diff --git a/.gitattributes b/.gitattributes index 412eeda78d..d63baf172c 100644 --- a/.gitattributes +++ b/.gitattributes @@ -20,3 +20,7 @@ *.PDF diff=astextplain *.rtf diff=astextplain *.RTF diff=astextplain + +*.cu diff=cpp + + diff --git a/.gitignore b/.gitignore index 147df83941..d4fb4d4284 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,3 @@ - ccminer *.o @@ -7,7 +6,6 @@ autom4te.cache Makefile Makefile.in -INSTALL aclocal.m4 configure configure.lineno @@ -15,7 +13,7 @@ depcomp missing install-sh stamp-h1 -cpuminer-config.h* +ccminer-config.h.in compile config.log config.status @@ -41,6 +39,8 @@ x64/Release/ *.opensdf *.sdf *.pdb +*.db +*.opendb .settings/ .project @@ -50,5 +50,3 @@ x64/Release/ .cproject .buildpath -compat/curl-for-windows/ - diff --git a/Algo256/blake256.cu b/Algo256/blake256.cu index 21b0276eae..0cc43ec50b 100644 --- a/Algo256/blake256.cu +++ b/Algo256/blake256.cu @@ -10,10 +10,12 @@ extern "C" { #include "sph/sph_blake.h" -#include -#include } +#include +#include + + /* threads per block and throughput (intensity) */ #define TPB 128 @@ -21,7 +23,7 @@ extern "C" { extern "C" int blake256_rounds = 14; /* hash by cpu with blake 256 */ -extern "C" void blake256hash(void *output, const void *input, int8_t rounds = 14) +void blake256hash(void *output, const void *input, int8_t rounds = 14) { uchar hash[64]; sph_blake256_context ctx; @@ -38,7 +40,8 @@ extern "C" void blake256hash(void *output, const void *input, int8_t rounds = 14 #include "cuda_helper.h" #if PRECALC64 -__constant__ uint32_t _ALIGN(32) d_data[12]; +__constant__ uint32_t _ALIGN(32) d_data[15]; +static THREAD uint32_t *h_data; #else __constant__ static uint32_t _ALIGN(32) c_data[20]; /* midstate hash cache, this algo is run on 2 parts */ @@ -50,32 +53,11 @@ extern "C" uint32_t crc32_u32t(const uint32_t *buf, size_t size); /* 8 adapters max */ static uint32_t *d_resNonce[MAX_GPUS]; -static uint32_t *h_resNonce[MAX_GPUS]; +static THREAD uint32_t *h_resNonce; /* max count of found nonces in one call */ #define NBN 2 -static uint32_t extra_results[NBN] = { UINT32_MAX }; - -/* prefer uint32_t to prevent size conversions = speed +5/10 % */ -__constant__ -static uint32_t _ALIGN(32) c_sigma[16][16] = { - { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, - {14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, - {11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 }, - { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 }, - { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 }, - { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }, - {12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 }, - {13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 }, - { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 }, - {10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13 , 0 }, - { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, - {14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, - {11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 }, - { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 }, - { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 }, - { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 } -}; +static uint32_t extra_results[MAX_GPUS][NBN] = { UINT32_MAX }; #if !PRECALC64 __device__ __constant__ @@ -87,41 +69,18 @@ static const uint32_t __align__(32) c_IV256[8] = { }; #endif -__device__ __constant__ -static const uint32_t __align__(32) c_u256[16] = { - SPH_C32(0x243F6A88), SPH_C32(0x85A308D3), - SPH_C32(0x13198A2E), SPH_C32(0x03707344), - SPH_C32(0xA4093822), SPH_C32(0x299F31D0), - SPH_C32(0x082EFA98), SPH_C32(0xEC4E6C89), - SPH_C32(0x452821E6), SPH_C32(0x38D01377), - SPH_C32(0xBE5466CF), SPH_C32(0x34E90C6C), - SPH_C32(0xC0AC29B7), SPH_C32(0xC97C50DD), - SPH_C32(0x3F84D5B5), SPH_C32(0xB5470917) -}; - -#define GS(a,b,c,d,x) { \ - const uint32_t idx1 = c_sigma[r][x]; \ - const uint32_t idx2 = c_sigma[r][x+1]; \ - v[a] += (m[idx1] ^ c_u256[idx2]) + v[b]; \ - v[d] = SPH_ROTL32(v[d] ^ v[a], 16); \ +#define GSPREC(a,b,c,d,x,y) { \ + v[a] += (m[x] ^ c_u256[y]) + v[b]; \ + v[d] = __byte_perm(v[d] ^ v[a],0, 0x1032); \ v[c] += v[d]; \ - v[b] = SPH_ROTR32(v[b] ^ v[c], 12); \ -\ - v[a] += (m[idx2] ^ c_u256[idx1]) + v[b]; \ - v[d] = SPH_ROTR32(v[d] ^ v[a], 8); \ + v[b] = ROTR32(v[b] ^ v[c], 12); \ + v[a] += (m[y] ^ c_u256[x]) + v[b]; \ + v[d] = __byte_perm(v[d] ^ v[a],0, 0x0321); \ v[c] += v[d]; \ - v[b] = SPH_ROTR32(v[b] ^ v[c], 7); \ -} + v[b] = ROTR32(v[b] ^ v[c], 7); \ + } /* Second part (64-80) msg never change, store it */ -__device__ __constant__ -static const uint32_t __align__(32) c_Padding[16] = { - 0, 0, 0, 0, - 0x80000000UL, 0, 0, 0, - 0, 0, 0, 0, - 0, 1, 0, 640, -}; - __device__ static void blake256_compress(uint32_t *h, const uint32_t *block, const uint32_t T0, const int rounds) { @@ -133,7 +92,29 @@ void blake256_compress(uint32_t *h, const uint32_t *block, const uint32_t T0, co m[2] = block[2]; m[3] = block[3]; - for (uint32_t i = 4; i < 16; i++) { + const uint32_t c_u256[16] = + { + SPH_C32(0x243F6A88), SPH_C32(0x85A308D3), + SPH_C32(0x13198A2E), SPH_C32(0x03707344), + SPH_C32(0xA4093822), SPH_C32(0x299F31D0), + SPH_C32(0x082EFA98), SPH_C32(0xEC4E6C89), + SPH_C32(0x452821E6), SPH_C32(0x38D01377), + SPH_C32(0xBE5466CF), SPH_C32(0x34E90C6C), + SPH_C32(0xC0AC29B7), SPH_C32(0xC97C50DD), + SPH_C32(0x3F84D5B5), SPH_C32(0xB5470917) + }; + + const uint32_t c_Padding[16] = { + 0, 0, 0, 0, + 0x80000000UL, 0, 0, 0, + 0, 0, 0, 0, + 0, 1, 0, 640, + }; + + + #pragma unroll + for (uint32_t i = 4; i < 16; i++) + { #if PRECALC64 m[i] = c_Padding[i]; #else @@ -141,7 +122,7 @@ void blake256_compress(uint32_t *h, const uint32_t *block, const uint32_t T0, co #endif } - //#pragma unroll 8 +#pragma unroll for(uint32_t i = 0; i < 8; i++) v[i] = h[i]; @@ -155,22 +136,288 @@ void blake256_compress(uint32_t *h, const uint32_t *block, const uint32_t T0, co v[14] = c_u256[6]; v[15] = c_u256[7]; - for (int r = 0; r < rounds; r++) { - /* column step */ - GS(0, 4, 0x8, 0xC, 0x0); - GS(1, 5, 0x9, 0xD, 0x2); - GS(2, 6, 0xA, 0xE, 0x4); - GS(3, 7, 0xB, 0xF, 0x6); - /* diagonal step */ - GS(0, 5, 0xA, 0xF, 0x8); - GS(1, 6, 0xB, 0xC, 0xA); - GS(2, 7, 0x8, 0xD, 0xC); - GS(3, 4, 0x9, 0xE, 0xE); + // { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + GSPREC(0, 4, 0x8, 0xC,0,1); + GSPREC(1, 5, 0x9, 0xD,2,3); + GSPREC(2, 6, 0xA, 0xE, 4,5); + GSPREC(3, 7, 0xB, 0xF, 6,7); + GSPREC(0, 5, 0xA, 0xF, 8,9); + GSPREC(1, 6, 0xB, 0xC, 10,11); + GSPREC(2, 7, 0x8, 0xD, 12,13); + GSPREC(3, 4, 0x9, 0xE, 14,15); + // { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, + GSPREC(0, 4, 0x8, 0xC, 14, 10); + GSPREC(1, 5, 0x9, 0xD, 4, 8); + GSPREC(2, 6, 0xA, 0xE, 9, 15); + GSPREC(3, 7, 0xB, 0xF, 13, 6); + GSPREC(0, 5, 0xA, 0xF, 1, 12); + GSPREC(1, 6, 0xB, 0xC, 0, 2); + GSPREC(2, 7, 0x8, 0xD, 11, 7); + GSPREC(3, 4, 0x9, 0xE, 5, 3); + // { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 }, + GSPREC(0, 4, 0x8, 0xC, 11, 8); + GSPREC(1, 5, 0x9, 0xD, 12, 0); + GSPREC(2, 6, 0xA, 0xE, 5, 2); + GSPREC(3, 7, 0xB, 0xF, 15, 13); + GSPREC(0, 5, 0xA, 0xF, 10, 14); + GSPREC(1, 6, 0xB, 0xC, 3, 6); + GSPREC(2, 7, 0x8, 0xD, 7, 1); + GSPREC(3, 4, 0x9, 0xE, 9, 4); + // { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 }, + GSPREC(0, 4, 0x8, 0xC, 7, 9); + GSPREC(1, 5, 0x9, 0xD, 3, 1); + GSPREC(2, 6, 0xA, 0xE, 13, 12); + GSPREC(3, 7, 0xB, 0xF, 11, 14); + GSPREC(0, 5, 0xA, 0xF, 2, 6); + GSPREC(1, 6, 0xB, 0xC, 5, 10); + GSPREC(2, 7, 0x8, 0xD, 4, 0); + GSPREC(3, 4, 0x9, 0xE, 15, 8); + + // { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 }, + GSPREC(0, 4, 0x8, 0xC, 9, 0); + GSPREC(1, 5, 0x9, 0xD, 5, 7); + GSPREC(2, 6, 0xA, 0xE, 2, 4); + GSPREC(3, 7, 0xB, 0xF, 10, 15); + GSPREC(0, 5, 0xA, 0xF, 14, 1); + GSPREC(1, 6, 0xB, 0xC, 11, 12); + GSPREC(2, 7, 0x8, 0xD, 6, 8); + GSPREC(3, 4, 0x9, 0xE, 3, 13); + // { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }, + GSPREC(0, 4, 0x8, 0xC, 2, 12); + GSPREC(1, 5, 0x9, 0xD, 6, 10); + GSPREC(2, 6, 0xA, 0xE, 0, 11); + GSPREC(3, 7, 0xB, 0xF, 8, 3); + GSPREC(0, 5, 0xA, 0xF, 4, 13); + GSPREC(1, 6, 0xB, 0xC, 7, 5); + GSPREC(2, 7, 0x8, 0xD, 15, 14); + GSPREC(3, 4, 0x9, 0xE, 1, 9); + + // { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 }, + GSPREC(0, 4, 0x8, 0xC, 12, 5); + GSPREC(1, 5, 0x9, 0xD, 1, 15); + GSPREC(2, 6, 0xA, 0xE, 14, 13); + GSPREC(3, 7, 0xB, 0xF, 4, 10); + GSPREC(0, 5, 0xA, 0xF, 0, 7); + GSPREC(1, 6, 0xB, 0xC, 6, 3); + GSPREC(2, 7, 0x8, 0xD, 9, 2); + GSPREC(3, 4, 0x9, 0xE, 8, 11); + +// { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 }, + GSPREC(0, 4, 0x8, 0xC, 13, 11); + GSPREC(1, 5, 0x9, 0xD, 7, 14); + GSPREC(2, 6, 0xA, 0xE, 12, 1); + GSPREC(3, 7, 0xB, 0xF, 3, 9); + GSPREC(0, 5, 0xA, 0xF, 5, 0); + GSPREC(1, 6, 0xB, 0xC, 15, 4); + GSPREC(2, 7, 0x8, 0xD, 8, 6); + GSPREC(3, 4, 0x9, 0xE, 2, 10); +// { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 }, + GSPREC(0, 4, 0x8, 0xC, 6, 15); + GSPREC(1, 5, 0x9, 0xD, 14, 9); + GSPREC(2, 6, 0xA, 0xE, 11, 3); + GSPREC(3, 7, 0xB, 0xF, 0, 8); + GSPREC(0, 5, 0xA, 0xF, 12, 2); + GSPREC(1, 6, 0xB, 0xC, 13, 7); + GSPREC(2, 7, 0x8, 0xD, 1, 4); + GSPREC(3, 4, 0x9, 0xE, 10, 5); +// { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 }, + GSPREC(0, 4, 0x8, 0xC, 10, 2); + GSPREC(1, 5, 0x9, 0xD, 8, 4); + GSPREC(2, 6, 0xA, 0xE, 7, 6); + GSPREC(3, 7, 0xB, 0xF, 1, 5); + GSPREC(0, 5, 0xA, 0xF, 15, 11); + GSPREC(1, 6, 0xB, 0xC, 9, 14); + GSPREC(2, 7, 0x8, 0xD, 3, 12); + GSPREC(3, 4, 0x9, 0xE, 13, 0); +// { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + GSPREC(0, 4, 0x8, 0xC, 0, 1); + GSPREC(1, 5, 0x9, 0xD, 2, 3); + GSPREC(2, 6, 0xA, 0xE, 4, 5); + GSPREC(3, 7, 0xB, 0xF, 6, 7); + GSPREC(0, 5, 0xA, 0xF, 8, 9); + GSPREC(1, 6, 0xB, 0xC, 10, 11); + GSPREC(2, 7, 0x8, 0xD, 12, 13); + GSPREC(3, 4, 0x9, 0xE, 14, 15); + +// { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, + GSPREC(0, 4, 0x8, 0xC, 14, 10); + GSPREC(1, 5, 0x9, 0xD, 4, 8); + GSPREC(2, 6, 0xA, 0xE, 9, 15); + GSPREC(3, 7, 0xB, 0xF, 13, 6); + GSPREC(0, 5, 0xA, 0xF, 1, 12); + GSPREC(1, 6, 0xB, 0xC, 0, 2); + GSPREC(2, 7, 0x8, 0xD, 11, 7); + GSPREC(3, 4, 0x9, 0xE, 5, 3); + +// { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 }, + GSPREC(0, 4, 0x8, 0xC, 11, 8); + GSPREC(1, 5, 0x9, 0xD, 12, 0); + GSPREC(2, 6, 0xA, 0xE, 5, 2); + GSPREC(3, 7, 0xB, 0xF, 15, 13); + GSPREC(0, 5, 0xA, 0xF, 10, 14); + GSPREC(1, 6, 0xB, 0xC, 3, 6); + GSPREC(2, 7, 0x8, 0xD, 7, 1); + GSPREC(3, 4, 0x9, 0xE, 9, 4); + // { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 }, + GSPREC(0, 4, 0x8, 0xC, 7, 9); + GSPREC(1, 5, 0x9, 0xD, 3, 1); + GSPREC(2, 6, 0xA, 0xE, 13, 12); + GSPREC(3, 7, 0xB, 0xF, 11, 14); + GSPREC(0, 5, 0xA, 0xF, 2, 6); + GSPREC(1, 6, 0xB, 0xC, 5, 10); + GSPREC(2, 7, 0x8, 0xD, 4, 0); +// GSPREC(3, 4, 0x9, 0xE, 15, 8); + + +#if PRECALC64 + // only compute h6 & 7 +// h[6U] ^= v[6U] ^ v[14U]; + h[7] ^= v[7] ^ v[15]; +#else + //#pragma unroll 16 + for (uint32_t i = 0; i < 16; i++) { + uint32_t j = i & 7U; + h[j] ^= v[i]; + } +#endif +} + + +/* Second part (64-80) msg never change, store it */ +__device__ static +void blake256_compress_8(uint32_t *const __restrict__ h, const uint32_t *const __restrict__ block) +{ + uint32_t /*_ALIGN(8)*/ m[16]; + uint32_t v[16]; + + m[0] = block[0]; + m[1] = block[1]; + m[2] = block[2]; + m[3] = block[3]; + + const uint32_t c_u256[16] = + { + SPH_C32(0x243F6A88), SPH_C32(0x85A308D3), + SPH_C32(0x13198A2E), SPH_C32(0x03707344), + SPH_C32(0xA4093822), SPH_C32(0x299F31D0), + SPH_C32(0x082EFA98), SPH_C32(0xEC4E6C89), + SPH_C32(0x452821E6), SPH_C32(0x38D01377), + SPH_C32(0xBE5466CF), SPH_C32(0x34E90C6C), + SPH_C32(0xC0AC29B7), SPH_C32(0xC97C50DD), + SPH_C32(0x3F84D5B5), SPH_C32(0xB5470917) + }; + + const uint32_t c_Padding[16] = { + 0, 0, 0, 0, + 0x80000000UL, 0, 0, 0, + 0, 0, 0, 0, + 0, 1, 0, 640, + }; + + +#pragma unroll + for (int i = 4; i < 16; i++) + { + m[i] = c_Padding[i]; } + +#pragma unroll + for(int i = 0; i < 8; i++) + v[i] = h[i]; + + v[ 9] = c_u256[1]; + v[10] = c_u256[2]; + v[11] = c_u256[3]; + + v[13] = c_u256[5] ^ 640; + v[14] = c_u256[6]; + v[15] = c_u256[7]; + + v[0] = d_data[11]; + v[4] = d_data[12]; + v[8] = d_data[13]; + v[12] = d_data[14]; + + GSPREC(1, 5, 0x9, 0xD,2,3); + GSPREC(2, 6, 0xA, 0xE, 4,5); + GSPREC(3, 7, 0xB, 0xF, 6,7); + GSPREC(0, 5, 0xA, 0xF, 8,9); + GSPREC(1, 6, 0xB, 0xC, 10,11); + GSPREC(2, 7, 0x8, 0xD, 12,13); + GSPREC(3, 4, 0x9, 0xE, 14,15); + // { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, + GSPREC(0, 4, 0x8, 0xC, 14, 10); + GSPREC(1, 5, 0x9, 0xD, 4, 8); + GSPREC(2, 6, 0xA, 0xE, 9, 15); + GSPREC(3, 7, 0xB, 0xF, 13, 6); + GSPREC(0, 5, 0xA, 0xF, 1, 12); + GSPREC(1, 6, 0xB, 0xC, 0, 2); + GSPREC(2, 7, 0x8, 0xD, 11, 7); + GSPREC(3, 4, 0x9, 0xE, 5, 3); + // { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 }, + GSPREC(0, 4, 0x8, 0xC, 11, 8); + GSPREC(1, 5, 0x9, 0xD, 12, 0); + GSPREC(2, 6, 0xA, 0xE, 5, 2); + GSPREC(3, 7, 0xB, 0xF, 15, 13); + GSPREC(0, 5, 0xA, 0xF, 10, 14); + GSPREC(1, 6, 0xB, 0xC, 3, 6); + GSPREC(2, 7, 0x8, 0xD, 7, 1); + GSPREC(3, 4, 0x9, 0xE, 9, 4); + // { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 }, + GSPREC(0, 4, 0x8, 0xC, 7, 9); + GSPREC(1, 5, 0x9, 0xD, 3, 1); + GSPREC(2, 6, 0xA, 0xE, 13, 12); + GSPREC(3, 7, 0xB, 0xF, 11, 14); + GSPREC(0, 5, 0xA, 0xF, 2, 6); + GSPREC(1, 6, 0xB, 0xC, 5, 10); + GSPREC(2, 7, 0x8, 0xD, 4, 0); + GSPREC(3, 4, 0x9, 0xE, 15, 8); + + // { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 }, + GSPREC(0, 4, 0x8, 0xC, 9, 0); + GSPREC(1, 5, 0x9, 0xD, 5, 7); + GSPREC(2, 6, 0xA, 0xE, 2, 4); + GSPREC(3, 7, 0xB, 0xF, 10, 15); + GSPREC(0, 5, 0xA, 0xF, 14, 1); + GSPREC(1, 6, 0xB, 0xC, 11, 12); + GSPREC(2, 7, 0x8, 0xD, 6, 8); + GSPREC(3, 4, 0x9, 0xE, 3, 13); + // { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }, + GSPREC(0, 4, 0x8, 0xC, 2, 12); + GSPREC(1, 5, 0x9, 0xD, 6, 10); + GSPREC(2, 6, 0xA, 0xE, 0, 11); + GSPREC(3, 7, 0xB, 0xF, 8, 3); + GSPREC(0, 5, 0xA, 0xF, 4, 13); + GSPREC(1, 6, 0xB, 0xC, 7, 5); + GSPREC(2, 7, 0x8, 0xD, 15, 14); + GSPREC(3, 4, 0x9, 0xE, 1, 9); + + // { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 }, + GSPREC(0, 4, 0x8, 0xC, 12, 5); + GSPREC(1, 5, 0x9, 0xD, 1, 15); + GSPREC(2, 6, 0xA, 0xE, 14, 13); + GSPREC(3, 7, 0xB, 0xF, 4, 10); + GSPREC(0, 5, 0xA, 0xF, 0, 7); + GSPREC(1, 6, 0xB, 0xC, 6, 3); + GSPREC(2, 7, 0x8, 0xD, 9, 2); + GSPREC(3, 4, 0x9, 0xE, 8, 11); + + // { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 }, + GSPREC(0, 4, 0x8, 0xC, 13, 11); + GSPREC(1, 5, 0x9, 0xD, 7, 14); + GSPREC(2, 6, 0xA, 0xE, 12, 1); + GSPREC(3, 7, 0xB, 0xF, 3, 9); + GSPREC(0, 5, 0xA, 0xF, 5, 0); + GSPREC(1, 6, 0xB, 0xC, 15, 4); + GSPREC(2, 7, 0x8, 0xD, 8, 6); +// GSPREC(3, 4, 0x9, 0xE, 2, 10); + // { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 }, + + #if PRECALC64 // only compute h6 & 7 - h[6U] ^= v[6U] ^ v[14U]; - h[7U] ^= v[7U] ^ v[15U]; +// h[6] ^= v[6] ^ v[14]; + h[7] ^= v[7] ^ v[15]; #else //#pragma unroll 16 for (uint32_t i = 0; i < 16; i++) { @@ -180,12 +427,13 @@ void blake256_compress(uint32_t *h, const uint32_t *block, const uint32_t T0, co #endif } + #if !PRECALC64 /* original method */ __global__ void blake256_gpu_hash_80(const uint32_t threads, const uint32_t startNonce, uint32_t *resNonce, const uint64_t highTarget, const int crcsum, const int rounds) { - uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); if (thread < threads) { const uint32_t nonce = startNonce + thread; @@ -253,23 +501,23 @@ uint32_t blake256_cpu_hash_80(const int thr_id, const uint32_t threads, const ui if (cudaMemset(d_resNonce[thr_id], 0xff, NBN*sizeof(uint32_t)) != cudaSuccess) return result; - blake256_gpu_hash_80<<>>(threads, startNonce, d_resNonce[thr_id], highTarget, crcsum, (int) rounds); - cudaDeviceSynchronize(); - if (cudaSuccess == cudaMemcpy(h_resNonce[thr_id], d_resNonce[thr_id], NBN*sizeof(uint32_t), cudaMemcpyDeviceToHost)) { + blake256_gpu_hash_80<<>>(threads, startNonce, d_resNonce[thr_id], highTarget, crcsum, (int) rounds); + //cudaDeviceSynchronize(); + if (cudaSuccess == cudaMemcpyAsync(h_resNonce[thr_id], d_resNonce[thr_id], NBN*sizeof(uint32_t), cudaMemcpyDeviceToHost)) { //cudaDeviceSynchronize(); /* seems no more required */ result = h_resNonce[thr_id][0]; for (int n=0; n < (NBN-1); n++) - extra_results[n] = h_resNonce[thr_id][n+1]; + extra_results[thr_id][n] = h_resNonce[thr_id][n+1]; } return result; } __host__ -void blake256_cpu_setBlock_80(uint32_t *pdata, const uint32_t *ptarget) +void blake256_cpu_setBlock_80(int thr_id, uint32_t *pdata, const uint32_t *ptarget) { uint32_t data[20]; memcpy(data, pdata, 80); - CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_data, data, sizeof(data), 0, cudaMemcpyHostToDevice)); + CUDA_SAFE_CALL(cudaMemcpyToSymbolAsync(c_data, data, sizeof(data), 0, cudaMemcpyHostToDevice)); } #else @@ -278,10 +526,10 @@ void blake256_cpu_setBlock_80(uint32_t *pdata, const uint32_t *ptarget) __global__ void blake256_gpu_hash_16(const uint32_t threads, const uint32_t startNonce, uint32_t *resNonce, - const uint64_t highTarget, const int rounds, const bool trace) + const uint32_t Target6, const uint32_t Target7, const int rounds, const bool trace) { - uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); - if (thread < threads) + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); +// if (thread < threads) { const uint32_t nonce = startNonce + thread; uint32_t _ALIGN(16) h[8]; @@ -301,14 +549,57 @@ void blake256_gpu_hash_16(const uint32_t threads, const uint32_t startNonce, uin blake256_compress(h, ending, 640, rounds); - if (h[7] == 0 && cuda_swab32(h[6]) <= highTarget) { + if (h[7] <= Target7) + { #if NBN == 2 - /* keep the smallest nonce, + extra one if found */ - if (resNonce[0] > nonce) { - resNonce[1] = resNonce[0]; - resNonce[0] = nonce; + uint32_t tmp = atomicCAS(resNonce, 0xffffffff, nonce); + if(tmp != 0xffffffff) + resNonce[1] = nonce; +#else + resNonce[0] = nonce; +#endif +#ifdef _DEBUG + if (trace) { + uint64_t high64 = ((uint64_t*)h)[3]; + printf("gpu: %16llx\n", high64); + printf("gpu: %08x.%08x\n", h[7], h[6]); + printf("tgt: %16llx\n", highTarget); } - else +#endif + } + } +} + + +__global__ +void blake256_gpu_hash_16_8(const uint32_t threads, const uint32_t startNonce, uint32_t *resNonce, + const uint32_t Target6, const uint32_t Target7, const int rounds, const bool trace) +{ + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); +// if (thread < threads) + { + const uint32_t nonce = startNonce + thread; + uint32_t _ALIGN(16) h[8]; + +#pragma unroll + for (int i = 0; i < 8; i++) + h[i] = d_data[i]; + + // ------ Close: Bytes 64 to 80 ------ + + uint32_t _ALIGN(16) ending[4]; + ending[0] = d_data[8]; + ending[1] = d_data[9]; + ending[2] = d_data[10]; + ending[3] = nonce; /* our tested value */ + + blake256_compress_8(h, ending); + + if (h[7] <= Target7) + { +#if NBN == 2 + uint32_t tmp = atomicCAS(resNonce, 0xffffffff, nonce); + if(tmp != 0xffffffff) resNonce[1] = nonce; #else resNonce[0] = nonce; @@ -325,8 +616,9 @@ void blake256_gpu_hash_16(const uint32_t threads, const uint32_t startNonce, uin } } + __host__ -static uint32_t blake256_cpu_hash_16(const int thr_id, const uint32_t threads, const uint32_t startNonce, const uint64_t highTarget, +static uint32_t blake256_cpu_hash_16(const int thr_id, const uint32_t threads, const uint32_t startNonce, const uint32_t Target6, const uint32_t Target7, const int8_t rounds) { uint32_t result = UINT32_MAX; @@ -334,18 +626,23 @@ static uint32_t blake256_cpu_hash_16(const int thr_id, const uint32_t threads, c dim3 grid((threads + TPB-1)/TPB); dim3 block(TPB); - /* Check error on Ctrl+C or kill to prevent segfaults on exit */ - if (cudaMemset(d_resNonce[thr_id], 0xff, NBN*sizeof(uint32_t)) != cudaSuccess) - return result; + CUDA_SAFE_CALL(cudaMemsetAsync(d_resNonce[thr_id], 0xff, NBN*sizeof(uint32_t), gpustream[thr_id])); - blake256_gpu_hash_16 <<>> (threads, startNonce, d_resNonce[thr_id], highTarget, (int) rounds, opt_tracegpu); - cudaDeviceSynchronize(); - if (cudaSuccess == cudaMemcpy(h_resNonce[thr_id], d_resNonce[thr_id], NBN*sizeof(uint32_t), cudaMemcpyDeviceToHost)) { - //cudaDeviceSynchronize(); /* seems no more required */ - result = h_resNonce[thr_id][0]; - for (int n=0; n < (NBN-1); n++) - extra_results[n] = h_resNonce[thr_id][n+1]; + if(rounds == 8) + blake256_gpu_hash_16_8 << > > (threads, startNonce, d_resNonce[thr_id], Target6, Target7, (int)rounds, opt_tracegpu); + else + { + if(rounds == 14) + blake256_gpu_hash_16 << > > (threads, startNonce, d_resNonce[thr_id], Target6, Target7, (int)rounds, opt_tracegpu); + else + applog(LOG_ERR, "Number of blake rounds not supported"); } + CUDA_SAFE_CALL(cudaMemcpyAsync(h_resNonce, d_resNonce[thr_id], NBN*sizeof(uint32_t), cudaMemcpyDeviceToHost, gpustream[thr_id])); cudaStreamSynchronize(gpustream[thr_id]); + CUDA_SAFE_CALL(cudaStreamSynchronize(gpustream[thr_id])); + result = h_resNonce[0]; + + for (int n=0; n < (NBN-1); n++) + extra_results[thr_id][n] = h_resNonce[n + 1]; return result; } @@ -364,119 +661,154 @@ static void blake256mid(uint32_t *output, const uint32_t *input, int8_t rounds = } __host__ -void blake256_cpu_setBlock_16(uint32_t *penddata, const uint32_t *midstate, const uint32_t *ptarget) +static void blake256_cpu_setBlock_16(int thr_id, uint32_t *penddata, const uint32_t *midstate, const uint32_t *ptarget) { - uint32_t _ALIGN(64) data[11]; - memcpy(data, midstate, 32); - data[8] = penddata[0]; - data[9] = penddata[1]; - data[10]= penddata[2]; - CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_data, data, 32 + 12, 0, cudaMemcpyHostToDevice)); + memcpy(h_data, midstate, 32); + h_data[8] = penddata[0]; + h_data[9] = penddata[1]; + h_data[10] = penddata[2]; + + // precalc v[0], v[4], v[8], v[12] + h_data[11] = h_data[0] + (h_data[8] ^ 0x85A308D3) + h_data[4]; + h_data[14] = ROTL32(0xA4093822 ^ 640 ^ h_data[11], 16); + h_data[13] = 0x243F6A88 + h_data[14]; + h_data[12] = ROTR32(h_data[4] ^ h_data[13], 12); + h_data[11] += (h_data[9] ^ 0x243F6A88) + h_data[12]; + h_data[14] = ROTR32(h_data[14] ^ h_data[11], 8); + h_data[13] += h_data[14]; + h_data[12] = ROTR32(h_data[12] ^ h_data[13], 7); + + CUDA_SAFE_CALL(cudaMemcpyToSymbolAsync(d_data, h_data, 15 * 4, 0, cudaMemcpyHostToDevice, gpustream[thr_id])); } #endif -static bool init[MAX_GPUS] = { 0 }; - -extern "C" int scanhash_blake256(int thr_id, uint32_t *pdata, const uint32_t *ptarget, - uint32_t max_nonce, unsigned long *hashes_done, int8_t blakerounds=14) +extern int scanhash_blake256(int thr_id, uint32_t *pdata, uint32_t *ptarget, + uint32_t max_nonce, uint32_t *hashes_done, int8_t blakerounds=14) { const uint32_t first_nonce = pdata[19]; - uint64_t targetHigh = ((uint64_t*)ptarget)[3]; uint32_t _ALIGN(64) endiandata[20]; #if PRECALC64 uint32_t _ALIGN(64) midstate[8]; #else uint32_t crcsum; #endif - unsigned int intensity = (device_sm[device_map[thr_id]] > 500) ? 22 : 20; - uint32_t throughput = device_intensity(thr_id, __func__, 1U << intensity); - throughput = min(throughput, max_nonce - first_nonce); + unsigned int intensity = 28; + uint32_t throughput = device_intensity(device_map[thr_id], __func__, 1U << intensity); + throughput = min(throughput, max_nonce - first_nonce) & 0xfffffc00; int rc = 0; - if (opt_benchmark) { - targetHigh = 0x1ULL << 32; - ((uint32_t*)ptarget)[6] = swab32(0x4); + if (opt_benchmark) + { + ptarget[7] = 0x00000000; + ptarget[6] = 0xffffffff; } + uint32_t target6 = ptarget[6]; + uint32_t target7 = swab32(ptarget[7]); // don't ask me why - if (opt_tracegpu) { + if (opt_tracegpu) + { /* test call from util.c */ throughput = 1; for (int k = 0; k < 20; k++) pdata[k] = swab32(pdata[k]); } - if (!init[thr_id]) { - if (active_gpus > 1) - CUDA_CALL_OR_RET_X(cudaSetDevice(device_map[thr_id]),0); - CUDA_CALL_OR_RET_X(cudaMallocHost(&h_resNonce[thr_id], NBN * sizeof(uint32_t)), 0); - CUDA_CALL_OR_RET_X(cudaMalloc(&d_resNonce[thr_id], NBN * sizeof(uint32_t)), 0); - init[thr_id] = true; + static THREAD volatile bool init = false; + if(!init) + { + CUDA_SAFE_CALL(cudaSetDevice(device_map[thr_id])); + cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync); + cudaDeviceSetCacheConfig(cudaFuncCachePreferL1); + CUDA_SAFE_CALL(cudaStreamCreate(&gpustream[thr_id])); + CUDA_SAFE_CALL(cudaMallocHost(&h_data, 15 * sizeof(uint32_t))); + CUDA_SAFE_CALL(cudaMallocHost(&h_resNonce, NBN * sizeof(uint32_t))); + CUDA_SAFE_CALL(cudaMalloc(&d_resNonce[thr_id], NBN * sizeof(uint32_t))); + mining_has_stopped[thr_id] = false; + init = true; } #if PRECALC64 for (int k = 0; k < 16; k++) be32enc(&endiandata[k], pdata[k]); blake256mid(midstate, endiandata, blakerounds); - blake256_cpu_setBlock_16(&pdata[16], midstate, ptarget); + blake256_cpu_setBlock_16(thr_id, &pdata[16], midstate, ptarget); #else - blake256_cpu_setBlock_80(pdata, ptarget); + blake256_cpu_setBlock_80(thr_id, pdata, ptarget); crcsum = crc32_u32t(pdata, 64); #endif /* PRECALC64 */ do { - uint32_t foundNonce = #if PRECALC64 // GPU HASH (second block only, first is midstate) - blake256_cpu_hash_16(thr_id, throughput, pdata[19], targetHigh, blakerounds); + uint32_t foundNonce = blake256_cpu_hash_16(thr_id, throughput, pdata[19], target6, target7, blakerounds); #else // GPU FULL HASH - blake256_cpu_hash_80(thr_id, throughput, pdata[19], targetHigh, crcsum, blakerounds); + uint32_t foundNonce = blake256_cpu_hash_80(thr_id, throughput, pdata[19], targetHigh, crcsum, blakerounds); #endif - if (foundNonce != UINT32_MAX) + if(stop_mining) {mining_has_stopped[thr_id] = true; cudaStreamDestroy(gpustream[thr_id]); pthread_exit(nullptr);} + if(foundNonce != UINT32_MAX) { - uint32_t vhashcpu[8]; - uint32_t Htarg = (uint32_t)targetHigh; + uint32_t vhashcpu[8] = { 0 }; for (int k=0; k < 19; k++) be32enc(&endiandata[k], pdata[k]); - be32enc(&endiandata[19], foundNonce); - blake256hash(vhashcpu, endiandata, blakerounds); - + if(opt_verify) + { + be32enc(&endiandata[19], foundNonce); + blake256hash(vhashcpu, endiandata, blakerounds); + } //applog(LOG_BLUE, "%08x %16llx", vhashcpu[6], targetHigh); - if (vhashcpu[6] <= Htarg && fulltest(vhashcpu, ptarget)) + if (vhashcpu[7] <= target7 && fulltest(vhashcpu, ptarget)) { + if (opt_benchmark) applog(LOG_INFO, "GPU #%d Found nounce %08x", thr_id, foundNonce); rc = 1; *hashes_done = pdata[19] - first_nonce + throughput; pdata[19] = foundNonce; #if NBN > 1 - if (extra_results[0] != UINT32_MAX) { - be32enc(&endiandata[19], extra_results[0]); - blake256hash(vhashcpu, endiandata, blakerounds); - if (vhashcpu[6] <= Htarg /* && fulltest(vhashcpu, ptarget) */) { - pdata[21] = extra_results[0]; - applog(LOG_BLUE, "1:%x 2:%x", foundNonce, extra_results[0]); + if (extra_results[thr_id][0] != UINT32_MAX) + { + if(opt_verify) + { + be32enc(&endiandata[19], extra_results[thr_id][0]); + blake256hash(vhashcpu, endiandata, blakerounds); + } + if (vhashcpu[7] <= target7 && fulltest(vhashcpu, ptarget)) + { + pdata[21] = extra_results[thr_id][0]; + if(opt_benchmark) applog(LOG_INFO, "GPU #%d Found second nounce %08x", thr_id, extra_results[thr_id][0]); +// applog(LOG_BLUE, "1:%x 2:%x", foundNonce, extra_results[thr_id][0]); rc = 2; } - extra_results[0] = UINT32_MAX; + else + { + if(vhashcpu[7]>target7) + applog(LOG_ERR, "GPU #%d: result for second nonce %08x does not validate on CPU!", device_map[thr_id], extra_results[thr_id][0]); + } + extra_results[thr_id][0] = UINT32_MAX; } #endif //applog_hash((uint8_t*)ptarget); //applog_compare_hash((uint8_t*)vhashcpu,(uint8_t*)ptarget); return rc; } - else if (opt_debug) { - applog_hash((uchar*)ptarget); - applog_compare_hash((uchar*)vhashcpu, (uchar*)ptarget); - applog(LOG_DEBUG, "GPU #%d: result for nonce %08x does not validate on CPU!", thr_id, foundNonce); + else + { + if(opt_debug) + { + applog_hash((uchar*)ptarget); + applog_compare_hash((uchar*)vhashcpu, (uchar*)ptarget); + } + if(vhashcpu[7]>target7) + applog(LOG_ERR, "GPU #%d: result for nonce %08x does not validate on CPU!", device_map[thr_id], foundNonce); } } - pdata[19] += throughput; + pdata[19] += throughput; CUDA_SAFE_CALL(cudaGetLastError()); } while (!work_restart[thr_id].restart && ((uint64_t)max_nonce > ((uint64_t)(pdata[19]) + (uint64_t)throughput))); - *hashes_done = pdata[19] - first_nonce; + *hashes_done = pdata[19] - first_nonce ; return rc; } diff --git a/Algo256/cuda_blake256.cu b/Algo256/cuda_blake256.cu index 718d18ecff..e599ef8e5d 100644 --- a/Algo256/cuda_blake256.cu +++ b/Algo256/cuda_blake256.cu @@ -8,16 +8,17 @@ extern "C" { } #include "cuda_helper.h" - #include -static __device__ uint64_t cuda_swab32ll(uint64_t x) { - return MAKE_ULONGLONG(cuda_swab32(_LOWORD(x)), cuda_swab32(_HIWORD(x))); -} +#define UINT2(x,y) make_uint2(x,y) + +//static __device__ __forceinline__ uint64_t cuda_swab32ll(uint64_t x) { +// return MAKE_UINT64(cuda_swab32(_LOWORD(x)), cuda_swab32(_HIWORD(x))); +//} -__constant__ static uint32_t c_data[20]; +__constant__ static uint32_t c_data[3]; -__constant__ static uint8_t sigma[16][16]; +//__constant__ static uint8_t sigma[16][16]; static uint8_t c_sigma[16][16] = { { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, @@ -37,17 +38,22 @@ static uint8_t c_sigma[16][16] = { { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 } }; -static const uint32_t c_IV256[8] = { - 0x6A09E667, 0xBB67AE85, - 0x3C6EF372, 0xA54FF53A, - 0x510E527F, 0x9B05688C, - 0x1F83D9AB, 0x5BE0CD19 -}; - __device__ __constant__ static uint32_t cpu_h[8]; - -__device__ __constant__ static uint32_t u256[16]; -static const uint32_t c_u256[16] = { +/* +__device__ __constant__ static uint32_t u256[16] = +{ + 0x243F6A88, 0x85A308D3, + 0x13198A2E, 0x03707344, + 0xA4093822, 0x299F31D0, + 0x082EFA98, 0xEC4E6C89, + 0x452821E6, 0x38D01377, + 0xBE5466CF, 0x34E90C6C, + 0xC0AC29B7, 0xC97C50DD, + 0x3F84D5B5, 0xB5470917 +}; +*/ +static const uint32_t c_u256[16] = +{ 0x243F6A88, 0x85A308D3, 0x13198A2E, 0x03707344, 0xA4093822, 0x299F31D0, @@ -58,25 +64,9 @@ static const uint32_t c_u256[16] = { 0x3F84D5B5, 0xB5470917 }; -#define GS2(a,b,c,d,x) { \ - const uint8_t idx1 = sigma[r][x]; \ - const uint8_t idx2 = sigma[r][x+1]; \ - v[a] += (m[idx1] ^ u256[idx2]) + v[b]; \ - v[d] = SPH_ROTL32(v[d] ^ v[a], 16); \ - v[c] += v[d]; \ - v[b] = SPH_ROTR32(v[b] ^ v[c], 12); \ -\ - v[a] += (m[idx2] ^ u256[idx1]) + v[b]; \ - v[d] = SPH_ROTR32(v[d] ^ v[a], 8); \ - v[c] += v[d]; \ - v[b] = SPH_ROTR32(v[b] ^ v[c], 7); \ -} - -//#define ROTL32(x, n) ((x) << (n)) | ((x) >> (32 - (n))) -#define ROTR32(x, n) (((x) >> (n)) | ((x) << (32 - (n)))) #define hostGS(a,b,c,d,x) { \ const uint8_t idx1 = c_sigma[r][x]; \ - const uint8_t idx2 = c_sigma[r][x+1]; \ + const uint8_t idx2 = c_sigma[r][x + 1]; \ v[a] += (m[idx1] ^ c_u256[idx2]) + v[b]; \ v[d] = ROTR32(v[d] ^ v[a], 16); \ v[c] += v[d]; \ @@ -88,30 +78,67 @@ static const uint32_t c_u256[16] = { v[b] = ROTR32(v[b] ^ v[c], 7); \ } +#define GSPREC(a,b,c,d,x,y) { \ + v[a] += (m[x] ^ u256[y]) + v[b]; \ + v[d] = __byte_perm(v[d] ^ v[a],0, 0x1032); \ + v[c] += v[d]; \ + v[b] = ROTR32(v[b] ^ v[c], 12); \ + v[a] += (m[y] ^ u256[x]) + v[b]; \ + v[d] = __byte_perm(v[d] ^ v[a],0, 0x0321); \ + v[c] += v[d]; \ + v[b] = ROTR32(v[b] ^ v[c], 7); \ + } + +__constant__ uint64_t keccak_round_constants[24] = { + 0x0000000000000001ull, 0x0000000000008082ull, + 0x800000000000808aull, 0x8000000080008000ull, + 0x000000000000808bull, 0x0000000080000001ull, + 0x8000000080008081ull, 0x8000000000008009ull, + 0x000000000000008aull, 0x0000000000000088ull, + 0x0000000080008009ull, 0x000000008000000aull, + 0x000000008000808bull, 0x800000000000008bull, + 0x8000000000008089ull, 0x8000000000008003ull, + 0x8000000000008002ull, 0x8000000000000080ull, + 0x000000000000800aull, 0x800000008000000aull, + 0x8000000080008081ull, 0x8000000000008080ull, + 0x0000000080000001ull, 0x8000000080008008ull +}; + +__constant__ uint2 keccak_round_constants35[24] = { + { 0x00000001ul, 0x00000000 }, { 0x00008082ul, 0x00000000 }, + { 0x0000808aul, 0x80000000 }, { 0x80008000ul, 0x80000000 }, + { 0x0000808bul, 0x00000000 }, { 0x80000001ul, 0x00000000 }, + { 0x80008081ul, 0x80000000 }, { 0x00008009ul, 0x80000000 }, + { 0x0000008aul, 0x00000000 }, { 0x00000088ul, 0x00000000 }, + { 0x80008009ul, 0x00000000 }, { 0x8000000aul, 0x00000000 }, + { 0x8000808bul, 0x00000000 }, { 0x0000008bul, 0x80000000 }, + { 0x00008089ul, 0x80000000 }, { 0x00008003ul, 0x80000000 }, + { 0x00008002ul, 0x80000000 }, { 0x00000080ul, 0x80000000 }, + { 0x0000800aul, 0x00000000 }, { 0x8000000aul, 0x80000000 }, + { 0x80008081ul, 0x80000000 }, { 0x00008080ul, 0x80000000 }, + { 0x80000001ul, 0x00000000 }, { 0x80008008ul, 0x80000000 } +}; __host__ __forceinline__ -static void blake256_compress1st(uint32_t *h, const uint32_t *block, const uint32_t T0) +static void blake256_compress1st(uint32_t *h, const uint32_t *block) { uint32_t m[16]; - uint32_t v[16]; + uint32_t v[16] = + { + 0x6A09E667, 0xBB67AE85, + 0x3C6EF372, 0xA54FF53A, + 0x510E527F, 0x9B05688C, + 0x1F83D9AB, 0x5BE0CD19, + 0x243F6A88, 0x85A308D3, + 0x13198A2E, 0x03707344, + 0xA4093A22, 0x299F33D0, + 0x082EFA98, 0xEC4E6C89 + }; for (int i = 0; i < 16; i++) { m[i] = block[i]; } - for (int i = 0; i < 8; i++) - v[i] = h[i]; - - v[8] = c_u256[0]; - v[9] = c_u256[1]; - v[10] = c_u256[2]; - v[11] = c_u256[3]; - - v[12] = c_u256[4] ^ T0; - v[13] = c_u256[5] ^ T0; - v[14] = c_u256[6]; - v[15] = c_u256[7]; - for (int r = 0; r < 14; r++) { /* column step */ hostGS(0, 4, 0x8, 0xC, 0x0); @@ -134,110 +161,528 @@ static void blake256_compress1st(uint32_t *h, const uint32_t *block, const uint3 h[6] ^= v[6] ^ v[14]; h[7] ^= v[7] ^ v[15]; } +#define bitselect(a, b, c) ((a) ^ ((c) & ((b) ^ (a)))) -__device__ __forceinline__ -static void blake256_compress2nd(uint32_t *h, const uint32_t *block, const uint32_t T0) +static void __forceinline__ __device__ keccak_block(uint2 *s) { - uint32_t v[16]; - uint32_t m[16]= - { - block[0], block[1], block[2], block[3], - 0x80000000, 0, 0, 0, - 0, 0, 0, 0, - 0, 1, 0, 640 - }; + uint2 bc[5], tmpxor[5], tmp1, tmp2; + // uint2 s[25]; - #pragma unroll 8 - for (int i = 0; i < 8; i++) - v[i] = h[i]; - - v[8] = u256[0]; - v[9] = u256[1]; - v[10] = u256[2]; - v[11] = u256[3]; - v[12] = u256[4] ^ T0; - v[13] = u256[5] ^ T0; - v[14] = u256[6]; - v[15] = u256[7]; - - for (int r = 0; r < 14; r++) { - /* column step */ - GS2(0, 4, 0x8, 0xC, 0x0); - GS2(1, 5, 0x9, 0xD, 0x2); - GS2(2, 6, 0xA, 0xE, 0x4); - GS2(3, 7, 0xB, 0xF, 0x6); - /* diagonal step */ - GS2(0, 5, 0xA, 0xF, 0x8); - GS2(1, 6, 0xB, 0xC, 0xA); - GS2(2, 7, 0x8, 0xD, 0xC); - GS2(3, 4, 0x9, 0xE, 0xE); +#pragma unroll 1 + for (int i = 0; i < 24; i++) + { +#pragma unroll + for (uint32_t x = 0; x < 5; x++) + tmpxor[x] = s[x] ^ s[x + 5] ^ s[x + 10] ^ s[x + 15] ^ s[x + 20]; + + bc[0] = tmpxor[0] ^ ROL2(tmpxor[2], 1); + bc[1] = tmpxor[1] ^ ROL2(tmpxor[3], 1); + bc[2] = tmpxor[2] ^ ROL2(tmpxor[4], 1); + bc[3] = tmpxor[3] ^ ROL2(tmpxor[0], 1); + bc[4] = tmpxor[4] ^ ROL2(tmpxor[1], 1); + + tmp1 = s[1] ^ bc[0]; + + s[0] ^= bc[4]; + s[1] = ROL2(s[6] ^ bc[0], 44); + s[6] = ROL2(s[9] ^ bc[3], 20); + s[9] = ROL2(s[22] ^ bc[1], 61); + s[22] = ROL2(s[14] ^ bc[3], 39); + s[14] = ROL2(s[20] ^ bc[4], 18); + s[20] = ROL2(s[2] ^ bc[1], 62); + s[2] = ROL2(s[12] ^ bc[1], 43); + s[12] = ROL2(s[13] ^ bc[2], 25); + s[13] = ROL8(s[19] ^ bc[3]); + s[19] = ROR8(s[23] ^ bc[2]); + s[23] = ROL2(s[15] ^ bc[4], 41); + s[15] = ROL2(s[4] ^ bc[3], 27); + s[4] = ROL2(s[24] ^ bc[3], 14); + s[24] = ROL2(s[21] ^ bc[0], 2); + s[21] = ROL2(s[8] ^ bc[2], 55); + s[8] = ROL2(s[16] ^ bc[0], 45); + s[16] = ROL2(s[5] ^ bc[4], 36); + s[5] = ROL2(s[3] ^ bc[2], 28); + s[3] = ROL2(s[18] ^ bc[2], 21); + s[18] = ROL2(s[17] ^ bc[1], 15); + s[17] = ROL2(s[11] ^ bc[0], 10); + s[11] = ROL2(s[7] ^ bc[1], 6); + s[7] = ROL2(s[10] ^ bc[4], 3); + s[10] = ROL2(tmp1, 1); + + tmp1 = s[0]; tmp2 = s[1]; s[0] = bitselect(s[0] ^ s[2], s[0], s[1]); s[1] = bitselect(s[1] ^ s[3], s[1], s[2]); s[2] = bitselect(s[2] ^ s[4], s[2], s[3]); s[3] = bitselect(s[3] ^ tmp1, s[3], s[4]); s[4] = bitselect(s[4] ^ tmp2, s[4], tmp1); + tmp1 = s[5]; tmp2 = s[6]; s[5] = bitselect(s[5] ^ s[7], s[5], s[6]); s[6] = bitselect(s[6] ^ s[8], s[6], s[7]); s[7] = bitselect(s[7] ^ s[9], s[7], s[8]); s[8] = bitselect(s[8] ^ tmp1, s[8], s[9]); s[9] = bitselect(s[9] ^ tmp2, s[9], tmp1); + tmp1 = s[10]; tmp2 = s[11]; s[10] = bitselect(s[10] ^ s[12], s[10], s[11]); s[11] = bitselect(s[11] ^ s[13], s[11], s[12]); s[12] = bitselect(s[12] ^ s[14], s[12], s[13]); s[13] = bitselect(s[13] ^ tmp1, s[13], s[14]); s[14] = bitselect(s[14] ^ tmp2, s[14], tmp1); + tmp1 = s[15]; tmp2 = s[16]; s[15] = bitselect(s[15] ^ s[17], s[15], s[16]); s[16] = bitselect(s[16] ^ s[18], s[16], s[17]); s[17] = bitselect(s[17] ^ s[19], s[17], s[18]); s[18] = bitselect(s[18] ^ tmp1, s[18], s[19]); s[19] = bitselect(s[19] ^ tmp2, s[19], tmp1); + tmp1 = s[20]; tmp2 = s[21]; s[20] = bitselect(s[20] ^ s[22], s[20], s[21]); s[21] = bitselect(s[21] ^ s[23], s[21], s[22]); s[22] = bitselect(s[22] ^ s[24], s[22], s[23]); s[23] = bitselect(s[23] ^ tmp1, s[23], s[24]); s[24] = bitselect(s[24] ^ tmp2, s[24], tmp1); + s[0] ^= keccak_round_constants35[i]; } - - h[0] ^= v[0] ^ v[8]; - h[1] ^= v[1] ^ v[9]; - h[2] ^= v[2] ^ v[10]; - h[3] ^= v[3] ^ v[11]; - h[4] ^= v[4] ^ v[12]; - h[5] ^= v[5] ^ v[13]; - h[6] ^= v[6] ^ v[14]; - h[7] ^= v[7] ^ v[15]; } -__global__ __launch_bounds__(256,4) -void blake256_gpu_hash_80(const uint32_t threads, const uint32_t startNonce, uint64_t * Hash) +//__launch_bounds__(256) +__global__ +void blakeKeccak256_gpu_hash_80(const uint32_t threads, const uint32_t startNonce, uint32_t * Hash) { uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); - if (thread < threads) +// if (thread < threads) { const uint32_t nonce = startNonce + thread; uint32_t h[8]; - uint32_t input[4]; - +// uint32_t input[4]; + const uint32_t T0 = 640; #pragma unroll 8 for (int i = 0; i<8; i++) { h[i] = cpu_h[i];} - #pragma unroll 3 - for (int i = 0; i < 3; ++i) input[i] = c_data[16 + i]; + uint32_t v[16]; + + const uint32_t c_Padding[12] = { + 0x80000000, 0, 0, 0, + 0, 0, 0, 0, + 0, 1, 0, 640 + }; + + const uint32_t u256[16] = + { + 0x243F6A88, 0x85A308D3, + 0x13198A2E, 0x03707344, + 0xA4093822, 0x299F31D0, + 0x082EFA98, 0xEC4E6C89, + 0x452821E6, 0x38D01377, + 0xBE5466CF, 0x34E90C6C, + 0xC0AC29B7, 0xC97C50DD, + 0x3F84D5B5, 0xB5470917 + }; + + uint32_t m[16] = + { + c_data[0], c_data[1], c_data[2], nonce, + c_Padding[0], c_Padding[1], c_Padding[2], c_Padding[3], + c_Padding[4], c_Padding[5], c_Padding[6], c_Padding[7], + c_Padding[8], c_Padding[9], c_Padding[10], c_Padding[11] + }; + +#pragma unroll 8 + for (int i = 0; i < 8; i++) + v[i] = h[i]; + + v[8] = u256[0]; + v[9] = u256[1]; + v[10] = u256[2]; + v[11] = u256[3]; + v[12] = u256[4] ^ T0; + v[13] = u256[5] ^ T0; + v[14] = u256[6]; + v[15] = u256[7]; + + // { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + GSPREC(0, 4, 0x8, 0xC, 0, 1); + GSPREC(1, 5, 0x9, 0xD, 2, 3); + GSPREC(2, 6, 0xA, 0xE, 4, 5); + GSPREC(3, 7, 0xB, 0xF, 6, 7); + GSPREC(0, 5, 0xA, 0xF, 8, 9); + GSPREC(1, 6, 0xB, 0xC, 10, 11); + GSPREC(2, 7, 0x8, 0xD, 12, 13); + GSPREC(3, 4, 0x9, 0xE, 14, 15); + // { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, + GSPREC(0, 4, 0x8, 0xC, 14, 10); + GSPREC(1, 5, 0x9, 0xD, 4, 8); + GSPREC(2, 6, 0xA, 0xE, 9, 15); + GSPREC(3, 7, 0xB, 0xF, 13, 6); + GSPREC(0, 5, 0xA, 0xF, 1, 12); + GSPREC(1, 6, 0xB, 0xC, 0, 2); + GSPREC(2, 7, 0x8, 0xD, 11, 7); + GSPREC(3, 4, 0x9, 0xE, 5, 3); + // { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 }, + GSPREC(0, 4, 0x8, 0xC, 11, 8); + GSPREC(1, 5, 0x9, 0xD, 12, 0); + GSPREC(2, 6, 0xA, 0xE, 5, 2); + GSPREC(3, 7, 0xB, 0xF, 15, 13); + GSPREC(0, 5, 0xA, 0xF, 10, 14); + GSPREC(1, 6, 0xB, 0xC, 3, 6); + GSPREC(2, 7, 0x8, 0xD, 7, 1); + GSPREC(3, 4, 0x9, 0xE, 9, 4); + // { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 }, + GSPREC(0, 4, 0x8, 0xC, 7, 9); + GSPREC(1, 5, 0x9, 0xD, 3, 1); + GSPREC(2, 6, 0xA, 0xE, 13, 12); + GSPREC(3, 7, 0xB, 0xF, 11, 14); + GSPREC(0, 5, 0xA, 0xF, 2, 6); + GSPREC(1, 6, 0xB, 0xC, 5, 10); + GSPREC(2, 7, 0x8, 0xD, 4, 0); + GSPREC(3, 4, 0x9, 0xE, 15, 8); + + // { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 }, + GSPREC(0, 4, 0x8, 0xC, 9, 0); + GSPREC(1, 5, 0x9, 0xD, 5, 7); + GSPREC(2, 6, 0xA, 0xE, 2, 4); + GSPREC(3, 7, 0xB, 0xF, 10, 15); + GSPREC(0, 5, 0xA, 0xF, 14, 1); + GSPREC(1, 6, 0xB, 0xC, 11, 12); + GSPREC(2, 7, 0x8, 0xD, 6, 8); + GSPREC(3, 4, 0x9, 0xE, 3, 13); + // { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }, + GSPREC(0, 4, 0x8, 0xC, 2, 12); + GSPREC(1, 5, 0x9, 0xD, 6, 10); + GSPREC(2, 6, 0xA, 0xE, 0, 11); + GSPREC(3, 7, 0xB, 0xF, 8, 3); + GSPREC(0, 5, 0xA, 0xF, 4, 13); + GSPREC(1, 6, 0xB, 0xC, 7, 5); + GSPREC(2, 7, 0x8, 0xD, 15, 14); + GSPREC(3, 4, 0x9, 0xE, 1, 9); + + // { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 }, + GSPREC(0, 4, 0x8, 0xC, 12, 5); + GSPREC(1, 5, 0x9, 0xD, 1, 15); + GSPREC(2, 6, 0xA, 0xE, 14, 13); + GSPREC(3, 7, 0xB, 0xF, 4, 10); + GSPREC(0, 5, 0xA, 0xF, 0, 7); + GSPREC(1, 6, 0xB, 0xC, 6, 3); + GSPREC(2, 7, 0x8, 0xD, 9, 2); + GSPREC(3, 4, 0x9, 0xE, 8, 11); + + // { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 }, + GSPREC(0, 4, 0x8, 0xC, 13, 11); + GSPREC(1, 5, 0x9, 0xD, 7, 14); + GSPREC(2, 6, 0xA, 0xE, 12, 1); + GSPREC(3, 7, 0xB, 0xF, 3, 9); + GSPREC(0, 5, 0xA, 0xF, 5, 0); + GSPREC(1, 6, 0xB, 0xC, 15, 4); + GSPREC(2, 7, 0x8, 0xD, 8, 6); + GSPREC(3, 4, 0x9, 0xE, 2, 10); + // { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 }, + GSPREC(0, 4, 0x8, 0xC, 6, 15); + GSPREC(1, 5, 0x9, 0xD, 14, 9); + GSPREC(2, 6, 0xA, 0xE, 11, 3); + GSPREC(3, 7, 0xB, 0xF, 0, 8); + GSPREC(0, 5, 0xA, 0xF, 12, 2); + GSPREC(1, 6, 0xB, 0xC, 13, 7); + GSPREC(2, 7, 0x8, 0xD, 1, 4); + GSPREC(3, 4, 0x9, 0xE, 10, 5); + // { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 }, + GSPREC(0, 4, 0x8, 0xC, 10, 2); + GSPREC(1, 5, 0x9, 0xD, 8, 4); + GSPREC(2, 6, 0xA, 0xE, 7, 6); + GSPREC(3, 7, 0xB, 0xF, 1, 5); + GSPREC(0, 5, 0xA, 0xF, 15, 11); + GSPREC(1, 6, 0xB, 0xC, 9, 14); + GSPREC(2, 7, 0x8, 0xD, 3, 12); + GSPREC(3, 4, 0x9, 0xE, 13, 0); + // { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + GSPREC(0, 4, 0x8, 0xC, 0, 1); + GSPREC(1, 5, 0x9, 0xD, 2, 3); + GSPREC(2, 6, 0xA, 0xE, 4, 5); + GSPREC(3, 7, 0xB, 0xF, 6, 7); + GSPREC(0, 5, 0xA, 0xF, 8, 9); + GSPREC(1, 6, 0xB, 0xC, 10, 11); + GSPREC(2, 7, 0x8, 0xD, 12, 13); + GSPREC(3, 4, 0x9, 0xE, 14, 15); + + // { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, + GSPREC(0, 4, 0x8, 0xC, 14, 10); + GSPREC(1, 5, 0x9, 0xD, 4, 8); + GSPREC(2, 6, 0xA, 0xE, 9, 15); + GSPREC(3, 7, 0xB, 0xF, 13, 6); + GSPREC(0, 5, 0xA, 0xF, 1, 12); + GSPREC(1, 6, 0xB, 0xC, 0, 2); + GSPREC(2, 7, 0x8, 0xD, 11, 7); + GSPREC(3, 4, 0x9, 0xE, 5, 3); + + // { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 }, + GSPREC(0, 4, 0x8, 0xC, 11, 8); + GSPREC(1, 5, 0x9, 0xD, 12, 0); + GSPREC(2, 6, 0xA, 0xE, 5, 2); + GSPREC(3, 7, 0xB, 0xF, 15, 13); + GSPREC(0, 5, 0xA, 0xF, 10, 14); + GSPREC(1, 6, 0xB, 0xC, 3, 6); + GSPREC(2, 7, 0x8, 0xD, 7, 1); + GSPREC(3, 4, 0x9, 0xE, 9, 4); + // { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 }, + GSPREC(0, 4, 0x8, 0xC, 7, 9); + GSPREC(1, 5, 0x9, 0xD, 3, 1); + GSPREC(2, 6, 0xA, 0xE, 13, 12); + GSPREC(3, 7, 0xB, 0xF, 11, 14); + GSPREC(0, 5, 0xA, 0xF, 2, 6); + GSPREC(1, 6, 0xB, 0xC, 5, 10); + GSPREC(2, 7, 0x8, 0xD, 4, 0); + GSPREC(3, 4, 0x9, 0xE, 15, 8); + + + + + h[0] = cuda_swab32(h[0] ^ v[0] ^ v[8]); + h[1] = cuda_swab32(h[1] ^ v[1] ^ v[9]); + h[2] = cuda_swab32(h[2] ^ v[2] ^ v[10]); + h[3] = cuda_swab32(h[3] ^ v[3] ^ v[11]); + h[4] = cuda_swab32(h[4] ^ v[4] ^ v[12]); + h[5] = cuda_swab32(h[5] ^ v[5] ^ v[13]); + h[6] = cuda_swab32(h[6] ^ v[6] ^ v[14]); + h[7] = cuda_swab32(h[7] ^ v[7] ^ v[15]); + + uint2 keccak_gpu_state[25] = {0}; + keccak_gpu_state[0].x = h[0]; + keccak_gpu_state[0].y = h[1]; + keccak_gpu_state[1].x = h[2]; + keccak_gpu_state[1].y = h[3]; + keccak_gpu_state[2].x = h[4]; + keccak_gpu_state[2].y = h[5]; + keccak_gpu_state[3].x = h[6]; + keccak_gpu_state[3].y = h[7]; + keccak_gpu_state[4] = UINT2(1, 0); + + keccak_gpu_state[16] = UINT2(0, 0x80000000); + keccak_block(keccak_gpu_state); + uint64_t *outputHash = (uint64_t *)Hash; +#pragma unroll 4 + for (int i = 0; i<4; i++) + outputHash[i*threads + thread] = devectorize(keccak_gpu_state[i]); + } + - input[3] = nonce; - blake256_compress2nd(h, input, 640); - #pragma unroll - for (int i = 0; i<4; i++) { - Hash[i*threads + thread] = cuda_swab32ll(MAKE_ULONGLONG(h[2 * i], h[2*i+1])); - } +} + + +__global__ __launch_bounds__(256, 4) +void blake256_gpu_hash_80(const uint32_t threads, const uint32_t startNonce, uint32_t * Hash) +{ + uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); +// if (thread < threads) + { + const uint32_t nonce = startNonce + thread; + uint32_t h[8]; + // uint32_t input[4]; +#pragma unroll 8 + for (int i = 0; i<8; i++) { h[i] = cpu_h[i]; } + + uint32_t v[16]; + + const uint32_t c_Padding[12] = { + 0x80000000, 0, 0, 0, + 0, 0, 0, 0, + 0, 1, 0, 640 + }; + const uint32_t u256[16] = + { + 0x243F6A88, 0x85A308D3, + 0x13198A2E, 0x03707344, + 0xA4093822, 0x299F31D0, + 0x082EFA98, 0xEC4E6C89, + 0x452821E6, 0x38D01377, + 0xBE5466CF, 0x34E90C6C, + 0xC0AC29B7, 0xC97C50DD, + 0x3F84D5B5, 0xB5470917 + }; + + uint32_t m[16] = + { + c_data[0], c_data[1], c_data[2], nonce, + c_Padding[0], c_Padding[1], c_Padding[2], c_Padding[3], + c_Padding[4], c_Padding[5], c_Padding[6], c_Padding[7], + c_Padding[8], c_Padding[9], c_Padding[10], c_Padding[11] + }; + +#pragma unroll 8 + for (int i = 0; i < 8; i++) + v[i] = h[i]; + + v[8] = u256[0]; + v[9] = u256[1]; + v[10] = u256[2]; + v[11] = u256[3]; + v[12] = u256[4] ^ 640; + v[13] = u256[5] ^ 640; + v[14] = u256[6]; + v[15] = u256[7]; + + // { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + GSPREC(0, 4, 0x8, 0xC, 0, 1); + GSPREC(1, 5, 0x9, 0xD, 2, 3); + GSPREC(2, 6, 0xA, 0xE, 4, 5); + GSPREC(3, 7, 0xB, 0xF, 6, 7); + GSPREC(0, 5, 0xA, 0xF, 8, 9); + GSPREC(1, 6, 0xB, 0xC, 10, 11); + GSPREC(2, 7, 0x8, 0xD, 12, 13); + GSPREC(3, 4, 0x9, 0xE, 14, 15); + // { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, + GSPREC(0, 4, 0x8, 0xC, 14, 10); + GSPREC(1, 5, 0x9, 0xD, 4, 8); + GSPREC(2, 6, 0xA, 0xE, 9, 15); + GSPREC(3, 7, 0xB, 0xF, 13, 6); + GSPREC(0, 5, 0xA, 0xF, 1, 12); + GSPREC(1, 6, 0xB, 0xC, 0, 2); + GSPREC(2, 7, 0x8, 0xD, 11, 7); + GSPREC(3, 4, 0x9, 0xE, 5, 3); + // { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 }, + GSPREC(0, 4, 0x8, 0xC, 11, 8); + GSPREC(1, 5, 0x9, 0xD, 12, 0); + GSPREC(2, 6, 0xA, 0xE, 5, 2); + GSPREC(3, 7, 0xB, 0xF, 15, 13); + GSPREC(0, 5, 0xA, 0xF, 10, 14); + GSPREC(1, 6, 0xB, 0xC, 3, 6); + GSPREC(2, 7, 0x8, 0xD, 7, 1); + GSPREC(3, 4, 0x9, 0xE, 9, 4); + // { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 }, + GSPREC(0, 4, 0x8, 0xC, 7, 9); + GSPREC(1, 5, 0x9, 0xD, 3, 1); + GSPREC(2, 6, 0xA, 0xE, 13, 12); + GSPREC(3, 7, 0xB, 0xF, 11, 14); + GSPREC(0, 5, 0xA, 0xF, 2, 6); + GSPREC(1, 6, 0xB, 0xC, 5, 10); + GSPREC(2, 7, 0x8, 0xD, 4, 0); + GSPREC(3, 4, 0x9, 0xE, 15, 8); + + // { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 }, + GSPREC(0, 4, 0x8, 0xC, 9, 0); + GSPREC(1, 5, 0x9, 0xD, 5, 7); + GSPREC(2, 6, 0xA, 0xE, 2, 4); + GSPREC(3, 7, 0xB, 0xF, 10, 15); + GSPREC(0, 5, 0xA, 0xF, 14, 1); + GSPREC(1, 6, 0xB, 0xC, 11, 12); + GSPREC(2, 7, 0x8, 0xD, 6, 8); + GSPREC(3, 4, 0x9, 0xE, 3, 13); + // { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }, + GSPREC(0, 4, 0x8, 0xC, 2, 12); + GSPREC(1, 5, 0x9, 0xD, 6, 10); + GSPREC(2, 6, 0xA, 0xE, 0, 11); + GSPREC(3, 7, 0xB, 0xF, 8, 3); + GSPREC(0, 5, 0xA, 0xF, 4, 13); + GSPREC(1, 6, 0xB, 0xC, 7, 5); + GSPREC(2, 7, 0x8, 0xD, 15, 14); + GSPREC(3, 4, 0x9, 0xE, 1, 9); + + // { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 }, + GSPREC(0, 4, 0x8, 0xC, 12, 5); + GSPREC(1, 5, 0x9, 0xD, 1, 15); + GSPREC(2, 6, 0xA, 0xE, 14, 13); + GSPREC(3, 7, 0xB, 0xF, 4, 10); + GSPREC(0, 5, 0xA, 0xF, 0, 7); + GSPREC(1, 6, 0xB, 0xC, 6, 3); + GSPREC(2, 7, 0x8, 0xD, 9, 2); + GSPREC(3, 4, 0x9, 0xE, 8, 11); + + // { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 }, + GSPREC(0, 4, 0x8, 0xC, 13, 11); + GSPREC(1, 5, 0x9, 0xD, 7, 14); + GSPREC(2, 6, 0xA, 0xE, 12, 1); + GSPREC(3, 7, 0xB, 0xF, 3, 9); + GSPREC(0, 5, 0xA, 0xF, 5, 0); + GSPREC(1, 6, 0xB, 0xC, 15, 4); + GSPREC(2, 7, 0x8, 0xD, 8, 6); + GSPREC(3, 4, 0x9, 0xE, 2, 10); + // { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 }, + GSPREC(0, 4, 0x8, 0xC, 6, 15); + GSPREC(1, 5, 0x9, 0xD, 14, 9); + GSPREC(2, 6, 0xA, 0xE, 11, 3); + GSPREC(3, 7, 0xB, 0xF, 0, 8); + GSPREC(0, 5, 0xA, 0xF, 12, 2); + GSPREC(1, 6, 0xB, 0xC, 13, 7); + GSPREC(2, 7, 0x8, 0xD, 1, 4); + GSPREC(3, 4, 0x9, 0xE, 10, 5); + // { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 }, + GSPREC(0, 4, 0x8, 0xC, 10, 2); + GSPREC(1, 5, 0x9, 0xD, 8, 4); + GSPREC(2, 6, 0xA, 0xE, 7, 6); + GSPREC(3, 7, 0xB, 0xF, 1, 5); + GSPREC(0, 5, 0xA, 0xF, 15, 11); + GSPREC(1, 6, 0xB, 0xC, 9, 14); + GSPREC(2, 7, 0x8, 0xD, 3, 12); + GSPREC(3, 4, 0x9, 0xE, 13, 0); + // { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + GSPREC(0, 4, 0x8, 0xC, 0, 1); + GSPREC(1, 5, 0x9, 0xD, 2, 3); + GSPREC(2, 6, 0xA, 0xE, 4, 5); + GSPREC(3, 7, 0xB, 0xF, 6, 7); + GSPREC(0, 5, 0xA, 0xF, 8, 9); + GSPREC(1, 6, 0xB, 0xC, 10, 11); + GSPREC(2, 7, 0x8, 0xD, 12, 13); + GSPREC(3, 4, 0x9, 0xE, 14, 15); + + // { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, + GSPREC(0, 4, 0x8, 0xC, 14, 10); + GSPREC(1, 5, 0x9, 0xD, 4, 8); + GSPREC(2, 6, 0xA, 0xE, 9, 15); + GSPREC(3, 7, 0xB, 0xF, 13, 6); + GSPREC(0, 5, 0xA, 0xF, 1, 12); + GSPREC(1, 6, 0xB, 0xC, 0, 2); + GSPREC(2, 7, 0x8, 0xD, 11, 7); + GSPREC(3, 4, 0x9, 0xE, 5, 3); + + // { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 }, + GSPREC(0, 4, 0x8, 0xC, 11, 8); + GSPREC(1, 5, 0x9, 0xD, 12, 0); + GSPREC(2, 6, 0xA, 0xE, 5, 2); + GSPREC(3, 7, 0xB, 0xF, 15, 13); + GSPREC(0, 5, 0xA, 0xF, 10, 14); + GSPREC(1, 6, 0xB, 0xC, 3, 6); + GSPREC(2, 7, 0x8, 0xD, 7, 1); + GSPREC(3, 4, 0x9, 0xE, 9, 4); + // { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 }, + GSPREC(0, 4, 0x8, 0xC, 7, 9); + GSPREC(1, 5, 0x9, 0xD, 3, 1); + GSPREC(2, 6, 0xA, 0xE, 13, 12); + GSPREC(3, 7, 0xB, 0xF, 11, 14); + GSPREC(0, 5, 0xA, 0xF, 2, 6); + GSPREC(1, 6, 0xB, 0xC, 5, 10); + GSPREC(2, 7, 0x8, 0xD, 4, 0); + GSPREC(3, 4, 0x9, 0xE, 15, 8); + + h[0] = cuda_swab32(h[0] ^ v[0] ^ v[8]); + h[1] = cuda_swab32(h[1] ^ v[1] ^ v[9]); + h[2] = cuda_swab32(h[2] ^ v[2] ^ v[10]); + h[3] = cuda_swab32(h[3] ^ v[3] ^ v[11]); + h[4] = cuda_swab32(h[4] ^ v[4] ^ v[12]); + h[5] = cuda_swab32(h[5] ^ v[5] ^ v[13]); + h[6] = cuda_swab32(h[6] ^ v[6] ^ v[14]); + h[7] = cuda_swab32(h[7] ^ v[7] ^ v[15]); + + Hash[((0 * threads) + thread)*2] = (h[0]); + Hash[((0 * threads) + thread) * 2 + 1] = (h[1]); + Hash[((1 * threads) + thread) * 2] = (h[2]); + Hash[((1 * threads) + thread) * 2 + 1] = (h[3]); + Hash[((2 * threads) + thread) * 2] = (h[4]); + Hash[((2 * threads) + thread) * 2 + 1] = (h[5]); + Hash[((3 * threads) + thread) * 2] = (h[6]); + Hash[((3 * threads) + thread) * 2 + 1] = (h[7]); } } __host__ -void blake256_cpu_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint64_t *Hash, int order) +void blake256_cpu_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint64_t *Hash) { const uint32_t threadsperblock = 64; dim3 grid((threads + threadsperblock - 1) / threadsperblock); dim3 block(threadsperblock); - blake256_gpu_hash_80 <<>> (threads, startNonce, Hash); + blake256_gpu_hash_80 <<>> (threads, startNonce, (uint32_t*)Hash); + CUDA_SAFE_CALL(cudaGetLastError()); } __host__ -void blake256_cpu_setBlock_80(uint32_t *pdata) +void blake256_cpu_setBlock_80(int thr_id, uint32_t *pdata) { - uint32_t h[8]; - uint32_t data[20]; - memcpy(data, pdata, 80); - for (int i = 0; i<8; i++) { - h[i] = c_IV256[i]; - } - blake256_compress1st(h, pdata, 512); + uint32_t h[8] = + { + 0x6A09E667, 0xBB67AE85, + 0x3C6EF372, 0xA54FF53A, + 0x510E527F, 0x9B05688C, + 0x1F83D9AB, 0x5BE0CD19 + }; + + CUDA_SAFE_CALL(cudaMemcpyToSymbolAsync(c_data, pdata + 16, 3 * sizeof(uint32_t), 0, cudaMemcpyHostToDevice, gpustream[thr_id])); - cudaMemcpyToSymbol(cpu_h, h, sizeof(h), 0, cudaMemcpyHostToDevice); - cudaMemcpyToSymbol(c_data, data, sizeof(data), 0, cudaMemcpyHostToDevice); + blake256_compress1st(h, pdata); + + CUDA_SAFE_CALL(cudaMemcpyToSymbolAsync(cpu_h, h, 8 * sizeof(uint32_t), 0, cudaMemcpyHostToDevice, gpustream[thr_id])); } __host__ -void blake256_cpu_init(int thr_id, uint32_t threads) +void blakeKeccak256_cpu_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint64_t *Hash) { - cudaMemcpyToSymbol(u256, c_u256, sizeof(c_u256), 0, cudaMemcpyHostToDevice); - cudaMemcpyToSymbol(sigma, c_sigma, sizeof(c_sigma), 0, cudaMemcpyHostToDevice); -} + const uint32_t threadsperblock = 256; + + dim3 grid((threads + threadsperblock - 1) / threadsperblock); + dim3 block(threadsperblock); + + blakeKeccak256_gpu_hash_80 << > > (threads, startNonce, (uint32_t *)Hash); + CUDA_SAFE_CALL(cudaGetLastError()); +} \ No newline at end of file diff --git a/Algo256/cuda_bmw256.cu b/Algo256/cuda_bmw256.cu new file mode 100644 index 0000000000..1982df8ee9 --- /dev/null +++ b/Algo256/cuda_bmw256.cu @@ -0,0 +1,328 @@ +#include +#include + +#include "cuda_helper.h" + +static uint32_t *h_nonce[MAX_GPUS]; +static uint32_t *d_nonce[MAX_GPUS]; + +#define shl(x, n) ((x) << (n)) +#define shr(x, n) ((x) >> (n)) +//#define SHR(x, n) SHR2(x, n) +//#define SHL(x, n) SHL2(x, n) + +#undef SPH_ROTL32 +#define SPH_ROTL32 ROTL32 + + +#define ROTL32host(x, n) ROTL32(x,n) +// #define SPH_ROTL32 SPH_ROTL32 +#define ss0(x) (shr((x), 1) ^ shl((x), 3) ^ SPH_ROTL32((x), 4) ^ SPH_ROTL32((x), 19)) +#define ss1(x) (shr((x), 1) ^ shl((x), 2) ^ __byte_perm(x,0,0x2103) ^ SPH_ROTL32((x), 23)) +#define ss2(x) (shr((x), 2) ^ shl((x), 1) ^ SPH_ROTL32((x), 12) ^ SPH_ROTL32((x), 25)) +#define ss3(x) (shr((x), 2) ^ shl((x), 2) ^ SPH_ROTL32((x), 15) ^ SPH_ROTL32((x), 29)) +#define ss4(x) (shr((x), 1) ^ (x)) +#define ss5(x) (shr((x), 2) ^ (x)) +#define rs1(x) SPH_ROTL32((x), 3) +#define rs2(x) SPH_ROTL32((x), 7) +#define rs3(x) SPH_ROTL32((x), 13) +#define rs4(x) __byte_perm(x,0,0x1032) +#define rs5(x) SPH_ROTL32((x), 19) +#define rs6(x) SPH_ROTL32((x), 23) +#define rs7(x) SPH_ROTL32((x), 27) + + +/* Message expansion function 1 */ +__forceinline__ __device__ uint32_t expand32_1(int i, const uint32_t *message, const uint32_t *H, const uint32_t *Q) +{ + return (ss1(Q[i - 16]) + ss2(Q[i - 15]) + ss3(Q[i - 14]) + ss0(Q[i - 13]) + + ss1(Q[i - 12]) + ss2(Q[i - 11]) + ss3(Q[i - 10]) + ss0(Q[i - 9]) + + ss1(Q[i - 8]) + ss2(Q[i - 7]) + ss3(Q[i - 6]) + ss0(Q[i - 5]) + + ss1(Q[i - 4]) + ss2(Q[i - 3]) + ss3(Q[i - 2]) + ss0(Q[i - 1]) + + ((i*(0x05555555ul) + SPH_ROTL32(message[(i - 16) % 16], ((i - 16) % 16) + 1) + SPH_ROTL32(message[(i - 13) % 16], ((i - 13) % 16) + 1) - SPH_ROTL32(message[(i - 6) % 16], ((i - 6) % 16) + 1)) ^ H[(i - 16 + 7) % 16])); +} + +/* Message expansion function 2 */ +__forceinline__ __device__ uint32_t expand32_2(const int i, const uint32_t *message, const uint32_t *H, const uint32_t *Q) +{ + return ( + rs2(Q[i - 13]) + rs3(Q[i - 11]) + rs4(Q[i - 9]) + rs1(Q[i - 15]) + + +rs5(Q[i - 7]) + rs6(Q[i - 5]) + rs7(Q[i - 3]) + ss4(Q[i - 2]) + ss5(Q[i - 1])); +} + +#define TPB 512 +__global__ __launch_bounds__(TPB, 2) +void bmw256_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint2 *g_hash, uint32_t *const __restrict__ nonceVector, uint32_t Target) +{ + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + // if (thread < threads) + { + uint32_t backup = Target; + uint32_t message[16] = {0}; + + ((uint2*)message)[0] = __ldg(&g_hash[thread]); + ((uint2*)message)[1] = __ldg(&g_hash[thread + 1 * threads]); + ((uint2*)message)[2] = __ldg(&g_hash[thread + 2 * threads]); + ((uint2*)message)[3] = __ldg(&g_hash[thread + 3 * threads]); + + + const uint32_t h1[16] = { + (0x40414243), (0x44454647), + (0x48494A4B), (0x4C4D4E4F), + (0x50515253), (0x54555657), + (0x58595A5B), (0x5C5D5E5F), + (0x60616263), (0x64656667), + (0x68696A6B), (0x6C6D6E6F), + (0x70717273), (0x74757677), + (0x78797A7B), (0x7C7D7E7F) + }; + + message[8] = 0x80; + message[14] = 0x100; + + uint32_t XL32, XH32, Q[32]; + + Q[0] = (message[5] ^ h1[5]) - (message[7] ^ h1[7]) + (message[10] ^ h1[10]) + (message[13] ^ h1[13]) + (message[14] ^ h1[14]); + Q[1] = (message[6] ^ h1[6]) - (message[8] ^ h1[8]) + (message[11] ^ h1[11]) + (message[14] ^ h1[14]) - (message[15] ^ h1[15]); + Q[2] = (message[0] ^ h1[0]) + (message[7] ^ h1[7]) + (message[9] ^ h1[9]) - (message[12] ^ h1[12]) + (message[15] ^ h1[15]); + Q[3] = (message[0] ^ h1[0]) - (message[1] ^ h1[1]) + (message[8] ^ h1[8]) - (message[10] ^ h1[10]) + (message[13] ^ h1[13]); + Q[4] = (message[1] ^ h1[1]) + (message[2] ^ h1[2]) + (message[9] ^ h1[9]) - (message[11] ^ h1[11]) - (message[14] ^ h1[14]); + Q[5] = (message[3] ^ h1[3]) - (message[2] ^ h1[2]) + (message[10] ^ h1[10]) - (message[12] ^ h1[12]) + (message[15] ^ h1[15]); + Q[6] = (message[4] ^ h1[4]) - (message[0] ^ h1[0]) - (message[3] ^ h1[3]) - (message[11] ^ h1[11]) + (message[13] ^ h1[13]); + Q[7] = (message[1] ^ h1[1]) - (message[4] ^ h1[4]) - (message[5] ^ h1[5]) - (message[12] ^ h1[12]) - (message[14] ^ h1[14]); + Q[8] = (message[2] ^ h1[2]) - (message[5] ^ h1[5]) - (message[6] ^ h1[6]) + (message[13] ^ h1[13]) - (message[15] ^ h1[15]); + Q[9] = (message[0] ^ h1[0]) - (message[3] ^ h1[3]) + (message[6] ^ h1[6]) - (message[7] ^ h1[7]) + (message[14] ^ h1[14]); + Q[10] = (message[8] ^ h1[8]) - (message[1] ^ h1[1]) - (message[4] ^ h1[4]) - (message[7] ^ h1[7]) + (message[15] ^ h1[15]); + Q[11] = (message[8] ^ h1[8]) - (message[0] ^ h1[0]) - (message[2] ^ h1[2]) - (message[5] ^ h1[5]) + (message[9] ^ h1[9]); + Q[12] = (message[1] ^ h1[1]) + (message[3] ^ h1[3]) - (message[6] ^ h1[6]) - (message[9] ^ h1[9]) + (message[10] ^ h1[10]); + Q[13] = (message[2] ^ h1[2]) + (message[4] ^ h1[4]) + (message[7] ^ h1[7]) + (message[10] ^ h1[10]) + (message[11] ^ h1[11]); + Q[14] = (message[3] ^ h1[3]) - (message[5] ^ h1[5]) + (message[8] ^ h1[8]) - (message[11] ^ h1[11]) - (message[12] ^ h1[12]); + Q[15] = (message[12] ^ h1[12]) - (message[4] ^ h1[4]) - (message[6] ^ h1[6]) - (message[9] ^ h1[9]) + (message[13] ^ h1[13]); + + Q[0] = ss0(Q[0]) + h1[1]; + Q[1] = ss1(Q[1]) + h1[2]; + Q[2] = ss2(Q[2]) + h1[3]; + Q[3] = ss3(Q[3]) + h1[4]; + Q[4] = ss4(Q[4]) + h1[5]; + Q[5] = ss0(Q[5]) + h1[6]; + Q[6] = ss1(Q[6]) + h1[7]; + Q[7] = ss2(Q[7]) + h1[8]; + Q[8] = ss3(Q[8]) + h1[9]; + Q[9] = ss4(Q[9]) + h1[10]; + Q[10] = ss0(Q[10]) + h1[11]; + Q[11] = ss1(Q[11]) + h1[12]; + Q[12] = ss2(Q[12]) + h1[13]; + Q[13] = ss3(Q[13]) + h1[14]; + Q[14] = ss4(Q[14]) + h1[15]; + Q[15] = ss0(Q[15]) + h1[0]; + + Q[16] = ss1(Q[16 - 16]) + ss2(Q[16 - 15]) + ss3(Q[16 - 14]) + ss0(Q[16 - 13]) + + ss1(Q[16 - 12]) + ss2(Q[16 - 11]) + ss3(Q[16 - 10]) + ss0(Q[16 - 9]) + + ss1(Q[16 - 8]) + ss2(Q[16 - 7]) + ss3(Q[16 - 6]) + ss0(Q[16 - 5]) + + ss1(Q[16 - 4]) + ss2(Q[16 - 3]) + ss3(Q[16 - 2]) + ss0(Q[16 - 1]) + + ((16 * (0x05555555ul) + SPH_ROTL32(message[0], ((16 - 16) % 16) + 1) + SPH_ROTL32(message[3], ((16 - 13) % 16) + 1)) ^ h1[(16 - 16 + 7) % 16]); + + Q[17] = ss1(Q[17 - 16]) + ss2(Q[17 - 15]) + ss3(Q[17 - 14]) + ss0(Q[17 - 13]) + + ss1(Q[17 - 12]) + ss2(Q[17 - 11]) + ss3(Q[17 - 10]) + ss0(Q[17 - 9]) + + ss1(Q[17 - 8]) + ss2(Q[17 - 7]) + ss3(Q[17 - 6]) + ss0(Q[17 - 5]) + + ss1(Q[17 - 4]) + ss2(Q[17 - 3]) + ss3(Q[17 - 2]) + ss0(Q[17 - 1]) + + ((17 * (0x05555555ul) + SPH_ROTL32(message[(17 - 16) % 16], ((17 - 16) % 16) + 1) + SPH_ROTL32(message[(17 - 13) % 16], ((17 - 13) % 16) + 1)) ^ h1[(17 - 16 + 7) % 16]); + + uint32_t precalc = Q[18 - 16] + Q[18 - 14] + Q[18 - 12] + Q[18 - 10] + Q[18 - 8] + Q[18 - 6]; //+ Q[18 - 4] + uint32_t precalc2 = Q[19 - 16] + Q[19 - 14] + Q[19 - 12] + Q[19 - 10] + Q[19 - 8] + Q[19 - 6];//+ Q[19 - 4] + + precalc = precalc + Q[18 - 4]; + precalc2 = precalc2 + Q[18 + 1 - 4]; + uint32_t p1 = ((18 * (0x05555555ul) + SPH_ROTL32(message[2], ((18 - 16) % 16) + 1) + SPH_ROTL32(message[5], ((18 - 13) % 16) + 1)) ^ h1[(18 - 16 + 7) % 16]); + uint32_t p2 = (((18 + 1)*(0x05555555ul) + SPH_ROTL32(message[3], (((18 + 1) - 16) % 16) + 1) + SPH_ROTL32(message[6], (((18 + 1) - 13) % 16) + 1)) ^ h1[((18 + 1) - 16 + 7) % 16]); + Q[18] = precalc + expand32_2(18, message, h1, Q) + p1; + Q[18 + 1] = precalc2 + expand32_2(18 + 1, message, h1, Q) + p2; + precalc = precalc - Q[18 - 16]; + precalc2 = precalc2 - Q[18 + 1 - 16]; + + precalc = precalc + Q[20 - 4]; + precalc2 = precalc2 + Q[20 + 1 - 4]; + p1 = ((20 * (0x05555555ul) + SPH_ROTL32(message[4], ((20 - 16) % 16) + 1) + SPH_ROTL32(message[7], ((20 - 13) % 16) + 1) - (0x100 << 15)) ^ h1[(20 - 16 + 7) % 16]); + p2 = (((20 + 1)*(0x05555555ul) + SPH_ROTL32(message[5], (((20 + 1) - 16) % 16) + 1) + (0x80 << 9)) ^ h1[((20 + 1) - 16 + 7) % 16]); + Q[20] = precalc + expand32_2(20, message, h1, Q) + p1; + Q[20 + 1] = precalc2 + expand32_2(20 + 1, message, h1, Q) + p2; + precalc = precalc - Q[20 - 16]; + precalc2 = precalc2 - Q[20 + 1 - 16]; + + precalc = precalc + Q[22 - 4]; + precalc2 = precalc2 + Q[22 + 1 - 4]; + p1 = ((22 * (0x05555555ul) + SPH_ROTL32(message[6], ((22 - 16) % 16) + 1) - SPH_ROTL32(message[0], ((22 - 6) % 16) + 1)) ^ h1[(22 - 16 + 7) % 16]); + p2 = (((22 + 1)*(0x05555555ul) + SPH_ROTL32(message[7], (((22 + 1) - 16) % 16) + 1) - SPH_ROTL32(message[1], (((22 + 1) - 6) % 16) + 1)) ^ h1[((22 + 1) - 16 + 7) % 16]); + Q[22] = precalc + expand32_2(22, message, h1, Q) + p1; + Q[22 + 1] = precalc2 + expand32_2(22 + 1, message, h1, Q) + p2; + precalc = precalc - Q[22 - 16]; + precalc2 = precalc2 - Q[22 + 1 - 16]; + + precalc = precalc + Q[24 - 4]; + precalc2 = precalc2 + Q[24 + 1 - 4]; + p1 = ((24 * (0x05555555ul) + (0x80 << 9) - SPH_ROTL32(message[2], ((24 - 6) % 16) + 1)) ^ h1[(24 - 16 + 7) % 16]); + p2 = (((24 + 1)*(0x05555555ul) - SPH_ROTL32(message[3], (((24 + 1) - 6) % 16) + 1)) ^ h1[((24 + 1) - 16 + 7) % 16]); + Q[24] = precalc + expand32_2(24, message, h1, Q) + p1; + Q[24 + 1] = precalc2 + expand32_2(24 + 1, message, h1, Q) + p2; + precalc = precalc - Q[24 - 16]; + precalc2 = precalc2 - Q[24 + 1 - 16]; + + precalc = precalc + Q[26 - 4]; + precalc2 = precalc2 + Q[26 + 1 - 4]; + p1 = ((26 * (0x05555555ul) - SPH_ROTL32(message[4], ((26 - 6) % 16) + 1)) ^ h1[(26 - 16 + 7) % 16]); + p2 = (((26 + 1)*(0x05555555ul) + (0x100 << 15) - SPH_ROTL32(message[5], (((26 + 1) - 6) % 16) + 1)) ^ h1[((26 + 1) - 16 + 7) % 16]); + Q[26] = precalc + expand32_2(26, message, h1, Q) + p1; + Q[26 + 1] = precalc2 + expand32_2(26 + 1, message, h1, Q) + p2; + precalc = precalc - Q[26 - 16]; + precalc2 = precalc2 - Q[26 + 1 - 16]; + + precalc = precalc + Q[28 - 4]; + precalc2 = precalc2 + Q[28 + 1 - 4]; + p1 = ((28 * (0x05555555ul) - SPH_ROTL32(message[6], ((28 - 6) % 16) + 1)) ^ h1[(28 - 16 + 7) % 16]); + p2 = (((28 + 1)*(0x05555555ul) + SPH_ROTL32(message[0], (((28 + 1) - 13) % 16) + 1) - SPH_ROTL32(message[7], (((28 + 1) - 6) % 16) + 1)) ^ h1[((28 + 1) - 16 + 7) % 16]); + Q[28] = precalc + expand32_2(28, message, h1, Q) + p1; + Q[28 + 1] = precalc2 + expand32_2(28 + 1, message, h1, Q) + p2; + precalc = precalc - Q[28 - 16]; + precalc2 = precalc2 - Q[28 + 1 - 16]; + + precalc = precalc + Q[30 - 4]; + precalc2 = precalc2 + Q[30 + 1 - 4]; + p1 = ((30 * (0x05555555ul) + (0x100 << 15) + SPH_ROTL32(message[1], ((30 - 13) % 16) + 1) - (0x80 << 9)) ^ h1[(30 - 16 + 7) % 16]); + p2 = (((30 + 1)*(0x05555555ul) + SPH_ROTL32(message[2], (((30 + 1) - 13) % 16) + 1)) ^ h1[((30 + 1) - 16 + 7) % 16]); + Q[30] = precalc + expand32_2(30, message, h1, Q) + p1; + Q[30 + 1] = precalc2 + expand32_2(30 + 1, message, h1, Q) + p2; + precalc = precalc - Q[30 - 16]; + precalc2 = precalc2 - Q[30 + 1 - 16]; + + XL32 = Q[16] ^ Q[17] ^ Q[18] ^ Q[19] ^ Q[20] ^ Q[21] ^ Q[22] ^ Q[23]; + XH32 = XL32^Q[24] ^ Q[25] ^ Q[26] ^ Q[27] ^ Q[28] ^ Q[29] ^ Q[30] ^ Q[31]; + + message[0] = (shl(XH32, 5) ^ shr(Q[16], 5) ^ message[0]) + (XL32 ^ Q[24] ^ Q[0]); + message[1] = (shr(XH32, 7) ^ shl(Q[17], 8) ^ message[1]) + (XL32 ^ Q[25] ^ Q[1]); + message[2] = (shr(XH32, 5) ^ shl(Q[18], 5) ^ message[2]) + (XL32 ^ Q[26] ^ Q[2]); + message[3] = (shr(XH32, 1) ^ shl(Q[19], 5) ^ message[3]) + (XL32 ^ Q[27] ^ Q[3]); + message[4] = (shr(XH32, 3) ^ Q[20] ^ message[4]) + (XL32 ^ Q[28] ^ Q[4]); + message[5] = (shl(XH32, 6) ^ shr(Q[21], 6) ^ message[5]) + (XL32 ^ Q[29] ^ Q[5]); + message[6] = (shr(XH32, 4) ^ shl(Q[22], 6) ^ message[6]) + (XL32 ^ Q[30] ^ Q[6]); + message[7] = (shr(XH32, 11) ^ shl(Q[23], 2) ^ message[7]) + (XL32 ^ Q[31] ^ Q[7]); + + message[8] = SPH_ROTL32(message[4], 9) + (XH32 ^ Q[24] ^ message[8]) + (shl(XL32, 8) ^ Q[23] ^ Q[8]); + message[9] = SPH_ROTL32(message[5], 10) + (XH32 ^ Q[25] ^ message[9]) + (shr(XL32, 6) ^ Q[16] ^ Q[9]); + message[10] = SPH_ROTL32(message[6], 11) + (XH32 ^ Q[26] ^ message[10]) + (shl(XL32, 6) ^ Q[17] ^ Q[10]); + message[11] = SPH_ROTL32(message[7], 12) + (XH32 ^ Q[27] ^ message[11]) + (shl(XL32, 4) ^ Q[18] ^ Q[11]); + message[12] = SPH_ROTL32(message[0], 13) + (XH32 ^ Q[28] ^ message[12]) + (shr(XL32, 3) ^ Q[19] ^ Q[12]); + message[13] = SPH_ROTL32(message[1], 14) + (XH32 ^ Q[29] ^ message[13]) + (shr(XL32, 4) ^ Q[20] ^ Q[13]); + message[14] = SPH_ROTL32(message[2], 15) + (XH32 ^ Q[30] ^ message[14]) + (shr(XL32, 7) ^ Q[21] ^ Q[14]); + message[15] = SPH_ROTL32(message[3], 16) + (XH32 ^ Q[31] ^ message[15]) + (shr(XL32, 2) ^ Q[22] ^ Q[15]); + + const uint32_t h2[16] = { + (0xaaaaaaa0), (0xaaaaaaa1), (0xaaaaaaa2), + (0xaaaaaaa3), (0xaaaaaaa4), (0xaaaaaaa5), + (0xaaaaaaa6), (0xaaaaaaa7), (0xaaaaaaa8), + (0xaaaaaaa9), (0xaaaaaaaa), (0xaaaaaaab), + (0xaaaaaaac), (0xaaaaaaad), (0xaaaaaaae), + (0xaaaaaaaf) + }; + + Q[0] = (message[5] ^ h2[5]) - (message[7] ^ h2[7]) + (message[10] ^ h2[10]) + (message[13] ^ h2[13]) + (message[14] ^ h2[14]); + Q[1] = (message[6] ^ h2[6]) - (message[8] ^ h2[8]) + (message[11] ^ h2[11]) + (message[14] ^ h2[14]) - (message[15] ^ h2[15]); + Q[2] = (message[0] ^ h2[0]) + (message[7] ^ h2[7]) + (message[9] ^ h2[9]) - (message[12] ^ h2[12]) + (message[15] ^ h2[15]); + Q[3] = (message[0] ^ h2[0]) - (message[1] ^ h2[1]) + (message[8] ^ h2[8]) - (message[10] ^ h2[10]) + (message[13] ^ h2[13]); + Q[4] = (message[1] ^ h2[1]) + (message[2] ^ h2[2]) + (message[9] ^ h2[9]) - (message[11] ^ h2[11]) - (message[14] ^ h2[14]); + Q[5] = (message[3] ^ h2[3]) - (message[2] ^ h2[2]) + (message[10] ^ h2[10]) - (message[12] ^ h2[12]) + (message[15] ^ h2[15]); + Q[6] = (message[4] ^ h2[4]) - (message[0] ^ h2[0]) - (message[3] ^ h2[3]) - (message[11] ^ h2[11]) + (message[13] ^ h2[13]); + Q[7] = (message[1] ^ h2[1]) - (message[4] ^ h2[4]) - (message[5] ^ h2[5]) - (message[12] ^ h2[12]) - (message[14] ^ h2[14]); + Q[8] = (message[2] ^ h2[2]) - (message[5] ^ h2[5]) - (message[6] ^ h2[6]) + (message[13] ^ h2[13]) - (message[15] ^ h2[15]); + Q[9] = (message[0] ^ h2[0]) - (message[3] ^ h2[3]) + (message[6] ^ h2[6]) - (message[7] ^ h2[7]) + (message[14] ^ h2[14]); + Q[10] = (message[8] ^ h2[8]) - (message[1] ^ h2[1]) - (message[4] ^ h2[4]) - (message[7] ^ h2[7]) + (message[15] ^ h2[15]); + Q[11] = (message[8] ^ h2[8]) - (message[0] ^ h2[0]) - (message[2] ^ h2[2]) - (message[5] ^ h2[5]) + (message[9] ^ h2[9]); + Q[12] = (message[1] ^ h2[1]) + (message[3] ^ h2[3]) - (message[6] ^ h2[6]) - (message[9] ^ h2[9]) + (message[10] ^ h2[10]); + Q[13] = (message[2] ^ h2[2]) + (message[4] ^ h2[4]) + (message[7] ^ h2[7]) + (message[10] ^ h2[10]) + (message[11] ^ h2[11]); + Q[14] = (message[3] ^ h2[3]) - (message[5] ^ h2[5]) + (message[8] ^ h2[8]) - (message[11] ^ h2[11]) - (message[12] ^ h2[12]); + Q[15] = (message[12] ^ h2[12]) - (message[4] ^ h2[4]) - (message[6] ^ h2[6]) - (message[9] ^ h2[9]) + (message[13] ^ h2[13]); + + Q[0] = ss0(Q[0]) + h2[1]; + Q[1] = ss1(Q[1]) + h2[2]; + Q[2] = ss2(Q[2]) + h2[3]; + Q[3] = ss3(Q[3]) + h2[4]; + Q[4] = ss4(Q[4]) + h2[5]; + Q[5] = ss0(Q[5]) + h2[6]; + Q[6] = ss1(Q[6]) + h2[7]; + Q[7] = ss2(Q[7]) + h2[8]; + Q[8] = ss3(Q[8]) + h2[9]; + Q[9] = ss4(Q[9]) + h2[10]; + Q[10] = ss0(Q[10]) + h2[11]; + Q[11] = ss1(Q[11]) + h2[12]; + Q[12] = ss2(Q[12]) + h2[13]; + Q[13] = ss3(Q[13]) + h2[14]; + Q[14] = ss4(Q[14]) + h2[15]; + Q[15] = ss0(Q[15]) + h2[0]; + +#pragma unroll + for(int i = 0; i<2; i++) + Q[i + 16] = expand32_1(i + 16, message, h2, Q); + + precalc = Q[18 - 16] + Q[18 - 14] + Q[18 - 12] + Q[18 - 10] + Q[18 - 8] + Q[18 - 6]; + precalc2 = Q[19 - 16] + Q[19 - 14] + Q[19 - 12] + Q[19 - 10] + Q[19 - 8] + Q[19 - 6]; + +#pragma unroll + for(int i = 2 + 16; i < 16 + 16; i += 2) + { + precalc = precalc + Q[i - 4]; + precalc2 = precalc2 + Q[i + 1 - 4]; + p1 = ((i*(0x05555555ul) + SPH_ROTL32(message[(i - 16) % 16], ((i - 16) % 16) + 1) + SPH_ROTL32(message[(i - 13) % 16], ((i - 13) % 16) + 1) - SPH_ROTL32(message[(i - 6) % 16], ((i - 6) % 16) + 1)) ^ h2[(i - 16 + 7) % 16]); + p2 = (((i + 1)*(0x05555555ul) + SPH_ROTL32(message[((i + 1) - 16) % 16], (((i + 1) - 16) % 16) + 1) + SPH_ROTL32(message[((i + 1) - 13) % 16], (((i + 1) - 13) % 16) + 1) - SPH_ROTL32(message[((i + 1) - 6) % 16], (((i + 1) - 6) % 16) + 1)) ^ h2[((i + 1) - 16 + 7) % 16]); + Q[i] = precalc + expand32_2(i, message, h2, Q) + p1; + Q[i + 1] = precalc2 + expand32_2(i + 1, message, h2, Q) + p2; + precalc = precalc - Q[i - 16]; + precalc2 = precalc2 - Q[i + 1 - 16]; + } + + XL32 = Q[16] ^ Q[17] ^ Q[18] ^ Q[19] ^ Q[20] ^ Q[21] ^ Q[22] ^ Q[23]; + XH32 = XL32^Q[24] ^ Q[25] ^ Q[26] ^ Q[27] ^ Q[28] ^ Q[29] ^ Q[30] ^ Q[31]; + + message[3] = (shr(XH32, 1) ^ shl(Q[19], 5) ^ message[3]) + (XL32 ^ Q[27] ^ Q[3]); + message[15] = SPH_ROTL32(message[3], 16) + (XH32 ^ Q[31] ^ message[15]) + (shr(XL32, 2) ^ Q[22] ^ Q[15]); + + if(message[15] <= backup) + { + + uint32_t tmp = atomicExch(nonceVector, startNounce + thread); + if(tmp != 0) + nonceVector[1] = tmp; + } + } +} + + +__host__ +void bmw256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *resultnonces, uint32_t Target) +{ + CUDA_SAFE_CALL(cudaMemsetAsync(d_nonce[thr_id], 0x0, 2 * sizeof(uint32_t), gpustream[thr_id])); + + // berechne wie viele Thread Blocks wir brauchen + dim3 grid((threads + TPB - 1) / TPB); + dim3 block(TPB); + + bmw256_gpu_hash_32 << > >(threads, startNounce, (uint2 *)g_hash, d_nonce[thr_id], Target); + CUDA_SAFE_CALL(cudaGetLastError()); + CUDA_SAFE_CALL(cudaMemcpy(h_nonce[thr_id], d_nonce[thr_id], 2 * sizeof(uint32_t), cudaMemcpyDeviceToHost)); + resultnonces[0] = *(h_nonce[thr_id]); + resultnonces[1] = *(h_nonce[thr_id] + 1); +} + + +__host__ +void bmw256_cpu_init(int thr_id) +{ + CUDA_SAFE_CALL(cudaMalloc(&d_nonce[thr_id], 2 * sizeof(uint32_t))); + CUDA_SAFE_CALL(cudaMallocHost(&h_nonce[thr_id], 2 * sizeof(uint32_t))); +} + +/* +__host__ +void bmw256_setTarget(int thr_id, const void *pTargetIn) +{ +cudaMemcpyToSymbolAsync(pTarget, pTargetIn, 8 * sizeof(uint32_t), 0, cudaMemcpyHostToDevice, gpustream[thr_id]); +} +*/ \ No newline at end of file diff --git a/Algo256/cuda_cubehash256.cu b/Algo256/cuda_cubehash256.cu new file mode 100644 index 0000000000..8071d812c3 --- /dev/null +++ b/Algo256/cuda_cubehash256.cu @@ -0,0 +1,471 @@ +#include "cuda_helper.h" + +#define CUBEHASH_ROUNDS 16 /* this is r for CubeHashr/b */ +#define CUBEHASH_BLOCKBYTES 32 /* this is b for CubeHashr/b */ +#define TPB 1024 + +#define ROTATEUPWARDS7(a) ROTL32(a,7) +#define ROTATEUPWARDS11(a) ROTL32(a,11) + +//#define SWAP(a,b) { uint32_t u = a; a = b; b = u; } +#define SWAP(a,b) { a ^= b; b ^=a; a ^=b;} +__device__ __forceinline__ void rrounds(uint32_t x[2][2][2][2][2]) +{ + int r; + int j; + int k; + int l; + int m; + + #pragma unroll 2 + for (r = 0; r < CUBEHASH_ROUNDS; ++r) { + + /* "add x_0jklm into x_1jklmn modulo 2^32" */ +#pragma unroll 2 + for (j = 0; j < 2; ++j) +#pragma unroll 2 + for (k = 0; k < 2; ++k) +#pragma unroll 2 + for (l = 0; l < 2; ++l) +#pragma unroll 2 + for (m = 0; m < 2; ++m) + x[1][j][k][l][m] += x[0][j][k][l][m]; + + /* "rotate x_0jklm upwards by 7 bits" */ +#pragma unroll 2 + for (j = 0; j < 2; ++j) +#pragma unroll 2 + for (k = 0; k < 2; ++k) +#pragma unroll 2 + for (l = 0; l < 2; ++l) +#pragma unroll 2 + for (m = 0; m < 2; ++m) + x[0][j][k][l][m] = ROTATEUPWARDS7(x[0][j][k][l][m]); + + /* "swap x_00klm with x_01klm" */ +#pragma unroll 2 + for (k = 0; k < 2; ++k) +#pragma unroll 2 + for (l = 0; l < 2; ++l) +#pragma unroll 2 + for (m = 0; m < 2; ++m) + SWAP(x[0][0][k][l][m], x[0][1][k][l][m]) + + /* "xor x_1jklm into x_0jklm" */ +#pragma unroll 2 + for (j = 0; j < 2; ++j) +#pragma unroll 2 + for (k = 0; k < 2; ++k) +#pragma unroll 2 + for (l = 0; l < 2; ++l) +#pragma unroll 2 + for (m = 0; m < 2; ++m) + x[0][j][k][l][m] ^= x[1][j][k][l][m]; + + /* "swap x_1jk0m with x_1jk1m" */ +#pragma unroll 2 + for (j = 0; j < 2; ++j) +#pragma unroll 2 + for (k = 0; k < 2; ++k) +#pragma unroll 2 + for (m = 0; m < 2; ++m) + SWAP(x[1][j][k][0][m], x[1][j][k][1][m]) + + /* "add x_0jklm into x_1jklm modulo 2^32" */ +#pragma unroll 2 + for (j = 0; j < 2; ++j) +#pragma unroll 2 + for (k = 0; k < 2; ++k) +#pragma unroll 2 + for (l = 0; l < 2; ++l) +#pragma unroll 2 + for (m = 0; m < 2; ++m) + x[1][j][k][l][m] += x[0][j][k][l][m]; + + /* "rotate x_0jklm upwards by 11 bits" */ +#pragma unroll 2 + for (j = 0; j < 2; ++j) +#pragma unroll 2 + for (k = 0; k < 2; ++k) +#pragma unroll 2 + for (l = 0; l < 2; ++l) +#pragma unroll 2 + for (m = 0; m < 2; ++m) + x[0][j][k][l][m] = ROTATEUPWARDS11(x[0][j][k][l][m]); + + /* "swap x_0j0lm with x_0j1lm" */ +#pragma unroll 2 + for (j = 0; j < 2; ++j) +#pragma unroll 2 + for (l = 0; l < 2; ++l) +#pragma unroll 2 + for (m = 0; m < 2; ++m) + SWAP(x[0][j][0][l][m], x[0][j][1][l][m]) + + /* "xor x_1jklm into x_0jklm" */ +#pragma unroll 2 + for (j = 0; j < 2; ++j) +#pragma unroll 2 + for (k = 0; k < 2; ++k) +#pragma unroll 2 + for (l = 0; l < 2; ++l) +#pragma unroll 2 + for (m = 0; m < 2; ++m) + x[0][j][k][l][m] ^= x[1][j][k][l][m]; + + /* "swap x_1jkl0 with x_1jkl1" */ +#pragma unroll 2 + for (j = 0; j < 2; ++j) +#pragma unroll 2 + for (k = 0; k < 2; ++k) +#pragma unroll 2 + for (l = 0; l < 2; ++l) + SWAP(x[1][j][k][l][0], x[1][j][k][l][1]) + + } +} + +__device__ __forceinline__ void block_tox(const uint32_t *in, uint32_t x[2][2][2][2][2]) +{ + x[0][0][0][0][0] ^= in[0]; + x[0][0][0][0][1] ^= in[1]; + x[0][0][0][1][0] ^= in[2]; + x[0][0][0][1][1] ^= in[3]; + x[0][0][1][0][0] ^= in[4]; + x[0][0][1][0][1] ^= in[5]; + x[0][0][1][1][0] ^= in[6]; + x[0][0][1][1][1] ^= in[7]; +} + +__device__ __forceinline__ void hash_fromx(uint32_t *out, uint32_t x[2][2][2][2][2]) +{ + out[0] = x[0][0][0][0][0]; + out[1] = x[0][0][0][0][1]; + out[2] = x[0][0][0][1][0]; + out[3] = x[0][0][0][1][1]; + out[4] = x[0][0][1][0][0]; + out[5] = x[0][0][1][0][1]; + out[6] = x[0][0][1][1][0]; + out[7] = x[0][0][1][1][1]; + +} + +void __device__ __forceinline__ Update32(uint32_t x[2][2][2][2][2], const uint32_t *data) +{ + /* "xor the block into the first b bytes of the state" */ + /* "and then transform the state invertibly through r identical rounds" */ + block_tox(data, x); + rrounds(x); +} + +void __device__ __forceinline__ Update32_const(uint32_t x[2][2][2][2][2]) +{ + x[0][0][0][0][0] ^= 0x80; + rrounds(x); +} + + + +void __device__ __forceinline__ Final(uint32_t x[2][2][2][2][2], uint32_t *hashval) +{ + int i; + + /* "the integer 1 is xored into the last state word x_11111" */ + x[1][1][1][1][1] ^= 1; + + /* "the state is then transformed invertibly through 10r identical rounds" */ + #pragma unroll 2 + for (i = 0; i < 10; ++i) rrounds(x); + + /* "output the first h/8 bytes of the state" */ + hash_fromx(hashval, x); +} + + +#if __CUDA_ARCH__ <500 +__global__ __launch_bounds__(TPB, 1) +#else +__global__ __launch_bounds__(TPB, 1) +#endif +void cubehash256_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint2 *g_hash) +{ + uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); +// if (thread < threads) + { + + uint2 Hash[4]; + + + Hash[0]= __ldg(&g_hash[thread]); + Hash[1] = __ldg(&g_hash[thread + 1 * threads]); // LOHI(Hash[2], Hash[3], __ldg(&g_hash[thread + 1 * threads])); + Hash[2] = __ldg(&g_hash[thread + 2 * threads]); // LOHI(Hash[4], Hash[5], __ldg(&g_hash[thread + 2 * threads])); + Hash[3] = __ldg(&g_hash[thread + 3 * threads]); // LOHI(Hash[6], Hash[7], __ldg(&g_hash[thread + 3 * threads])); + + uint32_t x[2][2][2][2][2] = + { + 0xEA2BD4B4, 0xCCD6F29F, 0x63117E71, + 0x35481EAE, 0x22512D5B, 0xE5D94E63, + 0x7E624131, 0xF4CC12BE, 0xC2D0B696, + 0x42AF2070, 0xD0720C35, 0x3361DA8C, + 0x28CCECA4, 0x8EF8AD83, 0x4680AC00, + 0x40E5FBAB, 0xD89041C3, 0x6107FBD5, + 0x6C859D41, 0xF0B26679, 0x09392549, + 0x5FA25603, 0x65C892FD, 0x93CB6285, + 0x2AF2B5AE, 0x9E4B4E60, 0x774ABFDD, + 0x85254725, 0x15815AEB, 0x4AB6AAD6, + 0x9CDAF8AF, 0xD6032C0A + + }; + x[0][0][0][0][0] ^= Hash[0].x; + x[0][0][0][0][1] ^= Hash[0].y; + x[0][0][0][1][0] ^= Hash[1].x; + x[0][0][0][1][1] ^= Hash[1].y; + x[0][0][1][0][0] ^= Hash[2].x; + x[0][0][1][0][1] ^= Hash[2].y; + x[0][0][1][1][0] ^= Hash[3].x; + x[0][0][1][1][1] ^= Hash[3].y; + +// rrounds(x); + int r; + int j; + int k; + int l; + int m; + + /* "add x_0jklm into x_1jklmn modulo 2^32" */ +#pragma unroll 2 + for (j = 0; j < 2; ++j) +#pragma unroll 2 + for (k = 0; k < 2; ++k) +#pragma unroll 2 + for (l = 0; l < 2; ++l) +#pragma unroll 2 + for (m = 0; m < 2; ++m) + x[1][j][k][l][m] += x[0][j][k][l][m]; + + /* "rotate x_0jklm upwards by 7 bits" */ +#pragma unroll 2 + for (j = 0; j < 2; ++j) +#pragma unroll 2 + for (k = 0; k < 2; ++k) +#pragma unroll 2 + for (l = 0; l < 2; ++l) +#pragma unroll 2 + for (m = 0; m < 2; ++m) + x[0][j][k][l][m] = ROTATEUPWARDS7(x[0][j][k][l][m]); + + /* "swap x_00klm with x_01klm" */ +#pragma unroll 2 + for (k = 0; k < 2; ++k) +#pragma unroll 2 + for (l = 0; l < 2; ++l) +#pragma unroll 2 + for (m = 0; m < 2; ++m) + SWAP(x[0][0][k][l][m], x[0][1][k][l][m]) + + /* "xor x_1jklm into x_0jklm" */ +#pragma unroll 2 + for (j = 0; j < 2; ++j) +#pragma unroll 2 + for (k = 0; k < 2; ++k) +#pragma unroll 2 + for (l = 0; l < 2; ++l) +#pragma unroll 2 + for (m = 0; m < 2; ++m) + x[0][j][k][l][m] ^= x[1][j][k][l][m]; + + /* "swap x_1jk0m with x_1jk1m" */ +#pragma unroll 2 + for (j = 0; j < 2; ++j) +#pragma unroll 2 + for (k = 0; k < 2; ++k) +#pragma unroll 2 + for (m = 0; m < 2; ++m) + SWAP(x[1][j][k][0][m], x[1][j][k][1][m]) + + /* "add x_0jklm into x_1jklm modulo 2^32" */ +#pragma unroll 2 + for (j = 0; j < 2; ++j) +#pragma unroll 2 + for (k = 0; k < 2; ++k) +#pragma unroll 2 + for (l = 0; l < 2; ++l) +#pragma unroll 2 + for (m = 0; m < 2; ++m) + x[1][j][k][l][m] += x[0][j][k][l][m]; + + /* "rotate x_0jklm upwards by 11 bits" */ +#pragma unroll 2 + for (j = 0; j < 2; ++j) +#pragma unroll 2 + for (k = 0; k < 2; ++k) +#pragma unroll 2 + for (l = 0; l < 2; ++l) +#pragma unroll 2 + for (m = 0; m < 2; ++m) + x[0][j][k][l][m] = ROTATEUPWARDS11(x[0][j][k][l][m]); + + /* "swap x_0j0lm with x_0j1lm" */ +#pragma unroll 2 + for (j = 0; j < 2; ++j) +#pragma unroll 2 + for (l = 0; l < 2; ++l) +#pragma unroll 2 + for (m = 0; m < 2; ++m) + SWAP(x[0][j][0][l][m], x[0][j][1][l][m]) + + /* "xor x_1jklm into x_0jklm" */ +#pragma unroll 2 + for (j = 0; j < 2; ++j) +#pragma unroll 2 + for (k = 0; k < 2; ++k) +#pragma unroll 2 + for (l = 0; l < 2; ++l) +#pragma unroll 2 + for (m = 0; m < 2; ++m) + x[0][j][k][l][m] ^= x[1][j][k][l][m]; + + /* "swap x_1jkl0 with x_1jkl1" */ +#pragma unroll 2 + for (j = 0; j < 2; ++j) +#pragma unroll 2 + for (k = 0; k < 2; ++k) +#pragma unroll 2 + for (l = 0; l < 2; ++l) + SWAP(x[1][j][k][l][0], x[1][j][k][l][1]) + + +#pragma unroll 1 + for (r = 1; r < CUBEHASH_ROUNDS; ++r) + { + + /* "add x_0jklm into x_1jklmn modulo 2^32" */ +#pragma unroll 2 + for (j = 0; j < 2; ++j) +#pragma unroll 2 + for (k = 0; k < 2; ++k) +#pragma unroll 2 + for (l = 0; l < 2; ++l) +#pragma unroll 2 + for (m = 0; m < 2; ++m) + x[1][j][k][l][m] += x[0][j][k][l][m]; + + /* "rotate x_0jklm upwards by 7 bits" */ +#pragma unroll 2 + for (j = 0; j < 2; ++j) +#pragma unroll 2 + for (k = 0; k < 2; ++k) +#pragma unroll 2 + for (l = 0; l < 2; ++l) +#pragma unroll 2 + for (m = 0; m < 2; ++m) + x[0][j][k][l][m] = ROTATEUPWARDS7(x[0][j][k][l][m]); + + /* "swap x_00klm with x_01klm" */ +#pragma unroll 2 + for (k = 0; k < 2; ++k) +#pragma unroll 2 + for (l = 0; l < 2; ++l) +#pragma unroll 2 + for (m = 0; m < 2; ++m) + SWAP(x[0][0][k][l][m], x[0][1][k][l][m]) + + /* "xor x_1jklm into x_0jklm" */ +#pragma unroll 2 + for (j = 0; j < 2; ++j) +#pragma unroll 2 + for (k = 0; k < 2; ++k) +#pragma unroll 2 + for (l = 0; l < 2; ++l) +#pragma unroll 2 + for (m = 0; m < 2; ++m) + x[0][j][k][l][m] ^= x[1][j][k][l][m]; + + /* "swap x_1jk0m with x_1jk1m" */ +#pragma unroll 2 + for (j = 0; j < 2; ++j) +#pragma unroll 2 + for (k = 0; k < 2; ++k) +#pragma unroll 2 + for (m = 0; m < 2; ++m) + SWAP(x[1][j][k][0][m], x[1][j][k][1][m]) + + /* "add x_0jklm into x_1jklm modulo 2^32" */ +#pragma unroll 2 + for (j = 0; j < 2; ++j) +#pragma unroll 2 + for (k = 0; k < 2; ++k) +#pragma unroll 2 + for (l = 0; l < 2; ++l) +#pragma unroll 2 + for (m = 0; m < 2; ++m) + x[1][j][k][l][m] += x[0][j][k][l][m]; + + /* "rotate x_0jklm upwards by 11 bits" */ +#pragma unroll 2 + for (j = 0; j < 2; ++j) +#pragma unroll 2 + for (k = 0; k < 2; ++k) +#pragma unroll 2 + for (l = 0; l < 2; ++l) +#pragma unroll 2 + for (m = 0; m < 2; ++m) + x[0][j][k][l][m] = ROTATEUPWARDS11(x[0][j][k][l][m]); + + /* "swap x_0j0lm with x_0j1lm" */ +#pragma unroll 2 + for (j = 0; j < 2; ++j) +#pragma unroll 2 + for (l = 0; l < 2; ++l) +#pragma unroll 2 + for (m = 0; m < 2; ++m) + SWAP(x[0][j][0][l][m], x[0][j][1][l][m]) + + /* "xor x_1jklm into x_0jklm" */ +#pragma unroll 2 + for (j = 0; j < 2; ++j) +#pragma unroll 2 + for (k = 0; k < 2; ++k) +#pragma unroll 2 + for (l = 0; l < 2; ++l) +#pragma unroll 2 + for (m = 0; m < 2; ++m) + x[0][j][k][l][m] ^= x[1][j][k][l][m]; + + /* "swap x_1jkl0 with x_1jkl1" */ +#pragma unroll 2 + for (j = 0; j < 2; ++j) +#pragma unroll 2 + for (k = 0; k < 2; ++k) +#pragma unroll 2 + for (l = 0; l < 2; ++l) + SWAP(x[1][j][k][l][0], x[1][j][k][l][1]) + + } + + + + x[0][0][0][0][0] ^= 0x80; + rrounds(x); + + Final(x, (uint32_t *)Hash); + + g_hash[thread] = ((uint2*)Hash)[0]; + g_hash[1 * threads + thread] = ((uint2*)Hash)[1]; + g_hash[2 * threads + thread] = ((uint2*)Hash)[2]; + g_hash[3 * threads + thread] = ((uint2*)Hash)[3]; + } +} + + +__host__ +void cubehash256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_hash) +{ + + // berechne wie viele Thread Blocks wir brauchen + dim3 grid((threads + (TPB) - 1) / (TPB)); + dim3 block(TPB); + + cubehash256_gpu_hash_32 << > >(threads, startNounce, (uint2 *)d_hash); + CUDA_SAFE_CALL(cudaGetLastError()); +} diff --git a/Algo256/cuda_fugue256.cu b/Algo256/cuda_fugue256.cu index 08950a5d41..7285e2cd70 100644 --- a/Algo256/cuda_fugue256.cu +++ b/Algo256/cuda_fugue256.cu @@ -6,10 +6,11 @@ #include "cuda_helper.h" #include + #define USE_SHARED 1 -uint32_t *d_fugue256_hashoutput[MAX_GPUS]; -uint32_t *d_resultNonce[MAX_GPUS]; +static uint32_t *d_fugue256_hashoutput[MAX_GPUS]; +static uint32_t *d_resultNonce[MAX_GPUS]; __constant__ uint32_t GPUstate[30]; // Single GPU __constant__ uint32_t pTarget[8]; // Single GPU @@ -540,7 +541,7 @@ static const uint32_t mixtab3_cpu[] = { #define S34 (sc[34]) #define S35 (sc[35]) -#define SWAB32(x) ( ((x & 0x000000FF) << 24) | ((x & 0x0000FF00) << 8) | ((x & 0x00FF0000) >> 8) | ((x & 0xFF000000) >> 24) ) +//#define SWAB32(x) ( ((x & 0x000000FF) << 24) | ((x & 0x0000FF00) << 8) | ((x & 0x00FF0000) >> 8) | ((x & 0xFF000000) >> 24) ) /* GPU - FUNKTIONEN */ #if USE_SHARED @@ -561,8 +562,8 @@ fugue256_gpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, void *outp __syncthreads(); #endif - uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); - if (thread < threads) + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); +// if (thread < threads) { /* Nimm den State und verarbeite das letztenByte (die Nounce) */ uint32_t sc[30]; @@ -571,7 +572,7 @@ fugue256_gpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, void *outp for(int i=0;i<30;i++) sc[i] = GPUstate[i]; - uint32_t nounce = startNounce + thread; // muss noch ermittelt werden + const uint32_t nounce = startNounce + thread; // muss noch ermittelt werden uint32_t q; @@ -679,11 +680,11 @@ fugue256_gpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, void *outp uint32_t hash[8]; #pragma unroll 4 for(int i=0;i<4;i++) - ((uint32_t*)hash)[i] = SWAB32(sc[19+i]); + ((uint32_t*)hash)[i] = cuda_swab32(sc[19+i]); #pragma unroll 4 for(int i=0;i<4;i++) - ((uint32_t*)hash)[i+4] = SWAB32(sc[3+i]); + ((uint32_t*)hash)[i + 4] = cuda_swab32(sc[3 + i]); int i; bool rc = true; @@ -710,7 +711,7 @@ fugue256_gpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, void *outp #define texDef(texname, texmem, texsource, texsize) \ unsigned int *texmem; \ cudaMalloc(&texmem, texsize); \ - cudaMemcpy(texmem, texsource, texsize, cudaMemcpyHostToDevice); \ + cudaMemcpyAsync(texmem, texsource, texsize, cudaMemcpyHostToDevice, gpustream[thr_id]); \ texname.normalized = 0; \ texname.filterMode = cudaFilterModePoint; \ texname.addressMode[0] = cudaAddressModeClamp; \ @@ -721,6 +722,8 @@ fugue256_gpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, void *outp void fugue256_cpu_init(int thr_id, uint32_t threads) { CUDA_SAFE_CALL(cudaSetDevice(device_map[thr_id])); + cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync); + cudaDeviceSetCacheConfig(cudaFuncCachePreferL1); // Kopiere die Hash-Tabellen in den GPU-Speicher texDef(mixTab0Tex, mixTab0m, mixtab0_cpu, sizeof(uint32_t)*256); @@ -729,8 +732,8 @@ void fugue256_cpu_init(int thr_id, uint32_t threads) texDef(mixTab3Tex, mixTab3m, mixtab3_cpu, sizeof(uint32_t)*256); // Speicher für alle Ergebnisse belegen - cudaMalloc(&d_fugue256_hashoutput[thr_id], 8 * sizeof(uint32_t) * threads); - cudaMalloc(&d_resultNonce[thr_id], sizeof(uint32_t)); + CUDA_SAFE_CALL(cudaMalloc(&d_fugue256_hashoutput[thr_id], 8 * sizeof(uint32_t) * threads)); + CUDA_SAFE_CALL(cudaMalloc(&d_resultNonce[thr_id], sizeof(uint32_t))); } __host__ void fugue256_cpu_setBlock(int thr_id, void *data, void *pTargetIn) @@ -740,15 +743,15 @@ __host__ void fugue256_cpu_setBlock(int thr_id, void *data, void *pTargetIn) sph_fugue256_init(&ctx_fugue_const); sph_fugue256 (&ctx_fugue_const, data, 80); // State speichern - cudaMemcpyToSymbol( GPUstate, + cudaMemcpyToSymbolAsync( GPUstate, ctx_fugue_const.S, - sizeof(uint32_t) * 30 ); + sizeof(uint32_t) * 30 , 0,cudaMemcpyHostToDevice, gpustream[thr_id]); - cudaMemcpyToSymbol( pTarget, + cudaMemcpyToSymbolAsync( pTarget, pTargetIn, - sizeof(uint32_t) * 8 ); + sizeof(uint32_t) * 8, 0, cudaMemcpyHostToDevice, gpustream[thr_id]); - cudaMemset(d_resultNonce[thr_id], 0xFF, sizeof(uint32_t)); + cudaMemsetAsync(d_resultNonce[thr_id], 0xFF, sizeof(uint32_t), gpustream[thr_id]); } __host__ void fugue256_cpu_hash(int thr_id, uint32_t threads, int startNounce, void *outputHashes, uint32_t *nounce) @@ -762,8 +765,8 @@ __host__ void fugue256_cpu_hash(int thr_id, uint32_t threads, int startNounce, v dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 block(threadsperblock); - fugue256_gpu_hash<<>>(thr_id, threads, startNounce, d_fugue256_hashoutput[thr_id], d_resultNonce[thr_id]); + fugue256_gpu_hash<<>>(thr_id, threads, startNounce, d_fugue256_hashoutput[thr_id], d_resultNonce[thr_id]); - //cudaMemcpy(outputHashes, d_fugue256_hashoutput[thr_id], 8 * sizeof(uint32_t), cudaMemcpyDeviceToHost); - cudaMemcpy(nounce, d_resultNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost); + //cudaMemcpyAsync(outputHashes, d_fugue256_hashoutput[thr_id], 8 * sizeof(uint32_t), cudaMemcpyDeviceToHost, gpustream[thr_id]); + CUDA_SAFE_CALL(cudaMemcpyAsync(nounce, d_resultNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost, gpustream[thr_id])); cudaStreamSynchronize(gpustream[thr_id]); } diff --git a/Algo256/cuda_groestl256.cu b/Algo256/cuda_groestl256.cu index 4700fa0cbf..1f48acc6c1 100644 --- a/Algo256/cuda_groestl256.cu +++ b/Algo256/cuda_groestl256.cu @@ -1,23 +1,10 @@ #include - #include "cuda_helper.h" -uint32_t *d_gnounce[MAX_GPUS]; -uint32_t *d_GNonce[MAX_GPUS]; +static uint32_t *d_GNonce[MAX_GPUS]; __constant__ uint32_t pTarget[8]; -#define C32e(x) \ - ((SPH_C32(x) >> 24) \ - | ((SPH_C32(x) >> 8) & SPH_C32(0x0000FF00)) \ - | ((SPH_C32(x) << 8) & SPH_C32(0x00FF0000)) \ - | ((SPH_C32(x) << 24) & SPH_C32(0xFF000000))) - -#define PC32up(j, r) ((uint32_t)((j) + (r))) -#define PC32dn(j, r) 0 -#define QC32up(j, r) 0xFFFFFFFF -#define QC32dn(j, r) (((uint32_t)(r) << 24) ^ ~((uint32_t)(j) << 24)) - #define B32_0(x) __byte_perm(x, 0, 0x4440) //((x) & 0xFF) #define B32_1(x) __byte_perm(x, 0, 0x4441) @@ -91,32 +78,566 @@ texture t3dn2; ^ T3up(B32_3(a[b7])); \ } while (0) - -extern uint32_t T0up_cpu[]; -extern uint32_t T0dn_cpu[]; -extern uint32_t T1up_cpu[]; -extern uint32_t T1dn_cpu[]; -extern uint32_t T2up_cpu[]; -extern uint32_t T2dn_cpu[]; -extern uint32_t T3up_cpu[]; -extern uint32_t T3dn_cpu[]; +#ifndef SPH_C32 +#define SPH_C32(x) ((uint32_t)(x ## U)) +#endif +#define C32e(x) ((SPH_C32(x) >> 24) \ + | ((SPH_C32(x) >> 8) & SPH_C32(0x0000FF00)) \ + | ((SPH_C32(x) << 8) & SPH_C32(0x00FF0000)) \ + | ((SPH_C32(x) << 24) & SPH_C32(0xFF000000))) + +uint32_t T0up_cpu[] = { + C32e(0xc632f4a5), C32e(0xf86f9784), C32e(0xee5eb099), C32e(0xf67a8c8d), + C32e(0xffe8170d), C32e(0xd60adcbd), C32e(0xde16c8b1), C32e(0x916dfc54), + C32e(0x6090f050), C32e(0x02070503), C32e(0xce2ee0a9), C32e(0x56d1877d), + C32e(0xe7cc2b19), C32e(0xb513a662), C32e(0x4d7c31e6), C32e(0xec59b59a), + C32e(0x8f40cf45), C32e(0x1fa3bc9d), C32e(0x8949c040), C32e(0xfa689287), + C32e(0xefd03f15), C32e(0xb29426eb), C32e(0x8ece40c9), C32e(0xfbe61d0b), + C32e(0x416e2fec), C32e(0xb31aa967), C32e(0x5f431cfd), C32e(0x456025ea), + C32e(0x23f9dabf), C32e(0x535102f7), C32e(0xe445a196), C32e(0x9b76ed5b), + C32e(0x75285dc2), C32e(0xe1c5241c), C32e(0x3dd4e9ae), C32e(0x4cf2be6a), + C32e(0x6c82ee5a), C32e(0x7ebdc341), C32e(0xf5f30602), C32e(0x8352d14f), + C32e(0x688ce45c), C32e(0x515607f4), C32e(0xd18d5c34), C32e(0xf9e11808), + C32e(0xe24cae93), C32e(0xab3e9573), C32e(0x6297f553), C32e(0x2a6b413f), + C32e(0x081c140c), C32e(0x9563f652), C32e(0x46e9af65), C32e(0x9d7fe25e), + C32e(0x30487828), C32e(0x37cff8a1), C32e(0x0a1b110f), C32e(0x2febc4b5), + C32e(0x0e151b09), C32e(0x247e5a36), C32e(0x1badb69b), C32e(0xdf98473d), + C32e(0xcda76a26), C32e(0x4ef5bb69), C32e(0x7f334ccd), C32e(0xea50ba9f), + C32e(0x123f2d1b), C32e(0x1da4b99e), C32e(0x58c49c74), C32e(0x3446722e), + C32e(0x3641772d), C32e(0xdc11cdb2), C32e(0xb49d29ee), C32e(0x5b4d16fb), + C32e(0xa4a501f6), C32e(0x76a1d74d), C32e(0xb714a361), C32e(0x7d3449ce), + C32e(0x52df8d7b), C32e(0xdd9f423e), C32e(0x5ecd9371), C32e(0x13b1a297), + C32e(0xa6a204f5), C32e(0xb901b868), C32e(0x00000000), C32e(0xc1b5742c), + C32e(0x40e0a060), C32e(0xe3c2211f), C32e(0x793a43c8), C32e(0xb69a2ced), + C32e(0xd40dd9be), C32e(0x8d47ca46), C32e(0x671770d9), C32e(0x72afdd4b), + C32e(0x94ed79de), C32e(0x98ff67d4), C32e(0xb09323e8), C32e(0x855bde4a), + C32e(0xbb06bd6b), C32e(0xc5bb7e2a), C32e(0x4f7b34e5), C32e(0xedd73a16), + C32e(0x86d254c5), C32e(0x9af862d7), C32e(0x6699ff55), C32e(0x11b6a794), + C32e(0x8ac04acf), C32e(0xe9d93010), C32e(0x040e0a06), C32e(0xfe669881), + C32e(0xa0ab0bf0), C32e(0x78b4cc44), C32e(0x25f0d5ba), C32e(0x4b753ee3), + C32e(0xa2ac0ef3), C32e(0x5d4419fe), C32e(0x80db5bc0), C32e(0x0580858a), + C32e(0x3fd3ecad), C32e(0x21fedfbc), C32e(0x70a8d848), C32e(0xf1fd0c04), + C32e(0x63197adf), C32e(0x772f58c1), C32e(0xaf309f75), C32e(0x42e7a563), + C32e(0x20705030), C32e(0xe5cb2e1a), C32e(0xfdef120e), C32e(0xbf08b76d), + C32e(0x8155d44c), C32e(0x18243c14), C32e(0x26795f35), C32e(0xc3b2712f), + C32e(0xbe8638e1), C32e(0x35c8fda2), C32e(0x88c74fcc), C32e(0x2e654b39), + C32e(0x936af957), C32e(0x55580df2), C32e(0xfc619d82), C32e(0x7ab3c947), + C32e(0xc827efac), C32e(0xba8832e7), C32e(0x324f7d2b), C32e(0xe642a495), + C32e(0xc03bfba0), C32e(0x19aab398), C32e(0x9ef668d1), C32e(0xa322817f), + C32e(0x44eeaa66), C32e(0x54d6827e), C32e(0x3bdde6ab), C32e(0x0b959e83), + C32e(0x8cc945ca), C32e(0xc7bc7b29), C32e(0x6b056ed3), C32e(0x286c443c), + C32e(0xa72c8b79), C32e(0xbc813de2), C32e(0x1631271d), C32e(0xad379a76), + C32e(0xdb964d3b), C32e(0x649efa56), C32e(0x74a6d24e), C32e(0x1436221e), + C32e(0x92e476db), C32e(0x0c121e0a), C32e(0x48fcb46c), C32e(0xb88f37e4), + C32e(0x9f78e75d), C32e(0xbd0fb26e), C32e(0x43692aef), C32e(0xc435f1a6), + C32e(0x39dae3a8), C32e(0x31c6f7a4), C32e(0xd38a5937), C32e(0xf274868b), + C32e(0xd5835632), C32e(0x8b4ec543), C32e(0x6e85eb59), C32e(0xda18c2b7), + C32e(0x018e8f8c), C32e(0xb11dac64), C32e(0x9cf16dd2), C32e(0x49723be0), + C32e(0xd81fc7b4), C32e(0xacb915fa), C32e(0xf3fa0907), C32e(0xcfa06f25), + C32e(0xca20eaaf), C32e(0xf47d898e), C32e(0x476720e9), C32e(0x10382818), + C32e(0x6f0b64d5), C32e(0xf0738388), C32e(0x4afbb16f), C32e(0x5cca9672), + C32e(0x38546c24), C32e(0x575f08f1), C32e(0x732152c7), C32e(0x9764f351), + C32e(0xcbae6523), C32e(0xa125847c), C32e(0xe857bf9c), C32e(0x3e5d6321), + C32e(0x96ea7cdd), C32e(0x611e7fdc), C32e(0x0d9c9186), C32e(0x0f9b9485), + C32e(0xe04bab90), C32e(0x7cbac642), C32e(0x712657c4), C32e(0xcc29e5aa), + C32e(0x90e373d8), C32e(0x06090f05), C32e(0xf7f40301), C32e(0x1c2a3612), + C32e(0xc23cfea3), C32e(0x6a8be15f), C32e(0xaebe10f9), C32e(0x69026bd0), + C32e(0x17bfa891), C32e(0x9971e858), C32e(0x3a536927), C32e(0x27f7d0b9), + C32e(0xd9914838), C32e(0xebde3513), C32e(0x2be5ceb3), C32e(0x22775533), + C32e(0xd204d6bb), C32e(0xa9399070), C32e(0x07878089), C32e(0x33c1f2a7), + C32e(0x2decc1b6), C32e(0x3c5a6622), C32e(0x15b8ad92), C32e(0xc9a96020), + C32e(0x875cdb49), C32e(0xaab01aff), C32e(0x50d88878), C32e(0xa52b8e7a), + C32e(0x03898a8f), C32e(0x594a13f8), C32e(0x09929b80), C32e(0x1a233917), + C32e(0x651075da), C32e(0xd7845331), C32e(0x84d551c6), C32e(0xd003d3b8), + C32e(0x82dc5ec3), C32e(0x29e2cbb0), C32e(0x5ac39977), C32e(0x1e2d3311), + C32e(0x7b3d46cb), C32e(0xa8b71ffc), C32e(0x6d0c61d6), C32e(0x2c624e3a) +}; + +uint32_t T0dn_cpu[] = { + C32e(0xf497a5c6), C32e(0x97eb84f8), C32e(0xb0c799ee), C32e(0x8cf78df6), + C32e(0x17e50dff), C32e(0xdcb7bdd6), C32e(0xc8a7b1de), C32e(0xfc395491), + C32e(0xf0c05060), C32e(0x05040302), C32e(0xe087a9ce), C32e(0x87ac7d56), + C32e(0x2bd519e7), C32e(0xa67162b5), C32e(0x319ae64d), C32e(0xb5c39aec), + C32e(0xcf05458f), C32e(0xbc3e9d1f), C32e(0xc0094089), C32e(0x92ef87fa), + C32e(0x3fc515ef), C32e(0x267febb2), C32e(0x4007c98e), C32e(0x1ded0bfb), + C32e(0x2f82ec41), C32e(0xa97d67b3), C32e(0x1cbefd5f), C32e(0x258aea45), + C32e(0xda46bf23), C32e(0x02a6f753), C32e(0xa1d396e4), C32e(0xed2d5b9b), + C32e(0x5deac275), C32e(0x24d91ce1), C32e(0xe97aae3d), C32e(0xbe986a4c), + C32e(0xeed85a6c), C32e(0xc3fc417e), C32e(0x06f102f5), C32e(0xd11d4f83), + C32e(0xe4d05c68), C32e(0x07a2f451), C32e(0x5cb934d1), C32e(0x18e908f9), + C32e(0xaedf93e2), C32e(0x954d73ab), C32e(0xf5c45362), C32e(0x41543f2a), + C32e(0x14100c08), C32e(0xf6315295), C32e(0xaf8c6546), C32e(0xe2215e9d), + C32e(0x78602830), C32e(0xf86ea137), C32e(0x11140f0a), C32e(0xc45eb52f), + C32e(0x1b1c090e), C32e(0x5a483624), C32e(0xb6369b1b), C32e(0x47a53ddf), + C32e(0x6a8126cd), C32e(0xbb9c694e), C32e(0x4cfecd7f), C32e(0xbacf9fea), + C32e(0x2d241b12), C32e(0xb93a9e1d), C32e(0x9cb07458), C32e(0x72682e34), + C32e(0x776c2d36), C32e(0xcda3b2dc), C32e(0x2973eeb4), C32e(0x16b6fb5b), + C32e(0x0153f6a4), C32e(0xd7ec4d76), C32e(0xa37561b7), C32e(0x49face7d), + C32e(0x8da47b52), C32e(0x42a13edd), C32e(0x93bc715e), C32e(0xa2269713), + C32e(0x0457f5a6), C32e(0xb86968b9), C32e(0x00000000), C32e(0x74992cc1), + C32e(0xa0806040), C32e(0x21dd1fe3), C32e(0x43f2c879), C32e(0x2c77edb6), + C32e(0xd9b3bed4), C32e(0xca01468d), C32e(0x70ced967), C32e(0xdde44b72), + C32e(0x7933de94), C32e(0x672bd498), C32e(0x237be8b0), C32e(0xde114a85), + C32e(0xbd6d6bbb), C32e(0x7e912ac5), C32e(0x349ee54f), C32e(0x3ac116ed), + C32e(0x5417c586), C32e(0x622fd79a), C32e(0xffcc5566), C32e(0xa7229411), + C32e(0x4a0fcf8a), C32e(0x30c910e9), C32e(0x0a080604), C32e(0x98e781fe), + C32e(0x0b5bf0a0), C32e(0xccf04478), C32e(0xd54aba25), C32e(0x3e96e34b), + C32e(0x0e5ff3a2), C32e(0x19bafe5d), C32e(0x5b1bc080), C32e(0x850a8a05), + C32e(0xec7ead3f), C32e(0xdf42bc21), C32e(0xd8e04870), C32e(0x0cf904f1), + C32e(0x7ac6df63), C32e(0x58eec177), C32e(0x9f4575af), C32e(0xa5846342), + C32e(0x50403020), C32e(0x2ed11ae5), C32e(0x12e10efd), C32e(0xb7656dbf), + C32e(0xd4194c81), C32e(0x3c301418), C32e(0x5f4c3526), C32e(0x719d2fc3), + C32e(0x3867e1be), C32e(0xfd6aa235), C32e(0x4f0bcc88), C32e(0x4b5c392e), + C32e(0xf93d5793), C32e(0x0daaf255), C32e(0x9de382fc), C32e(0xc9f4477a), + C32e(0xef8bacc8), C32e(0x326fe7ba), C32e(0x7d642b32), C32e(0xa4d795e6), + C32e(0xfb9ba0c0), C32e(0xb3329819), C32e(0x6827d19e), C32e(0x815d7fa3), + C32e(0xaa886644), C32e(0x82a87e54), C32e(0xe676ab3b), C32e(0x9e16830b), + C32e(0x4503ca8c), C32e(0x7b9529c7), C32e(0x6ed6d36b), C32e(0x44503c28), + C32e(0x8b5579a7), C32e(0x3d63e2bc), C32e(0x272c1d16), C32e(0x9a4176ad), + C32e(0x4dad3bdb), C32e(0xfac85664), C32e(0xd2e84e74), C32e(0x22281e14), + C32e(0x763fdb92), C32e(0x1e180a0c), C32e(0xb4906c48), C32e(0x376be4b8), + C32e(0xe7255d9f), C32e(0xb2616ebd), C32e(0x2a86ef43), C32e(0xf193a6c4), + C32e(0xe372a839), C32e(0xf762a431), C32e(0x59bd37d3), C32e(0x86ff8bf2), + C32e(0x56b132d5), C32e(0xc50d438b), C32e(0xebdc596e), C32e(0xc2afb7da), + C32e(0x8f028c01), C32e(0xac7964b1), C32e(0x6d23d29c), C32e(0x3b92e049), + C32e(0xc7abb4d8), C32e(0x1543faac), C32e(0x09fd07f3), C32e(0x6f8525cf), + C32e(0xea8fafca), C32e(0x89f38ef4), C32e(0x208ee947), C32e(0x28201810), + C32e(0x64ded56f), C32e(0x83fb88f0), C32e(0xb1946f4a), C32e(0x96b8725c), + C32e(0x6c702438), C32e(0x08aef157), C32e(0x52e6c773), C32e(0xf3355197), + C32e(0x658d23cb), C32e(0x84597ca1), C32e(0xbfcb9ce8), C32e(0x637c213e), + C32e(0x7c37dd96), C32e(0x7fc2dc61), C32e(0x911a860d), C32e(0x941e850f), + C32e(0xabdb90e0), C32e(0xc6f8427c), C32e(0x57e2c471), C32e(0xe583aacc), + C32e(0x733bd890), C32e(0x0f0c0506), C32e(0x03f501f7), C32e(0x3638121c), + C32e(0xfe9fa3c2), C32e(0xe1d45f6a), C32e(0x1047f9ae), C32e(0x6bd2d069), + C32e(0xa82e9117), C32e(0xe8295899), C32e(0x6974273a), C32e(0xd04eb927), + C32e(0x48a938d9), C32e(0x35cd13eb), C32e(0xce56b32b), C32e(0x55443322), + C32e(0xd6bfbbd2), C32e(0x904970a9), C32e(0x800e8907), C32e(0xf266a733), + C32e(0xc15ab62d), C32e(0x6678223c), C32e(0xad2a9215), C32e(0x608920c9), + C32e(0xdb154987), C32e(0x1a4fffaa), C32e(0x88a07850), C32e(0x8e517aa5), + C32e(0x8a068f03), C32e(0x13b2f859), C32e(0x9b128009), C32e(0x3934171a), + C32e(0x75cada65), C32e(0x53b531d7), C32e(0x5113c684), C32e(0xd3bbb8d0), + C32e(0x5e1fc382), C32e(0xcb52b029), C32e(0x99b4775a), C32e(0x333c111e), + C32e(0x46f6cb7b), C32e(0x1f4bfca8), C32e(0x61dad66d), C32e(0x4e583a2c) +}; + +uint32_t T1up_cpu[] = { + C32e(0xc6c632f4), C32e(0xf8f86f97), C32e(0xeeee5eb0), C32e(0xf6f67a8c), + C32e(0xffffe817), C32e(0xd6d60adc), C32e(0xdede16c8), C32e(0x91916dfc), + C32e(0x606090f0), C32e(0x02020705), C32e(0xcece2ee0), C32e(0x5656d187), + C32e(0xe7e7cc2b), C32e(0xb5b513a6), C32e(0x4d4d7c31), C32e(0xecec59b5), + C32e(0x8f8f40cf), C32e(0x1f1fa3bc), C32e(0x898949c0), C32e(0xfafa6892), + C32e(0xefefd03f), C32e(0xb2b29426), C32e(0x8e8ece40), C32e(0xfbfbe61d), + C32e(0x41416e2f), C32e(0xb3b31aa9), C32e(0x5f5f431c), C32e(0x45456025), + C32e(0x2323f9da), C32e(0x53535102), C32e(0xe4e445a1), C32e(0x9b9b76ed), + C32e(0x7575285d), C32e(0xe1e1c524), C32e(0x3d3dd4e9), C32e(0x4c4cf2be), + C32e(0x6c6c82ee), C32e(0x7e7ebdc3), C32e(0xf5f5f306), C32e(0x838352d1), + C32e(0x68688ce4), C32e(0x51515607), C32e(0xd1d18d5c), C32e(0xf9f9e118), + C32e(0xe2e24cae), C32e(0xabab3e95), C32e(0x626297f5), C32e(0x2a2a6b41), + C32e(0x08081c14), C32e(0x959563f6), C32e(0x4646e9af), C32e(0x9d9d7fe2), + C32e(0x30304878), C32e(0x3737cff8), C32e(0x0a0a1b11), C32e(0x2f2febc4), + C32e(0x0e0e151b), C32e(0x24247e5a), C32e(0x1b1badb6), C32e(0xdfdf9847), + C32e(0xcdcda76a), C32e(0x4e4ef5bb), C32e(0x7f7f334c), C32e(0xeaea50ba), + C32e(0x12123f2d), C32e(0x1d1da4b9), C32e(0x5858c49c), C32e(0x34344672), + C32e(0x36364177), C32e(0xdcdc11cd), C32e(0xb4b49d29), C32e(0x5b5b4d16), + C32e(0xa4a4a501), C32e(0x7676a1d7), C32e(0xb7b714a3), C32e(0x7d7d3449), + C32e(0x5252df8d), C32e(0xdddd9f42), C32e(0x5e5ecd93), C32e(0x1313b1a2), + C32e(0xa6a6a204), C32e(0xb9b901b8), C32e(0x00000000), C32e(0xc1c1b574), + C32e(0x4040e0a0), C32e(0xe3e3c221), C32e(0x79793a43), C32e(0xb6b69a2c), + C32e(0xd4d40dd9), C32e(0x8d8d47ca), C32e(0x67671770), C32e(0x7272afdd), + C32e(0x9494ed79), C32e(0x9898ff67), C32e(0xb0b09323), C32e(0x85855bde), + C32e(0xbbbb06bd), C32e(0xc5c5bb7e), C32e(0x4f4f7b34), C32e(0xededd73a), + C32e(0x8686d254), C32e(0x9a9af862), C32e(0x666699ff), C32e(0x1111b6a7), + C32e(0x8a8ac04a), C32e(0xe9e9d930), C32e(0x04040e0a), C32e(0xfefe6698), + C32e(0xa0a0ab0b), C32e(0x7878b4cc), C32e(0x2525f0d5), C32e(0x4b4b753e), + C32e(0xa2a2ac0e), C32e(0x5d5d4419), C32e(0x8080db5b), C32e(0x05058085), + C32e(0x3f3fd3ec), C32e(0x2121fedf), C32e(0x7070a8d8), C32e(0xf1f1fd0c), + C32e(0x6363197a), C32e(0x77772f58), C32e(0xafaf309f), C32e(0x4242e7a5), + C32e(0x20207050), C32e(0xe5e5cb2e), C32e(0xfdfdef12), C32e(0xbfbf08b7), + C32e(0x818155d4), C32e(0x1818243c), C32e(0x2626795f), C32e(0xc3c3b271), + C32e(0xbebe8638), C32e(0x3535c8fd), C32e(0x8888c74f), C32e(0x2e2e654b), + C32e(0x93936af9), C32e(0x5555580d), C32e(0xfcfc619d), C32e(0x7a7ab3c9), + C32e(0xc8c827ef), C32e(0xbaba8832), C32e(0x32324f7d), C32e(0xe6e642a4), + C32e(0xc0c03bfb), C32e(0x1919aab3), C32e(0x9e9ef668), C32e(0xa3a32281), + C32e(0x4444eeaa), C32e(0x5454d682), C32e(0x3b3bdde6), C32e(0x0b0b959e), + C32e(0x8c8cc945), C32e(0xc7c7bc7b), C32e(0x6b6b056e), C32e(0x28286c44), + C32e(0xa7a72c8b), C32e(0xbcbc813d), C32e(0x16163127), C32e(0xadad379a), + C32e(0xdbdb964d), C32e(0x64649efa), C32e(0x7474a6d2), C32e(0x14143622), + C32e(0x9292e476), C32e(0x0c0c121e), C32e(0x4848fcb4), C32e(0xb8b88f37), + C32e(0x9f9f78e7), C32e(0xbdbd0fb2), C32e(0x4343692a), C32e(0xc4c435f1), + C32e(0x3939dae3), C32e(0x3131c6f7), C32e(0xd3d38a59), C32e(0xf2f27486), + C32e(0xd5d58356), C32e(0x8b8b4ec5), C32e(0x6e6e85eb), C32e(0xdada18c2), + C32e(0x01018e8f), C32e(0xb1b11dac), C32e(0x9c9cf16d), C32e(0x4949723b), + C32e(0xd8d81fc7), C32e(0xacacb915), C32e(0xf3f3fa09), C32e(0xcfcfa06f), + C32e(0xcaca20ea), C32e(0xf4f47d89), C32e(0x47476720), C32e(0x10103828), + C32e(0x6f6f0b64), C32e(0xf0f07383), C32e(0x4a4afbb1), C32e(0x5c5cca96), + C32e(0x3838546c), C32e(0x57575f08), C32e(0x73732152), C32e(0x979764f3), + C32e(0xcbcbae65), C32e(0xa1a12584), C32e(0xe8e857bf), C32e(0x3e3e5d63), + C32e(0x9696ea7c), C32e(0x61611e7f), C32e(0x0d0d9c91), C32e(0x0f0f9b94), + C32e(0xe0e04bab), C32e(0x7c7cbac6), C32e(0x71712657), C32e(0xcccc29e5), + C32e(0x9090e373), C32e(0x0606090f), C32e(0xf7f7f403), C32e(0x1c1c2a36), + C32e(0xc2c23cfe), C32e(0x6a6a8be1), C32e(0xaeaebe10), C32e(0x6969026b), + C32e(0x1717bfa8), C32e(0x999971e8), C32e(0x3a3a5369), C32e(0x2727f7d0), + C32e(0xd9d99148), C32e(0xebebde35), C32e(0x2b2be5ce), C32e(0x22227755), + C32e(0xd2d204d6), C32e(0xa9a93990), C32e(0x07078780), C32e(0x3333c1f2), + C32e(0x2d2decc1), C32e(0x3c3c5a66), C32e(0x1515b8ad), C32e(0xc9c9a960), + C32e(0x87875cdb), C32e(0xaaaab01a), C32e(0x5050d888), C32e(0xa5a52b8e), + C32e(0x0303898a), C32e(0x59594a13), C32e(0x0909929b), C32e(0x1a1a2339), + C32e(0x65651075), C32e(0xd7d78453), C32e(0x8484d551), C32e(0xd0d003d3), + C32e(0x8282dc5e), C32e(0x2929e2cb), C32e(0x5a5ac399), C32e(0x1e1e2d33), + C32e(0x7b7b3d46), C32e(0xa8a8b71f), C32e(0x6d6d0c61), C32e(0x2c2c624e) +}; + +uint32_t T1dn_cpu[] = { + C32e(0xa5f497a5), C32e(0x8497eb84), C32e(0x99b0c799), C32e(0x8d8cf78d), + C32e(0x0d17e50d), C32e(0xbddcb7bd), C32e(0xb1c8a7b1), C32e(0x54fc3954), + C32e(0x50f0c050), C32e(0x03050403), C32e(0xa9e087a9), C32e(0x7d87ac7d), + C32e(0x192bd519), C32e(0x62a67162), C32e(0xe6319ae6), C32e(0x9ab5c39a), + C32e(0x45cf0545), C32e(0x9dbc3e9d), C32e(0x40c00940), C32e(0x8792ef87), + C32e(0x153fc515), C32e(0xeb267feb), C32e(0xc94007c9), C32e(0x0b1ded0b), + C32e(0xec2f82ec), C32e(0x67a97d67), C32e(0xfd1cbefd), C32e(0xea258aea), + C32e(0xbfda46bf), C32e(0xf702a6f7), C32e(0x96a1d396), C32e(0x5bed2d5b), + C32e(0xc25deac2), C32e(0x1c24d91c), C32e(0xaee97aae), C32e(0x6abe986a), + C32e(0x5aeed85a), C32e(0x41c3fc41), C32e(0x0206f102), C32e(0x4fd11d4f), + C32e(0x5ce4d05c), C32e(0xf407a2f4), C32e(0x345cb934), C32e(0x0818e908), + C32e(0x93aedf93), C32e(0x73954d73), C32e(0x53f5c453), C32e(0x3f41543f), + C32e(0x0c14100c), C32e(0x52f63152), C32e(0x65af8c65), C32e(0x5ee2215e), + C32e(0x28786028), C32e(0xa1f86ea1), C32e(0x0f11140f), C32e(0xb5c45eb5), + C32e(0x091b1c09), C32e(0x365a4836), C32e(0x9bb6369b), C32e(0x3d47a53d), + C32e(0x266a8126), C32e(0x69bb9c69), C32e(0xcd4cfecd), C32e(0x9fbacf9f), + C32e(0x1b2d241b), C32e(0x9eb93a9e), C32e(0x749cb074), C32e(0x2e72682e), + C32e(0x2d776c2d), C32e(0xb2cda3b2), C32e(0xee2973ee), C32e(0xfb16b6fb), + C32e(0xf60153f6), C32e(0x4dd7ec4d), C32e(0x61a37561), C32e(0xce49face), + C32e(0x7b8da47b), C32e(0x3e42a13e), C32e(0x7193bc71), C32e(0x97a22697), + C32e(0xf50457f5), C32e(0x68b86968), C32e(0x00000000), C32e(0x2c74992c), + C32e(0x60a08060), C32e(0x1f21dd1f), C32e(0xc843f2c8), C32e(0xed2c77ed), + C32e(0xbed9b3be), C32e(0x46ca0146), C32e(0xd970ced9), C32e(0x4bdde44b), + C32e(0xde7933de), C32e(0xd4672bd4), C32e(0xe8237be8), C32e(0x4ade114a), + C32e(0x6bbd6d6b), C32e(0x2a7e912a), C32e(0xe5349ee5), C32e(0x163ac116), + C32e(0xc55417c5), C32e(0xd7622fd7), C32e(0x55ffcc55), C32e(0x94a72294), + C32e(0xcf4a0fcf), C32e(0x1030c910), C32e(0x060a0806), C32e(0x8198e781), + C32e(0xf00b5bf0), C32e(0x44ccf044), C32e(0xbad54aba), C32e(0xe33e96e3), + C32e(0xf30e5ff3), C32e(0xfe19bafe), C32e(0xc05b1bc0), C32e(0x8a850a8a), + C32e(0xadec7ead), C32e(0xbcdf42bc), C32e(0x48d8e048), C32e(0x040cf904), + C32e(0xdf7ac6df), C32e(0xc158eec1), C32e(0x759f4575), C32e(0x63a58463), + C32e(0x30504030), C32e(0x1a2ed11a), C32e(0x0e12e10e), C32e(0x6db7656d), + C32e(0x4cd4194c), C32e(0x143c3014), C32e(0x355f4c35), C32e(0x2f719d2f), + C32e(0xe13867e1), C32e(0xa2fd6aa2), C32e(0xcc4f0bcc), C32e(0x394b5c39), + C32e(0x57f93d57), C32e(0xf20daaf2), C32e(0x829de382), C32e(0x47c9f447), + C32e(0xacef8bac), C32e(0xe7326fe7), C32e(0x2b7d642b), C32e(0x95a4d795), + C32e(0xa0fb9ba0), C32e(0x98b33298), C32e(0xd16827d1), C32e(0x7f815d7f), + C32e(0x66aa8866), C32e(0x7e82a87e), C32e(0xabe676ab), C32e(0x839e1683), + C32e(0xca4503ca), C32e(0x297b9529), C32e(0xd36ed6d3), C32e(0x3c44503c), + C32e(0x798b5579), C32e(0xe23d63e2), C32e(0x1d272c1d), C32e(0x769a4176), + C32e(0x3b4dad3b), C32e(0x56fac856), C32e(0x4ed2e84e), C32e(0x1e22281e), + C32e(0xdb763fdb), C32e(0x0a1e180a), C32e(0x6cb4906c), C32e(0xe4376be4), + C32e(0x5de7255d), C32e(0x6eb2616e), C32e(0xef2a86ef), C32e(0xa6f193a6), + C32e(0xa8e372a8), C32e(0xa4f762a4), C32e(0x3759bd37), C32e(0x8b86ff8b), + C32e(0x3256b132), C32e(0x43c50d43), C32e(0x59ebdc59), C32e(0xb7c2afb7), + C32e(0x8c8f028c), C32e(0x64ac7964), C32e(0xd26d23d2), C32e(0xe03b92e0), + C32e(0xb4c7abb4), C32e(0xfa1543fa), C32e(0x0709fd07), C32e(0x256f8525), + C32e(0xafea8faf), C32e(0x8e89f38e), C32e(0xe9208ee9), C32e(0x18282018), + C32e(0xd564ded5), C32e(0x8883fb88), C32e(0x6fb1946f), C32e(0x7296b872), + C32e(0x246c7024), C32e(0xf108aef1), C32e(0xc752e6c7), C32e(0x51f33551), + C32e(0x23658d23), C32e(0x7c84597c), C32e(0x9cbfcb9c), C32e(0x21637c21), + C32e(0xdd7c37dd), C32e(0xdc7fc2dc), C32e(0x86911a86), C32e(0x85941e85), + C32e(0x90abdb90), C32e(0x42c6f842), C32e(0xc457e2c4), C32e(0xaae583aa), + C32e(0xd8733bd8), C32e(0x050f0c05), C32e(0x0103f501), C32e(0x12363812), + C32e(0xa3fe9fa3), C32e(0x5fe1d45f), C32e(0xf91047f9), C32e(0xd06bd2d0), + C32e(0x91a82e91), C32e(0x58e82958), C32e(0x27697427), C32e(0xb9d04eb9), + C32e(0x3848a938), C32e(0x1335cd13), C32e(0xb3ce56b3), C32e(0x33554433), + C32e(0xbbd6bfbb), C32e(0x70904970), C32e(0x89800e89), C32e(0xa7f266a7), + C32e(0xb6c15ab6), C32e(0x22667822), C32e(0x92ad2a92), C32e(0x20608920), + C32e(0x49db1549), C32e(0xff1a4fff), C32e(0x7888a078), C32e(0x7a8e517a), + C32e(0x8f8a068f), C32e(0xf813b2f8), C32e(0x809b1280), C32e(0x17393417), + C32e(0xda75cada), C32e(0x3153b531), C32e(0xc65113c6), C32e(0xb8d3bbb8), + C32e(0xc35e1fc3), C32e(0xb0cb52b0), C32e(0x7799b477), C32e(0x11333c11), + C32e(0xcb46f6cb), C32e(0xfc1f4bfc), C32e(0xd661dad6), C32e(0x3a4e583a) +}; + +uint32_t T2up_cpu[] = { + C32e(0xa5c6c632), C32e(0x84f8f86f), C32e(0x99eeee5e), C32e(0x8df6f67a), + C32e(0x0dffffe8), C32e(0xbdd6d60a), C32e(0xb1dede16), C32e(0x5491916d), + C32e(0x50606090), C32e(0x03020207), C32e(0xa9cece2e), C32e(0x7d5656d1), + C32e(0x19e7e7cc), C32e(0x62b5b513), C32e(0xe64d4d7c), C32e(0x9aecec59), + C32e(0x458f8f40), C32e(0x9d1f1fa3), C32e(0x40898949), C32e(0x87fafa68), + C32e(0x15efefd0), C32e(0xebb2b294), C32e(0xc98e8ece), C32e(0x0bfbfbe6), + C32e(0xec41416e), C32e(0x67b3b31a), C32e(0xfd5f5f43), C32e(0xea454560), + C32e(0xbf2323f9), C32e(0xf7535351), C32e(0x96e4e445), C32e(0x5b9b9b76), + C32e(0xc2757528), C32e(0x1ce1e1c5), C32e(0xae3d3dd4), C32e(0x6a4c4cf2), + C32e(0x5a6c6c82), C32e(0x417e7ebd), C32e(0x02f5f5f3), C32e(0x4f838352), + C32e(0x5c68688c), C32e(0xf4515156), C32e(0x34d1d18d), C32e(0x08f9f9e1), + C32e(0x93e2e24c), C32e(0x73abab3e), C32e(0x53626297), C32e(0x3f2a2a6b), + C32e(0x0c08081c), C32e(0x52959563), C32e(0x654646e9), C32e(0x5e9d9d7f), + C32e(0x28303048), C32e(0xa13737cf), C32e(0x0f0a0a1b), C32e(0xb52f2feb), + C32e(0x090e0e15), C32e(0x3624247e), C32e(0x9b1b1bad), C32e(0x3ddfdf98), + C32e(0x26cdcda7), C32e(0x694e4ef5), C32e(0xcd7f7f33), C32e(0x9feaea50), + C32e(0x1b12123f), C32e(0x9e1d1da4), C32e(0x745858c4), C32e(0x2e343446), + C32e(0x2d363641), C32e(0xb2dcdc11), C32e(0xeeb4b49d), C32e(0xfb5b5b4d), + C32e(0xf6a4a4a5), C32e(0x4d7676a1), C32e(0x61b7b714), C32e(0xce7d7d34), + C32e(0x7b5252df), C32e(0x3edddd9f), C32e(0x715e5ecd), C32e(0x971313b1), + C32e(0xf5a6a6a2), C32e(0x68b9b901), C32e(0x00000000), C32e(0x2cc1c1b5), + C32e(0x604040e0), C32e(0x1fe3e3c2), C32e(0xc879793a), C32e(0xedb6b69a), + C32e(0xbed4d40d), C32e(0x468d8d47), C32e(0xd9676717), C32e(0x4b7272af), + C32e(0xde9494ed), C32e(0xd49898ff), C32e(0xe8b0b093), C32e(0x4a85855b), + C32e(0x6bbbbb06), C32e(0x2ac5c5bb), C32e(0xe54f4f7b), C32e(0x16ededd7), + C32e(0xc58686d2), C32e(0xd79a9af8), C32e(0x55666699), C32e(0x941111b6), + C32e(0xcf8a8ac0), C32e(0x10e9e9d9), C32e(0x0604040e), C32e(0x81fefe66), + C32e(0xf0a0a0ab), C32e(0x447878b4), C32e(0xba2525f0), C32e(0xe34b4b75), + C32e(0xf3a2a2ac), C32e(0xfe5d5d44), C32e(0xc08080db), C32e(0x8a050580), + C32e(0xad3f3fd3), C32e(0xbc2121fe), C32e(0x487070a8), C32e(0x04f1f1fd), + C32e(0xdf636319), C32e(0xc177772f), C32e(0x75afaf30), C32e(0x634242e7), + C32e(0x30202070), C32e(0x1ae5e5cb), C32e(0x0efdfdef), C32e(0x6dbfbf08), + C32e(0x4c818155), C32e(0x14181824), C32e(0x35262679), C32e(0x2fc3c3b2), + C32e(0xe1bebe86), C32e(0xa23535c8), C32e(0xcc8888c7), C32e(0x392e2e65), + C32e(0x5793936a), C32e(0xf2555558), C32e(0x82fcfc61), C32e(0x477a7ab3), + C32e(0xacc8c827), C32e(0xe7baba88), C32e(0x2b32324f), C32e(0x95e6e642), + C32e(0xa0c0c03b), C32e(0x981919aa), C32e(0xd19e9ef6), C32e(0x7fa3a322), + C32e(0x664444ee), C32e(0x7e5454d6), C32e(0xab3b3bdd), C32e(0x830b0b95), + C32e(0xca8c8cc9), C32e(0x29c7c7bc), C32e(0xd36b6b05), C32e(0x3c28286c), + C32e(0x79a7a72c), C32e(0xe2bcbc81), C32e(0x1d161631), C32e(0x76adad37), + C32e(0x3bdbdb96), C32e(0x5664649e), C32e(0x4e7474a6), C32e(0x1e141436), + C32e(0xdb9292e4), C32e(0x0a0c0c12), C32e(0x6c4848fc), C32e(0xe4b8b88f), + C32e(0x5d9f9f78), C32e(0x6ebdbd0f), C32e(0xef434369), C32e(0xa6c4c435), + C32e(0xa83939da), C32e(0xa43131c6), C32e(0x37d3d38a), C32e(0x8bf2f274), + C32e(0x32d5d583), C32e(0x438b8b4e), C32e(0x596e6e85), C32e(0xb7dada18), + C32e(0x8c01018e), C32e(0x64b1b11d), C32e(0xd29c9cf1), C32e(0xe0494972), + C32e(0xb4d8d81f), C32e(0xfaacacb9), C32e(0x07f3f3fa), C32e(0x25cfcfa0), + C32e(0xafcaca20), C32e(0x8ef4f47d), C32e(0xe9474767), C32e(0x18101038), + C32e(0xd56f6f0b), C32e(0x88f0f073), C32e(0x6f4a4afb), C32e(0x725c5cca), + C32e(0x24383854), C32e(0xf157575f), C32e(0xc7737321), C32e(0x51979764), + C32e(0x23cbcbae), C32e(0x7ca1a125), C32e(0x9ce8e857), C32e(0x213e3e5d), + C32e(0xdd9696ea), C32e(0xdc61611e), C32e(0x860d0d9c), C32e(0x850f0f9b), + C32e(0x90e0e04b), C32e(0x427c7cba), C32e(0xc4717126), C32e(0xaacccc29), + C32e(0xd89090e3), C32e(0x05060609), C32e(0x01f7f7f4), C32e(0x121c1c2a), + C32e(0xa3c2c23c), C32e(0x5f6a6a8b), C32e(0xf9aeaebe), C32e(0xd0696902), + C32e(0x911717bf), C32e(0x58999971), C32e(0x273a3a53), C32e(0xb92727f7), + C32e(0x38d9d991), C32e(0x13ebebde), C32e(0xb32b2be5), C32e(0x33222277), + C32e(0xbbd2d204), C32e(0x70a9a939), C32e(0x89070787), C32e(0xa73333c1), + C32e(0xb62d2dec), C32e(0x223c3c5a), C32e(0x921515b8), C32e(0x20c9c9a9), + C32e(0x4987875c), C32e(0xffaaaab0), C32e(0x785050d8), C32e(0x7aa5a52b), + C32e(0x8f030389), C32e(0xf859594a), C32e(0x80090992), C32e(0x171a1a23), + C32e(0xda656510), C32e(0x31d7d784), C32e(0xc68484d5), C32e(0xb8d0d003), + C32e(0xc38282dc), C32e(0xb02929e2), C32e(0x775a5ac3), C32e(0x111e1e2d), + C32e(0xcb7b7b3d), C32e(0xfca8a8b7), C32e(0xd66d6d0c), C32e(0x3a2c2c62) +}; + +uint32_t T2dn_cpu[] = { + C32e(0xf4a5f497), C32e(0x978497eb), C32e(0xb099b0c7), C32e(0x8c8d8cf7), + C32e(0x170d17e5), C32e(0xdcbddcb7), C32e(0xc8b1c8a7), C32e(0xfc54fc39), + C32e(0xf050f0c0), C32e(0x05030504), C32e(0xe0a9e087), C32e(0x877d87ac), + C32e(0x2b192bd5), C32e(0xa662a671), C32e(0x31e6319a), C32e(0xb59ab5c3), + C32e(0xcf45cf05), C32e(0xbc9dbc3e), C32e(0xc040c009), C32e(0x928792ef), + C32e(0x3f153fc5), C32e(0x26eb267f), C32e(0x40c94007), C32e(0x1d0b1ded), + C32e(0x2fec2f82), C32e(0xa967a97d), C32e(0x1cfd1cbe), C32e(0x25ea258a), + C32e(0xdabfda46), C32e(0x02f702a6), C32e(0xa196a1d3), C32e(0xed5bed2d), + C32e(0x5dc25dea), C32e(0x241c24d9), C32e(0xe9aee97a), C32e(0xbe6abe98), + C32e(0xee5aeed8), C32e(0xc341c3fc), C32e(0x060206f1), C32e(0xd14fd11d), + C32e(0xe45ce4d0), C32e(0x07f407a2), C32e(0x5c345cb9), C32e(0x180818e9), + C32e(0xae93aedf), C32e(0x9573954d), C32e(0xf553f5c4), C32e(0x413f4154), + C32e(0x140c1410), C32e(0xf652f631), C32e(0xaf65af8c), C32e(0xe25ee221), + C32e(0x78287860), C32e(0xf8a1f86e), C32e(0x110f1114), C32e(0xc4b5c45e), + C32e(0x1b091b1c), C32e(0x5a365a48), C32e(0xb69bb636), C32e(0x473d47a5), + C32e(0x6a266a81), C32e(0xbb69bb9c), C32e(0x4ccd4cfe), C32e(0xba9fbacf), + C32e(0x2d1b2d24), C32e(0xb99eb93a), C32e(0x9c749cb0), C32e(0x722e7268), + C32e(0x772d776c), C32e(0xcdb2cda3), C32e(0x29ee2973), C32e(0x16fb16b6), + C32e(0x01f60153), C32e(0xd74dd7ec), C32e(0xa361a375), C32e(0x49ce49fa), + C32e(0x8d7b8da4), C32e(0x423e42a1), C32e(0x937193bc), C32e(0xa297a226), + C32e(0x04f50457), C32e(0xb868b869), C32e(0x00000000), C32e(0x742c7499), + C32e(0xa060a080), C32e(0x211f21dd), C32e(0x43c843f2), C32e(0x2ced2c77), + C32e(0xd9bed9b3), C32e(0xca46ca01), C32e(0x70d970ce), C32e(0xdd4bdde4), + C32e(0x79de7933), C32e(0x67d4672b), C32e(0x23e8237b), C32e(0xde4ade11), + C32e(0xbd6bbd6d), C32e(0x7e2a7e91), C32e(0x34e5349e), C32e(0x3a163ac1), + C32e(0x54c55417), C32e(0x62d7622f), C32e(0xff55ffcc), C32e(0xa794a722), + C32e(0x4acf4a0f), C32e(0x301030c9), C32e(0x0a060a08), C32e(0x988198e7), + C32e(0x0bf00b5b), C32e(0xcc44ccf0), C32e(0xd5bad54a), C32e(0x3ee33e96), + C32e(0x0ef30e5f), C32e(0x19fe19ba), C32e(0x5bc05b1b), C32e(0x858a850a), + C32e(0xecadec7e), C32e(0xdfbcdf42), C32e(0xd848d8e0), C32e(0x0c040cf9), + C32e(0x7adf7ac6), C32e(0x58c158ee), C32e(0x9f759f45), C32e(0xa563a584), + C32e(0x50305040), C32e(0x2e1a2ed1), C32e(0x120e12e1), C32e(0xb76db765), + C32e(0xd44cd419), C32e(0x3c143c30), C32e(0x5f355f4c), C32e(0x712f719d), + C32e(0x38e13867), C32e(0xfda2fd6a), C32e(0x4fcc4f0b), C32e(0x4b394b5c), + C32e(0xf957f93d), C32e(0x0df20daa), C32e(0x9d829de3), C32e(0xc947c9f4), + C32e(0xefacef8b), C32e(0x32e7326f), C32e(0x7d2b7d64), C32e(0xa495a4d7), + C32e(0xfba0fb9b), C32e(0xb398b332), C32e(0x68d16827), C32e(0x817f815d), + C32e(0xaa66aa88), C32e(0x827e82a8), C32e(0xe6abe676), C32e(0x9e839e16), + C32e(0x45ca4503), C32e(0x7b297b95), C32e(0x6ed36ed6), C32e(0x443c4450), + C32e(0x8b798b55), C32e(0x3de23d63), C32e(0x271d272c), C32e(0x9a769a41), + C32e(0x4d3b4dad), C32e(0xfa56fac8), C32e(0xd24ed2e8), C32e(0x221e2228), + C32e(0x76db763f), C32e(0x1e0a1e18), C32e(0xb46cb490), C32e(0x37e4376b), + C32e(0xe75de725), C32e(0xb26eb261), C32e(0x2aef2a86), C32e(0xf1a6f193), + C32e(0xe3a8e372), C32e(0xf7a4f762), C32e(0x593759bd), C32e(0x868b86ff), + C32e(0x563256b1), C32e(0xc543c50d), C32e(0xeb59ebdc), C32e(0xc2b7c2af), + C32e(0x8f8c8f02), C32e(0xac64ac79), C32e(0x6dd26d23), C32e(0x3be03b92), + C32e(0xc7b4c7ab), C32e(0x15fa1543), C32e(0x090709fd), C32e(0x6f256f85), + C32e(0xeaafea8f), C32e(0x898e89f3), C32e(0x20e9208e), C32e(0x28182820), + C32e(0x64d564de), C32e(0x838883fb), C32e(0xb16fb194), C32e(0x967296b8), + C32e(0x6c246c70), C32e(0x08f108ae), C32e(0x52c752e6), C32e(0xf351f335), + C32e(0x6523658d), C32e(0x847c8459), C32e(0xbf9cbfcb), C32e(0x6321637c), + C32e(0x7cdd7c37), C32e(0x7fdc7fc2), C32e(0x9186911a), C32e(0x9485941e), + C32e(0xab90abdb), C32e(0xc642c6f8), C32e(0x57c457e2), C32e(0xe5aae583), + C32e(0x73d8733b), C32e(0x0f050f0c), C32e(0x030103f5), C32e(0x36123638), + C32e(0xfea3fe9f), C32e(0xe15fe1d4), C32e(0x10f91047), C32e(0x6bd06bd2), + C32e(0xa891a82e), C32e(0xe858e829), C32e(0x69276974), C32e(0xd0b9d04e), + C32e(0x483848a9), C32e(0x351335cd), C32e(0xceb3ce56), C32e(0x55335544), + C32e(0xd6bbd6bf), C32e(0x90709049), C32e(0x8089800e), C32e(0xf2a7f266), + C32e(0xc1b6c15a), C32e(0x66226678), C32e(0xad92ad2a), C32e(0x60206089), + C32e(0xdb49db15), C32e(0x1aff1a4f), C32e(0x887888a0), C32e(0x8e7a8e51), + C32e(0x8a8f8a06), C32e(0x13f813b2), C32e(0x9b809b12), C32e(0x39173934), + C32e(0x75da75ca), C32e(0x533153b5), C32e(0x51c65113), C32e(0xd3b8d3bb), + C32e(0x5ec35e1f), C32e(0xcbb0cb52), C32e(0x997799b4), C32e(0x3311333c), + C32e(0x46cb46f6), C32e(0x1ffc1f4b), C32e(0x61d661da), C32e(0x4e3a4e58) +}; + +uint32_t T3up_cpu[] = { + C32e(0x97a5c6c6), C32e(0xeb84f8f8), C32e(0xc799eeee), C32e(0xf78df6f6), + C32e(0xe50dffff), C32e(0xb7bdd6d6), C32e(0xa7b1dede), C32e(0x39549191), + C32e(0xc0506060), C32e(0x04030202), C32e(0x87a9cece), C32e(0xac7d5656), + C32e(0xd519e7e7), C32e(0x7162b5b5), C32e(0x9ae64d4d), C32e(0xc39aecec), + C32e(0x05458f8f), C32e(0x3e9d1f1f), C32e(0x09408989), C32e(0xef87fafa), + C32e(0xc515efef), C32e(0x7febb2b2), C32e(0x07c98e8e), C32e(0xed0bfbfb), + C32e(0x82ec4141), C32e(0x7d67b3b3), C32e(0xbefd5f5f), C32e(0x8aea4545), + C32e(0x46bf2323), C32e(0xa6f75353), C32e(0xd396e4e4), C32e(0x2d5b9b9b), + C32e(0xeac27575), C32e(0xd91ce1e1), C32e(0x7aae3d3d), C32e(0x986a4c4c), + C32e(0xd85a6c6c), C32e(0xfc417e7e), C32e(0xf102f5f5), C32e(0x1d4f8383), + C32e(0xd05c6868), C32e(0xa2f45151), C32e(0xb934d1d1), C32e(0xe908f9f9), + C32e(0xdf93e2e2), C32e(0x4d73abab), C32e(0xc4536262), C32e(0x543f2a2a), + C32e(0x100c0808), C32e(0x31529595), C32e(0x8c654646), C32e(0x215e9d9d), + C32e(0x60283030), C32e(0x6ea13737), C32e(0x140f0a0a), C32e(0x5eb52f2f), + C32e(0x1c090e0e), C32e(0x48362424), C32e(0x369b1b1b), C32e(0xa53ddfdf), + C32e(0x8126cdcd), C32e(0x9c694e4e), C32e(0xfecd7f7f), C32e(0xcf9feaea), + C32e(0x241b1212), C32e(0x3a9e1d1d), C32e(0xb0745858), C32e(0x682e3434), + C32e(0x6c2d3636), C32e(0xa3b2dcdc), C32e(0x73eeb4b4), C32e(0xb6fb5b5b), + C32e(0x53f6a4a4), C32e(0xec4d7676), C32e(0x7561b7b7), C32e(0xface7d7d), + C32e(0xa47b5252), C32e(0xa13edddd), C32e(0xbc715e5e), C32e(0x26971313), + C32e(0x57f5a6a6), C32e(0x6968b9b9), C32e(0x00000000), C32e(0x992cc1c1), + C32e(0x80604040), C32e(0xdd1fe3e3), C32e(0xf2c87979), C32e(0x77edb6b6), + C32e(0xb3bed4d4), C32e(0x01468d8d), C32e(0xced96767), C32e(0xe44b7272), + C32e(0x33de9494), C32e(0x2bd49898), C32e(0x7be8b0b0), C32e(0x114a8585), + C32e(0x6d6bbbbb), C32e(0x912ac5c5), C32e(0x9ee54f4f), C32e(0xc116eded), + C32e(0x17c58686), C32e(0x2fd79a9a), C32e(0xcc556666), C32e(0x22941111), + C32e(0x0fcf8a8a), C32e(0xc910e9e9), C32e(0x08060404), C32e(0xe781fefe), + C32e(0x5bf0a0a0), C32e(0xf0447878), C32e(0x4aba2525), C32e(0x96e34b4b), + C32e(0x5ff3a2a2), C32e(0xbafe5d5d), C32e(0x1bc08080), C32e(0x0a8a0505), + C32e(0x7ead3f3f), C32e(0x42bc2121), C32e(0xe0487070), C32e(0xf904f1f1), + C32e(0xc6df6363), C32e(0xeec17777), C32e(0x4575afaf), C32e(0x84634242), + C32e(0x40302020), C32e(0xd11ae5e5), C32e(0xe10efdfd), C32e(0x656dbfbf), + C32e(0x194c8181), C32e(0x30141818), C32e(0x4c352626), C32e(0x9d2fc3c3), + C32e(0x67e1bebe), C32e(0x6aa23535), C32e(0x0bcc8888), C32e(0x5c392e2e), + C32e(0x3d579393), C32e(0xaaf25555), C32e(0xe382fcfc), C32e(0xf4477a7a), + C32e(0x8bacc8c8), C32e(0x6fe7baba), C32e(0x642b3232), C32e(0xd795e6e6), + C32e(0x9ba0c0c0), C32e(0x32981919), C32e(0x27d19e9e), C32e(0x5d7fa3a3), + C32e(0x88664444), C32e(0xa87e5454), C32e(0x76ab3b3b), C32e(0x16830b0b), + C32e(0x03ca8c8c), C32e(0x9529c7c7), C32e(0xd6d36b6b), C32e(0x503c2828), + C32e(0x5579a7a7), C32e(0x63e2bcbc), C32e(0x2c1d1616), C32e(0x4176adad), + C32e(0xad3bdbdb), C32e(0xc8566464), C32e(0xe84e7474), C32e(0x281e1414), + C32e(0x3fdb9292), C32e(0x180a0c0c), C32e(0x906c4848), C32e(0x6be4b8b8), + C32e(0x255d9f9f), C32e(0x616ebdbd), C32e(0x86ef4343), C32e(0x93a6c4c4), + C32e(0x72a83939), C32e(0x62a43131), C32e(0xbd37d3d3), C32e(0xff8bf2f2), + C32e(0xb132d5d5), C32e(0x0d438b8b), C32e(0xdc596e6e), C32e(0xafb7dada), + C32e(0x028c0101), C32e(0x7964b1b1), C32e(0x23d29c9c), C32e(0x92e04949), + C32e(0xabb4d8d8), C32e(0x43faacac), C32e(0xfd07f3f3), C32e(0x8525cfcf), + C32e(0x8fafcaca), C32e(0xf38ef4f4), C32e(0x8ee94747), C32e(0x20181010), + C32e(0xded56f6f), C32e(0xfb88f0f0), C32e(0x946f4a4a), C32e(0xb8725c5c), + C32e(0x70243838), C32e(0xaef15757), C32e(0xe6c77373), C32e(0x35519797), + C32e(0x8d23cbcb), C32e(0x597ca1a1), C32e(0xcb9ce8e8), C32e(0x7c213e3e), + C32e(0x37dd9696), C32e(0xc2dc6161), C32e(0x1a860d0d), C32e(0x1e850f0f), + C32e(0xdb90e0e0), C32e(0xf8427c7c), C32e(0xe2c47171), C32e(0x83aacccc), + C32e(0x3bd89090), C32e(0x0c050606), C32e(0xf501f7f7), C32e(0x38121c1c), + C32e(0x9fa3c2c2), C32e(0xd45f6a6a), C32e(0x47f9aeae), C32e(0xd2d06969), + C32e(0x2e911717), C32e(0x29589999), C32e(0x74273a3a), C32e(0x4eb92727), + C32e(0xa938d9d9), C32e(0xcd13ebeb), C32e(0x56b32b2b), C32e(0x44332222), + C32e(0xbfbbd2d2), C32e(0x4970a9a9), C32e(0x0e890707), C32e(0x66a73333), + C32e(0x5ab62d2d), C32e(0x78223c3c), C32e(0x2a921515), C32e(0x8920c9c9), + C32e(0x15498787), C32e(0x4fffaaaa), C32e(0xa0785050), C32e(0x517aa5a5), + C32e(0x068f0303), C32e(0xb2f85959), C32e(0x12800909), C32e(0x34171a1a), + C32e(0xcada6565), C32e(0xb531d7d7), C32e(0x13c68484), C32e(0xbbb8d0d0), + C32e(0x1fc38282), C32e(0x52b02929), C32e(0xb4775a5a), C32e(0x3c111e1e), + C32e(0xf6cb7b7b), C32e(0x4bfca8a8), C32e(0xdad66d6d), C32e(0x583a2c2c) +}; + +uint32_t T3dn_cpu[] = { + C32e(0x32f4a5f4), C32e(0x6f978497), C32e(0x5eb099b0), C32e(0x7a8c8d8c), + C32e(0xe8170d17), C32e(0x0adcbddc), C32e(0x16c8b1c8), C32e(0x6dfc54fc), + C32e(0x90f050f0), C32e(0x07050305), C32e(0x2ee0a9e0), C32e(0xd1877d87), + C32e(0xcc2b192b), C32e(0x13a662a6), C32e(0x7c31e631), C32e(0x59b59ab5), + C32e(0x40cf45cf), C32e(0xa3bc9dbc), C32e(0x49c040c0), C32e(0x68928792), + C32e(0xd03f153f), C32e(0x9426eb26), C32e(0xce40c940), C32e(0xe61d0b1d), + C32e(0x6e2fec2f), C32e(0x1aa967a9), C32e(0x431cfd1c), C32e(0x6025ea25), + C32e(0xf9dabfda), C32e(0x5102f702), C32e(0x45a196a1), C32e(0x76ed5bed), + C32e(0x285dc25d), C32e(0xc5241c24), C32e(0xd4e9aee9), C32e(0xf2be6abe), + C32e(0x82ee5aee), C32e(0xbdc341c3), C32e(0xf3060206), C32e(0x52d14fd1), + C32e(0x8ce45ce4), C32e(0x5607f407), C32e(0x8d5c345c), C32e(0xe1180818), + C32e(0x4cae93ae), C32e(0x3e957395), C32e(0x97f553f5), C32e(0x6b413f41), + C32e(0x1c140c14), C32e(0x63f652f6), C32e(0xe9af65af), C32e(0x7fe25ee2), + C32e(0x48782878), C32e(0xcff8a1f8), C32e(0x1b110f11), C32e(0xebc4b5c4), + C32e(0x151b091b), C32e(0x7e5a365a), C32e(0xadb69bb6), C32e(0x98473d47), + C32e(0xa76a266a), C32e(0xf5bb69bb), C32e(0x334ccd4c), C32e(0x50ba9fba), + C32e(0x3f2d1b2d), C32e(0xa4b99eb9), C32e(0xc49c749c), C32e(0x46722e72), + C32e(0x41772d77), C32e(0x11cdb2cd), C32e(0x9d29ee29), C32e(0x4d16fb16), + C32e(0xa501f601), C32e(0xa1d74dd7), C32e(0x14a361a3), C32e(0x3449ce49), + C32e(0xdf8d7b8d), C32e(0x9f423e42), C32e(0xcd937193), C32e(0xb1a297a2), + C32e(0xa204f504), C32e(0x01b868b8), C32e(0x00000000), C32e(0xb5742c74), + C32e(0xe0a060a0), C32e(0xc2211f21), C32e(0x3a43c843), C32e(0x9a2ced2c), + C32e(0x0dd9bed9), C32e(0x47ca46ca), C32e(0x1770d970), C32e(0xafdd4bdd), + C32e(0xed79de79), C32e(0xff67d467), C32e(0x9323e823), C32e(0x5bde4ade), + C32e(0x06bd6bbd), C32e(0xbb7e2a7e), C32e(0x7b34e534), C32e(0xd73a163a), + C32e(0xd254c554), C32e(0xf862d762), C32e(0x99ff55ff), C32e(0xb6a794a7), + C32e(0xc04acf4a), C32e(0xd9301030), C32e(0x0e0a060a), C32e(0x66988198), + C32e(0xab0bf00b), C32e(0xb4cc44cc), C32e(0xf0d5bad5), C32e(0x753ee33e), + C32e(0xac0ef30e), C32e(0x4419fe19), C32e(0xdb5bc05b), C32e(0x80858a85), + C32e(0xd3ecadec), C32e(0xfedfbcdf), C32e(0xa8d848d8), C32e(0xfd0c040c), + C32e(0x197adf7a), C32e(0x2f58c158), C32e(0x309f759f), C32e(0xe7a563a5), + C32e(0x70503050), C32e(0xcb2e1a2e), C32e(0xef120e12), C32e(0x08b76db7), + C32e(0x55d44cd4), C32e(0x243c143c), C32e(0x795f355f), C32e(0xb2712f71), + C32e(0x8638e138), C32e(0xc8fda2fd), C32e(0xc74fcc4f), C32e(0x654b394b), + C32e(0x6af957f9), C32e(0x580df20d), C32e(0x619d829d), C32e(0xb3c947c9), + C32e(0x27efacef), C32e(0x8832e732), C32e(0x4f7d2b7d), C32e(0x42a495a4), + C32e(0x3bfba0fb), C32e(0xaab398b3), C32e(0xf668d168), C32e(0x22817f81), + C32e(0xeeaa66aa), C32e(0xd6827e82), C32e(0xdde6abe6), C32e(0x959e839e), + C32e(0xc945ca45), C32e(0xbc7b297b), C32e(0x056ed36e), C32e(0x6c443c44), + C32e(0x2c8b798b), C32e(0x813de23d), C32e(0x31271d27), C32e(0x379a769a), + C32e(0x964d3b4d), C32e(0x9efa56fa), C32e(0xa6d24ed2), C32e(0x36221e22), + C32e(0xe476db76), C32e(0x121e0a1e), C32e(0xfcb46cb4), C32e(0x8f37e437), + C32e(0x78e75de7), C32e(0x0fb26eb2), C32e(0x692aef2a), C32e(0x35f1a6f1), + C32e(0xdae3a8e3), C32e(0xc6f7a4f7), C32e(0x8a593759), C32e(0x74868b86), + C32e(0x83563256), C32e(0x4ec543c5), C32e(0x85eb59eb), C32e(0x18c2b7c2), + C32e(0x8e8f8c8f), C32e(0x1dac64ac), C32e(0xf16dd26d), C32e(0x723be03b), + C32e(0x1fc7b4c7), C32e(0xb915fa15), C32e(0xfa090709), C32e(0xa06f256f), + C32e(0x20eaafea), C32e(0x7d898e89), C32e(0x6720e920), C32e(0x38281828), + C32e(0x0b64d564), C32e(0x73838883), C32e(0xfbb16fb1), C32e(0xca967296), + C32e(0x546c246c), C32e(0x5f08f108), C32e(0x2152c752), C32e(0x64f351f3), + C32e(0xae652365), C32e(0x25847c84), C32e(0x57bf9cbf), C32e(0x5d632163), + C32e(0xea7cdd7c), C32e(0x1e7fdc7f), C32e(0x9c918691), C32e(0x9b948594), + C32e(0x4bab90ab), C32e(0xbac642c6), C32e(0x2657c457), C32e(0x29e5aae5), + C32e(0xe373d873), C32e(0x090f050f), C32e(0xf4030103), C32e(0x2a361236), + C32e(0x3cfea3fe), C32e(0x8be15fe1), C32e(0xbe10f910), C32e(0x026bd06b), + C32e(0xbfa891a8), C32e(0x71e858e8), C32e(0x53692769), C32e(0xf7d0b9d0), + C32e(0x91483848), C32e(0xde351335), C32e(0xe5ceb3ce), C32e(0x77553355), + C32e(0x04d6bbd6), C32e(0x39907090), C32e(0x87808980), C32e(0xc1f2a7f2), + C32e(0xecc1b6c1), C32e(0x5a662266), C32e(0xb8ad92ad), C32e(0xa9602060), + C32e(0x5cdb49db), C32e(0xb01aff1a), C32e(0xd8887888), C32e(0x2b8e7a8e), + C32e(0x898a8f8a), C32e(0x4a13f813), C32e(0x929b809b), C32e(0x23391739), + C32e(0x1075da75), C32e(0x84533153), C32e(0xd551c651), C32e(0x03d3b8d3), + C32e(0xdc5ec35e), C32e(0xe2cbb0cb), C32e(0xc3997799), C32e(0x2d331133), + C32e(0x3d46cb46), C32e(0xb71ffc1f), C32e(0x0c61d661), C32e(0x624e3a4e) +}; __device__ __forceinline__ -void groestl256_perm_P(uint32_t thread, uint32_t *a, uint32_t *mixtabs) +void groestl256_perm_P(uint32_t *const __restrict__ a, const uint32_t *const __restrict__ mixtabs) { #pragma unroll 10 for (int r = 0; r<10; r++) { uint32_t t[16]; - a[0x0] ^= PC32up(0x00, r); - a[0x2] ^= PC32up(0x10, r); - a[0x4] ^= PC32up(0x20, r); - a[0x6] ^= PC32up(0x30, r); - a[0x8] ^= PC32up(0x40, r); - a[0xA] ^= PC32up(0x50, r); - a[0xC] ^= PC32up(0x60, r); - a[0xE] ^= PC32up(0x70, r); + a[0x0] ^= 0x00 + r; + a[0x2] ^= 0x10 + r; + a[0x4] ^= 0x20 + r; + a[0x6] ^= 0x30 + r; + a[0x8] ^= 0x40 + r; + a[0xA] ^= 0x50 + r; + a[0xC] ^= 0x60 + r; + a[0xE] ^= 0x70 + r; RSTT(0x0, 0x1, a, 0x0, 0x2, 0x4, 0x6, 0x9, 0xB, 0xD, 0xF); RSTT(0x2, 0x3, a, 0x2, 0x4, 0x6, 0x8, 0xB, 0xD, 0xF, 0x1); RSTT(0x4, 0x5, a, 0x4, 0x6, 0x8, 0xA, 0xD, 0xF, 0x1, 0x3); @@ -131,31 +652,69 @@ void groestl256_perm_P(uint32_t thread, uint32_t *a, uint32_t *mixtabs) a[k] = t[k]; } } +__device__ __forceinline__ + +void groestl256_perm_P_final(uint32_t *const __restrict__ a, const uint32_t *const __restrict__ mixtabs) +{ + uint32_t t[16]; +#pragma unroll + for(int r = 0; r<9; r++) + { + a[0x0] ^= 0x00 + r; + a[0x2] ^= 0x10 + r; + a[0x4] ^= 0x20 + r; + a[0x6] ^= 0x30 + r; + a[0x8] ^= 0x40 + r; + a[0xA] ^= 0x50 + r; + a[0xC] ^= 0x60 + r; + a[0xE] ^= 0x70 + r; + RSTT(0x0, 0x1, a, 0x0, 0x2, 0x4, 0x6, 0x9, 0xB, 0xD, 0xF); + RSTT(0x2, 0x3, a, 0x2, 0x4, 0x6, 0x8, 0xB, 0xD, 0xF, 0x1); + RSTT(0x4, 0x5, a, 0x4, 0x6, 0x8, 0xA, 0xD, 0xF, 0x1, 0x3); + RSTT(0x6, 0x7, a, 0x6, 0x8, 0xA, 0xC, 0xF, 0x1, 0x3, 0x5); + RSTT(0x8, 0x9, a, 0x8, 0xA, 0xC, 0xE, 0x1, 0x3, 0x5, 0x7); + RSTT(0xA, 0xB, a, 0xA, 0xC, 0xE, 0x0, 0x3, 0x5, 0x7, 0x9); + RSTT(0xC, 0xD, a, 0xC, 0xE, 0x0, 0x2, 0x5, 0x7, 0x9, 0xB); + RSTT(0xE, 0xF, a, 0xE, 0x0, 0x2, 0x4, 0x7, 0x9, 0xB, 0xD); + +#pragma unroll 16 + for(int k = 0; k<16; k++) + a[k] = t[k]; + } + a[15] = T0dn(B32_0(a[14] ^ 0x79)) + ^ T1dn(B32_1(a[ 0] ^ 0x09)) + ^ T2dn(B32_2(a[ 2] ^ 0x19)) + ^ T3dn(B32_3(a[ 4] ^ 0x29)) + ^ T0up(B32_0(a[ 7])) + ^ T1up(B32_1(a[ 9])) + ^ T2up(B32_2(a[11])) + ^ T3up(B32_3(a[13])); +} __device__ __forceinline__ -void groestl256_perm_Q(uint32_t thread, uint32_t *a, uint32_t *mixtabs) +void groestl256_perm_Q(uint32_t *const __restrict__ a, const uint32_t *const __restrict__ mixtabs) { #pragma unroll - for (int r = 0; r<10; r++) + for (uint32_t r = 0; r<0x0a000000; r+=0x01000000) { uint32_t t[16]; - a[0x0] ^= QC32up(0x00, r); - a[0x1] ^= QC32dn(0x00, r); - a[0x2] ^= QC32up(0x10, r); - a[0x3] ^= QC32dn(0x10, r); - a[0x4] ^= QC32up(0x20, r); - a[0x5] ^= QC32dn(0x20, r); - a[0x6] ^= QC32up(0x30, r); - a[0x7] ^= QC32dn(0x30, r); - a[0x8] ^= QC32up(0x40, r); - a[0x9] ^= QC32dn(0x40, r); - a[0xA] ^= QC32up(0x50, r); - a[0xB] ^= QC32dn(0x50, r); - a[0xC] ^= QC32up(0x60, r); - a[0xD] ^= QC32dn(0x60, r); - a[0xE] ^= QC32up(0x70, r); - a[0xF] ^= QC32dn(0x70, r); + a[0x0] ^= 0xFFFFFFFF; + a[0x1] ^= ~r; + a[0x2] ^= 0xFFFFFFFF; + a[0x3] ^= r ^ 0xefffffff; + a[0x4] ^= 0xFFFFFFFF; + a[0x5] ^= r ^ 0xdfffffff; + a[0x6] ^= 0xFFFFFFFF; + a[0x7] ^= r ^ 0xcfffffff; + a[0x8] ^= 0xFFFFFFFF; + a[0x9] ^= r ^ 0xbfffffff; + a[0xA] ^= 0xFFFFFFFF; + a[0xB] ^= r ^ 0xafffffff; + a[0xC] ^= 0xFFFFFFFF; + a[0xD] ^= r ^ 0x9fffffff; + a[0xE] ^= 0xFFFFFFFF; + a[0xF] ^= r ^ 0x8fffffff; RSTT(0x0, 0x1, a, 0x2, 0x6, 0xA, 0xE, 0x1, 0x5, 0x9, 0xD); RSTT(0x2, 0x3, a, 0x4, 0x8, 0xC, 0x0, 0x3, 0x7, 0xB, 0xF); RSTT(0x4, 0x5, a, 0x6, 0xA, 0xE, 0x2, 0x5, 0x9, 0xD, 0x1); @@ -172,7 +731,7 @@ void groestl256_perm_Q(uint32_t thread, uint32_t *a, uint32_t *mixtabs) } __global__ __launch_bounds__(256,1) -void groestl256_gpu_hash32(uint32_t threads, uint32_t startNounce, uint64_t *const __restrict__ outputHash, uint32_t *const __restrict__ nonceVector) +void groestl256_gpu_hash32(uint32_t threads, uint32_t startNounce, const uint64_t *const __restrict__ outputHash, uint32_t *const __restrict__ nonceVector) { #if USE_SHARED __shared__ uint32_t mixtabs[2048]; @@ -191,8 +750,8 @@ void groestl256_gpu_hash32(uint32_t threads, uint32_t startNounce, uint64_t *con __syncthreads(); #endif - uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); - if (thread < threads) + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); +// if (thread < threads) { // GROESTL uint32_t message[16]; @@ -218,22 +777,22 @@ void groestl256_gpu_hash32(uint32_t threads, uint32_t startNounce, uint64_t *con // Perm #if USE_SHARED - groestl256_perm_P(thread, state, mixtabs); + groestl256_perm_P(state, mixtabs); state[15] ^= 0x10000; - groestl256_perm_Q(thread, message, mixtabs); + groestl256_perm_Q(message, mixtabs); #else - groestl256_perm_P(thread, state, NULL); + groestl256_perm_P(state, NULL); state[15] ^= 0x10000; - groestl256_perm_P(thread, message, NULL); + groestl256_perm_P(message, NULL); #endif #pragma unroll 16 for (int u = 0; u<16; u++) state[u] ^= message[u]; #pragma unroll 16 for (int u = 0; u<16; u++) message[u] = state[u]; #if USE_SHARED - groestl256_perm_P(thread, message, mixtabs); + groestl256_perm_P_final(message, mixtabs); #else - groestl256_perm_P(thread, message, NULL); + groestl256_perm_P(message, NULL); #endif state[15] ^= message[15]; @@ -248,13 +807,13 @@ void groestl256_gpu_hash32(uint32_t threads, uint32_t startNounce, uint64_t *con #define texDef(texname, texmem, texsource, texsize) \ unsigned int *texmem; \ - cudaMalloc(&texmem, texsize); \ - cudaMemcpy(texmem, texsource, texsize, cudaMemcpyHostToDevice); \ + CUDA_SAFE_CALL(cudaMalloc(&texmem, texsize)); \ + CUDA_SAFE_CALL(cudaMemcpyAsync(texmem, texsource, texsize, cudaMemcpyHostToDevice, gpustream[thr_id])); \ texname.normalized = 0; \ texname.filterMode = cudaFilterModePoint; \ texname.addressMode[0] = cudaAddressModeClamp; \ { cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(); \ - cudaBindTexture(NULL, &texname, texmem, &channelDesc, texsize ); } \ + CUDA_SAFE_CALL(cudaBindTexture(NULL, &texname, texmem, &channelDesc, texsize)); } __host__ void groestl256_cpu_init(int thr_id, uint32_t threads) @@ -270,28 +829,27 @@ void groestl256_cpu_init(int thr_id, uint32_t threads) texDef(t3up2, d_T3up, T3up_cpu, sizeof(uint32_t) * 256); texDef(t3dn2, d_T3dn, T3dn_cpu, sizeof(uint32_t) * 256); - cudaMalloc(&d_GNonce[thr_id], 2*sizeof(uint32_t)); - cudaMallocHost(&d_gnounce[thr_id], 2*sizeof(uint32_t)); + CUDA_SAFE_CALL(cudaMalloc(&d_GNonce[thr_id], 2 * sizeof(uint32_t))); } __host__ -void groestl256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_outputHash, int order, uint32_t *resultnonces) +void groestl256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_outputHash, uint32_t *resultnonces) { - cudaMemset(d_GNonce[thr_id], 0, 2*sizeof(uint32_t)); + CUDA_SAFE_CALL(cudaMemsetAsync(d_GNonce[thr_id], 0, 2 * sizeof(uint32_t), gpustream[thr_id])); const uint32_t threadsperblock = 256; // berechne wie viele Thread Blocks wir brauchen dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 block(threadsperblock); - groestl256_gpu_hash32<<>>(threads, startNounce, d_outputHash, d_GNonce[thr_id]); - cudaMemcpy(d_gnounce[thr_id], d_GNonce[thr_id], 2*sizeof(uint32_t), cudaMemcpyDeviceToHost); - resultnonces[0] = *(d_gnounce[thr_id]); - resultnonces[1] = *(d_gnounce[thr_id] + 1); + groestl256_gpu_hash32<<>>(threads, startNounce, d_outputHash, d_GNonce[thr_id]); + CUDA_SAFE_CALL(cudaGetLastError()); + CUDA_SAFE_CALL(cudaMemcpyAsync(resultnonces, d_GNonce[thr_id], 2 * sizeof(uint32_t), cudaMemcpyDeviceToHost, gpustream[thr_id])); + CUDA_SAFE_CALL(cudaStreamSynchronize(gpustream[thr_id])); } __host__ -void groestl256_setTarget(const void *pTargetIn) +void groestl256_setTarget(int thr_id, const void *pTargetIn) { - cudaMemcpyToSymbol(pTarget, pTargetIn, 8 * sizeof(uint32_t), 0, cudaMemcpyHostToDevice); + CUDA_SAFE_CALL(cudaMemcpyToSymbolAsync(pTarget, pTargetIn, 8 * sizeof(uint32_t), 0, cudaMemcpyHostToDevice, gpustream[thr_id])); } diff --git a/Algo256/cuda_keccak256.cu b/Algo256/cuda_keccak256.cu index 7b99cd7fcc..3cd851e44f 100644 --- a/Algo256/cuda_keccak256.cu +++ b/Algo256/cuda_keccak256.cu @@ -1,12 +1,13 @@ #include "miner.h" - -extern "C" { +#ifdef __cplusplus +#include +#else #include +#endif #include -} - #include "cuda_helper.h" + #define UINT2(x,y) make_uint2(x,y) static uint32_t *d_KNonce[MAX_GPUS]; @@ -43,602 +44,217 @@ __constant__ uint2 keccak_round_constants35[24] = { }; -__constant__ uint64_t c_PaddedMessage80[10]; // padded message (80 bytes + padding?) - -#if __CUDA_ARCH__ >= 350 -__device__ __forceinline__ -static void keccak_blockv35_32(uint2 *s) -{ - int i; - uint2 t1, t[5], u[5], v, w; - - t1 = s[1] ^ s[16]; - - /* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */ - u[0] = s[4] ^ ROL2(t1, 1); - u[1] = s[0] ^ ROL2(s[2], 1); - u[2] = t1 ^ ROL2(s[3], 1); - u[3] = s[2] ^ ROL2(s[4], 1); - u[4] = s[3] ^ ROL2(s[0], 1); - - /* theta: a[0,i], a[1,i], .. a[4,i] ^= d[i] */ - s[0] ^= u[0]; s[5] ^= u[0]; s[10] ^= u[0]; s[15] ^= u[0]; s[20] ^= u[0]; - s[1] ^= u[1]; s[6] ^= u[1]; s[11] ^= u[1]; s[16] ^= u[1]; s[21] ^= u[1]; - s[2] ^= u[2]; s[7] ^= u[2]; s[12] ^= u[2]; s[17] ^= u[2]; s[22] ^= u[2]; - s[3] ^= u[3]; s[8] ^= u[3]; s[13] ^= u[3]; s[18] ^= u[3]; s[23] ^= u[3]; - s[4] ^= u[4]; s[9] ^= u[4]; s[14] ^= u[4]; s[19] ^= u[4]; s[24] ^= u[4]; - - /* rho pi: b[..] = rotl(a[..], ..) */ - v = s[1]; - s[1] = ROL2(s[6], 44); - s[6] = ROL2(s[9], 20); - s[9] = ROL2(s[22], 61); - s[22] = ROL2(s[14], 39); - s[14] = ROL2(s[20], 18); - s[20] = ROL2(s[2], 62); - s[2] = ROL2(s[12], 43); - s[12] = ROL2(s[13], 25); - s[13] = ROL2(s[19], 8); - s[19] = ROL2(s[23], 56); - s[23] = ROL2(s[15], 41); - s[15] = ROL2(s[4], 27); - s[4] = ROL2(s[24], 14); - s[24] = ROL2(s[21], 2); - s[21] = ROL2(s[8], 55); - s[8] = ROL2(s[16], 45); - s[16] = ROL2(s[5], 36); - s[5] = ROL2(s[3], 28); - s[3] = ROL2(s[18], 21); - s[18] = ROL2(s[17], 15); - s[17] = ROL2(s[11], 10); - s[11] = ROL2(s[7], 6); - s[7] = ROL2(s[10], 3); - s[10] = ROL2(v, 1); - - /* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */ - v = s[0]; w = s[1]; s[0] ^= (~w) & s[2]; s[1] ^= (~s[2]) & s[3]; s[2] ^= (~s[3]) & s[4]; s[3] ^= (~s[4]) & v; s[4] ^= (~v) & w; - v = s[5]; w = s[6]; s[5] ^= (~w) & s[7]; s[6] ^= (~s[7]) & s[8]; s[7] ^= (~s[8]) & s[9]; s[8] ^= (~s[9]) & v; s[9] ^= (~v) & w; - v = s[10]; w = s[11]; s[10] ^= (~w) & s[12]; s[11] ^= (~s[12]) & s[13]; s[12] ^= (~s[13]) & s[14]; s[13] ^= (~s[14]) & v; s[14] ^= (~v) & w; - v = s[15]; w = s[16]; s[15] ^= (~w) & s[17]; s[16] ^= (~s[17]) & s[18]; s[17] ^= (~s[18]) & s[19]; s[18] ^= (~s[19]) & v; s[19] ^= (~v) & w; - v = s[20]; w = s[21]; s[20] ^= (~w) & s[22]; s[21] ^= (~s[22]) & s[23]; s[22] ^= (~s[23]) & s[24]; s[23] ^= (~s[24]) & v; s[24] ^= (~v) & w; - - /* iota: a[0,0] ^= round constant */ - s[0] = s[0]^1; //vectorize(keccak_round_constants[0]); - -#pragma unroll - for (i = 1; i < 24; i++) { - /* theta: c = a[0,i] ^ a[1,i] ^ .. a[4,i] */ - t[0] = s[0] ^ s[5] ^ s[10] ^ s[15] ^ s[20]; - t[1] = s[1] ^ s[6] ^ s[11] ^ s[16] ^ s[21]; - t[2] = s[2] ^ s[7] ^ s[12] ^ s[17] ^ s[22]; - t[3] = s[3] ^ s[8] ^ s[13] ^ s[18] ^ s[23]; - t[4] = s[4] ^ s[9] ^ s[14] ^ s[19] ^ s[24]; - - /* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */ - u[0] = t[4] ^ ROL2(t[1], 1); - u[1] = t[0] ^ ROL2(t[2], 1); - u[2] = t[1] ^ ROL2(t[3], 1); - u[3] = t[2] ^ ROL2(t[4], 1); - u[4] = t[3] ^ ROL2(t[0], 1); - - /* theta: a[0,i], a[1,i], .. a[4,i] ^= d[i] */ - s[0] ^= u[0]; s[5] ^= u[0]; s[10] ^= u[0]; s[15] ^= u[0]; s[20] ^= u[0]; - s[1] ^= u[1]; s[6] ^= u[1]; s[11] ^= u[1]; s[16] ^= u[1]; s[21] ^= u[1]; - s[2] ^= u[2]; s[7] ^= u[2]; s[12] ^= u[2]; s[17] ^= u[2]; s[22] ^= u[2]; - s[3] ^= u[3]; s[8] ^= u[3]; s[13] ^= u[3]; s[18] ^= u[3]; s[23] ^= u[3]; - s[4] ^= u[4]; s[9] ^= u[4]; s[14] ^= u[4]; s[19] ^= u[4]; s[24] ^= u[4]; - - /* rho pi: b[..] = rotl(a[..], ..) */ - v = s[1]; - s[1] = ROL2(s[6], 44); - s[6] = ROL2(s[9], 20); - s[9] = ROL2(s[22], 61); - s[22] = ROL2(s[14], 39); - s[14] = ROL2(s[20], 18); - s[20] = ROL2(s[2], 62); - s[2] = ROL2(s[12], 43); - s[12] = ROL2(s[13], 25); - s[13] = ROL2(s[19], 8); - s[19] = ROL2(s[23], 56); - s[23] = ROL2(s[15], 41); - s[15] = ROL2(s[4], 27); - s[4] = ROL2(s[24], 14); - s[24] = ROL2(s[21], 2); - s[21] = ROL2(s[8], 55); - s[8] = ROL2(s[16], 45); - s[16] = ROL2(s[5], 36); - s[5] = ROL2(s[3], 28); - s[3] = ROL2(s[18], 21); - s[18] = ROL2(s[17], 15); - s[17] = ROL2(s[11], 10); - s[11] = ROL2(s[7], 6); - s[7] = ROL2(s[10], 3); - s[10] = ROL2(v, 1); - - /* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */ - v = s[0]; w = s[1]; s[0] ^= (~w) & s[2]; s[1] ^= (~s[2]) & s[3]; s[2] ^= (~s[3]) & s[4]; s[3] ^= (~s[4]) & v; s[4] ^= (~v) & w; - v = s[5]; w = s[6]; s[5] ^= (~w) & s[7]; s[6] ^= (~s[7]) & s[8]; s[7] ^= (~s[8]) & s[9]; s[8] ^= (~s[9]) & v; s[9] ^= (~v) & w; - v = s[10]; w = s[11]; s[10] ^= (~w) & s[12]; s[11] ^= (~s[12]) & s[13]; s[12] ^= (~s[13]) & s[14]; s[13] ^= (~s[14]) & v; s[14] ^= (~v) & w; - v = s[15]; w = s[16]; s[15] ^= (~w) & s[17]; s[16] ^= (~s[17]) & s[18]; s[17] ^= (~s[18]) & s[19]; s[18] ^= (~s[19]) & v; s[19] ^= (~v) & w; - v = s[20]; w = s[21]; s[20] ^= (~w) & s[22]; s[21] ^= (~s[22]) & s[23]; s[22] ^= (~s[23]) & s[24]; s[23] ^= (~s[24]) & v; s[24] ^= (~v) & w; - - /* iota: a[0,0] ^= round constant */ - s[0] ^= keccak_round_constants35[i]; //vectorize(keccak_round_constants[i]); - } -} -#else - -__device__ __forceinline__ -static void keccak_blockv30_32(uint64_t *s, const uint64_t *keccak_round_constants) -{ - int i; - uint64_t t1, t[5], u[5], v, w; - - /* absorb input */ - /* theta: c = a[0,i] ^ a[1,i] ^ .. a[4,i] */ - t1 = s[1] ^ s[16]; - - /* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */ - u[0] = s[4] ^ ROTL64(t1, 1); - u[1] = s[0] ^ ROTL64(s[2], 1); - u[2] = t1 ^ ROTL64(s[3], 1); - u[3] = s[2] ^ ROTL64(s[4], 1); - u[4] = s[3] ^ ROTL64(s[0], 1); - - /* theta: a[0,i], a[1,i], .. a[4,i] ^= d[i] */ - s[0] ^= u[0]; s[5] ^= u[0]; s[10] ^= u[0]; s[15] ^= u[0]; s[20] ^= u[0]; - s[1] ^= u[1]; s[6] ^= u[1]; s[11] ^= u[1]; s[16] ^= u[1]; s[21] ^= u[1]; - s[2] ^= u[2]; s[7] ^= u[2]; s[12] ^= u[2]; s[17] ^= u[2]; s[22] ^= u[2]; - s[3] ^= u[3]; s[8] ^= u[3]; s[13] ^= u[3]; s[18] ^= u[3]; s[23] ^= u[3]; - s[4] ^= u[4]; s[9] ^= u[4]; s[14] ^= u[4]; s[19] ^= u[4]; s[24] ^= u[4]; - - /* rho pi: b[..] = rotl(a[..], ..) */ - v = s[1]; - s[1] = ROTL64(s[6], 44); - s[6] = ROTL64(s[9], 20); - s[9] = ROTL64(s[22], 61); - s[22] = ROTL64(s[14], 39); - s[14] = ROTL64(s[20], 18); - s[20] = ROTL64(s[2], 62); - s[2] = ROTL64(s[12], 43); - s[12] = ROTL64(s[13], 25); - s[13] = ROTL64(s[19], 8); - s[19] = ROTL64(s[23], 56); - s[23] = ROTL64(s[15], 41); - s[15] = ROTL64(s[4], 27); - s[4] = ROTL64(s[24], 14); - s[24] = ROTL64(s[21], 2); - s[21] = ROTL64(s[8], 55); - s[8] = ROTL64(s[16], 45); - s[16] = ROTL64(s[5], 36); - s[5] = ROTL64(s[3], 28); - s[3] = ROTL64(s[18], 21); - s[18] = ROTL64(s[17], 15); - s[17] = ROTL64(s[11], 10); - s[11] = ROTL64(s[7], 6); - s[7] = ROTL64(s[10], 3); - s[10] = ROTL64(v, 1); - - /* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */ - v = s[0]; w = s[1]; s[0] ^= (~w) & s[2]; s[1] ^= (~s[2]) & s[3]; s[2] ^= (~s[3]) & s[4]; s[3] ^= (~s[4]) & v; s[4] ^= (~v) & w; - v = s[5]; w = s[6]; s[5] ^= (~w) & s[7]; s[6] ^= (~s[7]) & s[8]; s[7] ^= (~s[8]) & s[9]; s[8] ^= (~s[9]) & v; s[9] ^= (~v) & w; - v = s[10]; w = s[11]; s[10] ^= (~w) & s[12]; s[11] ^= (~s[12]) & s[13]; s[12] ^= (~s[13]) & s[14]; s[13] ^= (~s[14]) & v; s[14] ^= (~v) & w; - v = s[15]; w = s[16]; s[15] ^= (~w) & s[17]; s[16] ^= (~s[17]) & s[18]; s[17] ^= (~s[18]) & s[19]; s[18] ^= (~s[19]) & v; s[19] ^= (~v) & w; - v = s[20]; w = s[21]; s[20] ^= (~w) & s[22]; s[21] ^= (~s[22]) & s[23]; s[22] ^= (~s[23]) & s[24]; s[23] ^= (~s[24]) & v; s[24] ^= (~v) & w; - - /* iota: a[0,0] ^= round constant */ - s[0] ^= 1;//keccak_round_constants[0]; - - for (i = 1; i < 24; i++) { - /* theta: c = a[0,i] ^ a[1,i] ^ .. a[4,i] */ - t[0] = s[0] ^ s[5] ^ s[10] ^ s[15] ^ s[20]; - t[1] = s[1] ^ s[6] ^ s[11] ^ s[16] ^ s[21]; - t[2] = s[2] ^ s[7] ^ s[12] ^ s[17] ^ s[22]; - t[3] = s[3] ^ s[8] ^ s[13] ^ s[18] ^ s[23]; - t[4] = s[4] ^ s[9] ^ s[14] ^ s[19] ^ s[24]; +__constant__ uint2 c_PaddedMessage80[10]; // padded message (80 bytes + padding?) - /* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */ - u[0] = t[4] ^ ROTL64(t[1], 1); - u[1] = t[0] ^ ROTL64(t[2], 1); - u[2] = t[1] ^ ROTL64(t[3], 1); - u[3] = t[2] ^ ROTL64(t[4], 1); - u[4] = t[3] ^ ROTL64(t[0], 1); - - /* theta: a[0,i], a[1,i], .. a[4,i] ^= d[i] */ - s[0] ^= u[0]; s[5] ^= u[0]; s[10] ^= u[0]; s[15] ^= u[0]; s[20] ^= u[0]; - s[1] ^= u[1]; s[6] ^= u[1]; s[11] ^= u[1]; s[16] ^= u[1]; s[21] ^= u[1]; - s[2] ^= u[2]; s[7] ^= u[2]; s[12] ^= u[2]; s[17] ^= u[2]; s[22] ^= u[2]; - s[3] ^= u[3]; s[8] ^= u[3]; s[13] ^= u[3]; s[18] ^= u[3]; s[23] ^= u[3]; - s[4] ^= u[4]; s[9] ^= u[4]; s[14] ^= u[4]; s[19] ^= u[4]; s[24] ^= u[4]; - - /* rho pi: b[..] = rotl(a[..], ..) */ - v = s[1]; - s[1] = ROTL64(s[6], 44); - s[6] = ROTL64(s[9], 20); - s[9] = ROTL64(s[22], 61); - s[22] = ROTL64(s[14], 39); - s[14] = ROTL64(s[20], 18); - s[20] = ROTL64(s[2], 62); - s[2] = ROTL64(s[12], 43); - s[12] = ROTL64(s[13], 25); - s[13] = ROTL64(s[19], 8); - s[19] = ROTL64(s[23], 56); - s[23] = ROTL64(s[15], 41); - s[15] = ROTL64(s[4], 27); - s[4] = ROTL64(s[24], 14); - s[24] = ROTL64(s[21], 2); - s[21] = ROTL64(s[8], 55); - s[8] = ROTL64(s[16], 45); - s[16] = ROTL64(s[5], 36); - s[5] = ROTL64(s[3], 28); - s[3] = ROTL64(s[18], 21); - s[18] = ROTL64(s[17], 15); - s[17] = ROTL64(s[11], 10); - s[11] = ROTL64(s[7], 6); - s[7] = ROTL64(s[10], 3); - s[10] = ROTL64(v, 1); - - /* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */ - v = s[0]; w = s[1]; s[0] ^= (~w) & s[2]; s[1] ^= (~s[2]) & s[3]; s[2] ^= (~s[3]) & s[4]; s[3] ^= (~s[4]) & v; s[4] ^= (~v) & w; - v = s[5]; w = s[6]; s[5] ^= (~w) & s[7]; s[6] ^= (~s[7]) & s[8]; s[7] ^= (~s[8]) & s[9]; s[8] ^= (~s[9]) & v; s[9] ^= (~v) & w; - v = s[10]; w = s[11]; s[10] ^= (~w) & s[12]; s[11] ^= (~s[12]) & s[13]; s[12] ^= (~s[13]) & s[14]; s[13] ^= (~s[14]) & v; s[14] ^= (~v) & w; - v = s[15]; w = s[16]; s[15] ^= (~w) & s[17]; s[16] ^= (~s[17]) & s[18]; s[17] ^= (~s[18]) & s[19]; s[18] ^= (~s[19]) & v; s[19] ^= (~v) & w; - v = s[20]; w = s[21]; s[20] ^= (~w) & s[22]; s[21] ^= (~s[22]) & s[23]; s[22] ^= (~s[23]) & s[24]; s[23] ^= (~s[24]) & v; s[24] ^= (~v) & w; - - /* iota: a[0,0] ^= round constant */ - s[0] ^= keccak_round_constants[i]; - } -} -#endif +#define bitselect(a, b, c) ((a) ^ ((c) & ((b) ^ (a)))) -#if __CUDA_ARCH__ >= 350 -__device__ __forceinline__ -static void keccak_blockv35_80(uint2 *s) +static void __forceinline__ __device__ keccak_block(uint2 *s) { - int i; - uint2 t[5], u[5], v, w; - - /* theta: c = a[0,i] ^ a[1,i] ^ .. a[4,i] */ - t[0] = s[0] ^ s[5] ^ s[10]; - t[1] = s[1] ^ s[6] ^ s[16]; - t[2] = s[2] ^ s[7]; - t[3] = s[3] ^ s[8]; - t[4] = s[4] ^ s[9]; - - /* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */ - u[0] = t[4] ^ ROL2(t[1], 1); - u[1] = t[0] ^ ROL2(t[2], 1); - u[2] = t[1] ^ ROL2(t[3], 1); - u[3] = t[2] ^ ROL2(t[4], 1); - u[4] = t[3] ^ ROL2(t[0], 1); - - /* theta: a[0,i], a[1,i], .. a[4,i] ^= d[i] */ - s[0] ^= u[0]; s[5] ^= u[0]; s[10] ^= u[0]; - s[1] ^= u[1]; s[6] ^= u[1]; s[16] ^= u[1]; - s[2] ^= u[2]; s[7] ^= u[2]; - s[3] ^= u[3]; s[8] ^= u[3]; - s[4] ^= u[4]; s[9] ^= u[4]; - - /* rho pi: b[..] = rotl(a[..], ..) */ - v = s[1]; - s[1] = ROL2(s[6], 44); - s[6] = ROL2(s[9], 20); - s[9] = ROL2(u[2], 61); - s[22] = ROL2(u[4], 39); - s[14] = ROL2(u[0], 18); - s[20] = ROL2(s[2], 62); - s[2] = ROL2(u[2], 43); - s[12] = ROL2(u[3], 25); - s[13] = ROL2(u[4], 8); - s[19] = ROL2(u[3], 56); - s[23] = ROL2(u[0], 41); - s[15] = ROL2(s[4], 27); - s[4] = ROL2(u[4], 14); - s[24] = ROL2(u[1], 2); - s[21] = ROL2(s[8], 55); - s[8] = ROL2(s[16], 45); - s[16] = ROL2(s[5], 36); - s[5] = ROL2(s[3], 28); - s[3] = ROL2(u[3], 21); - s[18] = ROL2(u[2], 15); - s[17] = ROL2(u[1], 10); - s[11] = ROL2(s[7], 6); - s[7] = ROL2(s[10], 3); - s[10] = ROL2(v, 1); - - /* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */ - v = s[0]; w = s[1]; s[0] ^= (~w) & s[2]; s[1] ^= (~s[2]) & s[3]; s[2] ^= (~s[3]) & s[4]; s[3] ^= (~s[4]) & v; s[4] ^= (~v) & w; - v = s[5]; w = s[6]; s[5] ^= (~w) & s[7]; s[6] ^= (~s[7]) & s[8]; s[7] ^= (~s[8]) & s[9]; s[8] ^= (~s[9]) & v; s[9] ^= (~v) & w; - v = s[10]; w = s[11]; s[10] ^= (~w) & s[12]; s[11] ^= (~s[12]) & s[13]; s[12] ^= (~s[13]) & s[14]; s[13] ^= (~s[14]) & v; s[14] ^= (~v) & w; - v = s[15]; w = s[16]; s[15] ^= (~w) & s[17]; s[16] ^= (~s[17]) & s[18]; s[17] ^= (~s[18]) & s[19]; s[18] ^= (~s[19]) & v; s[19] ^= (~v) & w; - v = s[20]; w = s[21]; s[20] ^= (~w) & s[22]; s[21] ^= (~s[22]) & s[23]; s[22] ^= (~s[23]) & s[24]; s[23] ^= (~s[24]) & v; s[24] ^= (~v) & w; - - /* iota: a[0,0] ^= round constant */ - s[0] = s[0]^1; //keccak_round_constants[0]; - - #pragma unroll - for (i = 1; i < 23; i++) { - /* theta: c = a[0,i] ^ a[1,i] ^ .. a[4,i] */ - t[0] = s[0] ^ s[5] ^ s[10] ^ s[15] ^ s[20]; - t[1] = s[1] ^ s[6] ^ s[11] ^ s[16] ^ s[21]; - t[2] = s[2] ^ s[7] ^ s[12] ^ s[17] ^ s[22]; - t[3] = s[3] ^ s[8] ^ s[13] ^ s[18] ^ s[23]; - t[4] = s[4] ^ s[9] ^ s[14] ^ s[19] ^ s[24]; + uint2 bc[5], tmpxor[5], tmp1, tmp2; +// uint2 s[25]; - /* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */ - u[0] = t[4] ^ ROL2(t[1], 1); - u[1] = t[0] ^ ROL2(t[2], 1); - u[2] = t[1] ^ ROL2(t[3], 1); - u[3] = t[2] ^ ROL2(t[4], 1); - u[4] = t[3] ^ ROL2(t[0], 1); - - /* theta: a[0,i], a[1,i], .. a[4,i] ^= d[i] */ - s[0] ^= u[0]; s[5] ^= u[0]; s[10] ^= u[0]; s[15] ^= u[0]; s[20] ^= u[0]; - s[1] ^= u[1]; s[6] ^= u[1]; s[11] ^= u[1]; s[16] ^= u[1]; s[21] ^= u[1]; - s[2] ^= u[2]; s[7] ^= u[2]; s[12] ^= u[2]; s[17] ^= u[2]; s[22] ^= u[2]; - s[3] ^= u[3]; s[8] ^= u[3]; s[13] ^= u[3]; s[18] ^= u[3]; s[23] ^= u[3]; - s[4] ^= u[4]; s[9] ^= u[4]; s[14] ^= u[4]; s[19] ^= u[4]; s[24] ^= u[4]; - - /* rho pi: b[..] = rotl(a[..], ..) */ - v = s[1]; - s[1] = ROL2(s[6], 44); - s[6] = ROL2(s[9], 20); - s[9] = ROL2(s[22], 61); - s[22] = ROL2(s[14], 39); - s[14] = ROL2(s[20], 18); - s[20] = ROL2(s[2], 62); - s[2] = ROL2(s[12], 43); - s[12] = ROL2(s[13], 25); - s[13] = ROL2(s[19], 8); - s[19] = ROL2(s[23], 56); - s[23] = ROL2(s[15], 41); - s[15] = ROL2(s[4], 27); - s[4] = ROL2(s[24], 14); - s[24] = ROL2(s[21], 2); - s[21] = ROL2(s[8], 55); - s[8] = ROL2(s[16], 45); - s[16] = ROL2(s[5], 36); - s[5] = ROL2(s[3], 28); - s[3] = ROL2(s[18], 21); - s[18] = ROL2(s[17], 15); - s[17] = ROL2(s[11], 10); - s[11] = ROL2(s[7], 6); - s[7] = ROL2(s[10], 3); - s[10] = ROL2(v, 1); - - /* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */ - v = s[0]; w = s[1]; s[0] ^= (~w) & s[2]; s[1] ^= (~s[2]) & s[3]; s[2] ^= (~s[3]) & s[4]; s[3] ^= (~s[4]) & v; s[4] ^= (~v) & w; - v = s[5]; w = s[6]; s[5] ^= (~w) & s[7]; s[6] ^= (~s[7]) & s[8]; s[7] ^= (~s[8]) & s[9]; s[8] ^= (~s[9]) & v; s[9] ^= (~v) & w; - v = s[10]; w = s[11]; s[10] ^= (~w) & s[12]; s[11] ^= (~s[12]) & s[13]; s[12] ^= (~s[13]) & s[14]; s[13] ^= (~s[14]) & v; s[14] ^= (~v) & w; - v = s[15]; w = s[16]; s[15] ^= (~w) & s[17]; s[16] ^= (~s[17]) & s[18]; s[17] ^= (~s[18]) & s[19]; s[18] ^= (~s[19]) & v; s[19] ^= (~v) & w; - v = s[20]; w = s[21]; s[20] ^= (~w) & s[22]; s[21] ^= (~s[22]) & s[23]; s[22] ^= (~s[23]) & s[24]; s[23] ^= (~s[24]) & v; s[24] ^= (~v) & w; - - /* iota: a[0,0] ^= round constant */ +#pragma unroll 1 + for (int i= 0; i < 24; i++) + { +#pragma unroll + for (uint32_t x = 0; x < 5; x++) + tmpxor[x] = s[x] ^ s[x + 5] ^ s[x + 10] ^ s[x + 15] ^ s[x + 20]; + + bc[0] = tmpxor[0] ^ ROL2(tmpxor[2], 1); + bc[1] = tmpxor[1] ^ ROL2(tmpxor[3], 1); + bc[2] = tmpxor[2] ^ ROL2(tmpxor[4], 1); + bc[3] = tmpxor[3] ^ ROL2(tmpxor[0], 1); + bc[4] = tmpxor[4] ^ ROL2(tmpxor[1], 1); + + tmp1 = s[1] ^ bc[0]; + + s[0] ^= bc[4]; + s[1] = ROL2(s[6] ^ bc[0], 44); + s[6] = ROL2(s[9] ^ bc[3], 20); + s[9] = ROL2(s[22] ^ bc[1], 61); + s[22] = ROL2(s[14] ^ bc[3], 39); + s[14] = ROL2(s[20] ^ bc[4], 18); + s[20] = ROL2(s[2] ^ bc[1], 62); + s[2] = ROL2(s[12] ^ bc[1], 43); + s[12] = ROL2(s[13] ^ bc[2], 25); + s[13] = ROL8(s[19] ^ bc[3]); + s[19] = ROR8(s[23] ^ bc[2]); + s[23] = ROL2(s[15] ^ bc[4], 41); + s[15] = ROL2(s[4] ^ bc[3], 27); + s[4] = ROL2(s[24] ^ bc[3], 14); + s[24] = ROL2(s[21] ^ bc[0], 2); + s[21] = ROL2(s[8] ^ bc[2], 55); + s[8] = ROL2(s[16] ^ bc[0], 45); + s[16] = ROL2(s[5] ^ bc[4], 36); + s[5] = ROL2(s[3] ^ bc[2], 28); + s[3] = ROL2(s[18] ^ bc[2], 21); + s[18] = ROL2(s[17] ^ bc[1], 15); + s[17] = ROL2(s[11] ^ bc[0], 10); + s[11] = ROL2(s[7] ^ bc[1], 6); + s[7] = ROL2(s[10] ^ bc[4], 3); + s[10] = ROL2(tmp1, 1); + + tmp1 = s[0]; tmp2 = s[1]; s[0] = bitselect(s[0] ^ s[2], s[0], s[1]); s[1] = bitselect(s[1] ^ s[3], s[1], s[2]); s[2] = bitselect(s[2] ^ s[4], s[2], s[3]); s[3] = bitselect(s[3] ^ tmp1, s[3], s[4]); s[4] = bitselect(s[4] ^ tmp2, s[4], tmp1); + tmp1 = s[5]; tmp2 = s[6]; s[5] = bitselect(s[5] ^ s[7], s[5], s[6]); s[6] = bitselect(s[6] ^ s[8], s[6], s[7]); s[7] = bitselect(s[7] ^ s[9], s[7], s[8]); s[8] = bitselect(s[8] ^ tmp1, s[8], s[9]); s[9] = bitselect(s[9] ^ tmp2, s[9], tmp1); + tmp1 = s[10]; tmp2 = s[11]; s[10] = bitselect(s[10] ^ s[12], s[10], s[11]); s[11] = bitselect(s[11] ^ s[13], s[11], s[12]); s[12] = bitselect(s[12] ^ s[14], s[12], s[13]); s[13] = bitselect(s[13] ^ tmp1, s[13], s[14]); s[14] = bitselect(s[14] ^ tmp2, s[14], tmp1); + tmp1 = s[15]; tmp2 = s[16]; s[15] = bitselect(s[15] ^ s[17], s[15], s[16]); s[16] = bitselect(s[16] ^ s[18], s[16], s[17]); s[17] = bitselect(s[17] ^ s[19], s[17], s[18]); s[18] = bitselect(s[18] ^ tmp1, s[18], s[19]); s[19] = bitselect(s[19] ^ tmp2, s[19], tmp1); + tmp1 = s[20]; tmp2 = s[21]; s[20] = bitselect(s[20] ^ s[22], s[20], s[21]); s[21] = bitselect(s[21] ^ s[23], s[21], s[22]); s[22] = bitselect(s[22] ^ s[24], s[22], s[23]); s[23] = bitselect(s[23] ^ tmp1, s[23], s[24]); s[24] = bitselect(s[24] ^ tmp2, s[24], tmp1); s[0] ^= keccak_round_constants35[i]; } - t[0] = s[0] ^ s[5] ^ s[10] ^ s[15] ^ s[20]; - t[1] = s[1] ^ s[6] ^ s[11] ^ s[16] ^ s[21]; - t[2] = s[2] ^ s[7] ^ s[12] ^ s[17] ^ s[22]; - t[3] = s[3] ^ s[8] ^ s[13] ^ s[18] ^ s[23]; - t[4] = s[4] ^ s[9] ^ s[14] ^ s[19] ^ s[24]; - - s[0] ^= t[4] ^ ROL2(t[1], 1); - s[18] ^= t[2] ^ ROL2(t[4], 1); - s[24] ^= t[3] ^ ROL2(t[0], 1); - - s[3] = ROL2(s[18], 21) ^ ((~ROL2(s[24], 14)) & s[0]); } -#else -__device__ __forceinline__ -static void keccak_blockv30_80(uint64_t *s, const uint64_t *keccak_round_constants) +__global__ __launch_bounds__(512) +void keccak256_gpu_hash_80(uint32_t threads, uint32_t startNounce, uint32_t *const __restrict__ resNounce) { - int i; - uint64_t t[5], u[5], v, w; - - /* absorb input */ - /* theta: c = a[0,i] ^ a[1,i] ^ .. a[4,i] */ - t[0] = s[0] ^ s[5] ^ s[10]; - t[1] = s[1] ^ s[6] ^ s[16]; - t[2] = s[2] ^ s[7]; - t[3] = s[3] ^ s[8] ; - t[4] = s[4] ^ s[9]; - - /* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */ - u[0] = t[4] ^ ROTL64(t[1], 1); - u[1] = t[0] ^ ROTL64(t[2], 1); - u[2] = t[1] ^ ROTL64(t[3], 1); - u[3] = t[2] ^ ROTL64(t[4], 1); - u[4] = t[3] ^ ROTL64(t[0], 1); - - /* theta: a[0,i], a[1,i], .. a[4,i] ^= d[i] */ - s[0] ^= u[0]; s[5] ^= u[0]; s[10] ^= u[0]; - s[1] ^= u[1]; s[6] ^= u[1]; s[16] ^= u[1]; - s[2] ^= u[2]; s[7] ^= u[2]; - s[3] ^= u[3]; s[8] ^= u[3]; - s[4] ^= u[4]; s[9] ^= u[4]; - - /* rho pi: b[..] = rotl(a[..], ..) */ - v = s[1]; - s[1] = ROTL64(s[6], 44); - s[6] = ROTL64(s[9], 20); - s[9] = ROTL64(u[2], 61); - s[22] = ROTL64(u[4], 39); - s[14] = ROTL64(u[0], 18); - s[20] = ROTL64(s[2], 62); - s[2] = ROTL64(u[2], 43); - s[12] = ROTL64(u[3], 25); - s[13] = ROTL64(u[4], 8); - s[19] = ROTL64(u[3], 56); - s[23] = ROTL64(u[0], 41); - s[15] = ROTL64(s[4], 27); - s[4] = ROTL64(u[4], 14); - s[24] = ROTL64(u[1], 2); - s[21] = ROTL64(s[8], 55); - s[8] = ROTL64(s[16], 45); - s[16] = ROTL64(s[5], 36); - s[5] = ROTL64(s[3], 28); - s[3] = ROTL64(u[3], 21); - s[18] = ROTL64(u[2], 15); - s[17] = ROTL64(u[1], 10); - s[11] = ROTL64(s[7], 6); - s[7] = ROTL64(s[10], 3); - s[10] = ROTL64(v, 1); - - /* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */ - v = s[0]; w = s[1]; s[0] ^= (~w) & s[2]; s[1] ^= (~s[2]) & s[3]; s[2] ^= (~s[3]) & s[4]; s[3] ^= (~s[4]) & v; s[4] ^= (~v) & w; - v = s[5]; w = s[6]; s[5] ^= (~w) & s[7]; s[6] ^= (~s[7]) & s[8]; s[7] ^= (~s[8]) & s[9]; s[8] ^= (~s[9]) & v; s[9] ^= (~v) & w; - v = s[10]; w = s[11]; s[10] ^= (~w) & s[12]; s[11] ^= (~s[12]) & s[13]; s[12] ^= (~s[13]) & s[14]; s[13] ^= (~s[14]) & v; s[14] ^= (~v) & w; - v = s[15]; w = s[16]; s[15] ^= (~w) & s[17]; s[16] ^= (~s[17]) & s[18]; s[17] ^= (~s[18]) & s[19]; s[18] ^= (~s[19]) & v; s[19] ^= (~v) & w; - v = s[20]; w = s[21]; s[20] ^= (~w) & s[22]; s[21] ^= (~s[22]) & s[23]; s[22] ^= (~s[23]) & s[24]; s[23] ^= (~s[24]) & v; s[24] ^= (~v) & w; - - /* iota: a[0,0] ^= round constant */ - s[0] ^= keccak_round_constants[0]; - - for (i = 1; i < 23; i++) { - /* theta: c = a[0,i] ^ a[1,i] ^ .. a[4,i] */ + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); +// if (thread < threads) + { + const uint32_t nounce = startNounce + thread; + uint2 bc[5], tmpxor[5], tmp1, tmp2; + uint2 s[25]; + + s[9] = make_uint2(c_PaddedMessage80[9].x, cuda_swab32(nounce)); + s[10] = make_uint2(1, 0); + s[16] = make_uint2(0, 0x80000000); + + tmpxor[0] = c_PaddedMessage80[0] ^ c_PaddedMessage80[5] ^ s[10]; + tmpxor[1] = c_PaddedMessage80[1] ^ c_PaddedMessage80[6] ^ s[16]; + tmpxor[2] = c_PaddedMessage80[2] ^ c_PaddedMessage80[7]; + tmpxor[3] = c_PaddedMessage80[3] ^ c_PaddedMessage80[8]; + tmpxor[4] = c_PaddedMessage80[4] ^ s[9]; + + bc[0] = tmpxor[0] ^ ROL2(tmpxor[2], 1); + bc[1] = tmpxor[1] ^ ROL2(tmpxor[3], 1); + bc[2] = tmpxor[2] ^ ROL2(tmpxor[4], 1); + bc[3] = tmpxor[3] ^ ROL2(tmpxor[0], 1); + bc[4] = tmpxor[4] ^ ROL2(tmpxor[1], 1); + + tmp1 = c_PaddedMessage80[1] ^ bc[0]; + + s[0] = c_PaddedMessage80[0] ^ bc[4]; + s[1] = ROL2(c_PaddedMessage80[6] ^ bc[0], 44); + s[6] = ROL2(s[9] ^ bc[3], 20); + s[9] = ROL2(bc[1], 61); + s[22] = ROL2(bc[3], 39); + s[14] = ROL2(bc[4], 18); + s[20] = ROL2(c_PaddedMessage80[2] ^ bc[1], 62); + s[2] = ROL2(bc[1], 43); + s[12] = ROL2(bc[2], 25); + s[13] = ROL8(bc[3]); + s[19] = ROR8(bc[2]); + s[23] = ROL2(bc[4], 41); + s[15] = ROL2(c_PaddedMessage80[4] ^ bc[3], 27); + s[4] = ROL2(bc[3], 14); + s[24] = ROL2(bc[0], 2); + s[21] = ROL2(c_PaddedMessage80[8] ^ bc[2], 55); + s[8] = ROL2(s[16] ^ bc[0], 45); + s[16] = ROL2(c_PaddedMessage80[5] ^ bc[4], 36); + s[5] = ROL2(c_PaddedMessage80[3] ^ bc[2], 28); + s[3] = ROL2( bc[2], 21); + s[18] = ROL2(bc[1], 15); + s[17] = ROL2(bc[0], 10); + s[11] = ROL2(c_PaddedMessage80[7] ^ bc[1], 6); + s[7] = ROL2(s[10] ^ bc[4], 3); + s[10] = ROL2(tmp1, 1); + + tmp1 = s[0]; tmp2 = s[1]; s[0] = bitselect(s[0] ^ s[2], s[0], s[1]); s[1] = bitselect(s[1] ^ s[3], s[1], s[2]); s[2] = bitselect(s[2] ^ s[4], s[2], s[3]); s[3] = bitselect(s[3] ^ tmp1, s[3], s[4]); s[4] = bitselect(s[4] ^ tmp2, s[4], tmp1); + tmp1 = s[5]; tmp2 = s[6]; s[5] = bitselect(s[5] ^ s[7], s[5], s[6]); s[6] = bitselect(s[6] ^ s[8], s[6], s[7]); s[7] = bitselect(s[7] ^ s[9], s[7], s[8]); s[8] = bitselect(s[8] ^ tmp1, s[8], s[9]); s[9] = bitselect(s[9] ^ tmp2, s[9], tmp1); + tmp1 = s[10]; tmp2 = s[11]; s[10] = bitselect(s[10] ^ s[12], s[10], s[11]); s[11] = bitselect(s[11] ^ s[13], s[11], s[12]); s[12] = bitselect(s[12] ^ s[14], s[12], s[13]); s[13] = bitselect(s[13] ^ tmp1, s[13], s[14]); s[14] = bitselect(s[14] ^ tmp2, s[14], tmp1); + tmp1 = s[15]; tmp2 = s[16]; s[15] = bitselect(s[15] ^ s[17], s[15], s[16]); s[16] = bitselect(s[16] ^ s[18], s[16], s[17]); s[17] = bitselect(s[17] ^ s[19], s[17], s[18]); s[18] = bitselect(s[18] ^ tmp1, s[18], s[19]); s[19] = bitselect(s[19] ^ tmp2, s[19], tmp1); + tmp1 = s[20]; tmp2 = s[21]; s[20] = bitselect(s[20] ^ s[22], s[20], s[21]); s[21] = bitselect(s[21] ^ s[23], s[21], s[22]); s[22] = bitselect(s[22] ^ s[24], s[22], s[23]); s[23] = bitselect(s[23] ^ tmp1, s[23], s[24]); s[24] = bitselect(s[24] ^ tmp2, s[24], tmp1); + s[0].x ^= 1; + +#pragma unroll 2 + for (int i = 1; i < 23; i++) + { + +#pragma unroll + for (uint32_t x = 0; x < 5; x++) + tmpxor[x] = s[x] ^ s[x + 5] ^ s[x + 10] ^ s[x + 15] ^ s[x + 20]; + + bc[0] = tmpxor[0] ^ ROL2(tmpxor[2], 1); + bc[1] = tmpxor[1] ^ ROL2(tmpxor[3], 1); + bc[2] = tmpxor[2] ^ ROL2(tmpxor[4], 1); + bc[3] = tmpxor[3] ^ ROL2(tmpxor[0], 1); + bc[4] = tmpxor[4] ^ ROL2(tmpxor[1], 1); + + tmp1 = s[1] ^ bc[0]; + + s[0] ^= bc[4]; + s[1] = ROL2(s[6] ^ bc[0], 44); + s[6] = ROL2(s[9] ^ bc[3], 20); + s[9] = ROL2(s[22] ^ bc[1], 61); + s[22] = ROL2(s[14] ^ bc[3], 39); + s[14] = ROL2(s[20] ^ bc[4], 18); + s[20] = ROL2(s[2] ^ bc[1], 62); + s[2] = ROL2(s[12] ^ bc[1], 43); + s[12] = ROL2(s[13] ^ bc[2], 25); + s[13] = ROL8(s[19] ^ bc[3]); + s[19] = ROR8(s[23] ^ bc[2]); + s[23] = ROL2(s[15] ^ bc[4], 41); + s[15] = ROL2(s[4] ^ bc[3], 27); + s[4] = ROL2(s[24] ^ bc[3], 14); + s[24] = ROL2(s[21] ^ bc[0], 2); + s[21] = ROL2(s[8] ^ bc[2], 55); + s[8] = ROL2(s[16] ^ bc[0], 45); + s[16] = ROL2(s[5] ^ bc[4], 36); + s[5] = ROL2(s[3] ^ bc[2], 28); + s[3] = ROL2(s[18] ^ bc[2], 21); + s[18] = ROL2(s[17] ^ bc[1], 15); + s[17] = ROL2(s[11] ^ bc[0], 10); + s[11] = ROL2(s[7] ^ bc[1], 6); + s[7] = ROL2(s[10] ^ bc[4], 3); + s[10] = ROL2(tmp1, 1); + + tmp1 = s[0]; tmp2 = s[1]; s[0] = bitselect(s[0] ^ s[2], s[0], s[1]); s[1] = bitselect(s[1] ^ s[3], s[1], s[2]); s[2] = bitselect(s[2] ^ s[4], s[2], s[3]); s[3] = bitselect(s[3] ^ tmp1, s[3], s[4]); s[4] = bitselect(s[4] ^ tmp2, s[4], tmp1); + tmp1 = s[5]; tmp2 = s[6]; s[5] = bitselect(s[5] ^ s[7], s[5], s[6]); s[6] = bitselect(s[6] ^ s[8], s[6], s[7]); s[7] = bitselect(s[7] ^ s[9], s[7], s[8]); s[8] = bitselect(s[8] ^ tmp1, s[8], s[9]); s[9] = bitselect(s[9] ^ tmp2, s[9], tmp1); + tmp1 = s[10]; tmp2 = s[11]; s[10] = bitselect(s[10] ^ s[12], s[10], s[11]); s[11] = bitselect(s[11] ^ s[13], s[11], s[12]); s[12] = bitselect(s[12] ^ s[14], s[12], s[13]); s[13] = bitselect(s[13] ^ tmp1, s[13], s[14]); s[14] = bitselect(s[14] ^ tmp2, s[14], tmp1); + tmp1 = s[15]; tmp2 = s[16]; s[15] = bitselect(s[15] ^ s[17], s[15], s[16]); s[16] = bitselect(s[16] ^ s[18], s[16], s[17]); s[17] = bitselect(s[17] ^ s[19], s[17], s[18]); s[18] = bitselect(s[18] ^ tmp1, s[18], s[19]); s[19] = bitselect(s[19] ^ tmp2, s[19], tmp1); + tmp1 = s[20]; tmp2 = s[21]; s[20] = bitselect(s[20] ^ s[22], s[20], s[21]); s[21] = bitselect(s[21] ^ s[23], s[21], s[22]); s[22] = bitselect(s[22] ^ s[24], s[22], s[23]); s[23] = bitselect(s[23] ^ tmp1, s[23], s[24]); s[24] = bitselect(s[24] ^ tmp2, s[24], tmp1); + s[0] ^= keccak_round_constants35[i]; + } + uint2 t[5]; t[0] = s[0] ^ s[5] ^ s[10] ^ s[15] ^ s[20]; t[1] = s[1] ^ s[6] ^ s[11] ^ s[16] ^ s[21]; t[2] = s[2] ^ s[7] ^ s[12] ^ s[17] ^ s[22]; t[3] = s[3] ^ s[8] ^ s[13] ^ s[18] ^ s[23]; t[4] = s[4] ^ s[9] ^ s[14] ^ s[19] ^ s[24]; - /* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */ - u[0] = t[4] ^ ROTL64(t[1], 1); - u[1] = t[0] ^ ROTL64(t[2], 1); - u[2] = t[1] ^ ROTL64(t[3], 1); - u[3] = t[2] ^ ROTL64(t[4], 1); - u[4] = t[3] ^ ROTL64(t[0], 1); - - /* theta: a[0,i], a[1,i], .. a[4,i] ^= d[i] */ - s[0] ^= u[0]; s[5] ^= u[0]; s[10] ^= u[0]; s[15] ^= u[0]; s[20] ^= u[0]; - s[1] ^= u[1]; s[6] ^= u[1]; s[11] ^= u[1]; s[16] ^= u[1]; s[21] ^= u[1]; - s[2] ^= u[2]; s[7] ^= u[2]; s[12] ^= u[2]; s[17] ^= u[2]; s[22] ^= u[2]; - s[3] ^= u[3]; s[8] ^= u[3]; s[13] ^= u[3]; s[18] ^= u[3]; s[23] ^= u[3]; - s[4] ^= u[4]; s[9] ^= u[4]; s[14] ^= u[4]; s[19] ^= u[4]; s[24] ^= u[4]; - - /* rho pi: b[..] = rotl(a[..], ..) */ - v = s[ 1]; - s[ 1] = ROTL64(s[ 6], 44); - s[ 6] = ROTL64(s[ 9], 20); - s[ 9] = ROTL64(s[22], 61); - s[22] = ROTL64(s[14], 39); - s[14] = ROTL64(s[20], 18); - s[20] = ROTL64(s[ 2], 62); - s[ 2] = ROTL64(s[12], 43); - s[12] = ROTL64(s[13], 25); - s[13] = ROTL64(s[19], 8); - s[19] = ROTL64(s[23], 56); - s[23] = ROTL64(s[15], 41); - s[15] = ROTL64(s[ 4], 27); - s[ 4] = ROTL64(s[24], 14); - s[24] = ROTL64(s[21], 2); - s[21] = ROTL64(s[ 8], 55); - s[ 8] = ROTL64(s[16], 45); - s[16] = ROTL64(s[ 5], 36); - s[ 5] = ROTL64(s[ 3], 28); - s[ 3] = ROTL64(s[18], 21); - s[18] = ROTL64(s[17], 15); - s[17] = ROTL64(s[11], 10); - s[11] = ROTL64(s[ 7], 6); - s[ 7] = ROTL64(s[10], 3); - s[10] = ROTL64( v, 1); - - /* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */ - v = s[ 0]; w = s[ 1]; s[ 0] ^= (~w) & s[ 2]; s[ 1] ^= (~s[ 2]) & s[ 3]; s[ 2] ^= (~s[ 3]) & s[ 4]; s[ 3] ^= (~s[ 4]) & v; s[ 4] ^= (~v) & w; - v = s[ 5]; w = s[ 6]; s[ 5] ^= (~w) & s[ 7]; s[ 6] ^= (~s[ 7]) & s[ 8]; s[ 7] ^= (~s[ 8]) & s[ 9]; s[ 8] ^= (~s[ 9]) & v; s[ 9] ^= (~v) & w; - v = s[10]; w = s[11]; s[10] ^= (~w) & s[12]; s[11] ^= (~s[12]) & s[13]; s[12] ^= (~s[13]) & s[14]; s[13] ^= (~s[14]) & v; s[14] ^= (~v) & w; - v = s[15]; w = s[16]; s[15] ^= (~w) & s[17]; s[16] ^= (~s[17]) & s[18]; s[17] ^= (~s[18]) & s[19]; s[18] ^= (~s[19]) & v; s[19] ^= (~v) & w; - v = s[20]; w = s[21]; s[20] ^= (~w) & s[22]; s[21] ^= (~s[22]) & s[23]; s[22] ^= (~s[23]) & s[24]; s[23] ^= (~s[24]) & v; s[24] ^= (~v) & w; - - /* iota: a[0,0] ^= round constant */ - s[0] ^= keccak_round_constants[i]; - } - t[0] = s[0] ^ s[5] ^ s[10] ^ s[15] ^ s[20]; - t[1] = s[1] ^ s[6] ^ s[11] ^ s[16] ^ s[21]; - t[2] = s[2] ^ s[7] ^ s[12] ^ s[17] ^ s[22]; - t[3] = s[3] ^ s[8] ^ s[13] ^ s[18] ^ s[23]; - t[4] = s[4] ^ s[9] ^ s[14] ^ s[19] ^ s[24]; - - s[0] ^= t[4] ^ ROTL64(t[1], 1); - s[18] ^= t[2] ^ ROTL64(t[4], 1); - s[24] ^= t[3] ^ ROTL64(t[0], 1); + s[0] ^= t[4] ^ ROL2(t[1], 1); + s[18] ^= t[2] ^ ROL2(t[4], 1); + s[24] ^= t[3] ^ ROL2(t[0], 1); - s[3] = ROTL64(s[18], 21) ^ ((~ROTL64(s[24], 14)) & s[0]); -} -#endif + s[3] = ROL2(s[18], 21) ^ ((~ROL2(s[24], 14)) & s[0]); -__global__ __launch_bounds__(128,5) -void keccak256_gpu_hash_80(uint32_t threads, uint32_t startNounce, void *outputHash, uint32_t *resNounce) -{ - uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); - if (thread < threads) - { - uint32_t nounce = startNounce + thread; - -#if __CUDA_ARCH__ >= 350 - uint2 keccak_gpu_state[25]; - #pragma unroll 25 - for (int i=0; i<25; i++) { - if (i<9) keccak_gpu_state[i] = vectorize(c_PaddedMessage80[i]); - else keccak_gpu_state[i] = UINT2(0, 0); - } - - keccak_gpu_state[9]= vectorize(c_PaddedMessage80[9]); - keccak_gpu_state[9].y = cuda_swab32(nounce); - keccak_gpu_state[10] = UINT2(1, 0); - keccak_gpu_state[16] = UINT2(0, 0x80000000); - keccak_blockv35_80(keccak_gpu_state); - if (devectorize(keccak_gpu_state[3]) <= ((uint64_t*)pTarget)[3]) - { - uint32_t tmp = atomicCAS(resNounce, 0xffffffff, nounce); - if (tmp != 0xffffffff) - resNounce[1] = nounce; - } -#else - uint64_t keccak_gpu_state[25]; - #pragma unroll 25 - for (int i=0; i<25; i++) { - if (i<9) keccak_gpu_state[i] = c_PaddedMessage80[i]; - else keccak_gpu_state[i] = 0; - } - keccak_gpu_state[9] = REPLACE_HIWORD(c_PaddedMessage80[9], cuda_swab32(nounce)); - keccak_gpu_state[10] = 0x0000000000000001; - keccak_gpu_state[16] = 0x8000000000000000; - - keccak_blockv30_80(keccak_gpu_state, keccak_round_constants); - if (keccak_gpu_state[3] <= ((uint64_t*)pTarget)[3]) + if (devectorize(s[3]) <= ((uint64_t*)pTarget)[3]) { uint32_t tmp = atomicCAS(resNounce, 0xffffffff, nounce); if (tmp != 0xffffffff) resNounce[1] = nounce; } -#endif } } __host__ -void keccak256_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_outputHash, int order, uint32_t *h_nounce) +void keccak256_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *h_nounce) { - cudaMemset(d_KNonce[thr_id], 0xff, 4*sizeof(uint32_t)); - const uint32_t threadsperblock = 128; + cudaMemsetAsync(d_KNonce[thr_id], 0xff, 2 * sizeof(uint32_t), gpustream[thr_id]); + const uint32_t threadsperblock = 512; dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 block(threadsperblock); - - keccak256_gpu_hash_80<<>>(threads, startNounce, d_outputHash, d_KNonce[thr_id]); + keccak256_gpu_hash_80<<>>(threads, startNounce, d_KNonce[thr_id]); //MyStreamSynchronize(NULL, order, thr_id); - cudaMemcpy(h_nounce, d_KNonce[thr_id], 4 * sizeof(uint32_t), cudaMemcpyDeviceToHost); + CUDA_SAFE_CALL(cudaMemcpyAsync(h_nounce, d_KNonce[thr_id], 2 * sizeof(uint32_t), cudaMemcpyDeviceToHost, gpustream[thr_id])); cudaStreamSynchronize(gpustream[thr_id]); } __global__ __launch_bounds__(256,3) void keccak256_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint64_t *outputHash) { - uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); - if (thread < threads) + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); +// if (thread < threads) { -#if __CUDA_ARCH__ >= 350 /* tpr: to double check if faster on SM5+ */ uint2 keccak_gpu_state[25]; #pragma unroll 25 for (int i = 0; i<25; i++) { @@ -647,53 +263,37 @@ void keccak256_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint64_t *out } keccak_gpu_state[4] = UINT2(1, 0); keccak_gpu_state[16] = UINT2(0, 0x80000000); - keccak_blockv35_32(keccak_gpu_state); + keccak_block(keccak_gpu_state); #pragma unroll 4 for (int i=0; i<4; i++) outputHash[i*threads+thread] = devectorize(keccak_gpu_state[i]); -#else - uint64_t keccak_gpu_state[25]; - #pragma unroll 25 - for (int i = 0; i<25; i++) { - if (i<4) - keccak_gpu_state[i] = outputHash[i*threads+thread]; - else - keccak_gpu_state[i] = 0; - } - keccak_gpu_state[4] = 0x0000000000000001; - keccak_gpu_state[16] = 0x8000000000000000; - - keccak_blockv30_32(keccak_gpu_state, keccak_round_constants); - #pragma unroll 4 - for (int i = 0; i<4; i++) - outputHash[i*threads + thread] = keccak_gpu_state[i]; -#endif } } __host__ -void keccak256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_outputHash, int order) +void keccak256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_outputHash) { const uint32_t threadsperblock = 256; dim3 grid((threads + threadsperblock - 1) / threadsperblock); dim3 block(threadsperblock); - keccak256_gpu_hash_32 <<>> (threads, startNounce, d_outputHash); + keccak256_gpu_hash_32 <<>> (threads, startNounce, d_outputHash); + CUDA_SAFE_CALL(cudaGetLastError()); } __host__ -void keccak256_setBlock_80(void *pdata,const void *pTargetIn) +void keccak256_setBlock_80(int thr_id, void *pdata,const void *pTargetIn) { unsigned char PaddedMessage[80]; memcpy(PaddedMessage, pdata, 80); - CUDA_SAFE_CALL(cudaMemcpyToSymbol(pTarget, pTargetIn, 8*sizeof(uint32_t), 0, cudaMemcpyHostToDevice)); - CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_PaddedMessage80, PaddedMessage, 10*sizeof(uint64_t), 0, cudaMemcpyHostToDevice)); + CUDA_SAFE_CALL(cudaMemcpyToSymbolAsync(pTarget, pTargetIn, 8 * sizeof(uint32_t), 0, cudaMemcpyHostToDevice, gpustream[thr_id])); + CUDA_SAFE_CALL(cudaMemcpyToSymbolAsync(c_PaddedMessage80, PaddedMessage, 10 * sizeof(uint64_t), 0, cudaMemcpyHostToDevice, gpustream[thr_id])); } __host__ void keccak256_cpu_init(int thr_id, uint32_t threads) { - CUDA_SAFE_CALL(cudaMalloc(&d_KNonce[thr_id], 4*sizeof(uint32_t))); + CUDA_SAFE_CALL(cudaMalloc(&d_KNonce[thr_id], 2*sizeof(uint32_t))); } \ No newline at end of file diff --git a/Algo256/cuda_skein256.cu b/Algo256/cuda_skein256.cu index d2060e0911..0361d66ee1 100644 --- a/Algo256/cuda_skein256.cu +++ b/Algo256/cuda_skein256.cu @@ -2,146 +2,142 @@ #include "cuda_helper.h" -#if 0 -static __constant__ uint64_t SKEIN_IV512_256[8] = { - 0xCCD044A12FDB3E13, 0xE83590301A79A9EB, - 0x55AEA0614F816E6F, 0x2A2767A4AE9B94DB, - 0xEC06025E74DD7683, 0xE7A436CDC4746251, - 0xC36FBAF9393AD185, 0x3EEDBA1833EDFC13 -}; -#endif - -static __constant__ uint2 vSKEIN_IV512_256[8] = { - { 0x2FDB3E13, 0xCCD044A1 }, - { 0x1A79A9EB, 0xE8359030 }, - { 0x4F816E6F, 0x55AEA061 }, - { 0xAE9B94DB, 0x2A2767A4 }, - { 0x74DD7683, 0xEC06025E }, - { 0xC4746251, 0xE7A436CD }, - { 0x393AD185, 0xC36FBAF9 }, - { 0x33EDFC13, 0x3EEDBA18 } -}; - -static __constant__ int ROT256[8][4] = -{ - 46,36, 19, 37, - 33,27, 14, 42, - 17,49, 36, 39, - 44, 9, 54, 56, - 39,30, 34, 24, - 13,50, 10, 17, - 25,29, 39, 43, - 8, 35, 56, 22, -}; - -static __constant__ uint2 skein_ks_parity = { 0xA9FC1A22,0x1BD11BDA}; -static __constant__ uint2 t12[6] = { - { 0x20, 0 }, - { 0, 0xf0000000 }, - { 0x20, 0xf0000000 }, - { 0x08, 0 }, - { 0, 0xff000000 }, - { 0x08, 0xff000000 } -}; - -#if 0 -static __constant__ uint64_t t12_30[6] = { - 0x20, - 0xf000000000000000, - 0xf000000000000020, - 0x08, - 0xff00000000000000, - 0xff00000000000008 -}; -#endif - static __forceinline__ __device__ -void Round512v35(uint2 &p0, uint2 &p1, uint2 &p2, uint2 &p3, uint2 &p4, uint2 &p5, uint2 &p6, uint2 &p7, int ROT) +void Round512v35(uint2 &p0, uint2 &p1, uint2 &p2, uint2 &p3, uint2 &p4, uint2 &p5, uint2 &p6, uint2 &p7, + const int ROT0, const int ROT1, const int ROT2, const int ROT3) { - p0 += p1; p1 = ROL2(p1, ROT256[ROT][0]); p1 ^= p0; - p2 += p3; p3 = ROL2(p3, ROT256[ROT][1]); p3 ^= p2; - p4 += p5; p5 = ROL2(p5, ROT256[ROT][2]); p5 ^= p4; - p6 += p7; p7 = ROL2(p7, ROT256[ROT][3]); p7 ^= p6; + p0 += p1; p1 = ROL2(p1, ROT0) ^ p0; + p2 += p3; p3 = ROL2(p3, ROT1) ^ p2; + p4 += p5; p5 = ROL2(p5, ROT2) ^ p4; + p6 += p7; p7 = ROL2(p7, ROT3) ^ p6; } +__forceinline__ __device__ +void Round_8_512v35(const uint2 *const __restrict__ ks, const uint2 *const __restrict__ ts, + uint2 &p0, uint2 &p1, uint2 &p2, uint2 &p3, uint2 &p4, uint2 &p5, uint2 &p6, uint2 &p7, const int R) +{ + Round512v35(p0, p1, p2, p3, p4, p5, p6, p7, 46, 36, 19, 37); + Round512v35(p2, p1, p4, p7, p6, p5, p0, p3, 33, 27, 14, 42); + Round512v35(p4, p1, p6, p3, p0, p5, p2, p7, 17, 49, 36, 39); + Round512v35(p6, p1, p0, p7, p2, p5, p4, p3, 44, 9, 54, 56); + + p0 += ks[(R+0) % 9]; + p1 += ks[(R+1) % 9]; + p2 += ks[(R+2) % 9]; + p3 += ks[(R+3) % 9]; + p4 += ks[(R+4) % 9]; + p5 += ks[(R+5) % 9] + ts[(R+0) % 3]; + p6 += ks[(R+6) % 9] + ts[(R+1) % 3]; + p7 += ks[(R+7) % 9] + make_uint2(R, 0); + + Round512v35(p0, p1, p2, p3, p4, p5, p6, p7, 39, 30, 34, 24); + Round512v35(p2, p1, p4, p7, p6, p5, p0, p3, 13, 50, 10, 17); + Round512v35(p4, p1, p6, p3, p0, p5, p2, p7, 25, 29, 39, 43); + Round512v35(p6, p1, p0, p7, p2, p5, p4, p3, 8, 35, 56, 22); + + p0 += ks[(R+1) % 9]; + p1 += ks[(R+2) % 9]; + p2 += ks[(R+3) % 9]; + p3 += ks[(R+4) % 9]; + p4 += ks[(R+5) % 9]; + p5 += ks[(R+6) % 9] + ts[(R+1) % 3]; + p6 += ks[(R+7) % 9] + ts[(R+2) % 3]; + p7 += ks[(R+8) % 9] + make_uint2(R+1, 0); +} -static __forceinline__ __device__ -void Round_8_512v35(uint2 *ks, uint2 *ts, - uint2 &p0, uint2 &p1, uint2 &p2, uint2 &p3, - uint2 &p4, uint2 &p5, uint2 &p6, uint2 &p7, int R) +__forceinline__ __device__ +void Round_8_512v35_final(const uint2 *const __restrict__ ks, const uint2 *const __restrict__ ts, + uint2 &p0, uint2 &p1, uint2 &p2, uint2 &p3, uint2 &p4, uint2 &p5, uint2 &p6, uint2 &p7) { - Round512v35(p0, p1, p2, p3, p4, p5, p6, p7, 0); - Round512v35(p2, p1, p4, p7, p6, p5, p0, p3, 1); - Round512v35(p4, p1, p6, p3, p0, p5, p2, p7, 2); - Round512v35(p6, p1, p0, p7, p2, p5, p4, p3, 3); - p0 += ks[((R)+0) % 9]; /* inject the key schedule value */ - p1 += ks[((R)+1) % 9]; - p2 += ks[((R)+2) % 9]; - p3 += ks[((R)+3) % 9]; - p4 += ks[((R)+4) % 9]; - p5 += ks[((R)+5) % 9] + ts[((R)+0) % 3]; - p6 += ks[((R)+6) % 9] + ts[((R)+1) % 3]; - p7 += ks[((R)+7) % 9] + make_uint2((R),0); - Round512v35(p0, p1, p2, p3, p4, p5, p6, p7, 4); - Round512v35(p2, p1, p4, p7, p6, p5, p0, p3, 5); - Round512v35(p4, p1, p6, p3, p0, p5, p2, p7, 6); - Round512v35(p6, p1, p0, p7, p2, p5, p4, p3, 7); - p0 += ks[((R)+1) % 9]; /* inject the key schedule value */ - p1 += ks[((R)+2) % 9]; - p2 += ks[((R)+3) % 9]; - p3 += ks[((R)+4) % 9]; - p4 += ks[((R)+5) % 9]; - p5 += ks[((R)+6) % 9] + ts[((R)+1) % 3]; - p6 += ks[((R)+7) % 9] + ts[((R)+2) % 3]; - p7 += ks[((R)+8) % 9] + make_uint2((R)+1, 0); + Round512v35(p0, p1, p2, p3, p4, p5, p6, p7, 46, 36, 19, 37); + Round512v35(p2, p1, p4, p7, p6, p5, p0, p3, 33, 27, 14, 42); + Round512v35(p4, p1, p6, p3, p0, p5, p2, p7, 17, 49, 36, 39); + Round512v35(p6, p1, p0, p7, p2, p5, p4, p3, 44, 9, 54, 56); + + p0 += ks[8]; + p1 += ks[0]; + p2 += ks[1]; + p3 += ks[2]; + p4 += ks[3]; + p5 += ks[4] + ts[2]; + p6 += ks[5] + ts[0]; + p7 += ks[6] + make_uint2(17, 0); + + Round512v35(p0, p1, p2, p3, p4, p5, p6, p7, 39, 30, 34, 24); + Round512v35(p2, p1, p4, p7, p6, p5, p0, p3, 13, 50, 10, 17); + Round512v35(p4, p1, p6, p3, p0, p5, p2, p7, 25, 29, 39, 43); + Round512v35(p6, p1, p0, p7, p2, p5, p4, p3, 8, 35, 56, 22); + + p0 += ks[0]; + p1 += ks[1]; + p2 += ks[2]; + p3 += ks[3]; } -__global__ __launch_bounds__(256,3) + +__global__ __launch_bounds__(256,4) void skein256_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint64_t *outputHash) { - uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); - if (thread < threads) + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + const uint2 skein_ks_parity = { 0xA9FC1A22, 0x1BD11BDA }; + + const uint2 h2[9] = { + { 0x2FDB3E13, 0xCCD044A1 }, + { 0x1A79A9EB, 0xE8359030 }, + { 0x4F816E6F, 0x55AEA061 }, + { 0xAE9B94DB, 0x2A2767A4 }, + { 0x74DD7683, 0xEC06025E }, + { 0xC4746251, 0xE7A436CD }, + { 0x393AD185, 0xC36FBAF9 }, + { 0x33EDFC13, 0x3EEDBA18 }, + { 0xC73A4E2A, 0xB69D3CFC } + }; + const uint2 t12[6] = { + { 0x20, 0 }, + { 0, 0xf0000000 }, + { 0x20, 0xf0000000 }, + { 0x08, 0 }, + { 0, 0xff000000 }, + { 0x08, 0xff000000 } + }; + +// if (thread < threads) { - uint2 h[9]; - uint2 t[3]; + uint2 dt0,dt1,dt2,dt3; uint2 p0, p1, p2, p3, p4, p5, p6, p7; - h[8] = skein_ks_parity; - for (int i = 0; i<8; i++) { - h[i] = vSKEIN_IV512_256[i]; - h[8] ^= h[i]; - } - - t[0]=t12[0]; - t[1]=t12[1]; - t[2]=t12[2]; - LOHI(dt0.x,dt0.y,outputHash[thread]); LOHI(dt1.x,dt1.y,outputHash[threads+thread]); LOHI(dt2.x,dt2.y,outputHash[2*threads+thread]); LOHI(dt3.x,dt3.y,outputHash[3*threads+thread]); - p0 = h[0] + dt0; - p1 = h[1] + dt1; - p2 = h[2] + dt2; - p3 = h[3] + dt3; - p4 = h[4]; - p5 = h[5] + t[0]; - p6 = h[6] + t[1]; - p7 = h[7]; - - #pragma unroll - for (int i = 1; i<19; i+=2) { - Round_8_512v35(h,t,p0,p1,p2,p3,p4,p5,p6,p7,i); - } + p0 = h2[0] + dt0; + p1 = h2[1] + dt1; + p2 = h2[2] + dt2; + p3 = h2[3] + dt3; + p4 = h2[4]; + p5 = h2[5] + t12[0]; + p6 = h2[6] + t12[1]; + p7 = h2[7]; + + Round_8_512v35(h2, t12, p0, p1, p2, p3, p4, p5, p6, p7, 1); + Round_8_512v35(h2, t12, p0, p1, p2, p3, p4, p5, p6, p7, 3); + Round_8_512v35(h2, t12, p0, p1, p2, p3, p4, p5, p6, p7, 5); + Round_8_512v35(h2, t12, p0, p1, p2, p3, p4, p5, p6, p7, 7); + Round_8_512v35(h2, t12, p0, p1, p2, p3, p4, p5, p6, p7, 9); + Round_8_512v35(h2, t12, p0, p1, p2, p3, p4, p5, p6, p7, 11); + Round_8_512v35(h2, t12, p0, p1, p2, p3, p4, p5, p6, p7, 13); + Round_8_512v35(h2, t12, p0, p1, p2, p3, p4, p5, p6, p7, 15); + Round_8_512v35(h2, t12, p0, p1, p2, p3, p4, p5, p6, p7, 17); p0 ^= dt0; p1 ^= dt1; p2 ^= dt2; p3 ^= dt3; + uint2 h[9]; h[0] = p0; h[1] = p1; h[2] = p2; @@ -150,23 +146,21 @@ void skein256_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint64_t *outp h[5] = p5; h[6] = p6; h[7] = p7; - h[8] = skein_ks_parity; - - #pragma unroll 8 - for (int i = 0; i<8; i++) { - h[8] ^= h[i]; - } - - t[0] = t12[3]; - t[1] = t12[4]; - t[2] = t12[5]; - p5 += t[0]; //p5 already equal h[5] - p6 += t[1]; - - #pragma unroll - for (int i = 1; i<19; i+=2) { - Round_8_512v35(h, t, p0, p1, p2, p3, p4, p5, p6, p7, i); - } + h[8] = skein_ks_parity ^ h[0] ^ h[1] ^ h[2] ^ h[3] ^ h[4] ^ h[5] ^ h[6] ^ h[7]; + + const uint2 *t = t12+3; + p5 += t12[3]; //p5 already equal h[5] + p6 += t12[4]; + + Round_8_512v35(h, t, p0, p1, p2, p3, p4, p5, p6, p7, 1); + Round_8_512v35(h, t, p0, p1, p2, p3, p4, p5, p6, p7, 3); + Round_8_512v35(h, t, p0, p1, p2, p3, p4, p5, p6, p7, 5); + Round_8_512v35(h, t, p0, p1, p2, p3, p4, p5, p6, p7, 7); + Round_8_512v35(h, t, p0, p1, p2, p3, p4, p5, p6, p7, 9); + Round_8_512v35(h, t, p0, p1, p2, p3, p4, p5, p6, p7, 11); + Round_8_512v35(h, t, p0, p1, p2, p3, p4, p5, p6, p7, 13); + Round_8_512v35(h, t, p0, p1, p2, p3, p4, p5, p6, p7, 15); + Round_8_512v35_final(h, t, p0, p1, p2, p3, p4, p5, p6, p7); outputHash[thread] = devectorize(p0); outputHash[threads+thread] = devectorize(p1); @@ -178,18 +172,17 @@ void skein256_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint64_t *outp __host__ void skein256_cpu_init(int thr_id, uint32_t threads) { - //empty } __host__ -void skein256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_outputHash, int order) +void skein256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_outputHash) { - const uint32_t threadsperblock = 256; + const uint32_t threadsperblock = 32; dim3 grid((threads + threadsperblock - 1) / threadsperblock); dim3 block(threadsperblock); - skein256_gpu_hash_32<<>>(threads, startNounce, d_outputHash); - + skein256_gpu_hash_32<<>>(threads, startNounce, d_outputHash); + CUDA_SAFE_CALL(cudaGetLastError()); } diff --git a/Algo256/keccak256.cu b/Algo256/keccak256.cu index 5d4db89b46..cec9ded7ba 100644 --- a/Algo256/keccak256.cu +++ b/Algo256/keccak256.cu @@ -8,21 +8,18 @@ extern "C" #include "sph/sph_shavite.h" #include "sph/sph_simd.h" #include "sph/sph_keccak.h" - -#include "miner.h" } +#include "miner.h" -#include "cuda_helper.h" -static uint32_t *d_hash[MAX_GPUS]; -static uint32_t *h_nounce[MAX_GPUS]; +#include "cuda_helper.h" extern void keccak256_cpu_init(int thr_id, uint32_t threads); -extern void keccak256_setBlock_80(void *pdata,const void *ptarget); -extern void keccak256_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order, uint32_t *h_nounce); +extern void keccak256_setBlock_80(int thr_id, void *pdata,const void *ptarget); +extern void keccak256_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *h_nounce); // CPU Hash -extern "C" void keccak256_hash(void *state, const void *input) +void keccak256_hash(void *state, const void *input) { sph_keccak_context ctx_keccak; @@ -35,78 +32,94 @@ extern "C" void keccak256_hash(void *state, const void *input) memcpy(state, hash, 32); } -static bool init[MAX_GPUS] = { 0 }; - -extern "C" int scanhash_keccak256(int thr_id, uint32_t *pdata, - const uint32_t *ptarget, uint32_t max_nonce, - unsigned long *hashes_done) +extern int scanhash_keccak256(int thr_id, uint32_t *pdata, + uint32_t *ptarget, uint32_t max_nonce, + uint32_t *hashes_done) { + static THREAD uint32_t *h_nounce = nullptr; + const uint32_t first_nonce = pdata[19]; - uint32_t throughput = device_intensity(thr_id, __func__, 1U << 21); // 256*256*8*4 - throughput = min(throughput, (max_nonce - first_nonce)); + uint32_t intensity = (device_sm[device_map[thr_id]] > 500) ? 1 << 28 : 1 << 27;; + uint32_t throughputmax = device_intensity(device_map[thr_id], __func__, intensity); // 256*4096 + uint32_t throughput = min(throughputmax, max_nonce - first_nonce) & 0xfffffc00; + if (opt_benchmark) - ((uint32_t*)ptarget)[7] = 0x0005; + ptarget[7] = 0x0002; - if (!init[thr_id]) { - cudaSetDevice(device_map[thr_id]); - cudaDeviceReset(); + static THREAD volatile bool init = false; + if(!init) + { + CUDA_SAFE_CALL(cudaSetDevice(device_map[thr_id])); cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync); cudaDeviceSetCacheConfig(cudaFuncCachePreferL1); - - CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput)); - keccak256_cpu_init(thr_id, (int)throughput); - CUDA_SAFE_CALL(cudaMallocHost(&h_nounce[thr_id], 4 * sizeof(uint32_t))); - init[thr_id] = true; + CUDA_SAFE_CALL(cudaStreamCreate(&gpustream[thr_id])); + CUDA_SAFE_CALL(cudaMallocHost(&h_nounce, 2 * sizeof(uint32_t))); + keccak256_cpu_init(thr_id, (int)throughputmax); +// CUDA_SAFE_CALL(cudaMallocHost(&h_nounce, 2 * sizeof(uint32_t))); + mining_has_stopped[thr_id] = false; + init = true; } uint32_t endiandata[20]; for (int k=0; k < 20; k++) { - be32enc(&endiandata[k], ((uint32_t*)pdata)[k]); + be32enc(&endiandata[k], pdata[k]); } - keccak256_setBlock_80((void*)endiandata, ptarget); + keccak256_setBlock_80(thr_id, (void*)endiandata, ptarget); do { - int order = 0; - keccak256_cpu_hash_80(thr_id, (int) throughput, pdata[19], d_hash[thr_id], order++, h_nounce[thr_id]); - if (h_nounce[thr_id][0] != UINT32_MAX) + keccak256_cpu_hash_80(thr_id, (int) throughput, pdata[19], h_nounce); + if(stop_mining) {mining_has_stopped[thr_id] = true; cudaStreamDestroy(gpustream[thr_id]); pthread_exit(nullptr);} + if(h_nounce[0] != UINT32_MAX) { uint32_t Htarg = ptarget[7]; - uint32_t vhash64[8]; - be32enc(&endiandata[19], h_nounce[thr_id][0]); + uint32_t vhash64[8]={0}; + if(opt_verify){ be32enc(&endiandata[19], h_nounce[0]); keccak256_hash(vhash64, endiandata); - if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) + } if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) { int res = 1; // check if there was some other ones... *hashes_done = pdata[19] - first_nonce + throughput; - if (h_nounce[thr_id][1] != 0xffffffff) + if (h_nounce[1] != 0xffffffff) { - pdata[21] = h_nounce[thr_id][1]; - res++; - if (opt_benchmark) - applog(LOG_INFO, "GPU #%d Found second nounce %08x", thr_id, h_nounce[thr_id][1], vhash64[7], Htarg); + if(opt_verify){ be32enc(&endiandata[19], h_nounce[1]); + keccak256_hash(vhash64, endiandata); + + } if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) + { + pdata[21] = h_nounce[1]; + res++; + if (opt_benchmark) + applog(LOG_INFO, "GPU #%d Found second nounce %08x", device_map[thr_id], h_nounce[1]); + } + else + { + if (vhash64[7] != Htarg) + { + applog(LOG_WARNING, "GPU #%d: result for %08x does not validate on CPU!", device_map[thr_id], h_nounce[1]); + } + } } - pdata[19] = h_nounce[thr_id][0]; + pdata[19] = h_nounce[0]; if (opt_benchmark) - applog(LOG_INFO, "GPU #%d Found nounce %08x", thr_id, h_nounce[thr_id][0], vhash64[7], Htarg); + applog(LOG_INFO, "GPU #%d Found nounce %08x", device_map[thr_id], h_nounce[0]); return res; } else { if (vhash64[7] != Htarg) { - applog(LOG_INFO, "GPU #%d: result for %08x does not validate on CPU!", thr_id, h_nounce[thr_id][0]); + applog(LOG_WARNING, "GPU #%d: result for %08x does not validate on CPU!", device_map[thr_id], h_nounce[0]); } } } - pdata[19] += throughput; + pdata[19] += throughput; CUDA_SAFE_CALL(cudaGetLastError()); } while (!work_restart[thr_id].restart && ((uint64_t)max_nonce > ((uint64_t)(pdata[19]) + (uint64_t)throughput))); - - *hashes_done = pdata[19] - first_nonce; + *hashes_done = pdata[19] - first_nonce ; return 0; } diff --git a/JHA/cuda_jha_compactionTest.cu b/JHA/cuda_jha_compactionTest.cu index 9d4c5d63e4..faf8373ab7 100644 --- a/JHA/cuda_jha_compactionTest.cu +++ b/JHA/cuda_jha_compactionTest.cu @@ -4,6 +4,7 @@ #include "cuda_helper.h" #include + static uint32_t *d_tempBranch1Nonces[MAX_GPUS]; static uint32_t *d_numValid[MAX_GPUS]; static uint32_t *h_numValid[MAX_GPUS]; @@ -32,8 +33,8 @@ cuda_compactTestFunction_t h_JackpotTrueFunction[MAX_GPUS], h_JackpotFalseFuncti // Setup-Funktionen __host__ void jackpot_compactTest_cpu_init(int thr_id, uint32_t threads) { - cudaMemcpyFromSymbol(&h_JackpotTrueFunction[thr_id], d_JackpotTrueFunction, sizeof(cuda_compactTestFunction_t)); - cudaMemcpyFromSymbol(&h_JackpotFalseFunction[thr_id], d_JackpotFalseFunction, sizeof(cuda_compactTestFunction_t)); + cudaMemcpyFromSymbolAsync(&h_JackpotTrueFunction[thr_id], d_JackpotTrueFunction, sizeof(cuda_compactTestFunction_t), 0, cudaMemcpyDeviceToHost, gpustream[thr_id]); + cudaMemcpyFromSymbolAsync(&h_JackpotFalseFunction[thr_id], d_JackpotFalseFunction, sizeof(cuda_compactTestFunction_t), 0, cudaMemcpyDeviceToHost, gpustream[thr_id]); // wir brauchen auch Speicherplatz auf dem Device cudaMalloc(&d_tempBranch1Nonces[thr_id], sizeof(uint32_t) * threads * 2); @@ -47,18 +48,10 @@ __host__ void jackpot_compactTest_cpu_init(int thr_id, uint32_t threads) cudaMalloc(&d_partSum[1][thr_id], sizeof(uint32_t) * s1); // BLOCKSIZE (Threads/Block) } -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 300 -/** - * __shfl_up() calculates a source lane ID by subtracting delta from the caller's lane ID, and clamping to the range 0..width-1 - */ -#undef __shfl_up -#define __shfl_up(var, delta, width) (0) -#endif - // Die Summenfunktion (vom NVIDIA SDK) __global__ void jackpot_compactTest_gpu_SCAN(uint32_t *data, int width, uint32_t *partial_sums=NULL, cuda_compactTestFunction_t testFunc=NULL, uint32_t threads=0, uint32_t startNounce=0, uint32_t *inpHashes=NULL, uint32_t *d_validNonceTable=NULL) { - extern __shared__ uint32_t sums[]; + __shared__ uint32_t sums[32]; int id = ((blockIdx.x * blockDim.x) + threadIdx.x); //int lane_id = id % warpSize; int lane_id = id % width; @@ -192,7 +185,7 @@ __global__ void jackpot_compactTest_gpu_SCATTER(uint32_t *sum, uint32_t *outp, c uint32_t value; if (id < threads) { -// uint32_t nounce = startNounce + id; +// const uint32_t nounce = startNounce + id; uint32_t *inpHash; if(d_validNonceTable == NULL) { @@ -252,38 +245,38 @@ __host__ void jackpot_compactTest_cpu_singleCompaction(int thr_id, uint32_t thre bool callThrid = (thr2 > 0) ? true : false; // Erster Initialscan - jackpot_compactTest_gpu_SCAN<<>>( + jackpot_compactTest_gpu_SCAN<<>>( d_tempBranch1Nonces[thr_id], 32, d_partSum[0][thr_id], function, orgThreads, startNounce, inpHashes, d_validNonceTable); // weitere Scans if(callThrid) { - jackpot_compactTest_gpu_SCAN<<>>(d_partSum[0][thr_id], 32, d_partSum[1][thr_id]); - jackpot_compactTest_gpu_SCAN<<<1, thr2, 32*sizeof(uint32_t)>>>(d_partSum[1][thr_id], (thr2>32) ? 32 : thr2); + jackpot_compactTest_gpu_SCAN<<>>(d_partSum[0][thr_id], 32, d_partSum[1][thr_id]); + jackpot_compactTest_gpu_SCAN<<<1, thr2, 0, gpustream[thr_id]>>>(d_partSum[1][thr_id], (thr2>32) ? 32 : thr2); }else { - jackpot_compactTest_gpu_SCAN<<>>(d_partSum[0][thr_id], (blockSize2>32) ? 32 : blockSize2); + jackpot_compactTest_gpu_SCAN<<>>(d_partSum[0][thr_id], (blockSize2>32) ? 32 : blockSize2); } if(callThrid) - cudaMemcpy(nrm, &(d_partSum[1][thr_id])[thr2-1], sizeof(uint32_t), cudaMemcpyDeviceToHost); + cudaMemcpyAsync(nrm, &(d_partSum[1][thr_id])[thr2-1], sizeof(uint32_t), cudaMemcpyDeviceToHost, gpustream[thr_id]); else - cudaMemcpy(nrm, &(d_partSum[0][thr_id])[nSummen-1], sizeof(uint32_t), cudaMemcpyDeviceToHost); + cudaMemcpyAsync(nrm, &(d_partSum[0][thr_id])[nSummen - 1], sizeof(uint32_t), cudaMemcpyDeviceToHost, gpustream[thr_id]); // Addieren if(callThrid) { - jackpot_compactTest_gpu_ADD<<>>(d_partSum[0][thr_id]+blockSize, d_partSum[1][thr_id], blockSize*thr2); + jackpot_compactTest_gpu_ADD<<>>(d_partSum[0][thr_id]+blockSize, d_partSum[1][thr_id], blockSize*thr2); } - jackpot_compactTest_gpu_ADD<<>>(d_tempBranch1Nonces[thr_id]+blockSize, d_partSum[0][thr_id], threads); + jackpot_compactTest_gpu_ADD<<>>(d_tempBranch1Nonces[thr_id]+blockSize, d_partSum[0][thr_id], threads); // Scatter - jackpot_compactTest_gpu_SCATTER<<>>(d_tempBranch1Nonces[thr_id], d_nonces1, + jackpot_compactTest_gpu_SCATTER<<>>(d_tempBranch1Nonces[thr_id], d_nonces1, function, orgThreads, startNounce, inpHashes, d_validNonceTable); // Sync - cudaDeviceSynchronize(); + cudaStreamSynchronize(gpustream[thr_id]); } ////// ACHTUNG: Diese funktion geht aktuell nur mit threads > 65536 (Am besten 256 * 1024 oder 256*2048) @@ -301,33 +294,32 @@ __host__ void jackpot_compactTest_cpu_dualCompaction(int thr_id, uint32_t thread int thr2 = threads / (blockSize*blockSize); // 1 - jackpot_compactTest_gpu_SCAN<<>>(d_tempBranch1Nonces[thr_id], 32, d_partSum1[thr_id], h_JackpotTrueFunction[thr_id], threads, startNounce, inpHashes); - jackpot_compactTest_gpu_SCAN<<>>(d_partSum1[thr_id], 32, d_partSum2[thr_id]); - jackpot_compactTest_gpu_SCAN<<<1, thr2, 32*sizeof(uint32_t)>>>(d_partSum2[thr_id], (thr2>32) ? 32 : thr2); - cudaMemcpy(&nrm[0], &(d_partSum2[thr_id])[thr2-1], sizeof(uint32_t), cudaMemcpyDeviceToHost); - jackpot_compactTest_gpu_ADD<<>>(d_partSum1[thr_id]+blockSize, d_partSum2[thr_id], blockSize*thr2); - jackpot_compactTest_gpu_ADD<<>>(d_tempBranch1Nonces[thr_id]+blockSize, d_partSum1[thr_id], threads); + jackpot_compactTest_gpu_SCAN<<>>(d_tempBranch1Nonces[thr_id], 32, d_partSum1[thr_id], h_JackpotTrueFunction[thr_id], threads, startNounce, inpHashes); + jackpot_compactTest_gpu_SCAN<<>>(d_partSum1[thr_id], 32, d_partSum2[thr_id]); + jackpot_compactTest_gpu_SCAN<<<1, thr2, 32*sizeof(uint32_t), 0, gpustream[thr_id]>>>(d_partSum2[thr_id], (thr2>32) ? 32 : thr2); + cudaMemcpyAsync(&nrm[0], &(d_partSum2[thr_id])[thr2-1], sizeof(uint32_t), cudaMemcpyDeviceToHost, gpustream[thr_id]); + jackpot_compactTest_gpu_ADD<<>>(d_partSum1[thr_id]+blockSize, d_partSum2[thr_id], blockSize*thr2); + jackpot_compactTest_gpu_ADD<<>>(d_tempBranch1Nonces[thr_id]+blockSize, d_partSum1[thr_id], threads); // 2 - jackpot_compactTest_gpu_SCAN<<>>(d_tempBranch2Nonces[thr_id], 32, d_partSum1[thr_id], h_JackpotFalseFunction[thr_id], threads, startNounce, inpHashes); - jackpot_compactTest_gpu_SCAN<<>>(d_partSum1[thr_id], 32, d_partSum2[thr_id]); - jackpot_compactTest_gpu_SCAN<<<1, thr2, 32*sizeof(uint32_t)>>>(d_partSum2[thr_id], (thr2>32) ? 32 : thr2); - cudaMemcpy(&nrm[1], &(d_partSum2[thr_id])[thr2-1], sizeof(uint32_t), cudaMemcpyDeviceToHost); - jackpot_compactTest_gpu_ADD<<>>(d_partSum1[thr_id]+blockSize, d_partSum2[thr_id], blockSize*thr2); - jackpot_compactTest_gpu_ADD<<>>(d_tempBranch2Nonces[thr_id]+blockSize, d_partSum1[thr_id], threads); + jackpot_compactTest_gpu_SCAN<<>>(d_tempBranch2Nonces[thr_id], 32, d_partSum1[thr_id], h_JackpotFalseFunction[thr_id], threads, startNounce, inpHashes); + jackpot_compactTest_gpu_SCAN<<>>(d_partSum1[thr_id], 32, d_partSum2[thr_id]); + jackpot_compactTest_gpu_SCAN<<<1, thr2, 32*sizeof(uint32_t), 0, gpustream[thr_id]>>>(d_partSum2[thr_id], (thr2>32) ? 32 : thr2); + cudaMemcpyAsync(&nrm[1], &(d_partSum2[thr_id])[thr2-1], sizeof(uint32_t), cudaMemcpyDeviceToHost, gpustream[thr_id]); + jackpot_compactTest_gpu_ADD<<>>(d_partSum1[thr_id]+blockSize, d_partSum2[thr_id], blockSize*thr2); + jackpot_compactTest_gpu_ADD<<>>(d_tempBranch2Nonces[thr_id]+blockSize, d_partSum1[thr_id], threads); // Hier ist noch eine Besonderheit: in d_tempBranch1Nonces sind die element von 1...nrm1 die Interessanten // Schritt 3: Scatter - jackpot_compactTest_gpu_SCATTER<<>>(d_tempBranch1Nonces[thr_id], d_nonces1, h_JackpotTrueFunction[thr_id], threads, startNounce, inpHashes); - jackpot_compactTest_gpu_SCATTER<<>>(d_tempBranch2Nonces[thr_id], d_nonces2, h_JackpotFalseFunction[thr_id], threads, startNounce, inpHashes); + jackpot_compactTest_gpu_SCATTER<<>>(d_tempBranch1Nonces[thr_id], d_nonces1, h_JackpotTrueFunction[thr_id], threads, startNounce, inpHashes); + jackpot_compactTest_gpu_SCATTER<<>>(d_tempBranch2Nonces[thr_id], d_nonces2, h_JackpotFalseFunction[thr_id], threads, startNounce, inpHashes); cudaDeviceSynchronize(); */ } __host__ void jackpot_compactTest_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable, uint32_t *d_nonces1, uint32_t *nrm1, - uint32_t *d_nonces2, uint32_t *nrm2, - int order) + uint32_t *d_nonces2, uint32_t *nrm2) { // Wenn validNonceTable genutzt wird, dann werden auch nur die Nonces betrachtet, die dort enthalten sind // "threads" ist in diesem Fall auf die Länge dieses Array's zu setzen! diff --git a/JHA/cuda_jha_keccak512.cu b/JHA/cuda_jha_keccak512.cu index 37b15ee578..47c3594d9e 100644 --- a/JHA/cuda_jha_keccak512.cu +++ b/JHA/cuda_jha_keccak512.cu @@ -3,6 +3,7 @@ #include "cuda_helper.h" + __constant__ uint64_t c_State[25]; __constant__ uint32_t c_PaddedMessage[18]; @@ -12,23 +13,21 @@ __constant__ uint32_t c_PaddedMessage[18]; #define U64TO32_LE(p, v) \ *p = (uint32_t)((v)); *(p+1) = (uint32_t)((v) >> 32); -static const uint64_t host_keccak_round_constants[24] = { - 0x0000000000000001ull, 0x0000000000008082ull, - 0x800000000000808aull, 0x8000000080008000ull, - 0x000000000000808bull, 0x0000000080000001ull, - 0x8000000080008081ull, 0x8000000000008009ull, - 0x000000000000008aull, 0x0000000000000088ull, - 0x0000000080008009ull, 0x000000008000000aull, - 0x000000008000808bull, 0x800000000000008bull, - 0x8000000000008089ull, 0x8000000000008003ull, - 0x8000000000008002ull, 0x8000000000000080ull, - 0x000000000000800aull, 0x800000008000000aull, - 0x8000000080008081ull, 0x8000000000008080ull, - 0x0000000080000001ull, 0x8000000080008008ull +__constant__ uint64_t c_keccak_round_constants[24] = { + 0x0000000000000001ull, 0x0000000000008082ull, + 0x800000000000808aull, 0x8000000080008000ull, + 0x000000000000808bull, 0x0000000080000001ull, + 0x8000000080008081ull, 0x8000000000008009ull, + 0x000000000000008aull, 0x0000000000000088ull, + 0x0000000080008009ull, 0x000000008000000aull, + 0x000000008000808bull, 0x800000000000008bull, + 0x8000000000008089ull, 0x8000000000008003ull, + 0x8000000000008002ull, 0x8000000000000080ull, + 0x000000000000800aull, 0x800000008000000aull, + 0x8000000080008081ull, 0x8000000000008080ull, + 0x0000000080000001ull, 0x8000000080008008ull }; -__constant__ uint64_t c_keccak_round_constants[24]; - static __device__ __forceinline__ void keccak_block(uint64_t *s, const uint32_t *in, const uint64_t *keccak_round_constants) { int i; @@ -102,12 +101,12 @@ keccak_block(uint64_t *s, const uint32_t *in, const uint64_t *keccak_round_const __global__ void jackpot_keccak512_gpu_hash(uint32_t threads, uint32_t startNounce, uint64_t *g_hash) { - uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); if (thread < threads) { - uint32_t nounce = startNounce + thread; + const uint32_t nounce = startNounce + thread; - int hashPosition = nounce - startNounce; + const uint32_t hashPosition = nounce - startNounce; // Nachricht kopieren uint32_t message[18]; @@ -147,11 +146,6 @@ __global__ void jackpot_keccak512_gpu_hash(uint32_t threads, uint32_t startNounc // Setup-Funktionen __host__ void jackpot_keccak512_cpu_init(int thr_id, uint32_t threads) { - // Kopiere die Hash-Tabellen in den GPU-Speicher - cudaMemcpyToSymbol( c_keccak_round_constants, - host_keccak_round_constants, - sizeof(host_keccak_round_constants), - 0, cudaMemcpyHostToDevice); } #define cKeccakB 1600 @@ -161,7 +155,7 @@ __host__ void jackpot_keccak512_cpu_init(int thr_id, uint32_t threads) #define crypto_hash_BYTES 64 #if (cKeccakB == 1600) - typedef unsigned long long UINT64; + typedef uint64_t UINT64; typedef UINT64 tKeccakLane; #define cKeccakNumberOfRounds 24 #endif @@ -487,7 +481,7 @@ void KeccakF( tKeccakLane * state, const tKeccakLane *in, int laneCount ) } // inlen kann 72...143 betragen -__host__ void jackpot_keccak512_cpu_setBlock(void *pdata, size_t inlen) +__host__ void jackpot_keccak512_cpu_setBlock(int thr_id, void *pdata, size_t inlen) { const unsigned char *in = (const unsigned char*)pdata; @@ -503,10 +497,10 @@ __host__ void jackpot_keccak512_cpu_setBlock(void *pdata, size_t inlen) // Kopiere den state nach der ersten Runde (nach Absorption von 72 Bytes Inputdaten) // ins Constant Memory - cudaMemcpyToSymbol( c_State, + cudaMemcpyToSymbolAsync( c_State, state, sizeof(state), - 0, cudaMemcpyHostToDevice); + 0, cudaMemcpyHostToDevice, gpustream[thr_id]); // padding memcpy( temp, in, (size_t)inlen ); @@ -516,13 +510,13 @@ __host__ void jackpot_keccak512_cpu_setBlock(void *pdata, size_t inlen) // Kopiere den Rest der Message und das Padding ins Constant Memory - cudaMemcpyToSymbol( c_PaddedMessage, + cudaMemcpyToSymbolAsync( c_PaddedMessage, temp, cKeccakR_SizeInBytes, - 0, cudaMemcpyHostToDevice); + 0, cudaMemcpyHostToDevice, gpustream[thr_id]); } -__host__ void jackpot_keccak512_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order) +__host__ void jackpot_keccak512_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash) { const uint32_t threadsperblock = 256; @@ -530,5 +524,5 @@ __host__ void jackpot_keccak512_cpu_hash(int thr_id, uint32_t threads, uint32_t dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 block(threadsperblock); - jackpot_keccak512_gpu_hash<<>>(threads, startNounce, (uint64_t*)d_hash); + jackpot_keccak512_gpu_hash<<>>(threads, startNounce, (uint64_t*)d_hash); } diff --git a/JHA/jackpotcoin.cu b/JHA/jackpotcoin.cu index e455acee6d..eaa5ea16c5 100644 --- a/JHA/jackpotcoin.cu +++ b/JHA/jackpotcoin.cu @@ -10,38 +10,29 @@ extern "C" #include "miner.h" #include "cuda_helper.h" -static uint32_t *d_hash[MAX_GPUS]; - extern void jackpot_keccak512_cpu_init(int thr_id, uint32_t threads); -extern void jackpot_keccak512_cpu_setBlock(void *pdata, size_t inlen); -extern void jackpot_keccak512_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order); +extern void jackpot_keccak512_cpu_setBlock(int thr_id, void *pdata, size_t inlen); +extern void jackpot_keccak512_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash); -extern void quark_blake512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); +extern void quark_blake512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash); extern void quark_groestl512_cpu_init(int thr_id, uint32_t threads); -extern void quark_groestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); +extern void quark_groestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash); -extern void quark_jh512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); +extern void quark_jh512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash); extern void quark_skein512_cpu_init(int thr_id); -extern void quark_skein512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); +extern void quark_skein512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash); extern void jackpot_compactTest_cpu_init(int thr_id, uint32_t threads); extern void jackpot_compactTest_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable, uint32_t *d_nonces1, uint32_t *nrm1, - uint32_t *d_nonces2, uint32_t *nrm2, - int order); - -extern uint32_t cuda_check_hash_branch(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order); + uint32_t *d_nonces2, uint32_t *nrm2); -// Speicher zur Generierung der Noncevektoren für die bedingten Hashes -static uint32_t *d_jackpotNonces[MAX_GPUS]; -static uint32_t *d_branch1Nonces[MAX_GPUS]; -static uint32_t *d_branch2Nonces[MAX_GPUS]; -static uint32_t *d_branch3Nonces[MAX_GPUS]; +extern uint32_t cuda_check_hash_branch(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash); // Original jackpothash Funktion aus einem miner Quelltext -extern "C" unsigned int jackpothash(void *state, const void *input) +unsigned int jackpothash(void *state, const void *input) { sph_blake512_context ctx_blake; sph_groestl512_context ctx_groestl; @@ -83,155 +74,171 @@ extern "C" unsigned int jackpothash(void *state, const void *input) return round; } -static bool init[MAX_GPUS] = { 0 }; - -extern "C" int scanhash_jackpot(int thr_id, uint32_t *pdata, - const uint32_t *ptarget, uint32_t max_nonce, - unsigned long *hashes_done) +extern int scanhash_jackpot(int thr_id, uint32_t *pdata, + uint32_t *ptarget, uint32_t max_nonce, + uint32_t *hashes_done) { const uint32_t first_nonce = pdata[19]; - uint32_t throughput = device_intensity(thr_id, __func__, 1U << 20); - throughput = min(throughput, (max_nonce - first_nonce)); + uint32_t throughputmax = device_intensity(device_map[thr_id], __func__, 1U << 20); + uint32_t throughput = min(throughputmax, (max_nonce - first_nonce)) & 0xfffffc00; if (opt_benchmark) - ((uint32_t*)ptarget)[7] = 0x000f; + ptarget[7] = 0x000f; - if (!init[thr_id]) + static THREAD uint32_t *d_hash = nullptr; + static THREAD uint32_t *d_jackpotNonces = nullptr; + static THREAD uint32_t *d_branch1Nonces = nullptr; + static THREAD uint32_t *d_branch2Nonces = nullptr; + static THREAD uint32_t *d_branch3Nonces = nullptr; + static THREAD volatile bool init = false; + + if (!init) { - CUDA_CALL_OR_RET_X(cudaSetDevice(device_map[thr_id]), 0); + CUDA_SAFE_CALL(cudaSetDevice(device_map[thr_id])); + cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync); + cudaDeviceSetCacheConfig(cudaFuncCachePreferL1); + CUDA_SAFE_CALL(cudaStreamCreate(&gpustream[thr_id])); + get_cuda_arch(&cuda_arch[thr_id]); - CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput)); + CUDA_SAFE_CALL(cudaMalloc(&d_hash, 16 * sizeof(uint32_t) * throughputmax)); - jackpot_keccak512_cpu_init(thr_id, throughput); - jackpot_compactTest_cpu_init(thr_id, throughput); - quark_groestl512_cpu_init(thr_id, throughput); + jackpot_keccak512_cpu_init(thr_id, throughputmax); + jackpot_compactTest_cpu_init(thr_id, throughputmax); + quark_groestl512_cpu_init(thr_id, throughputmax); quark_skein512_cpu_init(thr_id); - cuda_check_cpu_init(thr_id, throughput); + cuda_check_cpu_init(thr_id, throughputmax); - cudaMalloc(&d_branch1Nonces[thr_id], sizeof(uint32_t)*throughput*2); - cudaMalloc(&d_branch2Nonces[thr_id], sizeof(uint32_t)*throughput*2); - cudaMalloc(&d_branch3Nonces[thr_id], sizeof(uint32_t)*throughput*2); + cudaMalloc(&d_branch1Nonces, sizeof(uint32_t)*throughputmax * 1.25/2); + cudaMalloc(&d_branch2Nonces, sizeof(uint32_t)*throughputmax * 1.25/2); + cudaMalloc(&d_branch3Nonces, sizeof(uint32_t)*throughputmax * 1.25); // 25% more than we need, just in case - CUDA_SAFE_CALL(cudaMalloc(&d_jackpotNonces[thr_id], sizeof(uint32_t)*throughput*2)); + CUDA_SAFE_CALL(cudaMalloc(&d_jackpotNonces, sizeof(uint32_t)*throughputmax * 2)); + mining_has_stopped[thr_id] = false; - init[thr_id] = true; + init = true; } uint32_t endiandata[22]; for (int k=0; k < 22; k++) - be32enc(&endiandata[k], ((uint32_t*)pdata)[k]); + be32enc(&endiandata[k], pdata[k]); - jackpot_keccak512_cpu_setBlock((void*)endiandata, 80); - cuda_check_cpu_setTarget(ptarget); + jackpot_keccak512_cpu_setBlock(thr_id, (void*)endiandata, 80); + cuda_check_cpu_setTarget(ptarget, thr_id); do { - int order = 0; - // erstes Keccak512 Hash mit CUDA - jackpot_keccak512_cpu_hash(thr_id, throughput, pdata[19], d_hash[thr_id], order++); + jackpot_keccak512_cpu_hash(thr_id, throughput, pdata[19], d_hash); uint32_t nrm1, nrm2, nrm3; // Runde 1 (ohne Gröstl) - jackpot_compactTest_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id], NULL, - d_branch1Nonces[thr_id], &nrm1, - d_branch3Nonces[thr_id], &nrm3, - order++); + jackpot_compactTest_cpu_hash_64(thr_id, throughput, pdata[19], d_hash, NULL, + d_branch1Nonces, &nrm1, + d_branch3Nonces, &nrm3); // verfolge den skein-pfad weiter - quark_skein512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++); + quark_skein512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces, d_hash); // noch schnell Blake & JH - jackpot_compactTest_cpu_hash_64(thr_id, nrm3, pdata[19], d_hash[thr_id], d_branch3Nonces[thr_id], - d_branch1Nonces[thr_id], &nrm1, - d_branch2Nonces[thr_id], &nrm2, - order++); + jackpot_compactTest_cpu_hash_64(thr_id, nrm3, pdata[19], d_hash, d_branch3Nonces, + d_branch1Nonces, &nrm1, + d_branch2Nonces, &nrm2); if (nrm1+nrm2 == nrm3) { - quark_blake512_cpu_hash_64(thr_id, nrm1, pdata[19], d_branch1Nonces[thr_id], d_hash[thr_id], order++); - quark_jh512_cpu_hash_64(thr_id, nrm2, pdata[19], d_branch2Nonces[thr_id], d_hash[thr_id], order++); + quark_blake512_cpu_hash_64(thr_id, nrm1, pdata[19], d_branch1Nonces, d_hash); + quark_jh512_cpu_hash_64(thr_id, nrm2, pdata[19], d_branch2Nonces, d_hash); } // Runde 3 (komplett) // jackpotNonces in branch1/2 aufsplitten gemäss if (hash[0] & 0x01) - jackpot_compactTest_cpu_hash_64(thr_id, nrm3, pdata[19], d_hash[thr_id], d_branch3Nonces[thr_id], - d_branch1Nonces[thr_id], &nrm1, - d_branch2Nonces[thr_id], &nrm2, - order++); + jackpot_compactTest_cpu_hash_64(thr_id, nrm3, pdata[19], d_hash, d_branch3Nonces, + d_branch1Nonces, &nrm1, + d_branch2Nonces, &nrm2); if (nrm1+nrm2 == nrm3) { - quark_groestl512_cpu_hash_64(thr_id, nrm1, pdata[19], d_branch1Nonces[thr_id], d_hash[thr_id], order++); - quark_skein512_cpu_hash_64(thr_id, nrm2, pdata[19], d_branch2Nonces[thr_id], d_hash[thr_id], order++); + quark_groestl512_cpu_hash_64(thr_id, nrm1, pdata[19], d_branch1Nonces, d_hash); + quark_skein512_cpu_hash_64(thr_id, nrm2, pdata[19], d_branch2Nonces, d_hash); } // jackpotNonces in branch1/2 aufsplitten gemäss if (hash[0] & 0x01) - jackpot_compactTest_cpu_hash_64(thr_id, nrm3, pdata[19], d_hash[thr_id], d_branch3Nonces[thr_id], - d_branch1Nonces[thr_id], &nrm1, - d_branch2Nonces[thr_id], &nrm2, - order++); + jackpot_compactTest_cpu_hash_64(thr_id, nrm3, pdata[19], d_hash, d_branch3Nonces, + d_branch1Nonces, &nrm1, + d_branch2Nonces, &nrm2); if (nrm1+nrm2 == nrm3) { - quark_blake512_cpu_hash_64(thr_id, nrm1, pdata[19], d_branch1Nonces[thr_id], d_hash[thr_id], order++); - quark_jh512_cpu_hash_64(thr_id, nrm2, pdata[19], d_branch2Nonces[thr_id], d_hash[thr_id], order++); + quark_blake512_cpu_hash_64(thr_id, nrm1, pdata[19], d_branch1Nonces, d_hash); + quark_jh512_cpu_hash_64(thr_id, nrm2, pdata[19], d_branch2Nonces, d_hash); } // Runde 3 (komplett) // jackpotNonces in branch1/2 aufsplitten gemäss if (hash[0] & 0x01) - jackpot_compactTest_cpu_hash_64(thr_id, nrm3, pdata[19], d_hash[thr_id], d_branch3Nonces[thr_id], - d_branch1Nonces[thr_id], &nrm1, - d_branch2Nonces[thr_id], &nrm2, - order++); + jackpot_compactTest_cpu_hash_64(thr_id, nrm3, pdata[19], d_hash, d_branch3Nonces, + d_branch1Nonces, &nrm1, + d_branch2Nonces, &nrm2); if (nrm1+nrm2 == nrm3) { - quark_groestl512_cpu_hash_64(thr_id, nrm1, pdata[19], d_branch1Nonces[thr_id], d_hash[thr_id], order++); - quark_skein512_cpu_hash_64(thr_id, nrm2, pdata[19], d_branch2Nonces[thr_id], d_hash[thr_id], order++); + quark_groestl512_cpu_hash_64(thr_id, nrm1, pdata[19], d_branch1Nonces, d_hash); + quark_skein512_cpu_hash_64(thr_id, nrm2, pdata[19], d_branch2Nonces, d_hash); } // jackpotNonces in branch1/2 aufsplitten gemäss if (hash[0] & 0x01) - jackpot_compactTest_cpu_hash_64(thr_id, nrm3, pdata[19], d_hash[thr_id], d_branch3Nonces[thr_id], - d_branch1Nonces[thr_id], &nrm1, - d_branch2Nonces[thr_id], &nrm2, - order++); + jackpot_compactTest_cpu_hash_64(thr_id, nrm3, pdata[19], d_hash, d_branch3Nonces, + d_branch1Nonces, &nrm1, + d_branch2Nonces, &nrm2); if (nrm1+nrm2 == nrm3) { - quark_blake512_cpu_hash_64(thr_id, nrm1, pdata[19], d_branch1Nonces[thr_id], d_hash[thr_id], order++); - quark_jh512_cpu_hash_64(thr_id, nrm2, pdata[19], d_branch2Nonces[thr_id], d_hash[thr_id], order++); + quark_blake512_cpu_hash_64(thr_id, nrm1, pdata[19], d_branch1Nonces, d_hash); + quark_jh512_cpu_hash_64(thr_id, nrm2, pdata[19], d_branch2Nonces, d_hash); } - uint32_t foundNonce = cuda_check_hash_branch(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++); - if (foundNonce != 0xffffffff) + uint32_t foundNonce = cuda_check_hash_branch(thr_id, nrm3, pdata[19], d_branch3Nonces, d_hash); + if(stop_mining) {mining_has_stopped[thr_id] = true; cudaStreamDestroy(gpustream[thr_id]); pthread_exit(nullptr);} + if(foundNonce != 0xffffffff) { unsigned int rounds; - uint32_t vhash64[8]; + uint32_t vhash64[8]={0}; uint32_t Htarg = ptarget[7]; - be32enc(&endiandata[19], foundNonce); + if(opt_verify){ be32enc(&endiandata[19], foundNonce); // diese jackpothash Funktion gibt die Zahl der Runden zurück rounds = jackpothash(vhash64, endiandata); - if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) { + } if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) { int res = 1; - uint32_t secNonce = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], foundNonce); + uint32_t secNonce = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash, foundNonce); *hashes_done = pdata[19] - first_nonce + throughput; - if (secNonce != 0) { - pdata[21] = secNonce; - res++; + if (secNonce != 0) + { + if(opt_verify){ be32enc(&endiandata[19], secNonce); + rounds = jackpothash(vhash64, endiandata); + + } if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) + { + pdata[21] = secNonce; + res++; + } + else + { + if(opt_verify) + applog(LOG_INFO, "GPU #%d: result for nonce $%08X does not validate on CPU (%d rounds)!", device_map[thr_id], secNonce, rounds); + } } pdata[19] = foundNonce; return res; } else { - applog(LOG_INFO, "GPU #%d: result for nonce $%08X does not validate on CPU (%d rounds)!", thr_id, foundNonce, rounds); + if(opt_verify) + applog(LOG_INFO, "GPU #%d: result for nonce $%08X does not validate on CPU (%d rounds)!", device_map[thr_id], foundNonce, rounds); } } - pdata[19] += throughput; + pdata[19] += throughput; CUDA_SAFE_CALL(cudaGetLastError()); } while (!work_restart[thr_id].restart && ((uint64_t)max_nonce > ((uint64_t)(pdata[19]) + (uint64_t)throughput))); - *hashes_done = pdata[19] - first_nonce + 1; + *hashes_done = pdata[19] - first_nonce ; return 0; } diff --git a/Makefile.am b/Makefile.am index ff320b399c..e593739769 100644 --- a/Makefile.am +++ b/Makefile.am @@ -1,12 +1,13 @@ # allow to use Host cuda functions in C/C++ DEF_INCLUDES = @CUDA_INCLUDES@ -JANSSON_INCLUDES= if WANT_JANSSON -JANSSON_INCLUDES= -I$(top_srcdir)/compat/jansson +JANSSON_INCLUDES = -I$(top_srcdir)/compat/jansson +else +JANSSON_INCLUDES = endif -EXTRA_DIST = autogen.sh README.txt LICENSE.txt \ +EXTRA_DIST = autogen.sh README.txt LICENSE.txt \ cudaminer.sln cudaminer.vcxproj cudaminer.vcxproj.filters \ compat/gettimeofday.c compat/getopt/getopt_long.c cpuminer-config.h.in @@ -14,33 +15,31 @@ SUBDIRS = compat bin_PROGRAMS = ccminer -ccminer_SOURCES = elist.h miner.h compat.h \ +ccminer_SOURCES = elist.h miner.h compat.h \ compat/inttypes.h compat/stdbool.h compat/unistd.h \ compat/sys/time.h compat/getopt/getopt.h \ - crc32.c hefty1.c scrypt.c \ + crc32.c hefty1.c \ ccminer.cpp util.cpp \ api.cpp hashlog.cpp nvml.cpp stats.cpp sysinfos.cpp cuda.cpp \ - heavy/heavy.cu \ - heavy/cuda_blake512.cu heavy/cuda_blake512.h \ - heavy/cuda_combine.cu heavy/cuda_combine.h \ - heavy/cuda_groestl512.cu heavy/cuda_groestl512.h \ - heavy/cuda_hefty1.cu heavy/cuda_hefty1.h \ - heavy/cuda_keccak512.cu heavy/cuda_keccak512.h \ - heavy/cuda_sha256.cu heavy/cuda_sha256.h \ - fuguecoin.cpp Algo256/cuda_fugue256.cu sph/fugue.c uint256.h \ + cuda_helper.h cuda_vector.h \ + sph/neoscrypt.h sph/neoscrypt.cpp \ + sph/sha256_Y.h sph/sha256_Y.c sph/sph_sha2.c \ + fuguecoin.cpp Algo256/cuda_fugue256.cu sph/fugue.c \ groestlcoin.cpp cuda_groestlcoin.cu cuda_groestlcoin.h \ myriadgroestl.cpp cuda_myriadgroestl.cu \ lyra2/Lyra2.c lyra2/Sponge.c \ - lyra2/lyra2RE.cu lyra2/cuda_lyra2.cu \ + lyra2/lyra2REv2.cu lyra2/cuda_lyra2v2.cu \ Algo256/cuda_blake256.cu Algo256/cuda_groestl256.cu Algo256/cuda_keccak256.cu Algo256/cuda_skein256.cu \ + Algo256/cuda_bmw256.cu Algo256/cuda_cubehash256.cu \ Algo256/blake256.cu Algo256/keccak256.cu \ JHA/jackpotcoin.cu JHA/cuda_jha_keccak512.cu \ JHA/cuda_jha_compactionTest.cu cuda_checkhash.cu \ quark/cuda_jh512.cu quark/cuda_quark_blake512.cu quark/cuda_quark_groestl512.cu quark/cuda_skein512.cu \ quark/cuda_bmw512.cu quark/cuda_quark_keccak512.cu quark/cuda_jh512keccak512.cu \ - quark/quarkcoin.cu quark/animecoin.cu \ + quark/quarkcoin.cu \ quark/cuda_quark_compactionTest.cu \ - cuda_nist5.cu pentablake.cu \ + cuda_nist5.cu pentablake.cu skein.cu \ + Sia/sia.cu Sia/cuda_sia.cu \ sph/bmw.c sph/blake.c sph/groestl.c sph/jh.c sph/keccak.c sph/skein.c \ sph/cubehash.c sph/echo.c sph/luffa.c sph/sha2.c sph/shavite.c sph/simd.c \ sph/hamsi.c sph/hamsi_helper.c sph/sph_hamsi.h \ @@ -53,25 +52,36 @@ ccminer_SOURCES = elist.h miner.h compat.h \ x15/x14.cu x15/x15.cu x15/cuda_x14_shabal512.cu x15/cuda_x15_whirlpool.cu \ x15/whirlpool.cu \ x17/x17.cu x17/cuda_x17_haval512.cu x17/cuda_x17_sha512.cu \ - x11/s3.cu \ - bitcoin.cu cuda_bitcoin.cu - + x11/s3.cu x11/c11.cu \ + bitcoin.cu cuda_bitcoin.cu \ + x15/cuda_whirlpoolx.cu x15/whirlpoolx.cu \ + neoscrypt/neoscrypt.cu neoscrypt/cuda_neoscrypt.cu + neoscrypt/cuda_neoscrypt_tpruvot.cu + neoscrypt/cuda_vector_tpruvot.cuh neoscrypt/cuda_vector_uint2x4.cuh + +# scrypt +# ccminer_SOURCES += scrypt.cpp scrypt-jane.cpp \ +# scrypt/blake.cu scrypt/keccak.cu scrypt/sha256.cu \ +# scrypt/salsa_kernel.cu scrypt/test_kernel.cu \ +# scrypt/fermi_kernel.cu scrypt/kepler_kernel.cu \ +# scrypt/nv_kernel.cu scrypt/nv_kernel2.cu scrypt/titan_kernel.cu + + if HAVE_NVML nvml_defs = -DUSE_WRAPNVML nvml_libs = -ldl endif -if HAVE_WINDOWS -ccminer_SOURCES += compat/winansi.c -endif - ccminer_LDFLAGS = $(PTHREAD_FLAGS) @CUDA_LDFLAGS@ ccminer_LDADD = @LIBCURL@ @JANSSON_LIBS@ @PTHREAD_LIBS@ @WS2_LIBS@ @CUDA_LIBS@ @OPENMP_CFLAGS@ @LIBS@ $(nvml_libs) ccminer_CPPFLAGS = @LIBCURL_CPPFLAGS@ @OPENMP_CFLAGS@ $(CPPFLAGS) $(PTHREAD_FLAGS) -fno-strict-aliasing $(JANSSON_INCLUDES) $(DEF_INCLUDES) $(nvml_defs) -DSCRYPT_KECCAK512 -DSCRYPT_CHACHA -DSCRYPT_CHOOSE_COMPILETIME -nvcc_ARCH = -gencode=arch=compute_50,code=\"sm_50,compute_50\" -#nvcc_ARCH += -gencode=arch=compute_35,code=\"sm_35,compute_35\" -#nvcc_ARCH += -gencode=arch=compute_30,code=\"sm_30,compute_30\" +nvcc_ARCH = -gencode=arch=compute_61,code=sm_61 +nvcc_ARCH += -gencode=arch=compute_52,code=sm_52 +nvcc_ARCH += -gencode=arch=compute_50,code=sm_50 +nvcc_ARCH += -gencode=arch=compute_37,code=sm_37 +nvcc_ARCH += -gencode=arch=compute_35,code=sm_35 +nvcc_ARCH += -gencode=arch=compute_30,code=sm_30 nvcc_FLAGS = $(nvcc_ARCH) @CUDA_INCLUDES@ -I. @CUDA_CFLAGS@ nvcc_FLAGS += $(JANSSON_INCLUDES) --ptxas-options="-v" @@ -113,3 +123,14 @@ quark/cuda_quark_compactionTest.o: quark/cuda_quark_compactionTest.cu JHA/cuda_jha_compactionTest.o: JHA/cuda_jha_compactionTest.cu $(NVCC) $(nvcc_FLAGS) -I cudpp-2.1/include --maxrregcount=80 -o $@ -c $< + +# This kernel need also an older SM to be able to autotune kernels +# scrypt/salsa_kernel.o: scrypt/salsa_kernel.cu +# $(NVCC) $(nvcc_FLAGS) -gencode=arch=compute_20,code=\"sm_21,compute_20\" --maxrregcount=80 -o $@ -c $< + +skein.o: skein.cu + $(NVCC) $(nvcc_FLAGS) --maxrregcount=64 -o $@ -c $< + +# This kernel requires at least sm_35 +neoscrypt/cuda_neoscrypt.o: neoscrypt/cuda_neoscrypt.cu + $(NVCC) -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_35,code=sm_35 -gencode=arch=compute_37,code=sm_37 @CUDA_INCLUDES@ -I. @CUDA_CFLAGS@ $(JANSSON_INCLUDES) --ptxas-options="-v" -o $@ -c $< diff --git a/README.md b/README.md index b03f3ee719..1177648cc7 100644 --- a/README.md +++ b/README.md @@ -1,33 +1,23 @@ -ccminer -======= +# ccminer Based on Christian Buchner's & Christian H.'s CUDA project based on the Fork by tpruvot@github with X14,X15,X17,WHIRL,Blake256 and LYRA2 support , and some others, check the [README.txt](README.txt) Reforked and optimized by sp-hash@github and KlausT@github -SP-HASH: BTC donation address: 1CTiNJyoUmbdMRACtteRWXhGqtSETYd6Vd +* KlausT: BTC donation address: 1H2BHSyuwLP9vqt2p3bK9G3mDJsAi7qChw +* sp-hash: BTC donation address: 1CTiNJyoUmbdMRACtteRWXhGqtSETYd6Vd +* tpruvot: BTC donation address: 1AJdfCpLWPNoAMDfHF1wD5y8VgKSSTHxPo -A part of the recent algos were originally wrote by [djm34](https://github.com/djm34). +A part of the recent algos were originally written by [djm34](https://github.com/djm34). -This variant was tested and built on Linux (ubuntu server 14.04) and VStudio 2013 on Windows 7. +This variant was tested and built with Visual Studio 2015 on Windows 10 -Note that the x86 releases are generally faster than x64 ones on Windows. - -About source code dependencies ------------------------------- +## About source code dependencies This project requires some libraries to be built : +* OpenSSL (prebuilt for win) +* Curl (prebuilt for win) +* pthreads (prebuilt for win) -- OpenSSL (prebuilt for win) - -- Curl (prebuilt for win) - -- pthreads (prebuilt for win) - -The tree now contains recent prebuilt openssl and curl .lib for both x86 and x64 platforms (windows). - -To rebuild them, you need to clone this repository and its submodules : - git clone https://github.com/peters/curl-for-windows.git compat/curl-for-windows - -There is also a [Tutorial for windows](http://cudamining.co.uk/url/tutorials/id/3) on [CudaMining](http://cudamining.co.uk) website. +This fork now contains these libraries for both x86 and x64 platforms (windows). diff --git a/README.txt b/README.txt index 910aad4850..9b70c7f6e1 100644 --- a/README.txt +++ b/README.txt @@ -1,11 +1,13 @@ - -ccMiner release 1.5.2-tpruvot (SP_MOD) (Jan 2015) - "Happy new Year!" +ccMiner release 8.12(KlausT-mod) (August 17th, 2017) --------------------------------------------------------------- *************************************************************** If you find this tool useful and like to support its continued development, then consider a donation. +KlausT @github: + BTC 1H2BHSyuwLP9vqt2p3bK9G3mDJsAi7qChw + tpruvot@github: BTC donation address: 1AJdfCpLWPNoAMDfHF1wD5y8VgKSSTHxPo DRK : XeVrkPrWB7pDbdFLfKhF1Z3xpqhsx6wkH3 @@ -29,6 +31,7 @@ cbuchner v1.2: This is a CUDA accelerated mining application which handle : +Bitcoin HeavyCoin & MjollnirCoin FugueCoin GroestlCoin & Myriad-Groestl @@ -38,11 +41,13 @@ TalkCoin DarkCoin and other X11 coins NEOS blake (256 14-rounds) BlakeCoin (256 8-rounds) -Keccak (Maxcoin) Deep, Doom and Qubit +Keccak (Maxcoin) Pentablake (Blake 512 x5) S3 (OneCoin) +Skein (Skein + SHA) Lyra2RE (new VertCoin algo) +Neoscrypt where some of these coins have a VERY NOTABLE nVidia advantage over competing AMD (OpenCL Only) implementations. @@ -60,71 +65,85 @@ that the most of our comments are in german. This code is based on the pooler cpuminer 2.3.2 release and inherits its command line interface and options. - -a, --algo=ALGO specify the algorithm to use - anime use to mine Animecoin - blake use to mine NEOS (Blake 256) - blakecoin use to mine Old Blake 256 - deep use to mine Deepcoin - dmd-gr use to mine Diamond-Groestl - fresh use to mine Freshcoin - fugue256 use to mine Fuguecoin - groestl use to mine Groestlcoin - heavy use to mine Heavycoin - jackpot use to mine Jackpotcoin - keccak use to mine Maxcoin - luffa use to mine Doomcoin - lyra2 use to mine Vertcoin - mjollnir use to mine Mjollnircoin - myr-gr use to mine Myriad-Groest - nist5 use to mine TalkCoin - penta use to mine Joincoin / Pentablake - quark use to mine Quarkcoin - qubit use to mine Qubit Algo - s3 use to mine 1coin - whirl use to mine Whirlcoin - x11 use to mine DarkCoin - x14 use to mine X14Coin - x15 use to mine Halcyon - x17 use to mine X17 - - -d, --devices gives a comma separated list of CUDA device IDs - to operate on. Device IDs start counting from 0! - Alternatively give string names of your card like - gtx780ti or gt640#2 (matching 2nd gt640 in the PC). - - -i, --intensity GPU threads per call 8-31 (default: 0=auto) - Decimals are allowed for fine tuning - -f, --diff Divide difficulty by this factor (std is 1) - -v, --vote Heavycoin block vote (default: 512) + -a, --algo=ALGO specify the hash algorithm to use + bitcoin Bitcoin + blake Blake 256 (SFR/NEOS) + blakecoin Fast Blake 256 (8 rounds) + c11 X11 variant + deep Deepcoin + dmd-gr Diamond-Groestl + fresh Freshcoin (shavite 80) + fugue256 Fuguecoin + groestl Groestlcoin + jackpot Jackpot + keccak Keccak-256 (Maxcoin) + luffa Doomcoin + lyra2v2 VertCoin + myr-gr Myriad-Groestl + neoscrypt neoscrypt (FeatherCoin) + nist5 NIST5 (TalkCoin) + penta Pentablake hash (5x Blake 512) + quark Quark + qubit Qubit + sia Siacoin (at pools compatible to siamining.com) + skein Skein SHA2 (Skeincoin) + s3 S3 (1Coin) + spread Spread + x11 X11 (DarkCoin) + x13 X13 (MaruCoin) + x14 X14 + x15 X15 + x17 X17 (peoplecurrency) + vanilla Blake 256 8 rounds + yescrypt yescrypt + whirl Whirlcoin (old whirlpool) + whirlpoolx Vanillacoin + -d, --devices Comma separated list of CUDA devices to use. + Device IDs start counting from 0! Alternatively takes + string names of your cards like gtx780ti or gt640#2 + (matching 2nd gt640 in the PC) + -i --intensity=N GPU intensity 8-31 (default: auto) + Decimals are allowed for fine tuning + -f, --diff-factor Divide difficulty by this factor (default 1.0) + -m, --diff-multiplier Multiply difficulty by this value (default 1.0) + -v, --vote=VOTE block reward vote (for HeavyCoin) -o, --url=URL URL of mining server -O, --userpass=U:P username:password pair for mining server -u, --user=USERNAME username for mining server -p, --pass=PASSWORD password for mining server --cert=FILE certificate for mining server using SSL -x, --proxy=[PROTOCOL://]HOST[:PORT] connect through a proxy - -t, --threads=N number of miner threads (default: number of nVidia GPUs in your system) + -t, --threads=N number of miner threads (default: number of nVidia GPUs) -r, --retries=N number of times to retry if a network call fails (default: retry indefinitely) - -R, --retry-pause=N time to pause between retries, in seconds (default: 15) + -R, --retry-pause=N time to pause between retries, in seconds (default: 30) -T, --timeout=N network timeout, in seconds (default: 270) -s, --scantime=N upper bound on time spent scanning current work when long polling is unavailable, in seconds (default: 5) + -n, --ndevs list cuda devices -N, --statsavg number of samples used to display hashrate (default: 30) --no-gbt disable getblocktemplate support (height check in solo) --no-longpoll disable X-Long-Polling support --no-stratum disable X-Stratum support + -e disable extranonce -q, --quiet disable per-thread hashmeter output + --no-color disable colored output -D, --debug enable debug output -P, --protocol-dump verbose dump of protocol-level activities + --cpu-affinity set process affinity to cpu core(s), mask 0x3 for cores 0 and 1 + --cpu-priority set process priority (default: 0 idle, 2 normal to 5 highest) -b, --api-bind IP/Port for the miner API (default: 127.0.0.1:4068) + -S, --syslog use system log for output messages + --syslog-prefix=... allow to change syslog tool name + -B, --background run the miner in the background --benchmark run in offline benchmark mode --cputest debug hashes from cpu algorithms - --cpu-affinity set process affinity to specific cpu core(s) mask - --cpu-priority set process priority (default: 0 idle, 2 normal to 5 highest) + --no-cpu-verify don't verify the found results -c, --config=FILE load a JSON-format configuration file - --no-color disable colored console output + --plimit=N Set the gpu power limit to N Watt (driver version >=352.21) + (needs adminitrator rights under Windows) -V, --version display version information and exit - -h, --help display this help text and exit + -h, --help display this help text and exit\n" >>> Examples <<< @@ -179,142 +198,45 @@ features. >>> RELEASE HISTORY <<< - Jan. 2015 v1.5.2 - Allow per device intensity, example: -i 20,19.5 - Add process CPU priority and affinity mask parameters - Intelligent duplicate shares check feature (enabled if needed) - api: Fan RPM (windows), Cuda threads count, linux kernel ver. - More X11 optimisations from sp and KlausT - SM 3.0 enhancements - - Dec. 16th 2014 v1.5.1 - Add lyra2RE algo for Vertcoin based on djm34/vtc code - Multiple shares support (2 for the moment) - X11 optimisations (From klaust and sp-hash) - HTML5 WebSocket api compatibility (see api/websocket.htm) - Solo mode height checks with getblocktemplate rpc calls - - Nov. 27th 2014 v1.5.0 - Upgrade compat jansson to 2.6 (for windows) - Add pool mining.set_extranonce support - Allow intermediate intensity with decimals - Update prebuilt x86 openssl lib to 1.0.1i - Fix heavy algo on linux (broken since 1.4) - Some internal changes to use the C++ compiler - New API 1.2 with some new commands (read only) - Add some of sp x11/x15 optimisations (and tsiv x13) - - Nov. 15th 2014 v1.4.9 - Support of nvml and nvapi(windows) to monitor gpus - Fix (again) displayed hashrate for multi gpus systems - Average is now made by card (30 scans of the card) - Final API v1.1 (new fields + histo command) - Add support of telnet queries "telnet 127.0.0.1 4068" - add histo api command to get performance debug details - Add a rig sample php ui using json wrapper (php) - Restore quark/jackpot previous speed (differently) - - Nov. 12th 2014 v1.4.8 - Add a basic API and a sample php json wrapper - Add statsavg (def 20) and api-bind parameters - - Nov. 11th 2014 v1.4.7 - Average hashrate (based on the 20 last scans) - Rewrite blake algo - Add the -i (gpu threads/intensity parameter) - Add some X11 optimisations based on sp_ commits - Fix quark reported hashrate and benchmark mode for some algos - Enhance json config file param (int/float/false) (-c config.json) - Update windows prebuilt curl to 7.38.0 - - Oct. 26th 2014 v1.4.6 - Add S3 algo reusing existing code (onecoin) - Small X11 (simd512) enhancement - - Oct. 20th 2014 v1.4.5 - Add keccak algo from djm34 repo (maxcoin) - Curl 7.35 and OpenSSL are now included in the binary (and win tree) - Enhance windows terminal support (--help was broken) - - Sep. 27th 2014 v1.4.4 - First SM 5.2 Release (GTX 970 & 980) - CUDA Runtime included in binary - Colors enabled by default - - Sep. 10th 2014 v1.4.3 - Add algos from djm34 repo (deep, doom, qubit) - Goalcoin seems to be dead, not imported. - Create also the pentablake algo (5x Blake 512) - - Sept 6th 2014 Almost twice the speed on blake256 algos with the "midstate" cache - - Sep. 1st 2014 add X17, optimized x15 and whirl - add blake (256 variant) - color support on Windows, - remove some dll dependencies (pthreads, msvcp) - - Aug. 18th 2014 add X14, X15, Whirl, and Fresh algos, - also add colors and nvprof cmd line support - - June 15th 2014 add X13 and Diamond Groestl support. - Thanks to tsiv and to Bombadil for the contributions! - - June 14th 2014 released Killer Groestl quad version which I deem - sufficiently hard to port over to AMD. It isn't - the fastest option for Compute 3.5 and 5.0 cards, - but it is still much faster than the table based - versions. - - May 10th 2014 added X11, but without the bells & whistles - (no killer Groestl, SIMD hash quite slow still) - - May 6th 2014 this adds the quark and animecoin algorithms. - - May 3rd 2014 add the MjollnirCoin hash algorithm for the upcomin - MjollnirCoin relaunch. - - Add the -f (--diff) option to adjust the difficulty - e.g. for the erebor Dwarfpool myr-gr SaffronCoin pool. - Use -f 256 there. - - May 1st 2014 adapt the Jackpot algorithms to changes made by the - coin developers. We keep our unique nVidia advantage - because we have a way to break up the divergence. - NOTE: Jackpot Hash now requires Compute 3.0 or later. - - April, 27 2014 this release adds Myriad-Groestl and Jackpot Coin. - we apply an optimization to Jackpot that turns this - into a Keccak-only CUDA coin ;) Jackpot is tested with - solo--mining only at the moment. - - March, 27 2014 Heavycoin exchange rates soar, and as a result this coin - gets some love: We greatly optimized the Hefty1 kernel - for speed. Expect some hefty gains, especially on 750Ti's! - - By popular demand, we added the -d option as known from - cudaminer. - - different compute capability builds are now provided until - we figure out how to pack everything into a single executable - in a Windows build. - - March, 24 2014 fixed Groestl pool support - - went back to Compute 1.x for cuda_hefty1.cu kernel by - default after numerous reports of ccminer v0.2/v0.3 - not working with HeavyCoin for some people. - - March, 23 2014 added Groestlcoin support. stratum status unknown - (the only pool is currently down for fixing issues) - - March, 21 2014 use of shared memory in Fugue256 kernel boosts hash rates - on Fermi and Maxwell devices. Kepler may suffer slightly - (3-5%) - - Fixed Stratum for Fuguecoin. Tested on dwarfpool. - - March, 18 2014 initial release. - +2015-02-01 Release 1.0, forked from tpruvot and sp-hash +2015-02-03 v1.01: bug fix for cards with compute capability 3.0 (untested) +2015-02-09 v1.02: various bug fixes and optimizations +2015-03-08 v2.00: added whirlpoolx algo (Vanillacoin), also various optimizations and bug fixes +2015-03-30 v3.00: added skein (for Myriadcoin for example) +2015-05-06 v4.00: added Neoscrypt +2015-05-15 v4.01: fixed crash after ctrl-c (Windows), fixed -g option +2015-07-06 v5.00: -g option removed, some bug fixes and optimizations +2015-07-08 v5.01: lyra2 optimization +2015-08-22 v6.00: remove Lyra2RE, add Lyra2REv2, remove Animecoin, remove yesscrypt +2016-05-03 v6.01: various bug fixes and optimizations +2016-05-12 v6.02: faster x17 and quark +2016-05-16 v7.00: added Vanillacoin, optimized blake and blakecoin, + added stratum methods used by yiimp.ccminer.org +2016-05-16 v7.01: stratum.get_stats bug fix +2016-06-02 v7.02: fix default intensity for Nist5 + fix power usage statistics +2016-06-11 v7.03: faster lyra2v2 +2016-06-18 v7.04: Neoscrypt optimization + Bug Fixes +2016-08-11 v8.00: added Siacoin +2016-08-12 v8.01: increse default intensity for Sia + fix Linux build +2016-09-29 v8.02: change to CUDA 8.0 on Windows + various small changes +2016-12-08 v8.03: fix memory leak in Neoscrypt +2016-12-13 v8.04: fix illegal memory access in X11-X17 + fix duplicate shares in skein +2016-12-17 v8.05: fix Skein bug +2017-03-12 v8.06: Heavy and Mjollnir algos removed +2017-05-18 v8.07: Bitcredit algo removed + fixed bugs in bitcoin and jackpot algo +2017-05-19 v8.08: fix Makefile and configure.ac for Linux +2017-06-07 v8.09: some minor bug fixes +2017-07-17 v8.10: fix Orbitcoin solo mining (Neoscrypt) +2017-07-25 v8.11: change some timeout values + fix Feathercoin solo mining (Neoscrypt) + show chance to find a block while solo mining +2017-08-17 v8.12: fix Myriad-Groestl speed bug >>> AUTHORS <<< @@ -322,7 +244,7 @@ Notable contributors to this application are: Christian Buchner, Christian H. (Germany): Initial CUDA implementation -djm34, tsiv, sp for cuda algos implementation and optimisation +djm34, tsiv, sp and KlausT for cuda algos implementation and optimisation Tanguy Pruvot : 750Ti tuning, blake, colors, general code cleanup/opts API monitoring, linux Config/Makefile and vstudio stuff... diff --git a/Sia/cuda_sia.cu b/Sia/cuda_sia.cu new file mode 100644 index 0000000000..81f403e588 --- /dev/null +++ b/Sia/cuda_sia.cu @@ -0,0 +1,314 @@ +/* +Copyright (c) 2015 KlausT and Vorksholk + +Permission to use, copy, modify, and/or distribute this software for any +purpose with or without fee is hereby granted, provided that the above +copyright notice and this permission notice appear in all copies. + +THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH +REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY +AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, +INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM +LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE +OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR +PERFORMANCE OF THIS SOFTWARE. +*/ + + +#include +#include "cuda_helper.h" +#include "sia.h" + +#ifdef _MSC_VER +#define THREAD __declspec(thread) +#else +#define THREAD __thread +#endif + +#ifdef __INTELLISENSE__ +#define __launch_bounds__(blocksize) +#endif + +static THREAD uint64_t *vpre_h; +static THREAD uint32_t *nonceOut_d; +static THREAD uint64_t *hash_d; +__constant__ uint64_t vpre[16]; +__constant__ uint64_t header[10]; + +__device__ __forceinline__ +static uint64_t __byte_perm_64(const uint64_t source, const uint32_t grab1, const uint32_t grab2) +{ + uint64_t r; + uint32_t r1; + uint32_t r2; + + uint32_t i1; + uint32_t i2; + + asm("mov.b64 {%0, %1}, %2;" : "=r"(i1), "=r"(i2) : "l"(source)); + asm("prmt.b32 %0, %1, %2, %3;" : "=r"(r1) : "r"(i1), "r"(i2), "r"(grab1)); + asm("prmt.b32 %0, %1, %2, %3;" : "=r"(r2) : "r"(i1), "r"(i2), "r"(grab2)); + asm("mov.b64 %0, {%1, %2};" : "=l"(r) : "r"(r1), "r"(r2)); + + return r; +} + +__device__ __forceinline__ +static uint64_t __swap_hilo(const uint64_t source) +{ + uint64_t r; + uint32_t s1; + uint32_t s2; + + asm("mov.b64 {%0, %1}, %2;" : "=r"(s1), "=r"(s2) : "l"(source)); + asm("mov.b64 %0, {%1, %2};" : "=l"(r) : "r"(s2), "r"(s1)); + + return r; +} + +__device__ unsigned int numberofresults; + +__global__ void __launch_bounds__(blocksize, 3) siakernel(uint32_t * __restrict__ nonceOut, uint64_t target, uint64_t startnonce) +{ + uint64_t v[16]; + const uint64_t start = startnonce + (blockDim.x * blockIdx.x + threadIdx.x)*npt; + const uint64_t end = start + npt; + + numberofresults = 0; + + for(uint64_t n = start; n < end; n++) + { + v[2] = 0x5BF2CD1EF9D6B596u + n; v[14] = __swap_hilo(~0x1f83d9abfb41bd6bu ^ v[2]); v[10] = 0x3c6ef372fe94f82bu + v[14]; v[6] = __byte_perm_64(0x1f83d9abfb41bd6bu ^ v[10], 0x6543, 0x2107); + v[2] = v[2] + v[6] + header[5]; v[14] = __byte_perm_64(v[14] ^ v[2], 0x5432, 0x1076); v[10] = v[10] + v[14]; v[6] = ROTR64(v[6] ^ v[10], 63); + v[3] = 0x130C253729B586Au + header[6]; v[15] = __swap_hilo(0x5be0cd19137e2179u ^ v[3]); v[11] = 0xa54ff53a5f1d36f1u + v[15]; v[7] = __byte_perm_64(0x5be0cd19137e2179u ^ v[11], 0x6543, 0x2107); + v[3] = v[3] + v[7] + header[7]; v[15] = __byte_perm_64(v[15] ^ v[3], 0x5432, 0x1076); v[11] = v[11] + v[15]; v[7] = ROTR64(v[7] ^ v[11], 63); + v[0] = vpre[0] + vpre[5] + header[8]; v[15] = __swap_hilo(v[15] ^ v[0]); v[10] = v[10] + v[15]; v[5] = __byte_perm_64(vpre[5] ^ v[10], 0x6543, 0x2107); + v[0] = v[0] + v[5] + header[9]; v[15] = __byte_perm_64(v[15] ^ v[0], 0x5432, 0x1076); v[10] = v[10] + v[15]; v[5] = ROTR64(v[5] ^ v[10], 63); + v[1] = vpre[1] + v[6]; v[12] = __swap_hilo(vpre[12] ^ v[1]); v[11] = v[11] + v[12]; v[6] = __byte_perm_64(v[6] ^ v[11], 0x6543, 0x2107); + v[1] = v[1] + v[6]; v[12] = __byte_perm_64(v[12] ^ v[1], 0x5432, 0x1076); v[11] = v[11] + v[12]; v[6] = ROTR64(v[6] ^ v[11], 63); + v[2] = v[2] + v[7]; v[13] = __swap_hilo(vpre[13] ^ v[2]); v[8] = vpre[8] + v[13]; v[7] = __byte_perm_64(v[7] ^ v[8], 0x6543, 0x2107); + v[2] = v[2] + v[7]; v[13] = __byte_perm_64(v[13] ^ v[2], 0x5432, 0x1076); v[8] = v[8] + v[13]; v[7] = ROTR64(v[7] ^ v[8], 63); + v[3] = v[3] + vpre[4]; v[14] = __swap_hilo(v[14] ^ v[3]); v[9] = vpre[9] + v[14]; v[4] = __byte_perm_64(vpre[4] ^ v[9], 0x6543, 0x2107); + v[3] = v[3] + v[4]; v[14] = __byte_perm_64(v[14] ^ v[3], 0x5432, 0x1076); v[9] = v[9] + v[14]; v[4] = ROTR64(v[4] ^ v[9], 63); + + v[0] = v[0] + v[4]; v[12] = __swap_hilo(v[12] ^ v[0]); v[8] = v[8] + v[12]; v[4] = __byte_perm_64(v[4] ^ v[8], 0x6543, 0x2107); + v[0] = v[0] + v[4]; v[12] = __byte_perm_64(v[12] ^ v[0], 0x5432, 0x1076); v[8] = v[8] + v[12]; v[4] = ROTR64(v[4] ^ v[8], 63); + v[1] = v[1] + v[5] + n; v[13] = __swap_hilo(v[13] ^ v[1]); v[9] = v[9] + v[13]; v[5] = __byte_perm_64(v[5] ^ v[9], 0x6543, 0x2107); + v[1] = v[1] + v[5] + header[8]; v[13] = __byte_perm_64(v[13] ^ v[1], 0x5432, 0x1076); v[9] = v[9] + v[13]; v[5] = ROTR64(v[5] ^ v[9], 63); + v[2] = v[2] + v[6] + header[9]; v[14] = __swap_hilo(v[14] ^ v[2]); v[10] = v[10] + v[14]; v[6] = __byte_perm_64(v[6] ^ v[10], 0x6543, 0x2107); + v[2] = v[2] + v[6]; v[14] = __byte_perm_64(v[14] ^ v[2], 0x5432, 0x1076); v[10] = v[10] + v[14]; v[6] = ROTR64(v[6] ^ v[10], 63); + v[3] = v[3] + v[7]; v[15] = __swap_hilo(v[15] ^ v[3]); v[11] = v[11] + v[15]; v[7] = __byte_perm_64(v[7] ^ v[11], 0x6543, 0x2107); + v[3] = v[3] + v[7] + header[6]; v[15] = __byte_perm_64(v[15] ^ v[3], 0x5432, 0x1076); v[11] = v[11] + v[15]; v[7] = ROTR64(v[7] ^ v[11], 63); + v[0] = v[0] + v[5] + header[1]; v[15] = __swap_hilo(v[15] ^ v[0]); v[10] = v[10] + v[15]; v[5] = __byte_perm_64(v[5] ^ v[10], 0x6543, 0x2107); + v[0] = v[0] + v[5]; v[15] = __byte_perm_64(v[15] ^ v[0], 0x5432, 0x1076); v[10] = v[10] + v[15]; v[5] = ROTR64(v[5] ^ v[10], 63); + v[1] = v[1] + v[6] + header[0]; v[12] = __swap_hilo(v[12] ^ v[1]); v[11] = v[11] + v[12]; v[6] = __byte_perm_64(v[6] ^ v[11], 0x6543, 0x2107); + v[1] = v[1] + v[6] + header[2]; v[12] = __byte_perm_64(v[12] ^ v[1], 0x5432, 0x1076); v[11] = v[11] + v[12]; v[6] = ROTR64(v[6] ^ v[11], 63); + v[2] = v[2] + v[7]; v[13] = __swap_hilo(v[13] ^ v[2]); v[8] = v[8] + v[13]; v[7] = __byte_perm_64(v[7] ^ v[8], 0x6543, 0x2107); + v[2] = v[2] + v[7] + header[7]; v[13] = __byte_perm_64(v[13] ^ v[2], 0x5432, 0x1076); v[8] = v[8] + v[13]; v[7] = ROTR64(v[7] ^ v[8], 63); + v[3] = v[3] + v[4] + header[5]; v[14] = __swap_hilo(v[14] ^ v[3]); v[9] = v[9] + v[14]; v[4] = __byte_perm_64(v[4] ^ v[9], 0x6543, 0x2107); + v[3] = v[3] + v[4] + header[3]; v[14] = __byte_perm_64(v[14] ^ v[3], 0x5432, 0x1076); v[9] = v[9] + v[14]; v[4] = ROTR64(v[4] ^ v[9], 63); + + v[0] = v[0] + v[4]; v[12] = __swap_hilo(v[12] ^ v[0]); v[8] = v[8] + v[12]; v[4] = __byte_perm_64(v[4] ^ v[8], 0x6543, 0x2107); + v[0] = v[0] + v[4] + header[8]; v[12] = __byte_perm_64(v[12] ^ v[0], 0x5432, 0x1076); v[8] = v[8] + v[12]; v[4] = ROTR64(v[4] ^ v[8], 63); + v[1] = v[1] + v[5]; v[13] = __swap_hilo(v[13] ^ v[1]); v[9] = v[9] + v[13]; v[5] = __byte_perm_64(v[5] ^ v[9], 0x6543, 0x2107); + v[1] = v[1] + v[5] + header[0]; v[13] = __byte_perm_64(v[13] ^ v[1], 0x5432, 0x1076); v[9] = v[9] + v[13]; v[5] = ROTR64(v[5] ^ v[9], 63); + v[2] = v[2] + v[6] + header[5]; v[14] = __swap_hilo(v[14] ^ v[2]); v[10] = v[10] + v[14]; v[6] = __byte_perm_64(v[6] ^ v[10], 0x6543, 0x2107); + v[2] = v[2] + v[6] + header[2]; v[14] = __byte_perm_64(v[14] ^ v[2], 0x5432, 0x1076); v[10] = v[10] + v[14]; v[6] = ROTR64(v[6] ^ v[10], 63); + v[3] = v[3] + v[7]; v[15] = __swap_hilo(v[15] ^ v[3]); v[11] = v[11] + v[15]; v[7] = __byte_perm_64(v[7] ^ v[11], 0x6543, 0x2107); + v[3] = v[3] + v[7]; v[15] = __byte_perm_64(v[15] ^ v[3], 0x5432, 0x1076); v[11] = v[11] + v[15]; v[7] = ROTR64(v[7] ^ v[11], 63); + v[0] = v[0] + v[5]; v[15] = __swap_hilo(v[15] ^ v[0]); v[10] = v[10] + v[15]; v[5] = __byte_perm_64(v[5] ^ v[10], 0x6543, 0x2107); + v[0] = v[0] + v[5]; v[15] = __byte_perm_64(v[15] ^ v[0], 0x5432, 0x1076); v[10] = v[10] + v[15]; v[5] = ROTR64(v[5] ^ v[10], 63); + v[1] = v[1] + v[6] + header[3]; v[12] = __swap_hilo(v[12] ^ v[1]); v[11] = v[11] + v[12]; v[6] = __byte_perm_64(v[6] ^ v[11], 0x6543, 0x2107); + v[1] = v[1] + v[6] + header[6]; v[12] = __byte_perm_64(v[12] ^ v[1], 0x5432, 0x1076); v[11] = v[11] + v[12]; v[6] = ROTR64(v[6] ^ v[11], 63); + v[2] = v[2] + v[7] + header[7]; v[13] = __swap_hilo(v[13] ^ v[2]); v[8] = v[8] + v[13]; v[7] = __byte_perm_64(v[7] ^ v[8], 0x6543, 0x2107); + v[2] = v[2] + v[7] + header[1]; v[13] = __byte_perm_64(v[13] ^ v[2], 0x5432, 0x1076); v[8] = v[8] + v[13]; v[7] = ROTR64(v[7] ^ v[8], 63); + v[3] = v[3] + v[4] + header[9]; v[14] = __swap_hilo(v[14] ^ v[3]); v[9] = v[9] + v[14]; v[4] = __byte_perm_64(v[4] ^ v[9], 0x6543, 0x2107); + v[3] = v[3] + v[4] + n; v[14] = __byte_perm_64(v[14] ^ v[3], 0x5432, 0x1076); v[9] = v[9] + v[14]; v[4] = ROTR64(v[4] ^ v[9], 63); + + v[0] = v[0] + v[4] + header[7]; v[12] = __swap_hilo(v[12] ^ v[0]); v[8] = v[8] + v[12]; v[4] = __byte_perm_64(v[4] ^ v[8], 0x6543, 0x2107); + v[0] = v[0] + v[4] + header[9]; v[12] = __byte_perm_64(v[12] ^ v[0], 0x5432, 0x1076); v[8] = v[8] + v[12]; v[4] = ROTR64(v[4] ^ v[8], 63); + v[1] = v[1] + v[5] + header[3]; v[13] = __swap_hilo(v[13] ^ v[1]); v[9] = v[9] + v[13]; v[5] = __byte_perm_64(v[5] ^ v[9], 0x6543, 0x2107); + v[1] = v[1] + v[5] + header[1]; v[13] = __byte_perm_64(v[13] ^ v[1], 0x5432, 0x1076); v[9] = v[9] + v[13]; v[5] = ROTR64(v[5] ^ v[9], 63); + v[2] = v[2] + v[6]; v[14] = __swap_hilo(v[14] ^ v[2]); v[10] = v[10] + v[14]; v[6] = __byte_perm_64(v[6] ^ v[10], 0x6543, 0x2107); + v[2] = v[2] + v[6]; v[14] = __byte_perm_64(v[14] ^ v[2], 0x5432, 0x1076); v[10] = v[10] + v[14]; v[6] = ROTR64(v[6] ^ v[10], 63); + v[3] = v[3] + v[7]; v[15] = __swap_hilo(v[15] ^ v[3]); v[11] = v[11] + v[15]; v[7] = __byte_perm_64(v[7] ^ v[11], 0x6543, 0x2107); + v[3] = v[3] + v[7]; v[15] = __byte_perm_64(v[15] ^ v[3], 0x5432, 0x1076); v[11] = v[11] + v[15]; v[7] = ROTR64(v[7] ^ v[11], 63); + v[0] = v[0] + v[5] + header[2]; v[15] = __swap_hilo(v[15] ^ v[0]); v[10] = v[10] + v[15]; v[5] = __byte_perm_64(v[5] ^ v[10], 0x6543, 0x2107); + v[0] = v[0] + v[5] + header[6]; v[15] = __byte_perm_64(v[15] ^ v[0], 0x5432, 0x1076); v[10] = v[10] + v[15]; v[5] = ROTR64(v[5] ^ v[10], 63); + v[1] = v[1] + v[6] + header[5]; v[12] = __swap_hilo(v[12] ^ v[1]); v[11] = v[11] + v[12]; v[6] = __byte_perm_64(v[6] ^ v[11], 0x6543, 0x2107); + v[1] = v[1] + v[6]; v[12] = __byte_perm_64(v[12] ^ v[1], 0x5432, 0x1076); v[11] = v[11] + v[12]; v[6] = ROTR64(v[6] ^ v[11], 63); + v[2] = v[2] + v[7] + n; v[13] = __swap_hilo(v[13] ^ v[2]); v[8] = v[8] + v[13]; v[7] = __byte_perm_64(v[7] ^ v[8], 0x6543, 0x2107); + v[2] = v[2] + v[7] + header[0]; v[13] = __byte_perm_64(v[13] ^ v[2], 0x5432, 0x1076); v[8] = v[8] + v[13]; v[7] = ROTR64(v[7] ^ v[8], 63); + v[3] = v[3] + v[4]; v[14] = __swap_hilo(v[14] ^ v[3]); v[9] = v[9] + v[14]; v[4] = __byte_perm_64(v[4] ^ v[9], 0x6543, 0x2107); + v[3] = v[3] + v[4] + header[8]; v[14] = __byte_perm_64(v[14] ^ v[3], 0x5432, 0x1076); v[9] = v[9] + v[14]; v[4] = ROTR64(v[4] ^ v[9], 63); + + v[0] = v[0] + v[4] + header[9]; v[12] = __swap_hilo(v[12] ^ v[0]); v[8] = v[8] + v[12]; v[4] = __byte_perm_64(v[4] ^ v[8], 0x6543, 0x2107); + v[0] = v[0] + v[4] + header[0]; v[12] = __byte_perm_64(v[12] ^ v[0], 0x5432, 0x1076); v[8] = v[8] + v[12]; v[4] = ROTR64(v[4] ^ v[8], 63); + v[1] = v[1] + v[5] + header[5]; v[13] = __swap_hilo(v[13] ^ v[1]); v[9] = v[9] + v[13]; v[5] = __byte_perm_64(v[5] ^ v[9], 0x6543, 0x2107); + v[1] = v[1] + v[5] + header[7]; v[13] = __byte_perm_64(v[13] ^ v[1], 0x5432, 0x1076); v[9] = v[9] + v[13]; v[5] = ROTR64(v[5] ^ v[9], 63); + v[2] = v[2] + v[6] + header[2]; v[14] = __swap_hilo(v[14] ^ v[2]); v[10] = v[10] + v[14]; v[6] = __byte_perm_64(v[6] ^ v[10], 0x6543, 0x2107); + v[2] = v[2] + v[6] + n; v[14] = __byte_perm_64(v[14] ^ v[2], 0x5432, 0x1076); v[10] = v[10] + v[14]; v[6] = ROTR64(v[6] ^ v[10], 63); + v[3] = v[3] + v[7]; v[15] = __swap_hilo(v[15] ^ v[3]); v[11] = v[11] + v[15]; v[7] = __byte_perm_64(v[7] ^ v[11], 0x6543, 0x2107); + v[3] = v[3] + v[7]; v[15] = __byte_perm_64(v[15] ^ v[3], 0x5432, 0x1076); v[11] = v[11] + v[15]; v[7] = ROTR64(v[7] ^ v[11], 63); + v[0] = v[0] + v[5]; v[15] = __swap_hilo(v[15] ^ v[0]); v[10] = v[10] + v[15]; v[5] = __byte_perm_64(v[5] ^ v[10], 0x6543, 0x2107); + v[0] = v[0] + v[5] + header[1]; v[15] = __byte_perm_64(v[15] ^ v[0], 0x5432, 0x1076); v[10] = v[10] + v[15]; v[5] = ROTR64(v[5] ^ v[10], 63); + v[1] = v[1] + v[6]; v[12] = __swap_hilo(v[12] ^ v[1]); v[11] = v[11] + v[12]; v[6] = __byte_perm_64(v[6] ^ v[11], 0x6543, 0x2107); + v[1] = v[1] + v[6]; v[12] = __byte_perm_64(v[12] ^ v[1], 0x5432, 0x1076); v[11] = v[11] + v[12]; v[6] = ROTR64(v[6] ^ v[11], 63); + v[2] = v[2] + v[7] + header[6]; v[13] = __swap_hilo(v[13] ^ v[2]); v[8] = v[8] + v[13]; v[7] = __byte_perm_64(v[7] ^ v[8], 0x6543, 0x2107); + v[2] = v[2] + v[7] + header[8]; v[13] = __byte_perm_64(v[13] ^ v[2], 0x5432, 0x1076); v[8] = v[8] + v[13]; v[7] = ROTR64(v[7] ^ v[8], 63); + v[3] = v[3] + v[4] + header[3]; v[14] = __swap_hilo(v[14] ^ v[3]); v[9] = v[9] + v[14]; v[4] = __byte_perm_64(v[4] ^ v[9], 0x6543, 0x2107); + v[3] = v[3] + v[4]; v[14] = __byte_perm_64(v[14] ^ v[3], 0x5432, 0x1076); v[9] = v[9] + v[14]; v[4] = ROTR64(v[4] ^ v[9], 63); + + v[0] = v[0] + v[4] + header[2]; v[12] = __swap_hilo(v[12] ^ v[0]); v[8] = v[8] + v[12]; v[4] = __byte_perm_64(v[4] ^ v[8], 0x6543, 0x2107); + v[0] = v[0] + v[4]; v[12] = __byte_perm_64(v[12] ^ v[0], 0x5432, 0x1076); v[8] = v[8] + v[12]; v[4] = ROTR64(v[4] ^ v[8], 63); + v[1] = v[1] + v[5] + header[6]; v[13] = __swap_hilo(v[13] ^ v[1]); v[9] = v[9] + v[13]; v[5] = __byte_perm_64(v[5] ^ v[9], 0x6543, 0x2107); + v[1] = v[1] + v[5]; v[13] = __byte_perm_64(v[13] ^ v[1], 0x5432, 0x1076); v[9] = v[9] + v[13]; v[5] = ROTR64(v[5] ^ v[9], 63); + v[2] = v[2] + v[6] + header[0]; v[14] = __swap_hilo(v[14] ^ v[2]); v[10] = v[10] + v[14]; v[6] = __byte_perm_64(v[6] ^ v[10], 0x6543, 0x2107); + v[2] = v[2] + v[6]; v[14] = __byte_perm_64(v[14] ^ v[2], 0x5432, 0x1076); v[10] = v[10] + v[14]; v[6] = ROTR64(v[6] ^ v[10], 63); + v[3] = v[3] + v[7] + header[8]; v[15] = __swap_hilo(v[15] ^ v[3]); v[11] = v[11] + v[15]; v[7] = __byte_perm_64(v[7] ^ v[11], 0x6543, 0x2107); + v[3] = v[3] + v[7] + header[3]; v[15] = __byte_perm_64(v[15] ^ v[3], 0x5432, 0x1076); v[11] = v[11] + v[15]; v[7] = ROTR64(v[7] ^ v[11], 63); + v[0] = v[0] + v[5] + n; v[15] = __swap_hilo(v[15] ^ v[0]); v[10] = v[10] + v[15]; v[5] = __byte_perm_64(v[5] ^ v[10], 0x6543, 0x2107); + v[0] = v[0] + v[5]; v[15] = __byte_perm_64(v[15] ^ v[0], 0x5432, 0x1076); v[10] = v[10] + v[15]; v[5] = ROTR64(v[5] ^ v[10], 63); + v[1] = v[1] + v[6] + header[7]; v[12] = __swap_hilo(v[12] ^ v[1]); v[11] = v[11] + v[12]; v[6] = __byte_perm_64(v[6] ^ v[11], 0x6543, 0x2107); + v[1] = v[1] + v[6] + header[5]; v[12] = __byte_perm_64(v[12] ^ v[1], 0x5432, 0x1076); v[11] = v[11] + v[12]; v[6] = ROTR64(v[6] ^ v[11], 63); + v[2] = v[2] + v[7]; v[13] = __swap_hilo(v[13] ^ v[2]); v[8] = v[8] + v[13]; v[7] = __byte_perm_64(v[7] ^ v[8], 0x6543, 0x2107); + v[2] = v[2] + v[7]; v[13] = __byte_perm_64(v[13] ^ v[2], 0x5432, 0x1076); v[8] = v[8] + v[13]; v[7] = ROTR64(v[7] ^ v[8], 63); + v[3] = v[3] + v[4] + header[1]; v[14] = __swap_hilo(v[14] ^ v[3]); v[9] = v[9] + v[14]; v[4] = __byte_perm_64(v[4] ^ v[9], 0x6543, 0x2107); + v[3] = v[3] + v[4] + header[9]; v[14] = __byte_perm_64(v[14] ^ v[3], 0x5432, 0x1076); v[9] = v[9] + v[14]; v[4] = ROTR64(v[4] ^ v[9], 63); + + v[0] = v[0] + v[4]; v[12] = __swap_hilo(v[12] ^ v[0]); v[8] = v[8] + v[12]; v[4] = __byte_perm_64(v[4] ^ v[8], 0x6543, 0x2107); + v[0] = v[0] + v[4] + header[5]; v[12] = __byte_perm_64(v[12] ^ v[0], 0x5432, 0x1076); v[8] = v[8] + v[12]; v[4] = ROTR64(v[4] ^ v[8], 63); + v[1] = v[1] + v[5] + header[1]; v[13] = __swap_hilo(v[13] ^ v[1]); v[9] = v[9] + v[13]; v[5] = __byte_perm_64(v[5] ^ v[9], 0x6543, 0x2107); + v[1] = v[1] + v[5]; v[13] = __byte_perm_64(v[13] ^ v[1], 0x5432, 0x1076); v[9] = v[9] + v[13]; v[5] = ROTR64(v[5] ^ v[9], 63); + v[2] = v[2] + v[6]; v[14] = __swap_hilo(v[14] ^ v[2]); v[10] = v[10] + v[14]; v[6] = __byte_perm_64(v[6] ^ v[10], 0x6543, 0x2107); + v[2] = v[2] + v[6]; v[14] = __byte_perm_64(v[14] ^ v[2], 0x5432, 0x1076); v[10] = v[10] + v[14]; v[6] = ROTR64(v[6] ^ v[10], 63); + v[3] = v[3] + v[7] + n; v[15] = __swap_hilo(v[15] ^ v[3]); v[11] = v[11] + v[15]; v[7] = __byte_perm_64(v[7] ^ v[11], 0x6543, 0x2107); + v[3] = v[3] + v[7]; v[15] = __byte_perm_64(v[15] ^ v[3], 0x5432, 0x1076); v[11] = v[11] + v[15]; v[7] = ROTR64(v[7] ^ v[11], 63); + v[0] = v[0] + v[5] + header[0]; v[15] = __swap_hilo(v[15] ^ v[0]); v[10] = v[10] + v[15]; v[5] = __byte_perm_64(v[5] ^ v[10], 0x6543, 0x2107); + v[0] = v[0] + v[5] + header[7]; v[15] = __byte_perm_64(v[15] ^ v[0], 0x5432, 0x1076); v[10] = v[10] + v[15]; v[5] = ROTR64(v[5] ^ v[10], 63); + v[1] = v[1] + v[6] + header[6]; v[12] = __swap_hilo(v[12] ^ v[1]); v[11] = v[11] + v[12]; v[6] = __byte_perm_64(v[6] ^ v[11], 0x6543, 0x2107); + v[1] = v[1] + v[6] + header[3]; v[12] = __byte_perm_64(v[12] ^ v[1], 0x5432, 0x1076); v[11] = v[11] + v[12]; v[6] = ROTR64(v[6] ^ v[11], 63); + v[2] = v[2] + v[7] + header[9]; v[13] = __swap_hilo(v[13] ^ v[2]); v[8] = v[8] + v[13]; v[7] = __byte_perm_64(v[7] ^ v[8], 0x6543, 0x2107); + v[2] = v[2] + v[7] + header[2]; v[13] = __byte_perm_64(v[13] ^ v[2], 0x5432, 0x1076); v[8] = v[8] + v[13]; v[7] = ROTR64(v[7] ^ v[8], 63); + v[3] = v[3] + v[4] + header[8]; v[14] = __swap_hilo(v[14] ^ v[3]); v[9] = v[9] + v[14]; v[4] = __byte_perm_64(v[4] ^ v[9], 0x6543, 0x2107); + v[3] = v[3] + v[4]; v[14] = __byte_perm_64(v[14] ^ v[3], 0x5432, 0x1076); v[9] = v[9] + v[14]; v[4] = ROTR64(v[4] ^ v[9], 63); + + v[0] = v[0] + v[4]; v[12] = __swap_hilo(v[12] ^ v[0]); v[8] = v[8] + v[12]; v[4] = __byte_perm_64(v[4] ^ v[8], 0x6543, 0x2107); + v[0] = v[0] + v[4]; v[12] = __byte_perm_64(v[12] ^ v[0], 0x5432, 0x1076); v[8] = v[8] + v[12]; v[4] = ROTR64(v[4] ^ v[8], 63); + v[1] = v[1] + v[5] + header[7]; v[13] = __swap_hilo(v[13] ^ v[1]); v[9] = v[9] + v[13]; v[5] = __byte_perm_64(v[5] ^ v[9], 0x6543, 0x2107); + v[1] = v[1] + v[5]; v[13] = __byte_perm_64(v[13] ^ v[1], 0x5432, 0x1076); v[9] = v[9] + v[13]; v[5] = ROTR64(v[5] ^ v[9], 63); + v[2] = v[2] + v[6]; v[14] = __swap_hilo(v[14] ^ v[2]); v[10] = v[10] + v[14]; v[6] = __byte_perm_64(v[6] ^ v[10], 0x6543, 0x2107); + v[2] = v[2] + v[6] + header[1]; v[14] = __byte_perm_64(v[14] ^ v[2], 0x5432, 0x1076); v[10] = v[10] + v[14]; v[6] = ROTR64(v[6] ^ v[10], 63); + v[3] = v[3] + v[7] + header[3]; v[15] = __swap_hilo(v[15] ^ v[3]); v[11] = v[11] + v[15]; v[7] = __byte_perm_64(v[7] ^ v[11], 0x6543, 0x2107); + v[3] = v[3] + v[7] + header[9]; v[15] = __byte_perm_64(v[15] ^ v[3], 0x5432, 0x1076); v[11] = v[11] + v[15]; v[7] = ROTR64(v[7] ^ v[11], 63); + v[0] = v[0] + v[5] + header[5]; v[15] = __swap_hilo(v[15] ^ v[0]); v[10] = v[10] + v[15]; v[5] = __byte_perm_64(v[5] ^ v[10], 0x6543, 0x2107); + v[0] = v[0] + v[5] + header[0]; v[15] = __byte_perm_64(v[15] ^ v[0], 0x5432, 0x1076); v[10] = v[10] + v[15]; v[5] = ROTR64(v[5] ^ v[10], 63); + v[1] = v[1] + v[6]; v[12] = __swap_hilo(v[12] ^ v[1]); v[11] = v[11] + v[12]; v[6] = __byte_perm_64(v[6] ^ v[11], 0x6543, 0x2107); + v[1] = v[1] + v[6] + n; v[12] = __byte_perm_64(v[12] ^ v[1], 0x5432, 0x1076); v[11] = v[11] + v[12]; v[6] = ROTR64(v[6] ^ v[11], 63); + v[2] = v[2] + v[7] + header[8]; v[13] = __swap_hilo(v[13] ^ v[2]); v[8] = v[8] + v[13]; v[7] = __byte_perm_64(v[7] ^ v[8], 0x6543, 0x2107); + v[2] = v[2] + v[7] + header[6]; v[13] = __byte_perm_64(v[13] ^ v[2], 0x5432, 0x1076); v[8] = v[8] + v[13]; v[7] = ROTR64(v[7] ^ v[8], 63); + v[3] = v[3] + v[4] + header[2]; v[14] = __swap_hilo(v[14] ^ v[3]); v[9] = v[9] + v[14]; v[4] = __byte_perm_64(v[4] ^ v[9], 0x6543, 0x2107); + v[3] = v[3] + v[4]; v[14] = __byte_perm_64(v[14] ^ v[3], 0x5432, 0x1076); v[9] = v[9] + v[14]; v[4] = ROTR64(v[4] ^ v[9], 63); + + v[0] = v[0] + v[4] + header[6]; v[12] = __swap_hilo(v[12] ^ v[0]); v[8] = v[8] + v[12]; v[4] = __byte_perm_64(v[4] ^ v[8], 0x6543, 0x2107); + v[0] = v[0] + v[4]; v[12] = __byte_perm_64(v[12] ^ v[0], 0x5432, 0x1076); v[8] = v[8] + v[12]; v[4] = ROTR64(v[4] ^ v[8], 63); + v[1] = v[1] + v[5]; v[13] = __swap_hilo(v[13] ^ v[1]); v[9] = v[9] + v[13]; v[5] = __byte_perm_64(v[5] ^ v[9], 0x6543, 0x2107); + v[1] = v[1] + v[5] + header[9]; v[13] = __byte_perm_64(v[13] ^ v[1], 0x5432, 0x1076); v[9] = v[9] + v[13]; v[5] = ROTR64(v[5] ^ v[9], 63); + v[2] = v[2] + v[6]; v[14] = __swap_hilo(v[14] ^ v[2]); v[10] = v[10] + v[14]; v[6] = __byte_perm_64(v[6] ^ v[10], 0x6543, 0x2107); + v[2] = v[2] + v[6] + header[3]; v[14] = __byte_perm_64(v[14] ^ v[2], 0x5432, 0x1076); v[10] = v[10] + v[14]; v[6] = ROTR64(v[6] ^ v[10], 63); + v[3] = v[3] + v[7] + header[0]; v[15] = __swap_hilo(v[15] ^ v[3]); v[11] = v[11] + v[15]; v[7] = __byte_perm_64(v[7] ^ v[11], 0x6543, 0x2107); + v[3] = v[3] + v[7] + header[8]; v[15] = __byte_perm_64(v[15] ^ v[3], 0x5432, 0x1076); v[11] = v[11] + v[15]; v[7] = ROTR64(v[7] ^ v[11], 63); + v[0] = v[0] + v[5]; v[15] = __swap_hilo(v[15] ^ v[0]); v[10] = v[10] + v[15]; v[5] = __byte_perm_64(v[5] ^ v[10], 0x6543, 0x2107); + v[0] = v[0] + v[5] + header[2]; v[15] = __byte_perm_64(v[15] ^ v[0], 0x5432, 0x1076); v[10] = v[10] + v[15]; v[5] = ROTR64(v[5] ^ v[10], 63); + v[1] = v[1] + v[6]; v[12] = __swap_hilo(v[12] ^ v[1]); v[11] = v[11] + v[12]; v[6] = __byte_perm_64(v[6] ^ v[11], 0x6543, 0x2107); + v[1] = v[1] + v[6] + header[7]; v[12] = __byte_perm_64(v[12] ^ v[1], 0x5432, 0x1076); v[11] = v[11] + v[12]; v[6] = ROTR64(v[6] ^ v[11], 63); + v[2] = v[2] + v[7] + header[1]; v[13] = __swap_hilo(v[13] ^ v[2]); v[8] = v[8] + v[13]; v[7] = __byte_perm_64(v[7] ^ v[8], 0x6543, 0x2107); + v[2] = v[2] + v[7] + n; v[13] = __byte_perm_64(v[13] ^ v[2], 0x5432, 0x1076); v[8] = v[8] + v[13]; v[7] = ROTR64(v[7] ^ v[8], 63); + v[3] = v[3] + v[4]; v[14] = __swap_hilo(v[14] ^ v[3]); v[9] = v[9] + v[14]; v[4] = __byte_perm_64(v[4] ^ v[9], 0x6543, 0x2107); + v[3] = v[3] + v[4] + header[5]; v[14] = __byte_perm_64(v[14] ^ v[3], 0x5432, 0x1076); v[9] = v[9] + v[14]; v[4] = ROTR64(v[4] ^ v[9], 63); + + v[0] = v[0] + v[4]; v[12] = __swap_hilo(v[12] ^ v[0]); v[8] = v[8] + v[12]; v[4] = __byte_perm_64(v[4] ^ v[8], 0x6543, 0x2107); + v[0] = v[0] + v[4] + header[2]; v[12] = __byte_perm_64(v[12] ^ v[0], 0x5432, 0x1076); v[8] = v[8] + v[12]; v[4] = ROTR64(v[4] ^ v[8], 63); + v[1] = v[1] + v[5] + header[8]; v[13] = __swap_hilo(v[13] ^ v[1]); v[9] = v[9] + v[13]; v[5] = __byte_perm_64(v[5] ^ v[9], 0x6543, 0x2107); + v[1] = v[1] + v[5] + n; v[13] = __byte_perm_64(v[13] ^ v[1], 0x5432, 0x1076); v[9] = v[9] + v[13]; v[5] = ROTR64(v[5] ^ v[9], 63); + v[2] = v[2] + v[6] + header[7]; v[14] = __swap_hilo(v[14] ^ v[2]); v[10] = v[10] + v[14]; v[6] = __byte_perm_64(v[6] ^ v[10], 0x6543, 0x2107); + v[2] = v[2] + v[6] + header[6]; v[14] = __byte_perm_64(v[14] ^ v[2], 0x5432, 0x1076); v[10] = v[10] + v[14]; v[6] = ROTR64(v[6] ^ v[10], 63); + v[3] = v[3] + v[7] + header[1]; v[15] = __swap_hilo(v[15] ^ v[3]); v[11] = v[11] + v[15]; v[7] = __byte_perm_64(v[7] ^ v[11], 0x6543, 0x2107); + v[3] = v[3] + v[7] + header[5]; v[15] = __byte_perm_64(v[15] ^ v[3], 0x5432, 0x1076); v[11] = v[11] + v[15]; v[7] = ROTR64(v[7] ^ v[11], 63); + v[0] = v[0] + v[5]; v[15] = __swap_hilo(v[15] ^ v[0]); v[10] = v[10] + v[15]; v[5] = __byte_perm_64(v[5] ^ v[10], 0x6543, 0x2107); + v[0] = v[0] + v[5]; v[15] = __byte_perm_64(v[15] ^ v[0], 0x5432, 0x1076); v[10] = v[10] + v[15]; v[5] = ROTR64(v[5] ^ v[10], 63); + v[1] = v[1] + v[6] + header[9]; v[12] = __swap_hilo(v[12] ^ v[1]); v[11] = v[11] + v[12]; v[6] = __byte_perm_64(v[6] ^ v[11], 0x6543, 0x2107); + v[1] = v[1] + v[6]; v[12] = __byte_perm_64(v[12] ^ v[1], 0x5432, 0x1076); v[11] = v[11] + v[12]; v[6] = ROTR64(v[6] ^ v[11], 63); + v[2] = v[2] + v[7] + header[3]; v[13] = __swap_hilo(v[13] ^ v[2]); v[8] = v[8] + v[13]; v[7] = __byte_perm_64(v[7] ^ v[8], 0x6543, 0x2107); + v[2] = v[2] + v[7]; v[13] = __byte_perm_64(v[13] ^ v[2], 0x5432, 0x1076); v[8] = v[8] + v[13]; v[7] = ROTR64(v[7] ^ v[8], 63); + v[3] = v[3] + v[4]; v[14] = __swap_hilo(v[14] ^ v[3]); v[9] = v[9] + v[14]; v[4] = __byte_perm_64(v[4] ^ v[9], 0x6543, 0x2107); + v[3] = v[3] + v[4] + header[0]; v[14] = __byte_perm_64(v[14] ^ v[3], 0x5432, 0x1076); v[9] = v[9] + v[14]; v[4] = ROTR64(v[4] ^ v[9], 63); + + v[0] = v[0] + v[4] + header[0]; v[12] = __swap_hilo(v[12] ^ v[0]); v[8] = v[8] + v[12]; v[4] = __byte_perm_64(v[4] ^ v[8], 0x6543, 0x2107); + v[0] = v[0] + v[4] + header[1]; v[12] = __byte_perm_64(v[12] ^ v[0], 0x5432, 0x1076); v[8] = v[8] + v[12]; v[4] = ROTR64(v[4] ^ v[8], 63); + v[1] = v[1] + v[5] + header[2]; v[13] = __swap_hilo(v[13] ^ v[1]); v[9] = v[9] + v[13]; v[5] = __byte_perm_64(v[5] ^ v[9], 0x6543, 0x2107); + v[1] = v[1] + v[5] + header[3]; v[13] = __byte_perm_64(v[13] ^ v[1], 0x5432, 0x1076); v[9] = v[9] + v[13]; v[5] = ROTR64(v[5] ^ v[9], 63); + v[2] = v[2] + v[6] + n; v[14] = __swap_hilo(v[14] ^ v[2]); v[10] = v[10] + v[14]; v[6] = __byte_perm_64(v[6] ^ v[10], 0x6543, 0x2107); + v[2] = v[2] + v[6] + header[5]; v[14] = __byte_perm_64(v[14] ^ v[2], 0x5432, 0x1076); v[10] = v[10] + v[14]; v[6] = ROTR64(v[6] ^ v[10], 63); + v[3] = v[3] + v[7] + header[6]; v[15] = __swap_hilo(v[15] ^ v[3]); v[11] = v[11] + v[15]; v[7] = __byte_perm_64(v[7] ^ v[11], 0x6543, 0x2107); + v[3] = v[3] + v[7] + header[7]; v[15] = __byte_perm_64(v[15] ^ v[3], 0x5432, 0x1076); v[11] = v[11] + v[15]; v[7] = ROTR64(v[7] ^ v[11], 63); + v[0] = v[0] + v[5] + header[8]; v[15] = __swap_hilo(v[15] ^ v[0]); v[10] = v[10] + v[15]; v[5] = __byte_perm_64(v[5] ^ v[10], 0x6543, 0x2107); + v[0] = v[0] + v[5] + header[9]; v[15] = __byte_perm_64(v[15] ^ v[0], 0x5432, 0x1076); v[10] = v[10] + v[15]; v[5] = ROTR64(v[5] ^ v[10], 63); + v[1] = v[1] + v[6]; v[12] = __swap_hilo(v[12] ^ v[1]); v[11] = v[11] + v[12]; v[6] = __byte_perm_64(v[6] ^ v[11], 0x6543, 0x2107); + v[1] = v[1] + v[6]; v[12] = __byte_perm_64(v[12] ^ v[1], 0x5432, 0x1076); v[11] = v[11] + v[12]; v[6] = ROTR64(v[6] ^ v[11], 63); + v[2] = v[2] + v[7]; v[13] = __swap_hilo(v[13] ^ v[2]); v[8] = v[8] + v[13]; v[7] = __byte_perm_64(v[7] ^ v[8], 0x6543, 0x2107); + v[2] = v[2] + v[7]; v[13] = __byte_perm_64(v[13] ^ v[2], 0x5432, 0x1076); v[8] = v[8] + v[13]; v[7] = ROTR64(v[7] ^ v[8], 63); + v[3] = v[3] + v[4]; v[14] = __swap_hilo(v[14] ^ v[3]); v[9] = v[9] + v[14]; v[4] = __byte_perm_64(v[4] ^ v[9], 0x6543, 0x2107); + v[3] = v[3] + v[4]; v[14] = __byte_perm_64(v[14] ^ v[3], 0x5432, 0x1076); v[9] = v[9] + v[14]; v[4] = ROTR64(v[4] ^ v[9], 63); + + v[0] = v[0] + v[4]; v[12] = __swap_hilo(v[12] ^ v[0]); v[8] = v[8] + v[12]; v[4] = __byte_perm_64(v[4] ^ v[8], 0x6543, 0x2107); + v[0] = v[0] + v[4]; v[12] = __byte_perm_64(v[12] ^ v[0], 0x5432, 0x1076); v[8] = v[8] + v[12]; v[4] = ROTR64(v[4] ^ v[8], 63); + v[1] = v[1] + v[5] + n; v[13] = __swap_hilo(v[13] ^ v[1]); v[9] = v[9] + v[13]; v[5] = __byte_perm_64(v[5] ^ v[9], 0x6543, 0x2107); + v[1] = v[1] + v[5] + header[8]; v[13] = __byte_perm_64(v[13] ^ v[1], 0x5432, 0x1076); v[9] = v[9] + v[13]; v[5] = ROTR64(v[5] ^ v[9], 63); + v[2] = v[2] + v[6] + header[9]; v[14] = __swap_hilo(v[14] ^ v[2]); v[10] = v[10] + v[14]; v[6] = __byte_perm_64(v[6] ^ v[10], 0x6543, 0x2107); + v[2] = v[2] + v[6]; v[14] = __byte_perm_64(v[14] ^ v[2], 0x5432, 0x1076); v[10] = v[10] + v[14]; v[6] = ROTR64(v[6] ^ v[10], 63); + v[3] = v[3] + v[7]; v[15] = __swap_hilo(v[15] ^ v[3]); v[11] = v[11] + v[15]; v[7] = __byte_perm_64(v[7] ^ v[11], 0x6543, 0x2107); + v[3] = v[3] + v[7] + header[6]; v[15] = __byte_perm_64(v[15] ^ v[3], 0x5432, 0x1076); v[11] = v[11] + v[15]; v[7] = ROTR64(v[7] ^ v[11], 63); + v[0] = v[0] + v[5] + header[1]; + v[0] = v[0] + __byte_perm_64(v[5] ^ (v[10] + __swap_hilo(v[15] ^ v[0])), 0x6543, 0x2107); + v[2] = v[2] + v[7]; + v[13] = __swap_hilo(v[13] ^ v[2]); + v[8] = v[8] + v[13]; + v[2] = v[2] + __byte_perm_64(v[7] ^ v[8], 0x6543, 0x2107) + header[7]; + + if(cuda_swab64(0x6A09E667F2BDC928 ^ v[0] ^ (v[8] + __byte_perm_64(v[13] ^ v[2], 0x5432, 0x1076))) < target) + { + int i = atomicAdd(&numberofresults, 1); + if(i < MAXRESULTS) + nonceOut[i] = n & 0xffffffff; + return; + } + } +} + +void sia_gpu_hash(cudaStream_t cudastream, int thr_id, uint32_t threads, uint32_t *nonceOut, uint64_t target, uint64_t startnonce) +{ + siakernel << > >(nonceOut_d, target, startnonce); + CUDA_SAFE_CALL(cudaGetLastError()); + CUDA_SAFE_CALL(cudaMemcpyAsync(nonceOut, nonceOut_d, 4 * MAXRESULTS, cudaMemcpyDeviceToHost, cudastream)); + CUDA_SAFE_CALL(cudaStreamSynchronize(cudastream)); +} + +void sia_gpu_init(int thr_id) +{ + CUDA_SAFE_CALL(cudaMallocHost(&vpre_h, 16 * 8)); + CUDA_SAFE_CALL(cudaMalloc(&nonceOut_d, MAXRESULTS * 4)); + CUDA_SAFE_CALL(cudaMalloc(&hash_d, 4 * 8)); +} + +void sia_precalc(cudaStream_t cudastream, const uint64_t *blockHeader) +{ + vpre_h[0] = 0xBB1838E7A0A44BF9u + blockHeader[0]; vpre_h[12] = ROTR64(0x510E527FADE68281u ^ vpre_h[0], 32); vpre_h[8] = 0x6a09e667f3bcc908u + vpre_h[12]; vpre_h[4] = ROTR64(0x510e527fade682d1u ^ vpre_h[8], 24); + vpre_h[0] = vpre_h[0] + vpre_h[4] + blockHeader[1]; vpre_h[12] = ROTR64(vpre_h[12] ^ vpre_h[0], 16); vpre_h[8] = vpre_h[8] + vpre_h[12]; vpre_h[4] = ROTR64(vpre_h[4] ^ vpre_h[8], 63); + vpre_h[1] = 0x566D1711B009135Au + blockHeader[2]; vpre_h[13] = ROTR64(0x9b05688c2b3e6c1fu ^ vpre_h[1], 32); vpre_h[9] = 0xbb67ae8584caa73bu + vpre_h[13]; vpre_h[5] = ROTR64(0x9b05688c2b3e6c1fu ^ vpre_h[9], 24); + vpre_h[1] = vpre_h[1] + vpre_h[5] + blockHeader[3]; vpre_h[13] = ROTR64(vpre_h[13] ^ vpre_h[1], 16); vpre_h[9] = vpre_h[9] + vpre_h[13]; vpre_h[5] = ROTR64(vpre_h[5] ^ vpre_h[9], 63); + + CUDA_SAFE_CALL(cudaMemcpyToSymbolAsync(vpre, vpre_h, 16 * 8, 0, cudaMemcpyHostToDevice, cudastream)); + CUDA_SAFE_CALL(cudaMemcpyToSymbolAsync(header, blockHeader, 10 * 8, 0, cudaMemcpyHostToDevice, cudastream)); + CUDA_SAFE_CALL(cudaMemsetAsync(nonceOut_d, 0, 4 * MAXRESULTS, cudastream)); +} \ No newline at end of file diff --git a/Sia/sia.cu b/Sia/sia.cu new file mode 100644 index 0000000000..ad02c1d569 --- /dev/null +++ b/Sia/sia.cu @@ -0,0 +1,306 @@ +/*- +* blake2b C code from https://github.com/SiaMining/sgminer/blob/master/algorithm/sia.c +* +* Copyright 2009 Colin Percival, 2014 savale +* All rights reserved. +* +* Redistribution and use in source and binary forms, with or without +* modification, are permitted provided that the following conditions +* are met: +* 1. Redistributions of source code must retain the above copyright +* notice, this list of conditions and the following disclaimer. +* 2. Redistributions in binary form must reproduce the above copyright +* notice, this list of conditions and the following disclaimer in the +* documentation and/or other materials provided with the distribution. +* +* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND +* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE +* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +* SUCH DAMAGE. +* +* This file was originally written by Colin Percival as part of the Tarsnap +* online backup system. +*/ +#include "miner.h" +#include "cuda_helper.h" +#include +using namespace std; +#include +#include "sia.h" + +extern void applog_hex(void *data, int len); +extern bool fulltest_sia(const uint64_t *hash, const uint64_t *target); + +#define B2B_GET64(p) \ + (((uint64_t) ((uint8_t *) (p))[0]) ^ \ + (((uint64_t) ((uint8_t *) (p))[1]) << 8) ^ \ + (((uint64_t) ((uint8_t *) (p))[2]) << 16) ^ \ + (((uint64_t) ((uint8_t *) (p))[3]) << 24) ^ \ + (((uint64_t) ((uint8_t *) (p))[4]) << 32) ^ \ + (((uint64_t) ((uint8_t *) (p))[5]) << 40) ^ \ + (((uint64_t) ((uint8_t *) (p))[6]) << 48) ^ \ + (((uint64_t) ((uint8_t *) (p))[7]) << 56)) + +#define B2B_G(a, b, c, d, x, y) { \ + v[a] = v[a] + v[b] + x; \ + v[d] = ROTR64(v[d] ^ v[a], 32); \ + v[c] = v[c] + v[d]; \ + v[b] = ROTR64(v[b] ^ v[c], 24); \ + v[a] = v[a] + v[b] + y; \ + v[d] = ROTR64(v[d] ^ v[a], 16); \ + v[c] = v[c] + v[d]; \ + v[b] = ROTR64(v[b] ^ v[c], 63); } + +static const uint64_t blake2b_iv[8] = +{ + 0x6A09E667F3BCC908, 0xBB67AE8584CAA73B, + 0x3C6EF372FE94F82B, 0xA54FF53A5F1D36F1, + 0x510E527FADE682D1, 0x9B05688C2B3E6C1F, + 0x1F83D9ABFB41BD6B, 0x5BE0CD19137E2179 +}; + +typedef struct +{ + uint8_t b[128]; // input buffer + uint64_t h[8]; // chained state + uint64_t t[2]; // total number of bytes + size_t c; // pointer for b[] + size_t outlen; // digest size +} blake2b_ctx; + +void blake2b_update(blake2b_ctx *ctx, const void *in, size_t inlen); + +static void blake2b_compress(blake2b_ctx *ctx, int last) +{ + const uint8_t sigma[12][16] = + { + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, + {14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3}, + {11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4}, + {7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8}, + {9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13}, + {2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9}, + {12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11}, + {13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10}, + {6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5}, + {10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0}, + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, + {14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3} + }; + int i; + uint64_t v[16], m[16]; + + for(i = 0; i < 8; i++) + { // init work variables + v[i] = ctx->h[i]; + v[i + 8] = blake2b_iv[i]; + } + + v[12] ^= ctx->t[0]; // low 64 bits of offset + v[13] ^= ctx->t[1]; // high 64 bits + if(last) // last block flag set ? + v[14] = ~v[14]; + + for(i = 0; i < 16; i++) // get little-endian words + m[i] = B2B_GET64(&ctx->b[8 * i]); + + for(i = 0; i < 12; i++) + { // twelve rounds + B2B_G(0, 4, 8, 12, m[sigma[i][0]], m[sigma[i][1]]); + B2B_G(1, 5, 9, 13, m[sigma[i][2]], m[sigma[i][3]]); + B2B_G(2, 6, 10, 14, m[sigma[i][4]], m[sigma[i][5]]); + B2B_G(3, 7, 11, 15, m[sigma[i][6]], m[sigma[i][7]]); + B2B_G(0, 5, 10, 15, m[sigma[i][8]], m[sigma[i][9]]); + B2B_G(1, 6, 11, 12, m[sigma[i][10]], m[sigma[i][11]]); + B2B_G(2, 7, 8, 13, m[sigma[i][12]], m[sigma[i][13]]); + B2B_G(3, 4, 9, 14, m[sigma[i][14]], m[sigma[i][15]]); + } + + for(i = 0; i < 8; ++i) + ctx->h[i] ^= v[i] ^ v[i + 8]; +} + +// Initialize the hashing context "ctx" with optional key "key". +// 1 <= outlen <= 64 gives the digest size in bytes. +// Secret key (also <= 64 bytes) is optional (keylen = 0). +int blake2b_init(blake2b_ctx *ctx, size_t outlen, const void *key, size_t keylen) // (keylen=0: no key) +{ + size_t i; + + if(outlen == 0 || outlen > 64 || keylen > 64) + return -1; // illegal parameters + + for(i = 0; i < 8; i++) // state, "param block" + ctx->h[i] = blake2b_iv[i]; + ctx->h[0] ^= 0x01010000 ^ (keylen << 8) ^ outlen; + + ctx->t[0] = 0; // input count low word + ctx->t[1] = 0; // input count high word + ctx->c = 0; // pointer within buffer + ctx->outlen = outlen; + + for(i = keylen; i < 128; i++) // zero input block + ctx->b[i] = 0; + if(keylen > 0) + { + blake2b_update(ctx, key, keylen); + ctx->c = 128; // at the end + } + + return 0; +} + +// Add "inlen" bytes from "in" into the hash. +void blake2b_update(blake2b_ctx *ctx, const void *in, size_t inlen) +{ + size_t i; + + for(i = 0; i < inlen; i++) + { + if(ctx->c == 128) + { // buffer full ? + ctx->t[0] += ctx->c; // add counters + if(ctx->t[0] < ctx->c) // carry overflow ? + ctx->t[1]++; // high word + blake2b_compress(ctx, 0); // compress (not last) + ctx->c = 0; // counter to zero + } + ctx->b[ctx->c++] = ((const uint8_t *)in)[i]; + } +} + +// Generate the message digest (size given in init). +// Result placed in "out". +void blake2b_final(blake2b_ctx *ctx, void *out) +{ + size_t i; + + ctx->t[0] += ctx->c; // mark last block offset + if(ctx->t[0] < ctx->c) // carry overflow + ctx->t[1]++; // high word + + while(ctx->c < 128) // fill up with zeros + ctx->b[ctx->c++] = 0; + blake2b_compress(ctx, 1); // final block flag = 1 + + // little endian convert and store + for(i = 0; i < ctx->outlen; i++) + { + ((uint8_t *)out)[i] = + (ctx->h[i >> 3] >> (8 * (i & 7))) & 0xFF; + } +} + +void siahash(const void *data, unsigned int len, void *hash) +{ + blake2b_ctx ctx; + blake2b_init(&ctx, 32, NULL, 0); + blake2b_update(&ctx, data, len); + blake2b_final(&ctx, hash); +} + +/***************************************************************************/ + +int scanhash_sia(int thr_id, uint32_t *pdata, uint32_t *ptarget, uint32_t max_nonce, uint32_t *hashes_done) +{ + static THREAD uint32_t *h_nounce = nullptr; + const uint32_t first_nonce = pdata[8]; + static THREAD uint32_t throughputmax; + + if(opt_benchmark) + ptarget[7] = 0x00000001; + + static THREAD volatile bool init = false; + if(!init) + { + CUDA_SAFE_CALL(cudaSetDevice(device_map[thr_id])); + cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync); + cudaDeviceSetCacheConfig(cudaFuncCachePreferL1); + + CUDA_SAFE_CALL(cudaStreamCreate(&gpustream[thr_id])); + CUDA_SAFE_CALL(cudaMallocHost(&h_nounce, MAXRESULTS * sizeof(uint32_t))); + sia_gpu_init(thr_id); + + throughputmax = device_intensity(device_map[thr_id], __func__, 1U << 28); + mining_has_stopped[thr_id] = false; + init = true; + } + uint32_t throughput = min(throughputmax, (max_nonce - first_nonce)); + throughput -= throughput % (blocksize*npt); + + sia_precalc(gpustream[thr_id], (uint64_t *)pdata); + + uint32_t endiandata[20]; + for(int k = 0; k < 20; k++) + le32enc(&endiandata[k], pdata[k]); + + do + { + sia_gpu_hash(gpustream[thr_id], thr_id, throughput, h_nounce, ((uint64_t*)ptarget)[3], ((uint64_t*)pdata)[4]); + if(stop_mining) + { + cudaDeviceSynchronize(); + cudaStreamDestroy(gpustream[thr_id]); + cudaProfilerStop(); + mining_has_stopped[thr_id] = true; + pthread_exit(nullptr); + } + if(h_nounce[0] != 0) + { + const uint64_t Htarg = ((uint64_t*)ptarget)[3]; + uint64_t vhash64[4] = {0}; + if(opt_verify) + { + le32enc(&endiandata[8], h_nounce[0]); + siahash(endiandata, 80, vhash64); + } + if(swab64(vhash64[0]) <= Htarg && fulltest_sia(vhash64, (uint64_t*)ptarget)) + { + int res = 1; + *hashes_done = pdata[8] - first_nonce + throughput; + if(opt_benchmark || opt_debug) applog(LOG_INFO, "GPU #%d: Found nonce %08x", device_map[thr_id], h_nounce[0]); + // check if there was some other ones... + if(h_nounce[1] != 0) + { + if(opt_verify) + { + le32enc(&endiandata[8], h_nounce[1]); + siahash(vhash64, 80, endiandata); + + } + if(swab64(vhash64[0]) <= Htarg && fulltest_sia(vhash64, (uint64_t*)ptarget)) + { + pdata[20] = h_nounce[1]; + res++; + if(opt_benchmark || opt_debug) applog(LOG_INFO, "GPU #%d: Found second nonce", device_map[thr_id]); + } + else + { + if(vhash64[0] != Htarg) // don't show message if it is equal but fails fulltest + applog(LOG_INFO, "GPU #%d: result does not validate on CPU!", device_map[thr_id]); + } + } + pdata[8] = h_nounce[0]; +// applog(LOG_INFO, "hashes done = %08x", *hashes_done); + return res; + } + else + { + if(vhash64[0] != Htarg) // don't show message if it is equal but fails fulltest + applog(LOG_INFO, "GPU #%d: result does not validate on CPU!", device_map[thr_id]); + } + } + pdata[8] += throughput; + CUDA_SAFE_CALL(cudaGetLastError()); + + } while(!work_restart[thr_id].restart && ((uint64_t)max_nonce >((uint64_t)pdata[8] + (uint64_t)throughput))); + *hashes_done = pdata[8] - first_nonce; + return 0; +} \ No newline at end of file diff --git a/Sia/sia.h b/Sia/sia.h new file mode 100644 index 0000000000..e180cf6363 --- /dev/null +++ b/Sia/sia.h @@ -0,0 +1,10 @@ +#pragma once + +#define MAXRESULTS 8 + +#define npt 1 +#define blocksize 512 + +void sia_gpu_init(int thr_id); +void sia_precalc(cudaStream_t cudastream, const uint64_t *blockHeader); +void sia_gpu_hash(cudaStream_t cudastream, int thr_id, uint32_t threads, uint32_t *nonceOut, uint64_t target, uint64_t startnonce); diff --git a/api.cpp b/api.cpp index c1860f6d18..38572f1456 100644 --- a/api.cpp +++ b/api.cpp @@ -15,19 +15,18 @@ //# include #endif -#include +#include #include -#include -#include -#include +#include +#include #include #include #include -#include -#include -#include -#include - +#include +#include +#include +#include +using namespace std; #include #include @@ -90,7 +89,7 @@ static time_t startup = 0; static int bye = 0; extern char *opt_api_allow; -extern int opt_api_listen; /* port */ +extern uint16_t opt_api_listen; /* port */ extern uint32_t accepted_count; extern uint32_t rejected_count; extern int num_cpus; @@ -122,6 +121,7 @@ static void gpustatus(int thr_id) cgpu->gpu_temp = gpu_temp(cgpu); cgpu->gpu_fan = (uint16_t) gpu_fanpercent(cgpu); cgpu->gpu_fan_rpm = (uint16_t) gpu_fanrpm(cgpu); + cgpu->gpu_power = gpu_power(cgpu); #endif cuda_gpu_clocks(cgpu); @@ -270,12 +270,7 @@ static const char* os_name() #ifdef WIN32 return "windows"; #else - FILE *fd = fopen("/proc/version", "r"); - if (!fd || !fscanf(fd, "Linux version %48s", &os_version[6])) return "linux"; - fclose(fd); - os_version[48] = '\0'; - return (const char*) os_version; #endif } @@ -426,8 +421,10 @@ static size_t base64_encode(const uchar *indata, size_t insize, char *outptr, si memset(outptr, 0, outlen); outbuf = output = (char*)calloc(1, inlen * 4 / 3 + 4); - if (outbuf == NULL) { - return -1; + if(outbuf == NULL) + { + applog(LOG_ERR, "Out of memory!"); + proper_exit(1); } while (inlen > 0) { @@ -479,7 +476,7 @@ static size_t base64_encode(const uchar *indata, size_t insize, char *outptr, si return len; } -#include "compat/curl-for-windows/openssl/openssl/crypto/sha/sha.h" +#include "openssl/sha.h" /* websocket handshake (tested in Chrome) */ static int websocket_handshake(SOCKETTYPE c, char *result, char *clientkey) @@ -539,8 +536,11 @@ static int websocket_handshake(SOCKETTYPE c, char *result, char *clientkey) size_t handlen = strlen(answer); uchar *data = (uchar*) calloc(1, handlen + frames + (size_t) datalen + 1); - if (data == NULL) - return -1; + if(data == NULL) + { + applog(LOG_ERR, "Out of memory!"); + proper_exit(1); + } else { uchar *p = data; // HTTP header 101 @@ -565,8 +565,11 @@ static void setup_ipaccess() char group; buf = (char*) calloc(1, strlen(opt_api_allow) + 1); - if (unlikely(!buf)) - proper_exit(1);//, "Failed to malloc ipaccess buf"); + if(buf == NULL) + { + applog(LOG_ERR, "Out of memory!"); + proper_exit(1); + } strcpy(buf, opt_api_allow); ipcount = 1; @@ -576,8 +579,11 @@ static void setup_ipaccess() // possibly more than needed, but never less ipaccess = (struct IP4ACCESS *) calloc(ipcount, sizeof(struct IP4ACCESS)); - if (unlikely(!ipaccess)) - proper_exit(1);//, "Failed to calloc ipaccess"); + if(ipaccess == NULL) + { + applog(LOG_ERR, "Out of memory!"); + proper_exit(1); + } ips = 0; ptr = buf; @@ -676,11 +682,11 @@ static bool check_connect(struct sockaddr_in *cli, char **connectaddr, char *gro static void api() { const char *addr = opt_api_allow; - short int port = opt_api_listen; // 4068 + uint16_t port = opt_api_listen; // 4068 char buf[MYBUFSIZ]; int c, n, bound; - char *connectaddr; - char *binderror; + char *connectaddr = nullptr; + char *binderror = nullptr; char group; time_t bindstart; struct sockaddr_in serv; @@ -688,11 +694,11 @@ static void api() socklen_t clisiz; bool addrok = false; long long counter; - char *result; - char *params; + char *result = nullptr; + char *params = nullptr; int i; - SOCKETTYPE *apisock; + SOCKETTYPE *apisock = nullptr; if (!opt_api_listen && opt_debug) { applog(LOG_DEBUG, "API disabled"); return; @@ -706,6 +712,11 @@ static void api() } apisock = (SOCKETTYPE*) calloc(1, sizeof(*apisock)); + if(apisock == NULL) + { + applog(LOG_ERR, "Out of memory!"); + proper_exit(1); + } *apisock = INVSOCK; sleep(1); @@ -748,14 +759,29 @@ static void api() binderror = strerror(errno); if ((time(NULL) - bindstart) > 61) break; - else { + else if (opt_api_listen == 4068) { + /* when port is default one, use first available */ + if (opt_debug) + applog(LOG_DEBUG, "API bind to port %d failed, trying port %u", + port, (uint32_t) port+1); + port++; + serv.sin_port = htons(port); + sleep(1); + } else { if (!opt_quiet || opt_debug) - applog(LOG_WARNING, "API bind to port %d failed - trying again in 20sec", port); + applog(LOG_WARNING, "API bind to port %u failed - trying again in 20sec", + (uint32_t) port); sleep(20); } } - else + else { bound = 1; + if (opt_api_listen != port) { + applog(LOG_WARNING, "API bind to port %d failed - using port %u", + opt_api_listen, (uint32_t) port); + opt_api_listen = port; + } + } } if (bound == 0) { @@ -772,13 +798,19 @@ static void api() } buffer = (char *) calloc(1, MYBUFSIZ + 1); + if(buffer == NULL) + { + applog(LOG_ERR, "Out of memory!"); + proper_exit(1); + } counter = 0; while (bye == 0) { counter++; clisiz = sizeof(cli); - if (SOCKETFAIL(c = accept(*apisock, (struct sockaddr *)(&cli), &clisiz))) { + c = accept(*apisock, (struct sockaddr*) (&cli), &clisiz); + if (SOCKETFAIL(c)) { applog(LOG_ERR, "API failed (%s)%s", strerror(errno), UNAVAILABLE); CLOSESOCKET(*apisock); free(apisock); diff --git a/autogen.sh b/autogen.sh index 8261a2c136..a4768b525a 100755 --- a/autogen.sh +++ b/autogen.sh @@ -1 +1 @@ -aclocal && autoheader && automake --add-missing --gnu --copy && autoconf +aclocal && autoheader && automake --add-missing --gnu --copy && autoconf diff --git a/bitcoin.cu b/bitcoin.cu index 6f8b2b1107..aecf49c35c 100644 --- a/bitcoin.cu +++ b/bitcoin.cu @@ -1,8 +1,6 @@ #include "miner.h" #include "cuda_helper.h" -static uint32_t *h_nounce[MAX_GPUS]; - extern void bitcoin_cpu_init(int thr_id); extern void bitcoin_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, const uint32_t *const ms, uint32_t merkle, uint32_t time, uint32_t compacttarget, uint32_t *const h_nounce); extern void bitcoin_midstate(const uint32_t *data, uint32_t *midstate); @@ -112,39 +110,42 @@ void bitcoin_hash(uint32_t *output, const uint32_t *data, uint32_t nonce, const b = a; a = t1 + t2; } - output[0] = a + hc[0]; - output[1] = b + hc[1]; - output[2] = c + hc[2]; - output[3] = d + hc[3]; - output[4] = e + hc[4]; - output[5] = f + hc[5]; - output[6] = g + hc[6]; - output[7] = h + hc[7]; + be32enc(&output[0], a + hc[0]); + be32enc(&output[1], b + hc[1]); + be32enc(&output[2], c + hc[2]); + be32enc(&output[3], d + hc[3]); + be32enc(&output[4], e + hc[4]); + be32enc(&output[5], f + hc[5]); + be32enc(&output[6], g + hc[6]); + be32enc(&output[7], h + hc[7]); } -static bool init[MAX_GPUS] = { 0 }; int scanhash_bitcoin(int thr_id, uint32_t *pdata, - const uint32_t *ptarget, uint32_t max_nonce, - unsigned long *hashes_done) + uint32_t *ptarget, uint32_t max_nonce, + uint32_t *hashes_done) { + static THREAD uint32_t *h_nounce = nullptr; + const uint32_t first_nonce = pdata[19]; - uint32_t throughput = device_intensity(thr_id, __func__, 1U << 24); - throughput = min(throughput, (max_nonce - first_nonce)); + uint32_t throughput = device_intensity(device_map[thr_id], __func__, 1U << 28); + throughput = min(throughput, (max_nonce - first_nonce)) & 0xfffffc00; if (opt_benchmark) - ((uint32_t*)ptarget)[7] = 0x0005; + ptarget[7] = 0x0005; - if (!init[thr_id]) + static THREAD volatile bool init = false; + if(!init) { - cudaSetDevice(device_map[thr_id]); - cudaDeviceReset(); + CUDA_SAFE_CALL(cudaSetDevice(device_map[thr_id])); cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync); cudaDeviceSetCacheConfig(cudaFuncCachePreferL1); + CUDA_SAFE_CALL(cudaStreamCreate(&gpustream[thr_id])); bitcoin_cpu_init(thr_id); - CUDA_SAFE_CALL(cudaMallocHost(&h_nounce[thr_id], 2 * sizeof(uint32_t))); - init[thr_id] = true; + CUDA_SAFE_CALL(cudaMallocHost(&h_nounce, 2 * sizeof(uint32_t))); + mining_has_stopped[thr_id] = false; + init = true; } uint32_t ms[8]; @@ -152,40 +153,52 @@ int scanhash_bitcoin(int thr_id, uint32_t *pdata, do { - bitcoin_cpu_hash(thr_id, (int)throughput, pdata[19], ms, pdata[16], pdata[17], pdata[18], h_nounce[thr_id]); - if (h_nounce[thr_id][0] != UINT32_MAX) + bitcoin_cpu_hash(thr_id, throughput, pdata[19], ms, pdata[16], pdata[17], pdata[18], h_nounce); + if(stop_mining) {mining_has_stopped[thr_id] = true; cudaStreamDestroy(gpustream[thr_id]); pthread_exit(nullptr);} + if(h_nounce[0] != UINT32_MAX) { - uint32_t vhash64[8]; - bitcoin_hash(vhash64, pdata, h_nounce[thr_id][0], ms); - if (vhash64[7] == 0 && fulltest(vhash64, ptarget)) + uint32_t vhash64[8]={0}; + bitcoin_hash(vhash64, pdata, h_nounce[0], ms); + if (!opt_verify || (vhash64[7] == 0 && fulltest(vhash64, ptarget))) { int res = 1; // check if there was some other ones... *hashes_done = pdata[19] - first_nonce + throughput; - if (h_nounce[thr_id][1] != 0xffffffff) + if (h_nounce[1] != 0xffffffff) { - pdata[21] = h_nounce[thr_id][1]; - res++; - if (opt_benchmark) - applog(LOG_INFO, "GPU #%d Found second nounce %08x", thr_id, h_nounce[thr_id][1]); + bitcoin_hash(vhash64, pdata, h_nounce[1], ms); + if (!opt_verify || (vhash64[7] == 0 && fulltest(vhash64, ptarget))) + { + pdata[21] = h_nounce[1]; + res++; + if (opt_benchmark) + applog(LOG_INFO, "GPU #%d Found second nounce %08x", device_map[thr_id], h_nounce[1]); + } + else + { + if (vhash64[7] > 0) + { + applog(LOG_WARNING, "GPU #%d: result for %08x does not validate on CPU!", device_map[thr_id], h_nounce[1]); + } + } } - pdata[19] = h_nounce[thr_id][0]; + pdata[19] = h_nounce[0]; if (opt_benchmark) - applog(LOG_INFO, "GPU #%d Found nounce %08x", thr_id, h_nounce[thr_id][0]); + applog(LOG_INFO, "GPU #%d Found nounce %08x", device_map[thr_id], h_nounce[0]); return res; } else { if (vhash64[7] > 0) { - applog(LOG_INFO, "GPU #%d: result for %08x does not validate on CPU!", thr_id, h_nounce[thr_id][0]); + applog(LOG_WARNING, "GPU #%d: result for %08x does not validate on CPU!", device_map[thr_id], h_nounce[0]); } } } - pdata[19] += throughput; + pdata[19] += throughput; CUDA_SAFE_CALL(cudaGetLastError()); } while (!work_restart[thr_id].restart && ((uint64_t)max_nonce > ((uint64_t)(pdata[19]) + (uint64_t)throughput))); - *hashes_done = pdata[19] - first_nonce; + *hashes_done = pdata[19] - first_nonce ; return 0; } diff --git a/bitslice_transformations_quad.cu b/bitslice_transformations_quad.cu index ddbeb1aa81..acfd6e17a2 100644 --- a/bitslice_transformations_quad.cu +++ b/bitslice_transformations_quad.cu @@ -1,72 +1,81 @@ -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 300 -/** - * __shfl() returns the value of var held by the thread whose ID is given by srcLane. - * If srcLane is outside the range 0..width-1, the thread's own value of var is returned. - */ -#undef __shfl -#define __shfl(var, srcLane, width) (uint32_t)(var) -#endif -#define merge8(z,x,y)\ - z=__byte_perm(x, y, 0x5140); \ +#define merge8(z, x, y, b)\ + z=__byte_perm(x, y, b); \ #define SWAP8(x,y)\ x=__byte_perm(x, y, 0x5410); \ y=__byte_perm(x, y, 0x7632); #define SWAP4(x,y)\ - t = (y<<4); \ - t = (x ^ t); \ - t = 0xf0f0f0f0UL & t; \ + t = 0xf0f0f0f0UL & (x ^ (y<<4)); \ x = (x ^ t); \ t= t>>4;\ y= y ^ t; +#ifndef NOASM +#define SWAP4_final(x,y)\ + asm("and.b32 %0, %0, 0x0f0f0f0f;"\ + "and.b32 %1, %1, 0x0f0f0f0f;"\ + "vshl.u32.u32.u32.clamp.add %0, %1, 4, %0;\n\t"\ + : "+r"(x) : "r"(y)); +#else +#define SWAP4_final(x,y)\ + t = 0xf0f0f0f0UL & (x ^ (y << 4)); \ + x = (x ^ (0xf0f0f0f0UL & (x ^ (y << 4)))); +#endif + #define SWAP2(x,y)\ - t = (y<<2); \ - t = (x ^ t); \ - t = 0xccccccccUL & t; \ + t = 0xccccccccUL & (x ^ (y<<2)); \ x = (x ^ t); \ t= t>>2;\ y= y ^ t; #define SWAP1(x,y)\ - t = (y+y); \ - t = (x ^ t); \ - t = 0xaaaaaaaaUL & t; \ + t = 0xaaaaaaaaUL & (x ^ (y<<1)); \ x = (x ^ t); \ - t= t>>1;\ - y= y ^ t; - + t = t>>1;\ + y = y ^ t; __device__ __forceinline__ void to_bitslice_quad(uint32_t *const __restrict__ input, uint32_t *const __restrict__ output) { - uint32_t other[8]; + uint32_t other[8]; uint32_t t; - #pragma unroll - for (int i = 0; i < 8; i++) + const uint32_t perm = (threadIdx.x & 1) ? 0x7362 : 0x5140; + const unsigned int n = threadIdx.x & 3; +#pragma unroll + for(int i = 0; i < 4; i++) { - const unsigned int n = threadIdx.x & 3; input[i] = __shfl((int)input[i], n ^ (3 * (n >= 1 && n <= 2)), 4); - other[i] = __shfl((int)input[i], (threadIdx.x + 1) & 3, 4); - input[i] = __shfl((int)input[i], threadIdx.x & 2, 4); - other[i] = __shfl((int)other[i], threadIdx.x & 2, 4); - if (threadIdx.x & 1) { - input[i] = __byte_perm(input[i], 0, 0x1032); - other[i] = __byte_perm(other[i], 0, 0x1032); - } - } - - merge8(output[0], input[0], input[4]); - merge8(output[1], other[0], other[4]); - merge8(output[2], input[1], input[5]); - merge8(output[3], other[1], other[5]); - merge8(output[4], input[2], input[6]); - merge8(output[5], other[2], other[6]); - merge8(output[6], input[3], input[7]); - merge8(output[7], other[3], other[7]); + other[i] = __shfl((int)input[i], (threadIdx.x + 1) & 3, 4); + input[i] = __shfl((int)input[i], threadIdx.x & 2, 4); + other[i] = __shfl((int)other[i], threadIdx.x & 2, 4); + } + + if((threadIdx.x & 3) < 2) + { + input[4] = 0x80; + } + else + { + input[4] = 0; + } + + if((threadIdx.x & 3) > 1) + other[7] = 0x01000000; + else + other[7] = 0; + input[7] = 0; + + merge8(output[0], input[0], input[4], perm); + merge8(output[1], other[0], 0, perm); + merge8(output[2], input[1], 0, perm); + merge8(output[3], other[1], 0, perm); + merge8(output[4], input[2], 0, perm); + merge8(output[5], other[2], 0, perm); + merge8(output[6], input[3], 0, perm); + merge8(output[7], other[3], other[7], perm); SWAP1(output[0], output[1]); SWAP1(output[2], output[3]); @@ -85,15 +94,67 @@ void to_bitslice_quad(uint32_t *const __restrict__ input, uint32_t *const __rest } __device__ __forceinline__ -void from_bitslice_quad(const uint32_t *const __restrict__ input, uint32_t *const __restrict__ output) +void myr_to_bitslice_quad(uint32_t *const __restrict__ input, uint32_t *const __restrict__ output) { + uint32_t other[8]; + uint32_t t; + + const uint32_t perm = (threadIdx.x & 1) ? 0x7362 : 0x5140; + const unsigned int n = threadIdx.x & 3; +#pragma unroll + for(int i = 0; i < 5; i++) + { + input[i] = __shfl((int)input[i], n ^ (3 * (n >= 1 && n <= 2)), 4); + other[i] = __shfl((int)input[i], (threadIdx.x + 1) & 3, 4); + input[i] = __shfl((int)input[i], threadIdx.x & 2, 4); + other[i] = __shfl((int)other[i], threadIdx.x & 2, 4); + } + if(n < 2) + { + input[5] = 0x80; + other[7] = 0; + } + else + { + input[5] = 0; + other[7] = 0x01000000; + } + + merge8(output[0], input[0], input[4], perm); + merge8(output[1], other[0], other[4], perm); + merge8(output[2], input[1], input[5], perm); + output[3] = __byte_perm(other[1], 0, perm); + output[4] = __byte_perm(input[2], 0, perm); + output[5] = __byte_perm(other[2], 0, perm); + output[6] = __byte_perm(input[3], 0, perm); + merge8(output[7], other[3], other[7], perm); + SWAP1(output[0], output[1]); + SWAP1(output[2], output[3]); + SWAP1(output[4], output[5]); + SWAP1(output[6], output[7]); + + SWAP2(output[0], output[2]); + SWAP2(output[1], output[3]); + SWAP2(output[4], output[6]); + SWAP2(output[5], output[7]); + + SWAP4(output[0], output[4]); + SWAP4(output[1], output[5]); + SWAP4(output[2], output[6]); + SWAP4(output[3], output[7]); +} + +__device__ __forceinline__ +void from_bitslice_quad(const uint32_t *const __restrict__ input, uint32_t *const __restrict__ output) +{ uint32_t t; + const uint32_t perm = 0x7531;//(threadIdx.x & 1) ? 0x3175 : 0x7531; - output[0] = __byte_perm(input[0], input[4], 0x7531); - output[2] = __byte_perm(input[1], input[5], 0x7531); - output[8] = __byte_perm(input[2], input[6], 0x7531); - output[10] = __byte_perm(input[3], input[7], 0x7531); + output[0] = __byte_perm(input[0], input[4], perm); + output[2] = __byte_perm(input[1], input[5], perm); + output[8] = __byte_perm(input[2], input[6], perm); + output[10] = __byte_perm(input[3], input[7], perm); SWAP1(output[0], output[2]); SWAP1(output[8], output[10]); @@ -112,57 +173,87 @@ void from_bitslice_quad(const uint32_t *const __restrict__ input, uint32_t *cons SWAP4(output[0], output[8]); SWAP4(output[2], output[10]); - output[4] = output[0]; - output[6] = output[2]; - output[12] = output[8]; - output[14] = output[10]; - - if (threadIdx.x & 1) + if(threadIdx.x & 1) { + output[14] = __byte_perm(output[10], 0, 0x3232); + output[12] = __byte_perm(output[8], 0, 0x3232); + output[6] = __byte_perm(output[2], 0, 0x3232); + output[4] = __byte_perm(output[0], 0, 0x3232); + output[0] = __byte_perm(output[0], 0, 0x1032); output[2] = __byte_perm(output[2], 0, 0x1032); - output[4] = __byte_perm(output[4], 0, 0x3232); - output[6] = __byte_perm(output[6], 0, 0x3232); output[8] = __byte_perm(output[8], 0, 0x1032); output[10] = __byte_perm(output[10], 0, 0x1032); - output[12] = __byte_perm(output[12], 0, 0x3232); - output[14] = __byte_perm(output[14], 0, 0x3232); + } + else + { + output[4] = output[0]; + output[6] = output[2]; + output[12] = output[8]; + output[14] = output[10]; } output[0] = __byte_perm(output[0], __shfl((int)output[0], (threadIdx.x + 1) & 3, 4), 0x7610); - output[0 + 1] = __shfl((int)output[0], (threadIdx.x + 2) & 3, 4); - output[2] = __byte_perm(output[2], __shfl((int)output[2], (threadIdx.x + 1) & 3, 4), 0x7610); - output[2 + 1] = __shfl((int)output[2], (threadIdx.x + 2) & 3, 4); - output[4] = __byte_perm(output[4], __shfl((int)output[4], (threadIdx.x + 1) & 3, 4), 0x7632); - output[4 + 1] = __shfl((int)output[4], (threadIdx.x + 2) & 3, 4); - output[6] = __byte_perm(output[6], __shfl((int)output[6], (threadIdx.x + 1) & 3, 4), 0x7632); - output[6 + 1] = __shfl((int)output[6], (threadIdx.x + 2) & 3, 4); - output[8] = __byte_perm(output[8], __shfl((int)output[8], (threadIdx.x + 1) & 3, 4), 0x7610); - output[8 + 1] = __shfl((int)output[8], (threadIdx.x + 2) & 3, 4); - output[10] = __byte_perm(output[10], __shfl((int)output[10], (threadIdx.x + 1) & 3, 4), 0x7610); - output[10 + 1] = __shfl((int)output[10], (threadIdx.x + 2) & 3, 4); - output[12] = __byte_perm(output[12], __shfl((int)output[12], (threadIdx.x + 1) & 3, 4), 0x7632); - output[12 + 1] = __shfl((int)output[12], (threadIdx.x + 2) & 3, 4); - output[14] = __byte_perm(output[14], __shfl((int)output[14], (threadIdx.x + 1) & 3, 4), 0x7632); + + output[0 + 1] = __shfl((int)output[0], (threadIdx.x + 2) & 3, 4); + output[2 + 1] = __shfl((int)output[2], (threadIdx.x + 2) & 3, 4); + output[4 + 1] = __shfl((int)output[4], (threadIdx.x + 2) & 3, 4); + output[6 + 1] = __shfl((int)output[6], (threadIdx.x + 2) & 3, 4); + output[8 + 1] = __shfl((int)output[8], (threadIdx.x + 2) & 3, 4); + output[10 + 1] = __shfl((int)output[10], (threadIdx.x + 2) & 3, 4); + output[12 + 1] = __shfl((int)output[12], (threadIdx.x + 2) & 3, 4); output[14 + 1] = __shfl((int)output[14], (threadIdx.x + 2) & 3, 4); -/* if (threadIdx.x & 3) +} + +__device__ __forceinline__ +void from_bitslice_quad_final(const uint32_t *const __restrict__ input, uint32_t *const __restrict__ output) +{ + uint32_t t; + const uint32_t perm = 0x7531;//(threadIdx.x & 1) ? 0x3175 : 0x7531; + + output[0] = __byte_perm(input[0], input[4], perm); + output[2] = __byte_perm(input[1], input[5], perm); + output[8] = __byte_perm(input[2], input[6], perm); + output[10] = __byte_perm(input[3], input[7], perm); + + SWAP1(output[0], output[2]); + SWAP1(output[8], output[10]); + + SWAP2(output[2], output[10]); + + output[6] = __byte_perm(output[2], output[10], 0x5410); + output[10] = __byte_perm(output[2], output[10], 0x7632); + + if(threadIdx.x & 3) { - output[0] = output[0 + 1] = 0; - output[2] = output[2 + 1] = 0; - output[4] = output[4 + 1] = 0; - output[6] = output[6 + 1] = 0; - output[8] = output[8 + 1] = 0; - output[10] = output[10 + 1] = 0; - output[12] = output[12 + 1] = 0; - output[14] = output[14 + 1] = 0; + SWAP4_final(output[6], output[10]); + output[6] = __byte_perm(output[6], 0, 0x3232); } -*/ + else + { + output[2] = output[6]; + + SWAP4(output[2], output[10]); + + if(threadIdx.x & 1) + { + output[6] = __byte_perm(output[2], 0, 0x3232); + } + else + { + output[6] = output[2]; + } + } + + output[6] = __byte_perm(output[6], __shfl((int)output[6], (threadIdx.x + 1) & 3, 4), 0x7632); + output[7] = __shfl((int)output[6], (threadIdx.x + 2) & 3, 4); + } diff --git a/build.sh b/build.sh index 17935f3968..9a15d5c01d 100755 --- a/build.sh +++ b/build.sh @@ -13,4 +13,5 @@ rm -f config.status # CFLAGS="-O2" ./configure ./configure.sh -make -j 4 +make -j4 + diff --git a/cpuminer-config.h b/ccminer-config-win.h similarity index 93% rename from cpuminer-config.h rename to ccminer-config-win.h index 51fca9fe5d..661e032627 100644 --- a/cpuminer-config.h +++ b/ccminer-config-win.h @@ -14,7 +14,7 @@ /* Define to 1 if you have and it should be used (not on Ultrix). */ -#define HAVE_ALLOCA_H 1 +//#define HAVE_ALLOCA_H 1 /* Define to 1 if you have the declaration of `be32dec', and to 0 if you don't. */ @@ -39,7 +39,7 @@ #define HAVE_INTTYPES_H 1 /* Define to 1 if you have the `crypto' library (-lcrypto). */ -#define HAVE_LIBCRYPTO 1 +//#define HAVE_LIBCRYPTO 1 /* Define to 1 if you have a functional curl library. */ #define HAVE_LIBCURL 1 @@ -57,31 +57,31 @@ #define HAVE_STDLIB_H 1 /* Define to 1 if you have the header file. */ -#define HAVE_STRINGS_H 1 +//#define HAVE_STRINGS_H 1 /* Define to 1 if you have the header file. */ #define HAVE_STRING_H 1 /* Define to 1 if you have the header file. */ -#define HAVE_SYSLOG_H 1 +//#define HAVE_SYSLOG_H 1 /* Define to 1 if you have the header file. */ /* #undef HAVE_SYS_ENDIAN_H */ /* Define to 1 if you have the header file. */ -#define HAVE_SYS_PARAM_H 1 +//#define HAVE_SYS_PARAM_H 1 /* Define to 1 if you have the header file. */ #define HAVE_SYS_STAT_H 1 /* Define to 1 if you have the header file. */ -#define HAVE_SYS_SYSCTL_H 1 +//#define HAVE_SYS_SYSCTL_H 1 /* Define to 1 if you have the header file. */ #define HAVE_SYS_TYPES_H 1 /* Define to 1 if you have the header file. */ -#define HAVE_UNISTD_H 1 +//#define HAVE_UNISTD_H 1 /* Defined if libcurl supports AsynchDNS */ #define LIBCURL_FEATURE_ASYNCHDNS 1 @@ -156,7 +156,7 @@ #define PACKAGE_NAME "ccminer" /* Define to the full name and version of this package. */ -#define PACKAGE_STRING "ccminer 1.5.31-git(SP-MOD)" +#define PACKAGE_STRING "ccminer 8.12-KlausT" /* Define to the one symbol short name of this package. */ #define PACKAGE_TARNAME "ccminer" @@ -165,7 +165,7 @@ #define PACKAGE_URL "" /* Define to the version of this package. */ -#define PACKAGE_VERSION "1.5.31-git(SP-MOD)" +#define PACKAGE_VERSION "8.12-KlausT" /* If using the C implementation of alloca, define if you know the direction of stack growth for your system; otherwise it will be @@ -179,16 +179,16 @@ #define STDC_HEADERS 1 /* Define to 1 if AVX assembly is available. */ -#define USE_AVX 1 +#define USE_AVX 0 /* Define to 1 if AVX2 assembly is available. */ -#define USE_AVX2 1 +#define USE_AVX2 0 /* Define to 1 if XOP assembly is available. */ -#define USE_XOP 1 +//#define USE_XOP 1 /* Version number of package */ -#define VERSION "1.5.31-git(SP-MOD)" +#define VERSION "8.12-KlausT" /* Define curl_free() as free() if our version of curl lacks curl_free. */ /* #undef curl_free */ diff --git a/ccminer.cpp b/ccminer.cpp index 6445654c25..74e919d3b4 100644 --- a/ccminer.cpp +++ b/ccminer.cpp @@ -1,6 +1,7 @@ /* * Copyright 2010 Jeff Garzik * Copyright 2012-2014 pooler + * Copyright 2014-2015 tpruvot * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the Free @@ -8,18 +9,22 @@ * any later version. See COPYING for more details. */ -#include "cpuminer-config.h" +#ifndef WIN32 +#include "ccminer-config.h" +#else +#include "ccminer-config-win.h" +#endif +#include "cuda_runtime_api.h" -#include -#include -#include -#include -#include +#include +#include +#include +#include #include -#include +#include #include -#include -#include +#include +#include #include #include @@ -27,7 +32,7 @@ #ifdef WIN32 #include -#include +#include #else #include #include @@ -39,6 +44,7 @@ #include #endif #endif +using namespace std; #include "miner.h" @@ -50,15 +56,17 @@ BOOL WINAPI ConsoleHandler(DWORD); #endif #define PROGRAM_NAME "ccminer" -#define LP_SCANTIME 60 -#define HEAVYCOIN_BLKHDR_SZ 84 +#define LP_SCANTIME 25 #define MNR_BLKHDR_SZ 80 +double expectedblocktime(const uint32_t *target); + // from cuda.cpp int cuda_num_devices(); void cuda_devicenames(); void cuda_devicereset(); int cuda_finddevice(char *name); +void cuda_print_devices(); #include "nvml.h" #ifdef USE_WRAPNVML @@ -78,70 +86,43 @@ struct workio_cmd { } u; }; -enum sha_algos { - ALGO_ANIME, - ALGO_BITCOIN, - ALGO_BLAKE, - ALGO_BLAKECOIN, - ALGO_DEEP, - ALGO_DMD_GR, - ALGO_DOOM, - ALGO_FRESH, - ALGO_FUGUE256, /* Fugue256 */ - ALGO_GROESTL, - ALGO_HEAVY, /* Heavycoin hash */ - ALGO_KECCAK, - ALGO_JACKPOT, - ALGO_LUFFA_DOOM, - ALGO_LYRA2, - ALGO_MJOLLNIR, /* Hefty hash */ - ALGO_MYR_GR, - ALGO_NIST5, - ALGO_PENTABLAKE, - ALGO_QUARK, - ALGO_QUBIT, - ALGO_S3, - ALGO_SPREADX11, - ALGO_WHC, - ALGO_X11, - ALGO_X13, - ALGO_X14, - ALGO_X15, - ALGO_X17, -}; - static const char *algo_names[] = { - "anime", "bitcoin", "blake", "blakecoin", + "c11", "deep", "dmd-gr", "doom", /* is luffa */ "fresh", "fugue256", "groestl", - "heavy", "keccak", "jackpot", "luffa", - "lyra2", - "mjollnir", + "lyra2v2", "myr-gr", "nist5", "penta", "quark", "qubit", + "sia", + "skein", "s3", "spread", "whirl", + "whirlpoolx", "x11", "x13", "x14", "x15", "x17", + "vanilla", + "neoscrypt" }; +char curl_err_str[CURL_ERROR_SIZE]; +bool opt_verify = true; bool opt_debug = false; bool opt_protocol = false; bool opt_benchmark = false; @@ -150,6 +131,7 @@ bool have_longpoll = false; bool want_stratum = true; bool have_stratum = false; bool allow_gbt = true; +bool allow_mininginfo = true; bool check_dups = false; static bool submit_old = false; bool use_syslog = false; @@ -157,54 +139,63 @@ bool use_colors = true; static bool opt_background = false; bool opt_quiet = false; static int opt_retries = -1; -static int opt_fail_pause = 30; +static int opt_fail_pause = 10; int opt_timeout = 270; -static int opt_scantime = 5; -static json_t *opt_config; +static int opt_scantime = 25; +static json_t *opt_config = nullptr; static const bool opt_time = true; -static enum sha_algos opt_algo = ALGO_X11; +enum sha_algos opt_algo; int opt_n_threads = 0; int opt_affinity = -1; int opt_priority = 0; static double opt_difficulty = 1; // CH +static bool opt_extranonce = true; bool opt_trust_pool = false; -uint16_t opt_vote = 9999; int num_cpus; int active_gpus; -char * device_name[MAX_GPUS]; -int device_map[MAX_GPUS] = { 0, 1, 2, 3, 4, 5, 6, 7,8,9,10,11,12,13,14,15 }; +char * device_name[MAX_GPUS] = { nullptr }; +int device_map[MAX_GPUS] = { 0 }; long device_sm[MAX_GPUS] = { 0 }; -uint32_t gpus_intensity[MAX_GPUS] = { 0 }; +uint32_t gpus_intensity[MAX_GPUS] = {0}; +uint32_t device_gpu_clocks[MAX_GPUS] = {0}; +uint32_t device_mem_clocks[MAX_GPUS] = {0}; +uint32_t device_plimit[MAX_GPUS] = {0}; +int8_t device_pstate[MAX_GPUS]; char *rpc_user = NULL; -static char *rpc_url; -static char *rpc_userpass; -static char *rpc_pass; +static char *rpc_url = nullptr; +static char *rpc_userpass = nullptr; +static char *rpc_pass = nullptr; static char *short_url = NULL; -char *opt_cert; -char *opt_proxy; +char *opt_cert = nullptr; +char *opt_proxy = nullptr; long opt_proxy_type; -struct thr_info *thr_info; +struct thr_info *thr_info = nullptr; static int work_thr_id; -struct thr_api *thr_api; +struct thr_api *thr_api = nullptr; int longpoll_thr_id = -1; int stratum_thr_id = -1; int api_thr_id = -1; bool stratum_need_reset = false; struct work_restart *work_restart = NULL; struct stratum_ctx stratum = { 0 }; +bool stop_mining = false; +volatile bool mining_has_stopped[MAX_GPUS]; -pthread_mutex_t applog_lock; -static pthread_mutex_t stats_lock; +pthread_mutex_t applog_lock = PTHREAD_MUTEX_INITIALIZER; +static pthread_mutex_t stats_lock = PTHREAD_MUTEX_INITIALIZER; uint32_t accepted_count = 0L; uint32_t rejected_count = 0L; -static double *thr_hashrates; +static double thr_hashrates[MAX_GPUS]; uint64_t global_hashrate = 0; double global_diff = 0.0; +uint64_t net_hashrate = 0; +uint64_t net_blocks = 0; + int opt_statsavg = 30; -// strdup on char* to allow a common free() if used -static char* opt_syslog_pfx = strdup(PROGRAM_NAME); -char *opt_api_allow = strdup("127.0.0.1"); /* 0.0.0.0 for all ips */ -int opt_api_listen = 4068; /* 0 to disable */ +uint16_t opt_api_listen = 4068; /* 0 to disable */ +bool opt_stratum_stats = true; +static char* opt_syslog_pfx = nullptr; +char *opt_api_allow = nullptr; #ifdef HAVE_GETOPT_LONG #include @@ -220,26 +211,28 @@ struct option { static char const usage[] = "\ Usage: " PROGRAM_NAME " [OPTIONS]\n\ Options:\n\ - -a, --algo=ALGO specify the hash algorithm to use\n\ - anime Animecoin\n\ + -a, --algo=ALGO specify the hash algorithm to use\n\ + bitcoin Bitcoin\n\ blake Blake 256 (SFR/NEOS)\n\ blakecoin Fast Blake 256 (8 rounds)\n\ + c11 X11 variant\n\ deep Deepcoin\n\ dmd-gr Diamond-Groestl\n\ fresh Freshcoin (shavite 80)\n\ fugue256 Fuguecoin\n\ groestl Groestlcoin\n\ - heavy Heavycoin\n\ jackpot Jackpot\n\ keccak Keccak-256 (Maxcoin)\n\ luffa Doomcoin\n\ - lyra2 VertCoin\n\ - mjollnir Mjollnircoin\n\ + lyra2v2 VertCoin\n\ myr-gr Myriad-Groestl\n\ + neoscrypt neoscrypt (FeatherCoin)\n\ nist5 NIST5 (TalkCoin)\n\ penta Pentablake hash (5x Blake 512)\n\ quark Quark\n\ qubit Qubit\n\ + sia Siacoin (at pools compatible to siamining.com) \n\ + skein Skein SHA2 (Skeincoin)\n\ s3 S3 (1Coin)\n\ spread Spread\n\ x11 X11 (DarkCoin)\n\ @@ -247,16 +240,18 @@ Options:\n\ x14 X14\n\ x15 X15\n\ x17 X17 (peoplecurrency)\n\ + vanilla Blake 256 8 rounds\n\ + yescrypt yescrypt\n\ whirl Whirlcoin (old whirlpool)\n\ - -d, --devices Comma separated list of CUDA devices to use.\n\ + whirlpoolx Vanillacoin \n\ + -d, --devices Comma separated list of CUDA devices to use. \n\ Device IDs start counting from 0! Alternatively takes\n\ string names of your cards like gtx780ti or gt640#2\n\ (matching 2nd gt640 in the PC)\n\ -i --intensity=N GPU intensity 8-31 (default: auto) \n\ Decimals are allowed for fine tuning \n\ - -f, --diff Divide difficulty by this factor (std is 1) \n\ - -v, --vote=VOTE block reward vote (for HeavyCoin)\n\ - -m, --trust-pool trust the max block reward vote (maxvote) sent by the pool\n\ + -f, --diff-factor Divide difficulty by this factor (default 1.0) \n\ + -m, --diff-multiplier Multiply difficulty by this value (default 1.0) \n\ -o, --url=URL URL of mining server\n\ -O, --userpass=U:P username:password pair for mining server\n\ -u, --user=USERNAME username for mining server\n\ @@ -270,45 +265,48 @@ Options:\n\ -T, --timeout=N network timeout, in seconds (default: 270)\n\ -s, --scantime=N upper bound on time spent scanning current work when\n\ long polling is unavailable, in seconds (default: 5)\n\ + -n, --ndevs list cuda devices\n\ -N, --statsavg number of samples used to display hashrate (default: 30)\n\ --no-gbt disable getblocktemplate support (height check in solo)\n\ --no-longpoll disable X-Long-Polling support\n\ --no-stratum disable X-Stratum support\n\ + -e disable extranonce\n\ -q, --quiet disable per-thread hashmeter output\n\ --no-color disable colored output\n\ -D, --debug enable debug output\n\ -P, --protocol-dump verbose dump of protocol-level activities\n\ --cpu-affinity set process affinity to cpu core(s), mask 0x3 for cores 0 and 1\n\ --cpu-priority set process priority (default: 0 idle, 2 normal to 5 highest)\n\ - -b, --api-bind IP/Port for the miner API (default: 127.0.0.1:4068)\n" - -#ifdef HAVE_SYSLOG_H -"\ + -b, --api-bind IP/Port for the miner API (default: 127.0.0.1:4068)\n\ -S, --syslog use system log for output messages\n\ - --syslog-prefix=... allow to change syslog tool name\n" -#endif -#ifndef WIN32 -"\ - -B, --background run the miner in the background\n" -#endif -"\ + --syslog-prefix=... allow to change syslog tool name\n\ + -B, --background run the miner in the background\n\ --benchmark run in offline benchmark mode\n\ --cputest debug hashes from cpu algorithms\n\ + --no-cpu-verify don't verify the found results\n\ -c, --config=FILE load a JSON-format configuration file\n\ -V, --version display version information and exit\n\ - -h, --help display this help text and exit\n\ -"; + -h, --help display this help text and exit\n" +#if defined(USE_WRAPNVML) && (defined(__linux) || defined(_WIN64)) /* via nvml */ +"\ + --mem-clock=N Set the gpu memory max clock (346.72+ driver)\n\ + --gpu-clock=N Set the gpu engine max clock (346.72+ driver)\n\ + --pstate=N Set the gpu power state (352.21+ driver)\n\ + --plimit=N Set the gpu power limit(352.21 + driver)\n" +#endif +""; static char const short_options[] = #ifndef WIN32 - "B" +"B" #endif #ifdef HAVE_SYSLOG_H - "S" +"S" #endif - "a:c:i:Dhp:Px:qr:R:s:t:T:o:u:O:Vd:f:mv:N:b:"; +"a:c:i:Dhp:Px:nqr:R:s:t:T:o:u:O:Vd:f:m:N:b:e"; -static struct option const options[] = { +static struct option const options[] = +{ { "algo", 1, NULL, 'a' }, { "api-bind", 1, NULL, 'b' }, #ifndef WIN32 @@ -316,6 +314,7 @@ static struct option const options[] = { #endif { "benchmark", 0, NULL, 1005 }, { "cert", 1, NULL, 1001 }, + { "no-cpu-verify", 0, NULL, 1022 }, { "config", 1, NULL, 'c' }, { "cputest", 0, NULL, 1006 }, { "cpu-affinity", 1, NULL, 1020 }, @@ -323,6 +322,7 @@ static struct option const options[] = { { "debug", 0, NULL, 'D' }, { "help", 0, NULL, 'h' }, { "intensity", 1, NULL, 'i' }, + { "ndevs", 0, NULL, 'n' }, { "no-color", 0, NULL, 1002 }, { "no-gbt", 0, NULL, 1011 }, { "no-longpoll", 0, NULL, 1003 }, @@ -340,65 +340,84 @@ static struct option const options[] = { { "syslog-prefix", 1, NULL, 1008 }, #endif { "threads", 1, NULL, 't' }, - { "vote", 1, NULL, 'v' }, - { "trust-pool", 0, NULL, 'm' }, + { "Disable extranounce support", 1, NULL, 'e' }, { "timeout", 1, NULL, 'T' }, { "url", 1, NULL, 'o' }, { "user", 1, NULL, 'u' }, { "userpass", 1, NULL, 'O' }, { "version", 0, NULL, 'V' }, { "devices", 1, NULL, 'd' }, - { "diff", 1, NULL, 'f' }, - { 0, 0, 0, 0 } + { "diff-multiplier", 1, NULL, 'm' }, + { "diff-factor", 1, NULL, 'f' }, + { "diff", 1, NULL, 'f' }, // compat + {"gpu-clock", 1, NULL, 1070}, + {"mem-clock", 1, NULL, 1071}, + {"pstate", 1, NULL, 1072}, + {"plimit", 1, NULL, 1073}, + {0, 0, 0, 0} }; -static struct work _ALIGN(64) g_work; -static time_t g_work_time; -static pthread_mutex_t g_work_lock; +struct work _ALIGN(64) g_work; +time_t g_work_time; +static pthread_mutex_t g_work_lock = PTHREAD_MUTEX_INITIALIZER; #ifdef __linux /* Linux specific policy and affinity management */ #include -static inline void drop_policy(void) { +static inline void drop_policy(void) +{ struct sched_param param; param.sched_priority = 0; #ifdef SCHED_IDLE - if (unlikely(sched_setscheduler(0, SCHED_IDLE, ¶m) == -1)) + if(unlikely(sched_setscheduler(0, SCHED_IDLE, ¶m) == -1)) #endif #ifdef SCHED_BATCH sched_setscheduler(0, SCHED_BATCH, ¶m); #endif } -static void affine_to_cpu_mask(int id, uint8_t mask) { +static void affine_to_cpu_mask(int id, uint8_t mask) +{ cpu_set_t set; CPU_ZERO(&set); - for (uint8_t i = 0; i < num_cpus; i++) { + for(uint8_t i = 0; i < num_cpus; i++) + { // cpu mask - if (mask & (1< -static inline void drop_policy(void) { } -static void affine_to_cpu_mask(int id, uint8_t mask) { +static inline void drop_policy(void) +{} +static void affine_to_cpu_mask(int id, uint8_t mask) +{ cpuset_t set; CPU_ZERO(&set); - for (uint8_t i = 0; i < num_cpus; i++) { - if (mask & (1<data), target_size = sizeof(work->target); - int adata_sz = ARRAY_SIZE(work->data), atarget_sz = ARRAY_SIZE(work->target); + int target_size; + int midstate_size = sizeof(work->midstate); + int atarget_sz = ARRAY_SIZE(work->target); int i; - if (unlikely(!jobj_binary(val, "data", work->data, data_size))) { - applog(LOG_ERR, "JSON inval data"); + size_t data_size = jobj_binary(val, "data", work->data, sizeof(work->data)); + + if(opt_algo != ALGO_NEO && data_size != 128) + { + applog(LOG_ERR, "JSON invalid data"); return false; } - if (unlikely(!jobj_binary(val, "target", work->target, target_size))) { - applog(LOG_ERR, "JSON inval target"); + work->datasize = data_size; + int adata_sz = (int)data_size / 4; + + target_size = (int)jobj_binary(val, "target", work->target, sizeof(work->target)); + if(target_size != sizeof(work->target)) + { + applog(LOG_ERR, "JSON invalid target", target_size); return false; } - if (opt_algo == ALGO_HEAVY) { - if (unlikely(!jobj_binary(val, "maxvote", &work->maxvote, sizeof(work->maxvote)))) { - work->maxvote = 2048; - } - } else work->maxvote = 0; - - for (i = 0; i < adata_sz; i++) + for(i = 0; i < adata_sz; i++) work->data[i] = le32dec(work->data + i); - for (i = 0; i < atarget_sz; i++) + for(i = 0; i < atarget_sz; i++) work->target[i] = le32dec(work->target + i); json_t *jr = json_object_get(val, "noncerange"); - if (jr) { + if(jr) + { const char * hexstr = json_string_value(jr); - if (likely(hexstr)) { + if(likely(hexstr)) + { // never seen yet... hex2bin((uchar*)work->noncerange.u64, hexstr, 8); applog(LOG_DEBUG, "received noncerange: %08x-%08x", - work->noncerange.u32[0], work->noncerange.u32[1]); + work->noncerange.u32[0], work->noncerange.u32[1]); } } @@ -513,9 +541,9 @@ static bool work_decode(const json_t *val, struct work *work) } /** - * Calculate the work difficulty as double - * Not sure it works with pools - */ +* Calculate the work difficulty as double +* Not sure it works with pools +*/ static void calc_diff(struct work *work, int known) { // sample for diff 32.53 : 00000007de5f0000 @@ -526,52 +554,52 @@ static void calc_diff(struct work *work, int known) swab256(rtarget, work->target); data64 = (uint64_t *)(rtarget + 3); /* todo: index (3) can be tuned here */ - if (opt_algo == ALGO_HEAVY) { - data64 = (uint64_t *)(rtarget + 2); - } - d64 = swab64(*data64); - if (unlikely(!d64)) + if(unlikely(!d64)) d64 = 1; work->difficulty = (double)diffone / d64; - if (opt_difficulty > 0.) { + if(opt_difficulty > 0.) + { work->difficulty /= opt_difficulty; } } static int share_result(int result, const char *reason) { - char s[345]; + char s[32] = { 0 }; double hashrate = 0.; pthread_mutex_lock(&stats_lock); - for (int i = 0; i < opt_n_threads; i++) { + for(int i = 0; i < opt_n_threads; i++) + { hashrate += stats_get_speed(i, thr_hashrates[i]); } - result ? accepted_count++ : rejected_count++; pthread_mutex_unlock(&stats_lock); global_hashrate = llround(hashrate); - sprintf(s, hashrate >= 1e6 ? "%.0f" : "%.2f", 1e-3 * hashrate); - applog(LOG_NOTICE, "accepted: %lu/%lu (%.2f%%), %s khash/s %s", - accepted_count, - accepted_count + rejected_count, - 100. * accepted_count / (accepted_count + rejected_count), - s, - use_colors ? - (result ? CL_GRN "yay!!!" : CL_RED "booooo") - : (result ? "(yay!!!)" : "(booooo)")); - - if (reason) { + format_hashrate(hashrate, s); + applog(LOG_NOTICE, "accepted: %lu/%lu (%.2f%%), %s %s", + accepted_count, + accepted_count + rejected_count, + 100. * accepted_count / (accepted_count + rejected_count), + s, + use_colors ? + (result ? CL_GRN "yay!!!" : CL_RED "booooo") + : (result ? "(yay!!!)" : "(booooo)")); + + if(reason) + { applog(LOG_WARNING, "reject reason: %s", reason); - return 0; - if (strncmp(reason, "Duplicate share", 15) == 0 && !check_dups) { + if(strncmp(reason, "Duplicate share", 15) == 0 && !check_dups) + { applog(LOG_WARNING, "enabling duplicates check feature"); check_dups = true; } + return 0; + } return 1; } @@ -582,113 +610,133 @@ static bool submit_upstream_work(CURL *curl, struct work *work) bool stale_work = false; char s[384]; + /* discard if a newer block was received */ + /* + stale_work = work->height && work->height < g_work.height; + if (have_stratum && !stale_work) { + pthread_mutex_lock(&g_work_lock); + if (strlen(work->job_id + 8)) + stale_work = strcmp(work->job_id + 8, g_work.job_id + 8); + pthread_mutex_unlock(&g_work_lock); + } + */ + if(!have_stratum && !stale_work && allow_gbt) + { + struct work wheight = { 0 }; + if(get_blocktemplate(curl, &wheight)) + { + if(work->height && work->height < wheight.height) + { + if(opt_debug) + applog(LOG_WARNING, "block %u was already solved", work->height, wheight.height); + return true; + } + } + } + + if(stale_work) + { + if(opt_debug) + applog(LOG_WARNING, "stale work detected, discarding"); + return true; + } calc_diff(work, 0); - if (have_stratum) { + if(have_stratum) + { uint32_t sent = 0; uint32_t ntime, nonce; - uint16_t nvote; - char *ntimestr, *noncestr, *xnonce2str, *nvotestr; + char *ntimestr, *noncestr, *xnonce2str; - le32enc(&ntime, work->data[17]); - le32enc(&nonce, work->data[19]); + if(opt_algo != ALGO_SIA) + { + le32enc(&ntime, work->data[17]); + le32enc(&nonce, work->data[19]); + noncestr = bin2hex((const uchar*)(&nonce), 4); + ntimestr = bin2hex((const uchar*)(&ntime), 4); + } + else + { + le32enc(&ntime, work->data[10]); + uint64_t ntime64 = ntime; + le32enc(&nonce, work->data[8]); + uint64_t nonce64 = nonce; + le32enc(&nonce, work->data[9]); + nonce64 += (uint64_t)nonce << 32; + noncestr = bin2hex((const uchar*)(&nonce64), 8); + ntimestr = bin2hex((const uchar*)(&ntime64), 8); + } - noncestr = bin2hex((const uchar*)(&nonce), 4); - if (check_dups) + if(check_dups) sent = hashlog_already_submittted(work->job_id, nonce); - if (sent > 0) { - sent = (uint32_t) time(NULL) - sent; - if (!opt_quiet) { + if(sent > 0) + { + sent = (uint32_t)time(NULL) - sent; + if(!opt_quiet) + { applog(LOG_WARNING, "nonce %s was already sent %u seconds ago", noncestr, sent); hashlog_dump_job(work->job_id); } free(noncestr); // prevent useless computing on some pools - stratum_need_reset = true; - for (int i = 0; i < opt_n_threads; i++) - work_restart[i].restart = 1; + g_work_time = 0; + restart_threads(); return true; } - ntimestr = bin2hex((const uchar*)(&ntime), 4); xnonce2str = bin2hex(work->xnonce2, work->xnonce2_len); - if (opt_algo == ALGO_HEAVY) { - be16enc(&nvote, *((uint16_t*)&work->data[20])); - nvotestr = bin2hex((const uchar*)(&nvote), 2); - sprintf(s, - "{\"method\": \"mining.submit\", \"params\": [\"%s\", \"%s\", \"%s\", \"%s\", \"%s\", \"%s\"], \"id\":4}", - rpc_user, work->job_id + 8, xnonce2str, ntimestr, noncestr, nvotestr); - free(nvotestr); - } else { - sprintf(s, + sprintf(s, "{\"method\": \"mining.submit\", \"params\": [\"%s\", \"%s\", \"%s\", \"%s\", \"%s\"], \"id\":4}", rpc_user, work->job_id + 8, xnonce2str, ntimestr, noncestr); - } free(xnonce2str); free(ntimestr); free(noncestr); gettimeofday(&stratum.tv_submit, NULL); - -/* pthread_mutex_lock(&g_work_lock); - stale_work = work->height != g_work.height; - pthread_mutex_unlock(&g_work_lock); - if (stale_work) + if(unlikely(!stratum_send_line(&stratum, s))) { - applog(LOG_WARNING, "stale work detected, discarding"); - return true; - } - */ - if (unlikely(!stratum_send_line(&stratum, s))) { applog(LOG_ERR, "submit_upstream_work stratum_send_line failed"); return false; } - if (check_dups) + if(check_dups) hashlog_remember_submit(work, nonce); - } else + } + else { - /* - stale_work = work->height != g_work.height; - if (stale_work) - { - applog(LOG_WARNING, "stale work detected, discarding"); - return true; - } - */ /* build hex string */ char *str = NULL; - - if (opt_algo != ALGO_HEAVY && opt_algo != ALGO_MJOLLNIR) { - for (int i = 0; i < ARRAY_SIZE(work->data); i++) - le32enc(work->data + i, work->data[i]); - } - str = bin2hex((uchar*)work->data, sizeof(work->data)); - if (unlikely(!str)) { + for(int i = 0; i < (work->datasize >> 2); i++) + le32enc(work->data + i, work->data[i]); + str = bin2hex((uchar*)work->data, work->datasize); + if(unlikely(!str)) + { applog(LOG_ERR, "submit_upstream_work OOM"); return false; } /* build JSON-RPC request */ sprintf(s, - "{\"method\": \"getwork\", \"params\": [\"%s\"], \"id\":4}\r\n", - str); + "{\"method\": \"getwork\", \"params\": [\"%s\"], \"id\":4}\r\n", + str); /* issue JSON-RPC request */ val = json_rpc_call(curl, rpc_url, rpc_userpass, s, false, false, NULL); - if (unlikely(!val)) { + if(unlikely(!val)) + { applog(LOG_ERR, "submit_upstream_work json_rpc_call failed"); return false; } res = json_object_get(val, "result"); reason = json_object_get(val, "reject-reason"); - if (!share_result(json_is_true(res), reason ? json_string_value(reason) : NULL)) { - if (check_dups) + if(!share_result(json_is_true(res), reason ? json_string_value(reason) : NULL)) + { + if(check_dups) hashlog_purge_job(work->job_id); } @@ -704,21 +752,49 @@ static bool submit_upstream_work(CURL *curl, struct work *work) static bool gbt_work_decode(const json_t *val, struct work *work) { json_t *err = json_object_get(val, "error"); - if (err && !json_is_null(err)) { + if(err && !json_is_null(err)) + { allow_gbt = false; - applog(LOG_INFO, "GBT not supported, bloc height unavailable"); + applog(LOG_INFO, "GBT not supported, block height unavailable"); return false; } - if (!work->height) { + if(!work->height) + { // complete missing data from getwork json_t *key = json_object_get(val, "height"); - if (key && json_is_integer(key)) { - work->height = (uint32_t) json_integer_value(key); - if (!opt_quiet && work->height > g_work.height) { - applog(LOG_BLUE, "%s %s block %d", short_url, - algo_names[opt_algo], work->height); + if(key && json_is_integer(key)) + { + work->height = (uint32_t)json_integer_value(key); + if(!opt_quiet && work->height > g_work.height) + { + if(!have_stratum && allow_mininginfo && global_diff > 0) + { + char netinfo[64] = { 0 }; + char srate[32] = { 0 }; + sprintf(netinfo, "diff %.2f", global_diff); + if(net_hashrate) + { + format_hashrate((double)net_hashrate, srate); + strcat(netinfo, ", net "); + strcat(netinfo, srate); + } + applog(LOG_BLUE, "%s block %d, %s", + algo_names[opt_algo], work->height, netinfo); + } + else + { + applog(LOG_BLUE, "%s %s block %d", short_url, + algo_names[opt_algo], work->height); + } g_work.height = work->height; + if(!have_stratum) + { + double x = expectedblocktime(work->target); + if(x != 0.0) + applog(LOG_BLUE, "50%% chance to find a block in about %.2f days", x); + } + } } } @@ -728,20 +804,29 @@ static bool gbt_work_decode(const json_t *val, struct work *work) #define GBT_CAPABILITIES "[\"coinbasetxn\", \"coinbasevalue\", \"longpoll\", \"workid\"]" static const char *gbt_req = - "{\"method\": \"getblocktemplate\", \"params\": [" - // "{\"capabilities\": " GBT_CAPABILITIES "}" - "], \"id\":0}\r\n"; +"{\"method\": \"getblocktemplate\", \"params\": [" +// "{\"capabilities\": " GBT_CAPABILITIES "}" +"], \"id\":9}\r\n"; static bool get_blocktemplate(CURL *curl, struct work *work) { - if (!allow_gbt) + if(!allow_gbt) return false; + int curl_err = 0; json_t *val = json_rpc_call(curl, rpc_url, rpc_userpass, gbt_req, - want_longpoll, false, NULL); + want_longpoll, have_longpoll, &curl_err); - if (!val) + if(!val && curl_err == -1) + { + // when getblocktemplate is not supported, disable it + allow_gbt = false; + if(!opt_quiet) + { + applog(LOG_BLUE, "gbt not supported, block height notices disabled"); + } return false; + } bool rc = gbt_work_decode(json_object_get(val, "result"), work); @@ -750,8 +835,84 @@ static bool get_blocktemplate(CURL *curl, struct work *work) return rc; } +// good alternative for wallet mining, difficulty and net hashrate +static const char *info_req = +"{\"method\": \"getmininginfo\", \"params\": [], \"id\":8}\r\n"; + +static bool get_mininginfo(CURL *curl, struct work *work) +{ + if(have_stratum || !allow_mininginfo) + return false; + + int curl_err = 0; + json_t *val = json_rpc_call(curl, rpc_url, rpc_userpass, info_req, + want_longpoll, have_longpoll, &curl_err); + + if(!val && curl_err == -1) + { + allow_mininginfo = false; + if(opt_debug) + { + applog(LOG_DEBUG, "getmininginfo not supported"); + } + return false; + } + else + { + json_t *res = json_object_get(val, "result"); + // "blocks": 491493 (= current work height - 1) + // "difficulty": 0.99607860999999998 + // "networkhashps": 56475980 + if(res) + { + json_t *key = json_object_get(res, "powdifficulty"); + if(key && json_is_real(key)) + { + global_diff = json_real_value(key); + } + key = json_object_get(res, "difficulty"); + if(key && json_is_real(key)) + { + global_diff = json_real_value(key); + } + key = json_object_get(res, "networkhashps"); + if(key && json_is_integer(key)) + { + net_hashrate = json_integer_value(key); + } + key = json_object_get(res, "blocks"); + if(key && json_is_integer(key)) + { + net_blocks = json_integer_value(key); + } + } + } + json_decref(val); + return true; +} + +// time (in days) for a 50% chance to find a block +double expectedblocktime(const uint32_t *target) +{ + double x = 0.0; + if(global_hashrate == 0) + return 0; + else + { + for(int i = 0; i < 8; i++) + { + x *= 4294967296.0; + x += target[7 - i]; + } + if(x != 0.0) + return 115792089237316195423570985008687907853269984665640564039457584007913129639935.0 / x / (double)global_hashrate / 86400.0; + else + return 0.0; + } +} + static const char *rpc_req = - "{\"method\": \"getwork\", \"params\": [], \"id\":0}\r\n"; +"{\"method\": \"getwork\", \"params\": [], \"id\":0}\r\n"; static bool get_upstream_work(CURL *curl, struct work *work) { @@ -761,29 +922,32 @@ static bool get_upstream_work(CURL *curl, struct work *work) gettimeofday(&tv_start, NULL); val = json_rpc_call(curl, rpc_url, rpc_userpass, rpc_req, - want_longpoll, false, NULL); + want_longpoll, false, NULL); gettimeofday(&tv_end, NULL); - if (have_stratum) { - if (val) + if(have_stratum) + { + if(val) json_decref(val); return true; } - if (!val) + if(!val) return false; rc = work_decode(json_object_get(val, "result"), work); - if (opt_protocol && rc) { + if(opt_protocol && rc) + { timeval_subtract(&diff, &tv_end, &tv_start); /* show time because curl can be slower against versions/config */ applog(LOG_DEBUG, "got new work in %.2f ms", - (1000.0 * diff.tv_sec) + (0.001 * diff.tv_usec)); + (1000.0 * diff.tv_sec) + (0.001 * diff.tv_usec)); } json_decref(val); + get_mininginfo(curl, work); get_blocktemplate(curl, work); return rc; @@ -791,10 +955,11 @@ static bool get_upstream_work(CURL *curl, struct work *work) static void workio_cmd_free(struct workio_cmd *wc) { - if (!wc) + if(!wc) return; - switch (wc->cmd) { + switch(wc->cmd) + { case WC_SUBMIT_WORK: aligned_free(wc->u.work); break; @@ -812,12 +977,14 @@ static bool workio_get_work(struct workio_cmd *wc, CURL *curl) int failures = 0; ret_work = (struct work*)aligned_calloc(sizeof(*ret_work)); - if (!ret_work) + if(!ret_work) return false; /* obtain new work from bitcoin via JSON-RPC */ - while (!get_upstream_work(curl, ret_work)) { - if (unlikely((opt_retries >= 0) && (++failures > opt_retries))) { + while(!get_upstream_work(curl, ret_work)) + { + if(unlikely((opt_retries >= 0) && (++failures > opt_retries))) + { applog(LOG_ERR, "json_rpc_call failed, terminating workio thread"); aligned_free(ret_work); return false; @@ -825,12 +992,12 @@ static bool workio_get_work(struct workio_cmd *wc, CURL *curl) /* pause, then restart work-request loop */ applog(LOG_ERR, "json_rpc_call failed, retry after %d seconds", - opt_fail_pause); + opt_fail_pause); sleep(opt_fail_pause); } /* send work to requesting thread */ - if (!tq_push(wc->thr->q, ret_work)) + if(!tq_push(wc->thr->q, ret_work)) aligned_free(ret_work); return true; @@ -841,14 +1008,16 @@ static bool workio_submit_work(struct workio_cmd *wc, CURL *curl) int failures = 0; /* submit solution to bitcoin via JSON-RPC */ - while (!submit_upstream_work(curl, wc->u.work)) { - if (unlikely((opt_retries >= 0) && (++failures > opt_retries))) { + while(!submit_upstream_work(curl, wc->u.work)) + { + if(unlikely((opt_retries >= 0) && (++failures > opt_retries))) + { applog(LOG_ERR, "...terminating workio thread"); return false; } /* pause, then restart work-request loop */ - if (!opt_benchmark) + if(!opt_benchmark) applog(LOG_ERR, "...retry after %d seconds", opt_fail_pause); sleep(opt_fail_pause); @@ -864,23 +1033,27 @@ static void *workio_thread(void *userdata) bool ok = true; curl = curl_easy_init(); - if (unlikely(!curl)) { + if(unlikely(!curl)) + { applog(LOG_ERR, "CURL initialization failed"); return NULL; } - while (ok) { + while(ok) + { struct workio_cmd *wc; /* wait for workio_cmd sent to us, on our queue */ wc = (struct workio_cmd *)tq_pop(mythr->q, NULL); - if (!wc) { + if(!wc) + { ok = false; break; } /* process workio_cmd */ - switch (wc->cmd) { + switch(wc->cmd) + { case WC_GET_WORK: ok = workio_get_work(wc, curl); break; @@ -907,33 +1080,54 @@ static bool get_work(struct thr_info *thr, struct work *work) struct workio_cmd *wc; struct work *work_heap; - if (opt_benchmark) { - memset(work->data, 0x55, 76); - //work->data[17] = swab32((uint32_t)time(NULL)); - memset(work->data + 19, 0x00, 52); - work->data[20] = 0x80000000; - work->data[31] = 0x00000280; - memset(work->target, 0x00, sizeof(work->target)); + if(opt_benchmark) + { + if(opt_algo != ALGO_SIA) + { + memset(work->data, 0x55, 76); + memset(work->data + 19, 0x00, 52); + work->data[1] = (uint32_t)((double)rand() / (1ULL + RAND_MAX) * 0xffffffffu); + work->data[20] = 0x80000000; + work->data[31] = 0x00000280; + memset(work->target, 0x00, sizeof(work->target)); + work->datasize = 128; + } + else + { + memset(work->data, 0, 4); + work->data[1] = (uint32_t)((double)rand() / (1ULL + RAND_MAX) * 0xffffffffu); + memset(work->data+2, 0x55, 24); + memset(work->data + 8, 0, 8); + memset(work->data + 10, 0, 4); + memset(work->data + 11, 0x55, 4); + memset(work->data + 12, 0x55, 32); + memset(work->target, 0x00, sizeof(work->target)); + work->datasize = 128; + } return true; } /* fill out work request message */ wc = (struct workio_cmd *)calloc(1, sizeof(*wc)); - if (!wc) - return false; + if(wc == NULL) + { + applog(LOG_ERR, "Out of memory!"); + proper_exit(2); + } wc->cmd = WC_GET_WORK; wc->thr = thr; /* send work request to workio thread */ - if (!tq_push(thr_info[work_thr_id].q, wc)) { + if(!tq_push(thr_info[work_thr_id].q, wc)) + { workio_cmd_free(wc); return false; } /* wait for response, a unit of work */ work_heap = (struct work *)tq_pop(thr->q, NULL); - if (!work_heap) + if(!work_heap) return false; /* copy returned work into storage provided by caller */ @@ -948,19 +1142,25 @@ static bool submit_work(struct thr_info *thr, const struct work *work_in) struct workio_cmd *wc; /* fill out work request message */ wc = (struct workio_cmd *)calloc(1, sizeof(*wc)); - if (!wc) - return false; + if(wc == NULL) + { + applog(LOG_ERR, "Out of memory!"); + proper_exit(2); + } wc->u.work = (struct work *)aligned_calloc(sizeof(*work_in)); - if (!wc->u.work) - goto err_out; + if(wc->u.work == NULL) + { + applog(LOG_ERR, "Out of memory!"); + proper_exit(2); + } wc->cmd = WC_SUBMIT_WORK; wc->thr = thr; memcpy(wc->u.work, work_in, sizeof(*work_in)); /* send solution to workio thread */ - if (!tq_push(thr_info[work_thr_id].q, wc)) + if(!tq_push(thr_info[work_thr_id].q, wc)) goto err_out; return true; @@ -970,33 +1170,32 @@ static bool submit_work(struct thr_info *thr, const struct work *work_in) return false; } -static void stratum_gen_work(struct stratum_ctx *sctx, struct work *work) +static bool stratum_gen_work(struct stratum_ctx *sctx, struct work *work) { - uchar merkle_root[64]; + extern void siahash(const void *data, unsigned int len, void *hash); + uchar merkle_root[1024]; int i; - if (!sctx->job.job_id) { + if(!sctx->job.job_id) + { // applog(LOG_WARNING, "stratum_gen_work: job not yet retrieved"); - return; + return false; } pthread_mutex_lock(&sctx->work_lock); // store the job ntime as high part of jobid snprintf(work->job_id, sizeof(work->job_id), "%07x %s", - be32dec(sctx->job.ntime) & 0xfffffff, sctx->job.job_id); + be32dec(sctx->job.ntime) & 0xfffffff, sctx->job.job_id); work->xnonce2_len = sctx->xnonce2_size; memcpy(work->xnonce2, sctx->job.xnonce2, sctx->xnonce2_size); - // also store the bloc number + // also store the block number work->height = sctx->job.height; /* Generate merkle root */ - switch (opt_algo) { - case ALGO_HEAVY: - case ALGO_MJOLLNIR: - heavycoin_hash(merkle_root, sctx->job.coinbase, (int)sctx->job.coinbase_size); - break; + switch(opt_algo) + { case ALGO_FUGUE256: case ALGO_GROESTL: case ALGO_KECCAK: @@ -1004,87 +1203,112 @@ static void stratum_gen_work(struct stratum_ctx *sctx, struct work *work) case ALGO_WHC: SHA256((uchar*)sctx->job.coinbase, sctx->job.coinbase_size, (uchar*)merkle_root); break; + case ALGO_SIA: + { + merkle_root[0] = (uchar)0; + memcpy(merkle_root + 1, sctx->job.coinbase, sctx->job.coinbase_size); + siahash(merkle_root, (unsigned int)sctx->job.coinbase_size + 1, merkle_root + 33); + break; + } default: sha256d(merkle_root, sctx->job.coinbase, (int)sctx->job.coinbase_size); } + if(opt_algo == ALGO_SIA) + merkle_root[0] = (uchar)1; - for (i = 0; i < sctx->job.merkle_count; i++) { - memcpy(merkle_root + 32, sctx->job.merkle[i], 32); - if (opt_algo == ALGO_HEAVY || opt_algo == ALGO_MJOLLNIR) - heavycoin_hash(merkle_root, merkle_root, 64); + for(i = 0; i < sctx->job.merkle_count; i++) + { + if(opt_algo == ALGO_SIA) + { + memcpy(merkle_root + 1, sctx->job.merkle[i], 32); + siahash(merkle_root, 65, merkle_root + 33); + } else + { + memcpy(merkle_root + 32, sctx->job.merkle[i], 32); sha256d(merkle_root, merkle_root, 64); + } } - + /* Increment extranonce2 */ - for (i = 0; i < (int)sctx->xnonce2_size && !++sctx->job.xnonce2[i]; i++); + if(opt_extranonce) + { + for(i = 0; i < (int)sctx->xnonce2_size && !++sctx->job.xnonce2[i]; i++); + { + sctx->job.xnonce2[i]++; + } + } + static uint32_t highnonce = 0; + if(opt_algo == ALGO_SIA) + highnonce++; /* Assemble block header */ memset(work->data, 0, sizeof(work->data)); - work->data[0] = le32dec(sctx->job.version); - for (i = 0; i < 8; i++) - work->data[1 + i] = le32dec((uint32_t *)sctx->job.prevhash + i); - for (i = 0; i < 8; i++) - work->data[9 + i] = be32dec((uint32_t *)merkle_root + i); - work->data[17] = le32dec(sctx->job.ntime); - work->data[18] = le32dec(sctx->job.nbits); - if (opt_algo == ALGO_MJOLLNIR || opt_algo == ALGO_HEAVY) + if(opt_algo != ALGO_SIA) { - for (i = 0; i < 20; i++) - work->data[i] = be32dec((uint32_t *)&work->data[i]); + work->data[0] = le32dec(sctx->job.version); + for(i = 0; i < 8; i++) + work->data[1 + i] = le32dec((uint32_t *)sctx->job.prevhash + i); + for(i = 0; i < 8; i++) + work->data[9 + i] = be32dec((uint32_t *)merkle_root + i); + work->data[17] = le32dec(sctx->job.ntime); + work->data[18] = le32dec(sctx->job.nbits); + work->data[20] = 0x80000000; + work->data[31] = 0x00000280; } - - work->data[20] = 0x80000000; - work->data[31] = (opt_algo == ALGO_MJOLLNIR) ? 0x000002A0 : 0x00000280; - - // HeavyCoin (vote / reward) - if (opt_algo == ALGO_HEAVY) { - work->maxvote = 2048; - uint16_t *ext = (uint16_t*)(&work->data[20]); - ext[0] = opt_vote; - ext[1] = be16dec(sctx->job.nreward); - // applog(LOG_DEBUG, "DEBUG: vote=%hx reward=%hx", ext[0], ext[1]); + else + { + for(i = 0; i < 8; i++) + work->data[i] = le32dec((uint32_t *)sctx->job.prevhash + i); + work->data[8] = 0; // nonce + work->data[9] = highnonce; + work->data[10] = le32dec(sctx->job.ntime); + work->data[11] = 0; + for(i = 0; i < 8; i++) + work->data[12 + i] = le32dec((uint32_t *)(merkle_root + 33) + i); } pthread_mutex_unlock(&sctx->work_lock); - - if (opt_debug) { - char *tm = atime2str(swab32(work->data[17]) - sctx->srvtime_diff); + if(opt_debug) + { + char *tm; + if(opt_algo != ALGO_SIA) + tm = atime2str(swab32(work->data[17]) - sctx->srvtime_diff); + else + tm = atime2str(work->data[10] - sctx->srvtime_diff); char *xnonce2str = bin2hex(work->xnonce2, sctx->xnonce2_size); applog(LOG_DEBUG, "DEBUG: job_id=%s xnonce2=%s time=%s", - work->job_id, xnonce2str, tm); + work->job_id, xnonce2str, tm); free(tm); free(xnonce2str); } - switch (opt_algo) { + switch(opt_algo) + { case ALGO_JACKPOT: + case ALGO_NEO: diff_to_target(work->target, sctx->job.diff / (65536.0 * opt_difficulty)); break; case ALGO_DMD_GR: case ALGO_FRESH: case ALGO_FUGUE256: case ALGO_GROESTL: - diff_to_target(work->target, sctx->job.diff / (256.0 * opt_difficulty)); - break; case ALGO_KECCAK: + case ALGO_LYRA2v2: diff_to_target(work->target, sctx->job.diff / (256.0 * opt_difficulty)); break; - case ALGO_LYRA2: - case ALGO_QUBIT: - diff_to_target(work->target, sctx->job.diff / (128.0 * opt_difficulty)); - break; default: diff_to_target(work->target, sctx->job.diff / opt_difficulty); } + return true; } -static void restart_threads(void) +void restart_threads(void) { - if (opt_debug && !opt_quiet) - applog(LOG_DEBUG,"%s", __FUNCTION__); + if(opt_debug && !opt_quiet) + applog(LOG_DEBUG, "%s", __FUNCTION__); - for (int i = 0; i < opt_n_threads; i++) + for(int i = 0; i < opt_n_threads; i++) work_restart[i].restart = 1; } @@ -1095,110 +1319,117 @@ static void *miner_thread(void *userdata) struct work work; uint64_t loopcnt = 0; uint32_t max_nonce; - uint32_t end_nonce = 0xffffffffU / opt_n_threads * (thr_id + 1) - (thr_id + 1); - bool work_done = false; + uint32_t end_nonce = UINT32_MAX / opt_n_threads * (thr_id + 1) - (thr_id + 1); bool extrajob = false; char s[16]; int rc = 0; memset(&work, 0, sizeof(work)); // prevent work from being used uninitialized - /* Set worker threads to nice 19 and then preferentially to SCHED_IDLE - * and if that fails, then SCHED_BATCH. No need for this to be an - * error if it fails */ - if (!opt_benchmark && opt_priority == 0) { - setpriority(PRIO_PROCESS, 0, 18); - drop_policy(); - } else { - int prio = 0; + if(opt_priority > 0) + { + int prio = 2; // default to normal #ifndef WIN32 - prio = 18; + prio = 0; // note: different behavior on linux (-19 to 19) - switch (opt_priority) { - case 1: - prio = 5; - break; - case 2: - prio = 0; - break; - case 3: - prio = -5; - break; - case 4: - prio = -10; - break; - case 5: - prio = -15; + switch(opt_priority) + { + case 0: + prio = 15; + break; + case 1: + prio = 5; + break; + case 2: + prio = 0; // normal process + break; + case 3: + prio = -1; // above + break; + case 4: + prio = -10; + break; + case 5: + prio = -15; } - applog(LOG_DEBUG, "Thread %d priority %d (set to %d)", thr_id, - opt_priority, prio); + if(opt_debug) + applog(LOG_DEBUG, "Thread %d priority %d (nice %d)", + thr_id, opt_priority, prio); #endif - int ret = setpriority(PRIO_PROCESS, 0, prio); - if (opt_priority == 0) { - drop_policy(); - } + setpriority(PRIO_PROCESS, 0, prio); + drop_policy(); } + /* Cpu thread affinity */ - if (num_cpus > 1) { - if (opt_affinity == -1 && opt_n_threads > 1) { - if (!opt_quiet) + if(num_cpus > 1) + { + if(opt_affinity == -1) + { + if(opt_debug) applog(LOG_DEBUG, "Binding thread %d to cpu %d (mask %x)", thr_id, - thr_id % num_cpus, (1 << (thr_id % num_cpus))); - affine_to_cpu_mask(thr_id, 1 << (thr_id % num_cpus)); - } else if (opt_affinity != -1) { - if (!opt_quiet) - applog(LOG_DEBUG, "Binding thread %d to cpu mask %x", thr_id, - opt_affinity); + thr_id%num_cpus, (1 << (thr_id))); + affine_to_cpu_mask(thr_id, 1 << (thr_id)); + } + else if(opt_affinity != -1) + { + if(opt_debug) + applog(LOG_DEBUG, "Binding thread %d to gpu mask %x", thr_id, + opt_affinity); affine_to_cpu_mask(thr_id, opt_affinity); } } - while (1) + while(1) { - if (opt_benchmark) + // &work.data[19] + int wcmplen; + switch(opt_algo) { - work.data[19] = work.data[19] & 0xfffffffU; //reset Hashcounters - work.data[21] = work.data[21] & 0xfffffffU; + case ALGO_SIA: + wcmplen = 80; + break; + default: + wcmplen = 76; } + uint32_t *nonceptr; + if(opt_algo!=ALGO_SIA) + nonceptr = (uint32_t*)(((char*)work.data) + wcmplen); + else + nonceptr = (uint32_t*)(((char*)work.data) + 8*4); + struct timeval tv_start, tv_end, diff; - unsigned long hashes_done=0; + uint32_t hashes_done = 0; uint32_t start_nonce; uint32_t scan_time = have_longpoll ? LP_SCANTIME : opt_scantime; - uint64_t max64, minmax = 0x100000; + uint64_t max64, minmax; - // &work.data[19] - int wcmplen = 76; - uint32_t *nonceptr = (uint32_t*) (((char*)work.data) + wcmplen); - - if (have_stratum) { - uint32_t sleeptime = 0; - while (!work_done && time(NULL) >= (g_work_time + opt_scantime)) { - usleep(100*1000); - if (sleeptime > 4) { - extrajob = true; - break; - } - sleeptime++; - } - if (sleeptime && opt_debug && !opt_quiet) - applog(LOG_DEBUG, "sleeptime: %u ms", sleeptime*100); - nonceptr = (uint32_t*) (((char*)work.data) + wcmplen); + if(have_stratum) + { + if(loopcnt == 0 || time(NULL) >= (g_work_time + opt_scantime)) + extrajob = true; pthread_mutex_lock(&g_work_lock); - extrajob |= work_done; - if (nonceptr[0] >= end_nonce || extrajob) { - work_done = false; + if(nonceptr[0] >= end_nonce - 0x00010000 || extrajob) + { extrajob = false; - stratum_gen_work(&stratum, &g_work); + while(!stratum_gen_work(&stratum, &g_work)) + { + applog(LOG_WARNING, "GPU #%d: waiting for data", device_map[thr_id]); + sleep(3); + } } - } else { + } + else + { pthread_mutex_lock(&g_work_lock); - if ((time(NULL) - g_work_time) >= scan_time || nonceptr[0] >= (end_nonce - 0x100)) { - if (opt_debug && g_work_time && !opt_quiet) + if((time(NULL) - g_work_time) >= scan_time || nonceptr[0] >= (end_nonce - 0x10000)) + { + if(opt_debug && g_work_time && !opt_quiet) applog(LOG_DEBUG, "work time %u/%us nonce %x/%x", time(NULL) - g_work_time, - scan_time, nonceptr[0], end_nonce); + scan_time, nonceptr[0], end_nonce); /* obtain new work from internal workio thread */ - if (unlikely(!get_work(mythr, &g_work))) { + if(unlikely(!get_work(mythr, &g_work))) + { pthread_mutex_unlock(&g_work_lock); applog(LOG_ERR, "work retrieval failed, exiting mining thread %d", mythr->id); goto out; @@ -1206,242 +1437,301 @@ static void *miner_thread(void *userdata) g_work_time = time(NULL); } } - - if (!opt_benchmark && (g_work.height != work.height || memcmp(work.target, g_work.target, sizeof(work.target)))) + if(!opt_benchmark && (g_work.height != work.height || memcmp(work.target, g_work.target, sizeof(work.target)))) { calc_diff(&g_work, 0); - if (!have_stratum) + if(!have_stratum && !allow_mininginfo) global_diff = g_work.difficulty; - if (opt_debug) { + if(opt_debug) + { uint64_t target64 = g_work.target[7] * 0x100000000ULL + g_work.target[6]; applog(LOG_DEBUG, "job %s target change: %llx (%.1f)", g_work.job_id, target64, g_work.difficulty); } memcpy(work.target, g_work.target, sizeof(work.target)); work.difficulty = g_work.difficulty; work.height = g_work.height; - nonceptr[0] = (UINT32_MAX / opt_n_threads) * thr_id; // 0 if single thr - /* on new target, ignoring nonce, clear sent data (hashlog) */ - if (memcmp(work.target, g_work.target, sizeof(work.target))) { - if (check_dups) - hashlog_purge_job(work.job_id); - } } - if (memcmp(work.data, g_work.data, wcmplen)) { - #if 0 - if (opt_debug) { - for (int n=0; n <= (wcmplen-8); n+=8) { - if (memcmp(work.data + n, g_work.data + n, 8)) { + + int different; + if(opt_algo != ALGO_SIA) + different = memcmp(work.data, g_work.data, wcmplen); + else + different = memcmp(work.data, g_work.data, 7*4) || memcmp(work.data + 9, g_work.data + 9, 44); + if(different) + { + if(opt_debug) + applog(LOG_DEBUG, "thread %d: new work", thr_id); +#if 0 + if(opt_debug) + { + for(int n = 0; n <= (wcmplen - 8); n += 8) + { + if(memcmp(work.data + n, g_work.data + n, 8)) + { applog(LOG_DEBUG, "job %s work updated at offset %d:", g_work.job_id, n); - applog_hash((uchar*) &work.data[n]); - applog_compare_hash((uchar*) &g_work.data[n], (uchar*) &work.data[n]); + applog_hash((uchar*)&work.data[n]); + applog_compare_hash((uchar*)&g_work.data[n], (uchar*)&work.data[n]); } } } - #endif +#endif + if(opt_debug && opt_algo == ALGO_SIA) + applog(LOG_DEBUG, "thread %d: high nonce = %08X", thr_id, work.data[9]); memcpy(&work, &g_work, sizeof(struct work)); nonceptr[0] = (UINT32_MAX / opt_n_threads) * thr_id; // 0 if single thr - } else - nonceptr[0]++; //?? - + } + else + { + if(opt_debug) + applog(LOG_DEBUG, "thread %d: continue with old work", thr_id); + } work_restart[thr_id].restart = 0; pthread_mutex_unlock(&g_work_lock); - /* adjust max_nonce to meet target scan time */ - if (have_stratum) - max64 = LP_SCANTIME; + uint32_t max64time; + if(have_stratum) + max64time = LP_SCANTIME; else - max64 = max(1, scan_time + g_work_time - time(NULL)); + max64time = (uint32_t)max(1, scan_time + g_work_time - time(NULL)); - max64 *= (uint32_t)thr_hashrates[thr_id]; + max64 = max64time * (uint32_t)thr_hashrates[thr_id]; /* on start, max64 should not be 0, - * before hashrate is computed */ - if (max64 < minmax) { - switch (opt_algo) { - case ALGO_BLAKECOIN: + * before hashrate is computed */ + switch(opt_algo) + { + case ALGO_KECCAK: + minmax = 83000000 * max64time; + break; case ALGO_BLAKE: - minmax = 0x80000000U; + case ALGO_SIA: + minmax = 260000000 * max64time; + break; + case ALGO_BLAKECOIN: + case ALGO_VANILLA: + minmax = 470000000 * max64time; break; case ALGO_BITCOIN: - case ALGO_KECCAK: - minmax = 0x40000000U; + minmax = 100000000 * max64time; + break; + case ALGO_QUBIT: + case ALGO_QUARK: + minmax = 3100000 * max64time; + break; + case ALGO_JACKPOT: + minmax = 2800000 * max64time; break; + case ALGO_SKEIN: + case ALGO_WHCX: case ALGO_DOOM: - case ALGO_JACKPOT: case ALGO_LUFFA_DOOM: - minmax = 0x2000000; + minmax = 38000000 * max64time; break; + case ALGO_NIST5: case ALGO_S3: + minmax = 4600000 * max64time; + break; case ALGO_X11: + case ALGO_C11: + minmax = 1500000 * max64time; + break; case ALGO_X13: - minmax = 0x400000; + minmax = 1200000 * max64time; break; - case ALGO_LYRA2: - minmax = 0x100000; + case ALGO_X17: + case ALGO_X15: + minmax = 1000000 * max64time; break; - } - max64 = max(minmax-1, max64); + case ALGO_LYRA2v2: + minmax = 1900000 * max64time; + break; + case ALGO_NEO: + minmax = 90000 * max64time; + break; + default: + minmax = 4000 * max64time; } + max64 = max(minmax, max64); // we can't scan more than uint capacity max64 = min(UINT32_MAX, max64); - start_nonce = nonceptr[0]; /* never let small ranges at end */ - if (end_nonce >= UINT32_MAX - 256) + if(end_nonce >= UINT32_MAX - 256) end_nonce = UINT32_MAX; - if ((max64 + start_nonce) >= end_nonce) + if((max64 + start_nonce) >= end_nonce) max_nonce = end_nonce; else - max_nonce = (uint32_t) (max64 + start_nonce); + max_nonce = (uint32_t)(max64 + start_nonce); // todo: keep it rounded for gpu threads ? work.scanned_from = start_nonce; - nonceptr[0] = start_nonce; - if (opt_debug) + if(opt_debug) applog(LOG_DEBUG, "GPU #%d: start=%08x end=%08x range=%08x", - device_map[thr_id], start_nonce, max_nonce, (max_nonce-start_nonce)); + device_map[thr_id], start_nonce, max_nonce, (max_nonce - start_nonce + 1)); hashes_done = 0; gettimeofday(&tv_start, NULL); - + uint32_t databackup; + if(opt_algo != ALGO_SIA) + databackup = nonceptr[2]; + else + databackup = nonceptr[12]; /* scan nonces for a proof-of-work hash */ - switch (opt_algo) { - - case ALGO_HEAVY: - rc = scanhash_heavy(thr_id, work.data, work.target, - max_nonce, &hashes_done, work.maxvote, HEAVYCOIN_BLKHDR_SZ); - break; + switch(opt_algo) + { case ALGO_KECCAK: rc = scanhash_keccak256(thr_id, work.data, work.target, - max_nonce, &hashes_done); - break; - - case ALGO_MJOLLNIR: - rc = scanhash_heavy(thr_id, work.data, work.target, - max_nonce, &hashes_done, 0, MNR_BLKHDR_SZ); + max_nonce, &hashes_done); break; case ALGO_DEEP: rc = scanhash_deep(thr_id, work.data, work.target, - max_nonce, &hashes_done); + max_nonce, &hashes_done); break; case ALGO_DOOM: case ALGO_LUFFA_DOOM: rc = scanhash_doom(thr_id, work.data, work.target, - max_nonce, &hashes_done); + max_nonce, &hashes_done); + break; + + case ALGO_C11: + rc = scanhash_c11(thr_id, work.data, work.target, + max_nonce, &hashes_done); break; case ALGO_FUGUE256: rc = scanhash_fugue256(thr_id, work.data, work.target, - max_nonce, &hashes_done); + max_nonce, &hashes_done); break; case ALGO_GROESTL: case ALGO_DMD_GR: rc = scanhash_groestlcoin(thr_id, work.data, work.target, - max_nonce, &hashes_done); + max_nonce, &hashes_done); break; case ALGO_MYR_GR: rc = scanhash_myriad(thr_id, work.data, work.target, - max_nonce, &hashes_done); + max_nonce, &hashes_done); break; case ALGO_JACKPOT: rc = scanhash_jackpot(thr_id, work.data, work.target, - max_nonce, &hashes_done); + max_nonce, &hashes_done); break; case ALGO_QUARK: rc = scanhash_quark(thr_id, work.data, work.target, - max_nonce, &hashes_done); + max_nonce, &hashes_done); break; case ALGO_QUBIT: rc = scanhash_qubit(thr_id, work.data, work.target, - max_nonce, &hashes_done); + max_nonce, &hashes_done); break; - case ALGO_ANIME: - rc = scanhash_anime(thr_id, work.data, work.target, - max_nonce, &hashes_done); - break; case ALGO_BITCOIN: rc = scanhash_bitcoin(thr_id, work.data, work.target, - max_nonce, &hashes_done); + max_nonce, &hashes_done); + break; + + case ALGO_VANILLA: + rc = scanhash_blake256(thr_id, work.data, work.target, + max_nonce, &hashes_done, 8); break; case ALGO_BLAKECOIN: rc = scanhash_blake256(thr_id, work.data, work.target, - max_nonce, &hashes_done, 8); + max_nonce, &hashes_done, 8); break; case ALGO_BLAKE: rc = scanhash_blake256(thr_id, work.data, work.target, - max_nonce, &hashes_done, 14); + max_nonce, &hashes_done, 14); break; case ALGO_FRESH: rc = scanhash_fresh(thr_id, work.data, work.target, - max_nonce, &hashes_done); + max_nonce, &hashes_done); break; - case ALGO_LYRA2: - rc = scanhash_lyra2(thr_id, work.data, work.target, - max_nonce, &hashes_done); + case ALGO_LYRA2v2: + rc = scanhash_lyra2v2(thr_id, work.data, work.target, + max_nonce, &hashes_done); break; case ALGO_NIST5: rc = scanhash_nist5(thr_id, work.data, work.target, - max_nonce, &hashes_done); + max_nonce, &hashes_done); break; case ALGO_PENTABLAKE: rc = scanhash_pentablake(thr_id, work.data, work.target, - max_nonce, &hashes_done); + max_nonce, &hashes_done); + break; + + case ALGO_SKEIN: + rc = scanhash_skeincoin(thr_id, work.data, work.target, + max_nonce, &hashes_done); break; case ALGO_S3: rc = scanhash_s3(thr_id, work.data, work.target, - max_nonce, &hashes_done); + max_nonce, &hashes_done); break; case ALGO_WHC: rc = scanhash_whc(thr_id, work.data, work.target, - max_nonce, &hashes_done); + max_nonce, &hashes_done); + break; + + case ALGO_WHCX: + rc = scanhash_whirlpoolx(thr_id, work.data, work.target, + max_nonce, &hashes_done); break; case ALGO_X11: rc = scanhash_x11(thr_id, work.data, work.target, - max_nonce, &hashes_done); + max_nonce, &hashes_done); break; case ALGO_X13: rc = scanhash_x13(thr_id, work.data, work.target, - max_nonce, &hashes_done); + max_nonce, &hashes_done); break; case ALGO_X14: rc = scanhash_x14(thr_id, work.data, work.target, - max_nonce, &hashes_done); + max_nonce, &hashes_done); break; case ALGO_X15: rc = scanhash_x15(thr_id, work.data, work.target, - max_nonce, &hashes_done); + max_nonce, &hashes_done); break; case ALGO_X17: rc = scanhash_x17(thr_id, work.data, work.target, - max_nonce, &hashes_done); + max_nonce, &hashes_done); + break; + + case ALGO_NEO: + if(!have_stratum && work.datasize == 128) + rc = scanhash_neoscrypt(true, thr_id, work.data, work.target, max_nonce, &hashes_done); + else + rc = scanhash_neoscrypt(have_stratum, thr_id, work.data, work.target, max_nonce, &hashes_done); + break; + + case ALGO_SIA: + rc = scanhash_sia(thr_id, work.data, work.target, max_nonce, &hashes_done); break; default: @@ -1451,67 +1741,78 @@ static void *miner_thread(void *userdata) /* record scanhash elapsed time */ gettimeofday(&tv_end, NULL); - - if (rc && opt_debug) + if(rc && opt_debug) applog(LOG_NOTICE, CL_CYN "found => %08x" CL_GRN " %08x", nonceptr[0], swab32(nonceptr[0])); // data[19] - if (rc > 1 && opt_debug) - applog(LOG_NOTICE, CL_CYN "found => %08x" CL_GRN " %08x", nonceptr[2], swab32(nonceptr[2])); // data[21] - + if(opt_algo != ALGO_SIA) + { + if(rc > 1 && opt_debug) + applog(LOG_NOTICE, CL_CYN "found => %08x" CL_GRN " %08x", nonceptr[2], swab32(nonceptr[2])); // data[21] + } + else + { + if(rc > 1 && opt_debug) + applog(LOG_NOTICE, CL_CYN "found => %08x" CL_GRN " %08x", nonceptr[12], swab32(nonceptr[12])); // data[21] + } timeval_subtract(&diff, &tv_end, &tv_start); - if (diff.tv_sec > 0 || (diff.tv_sec==0 && diff.tv_usec>2000)) // avoid totally wrong hash rates + if(diff.tv_sec > 0 || (diff.tv_sec == 0 && diff.tv_usec>2000)) // avoid totally wrong hash rates { - double dtime = (double) diff.tv_sec + 1e-6 * diff.tv_usec; + double dtime = (double)diff.tv_sec + 1e-6 * diff.tv_usec; /* hashrate factors for some algos */ double rate_factor = 1.0; - switch (opt_algo) { - case ALGO_JACKPOT: - case ALGO_QUARK: - // to stay comparable to other ccminer forks or pools - rate_factor = 0.5; - break; + switch(opt_algo) + { + case ALGO_JACKPOT: + case ALGO_QUARK: + // to stay comparable to other ccminer forks or pools + rate_factor = 0.5; + break; } /* store thread hashrate */ - if (dtime > 0.0) { + if(dtime > 0.0) + { pthread_mutex_lock(&stats_lock); thr_hashrates[thr_id] = hashes_done / dtime; thr_hashrates[thr_id] *= rate_factor; - stats_remember_speed(thr_id, hashes_done, thr_hashrates[thr_id], (uint8_t) rc, work.height); + stats_remember_speed(thr_id, hashes_done, thr_hashrates[thr_id], (uint8_t)rc, work.height); pthread_mutex_unlock(&stats_lock); } } work.scanned_to = start_nonce + hashes_done - 1; - if (opt_debug && opt_benchmark) + if(opt_debug && opt_benchmark) { // to debug nonce ranges - applog(LOG_DEBUG, "GPU #%d: ends=%08x range=%llx", device_map[thr_id], - start_nonce + hashes_done - 1, hashes_done); + applog(LOG_DEBUG, "GPU #%d: ends=%08x range=%08x", device_map[thr_id], + start_nonce + hashes_done - 1, hashes_done); } - if (check_dups) + if(check_dups) hashlog_remember_scan_range(&work); - /* output */ - if (!opt_quiet && (loopcnt > 0)) { - sprintf(s, thr_hashrates[thr_id] >= 1e6 ? "%.0f" : "%.2f", - 1e-3 * thr_hashrates[thr_id]); - applog(LOG_INFO, "GPU #%d: %s, %s kH/s", - device_map[thr_id], device_name[device_map[thr_id]], s); + if(!opt_quiet && loopcnt > 0) + { + double hashrate; + + hashrate = thr_hashrates[thr_id]; + format_hashrate(hashrate, s); + applog(LOG_INFO, "GPU #%d: %s, %s", device_map[thr_id], device_name[device_map[thr_id]], s); } /* loopcnt: ignore first loop hashrate */ - if ((loopcnt>0) && thr_id == (opt_n_threads - 1)) { + if((loopcnt>0) && thr_id == (opt_n_threads - 1)) + { double hashrate = 0.; pthread_mutex_lock(&stats_lock); - for (int i = 0; i < opt_n_threads && thr_hashrates[i]; i++) + for(int i = 0; i < opt_n_threads; i++) hashrate += stats_get_speed(i, thr_hashrates[i]); pthread_mutex_unlock(&stats_lock); - if (opt_benchmark) { - sprintf(s, hashrate >= 1e6 ? "%.0f" : "%.2f", hashrate / 1000.); - applog(LOG_NOTICE, "Total: %s kH/s", s); + if(opt_benchmark) + { + format_hashrate(hashrate, s); + applog(LOG_NOTICE, "Total: %s", s); } // X-Mining-Hashrate @@ -1519,13 +1820,26 @@ static void *miner_thread(void *userdata) } /* if nonce found, submit work */ - if (rc && !opt_benchmark) { - if (!submit_work(mythr, &work)) + if(rc && !opt_benchmark) + { + uint32_t found2; + if(opt_algo != ALGO_SIA) + { + found2 = nonceptr[2]; + nonceptr[2] = databackup; + } + else + { + found2 = nonceptr[12]; + nonceptr[12] = databackup; + } + if(!submit_work(mythr, &work)) break; // prevent stale work in solo // we can't submit twice a block! - if (!have_stratum) { + if(!have_stratum && !have_longpoll) + { pthread_mutex_lock(&g_work_lock); // will force getwork g_work_time = 0; @@ -1534,14 +1848,26 @@ static void *miner_thread(void *userdata) } // second nonce found, submit too (on pool only!) - if (rc > 1 && work.data[21]) { - work.data[19] = work.data[21]; - work.data[21] = 0; - if (!submit_work(mythr, &work)) - break; + if(opt_algo != ALGO_SIA) + { + if(rc > 1 && nonceptr[2]) + { + nonceptr[0] = found2; + if(!submit_work(mythr, &work)) + break; + } } + else + if(rc > 1 && nonceptr[12]) + { + nonceptr[0] = found2; + if(!submit_work(mythr, &work)) + break; + } + } - work.data[19] = start_nonce + hashes_done; + nonceptr[0] = start_nonce + hashes_done; + loopcnt++; } @@ -1559,67 +1885,79 @@ static void *longpoll_thread(void *userdata) bool need_slash = false; curl = curl_easy_init(); - if (unlikely(!curl)) { + if(unlikely(!curl)) + { applog(LOG_ERR, "CURL initialization failed"); goto out; } start: hdr_path = (char*)tq_pop(mythr->q, NULL); - if (!hdr_path) + if(!hdr_path) goto out; /* full URL */ - if (strstr(hdr_path, "://")) { + if(strstr(hdr_path, "://")) + { lp_url = hdr_path; hdr_path = NULL; } - + /* absolute path, on current server */ - else { + else + { copy_start = (*hdr_path == '/') ? (hdr_path + 1) : hdr_path; - if (rpc_url[strlen(rpc_url) - 1] != '/') + if(rpc_url[strlen(rpc_url) - 1] != '/') need_slash = true; lp_url = (char*)malloc(strlen(rpc_url) + strlen(copy_start) + 2); - if (!lp_url) - goto out; + if(lp_url == NULL) + { + applog(LOG_ERR, "Out of memory!"); + proper_exit(2); + } sprintf(lp_url, "%s%s%s", rpc_url, need_slash ? "/" : "", copy_start); } - applog(LOG_INFO, "Long-polling activated for %s", lp_url); + applog(LOG_INFO, "Long-polling enabled on %s", lp_url); - while (1) { + while(1) + { json_t *val, *soval; int err; val = json_rpc_call(curl, lp_url, rpc_userpass, rpc_req, - false, true, &err); - if (have_stratum) { - if (val) + false, true, &err); + if(have_stratum) + { + if(val) json_decref(val); goto out; } - if (likely(val)) { - if (!opt_quiet) applog(LOG_INFO, "LONGPOLL detected new block"); + if(likely(val)) + { soval = json_object_get(json_object_get(val, "result"), "submitold"); submit_old = soval ? json_is_true(soval) : false; pthread_mutex_lock(&g_work_lock); - if (work_decode(json_object_get(val, "result"), &g_work)) { - if (opt_debug) - applog(LOG_BLUE, "LONGPOLL pushed new work"); + if(work_decode(json_object_get(val, "result"), &g_work)) + { + if(!opt_quiet) + applog(LOG_BLUE, "%s detected new block", short_url); g_work_time = time(NULL); restart_threads(); } pthread_mutex_unlock(&g_work_lock); json_decref(val); - } else { + } + else + { pthread_mutex_lock(&g_work_lock); g_work_time -= LP_SCANTIME; pthread_mutex_unlock(&g_work_lock); restart_threads(); - if (err != CURLE_OPERATION_TIMEDOUT) { + if(err != CURLE_OPERATION_TIMEDOUT) + { have_longpoll = false; free(hdr_path); free(lp_url); @@ -1634,7 +1972,7 @@ static void *longpoll_thread(void *userdata) free(hdr_path); free(lp_url); tq_freeze(mythr->q); - if (curl) + if(curl) curl_easy_cleanup(curl); return NULL; @@ -1648,7 +1986,8 @@ static bool stratum_handle_response(char *buf) bool ret = false; val = JSON_LOADS(buf, &err); - if (!val) { + if(!val) + { applog(LOG_INFO, "JSON decode failed(%d): %s", err.line, err.text); goto out; } @@ -1657,24 +1996,24 @@ static bool stratum_handle_response(char *buf) err_val = json_object_get(val, "error"); id_val = json_object_get(val, "id"); - if (!id_val || json_is_null(id_val) || !res_val) + if(!id_val || json_is_null(id_val) || !res_val) goto out; // ignore subscribe late answer (yaamp) - if (json_integer_value(id_val) < 4) + if(json_integer_value(id_val) < 4) goto out; gettimeofday(&tv_answer, NULL); timeval_subtract(&diff, &tv_answer, &stratum.tv_submit); // store time required to the pool to answer to a submit - stratum.answer_msec = (1000 * diff.tv_sec) + (uint32_t) (0.001 * diff.tv_usec); + stratum.answer_msec = (1000 * diff.tv_sec) + (uint32_t)(0.001 * diff.tv_usec); share_result(json_is_true(res_val), - err_val ? json_string_value(json_array_get(err_val, 1)) : NULL); + err_val ? json_string_value(json_array_get(err_val, 1)) : NULL); ret = true; out: - if (val) + if(val) json_decref(val); return ret; @@ -1686,72 +2025,83 @@ static void *stratum_thread(void *userdata) char *s; stratum.url = (char*)tq_pop(mythr->q, NULL); - if (!stratum.url) + if(!stratum.url) goto out; applog(LOG_BLUE, "Starting Stratum on %s", stratum.url); - - while (1) { + stratum.curl = NULL; + while(1) + { int failures = 0; - if (stratum_need_reset) { + if(stratum_need_reset) + { stratum_need_reset = false; stratum_disconnect(&stratum); applog(LOG_DEBUG, "stratum connection reset"); } - while (!stratum.curl) { + while(!stratum.curl) + { pthread_mutex_lock(&g_work_lock); g_work_time = 0; pthread_mutex_unlock(&g_work_lock); restart_threads(); - if (!stratum_connect(&stratum, stratum.url) || - !stratum_subscribe(&stratum) || - !stratum_authorize(&stratum, rpc_user, rpc_pass)) { + if(!stratum_connect(&stratum, stratum.url) || + !stratum_subscribe(&stratum) || + !stratum_authorize(&stratum, rpc_user, rpc_pass, opt_extranonce)) + { stratum_disconnect(&stratum); - if (opt_retries >= 0 && ++failures > opt_retries) { + if(opt_retries >= 0 && ++failures > opt_retries) + { applog(LOG_ERR, "...terminating workio thread"); tq_push(thr_info[work_thr_id].q, NULL); goto out; } - if (!opt_benchmark) + if(!opt_benchmark) applog(LOG_ERR, "...retry after %d seconds", opt_fail_pause); sleep(opt_fail_pause); } } - if (stratum.job.job_id && - (!g_work_time || strncmp(stratum.job.job_id, g_work.job_id + 8, 120))) { + if(stratum.job.job_id && + (!g_work_time || strncmp(stratum.job.job_id, g_work.job_id + 8, 120))) + { pthread_mutex_lock(&g_work_lock); stratum_gen_work(&stratum, &g_work); g_work_time = time(NULL); - if (stratum.job.clean) + if(stratum.job.clean) { - if (!opt_quiet) + if(!opt_quiet) applog(LOG_BLUE, "%s %s block %d", short_url, algo_names[opt_algo], - stratum.job.height); + stratum.job.height); restart_threads(); - if (check_dups) + if(check_dups) hashlog_purge_old(); stats_purge_old(); - } else if (opt_debug && !opt_quiet) { - applog(LOG_BLUE, "%s asks job %d for block %d", short_url, - strtoul(stratum.job.job_id, NULL, 16), stratum.job.height); + } + else if(opt_debug && !opt_quiet) + { + applog(LOG_BLUE, "%s asks job %s for block %d", short_url, + stratum.job.job_id, stratum.job.height); } pthread_mutex_unlock(&g_work_lock); } - - if (!stratum_socket_full(&stratum, 120)) { + + if(!stratum_socket_full(&stratum, 120)) + { applog(LOG_ERR, "Stratum connection timed out"); s = NULL; - } else + } + else s = stratum_recv_line(&stratum); - if (!s) { + if(!s) + { stratum_disconnect(&stratum); applog(LOG_ERR, "Stratum connection interrupted"); continue; } - if (!stratum_handle_method(&stratum, s)) + if(!stratum_handle_method(&stratum, s)) stratum_handle_response(s); free(s); } @@ -1764,20 +2114,20 @@ static void show_version_and_exit(void) { printf("%s v%s\n" #ifdef WIN32 - "pthreads static %s\n" + "pthreads static %s\n" #endif - "%s\n", - PACKAGE_NAME, PACKAGE_VERSION, + "%s\n", + PACKAGE_NAME, PACKAGE_VERSION, #ifdef WIN32 - PTW32_VERSION_STRING, + PTW32_VERSION_STRING, #endif - curl_version()); + curl_version()); proper_exit(0); } static void show_usage_and_exit(int status) { - if (status) + if(status) fprintf(stderr, "Try `" PROGRAM_NAME " --help' for more information.\n"); else printf(usage); @@ -1790,35 +2140,42 @@ static void parse_arg(int key, char *arg) int v, i; double d; - switch(key) { + switch(key) + { case 'a': - for (i = 0; i < ARRAY_SIZE(algo_names); i++) { - if (algo_names[i] && - !strcmp(arg, algo_names[i])) { + for(i = 0; i < ARRAY_SIZE(algo_names); i++) + { + if(algo_names[i] && + !strcasecmp(arg, algo_names[i])) + { opt_algo = (enum sha_algos)i; break; } } - if (i == ARRAY_SIZE(algo_names)) + if(i == ARRAY_SIZE(algo_names)) show_usage_and_exit(1); break; case 'b': p = strstr(arg, ":"); - if (p) { + if(p) + { /* ip:port */ - if (p - arg > 0) { + if(p - arg > 0) + { free(opt_api_allow); opt_api_allow = strdup(arg); opt_api_allow[p - arg] = '\0'; } opt_api_listen = atoi(p + 1); } - else if (arg && strstr(arg, ".")) { + else if(arg && strstr(arg, ".")) + { /* ip only */ free(opt_api_allow); opt_api_allow = strdup(arg); } - else if (arg) { + else if(arg) + { /* port or 0 to disable */ opt_api_listen = atoi(arg); } @@ -1828,14 +2185,15 @@ static void parse_arg(int key, char *arg) break; case 'c': { json_error_t err; - if (opt_config) + if(opt_config) json_decref(opt_config); #if JANSSON_VERSION_HEX >= 0x020000 opt_config = json_load_file(arg, 0, &err); #else opt_config = json_load_file(arg, &err); #endif - if (!json_is_object(opt_config)) { + if(!json_is_object(opt_config)) + { applog(LOG_ERR, "JSON decode of %s failed", arg); proper_exit(1); } @@ -1843,36 +2201,43 @@ static void parse_arg(int key, char *arg) } case 'i': d = atof(arg); - v = (uint32_t) d; - if (v < 0 || v > 31) + v = (uint32_t)d; + if(v < 0 || v > 31) show_usage_and_exit(1); + else { - int n = 0, adds = 0; + int n = 0; int ngpus = cuda_num_devices(); - char * pch = strtok(arg,","); - if (pch == NULL) { - for (n=0; n < ngpus; n++) - gpus_intensity[n] = (1 << v); - break; - } - while (pch != NULL) { + uint32_t last = 0; + char *pch = arg; + do + { d = atof(pch); - v = (uint32_t) d; - if (v > 7) { /* 0 = default */ - gpus_intensity[n] = (1 << v); - if ((d - v) > 0.0) { - adds = (uint32_t) floor((d - v) * (1 << (v-8))) * 256; - gpus_intensity[n] += adds; + v = (uint32_t)d; + if(v > 7) + { /* 0 = default */ + if((d - v) > 0.0) + { + uint32_t adds = (uint32_t)floor((d - v) * (1 << (v - 8))) * 256; + gpus_intensity[n] = (1 << v) + adds; applog(LOG_INFO, "Adding %u threads to intensity %u, %u cuda threads", - adds, v, gpus_intensity[n]); - } else { + adds, v, gpus_intensity[n]); + } + else if(gpus_intensity[n] != (1 << v)) + { + gpus_intensity[n] = (1 << v); applog(LOG_INFO, "Intensity set to %u, %u cuda threads", - v, gpus_intensity[n]); + v, gpus_intensity[n]); } } + last = gpus_intensity[n]; n++; - pch = strtok(NULL, ","); - } + pch = strpbrk(pch, ","); + if(pch != NULL) + pch++; + } while(pch != NULL); + while(n < MAX_GPUS) + gpus_intensity[n++] = last; } break; case 'D': @@ -1880,10 +2245,14 @@ static void parse_arg(int key, char *arg) break; case 'N': v = atoi(arg); - if (v < 1) + if(v < 1) opt_statsavg = INT_MAX; opt_statsavg = v; break; + case 'n': /* --ndevs */ + cuda_print_devices(); + proper_exit(0); + break; case 'q': opt_quiet = true; break; @@ -1896,79 +2265,87 @@ static void parse_arg(int key, char *arg) break; case 'r': v = atoi(arg); - if (v < -1 || v > 9999) /* sanity check */ + if(v < -1 || v > 9999) /* sanity check */ show_usage_and_exit(1); opt_retries = v; break; case 'R': v = atoi(arg); - if (v < 1 || v > 9999) /* sanity check */ + if(v < 1 || v > 9999) /* sanity check */ show_usage_and_exit(1); opt_fail_pause = v; break; case 's': v = atoi(arg); - if (v < 1 || v > 9999) /* sanity check */ + if(v < 1 || v > 9999) /* sanity check */ show_usage_and_exit(1); opt_scantime = v; break; case 'T': v = atoi(arg); - if (v < 1 || v > 99999) /* sanity check */ + if(v < 1 || v > 99999) /* sanity check */ show_usage_and_exit(1); opt_timeout = v; break; case 't': v = atoi(arg); - if (v < 1 || v > 9999) /* sanity check */ + if(v < 1 || v > 9999) /* sanity check */ show_usage_and_exit(1); opt_n_threads = v; break; - case 'v': - v = atoi(arg); - if (v < 0 || v > 8192) /* sanity check */ - show_usage_and_exit(1); - opt_vote = (uint16_t)v; - break; - case 'm': - opt_trust_pool = true; - break; case 'u': free(rpc_user); rpc_user = strdup(arg); break; case 'o': /* --url */ p = strstr(arg, "://"); - if (p) { - if (strncasecmp(arg, "http://", 7) && strncasecmp(arg, "https://", 8) && - strncasecmp(arg, "stratum+tcp://", 14)) - show_usage_and_exit(1); + if(p) + { + if(strncasecmp(arg, "http://", 7) && strncasecmp(arg, "https://", 8) && + strncasecmp(arg, "stratum+tcp://", 14)) + show_usage_and_exit(1); free(rpc_url); rpc_url = strdup(arg); short_url = &rpc_url[(p - arg) + 3]; - } else { - if (!strlen(arg) || *arg == '/') + } + else + { + if(!strlen(arg) || *arg == '/') show_usage_and_exit(1); free(rpc_url); rpc_url = (char*)malloc(strlen(arg) + 8); + if(rpc_url == NULL) + { + applog(LOG_ERR, "Out of memory!"); + proper_exit(1); + } sprintf(rpc_url, "http://%s", arg); short_url = &rpc_url[7]; } p = strrchr(rpc_url, '@'); - if (p) { + if(p) + { char *sp, *ap; *p = '\0'; ap = strstr(rpc_url, "://") + 3; sp = strchr(ap, ':'); - if (sp) { + if(sp) + { free(rpc_userpass); rpc_userpass = strdup(ap); free(rpc_user); rpc_user = (char*)calloc(sp - ap + 1, 1); + if(rpc_user == NULL) + { + applog(LOG_ERR, "Out of memory!"); + proper_exit(1); + } strncpy(rpc_user, ap, sp - ap); free(rpc_pass); rpc_pass = strdup(sp + 1); - } else { + } + else + { free(rpc_user); rpc_user = strdup(ap); } @@ -1979,25 +2356,30 @@ static void parse_arg(int key, char *arg) break; case 'O': /* --userpass */ p = strchr(arg, ':'); - if (!p) + if(!p) show_usage_and_exit(1); free(rpc_userpass); rpc_userpass = strdup(arg); free(rpc_user); rpc_user = (char*)calloc(p - arg + 1, 1); + if(rpc_user == NULL) + { + applog(LOG_ERR, "Out of memory!"); + proper_exit(1); + } strncpy(rpc_user, arg, p - arg); free(rpc_pass); rpc_pass = strdup(p + 1); break; case 'x': /* --proxy */ - if (!strncasecmp(arg, "socks4://", 9)) + if(!strncasecmp(arg, "socks4://", 9)) opt_proxy_type = CURLPROXY_SOCKS4; - else if (!strncasecmp(arg, "socks5://", 9)) + else if(!strncasecmp(arg, "socks5://", 9)) opt_proxy_type = CURLPROXY_SOCKS5; #if LIBCURL_VERSION_NUM >= 0x071200 - else if (!strncasecmp(arg, "socks4a://", 10)) + else if(!strncasecmp(arg, "socks4a://", 10)) opt_proxy_type = CURLPROXY_SOCKS4A; - else if (!strncasecmp(arg, "socks5h://", 10)) + else if(!strncasecmp(arg, "socks5h://", 10)) opt_proxy_type = CURLPROXY_SOCKS5_HOSTNAME; #endif else @@ -2035,118 +2417,189 @@ static void parse_arg(int key, char *arg) case 1008: applog(LOG_INFO, "Now logging to syslog..."); use_syslog = true; - if (arg && strlen(arg)) { + if(arg && strlen(arg)) + { free(opt_syslog_pfx); opt_syslog_pfx = strdup(arg); } break; case 1020: v = atoi(arg); - if (v < -1) + if(v < -1) v = -1; - if (v > (1<(1 << num_cpus) - 1) v = -1; opt_affinity = v; break; case 1021: v = atoi(arg); - if (v < 0 || v > 5) /* sanity check */ + if(v < 0 || v > 5) /* sanity check */ show_usage_and_exit(1); opt_priority = v; break; + case 1022: + opt_verify = false; + break; case 'd': // CB + { + int ngpus = cuda_num_devices(); + char * pch = strtok(arg, ","); + opt_n_threads = 0; + while(pch != NULL) { - int ngpus = cuda_num_devices(); - char * pch = strtok (arg,","); - opt_n_threads = 0; - while (pch != NULL) { - if (pch[0] >= '0' && pch[0] <= '9' && pch[1] == '\0') + if(pch[0] >= '0' && pch[0] <= '9' && pch[1] == '\0') + { + if(atoi(pch) < ngpus) + device_map[opt_n_threads++] = atoi(pch); + else { - if (atoi(pch) < ngpus) - device_map[opt_n_threads++] = atoi(pch); - else { - applog(LOG_ERR, "Non-existant CUDA device #%d specified in -d option", atoi(pch)); - proper_exit(1); - } - } else { - int device = cuda_finddevice(pch); - if (device >= 0 && device < ngpus) - device_map[opt_n_threads++] = device; - else { - applog(LOG_ERR, "Non-existant CUDA device '%s' specified in -d option", pch); - proper_exit(1); - } + applog(LOG_ERR, "Non-existant CUDA device #%d specified in -d option", atoi(pch)); + proper_exit(1); + } + } + else + { + int device = cuda_finddevice(pch); + if(device >= 0 && device < ngpus) + device_map[opt_n_threads++] = device; + else + { + applog(LOG_ERR, "Non-existant CUDA device '%s' specified in -d option", pch); + proper_exit(1); } - // set number of active gpus - active_gpus = opt_n_threads; - pch = strtok (NULL, ","); } + // set number of active gpus + active_gpus = opt_n_threads; + pch = strtok(NULL, ","); } - break; + } + break; case 'f': // CH - Divisor for Difficulty d = atof(arg); - if (d == 0) /* sanity check */ + if(d == 0) /* sanity check */ show_usage_and_exit(1); opt_difficulty = d; break; + case 'm': // --diff-multiplier + d = atof(arg); + if(d <= 0.) + show_usage_and_exit(1); + opt_difficulty = 1.0/d; + break; + case 'e': + opt_extranonce = false; + break; case 'V': show_version_and_exit(); case 'h': show_usage_and_exit(0); + case 1070: /* --gpu-clock */ + { + char *pch = strtok(arg, ","); + int n = 0; + while(pch != NULL && n < MAX_GPUS) + { + int dev_id = device_map[n++]; + device_gpu_clocks[dev_id] = atoi(pch); + pch = strtok(NULL, ","); + } + } + break; + case 1071: /* --mem-clock */ + { + char *pch = strtok(arg, ","); + int n = 0; + while(pch != NULL && n < MAX_GPUS) + { + int dev_id = device_map[n++]; + device_mem_clocks[dev_id] = atoi(pch); + pch = strtok(NULL, ","); + } + } + break; + case 1072: /* --pstate */ + { + char *pch = strtok(arg, ","); + int n = 0; + while(pch != NULL && n < MAX_GPUS) + { + int dev_id = device_map[n++]; + device_pstate[dev_id] = (int8_t)atoi(pch); + pch = strtok(NULL, ","); + } + } + break; + case 1073: /* --plimit */ + { + char *pch = strtok(arg, ","); + int n = 0; + while(pch != NULL && n < MAX_GPUS) + { + int dev_id = device_map[n++]; + device_plimit[dev_id] = atoi(pch); + pch = strtok(NULL, ","); + } + } + break; default: show_usage_and_exit(1); } - if (use_syslog) + if(use_syslog) use_colors = false; } /** - * Parse json config file - */ +* Parse json config file +*/ static void parse_config(void) { int i; json_t *val; - if (!json_is_object(opt_config)) + if(!json_is_object(opt_config)) return; - for (i = 0; i < ARRAY_SIZE(options); i++) { + for(i = 0; i < ARRAY_SIZE(options); i++) + { - if (!options[i].name) + if(!options[i].name) break; - if (!strcmp(options[i].name, "config")) + if(!strcmp(options[i].name, "config")) continue; val = json_object_get(opt_config, options[i].name); - if (!val) + if(!val) continue; - if (options[i].has_arg && json_is_string(val)) { + if(options[i].has_arg && json_is_string(val)) + { char *s = strdup(json_string_value(val)); - if (!s) + if(!s) continue; parse_arg(options[i].val, s); free(s); } - else if (options[i].has_arg && json_is_integer(val)) { + else if(options[i].has_arg && json_is_integer(val)) + { char buf[16]; - sprintf(buf, "%d", (int) json_integer_value(val)); + sprintf(buf, "%d", (int)json_integer_value(val)); parse_arg(options[i].val, buf); } - else if (options[i].has_arg && json_is_real(val)) { + else if(options[i].has_arg && json_is_real(val)) + { char buf[16]; sprintf(buf, "%f", json_real_value(val)); parse_arg(options[i].val, buf); } - else if (!options[i].has_arg) { - if (json_is_true(val)) + else if(!options[i].has_arg) + { + if(json_is_true(val)) parse_arg(options[i].val, (char*) ""); } else applog(LOG_ERR, "JSON option %s invalid", - options[i].name); + options[i].name); } } @@ -2154,61 +2607,60 @@ static void parse_cmdline(int argc, char *argv[]) { int key; - while (1) { + while(1) + { #if HAVE_GETOPT_LONG key = getopt_long(argc, argv, short_options, options, NULL); #else key = getopt(argc, argv, short_options); #endif - if (key < 0) + if(key < 0) break; parse_arg(key, optarg); } - if (optind < argc) { + if(optind < argc) + { fprintf(stderr, "%s: unsupported non-option argument '%s'\n", - argv[0], argv[optind]); + argv[0], argv[optind]); show_usage_and_exit(1); } parse_config(); - if (opt_algo == ALGO_HEAVY && opt_vote == 9999) { - fprintf(stderr, "%s: Heavycoin hash requires block reward vote parameter (see --vote)\n", - argv[0]); - show_usage_and_exit(1); - } } #ifndef WIN32 static void signal_handler(int sig) { - switch (sig) { + switch(sig) + { case SIGHUP: applog(LOG_INFO, "SIGHUP received"); break; case SIGINT: signal(sig, SIG_IGN); applog(LOG_INFO, "SIGINT received, exiting"); - proper_exit(0); + proper_exit(2); break; case SIGTERM: applog(LOG_INFO, "SIGTERM received, exiting"); - proper_exit(0); + proper_exit(2); break; } } #else BOOL WINAPI ConsoleHandler(DWORD dwType) { - switch (dwType) { + switch(dwType) + { case CTRL_C_EVENT: applog(LOG_INFO, "CTRL_C_EVENT received, exiting"); - proper_exit(0); + proper_exit(2); break; case CTRL_BREAK_EVENT: applog(LOG_INFO, "CTRL_BREAK_EVENT received, exiting"); - proper_exit(0); + proper_exit(2); break; default: return false; @@ -2217,26 +2669,62 @@ BOOL WINAPI ConsoleHandler(DWORD dwType) } #endif +static int msver(void) +{ + int version; +#ifdef _MSC_VER + switch(_MSC_VER) + { + case 1500: version = 2008; break; + case 1600: version = 2010; break; + case 1700: version = 2012; break; + case 1800: version = 2013; break; + case 1900: version = 2015; break; + default: version = _MSC_VER / 100; + } +#else + version = 0; +#endif + return version; +} + int main(int argc, char *argv[]) { struct thr_info *thr; - long flags; int i; + + // strdup on char* to allow a common free() if used + opt_syslog_pfx = strdup(PROGRAM_NAME); + opt_api_allow = strdup("127.0.0.1"); /* 0.0.0.0 for all ips */ - printf("*** ccminer " PACKAGE_VERSION " for nVidia GPUs by sp-hash@github ***\n"); -#ifdef WIN32 - printf("\tBuilt with VC++ 2013 and nVidia CUDA SDK 6.5\n\n"); +#if defined _WIN64 || defined _LP64 + printf("ccminer " PACKAGE_VERSION " (64bit) for nVidia GPUs\n"); +#else + printf("ccminer " PACKAGE_VERSION " (32bit) for nVidia GPUs\n"); +#endif +#ifdef _MSC_VER + printf("Compiled with Visual Studio %d ", msver()); +#else +#ifdef __clang__ + printf("Compiled with Clang %s ", __clang_version__); +#else +#ifdef __GNUC__ + printf("Compiled with GCC %d.%d ", __GNUC__, __GNUC_MINOR__); #else - printf("\tBuilt with the nVidia CUDA SDK 6.5\n\n"); + printf("Compiled with an unusual compiler "); #endif - printf(" Based on pooler cpuminer 2.3.2 and the tpruvot@github fork\n "); - printf(" CUDA support by Christian Buchner, Christian H. and DJM34\n"); - printf(" Includes optimizations implemented by sp , klaust, tpruvot and tsiv. \n\n"); +#endif +#endif + printf("using Nvidia CUDA Toolkit %d.%d\n\n", CUDART_VERSION / 1000, (CUDART_VERSION % 1000) / 10); + printf("Based on pooler cpuminer 2.3.2 and the tpruvot@github fork\n"); + printf("CUDA support by Christian Buchner, Christian H. and DJM34\n"); + printf("Includes optimizations implemented by sp-hash, klaust, tpruvot and tsiv.\n\n"); rpc_user = strdup(""); rpc_pass = strdup(""); - pthread_mutex_init(&applog_lock, NULL); + for(int i = 0; i < MAX_GPUS; i++) + device_pstate[i] = -1; // number of cpus for thread affinity #if defined(WIN32) @@ -2248,63 +2736,83 @@ int main(int argc, char *argv[]) #elif defined(CTL_HW) && defined(HW_NCPU) int req[] = { CTL_HW, HW_NCPU }; size_t len = sizeof(num_cpus); - sysctl(req, 2, &num_cpus, &len, NULL, 0); + sysc tl(req, 2, &num_cpus, &len, NULL, 0); #else num_cpus = 1; #endif - if (num_cpus < 1) - num_cpus = 1; + // number of gpus + active_gpus = cuda_num_devices(); +// cuda_devicereset(); - // default thread to device map - for (i = 0; i < MAX_GPUS; i++) { - device_map[i] = i; + if(active_gpus > 1) + { + // default thread to device map + for(i = 0; i < MAX_GPUS; i++) + { + device_map[i] = i; + } } - // number of gpus - active_gpus = cuda_num_devices(); cuda_devicenames(); /* parse command line */ parse_cmdline(argc, argv); - if (!opt_benchmark && !rpc_url) { + if(opt_protocol) + { + curl_version_info_data *info; + + info = curl_version_info(CURLVERSION_NOW); + applog(LOG_DEBUG, "using libcurl %s", info->version); + int features = info->features; + if(features&CURL_VERSION_IPV6) + applog(LOG_DEBUG, "libcurl supports IPv6"); + if(features&CURL_VERSION_SSL) + applog(LOG_DEBUG, "libcurl supports SSL"); + if(features&CURL_VERSION_IDN) + applog(LOG_DEBUG, "libcurl supports international domain names"); + } + if(!opt_benchmark && !rpc_url) + { fprintf(stderr, "%s: no URL supplied\n", argv[0]); show_usage_and_exit(1); } + cuda_devicereset(); - if (!rpc_userpass) { + if(!rpc_userpass) + { rpc_userpass = (char*)malloc(strlen(rpc_user) + strlen(rpc_pass) + 2); - if (!rpc_userpass) - return 1; + if(rpc_userpass == NULL) + { + applog(LOG_ERR, "Out of memory!"); + proper_exit(2); + } sprintf(rpc_userpass, "%s:%s", rpc_user, rpc_pass); } /* init stratum data.. */ memset(&stratum.url, 0, sizeof(stratum)); - pthread_mutex_init(&stats_lock, NULL); - pthread_mutex_init(&g_work_lock, NULL); pthread_mutex_init(&stratum.sock_lock, NULL); pthread_mutex_init(&stratum.work_lock, NULL); - flags = !opt_benchmark && strncmp(rpc_url, "https:", 6) - ? (CURL_GLOBAL_ALL & ~CURL_GLOBAL_SSL) - : CURL_GLOBAL_ALL; - if (curl_global_init(flags)) { + if(curl_global_init(CURL_GLOBAL_ALL)) + { applog(LOG_ERR, "CURL initialization failed"); return 1; } #ifndef WIN32 - if (opt_background) { + if(opt_background) + { i = fork(); - if (i < 0) exit(1); - if (i > 0) exit(0); + if(i < 0) exit(1); + if(i > 0) exit(0); i = setsid(); - if (i < 0) + if(i < 0) applog(LOG_ERR, "setsid() failed (errno = %d)", errno); i = chdir("/"); - if (i < 0) + if(i < 0) applog(LOG_ERR, "chdir() failed (errno = %d)", errno); signal(SIGHUP, signal_handler); signal(SIGTERM, signal_handler); @@ -2313,10 +2821,12 @@ int main(int argc, char *argv[]) signal(SIGINT, signal_handler); #else SetConsoleCtrlHandler((PHANDLER_ROUTINE)ConsoleHandler, TRUE); - if (opt_priority > 0) { + if(opt_priority > 0) + { DWORD prio = NORMAL_PRIORITY_CLASS; - SetPriorityClass(NULL, prio); - switch (opt_priority) { + // SetPriorityClass(NULL, prio); + switch(opt_priority) + { case 1: prio = BELOW_NORMAL_PRIORITY_CLASS; break; @@ -2329,36 +2839,64 @@ int main(int argc, char *argv[]) case 5: prio = REALTIME_PRIORITY_CLASS; } - SetPriorityClass(GetCurrentProcess(), prio); + if(SetPriorityClass(GetCurrentProcess(), prio) == 0) + { + LPSTR messageBuffer = nullptr; + size_t size = FormatMessage(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS, + NULL, GetLastError(), MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&messageBuffer, 0, NULL); + applog(LOG_ERR, "Error while trying to set the priority:"); + applog(LOG_ERR, "%s", messageBuffer); + LocalFree(messageBuffer); + } + prio = GetPriorityClass(GetCurrentProcess()); + switch(prio) + { + case NORMAL_PRIORITY_CLASS: + applog(LOG_INFO, "CPU priority: %s", "normal"); + break; + case BELOW_NORMAL_PRIORITY_CLASS: + applog(LOG_INFO, "CPU priority: %s", "below normal"); + break; + case ABOVE_NORMAL_PRIORITY_CLASS: + applog(LOG_INFO, "CPU priority: %s", "above normal"); + break; + case HIGH_PRIORITY_CLASS: + applog(LOG_INFO, "CPU priority: %s", "high"); + break; + case REALTIME_PRIORITY_CLASS: + applog(LOG_INFO, "CPU priority: %s", "realtime"); + break; + case IDLE_PRIORITY_CLASS: + applog(LOG_INFO, "CPU priority: %s", "idle"); + break; + default: + applog(LOG_INFO, "CPU priority class: %d", prio); + } } #endif - if (opt_affinity != -1) { - if (!opt_quiet) + if(opt_affinity != -1) + { + if(!opt_quiet) applog(LOG_DEBUG, "Binding process to cpu mask %x", opt_affinity); affine_to_cpu_mask(-1, opt_affinity); } - if (active_gpus == 0) { - applog(LOG_ERR, "No CUDA devices found! terminating."); - exit(1); - } - if (!opt_n_threads) + if(!opt_n_threads) opt_n_threads = active_gpus; #ifdef HAVE_SYSLOG_H - if (use_syslog) + if(use_syslog) openlog(opt_syslog_pfx, LOG_PID, LOG_USER); #endif work_restart = (struct work_restart *)calloc(opt_n_threads, sizeof(*work_restart)); - if (!work_restart) - return 1; + if(work_restart == NULL) + { + applog(LOG_ERR, "Out of memory!"); + proper_exit(2); + } thr_info = (struct thr_info *)calloc(opt_n_threads + 4, sizeof(*thr)); - if (!thr_info) - return 1; - - thr_hashrates = (double *) calloc(opt_n_threads, sizeof(double)); - if (!thr_hashrates) + if(!thr_info) return 1; /* init workio thread info */ @@ -2366,102 +2904,134 @@ int main(int argc, char *argv[]) thr = &thr_info[work_thr_id]; thr->id = work_thr_id; thr->q = tq_new(); - if (!thr->q) + if(!thr->q) return 1; + for(int i = 0; i < MAX_GPUS; i++) + mining_has_stopped[i] = true; + /* start work I/O thread */ - if (pthread_create(&thr->pth, NULL, workio_thread, thr)) { + if(pthread_create(&thr->pth, NULL, workio_thread, thr)) + { applog(LOG_ERR, "workio thread create failed"); return 1; } - if (want_longpoll && !have_stratum) { + if(want_longpoll && !have_stratum) + { /* init longpoll thread info */ longpoll_thr_id = opt_n_threads + 1; thr = &thr_info[longpoll_thr_id]; thr->id = longpoll_thr_id; thr->q = tq_new(); - if (!thr->q) + if(!thr->q) return 1; /* start longpoll thread */ - if (unlikely(pthread_create(&thr->pth, NULL, longpoll_thread, thr))) { + if(unlikely(pthread_create(&thr->pth, NULL, longpoll_thread, thr))) + { applog(LOG_ERR, "longpoll thread create failed"); return 1; } } - if (want_stratum) { + if(want_stratum) + { /* init stratum thread info */ stratum_thr_id = opt_n_threads + 2; thr = &thr_info[stratum_thr_id]; thr->id = stratum_thr_id; thr->q = tq_new(); - if (!thr->q) + if(!thr->q) return 1; /* start stratum thread */ - if (unlikely(pthread_create(&thr->pth, NULL, stratum_thread, thr))) { + if(unlikely(pthread_create(&thr->pth, NULL, stratum_thread, thr))) + { applog(LOG_ERR, "stratum thread create failed"); return 1; } - if (have_stratum) + if(have_stratum) tq_push(thr_info[stratum_thr_id].q, strdup(rpc_url)); } #ifdef USE_WRAPNVML -#ifndef WIN32 +#if defined(__linux__) || defined(_WIN64) /* nvml is currently not the best choice on Windows (only in x64) */ hnvml = nvml_create(); - if (hnvml) + if (hnvml) { + bool gpu_reinit = false;// (opt_cudaschedule >= 0); + cuda_devicenames(); // refresh gpu vendor name applog(LOG_INFO, "NVML GPU monitoring enabled."); -#else - if (nvapi_init() == 0) + for(int n = 0; n < active_gpus; n++) + { + if(nvml_set_pstate(hnvml, device_map[n]) == 1) + gpu_reinit = true; + if(nvml_set_plimit(hnvml, device_map[n]) == 1) + gpu_reinit = true; + if(nvml_set_clocks(hnvml, device_map[n]) == 1) + gpu_reinit = true; + if(gpu_reinit) + { +// cuda_reset_device(n, NULL); + } + } + } +#endif +#ifdef WIN32 + if(!hnvml && nvapi_init() == 0) + { applog(LOG_INFO, "NVAPI GPU monitoring enabled."); + cuda_devicenames(); // refresh gpu vendor name + } #endif - else + else if(!hnvml) applog(LOG_INFO, "GPU monitoring is not available."); #endif - if (opt_api_listen) { + if(opt_api_listen) + { /* api thread */ api_thr_id = opt_n_threads + 3; thr = &thr_info[api_thr_id]; thr->id = api_thr_id; thr->q = tq_new(); - if (!thr->q) + if(!thr->q) return 1; /* start stratum thread */ - if (unlikely(pthread_create(&thr->pth, NULL, api_thread, thr))) { + if(unlikely(pthread_create(&thr->pth, NULL, api_thread, thr))) + { applog(LOG_ERR, "api thread create failed"); return 1; } } /* start mining threads */ - for (i = 0; i < opt_n_threads; i++) { + for(i = 0; i < opt_n_threads; i++) + { thr = &thr_info[i]; thr->id = i; thr->gpu.thr_id = i; - thr->gpu.gpu_id = (uint8_t) device_map[i]; - thr->gpu.gpu_arch = (uint16_t) device_sm[device_map[i]]; + thr->gpu.gpu_id = (uint8_t)device_map[i]; + thr->gpu.gpu_arch = (uint16_t)device_sm[device_map[i]]; thr->q = tq_new(); - if (!thr->q) + if(!thr->q) return 1; - if (unlikely(pthread_create(&thr->pth, NULL, miner_thread, thr))) { + if(unlikely(pthread_create(&thr->pth, NULL, miner_thread, thr))) + { applog(LOG_ERR, "thread %d create failed", i); return 1; } } applog(LOG_INFO, "%d miner thread%s started, " - "using '%s' algorithm.", - opt_n_threads, opt_n_threads > 1 ? "s":"", - algo_names[opt_algo]); + "using '%s' algorithm.", + opt_n_threads, opt_n_threads > 1 ? "s" : "", + algo_names[opt_algo]); #ifdef WIN32 timeBeginPeriod(1); // enable high timer precision (similar to Google Chrome Trick) @@ -2470,10 +3040,6 @@ int main(int argc, char *argv[]) /* main loop - simply wait for workio thread to exit */ pthread_join(thr_info[work_thr_id].pth, NULL); -#ifdef WIN32 - timeEndPeriod(1); // be nice and forego high timer precision -#endif - applog(LOG_INFO, "workio thread dead, exiting."); proper_exit(0); diff --git a/ccminer.opensdf b/ccminer.opensdf deleted file mode 100644 index 303c7000d4..0000000000 Binary files a/ccminer.opensdf and /dev/null differ diff --git a/ccminer.v12.suo b/ccminer.v12.suo deleted file mode 100644 index 3f464a9307..0000000000 Binary files a/ccminer.v12.suo and /dev/null differ diff --git a/ccminer.vcxproj b/ccminer.vcxproj index ccd4af33b6..f96d83ffb8 100644 --- a/ccminer.vcxproj +++ b/ccminer.vcxproj @@ -1,5 +1,5 @@  - + Release @@ -27,7 +27,7 @@ Application false MultiByte - v120 + v140 false true @@ -35,26 +35,27 @@ Application true MultiByte - v120 + v140 false Application false MultiByte - v120 + v140 false + true Application true MultiByte - v120 + v140 false - + @@ -77,9 +78,15 @@ false + false + $(SolutionDir)$(Configuration)\temp\$(PlatformTarget)\ + $(SolutionDir)$(Configuration)\$(PlatformTarget)\ false + $(SolutionDir)$(Configuration)\$(PlatformTarget)\ + $(SolutionDir)$(Configuration)\temp\$(PlatformTarget)\ + .exe @@ -94,7 +101,7 @@ true Console - cudart_static.lib;cuda.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;pthreadVC2.lib;libcurl.x86.lib;openssl.x86.lib;zlib.x86.lib;ws2_32.lib;Wldap32.lib;nvapi.lib;%(AdditionalDependencies) + normaliz.lib;cudart_static.lib;cuda.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;pthreadVC2.lib;libcurl.x86.lib;openssl.x86.lib;zlib.x86.lib;ws2_32.lib;Wldap32.lib;nvapi.lib;%(AdditionalDependencies) libcmt.lib;msvcrt.lib compat\pthreads\x86;compat\curl-for-windows\out\x86\Release\lib;compat\nvapi\x86;%(AdditionalLibraryDirectories);$(CudaToolkitLibDir) /NODEFAULTLIB:LIBCMT %(AdditionalOptions) @@ -122,7 +129,7 @@ true Console - cudart_static.lib;cuda.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;pthreadVC2.lib;libcurl.x64.lib;openssl.x64.lib;zlib.x64.lib;ws2_32.lib;Wldap32.lib;nvapi64.lib;%(AdditionalDependencies) + normaliz.lib;cudart_static.lib;cuda.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;pthreadVC2.lib;libcurl.x64.lib;openssl.x64.lib;zlib.x64.lib;ws2_32.lib;Wldap32.lib;nvapi64.lib;%(AdditionalDependencies) libcmt.lib compat\pthreads\x64;compat\curl-for-windows\out\x64\Release\lib;compat\nvapi\amd64;%(AdditionalLibraryDirectories);$(CudaToolkitLibDir) /NODEFAULTLIB:LIBCMTD %(AdditionalOptions) @@ -143,26 +150,33 @@ Level3 - MaxSpeed + Full MultiThreaded Speed - StreamingSIMDExtensions2 false true true - WIN32;NDEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;CURL_STATICLIB;USE_WRAPNVML;SCRYPT_KECCAK512;SCRYPT_CHACHA;SCRYPT_CHOOSE_COMPILETIME;%(PreprocessorDefinitions) - .;compat;compat\curl-for-windows\curl\include;compat\jansson;compat\getopt;compat\pthreads;compat\curl-for-windows\openssl\openssl\include;compat\curl-for-windows\zlib;%(AdditionalIncludeDirectories);$(CudaToolkitIncludeDir) + HAVE_STRUCT_TIMESPEC;WIN32;NDEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;CURL_STATICLIB;USE_WRAPNVML;SCRYPT_KECCAK512;SCRYPT_CHACHA;SCRYPT_CHOOSE_COMPILETIME;%(PreprocessorDefinitions) + .;compat;compat\includes;compat\includes-x86;compat\getopt;compat\includes\pthreads;compat\includes\zlib;%(AdditionalIncludeDirectories);$(CudaToolkitIncludeDir) true SyncCThrow + true + Precise + false + true + false + false + true + true false true true Console - cudart_static.lib;cuda.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;pthreadVC2.lib;libcurl.x86.lib;openssl.x86.lib;zlib.x86.lib;ws2_32.lib;Wldap32.lib;nvapi.lib;%(AdditionalDependencies) + normaliz.lib;cudart.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;pthreadVC2.lib;libcurl.lib;libcrypto.lib;zlibstat.lib;ws2_32.lib;Wldap32.lib;nvapi.lib;jansson.lib;%(AdditionalDependencies) libcmt.lib - compat\pthreads\x86;compat\curl-for-windows\out\x86\Release\lib;compat\nvapi\x86;%(AdditionalLibraryDirectories);$(CudaToolkitLibDir) + compat\libs\x86;compat\nvapi\x86;%(AdditionalLibraryDirectories);$(CudaToolkitLibDir) /NODEFAULTLIB:LIBCMT %(AdditionalOptions) false false @@ -173,40 +187,55 @@ 80 true true - compute_52,sm_52;compute_50,sm_50 + compute_62,sm_62;compute_60,sm_60;compute_61,sm_61;compute_53,sm_53;compute_52,sm_52;compute_50,sm_50;compute_35,sm_35;compute_30,sm_30;compute_37,sm_37; --ptxas-options="-O2" %(AdditionalOptions) O3 + MT + Default + false + $(CudaIntDir)\temp\$(PlatformTarget) + Shared false - O2 + O3 + false Level3 - MaxSpeed + Full MultiThreaded Speed - AdvancedVectorExtensions false true true - WIN32;NDEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;CURL_STATICLIB;USE_WRAPNVML;SCRYPT_KECCAK512;SCRYPT_CHACHA;SCRYPT_CHOOSE_COMPILETIME;%(PreprocessorDefinitions) - .;compat;compat\curl-for-windows\curl\include;compat\jansson;compat\getopt;compat\pthreads;compat\curl-for-windows\openssl\openssl\include;compat\curl-for-windows\zlib;%(AdditionalIncludeDirectories);$(CudaToolkitIncludeDir) + HAVE_STRUCT_TIMESPEC;WIN32;NDEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;CURL_STATICLIB;USE_WRAPNVML;SCRYPT_KECCAK512;SCRYPT_CHACHA;SCRYPT_CHOOSE_COMPILETIME;%(PreprocessorDefinitions) + .;compat;compat\includes;compat\includes-x64;compat\getopt;compat\includes\pthreads;compat\curl-for-windows\zlib;%(AdditionalIncludeDirectories);$(CudaToolkitIncludeDir) + true + Precise + false + true + SyncCThrow + true + false + true + true + false + true - false true true Console - kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;pthreadVC2.lib;libcurl.x64.lib;openssl.x64.lib;zlib.x64.lib;ws2_32.lib;Wldap32.lib;cudart_static.lib;cuda.lib;nvapi64.lib;%(AdditionalDependencies) + normaliz.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;pthreadVC2.lib;libcurl.lib;libcrypto.lib;zlibstat.lib;ws2_32.lib;Wldap32.lib;cudart.lib;nvapi64.lib;jansson.lib;%(AdditionalDependencies) libcmt.lib - compat\pthreads\x64;compat\curl-for-windows\out\x64\Release\lib;compat\nvapi\amd64;%(AdditionalLibraryDirectories);$(CudaToolkitLibDir) + compat\libs\x64;compat\nvapi\amd64;%(AdditionalLibraryDirectories);$(CudaToolkitLibDir) /NODEFAULTLIB:LIBCMT %(AdditionalOptions) false true @@ -215,32 +244,34 @@ false 80 true - false - compute_50,sm_50 + true + compute_60,sm_60;compute_61,sm_61;compute_52,sm_52;compute_50,sm_50;compute_35,sm_35;compute_30,sm_30;compute_37,sm_37; + 64 + O3 + MT + Default + false + $(CudaIntDir)\temp\$(PlatformTarget) + Shared + -Wno-deprecated-gpu-targets %(AdditionalOptions) false + false + O3 + + CppCode - - - - - - - - - - - + false Full @@ -248,6 +279,14 @@ CppCode + + compute_52,sm_52;compute_50,sm_50;compute_35,sm_35;compute_37,sm_37; + compute_52,sm_52;compute_50,sm_50;compute_35,sm_35;compute_37,sm_37; + + + + + @@ -259,10 +298,6 @@ - - Full - /Tp %(AdditionalOptions) - @@ -293,6 +328,7 @@ + @@ -310,14 +346,20 @@ - + - + + + + + + + @@ -336,7 +378,7 @@ - + @@ -353,24 +395,15 @@ + + + + true + true + true - - - - - - - - 80 - - - - - - - -Xptxas "-abi=yes" %(AdditionalOptions) -Xptxas "-abi=yes" %(AdditionalOptions) @@ -392,15 +425,22 @@ 92 + + compute_52,sm_52;compute_50,sm_50;compute_35,sm_35;compute_37,sm_37; + compute_52,sm_52;compute_50,sm_50;compute_35,sm_35;compute_37,sm_37; + + + compute_52,sm_52;compute_50,sm_50;compute_35,sm_35;compute_37,sm_37; + compute_52,sm_52;compute_50,sm_50;compute_35,sm_35;compute_37,sm_37; + 80 --ptxas-options="-dlcm=cg" %(AdditionalOptions) true - - 128 + 80 80 @@ -433,15 +473,17 @@ - - + + + 64 + + true - 80 @@ -466,10 +508,12 @@ + + @@ -486,7 +530,7 @@ - + diff --git a/ccminer.vcxproj.filters b/ccminer.vcxproj.filters index 665a14b16c..a5138399e1 100644 --- a/ccminer.vcxproj.filters +++ b/ccminer.vcxproj.filters @@ -40,9 +40,6 @@ {7c2a98c6-064c-4a69-b803-d6f6ff5edd0b} - - {c3222908-22ba-4586-a637-6363f455b06d} - {f3ed23a2-8ce7-41a5-b051-6da56047dc35} @@ -58,38 +55,29 @@ {85dfae6a-66ca-4332-8cec-98ee70cbdf2f} - - {17b56151-79ec-4a32-bac3-9d94ae7f68fe} - {ef6f9983-bda5-4fb2-adfa-ac4f29b74f25} {9762c92c-9677-4044-8292-ff6ba4bfdd89} - + + {03b56ddb-6ebb-40b7-9a62-0a22c8c2865f} + + + {0e14317b-d054-4f9e-8f6f-3bd91b3aa160} + + {2ff6e4ce-7c92-4cb2-a3ad-c331e94fd81d} + + {62428d9b-4cac-44ca-a0c9-4b91f6c249d0} + + + {85b17b96-98a4-4fc5-baa8-c1a6b10f2d99} + - - Source Files\jansson - - - Source Files\jansson - - - Source Files\jansson - - - Source Files\jansson - - - Source Files\jansson - - - Source Files\jansson - Source Files\getopt @@ -108,15 +96,6 @@ Source Files - - Source Files - - - Source Files - - - Source Files - Source Files\sph @@ -162,9 +141,6 @@ Source Files\sph - - Source Files - Source Files\sph @@ -186,42 +162,48 @@ Source Files\sph - - Source Files - Source Files Source Files - - Source Files - Source Files Source Files - - Source Files\jansson + + Source Files\sph - - Source Files\jansson + + Source Files\sph - - Source Files\jansson + + Source Files\CUDA\neoscrypt - - Source Files\jansson + + Source Files\CUDA - - Source Files\sph + + Source Files\CUDA - + + Source Files\CUDA + + + Source Files + + Source Files\sph + + Source Files + + + Source Files\CUDA\neoscrypt + @@ -248,15 +230,9 @@ Header Files\compat - - Header Files - Header Files - - Header Files - Header Files\CUDA @@ -299,9 +275,6 @@ Header Files\sph - - Header Files\CUDA - Header Files\CUDA @@ -357,10 +330,40 @@ Header Files\compat - Header Files\lyra2 + Header Files\CUDA\lyra2 - Header Files\lyra2 + Header Files\CUDA\lyra2 + + + Header Files + + + Header Files + + + Header Files + + + Header Files\CUDA + + + Header Files\CUDA\lyra2 + + + Header Files\CUDA\lyra2 + + + Header Files\CUDA\sia + + + Header Files + + + Header Files\CUDA + + + Header Files\CUDA @@ -400,30 +403,6 @@ Source Files\CUDA\quark - - Source Files\CUDA\heavy - - - Source Files\CUDA\heavy - - - Source Files\CUDA\heavy - - - Source Files\CUDA\heavy - - - Source Files\CUDA\heavy - - - Source Files\CUDA\heavy - - - Source Files\CUDA\heavy - - - Source Files\CUDA\quark - Source Files\CUDA\quark @@ -460,9 +439,6 @@ Source Files\CUDA\x11 - - Source Files\CUDA\x11 - Source Files\CUDA\x11 @@ -544,12 +520,6 @@ Source Files\CUDA\Algo256 - - Source Files\CUDA - - - Source Files\CUDA - Source Files\CUDA\quark @@ -559,5 +529,47 @@ Source Files\CUDA + + Source Files\CUDA + + + Source Files\CUDA\Algo256 + + + Source Files\CUDA\Algo256 + + + Source Files\CUDA\Lyra2 + + + Source Files\CUDA\Lyra2 + + + Source Files\CUDA\x11 + + + Source Files\CUDA\x11 + + + Source Files\CUDA\neoscrypt + + + Source Files\CUDA\neoscrypt + + + Source Files\CUDA + + + Source Files\CUDA + + + Source Files\CUDA\Sia + + + Source Files\CUDA\Sia + + + Source Files\CUDA\neoscrypt + \ No newline at end of file diff --git a/compat/curl-for-windows/openssl/config/opensslconf.h b/compat/curl-for-windows/openssl/config/opensslconf.h deleted file mode 100644 index 9bf23692d6..0000000000 --- a/compat/curl-for-windows/openssl/config/opensslconf.h +++ /dev/null @@ -1,333 +0,0 @@ -/* opensslconf.h */ -/* WARNING: Generated automatically from opensslconf.h.in by Configure. */ - -/* OpenSSL was configured with the following options: */ -#undef OPENSSL_SYSNAME_WIN32 -#if defined(_WIN32) -# define OPENSSL_SYSNAME_WIN32 -#endif - -#ifndef OPENSSL_DOING_MAKEDEPEND -# ifndef OPENSSL_NO_CAPIENG -# define OPENSSL_NO_CAPIENG -# endif -# ifndef OPENSSL_NO_EC_NISTP_64_GCC_128 -# define OPENSSL_NO_EC_NISTP_64_GCC_128 -# endif -# ifndef OPENSSL_NO_GMP -# define OPENSSL_NO_GMP -# endif -# ifndef OPENSSL_NO_GOST -# define OPENSSL_NO_GOST -# endif -# ifndef OPENSSL_NO_HW_PADLOCK -# define OPENSSL_NO_HW_PADLOCK -# endif -# ifndef OPENSSL_NO_JPAKE -# define OPENSSL_NO_JPAKE -# endif -# ifndef OPENSSL_NO_KRB5 -# define OPENSSL_NO_KRB5 -# endif -# ifndef OPENSSL_NO_MD2 -# define OPENSSL_NO_MD2 -# endif -# ifndef OPENSSL_NO_RC5 -# define OPENSSL_NO_RC5 -# endif -# ifndef OPENSSL_NO_RFC3779 -# define OPENSSL_NO_RFC3779 -# endif -# ifndef OPENSSL_NO_SCTP -# define OPENSSL_NO_SCTP -# endif -# ifndef OPENSSL_NO_STORE -# define OPENSSL_NO_STORE -# endif -#endif /* OPENSSL_DOING_MAKEDEPEND */ - -#ifndef OPENSSL_THREADS -# define OPENSSL_THREADS -#endif -#ifndef OPENSSL_NO_DYNAMIC_ENGINE -# define OPENSSL_NO_DYNAMIC_ENGINE -#endif - -/* The OPENSSL_NO_* macros are also defined as NO_* if the application - asks for it. This is a transient feature that is provided for those - who haven't had the time to do the appropriate changes in their - applications. */ -#ifdef OPENSSL_ALGORITHM_DEFINES -# if defined(OPENSSL_NO_CAMELLIA) && !defined(NO_CAMELLIA) -# define NO_CAMELLIA -# endif -# if defined(OPENSSL_NO_CAPIENG) && !defined(NO_CAPIENG) -# define NO_CAPIENG -# endif -# if defined(OPENSSL_NO_CAST) && !defined(NO_CAST) -# define NO_CAST -# endif -# if defined(OPENSSL_NO_CMS) && !defined(NO_CMS) -# define NO_CMS -# endif -# if defined(OPENSSL_NO_FIPS) && !defined(NO_FIPS) -# define NO_FIPS -# endif -# if defined(OPENSSL_NO_GMP) && !defined(NO_GMP) -# define NO_GMP -# endif -# if defined(OPENSSL_NO_IDEA) && !defined(NO_IDEA) -# define NO_IDEA -# endif -# if defined(OPENSSL_NO_JPAKE) && !defined(NO_JPAKE) -# define NO_JPAKE -# endif -# if defined(OPENSSL_NO_KRB5) && !defined(NO_KRB5) -# define NO_KRB5 -# endif -# if defined(OPENSSL_NO_MD2) && !defined(NO_MD2) -# define NO_MD2 -# endif -# if defined(OPENSSL_NO_MDC2) && !defined(NO_MDC2) -# define NO_MDC2 -# endif -# if defined(OPENSSL_NO_RC5) && !defined(NO_RC5) -# define NO_RC5 -# endif -# if defined(OPENSSL_NO_RFC3779) && !defined(NO_RFC3779) -# define NO_RFC3779 -# endif -# if defined(OPENSSL_NO_SEED) && !defined(NO_SEED) -# define NO_SEED -# endif -# if defined(OPENSSL_NO_SHA0) && !defined(NO_SHA0) -# define NO_SHA0 -# endif -# if defined(OPENSSL_NO_STORE) && !defined(NO_STORE) -# define NO_STORE -# endif -# if defined(OPENSSL_NO_WHRLPOOL) && !defined(NO_WHRLPOOL) -# define NO_WHRLPOOL -# endif -# if defined(OPENSSL_NO_MDC2) && !defined(NO_MDC2) -# define NO_MDC2 -# endif -#endif - -/* crypto/opensslconf.h.in */ - -#ifdef OPENSSL_DOING_MAKEDEPEND - /* Include any symbols here that have to be explicitly set to enable a feature - * that should be visible to makedepend. - * - * [Our "make depend" doesn't actually look at this, we use actual build settings - * instead; we want to make it easy to remove subdirectories with disabled algorithms.] - */ -# ifndef OPENSSL_FIPS -# define OPENSSL_FIPS -# endif -#endif - -/* Generate 80386 code? */ -#undef I386_ONLY - -#if !(defined(VMS) || defined(__VMS)) /* VMS uses logical names instead */ -# if defined(HEADER_CRYPTLIB_H) && !defined(OPENSSLDIR) -# if defined(_WIN32) -# define ENGINESDIR "ssl/lib/engines" -# define OPENSSLDIR "ssl" -# else -# define ENGINESDIR "/usr/local/ssl/lib/engines" -# define OPENSSLDIR "/usr/local/ssl" -# endif -# endif -#endif - -#undef OPENSSL_UNISTD -#define OPENSSL_UNISTD -#if !defined(_WIN32) && !defined(__arm__) && !defined(__mips__) && !defined(SWIG) -# include -#endif - -#undef OPENSSL_EXPORT_VAR_AS_FUNCTION -#if defined(_WIN32) -# define OPENSSL_EXPORT_VAR_AS_FUNCTION -#endif - -#if defined(HEADER_IDEA_H) -# undef IDEA_INT -# define IDEA_INT unsigned int -#endif - -#if defined(HEADER_MD2_H) -# undef MD2_INT -# define MD2_INT unsigned int -#endif - -#if defined(HEADER_RC2_H) -/* I need to put in a mod for the alpha - eay */ -# undef RC2_INT -# define RC2_INT unsigned int -#endif - -#if defined(HEADER_RC4_H) - /* using int types make the structure larger but make the code faster - * on most boxes I have tested - up to %20 faster. */ - /* - * I don't know what does "most" mean, but declaring "int" is a must on: - * - Intel P6 because partial register stalls are very expensive; - * - elder Alpha because it lacks byte load/store instructions; - */ -# undef RC4_INT -# if defined(__arm__) -# define RC4_INT unsigned char -# else -# define RC4_INT unsigned int -# endif - - /* - * This enables code handling data aligned at natural CPU word - * boundary. See crypto/rc4/rc4_enc.c for further details. - */ -# undef RC4_CHUNK -# if (defined(_M_X64) || defined(__x86_64__)) && defined(_WIN32) -# define RC4_CHUNK unsigned long long -# elif (defined(_M_X64) || defined(__x86_64__)) && !defined(_WIN32) -# define RC4_CHUNK unsigned long -# elif defined(__arm__) -# define RC4_CHUNK unsigned long -# else - /* On x86 RC4_CHUNK is not defined */ -# endif -#endif - -#if defined(HEADER_NEW_DES_H) || defined(HEADER_DES_H) - /* If this is set to 'unsigned int' on a DEC Alpha, this gives about a - * %20 speed up (longs are 8 bytes, int's are 4). */ -# undef DES_LONG -# if defined(_M_X64) || defined(__x86_64__) || defined(__arm__) || defined(__mips__) -# define DES_LONG unsigned int -# elif defined(_M_IX86) || defined(__i386__) -# define DES_LONG unsigned long -# endif -#endif - -#if defined(HEADER_BN_H) && !defined(CONFIG_HEADER_BN_H) -# define CONFIG_HEADER_BN_H - -# undef BL_LLONG -# if defined(_M_IX86) || defined(__i386__) || defined(__arm__) -# define BL_LLONG -# endif - - /* Should we define BN_DIV2W here? */ - - /* Only one for the following should be defined */ - /* The prime number generation stuff may not work when - * EIGHT_BIT but I don't care since I've only used this mode - * for debuging the bignum libraries */ -# undef SIXTY_FOUR_BIT_LONG -# undef SIXTY_FOUR_BIT -# undef THIRTY_TWO_BIT -# undef SIXTEEN_BIT -# undef EIGHT_BIT -# if (defined(_M_X64) || defined(__x86_64__)) && defined(_WIN32) -# define SIXTY_FOUR_BIT -# elif (defined(_M_X64) || defined(__x86_64__)) && !defined(_WIN32) -# define SIXTY_FOUR_BIT_LONG -# elif defined(_M_IX86) || defined(__i386__) || defined(__arm__) || defined(__mips__) -# define THIRTY_TWO_BIT -# endif -#endif - -#if defined(HEADER_RC4_LOCL_H) && !defined(CONFIG_HEADER_RC4_LOCL_H) -# define CONFIG_HEADER_RC4_LOCL_H - /* if this is defined data[i] is used instead of *data, this is a %20 - * speedup on x86 */ -# undef RC4_INDEX -# if defined(_M_IX86) || defined(__i386__) -# define RC4_INDEX -# endif -#endif - -#if defined(HEADER_BF_LOCL_H) && !defined(CONFIG_HEADER_BF_LOCL_H) -# define CONFIG_HEADER_BF_LOCL_H -# undef BF_PTR -# if defined(__arm__) -# define BF_PTR -# endif -#endif /* HEADER_BF_LOCL_H */ - -#if defined(HEADER_DES_LOCL_H) && !defined(CONFIG_HEADER_DES_LOCL_H) -# define CONFIG_HEADER_DES_LOCL_H - -# ifndef DES_DEFAULT_OPTIONS - /* the following is tweaked from a config script, that is why it is a - * protected undef/define */ -# undef DES_PTR -# if !defined(_WIN32) && (defined(_M_IX86) || defined(__i386__)) -# define DES_PTR -# endif - - /* This helps C compiler generate the correct code for multiple functional - * units. It reduces register dependancies at the expense of 2 more - * registers */ -# undef DES_RISC1 -# if !defined(_WIN32) && (defined(_M_IX86) || defined(__i386__)) -# define DES_RISC1 -# endif - -# undef DES_RISC2 - -# if defined(DES_RISC1) && defined(DES_RISC2) -# error YOU SHOULD NOT HAVE BOTH DES_RISC1 AND DES_RISC2 DEFINED!!!!! -# endif - - /* Unroll the inner loop, this sometimes helps, sometimes hinders. - * Very mucy CPU dependant */ -# undef DES_UNROLL -# if !defined(_WIN32) -# define DES_UNROLL -# endif - - /* These default values were supplied by - * Peter Gutman - * They are only used if nothing else has been defined */ -# if !defined(DES_PTR) && !defined(DES_RISC1) && !defined(DES_RISC2) && !defined(DES_UNROLL) - /* Special defines which change the way the code is built depending on the - CPU and OS. For SGI machines you can use _MIPS_SZLONG (32 or 64) to find - even newer MIPS CPU's, but at the moment one size fits all for - optimization options. Older Sparc's work better with only UNROLL, but - there's no way to tell at compile time what it is you're running on */ -# if defined( sun ) /* Newer Sparc's */ -# define DES_PTR -# define DES_RISC1 -# define DES_UNROLL -# elif defined( __ultrix ) /* Older MIPS */ -# define DES_PTR -# define DES_RISC2 -# define DES_UNROLL -# elif defined( __osf1__ ) /* Alpha */ -# define DES_PTR -# define DES_RISC2 -# elif defined ( _AIX ) /* RS6000 */ - /* Unknown */ -# elif defined( __hpux ) /* HP-PA */ - /* Unknown */ -# elif defined( __aux ) /* 68K */ - /* Unknown */ -# elif defined( __dgux ) /* 88K (but P6 in latest boxes) */ -# define DES_UNROLL -# elif defined( __sgi ) /* Newer MIPS */ -# define DES_PTR -# define DES_RISC2 -# define DES_UNROLL -# elif defined(i386) || defined(__i386__) /* x86 boxes, should be gcc */ -# define DES_PTR -# define DES_RISC1 -# define DES_UNROLL -# endif /* Systems-specific speed defines */ -# endif - -# endif /* DES_DEFAULT_OPTIONS */ -#endif /* HEADER_DES_LOCL_H */ diff --git a/compat/curl-for-windows/openssl/openssl/crypto/opensslconf.h b/compat/curl-for-windows/openssl/openssl/crypto/opensslconf.h deleted file mode 100644 index 76c99d433a..0000000000 --- a/compat/curl-for-windows/openssl/openssl/crypto/opensslconf.h +++ /dev/null @@ -1 +0,0 @@ -#include "../../config/opensslconf.h" diff --git a/compat/curl-for-windows/openssl/openssl/crypto/sha/sha.h b/compat/curl-for-windows/openssl/openssl/crypto/sha/sha.h deleted file mode 100644 index 8a6bf4bbbb..0000000000 --- a/compat/curl-for-windows/openssl/openssl/crypto/sha/sha.h +++ /dev/null @@ -1,214 +0,0 @@ -/* crypto/sha/sha.h */ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] - */ - -#ifndef HEADER_SHA_H -#define HEADER_SHA_H - -#include -#include - -#ifdef __cplusplus -extern "C" { -#endif - -#if defined(OPENSSL_NO_SHA) || (defined(OPENSSL_NO_SHA0) && defined(OPENSSL_NO_SHA1)) -#error SHA is disabled. -#endif - -#if defined(OPENSSL_FIPS) -#define FIPS_SHA_SIZE_T size_t -#endif - -/* - * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - * ! SHA_LONG has to be at least 32 bits wide. If it's wider, then ! - * ! SHA_LONG_LOG2 has to be defined along. ! - * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - */ - -#if defined(__LP32__) -#define SHA_LONG unsigned long -#elif defined(OPENSSL_SYS_CRAY) || defined(__ILP64__) -#define SHA_LONG unsigned long -#define SHA_LONG_LOG2 3 -#else -#define SHA_LONG unsigned int -#endif - -#define SHA_LBLOCK 16 -#define SHA_CBLOCK (SHA_LBLOCK*4) /* SHA treats input data as a - * contiguous array of 32 bit - * wide big-endian values. */ -#define SHA_LAST_BLOCK (SHA_CBLOCK-8) -#define SHA_DIGEST_LENGTH 20 - -typedef struct SHAstate_st - { - SHA_LONG h0,h1,h2,h3,h4; - SHA_LONG Nl,Nh; - SHA_LONG data[SHA_LBLOCK]; - unsigned int num; - } SHA_CTX; - -#ifndef OPENSSL_NO_SHA0 -#ifdef OPENSSL_FIPS -int private_SHA_Init(SHA_CTX *c); -#endif -int SHA_Init(SHA_CTX *c); -int SHA_Update(SHA_CTX *c, const void *data, size_t len); -int SHA_Final(unsigned char *md, SHA_CTX *c); -unsigned char *SHA(const unsigned char *d, size_t n, unsigned char *md); -void SHA_Transform(SHA_CTX *c, const unsigned char *data); -#endif -#ifndef OPENSSL_NO_SHA1 -#ifdef OPENSSL_FIPS -int private_SHA1_Init(SHA_CTX *c); -#endif -int SHA1_Init(SHA_CTX *c); -int SHA1_Update(SHA_CTX *c, const void *data, size_t len); -int SHA1_Final(unsigned char *md, SHA_CTX *c); -unsigned char *SHA1(const unsigned char *d, size_t n, unsigned char *md); -void SHA1_Transform(SHA_CTX *c, const unsigned char *data); -#endif - -#define SHA256_CBLOCK (SHA_LBLOCK*4) /* SHA-256 treats input data as a - * contiguous array of 32 bit - * wide big-endian values. */ -#define SHA224_DIGEST_LENGTH 28 -#define SHA256_DIGEST_LENGTH 32 - -typedef struct SHA256state_st - { - SHA_LONG h[8]; - SHA_LONG Nl,Nh; - SHA_LONG data[SHA_LBLOCK]; - unsigned int num,md_len; - } SHA256_CTX; - -#ifndef OPENSSL_NO_SHA256 -#ifdef OPENSSL_FIPS -int private_SHA224_Init(SHA256_CTX *c); -int private_SHA256_Init(SHA256_CTX *c); -#endif -int SHA224_Init(SHA256_CTX *c); -int SHA224_Update(SHA256_CTX *c, const void *data, size_t len); -int SHA224_Final(unsigned char *md, SHA256_CTX *c); -unsigned char *SHA224(const unsigned char *d, size_t n,unsigned char *md); -int SHA256_Init(SHA256_CTX *c); -int SHA256_Update(SHA256_CTX *c, const void *data, size_t len); -int SHA256_Final(unsigned char *md, SHA256_CTX *c); -unsigned char *SHA256(const unsigned char *d, size_t n,unsigned char *md); -void SHA256_Transform(SHA256_CTX *c, const unsigned char *data); -#endif - -#define SHA384_DIGEST_LENGTH 48 -#define SHA512_DIGEST_LENGTH 64 - -#ifndef OPENSSL_NO_SHA512 -/* - * Unlike 32-bit digest algorithms, SHA-512 *relies* on SHA_LONG64 - * being exactly 64-bit wide. See Implementation Notes in sha512.c - * for further details. - */ -#define SHA512_CBLOCK (SHA_LBLOCK*8) /* SHA-512 treats input data as a - * contiguous array of 64 bit - * wide big-endian values. */ -#if (defined(_WIN32) || defined(_WIN64)) && !defined(__MINGW32__) -#define SHA_LONG64 unsigned __int64 -#define U64(C) C##UI64 -#elif defined(__arch64__) -#define SHA_LONG64 unsigned long -#define U64(C) C##UL -#else -#define SHA_LONG64 unsigned long long -#define U64(C) C##ULL -#endif - -typedef struct SHA512state_st - { - SHA_LONG64 h[8]; - SHA_LONG64 Nl,Nh; - union { - SHA_LONG64 d[SHA_LBLOCK]; - unsigned char p[SHA512_CBLOCK]; - } u; - unsigned int num,md_len; - } SHA512_CTX; -#endif - -#ifndef OPENSSL_NO_SHA512 -#ifdef OPENSSL_FIPS -int private_SHA384_Init(SHA512_CTX *c); -int private_SHA512_Init(SHA512_CTX *c); -#endif -int SHA384_Init(SHA512_CTX *c); -int SHA384_Update(SHA512_CTX *c, const void *data, size_t len); -int SHA384_Final(unsigned char *md, SHA512_CTX *c); -unsigned char *SHA384(const unsigned char *d, size_t n,unsigned char *md); -int SHA512_Init(SHA512_CTX *c); -int SHA512_Update(SHA512_CTX *c, const void *data, size_t len); -int SHA512_Final(unsigned char *md, SHA512_CTX *c); -unsigned char *SHA512(const unsigned char *d, size_t n,unsigned char *md); -void SHA512_Transform(SHA512_CTX *c, const unsigned char *data); -#endif - -#ifdef __cplusplus -} -#endif - -#endif diff --git a/compat/curl-for-windows/openssl/openssl/e_os2.h b/compat/curl-for-windows/openssl/openssl/e_os2.h deleted file mode 100644 index d22c0368f8..0000000000 --- a/compat/curl-for-windows/openssl/openssl/e_os2.h +++ /dev/null @@ -1,315 +0,0 @@ -/* e_os2.h */ -/* ==================================================================== - * Copyright (c) 1998-2000 The OpenSSL Project. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * - * 3. All advertising materials mentioning features or use of this - * software must display the following acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit. (http://www.openssl.org/)" - * - * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to - * endorse or promote products derived from this software without - * prior written permission. For written permission, please contact - * openssl-core@openssl.org. - * - * 5. Products derived from this software may not be called "OpenSSL" - * nor may "OpenSSL" appear in their names without prior written - * permission of the OpenSSL Project. - * - * 6. Redistributions of any form whatsoever must retain the following - * acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit (http://www.openssl.org/)" - * - * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY - * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR - * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT - * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, - * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED - * OF THE POSSIBILITY OF SUCH DAMAGE. - * ==================================================================== - * - * This product includes cryptographic software written by Eric Young - * (eay@cryptsoft.com). This product includes software written by Tim - * Hudson (tjh@cryptsoft.com). - * - */ - -#include - -#ifndef HEADER_E_OS2_H -#define HEADER_E_OS2_H - -#ifdef __cplusplus -extern "C" { -#endif - -/****************************************************************************** - * Detect operating systems. This probably needs completing. - * The result is that at least one OPENSSL_SYS_os macro should be defined. - * However, if none is defined, Unix is assumed. - **/ - -#define OPENSSL_SYS_UNIX - -/* ----------------------- Macintosh, before MacOS X ----------------------- */ -#if defined(__MWERKS__) && defined(macintosh) || defined(OPENSSL_SYSNAME_MAC) -# undef OPENSSL_SYS_UNIX -# define OPENSSL_SYS_MACINTOSH_CLASSIC -#endif - -/* ----------------------- NetWare ----------------------------------------- */ -#if defined(NETWARE) || defined(OPENSSL_SYSNAME_NETWARE) -# undef OPENSSL_SYS_UNIX -# define OPENSSL_SYS_NETWARE -#endif - -/* ---------------------- Microsoft operating systems ---------------------- */ - -/* Note that MSDOS actually denotes 32-bit environments running on top of - MS-DOS, such as DJGPP one. */ -#if defined(OPENSSL_SYSNAME_MSDOS) -# undef OPENSSL_SYS_UNIX -# define OPENSSL_SYS_MSDOS -#endif - -/* For 32 bit environment, there seems to be the CygWin environment and then - all the others that try to do the same thing Microsoft does... */ -#if defined(OPENSSL_SYSNAME_UWIN) -# undef OPENSSL_SYS_UNIX -# define OPENSSL_SYS_WIN32_UWIN -#else -# if defined(__CYGWIN32__) || defined(OPENSSL_SYSNAME_CYGWIN32) -# undef OPENSSL_SYS_UNIX -# define OPENSSL_SYS_WIN32_CYGWIN -# else -# if defined(_WIN32) || defined(OPENSSL_SYSNAME_WIN32) -# undef OPENSSL_SYS_UNIX -# define OPENSSL_SYS_WIN32 -# endif -# if defined(OPENSSL_SYSNAME_WINNT) -# undef OPENSSL_SYS_UNIX -# define OPENSSL_SYS_WINNT -# endif -# if defined(OPENSSL_SYSNAME_WINCE) -# undef OPENSSL_SYS_UNIX -# define OPENSSL_SYS_WINCE -# endif -# endif -#endif - -/* Anything that tries to look like Microsoft is "Windows" */ -#if defined(OPENSSL_SYS_WIN32) || defined(OPENSSL_SYS_WINNT) || defined(OPENSSL_SYS_WINCE) -# undef OPENSSL_SYS_UNIX -# define OPENSSL_SYS_WINDOWS -# ifndef OPENSSL_SYS_MSDOS -# define OPENSSL_SYS_MSDOS -# endif -#endif - -/* DLL settings. This part is a bit tough, because it's up to the application - implementor how he or she will link the application, so it requires some - macro to be used. */ -#ifdef OPENSSL_SYS_WINDOWS -# ifndef OPENSSL_OPT_WINDLL -# if defined(_WINDLL) /* This is used when building OpenSSL to indicate that - DLL linkage should be used */ -# define OPENSSL_OPT_WINDLL -# endif -# endif -#endif - -/* -------------------------------- OpenVMS -------------------------------- */ -#if defined(__VMS) || defined(VMS) || defined(OPENSSL_SYSNAME_VMS) -# undef OPENSSL_SYS_UNIX -# define OPENSSL_SYS_VMS -# if defined(__DECC) -# define OPENSSL_SYS_VMS_DECC -# elif defined(__DECCXX) -# define OPENSSL_SYS_VMS_DECC -# define OPENSSL_SYS_VMS_DECCXX -# else -# define OPENSSL_SYS_VMS_NODECC -# endif -#endif - -/* --------------------------------- OS/2 ---------------------------------- */ -#if defined(__EMX__) || defined(__OS2__) -# undef OPENSSL_SYS_UNIX -# define OPENSSL_SYS_OS2 -#endif - -/* --------------------------------- Unix ---------------------------------- */ -#ifdef OPENSSL_SYS_UNIX -# if defined(linux) || defined(__linux__) || defined(OPENSSL_SYSNAME_LINUX) -# define OPENSSL_SYS_LINUX -# endif -# ifdef OPENSSL_SYSNAME_MPE -# define OPENSSL_SYS_MPE -# endif -# ifdef OPENSSL_SYSNAME_SNI -# define OPENSSL_SYS_SNI -# endif -# ifdef OPENSSL_SYSNAME_ULTRASPARC -# define OPENSSL_SYS_ULTRASPARC -# endif -# ifdef OPENSSL_SYSNAME_NEWS4 -# define OPENSSL_SYS_NEWS4 -# endif -# ifdef OPENSSL_SYSNAME_MACOSX -# define OPENSSL_SYS_MACOSX -# endif -# ifdef OPENSSL_SYSNAME_MACOSX_RHAPSODY -# define OPENSSL_SYS_MACOSX_RHAPSODY -# define OPENSSL_SYS_MACOSX -# endif -# ifdef OPENSSL_SYSNAME_SUNOS -# define OPENSSL_SYS_SUNOS -#endif -# if defined(_CRAY) || defined(OPENSSL_SYSNAME_CRAY) -# define OPENSSL_SYS_CRAY -# endif -# if defined(_AIX) || defined(OPENSSL_SYSNAME_AIX) -# define OPENSSL_SYS_AIX -# endif -#endif - -/* --------------------------------- VOS ----------------------------------- */ -#if defined(__VOS__) || defined(OPENSSL_SYSNAME_VOS) -# define OPENSSL_SYS_VOS -#ifdef __HPPA__ -# define OPENSSL_SYS_VOS_HPPA -#endif -#ifdef __IA32__ -# define OPENSSL_SYS_VOS_IA32 -#endif -#endif - -/* ------------------------------- VxWorks --------------------------------- */ -#ifdef OPENSSL_SYSNAME_VXWORKS -# define OPENSSL_SYS_VXWORKS -#endif - -/* --------------------------------- BeOS ---------------------------------- */ -#if defined(__BEOS__) -# define OPENSSL_SYS_BEOS -# include -# if defined(BONE_VERSION) -# define OPENSSL_SYS_BEOS_BONE -# else -# define OPENSSL_SYS_BEOS_R5 -# endif -#endif - -/** - * That's it for OS-specific stuff - *****************************************************************************/ - - -/* Specials for I/O an exit */ -#ifdef OPENSSL_SYS_MSDOS -# define OPENSSL_UNISTD_IO -# define OPENSSL_DECLARE_EXIT extern void exit(int); -#else -# define OPENSSL_UNISTD_IO OPENSSL_UNISTD -# define OPENSSL_DECLARE_EXIT /* declared in unistd.h */ -#endif - -/* Definitions of OPENSSL_GLOBAL and OPENSSL_EXTERN, to define and declare - certain global symbols that, with some compilers under VMS, have to be - defined and declared explicitely with globaldef and globalref. - Definitions of OPENSSL_EXPORT and OPENSSL_IMPORT, to define and declare - DLL exports and imports for compilers under Win32. These are a little - more complicated to use. Basically, for any library that exports some - global variables, the following code must be present in the header file - that declares them, before OPENSSL_EXTERN is used: - - #ifdef SOME_BUILD_FLAG_MACRO - # undef OPENSSL_EXTERN - # define OPENSSL_EXTERN OPENSSL_EXPORT - #endif - - The default is to have OPENSSL_EXPORT, OPENSSL_IMPORT and OPENSSL_GLOBAL - have some generally sensible values, and for OPENSSL_EXTERN to have the - value OPENSSL_IMPORT. -*/ - -#if defined(OPENSSL_SYS_VMS_NODECC) -# define OPENSSL_EXPORT globalref -# define OPENSSL_IMPORT globalref -# define OPENSSL_GLOBAL globaldef -#elif defined(OPENSSL_SYS_WINDOWS) && defined(OPENSSL_OPT_WINDLL) -# define OPENSSL_EXPORT extern __declspec(dllexport) -# define OPENSSL_IMPORT extern __declspec(dllimport) -# define OPENSSL_GLOBAL -#else -# define OPENSSL_EXPORT extern -# define OPENSSL_IMPORT extern -# define OPENSSL_GLOBAL -#endif -#define OPENSSL_EXTERN OPENSSL_IMPORT - -/* Macros to allow global variables to be reached through function calls when - required (if a shared library version requires it, for example. - The way it's done allows definitions like this: - - // in foobar.c - OPENSSL_IMPLEMENT_GLOBAL(int,foobar,0) - // in foobar.h - OPENSSL_DECLARE_GLOBAL(int,foobar); - #define foobar OPENSSL_GLOBAL_REF(foobar) -*/ -#ifdef OPENSSL_EXPORT_VAR_AS_FUNCTION -# define OPENSSL_IMPLEMENT_GLOBAL(type,name,value) \ - type *_shadow_##name(void) \ - { static type _hide_##name=value; return &_hide_##name; } -# define OPENSSL_DECLARE_GLOBAL(type,name) type *_shadow_##name(void) -# define OPENSSL_GLOBAL_REF(name) (*(_shadow_##name())) -#else -# define OPENSSL_IMPLEMENT_GLOBAL(type,name,value) OPENSSL_GLOBAL type _shadow_##name=value; -# define OPENSSL_DECLARE_GLOBAL(type,name) OPENSSL_EXPORT type _shadow_##name -# define OPENSSL_GLOBAL_REF(name) _shadow_##name -#endif - -#if defined(OPENSSL_SYS_MACINTOSH_CLASSIC) && macintosh==1 && !defined(MAC_OS_GUSI_SOURCE) -# define ossl_ssize_t long -#endif - -#ifdef OPENSSL_SYS_MSDOS -# define ossl_ssize_t long -#endif - -#if defined(NeXT) || defined(OPENSSL_SYS_NEWS4) || defined(OPENSSL_SYS_SUNOS) -# define ssize_t int -#endif - -#if defined(__ultrix) && !defined(ssize_t) -# define ossl_ssize_t int -#endif - -#ifndef ossl_ssize_t -# define ossl_ssize_t ssize_t -#endif - -#ifdef __cplusplus -} -#endif -#endif diff --git a/compat/curl-for-windows/openssl/openssl/include/openssl/e_os2.h b/compat/curl-for-windows/openssl/openssl/include/openssl/e_os2.h deleted file mode 100644 index ab3f1ee44f..0000000000 --- a/compat/curl-for-windows/openssl/openssl/include/openssl/e_os2.h +++ /dev/null @@ -1 +0,0 @@ -#include "../../e_os2.h" diff --git a/compat/curl-for-windows/openssl/openssl/include/openssl/opensslconf.h b/compat/curl-for-windows/openssl/openssl/include/openssl/opensslconf.h deleted file mode 100644 index 221be629b7..0000000000 --- a/compat/curl-for-windows/openssl/openssl/include/openssl/opensslconf.h +++ /dev/null @@ -1 +0,0 @@ -#include "../../crypto/opensslconf.h" diff --git a/compat/curl-for-windows/openssl/openssl/include/openssl/sha.h b/compat/curl-for-windows/openssl/openssl/include/openssl/sha.h deleted file mode 100644 index ab9d94c386..0000000000 --- a/compat/curl-for-windows/openssl/openssl/include/openssl/sha.h +++ /dev/null @@ -1 +0,0 @@ -#include "../../crypto/sha/sha.h" diff --git a/compat/curl-for-windows/out/x64/Release/lib/libcurl.x64.lib b/compat/curl-for-windows/out/x64/Release/lib/libcurl.x64.lib deleted file mode 100644 index 1cb88e7bd0..0000000000 Binary files a/compat/curl-for-windows/out/x64/Release/lib/libcurl.x64.lib and /dev/null differ diff --git a/compat/curl-for-windows/out/x64/Release/lib/openssl.x64.lib b/compat/curl-for-windows/out/x64/Release/lib/openssl.x64.lib deleted file mode 100644 index 26ed21b4dd..0000000000 Binary files a/compat/curl-for-windows/out/x64/Release/lib/openssl.x64.lib and /dev/null differ diff --git a/compat/curl-for-windows/out/x64/Release/lib/zlib.x64.lib b/compat/curl-for-windows/out/x64/Release/lib/zlib.x64.lib deleted file mode 100644 index 22947086cb..0000000000 Binary files a/compat/curl-for-windows/out/x64/Release/lib/zlib.x64.lib and /dev/null differ diff --git a/compat/curl-for-windows/out/x86/Release/lib/libcurl.x86.lib b/compat/curl-for-windows/out/x86/Release/lib/libcurl.x86.lib deleted file mode 100644 index e86173f881..0000000000 Binary files a/compat/curl-for-windows/out/x86/Release/lib/libcurl.x86.lib and /dev/null differ diff --git a/compat/curl-for-windows/out/x86/Release/lib/openssl.x86.lib b/compat/curl-for-windows/out/x86/Release/lib/openssl.x86.lib deleted file mode 100644 index 1f75262911..0000000000 Binary files a/compat/curl-for-windows/out/x86/Release/lib/openssl.x86.lib and /dev/null differ diff --git a/compat/curl-for-windows/out/x86/Release/lib/zlib.x86.lib b/compat/curl-for-windows/out/x86/Release/lib/zlib.x86.lib deleted file mode 100644 index 10dc5baca1..0000000000 Binary files a/compat/curl-for-windows/out/x86/Release/lib/zlib.x86.lib and /dev/null differ diff --git a/compat/includes-x64/jansson.h b/compat/includes-x64/jansson.h new file mode 100644 index 0000000000..a5927bd630 --- /dev/null +++ b/compat/includes-x64/jansson.h @@ -0,0 +1,315 @@ +/* + * Copyright (c) 2009-2016 Petri Lehtinen + * + * Jansson is free software; you can redistribute it and/or modify + * it under the terms of the MIT license. See LICENSE for details. + */ + +#ifndef JANSSON_H +#define JANSSON_H + +#include +#include /* for size_t */ +#include + +#include "jansson_config.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/* version */ + +#define JANSSON_MAJOR_VERSION 2 +#define JANSSON_MINOR_VERSION 10 +#define JANSSON_MICRO_VERSION 0 + +/* Micro version is omitted if it's 0 */ +#define JANSSON_VERSION "2.10" + +/* Version as a 3-byte hex number, e.g. 0x010201 == 1.2.1. Use this + for numeric comparisons, e.g. #if JANSSON_VERSION_HEX >= ... */ +#define JANSSON_VERSION_HEX ((JANSSON_MAJOR_VERSION << 16) | \ + (JANSSON_MINOR_VERSION << 8) | \ + (JANSSON_MICRO_VERSION << 0)) + + +/* types */ + +typedef enum { + JSON_OBJECT, + JSON_ARRAY, + JSON_STRING, + JSON_INTEGER, + JSON_REAL, + JSON_TRUE, + JSON_FALSE, + JSON_NULL +} json_type; + +typedef struct json_t { + json_type type; + size_t refcount; +} json_t; + +#ifndef JANSSON_USING_CMAKE /* disabled if using cmake */ +#if JSON_INTEGER_IS_LONG_LONG +#ifdef _WIN32 +#define JSON_INTEGER_FORMAT "I64d" +#else +#define JSON_INTEGER_FORMAT "lld" +#endif +typedef long long json_int_t; +#else +#define JSON_INTEGER_FORMAT "ld" +typedef long json_int_t; +#endif /* JSON_INTEGER_IS_LONG_LONG */ +#endif + +#define json_typeof(json) ((json)->type) +#define json_is_object(json) ((json) && json_typeof(json) == JSON_OBJECT) +#define json_is_array(json) ((json) && json_typeof(json) == JSON_ARRAY) +#define json_is_string(json) ((json) && json_typeof(json) == JSON_STRING) +#define json_is_integer(json) ((json) && json_typeof(json) == JSON_INTEGER) +#define json_is_real(json) ((json) && json_typeof(json) == JSON_REAL) +#define json_is_number(json) (json_is_integer(json) || json_is_real(json)) +#define json_is_true(json) ((json) && json_typeof(json) == JSON_TRUE) +#define json_is_false(json) ((json) && json_typeof(json) == JSON_FALSE) +#define json_boolean_value json_is_true +#define json_is_boolean(json) (json_is_true(json) || json_is_false(json)) +#define json_is_null(json) ((json) && json_typeof(json) == JSON_NULL) + +/* construction, destruction, reference counting */ + +json_t *json_object(void); +json_t *json_array(void); +json_t *json_string(const char *value); +json_t *json_stringn(const char *value, size_t len); +json_t *json_string_nocheck(const char *value); +json_t *json_stringn_nocheck(const char *value, size_t len); +json_t *json_integer(json_int_t value); +json_t *json_real(double value); +json_t *json_true(void); +json_t *json_false(void); +#define json_boolean(val) ((val) ? json_true() : json_false()) +json_t *json_null(void); + +static JSON_INLINE +json_t *json_incref(json_t *json) +{ + if(json && json->refcount != (size_t)-1) + ++json->refcount; + return json; +} + +/* do not call json_delete directly */ +void json_delete(json_t *json); + +static JSON_INLINE +void json_decref(json_t *json) +{ + if(json && json->refcount != (size_t)-1 && --json->refcount == 0) + json_delete(json); +} + +#if defined(__GNUC__) || defined(__clang__) +static JSON_INLINE +void json_decrefp(json_t **json) +{ + if(json) { + json_decref(*json); + *json = NULL; + } +} + +#define json_auto_t json_t __attribute__((cleanup(json_decrefp))) +#endif + + +/* error reporting */ + +#define JSON_ERROR_TEXT_LENGTH 160 +#define JSON_ERROR_SOURCE_LENGTH 80 + +typedef struct { + int line; + int column; + int position; + char source[JSON_ERROR_SOURCE_LENGTH]; + char text[JSON_ERROR_TEXT_LENGTH]; +} json_error_t; + + +/* getters, setters, manipulation */ + +void json_object_seed(size_t seed); +size_t json_object_size(const json_t *object); +json_t *json_object_get(const json_t *object, const char *key); +int json_object_set_new(json_t *object, const char *key, json_t *value); +int json_object_set_new_nocheck(json_t *object, const char *key, json_t *value); +int json_object_del(json_t *object, const char *key); +int json_object_clear(json_t *object); +int json_object_update(json_t *object, json_t *other); +int json_object_update_existing(json_t *object, json_t *other); +int json_object_update_missing(json_t *object, json_t *other); +void *json_object_iter(json_t *object); +void *json_object_iter_at(json_t *object, const char *key); +void *json_object_key_to_iter(const char *key); +void *json_object_iter_next(json_t *object, void *iter); +const char *json_object_iter_key(void *iter); +json_t *json_object_iter_value(void *iter); +int json_object_iter_set_new(json_t *object, void *iter, json_t *value); + +#define json_object_foreach(object, key, value) \ + for(key = json_object_iter_key(json_object_iter(object)); \ + key && (value = json_object_iter_value(json_object_key_to_iter(key))); \ + key = json_object_iter_key(json_object_iter_next(object, json_object_key_to_iter(key)))) + +#define json_object_foreach_safe(object, n, key, value) \ + for(key = json_object_iter_key(json_object_iter(object)), \ + n = json_object_iter_next(object, json_object_key_to_iter(key)); \ + key && (value = json_object_iter_value(json_object_key_to_iter(key))); \ + key = json_object_iter_key(n), \ + n = json_object_iter_next(object, json_object_key_to_iter(key))) + +#define json_array_foreach(array, index, value) \ + for(index = 0; \ + index < json_array_size(array) && (value = json_array_get(array, index)); \ + index++) + +static JSON_INLINE +int json_object_set(json_t *object, const char *key, json_t *value) +{ + return json_object_set_new(object, key, json_incref(value)); +} + +static JSON_INLINE +int json_object_set_nocheck(json_t *object, const char *key, json_t *value) +{ + return json_object_set_new_nocheck(object, key, json_incref(value)); +} + +static JSON_INLINE +int json_object_iter_set(json_t *object, void *iter, json_t *value) +{ + return json_object_iter_set_new(object, iter, json_incref(value)); +} + +size_t json_array_size(const json_t *array); +json_t *json_array_get(const json_t *array, size_t index); +int json_array_set_new(json_t *array, size_t index, json_t *value); +int json_array_append_new(json_t *array, json_t *value); +int json_array_insert_new(json_t *array, size_t index, json_t *value); +int json_array_remove(json_t *array, size_t index); +int json_array_clear(json_t *array); +int json_array_extend(json_t *array, json_t *other); + +static JSON_INLINE +int json_array_set(json_t *array, size_t ind, json_t *value) +{ + return json_array_set_new(array, ind, json_incref(value)); +} + +static JSON_INLINE +int json_array_append(json_t *array, json_t *value) +{ + return json_array_append_new(array, json_incref(value)); +} + +static JSON_INLINE +int json_array_insert(json_t *array, size_t ind, json_t *value) +{ + return json_array_insert_new(array, ind, json_incref(value)); +} + +const char *json_string_value(const json_t *string); +size_t json_string_length(const json_t *string); +json_int_t json_integer_value(const json_t *integer); +double json_real_value(const json_t *real); +double json_number_value(const json_t *json); + +int json_string_set(json_t *string, const char *value); +int json_string_setn(json_t *string, const char *value, size_t len); +int json_string_set_nocheck(json_t *string, const char *value); +int json_string_setn_nocheck(json_t *string, const char *value, size_t len); +int json_integer_set(json_t *integer, json_int_t value); +int json_real_set(json_t *real, double value); + +/* pack, unpack */ + +json_t *json_pack(const char *fmt, ...); +json_t *json_pack_ex(json_error_t *error, size_t flags, const char *fmt, ...); +json_t *json_vpack_ex(json_error_t *error, size_t flags, const char *fmt, va_list ap); + +#define JSON_VALIDATE_ONLY 0x1 +#define JSON_STRICT 0x2 + +int json_unpack(json_t *root, const char *fmt, ...); +int json_unpack_ex(json_t *root, json_error_t *error, size_t flags, const char *fmt, ...); +int json_vunpack_ex(json_t *root, json_error_t *error, size_t flags, const char *fmt, va_list ap); + + +/* equality */ + +int json_equal(json_t *value1, json_t *value2); + + +/* copying */ + +json_t *json_copy(json_t *value); +json_t *json_deep_copy(const json_t *value); + + +/* decoding */ + +#define JSON_REJECT_DUPLICATES 0x1 +#define JSON_DISABLE_EOF_CHECK 0x2 +#define JSON_DECODE_ANY 0x4 +#define JSON_DECODE_INT_AS_REAL 0x8 +#define JSON_ALLOW_NUL 0x10 + +typedef size_t (*json_load_callback_t)(void *buffer, size_t buflen, void *data); + +json_t *json_loads(const char *input, size_t flags, json_error_t *error); +json_t *json_loadb(const char *buffer, size_t buflen, size_t flags, json_error_t *error); +json_t *json_loadf(FILE *input, size_t flags, json_error_t *error); +json_t *json_loadfd(int input, size_t flags, json_error_t *error); +json_t *json_load_file(const char *path, size_t flags, json_error_t *error); +json_t *json_load_callback(json_load_callback_t callback, void *data, size_t flags, json_error_t *error); + + +/* encoding */ + +#define JSON_MAX_INDENT 0x1F +#define JSON_INDENT(n) ((n) & JSON_MAX_INDENT) +#define JSON_COMPACT 0x20 +#define JSON_ENSURE_ASCII 0x40 +#define JSON_SORT_KEYS 0x80 +#define JSON_PRESERVE_ORDER 0x100 +#define JSON_ENCODE_ANY 0x200 +#define JSON_ESCAPE_SLASH 0x400 +#define JSON_REAL_PRECISION(n) (((n) & 0x1F) << 11) +#define JSON_EMBED 0x10000 + +typedef int (*json_dump_callback_t)(const char *buffer, size_t size, void *data); + +char *json_dumps(const json_t *json, size_t flags); +size_t json_dumpb(const json_t *json, char *buffer, size_t size, size_t flags); +int json_dumpf(const json_t *json, FILE *output, size_t flags); +int json_dumpfd(const json_t *json, int output, size_t flags); +int json_dump_file(const json_t *json, const char *path, size_t flags); +int json_dump_callback(const json_t *json, json_dump_callback_t callback, void *data, size_t flags); + +/* custom memory allocation */ + +typedef void *(*json_malloc_t)(size_t); +typedef void (*json_free_t)(void *); + +void json_set_alloc_funcs(json_malloc_t malloc_fn, json_free_t free_fn); +void json_get_alloc_funcs(json_malloc_t *malloc_fn, json_free_t *free_fn); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/compat/includes-x64/jansson_config.h b/compat/includes-x64/jansson_config.h new file mode 100644 index 0000000000..35eee9381d --- /dev/null +++ b/compat/includes-x64/jansson_config.h @@ -0,0 +1,68 @@ +/* + * Copyright (c) 2010-2016 Petri Lehtinen + * + * Jansson is free software; you can redistribute it and/or modify + * it under the terms of the MIT license. See LICENSE for details. + * + * + * This file specifies a part of the site-specific configuration for + * Jansson, namely those things that affect the public API in + * jansson.h. + * + * The CMake system will generate the jansson_config.h file and + * copy it to the build and install directories. + */ + +#ifndef JANSSON_CONFIG_H +#define JANSSON_CONFIG_H + +/* Define this so that we can disable scattered automake configuration in source files */ +#ifndef JANSSON_USING_CMAKE +#define JANSSON_USING_CMAKE +#endif + +/* Note: when using cmake, JSON_INTEGER_IS_LONG_LONG is not defined nor used, + * as we will also check for __int64 etc types. + * (the definition was used in the automake system) */ + +/* Bring in the cmake-detected defines */ +#define HAVE_STDINT_H 1 +/* #undef HAVE_INTTYPES_H */ +/* #undef HAVE_SYS_TYPES_H */ + +/* Include our standard type header for the integer typedef */ + +#if defined(HAVE_STDINT_H) +# include +#elif defined(HAVE_INTTYPES_H) +# include +#elif defined(HAVE_SYS_TYPES_H) +# include +#endif + + +/* If your compiler supports the inline keyword in C, JSON_INLINE is + defined to `inline', otherwise empty. In C++, the inline is always + supported. */ +#ifdef __cplusplus +#define JSON_INLINE inline +#else +#define JSON_INLINE inline +#endif + + +#define json_int_t int64_t +#define json_strtoint strtoll +#define JSON_INTEGER_FORMAT "I64d" + + +/* If locale.h and localeconv() are available, define to 1, otherwise to 0. */ +#define JSON_HAVE_LOCALECONV 1 + + +/* Maximum recursion depth for parsing JSON input. + This limits the depth of e.g. array-within-array constructions. */ +#define JSON_PARSER_MAX_DEPTH 2048 + + +#endif diff --git a/compat/includes-x64/openssl/e_os2.h b/compat/includes-x64/openssl/e_os2.h new file mode 100644 index 0000000000..99ea3477d7 --- /dev/null +++ b/compat/includes-x64/openssl/e_os2.h @@ -0,0 +1,311 @@ +/* + * Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved. + * + * Licensed under the OpenSSL license (the "License"). You may not use + * this file except in compliance with the License. You can obtain a copy + * in the file LICENSE in the source distribution or at + * https://www.openssl.org/source/license.html + */ + +#ifndef HEADER_E_OS2_H +# define HEADER_E_OS2_H + +# include + +#ifdef __cplusplus +extern "C" { +#endif + +/****************************************************************************** + * Detect operating systems. This probably needs completing. + * The result is that at least one OPENSSL_SYS_os macro should be defined. + * However, if none is defined, Unix is assumed. + **/ + +# define OPENSSL_SYS_UNIX + +/* --------------------- Microsoft operating systems ---------------------- */ + +/* + * Note that MSDOS actually denotes 32-bit environments running on top of + * MS-DOS, such as DJGPP one. + */ +# if defined(OPENSSL_SYS_MSDOS) +# undef OPENSSL_SYS_UNIX +# endif + +/* + * For 32 bit environment, there seems to be the CygWin environment and then + * all the others that try to do the same thing Microsoft does... + */ +/* + * UEFI lives here because it might be built with a Microsoft toolchain and + * we need to avoid the false positive match on Windows. + */ +# if defined(OPENSSL_SYS_UEFI) +# undef OPENSSL_SYS_UNIX +# elif defined(OPENSSL_SYS_UWIN) +# undef OPENSSL_SYS_UNIX +# define OPENSSL_SYS_WIN32_UWIN +# else +# if defined(__CYGWIN__) || defined(OPENSSL_SYS_CYGWIN) +# undef OPENSSL_SYS_UNIX +# define OPENSSL_SYS_WIN32_CYGWIN +# else +# if defined(_WIN32) || defined(OPENSSL_SYS_WIN32) +# undef OPENSSL_SYS_UNIX +# if !defined(OPENSSL_SYS_WIN32) +# define OPENSSL_SYS_WIN32 +# endif +# endif +# if defined(_WIN64) || defined(OPENSSL_SYS_WIN64) +# undef OPENSSL_SYS_UNIX +# if !defined(OPENSSL_SYS_WIN64) +# define OPENSSL_SYS_WIN64 +# endif +# endif +# if defined(OPENSSL_SYS_WINNT) +# undef OPENSSL_SYS_UNIX +# endif +# if defined(OPENSSL_SYS_WINCE) +# undef OPENSSL_SYS_UNIX +# endif +# endif +# endif + +/* Anything that tries to look like Microsoft is "Windows" */ +# if defined(OPENSSL_SYS_WIN32) || defined(OPENSSL_SYS_WIN64) || defined(OPENSSL_SYS_WINNT) || defined(OPENSSL_SYS_WINCE) +# undef OPENSSL_SYS_UNIX +# define OPENSSL_SYS_WINDOWS +# ifndef OPENSSL_SYS_MSDOS +# define OPENSSL_SYS_MSDOS +# endif +# endif + +/* + * DLL settings. This part is a bit tough, because it's up to the + * application implementor how he or she will link the application, so it + * requires some macro to be used. + */ +# ifdef OPENSSL_SYS_WINDOWS +# ifndef OPENSSL_OPT_WINDLL +# if defined(_WINDLL) /* This is used when building OpenSSL to + * indicate that DLL linkage should be used */ +# define OPENSSL_OPT_WINDLL +# endif +# endif +# endif + +/* ------------------------------- OpenVMS -------------------------------- */ +# if defined(__VMS) || defined(VMS) || defined(OPENSSL_SYS_VMS) +# if !defined(OPENSSL_SYS_VMS) +# undef OPENSSL_SYS_UNIX +# endif +# define OPENSSL_SYS_VMS +# if defined(__DECC) +# define OPENSSL_SYS_VMS_DECC +# elif defined(__DECCXX) +# define OPENSSL_SYS_VMS_DECC +# define OPENSSL_SYS_VMS_DECCXX +# else +# define OPENSSL_SYS_VMS_NODECC +# endif +# endif + +/* -------------------------------- Unix ---------------------------------- */ +# ifdef OPENSSL_SYS_UNIX +# if defined(linux) || defined(__linux__) && !defined(OPENSSL_SYS_LINUX) +# define OPENSSL_SYS_LINUX +# endif +# if defined(_AIX) && !defined(OPENSSL_SYS_AIX) +# define OPENSSL_SYS_AIX +# endif +# endif + +/* -------------------------------- VOS ----------------------------------- */ +# if defined(__VOS__) && !defined(OPENSSL_SYS_VOS) +# define OPENSSL_SYS_VOS +# ifdef __HPPA__ +# define OPENSSL_SYS_VOS_HPPA +# endif +# ifdef __IA32__ +# define OPENSSL_SYS_VOS_IA32 +# endif +# endif + +/** + * That's it for OS-specific stuff + *****************************************************************************/ + +/* Specials for I/O an exit */ +# ifdef OPENSSL_SYS_MSDOS +# define OPENSSL_UNISTD_IO +# define OPENSSL_DECLARE_EXIT extern void exit(int); +# else +# define OPENSSL_UNISTD_IO OPENSSL_UNISTD +# define OPENSSL_DECLARE_EXIT /* declared in unistd.h */ +# endif + +/*- + * Definitions of OPENSSL_GLOBAL and OPENSSL_EXTERN, to define and declare + * certain global symbols that, with some compilers under VMS, have to be + * defined and declared explicitly with globaldef and globalref. + * Definitions of OPENSSL_EXPORT and OPENSSL_IMPORT, to define and declare + * DLL exports and imports for compilers under Win32. These are a little + * more complicated to use. Basically, for any library that exports some + * global variables, the following code must be present in the header file + * that declares them, before OPENSSL_EXTERN is used: + * + * #ifdef SOME_BUILD_FLAG_MACRO + * # undef OPENSSL_EXTERN + * # define OPENSSL_EXTERN OPENSSL_EXPORT + * #endif + * + * The default is to have OPENSSL_EXPORT, OPENSSL_EXTERN and OPENSSL_GLOBAL + * have some generally sensible values. + */ + +# if defined(OPENSSL_SYS_VMS_NODECC) +# define OPENSSL_EXPORT globalref +# define OPENSSL_EXTERN globalref +# define OPENSSL_GLOBAL globaldef +# elif defined(OPENSSL_SYS_WINDOWS) && defined(OPENSSL_OPT_WINDLL) +# define OPENSSL_EXPORT extern __declspec(dllexport) +# define OPENSSL_EXTERN extern __declspec(dllimport) +# define OPENSSL_GLOBAL +# else +# define OPENSSL_EXPORT extern +# define OPENSSL_EXTERN extern +# define OPENSSL_GLOBAL +# endif + +/*- + * Macros to allow global variables to be reached through function calls when + * required (if a shared library version requires it, for example. + * The way it's done allows definitions like this: + * + * // in foobar.c + * OPENSSL_IMPLEMENT_GLOBAL(int,foobar,0) + * // in foobar.h + * OPENSSL_DECLARE_GLOBAL(int,foobar); + * #define foobar OPENSSL_GLOBAL_REF(foobar) + */ +# ifdef OPENSSL_EXPORT_VAR_AS_FUNCTION +# define OPENSSL_IMPLEMENT_GLOBAL(type,name,value) \ + type *_shadow_##name(void) \ + { static type _hide_##name=value; return &_hide_##name; } +# define OPENSSL_DECLARE_GLOBAL(type,name) type *_shadow_##name(void) +# define OPENSSL_GLOBAL_REF(name) (*(_shadow_##name())) +# else +# define OPENSSL_IMPLEMENT_GLOBAL(type,name,value) OPENSSL_GLOBAL type _shadow_##name=value; +# define OPENSSL_DECLARE_GLOBAL(type,name) OPENSSL_EXPORT type _shadow_##name +# define OPENSSL_GLOBAL_REF(name) _shadow_##name +# endif + +# ifdef _WIN32 +# ifdef _WIN64 +# define ossl_ssize_t __int64 +# define OSSL_SSIZE_MAX _I64_MAX +# else +# define ossl_ssize_t int +# define OSSL_SSIZE_MAX INT_MAX +# endif +# endif + +# if defined(OPENSSL_SYS_UEFI) && !defined(ssize_t) +# define ossl_ssize_t int +# define OSSL_SSIZE_MAX INT_MAX +# endif + +# ifndef ossl_ssize_t +# define ossl_ssize_t ssize_t +# if defined(SSIZE_MAX) +# define OSSL_SSIZE_MAX SSIZE_MAX +# elif defined(_POSIX_SSIZE_MAX) +# define OSSL_SSIZE_MAX _POSIX_SSIZE_MAX +# endif +# endif + +# ifdef DEBUG_UNUSED +# define __owur __attribute__((__warn_unused_result__)) +# else +# define __owur +# endif + +/* Standard integer types */ +# if defined(OPENSSL_SYS_UEFI) +typedef INT8 int8_t; +typedef UINT8 uint8_t; +typedef INT16 int16_t; +typedef UINT16 uint16_t; +typedef INT32 int32_t; +typedef UINT32 uint32_t; +typedef INT64 int64_t; +typedef UINT64 uint64_t; +# define PRIu64 "%Lu" +# elif (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) || \ + defined(__osf__) || defined(__sgi) || defined(__hpux) || \ + defined(OPENSSL_SYS_VMS) || defined (__OpenBSD__) +# include +# elif defined(_MSC_VER) && _MSC_VER<=1500 +/* + * minimally required typdefs for systems not supporting inttypes.h or + * stdint.h: currently just older VC++ + */ +typedef signed char int8_t; +typedef unsigned char uint8_t; +typedef short int16_t; +typedef unsigned short uint16_t; +typedef int int32_t; +typedef unsigned int uint32_t; +typedef __int64 int64_t; +typedef unsigned __int64 uint64_t; +# else +# include +# endif + +/* + * We need a format operator for some client tools for uint64_t. If inttypes.h + * isn't available or did not define it, just go with hard-coded. + */ +# ifndef PRIu64 +# ifdef SIXTY_FOUR_BIT_LONG +# define PRIu64 "lu" +# else +# define PRIu64 "llu" +# endif +# endif + +/* ossl_inline: portable inline definition usable in public headers */ +# if !defined(inline) && !defined(__cplusplus) +# if defined(__STDC_VERSION__) && __STDC_VERSION__>=199901L + /* just use inline */ +# define ossl_inline inline +# elif defined(__GNUC__) && __GNUC__>=2 +# define ossl_inline __inline__ +# elif defined(_MSC_VER) + /* + * Visual Studio: inline is available in C++ only, however + * __inline is available for C, see + * http://msdn.microsoft.com/en-us/library/z8y1yy88.aspx + */ +# define ossl_inline __inline +# else +# define ossl_inline +# endif +# else +# define ossl_inline inline +# endif + +# if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L +# define ossl_noreturn _Noreturn +# elif defined(__GNUC__) && __GNUC__ >= 2 +# define ossl_noreturn __attribute__((noreturn)) +# else +# define ossl_noreturn +# endif + +#ifdef __cplusplus +} +#endif +#endif diff --git a/compat/includes-x64/openssl/opensslconf.h b/compat/includes-x64/openssl/opensslconf.h new file mode 100644 index 0000000000..8c82f7d9d5 --- /dev/null +++ b/compat/includes-x64/openssl/opensslconf.h @@ -0,0 +1,172 @@ +/* + * WARNING: do not edit! + * Generated by makefile from include\openssl\opensslconf.h.in + * + * Copyright 2016 The OpenSSL Project Authors. All Rights Reserved. + * + * Licensed under the OpenSSL license (the "License"). You may not use + * this file except in compliance with the License. You can obtain a copy + * in the file LICENSE in the source distribution or at + * https://www.openssl.org/source/license.html + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef OPENSSL_ALGORITHM_DEFINES +# error OPENSSL_ALGORITHM_DEFINES no longer supported +#endif + +/* + * OpenSSL was configured with the following options: + */ + +#ifndef OPENSSL_SYS_WIN64A +# define OPENSSL_SYS_WIN64A 1 +#endif +#ifndef OPENSSL_NO_MD2 +# define OPENSSL_NO_MD2 +#endif +#ifndef OPENSSL_NO_RC5 +# define OPENSSL_NO_RC5 +#endif +#ifndef OPENSSL_THREADS +# define OPENSSL_THREADS +#endif +#ifndef OPENSSL_NO_ASAN +# define OPENSSL_NO_ASAN +#endif +#ifndef OPENSSL_NO_CRYPTO_MDEBUG +# define OPENSSL_NO_CRYPTO_MDEBUG +#endif +#ifndef OPENSSL_NO_CRYPTO_MDEBUG_BACKTRACE +# define OPENSSL_NO_CRYPTO_MDEBUG_BACKTRACE +#endif +#ifndef OPENSSL_NO_EC_NISTP_64_GCC_128 +# define OPENSSL_NO_EC_NISTP_64_GCC_128 +#endif +#ifndef OPENSSL_NO_EGD +# define OPENSSL_NO_EGD +#endif +#ifndef OPENSSL_NO_FUZZ_AFL +# define OPENSSL_NO_FUZZ_AFL +#endif +#ifndef OPENSSL_NO_FUZZ_LIBFUZZER +# define OPENSSL_NO_FUZZ_LIBFUZZER +#endif +#ifndef OPENSSL_NO_HEARTBEATS +# define OPENSSL_NO_HEARTBEATS +#endif +#ifndef OPENSSL_NO_MSAN +# define OPENSSL_NO_MSAN +#endif +#ifndef OPENSSL_NO_SCTP +# define OPENSSL_NO_SCTP +#endif +#ifndef OPENSSL_NO_SSL_TRACE +# define OPENSSL_NO_SSL_TRACE +#endif +#ifndef OPENSSL_NO_SSL3 +# define OPENSSL_NO_SSL3 +#endif +#ifndef OPENSSL_NO_SSL3_METHOD +# define OPENSSL_NO_SSL3_METHOD +#endif +#ifndef OPENSSL_NO_UBSAN +# define OPENSSL_NO_UBSAN +#endif +#ifndef OPENSSL_NO_UNIT_TEST +# define OPENSSL_NO_UNIT_TEST +#endif +#ifndef OPENSSL_NO_WEAK_SSL_CIPHERS +# define OPENSSL_NO_WEAK_SSL_CIPHERS +#endif +#ifndef OPENSSL_NO_AFALGENG +# define OPENSSL_NO_AFALGENG +#endif + + +/* + * Sometimes OPENSSSL_NO_xxx ends up with an empty file and some compilers + * don't like that. This will hopefully silence them. + */ +#define NON_EMPTY_TRANSLATION_UNIT static void *dummy = &dummy; + +/* + * Applications should use -DOPENSSL_API_COMPAT= to suppress the + * declarations of functions deprecated in or before . Otherwise, they + * still won't see them if the library has been built to disable deprecated + * functions. + */ +#if defined(OPENSSL_NO_DEPRECATED) +# define DECLARE_DEPRECATED(f) +#elif __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ > 0) +# define DECLARE_DEPRECATED(f) f __attribute__ ((deprecated)); +#else +# define DECLARE_DEPRECATED(f) f; +#endif + +#ifndef OPENSSL_FILE +# ifdef OPENSSL_NO_FILENAMES +# define OPENSSL_FILE "" +# define OPENSSL_LINE 0 +# else +# define OPENSSL_FILE __FILE__ +# define OPENSSL_LINE __LINE__ +# endif +#endif + +#ifndef OPENSSL_MIN_API +# define OPENSSL_MIN_API 0 +#endif + +#if !defined(OPENSSL_API_COMPAT) || OPENSSL_API_COMPAT < OPENSSL_MIN_API +# undef OPENSSL_API_COMPAT +# define OPENSSL_API_COMPAT OPENSSL_MIN_API +#endif + +#if OPENSSL_API_COMPAT < 0x10100000L +# define DEPRECATEDIN_1_1_0(f) DECLARE_DEPRECATED(f) +#else +# define DEPRECATEDIN_1_1_0(f) +#endif + +#if OPENSSL_API_COMPAT < 0x10000000L +# define DEPRECATEDIN_1_0_0(f) DECLARE_DEPRECATED(f) +#else +# define DEPRECATEDIN_1_0_0(f) +#endif + +#if OPENSSL_API_COMPAT < 0x00908000L +# define DEPRECATEDIN_0_9_8(f) DECLARE_DEPRECATED(f) +#else +# define DEPRECATEDIN_0_9_8(f) +#endif + +#define OPENSSL_CPUID_OBJ + +/* Generate 80386 code? */ +#undef I386_ONLY + +#undef OPENSSL_UNISTD +#define OPENSSL_UNISTD + +#define OPENSSL_EXPORT_VAR_AS_FUNCTION + +/* + * The following are cipher-specific, but are part of the public API. + */ +#if !defined(OPENSSL_SYS_UEFI) +# undef BN_LLONG +/* Only one for the following should be defined */ +# undef SIXTY_FOUR_BIT_LONG +# define SIXTY_FOUR_BIT +# undef THIRTY_TWO_BIT +#endif + +#define RC4_INT unsigned int + +#ifdef __cplusplus +} +#endif diff --git a/compat/includes-x64/openssl/sha.h b/compat/includes-x64/openssl/sha.h new file mode 100644 index 0000000000..6a1eb0de8b --- /dev/null +++ b/compat/includes-x64/openssl/sha.h @@ -0,0 +1,119 @@ +/* + * Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved. + * + * Licensed under the OpenSSL license (the "License"). You may not use + * this file except in compliance with the License. You can obtain a copy + * in the file LICENSE in the source distribution or at + * https://www.openssl.org/source/license.html + */ + +#ifndef HEADER_SHA_H +# define HEADER_SHA_H + +# include +# include + +#ifdef __cplusplus +extern "C" { +#endif + +/*- + * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + * ! SHA_LONG has to be at least 32 bits wide. ! + * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + */ +# define SHA_LONG unsigned int + +# define SHA_LBLOCK 16 +# define SHA_CBLOCK (SHA_LBLOCK*4)/* SHA treats input data as a + * contiguous array of 32 bit wide + * big-endian values. */ +# define SHA_LAST_BLOCK (SHA_CBLOCK-8) +# define SHA_DIGEST_LENGTH 20 + +typedef struct SHAstate_st { + SHA_LONG h0, h1, h2, h3, h4; + SHA_LONG Nl, Nh; + SHA_LONG data[SHA_LBLOCK]; + unsigned int num; +} SHA_CTX; + +int SHA1_Init(SHA_CTX *c); +int SHA1_Update(SHA_CTX *c, const void *data, size_t len); +int SHA1_Final(unsigned char *md, SHA_CTX *c); +unsigned char *SHA1(const unsigned char *d, size_t n, unsigned char *md); +void SHA1_Transform(SHA_CTX *c, const unsigned char *data); + +# define SHA256_CBLOCK (SHA_LBLOCK*4)/* SHA-256 treats input data as a + * contiguous array of 32 bit wide + * big-endian values. */ + +typedef struct SHA256state_st { + SHA_LONG h[8]; + SHA_LONG Nl, Nh; + SHA_LONG data[SHA_LBLOCK]; + unsigned int num, md_len; +} SHA256_CTX; + +int SHA224_Init(SHA256_CTX *c); +int SHA224_Update(SHA256_CTX *c, const void *data, size_t len); +int SHA224_Final(unsigned char *md, SHA256_CTX *c); +unsigned char *SHA224(const unsigned char *d, size_t n, unsigned char *md); +int SHA256_Init(SHA256_CTX *c); +int SHA256_Update(SHA256_CTX *c, const void *data, size_t len); +int SHA256_Final(unsigned char *md, SHA256_CTX *c); +unsigned char *SHA256(const unsigned char *d, size_t n, unsigned char *md); +void SHA256_Transform(SHA256_CTX *c, const unsigned char *data); + +# define SHA224_DIGEST_LENGTH 28 +# define SHA256_DIGEST_LENGTH 32 +# define SHA384_DIGEST_LENGTH 48 +# define SHA512_DIGEST_LENGTH 64 + +/* + * Unlike 32-bit digest algorithms, SHA-512 *relies* on SHA_LONG64 + * being exactly 64-bit wide. See Implementation Notes in sha512.c + * for further details. + */ +/* + * SHA-512 treats input data as a + * contiguous array of 64 bit + * wide big-endian values. + */ +# define SHA512_CBLOCK (SHA_LBLOCK*8) +# if (defined(_WIN32) || defined(_WIN64)) && !defined(__MINGW32__) +# define SHA_LONG64 unsigned __int64 +# define U64(C) C##UI64 +# elif defined(__arch64__) +# define SHA_LONG64 unsigned long +# define U64(C) C##UL +# else +# define SHA_LONG64 unsigned long long +# define U64(C) C##ULL +# endif + +typedef struct SHA512state_st { + SHA_LONG64 h[8]; + SHA_LONG64 Nl, Nh; + union { + SHA_LONG64 d[SHA_LBLOCK]; + unsigned char p[SHA512_CBLOCK]; + } u; + unsigned int num, md_len; +} SHA512_CTX; + +int SHA384_Init(SHA512_CTX *c); +int SHA384_Update(SHA512_CTX *c, const void *data, size_t len); +int SHA384_Final(unsigned char *md, SHA512_CTX *c); +unsigned char *SHA384(const unsigned char *d, size_t n, unsigned char *md); +int SHA512_Init(SHA512_CTX *c); +int SHA512_Update(SHA512_CTX *c, const void *data, size_t len); +int SHA512_Final(unsigned char *md, SHA512_CTX *c); +unsigned char *SHA512(const unsigned char *d, size_t n, unsigned char *md); +void SHA512_Transform(SHA512_CTX *c, const unsigned char *data); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/compat/includes-x86/jansson.h b/compat/includes-x86/jansson.h new file mode 100644 index 0000000000..a5927bd630 --- /dev/null +++ b/compat/includes-x86/jansson.h @@ -0,0 +1,315 @@ +/* + * Copyright (c) 2009-2016 Petri Lehtinen + * + * Jansson is free software; you can redistribute it and/or modify + * it under the terms of the MIT license. See LICENSE for details. + */ + +#ifndef JANSSON_H +#define JANSSON_H + +#include +#include /* for size_t */ +#include + +#include "jansson_config.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/* version */ + +#define JANSSON_MAJOR_VERSION 2 +#define JANSSON_MINOR_VERSION 10 +#define JANSSON_MICRO_VERSION 0 + +/* Micro version is omitted if it's 0 */ +#define JANSSON_VERSION "2.10" + +/* Version as a 3-byte hex number, e.g. 0x010201 == 1.2.1. Use this + for numeric comparisons, e.g. #if JANSSON_VERSION_HEX >= ... */ +#define JANSSON_VERSION_HEX ((JANSSON_MAJOR_VERSION << 16) | \ + (JANSSON_MINOR_VERSION << 8) | \ + (JANSSON_MICRO_VERSION << 0)) + + +/* types */ + +typedef enum { + JSON_OBJECT, + JSON_ARRAY, + JSON_STRING, + JSON_INTEGER, + JSON_REAL, + JSON_TRUE, + JSON_FALSE, + JSON_NULL +} json_type; + +typedef struct json_t { + json_type type; + size_t refcount; +} json_t; + +#ifndef JANSSON_USING_CMAKE /* disabled if using cmake */ +#if JSON_INTEGER_IS_LONG_LONG +#ifdef _WIN32 +#define JSON_INTEGER_FORMAT "I64d" +#else +#define JSON_INTEGER_FORMAT "lld" +#endif +typedef long long json_int_t; +#else +#define JSON_INTEGER_FORMAT "ld" +typedef long json_int_t; +#endif /* JSON_INTEGER_IS_LONG_LONG */ +#endif + +#define json_typeof(json) ((json)->type) +#define json_is_object(json) ((json) && json_typeof(json) == JSON_OBJECT) +#define json_is_array(json) ((json) && json_typeof(json) == JSON_ARRAY) +#define json_is_string(json) ((json) && json_typeof(json) == JSON_STRING) +#define json_is_integer(json) ((json) && json_typeof(json) == JSON_INTEGER) +#define json_is_real(json) ((json) && json_typeof(json) == JSON_REAL) +#define json_is_number(json) (json_is_integer(json) || json_is_real(json)) +#define json_is_true(json) ((json) && json_typeof(json) == JSON_TRUE) +#define json_is_false(json) ((json) && json_typeof(json) == JSON_FALSE) +#define json_boolean_value json_is_true +#define json_is_boolean(json) (json_is_true(json) || json_is_false(json)) +#define json_is_null(json) ((json) && json_typeof(json) == JSON_NULL) + +/* construction, destruction, reference counting */ + +json_t *json_object(void); +json_t *json_array(void); +json_t *json_string(const char *value); +json_t *json_stringn(const char *value, size_t len); +json_t *json_string_nocheck(const char *value); +json_t *json_stringn_nocheck(const char *value, size_t len); +json_t *json_integer(json_int_t value); +json_t *json_real(double value); +json_t *json_true(void); +json_t *json_false(void); +#define json_boolean(val) ((val) ? json_true() : json_false()) +json_t *json_null(void); + +static JSON_INLINE +json_t *json_incref(json_t *json) +{ + if(json && json->refcount != (size_t)-1) + ++json->refcount; + return json; +} + +/* do not call json_delete directly */ +void json_delete(json_t *json); + +static JSON_INLINE +void json_decref(json_t *json) +{ + if(json && json->refcount != (size_t)-1 && --json->refcount == 0) + json_delete(json); +} + +#if defined(__GNUC__) || defined(__clang__) +static JSON_INLINE +void json_decrefp(json_t **json) +{ + if(json) { + json_decref(*json); + *json = NULL; + } +} + +#define json_auto_t json_t __attribute__((cleanup(json_decrefp))) +#endif + + +/* error reporting */ + +#define JSON_ERROR_TEXT_LENGTH 160 +#define JSON_ERROR_SOURCE_LENGTH 80 + +typedef struct { + int line; + int column; + int position; + char source[JSON_ERROR_SOURCE_LENGTH]; + char text[JSON_ERROR_TEXT_LENGTH]; +} json_error_t; + + +/* getters, setters, manipulation */ + +void json_object_seed(size_t seed); +size_t json_object_size(const json_t *object); +json_t *json_object_get(const json_t *object, const char *key); +int json_object_set_new(json_t *object, const char *key, json_t *value); +int json_object_set_new_nocheck(json_t *object, const char *key, json_t *value); +int json_object_del(json_t *object, const char *key); +int json_object_clear(json_t *object); +int json_object_update(json_t *object, json_t *other); +int json_object_update_existing(json_t *object, json_t *other); +int json_object_update_missing(json_t *object, json_t *other); +void *json_object_iter(json_t *object); +void *json_object_iter_at(json_t *object, const char *key); +void *json_object_key_to_iter(const char *key); +void *json_object_iter_next(json_t *object, void *iter); +const char *json_object_iter_key(void *iter); +json_t *json_object_iter_value(void *iter); +int json_object_iter_set_new(json_t *object, void *iter, json_t *value); + +#define json_object_foreach(object, key, value) \ + for(key = json_object_iter_key(json_object_iter(object)); \ + key && (value = json_object_iter_value(json_object_key_to_iter(key))); \ + key = json_object_iter_key(json_object_iter_next(object, json_object_key_to_iter(key)))) + +#define json_object_foreach_safe(object, n, key, value) \ + for(key = json_object_iter_key(json_object_iter(object)), \ + n = json_object_iter_next(object, json_object_key_to_iter(key)); \ + key && (value = json_object_iter_value(json_object_key_to_iter(key))); \ + key = json_object_iter_key(n), \ + n = json_object_iter_next(object, json_object_key_to_iter(key))) + +#define json_array_foreach(array, index, value) \ + for(index = 0; \ + index < json_array_size(array) && (value = json_array_get(array, index)); \ + index++) + +static JSON_INLINE +int json_object_set(json_t *object, const char *key, json_t *value) +{ + return json_object_set_new(object, key, json_incref(value)); +} + +static JSON_INLINE +int json_object_set_nocheck(json_t *object, const char *key, json_t *value) +{ + return json_object_set_new_nocheck(object, key, json_incref(value)); +} + +static JSON_INLINE +int json_object_iter_set(json_t *object, void *iter, json_t *value) +{ + return json_object_iter_set_new(object, iter, json_incref(value)); +} + +size_t json_array_size(const json_t *array); +json_t *json_array_get(const json_t *array, size_t index); +int json_array_set_new(json_t *array, size_t index, json_t *value); +int json_array_append_new(json_t *array, json_t *value); +int json_array_insert_new(json_t *array, size_t index, json_t *value); +int json_array_remove(json_t *array, size_t index); +int json_array_clear(json_t *array); +int json_array_extend(json_t *array, json_t *other); + +static JSON_INLINE +int json_array_set(json_t *array, size_t ind, json_t *value) +{ + return json_array_set_new(array, ind, json_incref(value)); +} + +static JSON_INLINE +int json_array_append(json_t *array, json_t *value) +{ + return json_array_append_new(array, json_incref(value)); +} + +static JSON_INLINE +int json_array_insert(json_t *array, size_t ind, json_t *value) +{ + return json_array_insert_new(array, ind, json_incref(value)); +} + +const char *json_string_value(const json_t *string); +size_t json_string_length(const json_t *string); +json_int_t json_integer_value(const json_t *integer); +double json_real_value(const json_t *real); +double json_number_value(const json_t *json); + +int json_string_set(json_t *string, const char *value); +int json_string_setn(json_t *string, const char *value, size_t len); +int json_string_set_nocheck(json_t *string, const char *value); +int json_string_setn_nocheck(json_t *string, const char *value, size_t len); +int json_integer_set(json_t *integer, json_int_t value); +int json_real_set(json_t *real, double value); + +/* pack, unpack */ + +json_t *json_pack(const char *fmt, ...); +json_t *json_pack_ex(json_error_t *error, size_t flags, const char *fmt, ...); +json_t *json_vpack_ex(json_error_t *error, size_t flags, const char *fmt, va_list ap); + +#define JSON_VALIDATE_ONLY 0x1 +#define JSON_STRICT 0x2 + +int json_unpack(json_t *root, const char *fmt, ...); +int json_unpack_ex(json_t *root, json_error_t *error, size_t flags, const char *fmt, ...); +int json_vunpack_ex(json_t *root, json_error_t *error, size_t flags, const char *fmt, va_list ap); + + +/* equality */ + +int json_equal(json_t *value1, json_t *value2); + + +/* copying */ + +json_t *json_copy(json_t *value); +json_t *json_deep_copy(const json_t *value); + + +/* decoding */ + +#define JSON_REJECT_DUPLICATES 0x1 +#define JSON_DISABLE_EOF_CHECK 0x2 +#define JSON_DECODE_ANY 0x4 +#define JSON_DECODE_INT_AS_REAL 0x8 +#define JSON_ALLOW_NUL 0x10 + +typedef size_t (*json_load_callback_t)(void *buffer, size_t buflen, void *data); + +json_t *json_loads(const char *input, size_t flags, json_error_t *error); +json_t *json_loadb(const char *buffer, size_t buflen, size_t flags, json_error_t *error); +json_t *json_loadf(FILE *input, size_t flags, json_error_t *error); +json_t *json_loadfd(int input, size_t flags, json_error_t *error); +json_t *json_load_file(const char *path, size_t flags, json_error_t *error); +json_t *json_load_callback(json_load_callback_t callback, void *data, size_t flags, json_error_t *error); + + +/* encoding */ + +#define JSON_MAX_INDENT 0x1F +#define JSON_INDENT(n) ((n) & JSON_MAX_INDENT) +#define JSON_COMPACT 0x20 +#define JSON_ENSURE_ASCII 0x40 +#define JSON_SORT_KEYS 0x80 +#define JSON_PRESERVE_ORDER 0x100 +#define JSON_ENCODE_ANY 0x200 +#define JSON_ESCAPE_SLASH 0x400 +#define JSON_REAL_PRECISION(n) (((n) & 0x1F) << 11) +#define JSON_EMBED 0x10000 + +typedef int (*json_dump_callback_t)(const char *buffer, size_t size, void *data); + +char *json_dumps(const json_t *json, size_t flags); +size_t json_dumpb(const json_t *json, char *buffer, size_t size, size_t flags); +int json_dumpf(const json_t *json, FILE *output, size_t flags); +int json_dumpfd(const json_t *json, int output, size_t flags); +int json_dump_file(const json_t *json, const char *path, size_t flags); +int json_dump_callback(const json_t *json, json_dump_callback_t callback, void *data, size_t flags); + +/* custom memory allocation */ + +typedef void *(*json_malloc_t)(size_t); +typedef void (*json_free_t)(void *); + +void json_set_alloc_funcs(json_malloc_t malloc_fn, json_free_t free_fn); +void json_get_alloc_funcs(json_malloc_t *malloc_fn, json_free_t *free_fn); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/compat/includes-x86/jansson_config.h b/compat/includes-x86/jansson_config.h new file mode 100644 index 0000000000..35eee9381d --- /dev/null +++ b/compat/includes-x86/jansson_config.h @@ -0,0 +1,68 @@ +/* + * Copyright (c) 2010-2016 Petri Lehtinen + * + * Jansson is free software; you can redistribute it and/or modify + * it under the terms of the MIT license. See LICENSE for details. + * + * + * This file specifies a part of the site-specific configuration for + * Jansson, namely those things that affect the public API in + * jansson.h. + * + * The CMake system will generate the jansson_config.h file and + * copy it to the build and install directories. + */ + +#ifndef JANSSON_CONFIG_H +#define JANSSON_CONFIG_H + +/* Define this so that we can disable scattered automake configuration in source files */ +#ifndef JANSSON_USING_CMAKE +#define JANSSON_USING_CMAKE +#endif + +/* Note: when using cmake, JSON_INTEGER_IS_LONG_LONG is not defined nor used, + * as we will also check for __int64 etc types. + * (the definition was used in the automake system) */ + +/* Bring in the cmake-detected defines */ +#define HAVE_STDINT_H 1 +/* #undef HAVE_INTTYPES_H */ +/* #undef HAVE_SYS_TYPES_H */ + +/* Include our standard type header for the integer typedef */ + +#if defined(HAVE_STDINT_H) +# include +#elif defined(HAVE_INTTYPES_H) +# include +#elif defined(HAVE_SYS_TYPES_H) +# include +#endif + + +/* If your compiler supports the inline keyword in C, JSON_INLINE is + defined to `inline', otherwise empty. In C++, the inline is always + supported. */ +#ifdef __cplusplus +#define JSON_INLINE inline +#else +#define JSON_INLINE inline +#endif + + +#define json_int_t int64_t +#define json_strtoint strtoll +#define JSON_INTEGER_FORMAT "I64d" + + +/* If locale.h and localeconv() are available, define to 1, otherwise to 0. */ +#define JSON_HAVE_LOCALECONV 1 + + +/* Maximum recursion depth for parsing JSON input. + This limits the depth of e.g. array-within-array constructions. */ +#define JSON_PARSER_MAX_DEPTH 2048 + + +#endif diff --git a/compat/includes-x86/openssl/e_os2.h b/compat/includes-x86/openssl/e_os2.h new file mode 100644 index 0000000000..99ea3477d7 --- /dev/null +++ b/compat/includes-x86/openssl/e_os2.h @@ -0,0 +1,311 @@ +/* + * Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved. + * + * Licensed under the OpenSSL license (the "License"). You may not use + * this file except in compliance with the License. You can obtain a copy + * in the file LICENSE in the source distribution or at + * https://www.openssl.org/source/license.html + */ + +#ifndef HEADER_E_OS2_H +# define HEADER_E_OS2_H + +# include + +#ifdef __cplusplus +extern "C" { +#endif + +/****************************************************************************** + * Detect operating systems. This probably needs completing. + * The result is that at least one OPENSSL_SYS_os macro should be defined. + * However, if none is defined, Unix is assumed. + **/ + +# define OPENSSL_SYS_UNIX + +/* --------------------- Microsoft operating systems ---------------------- */ + +/* + * Note that MSDOS actually denotes 32-bit environments running on top of + * MS-DOS, such as DJGPP one. + */ +# if defined(OPENSSL_SYS_MSDOS) +# undef OPENSSL_SYS_UNIX +# endif + +/* + * For 32 bit environment, there seems to be the CygWin environment and then + * all the others that try to do the same thing Microsoft does... + */ +/* + * UEFI lives here because it might be built with a Microsoft toolchain and + * we need to avoid the false positive match on Windows. + */ +# if defined(OPENSSL_SYS_UEFI) +# undef OPENSSL_SYS_UNIX +# elif defined(OPENSSL_SYS_UWIN) +# undef OPENSSL_SYS_UNIX +# define OPENSSL_SYS_WIN32_UWIN +# else +# if defined(__CYGWIN__) || defined(OPENSSL_SYS_CYGWIN) +# undef OPENSSL_SYS_UNIX +# define OPENSSL_SYS_WIN32_CYGWIN +# else +# if defined(_WIN32) || defined(OPENSSL_SYS_WIN32) +# undef OPENSSL_SYS_UNIX +# if !defined(OPENSSL_SYS_WIN32) +# define OPENSSL_SYS_WIN32 +# endif +# endif +# if defined(_WIN64) || defined(OPENSSL_SYS_WIN64) +# undef OPENSSL_SYS_UNIX +# if !defined(OPENSSL_SYS_WIN64) +# define OPENSSL_SYS_WIN64 +# endif +# endif +# if defined(OPENSSL_SYS_WINNT) +# undef OPENSSL_SYS_UNIX +# endif +# if defined(OPENSSL_SYS_WINCE) +# undef OPENSSL_SYS_UNIX +# endif +# endif +# endif + +/* Anything that tries to look like Microsoft is "Windows" */ +# if defined(OPENSSL_SYS_WIN32) || defined(OPENSSL_SYS_WIN64) || defined(OPENSSL_SYS_WINNT) || defined(OPENSSL_SYS_WINCE) +# undef OPENSSL_SYS_UNIX +# define OPENSSL_SYS_WINDOWS +# ifndef OPENSSL_SYS_MSDOS +# define OPENSSL_SYS_MSDOS +# endif +# endif + +/* + * DLL settings. This part is a bit tough, because it's up to the + * application implementor how he or she will link the application, so it + * requires some macro to be used. + */ +# ifdef OPENSSL_SYS_WINDOWS +# ifndef OPENSSL_OPT_WINDLL +# if defined(_WINDLL) /* This is used when building OpenSSL to + * indicate that DLL linkage should be used */ +# define OPENSSL_OPT_WINDLL +# endif +# endif +# endif + +/* ------------------------------- OpenVMS -------------------------------- */ +# if defined(__VMS) || defined(VMS) || defined(OPENSSL_SYS_VMS) +# if !defined(OPENSSL_SYS_VMS) +# undef OPENSSL_SYS_UNIX +# endif +# define OPENSSL_SYS_VMS +# if defined(__DECC) +# define OPENSSL_SYS_VMS_DECC +# elif defined(__DECCXX) +# define OPENSSL_SYS_VMS_DECC +# define OPENSSL_SYS_VMS_DECCXX +# else +# define OPENSSL_SYS_VMS_NODECC +# endif +# endif + +/* -------------------------------- Unix ---------------------------------- */ +# ifdef OPENSSL_SYS_UNIX +# if defined(linux) || defined(__linux__) && !defined(OPENSSL_SYS_LINUX) +# define OPENSSL_SYS_LINUX +# endif +# if defined(_AIX) && !defined(OPENSSL_SYS_AIX) +# define OPENSSL_SYS_AIX +# endif +# endif + +/* -------------------------------- VOS ----------------------------------- */ +# if defined(__VOS__) && !defined(OPENSSL_SYS_VOS) +# define OPENSSL_SYS_VOS +# ifdef __HPPA__ +# define OPENSSL_SYS_VOS_HPPA +# endif +# ifdef __IA32__ +# define OPENSSL_SYS_VOS_IA32 +# endif +# endif + +/** + * That's it for OS-specific stuff + *****************************************************************************/ + +/* Specials for I/O an exit */ +# ifdef OPENSSL_SYS_MSDOS +# define OPENSSL_UNISTD_IO +# define OPENSSL_DECLARE_EXIT extern void exit(int); +# else +# define OPENSSL_UNISTD_IO OPENSSL_UNISTD +# define OPENSSL_DECLARE_EXIT /* declared in unistd.h */ +# endif + +/*- + * Definitions of OPENSSL_GLOBAL and OPENSSL_EXTERN, to define and declare + * certain global symbols that, with some compilers under VMS, have to be + * defined and declared explicitly with globaldef and globalref. + * Definitions of OPENSSL_EXPORT and OPENSSL_IMPORT, to define and declare + * DLL exports and imports for compilers under Win32. These are a little + * more complicated to use. Basically, for any library that exports some + * global variables, the following code must be present in the header file + * that declares them, before OPENSSL_EXTERN is used: + * + * #ifdef SOME_BUILD_FLAG_MACRO + * # undef OPENSSL_EXTERN + * # define OPENSSL_EXTERN OPENSSL_EXPORT + * #endif + * + * The default is to have OPENSSL_EXPORT, OPENSSL_EXTERN and OPENSSL_GLOBAL + * have some generally sensible values. + */ + +# if defined(OPENSSL_SYS_VMS_NODECC) +# define OPENSSL_EXPORT globalref +# define OPENSSL_EXTERN globalref +# define OPENSSL_GLOBAL globaldef +# elif defined(OPENSSL_SYS_WINDOWS) && defined(OPENSSL_OPT_WINDLL) +# define OPENSSL_EXPORT extern __declspec(dllexport) +# define OPENSSL_EXTERN extern __declspec(dllimport) +# define OPENSSL_GLOBAL +# else +# define OPENSSL_EXPORT extern +# define OPENSSL_EXTERN extern +# define OPENSSL_GLOBAL +# endif + +/*- + * Macros to allow global variables to be reached through function calls when + * required (if a shared library version requires it, for example. + * The way it's done allows definitions like this: + * + * // in foobar.c + * OPENSSL_IMPLEMENT_GLOBAL(int,foobar,0) + * // in foobar.h + * OPENSSL_DECLARE_GLOBAL(int,foobar); + * #define foobar OPENSSL_GLOBAL_REF(foobar) + */ +# ifdef OPENSSL_EXPORT_VAR_AS_FUNCTION +# define OPENSSL_IMPLEMENT_GLOBAL(type,name,value) \ + type *_shadow_##name(void) \ + { static type _hide_##name=value; return &_hide_##name; } +# define OPENSSL_DECLARE_GLOBAL(type,name) type *_shadow_##name(void) +# define OPENSSL_GLOBAL_REF(name) (*(_shadow_##name())) +# else +# define OPENSSL_IMPLEMENT_GLOBAL(type,name,value) OPENSSL_GLOBAL type _shadow_##name=value; +# define OPENSSL_DECLARE_GLOBAL(type,name) OPENSSL_EXPORT type _shadow_##name +# define OPENSSL_GLOBAL_REF(name) _shadow_##name +# endif + +# ifdef _WIN32 +# ifdef _WIN64 +# define ossl_ssize_t __int64 +# define OSSL_SSIZE_MAX _I64_MAX +# else +# define ossl_ssize_t int +# define OSSL_SSIZE_MAX INT_MAX +# endif +# endif + +# if defined(OPENSSL_SYS_UEFI) && !defined(ssize_t) +# define ossl_ssize_t int +# define OSSL_SSIZE_MAX INT_MAX +# endif + +# ifndef ossl_ssize_t +# define ossl_ssize_t ssize_t +# if defined(SSIZE_MAX) +# define OSSL_SSIZE_MAX SSIZE_MAX +# elif defined(_POSIX_SSIZE_MAX) +# define OSSL_SSIZE_MAX _POSIX_SSIZE_MAX +# endif +# endif + +# ifdef DEBUG_UNUSED +# define __owur __attribute__((__warn_unused_result__)) +# else +# define __owur +# endif + +/* Standard integer types */ +# if defined(OPENSSL_SYS_UEFI) +typedef INT8 int8_t; +typedef UINT8 uint8_t; +typedef INT16 int16_t; +typedef UINT16 uint16_t; +typedef INT32 int32_t; +typedef UINT32 uint32_t; +typedef INT64 int64_t; +typedef UINT64 uint64_t; +# define PRIu64 "%Lu" +# elif (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) || \ + defined(__osf__) || defined(__sgi) || defined(__hpux) || \ + defined(OPENSSL_SYS_VMS) || defined (__OpenBSD__) +# include +# elif defined(_MSC_VER) && _MSC_VER<=1500 +/* + * minimally required typdefs for systems not supporting inttypes.h or + * stdint.h: currently just older VC++ + */ +typedef signed char int8_t; +typedef unsigned char uint8_t; +typedef short int16_t; +typedef unsigned short uint16_t; +typedef int int32_t; +typedef unsigned int uint32_t; +typedef __int64 int64_t; +typedef unsigned __int64 uint64_t; +# else +# include +# endif + +/* + * We need a format operator for some client tools for uint64_t. If inttypes.h + * isn't available or did not define it, just go with hard-coded. + */ +# ifndef PRIu64 +# ifdef SIXTY_FOUR_BIT_LONG +# define PRIu64 "lu" +# else +# define PRIu64 "llu" +# endif +# endif + +/* ossl_inline: portable inline definition usable in public headers */ +# if !defined(inline) && !defined(__cplusplus) +# if defined(__STDC_VERSION__) && __STDC_VERSION__>=199901L + /* just use inline */ +# define ossl_inline inline +# elif defined(__GNUC__) && __GNUC__>=2 +# define ossl_inline __inline__ +# elif defined(_MSC_VER) + /* + * Visual Studio: inline is available in C++ only, however + * __inline is available for C, see + * http://msdn.microsoft.com/en-us/library/z8y1yy88.aspx + */ +# define ossl_inline __inline +# else +# define ossl_inline +# endif +# else +# define ossl_inline inline +# endif + +# if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L +# define ossl_noreturn _Noreturn +# elif defined(__GNUC__) && __GNUC__ >= 2 +# define ossl_noreturn __attribute__((noreturn)) +# else +# define ossl_noreturn +# endif + +#ifdef __cplusplus +} +#endif +#endif diff --git a/compat/includes-x86/openssl/opensslconf.h b/compat/includes-x86/openssl/opensslconf.h new file mode 100644 index 0000000000..1cb04a5a42 --- /dev/null +++ b/compat/includes-x86/openssl/opensslconf.h @@ -0,0 +1,172 @@ +/* + * WARNING: do not edit! + * Generated by makefile from include\openssl\opensslconf.h.in + * + * Copyright 2016 The OpenSSL Project Authors. All Rights Reserved. + * + * Licensed under the OpenSSL license (the "License"). You may not use + * this file except in compliance with the License. You can obtain a copy + * in the file LICENSE in the source distribution or at + * https://www.openssl.org/source/license.html + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef OPENSSL_ALGORITHM_DEFINES +# error OPENSSL_ALGORITHM_DEFINES no longer supported +#endif + +/* + * OpenSSL was configured with the following options: + */ + +#ifndef OPENSSL_SYS_WIN32 +# define OPENSSL_SYS_WIN32 1 +#endif +#ifndef OPENSSL_NO_MD2 +# define OPENSSL_NO_MD2 +#endif +#ifndef OPENSSL_NO_RC5 +# define OPENSSL_NO_RC5 +#endif +#ifndef OPENSSL_THREADS +# define OPENSSL_THREADS +#endif +#ifndef OPENSSL_NO_ASAN +# define OPENSSL_NO_ASAN +#endif +#ifndef OPENSSL_NO_CRYPTO_MDEBUG +# define OPENSSL_NO_CRYPTO_MDEBUG +#endif +#ifndef OPENSSL_NO_CRYPTO_MDEBUG_BACKTRACE +# define OPENSSL_NO_CRYPTO_MDEBUG_BACKTRACE +#endif +#ifndef OPENSSL_NO_EC_NISTP_64_GCC_128 +# define OPENSSL_NO_EC_NISTP_64_GCC_128 +#endif +#ifndef OPENSSL_NO_EGD +# define OPENSSL_NO_EGD +#endif +#ifndef OPENSSL_NO_FUZZ_AFL +# define OPENSSL_NO_FUZZ_AFL +#endif +#ifndef OPENSSL_NO_FUZZ_LIBFUZZER +# define OPENSSL_NO_FUZZ_LIBFUZZER +#endif +#ifndef OPENSSL_NO_HEARTBEATS +# define OPENSSL_NO_HEARTBEATS +#endif +#ifndef OPENSSL_NO_MSAN +# define OPENSSL_NO_MSAN +#endif +#ifndef OPENSSL_NO_SCTP +# define OPENSSL_NO_SCTP +#endif +#ifndef OPENSSL_NO_SSL_TRACE +# define OPENSSL_NO_SSL_TRACE +#endif +#ifndef OPENSSL_NO_SSL3 +# define OPENSSL_NO_SSL3 +#endif +#ifndef OPENSSL_NO_SSL3_METHOD +# define OPENSSL_NO_SSL3_METHOD +#endif +#ifndef OPENSSL_NO_UBSAN +# define OPENSSL_NO_UBSAN +#endif +#ifndef OPENSSL_NO_UNIT_TEST +# define OPENSSL_NO_UNIT_TEST +#endif +#ifndef OPENSSL_NO_WEAK_SSL_CIPHERS +# define OPENSSL_NO_WEAK_SSL_CIPHERS +#endif +#ifndef OPENSSL_NO_AFALGENG +# define OPENSSL_NO_AFALGENG +#endif + + +/* + * Sometimes OPENSSSL_NO_xxx ends up with an empty file and some compilers + * don't like that. This will hopefully silence them. + */ +#define NON_EMPTY_TRANSLATION_UNIT static void *dummy = &dummy; + +/* + * Applications should use -DOPENSSL_API_COMPAT= to suppress the + * declarations of functions deprecated in or before . Otherwise, they + * still won't see them if the library has been built to disable deprecated + * functions. + */ +#if defined(OPENSSL_NO_DEPRECATED) +# define DECLARE_DEPRECATED(f) +#elif __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ > 0) +# define DECLARE_DEPRECATED(f) f __attribute__ ((deprecated)); +#else +# define DECLARE_DEPRECATED(f) f; +#endif + +#ifndef OPENSSL_FILE +# ifdef OPENSSL_NO_FILENAMES +# define OPENSSL_FILE "" +# define OPENSSL_LINE 0 +# else +# define OPENSSL_FILE __FILE__ +# define OPENSSL_LINE __LINE__ +# endif +#endif + +#ifndef OPENSSL_MIN_API +# define OPENSSL_MIN_API 0 +#endif + +#if !defined(OPENSSL_API_COMPAT) || OPENSSL_API_COMPAT < OPENSSL_MIN_API +# undef OPENSSL_API_COMPAT +# define OPENSSL_API_COMPAT OPENSSL_MIN_API +#endif + +#if OPENSSL_API_COMPAT < 0x10100000L +# define DEPRECATEDIN_1_1_0(f) DECLARE_DEPRECATED(f) +#else +# define DEPRECATEDIN_1_1_0(f) +#endif + +#if OPENSSL_API_COMPAT < 0x10000000L +# define DEPRECATEDIN_1_0_0(f) DECLARE_DEPRECATED(f) +#else +# define DEPRECATEDIN_1_0_0(f) +#endif + +#if OPENSSL_API_COMPAT < 0x00908000L +# define DEPRECATEDIN_0_9_8(f) DECLARE_DEPRECATED(f) +#else +# define DEPRECATEDIN_0_9_8(f) +#endif + +#define OPENSSL_CPUID_OBJ + +/* Generate 80386 code? */ +#undef I386_ONLY + +#undef OPENSSL_UNISTD +#define OPENSSL_UNISTD + +#define OPENSSL_EXPORT_VAR_AS_FUNCTION + +/* + * The following are cipher-specific, but are part of the public API. + */ +#if !defined(OPENSSL_SYS_UEFI) +# define BN_LLONG +/* Only one for the following should be defined */ +# undef SIXTY_FOUR_BIT_LONG +# undef SIXTY_FOUR_BIT +# define THIRTY_TWO_BIT +#endif + +#define RC4_INT unsigned int + +#ifdef __cplusplus +} +#endif diff --git a/compat/includes-x86/openssl/sha.h b/compat/includes-x86/openssl/sha.h new file mode 100644 index 0000000000..6a1eb0de8b --- /dev/null +++ b/compat/includes-x86/openssl/sha.h @@ -0,0 +1,119 @@ +/* + * Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved. + * + * Licensed under the OpenSSL license (the "License"). You may not use + * this file except in compliance with the License. You can obtain a copy + * in the file LICENSE in the source distribution or at + * https://www.openssl.org/source/license.html + */ + +#ifndef HEADER_SHA_H +# define HEADER_SHA_H + +# include +# include + +#ifdef __cplusplus +extern "C" { +#endif + +/*- + * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + * ! SHA_LONG has to be at least 32 bits wide. ! + * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + */ +# define SHA_LONG unsigned int + +# define SHA_LBLOCK 16 +# define SHA_CBLOCK (SHA_LBLOCK*4)/* SHA treats input data as a + * contiguous array of 32 bit wide + * big-endian values. */ +# define SHA_LAST_BLOCK (SHA_CBLOCK-8) +# define SHA_DIGEST_LENGTH 20 + +typedef struct SHAstate_st { + SHA_LONG h0, h1, h2, h3, h4; + SHA_LONG Nl, Nh; + SHA_LONG data[SHA_LBLOCK]; + unsigned int num; +} SHA_CTX; + +int SHA1_Init(SHA_CTX *c); +int SHA1_Update(SHA_CTX *c, const void *data, size_t len); +int SHA1_Final(unsigned char *md, SHA_CTX *c); +unsigned char *SHA1(const unsigned char *d, size_t n, unsigned char *md); +void SHA1_Transform(SHA_CTX *c, const unsigned char *data); + +# define SHA256_CBLOCK (SHA_LBLOCK*4)/* SHA-256 treats input data as a + * contiguous array of 32 bit wide + * big-endian values. */ + +typedef struct SHA256state_st { + SHA_LONG h[8]; + SHA_LONG Nl, Nh; + SHA_LONG data[SHA_LBLOCK]; + unsigned int num, md_len; +} SHA256_CTX; + +int SHA224_Init(SHA256_CTX *c); +int SHA224_Update(SHA256_CTX *c, const void *data, size_t len); +int SHA224_Final(unsigned char *md, SHA256_CTX *c); +unsigned char *SHA224(const unsigned char *d, size_t n, unsigned char *md); +int SHA256_Init(SHA256_CTX *c); +int SHA256_Update(SHA256_CTX *c, const void *data, size_t len); +int SHA256_Final(unsigned char *md, SHA256_CTX *c); +unsigned char *SHA256(const unsigned char *d, size_t n, unsigned char *md); +void SHA256_Transform(SHA256_CTX *c, const unsigned char *data); + +# define SHA224_DIGEST_LENGTH 28 +# define SHA256_DIGEST_LENGTH 32 +# define SHA384_DIGEST_LENGTH 48 +# define SHA512_DIGEST_LENGTH 64 + +/* + * Unlike 32-bit digest algorithms, SHA-512 *relies* on SHA_LONG64 + * being exactly 64-bit wide. See Implementation Notes in sha512.c + * for further details. + */ +/* + * SHA-512 treats input data as a + * contiguous array of 64 bit + * wide big-endian values. + */ +# define SHA512_CBLOCK (SHA_LBLOCK*8) +# if (defined(_WIN32) || defined(_WIN64)) && !defined(__MINGW32__) +# define SHA_LONG64 unsigned __int64 +# define U64(C) C##UI64 +# elif defined(__arch64__) +# define SHA_LONG64 unsigned long +# define U64(C) C##UL +# else +# define SHA_LONG64 unsigned long long +# define U64(C) C##ULL +# endif + +typedef struct SHA512state_st { + SHA_LONG64 h[8]; + SHA_LONG64 Nl, Nh; + union { + SHA_LONG64 d[SHA_LBLOCK]; + unsigned char p[SHA512_CBLOCK]; + } u; + unsigned int num, md_len; +} SHA512_CTX; + +int SHA384_Init(SHA512_CTX *c); +int SHA384_Update(SHA512_CTX *c, const void *data, size_t len); +int SHA384_Final(unsigned char *md, SHA512_CTX *c); +unsigned char *SHA384(const unsigned char *d, size_t n, unsigned char *md); +int SHA512_Init(SHA512_CTX *c); +int SHA512_Update(SHA512_CTX *c, const void *data, size_t len); +int SHA512_Final(unsigned char *md, SHA512_CTX *c); +unsigned char *SHA512(const unsigned char *d, size_t n, unsigned char *md); +void SHA512_Transform(SHA512_CTX *c, const unsigned char *data); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/compat/curl-for-windows/curl/include/curl/curl.h b/compat/includes/curl/curl.h similarity index 87% rename from compat/curl-for-windows/curl/include/curl/curl.h rename to compat/includes/curl/curl.h index 44b1b7e391..1030712648 100644 --- a/compat/curl-for-windows/curl/include/curl/curl.h +++ b/compat/includes/curl/curl.h @@ -7,11 +7,11 @@ * | (__| |_| | _ <| |___ * \___|\___/|_| \_\_____| * - * Copyright (C) 1998 - 2014, Daniel Stenberg, , et al. + * Copyright (C) 1998 - 2017, Daniel Stenberg, , et al. * * This software is licensed as described in the file COPYING, which * you should have received as part of this distribution. The terms - * are also available at http://curl.haxx.se/docs/copyright.html. + * are also available at https://curl.haxx.se/docs/copyright.html. * * You may opt to use, copy, modify, merge, publish, distribute and/or sell * copies of the Software, and permit persons to whom the Software is @@ -24,13 +24,18 @@ /* * If you have libcurl problems, all docs and details are found here: - * http://curl.haxx.se/libcurl/ + * https://curl.haxx.se/libcurl/ * * curl-library mailing list subscription and unsubscription web interface: - * http://cool.haxx.se/mailman/listinfo/curl-library/ + * https://cool.haxx.se/mailman/listinfo/curl-library/ */ +#ifdef CURL_NO_OLDIES +#define CURL_STRICTER +#endif + #include "curlver.h" /* libcurl version defines */ +#include "system.h" /* determine things run-time */ #include "curlbuild.h" /* libcurl build definitions */ #include "curlrules.h" /* libcurl rules enforcement */ @@ -56,7 +61,8 @@ #include #if defined(WIN32) && !defined(_WIN32_WCE) && !defined(__CYGWIN__) -#if !(defined(_WINSOCKAPI_) || defined(_WINSOCK_H) || defined(__LWIP_OPT_H__)) +#if !(defined(_WINSOCKAPI_) || defined(_WINSOCK_H) || \ + defined(__LWIP_OPT_H__) || defined(LWIP_HDR_OPT_H)) /* The check above prevents the winsock2 inclusion if winsock.h already was included, since they can't co-exist without problems */ #include @@ -90,7 +96,13 @@ extern "C" { #endif +#if defined(BUILDING_LIBCURL) || defined(CURL_STRICTER) +typedef struct Curl_easy CURL; +typedef struct Curl_share CURLSH; +#else typedef void CURL; +typedef void CURLSH; +#endif /* * libcurl external API function linkage decorations. @@ -112,7 +124,7 @@ typedef void CURL; #ifndef curl_socket_typedef /* socket typedef */ -#if defined(WIN32) && !defined(__LWIP_OPT_H__) +#if defined(WIN32) && !defined(__LWIP_OPT_H__) && !defined(LWIP_HDR_OPT_H) typedef SOCKET curl_socket_t; #define CURL_SOCKET_BAD INVALID_SOCKET #else @@ -127,33 +139,43 @@ struct curl_httppost { char *name; /* pointer to allocated name */ long namelength; /* length of name length */ char *contents; /* pointer to allocated data contents */ - long contentslength; /* length of contents field */ + long contentslength; /* length of contents field, see also + CURL_HTTPPOST_LARGE */ char *buffer; /* pointer to allocated buffer contents */ long bufferlength; /* length of buffer field */ char *contenttype; /* Content-Type */ - struct curl_slist* contentheader; /* list of extra headers for this form */ + struct curl_slist *contentheader; /* list of extra headers for this form */ struct curl_httppost *more; /* if one field name has more than one file, this link should link to following files */ long flags; /* as defined below */ -#define HTTPPOST_FILENAME (1<<0) /* specified content is a file name */ -#define HTTPPOST_READFILE (1<<1) /* specified content is a file name */ -#define HTTPPOST_PTRNAME (1<<2) /* name is only stored pointer - do not free in formfree */ -#define HTTPPOST_PTRCONTENTS (1<<3) /* contents is only stored pointer - do not free in formfree */ -#define HTTPPOST_BUFFER (1<<4) /* upload file from buffer */ -#define HTTPPOST_PTRBUFFER (1<<5) /* upload file from pointer contents */ -#define HTTPPOST_CALLBACK (1<<6) /* upload file contents by using the - regular read callback to get the data - and pass the given pointer as custom - pointer */ + +/* specified content is a file name */ +#define CURL_HTTPPOST_FILENAME (1<<0) +/* specified content is a file name */ +#define CURL_HTTPPOST_READFILE (1<<1) +/* name is only stored pointer do not free in formfree */ +#define CURL_HTTPPOST_PTRNAME (1<<2) +/* contents is only stored pointer do not free in formfree */ +#define CURL_HTTPPOST_PTRCONTENTS (1<<3) +/* upload file from buffer */ +#define CURL_HTTPPOST_BUFFER (1<<4) +/* upload file from pointer contents */ +#define CURL_HTTPPOST_PTRBUFFER (1<<5) +/* upload file contents by using the regular read callback to get the data and + pass the given pointer as custom pointer */ +#define CURL_HTTPPOST_CALLBACK (1<<6) +/* use size in 'contentlen', added in 7.46.0 */ +#define CURL_HTTPPOST_LARGE (1<<7) char *showfilename; /* The file name to show. If not set, the actual file name will be used (if this is a file part) */ void *userp; /* custom pointer used for HTTPPOST_CALLBACK posts */ + curl_off_t contentlen; /* alternative length of contents + field. Used if CURL_HTTPPOST_LARGE is + set. Added in 7.46.0 */ }; /* This is the CURLOPT_PROGRESSFUNCTION callback proto. It is now considered @@ -172,6 +194,11 @@ typedef int (*curl_xferinfo_callback)(void *clientp, curl_off_t ultotal, curl_off_t ulnow); +#ifndef CURL_MAX_READ_SIZE + /* The maximum receive buffer size configurable via CURLOPT_BUFFERSIZE. */ +#define CURL_MAX_READ_SIZE 524288 +#endif + #ifndef CURL_MAX_WRITE_SIZE /* Tests have proven that 20K is a very bad buffer size for uploads on Windows, while 16K for some odd reason performed a lot better. @@ -249,7 +276,7 @@ struct curl_fileinfo { unsigned int flags; /* used internally */ - char * b_data; + char *b_data; size_t b_size; size_t b_used; }; @@ -362,6 +389,7 @@ typedef curlioerr (*curl_ioctl_callback)(CURL *handle, int cmd, void *clientp); +#ifndef CURL_DID_MEMORY_FUNC_TYPEDEFS /* * The following typedef's are signatures of malloc, free, realloc, strdup and * calloc respectively. Function pointers of these types can be passed to the @@ -374,6 +402,9 @@ typedef void *(*curl_realloc_callback)(void *ptr, size_t size); typedef char *(*curl_strdup_callback)(const char *str); typedef void *(*curl_calloc_callback)(size_t nmemb, size_t size); +#define CURL_DID_MEMORY_FUNC_TYPEDEFS +#endif + /* the kind of data that is passed to information_callback*/ typedef enum { CURLINFO_TEXT = 0, @@ -410,7 +441,7 @@ typedef enum { CURLE_COULDNT_RESOLVE_PROXY, /* 5 */ CURLE_COULDNT_RESOLVE_HOST, /* 6 */ CURLE_COULDNT_CONNECT, /* 7 */ - CURLE_FTP_WEIRD_SERVER_REPLY, /* 8 */ + CURLE_WEIRD_SERVER_REPLY, /* 8 */ CURLE_REMOTE_ACCESS_DENIED, /* 9 a service was denied by the server due to lack of access - when login fails this is not returned. */ @@ -454,15 +485,15 @@ typedef enum { CURLE_LDAP_CANNOT_BIND, /* 38 */ CURLE_LDAP_SEARCH_FAILED, /* 39 */ CURLE_OBSOLETE40, /* 40 - NOT USED */ - CURLE_FUNCTION_NOT_FOUND, /* 41 */ + CURLE_FUNCTION_NOT_FOUND, /* 41 - NOT USED starting with 7.53.0 */ CURLE_ABORTED_BY_CALLBACK, /* 42 */ CURLE_BAD_FUNCTION_ARGUMENT, /* 43 */ CURLE_OBSOLETE44, /* 44 - NOT USED */ CURLE_INTERFACE_FAILED, /* 45 - CURLOPT_INTERFACE failed */ CURLE_OBSOLETE46, /* 46 - NOT USED */ - CURLE_TOO_MANY_REDIRECTS , /* 47 - catch endless re-direct loops */ + CURLE_TOO_MANY_REDIRECTS, /* 47 - catch endless re-direct loops */ CURLE_UNKNOWN_OPTION, /* 48 - User specified an unknown option */ - CURLE_TELNET_OPTION_SYNTAX , /* 49 - Malformed telnet option */ + CURLE_TELNET_OPTION_SYNTAX, /* 49 - Malformed telnet option */ CURLE_OBSOLETE50, /* 50 - NOT USED */ CURLE_PEER_FAILED_VERIFICATION, /* 51 - peer's certificate or fingerprint wasn't verified fine */ @@ -523,6 +554,9 @@ typedef enum { session will be queued */ CURLE_SSL_PINNEDPUBKEYNOTMATCH, /* 90 - specified pinned public key did not match */ + CURLE_SSL_INVALIDCERTSTATUS, /* 91 - invalid certificate status */ + CURLE_HTTP2_STREAM, /* 92 - stream error in HTTP/2 framing layer + */ CURL_LAST /* never use! */ } CURLcode; @@ -538,6 +572,7 @@ typedef enum { /* compatibility with older names */ #define CURLOPT_ENCODING CURLOPT_ACCEPT_ENCODING +#define CURLE_FTP_WEIRD_SERVER_REPLY CURLE_WEIRD_SERVER_REPLY /* The following were added in 7.21.5, April 2011 */ #define CURLE_UNKNOWN_TELNET_OPTION CURLE_UNKNOWN_OPTION @@ -611,6 +646,7 @@ typedef enum { CONNECT HTTP/1.1 */ CURLPROXY_HTTP_1_0 = 1, /* added in 7.19.4, force to use CONNECT HTTP/1.0 */ + CURLPROXY_HTTPS = 2, /* added in 7.52.0 */ CURLPROXY_SOCKS4 = 4, /* support added in 7.15.2, enum existed already in 7.10 */ CURLPROXY_SOCKS5 = 5, /* added in 7.10 */ @@ -724,6 +760,10 @@ typedef enum { servers, a user can this way allow the vulnerability back. */ #define CURLSSLOPT_ALLOW_BEAST (1<<0) +/* - NO_REVOKE tells libcurl to disable certificate revocation checks for those + SSL backends where such behavior is present. */ +#define CURLSSLOPT_NO_REVOKE (1<<1) + #ifndef CURL_NO_OLDIES /* define this to test if your app builds with all the obsolete stuff removed! */ @@ -813,9 +853,13 @@ typedef enum { but 32 */ #define CURLOPTTYPE_LONG 0 #define CURLOPTTYPE_OBJECTPOINT 10000 +#define CURLOPTTYPE_STRINGPOINT 10000 #define CURLOPTTYPE_FUNCTIONPOINT 20000 #define CURLOPTTYPE_OFF_T 30000 +/* *STRINGPOINT is an alias for OBJECTPOINT to allow tools to extract the + string options from the header file */ + /* name is uppercase CURLOPT_, type is one of the defined CURLOPTTYPE_ number is unique identifier */ @@ -829,6 +873,7 @@ typedef enum { /* The macro "##" is ISO C, we assume pre-ISO C doesn't support it. */ #define LONG CURLOPTTYPE_LONG #define OBJECTPOINT CURLOPTTYPE_OBJECTPOINT +#define STRINGPOINT CURLOPTTYPE_OBJECTPOINT #define FUNCTIONPOINT CURLOPTTYPE_FUNCTIONPOINT #define OFF_T CURLOPTTYPE_OFF_T #define CINIT(name,type,number) CURLOPT_/**/name = type + number @@ -845,22 +890,22 @@ typedef enum { CINIT(WRITEDATA, OBJECTPOINT, 1), /* The full URL to get/put */ - CINIT(URL, OBJECTPOINT, 2), + CINIT(URL, STRINGPOINT, 2), /* Port number to connect to, if other than default. */ CINIT(PORT, LONG, 3), /* Name of proxy to use. */ - CINIT(PROXY, OBJECTPOINT, 4), + CINIT(PROXY, STRINGPOINT, 4), /* "user:password;options" to use when fetching. */ - CINIT(USERPWD, OBJECTPOINT, 5), + CINIT(USERPWD, STRINGPOINT, 5), /* "user:password" to use with proxy. */ - CINIT(PROXYUSERPWD, OBJECTPOINT, 6), + CINIT(PROXYUSERPWD, STRINGPOINT, 6), /* Range to get, specified as an ASCII string. */ - CINIT(RANGE, OBJECTPOINT, 7), + CINIT(RANGE, STRINGPOINT, 7), /* not used */ @@ -897,14 +942,14 @@ typedef enum { CINIT(POSTFIELDS, OBJECTPOINT, 15), /* Set the referrer page (needed by some CGIs) */ - CINIT(REFERER, OBJECTPOINT, 16), + CINIT(REFERER, STRINGPOINT, 16), /* Set the FTP PORT string (interface name, named or numerical IP address) Use i.e '-' to use default address. */ - CINIT(FTPPORT, OBJECTPOINT, 17), + CINIT(FTPPORT, STRINGPOINT, 17), /* Set the User-Agent string (examined by some CGIs) */ - CINIT(USERAGENT, OBJECTPOINT, 18), + CINIT(USERAGENT, STRINGPOINT, 18), /* If the download receives less than "low speed limit" bytes/second * during "low speed time" seconds, the operations is aborted. @@ -927,7 +972,7 @@ typedef enum { CINIT(RESUME_FROM, LONG, 21), /* Set cookie in request: */ - CINIT(COOKIE, OBJECTPOINT, 22), + CINIT(COOKIE, STRINGPOINT, 22), /* This points to a linked list of headers, struct curl_slist kind. This list is also used for RTSP (in spite of its name) */ @@ -937,10 +982,10 @@ typedef enum { CINIT(HTTPPOST, OBJECTPOINT, 24), /* name of the file keeping your private SSL-certificate */ - CINIT(SSLCERT, OBJECTPOINT, 25), + CINIT(SSLCERT, STRINGPOINT, 25), /* password for the SSL or SSH private key */ - CINIT(KEYPASSWD, OBJECTPOINT, 26), + CINIT(KEYPASSWD, STRINGPOINT, 26), /* send TYPE parameter? */ CINIT(CRLF, LONG, 27), @@ -954,7 +999,7 @@ typedef enum { /* point to a file to read the initial cookies from, also enables "cookie awareness" */ - CINIT(COOKIEFILE, OBJECTPOINT, 31), + CINIT(COOKIEFILE, STRINGPOINT, 31), /* What version to specifically try to use. See CURL_SSLVERSION defines below. */ @@ -973,9 +1018,9 @@ typedef enum { HTTP: DELETE, TRACE and others FTP: to use a different list command */ - CINIT(CUSTOMREQUEST, OBJECTPOINT, 36), + CINIT(CUSTOMREQUEST, STRINGPOINT, 36), - /* HTTP request, for odd commands like DELETE, TRACE and others */ + /* FILE handle to use instead of stderr */ CINIT(STDERR, OBJECTPOINT, 37), /* 38 is not used */ @@ -1032,19 +1077,19 @@ typedef enum { CINIT(HTTPPROXYTUNNEL, LONG, 61), /* Set the interface string to use as outgoing network interface */ - CINIT(INTERFACE, OBJECTPOINT, 62), + CINIT(INTERFACE, STRINGPOINT, 62), /* Set the krb4/5 security level, this also enables krb4/5 awareness. This * is a string, 'clear', 'safe', 'confidential' or 'private'. If the string * is set but doesn't match one of these, 'private' will be used. */ - CINIT(KRBLEVEL, OBJECTPOINT, 63), + CINIT(KRBLEVEL, STRINGPOINT, 63), /* Set if we should verify the peer in ssl handshake, set 1 to verify. */ CINIT(SSL_VERIFYPEER, LONG, 64), /* The CApath or CAfile used to validate the peer certificate this option is used only if SSL_VERIFYPEER is true */ - CINIT(CAINFO, OBJECTPOINT, 65), + CINIT(CAINFO, STRINGPOINT, 65), /* 66 = OBSOLETE */ /* 67 = OBSOLETE */ @@ -1078,10 +1123,10 @@ typedef enum { /* Set to a file name that contains random data for libcurl to use to seed the random engine when doing SSL connects. */ - CINIT(RANDOM_FILE, OBJECTPOINT, 76), + CINIT(RANDOM_FILE, STRINGPOINT, 76), /* Set to the Entropy Gathering Daemon socket pathname */ - CINIT(EGDSOCKET, OBJECTPOINT, 77), + CINIT(EGDSOCKET, STRINGPOINT, 77), /* Time-out connect operations after this amount of seconds, if connects are OK within this time, then fine... This only aborts the connect phase. */ @@ -1103,10 +1148,10 @@ typedef enum { /* Specify which file name to write all known cookies in after completed operation. Set file name to "-" (dash) to make it go to stdout. */ - CINIT(COOKIEJAR, OBJECTPOINT, 82), + CINIT(COOKIEJAR, STRINGPOINT, 82), /* Specify which SSL ciphers to use */ - CINIT(SSL_CIPHER_LIST, OBJECTPOINT, 83), + CINIT(SSL_CIPHER_LIST, STRINGPOINT, 83), /* Specify which HTTP version to use! This must be set to one of the CURL_HTTP_VERSION* enums set below. */ @@ -1118,16 +1163,16 @@ typedef enum { CINIT(FTP_USE_EPSV, LONG, 85), /* type of the file keeping your SSL-certificate ("DER", "PEM", "ENG") */ - CINIT(SSLCERTTYPE, OBJECTPOINT, 86), + CINIT(SSLCERTTYPE, STRINGPOINT, 86), /* name of the file keeping your private SSL-key */ - CINIT(SSLKEY, OBJECTPOINT, 87), + CINIT(SSLKEY, STRINGPOINT, 87), /* type of the file keeping your private SSL-key ("DER", "PEM", "ENG") */ - CINIT(SSLKEYTYPE, OBJECTPOINT, 88), + CINIT(SSLKEYTYPE, STRINGPOINT, 88), /* crypto engine for the SSL-sub system */ - CINIT(SSLENGINE, OBJECTPOINT, 89), + CINIT(SSLENGINE, STRINGPOINT, 89), /* set the crypto engine for the SSL-sub system as default the param has no meaning... @@ -1154,7 +1199,7 @@ typedef enum { /* The CApath directory used to validate the peer certificate this option is used only if SSL_VERIFYPEER is true */ - CINIT(CAPATH, OBJECTPOINT, 97), + CINIT(CAPATH, STRINGPOINT, 97), /* Instruct libcurl to use a smaller receive buffer */ CINIT(BUFFERSIZE, LONG, 98), @@ -1168,13 +1213,14 @@ typedef enum { CINIT(SHARE, OBJECTPOINT, 100), /* indicates type of proxy. accepted values are CURLPROXY_HTTP (default), - CURLPROXY_SOCKS4, CURLPROXY_SOCKS4A and CURLPROXY_SOCKS5. */ + CURLPROXY_HTTPS, CURLPROXY_SOCKS4, CURLPROXY_SOCKS4A and + CURLPROXY_SOCKS5. */ CINIT(PROXYTYPE, LONG, 101), /* Set the Accept-Encoding string. Use this to tell a server you would like the response to be compressed. Before 7.21.6, this was known as CURLOPT_ENCODING */ - CINIT(ACCEPT_ENCODING, OBJECTPOINT, 102), + CINIT(ACCEPT_ENCODING, STRINGPOINT, 102), /* Set pointer to private data */ CINIT(PRIVATE, OBJECTPOINT, 103), @@ -1255,7 +1301,7 @@ typedef enum { to parse (using the CURLOPT_NETRC option). If not set, libcurl will do a poor attempt to find the user's home directory and check for a .netrc file in there. */ - CINIT(NETRC_FILE, OBJECTPOINT, 118), + CINIT(NETRC_FILE, STRINGPOINT, 118), /* Enable SSL/TLS for FTP, pick one of: CURLUSESSL_TRY - try using SSL, proceed anyway otherwise @@ -1298,10 +1344,10 @@ typedef enum { /* zero terminated string for pass on to the FTP server when asked for "account" info */ - CINIT(FTP_ACCOUNT, OBJECTPOINT, 134), + CINIT(FTP_ACCOUNT, STRINGPOINT, 134), - /* feed cookies into cookie engine */ - CINIT(COOKIELIST, OBJECTPOINT, 135), + /* feed cookie into cookie engine */ + CINIT(COOKIELIST, STRINGPOINT, 135), /* ignore Content-Length */ CINIT(IGNORE_CONTENT_LENGTH, LONG, 136), @@ -1347,7 +1393,7 @@ typedef enum { CINIT(MAX_RECV_SPEED_LARGE, OFF_T, 146), /* Pointer to command string to send if USER/PASS fails. */ - CINIT(FTP_ALTERNATIVE_TO_USER, OBJECTPOINT, 147), + CINIT(FTP_ALTERNATIVE_TO_USER, STRINGPOINT, 147), /* callback function for setting socket options */ CINIT(SOCKOPTFUNCTION, FUNCTIONPOINT, 148), @@ -1361,8 +1407,8 @@ typedef enum { CINIT(SSH_AUTH_TYPES, LONG, 151), /* Used by scp/sftp to do public/private key authentication */ - CINIT(SSH_PUBLIC_KEYFILE, OBJECTPOINT, 152), - CINIT(SSH_PRIVATE_KEYFILE, OBJECTPOINT, 153), + CINIT(SSH_PUBLIC_KEYFILE, STRINGPOINT, 152), + CINIT(SSH_PRIVATE_KEYFILE, STRINGPOINT, 153), /* Send CCC (Clear Command Channel) after authentication */ CINIT(FTP_SSL_CCC, LONG, 154), @@ -1386,7 +1432,7 @@ typedef enum { CINIT(POSTREDIR, LONG, 161), /* used by scp/sftp to verify the host's public key */ - CINIT(SSH_HOST_PUBLIC_KEY_MD5, OBJECTPOINT, 162), + CINIT(SSH_HOST_PUBLIC_KEY_MD5, STRINGPOINT, 162), /* Callback function for opening socket (instead of socket(2)). Optionally, callback is able change the address or refuse to connect returning @@ -1406,10 +1452,10 @@ typedef enum { CINIT(SEEKDATA, OBJECTPOINT, 168), /* CRL file */ - CINIT(CRLFILE, OBJECTPOINT, 169), + CINIT(CRLFILE, STRINGPOINT, 169), /* Issuer certificate */ - CINIT(ISSUERCERT, OBJECTPOINT, 170), + CINIT(ISSUERCERT, STRINGPOINT, 170), /* (IPv6) Address scope */ CINIT(ADDRESS_SCOPE, LONG, 171), @@ -1419,12 +1465,12 @@ typedef enum { CINIT(CERTINFO, LONG, 172), /* "name" and "pwd" to use when fetching. */ - CINIT(USERNAME, OBJECTPOINT, 173), - CINIT(PASSWORD, OBJECTPOINT, 174), + CINIT(USERNAME, STRINGPOINT, 173), + CINIT(PASSWORD, STRINGPOINT, 174), /* "name" and "pwd" to use with Proxy when fetching. */ - CINIT(PROXYUSERNAME, OBJECTPOINT, 175), - CINIT(PROXYPASSWORD, OBJECTPOINT, 176), + CINIT(PROXYUSERNAME, STRINGPOINT, 175), + CINIT(PROXYPASSWORD, STRINGPOINT, 176), /* Comma separated list of hostnames defining no-proxy zones. These should match both hostnames directly, and hostnames within a domain. For @@ -1433,13 +1479,13 @@ typedef enum { implementations of this, .local.com will be considered to be the same as local.com. A single * is the only valid wildcard, and effectively disables the use of proxy. */ - CINIT(NOPROXY, OBJECTPOINT, 177), + CINIT(NOPROXY, STRINGPOINT, 177), /* block size for TFTP transfers */ CINIT(TFTP_BLKSIZE, LONG, 178), /* Socks Service */ - CINIT(SOCKS5_GSSAPI_SERVICE, OBJECTPOINT, 179), + CINIT(SOCKS5_GSSAPI_SERVICE, STRINGPOINT, 179), /* DEPRECATED, do not use! */ /* Socks Service */ CINIT(SOCKS5_GSSAPI_NEC, LONG, 180), @@ -1457,7 +1503,7 @@ typedef enum { CINIT(REDIR_PROTOCOLS, LONG, 182), /* set the SSH knownhost file name to use */ - CINIT(SSH_KNOWNHOSTS, OBJECTPOINT, 183), + CINIT(SSH_KNOWNHOSTS, STRINGPOINT, 183), /* set the SSH host key callback, must point to a curl_sshkeycallback function */ @@ -1467,9 +1513,9 @@ typedef enum { CINIT(SSH_KEYDATA, OBJECTPOINT, 185), /* set the SMTP mail originator */ - CINIT(MAIL_FROM, OBJECTPOINT, 186), + CINIT(MAIL_FROM, STRINGPOINT, 186), - /* set the SMTP mail receiver(s) */ + /* set the list of SMTP mail receiver(s) */ CINIT(MAIL_RCPT, OBJECTPOINT, 187), /* FTP: send PRET before PASV */ @@ -1479,13 +1525,13 @@ typedef enum { CINIT(RTSP_REQUEST, LONG, 189), /* The RTSP session identifier */ - CINIT(RTSP_SESSION_ID, OBJECTPOINT, 190), + CINIT(RTSP_SESSION_ID, STRINGPOINT, 190), /* The RTSP stream URI */ - CINIT(RTSP_STREAM_URI, OBJECTPOINT, 191), + CINIT(RTSP_STREAM_URI, STRINGPOINT, 191), /* The Transport: header to use in RTSP requests */ - CINIT(RTSP_TRANSPORT, OBJECTPOINT, 192), + CINIT(RTSP_TRANSPORT, STRINGPOINT, 192), /* Manually initialize the client RTSP CSeq for this handle */ CINIT(RTSP_CLIENT_CSEQ, LONG, 193), @@ -1523,13 +1569,13 @@ typedef enum { CINIT(RESOLVE, OBJECTPOINT, 203), /* Set a username for authenticated TLS */ - CINIT(TLSAUTH_USERNAME, OBJECTPOINT, 204), + CINIT(TLSAUTH_USERNAME, STRINGPOINT, 204), /* Set a password for authenticated TLS */ - CINIT(TLSAUTH_PASSWORD, OBJECTPOINT, 205), + CINIT(TLSAUTH_PASSWORD, STRINGPOINT, 205), /* Set authentication type for authenticated TLS */ - CINIT(TLSAUTH_TYPE, OBJECTPOINT, 206), + CINIT(TLSAUTH_TYPE, STRINGPOINT, 206), /* Set to 1 to enable the "TE:" header in HTTP requests to ask for compressed transfer-encoded responses. Set to 0 to disable the use of TE: @@ -1552,10 +1598,10 @@ typedef enum { CINIT(GSSAPI_DELEGATION, LONG, 210), /* Set the name servers to use for DNS resolution */ - CINIT(DNS_SERVERS, OBJECTPOINT, 211), + CINIT(DNS_SERVERS, STRINGPOINT, 211), /* Time-out accept operations (currently for FTP only) after this amount - of miliseconds. */ + of milliseconds. */ CINIT(ACCEPTTIMEOUT_MS, LONG, 212), /* Set TCP keepalive */ @@ -1569,7 +1615,7 @@ typedef enum { CINIT(SSL_OPTIONS, LONG, 216), /* Set the SMTP auth originator */ - CINIT(MAIL_AUTH, OBJECTPOINT, 217), + CINIT(MAIL_AUTH, STRINGPOINT, 217), /* Enable/disable SASL initial response */ CINIT(SASL_IR, LONG, 218), @@ -1580,23 +1626,23 @@ typedef enum { CINIT(XFERINFOFUNCTION, FUNCTIONPOINT, 219), /* The XOAUTH2 bearer token */ - CINIT(XOAUTH2_BEARER, OBJECTPOINT, 220), + CINIT(XOAUTH2_BEARER, STRINGPOINT, 220), /* Set the interface string to use as outgoing network * interface for DNS requests. * Only supported by the c-ares DNS backend */ - CINIT(DNS_INTERFACE, OBJECTPOINT, 221), + CINIT(DNS_INTERFACE, STRINGPOINT, 221), /* Set the local IPv4 address to use for outgoing DNS requests. * Only supported by the c-ares DNS backend */ - CINIT(DNS_LOCAL_IP4, OBJECTPOINT, 222), + CINIT(DNS_LOCAL_IP4, STRINGPOINT, 222), /* Set the local IPv4 address to use for outgoing DNS requests. * Only supported by the c-ares DNS backend */ - CINIT(DNS_LOCAL_IP6, OBJECTPOINT, 223), + CINIT(DNS_LOCAL_IP6, STRINGPOINT, 223), /* Set authentication options directly */ - CINIT(LOGIN_OPTIONS, OBJECTPOINT, 224), + CINIT(LOGIN_OPTIONS, STRINGPOINT, 224), /* Enable/disable TLS NPN extension (http2 over ssl might fail without) */ CINIT(SSL_ENABLE_NPN, LONG, 225), @@ -1617,10 +1663,124 @@ typedef enum { /* The public key in DER form used to validate the peer public key this option is used only if SSL_VERIFYPEER is true */ - CINIT(PINNEDPUBLICKEY, OBJECTPOINT, 230), + CINIT(PINNEDPUBLICKEY, STRINGPOINT, 230), /* Path to Unix domain socket */ - CINIT(UNIX_SOCKET_PATH, OBJECTPOINT, 231), + CINIT(UNIX_SOCKET_PATH, STRINGPOINT, 231), + + /* Set if we should verify the certificate status. */ + CINIT(SSL_VERIFYSTATUS, LONG, 232), + + /* Set if we should enable TLS false start. */ + CINIT(SSL_FALSESTART, LONG, 233), + + /* Do not squash dot-dot sequences */ + CINIT(PATH_AS_IS, LONG, 234), + + /* Proxy Service Name */ + CINIT(PROXY_SERVICE_NAME, STRINGPOINT, 235), + + /* Service Name */ + CINIT(SERVICE_NAME, STRINGPOINT, 236), + + /* Wait/don't wait for pipe/mutex to clarify */ + CINIT(PIPEWAIT, LONG, 237), + + /* Set the protocol used when curl is given a URL without a protocol */ + CINIT(DEFAULT_PROTOCOL, STRINGPOINT, 238), + + /* Set stream weight, 1 - 256 (default is 16) */ + CINIT(STREAM_WEIGHT, LONG, 239), + + /* Set stream dependency on another CURL handle */ + CINIT(STREAM_DEPENDS, OBJECTPOINT, 240), + + /* Set E-xclusive stream dependency on another CURL handle */ + CINIT(STREAM_DEPENDS_E, OBJECTPOINT, 241), + + /* Do not send any tftp option requests to the server */ + CINIT(TFTP_NO_OPTIONS, LONG, 242), + + /* Linked-list of host:port:connect-to-host:connect-to-port, + overrides the URL's host:port (only for the network layer) */ + CINIT(CONNECT_TO, OBJECTPOINT, 243), + + /* Set TCP Fast Open */ + CINIT(TCP_FASTOPEN, LONG, 244), + + /* Continue to send data if the server responds early with an + * HTTP status code >= 300 */ + CINIT(KEEP_SENDING_ON_ERROR, LONG, 245), + + /* The CApath or CAfile used to validate the proxy certificate + this option is used only if PROXY_SSL_VERIFYPEER is true */ + CINIT(PROXY_CAINFO, STRINGPOINT, 246), + + /* The CApath directory used to validate the proxy certificate + this option is used only if PROXY_SSL_VERIFYPEER is true */ + CINIT(PROXY_CAPATH, STRINGPOINT, 247), + + /* Set if we should verify the proxy in ssl handshake, + set 1 to verify. */ + CINIT(PROXY_SSL_VERIFYPEER, LONG, 248), + + /* Set if we should verify the Common name from the proxy certificate in ssl + * handshake, set 1 to check existence, 2 to ensure that it matches + * the provided hostname. */ + CINIT(PROXY_SSL_VERIFYHOST, LONG, 249), + + /* What version to specifically try to use for proxy. + See CURL_SSLVERSION defines below. */ + CINIT(PROXY_SSLVERSION, LONG, 250), + + /* Set a username for authenticated TLS for proxy */ + CINIT(PROXY_TLSAUTH_USERNAME, STRINGPOINT, 251), + + /* Set a password for authenticated TLS for proxy */ + CINIT(PROXY_TLSAUTH_PASSWORD, STRINGPOINT, 252), + + /* Set authentication type for authenticated TLS for proxy */ + CINIT(PROXY_TLSAUTH_TYPE, STRINGPOINT, 253), + + /* name of the file keeping your private SSL-certificate for proxy */ + CINIT(PROXY_SSLCERT, STRINGPOINT, 254), + + /* type of the file keeping your SSL-certificate ("DER", "PEM", "ENG") for + proxy */ + CINIT(PROXY_SSLCERTTYPE, STRINGPOINT, 255), + + /* name of the file keeping your private SSL-key for proxy */ + CINIT(PROXY_SSLKEY, STRINGPOINT, 256), + + /* type of the file keeping your private SSL-key ("DER", "PEM", "ENG") for + proxy */ + CINIT(PROXY_SSLKEYTYPE, STRINGPOINT, 257), + + /* password for the SSL private key for proxy */ + CINIT(PROXY_KEYPASSWD, STRINGPOINT, 258), + + /* Specify which SSL ciphers to use for proxy */ + CINIT(PROXY_SSL_CIPHER_LIST, STRINGPOINT, 259), + + /* CRL file for proxy */ + CINIT(PROXY_CRLFILE, STRINGPOINT, 260), + + /* Enable/disable specific SSL features with a bitmask for proxy, see + CURLSSLOPT_* */ + CINIT(PROXY_SSL_OPTIONS, LONG, 261), + + /* Name of pre proxy to use. */ + CINIT(PRE_PROXY, STRINGPOINT, 262), + + /* The public key in DER form used to validate the proxy public key + this option is used only if PROXY_SSL_VERIFYPEER is true */ + CINIT(PROXY_PINNEDPUBLICKEY, STRINGPOINT, 263), + + /* Path to an abstract Unix domain socket */ + CINIT(ABSTRACT_UNIX_SOCKET, STRINGPOINT, 264), + + /* Suppress proxy CONNECT response headers from user callbacks */ + CINIT(SUPPRESS_CONNECT_HEADERS, LONG, 265), CURLOPT_LASTENTRY /* the last unused */ } CURLoption; @@ -1671,11 +1831,19 @@ enum { for us! */ CURL_HTTP_VERSION_1_0, /* please use HTTP 1.0 in the request */ CURL_HTTP_VERSION_1_1, /* please use HTTP 1.1 in the request */ - CURL_HTTP_VERSION_2_0, /* please use HTTP 2.0 in the request */ + CURL_HTTP_VERSION_2_0, /* please use HTTP 2 in the request */ + CURL_HTTP_VERSION_2TLS, /* use version 2 for HTTPS, version 1.1 for HTTP */ + CURL_HTTP_VERSION_2_PRIOR_KNOWLEDGE, /* please use HTTP 2 without HTTP/1.1 + Upgrade */ CURL_HTTP_VERSION_LAST /* *ILLEGAL* http version */ }; +/* Convenience definition simple because the name of the version is HTTP/2 and + not 2.0. The 2_0 version of the enum name was set while the version was + still planned to be 2.0 and we stick to it for compatibility. */ +#define CURL_HTTP_VERSION_2 CURL_HTTP_VERSION_2_0 + /* * Public API enums for RTSP requests */ @@ -1715,10 +1883,23 @@ enum { CURL_SSLVERSION_TLSv1_0, CURL_SSLVERSION_TLSv1_1, CURL_SSLVERSION_TLSv1_2, + CURL_SSLVERSION_TLSv1_3, CURL_SSLVERSION_LAST /* never use, keep last */ }; +enum { + CURL_SSLVERSION_MAX_NONE = 0, + CURL_SSLVERSION_MAX_DEFAULT = (CURL_SSLVERSION_TLSv1 << 16), + CURL_SSLVERSION_MAX_TLSv1_0 = (CURL_SSLVERSION_TLSv1_0 << 16), + CURL_SSLVERSION_MAX_TLSv1_1 = (CURL_SSLVERSION_TLSv1_1 << 16), + CURL_SSLVERSION_MAX_TLSv1_2 = (CURL_SSLVERSION_TLSv1_2 << 16), + CURL_SSLVERSION_MAX_TLSv1_3 = (CURL_SSLVERSION_TLSv1_3 << 16), + + /* never use, keep last */ + CURL_SSLVERSION_MAX_LAST = (CURL_SSLVERSION_LAST << 16) +}; + enum CURL_TLSAUTH { CURL_TLSAUTH_NONE, CURL_TLSAUTH_SRP, @@ -1749,7 +1930,10 @@ typedef enum { /* curl_strequal() and curl_strnequal() are subject for removal in a future - libcurl, see lib/README.curlx for details */ + libcurl, see lib/README.curlx for details + + !checksrc! disable SPACEBEFOREPAREN 2 +*/ CURL_EXTERN int (curl_strequal)(const char *s1, const char *s2); CURL_EXTERN int (curl_strnequal)(const char *s1, const char *s2, size_t n); @@ -1791,6 +1975,7 @@ typedef enum { CFINIT(OBSOLETE2), CFINIT(STREAM), + CFINIT(CONTENTLEN), /* added in 7.46.0, provide a curl_off_t length */ CURLFORM_LASTENTRY /* the last unused */ } CURLformoption; @@ -2045,12 +2230,18 @@ typedef enum { CURLSSLBACKEND_CYASSL = 7, CURLSSLBACKEND_SCHANNEL = 8, CURLSSLBACKEND_DARWINSSL = 9, - CURLSSLBACKEND_AXTLS = 10 + CURLSSLBACKEND_AXTLS = 10, + CURLSSLBACKEND_MBEDTLS = 11 } curl_sslbackend; +/* aliases for library clones and renames */ +#define CURLSSLBACKEND_LIBRESSL 1 +#define CURLSSLBACKEND_BORINGSSL 1 +#define CURLSSLBACKEND_WOLFSSL 6 + /* Information about the SSL library used and the respective internal SSL handle, which can be used to obtain further information regarding the - connection. Asked for with CURLINFO_TLS_SESSION. */ + connection. Asked for with CURLINFO_TLS_SSL_PTR or CURLINFO_TLS_SESSION. */ struct curl_tlssessioninfo { curl_sslbackend backend; void *internals; @@ -2060,6 +2251,7 @@ struct curl_tlssessioninfo { #define CURLINFO_LONG 0x200000 #define CURLINFO_DOUBLE 0x300000 #define CURLINFO_SLIST 0x400000 +#define CURLINFO_SOCKET 0x500000 #define CURLINFO_MASK 0x0fffff #define CURLINFO_TYPEMASK 0xf00000 @@ -2108,9 +2300,15 @@ typedef enum { CURLINFO_LOCAL_IP = CURLINFO_STRING + 41, CURLINFO_LOCAL_PORT = CURLINFO_LONG + 42, CURLINFO_TLS_SESSION = CURLINFO_SLIST + 43, + CURLINFO_ACTIVESOCKET = CURLINFO_SOCKET + 44, + CURLINFO_TLS_SSL_PTR = CURLINFO_SLIST + 45, + CURLINFO_HTTP_VERSION = CURLINFO_LONG + 46, + CURLINFO_PROXY_SSL_VERIFYRESULT = CURLINFO_LONG + 47, + CURLINFO_PROTOCOL = CURLINFO_LONG + 48, + CURLINFO_SCHEME = CURLINFO_STRING + 49, /* Fill in new entries below here! */ - CURLINFO_LASTONE = 43 + CURLINFO_LASTONE = 49 } CURLINFO; /* CURLINFO_RESPONSE_CODE is the new name for the option previously known as @@ -2172,7 +2370,6 @@ typedef void (*curl_unlock_function)(CURL *handle, curl_lock_data data, void *userptr); -typedef void CURLSH; typedef enum { CURLSHE_OK, /* all is fine */ @@ -2265,11 +2462,14 @@ typedef struct { #define CURL_VERSION_CURLDEBUG (1<<13) /* Debug memory tracking supported */ #define CURL_VERSION_TLSAUTH_SRP (1<<14) /* TLS-SRP auth is supported */ #define CURL_VERSION_NTLM_WB (1<<15) /* NTLM delegation to winbind helper - is suported */ + is supported */ #define CURL_VERSION_HTTP2 (1<<16) /* HTTP2 support built-in */ #define CURL_VERSION_GSSAPI (1<<17) /* Built against a GSS-API library */ #define CURL_VERSION_KERBEROS5 (1<<18) /* Kerberos V5 auth is supported */ #define CURL_VERSION_UNIX_SOCKETS (1<<19) /* Unix domain sockets support */ +#define CURL_VERSION_PSL (1<<20) /* Mozilla's Public Suffix List, used + for cookie domain verification */ +#define CURL_VERSION_HTTPS_PROXY (1<<21) /* HTTPS-proxy support built-in */ /* * NAME curl_version_info() diff --git a/compat/curl-for-windows/curl/include/curl/curlbuild.h b/compat/includes/curl/curlbuild.h similarity index 98% rename from compat/curl-for-windows/curl/include/curl/curlbuild.h rename to compat/includes/curl/curlbuild.h index f09419a843..ae95095fa5 100644 --- a/compat/curl-for-windows/curl/include/curl/curlbuild.h +++ b/compat/includes/curl/curlbuild.h @@ -7,11 +7,11 @@ * | (__| |_| | _ <| |___ * \___|\___/|_| \_\_____| * - * Copyright (C) 1998 - 2013, Daniel Stenberg, , et al. + * Copyright (C) 1998 - 2016, Daniel Stenberg, , et al. * * This software is licensed as described in the file COPYING, which * you should have received as part of this distribution. The terms - * are also available at http://curl.haxx.se/docs/copyright.html. + * are also available at https://curl.haxx.se/docs/copyright.html. * * You may opt to use, copy, modify, merge, publish, distribute and/or sell * copies of the Software, and permit persons to whom the Software is @@ -49,7 +49,7 @@ * * If you think that something actually needs to be changed, adjusted * or fixed in this file, then, report it on the libcurl development - * mailing list: http://cool.haxx.se/mailman/listinfo/curl-library/ + * mailing list: https://cool.haxx.se/mailman/listinfo/curl-library/ * * Try to keep one section per platform, compiler and architecture, * otherwise, if an existing section is reused for a different one and @@ -527,8 +527,9 @@ /* ===================================== */ #elif defined(__GNUC__) -# if defined(__ILP32__) || \ - defined(__i386__) || defined(__ppc__) || defined(__arm__) || defined(__sparc__) +# if !defined(__LP64__) && (defined(__ILP32__) || \ + defined(__i386__) || defined(__ppc__) || defined(__arm__) || \ + defined(__sparc__) || defined(__mips__) || defined(__sh__)) # define CURL_SIZEOF_LONG 4 # define CURL_TYPEOF_CURL_OFF_T long long # define CURL_FORMAT_CURL_OFF_T "lld" diff --git a/compat/curl-for-windows/curl/include/curl/curlrules.h b/compat/includes/curl/curlrules.h similarity index 91% rename from compat/curl-for-windows/curl/include/curl/curlrules.h rename to compat/includes/curl/curlrules.h index 7c2ede35b6..0abd9f71d8 100644 --- a/compat/curl-for-windows/curl/include/curl/curlrules.h +++ b/compat/includes/curl/curlrules.h @@ -7,11 +7,11 @@ * | (__| |_| | _ <| |___ * \___|\___/|_| \_\_____| * - * Copyright (C) 1998 - 2012, Daniel Stenberg, , et al. + * Copyright (C) 1998 - 2017, Daniel Stenberg, , et al. * * This software is licensed as described in the file COPYING, which * you should have received as part of this distribution. The terms - * are also available at http://curl.haxx.se/docs/copyright.html. + * are also available at https://curl.haxx.se/docs/copyright.html. * * You may opt to use, copy, modify, merge, publish, distribute and/or sell * copies of the Software, and permit persons to whom the Software is @@ -47,7 +47,7 @@ * library is properly built and used. * * You can find further help on the libcurl development mailing list: - * http://cool.haxx.se/mailman/listinfo/curl-library/ + * https://cool.haxx.se/mailman/listinfo/curl-library/ * * NOTE 2 * ------ @@ -105,11 +105,6 @@ Error Compilation_aborted_CURL_FORMAT_CURL_OFF_TU_is_missing #endif -#ifndef CURL_FORMAT_OFF_T -# error "CURL_FORMAT_OFF_T definition is missing!" - Error Compilation_aborted_CURL_FORMAT_OFF_T_is_missing -#endif - #ifndef CURL_SIZEOF_CURL_OFF_T # error "CURL_SIZEOF_CURL_OFF_T definition is missing!" Error Compilation_aborted_CURL_SIZEOF_CURL_OFF_T_is_missing @@ -241,22 +236,4 @@ typedef char #undef CurlchkszEQ #undef CurlchkszGE -/* - * Get rid of macros not intended to exist beyond this point. - */ - -#undef CURL_PULL_WS2TCPIP_H -#undef CURL_PULL_SYS_TYPES_H -#undef CURL_PULL_SYS_SOCKET_H -#undef CURL_PULL_SYS_POLL_H -#undef CURL_PULL_STDINT_H -#undef CURL_PULL_INTTYPES_H - -#undef CURL_TYPEOF_CURL_SOCKLEN_T -#undef CURL_TYPEOF_CURL_OFF_T - -#ifdef CURL_NO_OLDIES -#undef CURL_FORMAT_OFF_T /* not required since 7.19.0 - obsoleted in 7.20.0 */ -#endif - #endif /* __CURL_CURLRULES_H */ diff --git a/compat/curl-for-windows/curl/include/curl/curlver.h b/compat/includes/curl/curlver.h similarity index 77% rename from compat/curl-for-windows/curl/include/curl/curlver.h rename to compat/includes/curl/curlver.h index ccdafc1de1..95a2cbbe78 100644 --- a/compat/curl-for-windows/curl/include/curl/curlver.h +++ b/compat/includes/curl/curlver.h @@ -7,11 +7,11 @@ * | (__| |_| | _ <| |___ * \___|\___/|_| \_\_____| * - * Copyright (C) 1998 - 2015, Daniel Stenberg, , et al. + * Copyright (C) 1998 - 2017, Daniel Stenberg, , et al. * * This software is licensed as described in the file COPYING, which * you should have received as part of this distribution. The terms - * are also available at http://curl.haxx.se/docs/copyright.html. + * are also available at https://curl.haxx.se/docs/copyright.html. * * You may opt to use, copy, modify, merge, publish, distribute and/or sell * copies of the Software, and permit persons to whom the Software is @@ -26,16 +26,16 @@ a script at release-time. This was made its own header file in 7.11.2 */ /* This is the global package copyright */ -#define LIBCURL_COPYRIGHT "1996 - 2015 Daniel Stenberg, ." +#define LIBCURL_COPYRIGHT "1996 - 2017 Daniel Stenberg, ." /* This is the version number of the libcurl package from which this header file origins: */ -#define LIBCURL_VERSION "7.40.0" +#define LIBCURL_VERSION "7.54.0" /* The numeric version number is also available "in parts" by using these defines: */ #define LIBCURL_VERSION_MAJOR 7 -#define LIBCURL_VERSION_MINOR 40 +#define LIBCURL_VERSION_MINOR 54 #define LIBCURL_VERSION_PATCH 0 /* This is the numeric version of the libcurl version number, meant for easier @@ -52,8 +52,12 @@ This 6-digit (24 bits) hexadecimal number does not show pre-release number, and it is always a greater number in a more recent release. It makes comparisons with greater than and less than work. + + Note: This define is the full hex number and _does not_ use the + CURL_VERSION_BITS() macro since curl's own configure script greps for it + and needs it to contain the full number. */ -#define LIBCURL_VERSION_NUM 0x072800 +#define LIBCURL_VERSION_NUM 0x073600 /* * This is the date and time when the full source package was created. The @@ -64,6 +68,10 @@ * * "Mon Feb 12 11:35:33 UTC 2007" */ -#define LIBCURL_TIMESTAMP "Thu Jan 8 08:17:17 UTC 2015" +#define LIBCURL_TIMESTAMP "Wed Apr 19 05:43:55 UTC 2017" + +#define CURL_VERSION_BITS(x,y,z) ((x)<<16|(y)<<8|z) +#define CURL_AT_LEAST_VERSION(x,y,z) \ + (LIBCURL_VERSION_NUM >= CURL_VERSION_BITS(x, y, z)) #endif /* __CURL_CURLVER_H */ diff --git a/compat/curl-for-windows/curl/include/curl/easy.h b/compat/includes/curl/easy.h similarity index 94% rename from compat/curl-for-windows/curl/include/curl/easy.h rename to compat/includes/curl/easy.h index c1e3e76096..752c5049f8 100644 --- a/compat/curl-for-windows/curl/include/curl/easy.h +++ b/compat/includes/curl/easy.h @@ -7,11 +7,11 @@ * | (__| |_| | _ <| |___ * \___|\___/|_| \_\_____| * - * Copyright (C) 1998 - 2008, Daniel Stenberg, , et al. + * Copyright (C) 1998 - 2016, Daniel Stenberg, , et al. * * This software is licensed as described in the file COPYING, which * you should have received as part of this distribution. The terms - * are also available at http://curl.haxx.se/docs/copyright.html. + * are also available at https://curl.haxx.se/docs/copyright.html. * * You may opt to use, copy, modify, merge, publish, distribute and/or sell * copies of the Software, and permit persons to whom the Software is @@ -58,7 +58,7 @@ CURL_EXTERN CURLcode curl_easy_getinfo(CURL *curl, CURLINFO info, ...); * curl_easy_duphandle() for each new thread to avoid a series of identical * curl_easy_setopt() invokes in every thread. */ -CURL_EXTERN CURL* curl_easy_duphandle(CURL *curl); +CURL_EXTERN CURL *curl_easy_duphandle(CURL *curl); /* * NAME curl_easy_reset() diff --git a/compat/curl-for-windows/curl/include/curl/mprintf.h b/compat/includes/curl/mprintf.h similarity index 68% rename from compat/curl-for-windows/curl/include/curl/mprintf.h rename to compat/includes/curl/mprintf.h index cc9e7f5d1f..e20f546e19 100644 --- a/compat/curl-for-windows/curl/include/curl/mprintf.h +++ b/compat/includes/curl/mprintf.h @@ -7,11 +7,11 @@ * | (__| |_| | _ <| |___ * \___|\___/|_| \_\_____| * - * Copyright (C) 1998 - 2013, Daniel Stenberg, , et al. + * Copyright (C) 1998 - 2016, Daniel Stenberg, , et al. * * This software is licensed as described in the file COPYING, which * you should have received as part of this distribution. The terms - * are also available at http://curl.haxx.se/docs/copyright.html. + * are also available at https://curl.haxx.se/docs/copyright.html. * * You may opt to use, copy, modify, merge, publish, distribute and/or sell * copies of the Software, and permit persons to whom the Software is @@ -24,8 +24,7 @@ #include #include /* needed for FILE */ - -#include "curl.h" +#include "curl.h" /* for CURL_EXTERN */ #ifdef __cplusplus extern "C" { @@ -44,36 +43,6 @@ CURL_EXTERN int curl_mvsnprintf(char *buffer, size_t maxlength, CURL_EXTERN char *curl_maprintf(const char *format, ...); CURL_EXTERN char *curl_mvaprintf(const char *format, va_list args); -#ifdef _MPRINTF_REPLACE -# undef printf -# undef fprintf -# undef sprintf -# undef vsprintf -# undef snprintf -# undef vprintf -# undef vfprintf -# undef vsnprintf -# undef aprintf -# undef vaprintf -# define printf curl_mprintf -# define fprintf curl_mfprintf -#ifdef CURLDEBUG -/* When built with CURLDEBUG we define away the sprintf functions since we - don't want internal code to be using them */ -# define sprintf sprintf_was_used -# define vsprintf vsprintf_was_used -#else -# define sprintf curl_msprintf -# define vsprintf curl_mvsprintf -#endif -# define snprintf curl_msnprintf -# define vprintf curl_mvprintf -# define vfprintf curl_mvfprintf -# define vsnprintf curl_mvsnprintf -# define aprintf curl_maprintf -# define vaprintf curl_mvaprintf -#endif - #ifdef __cplusplus } #endif diff --git a/compat/curl-for-windows/curl/include/curl/multi.h b/compat/includes/curl/multi.h similarity index 90% rename from compat/curl-for-windows/curl/include/curl/multi.h rename to compat/includes/curl/multi.h index 3c4acb0f6e..f93e511be0 100644 --- a/compat/curl-for-windows/curl/include/curl/multi.h +++ b/compat/includes/curl/multi.h @@ -7,11 +7,11 @@ * | (__| |_| | _ <| |___ * \___|\___/|_| \_\_____| * - * Copyright (C) 1998 - 2013, Daniel Stenberg, , et al. + * Copyright (C) 1998 - 2016, Daniel Stenberg, , et al. * * This software is licensed as described in the file COPYING, which * you should have received as part of this distribution. The terms - * are also available at http://curl.haxx.se/docs/copyright.html. + * are also available at https://curl.haxx.se/docs/copyright.html. * * You may opt to use, copy, modify, merge, publish, distribute and/or sell * copies of the Software, and permit persons to whom the Software is @@ -52,7 +52,11 @@ extern "C" { #endif +#if defined(BUILDING_LIBCURL) || defined(CURL_STRICTER) +typedef struct Curl_multi CURLM; +#else typedef void CURLM; +#endif typedef enum { CURLM_CALL_MULTI_PERFORM = -1, /* please call curl_multi_perform() or @@ -74,6 +78,11 @@ typedef enum { curl_multi_perform() and CURLM_CALL_MULTI_PERFORM */ #define CURLM_CALL_MULTI_SOCKET CURLM_CALL_MULTI_PERFORM +/* bitmask bits for CURLMOPT_PIPELINING */ +#define CURLPIPE_NOTHING 0L +#define CURLPIPE_HTTP1 1L +#define CURLPIPE_MULTIPLEX 2L + typedef enum { CURLMSG_NONE, /* first, not used */ CURLMSG_DONE, /* This easy handle has completed. 'result' contains @@ -209,7 +218,7 @@ CURL_EXTERN CURLMcode curl_multi_cleanup(CURLM *multi_handle); * curl_multi_cleanup(). * * The 'CURLMsg' struct is meant to be very simple and only contain - * very basic informations. If more involved information is wanted, + * very basic information. If more involved information is wanted, * we will provide the particular "transfer handle" in that struct * and that should/could/would be used in subsequent * curl_easy_getinfo() calls (or similar). The point being that we @@ -365,6 +374,12 @@ typedef enum { /* maximum number of open connections in total */ CINIT(MAX_TOTAL_CONNECTIONS, LONG, 13), + /* This is the server push callback function pointer */ + CINIT(PUSHFUNCTION, FUNCTIONPOINT, 14), + + /* This is the argument passed to the server push callback */ + CINIT(PUSHDATA, OBJECTPOINT, 15), + CURLMOPT_LASTENTRY /* the last unused */ } CURLMoption; @@ -392,6 +407,31 @@ CURL_EXTERN CURLMcode curl_multi_setopt(CURLM *multi_handle, CURL_EXTERN CURLMcode curl_multi_assign(CURLM *multi_handle, curl_socket_t sockfd, void *sockp); + +/* + * Name: curl_push_callback + * + * Desc: This callback gets called when a new stream is being pushed by the + * server. It approves or denies the new stream. + * + * Returns: CURL_PUSH_OK or CURL_PUSH_DENY. + */ +#define CURL_PUSH_OK 0 +#define CURL_PUSH_DENY 1 + +struct curl_pushheaders; /* forward declaration only */ + +CURL_EXTERN char *curl_pushheader_bynum(struct curl_pushheaders *h, + size_t num); +CURL_EXTERN char *curl_pushheader_byname(struct curl_pushheaders *h, + const char *name); + +typedef int (*curl_push_callback)(CURL *parent, + CURL *easy, + size_t num_headers, + struct curl_pushheaders *headers, + void *userp); + #ifdef __cplusplus } /* end of extern "C" */ #endif diff --git a/compat/curl-for-windows/curl/include/curl/stdcheaders.h b/compat/includes/curl/stdcheaders.h similarity index 82% rename from compat/curl-for-windows/curl/include/curl/stdcheaders.h rename to compat/includes/curl/stdcheaders.h index ad82ef6335..027b6f4211 100644 --- a/compat/curl-for-windows/curl/include/curl/stdcheaders.h +++ b/compat/includes/curl/stdcheaders.h @@ -7,11 +7,11 @@ * | (__| |_| | _ <| |___ * \___|\___/|_| \_\_____| * - * Copyright (C) 1998 - 2010, Daniel Stenberg, , et al. + * Copyright (C) 1998 - 2016, Daniel Stenberg, , et al. * * This software is licensed as described in the file COPYING, which * you should have received as part of this distribution. The terms - * are also available at http://curl.haxx.se/docs/copyright.html. + * are also available at https://curl.haxx.se/docs/copyright.html. * * You may opt to use, copy, modify, merge, publish, distribute and/or sell * copies of the Software, and permit persons to whom the Software is @@ -24,8 +24,8 @@ #include -size_t fread (void *, size_t, size_t, FILE *); -size_t fwrite (const void *, size_t, size_t, FILE *); +size_t fread(void *, size_t, size_t, FILE *); +size_t fwrite(const void *, size_t, size_t, FILE *); int strcasecmp(const char *, const char *); int strncasecmp(const char *, const char *, size_t); diff --git a/compat/includes/curl/system.h b/compat/includes/curl/system.h new file mode 100644 index 0000000000..ed3a55c954 --- /dev/null +++ b/compat/includes/curl/system.h @@ -0,0 +1,484 @@ +#ifndef __CURL_SYSTEM_H +#define __CURL_SYSTEM_H +/*************************************************************************** + * _ _ ____ _ + * Project ___| | | | _ \| | + * / __| | | | |_) | | + * | (__| |_| | _ <| |___ + * \___|\___/|_| \_\_____| + * + * Copyright (C) 1998 - 2017, Daniel Stenberg, , et al. + * + * This software is licensed as described in the file COPYING, which + * you should have received as part of this distribution. The terms + * are also available at https://curl.haxx.se/docs/copyright.html. + * + * You may opt to use, copy, modify, merge, publish, distribute and/or sell + * copies of the Software, and permit persons to whom the Software is + * furnished to do so, under the terms of the COPYING file. + * + * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY + * KIND, either express or implied. + * + ***************************************************************************/ + +/* + * This header is supposed to eventually replace curlbuild.h. This little one + * is still learning. During the experimental phase, this header files + * defines symbols using the prefixes CURLSYS_ or curlsys_. When we feel + * confident enough, we replace curlbuild.h with this file and rename all + * prefixes to CURL_ and curl_. + */ + +/* + * Try to keep one section per platform, compiler and architecture, otherwise, + * if an existing section is reused for a different one and later on the + * original is adjusted, probably the piggybacking one can be adversely + * changed. + * + * In order to differentiate between platforms/compilers/architectures use + * only compiler built in predefined preprocessor symbols. + * + * curl_off_t + * ---------- + * + * For any given platform/compiler curl_off_t must be typedef'ed to a 64-bit + * wide signed integral data type. The width of this data type must remain + * constant and independent of any possible large file support settings. + * + * As an exception to the above, curl_off_t shall be typedef'ed to a 32-bit + * wide signed integral data type if there is no 64-bit type. + * + * As a general rule, curl_off_t shall not be mapped to off_t. This rule shall + * only be violated if off_t is the only 64-bit data type available and the + * size of off_t is independent of large file support settings. Keep your + * build on the safe side avoiding an off_t gating. If you have a 64-bit + * off_t then take for sure that another 64-bit data type exists, dig deeper + * and you will find it. + * + */ + +#if defined(__DJGPP__) || defined(__GO32__) +# if defined(__DJGPP__) && (__DJGPP__ > 1) +# define CURLSYS_SIZEOF_LONG 4 +# define CURLSYS_TYPEOF_CURL_OFF_T long long +# define CURLSYS_FORMAT_CURL_OFF_T "lld" +# define CURLSYS_FORMAT_CURL_OFF_TU "llu" +# define CURLSYS_SIZEOF_CURL_OFF_T 8 +# define CURLSYS_SUFFIX_CURL_OFF_T LL +# define CURLSYS_SUFFIX_CURL_OFF_TU ULL +# else +# define CURLSYS_SIZEOF_LONG 4 +# define CURLSYS_TYPEOF_CURL_OFF_T long +# define CURLSYS_FORMAT_CURL_OFF_T "ld" +# define CURLSYS_FORMAT_CURL_OFF_TU "lu" +# define CURLSYS_SIZEOF_CURL_OFF_T 4 +# define CURLSYS_SUFFIX_CURL_OFF_T L +# define CURLSYS_SUFFIX_CURL_OFF_TU UL +# endif +# define CURLSYS_TYPEOF_CURL_SOCKLEN_T int +# define CURLSYS_SIZEOF_CURL_SOCKLEN_T 4 + +#elif defined(__SALFORDC__) +# define CURLSYS_SIZEOF_LONG 4 +# define CURLSYS_TYPEOF_CURL_OFF_T long +# define CURLSYS_FORMAT_CURL_OFF_T "ld" +# define CURLSYS_FORMAT_CURL_OFF_TU "lu" +# define CURLSYS_SIZEOF_CURL_OFF_T 4 +# define CURLSYS_SUFFIX_CURL_OFF_T L +# define CURLSYS_SUFFIX_CURL_OFF_TU UL +# define CURLSYS_TYPEOF_CURL_SOCKLEN_T int +# define CURLSYS_SIZEOF_CURL_SOCKLEN_T 4 + +#elif defined(__BORLANDC__) +# if (__BORLANDC__ < 0x520) +# define CURLSYS_SIZEOF_LONG 4 +# define CURLSYS_TYPEOF_CURL_OFF_T long +# define CURLSYS_FORMAT_CURL_OFF_T "ld" +# define CURLSYS_FORMAT_CURL_OFF_TU "lu" +# define CURLSYS_SIZEOF_CURL_OFF_T 4 +# define CURLSYS_SUFFIX_CURL_OFF_T L +# define CURLSYS_SUFFIX_CURL_OFF_TU UL +# else +# define CURLSYS_SIZEOF_LONG 4 +# define CURLSYS_TYPEOF_CURL_OFF_T __int64 +# define CURLSYS_FORMAT_CURL_OFF_T "I64d" +# define CURLSYS_FORMAT_CURL_OFF_TU "I64u" +# define CURLSYS_SIZEOF_CURL_OFF_T 8 +# define CURLSYS_SUFFIX_CURL_OFF_T i64 +# define CURLSYS_SUFFIX_CURL_OFF_TU ui64 +# endif +# define CURLSYS_TYPEOF_CURL_SOCKLEN_T int +# define CURLSYS_SIZEOF_CURL_SOCKLEN_T 4 + +#elif defined(__TURBOC__) +# define CURLSYS_SIZEOF_LONG 4 +# define CURLSYS_TYPEOF_CURL_OFF_T long +# define CURLSYS_FORMAT_CURL_OFF_T "ld" +# define CURLSYS_FORMAT_CURL_OFF_TU "lu" +# define CURLSYS_SIZEOF_CURL_OFF_T 4 +# define CURLSYS_SUFFIX_CURL_OFF_T L +# define CURLSYS_SUFFIX_CURL_OFF_TU UL +# define CURLSYS_TYPEOF_CURL_SOCKLEN_T int +# define CURLSYS_SIZEOF_CURL_SOCKLEN_T 4 + +#elif defined(__WATCOMC__) +# if defined(__386__) +# define CURLSYS_SIZEOF_LONG 4 +# define CURLSYS_TYPEOF_CURL_OFF_T __int64 +# define CURLSYS_FORMAT_CURL_OFF_T "I64d" +# define CURLSYS_FORMAT_CURL_OFF_TU "I64u" +# define CURLSYS_SIZEOF_CURL_OFF_T 8 +# define CURLSYS_SUFFIX_CURL_OFF_T i64 +# define CURLSYS_SUFFIX_CURL_OFF_TU ui64 +# else +# define CURLSYS_SIZEOF_LONG 4 +# define CURLSYS_TYPEOF_CURL_OFF_T long +# define CURLSYS_FORMAT_CURL_OFF_T "ld" +# define CURLSYS_FORMAT_CURL_OFF_TU "lu" +# define CURLSYS_SIZEOF_CURL_OFF_T 4 +# define CURLSYS_SUFFIX_CURL_OFF_T L +# define CURLSYS_SUFFIX_CURL_OFF_TU UL +# endif +# define CURLSYS_TYPEOF_CURL_SOCKLEN_T int +# define CURLSYS_SIZEOF_CURL_SOCKLEN_T 4 + +#elif defined(__POCC__) +# if (__POCC__ < 280) +# define CURLSYS_SIZEOF_LONG 4 +# define CURLSYS_TYPEOF_CURL_OFF_T long +# define CURLSYS_FORMAT_CURL_OFF_T "ld" +# define CURLSYS_FORMAT_CURL_OFF_TU "lu" +# define CURLSYS_SIZEOF_CURL_OFF_T 4 +# define CURLSYS_SUFFIX_CURL_OFF_T L +# define CURLSYS_SUFFIX_CURL_OFF_TU UL +# elif defined(_MSC_VER) +# define CURLSYS_SIZEOF_LONG 4 +# define CURLSYS_TYPEOF_CURL_OFF_T __int64 +# define CURLSYS_FORMAT_CURL_OFF_T "I64d" +# define CURLSYS_FORMAT_CURL_OFF_TU "I64u" +# define CURLSYS_SIZEOF_CURL_OFF_T 8 +# define CURLSYS_SUFFIX_CURL_OFF_T i64 +# define CURLSYS_SUFFIX_CURL_OFF_TU ui64 +# else +# define CURLSYS_SIZEOF_LONG 4 +# define CURLSYS_TYPEOF_CURL_OFF_T long long +# define CURLSYS_FORMAT_CURL_OFF_T "lld" +# define CURLSYS_FORMAT_CURL_OFF_TU "llu" +# define CURLSYS_SIZEOF_CURL_OFF_T 8 +# define CURLSYS_SUFFIX_CURL_OFF_T LL +# define CURLSYS_SUFFIX_CURL_OFF_TU ULL +# endif +# define CURLSYS_TYPEOF_CURL_SOCKLEN_T int +# define CURLSYS_SIZEOF_CURL_SOCKLEN_T 4 + +#elif defined(__LCC__) +# define CURLSYS_SIZEOF_LONG 4 +# define CURLSYS_TYPEOF_CURL_OFF_T long +# define CURLSYS_FORMAT_CURL_OFF_T "ld" +# define CURLSYS_FORMAT_CURL_OFF_TU "lu" +# define CURLSYS_SIZEOF_CURL_OFF_T 4 +# define CURLSYS_SUFFIX_CURL_OFF_T L +# define CURLSYS_SUFFIX_CURL_OFF_TU UL +# define CURLSYS_TYPEOF_CURL_SOCKLEN_T int +# define CURLSYS_SIZEOF_CURL_SOCKLEN_T 4 + +#elif defined(__SYMBIAN32__) +# if defined(__EABI__) /* Treat all ARM compilers equally */ +# define CURLSYS_SIZEOF_LONG 4 +# define CURLSYS_TYPEOF_CURL_OFF_T long long +# define CURLSYS_FORMAT_CURL_OFF_T "lld" +# define CURLSYS_FORMAT_CURL_OFF_TU "llu" +# define CURLSYS_SIZEOF_CURL_OFF_T 8 +# define CURLSYS_SUFFIX_CURL_OFF_T LL +# define CURLSYS_SUFFIX_CURL_OFF_TU ULL +# elif defined(__CW32__) +# pragma longlong on +# define CURLSYS_SIZEOF_LONG 4 +# define CURLSYS_TYPEOF_CURL_OFF_T long long +# define CURLSYS_FORMAT_CURL_OFF_T "lld" +# define CURLSYS_FORMAT_CURL_OFF_TU "llu" +# define CURLSYS_SIZEOF_CURL_OFF_T 8 +# define CURLSYS_SUFFIX_CURL_OFF_T LL +# define CURLSYS_SUFFIX_CURL_OFF_TU ULL +# elif defined(__VC32__) +# define CURLSYS_SIZEOF_LONG 4 +# define CURLSYS_TYPEOF_CURL_OFF_T __int64 +# define CURLSYS_FORMAT_CURL_OFF_T "lld" +# define CURLSYS_FORMAT_CURL_OFF_TU "llu" +# define CURLSYS_SIZEOF_CURL_OFF_T 8 +# define CURLSYS_SUFFIX_CURL_OFF_T LL +# define CURLSYS_SUFFIX_CURL_OFF_TU ULL +# endif +# define CURLSYS_TYPEOF_CURL_SOCKLEN_T unsigned int +# define CURLSYS_SIZEOF_CURL_SOCKLEN_T 4 + +#elif defined(__MWERKS__) +# define CURLSYS_SIZEOF_LONG 4 +# define CURLSYS_TYPEOF_CURL_OFF_T long long +# define CURLSYS_FORMAT_CURL_OFF_T "lld" +# define CURLSYS_FORMAT_CURL_OFF_TU "llu" +# define CURLSYS_SIZEOF_CURL_OFF_T 8 +# define CURLSYS_SUFFIX_CURL_OFF_T LL +# define CURLSYS_SUFFIX_CURL_OFF_TU ULL +# define CURLSYS_TYPEOF_CURL_SOCKLEN_T int +# define CURLSYS_SIZEOF_CURL_SOCKLEN_T 4 + +#elif defined(_WIN32_WCE) +# define CURLSYS_SIZEOF_LONG 4 +# define CURLSYS_TYPEOF_CURL_OFF_T __int64 +# define CURLSYS_FORMAT_CURL_OFF_T "I64d" +# define CURLSYS_FORMAT_CURL_OFF_TU "I64u" +# define CURLSYS_SIZEOF_CURL_OFF_T 8 +# define CURLSYS_SUFFIX_CURL_OFF_T i64 +# define CURLSYS_SUFFIX_CURL_OFF_TU ui64 +# define CURLSYS_TYPEOF_CURL_SOCKLEN_T int +# define CURLSYS_SIZEOF_CURL_SOCKLEN_T 4 + +#elif defined(__MINGW32__) +# define CURLSYS_SIZEOF_LONG 4 +# define CURLSYS_TYPEOF_CURL_OFF_T long long +# define CURLSYS_FORMAT_CURL_OFF_T "I64d" +# define CURLSYS_FORMAT_CURL_OFF_TU "I64u" +# define CURLSYS_SIZEOF_CURL_OFF_T 8 +# define CURLSYS_SUFFIX_CURL_OFF_T LL +# define CURLSYS_SUFFIX_CURL_OFF_TU ULL +# define CURLSYS_TYPEOF_CURL_SOCKLEN_T socklen_t +# define CURLSYS_SIZEOF_CURL_SOCKLEN_T 4 +# define CURLSYS_PULL_SYS_TYPES_H 1 +# define CURLSYS_PULL_WS2TCPIP_H 1 + +#elif defined(__VMS) +# if defined(__VAX) +# define CURLSYS_SIZEOF_LONG 4 +# define CURLSYS_TYPEOF_CURL_OFF_T long +# define CURLSYS_FORMAT_CURL_OFF_T "ld" +# define CURLSYS_FORMAT_CURL_OFF_TU "lu" +# define CURLSYS_SIZEOF_CURL_OFF_T 4 +# define CURLSYS_SUFFIX_CURL_OFF_T L +# define CURLSYS_SUFFIX_CURL_OFF_TU UL +# else +# define CURLSYS_SIZEOF_LONG 4 +# define CURLSYS_TYPEOF_CURL_OFF_T long long +# define CURLSYS_FORMAT_CURL_OFF_T "lld" +# define CURLSYS_FORMAT_CURL_OFF_TU "llu" +# define CURLSYS_SIZEOF_CURL_OFF_T 8 +# define CURLSYS_SUFFIX_CURL_OFF_T LL +# define CURLSYS_SUFFIX_CURL_OFF_TU ULL +# endif +# define CURLSYS_TYPEOF_CURL_SOCKLEN_T unsigned int +# define CURLSYS_SIZEOF_CURL_SOCKLEN_T 4 + +#elif defined(__OS400__) +# if defined(__ILEC400__) +# define CURLSYS_SIZEOF_LONG 4 +# define CURLSYS_TYPEOF_CURL_OFF_T long long +# define CURLSYS_FORMAT_CURL_OFF_T "lld" +# define CURLSYS_FORMAT_CURL_OFF_TU "llu" +# define CURLSYS_SIZEOF_CURL_OFF_T 8 +# define CURLSYS_SUFFIX_CURL_OFF_T LL +# define CURLSYS_SUFFIX_CURL_OFF_TU ULL +# define CURLSYS_TYPEOF_CURL_SOCKLEN_T socklen_t +# define CURLSYS_SIZEOF_CURL_SOCKLEN_T 4 +# define CURLSYS_PULL_SYS_TYPES_H 1 +# define CURLSYS_PULL_SYS_SOCKET_H 1 +# endif + +#elif defined(__MVS__) +# if defined(__IBMC__) || defined(__IBMCPP__) +# if defined(_ILP32) +# define CURLSYS_SIZEOF_LONG 4 +# elif defined(_LP64) +# define CURLSYS_SIZEOF_LONG 8 +# endif +# if defined(_LONG_LONG) +# define CURLSYS_TYPEOF_CURL_OFF_T long long +# define CURLSYS_FORMAT_CURL_OFF_T "lld" +# define CURLSYS_FORMAT_CURL_OFF_TU "llu" +# define CURLSYS_SIZEOF_CURL_OFF_T 8 +# define CURLSYS_SUFFIX_CURL_OFF_T LL +# define CURLSYS_SUFFIX_CURL_OFF_TU ULL +# elif defined(_LP64) +# define CURLSYS_TYPEOF_CURL_OFF_T long +# define CURLSYS_FORMAT_CURL_OFF_T "ld" +# define CURLSYS_FORMAT_CURL_OFF_TU "lu" +# define CURLSYS_SIZEOF_CURL_OFF_T 8 +# define CURLSYS_SUFFIX_CURL_OFF_T L +# define CURLSYS_SUFFIX_CURL_OFF_TU UL +# else +# define CURLSYS_TYPEOF_CURL_OFF_T long +# define CURLSYS_FORMAT_CURL_OFF_T "ld" +# define CURLSYS_FORMAT_CURL_OFF_TU "lu" +# define CURLSYS_SIZEOF_CURL_OFF_T 4 +# define CURLSYS_SUFFIX_CURL_OFF_T L +# define CURLSYS_SUFFIX_CURL_OFF_TU UL +# endif +# define CURLSYS_TYPEOF_CURL_SOCKLEN_T socklen_t +# define CURLSYS_SIZEOF_CURL_SOCKLEN_T 4 +# define CURLSYS_PULL_SYS_TYPES_H 1 +# define CURLSYS_PULL_SYS_SOCKET_H 1 +# endif + +#elif defined(__370__) +# if defined(__IBMC__) || defined(__IBMCPP__) +# if defined(_ILP32) +# define CURLSYS_SIZEOF_LONG 4 +# elif defined(_LP64) +# define CURLSYS_SIZEOF_LONG 8 +# endif +# if defined(_LONG_LONG) +# define CURLSYS_TYPEOF_CURL_OFF_T long long +# define CURLSYS_FORMAT_CURL_OFF_T "lld" +# define CURLSYS_FORMAT_CURL_OFF_TU "llu" +# define CURLSYS_SIZEOF_CURL_OFF_T 8 +# define CURLSYS_SUFFIX_CURL_OFF_T LL +# define CURLSYS_SUFFIX_CURL_OFF_TU ULL +# elif defined(_LP64) +# define CURLSYS_TYPEOF_CURL_OFF_T long +# define CURLSYS_FORMAT_CURL_OFF_T "ld" +# define CURLSYS_FORMAT_CURL_OFF_TU "lu" +# define CURLSYS_SIZEOF_CURL_OFF_T 8 +# define CURLSYS_SUFFIX_CURL_OFF_T L +# define CURLSYS_SUFFIX_CURL_OFF_TU UL +# else +# define CURLSYS_TYPEOF_CURL_OFF_T long +# define CURLSYS_FORMAT_CURL_OFF_T "ld" +# define CURLSYS_FORMAT_CURL_OFF_TU "lu" +# define CURLSYS_SIZEOF_CURL_OFF_T 4 +# define CURLSYS_SUFFIX_CURL_OFF_T L +# define CURLSYS_SUFFIX_CURL_OFF_TU UL +# endif +# define CURLSYS_TYPEOF_CURL_SOCKLEN_T socklen_t +# define CURLSYS_SIZEOF_CURL_SOCKLEN_T 4 +# define CURLSYS_PULL_SYS_TYPES_H 1 +# define CURLSYS_PULL_SYS_SOCKET_H 1 +# endif + +#elif defined(TPF) +# define CURLSYS_SIZEOF_LONG 8 +# define CURLSYS_TYPEOF_CURL_OFF_T long +# define CURLSYS_FORMAT_CURL_OFF_T "ld" +# define CURLSYS_FORMAT_CURL_OFF_TU "lu" +# define CURLSYS_SIZEOF_CURL_OFF_T 8 +# define CURLSYS_SUFFIX_CURL_OFF_T L +# define CURLSYS_SUFFIX_CURL_OFF_TU UL +# define CURLSYS_TYPEOF_CURL_SOCKLEN_T int +# define CURLSYS_SIZEOF_CURL_SOCKLEN_T 4 + +#elif defined(__TINYC__) /* also known as tcc */ + +# define CURLSYS_SIZEOF_LONG 4 +# define CURLSYS_TYPEOF_CURL_OFF_T long long +# define CURLSYS_FORMAT_CURL_OFF_T "lld" +# define CURLSYS_FORMAT_CURL_OFF_TU "llu" +# define CURLSYS_SIZEOF_CURL_OFF_T 8 +# define CURLSYS_SUFFIX_CURL_OFF_T LL +# define CURLSYS_SUFFIX_CURL_OFF_TU ULL +# define CURLSYS_TYPEOF_CURL_SOCKLEN_T socklen_t +# define CURLSYS_PULL_SYS_TYPES_H 1 +# define CURLSYS_PULL_SYS_SOCKET_H 1 + +/* ===================================== */ +/* KEEP MSVC THE PENULTIMATE ENTRY */ +/* ===================================== */ + +#elif defined(_MSC_VER) +# if (_MSC_VER >= 900) && (_INTEGRAL_MAX_BITS >= 64) +# define CURLSYS_SIZEOF_LONG 4 +# define CURLSYS_TYPEOF_CURL_OFF_T __int64 +# define CURLSYS_FORMAT_CURL_OFF_T "I64d" +# define CURLSYS_FORMAT_CURL_OFF_TU "I64u" +# define CURLSYS_SIZEOF_CURL_OFF_T 8 +# define CURLSYS_SUFFIX_CURL_OFF_T i64 +# define CURLSYS_SUFFIX_CURL_OFF_TU ui64 +# else +# define CURLSYS_SIZEOF_LONG 4 +# define CURLSYS_TYPEOF_CURL_OFF_T long +# define CURLSYS_FORMAT_CURL_OFF_T "ld" +# define CURLSYS_FORMAT_CURL_OFF_TU "lu" +# define CURLSYS_SIZEOF_CURL_OFF_T 4 +# define CURLSYS_SUFFIX_CURL_OFF_T L +# define CURLSYS_SUFFIX_CURL_OFF_TU UL +# endif +# define CURLSYS_TYPEOF_CURL_SOCKLEN_T int +# define CURLSYS_SIZEOF_CURL_SOCKLEN_T 4 + +/* ===================================== */ +/* KEEP GENERIC GCC THE LAST ENTRY */ +/* ===================================== */ + +#elif defined(__GNUC__) +# if !defined(__LP64__) && (defined(__ILP32__) || \ + defined(__i386__) || defined(__ppc__) || defined(__arm__) || \ + defined(__sparc__) || defined(__mips__) || defined(__sh__)) +# define CURLSYS_SIZEOF_LONG 4 +# define CURLSYS_TYPEOF_CURL_OFF_T long long +# define CURLSYS_FORMAT_CURL_OFF_T "lld" +# define CURLSYS_FORMAT_CURL_OFF_TU "llu" +# define CURLSYS_SIZEOF_CURL_OFF_T 8 +# define CURLSYS_SUFFIX_CURL_OFF_T LL +# define CURLSYS_SUFFIX_CURL_OFF_TU ULL +# elif defined(__LP64__) || \ + defined(__x86_64__) || defined(__ppc64__) || defined(__sparc64__) +# define CURLSYS_SIZEOF_LONG 8 +# define CURLSYS_TYPEOF_CURL_OFF_T long +# define CURLSYS_FORMAT_CURL_OFF_T "ld" +# define CURLSYS_FORMAT_CURL_OFF_TU "lu" +# define CURLSYS_SIZEOF_CURL_OFF_T 8 +# define CURLSYS_SUFFIX_CURL_OFF_T L +# define CURLSYS_SUFFIX_CURL_OFF_TU UL +# endif +# define CURLSYS_TYPEOF_CURL_SOCKLEN_T socklen_t +# define CURLSYS_SIZEOF_CURL_SOCKLEN_T 4 +# define CURLSYS_PULL_SYS_TYPES_H 1 +# define CURLSYS_PULL_SYS_SOCKET_H 1 + +#else +/* generic "safe guess" on old 32 bit style */ +# define CURLSYS_SIZEOF_LONG 4 +# define CURLSYS_SIZEOF_CURL_SOCKLEN_T 4 +# define CURLSYS_SIZEOF_CURL_OFF_T 4 +# define CURLSYS_TYPEOF_CURL_OFF_T long +# define CURLSYS_FORMAT_CURL_OFF_T "ld" +# define CURLSYS_FORMAT_CURL_OFF_TU "lu" +# define CURLSYS_SUFFIX_CURL_OFF_T L +# define CURLSYS_SUFFIX_CURL_OFF_TU UL +# define CURLSYS_TYPEOF_CURL_SOCKLEN_T int +#endif + +/* CURLSYS_PULL_WS2TCPIP_H is defined above when inclusion of header file */ +/* ws2tcpip.h is required here to properly make type definitions below. */ +#ifdef CURLSYS_PULL_WS2TCPIP_H +# ifndef WIN32_LEAN_AND_MEAN +# define WIN32_LEAN_AND_MEAN +# endif +# include +# include +# include +#endif + +/* CURLSYS_PULL_SYS_TYPES_H is defined above when inclusion of header file */ +/* sys/types.h is required here to properly make type definitions below. */ +#ifdef CURLSYS_PULL_SYS_TYPES_H +# include +#endif + +/* CURLSYS_PULL_SYS_SOCKET_H is defined above when inclusion of header file */ +/* sys/socket.h is required here to properly make type definitions below. */ +#ifdef CURLSYS_PULL_SYS_SOCKET_H +# include +#endif + +/* Data type definition of curl_socklen_t. */ +#ifdef CURLSYS_TYPEOF_CURL_SOCKLEN_T + typedef CURLSYS_TYPEOF_CURL_SOCKLEN_T curlsys_socklen_t; +#endif + +/* Data type definition of curl_off_t. */ + +#ifdef CURLSYS_TYPEOF_CURL_OFF_T + typedef CURLSYS_TYPEOF_CURL_OFF_T curlsys_off_t; +#endif + +#endif /* __CURL_SYSTEM_H */ + diff --git a/compat/curl-for-windows/curl/include/curl/typecheck-gcc.h b/compat/includes/curl/typecheck-gcc.h similarity index 93% rename from compat/curl-for-windows/curl/include/curl/typecheck-gcc.h rename to compat/includes/curl/typecheck-gcc.h index 69d41a20d1..3d683152b6 100644 --- a/compat/curl-for-windows/curl/include/curl/typecheck-gcc.h +++ b/compat/includes/curl/typecheck-gcc.h @@ -7,11 +7,11 @@ * | (__| |_| | _ <| |___ * \___|\___/|_| \_\_____| * - * Copyright (C) 1998 - 2014, Daniel Stenberg, , et al. + * Copyright (C) 1998 - 2016, Daniel Stenberg, , et al. * * This software is licensed as described in the file COPYING, which * you should have received as part of this distribution. The terms - * are also available at http://curl.haxx.se/docs/copyright.html. + * are also available at https://curl.haxx.se/docs/copyright.html. * * You may opt to use, copy, modify, merge, publish, distribute and/or sell * copies of the Software, and permit persons to whom the Software is @@ -40,7 +40,7 @@ */ #define curl_easy_setopt(handle, option, value) \ __extension__ ({ \ - __typeof__ (option) _curl_opt = option; \ + __typeof__(option) _curl_opt = option; \ if(__builtin_constant_p(_curl_opt)) { \ if(_curl_is_long_option(_curl_opt)) \ if(!_curl_is_long(value)) \ @@ -110,7 +110,7 @@ __extension__ ({ \ /* FIXME: don't allow const pointers */ #define curl_easy_getinfo(handle, info, arg) \ __extension__ ({ \ - __typeof__ (info) _curl_info = info; \ + __typeof__(info) _curl_info = info; \ if(__builtin_constant_p(_curl_info)) { \ if(_curl_is_string_info(_curl_info)) \ if(!_curl_is_arr((arg), char *)) \ @@ -151,7 +151,7 @@ _CURL_WARNING(_curl_easy_setopt_err_curl_off_t, "curl_easy_setopt expects a curl_off_t argument for this option") _CURL_WARNING(_curl_easy_setopt_err_string, "curl_easy_setopt expects a " - "string (char* or char[]) argument for this option" + "string ('char *' or char[]) argument for this option" ) _CURL_WARNING(_curl_easy_setopt_err_write_callback, "curl_easy_setopt expects a curl_write_callback argument for this option") @@ -182,24 +182,25 @@ _CURL_WARNING(_curl_easy_setopt_err_error_buffer, "curl_easy_setopt expects a " "char buffer of CURL_ERROR_SIZE as argument for this option") _CURL_WARNING(_curl_easy_setopt_err_FILE, - "curl_easy_setopt expects a FILE* argument for this option") + "curl_easy_setopt expects a 'FILE *' argument for this option") _CURL_WARNING(_curl_easy_setopt_err_postfields, - "curl_easy_setopt expects a void* or char* argument for this option") + "curl_easy_setopt expects a 'void *' or 'char *' argument for this option") _CURL_WARNING(_curl_easy_setopt_err_curl_httpost, - "curl_easy_setopt expects a struct curl_httppost* argument for this option") + "curl_easy_setopt expects a 'struct curl_httppost *' " + "argument for this option") _CURL_WARNING(_curl_easy_setopt_err_curl_slist, - "curl_easy_setopt expects a struct curl_slist* argument for this option") + "curl_easy_setopt expects a 'struct curl_slist *' argument for this option") _CURL_WARNING(_curl_easy_setopt_err_CURLSH, "curl_easy_setopt expects a CURLSH* argument for this option") _CURL_WARNING(_curl_easy_getinfo_err_string, - "curl_easy_getinfo expects a pointer to char * for this info") + "curl_easy_getinfo expects a pointer to 'char *' for this info") _CURL_WARNING(_curl_easy_getinfo_err_long, "curl_easy_getinfo expects a pointer to long for this info") _CURL_WARNING(_curl_easy_getinfo_err_double, "curl_easy_getinfo expects a pointer to double for this info") _CURL_WARNING(_curl_easy_getinfo_err_curl_slist, - "curl_easy_getinfo expects a pointer to struct curl_slist * for this info") + "curl_easy_getinfo expects a pointer to 'struct curl_slist *' for this info") /* groups of curl_easy_setops options that take the same type of argument */ @@ -218,58 +219,68 @@ _CURL_WARNING(_curl_easy_getinfo_err_curl_slist, /* evaluates to true if option takes a char* argument */ #define _curl_is_string_option(option) \ - ((option) == CURLOPT_URL || \ - (option) == CURLOPT_PROXY || \ - (option) == CURLOPT_INTERFACE || \ - (option) == CURLOPT_NETRC_FILE || \ - (option) == CURLOPT_USERPWD || \ - (option) == CURLOPT_USERNAME || \ - (option) == CURLOPT_PASSWORD || \ - (option) == CURLOPT_PROXYUSERPWD || \ - (option) == CURLOPT_PROXYUSERNAME || \ - (option) == CURLOPT_PROXYPASSWORD || \ - (option) == CURLOPT_NOPROXY || \ + ((option) == CURLOPT_ABSTRACT_UNIX_SOCKET || \ (option) == CURLOPT_ACCEPT_ENCODING || \ - (option) == CURLOPT_REFERER || \ - (option) == CURLOPT_USERAGENT || \ + (option) == CURLOPT_CAINFO || \ + (option) == CURLOPT_CAPATH || \ (option) == CURLOPT_COOKIE || \ (option) == CURLOPT_COOKIEFILE || \ (option) == CURLOPT_COOKIEJAR || \ (option) == CURLOPT_COOKIELIST || \ + (option) == CURLOPT_CRLFILE || \ + (option) == CURLOPT_CUSTOMREQUEST || \ + (option) == CURLOPT_DEFAULT_PROTOCOL || \ + (option) == CURLOPT_DNS_INTERFACE || \ + (option) == CURLOPT_DNS_LOCAL_IP4 || \ + (option) == CURLOPT_DNS_LOCAL_IP6 || \ + (option) == CURLOPT_DNS_SERVERS || \ + (option) == CURLOPT_EGDSOCKET || \ (option) == CURLOPT_FTPPORT || \ - (option) == CURLOPT_FTP_ALTERNATIVE_TO_USER || \ (option) == CURLOPT_FTP_ACCOUNT || \ - (option) == CURLOPT_RANGE || \ - (option) == CURLOPT_CUSTOMREQUEST || \ - (option) == CURLOPT_SSLCERT || \ - (option) == CURLOPT_SSLCERTTYPE || \ - (option) == CURLOPT_SSLKEY || \ - (option) == CURLOPT_SSLKEYTYPE || \ + (option) == CURLOPT_FTP_ALTERNATIVE_TO_USER || \ + (option) == CURLOPT_INTERFACE || \ + (option) == CURLOPT_ISSUERCERT || \ (option) == CURLOPT_KEYPASSWD || \ - (option) == CURLOPT_SSLENGINE || \ - (option) == CURLOPT_CAINFO || \ - (option) == CURLOPT_CAPATH || \ - (option) == CURLOPT_RANDOM_FILE || \ - (option) == CURLOPT_EGDSOCKET || \ - (option) == CURLOPT_SSL_CIPHER_LIST || \ (option) == CURLOPT_KRBLEVEL || \ - (option) == CURLOPT_SSH_HOST_PUBLIC_KEY_MD5 || \ - (option) == CURLOPT_SSH_PUBLIC_KEYFILE || \ - (option) == CURLOPT_SSH_PRIVATE_KEYFILE || \ - (option) == CURLOPT_CRLFILE || \ - (option) == CURLOPT_ISSUERCERT || \ - (option) == CURLOPT_SOCKS5_GSSAPI_SERVICE || \ - (option) == CURLOPT_SSH_KNOWNHOSTS || \ + (option) == CURLOPT_LOGIN_OPTIONS || \ + (option) == CURLOPT_MAIL_AUTH || \ (option) == CURLOPT_MAIL_FROM || \ + (option) == CURLOPT_NETRC_FILE || \ + (option) == CURLOPT_NOPROXY || \ + (option) == CURLOPT_PASSWORD || \ + (option) == CURLOPT_PINNEDPUBLICKEY || \ + (option) == CURLOPT_PROXY || \ + (option) == CURLOPT_PROXYPASSWORD || \ + (option) == CURLOPT_PROXYUSERNAME || \ + (option) == CURLOPT_PROXYUSERPWD || \ + (option) == CURLOPT_PROXY_SERVICE_NAME || \ + (option) == CURLOPT_RANDOM_FILE || \ + (option) == CURLOPT_RANGE || \ + (option) == CURLOPT_REFERER || \ (option) == CURLOPT_RTSP_SESSION_ID || \ (option) == CURLOPT_RTSP_STREAM_URI || \ (option) == CURLOPT_RTSP_TRANSPORT || \ + (option) == CURLOPT_SERVICE_NAME || \ + (option) == CURLOPT_SOCKS5_GSSAPI_SERVICE || \ + (option) == CURLOPT_SSH_HOST_PUBLIC_KEY_MD5 || \ + (option) == CURLOPT_SSH_KNOWNHOSTS || \ + (option) == CURLOPT_SSH_PRIVATE_KEYFILE || \ + (option) == CURLOPT_SSH_PUBLIC_KEYFILE || \ + (option) == CURLOPT_SSLCERT || \ + (option) == CURLOPT_SSLCERTTYPE || \ + (option) == CURLOPT_SSLENGINE || \ + (option) == CURLOPT_SSLKEY || \ + (option) == CURLOPT_SSLKEYTYPE || \ + (option) == CURLOPT_SSL_CIPHER_LIST || \ + (option) == CURLOPT_TLSAUTH_PASSWORD || \ + (option) == CURLOPT_TLSAUTH_TYPE || \ + (option) == CURLOPT_TLSAUTH_USERNAME || \ + (option) == CURLOPT_UNIX_SOCKET_PATH || \ + (option) == CURLOPT_URL || \ + (option) == CURLOPT_USERAGENT || \ + (option) == CURLOPT_USERNAME || \ + (option) == CURLOPT_USERPWD || \ (option) == CURLOPT_XOAUTH2_BEARER || \ - (option) == CURLOPT_DNS_SERVERS || \ - (option) == CURLOPT_DNS_INTERFACE || \ - (option) == CURLOPT_DNS_LOCAL_IP4 || \ - (option) == CURLOPT_DNS_LOCAL_IP6 || \ - (option) == CURLOPT_LOGIN_OPTIONS || \ 0) /* evaluates to true if option takes a curl_write_callback argument */ @@ -285,21 +296,22 @@ _CURL_WARNING(_curl_easy_getinfo_err_curl_slist, /* evaluates to true if option takes a data argument to pass to a callback */ #define _curl_is_cb_data_option(option) \ - ((option) == CURLOPT_WRITEDATA || \ - (option) == CURLOPT_READDATA || \ + ((option) == CURLOPT_CHUNK_DATA || \ + (option) == CURLOPT_CLOSESOCKETDATA || \ + (option) == CURLOPT_DEBUGDATA || \ + (option) == CURLOPT_FNMATCH_DATA || \ + (option) == CURLOPT_HEADERDATA || \ + (option) == CURLOPT_INTERLEAVEDATA || \ (option) == CURLOPT_IOCTLDATA || \ - (option) == CURLOPT_SOCKOPTDATA || \ (option) == CURLOPT_OPENSOCKETDATA || \ + (option) == CURLOPT_PRIVATE || \ (option) == CURLOPT_PROGRESSDATA || \ - (option) == CURLOPT_HEADERDATA || \ - (option) == CURLOPT_DEBUGDATA || \ - (option) == CURLOPT_SSL_CTX_DATA || \ + (option) == CURLOPT_READDATA || \ (option) == CURLOPT_SEEKDATA || \ - (option) == CURLOPT_PRIVATE || \ + (option) == CURLOPT_SOCKOPTDATA || \ (option) == CURLOPT_SSH_KEYDATA || \ - (option) == CURLOPT_INTERLEAVEDATA || \ - (option) == CURLOPT_CHUNK_DATA || \ - (option) == CURLOPT_FNMATCH_DATA || \ + (option) == CURLOPT_SSL_CTX_DATA || \ + (option) == CURLOPT_WRITEDATA || \ 0) /* evaluates to true if option takes a POST data argument (void* or char*) */ @@ -310,13 +322,15 @@ _CURL_WARNING(_curl_easy_getinfo_err_curl_slist, /* evaluates to true if option takes a struct curl_slist * argument */ #define _curl_is_slist_option(option) \ - ((option) == CURLOPT_HTTPHEADER || \ - (option) == CURLOPT_HTTP200ALIASES || \ - (option) == CURLOPT_QUOTE || \ + ((option) == CURLOPT_HTTP200ALIASES || \ + (option) == CURLOPT_HTTPHEADER || \ + (option) == CURLOPT_MAIL_RCPT || \ (option) == CURLOPT_POSTQUOTE || \ (option) == CURLOPT_PREQUOTE || \ + (option) == CURLOPT_PROXYHEADER || \ + (option) == CURLOPT_QUOTE || \ + (option) == CURLOPT_RESOLVE || \ (option) == CURLOPT_TELNETOPTIONS || \ - (option) == CURLOPT_MAIL_RCPT || \ 0) /* groups of curl_easy_getinfo infos that take the same type of argument */ @@ -351,7 +365,7 @@ _CURL_WARNING(_curl_easy_getinfo_err_curl_slist, /* XXX: should evaluate to true iff expr is a pointer */ #define _curl_is_any_ptr(expr) \ - (sizeof(expr) == sizeof(void*)) + (sizeof(expr) == sizeof(void *)) /* evaluates to true if expr is NULL */ /* XXX: must not evaluate expr, so this check is not accurate */ @@ -443,12 +457,12 @@ _CURL_WARNING(_curl_easy_getinfo_err_curl_slist, _curl_callback_compatible((expr), _curl_read_callback4) || \ _curl_callback_compatible((expr), _curl_read_callback5) || \ _curl_callback_compatible((expr), _curl_read_callback6)) -typedef size_t (_curl_read_callback1)(char *, size_t, size_t, void*); -typedef size_t (_curl_read_callback2)(char *, size_t, size_t, const void*); -typedef size_t (_curl_read_callback3)(char *, size_t, size_t, FILE*); -typedef size_t (_curl_read_callback4)(void *, size_t, size_t, void*); -typedef size_t (_curl_read_callback5)(void *, size_t, size_t, const void*); -typedef size_t (_curl_read_callback6)(void *, size_t, size_t, FILE*); +typedef size_t (_curl_read_callback1)(char *, size_t, size_t, void *); +typedef size_t (_curl_read_callback2)(char *, size_t, size_t, const void *); +typedef size_t (_curl_read_callback3)(char *, size_t, size_t, FILE *); +typedef size_t (_curl_read_callback4)(void *, size_t, size_t, void *); +typedef size_t (_curl_read_callback5)(void *, size_t, size_t, const void *); +typedef size_t (_curl_read_callback6)(void *, size_t, size_t, FILE *); /* evaluates to true if expr is of type curl_write_callback or "similar" */ #define _curl_is_write_cb(expr) \ @@ -461,14 +475,14 @@ typedef size_t (_curl_read_callback6)(void *, size_t, size_t, FILE*); _curl_callback_compatible((expr), _curl_write_callback4) || \ _curl_callback_compatible((expr), _curl_write_callback5) || \ _curl_callback_compatible((expr), _curl_write_callback6)) -typedef size_t (_curl_write_callback1)(const char *, size_t, size_t, void*); +typedef size_t (_curl_write_callback1)(const char *, size_t, size_t, void *); typedef size_t (_curl_write_callback2)(const char *, size_t, size_t, - const void*); -typedef size_t (_curl_write_callback3)(const char *, size_t, size_t, FILE*); -typedef size_t (_curl_write_callback4)(const void *, size_t, size_t, void*); + const void *); +typedef size_t (_curl_write_callback3)(const char *, size_t, size_t, FILE *); +typedef size_t (_curl_write_callback4)(const void *, size_t, size_t, void *); typedef size_t (_curl_write_callback5)(const void *, size_t, size_t, - const void*); -typedef size_t (_curl_write_callback6)(const void *, size_t, size_t, FILE*); + const void *); +typedef size_t (_curl_write_callback6)(const void *, size_t, size_t, FILE *); /* evaluates to true if expr is of type curl_ioctl_callback or "similar" */ #define _curl_is_ioctl_cb(expr) \ @@ -478,10 +492,10 @@ typedef size_t (_curl_write_callback6)(const void *, size_t, size_t, FILE*); _curl_callback_compatible((expr), _curl_ioctl_callback2) || \ _curl_callback_compatible((expr), _curl_ioctl_callback3) || \ _curl_callback_compatible((expr), _curl_ioctl_callback4)) -typedef curlioerr (_curl_ioctl_callback1)(CURL *, int, void*); -typedef curlioerr (_curl_ioctl_callback2)(CURL *, int, const void*); -typedef curlioerr (_curl_ioctl_callback3)(CURL *, curliocmd, void*); -typedef curlioerr (_curl_ioctl_callback4)(CURL *, curliocmd, const void*); +typedef curlioerr (_curl_ioctl_callback1)(CURL *, int, void *); +typedef curlioerr (_curl_ioctl_callback2)(CURL *, int, const void *); +typedef curlioerr (_curl_ioctl_callback3)(CURL *, curliocmd, void *); +typedef curlioerr (_curl_ioctl_callback4)(CURL *, curliocmd, const void *); /* evaluates to true if expr is of type curl_sockopt_callback or "similar" */ #define _curl_is_sockopt_cb(expr) \ diff --git a/compat/pthreads/pthread.h b/compat/includes/pthreads/pthread.h similarity index 100% rename from compat/pthreads/pthread.h rename to compat/includes/pthreads/pthread.h diff --git a/compat/pthreads/sched.h b/compat/includes/pthreads/sched.h similarity index 100% rename from compat/pthreads/sched.h rename to compat/includes/pthreads/sched.h diff --git a/compat/curl-for-windows/zlib/zconf.h b/compat/includes/zlib/zconf.h similarity index 100% rename from compat/curl-for-windows/zlib/zconf.h rename to compat/includes/zlib/zconf.h diff --git a/compat/curl-for-windows/zlib/zlib.h b/compat/includes/zlib/zlib.h similarity index 100% rename from compat/curl-for-windows/zlib/zlib.h rename to compat/includes/zlib/zlib.h diff --git a/compat/jansson/config.h b/compat/jansson/config.h deleted file mode 100644 index 43858aa61f..0000000000 --- a/compat/jansson/config.h +++ /dev/null @@ -1,73 +0,0 @@ -/* config.h. Generated from config.h.in by configure. */ -/* config.h.in. Generated from configure.ac by autoheader. */ - -/* Define to 1 if you have the header file. */ -#define HAVE_DLFCN_H 1 - -/* Define to 1 if you have the header file. */ -#define HAVE_INTTYPES_H 1 - -/* Define to 1 if you have the header file. */ -#define HAVE_MEMORY_H 1 - -/* Define to 1 if you have the header file. */ -#define HAVE_STDINT_H 1 - -/* Define to 1 if you have the header file. */ -#define HAVE_STDLIB_H 1 - -/* Define to 1 if you have the header file. */ -#define HAVE_STRINGS_H 1 - -/* Define to 1 if you have the header file. */ -#define HAVE_STRING_H 1 - -/* Define to 1 if you have the header file. */ -#define HAVE_SYS_STAT_H 1 - -/* Define to 1 if you have the header file. */ -#define HAVE_SYS_TYPES_H 1 - -/* Define to 1 if you have the header file. */ -#define HAVE_UNISTD_H 1 - -/* Define to the sub-directory in which libtool stores uninstalled libraries. - */ -#define LT_OBJDIR ".libs/" - -/* Name of package */ -#define PACKAGE "jansson" - -/* Define to the address where bug reports for this package should be sent. */ -#define PACKAGE_BUGREPORT "petri@digip.org" - -/* Define to the full name of this package. */ -#define PACKAGE_NAME "jansson" - -/* Define to the full name and version of this package. */ -#define PACKAGE_STRING "jansson 1.3" - -/* Define to the one symbol short name of this package. */ -#define PACKAGE_TARNAME "jansson" - -/* Define to the home page for this package. */ -#define PACKAGE_URL "" - -/* Define to the version of this package. */ -#define PACKAGE_VERSION "1.3" - -/* Define to 1 if you have the ANSI C header files. */ -#define STDC_HEADERS 1 - -/* Version number of package */ -#define VERSION "1.3" - -/* Define to `__inline__' or `__inline' if that's what the C compiler - calls it, or to nothing if 'inline' is not supported under any name. */ -#ifndef __cplusplus -/* #undef inline */ -#endif - -/* Define to the type of a signed integer type of width exactly 32 bits if - such a type exists and the standard includes do not define it. */ -/* #undef int32_t */ diff --git a/compat/jansson/jansson_config.h b/compat/jansson/jansson_config.h index 90ca129281..42421e8a86 100644 --- a/compat/jansson/jansson_config.h +++ b/compat/jansson/jansson_config.h @@ -22,8 +22,10 @@ supported. */ #ifdef _MSC_VER +#ifndef __cplusplus #define inline __inline #endif +#endif #ifdef __cplusplus #define JSON_INLINE inline diff --git a/compat/libs/x64/jansson.lib b/compat/libs/x64/jansson.lib new file mode 100644 index 0000000000..925e87c850 Binary files /dev/null and b/compat/libs/x64/jansson.lib differ diff --git a/compat/libs/x64/libcrypto.lib b/compat/libs/x64/libcrypto.lib new file mode 100644 index 0000000000..a364a4d822 Binary files /dev/null and b/compat/libs/x64/libcrypto.lib differ diff --git a/compat/libs/x64/libcurl.lib b/compat/libs/x64/libcurl.lib new file mode 100644 index 0000000000..ded35f14e4 Binary files /dev/null and b/compat/libs/x64/libcurl.lib differ diff --git a/compat/pthreads/x64/pthreadVC2.lib b/compat/libs/x64/pthreadVC2.lib similarity index 100% rename from compat/pthreads/x64/pthreadVC2.lib rename to compat/libs/x64/pthreadVC2.lib diff --git a/compat/libs/x64/zlibstat.lib b/compat/libs/x64/zlibstat.lib new file mode 100644 index 0000000000..5078caf7a1 Binary files /dev/null and b/compat/libs/x64/zlibstat.lib differ diff --git a/compat/libs/x86/jansson.lib b/compat/libs/x86/jansson.lib new file mode 100644 index 0000000000..10b32d1b30 Binary files /dev/null and b/compat/libs/x86/jansson.lib differ diff --git a/compat/libs/x86/libcrypto.lib b/compat/libs/x86/libcrypto.lib new file mode 100644 index 0000000000..6a7068f058 Binary files /dev/null and b/compat/libs/x86/libcrypto.lib differ diff --git a/compat/libs/x86/libcurl.lib b/compat/libs/x86/libcurl.lib new file mode 100644 index 0000000000..e227cfdfe3 Binary files /dev/null and b/compat/libs/x86/libcurl.lib differ diff --git a/compat/pthreads/x86/pthreadVC2.lib b/compat/libs/x86/pthreadVC2.lib similarity index 100% rename from compat/pthreads/x86/pthreadVC2.lib rename to compat/libs/x86/pthreadVC2.lib diff --git a/compat/libs/x86/zlibstat.lib b/compat/libs/x86/zlibstat.lib new file mode 100644 index 0000000000..387e902b03 Binary files /dev/null and b/compat/libs/x86/zlibstat.lib differ diff --git a/compat/winansi.c b/compat/winansi.cpp similarity index 96% rename from compat/winansi.c rename to compat/winansi.cpp index 50e8388ac1..802f93ced9 100644 --- a/compat/winansi.c +++ b/compat/winansi.cpp @@ -1,3 +1,15 @@ +extern void proper_exit(int reason); +enum +{ + LOG_ERR, + LOG_WARNING, + LOG_NOTICE, + LOG_INFO, + LOG_DEBUG, + /* custom notices */ + LOG_BLUE = 0x10, +}; +extern void applog(int prio, const char *fmt, ...); /** * Old Git implementation of windows terminal colors (2009) * before use of a threaded wrapper. @@ -345,9 +357,12 @@ int winansi_vfprintf(FILE *stream, const char *format, va_list list) va_end(cp); if (len > sizeof(small_buf) - 1) { - buf = malloc(len + 1); - if (!buf) - goto abort; + buf = (char*)malloc(len + 1); + if(buf == NULL) + { + applog(LOG_ERR, "Out of memory!"); + proper_exit(2); + } len = vsnprintf(buf, len + 1, format, list); #ifdef WIN32 diff --git a/compile b/compile deleted file mode 100644 index b1f4749152..0000000000 --- a/compile +++ /dev/null @@ -1,310 +0,0 @@ -#! /bin/sh -# Wrapper for compilers which do not understand '-c -o'. - -scriptversion=2012-01-04.17; # UTC - -# Copyright (C) 1999, 2000, 2003, 2004, 2005, 2009, 2010, 2012 Free -# Software Foundation, Inc. -# Written by Tom Tromey . -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 2, or (at your option) -# any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program. If not, see . - -# As a special exception to the GNU General Public License, if you -# distribute this file as part of a program that contains a -# configuration script generated by Autoconf, you may include it under -# the same distribution terms that you use for the rest of that program. - -# This file is maintained in Automake, please report -# bugs to or send patches to -# . - -nl=' -' - -# We need space, tab and new line, in precisely that order. Quoting is -# there to prevent tools from complaining about whitespace usage. -IFS=" "" $nl" - -file_conv= - -# func_file_conv build_file lazy -# Convert a $build file to $host form and store it in $file -# Currently only supports Windows hosts. If the determined conversion -# type is listed in (the comma separated) LAZY, no conversion will -# take place. -func_file_conv () -{ - file=$1 - case $file in - / | /[!/]*) # absolute file, and not a UNC file - if test -z "$file_conv"; then - # lazily determine how to convert abs files - case `uname -s` in - MINGW*) - file_conv=mingw - ;; - CYGWIN*) - file_conv=cygwin - ;; - *) - file_conv=wine - ;; - esac - fi - case $file_conv/,$2, in - *,$file_conv,*) - ;; - mingw/*) - file=`cmd //C echo "$file " | sed -e 's/"\(.*\) " *$/\1/'` - ;; - cygwin/*) - file=`cygpath -m "$file" || echo "$file"` - ;; - wine/*) - file=`winepath -w "$file" || echo "$file"` - ;; - esac - ;; - esac -} - -# func_cl_wrapper cl arg... -# Adjust compile command to suit cl -func_cl_wrapper () -{ - # Assume a capable shell - lib_path= - shared=: - linker_opts= - for arg - do - if test -n "$eat"; then - eat= - else - case $1 in - -o) - # configure might choose to run compile as 'compile cc -o foo foo.c'. - eat=1 - case $2 in - *.o | *.[oO][bB][jJ]) - func_file_conv "$2" - set x "$@" -Fo"$file" - shift - ;; - *) - func_file_conv "$2" - set x "$@" -Fe"$file" - shift - ;; - esac - ;; - -I*) - func_file_conv "${1#-I}" mingw - set x "$@" -I"$file" - shift - ;; - -l*) - lib=${1#-l} - found=no - save_IFS=$IFS - IFS=';' - for dir in $lib_path $LIB - do - IFS=$save_IFS - if $shared && test -f "$dir/$lib.dll.lib"; then - found=yes - set x "$@" "$dir/$lib.dll.lib" - break - fi - if test -f "$dir/$lib.lib"; then - found=yes - set x "$@" "$dir/$lib.lib" - break - fi - done - IFS=$save_IFS - - test "$found" != yes && set x "$@" "$lib.lib" - shift - ;; - -L*) - func_file_conv "${1#-L}" - if test -z "$lib_path"; then - lib_path=$file - else - lib_path="$lib_path;$file" - fi - linker_opts="$linker_opts -LIBPATH:$file" - ;; - -static) - shared=false - ;; - -Wl,*) - arg=${1#-Wl,} - save_ifs="$IFS"; IFS=',' - for flag in $arg; do - IFS="$save_ifs" - linker_opts="$linker_opts $flag" - done - IFS="$save_ifs" - ;; - -Xlinker) - eat=1 - linker_opts="$linker_opts $2" - ;; - -*) - set x "$@" "$1" - shift - ;; - *.cc | *.CC | *.cxx | *.CXX | *.[cC]++) - func_file_conv "$1" - set x "$@" -Tp"$file" - shift - ;; - *.c | *.cpp | *.CPP | *.lib | *.LIB | *.Lib | *.OBJ | *.obj | *.[oO]) - func_file_conv "$1" mingw - set x "$@" "$file" - shift - ;; - *) - set x "$@" "$1" - shift - ;; - esac - fi - shift - done - if test -n "$linker_opts"; then - linker_opts="-link$linker_opts" - fi - exec "$@" $linker_opts - exit 1 -} - -eat= - -case $1 in - '') - echo "$0: No command. Try '$0 --help' for more information." 1>&2 - exit 1; - ;; - -h | --h*) - cat <<\EOF -Usage: compile [--help] [--version] PROGRAM [ARGS] - -Wrapper for compilers which do not understand '-c -o'. -Remove '-o dest.o' from ARGS, run PROGRAM with the remaining -arguments, and rename the output as expected. - -If you are trying to build a whole package this is not the -right script to run: please start by reading the file 'INSTALL'. - -Report bugs to . -EOF - exit $? - ;; - -v | --v*) - echo "compile $scriptversion" - exit $? - ;; - cl | *[/\\]cl | cl.exe | *[/\\]cl.exe ) - func_cl_wrapper "$@" # Doesn't return... - ;; -esac - -ofile= -cfile= - -for arg -do - if test -n "$eat"; then - eat= - else - case $1 in - -o) - # configure might choose to run compile as 'compile cc -o foo foo.c'. - # So we strip '-o arg' only if arg is an object. - eat=1 - case $2 in - *.o | *.obj) - ofile=$2 - ;; - *) - set x "$@" -o "$2" - shift - ;; - esac - ;; - *.c) - cfile=$1 - set x "$@" "$1" - shift - ;; - *) - set x "$@" "$1" - shift - ;; - esac - fi - shift -done - -if test -z "$ofile" || test -z "$cfile"; then - # If no '-o' option was seen then we might have been invoked from a - # pattern rule where we don't need one. That is ok -- this is a - # normal compilation that the losing compiler can handle. If no - # '.c' file was seen then we are probably linking. That is also - # ok. - exec "$@" -fi - -# Name of file we expect compiler to create. -cofile=`echo "$cfile" | sed 's|^.*[\\/]||; s|^[a-zA-Z]:||; s/\.c$/.o/'` - -# Create the lock directory. -# Note: use '[/\\:.-]' here to ensure that we don't use the same name -# that we are using for the .o file. Also, base the name on the expected -# object file name, since that is what matters with a parallel build. -lockdir=`echo "$cofile" | sed -e 's|[/\\:.-]|_|g'`.d -while true; do - if mkdir "$lockdir" >/dev/null 2>&1; then - break - fi - sleep 1 -done -# FIXME: race condition here if user kills between mkdir and trap. -trap "rmdir '$lockdir'; exit 1" 1 2 15 - -# Run the compile. -"$@" -ret=$? - -if test -f "$cofile"; then - test "$cofile" = "$ofile" || mv "$cofile" "$ofile" -elif test -f "${cofile}bj"; then - test "${cofile}bj" = "$ofile" || mv "${cofile}bj" "$ofile" -fi - -rmdir "$lockdir" -exit $ret - -# Local Variables: -# mode: shell-script -# sh-indentation: 2 -# eval: (add-hook 'write-file-hooks 'time-stamp) -# time-stamp-start: "scriptversion=" -# time-stamp-format: "%:y-%02m-%02d.%02H" -# time-stamp-time-zone: "UTC" -# time-stamp-end: "; # UTC" -# End: diff --git a/config.guess b/config.guess deleted file mode 100644 index f32079abda..0000000000 --- a/config.guess +++ /dev/null @@ -1,1526 +0,0 @@ -#! /bin/sh -# Attempt to guess a canonical system name. -# Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, -# 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008 -# Free Software Foundation, Inc. - -timestamp='2008-01-23' - -# This file is free software; you can redistribute it and/or modify it -# under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 2 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, but -# WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -# General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program; if not, write to the Free Software -# Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA -# 02110-1301, USA. -# -# As a special exception to the GNU General Public License, if you -# distribute this file as part of a program that contains a -# configuration script generated by Autoconf, you may include it under -# the same distribution terms that you use for the rest of that program. - - -# Originally written by Per Bothner . -# Please send patches to . Submit a context -# diff and a properly formatted ChangeLog entry. -# -# This script attempts to guess a canonical system name similar to -# config.sub. If it succeeds, it prints the system name on stdout, and -# exits with 0. Otherwise, it exits with 1. -# -# The plan is that this can be called by configure scripts if you -# don't specify an explicit build system type. - -me=`echo "$0" | sed -e 's,.*/,,'` - -usage="\ -Usage: $0 [OPTION] - -Output the configuration name of the system \`$me' is run on. - -Operation modes: - -h, --help print this help, then exit - -t, --time-stamp print date of last modification, then exit - -v, --version print version number, then exit - -Report bugs and patches to ." - -version="\ -GNU config.guess ($timestamp) - -Originally written by Per Bothner. -Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, -2002, 2003, 2004, 2005, 2006, 2007, 2008 Free Software Foundation, Inc. - -This is free software; see the source for copying conditions. There is NO -warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE." - -help=" -Try \`$me --help' for more information." - -# Parse command line -while test $# -gt 0 ; do - case $1 in - --time-stamp | --time* | -t ) - echo "$timestamp" ; exit ;; - --version | -v ) - echo "$version" ; exit ;; - --help | --h* | -h ) - echo "$usage"; exit ;; - -- ) # Stop option processing - shift; break ;; - - ) # Use stdin as input. - break ;; - -* ) - echo "$me: invalid option $1$help" >&2 - exit 1 ;; - * ) - break ;; - esac -done - -if test $# != 0; then - echo "$me: too many arguments$help" >&2 - exit 1 -fi - -trap 'exit 1' 1 2 15 - -# CC_FOR_BUILD -- compiler used by this script. Note that the use of a -# compiler to aid in system detection is discouraged as it requires -# temporary files to be created and, as you can see below, it is a -# headache to deal with in a portable fashion. - -# Historically, `CC_FOR_BUILD' used to be named `HOST_CC'. We still -# use `HOST_CC' if defined, but it is deprecated. - -# Portable tmp directory creation inspired by the Autoconf team. - -set_cc_for_build=' -trap "exitcode=\$?; (rm -f \$tmpfiles 2>/dev/null; rmdir \$tmp 2>/dev/null) && exit \$exitcode" 0 ; -trap "rm -f \$tmpfiles 2>/dev/null; rmdir \$tmp 2>/dev/null; exit 1" 1 2 13 15 ; -: ${TMPDIR=/tmp} ; - { tmp=`(umask 077 && mktemp -d "$TMPDIR/cgXXXXXX") 2>/dev/null` && test -n "$tmp" && test -d "$tmp" ; } || - { test -n "$RANDOM" && tmp=$TMPDIR/cg$$-$RANDOM && (umask 077 && mkdir $tmp) ; } || - { tmp=$TMPDIR/cg-$$ && (umask 077 && mkdir $tmp) && echo "Warning: creating insecure temp directory" >&2 ; } || - { echo "$me: cannot create a temporary directory in $TMPDIR" >&2 ; exit 1 ; } ; -dummy=$tmp/dummy ; -tmpfiles="$dummy.c $dummy.o $dummy.rel $dummy" ; -case $CC_FOR_BUILD,$HOST_CC,$CC in - ,,) echo "int x;" > $dummy.c ; - for c in cc gcc c89 c99 ; do - if ($c -c -o $dummy.o $dummy.c) >/dev/null 2>&1 ; then - CC_FOR_BUILD="$c"; break ; - fi ; - done ; - if test x"$CC_FOR_BUILD" = x ; then - CC_FOR_BUILD=no_compiler_found ; - fi - ;; - ,,*) CC_FOR_BUILD=$CC ;; - ,*,*) CC_FOR_BUILD=$HOST_CC ;; -esac ; set_cc_for_build= ;' - -# This is needed to find uname on a Pyramid OSx when run in the BSD universe. -# (ghazi@noc.rutgers.edu 1994-08-24) -if (test -f /.attbin/uname) >/dev/null 2>&1 ; then - PATH=$PATH:/.attbin ; export PATH -fi - -UNAME_MACHINE=`(uname -m) 2>/dev/null` || UNAME_MACHINE=unknown -UNAME_RELEASE=`(uname -r) 2>/dev/null` || UNAME_RELEASE=unknown -UNAME_SYSTEM=`(uname -s) 2>/dev/null` || UNAME_SYSTEM=unknown -UNAME_VERSION=`(uname -v) 2>/dev/null` || UNAME_VERSION=unknown - -# Note: order is significant - the case branches are not exclusive. - -case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in - *:NetBSD:*:*) - # NetBSD (nbsd) targets should (where applicable) match one or - # more of the tupples: *-*-netbsdelf*, *-*-netbsdaout*, - # *-*-netbsdecoff* and *-*-netbsd*. For targets that recently - # switched to ELF, *-*-netbsd* would select the old - # object file format. This provides both forward - # compatibility and a consistent mechanism for selecting the - # object file format. - # - # Note: NetBSD doesn't particularly care about the vendor - # portion of the name. We always set it to "unknown". - sysctl="sysctl -n hw.machine_arch" - UNAME_MACHINE_ARCH=`(/sbin/$sysctl 2>/dev/null || \ - /usr/sbin/$sysctl 2>/dev/null || echo unknown)` - case "${UNAME_MACHINE_ARCH}" in - armeb) machine=armeb-unknown ;; - arm*) machine=arm-unknown ;; - sh3el) machine=shl-unknown ;; - sh3eb) machine=sh-unknown ;; - sh5el) machine=sh5le-unknown ;; - *) machine=${UNAME_MACHINE_ARCH}-unknown ;; - esac - # The Operating System including object format, if it has switched - # to ELF recently, or will in the future. - case "${UNAME_MACHINE_ARCH}" in - arm*|i386|m68k|ns32k|sh3*|sparc|vax) - eval $set_cc_for_build - if echo __ELF__ | $CC_FOR_BUILD -E - 2>/dev/null \ - | grep __ELF__ >/dev/null - then - # Once all utilities can be ECOFF (netbsdecoff) or a.out (netbsdaout). - # Return netbsd for either. FIX? - os=netbsd - else - os=netbsdelf - fi - ;; - *) - os=netbsd - ;; - esac - # The OS release - # Debian GNU/NetBSD machines have a different userland, and - # thus, need a distinct triplet. However, they do not need - # kernel version information, so it can be replaced with a - # suitable tag, in the style of linux-gnu. - case "${UNAME_VERSION}" in - Debian*) - release='-gnu' - ;; - *) - release=`echo ${UNAME_RELEASE}|sed -e 's/[-_].*/\./'` - ;; - esac - # Since CPU_TYPE-MANUFACTURER-KERNEL-OPERATING_SYSTEM: - # contains redundant information, the shorter form: - # CPU_TYPE-MANUFACTURER-OPERATING_SYSTEM is used. - echo "${machine}-${os}${release}" - exit ;; - *:OpenBSD:*:*) - UNAME_MACHINE_ARCH=`arch | sed 's/OpenBSD.//'` - echo ${UNAME_MACHINE_ARCH}-unknown-openbsd${UNAME_RELEASE} - exit ;; - *:ekkoBSD:*:*) - echo ${UNAME_MACHINE}-unknown-ekkobsd${UNAME_RELEASE} - exit ;; - *:SolidBSD:*:*) - echo ${UNAME_MACHINE}-unknown-solidbsd${UNAME_RELEASE} - exit ;; - macppc:MirBSD:*:*) - echo powerpc-unknown-mirbsd${UNAME_RELEASE} - exit ;; - *:MirBSD:*:*) - echo ${UNAME_MACHINE}-unknown-mirbsd${UNAME_RELEASE} - exit ;; - alpha:OSF1:*:*) - case $UNAME_RELEASE in - *4.0) - UNAME_RELEASE=`/usr/sbin/sizer -v | awk '{print $3}'` - ;; - *5.*) - UNAME_RELEASE=`/usr/sbin/sizer -v | awk '{print $4}'` - ;; - esac - # According to Compaq, /usr/sbin/psrinfo has been available on - # OSF/1 and Tru64 systems produced since 1995. I hope that - # covers most systems running today. This code pipes the CPU - # types through head -n 1, so we only detect the type of CPU 0. - ALPHA_CPU_TYPE=`/usr/sbin/psrinfo -v | sed -n -e 's/^ The alpha \(.*\) processor.*$/\1/p' | head -n 1` - case "$ALPHA_CPU_TYPE" in - "EV4 (21064)") - UNAME_MACHINE="alpha" ;; - "EV4.5 (21064)") - UNAME_MACHINE="alpha" ;; - "LCA4 (21066/21068)") - UNAME_MACHINE="alpha" ;; - "EV5 (21164)") - UNAME_MACHINE="alphaev5" ;; - "EV5.6 (21164A)") - UNAME_MACHINE="alphaev56" ;; - "EV5.6 (21164PC)") - UNAME_MACHINE="alphapca56" ;; - "EV5.7 (21164PC)") - UNAME_MACHINE="alphapca57" ;; - "EV6 (21264)") - UNAME_MACHINE="alphaev6" ;; - "EV6.7 (21264A)") - UNAME_MACHINE="alphaev67" ;; - "EV6.8CB (21264C)") - UNAME_MACHINE="alphaev68" ;; - "EV6.8AL (21264B)") - UNAME_MACHINE="alphaev68" ;; - "EV6.8CX (21264D)") - UNAME_MACHINE="alphaev68" ;; - "EV6.9A (21264/EV69A)") - UNAME_MACHINE="alphaev69" ;; - "EV7 (21364)") - UNAME_MACHINE="alphaev7" ;; - "EV7.9 (21364A)") - UNAME_MACHINE="alphaev79" ;; - esac - # A Pn.n version is a patched version. - # A Vn.n version is a released version. - # A Tn.n version is a released field test version. - # A Xn.n version is an unreleased experimental baselevel. - # 1.2 uses "1.2" for uname -r. - echo ${UNAME_MACHINE}-dec-osf`echo ${UNAME_RELEASE} | sed -e 's/^[PVTX]//' | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz'` - exit ;; - Alpha\ *:Windows_NT*:*) - # How do we know it's Interix rather than the generic POSIX subsystem? - # Should we change UNAME_MACHINE based on the output of uname instead - # of the specific Alpha model? - echo alpha-pc-interix - exit ;; - 21064:Windows_NT:50:3) - echo alpha-dec-winnt3.5 - exit ;; - Amiga*:UNIX_System_V:4.0:*) - echo m68k-unknown-sysv4 - exit ;; - *:[Aa]miga[Oo][Ss]:*:*) - echo ${UNAME_MACHINE}-unknown-amigaos - exit ;; - *:[Mm]orph[Oo][Ss]:*:*) - echo ${UNAME_MACHINE}-unknown-morphos - exit ;; - *:OS/390:*:*) - echo i370-ibm-openedition - exit ;; - *:z/VM:*:*) - echo s390-ibm-zvmoe - exit ;; - *:OS400:*:*) - echo powerpc-ibm-os400 - exit ;; - arm:RISC*:1.[012]*:*|arm:riscix:1.[012]*:*) - echo arm-acorn-riscix${UNAME_RELEASE} - exit ;; - arm:riscos:*:*|arm:RISCOS:*:*) - echo arm-unknown-riscos - exit ;; - SR2?01:HI-UX/MPP:*:* | SR8000:HI-UX/MPP:*:*) - echo hppa1.1-hitachi-hiuxmpp - exit ;; - Pyramid*:OSx*:*:* | MIS*:OSx*:*:* | MIS*:SMP_DC-OSx*:*:*) - # akee@wpdis03.wpafb.af.mil (Earle F. Ake) contributed MIS and NILE. - if test "`(/bin/universe) 2>/dev/null`" = att ; then - echo pyramid-pyramid-sysv3 - else - echo pyramid-pyramid-bsd - fi - exit ;; - NILE*:*:*:dcosx) - echo pyramid-pyramid-svr4 - exit ;; - DRS?6000:unix:4.0:6*) - echo sparc-icl-nx6 - exit ;; - DRS?6000:UNIX_SV:4.2*:7* | DRS?6000:isis:4.2*:7*) - case `/usr/bin/uname -p` in - sparc) echo sparc-icl-nx7; exit ;; - esac ;; - sun4H:SunOS:5.*:*) - echo sparc-hal-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'` - exit ;; - sun4*:SunOS:5.*:* | tadpole*:SunOS:5.*:*) - echo sparc-sun-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'` - exit ;; - i86pc:SunOS:5.*:* | i86xen:SunOS:5.*:*) - echo i386-pc-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'` - exit ;; - sun4*:SunOS:6*:*) - # According to config.sub, this is the proper way to canonicalize - # SunOS6. Hard to guess exactly what SunOS6 will be like, but - # it's likely to be more like Solaris than SunOS4. - echo sparc-sun-solaris3`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'` - exit ;; - sun4*:SunOS:*:*) - case "`/usr/bin/arch -k`" in - Series*|S4*) - UNAME_RELEASE=`uname -v` - ;; - esac - # Japanese Language versions have a version number like `4.1.3-JL'. - echo sparc-sun-sunos`echo ${UNAME_RELEASE}|sed -e 's/-/_/'` - exit ;; - sun3*:SunOS:*:*) - echo m68k-sun-sunos${UNAME_RELEASE} - exit ;; - sun*:*:4.2BSD:*) - UNAME_RELEASE=`(sed 1q /etc/motd | awk '{print substr($5,1,3)}') 2>/dev/null` - test "x${UNAME_RELEASE}" = "x" && UNAME_RELEASE=3 - case "`/bin/arch`" in - sun3) - echo m68k-sun-sunos${UNAME_RELEASE} - ;; - sun4) - echo sparc-sun-sunos${UNAME_RELEASE} - ;; - esac - exit ;; - aushp:SunOS:*:*) - echo sparc-auspex-sunos${UNAME_RELEASE} - exit ;; - # The situation for MiNT is a little confusing. The machine name - # can be virtually everything (everything which is not - # "atarist" or "atariste" at least should have a processor - # > m68000). The system name ranges from "MiNT" over "FreeMiNT" - # to the lowercase version "mint" (or "freemint"). Finally - # the system name "TOS" denotes a system which is actually not - # MiNT. But MiNT is downward compatible to TOS, so this should - # be no problem. - atarist[e]:*MiNT:*:* | atarist[e]:*mint:*:* | atarist[e]:*TOS:*:*) - echo m68k-atari-mint${UNAME_RELEASE} - exit ;; - atari*:*MiNT:*:* | atari*:*mint:*:* | atarist[e]:*TOS:*:*) - echo m68k-atari-mint${UNAME_RELEASE} - exit ;; - *falcon*:*MiNT:*:* | *falcon*:*mint:*:* | *falcon*:*TOS:*:*) - echo m68k-atari-mint${UNAME_RELEASE} - exit ;; - milan*:*MiNT:*:* | milan*:*mint:*:* | *milan*:*TOS:*:*) - echo m68k-milan-mint${UNAME_RELEASE} - exit ;; - hades*:*MiNT:*:* | hades*:*mint:*:* | *hades*:*TOS:*:*) - echo m68k-hades-mint${UNAME_RELEASE} - exit ;; - *:*MiNT:*:* | *:*mint:*:* | *:*TOS:*:*) - echo m68k-unknown-mint${UNAME_RELEASE} - exit ;; - m68k:machten:*:*) - echo m68k-apple-machten${UNAME_RELEASE} - exit ;; - powerpc:machten:*:*) - echo powerpc-apple-machten${UNAME_RELEASE} - exit ;; - RISC*:Mach:*:*) - echo mips-dec-mach_bsd4.3 - exit ;; - RISC*:ULTRIX:*:*) - echo mips-dec-ultrix${UNAME_RELEASE} - exit ;; - VAX*:ULTRIX*:*:*) - echo vax-dec-ultrix${UNAME_RELEASE} - exit ;; - 2020:CLIX:*:* | 2430:CLIX:*:*) - echo clipper-intergraph-clix${UNAME_RELEASE} - exit ;; - mips:*:*:UMIPS | mips:*:*:RISCos) - eval $set_cc_for_build - sed 's/^ //' << EOF >$dummy.c -#ifdef __cplusplus -#include /* for printf() prototype */ - int main (int argc, char *argv[]) { -#else - int main (argc, argv) int argc; char *argv[]; { -#endif - #if defined (host_mips) && defined (MIPSEB) - #if defined (SYSTYPE_SYSV) - printf ("mips-mips-riscos%ssysv\n", argv[1]); exit (0); - #endif - #if defined (SYSTYPE_SVR4) - printf ("mips-mips-riscos%ssvr4\n", argv[1]); exit (0); - #endif - #if defined (SYSTYPE_BSD43) || defined(SYSTYPE_BSD) - printf ("mips-mips-riscos%sbsd\n", argv[1]); exit (0); - #endif - #endif - exit (-1); - } -EOF - $CC_FOR_BUILD -o $dummy $dummy.c && - dummyarg=`echo "${UNAME_RELEASE}" | sed -n 's/\([0-9]*\).*/\1/p'` && - SYSTEM_NAME=`$dummy $dummyarg` && - { echo "$SYSTEM_NAME"; exit; } - echo mips-mips-riscos${UNAME_RELEASE} - exit ;; - Motorola:PowerMAX_OS:*:*) - echo powerpc-motorola-powermax - exit ;; - Motorola:*:4.3:PL8-*) - echo powerpc-harris-powermax - exit ;; - Night_Hawk:*:*:PowerMAX_OS | Synergy:PowerMAX_OS:*:*) - echo powerpc-harris-powermax - exit ;; - Night_Hawk:Power_UNIX:*:*) - echo powerpc-harris-powerunix - exit ;; - m88k:CX/UX:7*:*) - echo m88k-harris-cxux7 - exit ;; - m88k:*:4*:R4*) - echo m88k-motorola-sysv4 - exit ;; - m88k:*:3*:R3*) - echo m88k-motorola-sysv3 - exit ;; - AViiON:dgux:*:*) - # DG/UX returns AViiON for all architectures - UNAME_PROCESSOR=`/usr/bin/uname -p` - if [ $UNAME_PROCESSOR = mc88100 ] || [ $UNAME_PROCESSOR = mc88110 ] - then - if [ ${TARGET_BINARY_INTERFACE}x = m88kdguxelfx ] || \ - [ ${TARGET_BINARY_INTERFACE}x = x ] - then - echo m88k-dg-dgux${UNAME_RELEASE} - else - echo m88k-dg-dguxbcs${UNAME_RELEASE} - fi - else - echo i586-dg-dgux${UNAME_RELEASE} - fi - exit ;; - M88*:DolphinOS:*:*) # DolphinOS (SVR3) - echo m88k-dolphin-sysv3 - exit ;; - M88*:*:R3*:*) - # Delta 88k system running SVR3 - echo m88k-motorola-sysv3 - exit ;; - XD88*:*:*:*) # Tektronix XD88 system running UTekV (SVR3) - echo m88k-tektronix-sysv3 - exit ;; - Tek43[0-9][0-9]:UTek:*:*) # Tektronix 4300 system running UTek (BSD) - echo m68k-tektronix-bsd - exit ;; - *:IRIX*:*:*) - echo mips-sgi-irix`echo ${UNAME_RELEASE}|sed -e 's/-/_/g'` - exit ;; - ????????:AIX?:[12].1:2) # AIX 2.2.1 or AIX 2.1.1 is RT/PC AIX. - echo romp-ibm-aix # uname -m gives an 8 hex-code CPU id - exit ;; # Note that: echo "'`uname -s`'" gives 'AIX ' - i*86:AIX:*:*) - echo i386-ibm-aix - exit ;; - ia64:AIX:*:*) - if [ -x /usr/bin/oslevel ] ; then - IBM_REV=`/usr/bin/oslevel` - else - IBM_REV=${UNAME_VERSION}.${UNAME_RELEASE} - fi - echo ${UNAME_MACHINE}-ibm-aix${IBM_REV} - exit ;; - *:AIX:2:3) - if grep bos325 /usr/include/stdio.h >/dev/null 2>&1; then - eval $set_cc_for_build - sed 's/^ //' << EOF >$dummy.c - #include - - main() - { - if (!__power_pc()) - exit(1); - puts("powerpc-ibm-aix3.2.5"); - exit(0); - } -EOF - if $CC_FOR_BUILD -o $dummy $dummy.c && SYSTEM_NAME=`$dummy` - then - echo "$SYSTEM_NAME" - else - echo rs6000-ibm-aix3.2.5 - fi - elif grep bos324 /usr/include/stdio.h >/dev/null 2>&1; then - echo rs6000-ibm-aix3.2.4 - else - echo rs6000-ibm-aix3.2 - fi - exit ;; - *:AIX:*:[456]) - IBM_CPU_ID=`/usr/sbin/lsdev -C -c processor -S available | sed 1q | awk '{ print $1 }'` - if /usr/sbin/lsattr -El ${IBM_CPU_ID} | grep ' POWER' >/dev/null 2>&1; then - IBM_ARCH=rs6000 - else - IBM_ARCH=powerpc - fi - if [ -x /usr/bin/oslevel ] ; then - IBM_REV=`/usr/bin/oslevel` - else - IBM_REV=${UNAME_VERSION}.${UNAME_RELEASE} - fi - echo ${IBM_ARCH}-ibm-aix${IBM_REV} - exit ;; - *:AIX:*:*) - echo rs6000-ibm-aix - exit ;; - ibmrt:4.4BSD:*|romp-ibm:BSD:*) - echo romp-ibm-bsd4.4 - exit ;; - ibmrt:*BSD:*|romp-ibm:BSD:*) # covers RT/PC BSD and - echo romp-ibm-bsd${UNAME_RELEASE} # 4.3 with uname added to - exit ;; # report: romp-ibm BSD 4.3 - *:BOSX:*:*) - echo rs6000-bull-bosx - exit ;; - DPX/2?00:B.O.S.:*:*) - echo m68k-bull-sysv3 - exit ;; - 9000/[34]??:4.3bsd:1.*:*) - echo m68k-hp-bsd - exit ;; - hp300:4.4BSD:*:* | 9000/[34]??:4.3bsd:2.*:*) - echo m68k-hp-bsd4.4 - exit ;; - 9000/[34678]??:HP-UX:*:*) - HPUX_REV=`echo ${UNAME_RELEASE}|sed -e 's/[^.]*.[0B]*//'` - case "${UNAME_MACHINE}" in - 9000/31? ) HP_ARCH=m68000 ;; - 9000/[34]?? ) HP_ARCH=m68k ;; - 9000/[678][0-9][0-9]) - if [ -x /usr/bin/getconf ]; then - sc_cpu_version=`/usr/bin/getconf SC_CPU_VERSION 2>/dev/null` - sc_kernel_bits=`/usr/bin/getconf SC_KERNEL_BITS 2>/dev/null` - case "${sc_cpu_version}" in - 523) HP_ARCH="hppa1.0" ;; # CPU_PA_RISC1_0 - 528) HP_ARCH="hppa1.1" ;; # CPU_PA_RISC1_1 - 532) # CPU_PA_RISC2_0 - case "${sc_kernel_bits}" in - 32) HP_ARCH="hppa2.0n" ;; - 64) HP_ARCH="hppa2.0w" ;; - '') HP_ARCH="hppa2.0" ;; # HP-UX 10.20 - esac ;; - esac - fi - if [ "${HP_ARCH}" = "" ]; then - eval $set_cc_for_build - sed 's/^ //' << EOF >$dummy.c - - #define _HPUX_SOURCE - #include - #include - - int main () - { - #if defined(_SC_KERNEL_BITS) - long bits = sysconf(_SC_KERNEL_BITS); - #endif - long cpu = sysconf (_SC_CPU_VERSION); - - switch (cpu) - { - case CPU_PA_RISC1_0: puts ("hppa1.0"); break; - case CPU_PA_RISC1_1: puts ("hppa1.1"); break; - case CPU_PA_RISC2_0: - #if defined(_SC_KERNEL_BITS) - switch (bits) - { - case 64: puts ("hppa2.0w"); break; - case 32: puts ("hppa2.0n"); break; - default: puts ("hppa2.0"); break; - } break; - #else /* !defined(_SC_KERNEL_BITS) */ - puts ("hppa2.0"); break; - #endif - default: puts ("hppa1.0"); break; - } - exit (0); - } -EOF - (CCOPTS= $CC_FOR_BUILD -o $dummy $dummy.c 2>/dev/null) && HP_ARCH=`$dummy` - test -z "$HP_ARCH" && HP_ARCH=hppa - fi ;; - esac - if [ ${HP_ARCH} = "hppa2.0w" ] - then - eval $set_cc_for_build - - # hppa2.0w-hp-hpux* has a 64-bit kernel and a compiler generating - # 32-bit code. hppa64-hp-hpux* has the same kernel and a compiler - # generating 64-bit code. GNU and HP use different nomenclature: - # - # $ CC_FOR_BUILD=cc ./config.guess - # => hppa2.0w-hp-hpux11.23 - # $ CC_FOR_BUILD="cc +DA2.0w" ./config.guess - # => hppa64-hp-hpux11.23 - - if echo __LP64__ | (CCOPTS= $CC_FOR_BUILD -E - 2>/dev/null) | - grep __LP64__ >/dev/null - then - HP_ARCH="hppa2.0w" - else - HP_ARCH="hppa64" - fi - fi - echo ${HP_ARCH}-hp-hpux${HPUX_REV} - exit ;; - ia64:HP-UX:*:*) - HPUX_REV=`echo ${UNAME_RELEASE}|sed -e 's/[^.]*.[0B]*//'` - echo ia64-hp-hpux${HPUX_REV} - exit ;; - 3050*:HI-UX:*:*) - eval $set_cc_for_build - sed 's/^ //' << EOF >$dummy.c - #include - int - main () - { - long cpu = sysconf (_SC_CPU_VERSION); - /* The order matters, because CPU_IS_HP_MC68K erroneously returns - true for CPU_PA_RISC1_0. CPU_IS_PA_RISC returns correct - results, however. */ - if (CPU_IS_PA_RISC (cpu)) - { - switch (cpu) - { - case CPU_PA_RISC1_0: puts ("hppa1.0-hitachi-hiuxwe2"); break; - case CPU_PA_RISC1_1: puts ("hppa1.1-hitachi-hiuxwe2"); break; - case CPU_PA_RISC2_0: puts ("hppa2.0-hitachi-hiuxwe2"); break; - default: puts ("hppa-hitachi-hiuxwe2"); break; - } - } - else if (CPU_IS_HP_MC68K (cpu)) - puts ("m68k-hitachi-hiuxwe2"); - else puts ("unknown-hitachi-hiuxwe2"); - exit (0); - } -EOF - $CC_FOR_BUILD -o $dummy $dummy.c && SYSTEM_NAME=`$dummy` && - { echo "$SYSTEM_NAME"; exit; } - echo unknown-hitachi-hiuxwe2 - exit ;; - 9000/7??:4.3bsd:*:* | 9000/8?[79]:4.3bsd:*:* ) - echo hppa1.1-hp-bsd - exit ;; - 9000/8??:4.3bsd:*:*) - echo hppa1.0-hp-bsd - exit ;; - *9??*:MPE/iX:*:* | *3000*:MPE/iX:*:*) - echo hppa1.0-hp-mpeix - exit ;; - hp7??:OSF1:*:* | hp8?[79]:OSF1:*:* ) - echo hppa1.1-hp-osf - exit ;; - hp8??:OSF1:*:*) - echo hppa1.0-hp-osf - exit ;; - i*86:OSF1:*:*) - if [ -x /usr/sbin/sysversion ] ; then - echo ${UNAME_MACHINE}-unknown-osf1mk - else - echo ${UNAME_MACHINE}-unknown-osf1 - fi - exit ;; - parisc*:Lites*:*:*) - echo hppa1.1-hp-lites - exit ;; - C1*:ConvexOS:*:* | convex:ConvexOS:C1*:*) - echo c1-convex-bsd - exit ;; - C2*:ConvexOS:*:* | convex:ConvexOS:C2*:*) - if getsysinfo -f scalar_acc - then echo c32-convex-bsd - else echo c2-convex-bsd - fi - exit ;; - C34*:ConvexOS:*:* | convex:ConvexOS:C34*:*) - echo c34-convex-bsd - exit ;; - C38*:ConvexOS:*:* | convex:ConvexOS:C38*:*) - echo c38-convex-bsd - exit ;; - C4*:ConvexOS:*:* | convex:ConvexOS:C4*:*) - echo c4-convex-bsd - exit ;; - CRAY*Y-MP:*:*:*) - echo ymp-cray-unicos${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/' - exit ;; - CRAY*[A-Z]90:*:*:*) - echo ${UNAME_MACHINE}-cray-unicos${UNAME_RELEASE} \ - | sed -e 's/CRAY.*\([A-Z]90\)/\1/' \ - -e y/ABCDEFGHIJKLMNOPQRSTUVWXYZ/abcdefghijklmnopqrstuvwxyz/ \ - -e 's/\.[^.]*$/.X/' - exit ;; - CRAY*TS:*:*:*) - echo t90-cray-unicos${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/' - exit ;; - CRAY*T3E:*:*:*) - echo alphaev5-cray-unicosmk${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/' - exit ;; - CRAY*SV1:*:*:*) - echo sv1-cray-unicos${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/' - exit ;; - *:UNICOS/mp:*:*) - echo craynv-cray-unicosmp${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/' - exit ;; - F30[01]:UNIX_System_V:*:* | F700:UNIX_System_V:*:*) - FUJITSU_PROC=`uname -m | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz'` - FUJITSU_SYS=`uname -p | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz' | sed -e 's/\///'` - FUJITSU_REL=`echo ${UNAME_RELEASE} | sed -e 's/ /_/'` - echo "${FUJITSU_PROC}-fujitsu-${FUJITSU_SYS}${FUJITSU_REL}" - exit ;; - 5000:UNIX_System_V:4.*:*) - FUJITSU_SYS=`uname -p | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz' | sed -e 's/\///'` - FUJITSU_REL=`echo ${UNAME_RELEASE} | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz' | sed -e 's/ /_/'` - echo "sparc-fujitsu-${FUJITSU_SYS}${FUJITSU_REL}" - exit ;; - i*86:BSD/386:*:* | i*86:BSD/OS:*:* | *:Ascend\ Embedded/OS:*:*) - echo ${UNAME_MACHINE}-pc-bsdi${UNAME_RELEASE} - exit ;; - sparc*:BSD/OS:*:*) - echo sparc-unknown-bsdi${UNAME_RELEASE} - exit ;; - *:BSD/OS:*:*) - echo ${UNAME_MACHINE}-unknown-bsdi${UNAME_RELEASE} - exit ;; - *:FreeBSD:*:*) - case ${UNAME_MACHINE} in - pc98) - echo i386-unknown-freebsd`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'` ;; - amd64) - echo x86_64-unknown-freebsd`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'` ;; - *) - echo ${UNAME_MACHINE}-unknown-freebsd`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'` ;; - esac - exit ;; - i*:CYGWIN*:*) - echo ${UNAME_MACHINE}-pc-cygwin - exit ;; - *:MINGW*:*) - echo ${UNAME_MACHINE}-pc-mingw32 - exit ;; - i*:windows32*:*) - # uname -m includes "-pc" on this system. - echo ${UNAME_MACHINE}-mingw32 - exit ;; - i*:PW*:*) - echo ${UNAME_MACHINE}-pc-pw32 - exit ;; - *:Interix*:[3456]*) - case ${UNAME_MACHINE} in - x86) - echo i586-pc-interix${UNAME_RELEASE} - exit ;; - EM64T | authenticamd) - echo x86_64-unknown-interix${UNAME_RELEASE} - exit ;; - IA64) - echo ia64-unknown-interix${UNAME_RELEASE} - exit ;; - esac ;; - [345]86:Windows_95:* | [345]86:Windows_98:* | [345]86:Windows_NT:*) - echo i${UNAME_MACHINE}-pc-mks - exit ;; - i*:Windows_NT*:* | Pentium*:Windows_NT*:*) - # How do we know it's Interix rather than the generic POSIX subsystem? - # It also conflicts with pre-2.0 versions of AT&T UWIN. Should we - # UNAME_MACHINE based on the output of uname instead of i386? - echo i586-pc-interix - exit ;; - i*:UWIN*:*) - echo ${UNAME_MACHINE}-pc-uwin - exit ;; - amd64:CYGWIN*:*:* | x86_64:CYGWIN*:*:*) - echo x86_64-unknown-cygwin - exit ;; - p*:CYGWIN*:*) - echo powerpcle-unknown-cygwin - exit ;; - prep*:SunOS:5.*:*) - echo powerpcle-unknown-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'` - exit ;; - *:GNU:*:*) - # the GNU system - echo `echo ${UNAME_MACHINE}|sed -e 's,[-/].*$,,'`-unknown-gnu`echo ${UNAME_RELEASE}|sed -e 's,/.*$,,'` - exit ;; - *:GNU/*:*:*) - # other systems with GNU libc and userland - echo ${UNAME_MACHINE}-unknown-`echo ${UNAME_SYSTEM} | sed 's,^[^/]*/,,' | tr '[A-Z]' '[a-z]'``echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'`-gnu - exit ;; - i*86:Minix:*:*) - echo ${UNAME_MACHINE}-pc-minix - exit ;; - arm*:Linux:*:*) - eval $set_cc_for_build - if echo __ARM_EABI__ | $CC_FOR_BUILD -E - 2>/dev/null \ - | grep -q __ARM_EABI__ - then - echo ${UNAME_MACHINE}-unknown-linux-gnu - else - echo ${UNAME_MACHINE}-unknown-linux-gnueabi - fi - exit ;; - avr32*:Linux:*:*) - echo ${UNAME_MACHINE}-unknown-linux-gnu - exit ;; - cris:Linux:*:*) - echo cris-axis-linux-gnu - exit ;; - crisv32:Linux:*:*) - echo crisv32-axis-linux-gnu - exit ;; - frv:Linux:*:*) - echo frv-unknown-linux-gnu - exit ;; - ia64:Linux:*:*) - echo ${UNAME_MACHINE}-unknown-linux-gnu - exit ;; - m32r*:Linux:*:*) - echo ${UNAME_MACHINE}-unknown-linux-gnu - exit ;; - m68*:Linux:*:*) - echo ${UNAME_MACHINE}-unknown-linux-gnu - exit ;; - mips:Linux:*:*) - eval $set_cc_for_build - sed 's/^ //' << EOF >$dummy.c - #undef CPU - #undef mips - #undef mipsel - #if defined(__MIPSEL__) || defined(__MIPSEL) || defined(_MIPSEL) || defined(MIPSEL) - CPU=mipsel - #else - #if defined(__MIPSEB__) || defined(__MIPSEB) || defined(_MIPSEB) || defined(MIPSEB) - CPU=mips - #else - CPU= - #endif - #endif -EOF - eval "`$CC_FOR_BUILD -E $dummy.c 2>/dev/null | sed -n ' - /^CPU/{ - s: ::g - p - }'`" - test x"${CPU}" != x && { echo "${CPU}-unknown-linux-gnu"; exit; } - ;; - mips64:Linux:*:*) - eval $set_cc_for_build - sed 's/^ //' << EOF >$dummy.c - #undef CPU - #undef mips64 - #undef mips64el - #if defined(__MIPSEL__) || defined(__MIPSEL) || defined(_MIPSEL) || defined(MIPSEL) - CPU=mips64el - #else - #if defined(__MIPSEB__) || defined(__MIPSEB) || defined(_MIPSEB) || defined(MIPSEB) - CPU=mips64 - #else - CPU= - #endif - #endif -EOF - eval "`$CC_FOR_BUILD -E $dummy.c 2>/dev/null | sed -n ' - /^CPU/{ - s: ::g - p - }'`" - test x"${CPU}" != x && { echo "${CPU}-unknown-linux-gnu"; exit; } - ;; - or32:Linux:*:*) - echo or32-unknown-linux-gnu - exit ;; - ppc:Linux:*:*) - echo powerpc-unknown-linux-gnu - exit ;; - ppc64:Linux:*:*) - echo powerpc64-unknown-linux-gnu - exit ;; - alpha:Linux:*:*) - case `sed -n '/^cpu model/s/^.*: \(.*\)/\1/p' < /proc/cpuinfo` in - EV5) UNAME_MACHINE=alphaev5 ;; - EV56) UNAME_MACHINE=alphaev56 ;; - PCA56) UNAME_MACHINE=alphapca56 ;; - PCA57) UNAME_MACHINE=alphapca56 ;; - EV6) UNAME_MACHINE=alphaev6 ;; - EV67) UNAME_MACHINE=alphaev67 ;; - EV68*) UNAME_MACHINE=alphaev68 ;; - esac - objdump --private-headers /bin/sh | grep ld.so.1 >/dev/null - if test "$?" = 0 ; then LIBC="libc1" ; else LIBC="" ; fi - echo ${UNAME_MACHINE}-unknown-linux-gnu${LIBC} - exit ;; - parisc:Linux:*:* | hppa:Linux:*:*) - # Look for CPU level - case `grep '^cpu[^a-z]*:' /proc/cpuinfo 2>/dev/null | cut -d' ' -f2` in - PA7*) echo hppa1.1-unknown-linux-gnu ;; - PA8*) echo hppa2.0-unknown-linux-gnu ;; - *) echo hppa-unknown-linux-gnu ;; - esac - exit ;; - parisc64:Linux:*:* | hppa64:Linux:*:*) - echo hppa64-unknown-linux-gnu - exit ;; - s390:Linux:*:* | s390x:Linux:*:*) - echo ${UNAME_MACHINE}-ibm-linux - exit ;; - sh64*:Linux:*:*) - echo ${UNAME_MACHINE}-unknown-linux-gnu - exit ;; - sh*:Linux:*:*) - echo ${UNAME_MACHINE}-unknown-linux-gnu - exit ;; - sparc:Linux:*:* | sparc64:Linux:*:*) - echo ${UNAME_MACHINE}-unknown-linux-gnu - exit ;; - vax:Linux:*:*) - echo ${UNAME_MACHINE}-dec-linux-gnu - exit ;; - x86_64:Linux:*:*) - echo x86_64-unknown-linux-gnu - exit ;; - xtensa*:Linux:*:*) - echo ${UNAME_MACHINE}-unknown-linux-gnu - exit ;; - i*86:Linux:*:*) - # The BFD linker knows what the default object file format is, so - # first see if it will tell us. cd to the root directory to prevent - # problems with other programs or directories called `ld' in the path. - # Set LC_ALL=C to ensure ld outputs messages in English. - ld_supported_targets=`cd /; LC_ALL=C ld --help 2>&1 \ - | sed -ne '/supported targets:/!d - s/[ ][ ]*/ /g - s/.*supported targets: *// - s/ .*// - p'` - case "$ld_supported_targets" in - elf32-i386) - TENTATIVE="${UNAME_MACHINE}-pc-linux-gnu" - ;; - a.out-i386-linux) - echo "${UNAME_MACHINE}-pc-linux-gnuaout" - exit ;; - coff-i386) - echo "${UNAME_MACHINE}-pc-linux-gnucoff" - exit ;; - "") - # Either a pre-BFD a.out linker (linux-gnuoldld) or - # one that does not give us useful --help. - echo "${UNAME_MACHINE}-pc-linux-gnuoldld" - exit ;; - esac - # Determine whether the default compiler is a.out or elf - eval $set_cc_for_build - sed 's/^ //' << EOF >$dummy.c - #include - #ifdef __ELF__ - # ifdef __GLIBC__ - # if __GLIBC__ >= 2 - LIBC=gnu - # else - LIBC=gnulibc1 - # endif - # else - LIBC=gnulibc1 - # endif - #else - #if defined(__INTEL_COMPILER) || defined(__PGI) || defined(__SUNPRO_C) || defined(__SUNPRO_CC) - LIBC=gnu - #else - LIBC=gnuaout - #endif - #endif - #ifdef __dietlibc__ - LIBC=dietlibc - #endif -EOF - eval "`$CC_FOR_BUILD -E $dummy.c 2>/dev/null | sed -n ' - /^LIBC/{ - s: ::g - p - }'`" - test x"${LIBC}" != x && { - echo "${UNAME_MACHINE}-pc-linux-${LIBC}" - exit - } - test x"${TENTATIVE}" != x && { echo "${TENTATIVE}"; exit; } - ;; - i*86:DYNIX/ptx:4*:*) - # ptx 4.0 does uname -s correctly, with DYNIX/ptx in there. - # earlier versions are messed up and put the nodename in both - # sysname and nodename. - echo i386-sequent-sysv4 - exit ;; - i*86:UNIX_SV:4.2MP:2.*) - # Unixware is an offshoot of SVR4, but it has its own version - # number series starting with 2... - # I am not positive that other SVR4 systems won't match this, - # I just have to hope. -- rms. - # Use sysv4.2uw... so that sysv4* matches it. - echo ${UNAME_MACHINE}-pc-sysv4.2uw${UNAME_VERSION} - exit ;; - i*86:OS/2:*:*) - # If we were able to find `uname', then EMX Unix compatibility - # is probably installed. - echo ${UNAME_MACHINE}-pc-os2-emx - exit ;; - i*86:XTS-300:*:STOP) - echo ${UNAME_MACHINE}-unknown-stop - exit ;; - i*86:atheos:*:*) - echo ${UNAME_MACHINE}-unknown-atheos - exit ;; - i*86:syllable:*:*) - echo ${UNAME_MACHINE}-pc-syllable - exit ;; - i*86:LynxOS:2.*:* | i*86:LynxOS:3.[01]*:* | i*86:LynxOS:4.0*:*) - echo i386-unknown-lynxos${UNAME_RELEASE} - exit ;; - i*86:*DOS:*:*) - echo ${UNAME_MACHINE}-pc-msdosdjgpp - exit ;; - i*86:*:4.*:* | i*86:SYSTEM_V:4.*:*) - UNAME_REL=`echo ${UNAME_RELEASE} | sed 's/\/MP$//'` - if grep Novell /usr/include/link.h >/dev/null 2>/dev/null; then - echo ${UNAME_MACHINE}-univel-sysv${UNAME_REL} - else - echo ${UNAME_MACHINE}-pc-sysv${UNAME_REL} - fi - exit ;; - i*86:*:5:[678]*) - # UnixWare 7.x, OpenUNIX and OpenServer 6. - case `/bin/uname -X | grep "^Machine"` in - *486*) UNAME_MACHINE=i486 ;; - *Pentium) UNAME_MACHINE=i586 ;; - *Pent*|*Celeron) UNAME_MACHINE=i686 ;; - esac - echo ${UNAME_MACHINE}-unknown-sysv${UNAME_RELEASE}${UNAME_SYSTEM}${UNAME_VERSION} - exit ;; - i*86:*:3.2:*) - if test -f /usr/options/cb.name; then - UNAME_REL=`sed -n 's/.*Version //p' /dev/null >/dev/null ; then - UNAME_REL=`(/bin/uname -X|grep Release|sed -e 's/.*= //')` - (/bin/uname -X|grep i80486 >/dev/null) && UNAME_MACHINE=i486 - (/bin/uname -X|grep '^Machine.*Pentium' >/dev/null) \ - && UNAME_MACHINE=i586 - (/bin/uname -X|grep '^Machine.*Pent *II' >/dev/null) \ - && UNAME_MACHINE=i686 - (/bin/uname -X|grep '^Machine.*Pentium Pro' >/dev/null) \ - && UNAME_MACHINE=i686 - echo ${UNAME_MACHINE}-pc-sco$UNAME_REL - else - echo ${UNAME_MACHINE}-pc-sysv32 - fi - exit ;; - pc:*:*:*) - # Left here for compatibility: - # uname -m prints for DJGPP always 'pc', but it prints nothing about - # the processor, so we play safe by assuming i386. - echo i386-pc-msdosdjgpp - exit ;; - Intel:Mach:3*:*) - echo i386-pc-mach3 - exit ;; - paragon:*:*:*) - echo i860-intel-osf1 - exit ;; - i860:*:4.*:*) # i860-SVR4 - if grep Stardent /usr/include/sys/uadmin.h >/dev/null 2>&1 ; then - echo i860-stardent-sysv${UNAME_RELEASE} # Stardent Vistra i860-SVR4 - else # Add other i860-SVR4 vendors below as they are discovered. - echo i860-unknown-sysv${UNAME_RELEASE} # Unknown i860-SVR4 - fi - exit ;; - mini*:CTIX:SYS*5:*) - # "miniframe" - echo m68010-convergent-sysv - exit ;; - mc68k:UNIX:SYSTEM5:3.51m) - echo m68k-convergent-sysv - exit ;; - M680?0:D-NIX:5.3:*) - echo m68k-diab-dnix - exit ;; - M68*:*:R3V[5678]*:*) - test -r /sysV68 && { echo 'm68k-motorola-sysv'; exit; } ;; - 3[345]??:*:4.0:3.0 | 3[34]??A:*:4.0:3.0 | 3[34]??,*:*:4.0:3.0 | 3[34]??/*:*:4.0:3.0 | 4400:*:4.0:3.0 | 4850:*:4.0:3.0 | SKA40:*:4.0:3.0 | SDS2:*:4.0:3.0 | SHG2:*:4.0:3.0 | S7501*:*:4.0:3.0) - OS_REL='' - test -r /etc/.relid \ - && OS_REL=.`sed -n 's/[^ ]* [^ ]* \([0-9][0-9]\).*/\1/p' < /etc/.relid` - /bin/uname -p 2>/dev/null | grep 86 >/dev/null \ - && { echo i486-ncr-sysv4.3${OS_REL}; exit; } - /bin/uname -p 2>/dev/null | /bin/grep entium >/dev/null \ - && { echo i586-ncr-sysv4.3${OS_REL}; exit; } ;; - 3[34]??:*:4.0:* | 3[34]??,*:*:4.0:*) - /bin/uname -p 2>/dev/null | grep 86 >/dev/null \ - && { echo i486-ncr-sysv4; exit; } ;; - m68*:LynxOS:2.*:* | m68*:LynxOS:3.0*:*) - echo m68k-unknown-lynxos${UNAME_RELEASE} - exit ;; - mc68030:UNIX_System_V:4.*:*) - echo m68k-atari-sysv4 - exit ;; - TSUNAMI:LynxOS:2.*:*) - echo sparc-unknown-lynxos${UNAME_RELEASE} - exit ;; - rs6000:LynxOS:2.*:*) - echo rs6000-unknown-lynxos${UNAME_RELEASE} - exit ;; - PowerPC:LynxOS:2.*:* | PowerPC:LynxOS:3.[01]*:* | PowerPC:LynxOS:4.0*:*) - echo powerpc-unknown-lynxos${UNAME_RELEASE} - exit ;; - SM[BE]S:UNIX_SV:*:*) - echo mips-dde-sysv${UNAME_RELEASE} - exit ;; - RM*:ReliantUNIX-*:*:*) - echo mips-sni-sysv4 - exit ;; - RM*:SINIX-*:*:*) - echo mips-sni-sysv4 - exit ;; - *:SINIX-*:*:*) - if uname -p 2>/dev/null >/dev/null ; then - UNAME_MACHINE=`(uname -p) 2>/dev/null` - echo ${UNAME_MACHINE}-sni-sysv4 - else - echo ns32k-sni-sysv - fi - exit ;; - PENTIUM:*:4.0*:*) # Unisys `ClearPath HMP IX 4000' SVR4/MP effort - # says - echo i586-unisys-sysv4 - exit ;; - *:UNIX_System_V:4*:FTX*) - # From Gerald Hewes . - # How about differentiating between stratus architectures? -djm - echo hppa1.1-stratus-sysv4 - exit ;; - *:*:*:FTX*) - # From seanf@swdc.stratus.com. - echo i860-stratus-sysv4 - exit ;; - i*86:VOS:*:*) - # From Paul.Green@stratus.com. - echo ${UNAME_MACHINE}-stratus-vos - exit ;; - *:VOS:*:*) - # From Paul.Green@stratus.com. - echo hppa1.1-stratus-vos - exit ;; - mc68*:A/UX:*:*) - echo m68k-apple-aux${UNAME_RELEASE} - exit ;; - news*:NEWS-OS:6*:*) - echo mips-sony-newsos6 - exit ;; - R[34]000:*System_V*:*:* | R4000:UNIX_SYSV:*:* | R*000:UNIX_SV:*:*) - if [ -d /usr/nec ]; then - echo mips-nec-sysv${UNAME_RELEASE} - else - echo mips-unknown-sysv${UNAME_RELEASE} - fi - exit ;; - BeBox:BeOS:*:*) # BeOS running on hardware made by Be, PPC only. - echo powerpc-be-beos - exit ;; - BeMac:BeOS:*:*) # BeOS running on Mac or Mac clone, PPC only. - echo powerpc-apple-beos - exit ;; - BePC:BeOS:*:*) # BeOS running on Intel PC compatible. - echo i586-pc-beos - exit ;; - SX-4:SUPER-UX:*:*) - echo sx4-nec-superux${UNAME_RELEASE} - exit ;; - SX-5:SUPER-UX:*:*) - echo sx5-nec-superux${UNAME_RELEASE} - exit ;; - SX-6:SUPER-UX:*:*) - echo sx6-nec-superux${UNAME_RELEASE} - exit ;; - SX-7:SUPER-UX:*:*) - echo sx7-nec-superux${UNAME_RELEASE} - exit ;; - SX-8:SUPER-UX:*:*) - echo sx8-nec-superux${UNAME_RELEASE} - exit ;; - SX-8R:SUPER-UX:*:*) - echo sx8r-nec-superux${UNAME_RELEASE} - exit ;; - Power*:Rhapsody:*:*) - echo powerpc-apple-rhapsody${UNAME_RELEASE} - exit ;; - *:Rhapsody:*:*) - echo ${UNAME_MACHINE}-apple-rhapsody${UNAME_RELEASE} - exit ;; - *:Darwin:*:*) - UNAME_PROCESSOR=`uname -p` || UNAME_PROCESSOR=unknown - case $UNAME_PROCESSOR in - unknown) UNAME_PROCESSOR=powerpc ;; - esac - echo ${UNAME_PROCESSOR}-apple-darwin${UNAME_RELEASE} - exit ;; - *:procnto*:*:* | *:QNX:[0123456789]*:*) - UNAME_PROCESSOR=`uname -p` - if test "$UNAME_PROCESSOR" = "x86"; then - UNAME_PROCESSOR=i386 - UNAME_MACHINE=pc - fi - echo ${UNAME_PROCESSOR}-${UNAME_MACHINE}-nto-qnx${UNAME_RELEASE} - exit ;; - *:QNX:*:4*) - echo i386-pc-qnx - exit ;; - NSE-?:NONSTOP_KERNEL:*:*) - echo nse-tandem-nsk${UNAME_RELEASE} - exit ;; - NSR-?:NONSTOP_KERNEL:*:*) - echo nsr-tandem-nsk${UNAME_RELEASE} - exit ;; - *:NonStop-UX:*:*) - echo mips-compaq-nonstopux - exit ;; - BS2000:POSIX*:*:*) - echo bs2000-siemens-sysv - exit ;; - DS/*:UNIX_System_V:*:*) - echo ${UNAME_MACHINE}-${UNAME_SYSTEM}-${UNAME_RELEASE} - exit ;; - *:Plan9:*:*) - # "uname -m" is not consistent, so use $cputype instead. 386 - # is converted to i386 for consistency with other x86 - # operating systems. - if test "$cputype" = "386"; then - UNAME_MACHINE=i386 - else - UNAME_MACHINE="$cputype" - fi - echo ${UNAME_MACHINE}-unknown-plan9 - exit ;; - *:TOPS-10:*:*) - echo pdp10-unknown-tops10 - exit ;; - *:TENEX:*:*) - echo pdp10-unknown-tenex - exit ;; - KS10:TOPS-20:*:* | KL10:TOPS-20:*:* | TYPE4:TOPS-20:*:*) - echo pdp10-dec-tops20 - exit ;; - XKL-1:TOPS-20:*:* | TYPE5:TOPS-20:*:*) - echo pdp10-xkl-tops20 - exit ;; - *:TOPS-20:*:*) - echo pdp10-unknown-tops20 - exit ;; - *:ITS:*:*) - echo pdp10-unknown-its - exit ;; - SEI:*:*:SEIUX) - echo mips-sei-seiux${UNAME_RELEASE} - exit ;; - *:DragonFly:*:*) - echo ${UNAME_MACHINE}-unknown-dragonfly`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'` - exit ;; - *:*VMS:*:*) - UNAME_MACHINE=`(uname -p) 2>/dev/null` - case "${UNAME_MACHINE}" in - A*) echo alpha-dec-vms ; exit ;; - I*) echo ia64-dec-vms ; exit ;; - V*) echo vax-dec-vms ; exit ;; - esac ;; - *:XENIX:*:SysV) - echo i386-pc-xenix - exit ;; - i*86:skyos:*:*) - echo ${UNAME_MACHINE}-pc-skyos`echo ${UNAME_RELEASE}` | sed -e 's/ .*$//' - exit ;; - i*86:rdos:*:*) - echo ${UNAME_MACHINE}-pc-rdos - exit ;; -esac - -#echo '(No uname command or uname output not recognized.)' 1>&2 -#echo "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" 1>&2 - -eval $set_cc_for_build -cat >$dummy.c < -# include -#endif -main () -{ -#if defined (sony) -#if defined (MIPSEB) - /* BFD wants "bsd" instead of "newsos". Perhaps BFD should be changed, - I don't know.... */ - printf ("mips-sony-bsd\n"); exit (0); -#else -#include - printf ("m68k-sony-newsos%s\n", -#ifdef NEWSOS4 - "4" -#else - "" -#endif - ); exit (0); -#endif -#endif - -#if defined (__arm) && defined (__acorn) && defined (__unix) - printf ("arm-acorn-riscix\n"); exit (0); -#endif - -#if defined (hp300) && !defined (hpux) - printf ("m68k-hp-bsd\n"); exit (0); -#endif - -#if defined (NeXT) -#if !defined (__ARCHITECTURE__) -#define __ARCHITECTURE__ "m68k" -#endif - int version; - version=`(hostinfo | sed -n 's/.*NeXT Mach \([0-9]*\).*/\1/p') 2>/dev/null`; - if (version < 4) - printf ("%s-next-nextstep%d\n", __ARCHITECTURE__, version); - else - printf ("%s-next-openstep%d\n", __ARCHITECTURE__, version); - exit (0); -#endif - -#if defined (MULTIMAX) || defined (n16) -#if defined (UMAXV) - printf ("ns32k-encore-sysv\n"); exit (0); -#else -#if defined (CMU) - printf ("ns32k-encore-mach\n"); exit (0); -#else - printf ("ns32k-encore-bsd\n"); exit (0); -#endif -#endif -#endif - -#if defined (__386BSD__) - printf ("i386-pc-bsd\n"); exit (0); -#endif - -#if defined (sequent) -#if defined (i386) - printf ("i386-sequent-dynix\n"); exit (0); -#endif -#if defined (ns32000) - printf ("ns32k-sequent-dynix\n"); exit (0); -#endif -#endif - -#if defined (_SEQUENT_) - struct utsname un; - - uname(&un); - - if (strncmp(un.version, "V2", 2) == 0) { - printf ("i386-sequent-ptx2\n"); exit (0); - } - if (strncmp(un.version, "V1", 2) == 0) { /* XXX is V1 correct? */ - printf ("i386-sequent-ptx1\n"); exit (0); - } - printf ("i386-sequent-ptx\n"); exit (0); - -#endif - -#if defined (vax) -# if !defined (ultrix) -# include -# if defined (BSD) -# if BSD == 43 - printf ("vax-dec-bsd4.3\n"); exit (0); -# else -# if BSD == 199006 - printf ("vax-dec-bsd4.3reno\n"); exit (0); -# else - printf ("vax-dec-bsd\n"); exit (0); -# endif -# endif -# else - printf ("vax-dec-bsd\n"); exit (0); -# endif -# else - printf ("vax-dec-ultrix\n"); exit (0); -# endif -#endif - -#if defined (alliant) && defined (i860) - printf ("i860-alliant-bsd\n"); exit (0); -#endif - - exit (1); -} -EOF - -$CC_FOR_BUILD -o $dummy $dummy.c 2>/dev/null && SYSTEM_NAME=`$dummy` && - { echo "$SYSTEM_NAME"; exit; } - -# Apollos put the system type in the environment. - -test -d /usr/apollo && { echo ${ISP}-apollo-${SYSTYPE}; exit; } - -# Convex versions that predate uname can use getsysinfo(1) - -if [ -x /usr/convex/getsysinfo ] -then - case `getsysinfo -f cpu_type` in - c1*) - echo c1-convex-bsd - exit ;; - c2*) - if getsysinfo -f scalar_acc - then echo c32-convex-bsd - else echo c2-convex-bsd - fi - exit ;; - c34*) - echo c34-convex-bsd - exit ;; - c38*) - echo c38-convex-bsd - exit ;; - c4*) - echo c4-convex-bsd - exit ;; - esac -fi - -cat >&2 < in order to provide the needed -information to handle your system. - -config.guess timestamp = $timestamp - -uname -m = `(uname -m) 2>/dev/null || echo unknown` -uname -r = `(uname -r) 2>/dev/null || echo unknown` -uname -s = `(uname -s) 2>/dev/null || echo unknown` -uname -v = `(uname -v) 2>/dev/null || echo unknown` - -/usr/bin/uname -p = `(/usr/bin/uname -p) 2>/dev/null` -/bin/uname -X = `(/bin/uname -X) 2>/dev/null` - -hostinfo = `(hostinfo) 2>/dev/null` -/bin/universe = `(/bin/universe) 2>/dev/null` -/usr/bin/arch -k = `(/usr/bin/arch -k) 2>/dev/null` -/bin/arch = `(/bin/arch) 2>/dev/null` -/usr/bin/oslevel = `(/usr/bin/oslevel) 2>/dev/null` -/usr/convex/getsysinfo = `(/usr/convex/getsysinfo) 2>/dev/null` - -UNAME_MACHINE = ${UNAME_MACHINE} -UNAME_RELEASE = ${UNAME_RELEASE} -UNAME_SYSTEM = ${UNAME_SYSTEM} -UNAME_VERSION = ${UNAME_VERSION} -EOF - -exit 1 - -# Local variables: -# eval: (add-hook 'write-file-hooks 'time-stamp) -# time-stamp-start: "timestamp='" -# time-stamp-format: "%:y-%02m-%02d" -# time-stamp-end: "'" -# End: diff --git a/configure.ac b/configure.ac index fc104296f0..134f4689f6 100644 --- a/configure.ac +++ b/configure.ac @@ -1,10 +1,10 @@ -AC_INIT([ccminer], [1.5.31-git(SP-MOD)]) +AC_INIT([ccminer], [8.12-KlausT]) AC_PREREQ([2.59c]) AC_CANONICAL_SYSTEM AC_CONFIG_SRCDIR([ccminer.cpp]) AM_INIT_AUTOMAKE([foreign subdir-objects]) -AC_CONFIG_HEADERS([cpuminer-config.h]) +AC_CONFIG_HEADERS([ccminer-config.h]) dnl Make sure anyone changing configure.ac/Makefile.am has a clue AM_MAINTAINER_MODE @@ -178,3 +178,4 @@ AC_SUBST(NVCC) AC_SUBST(OPENMP_CFLAGS) AC_OUTPUT + diff --git a/configure.sh b/configure.sh index 1084ba7441..fd5bb1d251 100755 --- a/configure.sh +++ b/configure.sh @@ -5,7 +5,6 @@ #--ptxas-options=\"-v -dlcm=cg\"" -extracflags="-march=native -D_REENTRANT -falign-functions=16 -falign-jumps=16 -falign-labels=16" - -CUDA_CFLAGS="-O3 -Xcompiler -Wall" ./configure CXXFLAGS="-O3 $extracflags" --with-cuda=/usr/local/cuda --with-nvml=libnvidia-ml.so +extracflags="-march=native -std=c++11 -D_REENTRANT -falign-functions=16 -falign-jumps=16 -falign-labels=16" +CUDA_CFLAGS="-O3 -std=c++11 -Xcompiler -Wall -D_FORCE_INLINES" ./configure CXXFLAGS="-O3 $extracflags" --with-cuda=/usr/local/cuda --with-nvml=libnvidia-ml.so diff --git a/cpu-miner.c b/cpu-miner.c deleted file mode 100644 index c5aee0faca..0000000000 --- a/cpu-miner.c +++ /dev/null @@ -1,2084 +0,0 @@ -/* - * Copyright 2010 Jeff Garzik - * Copyright 2012-2014 pooler - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the Free - * Software Foundation; either version 2 of the License, or (at your option) - * any later version. See COPYING for more details. - */ - -#include "cpuminer-config.h" -#define _GNU_SOURCE - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include - -#ifdef WIN32 -#include -#include -#else -#include -#include -#if HAVE_SYS_SYSCTL_H -#include -#if HAVE_SYS_PARAM_H -#include -#endif -#include -#endif -#endif - -#include "compat.h" -#include "miner.h" - -#ifdef WIN32 -#include -#pragma comment(lib, "winmm.lib") -#include "compat/winansi.h" -BOOL WINAPI ConsoleHandler(DWORD); -#endif - -#define PROGRAM_NAME "ccminer" -#define LP_SCANTIME 60 -#define HEAVYCOIN_BLKHDR_SZ 84 -#define MNR_BLKHDR_SZ 80 - -// from heavy.cu -#ifdef __cplusplus -extern "C" -{ -#endif -int cuda_num_devices(); -void cuda_devicenames(); -void cuda_devicereset(); -int cuda_finddevice(char *name); -#ifdef __cplusplus -} -#endif - - -#ifdef __linux /* Linux specific policy and affinity management */ -#include -static inline void drop_policy(void) -{ - struct sched_param param; - param.sched_priority = 0; - -#ifdef SCHED_IDLE - if (unlikely(sched_setscheduler(0, SCHED_IDLE, ¶m) == -1)) -#endif -#ifdef SCHED_BATCH - sched_setscheduler(0, SCHED_BATCH, ¶m); -#endif -} - -static inline void affine_to_cpu(int id, int cpu) -{ - cpu_set_t set; - - CPU_ZERO(&set); - CPU_SET(cpu, &set); - sched_setaffinity(0, sizeof(&set), &set); -} -#elif defined(__FreeBSD__) /* FreeBSD specific policy and affinity management */ -#include -static inline void drop_policy(void) -{ -} - -static inline void affine_to_cpu(int id, int cpu) -{ - cpuset_t set; - CPU_ZERO(&set); - CPU_SET(cpu, &set); - cpuset_setaffinity(CPU_LEVEL_WHICH, CPU_WHICH_TID, -1, sizeof(cpuset_t), &set); -} -#else -static inline void drop_policy(void) -{ -} - -static inline void affine_to_cpu(int id, int cpu) -{ -} -#endif - -enum workio_commands { - WC_GET_WORK, - WC_SUBMIT_WORK, -}; - -struct workio_cmd { - enum workio_commands cmd; - struct thr_info *thr; - union { - struct work *work; - } u; -}; - -typedef enum { - ALGO_ANIME, - ALGO_BLAKE, - ALGO_BLAKECOIN, - ALGO_DEEP, - ALGO_DOOM, - ALGO_FRESH, - ALGO_FUGUE256, /* Fugue256 */ - ALGO_GROESTL, - ALGO_HEAVY, /* Heavycoin hash */ - ALGO_KECCAK, - ALGO_JACKPOT, - ALGO_LUFFA_DOOM, - ALGO_MJOLLNIR, /* Mjollnir hash */ - ALGO_MYR_GR, - ALGO_NIST5, - ALGO_PENTABLAKE, - ALGO_QUARK, - ALGO_QUBIT, - ALGO_S3, - ALGO_WHC, - ALGO_X11, - ALGO_X13, - ALGO_X14, - ALGO_X15, - ALGO_X17, - ALGO_DMD_GR, -} sha256_algos; - -static const char *algo_names[] = { - "anime", - "blake", - "blakecoin", - "deep", - "doom", /* is luffa */ - "fresh", - "fugue256", - "groestl", - "heavy", - "keccak", - "jackpot", - "luffa", - "mjollnir", - "myr-gr", - "nist5", - "penta", - "quark", - "qubit", - "s3", - "whirl", - "x11", - "x13", - "x14", - "x15", - "x17", - "dmd-gr", -}; - -bool opt_debug = false; -bool opt_tracegpu = false; -bool opt_protocol = false; -bool opt_benchmark = false; -bool want_longpoll = true; -bool have_longpoll = false; -bool want_stratum = true; -bool have_stratum = false; -static bool submit_old = false; -bool use_syslog = false; -bool use_colors = true; -static bool opt_background = false; -bool opt_quiet = false; -static int opt_retries = -1; -static int opt_fail_pause = 30; -int opt_timeout = 270; -static int opt_scantime = 5; -static json_t *opt_config; -static const bool opt_time = true; -static sha256_algos opt_algo = ALGO_HEAVY; -int opt_n_threads = 0; -static double opt_difficulty = 1; // CH -bool opt_trust_pool = false; -uint16_t opt_vote = 9999; -static int num_processors; -int device_map[8] = {0,1,2,3,4,5,6,7}; // CB -char *device_name[8]; // CB -static char *rpc_url; -static char *rpc_userpass; -static char *rpc_user, *rpc_pass; -static char *short_url = NULL; -char *opt_cert; -char *opt_proxy; -long opt_proxy_type; -struct thr_info *thr_info; -static int work_thr_id; -int longpoll_thr_id = -1; -int stratum_thr_id = -1; -struct work_restart *work_restart = NULL; -static struct stratum_ctx stratum; - -pthread_mutex_t applog_lock; -static pthread_mutex_t stats_lock; -static unsigned long accepted_count = 0L; -static unsigned long rejected_count = 0L; -static double *thr_hashrates; -uint64_t global_hashrate = 0; - -#ifdef HAVE_GETOPT_LONG -#include -#else -struct option { - const char *name; - int has_arg; - int *flag; - int val; -}; -#endif - -static char const usage[] = "\ -Usage: " PROGRAM_NAME " [OPTIONS]\n\ -Options:\n\ - -a, --algo=ALGO specify the hash algorithm to use\n\ - anime Animecoin\n\ - blake Blake 256 (SFR/NEOS)\n\ - blakecoin Fast Blake 256 (8 rounds)\n\ - deep Deepcoin\n\ - dmd-gr Diamond-Groestl\n\ - fresh Freshcoin (shavite 80)\n\ - fugue256 Fuguecoin\n\ - groestl Groestlcoin\n\ - heavy Heavycoin\n\ - jackpot Jackpot\n\ - keccak Keccak-256 (Maxcoin)\n\ - luffa Doomcoin\n\ - mjollnir Mjollnircoin\n\ - myr-gr Myriad-Groestl\n\ - nist5 NIST5 (TalkCoin)\n\ - penta Pentablake hash (5x Blake 512)\n\ - quark Quark\n\ - qubit Qubit\n\ - s3 S3 (1Coin)\n\ - x11 X11 (DarkCoin)\n\ - x13 X13 (MaruCoin)\n\ - x14 X14\n\ - x15 X15\n\ - x17 X17 (peoplecurrency)\n\ - whirl Whirlcoin (old whirlpool)\n\ - -d, --devices Comma separated list of CUDA devices to use.\n\ - Device IDs start counting from 0! Alternatively takes\n\ - string names of your cards like gtx780ti or gt640#2\n\ - (matching 2nd gt640 in the PC)\n\ - -f, --diff Divide difficulty by this factor (std is 1) \n\ - -v, --vote=VOTE block reward vote (for HeavyCoin)\n\ - -m, --trust-pool trust the max block reward vote (maxvote) sent by the pool\n\ - -o, --url=URL URL of mining server\n\ - -O, --userpass=U:P username:password pair for mining server\n\ - -u, --user=USERNAME username for mining server\n\ - -p, --pass=PASSWORD password for mining server\n\ - --cert=FILE certificate for mining server using SSL\n\ - -x, --proxy=[PROTOCOL://]HOST[:PORT] connect through a proxy\n\ - -t, --threads=N number of miner threads (default: number of nVidia GPUs)\n\ - -r, --retries=N number of times to retry if a network call fails\n\ - (default: retry indefinitely)\n\ - -R, --retry-pause=N time to pause between retries, in seconds (default: 30)\n\ - -T, --timeout=N network timeout, in seconds (default: 270)\n\ - -s, --scantime=N upper bound on time spent scanning current work when\n\ - long polling is unavailable, in seconds (default: 5)\n\ - --no-longpoll disable X-Long-Polling support\n\ - --no-stratum disable X-Stratum support\n\ - -q, --quiet disable per-thread hashmeter output\n\ - -K, --no-color disable colored output\n\ - -D, --debug enable debug output\n\ - -P, --protocol-dump verbose dump of protocol-level activities\n" -#ifdef HAVE_SYSLOG_H -"\ - -S, --syslog use system log for output messages\n" -#endif -#ifndef WIN32 -"\ - -B, --background run the miner in the background\n" -#endif -"\ - --benchmark run in offline benchmark mode\n\ - --cputest debug hashes from cpu algorithms\n\ - -c, --config=FILE load a JSON-format configuration file\n\ - -V, --version display version information and exit\n\ - -h, --help display this help text and exit\n\ -"; - -static char const short_options[] = -#ifndef WIN32 - "B" -#endif -#ifdef HAVE_SYSLOG_H - "S" -#endif - "a:c:CKDhp:Px:qr:R:s:t:T:o:u:O:Vd:f:mv:"; - -static struct option const options[] = { - { "algo", 1, NULL, 'a' }, -#ifndef WIN32 - { "background", 0, NULL, 'B' }, -#endif - { "benchmark", 0, NULL, 1005 }, - { "cputest", 0, NULL, 1006 }, - { "cert", 1, NULL, 1001 }, - { "config", 1, NULL, 'c' }, - { "no-color", 0, NULL, 'K' }, - { "debug", 0, NULL, 'D' }, - { "help", 0, NULL, 'h' }, - { "no-longpoll", 0, NULL, 1003 }, - { "no-stratum", 0, NULL, 1007 }, - { "pass", 1, NULL, 'p' }, - { "protocol-dump", 0, NULL, 'P' }, - { "proxy", 1, NULL, 'x' }, - { "quiet", 0, NULL, 'q' }, - { "retries", 1, NULL, 'r' }, - { "retry-pause", 1, NULL, 'R' }, - { "scantime", 1, NULL, 's' }, -#ifdef HAVE_SYSLOG_H - { "syslog", 0, NULL, 'S' }, -#endif - { "threads", 1, NULL, 't' }, - { "vote", 1, NULL, 'v' }, - { "trust-pool", 0, NULL, 'm' }, - { "timeout", 1, NULL, 'T' }, - { "url", 1, NULL, 'o' }, - { "user", 1, NULL, 'u' }, - { "userpass", 1, NULL, 'O' }, - { "version", 0, NULL, 'V' }, - { "devices", 1, NULL, 'd' }, - { "diff", 1, NULL, 'f' }, - { 0, 0, 0, 0 } -}; - -struct work { - uint32_t data[32]; - uint32_t target[8]; - uint32_t maxvote; - - char job_id[128]; - size_t xnonce2_len; - unsigned char xnonce2[32]; - - union { - uint32_t u32[2]; - uint64_t u64[1]; - } noncerange; - - double difficulty; - - uint32_t scanned_from; - uint32_t scanned_to; -}; - -static struct work _ALIGN(64) g_work; -static time_t g_work_time; -static pthread_mutex_t g_work_lock; - - -void proper_exit(int reason) -{ - cuda_devicereset(); - hashlog_purge_all(); - exit(reason); -} - -static bool jobj_binary(const json_t *obj, const char *key, - void *buf, size_t buflen) -{ - const char *hexstr; - json_t *tmp; - - tmp = json_object_get(obj, key); - if (unlikely(!tmp)) { - applog(LOG_ERR, "JSON key '%s' not found", key); - return false; - } - hexstr = json_string_value(tmp); - if (unlikely(!hexstr)) { - applog(LOG_ERR, "JSON key '%s' is not a string", key); - return false; - } - if (!hex2bin((unsigned char*)buf, hexstr, buflen)) - return false; - - return true; -} - -static bool work_decode(const json_t *val, struct work *work) -{ - int i; - - if (unlikely(!jobj_binary(val, "data", work->data, sizeof(work->data)))) { - applog(LOG_ERR, "JSON inval data"); - goto err_out; - } - if (unlikely(!jobj_binary(val, "target", work->target, sizeof(work->target)))) { - applog(LOG_ERR, "JSON inval target"); - goto err_out; - } - if (opt_algo == ALGO_HEAVY) { - if (unlikely(!jobj_binary(val, "maxvote", &work->maxvote, sizeof(work->maxvote)))) { - work->maxvote = 1024; - } - } else work->maxvote = 0; - - for (i = 0; i < ARRAY_SIZE(work->data); i++) - work->data[i] = le32dec(work->data + i); - for (i = 0; i < ARRAY_SIZE(work->target); i++) - work->target[i] = le32dec(work->target + i); - - json_t *jr = json_object_get(val, "noncerange"); - if (jr) { - const char * hexstr = json_string_value(jr); - if (likely(hexstr)) { - // never seen yet... - hex2bin((unsigned char*)work->noncerange.u64, hexstr, 8); - applog(LOG_DEBUG, "received noncerange: %08x-%08x", work->noncerange.u32[0], work->noncerange.u32[1]); - } - } - - /* use work ntime as job id (solo-mining) */ - cbin2hex(work->job_id, (const char*)&work->data[17], 4); - - return true; - -err_out: - return false; -} - -/** - * Calculate the work difficulty as double - */ -static void calc_diff(struct work *work, int known) -{ - // sample for diff 32.53 : 00000007de5f0000 - const uint64_t diffone = 0xFFFF000000000000ull; - uint64_t *data64, d64; - char rtarget[32]; - - swab256(rtarget, work->target); - data64 = (uint64_t *)(rtarget + 3); /* todo: index (3) can be tuned here */ - d64 = swab64(*data64); - if (unlikely(!d64)) - d64 = 1; - work->difficulty = (double)diffone / d64; - if (opt_difficulty > 0.) { - work->difficulty /= opt_difficulty; - } -} - -static int share_result(int result, const char *reason) -{ - char s[345]; - double hashrate; - - hashrate = 0.; - pthread_mutex_lock(&stats_lock); - for (int i = 0; i < opt_n_threads; i++) - hashrate += thr_hashrates[i]; - result ? accepted_count++ : rejected_count++; - pthread_mutex_unlock(&stats_lock); - - global_hashrate = llround(hashrate); - - sprintf(s, hashrate >= 1e6 ? "%.0f" : "%.2f", 1e-3 * hashrate); - applog(LOG_NOTICE, "accepted: %lu/%lu (%.2f%%), %s khash/s %s", - accepted_count, - accepted_count + rejected_count, - 100. * accepted_count / (accepted_count + rejected_count), - s, - use_colors ? - (result ? CL_GRN "yay!!!" : CL_RED "booooo") - : (result ? "(yay!!!)" : "(booooo)")); - - if (reason) { - applog(LOG_WARNING, "reject reason: %s", reason); - if (strncmp(reason, "low difficulty share", 20) == 0) { - opt_difficulty = (opt_difficulty * 2.0) / 3.0; - applog(LOG_WARNING, "factor reduced to : %0.2f", opt_difficulty); - return 0; - } - } - return 1; -} - -static bool submit_upstream_work(CURL *curl, struct work *work) -{ - char *str = NULL; - json_t *val, *res, *reason; - char s[345]; - int i; - bool rc = false; - - /* pass if the previous hash is not the current previous hash */ - pthread_mutex_lock(&g_work_lock); - if (memcmp(work->data + 1, g_work.data + 1, 32)) { - pthread_mutex_unlock(&g_work_lock); - if (opt_debug) - applog(LOG_DEBUG, "DEBUG: stale work detected, discarding"); - return true; - } - calc_diff(work, 0); - pthread_mutex_unlock(&g_work_lock); - - if (have_stratum) { - uint32_t sent; - uint32_t ntime, nonce; - uint16_t nvote; - char *ntimestr, *noncestr, *xnonce2str, *nvotestr; - - le32enc(&ntime, work->data[17]); - le32enc(&nonce, work->data[19]); - be16enc(&nvote, *((uint16_t*)&work->data[20])); - - ntimestr = bin2hex((const unsigned char *)(&ntime), 4); - noncestr = bin2hex((const unsigned char *)(&nonce), 4); - xnonce2str = bin2hex(work->xnonce2, work->xnonce2_len); - nvotestr = bin2hex((const unsigned char *)(&nvote), 2); - - sent = hashlog_already_submittted(work->job_id, nonce); - if (sent > 0) { - sent = (uint32_t) time(NULL) - sent; - if (!opt_quiet) { - applog(LOG_WARNING, "skip submit, nonce %s was already sent %u seconds ago", noncestr, sent); - hashlog_dump_job(work->job_id); - } - rc = true; - goto out; - } - - if (opt_algo == ALGO_HEAVY) { - sprintf(s, - "{\"method\": \"mining.submit\", \"params\": [\"%s\", \"%s\", \"%s\", \"%s\", \"%s\", \"%s\"], \"id\":4}", - rpc_user, work->job_id + 8, xnonce2str, ntimestr, noncestr, nvotestr); - } else { - sprintf(s, - "{\"method\": \"mining.submit\", \"params\": [\"%s\", \"%s\", \"%s\", \"%s\", \"%s\"], \"id\":4}", - rpc_user, work->job_id + 8, xnonce2str, ntimestr, noncestr); - } - free(ntimestr); - free(noncestr); - free(xnonce2str); - free(nvotestr); - - if (unlikely(!stratum_send_line(&stratum, s))) { - applog(LOG_ERR, "submit_upstream_work stratum_send_line failed"); - goto out; - } - - hashlog_remember_submit(work->job_id, nonce, work->scanned_from); - - } else { - - /* build hex string */ - - if (opt_algo != ALGO_HEAVY && opt_algo != ALGO_MJOLLNIR) { - for (i = 0; i < ARRAY_SIZE(work->data); i++) - le32enc(work->data + i, work->data[i]); - } - str = bin2hex((unsigned char *)work->data, sizeof(work->data)); - if (unlikely(!str)) { - applog(LOG_ERR, "submit_upstream_work OOM"); - goto out; - } - - /* build JSON-RPC request */ - sprintf(s, - "{\"method\": \"getwork\", \"params\": [ \"%s\" ], \"id\":1}\r\n", - str); - - /* issue JSON-RPC request */ - val = json_rpc_call(curl, rpc_url, rpc_userpass, s, false, false, NULL); - if (unlikely(!val)) { - applog(LOG_ERR, "submit_upstream_work json_rpc_call failed"); - goto out; - } - - res = json_object_get(val, "result"); - reason = json_object_get(val, "reject-reason"); - if (!share_result(json_is_true(res), reason ? json_string_value(reason) : NULL)) - hashlog_purge_job(work->job_id); - - json_decref(val); - } - - rc = true; - -out: - free(str); - return rc; -} - -static const char *rpc_req = - "{\"method\": \"getwork\", \"params\": [], \"id\":0}\r\n"; - -static bool get_upstream_work(CURL *curl, struct work *work) -{ - json_t *val; - bool rc; - struct timeval tv_start, tv_end, diff; - - gettimeofday(&tv_start, NULL); - val = json_rpc_call(curl, rpc_url, rpc_userpass, rpc_req, - want_longpoll, false, NULL); - gettimeofday(&tv_end, NULL); - - if (have_stratum) { - if (val) - json_decref(val); - return true; - } - - if (!val) - return false; - - rc = work_decode(json_object_get(val, "result"), work); - - if (opt_protocol && rc) { - timeval_subtract(&diff, &tv_end, &tv_start); - /* show time because curl can be slower against versions/config */ - applog(LOG_DEBUG, "got new work in %.2f ms", - (1000.0 * diff.tv_sec) + (0.001 * diff.tv_usec)); - } - - json_decref(val); - - return rc; -} - -static void workio_cmd_free(struct workio_cmd *wc) -{ - if (!wc) - return; - - switch (wc->cmd) { - case WC_SUBMIT_WORK: - aligned_free(wc->u.work); - break; - default: /* do nothing */ - break; - } - - memset(wc, 0, sizeof(*wc)); /* poison */ - free(wc); -} - -static bool workio_get_work(struct workio_cmd *wc, CURL *curl) -{ - struct work *ret_work; - int failures = 0; - - ret_work = (struct work*)aligned_calloc(sizeof(*ret_work)); - if (!ret_work) - return false; - - /* obtain new work from bitcoin via JSON-RPC */ - while (!get_upstream_work(curl, ret_work)) { - if (unlikely((opt_retries >= 0) && (++failures > opt_retries))) { - applog(LOG_ERR, "json_rpc_call failed, terminating workio thread"); - aligned_free(ret_work); - return false; - } - - /* pause, then restart work-request loop */ - applog(LOG_ERR, "json_rpc_call failed, retry after %d seconds", - opt_fail_pause); - sleep(opt_fail_pause); - } - - /* send work to requesting thread */ - if (!tq_push(wc->thr->q, ret_work)) - aligned_free(ret_work); - - return true; -} - -static bool workio_submit_work(struct workio_cmd *wc, CURL *curl) -{ - int failures = 0; - - /* submit solution to bitcoin via JSON-RPC */ - while (!submit_upstream_work(curl, wc->u.work)) { - if (unlikely((opt_retries >= 0) && (++failures > opt_retries))) { - applog(LOG_ERR, "...terminating workio thread"); - return false; - } - - /* pause, then restart work-request loop */ - if (!opt_benchmark) - applog(LOG_ERR, "...retry after %d seconds", opt_fail_pause); - - sleep(opt_fail_pause); - } - - return true; -} - -static void *workio_thread(void *userdata) -{ - struct thr_info *mythr = (struct thr_info*)userdata; - CURL *curl; - bool ok = true; - - curl = curl_easy_init(); - if (unlikely(!curl)) { - applog(LOG_ERR, "CURL initialization failed"); - return NULL; - } - - while (ok) { - struct workio_cmd *wc; - - /* wait for workio_cmd sent to us, on our queue */ - wc = (struct workio_cmd *)tq_pop(mythr->q, NULL); - if (!wc) { - ok = false; - break; - } - - /* process workio_cmd */ - switch (wc->cmd) { - case WC_GET_WORK: - ok = workio_get_work(wc, curl); - break; - case WC_SUBMIT_WORK: - ok = workio_submit_work(wc, curl); - break; - - default: /* should never happen */ - ok = false; - break; - } - - workio_cmd_free(wc); - } - - tq_freeze(mythr->q); - curl_easy_cleanup(curl); - - return NULL; -} - -static bool get_work(struct thr_info *thr, struct work *work) -{ - struct workio_cmd *wc; - struct work *work_heap; - - if (opt_benchmark) { - memset(work->data, 0x55, 76); - work->data[17] = swab32((uint32_t)time(NULL)); - memset(work->data + 19, 0x00, 52); - work->data[20] = 0x80000000; - work->data[31] = 0x00000280; - memset(work->target, 0x00, sizeof(work->target)); - return true; - } - - /* fill out work request message */ - wc = (struct workio_cmd *)calloc(1, sizeof(*wc)); - if (!wc) - return false; - - wc->cmd = WC_GET_WORK; - wc->thr = thr; - - /* send work request to workio thread */ - if (!tq_push(thr_info[work_thr_id].q, wc)) { - workio_cmd_free(wc); - return false; - } - - /* wait for response, a unit of work */ - work_heap = (struct work *)tq_pop(thr->q, NULL); - if (!work_heap) - return false; - - /* copy returned work into storage provided by caller */ - memcpy(work, work_heap, sizeof(*work)); - free(work_heap); - - return true; -} - -static bool submit_work(struct thr_info *thr, const struct work *work_in) -{ - struct workio_cmd *wc; - /* fill out work request message */ - wc = (struct workio_cmd *)calloc(1, sizeof(*wc)); - if (!wc) - return false; - - wc->u.work = (struct work *)aligned_calloc(sizeof(*work_in)); - if (!wc->u.work) - goto err_out; - - wc->cmd = WC_SUBMIT_WORK; - wc->thr = thr; - memcpy(wc->u.work, work_in, sizeof(*work_in)); - - /* send solution to workio thread */ - if (!tq_push(thr_info[work_thr_id].q, wc)) - goto err_out; - - return true; - -err_out: - workio_cmd_free(wc); - return false; -} - -static void stratum_gen_work(struct stratum_ctx *sctx, struct work *work) -{ - unsigned char merkle_root[64]; - int i; - - if (!sctx->job.job_id) { - /* job not yet retrieved */ - return; - } - - pthread_mutex_lock(&sctx->work_lock); - - // store the job ntime as high part of jobid - snprintf(work->job_id, sizeof(work->job_id), "%07x %s", - be32dec(sctx->job.ntime) & 0xfffffff, sctx->job.job_id); - work->xnonce2_len = sctx->xnonce2_size; - memcpy(work->xnonce2, sctx->job.xnonce2, sctx->xnonce2_size); - - /* Generate merkle root */ - switch (opt_algo) { - case ALGO_HEAVY: - case ALGO_MJOLLNIR: - heavycoin_hash(merkle_root, sctx->job.coinbase, (int)sctx->job.coinbase_size); - break; - case ALGO_FUGUE256: - case ALGO_GROESTL: - case ALGO_KECCAK: - case ALGO_BLAKECOIN: - case ALGO_WHC: - SHA256((uint8_t*)sctx->job.coinbase, sctx->job.coinbase_size, (uint8_t*)merkle_root); - break; - default: - sha256d(merkle_root, sctx->job.coinbase, (int)sctx->job.coinbase_size); - } - - for (i = 0; i < sctx->job.merkle_count; i++) { - memcpy(merkle_root + 32, sctx->job.merkle[i], 32); - if (opt_algo == ALGO_HEAVY || opt_algo == ALGO_MJOLLNIR) - heavycoin_hash(merkle_root, merkle_root, 64); - else - sha256d(merkle_root, merkle_root, 64); - } - - /* Increment extranonce2 */ - for (i = 0; i < (int)sctx->xnonce2_size && !++sctx->job.xnonce2[i]; i++); - - /* Assemble block header */ - memset(work->data, 0, sizeof(work->data)); - work->data[0] = le32dec(sctx->job.version); - for (i = 0; i < 8; i++) - work->data[1 + i] = le32dec((uint32_t *)sctx->job.prevhash + i); - for (i = 0; i < 8; i++) - work->data[9 + i] = be32dec((uint32_t *)merkle_root + i); - work->data[17] = le32dec(sctx->job.ntime); - work->data[18] = le32dec(sctx->job.nbits); - if (opt_algo == ALGO_MJOLLNIR) - { - for (i = 0; i < 20; i++) - work->data[i] = be32dec((uint32_t *)&work->data[i]); - } - - work->data[20] = 0x80000000; - work->data[31] = (opt_algo == ALGO_MJOLLNIR) ? 0x000002A0 : 0x00000280; - - // HeavyCoin - if (opt_algo == ALGO_HEAVY) { - uint16_t *ext; - work->maxvote = 1024; - ext = (uint16_t*)(&work->data[20]); - ext[0] = opt_vote; - ext[1] = be16dec(sctx->job.nreward); - - for (i = 0; i < 20; i++) - work->data[i] = be32dec((uint32_t *)&work->data[i]); - } - // - - pthread_mutex_unlock(&sctx->work_lock); - - if (opt_debug) { - char *tm = atime2str(swab32(work->data[17]) - sctx->srvtime_diff); - char *xnonce2str = bin2hex(work->xnonce2, sctx->xnonce2_size); - applog(LOG_DEBUG, "DEBUG: job_id=%s xnonce2=%s time=%s", - work->job_id, xnonce2str, tm); - free(tm); - free(xnonce2str); - } - - if (opt_algo == ALGO_JACKPOT) - diff_to_target(work->target, sctx->job.diff / (65536.0 * opt_difficulty)); - else if (opt_algo == ALGO_FUGUE256 || opt_algo == ALGO_GROESTL || opt_algo == ALGO_DMD_GR || opt_algo == ALGO_FRESH) - diff_to_target(work->target, sctx->job.diff / (256.0 * opt_difficulty)); - else if (opt_algo == ALGO_KECCAK) - diff_to_target(work->target, sctx->job.diff / (128.0 * opt_difficulty)); - else - diff_to_target(work->target, sctx->job.diff / opt_difficulty); -} - -static void *miner_thread(void *userdata) -{ - struct thr_info *mythr = (struct thr_info *)userdata; - int thr_id = mythr->id; - struct work work; - uint32_t max_nonce; - uint32_t end_nonce = 0xffffffffU / opt_n_threads * (thr_id + 1) - (thr_id + 1); - bool work_done = false; - bool extrajob = false; - char s[16]; - int rc = 0; - - memset(&work, 0, sizeof(work)); // prevent work from being used uninitialized - - /* Set worker threads to nice 19 and then preferentially to SCHED_IDLE - * and if that fails, then SCHED_BATCH. No need for this to be an - * error if it fails */ - if (!opt_benchmark) { - setpriority(PRIO_PROCESS, 0, 19); - drop_policy(); - } - - /* Cpu affinity only makes sense if the number of threads is a multiple - * of the number of CPUs */ - if (num_processors > 1 && opt_n_threads % num_processors == 0) { - if (!opt_quiet) - applog(LOG_DEBUG, "Binding thread %d to cpu %d", thr_id, - thr_id % num_processors); - affine_to_cpu(thr_id, thr_id % num_processors); - } - - while (1) { - unsigned long hashes_done; - uint32_t start_nonce; - struct timeval tv_start, tv_end, diff; - int64_t max64; - uint64_t umax64; - - // &work.data[19] - int wcmplen = 76; - uint32_t *nonceptr = (uint32_t*) (((char*)work.data) + wcmplen); - - if (have_stratum) { - uint32_t sleeptime = 0; - while (!work_done && time(NULL) >= (g_work_time + opt_scantime)) { - usleep(100*1000); - if (sleeptime > 4) { - extrajob = true; - break; - } - sleeptime++; - } - if (sleeptime && opt_debug && !opt_quiet) - applog(LOG_DEBUG, "sleeptime: %u ms", sleeptime*100); - nonceptr = (uint32_t*) (((char*)work.data) + wcmplen); - pthread_mutex_lock(&g_work_lock); - extrajob |= work_done; - if ((*nonceptr) >= end_nonce || extrajob) { - work_done = false; - extrajob = false; - stratum_gen_work(&stratum, &g_work); - } - } else { - int min_scantime = have_longpoll ? LP_SCANTIME : opt_scantime; - /* obtain new work from internal workio thread */ - pthread_mutex_lock(&g_work_lock); - if (time(NULL) - g_work_time >= min_scantime || - (*nonceptr) >= end_nonce) { - if (unlikely(!get_work(mythr, &g_work))) { - applog(LOG_ERR, "work retrieval failed, exiting " - "mining thread %d", mythr->id); - pthread_mutex_unlock(&g_work_lock); - goto out; - } - g_work_time = time(NULL); - } - } -#if 0 - if (!opt_benchmark && g_work.job_id[0] == '\0') { - applog(LOG_ERR, "work data not read yet"); - extrajob = true; - work_done = true; - sleep(1); - //continue; - } -#endif - if (rc > 1) { - /* if we found more than one on last loop */ - /* todo: handle an array to get them directly */ - pthread_mutex_unlock(&g_work_lock); - goto continue_scan; - } - - if (memcmp(work.target, g_work.target, sizeof(work.target))) { - calc_diff(&g_work, 0); - if (opt_debug) { - uint64_t target64 = g_work.target[7] * 0x100000000ULL + g_work.target[6]; - applog(LOG_DEBUG, "job %s target change: %llx (%.1f)", g_work.job_id, target64, g_work.difficulty); - } - memcpy(work.target, g_work.target, sizeof(work.target)); - work.difficulty = g_work.difficulty; - (*nonceptr) = (0xffffffffUL / opt_n_threads) * thr_id; // 0 if single thr - /* on new target, ignoring nonce, clear sent data (hashlog) */ - if (memcmp(work.target, g_work.target, sizeof(work.target))) { - hashlog_purge_job(work.job_id); - } - } - if (memcmp(work.data, g_work.data, wcmplen)) { - if (opt_debug) { -#if 0 - for (int n=0; n <= (wcmplen-8); n+=8) { - if (memcmp(work.data + n, g_work.data + n, 8)) { - applog(LOG_DEBUG, "job %s work updated at offset %d:", g_work.job_id, n); - applog_hash((uint8_t*) work.data + n); - applog_compare_hash((uint8_t*) g_work.data + n, (uint8_t*) work.data + n); - } - } -#endif - } - memcpy(&work, &g_work, sizeof(struct work)); - (*nonceptr) = (0xffffffffUL / opt_n_threads) * thr_id; // 0 if single thr - } else - (*nonceptr)++; //?? - work_restart[thr_id].restart = 0; - - if (opt_debug) - applog(LOG_DEBUG, "job %s %08x", g_work.job_id, (*nonceptr)); - pthread_mutex_unlock(&g_work_lock); - - /* adjust max_nonce to meet target scan time */ - if (have_stratum) - max64 = LP_SCANTIME; - else - max64 = g_work_time + (have_longpoll ? LP_SCANTIME : opt_scantime) - - time(NULL); - - max64 *= (int64_t)thr_hashrates[thr_id]; - - if (max64 <= 0) { - /* should not be set too high, - else you can miss multiple nounces */ - switch (opt_algo) { - case ALGO_JACKPOT: - max64 = 0x1fffLL; - break; - case ALGO_BLAKECOIN: - max64 = 0x3ffffffLL; - break; - case ALGO_BLAKE: - /* based on the 750Ti hashrate (100kH) */ - max64 = 0x1ffffffLL; - break; - default: - max64 = 0xfffffLL; - break; - } - } - - start_nonce = *nonceptr; - - /* do not recompute something already scanned */ - if (opt_algo == ALGO_BLAKE && opt_n_threads == 1) { - union { - uint64_t data; - uint32_t scanned[2]; - } range; - - range.data = hashlog_get_scan_range(work.job_id); - if (range.data) { - bool stall = false; - if (range.scanned[0] == 1 && range.scanned[1] == 0xFFFFFFFFUL) { - applog(LOG_WARNING, "detected a rescan of fully scanned job!"); - } else if (range.scanned[0] > 0 && range.scanned[1] > 0 && range.scanned[1] < 0xFFFFFFF0UL) { - /* continue scan the end */ - start_nonce = range.scanned[1] + 1; - //applog(LOG_DEBUG, "scan the next part %x + 1 (%x-%x)", range.scanned[1], range.scanned[0], range.scanned[1]); - } - - stall = (start_nonce == work.scanned_from && end_nonce == work.scanned_to); - stall |= (start_nonce == work.scanned_from && start_nonce == range.scanned[1] + 1); - stall |= (start_nonce > range.scanned[0] && start_nonce < range.scanned[1]); - - if (stall) { - if (opt_debug && !opt_quiet) - applog(LOG_DEBUG, "job done, wait for a new one..."); - work_restart[thr_id].restart = 1; - hashlog_purge_old(); - // wait a bit for a new job... - usleep(500*1000); - (*nonceptr) = end_nonce + 1; - work_done = true; - continue; - } - } - } - - umax64 = (uint64_t) max64; - if ((umax64 + start_nonce) >= end_nonce) - max_nonce = end_nonce; - else - max_nonce = (uint32_t) umax64 + start_nonce; - - work.scanned_from = start_nonce; - (*nonceptr) = start_nonce; - - hashes_done = 0; -continue_scan: - gettimeofday(&tv_start, NULL); - - /* scan nonces for a proof-of-work hash */ - switch (opt_algo) { - - case ALGO_HEAVY: - rc = scanhash_heavy(thr_id, work.data, work.target, - max_nonce, &hashes_done, work.maxvote, HEAVYCOIN_BLKHDR_SZ); - break; - - case ALGO_KECCAK: - rc = scanhash_keccak256(thr_id, work.data, work.target, - max_nonce, &hashes_done); - break; - - case ALGO_MJOLLNIR: - rc = scanhash_heavy(thr_id, work.data, work.target, - max_nonce, &hashes_done, 0, MNR_BLKHDR_SZ); - break; - - case ALGO_DEEP: - rc = scanhash_deep(thr_id, work.data, work.target, - max_nonce, &hashes_done); - break; - - case ALGO_DOOM: - case ALGO_LUFFA_DOOM: - rc = scanhash_doom(thr_id, work.data, work.target, - max_nonce, &hashes_done); - break; - - case ALGO_FUGUE256: - rc = scanhash_fugue256(thr_id, work.data, work.target, - max_nonce, &hashes_done); - break; - - case ALGO_GROESTL: - case ALGO_DMD_GR: - rc = scanhash_groestlcoin(thr_id, work.data, work.target, - max_nonce, &hashes_done); - break; - - case ALGO_MYR_GR: - rc = scanhash_myriad(thr_id, work.data, work.target, - max_nonce, &hashes_done); - break; - - case ALGO_JACKPOT: - rc = scanhash_jackpot(thr_id, work.data, work.target, - max_nonce, &hashes_done); - break; - - case ALGO_QUARK: - rc = scanhash_quark(thr_id, work.data, work.target, - max_nonce, &hashes_done); - break; - - case ALGO_QUBIT: - rc = scanhash_qubit(thr_id, work.data, work.target, - max_nonce, &hashes_done); - break; - - case ALGO_ANIME: - rc = scanhash_anime(thr_id, work.data, work.target, - max_nonce, &hashes_done); - break; - - case ALGO_BLAKECOIN: - rc = scanhash_blake256(thr_id, work.data, work.target, - max_nonce, &hashes_done, 8); - break; - - case ALGO_BLAKE: - rc = scanhash_blake256(thr_id, work.data, work.target, - max_nonce, &hashes_done, 14); - break; - - case ALGO_FRESH: - rc = scanhash_fresh(thr_id, work.data, work.target, - max_nonce, &hashes_done); - break; - - case ALGO_NIST5: - rc = scanhash_nist5(thr_id, work.data, work.target, - max_nonce, &hashes_done); - break; - - case ALGO_PENTABLAKE: - rc = scanhash_pentablake(thr_id, work.data, work.target, - max_nonce, &hashes_done); - break; - - case ALGO_S3: - rc = scanhash_s3(thr_id, work.data, work.target, - max_nonce, &hashes_done); - break; - - case ALGO_WHC: - rc = scanhash_whc(thr_id, work.data, work.target, - max_nonce, &hashes_done); - break; - - case ALGO_X11: - rc = scanhash_x11(thr_id, work.data, work.target, - max_nonce, &hashes_done); - break; - - case ALGO_X13: - rc = scanhash_x13(thr_id, work.data, work.target, - max_nonce, &hashes_done); - break; - - case ALGO_X14: - rc = scanhash_x14(thr_id, work.data, work.target, - max_nonce, &hashes_done); - break; - - case ALGO_X15: - rc = scanhash_x15(thr_id, work.data, work.target, - max_nonce, &hashes_done); - break; - - case ALGO_X17: - rc = scanhash_x17(thr_id, work.data, work.target, - max_nonce, &hashes_done); - break; - - default: - /* should never happen */ - goto out; - } - - /* record scanhash elapsed time */ - gettimeofday(&tv_end, NULL); - - if (rc && opt_debug) - applog(LOG_NOTICE, CL_CYN "found => %08x" CL_GRN " %08x", *nonceptr, swab32(*nonceptr)); - - timeval_subtract(&diff, &tv_end, &tv_start); - if (diff.tv_usec || diff.tv_sec) { - pthread_mutex_lock(&stats_lock); - if (diff.tv_sec + 1e-6 * diff.tv_usec > 0.0) { - thr_hashrates[thr_id] = hashes_done / (diff.tv_sec + 1e-6 * diff.tv_usec); - if (rc > 1) - thr_hashrates[thr_id] = (rc * hashes_done) / (diff.tv_sec + 1e-6 * diff.tv_usec); - } - pthread_mutex_unlock(&stats_lock); - } - if (!opt_quiet) { - sprintf(s, thr_hashrates[thr_id] >= 1e6 ? "%.0f" : "%.2f", - 1e-3 * thr_hashrates[thr_id]); - applog(LOG_INFO, "GPU #%d: %s, %s kH/s", - device_map[thr_id], device_name[thr_id], s); - } - if (thr_id == opt_n_threads - 1) { - double hashrate = 0.; - for (int i = 0; i < opt_n_threads && thr_hashrates[i]; i++) - hashrate += thr_hashrates[i]; - - global_hashrate = llround(hashrate); - if (opt_benchmark) { - sprintf(s, hashrate >= 1e6 ? "%.0f" : "%.2f", hashrate / 1000.); - applog(LOG_NOTICE, "Total: %s kH/s", s); - } - } - - if (rc) { - work.scanned_to = *nonceptr; - } else { - work.scanned_to = max_nonce; - } - - // could be used to store speeds too.. - hashlog_remember_scan_range(work.job_id, work.scanned_from, work.scanned_to); - - /* if nonce found, submit work */ - if (rc) { - if (!opt_benchmark && !submit_work(mythr, &work)) - break; - } - } - -out: - tq_freeze(mythr->q); - - return NULL; -} - -static void restart_threads(void) -{ - int i; - - for (i = 0; i < opt_n_threads; i++) - work_restart[i].restart = 1; -} - -static void *longpoll_thread(void *userdata) -{ - struct thr_info *mythr = (struct thr_info *)userdata; - CURL *curl = NULL; - char *copy_start, *hdr_path = NULL, *lp_url = NULL; - bool need_slash = false; - - curl = curl_easy_init(); - if (unlikely(!curl)) { - applog(LOG_ERR, "CURL initialization failed"); - goto out; - } - -start: - hdr_path = (char*)tq_pop(mythr->q, NULL); - if (!hdr_path) - goto out; - - /* full URL */ - if (strstr(hdr_path, "://")) { - lp_url = hdr_path; - hdr_path = NULL; - } - - /* absolute path, on current server */ - else { - copy_start = (*hdr_path == '/') ? (hdr_path + 1) : hdr_path; - if (rpc_url[strlen(rpc_url) - 1] != '/') - need_slash = true; - - lp_url = (char*)malloc(strlen(rpc_url) + strlen(copy_start) + 2); - if (!lp_url) - goto out; - - sprintf(lp_url, "%s%s%s", rpc_url, need_slash ? "/" : "", copy_start); - } - - applog(LOG_INFO, "Long-polling activated for %s", lp_url); - - while (1) { - json_t *val, *soval; - int err; - - val = json_rpc_call(curl, lp_url, rpc_userpass, rpc_req, - false, true, &err); - if (have_stratum) { - if (val) - json_decref(val); - goto out; - } - if (likely(val)) { - if (!opt_quiet) applog(LOG_INFO, "LONGPOLL detected new block"); - soval = json_object_get(json_object_get(val, "result"), "submitold"); - submit_old = soval ? json_is_true(soval) : false; - pthread_mutex_lock(&g_work_lock); - if (work_decode(json_object_get(val, "result"), &g_work)) { - if (opt_debug) - applog(LOG_BLUE, "LONGPOLL pushed new work"); - time(&g_work_time); - restart_threads(); - } - pthread_mutex_unlock(&g_work_lock); - json_decref(val); - } else { - pthread_mutex_lock(&g_work_lock); - g_work_time -= LP_SCANTIME; - pthread_mutex_unlock(&g_work_lock); - if (err == CURLE_OPERATION_TIMEDOUT) { - restart_threads(); - } else { - have_longpoll = false; - restart_threads(); - free(hdr_path); - free(lp_url); - lp_url = NULL; - sleep(opt_fail_pause); - goto start; - } - } - } - -out: - free(hdr_path); - free(lp_url); - tq_freeze(mythr->q); - if (curl) - curl_easy_cleanup(curl); - - return NULL; -} - -static bool stratum_handle_response(char *buf) -{ - json_t *val, *err_val, *res_val, *id_val; - json_error_t err; - bool ret = false; - - val = JSON_LOADS(buf, &err); - if (!val) { - applog(LOG_INFO, "JSON decode failed(%d): %s", err.line, err.text); - goto out; - } - - res_val = json_object_get(val, "result"); - err_val = json_object_get(val, "error"); - id_val = json_object_get(val, "id"); - - if (!id_val || json_is_null(id_val) || !res_val) - goto out; - - share_result(json_is_true(res_val), - err_val ? json_string_value(json_array_get(err_val, 1)) : NULL); - - ret = true; -out: - if (val) - json_decref(val); - - return ret; -} - -static void *stratum_thread(void *userdata) -{ - struct thr_info *mythr = (struct thr_info *)userdata; - char *s; - - stratum.url = (char*)tq_pop(mythr->q, NULL); - if (!stratum.url) - goto out; - applog(LOG_BLUE, "Starting Stratum on %s", stratum.url); - - while (1) { - int failures = 0; - - while (!stratum.curl) { - pthread_mutex_lock(&g_work_lock); - g_work_time = 0; - pthread_mutex_unlock(&g_work_lock); - restart_threads(); - - if (!stratum_connect(&stratum, stratum.url) || - !stratum_subscribe(&stratum) || - !stratum_authorize(&stratum, rpc_user, rpc_pass)) { - stratum_disconnect(&stratum); - if (opt_retries >= 0 && ++failures > opt_retries) { - applog(LOG_ERR, "...terminating workio thread"); - tq_push(thr_info[work_thr_id].q, NULL); - goto out; - } - if (!opt_benchmark) - applog(LOG_ERR, "...retry after %d seconds", opt_fail_pause); - sleep(opt_fail_pause); - } - } - - if (stratum.job.job_id && - (!g_work_time || strncmp(stratum.job.job_id, g_work.job_id + 8, 120))) { - pthread_mutex_lock(&g_work_lock); - stratum_gen_work(&stratum, &g_work); - time(&g_work_time); - if (stratum.job.clean) { - if (!opt_quiet) - applog(LOG_BLUE, "%s sent %s block %d", short_url, algo_names[opt_algo], - stratum.bloc_height); - restart_threads(); - hashlog_purge_old(); - } else if (!opt_quiet) { - applog(LOG_BLUE, "%s asks job %d for block %d", short_url, - strtoul(stratum.job.job_id, NULL, 16), stratum.bloc_height); - } - pthread_mutex_unlock(&g_work_lock); - } - - if (!stratum_socket_full(&stratum, 120)) { - applog(LOG_ERR, "Stratum connection timed out"); - s = NULL; - } else - s = stratum_recv_line(&stratum); - if (!s) { - stratum_disconnect(&stratum); - applog(LOG_ERR, "Stratum connection interrupted"); - continue; - } - if (!stratum_handle_method(&stratum, s)) - stratum_handle_response(s); - free(s); - } - -out: - return NULL; -} - -#define PROGRAM_VERSION "1.4.7.SP" -static void show_version_and_exit(void) -{ - printf("%s v%s\n" -#ifdef WIN32 - "pthreads static %s\n" -#endif - "%s\n", - PACKAGE_STRING, PROGRAM_VERSION, -#ifdef WIN32 - PTW32_VERSION_STRING, -#endif - curl_version()); - proper_exit(0); -} - -static void show_usage_and_exit(int status) -{ - if (status) - fprintf(stderr, "Try `" PROGRAM_NAME " --help' for more information.\n"); - else - printf(usage); - proper_exit(status); -} - -static void parse_arg (int key, char *arg) -{ - char *p; - int v, i; - double d; - - switch(key) { - case 'a': - for (i = 0; i < ARRAY_SIZE(algo_names); i++) { - if (algo_names[i] && - !strcmp(arg, algo_names[i])) { - opt_algo = (sha256_algos)i; - break; - } - } - if (i == ARRAY_SIZE(algo_names)) - show_usage_and_exit(1); - break; - case 'B': - opt_background = true; - break; - case 'c': { - json_error_t err; - if (opt_config) - json_decref(opt_config); -#if JANSSON_VERSION_HEX >= 0x020000 - opt_config = json_load_file(arg, 0, &err); -#else - opt_config = json_load_file(arg, &err); -#endif - if (!json_is_object(opt_config)) { - applog(LOG_ERR, "JSON decode of %s failed", arg); - proper_exit(1); - } - break; - } - case 'C': - /* color for compat */ - use_colors = true; - break; - case 'K': - use_colors = false; - break; - case 'D': - opt_debug = true; - break; - case 'q': - opt_quiet = true; - break; - case 'p': - free(rpc_pass); - rpc_pass = strdup(arg); - break; - case 'P': - opt_protocol = true; - break; - case 'r': - v = atoi(arg); - if (v < -1 || v > 9999) /* sanity check */ - show_usage_and_exit(1); - opt_retries = v; - break; - case 'R': - v = atoi(arg); - if (v < 1 || v > 9999) /* sanity check */ - show_usage_and_exit(1); - opt_fail_pause = v; - break; - case 's': - v = atoi(arg); - if (v < 1 || v > 9999) /* sanity check */ - show_usage_and_exit(1); - opt_scantime = v; - break; - case 'T': - v = atoi(arg); - if (v < 1 || v > 99999) /* sanity check */ - show_usage_and_exit(1); - opt_timeout = v; - break; - case 't': - v = atoi(arg); - if (v < 1 || v > 9999) /* sanity check */ - show_usage_and_exit(1); - opt_n_threads = v; - break; - case 'v': - v = atoi(arg); - if (v < 0 || v > 1024) /* sanity check */ - show_usage_and_exit(1); - opt_vote = (uint16_t)v; - break; - case 'm': - opt_trust_pool = true; - break; - case 'u': - free(rpc_user); - rpc_user = strdup(arg); - break; - case 'o': /* --url */ - p = strstr(arg, "://"); - if (p) { - if (strncasecmp(arg, "http://", 7) && strncasecmp(arg, "https://", 8) && - strncasecmp(arg, "stratum+tcp://", 14)) - show_usage_and_exit(1); - free(rpc_url); - rpc_url = strdup(arg); - short_url = &rpc_url[(p - arg) + 3]; - } else { - if (!strlen(arg) || *arg == '/') - show_usage_and_exit(1); - free(rpc_url); - rpc_url = (char*)malloc(strlen(arg) + 8); - sprintf(rpc_url, "http://%s", arg); - short_url = &rpc_url[7]; - } - p = strrchr(rpc_url, '@'); - if (p) { - char *sp, *ap; - *p = '\0'; - ap = strstr(rpc_url, "://") + 3; - sp = strchr(ap, ':'); - if (sp) { - free(rpc_userpass); - rpc_userpass = strdup(ap); - free(rpc_user); - rpc_user = (char*)calloc(sp - ap + 1, 1); - strncpy(rpc_user, ap, sp - ap); - free(rpc_pass); - rpc_pass = strdup(sp + 1); - } else { - free(rpc_user); - rpc_user = strdup(ap); - } - memmove(ap, p + 1, strlen(p + 1) + 1); - short_url = p + 1; - } - have_stratum = !opt_benchmark && !strncasecmp(rpc_url, "stratum", 7); - break; - case 'O': /* --userpass */ - p = strchr(arg, ':'); - if (!p) - show_usage_and_exit(1); - free(rpc_userpass); - rpc_userpass = strdup(arg); - free(rpc_user); - rpc_user = (char*)calloc(p - arg + 1, 1); - strncpy(rpc_user, arg, p - arg); - free(rpc_pass); - rpc_pass = strdup(p + 1); - break; - case 'x': /* --proxy */ - if (!strncasecmp(arg, "socks4://", 9)) - opt_proxy_type = CURLPROXY_SOCKS4; - else if (!strncasecmp(arg, "socks5://", 9)) - opt_proxy_type = CURLPROXY_SOCKS5; -#if LIBCURL_VERSION_NUM >= 0x071200 - else if (!strncasecmp(arg, "socks4a://", 10)) - opt_proxy_type = CURLPROXY_SOCKS4A; - else if (!strncasecmp(arg, "socks5h://", 10)) - opt_proxy_type = CURLPROXY_SOCKS5_HOSTNAME; -#endif - else - opt_proxy_type = CURLPROXY_HTTP; - free(opt_proxy); - opt_proxy = strdup(arg); - break; - case 1001: - free(opt_cert); - opt_cert = strdup(arg); - break; - case 1005: - opt_benchmark = true; - want_longpoll = false; - want_stratum = false; - have_stratum = false; - break; - case 1006: - print_hash_tests(); - proper_exit(0); - break; - case 1003: - want_longpoll = false; - break; - case 1007: - want_stratum = false; - break; - case 'S': - use_syslog = true; - break; - case 'd': // CB - { - char * pch = strtok (arg,","); - opt_n_threads = 0; - while (pch != NULL) { - if (pch[0] >= '0' && pch[0] <= '9' && pch[1] == '\0') - { - if (atoi(pch) < num_processors) - device_map[opt_n_threads++] = atoi(pch); - else { - applog(LOG_ERR, "Non-existant CUDA device #%d specified in -d option", atoi(pch)); - proper_exit(1); - } - } else { - int device = cuda_finddevice(pch); - if (device >= 0 && device < num_processors) - device_map[opt_n_threads++] = device; - else { - applog(LOG_ERR, "Non-existant CUDA device '%s' specified in -d option", pch); - proper_exit(1); - } - } - pch = strtok (NULL, ","); - } - } - break; - case 'f': // CH - Divisor for Difficulty - d = atof(arg); - if (d == 0) /* sanity check */ - show_usage_and_exit(1); - opt_difficulty = d; - break; - case 'V': - show_version_and_exit(); - case 'h': - show_usage_and_exit(0); - default: - show_usage_and_exit(1); - } - - if (use_syslog) - use_colors = false; -} - -static void parse_config(void) -{ - int i; - json_t *val; - - if (!json_is_object(opt_config)) - return; - - for (i = 0; i < ARRAY_SIZE(options); i++) { - if (!options[i].name) - break; - if (!strcmp(options[i].name, "config")) - continue; - - val = json_object_get(opt_config, options[i].name); - if (!val) - continue; - - if (options[i].has_arg && json_is_string(val)) { - char *s = strdup(json_string_value(val)); - if (!s) - break; - parse_arg(options[i].val, s); - free(s); - } else if (!options[i].has_arg && json_is_true(val)) - parse_arg(options[i].val, ""); - else - applog(LOG_ERR, "JSON option %s invalid", - options[i].name); - } - - if (opt_algo == ALGO_HEAVY && opt_vote == 9999) { - fprintf(stderr, "Heavycoin hash requires block reward vote parameter (see --vote)\n"); - show_usage_and_exit(1); - } -} - -static void parse_cmdline(int argc, char *argv[]) -{ - int key; - - while (1) { -#if HAVE_GETOPT_LONG - key = getopt_long(argc, argv, short_options, options, NULL); -#else - key = getopt(argc, argv, short_options); -#endif - if (key < 0) - break; - - parse_arg(key, optarg); - } - if (optind < argc) { - fprintf(stderr, "%s: unsupported non-option argument '%s'\n", - argv[0], argv[optind]); - show_usage_and_exit(1); - } - - if (opt_algo == ALGO_HEAVY && opt_vote == 9999) { - fprintf(stderr, "%s: Heavycoin hash requires block reward vote parameter (see --vote)\n", - argv[0]); - show_usage_and_exit(1); - } - - parse_config(); -} - -#ifndef WIN32 -static void signal_handler(int sig) -{ - switch (sig) { - case SIGHUP: - applog(LOG_INFO, "SIGHUP received"); - break; - case SIGINT: - signal(sig, SIG_IGN); - applog(LOG_INFO, "SIGINT received, exiting"); - proper_exit(0); - break; - case SIGTERM: - applog(LOG_INFO, "SIGTERM received, exiting"); - proper_exit(0); - break; - } -} -#else -BOOL WINAPI ConsoleHandler(DWORD dwType) -{ - switch (dwType) { - case CTRL_C_EVENT: - applog(LOG_INFO, "CTRL_C_EVENT received, exiting"); - proper_exit(0); - break; - case CTRL_BREAK_EVENT: - applog(LOG_INFO, "CTRL_BREAK_EVENT received, exiting"); - proper_exit(0); - break; - default: - return false; - } - return true; -} -#endif - -int main(int argc, char *argv[]) -{ - struct thr_info *thr; - long flags; - int i; - - printf("*** ccMiner for nVidia GPUs by Christian Buchner and Christian H. ***\n"); - printf("\t This is the forked version "PROGRAM_VERSION" (sp-hash@github)\n"); -#ifdef WIN32 - printf("\t Built with VC++ 2013 and nVidia CUDA SDK 6.5\n\n"); -#else - printf("\t Built with the nVidia CUDA SDK 6.5\n\n"); -#endif - printf("\t based on pooler-cpuminer 2.3.2 (c) 2010 Jeff Garzik, 2012 pooler\n"); - printf("\t and HVC extension from http://hvc.1gh.com/" "\n\n"); - printf("\tCuda additions Copyright 2014 Christian Buchner, Christian H.\n\n"); - printf("\tInclude some of djm34 additions, cleaned by Tanguy Pruvot\n"); - printf("\t\t Optimized Kernals By SP^Cryptoburners.\n\n"); - - rpc_user = strdup(""); - rpc_pass = strdup(""); - - pthread_mutex_init(&applog_lock, NULL); - num_processors = cuda_num_devices(); - - /* parse command line */ - parse_cmdline(argc, argv); - - cuda_devicenames(); - - if (!opt_benchmark && !rpc_url) { - fprintf(stderr, "%s: no URL supplied\n", argv[0]); - show_usage_and_exit(1); - } - - if (!rpc_userpass) { - rpc_userpass = (char*)malloc(strlen(rpc_user) + strlen(rpc_pass) + 2); - if (!rpc_userpass) - return 1; - sprintf(rpc_userpass, "%s:%s", rpc_user, rpc_pass); - } - - /* init stratum data.. */ - memset(&stratum.url, 0, sizeof(stratum)); - - pthread_mutex_init(&stats_lock, NULL); - pthread_mutex_init(&g_work_lock, NULL); - pthread_mutex_init(&stratum.sock_lock, NULL); - pthread_mutex_init(&stratum.work_lock, NULL); - - flags = !opt_benchmark && strncmp(rpc_url, "https:", 6) - ? (CURL_GLOBAL_ALL & ~CURL_GLOBAL_SSL) - : CURL_GLOBAL_ALL; - if (curl_global_init(flags)) { - applog(LOG_ERR, "CURL initialization failed"); - return 1; - } - -#ifndef WIN32 - if (opt_background) { - i = fork(); - if (i < 0) exit(1); - if (i > 0) exit(0); - i = setsid(); - if (i < 0) - applog(LOG_ERR, "setsid() failed (errno = %d)", errno); - i = chdir("/"); - if (i < 0) - applog(LOG_ERR, "chdir() failed (errno = %d)", errno); - signal(SIGHUP, signal_handler); - signal(SIGTERM, signal_handler); - } - /* Always catch Ctrl+C */ - signal(SIGINT, signal_handler); -#else - SetConsoleCtrlHandler((PHANDLER_ROUTINE)ConsoleHandler, TRUE); -#endif - - if (num_processors == 0) - { - applog(LOG_ERR, "No CUDA devices found! terminating."); - exit(1); - } - if (!opt_n_threads) - opt_n_threads = num_processors; - -#ifdef HAVE_SYSLOG_H - if (use_syslog) - openlog("cpuminer", LOG_PID, LOG_USER); -#endif - - work_restart = (struct work_restart *)calloc(opt_n_threads, sizeof(*work_restart)); - if (!work_restart) - return 1; - - thr_info = (struct thr_info *)calloc(opt_n_threads + 3, sizeof(*thr)); - if (!thr_info) - return 1; - - thr_hashrates = (double *) calloc(opt_n_threads, sizeof(double)); - if (!thr_hashrates) - return 1; - - /* init workio thread info */ - work_thr_id = opt_n_threads; - thr = &thr_info[work_thr_id]; - thr->id = work_thr_id; - thr->q = tq_new(); - if (!thr->q) - return 1; - - /* start work I/O thread */ - if (pthread_create(&thr->pth, NULL, workio_thread, thr)) { - applog(LOG_ERR, "workio thread create failed"); - return 1; - } - - if (want_longpoll && !have_stratum) { - /* init longpoll thread info */ - longpoll_thr_id = opt_n_threads + 1; - thr = &thr_info[longpoll_thr_id]; - thr->id = longpoll_thr_id; - thr->q = tq_new(); - if (!thr->q) - return 1; - - /* start longpoll thread */ - if (unlikely(pthread_create(&thr->pth, NULL, longpoll_thread, thr))) { - applog(LOG_ERR, "longpoll thread create failed"); - return 1; - } - } - if (want_stratum) { - /* init stratum thread info */ - stratum_thr_id = opt_n_threads + 2; - thr = &thr_info[stratum_thr_id]; - thr->id = stratum_thr_id; - thr->q = tq_new(); - if (!thr->q) - return 1; - - /* start stratum thread */ - if (unlikely(pthread_create(&thr->pth, NULL, stratum_thread, thr))) { - applog(LOG_ERR, "stratum thread create failed"); - return 1; - } - - if (have_stratum) - tq_push(thr_info[stratum_thr_id].q, strdup(rpc_url)); - } - - /* start mining threads */ - for (i = 0; i < opt_n_threads; i++) { - thr = &thr_info[i]; - - thr->id = i; - thr->q = tq_new(); - if (!thr->q) - return 1; - - if (unlikely(pthread_create(&thr->pth, NULL, miner_thread, thr))) { - applog(LOG_ERR, "thread %d create failed", i); - return 1; - } - } - - applog(LOG_INFO, "%d miner threads started, " - "using '%s' algorithm.", - opt_n_threads, - algo_names[opt_algo]); - -#ifdef WIN32 - timeBeginPeriod(1); // enable high timer precision (similar to Google Chrome Trick) -#endif - - /* main loop - simply wait for workio thread to exit */ - pthread_join(thr_info[work_thr_id].pth, NULL); - -#ifdef WIN32 - timeEndPeriod(1); // be nice and forego high timer precision -#endif - - applog(LOG_INFO, "workio thread dead, exiting."); - - return 0; -} diff --git a/cpuminer-config.h.in b/cpuminer-config.h.in deleted file mode 100644 index c172559ba5..0000000000 --- a/cpuminer-config.h.in +++ /dev/null @@ -1,196 +0,0 @@ -/* cpuminer-config.h.in. Generated from configure.ac by autoheader. */ - -/* Define to one of `_getb67', `GETB67', `getb67' for Cray-2 and Cray-YMP - systems. This function is required for `alloca.c' support on those systems. - */ -#undef CRAY_STACKSEG_END - -/* Define to 1 if using `alloca.c'. */ -#undef C_ALLOCA - -/* Define to 1 if you have `alloca', as a function or macro. */ -#undef HAVE_ALLOCA - -/* Define to 1 if you have and it should be used (not on Ultrix). - */ -#undef HAVE_ALLOCA_H - -/* Define to 1 if you have the declaration of `be32dec', and to 0 if you - don't. */ -#undef HAVE_DECL_BE32DEC - -/* Define to 1 if you have the declaration of `be32enc', and to 0 if you - don't. */ -#undef HAVE_DECL_BE32ENC - -/* Define to 1 if you have the declaration of `le32dec', and to 0 if you - don't. */ -#undef HAVE_DECL_LE32DEC - -/* Define to 1 if you have the declaration of `le32enc', and to 0 if you - don't. */ -#undef HAVE_DECL_LE32ENC - -/* Define to 1 if you have the `getopt_long' function. */ -#undef HAVE_GETOPT_LONG - -/* Define to 1 if you have the header file. */ -#undef HAVE_INTTYPES_H - -/* Define to 1 if you have the `crypto' library (-lcrypto). */ -#undef HAVE_LIBCRYPTO - -/* Define to 1 if you have a functional curl library. */ -#undef HAVE_LIBCURL - -/* Define to 1 if you have the `ssl' library (-lssl). */ -#undef HAVE_LIBSSL - -/* Define to 1 if you have the header file. */ -#undef HAVE_MEMORY_H - -/* Define to 1 if you have the header file. */ -#undef HAVE_STDINT_H - -/* Define to 1 if you have the header file. */ -#undef HAVE_STDLIB_H - -/* Define to 1 if you have the header file. */ -#undef HAVE_STRINGS_H - -/* Define to 1 if you have the header file. */ -#undef HAVE_STRING_H - -/* Define to 1 if you have the header file. */ -#undef HAVE_SYSLOG_H - -/* Define to 1 if you have the header file. */ -#undef HAVE_SYS_ENDIAN_H - -/* Define to 1 if you have the header file. */ -#undef HAVE_SYS_PARAM_H - -/* Define to 1 if you have the header file. */ -#undef HAVE_SYS_STAT_H - -/* Define to 1 if you have the header file. */ -#undef HAVE_SYS_SYSCTL_H - -/* Define to 1 if you have the header file. */ -#undef HAVE_SYS_TYPES_H - -/* Define to 1 if you have the header file. */ -#undef HAVE_UNISTD_H - -/* Defined if libcurl supports AsynchDNS */ -#undef LIBCURL_FEATURE_ASYNCHDNS - -/* Defined if libcurl supports IDN */ -#undef LIBCURL_FEATURE_IDN - -/* Defined if libcurl supports IPv6 */ -#undef LIBCURL_FEATURE_IPV6 - -/* Defined if libcurl supports KRB4 */ -#undef LIBCURL_FEATURE_KRB4 - -/* Defined if libcurl supports libz */ -#undef LIBCURL_FEATURE_LIBZ - -/* Defined if libcurl supports NTLM */ -#undef LIBCURL_FEATURE_NTLM - -/* Defined if libcurl supports SSL */ -#undef LIBCURL_FEATURE_SSL - -/* Defined if libcurl supports SSPI */ -#undef LIBCURL_FEATURE_SSPI - -/* Defined if libcurl supports DICT */ -#undef LIBCURL_PROTOCOL_DICT - -/* Defined if libcurl supports FILE */ -#undef LIBCURL_PROTOCOL_FILE - -/* Defined if libcurl supports FTP */ -#undef LIBCURL_PROTOCOL_FTP - -/* Defined if libcurl supports FTPS */ -#undef LIBCURL_PROTOCOL_FTPS - -/* Defined if libcurl supports HTTP */ -#undef LIBCURL_PROTOCOL_HTTP - -/* Defined if libcurl supports HTTPS */ -#undef LIBCURL_PROTOCOL_HTTPS - -/* Defined if libcurl supports IMAP */ -#undef LIBCURL_PROTOCOL_IMAP - -/* Defined if libcurl supports LDAP */ -#undef LIBCURL_PROTOCOL_LDAP - -/* Defined if libcurl supports POP3 */ -#undef LIBCURL_PROTOCOL_POP3 - -/* Defined if libcurl supports RTSP */ -#undef LIBCURL_PROTOCOL_RTSP - -/* Defined if libcurl supports SMTP */ -#undef LIBCURL_PROTOCOL_SMTP - -/* Defined if libcurl supports TELNET */ -#undef LIBCURL_PROTOCOL_TELNET - -/* Defined if libcurl supports TFTP */ -#undef LIBCURL_PROTOCOL_TFTP - -/* Name of package */ -#undef PACKAGE - -/* Define to the address where bug reports for this package should be sent. */ -#undef PACKAGE_BUGREPORT - -/* Define to the full name of this package. */ -#undef PACKAGE_NAME - -/* Define to the full name and version of this package. */ -#undef PACKAGE_STRING - -/* Define to the one symbol short name of this package. */ -#undef PACKAGE_TARNAME - -/* Define to the home page for this package. */ -#undef PACKAGE_URL - -/* Define to the version of this package. */ -#undef PACKAGE_VERSION - -/* If using the C implementation of alloca, define if you know the - direction of stack growth for your system; otherwise it will be - automatically deduced at runtime. - STACK_DIRECTION > 0 => grows toward higher addresses - STACK_DIRECTION < 0 => grows toward lower addresses - STACK_DIRECTION = 0 => direction of growth unknown */ -#undef STACK_DIRECTION - -/* Define to 1 if you have the ANSI C header files. */ -#undef STDC_HEADERS - -/* Define to 1 if AVX assembly is available. */ -#undef USE_AVX - -/* Define to 1 if AVX2 assembly is available. */ -#undef USE_AVX2 - -/* Define to 1 if XOP assembly is available. */ -#undef USE_XOP - -/* Version number of package */ -#undef VERSION - -/* Define curl_free() as free() if our version of curl lacks curl_free. */ -#undef curl_free - -/* Define to `unsigned int' if does not define. */ -#undef size_t diff --git a/crc32.c b/crc32.c index f036bcbd3b..ad65c57120 100644 --- a/crc32.c +++ b/crc32.c @@ -40,8 +40,13 @@ * CRC32 code derived from work by Gary S. Brown. */ -#include +#ifdef __cplusplus +#include +#include +#else #include +#include +#endif static uint32_t crc32_tab[] = { 0x00000000, 0x77073096, 0xee0e612c, 0x990951ba, 0x076dc419, 0x706af48f, diff --git a/cuPrintf.cu b/cuPrintf.cu deleted file mode 100644 index f06653f2db..0000000000 --- a/cuPrintf.cu +++ /dev/null @@ -1,879 +0,0 @@ -/* - Copyright 2009 NVIDIA Corporation. All rights reserved. - - NOTICE TO LICENSEE: - - This source code and/or documentation ("Licensed Deliverables") are subject - to NVIDIA intellectual property rights under U.S. and international Copyright - laws. - - These Licensed Deliverables contained herein is PROPRIETARY and CONFIDENTIAL - to NVIDIA and is being provided under the terms and conditions of a form of - NVIDIA software license agreement by and between NVIDIA and Licensee ("License - Agreement") or electronically accepted by Licensee. Notwithstanding any terms - or conditions to the contrary in the License Agreement, reproduction or - disclosure of the Licensed Deliverables to any third party without the express - written consent of NVIDIA is prohibited. - - NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE LICENSE AGREEMENT, - NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THESE LICENSED - DELIVERABLES FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED - WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE - LICENSED DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, - NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. NOTWITHSTANDING ANY - TERMS OR CONDITIONS TO THE CONTRARY IN THE LICENSE AGREEMENT, IN NO EVENT SHALL - NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, - OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER - IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THESE LICENSED DELIVERABLES. - - U.S. Government End Users. These Licensed Deliverables are a "commercial item" - as that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of - "commercial computer software" and "commercial computer software documentation" - as such terms are used in 48 C.F.R. 12.212 (SEPT 1995) and is provided to the - U.S. Government only as a commercial end item. Consistent with 48 C.F.R.12.212 - and 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all U.S. Government - End Users acquire the Licensed Deliverables with only those rights set forth - herein. - - Any use of the Licensed Deliverables in individual and commercial software must - include, in the user documentation and internal comments to the code, the above - Disclaimer and U.S. Government End Users Notice. - */ - -/* - * cuPrintf.cu - * - * This is a printf command callable from within a kernel. It is set - * up so that output is sent to a memory buffer, which is emptied from - * the host side - but only after a cudaThreadSynchronize() on the host. - * - * Currently, there is a limitation of around 200 characters of output - * and no more than 10 arguments to a single cuPrintf() call. Issue - * multiple calls if longer format strings are required. - * - * It requires minimal setup, and is *NOT* optimised for performance. - * For example, writes are not coalesced - this is because there is an - * assumption that people will not want to printf from every single one - * of thousands of threads, but only from individual threads at a time. - * - * Using this is simple - it requires one host-side call to initialise - * everything, and then kernels can call cuPrintf at will. Sample code - * is the easiest way to demonstrate: - * - #include "cuPrintf.cu" - - __global__ void testKernel(int val) - { - cuPrintf("Value is: %d\n", val); - } - - int main() - { - cudaPrintfInit(); - testKernel<<< 2, 3 >>>(10); - cudaPrintfDisplay(stdout, true); - cudaPrintfEnd(); - return 0; - } - * - * See the header file, "cuPrintf.cuh" for more info, especially - * arguments to cudaPrintfInit() and cudaPrintfDisplay(); - */ - -#ifndef CUPRINTF_CU -#define CUPRINTF_CU - -#include "cuPrintf.cuh" -#if __CUDA_ARCH__ > 100 // Atomics only used with > sm_10 architecture -#include -#endif - -// This is the smallest amount of memory, per-thread, which is allowed. -// It is also the largest amount of space a single printf() can take up -const static int CUPRINTF_MAX_LEN = 256; - -// This structure is used internally to track block/thread output restrictions. -typedef struct __align__(8) { - int threadid; // CUPRINTF_UNRESTRICTED for unrestricted - int blockid; // CUPRINTF_UNRESTRICTED for unrestricted -} cuPrintfRestriction; - -// The main storage is in a global print buffer, which has a known -// start/end/length. These are atomically updated so it works as a -// circular buffer. -// Since the only control primitive that can be used is atomicAdd(), -// we cannot wrap the pointer as such. The actual address must be -// calculated from printfBufferPtr by mod-ing with printfBufferLength. -// For sm_10 architecture, we must subdivide the buffer per-thread -// since we do not even have an atomic primitive. -__constant__ static char *globalPrintfBuffer = NULL; // Start of circular buffer (set up by host) -__constant__ static int printfBufferLength = 0; // Size of circular buffer (set up by host) -__device__ static cuPrintfRestriction restrictRules; // Output restrictions -__device__ volatile static char *printfBufferPtr = NULL; // Current atomically-incremented non-wrapped offset - -// This is the header preceeding all printf entries. -// NOTE: It *must* be size-aligned to the maximum entity size (size_t) -typedef struct __align__(8) { - unsigned short magic; // Magic number says we're valid - unsigned short fmtoffset; // Offset of fmt string into buffer - unsigned short blockid; // Block ID of author - unsigned short threadid; // Thread ID of author -} cuPrintfHeader; - -// Special header for sm_10 architecture -#define CUPRINTF_SM10_MAGIC 0xC810 // Not a valid ascii character -typedef struct __align__(16) { - unsigned short magic; // sm_10 specific magic number - unsigned short unused; - unsigned int thread_index; // thread ID for this buffer - unsigned int thread_buf_len; // per-thread buffer length - unsigned int offset; // most recent printf's offset -} cuPrintfHeaderSM10; - - -// Because we can't write an element which is not aligned to its bit-size, -// we have to align all sizes and variables on maximum-size boundaries. -// That means sizeof(double) in this case, but we'll use (long long) for -// better arch<1.3 support -#define CUPRINTF_ALIGN_SIZE sizeof(long long) - -// All our headers are prefixed with a magic number so we know they're ready -#define CUPRINTF_SM11_MAGIC (unsigned short)0xC811 // Not a valid ascii character - - -// -// getNextPrintfBufPtr -// -// Grabs a block of space in the general circular buffer, using an -// atomic function to ensure that it's ours. We handle wrapping -// around the circular buffer and return a pointer to a place which -// can be written to. -// -// Important notes: -// 1. We always grab CUPRINTF_MAX_LEN bytes -// 2. Because of 1, we never worry about wrapping around the end -// 3. Because of 1, printfBufferLength *must* be a factor of CUPRINTF_MAX_LEN -// -// This returns a pointer to the place where we own. -// -__device__ static char *getNextPrintfBufPtr() -{ - // Initialisation check - if(!printfBufferPtr) - return NULL; - - // Thread/block restriction check - if((restrictRules.blockid != CUPRINTF_UNRESTRICTED) && (restrictRules.blockid != (blockIdx.x + gridDim.x*blockIdx.y))) - return NULL; - if((restrictRules.threadid != CUPRINTF_UNRESTRICTED) && (restrictRules.threadid != (threadIdx.x + blockDim.x*threadIdx.y + blockDim.x*blockDim.y*threadIdx.z))) - return NULL; - - // Conditional section, dependent on architecture -#if __CUDA_ARCH__ == 100 - // For sm_10 architectures, we have no atomic add - this means we must split the - // entire available buffer into per-thread blocks. Inefficient, but what can you do. - int thread_count = (gridDim.x * gridDim.y) * (blockDim.x * blockDim.y * blockDim.z); - int thread_index = threadIdx.x + blockDim.x*threadIdx.y + blockDim.x*blockDim.y*threadIdx.z + - (blockIdx.x + gridDim.x*blockIdx.y) * (blockDim.x * blockDim.y * blockDim.z); - - // Find our own block of data and go to it. Make sure the per-thread length - // is a precise multiple of CUPRINTF_MAX_LEN, otherwise we risk size and - // alignment issues! We must round down, of course. - unsigned int thread_buf_len = printfBufferLength / thread_count; - thread_buf_len &= ~(CUPRINTF_MAX_LEN-1); - - // We *must* have a thread buffer length able to fit at least two printfs (one header, one real) - if(thread_buf_len < (CUPRINTF_MAX_LEN * 2)) - return NULL; - - // Now address our section of the buffer. The first item is a header. - char *myPrintfBuffer = globalPrintfBuffer + (thread_buf_len * thread_index); - cuPrintfHeaderSM10 hdr = *(cuPrintfHeaderSM10 *)(void *)myPrintfBuffer; - if(hdr.magic != CUPRINTF_SM10_MAGIC) - { - // If our header is not set up, initialise it - hdr.magic = CUPRINTF_SM10_MAGIC; - hdr.thread_index = thread_index; - hdr.thread_buf_len = thread_buf_len; - hdr.offset = 0; // Note we start at 0! We pre-increment below. - *(cuPrintfHeaderSM10 *)(void *)myPrintfBuffer = hdr; // Write back the header - - // For initial setup purposes, we might need to init thread0's header too - // (so that cudaPrintfDisplay() below will work). This is only run once. - cuPrintfHeaderSM10 *tophdr = (cuPrintfHeaderSM10 *)(void *)globalPrintfBuffer; - tophdr->thread_buf_len = thread_buf_len; - } - - // Adjust the offset by the right amount, and wrap it if need be - unsigned int offset = hdr.offset + CUPRINTF_MAX_LEN; - if(offset >= hdr.thread_buf_len) - offset = CUPRINTF_MAX_LEN; - - // Write back the new offset for next time and return a pointer to it - ((cuPrintfHeaderSM10 *)(void *)myPrintfBuffer)->offset = offset; - return myPrintfBuffer + offset; -#else - // Much easier with an atomic operation! - size_t offset = atomicAdd((unsigned int *)&printfBufferPtr, CUPRINTF_MAX_LEN) - (size_t)globalPrintfBuffer; - offset %= printfBufferLength; - return globalPrintfBuffer + offset; -#endif -} - - -// -// writePrintfHeader -// -// Inserts the header for containing our UID, fmt position and -// block/thread number. We generate it dynamically to avoid -// issues arising from requiring pre-initialisation. -// -__device__ static void writePrintfHeader(char *ptr, char *fmtptr) -{ - if(ptr) - { - cuPrintfHeader header; - header.magic = CUPRINTF_SM11_MAGIC; - header.fmtoffset = (unsigned short)(fmtptr - ptr); - header.blockid = blockIdx.x + gridDim.x*blockIdx.y; - header.threadid = threadIdx.x + blockDim.x*threadIdx.y + blockDim.x*blockDim.y*threadIdx.z; - *(cuPrintfHeader *)(void *)ptr = header; - } -} - - -// -// cuPrintfStrncpy -// -// This special strncpy outputs an aligned length value, followed by the -// string. It then zero-pads the rest of the string until a 64-aligned -// boundary. The length *includes* the padding. A pointer to the byte -// just after the \0 is returned. -// -// This function could overflow CUPRINTF_MAX_LEN characters in our buffer. -// To avoid it, we must count as we output and truncate where necessary. -// -__device__ static char *cuPrintfStrncpy(char *dest, const char *src, int n, char *end) -{ - // Initialisation and overflow check - if(!dest || !src || (dest >= end)) - return NULL; - - // Prepare to write the length specifier. We're guaranteed to have - // at least "CUPRINTF_ALIGN_SIZE" bytes left because we only write out in - // chunks that size, and CUPRINTF_MAX_LEN is aligned with CUPRINTF_ALIGN_SIZE. - int *lenptr = (int *)(void *)dest; - int len = 0; - dest += CUPRINTF_ALIGN_SIZE; - - // Now copy the string - while(n--) - { - if(dest >= end) // Overflow check - break; - - len++; - *dest++ = *src; - if(*src++ == '\0') - break; - } - - // Now write out the padding bytes, and we have our length. - while((dest < end) && (((long)dest & (CUPRINTF_ALIGN_SIZE-1)) != 0)) - { - len++; - *dest++ = 0; - } - *lenptr = len; - return (dest < end) ? dest : NULL; // Overflow means return NULL -} - - -// -// copyArg -// -// This copies a length specifier and then the argument out to the -// data buffer. Templates let the compiler figure all this out at -// compile-time, making life much simpler from the programming -// point of view. I'm assuimg all (const char *) is a string, and -// everything else is the variable it points at. I'd love to see -// a better way of doing it, but aside from parsing the format -// string I can't think of one. -// -// The length of the data type is inserted at the beginning (so that -// the display can distinguish between float and double), and the -// pointer to the end of the entry is returned. -// -__device__ static char *copyArg(char *ptr, const char *arg, char *end) -{ - // Initialisation check - if(!ptr || !arg) - return NULL; - - // strncpy does all our work. We just terminate. - if((ptr = cuPrintfStrncpy(ptr, arg, CUPRINTF_MAX_LEN, end)) != NULL) - *ptr = 0; - - return ptr; -} - -template -__device__ static char *copyArg(char *ptr, T &arg, char *end) -{ - // Initisalisation and overflow check. Alignment rules mean that - // we're at least CUPRINTF_ALIGN_SIZE away from "end", so we only need - // to check that one offset. - if(!ptr || ((ptr+CUPRINTF_ALIGN_SIZE) >= end)) - return NULL; - - // Write the length and argument - *(int *)(void *)ptr = sizeof(arg); - ptr += CUPRINTF_ALIGN_SIZE; - *(T *)(void *)ptr = arg; - ptr += CUPRINTF_ALIGN_SIZE; - *ptr = 0; - - return ptr; -} - - -// -// cuPrintf -// -// Templated printf functions to handle multiple arguments. -// Note we return the total amount of data copied, not the number -// of characters output. But then again, who ever looks at the -// return from printf() anyway? -// -// The format is to grab a block of circular buffer space, the -// start of which will hold a header and a pointer to the format -// string. We then write in all the arguments, and finally the -// format string itself. This is to make it easy to prevent -// overflow of our buffer (we support up to 10 arguments, each of -// which can be 12 bytes in length - that means that only the -// format string (or a %s) can actually overflow; so the overflow -// check need only be in the strcpy function. -// -// The header is written at the very last because that's what -// makes it look like we're done. -// -// Errors, which are basically lack-of-initialisation, are ignored -// in the called functions because NULL pointers are passed around -// - -// All printf variants basically do the same thing, setting up the -// buffer, writing all arguments, then finalising the header. For -// clarity, we'll pack the code into some big macros. -#define CUPRINTF_PREAMBLE \ - char *start, *end, *bufptr, *fmtstart; \ - if((start = getNextPrintfBufPtr()) == NULL) return 0; \ - end = start + CUPRINTF_MAX_LEN; \ - bufptr = start + sizeof(cuPrintfHeader); - -// Posting an argument is easy -#define CUPRINTF_ARG(argname) \ - bufptr = copyArg(bufptr, argname, end); - -// After args are done, record start-of-fmt and write the fmt and header -#define CUPRINTF_POSTAMBLE \ - fmtstart = bufptr; \ - end = cuPrintfStrncpy(bufptr, fmt, CUPRINTF_MAX_LEN, end); \ - writePrintfHeader(start, end ? fmtstart : NULL); \ - return end ? (int)(end - start) : 0; - -__device__ int cuPrintf(const char *fmt) -{ - CUPRINTF_PREAMBLE; - - CUPRINTF_POSTAMBLE; -} -template __device__ int cuPrintf(const char *fmt, T1 arg1) -{ - CUPRINTF_PREAMBLE; - - CUPRINTF_ARG(arg1); - - CUPRINTF_POSTAMBLE; -} -template __device__ int cuPrintf(const char *fmt, T1 arg1, T2 arg2) -{ - CUPRINTF_PREAMBLE; - - CUPRINTF_ARG(arg1); - CUPRINTF_ARG(arg2); - - CUPRINTF_POSTAMBLE; -} -template __device__ int cuPrintf(const char *fmt, T1 arg1, T2 arg2, T3 arg3) -{ - CUPRINTF_PREAMBLE; - - CUPRINTF_ARG(arg1); - CUPRINTF_ARG(arg2); - CUPRINTF_ARG(arg3); - - CUPRINTF_POSTAMBLE; -} -template __device__ int cuPrintf(const char *fmt, T1 arg1, T2 arg2, T3 arg3, T4 arg4) -{ - CUPRINTF_PREAMBLE; - - CUPRINTF_ARG(arg1); - CUPRINTF_ARG(arg2); - CUPRINTF_ARG(arg3); - CUPRINTF_ARG(arg4); - - CUPRINTF_POSTAMBLE; -} -template __device__ int cuPrintf(const char *fmt, T1 arg1, T2 arg2, T3 arg3, T4 arg4, T5 arg5) -{ - CUPRINTF_PREAMBLE; - - CUPRINTF_ARG(arg1); - CUPRINTF_ARG(arg2); - CUPRINTF_ARG(arg3); - CUPRINTF_ARG(arg4); - CUPRINTF_ARG(arg5); - - CUPRINTF_POSTAMBLE; -} -template __device__ int cuPrintf(const char *fmt, T1 arg1, T2 arg2, T3 arg3, T4 arg4, T5 arg5, T6 arg6) -{ - CUPRINTF_PREAMBLE; - - CUPRINTF_ARG(arg1); - CUPRINTF_ARG(arg2); - CUPRINTF_ARG(arg3); - CUPRINTF_ARG(arg4); - CUPRINTF_ARG(arg5); - CUPRINTF_ARG(arg6); - CUPRINTF_POSTAMBLE; -} -template __device__ int cuPrintf(const char *fmt, T1 arg1, T2 arg2, T3 arg3, T4 arg4, T5 arg5, T6 arg6, T7 arg7) -{ - CUPRINTF_PREAMBLE; - - CUPRINTF_ARG(arg1); - CUPRINTF_ARG(arg2); - CUPRINTF_ARG(arg3); - CUPRINTF_ARG(arg4); - CUPRINTF_ARG(arg5); - CUPRINTF_ARG(arg6); - CUPRINTF_ARG(arg7); - - CUPRINTF_POSTAMBLE; -} -template __device__ int cuPrintf(const char *fmt, T1 arg1, T2 arg2, T3 arg3, T4 arg4, T5 arg5, T6 arg6, T7 arg7, T8 arg8) -{ - CUPRINTF_PREAMBLE; - - CUPRINTF_ARG(arg1); - CUPRINTF_ARG(arg2); - CUPRINTF_ARG(arg3); - CUPRINTF_ARG(arg4); - CUPRINTF_ARG(arg5); - CUPRINTF_ARG(arg6); - CUPRINTF_ARG(arg7); - CUPRINTF_ARG(arg8); - - CUPRINTF_POSTAMBLE; -} -template __device__ int cuPrintf(const char *fmt, T1 arg1, T2 arg2, T3 arg3, T4 arg4, T5 arg5, T6 arg6, T7 arg7, T8 arg8, T9 arg9) -{ - CUPRINTF_PREAMBLE; - - CUPRINTF_ARG(arg1); - CUPRINTF_ARG(arg2); - CUPRINTF_ARG(arg3); - CUPRINTF_ARG(arg4); - CUPRINTF_ARG(arg5); - CUPRINTF_ARG(arg6); - CUPRINTF_ARG(arg7); - CUPRINTF_ARG(arg8); - CUPRINTF_ARG(arg9); - - CUPRINTF_POSTAMBLE; -} -template __device__ int cuPrintf(const char *fmt, T1 arg1, T2 arg2, T3 arg3, T4 arg4, T5 arg5, T6 arg6, T7 arg7, T8 arg8, T9 arg9, T10 arg10) -{ - CUPRINTF_PREAMBLE; - - CUPRINTF_ARG(arg1); - CUPRINTF_ARG(arg2); - CUPRINTF_ARG(arg3); - CUPRINTF_ARG(arg4); - CUPRINTF_ARG(arg5); - CUPRINTF_ARG(arg6); - CUPRINTF_ARG(arg7); - CUPRINTF_ARG(arg8); - CUPRINTF_ARG(arg9); - CUPRINTF_ARG(arg10); - - CUPRINTF_POSTAMBLE; -} -#undef CUPRINTF_PREAMBLE -#undef CUPRINTF_ARG -#undef CUPRINTF_POSTAMBLE - - -// -// cuPrintfRestrict -// -// Called to restrict output to a given thread/block. -// We store the info in "restrictRules", which is set up at -// init time by the host. It's not the cleanest way to do this -// because it means restrictions will last between -// invocations, but given the output-pointer continuity, -// I feel this is reasonable. -// -__device__ void cuPrintfRestrict(int threadid, int blockid) -{ - int thread_count = blockDim.x * blockDim.y * blockDim.z; - if(((threadid < thread_count) && (threadid >= 0)) || (threadid == CUPRINTF_UNRESTRICTED)) - restrictRules.threadid = threadid; - - int block_count = gridDim.x * gridDim.y; - if(((blockid < block_count) && (blockid >= 0)) || (blockid == CUPRINTF_UNRESTRICTED)) - restrictRules.blockid = blockid; -} - - -/////////////////////////////////////////////////////////////////////////////// -// HOST SIDE - -#include -static FILE *printf_fp; - -static char *printfbuf_start=NULL; -static char *printfbuf_device=NULL; -static int printfbuf_len=0; - - -// -// outputPrintfData -// -// Our own internal function, which takes a pointer to a data buffer -// and passes it through libc's printf for output. -// -// We receive the formate string and a pointer to where the data is -// held. We then run through and print it out. -// -// Returns 0 on failure, 1 on success -// -static int outputPrintfData(char *fmt, char *data) -{ - // Format string is prefixed by a length that we don't need - fmt += CUPRINTF_ALIGN_SIZE; - - // Now run through it, printing everything we can. We must - // run to every % character, extract only that, and use printf - // to format it. - char *p = strchr(fmt, '%'); - while(p != NULL) - { - // Print up to the % character - *p = '\0'; - fputs(fmt, printf_fp); - *p = '%'; // Put back the % - - // Now handle the format specifier - char *format = p++; // Points to the '%' - p += strcspn(p, "%cdiouxXeEfgGaAnps"); - if(*p == '\0') // If no format specifier, print the whole thing - { - fmt = format; - break; - } - - // Cut out the format bit and use printf to print it. It's prefixed - // by its length. - int arglen = *(int *)data; - if(arglen > CUPRINTF_MAX_LEN) - { - fputs("Corrupt printf buffer data - aborting\n", printf_fp); - return 0; - } - - data += CUPRINTF_ALIGN_SIZE; - - char specifier = *p++; - char c = *p; // Store for later - *p = '\0'; - switch(specifier) - { - // These all take integer arguments - case 'c': - case 'd': - case 'i': - case 'o': - case 'u': - case 'x': - case 'X': - case 'p': - fprintf(printf_fp, format, *((int *)data)); - break; - - // These all take double arguments - case 'e': - case 'E': - case 'f': - case 'g': - case 'G': - case 'a': - case 'A': - if(arglen == 4) // Float vs. Double thing - fprintf(printf_fp, format, *((float *)data)); - else - fprintf(printf_fp, format, *((double *)data)); - break; - - // Strings are handled in a special way - case 's': - fprintf(printf_fp, format, (char *)data); - break; - - // % is special - case '%': - fprintf(printf_fp, "%%"); - break; - - // Everything else is just printed out as-is - default: - fprintf(printf_fp, format); - break; - } - data += CUPRINTF_ALIGN_SIZE; // Move on to next argument - *p = c; // Restore what we removed - fmt = p; // Adjust fmt string to be past the specifier - p = strchr(fmt, '%'); // and get the next specifier - } - - // Print out the last of the string - fputs(fmt, printf_fp); - return 1; -} - - -// -// doPrintfDisplay -// -// This runs through the blocks of CUPRINTF_MAX_LEN-sized data, calling the -// print function above to display them. We've got this separate from -// cudaPrintfDisplay() below so we can handle the SM_10 architecture -// partitioning. -// -static int doPrintfDisplay(int headings, int clear, char *bufstart, char *bufend, char *bufptr, char *endptr) -{ - // Grab, piece-by-piece, each output element until we catch - // up with the circular buffer end pointer - int printf_count=0; - char printfbuf_local[CUPRINTF_MAX_LEN+1]; - printfbuf_local[CUPRINTF_MAX_LEN] = '\0'; - - while(bufptr != endptr) - { - // Wrap ourselves at the end-of-buffer - if(bufptr == bufend) - bufptr = bufstart; - - // Adjust our start pointer to within the circular buffer and copy a block. - cudaMemcpy(printfbuf_local, bufptr, CUPRINTF_MAX_LEN, cudaMemcpyDeviceToHost); - - // If the magic number isn't valid, then this write hasn't gone through - // yet and we'll wait until it does (or we're past the end for non-async printfs). - cuPrintfHeader *hdr = (cuPrintfHeader *)printfbuf_local; - if((hdr->magic != CUPRINTF_SM11_MAGIC) || (hdr->fmtoffset >= CUPRINTF_MAX_LEN)) - { - //fprintf(printf_fp, "Bad magic number in printf header\n"); - break; - } - - // Extract all the info and get this printf done - if(headings) - fprintf(printf_fp, "[%d, %d]: ", hdr->blockid, hdr->threadid); - if(hdr->fmtoffset == 0) - fprintf(printf_fp, "printf buffer overflow\n"); - else if(!outputPrintfData(printfbuf_local+hdr->fmtoffset, printfbuf_local+sizeof(cuPrintfHeader))) - break; - printf_count++; - - // Clear if asked - if(clear) - cudaMemset(bufptr, 0, CUPRINTF_MAX_LEN); - - // Now advance our start location, because we're done, and keep copying - bufptr += CUPRINTF_MAX_LEN; - } - - return printf_count; -} - - -// -// cudaPrintfInit -// -// Takes a buffer length to allocate, creates the memory on the device and -// returns a pointer to it for when a kernel is called. It's up to the caller -// to free it. -// -extern "C" cudaError_t cudaPrintfInit(size_t bufferLen) -{ - // Fix up bufferlen to be a multiple of CUPRINTF_MAX_LEN - bufferLen = (bufferLen < CUPRINTF_MAX_LEN) ? CUPRINTF_MAX_LEN : bufferLen; - if((bufferLen % CUPRINTF_MAX_LEN) > 0) - bufferLen += (CUPRINTF_MAX_LEN - (bufferLen % CUPRINTF_MAX_LEN)); - printfbuf_len = (int)bufferLen; - - // Allocate a print buffer on the device and zero it - if(cudaMalloc((void **)&printfbuf_device, printfbuf_len) != cudaSuccess) - return cudaErrorInitializationError; - cudaMemset(printfbuf_device, 0, printfbuf_len); - printfbuf_start = printfbuf_device; // Where we start reading from - - // No restrictions to begin with - cuPrintfRestriction restrict; - restrict.threadid = restrict.blockid = CUPRINTF_UNRESTRICTED; - cudaMemcpyToSymbol(restrictRules, &restrict, sizeof(restrict)); - - // Initialise the buffer and the respective lengths/pointers. - cudaMemcpyToSymbol(globalPrintfBuffer, &printfbuf_device, sizeof(char *)); - cudaMemcpyToSymbol(printfBufferPtr, &printfbuf_device, sizeof(char *)); - cudaMemcpyToSymbol(printfBufferLength, &printfbuf_len, sizeof(printfbuf_len)); - - return cudaSuccess; -} - - -// -// cudaPrintfEnd -// -// Frees up the memory which we allocated -// -extern "C" void cudaPrintfEnd() -{ - if(!printfbuf_start || !printfbuf_device) - return; - - cudaFree(printfbuf_device); - printfbuf_start = printfbuf_device = NULL; -} - - -// -// cudaPrintfDisplay -// -// Each call to this function dumps the entire current contents -// of the printf buffer to the pre-specified FILE pointer. The -// circular "start" pointer is advanced so that subsequent calls -// dumps only new stuff. -// -// In the case of async memory access (via streams), call this -// repeatedly to keep trying to empty the buffer. If it's a sync -// access, then the whole buffer should empty in one go. -// -// Arguments: -// outputFP - File descriptor to output to (NULL => stdout) -// showThreadID - If true, prints [block,thread] before each line -// -extern "C" cudaError_t cudaPrintfDisplay(void *outputFP, bool showThreadID) -{ - printf_fp = (FILE *)((outputFP == NULL) ? stdout : outputFP); - - // For now, we force "synchronous" mode which means we're not concurrent - // with kernel execution. This also means we don't need clearOnPrint. - // If you're patching it for async operation, here's where you want it. - bool sync_printfs = true; - bool clearOnPrint = false; - - // Initialisation check - if(!printfbuf_start || !printfbuf_device || !printf_fp) - return cudaErrorMissingConfiguration; - - // To determine which architecture we're using, we read the - // first short from the buffer - it'll be the magic number - // relating to the version. - unsigned short magic; - cudaMemcpy(&magic, printfbuf_device, sizeof(unsigned short), cudaMemcpyDeviceToHost); - - // For SM_10 architecture, we've split our buffer into one-per-thread. - // That means we must do each thread block separately. It'll require - // extra reading. We also, for now, don't support async printfs because - // that requires tracking one start pointer per thread. - if(magic == CUPRINTF_SM10_MAGIC) - { - sync_printfs = true; - clearOnPrint = false; - int blocklen = 0; - char *blockptr = printfbuf_device; - while(blockptr < (printfbuf_device + printfbuf_len)) - { - cuPrintfHeaderSM10 hdr; - cudaMemcpy(&hdr, blockptr, sizeof(hdr), cudaMemcpyDeviceToHost); - - // We get our block-size-step from the very first header - if(hdr.thread_buf_len != 0) - blocklen = hdr.thread_buf_len; - - // No magic number means no printfs from this thread - if(hdr.magic != CUPRINTF_SM10_MAGIC) - { - if(blocklen == 0) - { - fprintf(printf_fp, "No printf headers found at all!\n"); - break; // No valid headers! - } - blockptr += blocklen; - continue; - } - - // "offset" is non-zero then we can print the block contents - if(hdr.offset > 0) - { - // For synchronous printfs, we must print from endptr->bufend, then from start->end - if(sync_printfs) - doPrintfDisplay(showThreadID, clearOnPrint, blockptr+CUPRINTF_MAX_LEN, blockptr+hdr.thread_buf_len, blockptr+hdr.offset+CUPRINTF_MAX_LEN, blockptr+hdr.thread_buf_len); - doPrintfDisplay(showThreadID, clearOnPrint, blockptr+CUPRINTF_MAX_LEN, blockptr+hdr.thread_buf_len, blockptr+CUPRINTF_MAX_LEN, blockptr+hdr.offset+CUPRINTF_MAX_LEN); - } - - // Move on to the next block and loop again - blockptr += hdr.thread_buf_len; - } - } - // For SM_11 and up, everything is a single buffer and it's simple - else if(magic == CUPRINTF_SM11_MAGIC) - { - // Grab the current "end of circular buffer" pointer. - char *printfbuf_end = NULL; - cudaMemcpyFromSymbol(&printfbuf_end, printfBufferPtr, sizeof(char *)); - - // Adjust our starting and ending pointers to within the block - char *bufptr = ((printfbuf_start - printfbuf_device) % printfbuf_len) + printfbuf_device; - char *endptr = ((printfbuf_end - printfbuf_device) % printfbuf_len) + printfbuf_device; - - // For synchronous (i.e. after-kernel-exit) printf display, we have to handle circular - // buffer wrap carefully because we could miss those past "end". - if(sync_printfs) - doPrintfDisplay(showThreadID, clearOnPrint, printfbuf_device, printfbuf_device+printfbuf_len, endptr, printfbuf_device+printfbuf_len); - doPrintfDisplay(showThreadID, clearOnPrint, printfbuf_device, printfbuf_device+printfbuf_len, bufptr, endptr); - - printfbuf_start = printfbuf_end; - } - else - ;//printf("Bad magic number in cuPrintf buffer header\n"); - - // If we were synchronous, then we must ensure that the memory is cleared on exit - // otherwise another kernel launch with a different grid size could conflict. - if(sync_printfs) - cudaMemset(printfbuf_device, 0, printfbuf_len); - - return cudaSuccess; -} - -// Cleanup -#undef CUPRINTF_MAX_LEN -#undef CUPRINTF_ALIGN_SIZE -#undef CUPRINTF_SM10_MAGIC -#undef CUPRINTF_SM11_MAGIC - -#endif diff --git a/cuPrintf.cuh b/cuPrintf.cuh deleted file mode 100644 index cf3fe48688..0000000000 --- a/cuPrintf.cuh +++ /dev/null @@ -1,162 +0,0 @@ -/* - Copyright 2009 NVIDIA Corporation. All rights reserved. - - NOTICE TO LICENSEE: - - This source code and/or documentation ("Licensed Deliverables") are subject - to NVIDIA intellectual property rights under U.S. and international Copyright - laws. - - These Licensed Deliverables contained herein is PROPRIETARY and CONFIDENTIAL - to NVIDIA and is being provided under the terms and conditions of a form of - NVIDIA software license agreement by and between NVIDIA and Licensee ("License - Agreement") or electronically accepted by Licensee. Notwithstanding any terms - or conditions to the contrary in the License Agreement, reproduction or - disclosure of the Licensed Deliverables to any third party without the express - written consent of NVIDIA is prohibited. - - NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE LICENSE AGREEMENT, - NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THESE LICENSED - DELIVERABLES FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED - WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE - LICENSED DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, - NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. NOTWITHSTANDING ANY - TERMS OR CONDITIONS TO THE CONTRARY IN THE LICENSE AGREEMENT, IN NO EVENT SHALL - NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, - OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER - IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THESE LICENSED DELIVERABLES. - - U.S. Government End Users. These Licensed Deliverables are a "commercial item" - as that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of - "commercial computer software" and "commercial computer software documentation" - as such terms are used in 48 C.F.R. 12.212 (SEPT 1995) and is provided to the - U.S. Government only as a commercial end item. Consistent with 48 C.F.R.12.212 - and 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all U.S. Government - End Users acquire the Licensed Deliverables with only those rights set forth - herein. - - Any use of the Licensed Deliverables in individual and commercial software must - include, in the user documentation and internal comments to the code, the above - Disclaimer and U.S. Government End Users Notice. - */ - -#ifndef CUPRINTF_H -#define CUPRINTF_H - -/* - * This is the header file supporting cuPrintf.cu and defining both - * the host and device-side interfaces. See that file for some more - * explanation and sample use code. See also below for details of the - * host-side interfaces. - * - * Quick sample code: - * - #include "cuPrintf.cu" - - __global__ void testKernel(int val) - { - cuPrintf("Value is: %d\n", val); - } - - int main() - { - cudaPrintfInit(); - testKernel<<< 2, 3 >>>(10); - cudaPrintfDisplay(stdout, true); - cudaPrintfEnd(); - return 0; - } - */ - -/////////////////////////////////////////////////////////////////////////////// -// DEVICE SIDE -// External function definitions for device-side code - -// Abuse of templates to simulate varargs -__device__ int cuPrintf(const char *fmt); -template __device__ int cuPrintf(const char *fmt, T1 arg1); -template __device__ int cuPrintf(const char *fmt, T1 arg1, T2 arg2); -template __device__ int cuPrintf(const char *fmt, T1 arg1, T2 arg2, T3 arg3); -template __device__ int cuPrintf(const char *fmt, T1 arg1, T2 arg2, T3 arg3, T4 arg4); -template __device__ int cuPrintf(const char *fmt, T1 arg1, T2 arg2, T3 arg3, T4 arg4, T5 arg5); -template __device__ int cuPrintf(const char *fmt, T1 arg1, T2 arg2, T3 arg3, T4 arg4, T5 arg5, T6 arg6); -template __device__ int cuPrintf(const char *fmt, T1 arg1, T2 arg2, T3 arg3, T4 arg4, T5 arg5, T6 arg6, T7 arg7); -template __device__ int cuPrintf(const char *fmt, T1 arg1, T2 arg2, T3 arg3, T4 arg4, T5 arg5, T6 arg6, T7 arg7, T8 arg8); -template __device__ int cuPrintf(const char *fmt, T1 arg1, T2 arg2, T3 arg3, T4 arg4, T5 arg5, T6 arg6, T7 arg7, T8 arg8, T9 arg9); -template __device__ int cuPrintf(const char *fmt, T1 arg1, T2 arg2, T3 arg3, T4 arg4, T5 arg5, T6 arg6, T7 arg7, T8 arg8, T9 arg9, T10 arg10); - - -// -// cuPrintfRestrict -// -// Called to restrict output to a given thread/block. Pass -// the constant CUPRINTF_UNRESTRICTED to unrestrict output -// for thread/block IDs. Note you can therefore allow -// "all printfs from block 3" or "printfs from thread 2 -// on all blocks", or "printfs only from block 1, thread 5". -// -// Arguments: -// threadid - Thread ID to allow printfs from -// blockid - Block ID to allow printfs from -// -// NOTE: Restrictions last between invocations of -// kernels unless cudaPrintfInit() is called again. -// -#define CUPRINTF_UNRESTRICTED -1 -__device__ void cuPrintfRestrict(int threadid, int blockid); - - - -/////////////////////////////////////////////////////////////////////////////// -// HOST SIDE -// External function definitions for host-side code - -// -// cudaPrintfInit -// -// Call this once to initialise the printf system. If the output -// file or buffer size needs to be changed, call cudaPrintfEnd() -// before re-calling cudaPrintfInit(). -// -// The default size for the buffer is 1 megabyte. For CUDA -// architecture 1.1 and above, the buffer is filled linearly and -// is completely used; however for architecture 1.0, the buffer -// is divided into as many segments are there are threads, even -// if some threads do not call cuPrintf(). -// -// Arguments: -// bufferLen - Length, in bytes, of total space to reserve -// (in device global memory) for output. -// -// Returns: -// cudaSuccess if all is well. -// -extern "C" cudaError_t cudaPrintfInit(size_t bufferLen=1048576); // 1-meg - that's enough for 4096 printfs by all threads put together - -// -// cudaPrintfEnd -// -// Cleans up all memories allocated by cudaPrintfInit(). -// Call this at exit, or before calling cudaPrintfInit() again. -// -extern "C" void cudaPrintfEnd(); - -// -// cudaPrintfDisplay -// -// Dumps the contents of the output buffer to the specified -// file pointer. If the output pointer is not specified, -// the default "stdout" is used. -// -// Arguments: -// outputFP - A file pointer to an output stream. -// showThreadID - If "true", output strings are prefixed -// by "[blockid, threadid] " at output. -// -// Returns: -// cudaSuccess if all is well. -// -extern "C" cudaError_t cudaPrintfDisplay(void *outputFP=NULL, bool showThreadID=false); - -#endif // CUPRINTF_H diff --git a/cuda.cpp b/cuda.cpp index 95c8221bdf..02f253c297 100644 --- a/cuda.cpp +++ b/cuda.cpp @@ -1,8 +1,8 @@ -#include +#include #include -#include +#include #include - +using namespace std; #ifndef _WIN32 #include #endif @@ -16,10 +16,15 @@ #include #endif +#include "nvml.h" #include "miner.h" #include "cuda_runtime.h" +cudaDeviceProp device_props[MAX_GPUS]; +cudaStream_t gpustream[MAX_GPUS] = { 0 }; +extern uint16_t opt_api_listen; + // CUDA Devices on the System int cuda_num_devices() { @@ -31,10 +36,9 @@ int cuda_num_devices() exit(1); } - int maj = version / 1000, min = version % 100; // same as in deviceQuery sample - if (maj < 5 || (maj == 5 && min < 5)) + if (version < CUDART_VERSION) { - applog(LOG_ERR, "Driver does not support CUDA %d.%d API! Update your nVidia driver!", 5, 5); + applog(LOG_ERR, "Driver does not support CUDA %d.%d API! Update your nVidia driver!", CUDART_VERSION / 1000, (CUDART_VERSION % 1000) / 10); exit(1); } @@ -42,37 +46,89 @@ int cuda_num_devices() err = cudaGetDeviceCount(&GPU_N); if (err != cudaSuccess) { - applog(LOG_ERR, "Unable to query number of CUDA devices! Is an nVidia driver installed?"); + if(err!=cudaErrorNoDevice) + applog(LOG_ERR, "No CUDA device found!"); + else + applog(LOG_ERR, "Unable to query number of CUDA devices!"); exit(1); } return GPU_N; } +int cuda_version() +{ + return (int)CUDART_VERSION; +} + void cuda_devicenames() { cudaError_t err; int GPU_N; err = cudaGetDeviceCount(&GPU_N); - if (err != cudaSuccess) + if(err != cudaSuccess) { applog(LOG_ERR, "Unable to query number of CUDA devices! Is an nVidia driver installed?"); exit(1); } - for (int i=0; i < GPU_N; i++) + if(opt_n_threads) + GPU_N = min(MAX_GPUS, opt_n_threads); + for(int i = 0; i < GPU_N; i++) { + char vendorname[32] = {0}; + int dev_id = device_map[i]; cudaDeviceProp props; - cudaGetDeviceProperties(&props, device_map[i]); + cudaGetDeviceProperties(&props, dev_id); + + device_sm[dev_id] = (props.major * 100 + props.minor * 10); - device_name[i] = strdup(props.name); - device_sm[i] = (props.major * 100 + props.minor * 10); + if(device_name[dev_id]) + { + free(device_name[dev_id]); + device_name[dev_id] = NULL; + } +#ifdef USE_WRAPNVML + if(gpu_vendor((uint8_t)props.pciBusID, vendorname) > 0 && strlen(vendorname)) + { + device_name[dev_id] = (char*)calloc(1, strlen(vendorname) + strlen(props.name) + 2); + if(device_name[dev_id] == NULL) + { + applog(LOG_ERR, "Out of memory!"); + proper_exit(1); + } + if(!strncmp(props.name, "GeForce ", 8)) + sprintf(device_name[dev_id], "%s %s", vendorname, &props.name[8]); + else + sprintf(device_name[dev_id], "%s %s", vendorname, props.name); + } + else +#endif + device_name[dev_id] = strdup(props.name); + } +} + + +void cuda_print_devices() +{ + int ngpus = cuda_num_devices(); + for (int n=0; n < ngpus; n++) { + int m = device_map[n]; + cudaDeviceProp props; + cudaGetDeviceProperties(&props, m); + if (!opt_n_threads || n < opt_n_threads) + fprintf(stderr, "GPU #%d: SM %d.%d %s\n", m, props.major, props.minor, props.name); } } // Can't be called directly in cpu-miner.c void cuda_devicereset() { - cudaDeviceReset(); + for (int i = 0; i < active_gpus; i++) + { + cudaSetDevice(device_map[i]); + cudaDeviceSynchronize(); + cudaDeviceReset(); + } } static bool substringsearch(const char *haystack, const char *needle, int &match) @@ -113,7 +169,7 @@ int cuda_finddevice(char *name) uint32_t device_intensity(int thr_id, const char *func, uint32_t defcount) { uint32_t throughput = gpus_intensity[thr_id] ? gpus_intensity[thr_id] : defcount; - api_set_throughput(thr_id, throughput); + if(opt_api_listen!=0) api_set_throughput(thr_id, throughput); return throughput; } diff --git a/cuda_bitcoin.cu b/cuda_bitcoin.cu index 80c6dc6965..1b6d4e0b46 100644 --- a/cuda_bitcoin.cu +++ b/cuda_bitcoin.cu @@ -1,7 +1,11 @@ // Original version written by Schleicher (KlausT @github) // Redistribution and use in source and binary forms, with or without modification, are permitted +#ifdef __cplusplus +#include +#else #include +#endif #include "miner.h" #include "cuda_helper.h" @@ -9,553 +13,548 @@ void bitcoin_cpu_init(int thr_id); void bitcoin_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, const uint32_t *const ms, uint32_t merkle, uint32_t time, uint32_t compacttarget, uint32_t *const h_nounce); void bitcoin_midstate(const uint32_t *data, uint32_t *midstate); + __constant__ uint32_t pTarget[8]; static uint32_t *d_result[MAX_GPUS]; #define TPB 512 -#define NONCES_PER_THREAD 2048 +#define NONCES_PER_THREAD 32 -#if __CUDA_ARCH__ < 320 -#define rrot(x, n) ((x >> n) | (x << (32 - n))) -#else -#define rrot(x, n) __funnelshift_r((x), (x), (n)) -#endif - -__global__ __launch_bounds__(TPB, 1) +__global__ __launch_bounds__(TPB, 2) void bitcoin_gpu_hash(const uint32_t threads, const uint32_t startNounce, uint32_t *const result, const uint32_t t1c, const uint32_t t2c, const uint32_t w16, const uint32_t w16rot, const uint32_t w17, const uint32_t w17rot, const uint32_t b2, const uint32_t c2, const uint32_t d2, const uint32_t f2, const uint32_t g2, const uint32_t h2, const uint32_t ms0, const uint32_t ms1, const uint32_t ms2, const uint32_t ms3, const uint32_t ms4, const uint32_t ms5, const uint32_t ms6, const uint32_t ms7, const uint32_t compacttarget) { - uint32_t threadindex = (blockDim.x * blockIdx.x + threadIdx.x); + const uint32_t threadindex = (blockDim.x * blockIdx.x + threadIdx.x); if (threadindex < threads) { uint32_t t1, a, b, c, d, e, f, g, h; uint32_t w[64]; const uint32_t numberofthreads = blockDim.x*gridDim.x; const uint32_t maxnonce = startNounce + threadindex + numberofthreads*NONCES_PER_THREAD - 1; - const uint32_t threadindex = blockIdx.x*blockDim.x + threadIdx.x; - - for (uint32_t nonce = startNounce + threadindex; nonce <= maxnonce; nonce += numberofthreads) + + #pragma unroll + for (uint32_t nonce = startNounce + threadindex; nonce-1 < maxnonce; nonce += numberofthreads) { - w[18] = (rrot(nonce, 7) ^ rrot(nonce, 18) ^ (nonce >> 3)) + w16rot; + w[18] = (ROTR32(nonce, 7) ^ ROTR32(nonce, 18) ^ (nonce >> 3)) + w16rot; w[19] = nonce + w17rot; - w[20] = 0x80000000U + (rrot(w[18], 17) ^ rrot(w[18], 19) ^ (w[18] >> 10)); - w[21] = (rrot(w[19], 17) ^ rrot(w[19], 19) ^ (w[19] >> 10)); - w[22] = 0x280U + (rrot(w[20], 17) ^ rrot(w[20], 19) ^ (w[20] >> 10)); - w[23] = w16 + (rrot(w[21], 17) ^ rrot(w[21], 19) ^ (w[21] >> 10)); - w[24] = w17 + (rrot(w[22], 17) ^ rrot(w[22], 19) ^ (w[22] >> 10)); - w[25] = w[18] + (rrot(w[23], 17) ^ rrot(w[23], 19) ^ (w[23] >> 10)); - w[26] = w[19] + (rrot(w[24], 17) ^ rrot(w[24], 19) ^ (w[24] >> 10)); - w[27] = w[20] + (rrot(w[25], 17) ^ rrot(w[25], 19) ^ (w[25] >> 10)); - w[28] = w[21] + (rrot(w[26], 17) ^ rrot(w[26], 19) ^ (w[26] >> 10)); - w[29] = w[22] + (rrot(w[27], 17) ^ rrot(w[27], 19) ^ (w[27] >> 10)); - w[30] = w[23] + 0xa00055U + (rrot(w[28], 17) ^ rrot(w[28], 19) ^ (w[28] >> 10)); - w[31] = 0x280U + w[24] + (rrot(w16, 7) ^ rrot(w16, 18) ^ (w16 >> 3)) + (rrot(w[29], 17) ^ rrot(w[29], 19) ^ (w[29] >> 10)); - w[32] = w16 + w[25] + (rrot(w17, 7) ^ rrot(w17, 18) ^ (w17 >> 3)) + (rrot(w[30], 17) ^ rrot(w[30], 19) ^ (w[30] >> 10)); - w[33] = w17 + w[26] + (rrot(w[18], 7) ^ rrot(w[18], 18) ^ (w[18] >> 3)) + (rrot(w[31], 17) ^ rrot(w[31], 19) ^ (w[31] >> 10)); + w[20] = 0x80000000U + (ROTR32(w[18], 17) ^ ROTR32(w[18], 19) ^ (w[18] >> 10)); + w[21] = (ROTR32(w[19], 17) ^ ROTR32(w[19], 19) ^ (w[19] >> 10)); + w[22] = 0x280U + (ROTR32(w[20], 17) ^ ROTR32(w[20], 19) ^ (w[20] >> 10)); + w[23] = w16 + (ROTR32(w[21], 17) ^ ROTR32(w[21], 19) ^ (w[21] >> 10)); + w[24] = w17 + (ROTR32(w[22], 17) ^ ROTR32(w[22], 19) ^ (w[22] >> 10)); + w[25] = w[18] + (ROTR32(w[23], 17) ^ ROTR32(w[23], 19) ^ (w[23] >> 10)); + w[26] = w[19] + (ROTR32(w[24], 17) ^ ROTR32(w[24], 19) ^ (w[24] >> 10)); + w[27] = w[20] + (ROTR32(w[25], 17) ^ ROTR32(w[25], 19) ^ (w[25] >> 10)); + w[28] = w[21] + (ROTR32(w[26], 17) ^ ROTR32(w[26], 19) ^ (w[26] >> 10)); + w[29] = w[22] + (ROTR32(w[27], 17) ^ ROTR32(w[27], 19) ^ (w[27] >> 10)); + w[30] = w[23] + 0xa00055U + (ROTR32(w[28], 17) ^ ROTR32(w[28], 19) ^ (w[28] >> 10)); + w[31] = 0x280U + w[24] + (ROTR32(w16, 7) ^ ROTR32(w16, 18) ^ (w16 >> 3)) + (ROTR32(w[29], 17) ^ ROTR32(w[29], 19) ^ (w[29] >> 10)); + w[32] = w16 + w[25] + (ROTR32(w17, 7) ^ ROTR32(w17, 18) ^ (w17 >> 3)) + (ROTR32(w[30], 17) ^ ROTR32(w[30], 19) ^ (w[30] >> 10)); + w[33] = w17 + w[26] + (ROTR32(w[18], 7) ^ ROTR32(w[18], 18) ^ (w[18] >> 3)) + (ROTR32(w[31], 17) ^ ROTR32(w[31], 19) ^ (w[31] >> 10)); #pragma unroll for (int i = 34; i < 62; i++) - w[i] = w[i-16] + w[i-7] + (rrot(w[i-15], 7) ^ rrot(w[i-15], 18) ^ (w[i-15] >> 3)) + (rrot(w[i-2], 17) ^ rrot(w[i-2], 19) ^ (w[i-2] >> 10)); + w[i] = w[i-16] + w[i-7] + (ROTR32(w[i-15], 7) ^ ROTR32(w[i-15], 18) ^ (w[i-15] >> 3)) + (ROTR32(w[i-2], 17) ^ ROTR32(w[i-2], 19) ^ (w[i-2] >> 10)); t1 = t1c + (uint32_t)nonce; a = ms0 + t1; e = t1 + t2c; // - t1 = d2 + (rrot(a, 6) ^ rrot(a, 11) ^ rrot(a, 25)) + (c2 ^ (a & (b2 ^ c2))) + 0xb956c25bU; + t1 = d2 + (ROTR32(a, 6) ^ ROTR32(a, 11) ^ ROTR32(a, 25)) + (c2 ^ (a & (b2 ^ c2))) + 0xb956c25bU; h = h2 + t1; - d = t1 + (rrot(e, 2) ^ rrot(e, 13) ^ rrot(e, 22)) + ((g2 & f2) | (e & (g2 | f2))); + d = t1 + (ROTR32(e, 2) ^ ROTR32(e, 13) ^ ROTR32(e, 22)) + ((g2 & f2) | (e & (g2 | f2))); // - t1 = c2 + (rrot(h, 6) ^ rrot(h, 11) ^ rrot(h, 25)) + (b2 ^ (h & (a ^ b2))) + 0x59f111f1U; + t1 = c2 + (ROTR32(h, 6) ^ ROTR32(h, 11) ^ ROTR32(h, 25)) + (b2 ^ (h & (a ^ b2))) + 0x59f111f1U; g = g2 + t1; - c = t1 + (rrot(d, 2) ^ rrot(d, 13) ^ rrot(d, 22)) + ((f2 & e) | (d & (f2 | e))); + c = t1 + (ROTR32(d, 2) ^ ROTR32(d, 13) ^ ROTR32(d, 22)) + ((f2 & e) | (d & (f2 | e))); // - t1 = b2 + (rrot(g, 6) ^ rrot(g, 11) ^ rrot(g, 25)) + (a ^ (g & (h ^ a))) + 0x923f82a4U; + t1 = b2 + (ROTR32(g, 6) ^ ROTR32(g, 11) ^ ROTR32(g, 25)) + (a ^ (g & (h ^ a))) + 0x923f82a4U; f = f2 + t1; - b = t1 + (rrot(c, 2) ^ rrot(c, 13) ^ rrot(c, 22)) + ((e & d) | (c & (e | d))); + b = t1 + (ROTR32(c, 2) ^ ROTR32(c, 13) ^ ROTR32(c, 22)) + ((e & d) | (c & (e | d))); // - t1 = a + (rrot(f, 6) ^ rrot(f, 11) ^ rrot(f, 25)) + (h ^ (f & (g ^ h))) + 0xab1c5ed5U; + t1 = a + (ROTR32(f, 6) ^ ROTR32(f, 11) ^ ROTR32(f, 25)) + (h ^ (f & (g ^ h))) + 0xab1c5ed5U; e += t1; - a = t1 + (rrot(b, 2) ^ rrot(b, 13) ^ rrot(b, 22)) + ((d & c) | (b & (d | c))); + a = t1 + (ROTR32(b, 2) ^ ROTR32(b, 13) ^ ROTR32(b, 22)) + ((d & c) | (b & (d | c))); // - t1 = h + (rrot(e, 6) ^ rrot(e, 11) ^ rrot(e, 25)) + (g ^ (e & (f ^ g))) + 0xd807aa98U; + t1 = h + (ROTR32(e, 6) ^ ROTR32(e, 11) ^ ROTR32(e, 25)) + (g ^ (e & (f ^ g))) + 0xd807aa98U; d += t1; - h = t1 + (rrot(a, 2) ^ rrot(a, 13) ^ rrot(a, 22)) + ((c & b) | (a & (c | b))); + h = t1 + (ROTR32(a, 2) ^ ROTR32(a, 13) ^ ROTR32(a, 22)) + ((c & b) | (a & (c | b))); // - t1 = g + (rrot(d, 6) ^ rrot(d, 11) ^ rrot(d, 25)) + (f ^ (d & (e ^ f))) + 0x12835b01U; + t1 = g + (ROTR32(d, 6) ^ ROTR32(d, 11) ^ ROTR32(d, 25)) + (f ^ (d & (e ^ f))) + 0x12835b01U; c += t1; - g = t1 + (rrot(h, 2) ^ rrot(h, 13) ^ rrot(h, 22)) + ((b & a) | (h & (b | a))); + g = t1 + (ROTR32(h, 2) ^ ROTR32(h, 13) ^ ROTR32(h, 22)) + ((b & a) | (h & (b | a))); // - t1 = f + (rrot(c, 6) ^ rrot(c, 11) ^ rrot(c, 25)) + (e ^ (c & (d ^ e))) + 0x243185beU; + t1 = f + (ROTR32(c, 6) ^ ROTR32(c, 11) ^ ROTR32(c, 25)) + (e ^ (c & (d ^ e))) + 0x243185beU; b += t1; - f = t1 + (rrot(g, 2) ^ rrot(g, 13) ^ rrot(g, 22)) + ((a & h) | (g & (a | h))); + f = t1 + (ROTR32(g, 2) ^ ROTR32(g, 13) ^ ROTR32(g, 22)) + ((a & h) | (g & (a | h))); // - t1 = e + (rrot(b, 6) ^ rrot(b, 11) ^ rrot(b, 25)) + (d ^ (b & (c ^ d))) + 0x550c7dc3U; + t1 = e + (ROTR32(b, 6) ^ ROTR32(b, 11) ^ ROTR32(b, 25)) + (d ^ (b & (c ^ d))) + 0x550c7dc3U; a += t1; - e = t1 + (rrot(f, 2) ^ rrot(f, 13) ^ rrot(f, 22)) + ((h & g) | (f & (h | g))); + e = t1 + (ROTR32(f, 2) ^ ROTR32(f, 13) ^ ROTR32(f, 22)) + ((h & g) | (f & (h | g))); // - t1 = d + (rrot(a, 6) ^ rrot(a, 11) ^ rrot(a, 25)) + (c ^ (a & (b ^ c))) + 0x72be5d74U; + t1 = d + (ROTR32(a, 6) ^ ROTR32(a, 11) ^ ROTR32(a, 25)) + (c ^ (a & (b ^ c))) + 0x72be5d74U; h += t1; - d = t1 + (rrot(e, 2) ^ rrot(e, 13) ^ rrot(e, 22)) + ((g & f) | (e & (g | f))); + d = t1 + (ROTR32(e, 2) ^ ROTR32(e, 13) ^ ROTR32(e, 22)) + ((g & f) | (e & (g | f))); // - t1 = c + (rrot(h, 6) ^ rrot(h, 11) ^ rrot(h, 25)) + (b ^ (h & (a ^ b))) + 0x80deb1feU; + t1 = c + (ROTR32(h, 6) ^ ROTR32(h, 11) ^ ROTR32(h, 25)) + (b ^ (h & (a ^ b))) + 0x80deb1feU; g += t1; - c = t1 + (rrot(d, 2) ^ rrot(d, 13) ^ rrot(d, 22)) + ((f & e) | (d & (f | e))); + c = t1 + (ROTR32(d, 2) ^ ROTR32(d, 13) ^ ROTR32(d, 22)) + ((f & e) | (d & (f | e))); // - t1 = b + (rrot(g, 6) ^ rrot(g, 11) ^ rrot(g, 25)) + (a ^ (g & (h ^ a))) + 0x9bdc06a7U; + t1 = b + (ROTR32(g, 6) ^ ROTR32(g, 11) ^ ROTR32(g, 25)) + (a ^ (g & (h ^ a))) + 0x9bdc06a7U; f += t1; - b = t1 + (rrot(c, 2) ^ rrot(c, 13) ^ rrot(c, 22)) + ((e & d) | (c & (e | d))); + b = t1 + (ROTR32(c, 2) ^ ROTR32(c, 13) ^ ROTR32(c, 22)) + ((e & d) | (c & (e | d))); // - t1 = a + (rrot(f, 6) ^ rrot(f, 11) ^ rrot(f, 25)) + (h ^ (f & (g ^ h))) + 0xc19bf3f4U; + t1 = a + (ROTR32(f, 6) ^ ROTR32(f, 11) ^ ROTR32(f, 25)) + (h ^ (f & (g ^ h))) + 0xc19bf3f4U; e += t1; - a = t1 + (rrot(b, 2) ^ rrot(b, 13) ^ rrot(b, 22)) + ((d & c) | (b & (d | c))); + a = t1 + (ROTR32(b, 2) ^ ROTR32(b, 13) ^ ROTR32(b, 22)) + ((d & c) | (b & (d | c))); // - t1 = h + (rrot(e, 6) ^ rrot(e, 11) ^ rrot(e, 25)) + (g ^ (e & (f ^ g))) + 0xe49b69c1U + w16; + t1 = h + (ROTR32(e, 6) ^ ROTR32(e, 11) ^ ROTR32(e, 25)) + (g ^ (e & (f ^ g))) + 0xe49b69c1U + w16; d += t1; - h = t1 + (rrot(a, 2) ^ rrot(a, 13) ^ rrot(a, 22)) + ((c & b) | (a & (c | b))); + h = t1 + (ROTR32(a, 2) ^ ROTR32(a, 13) ^ ROTR32(a, 22)) + ((c & b) | (a & (c | b))); // - t1 = g + (rrot(d, 6) ^ rrot(d, 11) ^ rrot(d, 25)) + (f ^ (d & (e ^ f))) + 0xefbe4786U + w17; + t1 = g + (ROTR32(d, 6) ^ ROTR32(d, 11) ^ ROTR32(d, 25)) + (f ^ (d & (e ^ f))) + 0xefbe4786U + w17; c += t1; - g = t1 + (rrot(h, 2) ^ rrot(h, 13) ^ rrot(h, 22)) + ((b & a) | (h & (b | a))); + g = t1 + (ROTR32(h, 2) ^ ROTR32(h, 13) ^ ROTR32(h, 22)) + ((b & a) | (h & (b | a))); // - t1 = f + (rrot(c, 6) ^ rrot(c, 11) ^ rrot(c, 25)) + (e ^ (c & (d ^ e))) + 0x0fc19dc6U + w[18]; + t1 = f + (ROTR32(c, 6) ^ ROTR32(c, 11) ^ ROTR32(c, 25)) + (e ^ (c & (d ^ e))) + 0x0fc19dc6U + w[18]; b += t1; - f = t1 + (rrot(g, 2) ^ rrot(g, 13) ^ rrot(g, 22)) + ((a & h) | (g & (a | h))); + f = t1 + (ROTR32(g, 2) ^ ROTR32(g, 13) ^ ROTR32(g, 22)) + ((a & h) | (g & (a | h))); // - t1 = e + (rrot(b, 6) ^ rrot(b, 11) ^ rrot(b, 25)) + (d ^ (b & (c ^ d))) + 0x240ca1ccU + w[19]; + t1 = e + (ROTR32(b, 6) ^ ROTR32(b, 11) ^ ROTR32(b, 25)) + (d ^ (b & (c ^ d))) + 0x240ca1ccU + w[19]; a += t1; - e = t1 + (rrot(f, 2) ^ rrot(f, 13) ^ rrot(f, 22)) + ((h & g) | (f & (h | g))); + e = t1 + (ROTR32(f, 2) ^ ROTR32(f, 13) ^ ROTR32(f, 22)) + ((h & g) | (f & (h | g))); // - t1 = d + (rrot(a, 6) ^ rrot(a, 11) ^ rrot(a, 25)) + (c ^ (a & (b ^ c))) + 0x2de92c6fU + w[20]; + t1 = d + (ROTR32(a, 6) ^ ROTR32(a, 11) ^ ROTR32(a, 25)) + (c ^ (a & (b ^ c))) + 0x2de92c6fU + w[20]; h += t1; - d = t1 + (rrot(e, 2) ^ rrot(e, 13) ^ rrot(e, 22)) + ((g & f) | (e & (g | f))); + d = t1 + (ROTR32(e, 2) ^ ROTR32(e, 13) ^ ROTR32(e, 22)) + ((g & f) | (e & (g | f))); // - t1 = c + (rrot(h, 6) ^ rrot(h, 11) ^ rrot(h, 25)) + (b ^ (h & (a ^ b))) + 0x4a7484aaU + w[21]; + t1 = c + (ROTR32(h, 6) ^ ROTR32(h, 11) ^ ROTR32(h, 25)) + (b ^ (h & (a ^ b))) + 0x4a7484aaU + w[21]; g += t1; - c = t1 + (rrot(d, 2) ^ rrot(d, 13) ^ rrot(d, 22)) + ((f & e) | (d & (f | e))); + c = t1 + (ROTR32(d, 2) ^ ROTR32(d, 13) ^ ROTR32(d, 22)) + ((f & e) | (d & (f | e))); // - t1 = b + (rrot(g, 6) ^ rrot(g, 11) ^ rrot(g, 25)) + (a ^ (g & (h ^ a))) + 0x5cb0a9dcU + w[22]; + t1 = b + (ROTR32(g, 6) ^ ROTR32(g, 11) ^ ROTR32(g, 25)) + (a ^ (g & (h ^ a))) + 0x5cb0a9dcU + w[22]; f += t1; - b = t1 + (rrot(c, 2) ^ rrot(c, 13) ^ rrot(c, 22)) + ((e & d) | (c & (e | d))); + b = t1 + (ROTR32(c, 2) ^ ROTR32(c, 13) ^ ROTR32(c, 22)) + ((e & d) | (c & (e | d))); // - t1 = a + (rrot(f, 6) ^ rrot(f, 11) ^ rrot(f, 25)) + (h ^ (f & (g ^ h))) + 0x76f988daU + w[23]; + t1 = a + (ROTR32(f, 6) ^ ROTR32(f, 11) ^ ROTR32(f, 25)) + (h ^ (f & (g ^ h))) + 0x76f988daU + w[23]; e += t1; - a = t1 + (rrot(b, 2) ^ rrot(b, 13) ^ rrot(b, 22)) + ((d & c) | (b & (d | c))); + a = t1 + (ROTR32(b, 2) ^ ROTR32(b, 13) ^ ROTR32(b, 22)) + ((d & c) | (b & (d | c))); // - t1 = h + (rrot(e, 6) ^ rrot(e, 11) ^ rrot(e, 25)) + (g ^ (e & (f ^ g))) + 0x983e5152U + w[24]; + t1 = h + (ROTR32(e, 6) ^ ROTR32(e, 11) ^ ROTR32(e, 25)) + (g ^ (e & (f ^ g))) + 0x983e5152U + w[24]; d += t1; - h = t1 + (rrot(a, 2) ^ rrot(a, 13) ^ rrot(a, 22)) + ((c & b) | (a & (c | b))); + h = t1 + (ROTR32(a, 2) ^ ROTR32(a, 13) ^ ROTR32(a, 22)) + ((c & b) | (a & (c | b))); // - t1 = g + (rrot(d, 6) ^ rrot(d, 11) ^ rrot(d, 25)) + (f ^ (d & (e ^ f))) + 0xa831c66dU + w[25]; + t1 = g + (ROTR32(d, 6) ^ ROTR32(d, 11) ^ ROTR32(d, 25)) + (f ^ (d & (e ^ f))) + 0xa831c66dU + w[25]; c += t1; - g = t1 + (rrot(h, 2) ^ rrot(h, 13) ^ rrot(h, 22)) + ((b & a) | (h & (b | a))); + g = t1 + (ROTR32(h, 2) ^ ROTR32(h, 13) ^ ROTR32(h, 22)) + ((b & a) | (h & (b | a))); // - t1 = f + (rrot(c, 6) ^ rrot(c, 11) ^ rrot(c, 25)) + (e ^ (c & (d ^ e))) + 0xb00327c8U + w[26]; + t1 = f + (ROTR32(c, 6) ^ ROTR32(c, 11) ^ ROTR32(c, 25)) + (e ^ (c & (d ^ e))) + 0xb00327c8U + w[26]; b += t1; - f = t1 + (rrot(g, 2) ^ rrot(g, 13) ^ rrot(g, 22)) + ((a & h) | (g & (a | h))); + f = t1 + (ROTR32(g, 2) ^ ROTR32(g, 13) ^ ROTR32(g, 22)) + ((a & h) | (g & (a | h))); // - t1 = e + (rrot(b, 6) ^ rrot(b, 11) ^ rrot(b, 25)) + (d ^ (b & (c ^ d))) + 0xbf597fc7U + w[27]; + t1 = e + (ROTR32(b, 6) ^ ROTR32(b, 11) ^ ROTR32(b, 25)) + (d ^ (b & (c ^ d))) + 0xbf597fc7U + w[27]; a += t1; - e = t1 + (rrot(f, 2) ^ rrot(f, 13) ^ rrot(f, 22)) + ((h & g) | (f & (h | g))); + e = t1 + (ROTR32(f, 2) ^ ROTR32(f, 13) ^ ROTR32(f, 22)) + ((h & g) | (f & (h | g))); // - t1 = d + (rrot(a, 6) ^ rrot(a, 11) ^ rrot(a, 25)) + (c ^ (a & (b ^ c))) + 0xc6e00bf3U + w[28]; + t1 = d + (ROTR32(a, 6) ^ ROTR32(a, 11) ^ ROTR32(a, 25)) + (c ^ (a & (b ^ c))) + 0xc6e00bf3U + w[28]; h += t1; - d = t1 + (rrot(e, 2) ^ rrot(e, 13) ^ rrot(e, 22)) + ((g & f) | (e & (g | f))); + d = t1 + (ROTR32(e, 2) ^ ROTR32(e, 13) ^ ROTR32(e, 22)) + ((g & f) | (e & (g | f))); // - t1 = c + (rrot(h, 6) ^ rrot(h, 11) ^ rrot(h, 25)) + (b ^ (h & (a ^ b))) + 0xd5a79147U + w[29]; + t1 = c + (ROTR32(h, 6) ^ ROTR32(h, 11) ^ ROTR32(h, 25)) + (b ^ (h & (a ^ b))) + 0xd5a79147U + w[29]; g += t1; - c = t1 + (rrot(d, 2) ^ rrot(d, 13) ^ rrot(d, 22)) + ((f & e) | (d & (f | e))); + c = t1 + (ROTR32(d, 2) ^ ROTR32(d, 13) ^ ROTR32(d, 22)) + ((f & e) | (d & (f | e))); // - t1 = b + (rrot(g, 6) ^ rrot(g, 11) ^ rrot(g, 25)) + (a ^ (g & (h ^ a))) + 0x06ca6351U + w[30]; + t1 = b + (ROTR32(g, 6) ^ ROTR32(g, 11) ^ ROTR32(g, 25)) + (a ^ (g & (h ^ a))) + 0x06ca6351U + w[30]; f += t1; - b = t1 + (rrot(c, 2) ^ rrot(c, 13) ^ rrot(c, 22)) + ((e & d) | (c & (e | d))); + b = t1 + (ROTR32(c, 2) ^ ROTR32(c, 13) ^ ROTR32(c, 22)) + ((e & d) | (c & (e | d))); // - t1 = a + (rrot(f, 6) ^ rrot(f, 11) ^ rrot(f, 25)) + (h ^ (f & (g ^ h))) + 0x14292967U + w[31]; + t1 = a + (ROTR32(f, 6) ^ ROTR32(f, 11) ^ ROTR32(f, 25)) + (h ^ (f & (g ^ h))) + 0x14292967U + w[31]; e += t1; - a = t1 + (rrot(b, 2) ^ rrot(b, 13) ^ rrot(b, 22)) + ((d & c) | (b & (d | c))); + a = t1 + (ROTR32(b, 2) ^ ROTR32(b, 13) ^ ROTR32(b, 22)) + ((d & c) | (b & (d | c))); // - t1 = h + (rrot(e, 6) ^ rrot(e, 11) ^ rrot(e, 25)) + (g ^ (e & (f ^ g))) + 0x27b70a85U + w[32]; + t1 = h + (ROTR32(e, 6) ^ ROTR32(e, 11) ^ ROTR32(e, 25)) + (g ^ (e & (f ^ g))) + 0x27b70a85U + w[32]; d += t1; - h = t1 + (rrot(a, 2) ^ rrot(a, 13) ^ rrot(a, 22)) + ((c & b) | (a & (c | b))); + h = t1 + (ROTR32(a, 2) ^ ROTR32(a, 13) ^ ROTR32(a, 22)) + ((c & b) | (a & (c | b))); // - t1 = g + (rrot(d, 6) ^ rrot(d, 11) ^ rrot(d, 25)) + (f ^ (d & (e ^ f))) + 0x2e1b2138U + w[33]; + t1 = g + (ROTR32(d, 6) ^ ROTR32(d, 11) ^ ROTR32(d, 25)) + (f ^ (d & (e ^ f))) + 0x2e1b2138U + w[33]; c += t1; - g = t1 + (rrot(h, 2) ^ rrot(h, 13) ^ rrot(h, 22)) + ((b & a) | (h & (b | a))); + g = t1 + (ROTR32(h, 2) ^ ROTR32(h, 13) ^ ROTR32(h, 22)) + ((b & a) | (h & (b | a))); // - t1 = f + (rrot(c, 6) ^ rrot(c, 11) ^ rrot(c, 25)) + (e ^ (c & (d ^ e))) + 0x4d2c6dfcU + w[34]; + t1 = f + (ROTR32(c, 6) ^ ROTR32(c, 11) ^ ROTR32(c, 25)) + (e ^ (c & (d ^ e))) + 0x4d2c6dfcU + w[34]; b += t1; - f = t1 + (rrot(g, 2) ^ rrot(g, 13) ^ rrot(g, 22)) + ((a & h) | (g & (a | h))); + f = t1 + (ROTR32(g, 2) ^ ROTR32(g, 13) ^ ROTR32(g, 22)) + ((a & h) | (g & (a | h))); // - t1 = e + (rrot(b, 6) ^ rrot(b, 11) ^ rrot(b, 25)) + (d ^ (b & (c ^ d))) + 0x53380d13U + w[35]; + t1 = e + (ROTR32(b, 6) ^ ROTR32(b, 11) ^ ROTR32(b, 25)) + (d ^ (b & (c ^ d))) + 0x53380d13U + w[35]; a += t1; - e = t1 + (rrot(f, 2) ^ rrot(f, 13) ^ rrot(f, 22)) + ((h & g) | (f & (h | g))); + e = t1 + (ROTR32(f, 2) ^ ROTR32(f, 13) ^ ROTR32(f, 22)) + ((h & g) | (f & (h | g))); // - t1 = d + (rrot(a, 6) ^ rrot(a, 11) ^ rrot(a, 25)) + (c ^ (a & (b ^ c))) + 0x650a7354U + w[36]; + t1 = d + (ROTR32(a, 6) ^ ROTR32(a, 11) ^ ROTR32(a, 25)) + (c ^ (a & (b ^ c))) + 0x650a7354U + w[36]; h += t1; - d = t1 + (rrot(e, 2) ^ rrot(e, 13) ^ rrot(e, 22)) + ((g & f) | (e & (g | f))); + d = t1 + (ROTR32(e, 2) ^ ROTR32(e, 13) ^ ROTR32(e, 22)) + ((g & f) | (e & (g | f))); // - t1 = c + (rrot(h, 6) ^ rrot(h, 11) ^ rrot(h, 25)) + (b ^ (h & (a ^ b))) + 0x766a0abbU + w[37]; + t1 = c + (ROTR32(h, 6) ^ ROTR32(h, 11) ^ ROTR32(h, 25)) + (b ^ (h & (a ^ b))) + 0x766a0abbU + w[37]; g += t1; - c = t1 + (rrot(d, 2) ^ rrot(d, 13) ^ rrot(d, 22)) + ((f & e) | (d & (f | e))); + c = t1 + (ROTR32(d, 2) ^ ROTR32(d, 13) ^ ROTR32(d, 22)) + ((f & e) | (d & (f | e))); // - t1 = b + (rrot(g, 6) ^ rrot(g, 11) ^ rrot(g, 25)) + (a ^ (g & (h ^ a))) + 0x81c2c92eU + w[38]; + t1 = b + (ROTR32(g, 6) ^ ROTR32(g, 11) ^ ROTR32(g, 25)) + (a ^ (g & (h ^ a))) + 0x81c2c92eU + w[38]; f += t1; - b = t1 + (rrot(c, 2) ^ rrot(c, 13) ^ rrot(c, 22)) + ((e & d) | (c & (e | d))); + b = t1 + (ROTR32(c, 2) ^ ROTR32(c, 13) ^ ROTR32(c, 22)) + ((e & d) | (c & (e | d))); // - t1 = a + (rrot(f, 6) ^ rrot(f, 11) ^ rrot(f, 25)) + (h ^ (f & (g ^ h))) + 0x92722c85U + w[39]; + t1 = a + (ROTR32(f, 6) ^ ROTR32(f, 11) ^ ROTR32(f, 25)) + (h ^ (f & (g ^ h))) + 0x92722c85U + w[39]; e += t1; - a = t1 + (rrot(b, 2) ^ rrot(b, 13) ^ rrot(b, 22)) + ((d & c) | (b & (d | c))); + a = t1 + (ROTR32(b, 2) ^ ROTR32(b, 13) ^ ROTR32(b, 22)) + ((d & c) | (b & (d | c))); // - t1 = h + (rrot(e, 6) ^ rrot(e, 11) ^ rrot(e, 25)) + (g ^ (e & (f ^ g))) + 0xa2bfe8a1U + w[40]; + t1 = h + (ROTR32(e, 6) ^ ROTR32(e, 11) ^ ROTR32(e, 25)) + (g ^ (e & (f ^ g))) + 0xa2bfe8a1U + w[40]; d += t1; - h = t1 + (rrot(a, 2) ^ rrot(a, 13) ^ rrot(a, 22)) + ((c & b) | (a & (c | b))); + h = t1 + (ROTR32(a, 2) ^ ROTR32(a, 13) ^ ROTR32(a, 22)) + ((c & b) | (a & (c | b))); // - t1 = g + (rrot(d, 6) ^ rrot(d, 11) ^ rrot(d, 25)) + (f ^ (d & (e ^ f))) + 0xa81a664bU + w[41]; + t1 = g + (ROTR32(d, 6) ^ ROTR32(d, 11) ^ ROTR32(d, 25)) + (f ^ (d & (e ^ f))) + 0xa81a664bU + w[41]; c += t1; - g = t1 + (rrot(h, 2) ^ rrot(h, 13) ^ rrot(h, 22)) + ((b & a) | (h & (b | a))); + g = t1 + (ROTR32(h, 2) ^ ROTR32(h, 13) ^ ROTR32(h, 22)) + ((b & a) | (h & (b | a))); // - t1 = f + (rrot(c, 6) ^ rrot(c, 11) ^ rrot(c, 25)) + (e ^ (c & (d ^ e))) + 0xc24b8b70U + w[42]; + t1 = f + (ROTR32(c, 6) ^ ROTR32(c, 11) ^ ROTR32(c, 25)) + (e ^ (c & (d ^ e))) + 0xc24b8b70U + w[42]; b += t1; - f = t1 + (rrot(g, 2) ^ rrot(g, 13) ^ rrot(g, 22)) + ((a & h) | (g & (a | h))); + f = t1 + (ROTR32(g, 2) ^ ROTR32(g, 13) ^ ROTR32(g, 22)) + ((a & h) | (g & (a | h))); // - t1 = e + (rrot(b, 6) ^ rrot(b, 11) ^ rrot(b, 25)) + (d ^ (b & (c ^ d))) + 0xc76c51a3U + w[43]; + t1 = e + (ROTR32(b, 6) ^ ROTR32(b, 11) ^ ROTR32(b, 25)) + (d ^ (b & (c ^ d))) + 0xc76c51a3U + w[43]; a += t1; - e = t1 + (rrot(f, 2) ^ rrot(f, 13) ^ rrot(f, 22)) + ((h & g) | (f & (h | g))); + e = t1 + (ROTR32(f, 2) ^ ROTR32(f, 13) ^ ROTR32(f, 22)) + ((h & g) | (f & (h | g))); // - t1 = d + (rrot(a, 6) ^ rrot(a, 11) ^ rrot(a, 25)) + (c ^ (a & (b ^ c))) + 0xd192e819U + w[44]; + t1 = d + (ROTR32(a, 6) ^ ROTR32(a, 11) ^ ROTR32(a, 25)) + (c ^ (a & (b ^ c))) + 0xd192e819U + w[44]; h += t1; - d = t1 + (rrot(e, 2) ^ rrot(e, 13) ^ rrot(e, 22)) + ((g & f) | (e & (g | f))); + d = t1 + (ROTR32(e, 2) ^ ROTR32(e, 13) ^ ROTR32(e, 22)) + ((g & f) | (e & (g | f))); // - t1 = c + (rrot(h, 6) ^ rrot(h, 11) ^ rrot(h, 25)) + (b ^ (h & (a ^ b))) + 0xd6990624U + w[45]; + t1 = c + (ROTR32(h, 6) ^ ROTR32(h, 11) ^ ROTR32(h, 25)) + (b ^ (h & (a ^ b))) + 0xd6990624U + w[45]; g += t1; - c = t1 + (rrot(d, 2) ^ rrot(d, 13) ^ rrot(d, 22)) + ((f & e) | (d & (f | e))); + c = t1 + (ROTR32(d, 2) ^ ROTR32(d, 13) ^ ROTR32(d, 22)) + ((f & e) | (d & (f | e))); // - t1 = b + (rrot(g, 6) ^ rrot(g, 11) ^ rrot(g, 25)) + (a ^ (g & (h ^ a))) + 0xf40e3585U + w[46]; + t1 = b + (ROTR32(g, 6) ^ ROTR32(g, 11) ^ ROTR32(g, 25)) + (a ^ (g & (h ^ a))) + 0xf40e3585U + w[46]; f += t1; - b = t1 + (rrot(c, 2) ^ rrot(c, 13) ^ rrot(c, 22)) + ((e & d) | (c & (e | d))); + b = t1 + (ROTR32(c, 2) ^ ROTR32(c, 13) ^ ROTR32(c, 22)) + ((e & d) | (c & (e | d))); // - t1 = a + (rrot(f, 6) ^ rrot(f, 11) ^ rrot(f, 25)) + (h ^ (f & (g ^ h))) + 0x106aa070U + w[47]; + t1 = a + (ROTR32(f, 6) ^ ROTR32(f, 11) ^ ROTR32(f, 25)) + (h ^ (f & (g ^ h))) + 0x106aa070U + w[47]; e += t1; - a = t1 + (rrot(b, 2) ^ rrot(b, 13) ^ rrot(b, 22)) + ((d & c) | (b & (d | c))); + a = t1 + (ROTR32(b, 2) ^ ROTR32(b, 13) ^ ROTR32(b, 22)) + ((d & c) | (b & (d | c))); // - t1 = h + (rrot(e, 6) ^ rrot(e, 11) ^ rrot(e, 25)) + (g ^ (e & (f ^ g))) + 0x19a4c116U + w[48]; + t1 = h + (ROTR32(e, 6) ^ ROTR32(e, 11) ^ ROTR32(e, 25)) + (g ^ (e & (f ^ g))) + 0x19a4c116U + w[48]; d += t1; - h = t1 + (rrot(a, 2) ^ rrot(a, 13) ^ rrot(a, 22)) + ((c & b) | (a & (c | b))); + h = t1 + (ROTR32(a, 2) ^ ROTR32(a, 13) ^ ROTR32(a, 22)) + ((c & b) | (a & (c | b))); // - t1 = g + (rrot(d, 6) ^ rrot(d, 11) ^ rrot(d, 25)) + (f ^ (d & (e ^ f))) + 0x1e376c08U + w[49]; + t1 = g + (ROTR32(d, 6) ^ ROTR32(d, 11) ^ ROTR32(d, 25)) + (f ^ (d & (e ^ f))) + 0x1e376c08U + w[49]; c += t1; - g = t1 + (rrot(h, 2) ^ rrot(h, 13) ^ rrot(h, 22)) + ((b & a) | (h & (b | a))); + g = t1 + (ROTR32(h, 2) ^ ROTR32(h, 13) ^ ROTR32(h, 22)) + ((b & a) | (h & (b | a))); // - t1 = f + (rrot(c, 6) ^ rrot(c, 11) ^ rrot(c, 25)) + (e ^ (c & (d ^ e))) + 0x2748774cU + w[50]; + t1 = f + (ROTR32(c, 6) ^ ROTR32(c, 11) ^ ROTR32(c, 25)) + (e ^ (c & (d ^ e))) + 0x2748774cU + w[50]; b += t1; - f = t1 + (rrot(g, 2) ^ rrot(g, 13) ^ rrot(g, 22)) + ((a & h) | (g & (a | h))); + f = t1 + (ROTR32(g, 2) ^ ROTR32(g, 13) ^ ROTR32(g, 22)) + ((a & h) | (g & (a | h))); // - t1 = e + (rrot(b, 6) ^ rrot(b, 11) ^ rrot(b, 25)) + (d ^ (b & (c ^ d))) + 0x34b0bcb5U + w[51]; + t1 = e + (ROTR32(b, 6) ^ ROTR32(b, 11) ^ ROTR32(b, 25)) + (d ^ (b & (c ^ d))) + 0x34b0bcb5U + w[51]; a += t1; - e = t1 + (rrot(f, 2) ^ rrot(f, 13) ^ rrot(f, 22)) + ((h & g) | (f & (h | g))); + e = t1 + (ROTR32(f, 2) ^ ROTR32(f, 13) ^ ROTR32(f, 22)) + ((h & g) | (f & (h | g))); // - t1 = d + (rrot(a, 6) ^ rrot(a, 11) ^ rrot(a, 25)) + (c ^ (a & (b ^ c))) + 0x391c0cb3U + w[52]; + t1 = d + (ROTR32(a, 6) ^ ROTR32(a, 11) ^ ROTR32(a, 25)) + (c ^ (a & (b ^ c))) + 0x391c0cb3U + w[52]; h += t1; - d = t1 + (rrot(e, 2) ^ rrot(e, 13) ^ rrot(e, 22)) + ((g & f) | (e & (g | f))); + d = t1 + (ROTR32(e, 2) ^ ROTR32(e, 13) ^ ROTR32(e, 22)) + ((g & f) | (e & (g | f))); // - t1 = c + (rrot(h, 6) ^ rrot(h, 11) ^ rrot(h, 25)) + (b ^ (h & (a ^ b))) + 0x4ed8aa4aU + w[53]; + t1 = c + (ROTR32(h, 6) ^ ROTR32(h, 11) ^ ROTR32(h, 25)) + (b ^ (h & (a ^ b))) + 0x4ed8aa4aU + w[53]; g += t1; - c = t1 + (rrot(d, 2) ^ rrot(d, 13) ^ rrot(d, 22)) + ((f & e) | (d & (f | e))); + c = t1 + (ROTR32(d, 2) ^ ROTR32(d, 13) ^ ROTR32(d, 22)) + ((f & e) | (d & (f | e))); // - t1 = b + (rrot(g, 6) ^ rrot(g, 11) ^ rrot(g, 25)) + (a ^ (g & (h ^ a))) + 0x5b9cca4fU + w[54]; + t1 = b + (ROTR32(g, 6) ^ ROTR32(g, 11) ^ ROTR32(g, 25)) + (a ^ (g & (h ^ a))) + 0x5b9cca4fU + w[54]; f += t1; - b = t1 + (rrot(c, 2) ^ rrot(c, 13) ^ rrot(c, 22)) + ((e & d) | (c & (e | d))); + b = t1 + (ROTR32(c, 2) ^ ROTR32(c, 13) ^ ROTR32(c, 22)) + ((e & d) | (c & (e | d))); // - t1 = a + (rrot(f, 6) ^ rrot(f, 11) ^ rrot(f, 25)) + (h ^ (f & (g ^ h))) + 0x682e6ff3U + w[55]; + t1 = a + (ROTR32(f, 6) ^ ROTR32(f, 11) ^ ROTR32(f, 25)) + (h ^ (f & (g ^ h))) + 0x682e6ff3U + w[55]; e += t1; - a = t1 + (rrot(b, 2) ^ rrot(b, 13) ^ rrot(b, 22)) + ((d & c) | (b & (d | c))); + a = t1 + (ROTR32(b, 2) ^ ROTR32(b, 13) ^ ROTR32(b, 22)) + ((d & c) | (b & (d | c))); // - t1 = h + (rrot(e, 6) ^ rrot(e, 11) ^ rrot(e, 25)) + (g ^ (e & (f ^ g))) + 0x748f82eeU + w[56]; + t1 = h + (ROTR32(e, 6) ^ ROTR32(e, 11) ^ ROTR32(e, 25)) + (g ^ (e & (f ^ g))) + 0x748f82eeU + w[56]; d += t1; - h = t1 + (rrot(a, 2) ^ rrot(a, 13) ^ rrot(a, 22)) + ((c & b) | (a & (c | b))); + h = t1 + (ROTR32(a, 2) ^ ROTR32(a, 13) ^ ROTR32(a, 22)) + ((c & b) | (a & (c | b))); // - t1 = g + (rrot(d, 6) ^ rrot(d, 11) ^ rrot(d, 25)) + (f ^ (d & (e ^ f))) + 0x78a5636fU + w[57]; + t1 = g + (ROTR32(d, 6) ^ ROTR32(d, 11) ^ ROTR32(d, 25)) + (f ^ (d & (e ^ f))) + 0x78a5636fU + w[57]; c += t1; - g = t1 + (rrot(h, 2) ^ rrot(h, 13) ^ rrot(h, 22)) + ((b & a) | (h & (b | a))); + g = t1 + (ROTR32(h, 2) ^ ROTR32(h, 13) ^ ROTR32(h, 22)) + ((b & a) | (h & (b | a))); // - t1 = f + (rrot(c, 6) ^ rrot(c, 11) ^ rrot(c, 25)) + (e ^ (c & (d ^ e))) + 0x84c87814U + w[58]; + t1 = f + (ROTR32(c, 6) ^ ROTR32(c, 11) ^ ROTR32(c, 25)) + (e ^ (c & (d ^ e))) + 0x84c87814U + w[58]; b += t1; - f = t1 + (rrot(g, 2) ^ rrot(g, 13) ^ rrot(g, 22)) + ((a & h) | (g & (a | h))); + f = t1 + (ROTR32(g, 2) ^ ROTR32(g, 13) ^ ROTR32(g, 22)) + ((a & h) | (g & (a | h))); // - t1 = e + (rrot(b, 6) ^ rrot(b, 11) ^ rrot(b, 25)) + (d ^ (b & (c ^ d))) + 0x8cc70208U + w[59]; + t1 = e + (ROTR32(b, 6) ^ ROTR32(b, 11) ^ ROTR32(b, 25)) + (d ^ (b & (c ^ d))) + 0x8cc70208U + w[59]; a += t1; - e = t1 + (rrot(f, 2) ^ rrot(f, 13) ^ rrot(f, 22)) + ((h & g) | (f & (h | g))); + e = t1 + (ROTR32(f, 2) ^ ROTR32(f, 13) ^ ROTR32(f, 22)) + ((h & g) | (f & (h | g))); // - t1 = d + (rrot(a, 6) ^ rrot(a, 11) ^ rrot(a, 25)) + (c ^ (a & (b ^ c))) + 0x90befffaU + w[60]; + t1 = d + (ROTR32(a, 6) ^ ROTR32(a, 11) ^ ROTR32(a, 25)) + (c ^ (a & (b ^ c))) + 0x90befffaU + w[60]; h += t1; - d = t1 + (rrot(e, 2) ^ rrot(e, 13) ^ rrot(e, 22)) + ((g & f) | (e & (g | f))); + d = t1 + (ROTR32(e, 2) ^ ROTR32(e, 13) ^ ROTR32(e, 22)) + ((g & f) | (e & (g | f))); // - t1 = c + (rrot(h, 6) ^ rrot(h, 11) ^ rrot(h, 25)) + (b ^ (h & (a ^ b))) + 0xa4506cebU + w[61]; + t1 = c + (ROTR32(h, 6) ^ ROTR32(h, 11) ^ ROTR32(h, 25)) + (b ^ (h & (a ^ b))) + 0xa4506cebU + w[61]; g += t1; - c = t1 + (rrot(d, 2) ^ rrot(d, 13) ^ rrot(d, 22)) + ((f & e) | (d & (f | e))); + c = t1 + (ROTR32(d, 2) ^ ROTR32(d, 13) ^ ROTR32(d, 22)) + ((f & e) | (d & (f | e))); // - t1 = b + (rrot(g, 6) ^ rrot(g, 11) ^ rrot(g, 25)) + (a ^ (g & (h ^ a))) + 0xbef9a3f7U + w[46] + w[55] + (rrot(w[47], 7) ^ rrot(w[47], 18) ^ (w[47] >> 3)) + (rrot(w[60], 17) ^ rrot(w[60], 19) ^ (w[60] >> 10)); + t1 = b + (ROTR32(g, 6) ^ ROTR32(g, 11) ^ ROTR32(g, 25)) + (a ^ (g & (h ^ a))) + 0xbef9a3f7U + w[46] + w[55] + (ROTR32(w[47], 7) ^ ROTR32(w[47], 18) ^ (w[47] >> 3)) + (ROTR32(w[60], 17) ^ ROTR32(w[60], 19) ^ (w[60] >> 10)); f += t1; - b = t1 + (rrot(c, 2) ^ rrot(c, 13) ^ rrot(c, 22)) + ((e & d) | (c & (e | d))); + b = t1 + (ROTR32(c, 2) ^ ROTR32(c, 13) ^ ROTR32(c, 22)) + ((e & d) | (c & (e | d))); // - t1 = a + (rrot(f, 6) ^ rrot(f, 11) ^ rrot(f, 25)) + (h ^ (f & (g ^ h))) + 0xc67178f2U + w[47] + w[56] + (rrot(w[48], 7) ^ rrot(w[48], 18) ^ (w[48] >> 3)) + (rrot(w[61], 17) ^ rrot(w[61], 19) ^ (w[61] >> 10)); + t1 = a + (ROTR32(f, 6) ^ ROTR32(f, 11) ^ ROTR32(f, 25)) + (h ^ (f & (g ^ h))) + 0xc67178f2U + w[47] + w[56] + (ROTR32(w[48], 7) ^ ROTR32(w[48], 18) ^ (w[48] >> 3)) + (ROTR32(w[61], 17) ^ ROTR32(w[61], 19) ^ (w[61] >> 10)); e += t1; - a = t1 + (rrot(b, 2) ^ rrot(b, 13) ^ rrot(b, 22)) + ((d & c) | (b & (d | c))); + a = t1 + (ROTR32(b, 2) ^ ROTR32(b, 13) ^ ROTR32(b, 22)) + ((d & c) | (b & (d | c))); // w[0] = a + ms0; w[1] = b + ms1; w[2] = c + ms2; w[3] = d + ms3; w[4] = e + ms4; w[5] = f + ms5; w[6] = g + ms6; w[7] = h + ms7; // hash the hash *************************************************************** - w[16] = w[0] + (rrot(w[1], 7) ^ rrot(w[1], 18) ^ (w[1] >> 3)); - w[17] = w[1] + (rrot(w[2], 7) ^ rrot(w[2], 18) ^ (w[2] >> 3)) + (rrot(0x100, 17) ^ rrot(0x100, 19) ^ (0x100 >> 10)); - w[18] = w[2] + (rrot(w[3], 7) ^ rrot(w[3], 18) ^ (w[3] >> 3)) + (rrot(w[16], 17) ^ rrot(w[16], 19) ^ (w[16] >> 10)); - w[19] = w[3] + (rrot(w[4], 7) ^ rrot(w[4], 18) ^ (w[4] >> 3)) + (rrot(w[17], 17) ^ rrot(w[17], 19) ^ (w[17] >> 10)); - w[20] = w[4] + (rrot(w[5], 7) ^ rrot(w[5], 18) ^ (w[5] >> 3)) + (rrot(w[18], 17) ^ rrot(w[18], 19) ^ (w[18] >> 10)); - w[21] = w[5] + (rrot(w[6], 7) ^ rrot(w[6], 18) ^ (w[6] >> 3)) + (rrot(w[19], 17) ^ rrot(w[19], 19) ^ (w[19] >> 10)); - w[22] = w[6] + 0x100 + (rrot(w[7], 7) ^ rrot(w[7], 18) ^ (w[7] >> 3)) + (rrot(w[20], 17) ^ rrot(w[20], 19) ^ (w[20] >> 10)); - w[23] = w[7] + w[16] + 0x11002000U + (rrot(w[21], 17) ^ rrot(w[21], 19) ^ (w[21] >> 10)); - w[24] = 0x80000000U + w[17] + (rrot(w[22], 17) ^ rrot(w[22], 19) ^ (w[22] >> 10)); - w[25] = w[18] + (rrot(w[23], 17) ^ rrot(w[23], 19) ^ (w[23] >> 10)); - w[26] = w[19] + (rrot(w[24], 17) ^ rrot(w[24], 19) ^ (w[24] >> 10)); - w[27] = w[20] + (rrot(w[25], 17) ^ rrot(w[25], 19) ^ (w[25] >> 10)); - w[28] = w[21] + (rrot(w[26], 17) ^ rrot(w[26], 19) ^ (w[26] >> 10)); - w[29] = w[22] + (rrot(w[27], 17) ^ rrot(w[27], 19) ^ (w[27] >> 10)); - w[30] = w[23] + (rrot(0x100, 7) ^ rrot(0x100, 18) ^ (0x100 >> 3)) + (rrot(w[28], 17) ^ rrot(w[28], 19) ^ (w[28] >> 10)); - w[31] = 0x100 + w[24] + (rrot(w[16], 7) ^ rrot(w[16], 18) ^ (w[16] >> 3)) + (rrot(w[29], 17) ^ rrot(w[29], 19) ^ (w[29] >> 10)); + w[16] = w[0] + (ROTR32(w[1], 7) ^ ROTR32(w[1], 18) ^ (w[1] >> 3)); + w[17] = w[1] + (ROTR32(w[2], 7) ^ ROTR32(w[2], 18) ^ (w[2] >> 3)) + (ROTR32(0x100, 17) ^ ROTR32(0x100, 19) ^ (0x100 >> 10)); + w[18] = w[2] + (ROTR32(w[3], 7) ^ ROTR32(w[3], 18) ^ (w[3] >> 3)) + (ROTR32(w[16], 17) ^ ROTR32(w[16], 19) ^ (w[16] >> 10)); + w[19] = w[3] + (ROTR32(w[4], 7) ^ ROTR32(w[4], 18) ^ (w[4] >> 3)) + (ROTR32(w[17], 17) ^ ROTR32(w[17], 19) ^ (w[17] >> 10)); + w[20] = w[4] + (ROTR32(w[5], 7) ^ ROTR32(w[5], 18) ^ (w[5] >> 3)) + (ROTR32(w[18], 17) ^ ROTR32(w[18], 19) ^ (w[18] >> 10)); + w[21] = w[5] + (ROTR32(w[6], 7) ^ ROTR32(w[6], 18) ^ (w[6] >> 3)) + (ROTR32(w[19], 17) ^ ROTR32(w[19], 19) ^ (w[19] >> 10)); + w[22] = w[6] + 0x100 + (ROTR32(w[7], 7) ^ ROTR32(w[7], 18) ^ (w[7] >> 3)) + (ROTR32(w[20], 17) ^ ROTR32(w[20], 19) ^ (w[20] >> 10)); + w[23] = w[7] + w[16] + 0x11002000U + (ROTR32(w[21], 17) ^ ROTR32(w[21], 19) ^ (w[21] >> 10)); + w[24] = 0x80000000U + w[17] + (ROTR32(w[22], 17) ^ ROTR32(w[22], 19) ^ (w[22] >> 10)); + w[25] = w[18] + (ROTR32(w[23], 17) ^ ROTR32(w[23], 19) ^ (w[23] >> 10)); + w[26] = w[19] + (ROTR32(w[24], 17) ^ ROTR32(w[24], 19) ^ (w[24] >> 10)); + w[27] = w[20] + (ROTR32(w[25], 17) ^ ROTR32(w[25], 19) ^ (w[25] >> 10)); + w[28] = w[21] + (ROTR32(w[26], 17) ^ ROTR32(w[26], 19) ^ (w[26] >> 10)); + w[29] = w[22] + (ROTR32(w[27], 17) ^ ROTR32(w[27], 19) ^ (w[27] >> 10)); + w[30] = w[23] + (ROTR32(0x100, 7) ^ ROTR32(0x100, 18) ^ (0x100 >> 3)) + (ROTR32(w[28], 17) ^ ROTR32(w[28], 19) ^ (w[28] >> 10)); + w[31] = 0x100 + w[24] + (ROTR32(w[16], 7) ^ ROTR32(w[16], 18) ^ (w[16] >> 3)) + (ROTR32(w[29], 17) ^ ROTR32(w[29], 19) ^ (w[29] >> 10)); #pragma unroll for (int i = 32; i < 59; i++) - w[i] = w[i - 16] + w[i - 7] + (rrot(w[i - 15], 7) ^ rrot(w[i - 15], 18) ^ (w[i - 15] >> 3)) + (rrot(w[i - 2], 17) ^ rrot(w[i - 2], 19) ^ (w[i - 2] >> 10)); + w[i] = w[i - 16] + w[i - 7] + (ROTR32(w[i - 15], 7) ^ ROTR32(w[i - 15], 18) ^ (w[i - 15] >> 3)) + (ROTR32(w[i - 2], 17) ^ ROTR32(w[i - 2], 19) ^ (w[i - 2] >> 10)); d = 0x98c7e2a2U + w[0]; h = 0xfc08884dU + w[0]; // - t1 = (rrot(d, 6) ^ rrot(d, 11) ^ rrot(d, 25)) + (0x9b05688cU ^ (d & 0xca0b3af3)) + 0x90bb1e3cU + w[1]; + t1 = (ROTR32(d, 6) ^ ROTR32(d, 11) ^ ROTR32(d, 25)) + (0x9b05688cU ^ (d & 0xca0b3af3)) + 0x90bb1e3cU + w[1]; c = 0x3c6ef372U + t1; - g = t1 + (rrot(h, 2) ^ rrot(h, 13) ^ rrot(h, 22)) + (0x2A01A605 | (h & 0xfb6feee7)); + g = t1 + (ROTR32(h, 2) ^ ROTR32(h, 13) ^ ROTR32(h, 22)) + (0x2A01A605 | (h & 0xfb6feee7)); // - t1 = (rrot(c, 6) ^ rrot(c, 11) ^ rrot(c, 25)) + (0x510e527fU ^ (c & (d ^ 0x510e527fU))) + 0x50C6645BU + w[2]; + t1 = (ROTR32(c, 6) ^ ROTR32(c, 11) ^ ROTR32(c, 25)) + (0x510e527fU ^ (c & (d ^ 0x510e527fU))) + 0x50C6645BU + w[2]; b = 0xbb67ae85U + t1; - f = t1 + (rrot(g, 2) ^ rrot(g, 13) ^ rrot(g, 22)) + ((0x6a09e667U & h) | (g & (0x6a09e667U | h))); + f = t1 + (ROTR32(g, 2) ^ ROTR32(g, 13) ^ ROTR32(g, 22)) + ((0x6a09e667U & h) | (g & (0x6a09e667U | h))); // - t1 = (rrot(b, 6) ^ rrot(b, 11) ^ rrot(b, 25)) + (d ^ (b & (c ^ d))) + 0x3AC42E24U + w[3]; + t1 = (ROTR32(b, 6) ^ ROTR32(b, 11) ^ ROTR32(b, 25)) + (d ^ (b & (c ^ d))) + 0x3AC42E24U + w[3]; a = 0x6a09e667U + t1; - e = t1 + (rrot(f, 2) ^ rrot(f, 13) ^ rrot(f, 22)) + ((h & g) | (f & (h | g))); + e = t1 + (ROTR32(f, 2) ^ ROTR32(f, 13) ^ ROTR32(f, 22)) + ((h & g) | (f & (h | g))); // - t1 = d + (rrot(a, 6) ^ rrot(a, 11) ^ rrot(a, 25)) + (c ^ (a & (b ^ c))) + 0x3956c25bU + w[4]; + t1 = d + (ROTR32(a, 6) ^ ROTR32(a, 11) ^ ROTR32(a, 25)) + (c ^ (a & (b ^ c))) + 0x3956c25bU + w[4]; h += t1; - d = t1 + (rrot(e, 2) ^ rrot(e, 13) ^ rrot(e, 22)) + ((g & f) | (e & (g | f))); + d = t1 + (ROTR32(e, 2) ^ ROTR32(e, 13) ^ ROTR32(e, 22)) + ((g & f) | (e & (g | f))); // - t1 = c + (rrot(h, 6) ^ rrot(h, 11) ^ rrot(h, 25)) + (b ^ (h & (a ^ b))) + 0x59f111f1U + w[5]; + t1 = c + (ROTR32(h, 6) ^ ROTR32(h, 11) ^ ROTR32(h, 25)) + (b ^ (h & (a ^ b))) + 0x59f111f1U + w[5]; g += t1; - c = t1 + (rrot(d, 2) ^ rrot(d, 13) ^ rrot(d, 22)) + ((f & e) | (d & (f | e))); + c = t1 + (ROTR32(d, 2) ^ ROTR32(d, 13) ^ ROTR32(d, 22)) + ((f & e) | (d & (f | e))); // - t1 = b + (rrot(g, 6) ^ rrot(g, 11) ^ rrot(g, 25)) + (a ^ (g & (h ^ a))) + 0x923f82a4U + w[6]; + t1 = b + (ROTR32(g, 6) ^ ROTR32(g, 11) ^ ROTR32(g, 25)) + (a ^ (g & (h ^ a))) + 0x923f82a4U + w[6]; f += t1; - b = t1 + (rrot(c, 2) ^ rrot(c, 13) ^ rrot(c, 22)) + ((e & d) | (c & (e | d))); + b = t1 + (ROTR32(c, 2) ^ ROTR32(c, 13) ^ ROTR32(c, 22)) + ((e & d) | (c & (e | d))); // - t1 = a + (rrot(f, 6) ^ rrot(f, 11) ^ rrot(f, 25)) + (h ^ (f & (g ^ h))) + 0xab1c5ed5U + w[7]; + t1 = a + (ROTR32(f, 6) ^ ROTR32(f, 11) ^ ROTR32(f, 25)) + (h ^ (f & (g ^ h))) + 0xab1c5ed5U + w[7]; e += t1; - a = t1 + (rrot(b, 2) ^ rrot(b, 13) ^ rrot(b, 22)) + ((d & c) | (b & (d | c))); + a = t1 + (ROTR32(b, 2) ^ ROTR32(b, 13) ^ ROTR32(b, 22)) + ((d & c) | (b & (d | c))); // - t1 = h + (rrot(e, 6) ^ rrot(e, 11) ^ rrot(e, 25)) + (g ^ (e & (f ^ g))) + 0x5807aa98U; + t1 = h + (ROTR32(e, 6) ^ ROTR32(e, 11) ^ ROTR32(e, 25)) + (g ^ (e & (f ^ g))) + 0x5807aa98U; d += t1; - h = t1 + (rrot(a, 2) ^ rrot(a, 13) ^ rrot(a, 22)) + ((c & b) | (a & (c | b))); + h = t1 + (ROTR32(a, 2) ^ ROTR32(a, 13) ^ ROTR32(a, 22)) + ((c & b) | (a & (c | b))); // - t1 = g + (rrot(d, 6) ^ rrot(d, 11) ^ rrot(d, 25)) + (f ^ (d & (e ^ f))) + 0x12835b01U; + t1 = g + (ROTR32(d, 6) ^ ROTR32(d, 11) ^ ROTR32(d, 25)) + (f ^ (d & (e ^ f))) + 0x12835b01U; c += t1; - g = t1 + (rrot(h, 2) ^ rrot(h, 13) ^ rrot(h, 22)) + ((b & a) | (h & (b | a))); + g = t1 + (ROTR32(h, 2) ^ ROTR32(h, 13) ^ ROTR32(h, 22)) + ((b & a) | (h & (b | a))); // - t1 = f + (rrot(c, 6) ^ rrot(c, 11) ^ rrot(c, 25)) + (e ^ (c & (d ^ e))) + 0x243185beU; + t1 = f + (ROTR32(c, 6) ^ ROTR32(c, 11) ^ ROTR32(c, 25)) + (e ^ (c & (d ^ e))) + 0x243185beU; b += t1; - f = t1 + (rrot(g, 2) ^ rrot(g, 13) ^ rrot(g, 22)) + ((a & h) | (g & (a | h))); + f = t1 + (ROTR32(g, 2) ^ ROTR32(g, 13) ^ ROTR32(g, 22)) + ((a & h) | (g & (a | h))); // - t1 = e + (rrot(b, 6) ^ rrot(b, 11) ^ rrot(b, 25)) + (d ^ (b & (c ^ d))) + 0x550c7dc3U; + t1 = e + (ROTR32(b, 6) ^ ROTR32(b, 11) ^ ROTR32(b, 25)) + (d ^ (b & (c ^ d))) + 0x550c7dc3U; a += t1; - e = t1 + (rrot(f, 2) ^ rrot(f, 13) ^ rrot(f, 22)) + ((h & g) | (f & (h | g))); + e = t1 + (ROTR32(f, 2) ^ ROTR32(f, 13) ^ ROTR32(f, 22)) + ((h & g) | (f & (h | g))); // - t1 = d + (rrot(a, 6) ^ rrot(a, 11) ^ rrot(a, 25)) + (c ^ (a & (b ^ c))) + 0x72be5d74U; + t1 = d + (ROTR32(a, 6) ^ ROTR32(a, 11) ^ ROTR32(a, 25)) + (c ^ (a & (b ^ c))) + 0x72be5d74U; h += t1; - d = t1 + (rrot(e, 2) ^ rrot(e, 13) ^ rrot(e, 22)) + ((g & f) | (e & (g | f))); + d = t1 + (ROTR32(e, 2) ^ ROTR32(e, 13) ^ ROTR32(e, 22)) + ((g & f) | (e & (g | f))); // - t1 = c + (rrot(h, 6) ^ rrot(h, 11) ^ rrot(h, 25)) + (b ^ (h & (a ^ b))) + 0x80deb1feU; + t1 = c + (ROTR32(h, 6) ^ ROTR32(h, 11) ^ ROTR32(h, 25)) + (b ^ (h & (a ^ b))) + 0x80deb1feU; g += t1; - c = t1 + (rrot(d, 2) ^ rrot(d, 13) ^ rrot(d, 22)) + ((f & e) | (d & (f | e))); + c = t1 + (ROTR32(d, 2) ^ ROTR32(d, 13) ^ ROTR32(d, 22)) + ((f & e) | (d & (f | e))); // - t1 = b + (rrot(g, 6) ^ rrot(g, 11) ^ rrot(g, 25)) + (a ^ (g & (h ^ a))) + 0x9bdc06a7U; + t1 = b + (ROTR32(g, 6) ^ ROTR32(g, 11) ^ ROTR32(g, 25)) + (a ^ (g & (h ^ a))) + 0x9bdc06a7U; f += t1; - b = t1 + (rrot(c, 2) ^ rrot(c, 13) ^ rrot(c, 22)) + ((e & d) | (c & (e | d))); + b = t1 + (ROTR32(c, 2) ^ ROTR32(c, 13) ^ ROTR32(c, 22)) + ((e & d) | (c & (e | d))); // - t1 = a + (rrot(f, 6) ^ rrot(f, 11) ^ rrot(f, 25)) + (h ^ (f & (g ^ h))) + 0xc19bf274U; + t1 = a + (ROTR32(f, 6) ^ ROTR32(f, 11) ^ ROTR32(f, 25)) + (h ^ (f & (g ^ h))) + 0xc19bf274U; e += t1; - a = t1 + (rrot(b, 2) ^ rrot(b, 13) ^ rrot(b, 22)) + ((d & c) | (b & (d | c))); + a = t1 + (ROTR32(b, 2) ^ ROTR32(b, 13) ^ ROTR32(b, 22)) + ((d & c) | (b & (d | c))); // - t1 = h + (rrot(e, 6) ^ rrot(e, 11) ^ rrot(e, 25)) + (g ^ (e & (f ^ g))) + 0xe49b69c1U + w[16]; + t1 = h + (ROTR32(e, 6) ^ ROTR32(e, 11) ^ ROTR32(e, 25)) + (g ^ (e & (f ^ g))) + 0xe49b69c1U + w[16]; d += t1; - h = t1 + (rrot(a, 2) ^ rrot(a, 13) ^ rrot(a, 22)) + ((c & b) | (a & (c | b))); + h = t1 + (ROTR32(a, 2) ^ ROTR32(a, 13) ^ ROTR32(a, 22)) + ((c & b) | (a & (c | b))); // - t1 = g + (rrot(d, 6) ^ rrot(d, 11) ^ rrot(d, 25)) + (f ^ (d & (e ^ f))) + 0xefbe4786U + w[17]; + t1 = g + (ROTR32(d, 6) ^ ROTR32(d, 11) ^ ROTR32(d, 25)) + (f ^ (d & (e ^ f))) + 0xefbe4786U + w[17]; c += t1; - g = t1 + (rrot(h, 2) ^ rrot(h, 13) ^ rrot(h, 22)) + ((b & a) | (h & (b | a))); + g = t1 + (ROTR32(h, 2) ^ ROTR32(h, 13) ^ ROTR32(h, 22)) + ((b & a) | (h & (b | a))); // - t1 = f + (rrot(c, 6) ^ rrot(c, 11) ^ rrot(c, 25)) + (e ^ (c & (d ^ e))) + 0x0fc19dc6U + w[18]; + t1 = f + (ROTR32(c, 6) ^ ROTR32(c, 11) ^ ROTR32(c, 25)) + (e ^ (c & (d ^ e))) + 0x0fc19dc6U + w[18]; b += t1; - f = t1 + (rrot(g, 2) ^ rrot(g, 13) ^ rrot(g, 22)) + ((a & h) | (g & (a | h))); + f = t1 + (ROTR32(g, 2) ^ ROTR32(g, 13) ^ ROTR32(g, 22)) + ((a & h) | (g & (a | h))); // - t1 = e + (rrot(b, 6) ^ rrot(b, 11) ^ rrot(b, 25)) + (d ^ (b & (c ^ d))) + 0x240ca1ccU + w[19]; + t1 = e + (ROTR32(b, 6) ^ ROTR32(b, 11) ^ ROTR32(b, 25)) + (d ^ (b & (c ^ d))) + 0x240ca1ccU + w[19]; a += t1; - e = t1 + (rrot(f, 2) ^ rrot(f, 13) ^ rrot(f, 22)) + ((h & g) | (f & (h | g))); + e = t1 + (ROTR32(f, 2) ^ ROTR32(f, 13) ^ ROTR32(f, 22)) + ((h & g) | (f & (h | g))); // - t1 = d + (rrot(a, 6) ^ rrot(a, 11) ^ rrot(a, 25)) + (c ^ (a & (b ^ c))) + 0x2de92c6fU + w[20]; + t1 = d + (ROTR32(a, 6) ^ ROTR32(a, 11) ^ ROTR32(a, 25)) + (c ^ (a & (b ^ c))) + 0x2de92c6fU + w[20]; h += t1; - d = t1 + (rrot(e, 2) ^ rrot(e, 13) ^ rrot(e, 22)) + ((g & f) | (e & (g | f))); + d = t1 + (ROTR32(e, 2) ^ ROTR32(e, 13) ^ ROTR32(e, 22)) + ((g & f) | (e & (g | f))); // - t1 = c + (rrot(h, 6) ^ rrot(h, 11) ^ rrot(h, 25)) + (b ^ (h & (a ^ b))) + 0x4a7484aaU + w[21]; + t1 = c + (ROTR32(h, 6) ^ ROTR32(h, 11) ^ ROTR32(h, 25)) + (b ^ (h & (a ^ b))) + 0x4a7484aaU + w[21]; g += t1; - c = t1 + (rrot(d, 2) ^ rrot(d, 13) ^ rrot(d, 22)) + ((f & e) | (d & (f | e))); + c = t1 + (ROTR32(d, 2) ^ ROTR32(d, 13) ^ ROTR32(d, 22)) + ((f & e) | (d & (f | e))); // - t1 = b + (rrot(g, 6) ^ rrot(g, 11) ^ rrot(g, 25)) + (a ^ (g & (h ^ a))) + 0x5cb0a9dcU + w[22]; + t1 = b + (ROTR32(g, 6) ^ ROTR32(g, 11) ^ ROTR32(g, 25)) + (a ^ (g & (h ^ a))) + 0x5cb0a9dcU + w[22]; f += t1; - b = t1 + (rrot(c, 2) ^ rrot(c, 13) ^ rrot(c, 22)) + ((e & d) | (c & (e | d))); + b = t1 + (ROTR32(c, 2) ^ ROTR32(c, 13) ^ ROTR32(c, 22)) + ((e & d) | (c & (e | d))); // - t1 = a + (rrot(f, 6) ^ rrot(f, 11) ^ rrot(f, 25)) + (h ^ (f & (g ^ h))) + 0x76f988daU + w[23]; + t1 = a + (ROTR32(f, 6) ^ ROTR32(f, 11) ^ ROTR32(f, 25)) + (h ^ (f & (g ^ h))) + 0x76f988daU + w[23]; e += t1; - a = t1 + (rrot(b, 2) ^ rrot(b, 13) ^ rrot(b, 22)) + ((d & c) | (b & (d | c))); + a = t1 + (ROTR32(b, 2) ^ ROTR32(b, 13) ^ ROTR32(b, 22)) + ((d & c) | (b & (d | c))); // - t1 = h + (rrot(e, 6) ^ rrot(e, 11) ^ rrot(e, 25)) + (g ^ (e & (f ^ g))) + 0x983e5152U + w[24]; + t1 = h + (ROTR32(e, 6) ^ ROTR32(e, 11) ^ ROTR32(e, 25)) + (g ^ (e & (f ^ g))) + 0x983e5152U + w[24]; d += t1; - h = t1 + (rrot(a, 2) ^ rrot(a, 13) ^ rrot(a, 22)) + ((c & b) | (a & (c | b))); + h = t1 + (ROTR32(a, 2) ^ ROTR32(a, 13) ^ ROTR32(a, 22)) + ((c & b) | (a & (c | b))); // - t1 = g + (rrot(d, 6) ^ rrot(d, 11) ^ rrot(d, 25)) + (f ^ (d & (e ^ f))) + 0xa831c66dU + w[25]; + t1 = g + (ROTR32(d, 6) ^ ROTR32(d, 11) ^ ROTR32(d, 25)) + (f ^ (d & (e ^ f))) + 0xa831c66dU + w[25]; c += t1; - g = t1 + (rrot(h, 2) ^ rrot(h, 13) ^ rrot(h, 22)) + ((b & a) | (h & (b | a))); + g = t1 + (ROTR32(h, 2) ^ ROTR32(h, 13) ^ ROTR32(h, 22)) + ((b & a) | (h & (b | a))); // - t1 = f + (rrot(c, 6) ^ rrot(c, 11) ^ rrot(c, 25)) + (e ^ (c & (d ^ e))) + 0xb00327c8U + w[26]; + t1 = f + (ROTR32(c, 6) ^ ROTR32(c, 11) ^ ROTR32(c, 25)) + (e ^ (c & (d ^ e))) + 0xb00327c8U + w[26]; b += t1; - f = t1 + (rrot(g, 2) ^ rrot(g, 13) ^ rrot(g, 22)) + ((a & h) | (g & (a | h))); + f = t1 + (ROTR32(g, 2) ^ ROTR32(g, 13) ^ ROTR32(g, 22)) + ((a & h) | (g & (a | h))); // - t1 = e + (rrot(b, 6) ^ rrot(b, 11) ^ rrot(b, 25)) + (d ^ (b & (c ^ d))) + 0xbf597fc7U + w[27]; + t1 = e + (ROTR32(b, 6) ^ ROTR32(b, 11) ^ ROTR32(b, 25)) + (d ^ (b & (c ^ d))) + 0xbf597fc7U + w[27]; a += t1; - e = t1 + (rrot(f, 2) ^ rrot(f, 13) ^ rrot(f, 22)) + ((h & g) | (f & (h | g))); + e = t1 + (ROTR32(f, 2) ^ ROTR32(f, 13) ^ ROTR32(f, 22)) + ((h & g) | (f & (h | g))); // - t1 = d + (rrot(a, 6) ^ rrot(a, 11) ^ rrot(a, 25)) + (c ^ (a & (b ^ c))) + 0xc6e00bf3U + w[28]; + t1 = d + (ROTR32(a, 6) ^ ROTR32(a, 11) ^ ROTR32(a, 25)) + (c ^ (a & (b ^ c))) + 0xc6e00bf3U + w[28]; h += t1; - d = t1 + (rrot(e, 2) ^ rrot(e, 13) ^ rrot(e, 22)) + ((g & f) | (e & (g | f))); + d = t1 + (ROTR32(e, 2) ^ ROTR32(e, 13) ^ ROTR32(e, 22)) + ((g & f) | (e & (g | f))); // - t1 = c + (rrot(h, 6) ^ rrot(h, 11) ^ rrot(h, 25)) + (b ^ (h & (a ^ b))) + 0xd5a79147U + w[29]; + t1 = c + (ROTR32(h, 6) ^ ROTR32(h, 11) ^ ROTR32(h, 25)) + (b ^ (h & (a ^ b))) + 0xd5a79147U + w[29]; g += t1; - c = t1 + (rrot(d, 2) ^ rrot(d, 13) ^ rrot(d, 22)) + ((f & e) | (d & (f | e))); + c = t1 + (ROTR32(d, 2) ^ ROTR32(d, 13) ^ ROTR32(d, 22)) + ((f & e) | (d & (f | e))); // - t1 = b + (rrot(g, 6) ^ rrot(g, 11) ^ rrot(g, 25)) + (a ^ (g & (h ^ a))) + 0x06ca6351U + w[30]; + t1 = b + (ROTR32(g, 6) ^ ROTR32(g, 11) ^ ROTR32(g, 25)) + (a ^ (g & (h ^ a))) + 0x06ca6351U + w[30]; f += t1; - b = t1 + (rrot(c, 2) ^ rrot(c, 13) ^ rrot(c, 22)) + ((e & d) | (c & (e | d))); + b = t1 + (ROTR32(c, 2) ^ ROTR32(c, 13) ^ ROTR32(c, 22)) + ((e & d) | (c & (e | d))); // - t1 = a + (rrot(f, 6) ^ rrot(f, 11) ^ rrot(f, 25)) + (h ^ (f & (g ^ h))) + 0x14292967U + w[31]; + t1 = a + (ROTR32(f, 6) ^ ROTR32(f, 11) ^ ROTR32(f, 25)) + (h ^ (f & (g ^ h))) + 0x14292967U + w[31]; e += t1; - a = t1 + (rrot(b, 2) ^ rrot(b, 13) ^ rrot(b, 22)) + ((d & c) | (b & (d | c))); + a = t1 + (ROTR32(b, 2) ^ ROTR32(b, 13) ^ ROTR32(b, 22)) + ((d & c) | (b & (d | c))); // - t1 = h + (rrot(e, 6) ^ rrot(e, 11) ^ rrot(e, 25)) + (g ^ (e & (f ^ g))) + 0x27b70a85U + w[32]; + t1 = h + (ROTR32(e, 6) ^ ROTR32(e, 11) ^ ROTR32(e, 25)) + (g ^ (e & (f ^ g))) + 0x27b70a85U + w[32]; d += t1; - h = t1 + (rrot(a, 2) ^ rrot(a, 13) ^ rrot(a, 22)) + ((c & b) | (a & (c | b))); + h = t1 + (ROTR32(a, 2) ^ ROTR32(a, 13) ^ ROTR32(a, 22)) + ((c & b) | (a & (c | b))); // - t1 = g + (rrot(d, 6) ^ rrot(d, 11) ^ rrot(d, 25)) + (f ^ (d & (e ^ f))) + 0x2e1b2138U + w[33]; + t1 = g + (ROTR32(d, 6) ^ ROTR32(d, 11) ^ ROTR32(d, 25)) + (f ^ (d & (e ^ f))) + 0x2e1b2138U + w[33]; c += t1; - g = t1 + (rrot(h, 2) ^ rrot(h, 13) ^ rrot(h, 22)) + ((b & a) | (h & (b | a))); + g = t1 + (ROTR32(h, 2) ^ ROTR32(h, 13) ^ ROTR32(h, 22)) + ((b & a) | (h & (b | a))); // - t1 = f + (rrot(c, 6) ^ rrot(c, 11) ^ rrot(c, 25)) + (e ^ (c & (d ^ e))) + 0x4d2c6dfcU + w[34]; + t1 = f + (ROTR32(c, 6) ^ ROTR32(c, 11) ^ ROTR32(c, 25)) + (e ^ (c & (d ^ e))) + 0x4d2c6dfcU + w[34]; b += t1; - f = t1 + (rrot(g, 2) ^ rrot(g, 13) ^ rrot(g, 22)) + ((a & h) | (g & (a | h))); + f = t1 + (ROTR32(g, 2) ^ ROTR32(g, 13) ^ ROTR32(g, 22)) + ((a & h) | (g & (a | h))); // - t1 = e + (rrot(b, 6) ^ rrot(b, 11) ^ rrot(b, 25)) + (d ^ (b & (c ^ d))) + 0x53380d13U + w[35]; + t1 = e + (ROTR32(b, 6) ^ ROTR32(b, 11) ^ ROTR32(b, 25)) + (d ^ (b & (c ^ d))) + 0x53380d13U + w[35]; a += t1; - e = t1 + (rrot(f, 2) ^ rrot(f, 13) ^ rrot(f, 22)) + ((h & g) | (f & (h | g))); + e = t1 + (ROTR32(f, 2) ^ ROTR32(f, 13) ^ ROTR32(f, 22)) + ((h & g) | (f & (h | g))); // - t1 = d + (rrot(a, 6) ^ rrot(a, 11) ^ rrot(a, 25)) + (c ^ (a & (b ^ c))) + 0x650a7354U + w[36]; + t1 = d + (ROTR32(a, 6) ^ ROTR32(a, 11) ^ ROTR32(a, 25)) + (c ^ (a & (b ^ c))) + 0x650a7354U + w[36]; h += t1; - d = t1 + (rrot(e, 2) ^ rrot(e, 13) ^ rrot(e, 22)) + ((g & f) | (e & (g | f))); + d = t1 + (ROTR32(e, 2) ^ ROTR32(e, 13) ^ ROTR32(e, 22)) + ((g & f) | (e & (g | f))); // - t1 = c + (rrot(h, 6) ^ rrot(h, 11) ^ rrot(h, 25)) + (b ^ (h & (a ^ b))) + 0x766a0abbU + w[37]; + t1 = c + (ROTR32(h, 6) ^ ROTR32(h, 11) ^ ROTR32(h, 25)) + (b ^ (h & (a ^ b))) + 0x766a0abbU + w[37]; g += t1; - c = t1 + (rrot(d, 2) ^ rrot(d, 13) ^ rrot(d, 22)) + ((f & e) | (d & (f | e))); + c = t1 + (ROTR32(d, 2) ^ ROTR32(d, 13) ^ ROTR32(d, 22)) + ((f & e) | (d & (f | e))); // - t1 = b + (rrot(g, 6) ^ rrot(g, 11) ^ rrot(g, 25)) + (a ^ (g & (h ^ a))) + 0x81c2c92eU + w[38]; + t1 = b + (ROTR32(g, 6) ^ ROTR32(g, 11) ^ ROTR32(g, 25)) + (a ^ (g & (h ^ a))) + 0x81c2c92eU + w[38]; f += t1; - b = t1 + (rrot(c, 2) ^ rrot(c, 13) ^ rrot(c, 22)) + ((e & d) | (c & (e | d))); + b = t1 + (ROTR32(c, 2) ^ ROTR32(c, 13) ^ ROTR32(c, 22)) + ((e & d) | (c & (e | d))); // - t1 = a + (rrot(f, 6) ^ rrot(f, 11) ^ rrot(f, 25)) + (h ^ (f & (g ^ h))) + 0x92722c85U + w[39]; + t1 = a + (ROTR32(f, 6) ^ ROTR32(f, 11) ^ ROTR32(f, 25)) + (h ^ (f & (g ^ h))) + 0x92722c85U + w[39]; e += t1; - a = t1 + (rrot(b, 2) ^ rrot(b, 13) ^ rrot(b, 22)) + ((d & c) | (b & (d | c))); + a = t1 + (ROTR32(b, 2) ^ ROTR32(b, 13) ^ ROTR32(b, 22)) + ((d & c) | (b & (d | c))); // - t1 = h + (rrot(e, 6) ^ rrot(e, 11) ^ rrot(e, 25)) + (g ^ (e & (f ^ g))) + 0xa2bfe8a1U + w[40]; + t1 = h + (ROTR32(e, 6) ^ ROTR32(e, 11) ^ ROTR32(e, 25)) + (g ^ (e & (f ^ g))) + 0xa2bfe8a1U + w[40]; d += t1; - h = t1 + (rrot(a, 2) ^ rrot(a, 13) ^ rrot(a, 22)) + ((c & b) | (a & (c | b))); + h = t1 + (ROTR32(a, 2) ^ ROTR32(a, 13) ^ ROTR32(a, 22)) + ((c & b) | (a & (c | b))); // - t1 = g + (rrot(d, 6) ^ rrot(d, 11) ^ rrot(d, 25)) + (f ^ (d & (e ^ f))) + 0xa81a664bU + w[41]; + t1 = g + (ROTR32(d, 6) ^ ROTR32(d, 11) ^ ROTR32(d, 25)) + (f ^ (d & (e ^ f))) + 0xa81a664bU + w[41]; c += t1; - g = t1 + (rrot(h, 2) ^ rrot(h, 13) ^ rrot(h, 22)) + ((b & a) | (h & (b | a))); + g = t1 + (ROTR32(h, 2) ^ ROTR32(h, 13) ^ ROTR32(h, 22)) + ((b & a) | (h & (b | a))); // - t1 = f + (rrot(c, 6) ^ rrot(c, 11) ^ rrot(c, 25)) + (e ^ (c & (d ^ e))) + 0xc24b8b70U + w[42]; + t1 = f + (ROTR32(c, 6) ^ ROTR32(c, 11) ^ ROTR32(c, 25)) + (e ^ (c & (d ^ e))) + 0xc24b8b70U + w[42]; b += t1; - f = t1 + (rrot(g, 2) ^ rrot(g, 13) ^ rrot(g, 22)) + ((a & h) | (g & (a | h))); + f = t1 + (ROTR32(g, 2) ^ ROTR32(g, 13) ^ ROTR32(g, 22)) + ((a & h) | (g & (a | h))); // - t1 = e + (rrot(b, 6) ^ rrot(b, 11) ^ rrot(b, 25)) + (d ^ (b & (c ^ d))) + 0xc76c51a3U + w[43]; + t1 = e + (ROTR32(b, 6) ^ ROTR32(b, 11) ^ ROTR32(b, 25)) + (d ^ (b & (c ^ d))) + 0xc76c51a3U + w[43]; a += t1; - e = t1 + (rrot(f, 2) ^ rrot(f, 13) ^ rrot(f, 22)) + ((h & g) | (f & (h | g))); + e = t1 + (ROTR32(f, 2) ^ ROTR32(f, 13) ^ ROTR32(f, 22)) + ((h & g) | (f & (h | g))); // - t1 = d + (rrot(a, 6) ^ rrot(a, 11) ^ rrot(a, 25)) + (c ^ (a & (b ^ c))) + 0xd192e819U + w[44]; + t1 = d + (ROTR32(a, 6) ^ ROTR32(a, 11) ^ ROTR32(a, 25)) + (c ^ (a & (b ^ c))) + 0xd192e819U + w[44]; h += t1; - d = t1 + (rrot(e, 2) ^ rrot(e, 13) ^ rrot(e, 22)) + ((g & f) | (e & (g | f))); + d = t1 + (ROTR32(e, 2) ^ ROTR32(e, 13) ^ ROTR32(e, 22)) + ((g & f) | (e & (g | f))); // - t1 = c + (rrot(h, 6) ^ rrot(h, 11) ^ rrot(h, 25)) + (b ^ (h & (a ^ b))) + 0xd6990624U + w[45]; + t1 = c + (ROTR32(h, 6) ^ ROTR32(h, 11) ^ ROTR32(h, 25)) + (b ^ (h & (a ^ b))) + 0xd6990624U + w[45]; g += t1; - c = t1 + (rrot(d, 2) ^ rrot(d, 13) ^ rrot(d, 22)) + ((f & e) | (d & (f | e))); + c = t1 + (ROTR32(d, 2) ^ ROTR32(d, 13) ^ ROTR32(d, 22)) + ((f & e) | (d & (f | e))); // - t1 = b + (rrot(g, 6) ^ rrot(g, 11) ^ rrot(g, 25)) + (a ^ (g & (h ^ a))) + 0xf40e3585U + w[46]; + t1 = b + (ROTR32(g, 6) ^ ROTR32(g, 11) ^ ROTR32(g, 25)) + (a ^ (g & (h ^ a))) + 0xf40e3585U + w[46]; f += t1; - b = t1 + (rrot(c, 2) ^ rrot(c, 13) ^ rrot(c, 22)) + ((e & d) | (c & (e | d))); + b = t1 + (ROTR32(c, 2) ^ ROTR32(c, 13) ^ ROTR32(c, 22)) + ((e & d) | (c & (e | d))); // - t1 = a + (rrot(f, 6) ^ rrot(f, 11) ^ rrot(f, 25)) + (h ^ (f & (g ^ h))) + 0x106aa070U + w[47]; + t1 = a + (ROTR32(f, 6) ^ ROTR32(f, 11) ^ ROTR32(f, 25)) + (h ^ (f & (g ^ h))) + 0x106aa070U + w[47]; e += t1; - a = t1 + (rrot(b, 2) ^ rrot(b, 13) ^ rrot(b, 22)) + ((d & c) | (b & (d | c))); + a = t1 + (ROTR32(b, 2) ^ ROTR32(b, 13) ^ ROTR32(b, 22)) + ((d & c) | (b & (d | c))); // - t1 = h + (rrot(e, 6) ^ rrot(e, 11) ^ rrot(e, 25)) + (g ^ (e & (f ^ g))) + 0x19a4c116U + w[48]; + t1 = h + (ROTR32(e, 6) ^ ROTR32(e, 11) ^ ROTR32(e, 25)) + (g ^ (e & (f ^ g))) + 0x19a4c116U + w[48]; d += t1; - h = t1 + (rrot(a, 2) ^ rrot(a, 13) ^ rrot(a, 22)) + ((c & b) | (a & (c | b))); + h = t1 + (ROTR32(a, 2) ^ ROTR32(a, 13) ^ ROTR32(a, 22)) + ((c & b) | (a & (c | b))); // - t1 = g + (rrot(d, 6) ^ rrot(d, 11) ^ rrot(d, 25)) + (f ^ (d & (e ^ f))) + 0x1e376c08U + w[49]; + t1 = g + (ROTR32(d, 6) ^ ROTR32(d, 11) ^ ROTR32(d, 25)) + (f ^ (d & (e ^ f))) + 0x1e376c08U + w[49]; c += t1; - g = t1 + (rrot(h, 2) ^ rrot(h, 13) ^ rrot(h, 22)) + ((b & a) | (h & (b | a))); + g = t1 + (ROTR32(h, 2) ^ ROTR32(h, 13) ^ ROTR32(h, 22)) + ((b & a) | (h & (b | a))); // - t1 = f + (rrot(c, 6) ^ rrot(c, 11) ^ rrot(c, 25)) + (e ^ (c & (d ^ e))) + 0x2748774cU + w[50]; + t1 = f + (ROTR32(c, 6) ^ ROTR32(c, 11) ^ ROTR32(c, 25)) + (e ^ (c & (d ^ e))) + 0x2748774cU + w[50]; b += t1; - f = t1 + (rrot(g, 2) ^ rrot(g, 13) ^ rrot(g, 22)) + ((a & h) | (g & (a | h))); + f = t1 + (ROTR32(g, 2) ^ ROTR32(g, 13) ^ ROTR32(g, 22)) + ((a & h) | (g & (a | h))); // - t1 = e + (rrot(b, 6) ^ rrot(b, 11) ^ rrot(b, 25)) + (d ^ (b & (c ^ d))) + 0x34b0bcb5U + w[51]; + t1 = e + (ROTR32(b, 6) ^ ROTR32(b, 11) ^ ROTR32(b, 25)) + (d ^ (b & (c ^ d))) + 0x34b0bcb5U + w[51]; a += t1; - e = t1 + (rrot(f, 2) ^ rrot(f, 13) ^ rrot(f, 22)) + ((h & g) | (f & (h | g))); + e = t1 + (ROTR32(f, 2) ^ ROTR32(f, 13) ^ ROTR32(f, 22)) + ((h & g) | (f & (h | g))); // - t1 = d + (rrot(a, 6) ^ rrot(a, 11) ^ rrot(a, 25)) + (c ^ (a & (b ^ c))) + 0x391c0cb3U + w[52]; + t1 = d + (ROTR32(a, 6) ^ ROTR32(a, 11) ^ ROTR32(a, 25)) + (c ^ (a & (b ^ c))) + 0x391c0cb3U + w[52]; h += t1; - d = t1 + (rrot(e, 2) ^ rrot(e, 13) ^ rrot(e, 22)) + ((g & f) | (e & (g | f))); + d = t1 + (ROTR32(e, 2) ^ ROTR32(e, 13) ^ ROTR32(e, 22)) + ((g & f) | (e & (g | f))); // - t1 = c + (rrot(h, 6) ^ rrot(h, 11) ^ rrot(h, 25)) + (b ^ (h & (a ^ b))) + 0x4ed8aa4aU + w[53]; + t1 = c + (ROTR32(h, 6) ^ ROTR32(h, 11) ^ ROTR32(h, 25)) + (b ^ (h & (a ^ b))) + 0x4ed8aa4aU + w[53]; g += t1; - c = t1 + (rrot(d, 2) ^ rrot(d, 13) ^ rrot(d, 22)) + ((f & e) | (d & (f | e))); + c = t1 + (ROTR32(d, 2) ^ ROTR32(d, 13) ^ ROTR32(d, 22)) + ((f & e) | (d & (f | e))); // - t1 = b + (rrot(g, 6) ^ rrot(g, 11) ^ rrot(g, 25)) + (a ^ (g & (h ^ a))) + 0x5b9cca4fU + w[54]; + t1 = b + (ROTR32(g, 6) ^ ROTR32(g, 11) ^ ROTR32(g, 25)) + (a ^ (g & (h ^ a))) + 0x5b9cca4fU + w[54]; f += t1; - b = t1 + (rrot(c, 2) ^ rrot(c, 13) ^ rrot(c, 22)) + ((e & d) | (c & (e | d))); + b = t1 + (ROTR32(c, 2) ^ ROTR32(c, 13) ^ ROTR32(c, 22)) + ((e & d) | (c & (e | d))); // - t1 = a + (rrot(f, 6) ^ rrot(f, 11) ^ rrot(f, 25)) + (h ^ (f & (g ^ h))) + 0x682e6ff3U + w[55]; + t1 = a + (ROTR32(f, 6) ^ ROTR32(f, 11) ^ ROTR32(f, 25)) + (h ^ (f & (g ^ h))) + 0x682e6ff3U + w[55]; e += t1; - a = t1 + (rrot(b, 2) ^ rrot(b, 13) ^ rrot(b, 22)) + ((d & c) | (b & (d | c))); + a = t1 + (ROTR32(b, 2) ^ ROTR32(b, 13) ^ ROTR32(b, 22)) + ((d & c) | (b & (d | c))); // - t1 = h + (rrot(e, 6) ^ rrot(e, 11) ^ rrot(e, 25)) + (g ^ (e & (f ^ g))) + 0x748f82eeU + w[56]; + t1 = h + (ROTR32(e, 6) ^ ROTR32(e, 11) ^ ROTR32(e, 25)) + (g ^ (e & (f ^ g))) + 0x748f82eeU + w[56]; d += t1; - h = t1 + (rrot(a, 2) ^ rrot(a, 13) ^ rrot(a, 22)) + ((c & b) | (a & (c | b))); + h = t1 + (ROTR32(a, 2) ^ ROTR32(a, 13) ^ ROTR32(a, 22)) + ((c & b) | (a & (c | b))); // - c += g + (rrot(d, 6) ^ rrot(d, 11) ^ rrot(d, 25)) + (f ^ (d & (e ^ f))) + 0x78a5636fU + w[57]; + c += g + (ROTR32(d, 6) ^ ROTR32(d, 11) ^ ROTR32(d, 25)) + (f ^ (d & (e ^ f))) + 0x78a5636fU + w[57]; // - b += f + (rrot(c, 6) ^ rrot(c, 11) ^ rrot(c, 25)) + (e ^ (c & (d ^ e))) + 0x84c87814U + w[58]; + b += f + (ROTR32(c, 6) ^ ROTR32(c, 11) ^ ROTR32(c, 25)) + (e ^ (c & (d ^ e))) + 0x84c87814U + w[58]; // - a += e + (rrot(b, 6) ^ rrot(b, 11) ^ rrot(b, 25)) + (d ^ (b & (c ^ d))) + 0x8cc70208U + w[43] + w[52] + (rrot(w[44], 7) ^ rrot(w[44], 18) ^ (w[44] >> 3)) + (rrot(w[57], 17) ^ rrot(w[57], 19) ^ (w[57] >> 10)); + a += e + (ROTR32(b, 6) ^ ROTR32(b, 11) ^ ROTR32(b, 25)) + (d ^ (b & (c ^ d))) + 0x8cc70208U + w[43] + w[52] + (ROTR32(w[44], 7) ^ ROTR32(w[44], 18) ^ (w[44] >> 3)) + (ROTR32(w[57], 17) ^ ROTR32(w[57], 19) ^ (w[57] >> 10)); // - h += d + (rrot(a, 6) ^ rrot(a, 11) ^ rrot(a, 25)) + (c ^ (a & (b ^ c))) + 0x90befffaU + w[44] + w[53] + (rrot(w[45], 7) ^ rrot(w[45], 18) ^ (w[45] >> 3)) + (rrot(w[58], 17) ^ rrot(w[58], 19) ^ (w[58] >> 10)); + h += d + (ROTR32(a, 6) ^ ROTR32(a, 11) ^ ROTR32(a, 25)) + (c ^ (a & (b ^ c))) + 0x90befffaU + w[44] + w[53] + (ROTR32(w[45], 7) ^ ROTR32(w[45], 18) ^ (w[45] >> 3)) + (ROTR32(w[58], 17) ^ ROTR32(w[58], 19) ^ (w[58] >> 10)); // if (h == 0xa41f32e7) { @@ -595,8 +594,8 @@ void bitcoin_midstate(const uint32_t *data, uint32_t *midstate) } for (i = 16; i <= 63; i++) { - s0 = rrot(w[i - 15], 7) ^ rrot(w[i - 15], 18) ^ (w[i - 15] >> 3); - s1 = rrot(w[i - 2], 17) ^ rrot(w[i - 2], 19) ^ (w[i - 2] >> 10); + s0 = ROTR32(w[i - 15], 7) ^ ROTR32(w[i - 15], 18) ^ (w[i - 15] >> 3); + s1 = ROTR32(w[i - 2], 17) ^ ROTR32(w[i - 2], 19) ^ (w[i - 2] >> 10); w[i] = w[i - 16] + s0 + w[i - 7] + s1; } a = hc[0]; @@ -609,10 +608,10 @@ void bitcoin_midstate(const uint32_t *data, uint32_t *midstate) h = hc[7]; for (i = 0; i <= 63; i++) { - s0 = rrot(a, 2) ^ rrot(a, 13) ^ rrot(a, 22); + s0 = ROTR32(a, 2) ^ ROTR32(a, 13) ^ ROTR32(a, 22); maj = (a & b) ^ (a & c) ^ (b & c); t2 = s0 + maj; - s1 = rrot(e, 6) ^ rrot(e, 11) ^ rrot(e, 25); + s1 = ROTR32(e, 6) ^ ROTR32(e, 11) ^ ROTR32(e, 25); ch = (e & f) ^ ((~e) & g); t1 = h + s1 + ch + k[i] + w[i]; h = g; @@ -639,31 +638,31 @@ void bitcoin_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, const { uint32_t b2, c2, d2, f2, g2, h2, t1, w16, w17, t1c, t2c, w16rot, w17rot; - cudaMemset(d_result[thr_id], 0xff, 2 * sizeof(uint32_t)); + cudaMemsetAsync(d_result[thr_id], 0xff, 2 * sizeof(uint32_t), gpustream[thr_id]); - t1 = ms[7] + (rrot(ms[4], 6) ^ rrot(ms[4], 11) ^ rrot(ms[4], 25)) + (ms[6] ^ (ms[4] & (ms[5] ^ ms[6]))) + 0x428a2f98U + merkle; + t1 = ms[7] + (ROTR32(ms[4], 6) ^ ROTR32(ms[4], 11) ^ ROTR32(ms[4], 25)) + (ms[6] ^ (ms[4] & (ms[5] ^ ms[6]))) + 0x428a2f98U + merkle; d2 = ms[3] + t1; - h2 = t1 + (rrot(ms[0], 2) ^ rrot(ms[0], 13) ^ rrot(ms[0], 22)) + ((ms[2] & ms[1]) | (ms[0] & (ms[2] | ms[1]))); + h2 = t1 + (ROTR32(ms[0], 2) ^ ROTR32(ms[0], 13) ^ ROTR32(ms[0], 22)) + ((ms[2] & ms[1]) | (ms[0] & (ms[2] | ms[1]))); // - t1 = ms[6] + (rrot(d2, 6) ^ rrot(d2, 11) ^ rrot(d2, 25)) + (ms[5] ^ (d2 & (ms[4] ^ ms[5]))) + 0x71374491U + time; + t1 = ms[6] + (ROTR32(d2, 6) ^ ROTR32(d2, 11) ^ ROTR32(d2, 25)) + (ms[5] ^ (d2 & (ms[4] ^ ms[5]))) + 0x71374491U + time; c2 = ms[2] + t1; - g2 = t1 + (rrot(h2, 2) ^ rrot(h2, 13) ^ rrot(h2, 22)) + ((ms[1] & ms[0]) | (h2 & (ms[1] | ms[0]))); + g2 = t1 + (ROTR32(h2, 2) ^ ROTR32(h2, 13) ^ ROTR32(h2, 22)) + ((ms[1] & ms[0]) | (h2 & (ms[1] | ms[0]))); // - t1 = ms[5] + (rrot(c2, 6) ^ rrot(c2, 11) ^ rrot(c2, 25)) + (ms[4] ^ (c2 & (d2 ^ ms[4]))) + 0xb5c0fbcfU + compacttarget; + t1 = ms[5] + (ROTR32(c2, 6) ^ ROTR32(c2, 11) ^ ROTR32(c2, 25)) + (ms[4] ^ (c2 & (d2 ^ ms[4]))) + 0xb5c0fbcfU + compacttarget; b2 = ms[1] + t1; - f2 = t1 + (rrot(g2, 2) ^ rrot(g2, 13) ^ rrot(g2, 22)) + ((ms[0] & h2) | (g2 & (ms[0] | h2))); + f2 = t1 + (ROTR32(g2, 2) ^ ROTR32(g2, 13) ^ ROTR32(g2, 22)) + ((ms[0] & h2) | (g2 & (ms[0] | h2))); - w16 = merkle + (rrot(time, 7) ^ rrot(time, 18) ^ (time >> 3)); - w16rot = (rrot(w16, 17) ^ rrot(w16, 19) ^ (w16 >> 10)) + compacttarget; - w17 = time + (rrot(compacttarget, 7) ^ rrot(compacttarget, 18) ^ (compacttarget >> 3)) + 0x01100000U; - w17rot = (rrot(w17, 17) ^ rrot(w17, 19) ^ (w17 >> 10)) + 0x11002000U; - t2c = (rrot(f2, 2) ^ rrot(f2, 13) ^ rrot(f2, 22)) + ((h2 & g2) | (f2 & (h2 | g2))); - t1c = ms[4] + (rrot(b2, 6) ^ rrot(b2, 11) ^ rrot(b2, 25)) + (d2 ^ (b2 & (c2 ^ d2))) + 0xe9b5dba5U; + w16 = merkle + (ROTR32(time, 7) ^ ROTR32(time, 18) ^ (time >> 3)); + w16rot = (ROTR32(w16, 17) ^ ROTR32(w16, 19) ^ (w16 >> 10)) + compacttarget; + w17 = time + (ROTR32(compacttarget, 7) ^ ROTR32(compacttarget, 18) ^ (compacttarget >> 3)) + 0x01100000U; + w17rot = (ROTR32(w17, 17) ^ ROTR32(w17, 19) ^ (w17 >> 10)) + 0x11002000U; + t2c = (ROTR32(f2, 2) ^ ROTR32(f2, 13) ^ ROTR32(f2, 22)) + ((h2 & g2) | (f2 & (h2 | g2))); + t1c = ms[4] + (ROTR32(b2, 6) ^ ROTR32(b2, 11) ^ ROTR32(b2, 25)) + (d2 ^ (b2 & (c2 ^ d2))) + 0xe9b5dba5U; dim3 grid((threads + TPB*NONCES_PER_THREAD - 1) / TPB / NONCES_PER_THREAD); dim3 block(TPB); - bitcoin_gpu_hash << > > (threads, startNounce, d_result[thr_id], t1c, t2c, w16, w16rot, w17, w17rot, b2, c2, d2, f2, g2, h2, ms[0], ms[1], ms[2], ms[3], ms[4], ms[5], ms[6], ms[7], compacttarget); - CUDA_SAFE_CALL(cudaMemcpy(h_nounce, d_result[thr_id], 2 * sizeof(uint32_t), cudaMemcpyDeviceToHost)); + bitcoin_gpu_hash << >> (threads, startNounce, d_result[thr_id], t1c, t2c, w16, w16rot, w17, w17rot, b2, c2, d2, f2, g2, h2, ms[0], ms[1], ms[2], ms[3], ms[4], ms[5], ms[6], ms[7], compacttarget); + CUDA_SAFE_CALL(cudaMemcpyAsync(h_nounce, d_result[thr_id], 2 * sizeof(uint32_t), cudaMemcpyDeviceToHost, gpustream[thr_id])); cudaStreamSynchronize(gpustream[thr_id]); } __host__ diff --git a/cuda_checkhash.cu b/cuda_checkhash.cu index 070c307b43..1fc51aa84b 100644 --- a/cuda_checkhash.cu +++ b/cuda_checkhash.cu @@ -5,7 +5,6 @@ #include #include "miner.h" - #include "cuda_helper.h" __constant__ uint32_t pTarget[8]; // 32 bytes @@ -17,15 +16,16 @@ static uint32_t* d_resNonces[MAX_GPUS]; __host__ void cuda_check_cpu_init(int thr_id, uint32_t threads) { - CUDA_CALL_OR_RET(cudaMallocHost(&h_resNonces[thr_id], 8*sizeof(uint32_t))); - CUDA_CALL_OR_RET(cudaMalloc(&d_resNonces[thr_id], 8 * sizeof(uint32_t))); + CUDA_SAFE_CALL(cudaMallocHost(&h_resNonces[thr_id], 8*sizeof(uint32_t))); + CUDA_SAFE_CALL(cudaMalloc(&d_resNonces[thr_id], 8 * sizeof(uint32_t))); } // Target Difficulty + __host__ -void cuda_check_cpu_setTarget(const void *ptarget) +void cuda_check_cpu_setTarget(const void *ptarget, int thr_id) { - CUDA_SAFE_CALL(cudaMemcpyToSymbol(pTarget, ptarget, 8*sizeof(uint32_t), 0, cudaMemcpyHostToDevice)); + CUDA_SAFE_CALL(cudaMemcpyToSymbolAsync(pTarget, ptarget, 8*sizeof(uint32_t), 0, cudaMemcpyHostToDevice, gpustream[thr_id])); } /* --------------------------------------------------------------------------------------------- */ @@ -73,7 +73,7 @@ static bool hashbelowtarget(const uint32_t *const __restrict__ hash, const uint3 __global__ __launch_bounds__(512, 4) void cuda_checkhash_64(uint32_t threads, uint32_t startNounce, uint32_t *hash, uint32_t *resNonces) { - uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); if (thread < threads) { // shl 4 = *16 x 4 (uint32) = 64 bytes @@ -90,16 +90,17 @@ void cuda_checkhash_64(uint32_t threads, uint32_t startNounce, uint32_t *hash, u __host__ uint32_t cuda_check_hash(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_inputHash) { - cudaMemset(d_resNonces[thr_id], 0xff, sizeof(uint32_t)); + CUDA_SAFE_CALL(cudaMemsetAsync(d_resNonces[thr_id], 0xff, sizeof(uint32_t), gpustream[thr_id])); const uint32_t threadsperblock = 512; dim3 grid((threads + threadsperblock - 1) / threadsperblock); dim3 block(threadsperblock); - cuda_checkhash_64 <<>> (threads, startNounce, d_inputHash, d_resNonces[thr_id]); + cuda_checkhash_64 <<>> (threads, startNounce, d_inputHash, d_resNonces[thr_id]); - cudaMemcpy(h_resNonces[thr_id], d_resNonces[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost); + cudaMemcpyAsync(h_resNonces[thr_id], d_resNonces[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost, gpustream[thr_id]); + cudaStreamSynchronize(gpustream[thr_id]); return h_resNonces[thr_id][0]; } @@ -109,7 +110,7 @@ uint32_t cuda_check_hash(int thr_id, uint32_t threads, uint32_t startNounce, uin __global__ __launch_bounds__(512, 4) void cuda_checkhash_64_suppl(uint32_t startNounce, uint32_t *hash, uint32_t *resNonces) { - uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); uint32_t *inpHash = &hash[thread << 4]; @@ -130,10 +131,11 @@ uint32_t cuda_check_hash_suppl(int thr_id, uint32_t threads, uint32_t startNounc dim3 block(threadsperblock); // first element stores the count of found nonces - cudaMemset(d_resNonces[thr_id], 0, sizeof(uint32_t)); + cudaMemsetAsync(d_resNonces[thr_id], 0, sizeof(uint32_t), gpustream[thr_id]); - cuda_checkhash_64_suppl <<>> (startNounce, d_inputHash, d_resNonces[thr_id]); - cudaMemcpy(h_resNonces[thr_id], d_resNonces[thr_id], 8*sizeof(uint32_t), cudaMemcpyDeviceToHost); + cuda_checkhash_64_suppl <<>> (startNounce, d_inputHash, d_resNonces[thr_id]); + cudaMemcpyAsync(h_resNonces[thr_id], d_resNonces[thr_id], 8*sizeof(uint32_t), cudaMemcpyDeviceToHost, gpustream[thr_id]); + cudaStreamSynchronize(gpustream[thr_id]); rescnt = h_resNonces[thr_id][0]; if (rescnt > 1) @@ -156,12 +158,12 @@ uint32_t cuda_check_hash_suppl(int thr_id, uint32_t threads, uint32_t startNounc __global__ void cuda_check_hash_branch_64(uint32_t threads, uint32_t startNounce, uint32_t *g_nonceVector, uint32_t *g_hash, uint32_t *resNounce) { - uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); if (thread < threads) { - uint32_t nounce = g_nonceVector[thread]; + const uint32_t nounce = g_nonceVector[thread]; uint32_t hashPosition = (nounce - startNounce) << 4; - uint32_t *inpHash = &g_hash[hashPosition]; + const uint32_t *const inpHash = &g_hash[hashPosition]; if (hashbelowtarget(inpHash, pTarget)) { @@ -174,12 +176,12 @@ void cuda_check_hash_branch_64(uint32_t threads, uint32_t startNounce, uint32_t __global__ void cuda_check_quarkcoin_64(uint32_t threads, uint32_t startNounce, uint32_t *g_nonceVector, uint32_t *g_hash, uint32_t *resNounce) { - uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); if (thread < threads) { - uint32_t nounce = g_nonceVector[thread]; + const uint32_t nounce = g_nonceVector[thread]; uint32_t hashPosition = (nounce - startNounce) << 4; - uint32_t *inpHash = &g_hash[hashPosition]; + const uint32_t *const inpHash = &g_hash[hashPosition]; if (inpHash[7] <= pTarget[7]) { @@ -191,35 +193,54 @@ void cuda_check_quarkcoin_64(uint32_t threads, uint32_t startNounce, uint32_t *g } __host__ -uint32_t cuda_check_hash_branch(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order) +uint32_t cuda_check_hash_branch(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash) { uint32_t result = 0xffffffff; - cudaMemset(d_resNonces[thr_id], 0xff, sizeof(uint32_t)); + cudaMemsetAsync(d_resNonces[thr_id], 0xff, sizeof(uint32_t), gpustream[thr_id]); const uint32_t threadsperblock = 256; dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 block(threadsperblock); - cuda_check_hash_branch_64 <<>> (threads, startNounce, d_nonceVector, d_inputHash, d_resNonces[thr_id]); + cuda_check_hash_branch_64 <<>> (threads, startNounce, d_nonceVector, d_inputHash, d_resNonces[thr_id]); - cudaMemcpy(h_resNonces[thr_id], d_resNonces[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost); + cudaMemcpyAsync(h_resNonces[thr_id], d_resNonces[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost, gpustream[thr_id]); + cudaStreamSynchronize(gpustream[thr_id]); result = *h_resNonces[thr_id]; return result; } __host__ -void cuda_check_quarkcoin(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order, uint32_t *resNonces) +void cuda_check_quarkcoin(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, uint32_t *resNonces) { - cudaMemset(d_resNonces[thr_id], 0xff, 2*sizeof(uint32_t)); + CUDA_SAFE_CALL(cudaMemsetAsync(d_resNonces[thr_id], 0xff, 2 * sizeof(uint32_t), gpustream[thr_id])); const uint32_t threadsperblock = 256; dim3 grid((threads + threadsperblock - 1) / threadsperblock); dim3 block(threadsperblock); - cuda_check_quarkcoin_64 << > > (threads, startNounce, d_nonceVector, d_inputHash, d_resNonces[thr_id]); + cuda_check_quarkcoin_64 << >> (threads, startNounce, d_nonceVector, d_inputHash, d_resNonces[thr_id]); - cudaMemcpy(resNonces, d_resNonces[thr_id], 2*sizeof(uint32_t), cudaMemcpyDeviceToHost); -} \ No newline at end of file + cudaMemcpyAsync(resNonces, d_resNonces[thr_id], 2*sizeof(uint32_t), cudaMemcpyDeviceToHost, gpustream[thr_id]); + cudaStreamSynchronize(gpustream[thr_id]); +} + +int cuda_arch[MAX_GPUS]; +__global__ void get_cuda_arch_gpu(int *d_version) +{ +#ifdef __CUDA_ARCH__ + *d_version = __CUDA_ARCH__; +#endif +} + +__host__ void get_cuda_arch(int *version) +{ + int *d_version; + cudaMalloc(&d_version, sizeof(int)); + get_cuda_arch_gpu << < 1, 1 >> > (d_version); + cudaMemcpy(version, d_version, sizeof(int), cudaMemcpyDeviceToHost); + cudaFree(d_version); +} diff --git a/cuda_groestlcoin.cu b/cuda_groestlcoin.cu index ce8ea18a5b..150ee8c18e 100644 --- a/cuda_groestlcoin.cu +++ b/cuda_groestlcoin.cu @@ -6,11 +6,11 @@ #include "cuda_helper.h" #include + // globaler Speicher für alle HeftyHashes aller Threads -__constant__ uint32_t pTarget[8]; // Single GPU -extern uint32_t *d_resultNonce[MAX_GPUS]; +static uint32_t *d_resultNonce[MAX_GPUS]; -__constant__ uint32_t groestlcoin_gpu_msg[32]; +__constant__ uint32_t groestlcoin_gpu_msg[20]; // 64 Register Variante für Compute 3.0 #include "groestl_functions_quad.cu" @@ -18,72 +18,54 @@ __constant__ uint32_t groestlcoin_gpu_msg[32]; #define SWAB32(x) cuda_swab32(x) -__global__ __launch_bounds__(256, 4) -void groestlcoin_gpu_hash_quad(uint32_t threads, uint32_t startNounce, uint32_t *resNounce) +__global__ __launch_bounds__(512, 2) +void groestlcoin_gpu_hash_quad(uint32_t threads, uint32_t startNounce, uint32_t *resNounce, int thr_id, uint32_t target) { // durch 4 dividieren, weil jeweils 4 Threads zusammen ein Hash berechnen - uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x) / 4; - if (thread < threads) + uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x) >> 2; +// if (thread < threads) { // GROESTL - uint32_t paddedInput[8]; -#pragma unroll 8 - for(int k=0;k<8;k++) paddedInput[k] = groestlcoin_gpu_msg[4*k+(threadIdx.x & 3)]; - - uint32_t nounce = startNounce + thread; - if ((threadIdx.x & 3) == 3) - paddedInput[4] = SWAB32(nounce); // 4*4+3 = 19 + uint32_t paddedInput[8] = { 0 }; + const uint32_t nounce = startNounce + thread; + paddedInput[0] = groestlcoin_gpu_msg[(threadIdx.x & 3)]; + paddedInput[1] = groestlcoin_gpu_msg[4 + (threadIdx.x & 3)]; + paddedInput[2] = groestlcoin_gpu_msg[8 + (threadIdx.x & 3)]; + paddedInput[3] = groestlcoin_gpu_msg[12 + (threadIdx.x & 3)]; + paddedInput[4] = groestlcoin_gpu_msg[16 + (threadIdx.x & 3)]; + if ((threadIdx.x & 3) == 3) paddedInput[4] = SWAB32(nounce); + if ((threadIdx.x & 3) == 0) paddedInput[5] = 0x80; + if ((threadIdx.x & 3)==3) paddedInput[7] = 0x01000000; uint32_t msgBitsliced[8]; - to_bitslice_quad(paddedInput, msgBitsliced); + myr_to_bitslice_quad(paddedInput, msgBitsliced); uint32_t state[8]; - for (int round=0; round<2; round++) - { - groestl512_progressMessage_quad(state, msgBitsliced); - - if (round < 1) - { - // Verkettung zweier Runden inclusive Padding. - msgBitsliced[ 0] = __byte_perm(state[ 0], 0x00800100, 0x4341 + ((threadIdx.x & 3)==3)*0x2000); - msgBitsliced[ 1] = __byte_perm(state[ 1], 0x00800100, 0x4341); - msgBitsliced[ 2] = __byte_perm(state[ 2], 0x00800100, 0x4341); - msgBitsliced[ 3] = __byte_perm(state[ 3], 0x00800100, 0x4341); - msgBitsliced[ 4] = __byte_perm(state[ 4], 0x00800100, 0x4341); - msgBitsliced[ 5] = __byte_perm(state[ 5], 0x00800100, 0x4341); - msgBitsliced[ 6] = __byte_perm(state[ 6], 0x00800100, 0x4341); - msgBitsliced[7] = __byte_perm(state[7], 0x00800100, 0x4341 + ((threadIdx.x & 3) == 0) * 0x0010); - } - } - // Nur der erste von jeweils 4 Threads bekommt das Ergebns-Hash - uint32_t out_state[16]; - from_bitslice_quad(state, out_state); + groestl512_progressMessage_quad(state, msgBitsliced); + + msgBitsliced[ 0] = __byte_perm(state[ 0], 0x00800100, 0x4341 + ((threadIdx.x & 3)==3)*0x2000); + msgBitsliced[ 1] = __byte_perm(state[ 1], 0x00800100, 0x4341); + msgBitsliced[ 2] = __byte_perm(state[ 2], 0x00800100, 0x4341); + msgBitsliced[ 3] = __byte_perm(state[ 3], 0x00800100, 0x4341); + msgBitsliced[ 4] = __byte_perm(state[ 4], 0x00800100, 0x4341); + msgBitsliced[ 5] = __byte_perm(state[ 5], 0x00800100, 0x4341); + msgBitsliced[ 6] = __byte_perm(state[ 6], 0x00800100, 0x4341); + msgBitsliced[7] = __byte_perm(state[7], 0x00800100, 0x4341 + ((threadIdx.x & 3) == 0) * 0x0010); + + groestl512_progressMessage_quad(state, msgBitsliced); + + uint32_t out_state[16]; + from_bitslice_quad_final(state, out_state); if ((threadIdx.x & 3) == 0) { - int i, position = -1; - bool rc = true; - - #pragma unroll 8 - for (i = 7; i >= 0; i--) { - if (out_state[i] > pTarget[i]) { - if(position < i) { - position = i; - rc = false; - } - } - if (out_state[i] < pTarget[i]) { - if(position < i) { - position = i; - rc = true; - } - } - } - - if(rc == true) - if(resNounce[0] > nounce) - resNounce[0] = nounce; + if (out_state[7] <= target) + { + uint32_t tmp = atomicExch(resNounce, nounce); + if (tmp != 0xffffffff) + resNounce[1] = tmp; + } } } } @@ -91,53 +73,31 @@ void groestlcoin_gpu_hash_quad(uint32_t threads, uint32_t startNounce, uint32_t // Setup-Funktionen __host__ void groestlcoin_cpu_init(int thr_id, uint32_t threads) { - CUDA_SAFE_CALL(cudaSetDevice(device_map[thr_id])); - - // Speicher für Gewinner-Nonce belegen - cudaMalloc(&d_resultNonce[thr_id], sizeof(uint32_t)); + cudaMalloc(&d_resultNonce[thr_id], 2 * sizeof(uint32_t)); } -__host__ void groestlcoin_cpu_setBlock(int thr_id, void *data, void *pTargetIn) +__host__ void groestlcoin_cpu_setBlock(int thr_id, void *data ) { - // Nachricht expandieren und setzen - uint32_t msgBlock[32]; - - memset(msgBlock, 0, sizeof(uint32_t) * 32); + uint32_t msgBlock[20]; memcpy(&msgBlock[0], data, 80); + cudaMemcpyToSymbolAsync(groestlcoin_gpu_msg, msgBlock, 80, 0, cudaMemcpyHostToDevice, gpustream[thr_id]); - // Erweitere die Nachricht auf den Nachrichtenblock (padding) - // Unsere Nachricht hat 80 Byte - msgBlock[20] = 0x80; - msgBlock[31] = 0x01000000; - - // groestl512 braucht hierfür keinen CPU-Code (die einzige Runde wird - // auf der GPU ausgeführt) - - // Blockheader setzen (korrekte Nonce und Hefty Hash fehlen da drin noch) - cudaMemcpyToSymbol( groestlcoin_gpu_msg, - msgBlock, - 128); - - cudaMemset(d_resultNonce[thr_id], 0xFF, sizeof(uint32_t)); - cudaMemcpyToSymbol( pTarget, - pTargetIn, - sizeof(uint32_t) * 8 ); + cudaMemsetAsync(d_resultNonce[thr_id], 0xFF, 2 * sizeof(uint32_t), gpustream[thr_id]); } -__host__ void groestlcoin_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, void *outputHashes, uint32_t *nounce) +__host__ void groestlcoin_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *nounce, uint32_t target) { - uint32_t threadsperblock = 256; + uint32_t threadsperblock = 512; // Compute 3.0 benutzt die registeroptimierte Quad Variante mit Warp Shuffle // mit den Quad Funktionen brauchen wir jetzt 4 threads pro Hash, daher Faktor 4 bei der Blockzahl int factor = 4; - // berechne wie viele Thread Blocks wir brauchen + // berechne wie viele Thread Blocks wir brauchen dim3 grid(factor*((threads + threadsperblock-1)/threadsperblock)); dim3 block(threadsperblock); - cudaMemset(d_resultNonce[thr_id], 0xFF, sizeof(uint32_t)); - groestlcoin_gpu_hash_quad<<>>(threads, startNounce, d_resultNonce[thr_id]); + groestlcoin_gpu_hash_quad<<>>(threads, startNounce, d_resultNonce[thr_id], thr_id, target); - cudaMemcpy(nounce, d_resultNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost); + CUDA_SAFE_CALL(cudaMemcpyAsync(nounce, d_resultNonce[thr_id], 2 * sizeof(uint32_t), cudaMemcpyDeviceToHost, gpustream[thr_id])); cudaStreamSynchronize(gpustream[thr_id]); } diff --git a/cuda_groestlcoin.h b/cuda_groestlcoin.h index 7b95b59a07..3dc121537f 100644 --- a/cuda_groestlcoin.h +++ b/cuda_groestlcoin.h @@ -2,7 +2,7 @@ #define _CUDA_GROESTLCOIN_H void groestlcoin_cpu_init(int thr_id, uint32_t threads); -void groestlcoin_cpu_setBlock(int thr_id, void *data, void *pTargetIn); -void groestlcoin_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, void *outputHashes, uint32_t *nounce); +void groestlcoin_cpu_setBlock(int thr_id, void *data); +void groestlcoin_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *nounce, uint32_t target); #endif \ No newline at end of file diff --git a/cuda_helper.h b/cuda_helper.h index 8eb46f3ba2..497b6670e8 100644 --- a/cuda_helper.h +++ b/cuda_helper.h @@ -3,32 +3,45 @@ #include #include +#ifdef __cplusplus +#include +#include +using namespace std; +#else +#include +#endif #ifdef __INTELLISENSE__ +#define NOASM /* reduce vstudio warnings (__byteperm, blockIdx...) */ #include #include #define __launch_bounds__(max_tpb, min_blocks) +#define __CUDA_ARCH__ 610 + uint32_t __byte_perm(uint32_t x, uint32_t y, uint32_t z); uint32_t __shfl(uint32_t x, uint32_t y, uint32_t z); uint32_t atomicExch(uint32_t *x, uint32_t y); uint32_t atomicAdd(uint32_t *x, uint32_t y); void __syncthreads(void); void __threadfence(void); +#define __ldg(x) (*(x)) #endif -#include - #ifndef MAX_GPUS -#define MAX_GPUS 16 +#define MAX_GPUS 8 #endif -extern "C" int device_map[MAX_GPUS]; -extern "C" long device_sm[MAX_GPUS]; +extern int device_map[MAX_GPUS]; +extern long device_sm[MAX_GPUS]; +extern cudaStream_t gpustream[MAX_GPUS]; +extern bool stop_mining; +extern volatile bool mining_has_stopped[MAX_GPUS]; // common functions extern void cuda_check_cpu_init(int thr_id, uint32_t threads); -extern void cuda_check_cpu_setTarget(const void *ptarget); +extern void cuda_check_cpu_setTarget(const void *ptarget, int thr_id); +extern void cuda_check_cpu_setTarget_mod(const void *ptarget, const void *ptarget2); extern uint32_t cuda_check_hash(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_inputHash); extern uint32_t cuda_check_hash_suppl(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_inputHash, uint32_t foundnonce); extern void cudaReportHardwareFailure(int thr_id, cudaError_t error, const char* func); @@ -61,25 +74,63 @@ extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int t #define SPH_T64(x) (x) // #define SPH_T64(x) ((x) & SPH_C64(0xFFFFFFFFFFFFFFFF)) #endif + +#if defined _MSC_VER && !defined __CUDA_ARCH__ +#define ROTL32c(x, n) _rotl(x, n) +#define ROTR32c(x, n) _rotr(x, n) +#else +#define ROTL32c(x, n) ((x) << (n)) | ((x) >> (32 - (n))) +#define ROTR32c(x, n) ((x) >> (n)) | ((x) << (32 - (n))) +#endif + +#ifndef __CUDA_ARCH__ +#define ROTR32(x, n) ROTR32c(x, n) +#define ROTL32(x, n) ROTL32c(x, n) +#else #if __CUDA_ARCH__ < 320 // Kepler (Compute 3.0) -#define ROTL32(x, n) ((x) << (n)) | ((x) >> (32 - (n))) +__device__ __forceinline__ uint32_t ROTR32(const uint32_t x, const uint32_t n) +{ + return (x >> n) | (x << (32 - n)); +} +__device__ __forceinline__ uint32_t ROTL32(const uint32_t x, const uint32_t n) +{ + return (x << n) | (x >> (32 - n)); +} #else -// Kepler (Compute 3.5, 5.0) -#define ROTL32(x, n) __funnelshift_l( (x), (x), (n) ) +__device__ __forceinline__ uint32_t ROTR32(const uint32_t x, const uint32_t n) +{ + return __funnelshift_r(x, x, n); +} +__device__ __forceinline__ uint32_t ROTL32(const uint32_t x, const uint32_t n) +{ + return __funnelshift_l(x, x, n); +} +#endif +#endif + +// #define NOASM here if you don't want asm +#ifndef __CUDA_ARCH__ +#define NOASM #endif +#define MAKE_ULONGLONG(lo, hi) MAKE_UINT64(lo, hi) -__device__ __forceinline__ uint64_t MAKE_ULONGLONG(uint32_t LO, uint32_t HI) +__device__ __forceinline__ uint64_t MAKE_UINT64(uint32_t LO, uint32_t HI) { +#ifndef NOASM uint64_t result; asm("mov.b64 %0,{%1,%2}; \n\t" : "=l"(result) : "r"(LO), "r"(HI)); return result; +#else + return LO + ((uint64_t)HI << 32); +#endif } __device__ __forceinline__ uint64_t REPLACE_HIWORD(const uint64_t x, const uint32_t y) { +#ifndef NOASM uint64_t result; asm( "{\n\t" @@ -89,10 +140,13 @@ __device__ __forceinline__ uint64_t REPLACE_HIWORD(const uint64_t x, const uint3 "}" : "=l"(result) : "l"(x), "r"(y) ); return result; - +#else + return (x & 0xffffffff) + ((uint64_t)y << 32); +#endif } __device__ __forceinline__ uint64_t REPLACE_LOWORD(const uint64_t x, const uint32_t y) { +#ifndef NOASM uint64_t result; asm( "{\n\t" @@ -102,25 +156,36 @@ __device__ __forceinline__ uint64_t REPLACE_LOWORD(const uint64_t x, const uint3 "}" : "=l"(result) : "l"(x), "r"(y) ); return result; +#else + return (x & 0xffffffff00000000) + y; +#endif } -// Endian Drehung für 32 Bit Typen +// endian change for 32bit #ifdef __CUDA_ARCH__ -__device__ __forceinline__ uint32_t cuda_swab32(const uint32_t x) -{ - /* device */ - return __byte_perm(x, x, 0x0123); -} + __device__ __forceinline__ uint32_t cuda_swab32(const uint32_t x) + { + /* device */ + return __byte_perm(x, x, 0x0123); + } #else /* host */ - #define cuda_swab32(x) \ - ((((x) << 24) & 0xff000000u) | (((x) << 8) & 0x00ff0000u) | \ - (((x) >> 8) & 0x0000ff00u) | (((x) >> 24) & 0x000000ffu)) + #ifdef __GNUC__ + #if ((__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3)) + #define cuda_swab32(x) __builtin_bswap32(x) + #endif + #else + #ifdef _MSC_VER + #define cuda_swab32(x) _byteswap_ulong(x) + #else + #define cuda_swab32(x) ( ((x) << 24) | (((x) << 8) & 0x00ff0000u) | (((x) >> 8) & 0x0000ff00u) | ((x) >> 24)) + #endif + #endif #endif - static __device__ uint32_t _HIWORD(const uint64_t x) { +#ifndef NOASM uint32_t result; asm( "{\n\t" @@ -129,10 +194,14 @@ static __device__ uint32_t _HIWORD(const uint64_t x) "}" : "=r"(result) : "l"(x) ); return result; +#else + return x >> 32; +#endif } static __device__ uint32_t _LOWORD(const uint64_t x) { +#ifndef NOASM uint32_t result; asm( "{\n\t" @@ -141,11 +210,13 @@ static __device__ uint32_t _LOWORD(const uint64_t x) "}" : "=r"(result) : "l"(x) ); return result; +#else + return x & 0xffffffff; +#endif } -// Input: 77665544 33221100 -// Output: 00112233 44556677 -#ifdef __CUDA_ARCH__ +// endian change for 64bit +#if (defined __CUDA_ARCH__ && !defined NOASM) __device__ __forceinline__ uint64_t cuda_swab64(uint64_t x) { uint64_t result; @@ -160,15 +231,25 @@ __device__ __forceinline__ uint64_t cuda_swab64(uint64_t x) } #else /* host */ - #define cuda_swab64(x) \ - ((uint64_t)((((uint64_t)(x) & 0xff00000000000000ULL) >> 56) | \ - (((uint64_t)(x) & 0x00ff000000000000ULL) >> 40) | \ - (((uint64_t)(x) & 0x0000ff0000000000ULL) >> 24) | \ - (((uint64_t)(x) & 0x000000ff00000000ULL) >> 8) | \ - (((uint64_t)(x) & 0x00000000ff000000ULL) << 8) | \ - (((uint64_t)(x) & 0x0000000000ff0000ULL) << 24) | \ - (((uint64_t)(x) & 0x000000000000ff00ULL) << 40) | \ - (((uint64_t)(x) & 0x00000000000000ffULL) << 56))) + #ifdef __GNUC__ + #if ((__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3)) + #define cuda_swab64(x) __builtin_bswap64(x) + #endif + #else + #ifdef _MSC_VER + #define cuda_swab64(x) _byteswap_uint64(x) + #else + #define cuda_swab64(x) \ + ((uint64_t)((((uint64_t)(x)) >> 56) | \ + (((uint64_t)(x) & 0x00ff000000000000ULL) >> 40) | \ + (((uint64_t)(x) & 0x0000ff0000000000ULL) >> 24) | \ + (((uint64_t)(x) & 0x000000ff00000000ULL) >> 8) | \ + (((uint64_t)(x) & 0x00000000ff000000ULL) << 8) | \ + (((uint64_t)(x) & 0x0000000000ff0000ULL) << 24) | \ + (((uint64_t)(x) & 0x000000000000ff00ULL) << 40) | \ + (((uint64_t)(x)) << 56))) + #endif + #endif #endif /*********************************************************************/ @@ -201,7 +282,7 @@ do { \ } while (0) /*********************************************************************/ -#ifdef _WIN64 +#if (defined _WIN64 || defined NOASM) #define USE_XOR_ASM_OPTS 0 #else #define USE_XOR_ASM_OPTS 1 @@ -217,7 +298,7 @@ uint64_t xor1(const uint64_t a, const uint64_t b) return result; } #else -#define xor1(a,b) (a ^ b) +#define xor1(a,b) ((a) ^ (b)) #endif #if USE_XOR_ASM_OPTS @@ -233,7 +314,7 @@ uint64_t xor3(const uint64_t a, const uint64_t b, const uint64_t c) return result; } #else -#define xor3(a,b,c) (a ^ b ^ c) +#define xor3(a,b,c) ((a) ^ (b) ^ (c)) #endif #if USE_XOR_ASM_OPTS @@ -252,7 +333,7 @@ uint64_t xor8(const uint64_t a, const uint64_t b, const uint64_t c, const uint64 return result; } #else -#define xor8(a,b,c,d,e,f,g,h) ((a^b)^(c^d)^(e^f)^(g^h)) +#define xor8(a,b,c,d,e,f,g,h) ((a)^(b)^(c)^(d)^(e)^(f)^(g)^(h)) #endif // device asm for x17 @@ -260,6 +341,7 @@ __device__ __forceinline__ uint64_t xandx(const uint64_t a, const uint64_t b, const uint64_t c) { uint64_t result; +#ifndef NOASM asm("{\n\t" ".reg .u64 n;\n\t" "xor.b64 %0, %2, %3;\n\t" @@ -267,6 +349,9 @@ uint64_t xandx(const uint64_t a, const uint64_t b, const uint64_t c) "xor.b64 %0, n, %3;" "}\n" : "=l"(result) : "l"(a), "l"(b), "l"(c)); +#else + result = ((((b) ^ (c)) & (a)) ^ (c)); +#endif return result; } @@ -275,6 +360,7 @@ __device__ __forceinline__ uint64_t andor(uint64_t a, uint64_t b, uint64_t c) { uint64_t result; +#ifndef NOASM asm("{\n\t" ".reg .u64 m,n;\n\t" "and.b64 m, %1, %2;\n\t" @@ -283,6 +369,9 @@ uint64_t andor(uint64_t a, uint64_t b, uint64_t c) " or.b64 %0, %0, m ;\n\t" "}\n" : "=l"(result) : "l"(a), "l"(b), "l"(c)); +#else + result = (((a) & (b)) | (((a) | (b)) & (c))); +#endif return result; } @@ -291,8 +380,12 @@ __device__ __forceinline__ uint64_t shr_t64(uint64_t x, uint32_t n) { uint64_t result; +#ifndef NOASM asm("shr.b64 %0,%1,%2;\n\t" : "=l"(result) : "l"(x), "r"(n)); +#else + result = x >> n; +#endif return result; } @@ -301,8 +394,12 @@ __device__ __forceinline__ uint64_t shl_t64(uint64_t x, uint32_t n) { uint64_t result; +#ifndef NOASM asm("shl.b64 %0,%1,%2;\n\t" : "=l"(result) : "l"(x), "r"(n)); +#else + result = x << n; +#endif return result; } @@ -310,6 +407,10 @@ uint64_t shl_t64(uint64_t x, uint32_t n) #define USE_ROT_ASM_OPT 1 #endif +#ifdef NOASM +#undef USE_ROT_ASM_OPT +#endif + // 64-bit ROTATE RIGHT #if __CUDA_ARCH__ >= 320 && USE_ROT_ASM_OPT == 1 /* complicated sm >= 3.5 one (with Funnel Shifter beschleunigt), to bench */ @@ -343,7 +444,48 @@ uint64_t ROTR64(const uint64_t x, const int offset) } #else /* host */ -#define ROTR64(x, n) (((x) >> (n)) | ((x) << (64 - (n)))) +#if defined _MSC_VER && !defined __CUDA_ARCH__ + #define ROTR64(x, n) _rotr64(x, n) +#else +#ifndef __CUDA_ARCH__ + #define ROTR64(x, n) (((x) >> (n)) | ((x) << (64 - (n)))) +#else +#if __CUDA_ARCH__ >= 520 +__device__ __forceinline__ +uint64_t ROTR64(const uint64_t value, const int offset) +{ + uint2 result; + if(offset < 32) + { + asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset)); + asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset)); + } + else + { + asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset)); + asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset)); + } + return __double_as_longlong(__hiloint2double(result.y, result.x)); +} +#else +__device__ __forceinline__ +uint64_t ROTR64(const uint64_t x, const int offset) +{ + uint64_t result; + asm("{\n\t" + ".reg .b64 lhs;\n\t" + ".reg .u32 roff;\n\t" + "shr.b64 lhs, %1, %2;\n\t" + "sub.u32 roff, 64, %2;\n\t" + "shl.b64 %0, %1, roff;\n\t" + "add.u64 %0, %0, lhs;\n\t" + "}\n" + : "=l"(result) : "l"(x), "r"(offset)); + return result; +} +#endif +#endif +#endif #endif // 64-bit ROTATE LEFT @@ -397,13 +539,17 @@ uint64_t ROTL64(const uint64_t x, const int offset) } #else /* host */ +#if defined _MSC_VER && !defined __CUDA_ARCH__ +#define ROTL64(x, n) _rotl64(x, n) +#else #define ROTL64(x, n) (((x) << (n)) | ((x) >> (64 - (n)))) #endif +#endif __device__ __forceinline__ uint64_t SWAPDWORDS(uint64_t value) { -#if __CUDA_ARCH__ >= 320 +#if __CUDA_ARCH__ >= 320 && !defined NOASM uint2 temp; asm("mov.b64 {%0, %1}, %2; ": "=r"(temp.x), "=r"(temp.y) : "l"(value)); asm("mov.b64 %0, {%1, %2}; ": "=l"(value) : "r"(temp.y), "r"(temp.x)); @@ -416,26 +562,39 @@ uint64_t SWAPDWORDS(uint64_t value) /* lyra2 - int2 operators */ __device__ __forceinline__ -void LOHI(uint32_t &lo, uint32_t &hi, uint64_t x) { +void LOHI(uint32_t &lo, uint32_t &hi, uint64_t x) +{ +#ifndef NOASM asm("mov.b64 {%0,%1},%2; \n\t" : "=r"(lo), "=r"(hi) : "l"(x)); +#else + lo = x & 0xffffffff; + hi = x >> 32; +#endif } __device__ __forceinline__ uint64_t devectorize(uint2 x) { +#ifndef NOASM uint64_t result; asm("mov.b64 %0,{%1,%2}; \n\t" : "=l"(result) : "r"(x.x), "r"(x.y)); return result; +#else + return x.x + ((uint64_t)x.y << 32); +#endif } - -__device__ __forceinline__ uint2 vectorize(uint64_t x) +__device__ __forceinline__ uint2 vectorize(const uint64_t x) { +#ifndef NOASM uint2 result; asm("mov.b64 {%0,%1},%2; \n\t" : "=r"(result.x), "=r"(result.y) : "l"(x)); return result; +#else + return make_uint2(x & 0xffffffff, x >> 32); +#endif } static __device__ __forceinline__ uint2 vectorizelow(uint32_t v) { @@ -444,6 +603,19 @@ static __device__ __forceinline__ uint2 vectorizelow(uint32_t v) { result.y = 0; return result; } +static __device__ __forceinline__ uint2 vectorizehigh(uint32_t v) { + uint2 result; + result.x = 0; + result.y = v; + return result; +} +static __device__ __forceinline__ uint2 eorswap32(uint2 u, uint2 v) +{ + uint2 result; + result.y = u.x ^ v.x; + result.x = u.y ^ v.y; + return result; +} static __device__ __forceinline__ uint2 operator^ (uint2 a, uint32_t b) { return make_uint2(a.x^ b, a.y); } static __device__ __forceinline__ uint2 operator^ (uint2 a, uint2 b) { return make_uint2(a.x ^ b.x, a.y ^ b.y); } @@ -451,8 +623,10 @@ static __device__ __forceinline__ uint2 operator& (uint2 a, uint2 b) { return ma static __device__ __forceinline__ uint2 operator| (uint2 a, uint2 b) { return make_uint2(a.x | b.x, a.y | b.y); } static __device__ __forceinline__ uint2 operator~ (uint2 a) { return make_uint2(~a.x, ~a.y); } static __device__ __forceinline__ void operator^= (uint2 &a, uint2 b) { a = a ^ b; } + static __device__ __forceinline__ uint2 operator+ (uint2 a, uint2 b) { +#ifndef NOASM uint2 result; asm("{\n\t" "add.cc.u32 %0,%2,%4; \n\t" @@ -460,11 +634,24 @@ static __device__ __forceinline__ uint2 operator+ (uint2 a, uint2 b) "}\n\t" : "=r"(result.x), "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(b.x), "r"(b.y)); return result; +#else + return make_uint2(a.x + b.x, a.y + b.y); +#endif } - +static __device__ __forceinline__ uint2 operator+ (uint2 a, uint32_t b) +{ + uint2 result; + asm("{\n\t" + "add.cc.u32 %0,%2,%4; \n\t" + "addc.u32 %1,%3,%5; \n\t" + "}\n\t" + : "=r"(result.x), "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(b), "r"(0)); + return result; +} static __device__ __forceinline__ uint2 operator- (uint2 a, uint2 b) { +#ifndef NOASM uint2 result; asm("{\n\t" "sub.cc.u32 %0,%2,%4; \n\t" @@ -472,9 +659,27 @@ static __device__ __forceinline__ uint2 operator- (uint2 a, uint2 b) "}\n\t" : "=r"(result.x), "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(b.x), "r"(b.y)); return result; +#else +return make_uint2(a.x - b.x, a.y - b.y); +#endif +} + + +static __device__ __forceinline__ uint4 operator+ (uint4 a, uint4 b) +{ + return make_uint4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); } +static __device__ __forceinline__ uint4 operator^ (uint4 a, uint4 b) { return make_uint4(a.x ^ b.x, a.y ^ b.y, a.z ^ b.z, a.w ^ b.w); } +static __device__ __forceinline__ uint4 operator& (uint4 a, uint4 b) { return make_uint4(a.x & b.x, a.y & b.y, a.z & b.z, a.w & b.w); } +static __device__ __forceinline__ uint4 operator| (uint4 a, uint4 b) { return make_uint4(a.x | b.x, a.y | b.y, a.z | b.z, a.w | b.w); } +static __device__ __forceinline__ uint4 operator~ (uint4 a) { return make_uint4(~a.x, ~a.y, ~a.z, ~a.w); } +static __device__ __forceinline__ void operator^= (uint4 &a, uint4 b) { a = a ^ b; } static __device__ __forceinline__ void operator+= (uint2 &a, uint2 b) { a = a + b; } +static __forceinline__ __device__ uchar4 operator^ (uchar4 a, uchar4 b){return make_uchar4(a.x ^ b.x, a.y ^ b.y, a.z ^ b.z, a.w ^ b.w);} +static __forceinline__ __device__ uchar4 operator+ (uchar4 a, uchar4 b){return make_uchar4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w);} + +static __forceinline__ __device__ void operator^= (uchar4 &a, uchar4 b) { a = a ^ b; } /** * basic multiplication between 64bit no carry outside that range (ie mul.lo.b64(a*b)) @@ -482,6 +687,7 @@ static __device__ __forceinline__ void operator+= (uint2 &a, uint2 b) { a = a + */ static __device__ __forceinline__ uint2 operator* (uint2 a, uint2 b) { +#ifndef NOASM uint2 result; asm("{\n\t" "mul.lo.u32 %0,%2,%4; \n\t" @@ -491,10 +697,13 @@ static __device__ __forceinline__ uint2 operator* (uint2 a, uint2 b) "}\n\t" : "=r"(result.x), "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(b.x), "r"(b.y)); return result; +#else + return vectorize(devectorize(a)*devectorize(b)); +#endif } // uint2 method -#if __CUDA_ARCH__ >= 350 +#if __CUDA_ARCH__ >= 320 && !defined NOASM __device__ __inline__ uint2 ROR2(const uint2 a, const int offset) { uint2 result; @@ -519,14 +728,83 @@ __device__ __inline__ uint2 ROR2(const uint2 v, const int n) } else { - result.y = ((v.x >> (n - 32)) | (v.y << (32 - n))); - result.x = ((v.y >> (n - 32)) | (v.x << (32 - n))); + result.y = ((v.x >> (n - 32)) | (v.y << (64 - n))); + result.x = ((v.y >> (n - 32)) | (v.x << (64 - n))); } return result; } #endif -#if __CUDA_ARCH__ >= 350 +__device__ __inline__ uint32_t ROL8(const uint32_t x) +{ + return __byte_perm(x, x, 0x2103); +} +__device__ __inline__ uint32_t ROL16(const uint32_t x) +{ + return __byte_perm(x, x, 0x1032); +} +__device__ __inline__ uint32_t ROL24(const uint32_t x) +{ + return __byte_perm(x, x, 0x0321); +} + +__device__ __inline__ uint2 ROR8(const uint2 a) +{ + uint2 result; + result.x = __byte_perm(a.y, a.x, 0x0765); + result.y = __byte_perm(a.y, a.x, 0x4321); + + return result; +} + +__device__ __inline__ uint2 ROR16(const uint2 a) +{ + uint2 result; + result.x = __byte_perm(a.y, a.x, 0x1076); + result.y = __byte_perm(a.y, a.x, 0x5432); + + return result; +} + +__device__ __inline__ uint2 ROR24(const uint2 a) +{ + uint2 result; + result.x = __byte_perm(a.y, a.x, 0x2107); + result.y = __byte_perm(a.y, a.x, 0x6543); + + return result; +} + +__device__ __inline__ uint2 ROL8(const uint2 a) +{ + uint2 result; + result.x = __byte_perm(a.y, a.x, 0x6543); + result.y = __byte_perm(a.y, a.x, 0x2107); + + return result; +} + +__device__ __inline__ uint2 ROL16(const uint2 a) +{ + uint2 result; + result.x = __byte_perm(a.y, a.x, 0x5432); + result.y = __byte_perm(a.y, a.x, 0x1076); + + return result; +} + +__device__ __inline__ uint2 ROL24(const uint2 a) +{ + uint2 result; + result.x = __byte_perm(a.y, a.x, 0x4321); + result.y = __byte_perm(a.y, a.x, 0x0765); + + return result; +} + +#if __CUDA_ARCH__ >= 320 && !defined NOASM + + __inline__ __device__ uint2 ROL2(const uint2 a, const int offset) { uint2 result; if (offset >= 32) { @@ -560,7 +838,7 @@ __inline__ __device__ uint2 ROL2(const uint2 v, const int n) __device__ __forceinline__ uint64_t ROTR16(uint64_t x) { -#if __CUDA_ARCH__ > 500 +#if __CUDA_ARCH__ > 500 && !defined NOASM short4 temp; asm("mov.b64 { %0, %1, %2, %3 }, %4; ": "=h"(temp.x), "=h"(temp.y), "=h"(temp.z), "=h"(temp.w) : "l"(x)); asm("mov.b64 %0, {%1, %2, %3 , %4}; ": "=l"(x) : "h"(temp.y), "h"(temp.z), "h"(temp.w), "h"(temp.x)); @@ -569,10 +847,11 @@ uint64_t ROTR16(uint64_t x) return ROTR64(x, 16); #endif } + __device__ __forceinline__ uint64_t ROTL16(uint64_t x) { -#if __CUDA_ARCH__ > 500 +#if __CUDA_ARCH__ > 500 && !defined NOASM short4 temp; asm("mov.b64 { %0, %1, %2, %3 }, %4; ": "=h"(temp.x), "=h"(temp.y), "=h"(temp.z), "=h"(temp.w) : "l"(x)); asm("mov.b64 %0, {%1, %2, %3 , %4}; ": "=l"(x) : "h"(temp.w), "h"(temp.x), "h"(temp.y), "h"(temp.z)); @@ -587,6 +866,7 @@ uint2 SWAPINT2(uint2 x) { return(make_uint2(x.y, x.x)); } + __device__ __forceinline__ bool cuda_hashisbelowtarget(const uint32_t *const __restrict__ hash, const uint32_t *const __restrict__ target) { if (hash[7] > target[7]) @@ -628,10 +908,10 @@ uint2 SWAPDWORDS2(uint2 value) return make_uint2(value.y, value.x); } -static __forceinline__ __device__ uint2 SHL2(uint2 a, int offset) +static __forceinline__ __device__ uint2 SHL2(const uint2 a, int offset) { -#if __CUDA_ARCH__ > 300 uint2 result; +#if __CUDA_ARCH__ > 300 && !defined NOASM if (offset<32) { asm("{\n\t" @@ -647,25 +927,25 @@ static __forceinline__ __device__ uint2 SHL2(uint2 a, int offset) "}\n\t" : "=r"(result.x), "=r"(result.y) : "r"(a.y), "r"(a.x), "r"(offset)); } - return result; #else if (offset<=32) { - a.y = (a.y << offset) | (a.x >> (32 - offset)); - a.x = (a.x << offset); + result.y = (a.y << offset) | (a.x >> (32 - offset)); + result.x = (a.x << offset); } else { - a.y = (a.x << (offset-32)); - a.x = 0; + result.y = (a.x << (offset - 32)); + result.x = 0; } - return a; #endif + return result; } -static __forceinline__ __device__ uint2 SHR2(uint2 a, int offset) + +static __forceinline__ __device__ uint2 SHR2(const uint2 a, int offset) { - #if __CUDA_ARCH__ > 300 uint2 result; +#if __CUDA_ARCH__ >= 320 && !defined NOASM if (offset<32) { asm("{\n\t" "shf.r.clamp.b32 %0,%2,%3,%4; \n\t" @@ -680,24 +960,24 @@ static __forceinline__ __device__ uint2 SHR2(uint2 a, int offset) "}\n\t" : "=r"(result.x), "=r"(result.y) : "r"(a.y), "r"(a.x), "r"(offset)); } - return result; #else if (offset<=32) { - a.x = (a.x >> offset) | (a.y << (32 - offset)); - a.y = (a.y >> offset); + result.x = (a.x >> offset) | (a.y << (32 - offset)); + result.y = (a.y >> offset); } else { - a.x = (a.y >> (offset - 32)); - a.y = 0; + result.x = (a.y >> (offset - 32)); + result.y = 0; } - return a; - #endif +#endif + return result; } -static __device__ __forceinline__ uint64_t devectorizeswap(uint2 v) { return MAKE_ULONGLONG(cuda_swab32(v.y), cuda_swab32(v.x)); } -static __device__ __forceinline__ uint2 vectorizeswap(uint64_t v) { +static __device__ __forceinline__ uint64_t devectorizeswap(uint2 v) { return MAKE_UINT64(cuda_swab32(v.y), cuda_swab32(v.x)); } +static __device__ __forceinline__ uint2 vectorizeswap(uint64_t v) +{ uint2 result; LOHI(result.y, result.x, v); result.x = cuda_swab32(result.x); @@ -705,12 +985,23 @@ static __device__ __forceinline__ uint2 vectorizeswap(uint64_t v) { return result; } +static __device__ __forceinline__ uint2 cuda_swap(uint2 v) +{ + uint32_t t = cuda_swab32(v.x); + v.x = cuda_swab32(v.y); + v.y = t; + return v; +} __device__ __forceinline__ uint32_t devectorize16(ushort2 x) { uint32_t result; +#ifndef NOASM asm("mov.b32 %0,{%1,%2}; \n\t" : "=r"(result) : "h"(x.x) , "h"(x.y)); +#else + result = x.x + (x.y << 16); +#endif return result; } @@ -718,11 +1009,164 @@ __device__ __forceinline__ uint32_t devectorize16(ushort2 x) __device__ __forceinline__ ushort2 vectorize16(uint32_t x) { ushort2 result; +#ifndef NOASM asm("mov.b32 {%0,%1},%2; \n\t" : "=h"(result.x), "=h"(result.y) : "r"(x)); +#else + result.x = x & 0xffff; + result.y = x >> 16; +#endif + return result; +} + +extern int cuda_arch[MAX_GPUS]; +extern void get_cuda_arch(int *); + +/* +static __device__ __forceinline__ uint4 mul4(uint4 a) +{ + uint4 result; + asm("{\n\t" + "mul.lo.u32 %0,%4,%5; \n\t" + "mul.hi.u32 %1,%4,%5; \n\t" + "mul.lo.u32 %2,%6,%7; \n\t" + "mul.hi.u32 %3,%6,%7; \n\t" + "}\n\t" + : "=r"(result.x), "=r"(result.y), "=r"(result.z), "=r"(result.w) : "r"(a.x), "r"(a.y), "r"(a.z), "r"(a.w)); + return result; +} +static __device__ __forceinline__ uint4 add4(uint4 a, uint4 b) + { + uint4 result; + asm("{\n\t" + "add.cc.u32 %0,%4,%8; \n\t" + "addc.u32 %1,%5,%9; \n\t" + "add.cc.u32 %2,%6,%10; \n\t" + "addc.u32 %3,%7,%11; \n\t" + "}\n\t" + : "=r"(result.x), "=r"(result.y), "=r"(result.z), "=r"(result.w) : "r"(a.x), "r"(a.y), "r"(a.z), "r"(a.w), "r"(b.x), "r"(b.y), "r"(b.z), "r"(b.w)); + return result; + } + +static __device__ __forceinline__ uint4 madd4(uint4 a, uint4 b) + { + uint4 result; + asm("{\n\t" + "mad.lo.cc.u32 %0,%4,%5,%8; \n\t" + "madc.hi.u32 %1,%4,%5,%9; \n\t" + "mad.lo.cc.u32 %2,%6,%7,%10; \n\t" + "madc.hi.u32 %3,%6,%7,%11; \n\t" + "}\n\t" + : "=r"(result.x), "=r"(result.y), "=r"(result.z), "=r"(result.w) : "r"(a.x), "r"(a.y), "r"(a.z), "r"(a.w), "r"(b.x), "r"(b.y), "r"(b.z), "r"(b.w)); + return result; + } + +static __device__ __forceinline__ ulonglong2 madd4long(ulonglong2 a, ulonglong2 b) + { + ulonglong2 result; + asm("{\n\t" + ".reg .u32 a0,a1,a2,a3,b0,b1,b2,b3;\n\t" + "mov.b64 {a0,a1}, %2;\n\t" + "mov.b64 {a2,a3}, %3;\n\t" + "mov.b64 {b0,b1}, %4;\n\t" + "mov.b64 {b2,b3}, %5;\n\t" + "mad.lo.cc.u32 b0,a0,a1,b0; \n\t" + "madc.hi.u32 b1,a0,a1,b1; \n\t" + "mad.lo.cc.u32 b2,a2,a3,b2; \n\t" + "madc.hi.u32 b3,a2,a3,b3; \n\t" + "mov.b64 %0, {b0,b1};\n\t" + "mov.b64 %1, {b2,b3};\n\t" + "}\n\t" + : "=l"(result.x), "=l"(result.y) : "l"(a.x), "l"(a.y), "l"(b.x), "l"(b.y)); + return result; + } +*/ +static __device__ __forceinline__ void madd4long2(ulonglong2 &a, ulonglong2 b) + { +#ifndef NOASM + asm("{\n\t" + ".reg .u32 a0,a1,a2,a3,b0,b1,b2,b3;\n\t" + "mov.b64 {a0,a1}, %0;\n\t" + "mov.b64 {a2,a3}, %1;\n\t" + "mov.b64 {b0,b1}, %2;\n\t" + "mov.b64 {b2,b3}, %3;\n\t" + "mad.lo.cc.u32 b0,a0,a1,b0; \n\t" + "madc.hi.u32 b1,a0,a1,b1; \n\t" + "mad.lo.cc.u32 b2,a2,a3,b2; \n\t" + "madc.hi.u32 b3,a2,a3,b3; \n\t" + "mov.b64 %0, {b0,b1};\n\t" + "mov.b64 %1, {b2,b3};\n\t" + "}\n\t" + : "+l"(a.x), "+l"(a.y) : "l"(b.x), "l"(b.y)); +#else // ?? no idea what madd4long is supposed to do + a.x = a.x + b.x; + if(a.x < b.x) + a.y = a.y + b.y + 1; + else + a.y = a.y + b.y; +#endif +} + +__device__ __forceinline__ +uint32_t xor3b(uint32_t a, uint32_t b, uint32_t c) { + uint32_t result; +#ifndef NOASM + asm("{ .reg .u32 t1;\n\t" + "xor.b32 t1, %2, %3;\n\t" + "xor.b32 %0, %1, t1;\n\t" + "}" + : "=r"(result) : "r"(a), "r"(b), "r"(c)); +#else + result = a ^ b ^ c; +#endif return result; } +__device__ __forceinline__ +uint32_t shr_t32(uint32_t x, uint32_t n) { + uint32_t result; +#ifndef NOASM + asm("shr.b32 %0,%1,%2;" : "=r"(result) : "r"(x), "r"(n)); +#else + result = x >> n; +#endif + return result; +} + +__device__ __forceinline__ +uint32_t shl_t32(uint32_t x, uint32_t n) { + uint32_t result; +#ifndef NOASM + asm("shl.b32 %0,%1,%2;" : "=r"(result) : "r"(x), "r"(n)); +#else + result = x << n; +#endif + return result; +} + +// device asm 32 for pluck +__device__ __forceinline__ +uint32_t andor32(uint32_t a, uint32_t b, uint32_t c) { + uint32_t result; +#ifndef NOASM + asm("{ .reg .u32 m,n,o;\n\t" + "and.b32 m, %1, %2;\n\t" + " or.b32 n, %1, %2;\n\t" + "and.b32 o, n, %3;\n\t" + " or.b32 %0, m, o ;\n\t" + "}\n\t" + : "=r"(result) : "r"(a), "r"(b), "r"(c)); +#else + result = ((a | b) & c) | (a & b); +#endif + return result; +} + +#if __CUDA_ARCH__ < 350 +#ifndef __ldg +#define __ldg(x) (*(x)) +#endif +#endif #endif // #ifndef CUDA_HELPER_H diff --git a/cuda_helper.h.orig b/cuda_helper.h.orig new file mode 100644 index 0000000000..142bfa9da6 --- /dev/null +++ b/cuda_helper.h.orig @@ -0,0 +1,1115 @@ +#ifndef CUDA_HELPER_H +#define CUDA_HELPER_H + +#include +#include +#ifdef __cplusplus +#include +#include +using namespace std; +#else +#include +#endif + +#ifdef __INTELLISENSE__ +#define NOASM +/* reduce vstudio warnings (__byteperm, blockIdx...) */ +#include +#include +#define __launch_bounds__(max_tpb, min_blocks) + +uint32_t __byte_perm(uint32_t x, uint32_t y, uint32_t z); +uint32_t __shfl(uint32_t x, uint32_t y, uint32_t z); +uint32_t atomicExch(uint32_t *x, uint32_t y); +uint32_t atomicAdd(uint32_t *x, uint32_t y); +void __syncthreads(void); +void __threadfence(void); +#define __ldg(x) (*(x)) +#endif + +#ifndef MAX_GPUS +#define MAX_GPUS 8 +#endif + +extern int device_map[MAX_GPUS]; +extern long device_sm[MAX_GPUS]; +extern cudaStream_t gpustream[MAX_GPUS]; +extern bool stop_mining; +extern volatile bool mining_has_stopped[MAX_GPUS]; + +// common functions +extern void cuda_check_cpu_init(int thr_id, uint32_t threads); +extern void cuda_check_cpu_setTarget(const void *ptarget, int thr_id); +extern void cuda_check_cpu_setTarget_mod(const void *ptarget, const void *ptarget2); +extern uint32_t cuda_check_hash(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_inputHash); +extern uint32_t cuda_check_hash_suppl(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_inputHash, uint32_t foundnonce); +extern void cudaReportHardwareFailure(int thr_id, cudaError_t error, const char* func); + +#ifndef __CUDA_ARCH__ +// define blockDim and threadIdx for host +extern const dim3 blockDim; +extern const uint3 threadIdx; +#endif + +extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id); + + +#ifndef SPH_C32 +#define SPH_C32(x) ((x ## U)) +// #define SPH_C32(x) ((uint32_t)(x ## U)) +#endif + +#ifndef SPH_C64 +#define SPH_C64(x) ((x ## ULL)) +// #define SPH_C64(x) ((uint64_t)(x ## ULL)) +#endif + +#ifndef SPH_T32 +#define SPH_T32(x) (x) +// #define SPH_T32(x) ((x) & SPH_C32(0xFFFFFFFF)) +#endif + +#ifndef SPH_T64 +#define SPH_T64(x) (x) +// #define SPH_T64(x) ((x) & SPH_C64(0xFFFFFFFFFFFFFFFF)) +#endif + +#if defined _MSC_VER && !defined __CUDA_ARCH__ +#define ROTL32c(x, n) _rotl(x, n) +#define ROTR32c(x, n) _rotr(x, n) +#else +#define ROTL32c(x, n) ((x) << (n)) | ((x) >> (32 - (n))) +#define ROTR32c(x, n) ((x) >> (n)) | ((x) << (32 - (n))) +#endif + +#ifndef __CUDA_ARCH__ +#define ROTR32(x, n) ROTR32c(x, n) +#define ROTL32(x, n) ROTL32c(x, n) +#else +#if __CUDA_ARCH__ < 320 +// Kepler (Compute 3.0) +__device__ __forceinline__ uint32_t ROTR32(const uint32_t x, const uint32_t n) +{ + return (x >> n) | (x << (32 - n)); +} +__device__ __forceinline__ uint32_t ROTL32(const uint32_t x, const uint32_t n) +{ + return (x << n) | (x >> (32 - n)); +} +#else +__device__ __forceinline__ uint32_t ROTR32(const uint32_t x, const uint32_t n) +{ + return __funnelshift_r(x, x, n); +} +__device__ __forceinline__ uint32_t ROTL32(const uint32_t x, const uint32_t n) +{ + return __funnelshift_l(x, x, n); +} +#endif +#endif + +// #define NOASM here if you don't want asm +#ifndef __CUDA_ARCH__ +#define NOASM +#endif + +#define MAKE_ULONGLONG(lo, hi) MAKE_UINT64(lo, hi) + +__device__ __forceinline__ uint64_t MAKE_UINT64(uint32_t LO, uint32_t HI) +{ +#ifndef NOASM + uint64_t result; + asm("mov.b64 %0,{%1,%2}; \n\t" + : "=l"(result) : "r"(LO), "r"(HI)); + return result; +#else + return LO + (uint64_t)HI << 32; +#endif +} + +__device__ __forceinline__ uint64_t REPLACE_HIWORD(const uint64_t x, const uint32_t y) +{ +#ifndef NOASM + uint64_t result; + asm( + "{\n\t" + ".reg .u32 t,t2; \n\t" + "mov.b64 {t2,t},%1; \n\t" + "mov.b64 %0,{t2,%2}; \n\t" + "}" : "=l"(result) : "l"(x), "r"(y) + ); + return result; +#else + return (x & 0xffffffff) + ((uint64_t)y << 32); +#endif +} +__device__ __forceinline__ uint64_t REPLACE_LOWORD(const uint64_t x, const uint32_t y) +{ +#ifndef NOASM + uint64_t result; + asm( + "{\n\t" + ".reg .u32 t,t2; \n\t" + "mov.b64 {t2,t},%1; \n\t" + "mov.b64 %0,{%2,t}; \n\t" + "}" : "=l"(result) : "l"(x), "r"(y) + ); + return result; +#else + return (x & 0xffffffff00000000) + y; +#endif +} + +// endian change for 32bit +#ifdef __CUDA_ARCH__ +__device__ __forceinline__ uint32_t cuda_swab32(const uint32_t x) +{ + /* device */ + return __byte_perm(x, x, 0x0123); +} +#else + /* host */ + #define cuda_swab32(x) \ + ( ((x) << 24) | (((x) << 8) & 0x00ff0000u) | \ + (((x) >> 8) & 0x0000ff00u) | ((x) >> 24)) +#endif + + +static __device__ uint32_t _HIWORD(const uint64_t x) +{ +#ifndef NOASM + uint32_t result; + asm( + "{\n\t" + ".reg .u32 xl; \n\t" + "mov.b64 {xl,%0},%1; \n\t" + "}" : "=r"(result) : "l"(x) + ); + return result; +#else + return x >> 32; +#endif +} + +static __device__ uint32_t _LOWORD(const uint64_t x) +{ +#ifndef NOASM + uint32_t result; + asm( + "{\n\t" + ".reg .u32 xh; \n\t" + "mov.b64 {%0,xh},%1; \n\t" + "}" : "=r"(result) : "l"(x) + ); + return result; +#else + return x & 0xffffffff; +#endif +} + +// endian change for 64bit +#if (defined __CUDA_ARCH__ && !defined NOASM) +__device__ __forceinline__ uint64_t cuda_swab64(uint64_t x) +{ + uint64_t result; + uint2 t; + asm("mov.b64 {%0,%1},%2; \n\t" + : "=r"(t.x), "=r"(t.y) : "l"(x)); + t.x=__byte_perm(t.x, 0, 0x0123); + t.y=__byte_perm(t.y, 0, 0x0123); + asm("mov.b64 %0,{%1,%2}; \n\t" + : "=l"(result) : "r"(t.y), "r"(t.x)); + return result; +} +#else + /* host */ + #define cuda_swab64(x) \ + ((uint64_t)((((uint64_t)(x)) >> 56) | \ + (((uint64_t)(x) & 0x00ff000000000000ULL) >> 40) | \ + (((uint64_t)(x) & 0x0000ff0000000000ULL) >> 24) | \ + (((uint64_t)(x) & 0x000000ff00000000ULL) >> 8) | \ + (((uint64_t)(x) & 0x00000000ff000000ULL) << 8) | \ + (((uint64_t)(x) & 0x0000000000ff0000ULL) << 24) | \ + (((uint64_t)(x) & 0x000000000000ff00ULL) << 40) | \ + (((uint64_t)(x)) << 56))) +#endif + +/*********************************************************************/ +// Macros to catch CUDA errors in CUDA runtime calls + +#define CUDA_SAFE_CALL(call) \ +do { \ + cudaError_t err = call; \ + if (cudaSuccess != err) { \ + fprintf(stderr, "Cuda error in func '%s' at line %i : %s.\n", \ + __FUNCTION__, __LINE__, cudaGetErrorString(err) ); \ + exit(EXIT_FAILURE); \ + } \ +} while (0) + +#define CUDA_CALL_OR_RET(call) do { \ + cudaError_t err = call; \ + if (cudaSuccess != err) { \ + cudaReportHardwareFailure(thr_id, err, __FUNCTION__); \ + return; \ + } \ +} while (0) + +#define CUDA_CALL_OR_RET_X(call, ret) do { \ + cudaError_t err = call; \ + if (cudaSuccess != err) { \ + cudaReportHardwareFailure(thr_id, err, __FUNCTION__); \ + return ret; \ + } \ +} while (0) + +/*********************************************************************/ +#if (defined _WIN64 || defined NOASM) +#define USE_XOR_ASM_OPTS 0 +#else +#define USE_XOR_ASM_OPTS 1 +#endif + +#if USE_XOR_ASM_OPTS +// device asm for whirpool +__device__ __forceinline__ +uint64_t xor1(const uint64_t a, const uint64_t b) +{ + uint64_t result; + asm("xor.b64 %0, %1, %2;" : "=l"(result) : "l"(a), "l"(b)); + return result; +} +#else +#define xor1(a,b) ((a) ^ (b)) +#endif + +#if USE_XOR_ASM_OPTS +// device asm for whirpool +__device__ __forceinline__ +uint64_t xor3(const uint64_t a, const uint64_t b, const uint64_t c) +{ + uint64_t result; + asm("xor.b64 %0, %2, %3;\n\t" + "xor.b64 %0, %0, %1;\n\t" + /* output : input registers */ + : "=l"(result) : "l"(a), "l"(b), "l"(c)); + return result; +} +#else +#define xor3(a,b,c) ((a) ^ (b) ^ (c)) +#endif + +#if USE_XOR_ASM_OPTS +// device asm for whirpool +__device__ __forceinline__ +uint64_t xor8(const uint64_t a, const uint64_t b, const uint64_t c, const uint64_t d, const uint64_t e, const uint64_t f, const uint64_t g, const uint64_t h) +{ + uint64_t result; + asm("xor.b64 %0, %1, %2;" : "=l"(result) : "l"(g) ,"l"(h)); + asm("xor.b64 %0, %0, %1;" : "+l"(result) : "l"(f)); + asm("xor.b64 %0, %0, %1;" : "+l"(result) : "l"(e)); + asm("xor.b64 %0, %0, %1;" : "+l"(result) : "l"(d)); + asm("xor.b64 %0, %0, %1;" : "+l"(result) : "l"(c)); + asm("xor.b64 %0, %0, %1;" : "+l"(result) : "l"(b)); + asm("xor.b64 %0, %0, %1;" : "+l"(result) : "l"(a)); + return result; +} +#else +#define xor8(a,b,c,d,e,f,g,h) ((a)^(b)^(c)^(d)^(e)^(f)^(g)^(h)) +#endif + +// device asm for x17 +__device__ __forceinline__ +uint64_t xandx(const uint64_t a, const uint64_t b, const uint64_t c) +{ + uint64_t result; +#ifndef NOASM + asm("{\n\t" + ".reg .u64 n;\n\t" + "xor.b64 %0, %2, %3;\n\t" + "and.b64 n, %0, %1;\n\t" + "xor.b64 %0, n, %3;" + "}\n" + : "=l"(result) : "l"(a), "l"(b), "l"(c)); +#else + result = ((((b) ^ (c)) & (a)) ^ (c)); +#endif + return result; +} + +// device asm for x17 +__device__ __forceinline__ +uint64_t andor(uint64_t a, uint64_t b, uint64_t c) +{ + uint64_t result; +#ifndef NOASM + asm("{\n\t" + ".reg .u64 m,n;\n\t" + "and.b64 m, %1, %2;\n\t" + " or.b64 n, %1, %2;\n\t" + "and.b64 %0, n, %3;\n\t" + " or.b64 %0, %0, m ;\n\t" + "}\n" + : "=l"(result) : "l"(a), "l"(b), "l"(c)); +#else + result = (((a) & (b)) | (((a) | (b)) & (c))); +#endif + return result; +} + +// device asm for x17 +__device__ __forceinline__ +uint64_t shr_t64(uint64_t x, uint32_t n) +{ + uint64_t result; +#ifndef NOASM + asm("shr.b64 %0,%1,%2;\n\t" + : "=l"(result) : "l"(x), "r"(n)); +#else + result = x >> n; +#endif + return result; +} + +// device asm for ? +__device__ __forceinline__ +uint64_t shl_t64(uint64_t x, uint32_t n) +{ + uint64_t result; +#ifndef NOASM + asm("shl.b64 %0,%1,%2;\n\t" + : "=l"(result) : "l"(x), "r"(n)); +#else + result = x << n; +#endif + return result; +} + +#ifndef USE_ROT_ASM_OPT +#define USE_ROT_ASM_OPT 1 +#endif + +#ifdef NOASM +#undef USE_ROT_ASM_OPT +#endif + +// 64-bit ROTATE RIGHT +#if __CUDA_ARCH__ >= 320 && USE_ROT_ASM_OPT == 1 +/* complicated sm >= 3.5 one (with Funnel Shifter beschleunigt), to bench */ +__device__ __forceinline__ +uint64_t ROTR64(const uint64_t value, const int offset) { + uint2 result; + if(offset < 32) { + asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset)); + asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset)); + } else { + asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset)); + asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset)); + } + return __double_as_longlong(__hiloint2double(result.y, result.x)); +} +#elif __CUDA_ARCH__ >= 120 && USE_ROT_ASM_OPT == 2 +__device__ __forceinline__ +uint64_t ROTR64(const uint64_t x, const int offset) +{ + uint64_t result; + asm("{\n\t" + ".reg .b64 lhs;\n\t" + ".reg .u32 roff;\n\t" + "shr.b64 lhs, %1, %2;\n\t" + "sub.u32 roff, 64, %2;\n\t" + "shl.b64 %0, %1, roff;\n\t" + "add.u64 %0, %0, lhs;\n\t" + "}\n" + : "=l"(result) : "l"(x), "r"(offset)); + return result; +} +#else +/* host */ +#if defined _MSC_VER && !defined __CUDA_ARCH__ +#define ROTR64(x, n) _rotr64(x, n) +#else +#define ROTR64(x, n) (((x) >> (n)) | ((x) << (64 - (n)))) +#endif +#endif + +// 64-bit ROTATE LEFT +#if __CUDA_ARCH__ >= 320 && USE_ROT_ASM_OPT == 1 +__device__ __forceinline__ +uint64_t ROTL64(const uint64_t value, const int offset) { + uint2 result; + if(offset >= 32) { + asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset)); + asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset)); + } else { + asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset)); + asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset)); + } + return __double_as_longlong(__hiloint2double(result.y, result.x)); +} +#elif __CUDA_ARCH__ >= 120 && USE_ROT_ASM_OPT == 2 +__device__ __forceinline__ +uint64_t ROTL64(const uint64_t x, const int offset) +{ + uint64_t result; + asm("{\n\t" + ".reg .b64 lhs;\n\t" + ".reg .u32 roff;\n\t" + "shl.b64 lhs, %1, %2;\n\t" + "sub.u32 roff, 64, %2;\n\t" + "shr.b64 %0, %1, roff;\n\t" + "add.u64 %0, lhs, %0;\n\t" + "}\n" + : "=l"(result) : "l"(x), "r"(offset)); + return result; +} +#elif __CUDA_ARCH__ >= 320 && USE_ROT_ASM_OPT == 3 +__device__ +uint64_t ROTL64(const uint64_t x, const int offset) +{ + uint64_t res; + asm("{\n\t" + ".reg .u32 tl,th,vl,vh;\n\t" + ".reg .pred p;\n\t" + "mov.b64 {tl,th}, %1;\n\t" + "shf.l.wrap.b32 vl, tl, th, %2;\n\t" + "shf.l.wrap.b32 vh, th, tl, %2;\n\t" + "setp.lt.u32 p, %2, 32;\n\t" + "@!p mov.b64 %0, {vl,vh};\n\t" + "@p mov.b64 %0, {vh,vl};\n\t" + "}" + : "=l"(res) : "l"(x) , "r"(offset) + ); + return res; +} +#else +/* host */ +#if defined _MSC_VER && !defined __CUDA_ARCH__ +#define ROTL64(x, n) _rotr64(x, n) +#else +#define ROTL64(x, n) (((x) << (n)) | ((x) >> (64 - (n)))) +#endif +#endif + +__device__ __forceinline__ +uint64_t SWAPDWORDS(uint64_t value) +{ +#if __CUDA_ARCH__ >= 320 && !defined NOASM + uint2 temp; + asm("mov.b64 {%0, %1}, %2; ": "=r"(temp.x), "=r"(temp.y) : "l"(value)); + asm("mov.b64 %0, {%1, %2}; ": "=l"(value) : "r"(temp.y), "r"(temp.x)); + return value; +#else + return ROTL64(value, 32); +#endif +} + +/* lyra2 - int2 operators */ + +__device__ __forceinline__ +void LOHI(uint32_t &lo, uint32_t &hi, uint64_t x) +{ +#ifndef NOASM + asm("mov.b64 {%0,%1},%2; \n\t" + : "=r"(lo), "=r"(hi) : "l"(x)); +#else + lo = x & 0xffffffff; + hi = x >> 32; +#endif +} + +__device__ __forceinline__ uint64_t devectorize(uint2 x) +{ +#ifndef NOASM + uint64_t result; + asm("mov.b64 %0,{%1,%2}; \n\t" + : "=l"(result) : "r"(x.x), "r"(x.y)); + return result; +#else + return x.x + ((uint64_t)x.y << 32); +#endif +} + +<<<<<<< HEAD +__device__ __forceinline__ uint2 vectorize(uint64_t x) +======= + +__device__ __forceinline__ uint2 vectorize(const uint64_t x) +>>>>>>> 6506ecf... fixed build +{ +#ifndef NOASM + uint2 result; + asm("mov.b64 {%0,%1},%2; \n\t" + : "=r"(result.x), "=r"(result.y) : "l"(x)); + return result; +#else + return make_uint2(x & 0xffffffff, x >> 32); +#endif +} + +static __device__ __forceinline__ uint2 vectorizelow(uint32_t v) { + uint2 result; + result.x = v; + result.y = 0; + return result; +} +static __device__ __forceinline__ uint2 vectorizehigh(uint32_t v) { + uint2 result; + result.x = 0; + result.y = v; + return result; +} + +static __device__ __forceinline__ uint2 operator^ (uint2 a, uint32_t b) { return make_uint2(a.x^ b, a.y); } +static __device__ __forceinline__ uint2 operator^ (uint2 a, uint2 b) { return make_uint2(a.x ^ b.x, a.y ^ b.y); } +static __device__ __forceinline__ uint2 operator& (uint2 a, uint2 b) { return make_uint2(a.x & b.x, a.y & b.y); } +static __device__ __forceinline__ uint2 operator| (uint2 a, uint2 b) { return make_uint2(a.x | b.x, a.y | b.y); } +static __device__ __forceinline__ uint2 operator~ (uint2 a) { return make_uint2(~a.x, ~a.y); } +static __device__ __forceinline__ void operator^= (uint2 &a, uint2 b) { a = a ^ b; } + +static __device__ __forceinline__ uint2 operator+ (uint2 a, uint2 b) +{ +#ifndef NOASM + uint2 result; + asm("{\n\t" + "add.cc.u32 %0,%2,%4; \n\t" + "addc.u32 %1,%3,%5; \n\t" + "}\n\t" + : "=r"(result.x), "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(b.x), "r"(b.y)); + return result; +#else + return make_uint2(a.x + b.x, a.y + b.y); +#endif +} + +<<<<<<< HEAD +======= +static __device__ __forceinline__ uint2 operator+ (uint2 a, uint32_t b) +{ + uint2 result; + asm("{\n\t" + "add.cc.u32 %0,%2,%4; \n\t" + "addc.u32 %1,%3,%5; \n\t" + "}\n\t" + : "=r"(result.x), "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(b), "r"(0)); + return result; +} + + +static __device__ __forceinline__ uint2 operator- (uint2 a, uint32_t b) +{ + uint2 result; + asm("{\n\t" + "sub.cc.u32 %0,%2,%4; \n\t" + "subc.u32 %1,%3,%5; \n\t" + "}\n\t" + : "=r"(result.x), "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(b), "r"(0)); + return result; +} + + +>>>>>>> 6506ecf... fixed build +static __device__ __forceinline__ uint2 operator- (uint2 a, uint2 b) +{ +#ifndef NOASM + uint2 result; + asm("{\n\t" + "sub.cc.u32 %0,%2,%4; \n\t" + "subc.u32 %1,%3,%5; \n\t" + "}\n\t" + : "=r"(result.x), "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(b.x), "r"(b.y)); + return result; +#else +return make_uint2(a.x - b.x, a.y - b.y); +#endif +} + + + +static __device__ __forceinline__ uint4 operator^ (uint4 a, uint4 b) { return make_uint4(a.x ^ b.x, a.y ^ b.y, a.z ^ b.z, a.w ^ b.w); } +static __device__ __forceinline__ uint4 operator& (uint4 a, uint4 b) { return make_uint4(a.x & b.x, a.y & b.y, a.z & b.z, a.w & b.w); } +static __device__ __forceinline__ uint4 operator| (uint4 a, uint4 b) { return make_uint4(a.x | b.x, a.y | b.y, a.z | b.z, a.w | b.w); } +static __device__ __forceinline__ uint4 operator~ (uint4 a) { return make_uint4(~a.x, ~a.y, ~a.z, ~a.w); } +static __device__ __forceinline__ void operator^= (uint4 &a, uint4 b) { a = a ^ b; } +static __device__ __forceinline__ uint4 operator^ (uint4 a, uint2 b) { return make_uint4(a.x ^ b.x, a.y ^ b.y, a.z ^ b.x, a.w ^ b.y); } + + +static __device__ __forceinline__ void operator+= (uint2 &a, uint2 b) { a = a + b; } + +/** + * basic multiplication between 64bit no carry outside that range (ie mul.lo.b64(a*b)) + * (what does uint64 "*" operator) + */ +static __device__ __forceinline__ uint2 operator* (uint2 a, uint2 b) +{ +#ifndef NOASM + uint2 result; + asm("{\n\t" + "mul.lo.u32 %0,%2,%4; \n\t" + "mul.hi.u32 %1,%2,%4; \n\t" + "mad.lo.cc.u32 %1,%3,%4,%1; \n\t" + "madc.lo.u32 %1,%3,%5,%1; \n\t" + "}\n\t" + : "=r"(result.x), "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(b.x), "r"(b.y)); + return result; +#else + return vectorize(devectorize(a)*devectorize(b)); +#endif +} + +// uint2 method +#if __CUDA_ARCH__ >= 320 && !defined NOASM +__device__ __inline__ uint2 ROR2(const uint2 a, const int offset) +{ + uint2 result; + if (offset < 32) { + asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(a.x), "r"(a.y), "r"(offset)); + asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(a.y), "r"(a.x), "r"(offset)); + } + else { + asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(a.y), "r"(a.x), "r"(offset)); + asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(offset)); + } + return result; +} +#else +__device__ __inline__ uint2 ROR2(const uint2 v, const int n) +{ + uint2 result; + if (n <= 32) + { + result.y = ((v.y >> (n)) | (v.x << (32 - n))); + result.x = ((v.x >> (n)) | (v.y << (32 - n))); + } + else + { + result.y = ((v.x >> (n - 32)) | (v.y << (64 - n))); + result.x = ((v.y >> (n - 32)) | (v.x << (64 - n))); + } + return result; +} +#endif + +__device__ __inline__ uint32_t ROL8(const uint32_t x) +{ + return __byte_perm(x, x, 0x2103); +} +__device__ __inline__ uint32_t ROL16(const uint32_t x) +{ + return __byte_perm(x, x, 0x1032); +} +__device__ __inline__ uint32_t ROL24(const uint32_t x) +{ + return __byte_perm(x, x, 0x0321); +} + +__device__ __inline__ uint2 ROR8(const uint2 a) +{ + uint2 result; + result.x = __byte_perm(a.y, a.x, 0x0765); + result.y = __byte_perm(a.y, a.x, 0x4321); + + return result; +} + +__device__ __inline__ uint2 ROR16(const uint2 a) +{ + uint2 result; + result.x = __byte_perm(a.y, a.x, 0x1076); + result.y = __byte_perm(a.y, a.x, 0x5432); + + return result; +} + +__device__ __inline__ uint2 ROR24(const uint2 a) +{ + uint2 result; + result.x = __byte_perm(a.y, a.x, 0x2107); + result.y = __byte_perm(a.y, a.x, 0x6543); + + return result; +} + +__device__ __inline__ uint2 ROL8(const uint2 a) +{ + uint2 result; + result.x = __byte_perm(a.y, a.x, 0x6543); + result.y = __byte_perm(a.y, a.x, 0x2107); + + return result; +} + +__device__ __inline__ uint2 ROL16(const uint2 a) +{ + uint2 result; + result.x = __byte_perm(a.y, a.x, 0x5432); + result.y = __byte_perm(a.y, a.x, 0x1076); + + return result; +} + +__device__ __inline__ uint2 ROL24(const uint2 a) +{ + uint2 result; + result.x = __byte_perm(a.y, a.x, 0x4321); + result.y = __byte_perm(a.y, a.x, 0x0765); + + return result; +} + +#if __CUDA_ARCH__ >= 320 && !defined NOASM + + +__inline__ __device__ uint2 ROL2(const uint2 a, const int offset) { + uint2 result; + if (offset >= 32) { + asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(a.x), "r"(a.y), "r"(offset)); + asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(a.y), "r"(a.x), "r"(offset)); + } + else { + asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(a.y), "r"(a.x), "r"(offset)); + asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(offset)); + } + return result; +} +#else +__inline__ __device__ uint2 ROL2(const uint2 v, const int n) +{ + uint2 result; + if (n <= 32) + { + result.y = ((v.y << (n)) | (v.x >> (32 - n))); + result.x = ((v.x << (n)) | (v.y >> (32 - n))); + } + else + { + result.y = ((v.x << (n - 32)) | (v.y >> (64 - n))); + result.x = ((v.y << (n - 32)) | (v.x >> (64 - n))); + } + return result; +} +#endif + +__device__ __forceinline__ +uint64_t ROTR16(uint64_t x) +{ +#if __CUDA_ARCH__ > 500 && !defined NOASM + short4 temp; + asm("mov.b64 { %0, %1, %2, %3 }, %4; ": "=h"(temp.x), "=h"(temp.y), "=h"(temp.z), "=h"(temp.w) : "l"(x)); + asm("mov.b64 %0, {%1, %2, %3 , %4}; ": "=l"(x) : "h"(temp.y), "h"(temp.z), "h"(temp.w), "h"(temp.x)); + return x; +#else + return ROTR64(x, 16); +#endif +} + +__device__ __forceinline__ +uint64_t ROTL16(uint64_t x) +{ +#if __CUDA_ARCH__ > 500 && !defined NOASM + short4 temp; + asm("mov.b64 { %0, %1, %2, %3 }, %4; ": "=h"(temp.x), "=h"(temp.y), "=h"(temp.z), "=h"(temp.w) : "l"(x)); + asm("mov.b64 %0, {%1, %2, %3 , %4}; ": "=l"(x) : "h"(temp.w), "h"(temp.x), "h"(temp.y), "h"(temp.z)); + return x; +#else + return ROTL64(x, 16); +#endif +} + +__device__ __forceinline__ +uint2 SWAPINT2(uint2 x) +{ + return(make_uint2(x.y, x.x)); +} + +__device__ __forceinline__ bool cuda_hashisbelowtarget(const uint32_t *const __restrict__ hash, const uint32_t *const __restrict__ target) +{ + if (hash[7] > target[7]) + return false; + if (hash[7] < target[7]) + return true; + if (hash[6] > target[6]) + return false; + if (hash[6] < target[6]) + return true; + if (hash[5] > target[5]) + return false; + if (hash[5] < target[5]) + return true; + if (hash[4] > target[4]) + return false; + if (hash[4] < target[4]) + return true; + if (hash[3] > target[3]) + return false; + if (hash[3] < target[3]) + return true; + if (hash[2] > target[2]) + return false; + if (hash[2] < target[2]) + return true; + if (hash[1] > target[1]) + return false; + if (hash[1] < target[1]) + return true; + if (hash[0] > target[0]) + return false; + return true; +} + +__device__ __forceinline__ +uint2 SWAPDWORDS2(uint2 value) +{ + return make_uint2(value.y, value.x); +} + +static __forceinline__ __device__ uint2 SHL2(const uint2 a, int offset) +{ + uint2 result; +#if __CUDA_ARCH__ > 300 && !defined NOASM + if (offset<32) + { + asm("{\n\t" + "shf.l.clamp.b32 %1,%2,%3,%4; \n\t" + "shl.b32 %0,%2,%4; \n\t" + "}\n\t" + : "=r"(result.x), "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(offset)); + } + else { + asm("{\n\t" + "shf.l.clamp.b32 %1,%2,%3,%4; \n\t" + "shl.b32 %0,%2,%4; \n\t" + "}\n\t" + : "=r"(result.x), "=r"(result.y) : "r"(a.y), "r"(a.x), "r"(offset)); + } +#else + if (offset<=32) + { + result.y = (a.y << offset) | (a.x >> (32 - offset)); + result.x = (a.x << offset); + } + else + { + result.y = (a.x << (offset - 32)); + result.x = 0; + } +#endif + return result; +} + +static __forceinline__ __device__ uint2 SHR2(const uint2 a, int offset) +{ + uint2 result; +#if __CUDA_ARCH__ >= 320 && !defined NOASM + if (offset<32) { + asm("{\n\t" + "shf.r.clamp.b32 %0,%2,%3,%4; \n\t" + "shr.b32 %1,%3,%4; \n\t" + "}\n\t" + : "=r"(result.x), "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(offset)); + } + else { + asm("{\n\t" + "shf.l.clamp.b32 %0,%2,%3,%4; \n\t" + "shl.b32 %1,%3,%4; \n\t" + "}\n\t" + : "=r"(result.x), "=r"(result.y) : "r"(a.y), "r"(a.x), "r"(offset)); + } + #else + if (offset<=32) + { + result.x = (a.x >> offset) | (a.y << (32 - offset)); + result.y = (a.y >> offset); + } + else + { + result.x = (a.y >> (offset - 32)); + result.y = 0; + } +#endif + return result; +} + +static __device__ __forceinline__ uint64_t devectorizeswap(uint2 v) { return MAKE_UINT64(cuda_swab32(v.y), cuda_swab32(v.x)); } +static __device__ __forceinline__ uint2 vectorizeswap(uint64_t v) +{ + uint2 result; + LOHI(result.y, result.x, v); + result.x = cuda_swab32(result.x); + result.y = cuda_swab32(result.y); + return result; +} + + +__device__ __forceinline__ uint32_t devectorize16(ushort2 x) +{ + uint32_t result; +#ifndef NOASM + asm("mov.b32 %0,{%1,%2}; \n\t" + : "=r"(result) : "h"(x.x) , "h"(x.y)); +#else + result = x.x + (x.y << 16); +#endif + return result; +} + + +__device__ __forceinline__ ushort2 vectorize16(uint32_t x) +{ + ushort2 result; +#ifndef NOASM + asm("mov.b32 {%0,%1},%2; \n\t" + : "=h"(result.x), "=h"(result.y) : "r"(x)); +#else + result.x = x & 0xffff; + result.y = x >> 16; +#endif + return result; +} + +extern int cuda_arch[MAX_GPUS]; +extern void get_cuda_arch(int *); + +/* +static __device__ __forceinline__ uint4 mul4(uint4 a) +{ + uint4 result; + asm("{\n\t" + "mul.lo.u32 %0,%4,%5; \n\t" + "mul.hi.u32 %1,%4,%5; \n\t" + "mul.lo.u32 %2,%6,%7; \n\t" + "mul.hi.u32 %3,%6,%7; \n\t" + "}\n\t" + : "=r"(result.x), "=r"(result.y), "=r"(result.z), "=r"(result.w) : "r"(a.x), "r"(a.y), "r"(a.z), "r"(a.w)); + return result; +} +static __device__ __forceinline__ uint4 add4(uint4 a, uint4 b) + { + uint4 result; + asm("{\n\t" + "add.cc.u32 %0,%4,%8; \n\t" + "addc.u32 %1,%5,%9; \n\t" + "add.cc.u32 %2,%6,%10; \n\t" + "addc.u32 %3,%7,%11; \n\t" + "}\n\t" + : "=r"(result.x), "=r"(result.y), "=r"(result.z), "=r"(result.w) : "r"(a.x), "r"(a.y), "r"(a.z), "r"(a.w), "r"(b.x), "r"(b.y), "r"(b.z), "r"(b.w)); + return result; + } + +static __device__ __forceinline__ uint4 madd4(uint4 a, uint4 b) + { + uint4 result; + asm("{\n\t" + "mad.lo.cc.u32 %0,%4,%5,%8; \n\t" + "madc.hi.u32 %1,%4,%5,%9; \n\t" + "mad.lo.cc.u32 %2,%6,%7,%10; \n\t" + "madc.hi.u32 %3,%6,%7,%11; \n\t" + "}\n\t" + : "=r"(result.x), "=r"(result.y), "=r"(result.z), "=r"(result.w) : "r"(a.x), "r"(a.y), "r"(a.z), "r"(a.w), "r"(b.x), "r"(b.y), "r"(b.z), "r"(b.w)); + return result; + } + +static __device__ __forceinline__ ulonglong2 madd4long(ulonglong2 a, ulonglong2 b) + { + ulonglong2 result; + asm("{\n\t" + ".reg .u32 a0,a1,a2,a3,b0,b1,b2,b3;\n\t" + "mov.b64 {a0,a1}, %2;\n\t" + "mov.b64 {a2,a3}, %3;\n\t" + "mov.b64 {b0,b1}, %4;\n\t" + "mov.b64 {b2,b3}, %5;\n\t" + "mad.lo.cc.u32 b0,a0,a1,b0; \n\t" + "madc.hi.u32 b1,a0,a1,b1; \n\t" + "mad.lo.cc.u32 b2,a2,a3,b2; \n\t" + "madc.hi.u32 b3,a2,a3,b3; \n\t" + "mov.b64 %0, {b0,b1};\n\t" + "mov.b64 %1, {b2,b3};\n\t" + "}\n\t" + : "=l"(result.x), "=l"(result.y) : "l"(a.x), "l"(a.y), "l"(b.x), "l"(b.y)); + return result; + } +*/ +static __device__ __forceinline__ void madd4long2(ulonglong2 &a, ulonglong2 b) + { +#ifndef NOASM + asm("{\n\t" + ".reg .u32 a0,a1,a2,a3,b0,b1,b2,b3;\n\t" + "mov.b64 {a0,a1}, %0;\n\t" + "mov.b64 {a2,a3}, %1;\n\t" + "mov.b64 {b0,b1}, %2;\n\t" + "mov.b64 {b2,b3}, %3;\n\t" + "mad.lo.cc.u32 b0,a0,a1,b0; \n\t" + "madc.hi.u32 b1,a0,a1,b1; \n\t" + "mad.lo.cc.u32 b2,a2,a3,b2; \n\t" + "madc.hi.u32 b3,a2,a3,b3; \n\t" + "mov.b64 %0, {b0,b1};\n\t" + "mov.b64 %1, {b2,b3};\n\t" + "}\n\t" + : "+l"(a.x), "+l"(a.y) : "l"(b.x), "l"(b.y)); +#else // ?? no idea what madd4long is supposed to do + a.x = a.x + b.x; + if(a.x < b.x) + a.y = a.y + b.y + 1; + else + a.y = a.y + b.y; +#endif +} + +__device__ __forceinline__ +uint32_t xor3b(uint32_t a, uint32_t b, uint32_t c) { + uint32_t result; +#ifndef NOASM + asm("{ .reg .u32 t1;\n\t" + "xor.b32 t1, %2, %3;\n\t" + "xor.b32 %0, %1, t1;\n\t" + "}" + : "=r"(result) : "r"(a), "r"(b), "r"(c)); +#else + result = a ^ b ^ c; +#endif + return result; +} + +__device__ __forceinline__ +uint32_t shr_t32(uint32_t x, uint32_t n) { + uint32_t result; +#ifndef NOASM + asm("shr.b32 %0,%1,%2;" : "=r"(result) : "r"(x), "r"(n)); +#else + result = x >> n; +#endif + return result; +} + +__device__ __forceinline__ +uint32_t shl_t32(uint32_t x, uint32_t n) { + uint32_t result; +#ifndef NOASM + asm("shl.b32 %0,%1,%2;" : "=r"(result) : "r"(x), "r"(n)); +#else + result = x << n; +#endif + return result; +} + +// device asm 32 for pluck +__device__ __forceinline__ +uint32_t andor32(uint32_t a, uint32_t b, uint32_t c) { + uint32_t result; +#ifndef NOASM + asm("{ .reg .u32 m,n,o;\n\t" + "and.b32 m, %1, %2;\n\t" + " or.b32 n, %1, %2;\n\t" + "and.b32 o, n, %3;\n\t" + " or.b32 %0, m, o ;\n\t" + "}\n\t" + : "=r"(result) : "r"(a), "r"(b), "r"(c)); +#else + result = ((a | b) & c) | (a & b); +#endif + return result; +} + +#endif // #ifndef CUDA_HELPER_H + + diff --git a/cuda_myriadgroestl.cu b/cuda_myriadgroestl.cu index d404afe7b4..6316309fbb 100644 --- a/cuda_myriadgroestl.cu +++ b/cuda_myriadgroestl.cu @@ -2,15 +2,16 @@ #include #include - +#include "miner.h" #include "cuda_helper.h" + // globaler Speicher für alle HeftyHashes aller Threads __constant__ uint32_t pTarget[8]; // Single GPU -uint32_t *d_outputHashes[MAX_GPUS]; +static uint32_t *d_outputHashes[MAX_GPUS]; static uint32_t *d_resultNonce[MAX_GPUS]; -__constant__ uint32_t myriadgroestl_gpu_msg[32]; +__constant__ uint32_t myriadgroestl_gpu_msg[20]; // muss expandiert werden __constant__ uint32_t myr_sha256_gpu_constantTable[64] = { @@ -23,7 +24,17 @@ __constant__ uint32_t myr_sha256_gpu_constantTable[64] = { 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2, }; -// __constant__ uint32_t myr_sha256_gpu_constantTable2[64]; +__constant__ uint32_t myr_sha256_gpu_constantTable2[64] = { + 0xc28a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, + 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf374, + 0x649b69c1, 0xf0fe4786, 0x0fe1edc6, 0x240cf254, 0x4fe9346f, 0x6cc984be, 0x61b9411e, 0x16f988fa, + 0xf2c65152, 0xa88e5a6d, 0xb019fc65, 0xb9d99ec7, 0x9a1231c3, 0xe70eeaa0, 0xfdb1232b, 0xc7353eb0, + 0x3069bad5, 0xcb976d5f, 0x5a0f118f, 0xdc1eeefd, 0x0a35b689, 0xde0b7a04, 0x58f4ca9d, 0xe15d5b16, + 0x007f3e86, 0x37088980, 0xa507ea32, 0x6fab9537, 0x17406110, 0x0d8cd6f1, 0xcdaa3b6d, 0xc0bbbe37, + 0x83613bda, 0xdb48a363, 0x0b02e931, 0x6fd15ca7, 0x521afaca, 0x31338431, 0x6ed41a95, 0x6d437890, + 0xc39c91f2, 0x9eccabbd, 0xb5c9a0e6, 0x532fb63c, 0xd2c741c6, 0x07237ea3, 0xa4954b68, 0x4c191d76 +}; + __constant__ uint32_t myr_sha256_gpu_hashTable[8] = { 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19 }; @@ -41,15 +52,6 @@ __constant__ uint32_t myr_sha256_gpu_w2Table[64] = { #include "groestl_functions_quad.cu" #include "bitslice_transformations_quad.cu" -#define SWAB32(x) ( ((x & 0x000000FF) << 24) | ((x & 0x0000FF00) << 8) | ((x & 0x00FF0000) >> 8) | ((x & 0xFF000000) >> 24) ) - -#if __CUDA_ARCH__ < 350 - // Kepler (Compute 3.0) - #define ROTR32(x, n) (((x) >> (n)) | ((x) << (32 - (n)))) -#else - // Kepler (Compute 3.5) - #define ROTR32(x, n) __funnelshift_r( (x), (x), (n) ) -#endif #define R(x, n) ((x) >> (n)) #define Ch(x, y, z) ((x & (y ^ z)) ^ z) #define Maj(x, y, z) ((x & (y | z)) | (y & z)) @@ -181,49 +183,56 @@ __device__ void myriadgroestl_gpu_sha256(uint32_t *message) for(int k=0;k<8;k++) regs[k] = hash[k]; -// to do: precalculate constants - uint32_t myr_sha256_gpu_constantTable2[64]; -#pragma unroll 64 - for (int i = 0; i < 64; i++) - myr_sha256_gpu_constantTable2[i] = myr_sha256_gpu_constantTable[i] + myr_sha256_gpu_w2Table[i]; - // Progress W1 -#pragma unroll 64 - for(int j=0;j<61;j++) +#pragma unroll + for(int j=0;j<57;j++) { uint32_t T1, T2; T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + myr_sha256_gpu_constantTable2[j]; T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]); -#pragma unroll 7 +#pragma unroll for (int k = 6; k >= 0; k--) regs[k + 1] = regs[k]; regs[0] = T1 + T2; regs[4] += T1; } - //// FERTIG + regs[3] += regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + myr_sha256_gpu_constantTable2[57]; + regs[2] += regs[6] + S1(regs[3]) + Ch(regs[3], regs[4], regs[5]) + myr_sha256_gpu_constantTable2[58]; + regs[1] += regs[5] + S1(regs[2]) + Ch(regs[2], regs[3], regs[4]) + myr_sha256_gpu_constantTable2[59]; + regs[0] += regs[4] + S1(regs[1]) + Ch(regs[1], regs[2], regs[3]) + myr_sha256_gpu_constantTable2[60]; - message[7] = cuda_swab32(hash[7] + regs[4]); + message[7] = cuda_swab32(hash[7] + regs[0]); } -__global__ void __launch_bounds__(256, 4) +__global__ void __launch_bounds__(512, 2) myriadgroestl_gpu_hash_quad(uint32_t threads, uint32_t startNounce, uint32_t *hashBuffer) { // durch 4 dividieren, weil jeweils 4 Threads zusammen ein Hash berechnen - uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x) / 4; + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x) >> 2; if (thread < threads) { // GROESTL - uint32_t paddedInput[8]; -#pragma unroll 8 - for (int k = 0; k<8; k++) paddedInput[k] = myriadgroestl_gpu_msg[4 * k + (threadIdx.x & 3)]; - - uint32_t nounce = startNounce + thread; - if ((threadIdx.x & 3) == 3) - paddedInput[4] = cuda_swab32(nounce); // 4*4+3 = 19 + uint32_t paddedInput[8]; + paddedInput[0] = myriadgroestl_gpu_msg[4 * 0 + (threadIdx.x & 3)]; + paddedInput[1] = myriadgroestl_gpu_msg[4 * 1 + (threadIdx.x & 3)]; + paddedInput[2] = myriadgroestl_gpu_msg[4 * 2 + (threadIdx.x & 3)]; + paddedInput[3] = myriadgroestl_gpu_msg[4 * 3 + (threadIdx.x & 3)]; + paddedInput[4] = myriadgroestl_gpu_msg[4 * 4 + (threadIdx.x & 3)]; + paddedInput[5] = 0; + paddedInput[6] = 0; + paddedInput[7] = 0; + + if((threadIdx.x & 3) == 0) + paddedInput[5] = 0x80; + if((threadIdx.x & 3) == 3) + { + paddedInput[4] = cuda_swab32(startNounce + thread); + paddedInput[7] = 0x01000000; + } uint32_t msgBitsliced[8]; - to_bitslice_quad(paddedInput, msgBitsliced); + myr_to_bitslice_quad(paddedInput, msgBitsliced); uint32_t state[8]; @@ -234,23 +243,25 @@ __global__ void __launch_bounds__(256, 4) if ((threadIdx.x & 0x03) == 0) { - uint32_t *outpHash = &hashBuffer[16 * thread]; -#pragma unroll 16 - for(int k=0;k<16;k++) outpHash[k] = out_state[k]; - } + uint4 *outpHash = (uint4*)&hashBuffer[16 * thread]; + uint4 *phash = (uint4*)out_state; + uint4 *outpt = outpHash; + outpt[0] = phash[0]; + outpt[1] = phash[1]; + outpt[2] = phash[2]; + outpt[3] = phash[3]; + } } } -__global__ void __launch_bounds__(256, 3) - myriadgroestl_gpu_hash_quad2(uint32_t threads, uint32_t startNounce, uint32_t *resNounce, uint32_t *hashBuffer) +__global__ void __launch_bounds__(512, 1) + myriadgroestl_gpu_hash_quad2(uint32_t threads, uint32_t startNounce, uint32_t *const __restrict__ resNounce, const uint32_t *const __restrict__ hashBuffer) { - uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); if (thread < threads) { - uint32_t nounce = startNounce + thread; - uint32_t out_state[16]; - uint32_t *inpHash = &hashBuffer[16 * thread]; + const uint32_t *inpHash = &hashBuffer[16 * thread]; #pragma unroll 16 for (int i=0; i < 16; i++) out_state[i] = inpHash[i]; @@ -259,72 +270,54 @@ __global__ void __launch_bounds__(256, 3) if (out_state[7] <= pTarget[7]) { - uint32_t tmp = atomicExch(resNounce, nounce); + uint32_t tmp = atomicExch(resNounce, startNounce + thread); if (tmp != 0xffffffff) resNounce[1] = tmp; } } } +static THREAD cudaStream_t stream[3]; // Setup-Funktionen __host__ void myriadgroestl_cpu_init(int thr_id, uint32_t threads) { - cudaSetDevice(device_map[thr_id]); - cudaDeviceReset(); + CUDA_SAFE_CALL(cudaSetDevice(device_map[thr_id])); cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync); cudaDeviceSetCacheConfig(cudaFuncCachePreferL1); - - // Speicher für Gewinner-Nonce belegen - cudaMalloc(&d_resultNonce[thr_id], 4*sizeof(uint32_t)); + CUDA_SAFE_CALL(cudaStreamCreate(&stream[0])); + CUDA_SAFE_CALL(cudaStreamCreate(&stream[1])); + CUDA_SAFE_CALL(cudaStreamCreate(&stream[2])); + cudaMalloc(&d_resultNonce[thr_id], 4 * sizeof(uint32_t)); // Speicher für temporäreHashes - cudaMalloc(&d_outputHashes[thr_id], 16*sizeof(uint32_t)*threads); + CUDA_SAFE_CALL(cudaMalloc(&d_outputHashes[thr_id], 16 * sizeof(uint32_t)*threads)); } __host__ void myriadgroestl_cpu_setBlock(int thr_id, void *data, void *pTargetIn) { - // Nachricht expandieren und setzen - uint32_t msgBlock[32]; - - memset(msgBlock, 0, sizeof(uint32_t) * 32); - memcpy(&msgBlock[0], data, 80); + cudaMemcpyToSymbolAsync(myriadgroestl_gpu_msg, data, 80, 0, cudaMemcpyHostToDevice, stream[0]); - // Erweitere die Nachricht auf den Nachrichtenblock (padding) - // Unsere Nachricht hat 80 Byte - msgBlock[20] = 0x80; - msgBlock[31] = 0x01000000; - - // groestl512 braucht hierfür keinen CPU-Code (die einzige Runde wird - // auf der GPU ausgeführt) - - // Blockheader setzen (korrekte Nonce und Hefty Hash fehlen da drin noch) - cudaMemcpyToSymbol( myriadgroestl_gpu_msg, - msgBlock, - 128); - - cudaMemset(d_resultNonce[thr_id], 0xFF, 4*sizeof(uint32_t)); - cudaMemcpyToSymbol( pTarget, - pTargetIn, - sizeof(uint32_t) * 8 ); + cudaMemsetAsync(d_resultNonce[thr_id], 0xFF, 4 * sizeof(uint32_t), stream[1]); + cudaMemcpyToSymbolAsync(pTarget, pTargetIn, sizeof(uint32_t) * 8, 0, cudaMemcpyHostToDevice, stream[2]); } __host__ void myriadgroestl_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *nounce) { - uint32_t threadsperblock = 256; - + const uint32_t threadsperblock = 512; + const uint32_t threadsperblock2 = 512; // Compute 3.0 benutzt die registeroptimierte Quad Variante mit Warp Shuffle // mit den Quad Funktionen brauchen wir jetzt 4 threads pro Hash, daher Faktor 4 bei der Blockzahl const int factor=4; - cudaMemset(d_resultNonce[thr_id], 0xFF, 4*sizeof(uint32_t)); // berechne wie viele Thread Blocks wir brauchen dim3 grid(factor*((threads + threadsperblock-1)/threadsperblock)); dim3 block(threadsperblock); + CUDA_SAFE_CALL(cudaDeviceSynchronize()); myriadgroestl_gpu_hash_quad<<>>(threads, startNounce, d_outputHashes[thr_id]); - dim3 grid2((threads + threadsperblock-1)/threadsperblock); + dim3 grid2((threads + threadsperblock2-1)/threadsperblock2); myriadgroestl_gpu_hash_quad2<<>>(threads, startNounce, d_resultNonce[thr_id], d_outputHashes[thr_id]); - cudaMemcpy(nounce, d_resultNonce[thr_id], 4*sizeof(uint32_t), cudaMemcpyDeviceToHost); + CUDA_SAFE_CALL(cudaMemcpy(nounce, d_resultNonce[thr_id], 4*sizeof(uint32_t), cudaMemcpyDeviceToHost)); } diff --git a/cuda_nist5.cu b/cuda_nist5.cu index 437889b648..a4270cde30 100644 --- a/cuda_nist5.cu +++ b/cuda_nist5.cu @@ -11,26 +11,27 @@ extern "C" #include "cuda_helper.h" -static uint32_t *d_hash[MAX_GPUS]; - -extern void quark_blake512_cpu_setBlock_80(void *pdata); -extern void quark_blake512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order); +extern void quark_blake512_cpu_init(int thr_id); +extern void quark_blake512_cpu_setBlock_80(int thr_id, uint64_t *pdata); +extern void quark_blake512_cpu_setBlock_80_multi(int thr_id, uint64_t *pdata); +extern void quark_blake512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash); +extern void quark_blake512_cpu_hash_80_multi(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash); extern void quark_groestl512_cpu_init(int thr_id, uint32_t threads); -extern void quark_groestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); +extern void quark_groestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash); -extern void quark_jh512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); +extern void quark_jh512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash); extern void quark_keccak512_cpu_init(int thr_id, uint32_t threads); -extern void quark_keccak512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); +extern void quark_keccak512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash); extern void quark_skein512_cpu_init(int thr_id); -extern void quark_skein512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); -extern void quark_skein512_cpu_hash_64_final(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, uint32_t *h_found, uint32_t target, int order); +extern void quark_skein512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash); +extern void quark_skein512_cpu_hash_64_final(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, uint32_t *h_found, uint32_t target); // Original nist5hash Funktion aus einem miner Quelltext -extern "C" void nist5hash(void *state, const void *input) +void nist5hash(void *state, const void *input) { sph_blake512_context ctx_blake; sph_groestl512_context ctx_groestl; @@ -63,91 +64,125 @@ extern "C" void nist5hash(void *state, const void *input) memcpy(state, hash, 32); } -static bool init[MAX_GPUS] = { 0 }; -static uint32_t *h_found[MAX_GPUS]; - -extern "C" int scanhash_nist5(int thr_id, uint32_t *pdata, - const uint32_t *ptarget, uint32_t max_nonce, - unsigned long *hashes_done) +extern int scanhash_nist5(int thr_id, uint32_t *pdata, + uint32_t *ptarget, uint32_t max_nonce, + uint32_t *hashes_done) { + static THREAD uint32_t *d_hash = nullptr; + static THREAD uint32_t *h_found = nullptr; + static THREAD uint32_t oldthroughput; + const uint32_t first_nonce = pdata[19]; - uint32_t throughput = device_intensity(thr_id, __func__, 1 << 20); // 256*256*16 - throughput = min(throughput, (max_nonce - first_nonce)); + uint32_t throughput = device_intensity(device_map[thr_id], __func__, 1 << 19); // 256*256*16 + throughput = min(throughput, (max_nonce - first_nonce)) & 0xfffffc00; if (opt_benchmark) - ((uint32_t*)ptarget)[7] = 0x0Fu; + ptarget[7] = 0x0Fu; - if (!init[thr_id]) + static THREAD volatile bool init = false; + if(!init) { - cudaSetDevice(device_map[thr_id]); - cudaDeviceReset(); + oldthroughput = throughput; + CUDA_SAFE_CALL(cudaSetDevice(device_map[thr_id])); cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync); cudaDeviceSetCacheConfig(cudaFuncCachePreferL1); + CUDA_SAFE_CALL(cudaStreamCreate(&gpustream[thr_id])); + get_cuda_arch(&cuda_arch[thr_id]); +#if defined WIN32 && !defined _WIN64 + // 2GB limit for cudaMalloc + if(throughput > 0x7fffffffULL / (16 * sizeof(uint32_t))) + { + applog(LOG_ERR, "intensity too high"); + mining_has_stopped[thr_id] = true; + cudaStreamDestroy(gpustream[thr_id]); + proper_exit(2); + } +#endif // Konstanten kopieren, Speicher belegen quark_groestl512_cpu_init(thr_id, throughput); quark_skein512_cpu_init(thr_id); - CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput)); - CUDA_CALL_OR_RET_X(cudaMallocHost(&(h_found[thr_id]), 2 * sizeof(uint32_t)), 0); + CUDA_SAFE_CALL(cudaMalloc(&d_hash, 16 * sizeof(uint32_t) * throughput)); + CUDA_SAFE_CALL(cudaMallocHost(&(h_found), 2 * sizeof(uint32_t))); - cuda_check_cpu_init(thr_id, throughput); - init[thr_id] = true; +// cuda_check_cpu_init(thr_id, throughput); + mining_has_stopped[thr_id] = false; + init = true; + } + if(throughput > oldthroughput) + { + oldthroughput = throughput; + CUDA_SAFE_CALL(cudaFree(d_hash)); + CUDA_SAFE_CALL(cudaMalloc(&d_hash, 16 * sizeof(uint32_t) * throughput)); } uint32_t endiandata[20]; for (int k=0; k < 20; k++) - be32enc(&endiandata[k], ((uint32_t*)pdata)[k]); + be32enc(&endiandata[k], pdata[k]); - quark_blake512_cpu_setBlock_80((void*)endiandata); - cuda_check_cpu_setTarget(ptarget); + quark_blake512_cpu_setBlock_80(thr_id, (uint64_t *)endiandata); +// cuda_check_cpu_setTarget(ptarget, thr_id); do { - int order = 0; // Hash with CUDA - quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++); - quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); - quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); - quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); - quark_skein512_cpu_hash_64_final(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], h_found[thr_id], ptarget[7], order++); - - if (h_found[thr_id][0] != 0xffffffff) + quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash); + quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash); + quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash); + quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash); + quark_skein512_cpu_hash_64_final(thr_id, throughput, pdata[19], NULL, d_hash, h_found, ptarget[7]); + + if(stop_mining) {mining_has_stopped[thr_id] = true; cudaStreamDestroy(gpustream[thr_id]); pthread_exit(nullptr);} + if(h_found[0] != 0xffffffff) { const uint32_t Htarg = ptarget[7]; - uint32_t vhash64[8]; - be32enc(&endiandata[19], h_found[thr_id][0]); + uint32_t vhash64[8]={0}; + if(opt_verify){ be32enc(&endiandata[19], h_found[0]); nist5hash(vhash64, endiandata); - if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) + } if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) { int res = 1; - // check if there was some other ones... *hashes_done = pdata[19] - first_nonce + throughput; - if (h_found[thr_id][1] != 0xffffffff) + if (h_found[1] != 0xffffffff) { - pdata[21] = h_found[thr_id][1]; - res++; - if (opt_benchmark) - applog(LOG_INFO, "GPU #%d Found second nounce %08x", thr_id, h_found[thr_id][1]); + if(opt_verify){ be32enc(&endiandata[19], h_found[1]); + nist5hash(vhash64, endiandata); + } if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) + { + + pdata[21] = h_found[1]; + res++; + if (opt_benchmark) + applog(LOG_INFO, "GPU #%d Found second nounce %08x", device_map[thr_id], h_found[1]); + } + else + { + if (vhash64[7] != Htarg) + { + applog(LOG_WARNING, "GPU #%d: result for %08x does not validate on CPU!", device_map[thr_id], h_found[1]); + } + } + } - pdata[19] = h_found[thr_id][0]; + pdata[19] = h_found[0]; if (opt_benchmark) - applog(LOG_INFO, "GPU #%d Found nounce %08x", thr_id, h_found[thr_id][0]); + applog(LOG_INFO, "GPU #%d Found nounce %08x", device_map[thr_id], h_found[0]); return res; } else { if (vhash64[7] != Htarg) { - applog(LOG_INFO, "GPU #%d: result for %08x does not validate on CPU!", thr_id, h_found[thr_id][0]); + applog(LOG_WARNING, "GPU #%d: result for %08x does not validate on CPU!", device_map[thr_id], h_found[0]); } } } - pdata[19] += throughput; + pdata[19] += throughput; CUDA_SAFE_CALL(cudaGetLastError()); } while (!work_restart[thr_id].restart && ((uint64_t)max_nonce > ((uint64_t)(pdata[19]) + (uint64_t)throughput))); - *hashes_done = pdata[19] - first_nonce + 1; + *hashes_done = pdata[19] - first_nonce ; return 0; } diff --git a/cuda_vector.h b/cuda_vector.h new file mode 100644 index 0000000000..0644c902d6 --- /dev/null +++ b/cuda_vector.h @@ -0,0 +1,1385 @@ +#ifndef CUDA_VECTOR_H +#define CUDA_VECTOR_H + + +/////////////////////////////////////////////////////////////////////////////////// +#if (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) +#define __LDG_PTR "l" +#else +#define __LDG_PTR "r" +#endif + +#include "cuda_helper.h" + +//typedef __device_builtin__ struct ulong16 ulong16; + + +typedef struct __align__(32) uint8 +{ + unsigned int s0, s1, s2, s3, s4, s5, s6, s7; +} uint8; + +typedef struct __align__(64) uint2_8 +{ + uint2 s0, s1, s2, s3, s4, s5, s6, s7; +} uint2_8; + + +typedef struct __align__(64) ulonglong2to8 +{ +ulonglong2 l0,l1,l2,l3; +} ulonglong2to8; + +typedef struct __align__(128) ulonglong8to16 +{ + ulonglong2to8 lo, hi; +} ulonglong8to16; + +typedef struct __align__(256) ulonglong16to32 +{ + ulonglong8to16 lo, hi; +} ulonglong16to32; + +typedef struct __align__(512) ulonglong32to64 +{ + ulonglong16to32 lo, hi; +} ulonglong32to64; + + + +typedef struct __align__(128) ulonglonglong +{ + ulonglong2 s0,s1,s2,s3,s4,s5,s6,s7; +} ulonglonglong; + + + + +typedef struct __align__(64) uint16 +{ + union { + struct {unsigned int s0, s1, s2, s3, s4, s5, s6, s7;}; + uint8 lo; + }; + union { + struct {unsigned int s8, s9, sa, sb, sc, sd, se, sf;}; + uint8 hi; + }; +} uint16; + +typedef struct __align__(128) uint2_16 +{ + union { + struct { uint2 s0, s1, s2, s3, s4, s5, s6, s7; }; + uint2_8 lo; + }; + union { + struct { uint2 s8, s9, sa, sb, sc, sd, se, sf; }; + uint2_8 hi; + }; +} uint2_16; + + + + +typedef struct __align__(128) uint32 +{ + + uint16 lo,hi; +} uint32; + + + +struct __align__(128) ulong8 +{ + ulonglong4 s0, s1, s2, s3; +}; +typedef __device_builtin__ struct ulong8 ulong8; + +/* +typedef struct __align__(256) ulonglong16 +{ + ulonglong2 s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sa, sb, sc, sd, se, sf; +} ulonglong16; +*/ +typedef struct __align__(256) ulonglong16 +{ + ulonglong4 s0, s1, s2, s3, s4, s5, s6, s7; +} ulonglong16; + + + +//typedef struct __align__(32) uint48 +//{ +// uint4 s0, s1; +// +//} uint48; + +typedef struct __align__(16) uint28 +{ + uint2 x, y, z, w; + +} uint28; + +/* +typedef struct __builtin_align__(32) uint48 +{ + union { + uint4 s0; + struct { uint2 x, y;}; + }; + union { + uint4 s1; + struct { uint2 z, w; }; + + }; +} uint48; +*/ + +typedef struct __builtin_align__(32) uint48 +{ + uint4 s0,s1; +} uint48; + +typedef struct __align__(64) uint816 +{ + uint48 s0, s1; + +} uint816; + +typedef struct __align__(128) uint1632 +{ + uint816 s0, s1; + +} uint1632; + +typedef struct __align__(256) uintx64 +{ + uint1632 s0, s1; + +} uintx64; + +typedef struct __builtin_align__(256) uintx64bis +{ + uint28 s0, s1, s2, s3, s4, s5, s6, s7; + +} uintx64bis; + + + +typedef struct __align__(512) uintx128 +{ + uintx64 s0, s1; + +} uintx128; + +typedef struct __align__(1024) uintx256 +{ + uintx128 s0, s1; + +} uintx256; + + + +typedef struct __align__(256) uint4x16 +{ + uint4 s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15; +} uint4x16; + +static __inline__ __device__ ulonglong2to8 make_ulonglong2to8(ulonglong2 s0, ulonglong2 s1, ulonglong2 s2, ulonglong2 s3) +{ +ulonglong2to8 t; t.l0=s0; t.l1=s1; t.l2=s2; t.l3=s3; +return t; +} + +static __inline__ __device__ ulonglong8to16 make_ulonglong8to16(const ulonglong2to8 &s0, const ulonglong2to8 &s1) +{ + ulonglong8to16 t; t.lo = s0; t.hi = s1; + return t; +} + +static __inline__ __device__ ulonglong16to32 make_ulonglong16to32(const ulonglong8to16 &s0, const ulonglong8to16 &s1) +{ + ulonglong16to32 t; t.lo = s0; t.hi = s1; + return t; +} + +static __inline__ __device__ ulonglong32to64 make_ulonglong32to64(const ulonglong16to32 &s0, const ulonglong16to32 &s1) +{ + ulonglong32to64 t; t.lo = s0; t.hi = s1; + return t; +} + + +static __inline__ __host__ __device__ ulonglonglong make_ulonglonglong( + const ulonglong2 &s0, const ulonglong2 &s1, const ulonglong2 &s2, const ulonglong2 &s3, + const ulonglong2 &s4, const ulonglong2 &s5) +{ + ulonglonglong t; t.s0 = s0; t.s1 = s1; t.s2 = s2; t.s3 = s3; t.s4 = s4; t.s5 = s5; + return t; +} + + +static __inline__ __device__ uint48 make_uint48(uint4 s0, uint4 s1) +{ + uint48 t; t.s0 = s0; t.s1 = s1; + return t; +} +/* +static __inline__ __device__ uint48 make_uint48(uint2 s0, uint2 s1, uint2 s2, uint2 s3) +{ + uint48 t; t.x = s0; t.y = s1; t.z = s2; t.w = s3; + return t; +} + +static __inline__ __device__ uint48 make_uint48(const uint28 &s0) +{ + uint48 t; t.x = s0.x; t.y = s0.y; t.z = s0.z; t.w = s0.w; + return t; +} +*/ +static __inline__ __device__ uint28 make_uint28(uint2 s0, uint2 s1, uint2 s2, uint2 s3) +{ + uint28 t; t.x = s0; t.y = s1; t.z = s2; t.w = s3; + return t; +} + + +static __inline__ __device__ uint816 make_uint816(const uint48 &s0, const uint48 &s1) +{ + uint816 t; t.s0 = s0; t.s1 = s1; + return t; +} + + + + + +static __inline__ __device__ uint1632 make_uint1632(const uint816 &s0, const uint816 &s1) +{ + uint1632 t; t.s0 = s0; t.s1 = s1; + return t; +} + +static __inline__ __device__ uintx64 make_uintx64(const uint1632 &s0, const uint1632 &s1) +{ + uintx64 t; t.s0 = s0; t.s1 = s1; + return t; +} + +static __inline__ __device__ uintx64bis make_uintx64bis( + const uint28 &s0, const uint28 &s1, const uint28 &s2, const uint28 &s3, + const uint28 &s4, const uint28 &s5, const uint28 &s6, const uint28 &s7 +) +{ + uintx64bis t; t.s0 = s0; t.s1 = s1; t.s2 = s2; t.s3 = s3; t.s4 = s4; t.s5 = s5; t.s6 = s6; t.s7 = s7; + return t; +} + + +static __inline__ __device__ uintx128 make_uintx128(const uintx64 &s0, const uintx64 &s1) +{ + uintx128 t; t.s0 = s0; t.s1 = s1; + return t; +} + +static __inline__ __device__ uintx256 make_uintx256(const uintx128 &s0, const uintx128 &s1) +{ + uintx256 t; t.s0 = s0; t.s1 = s1; + return t; +} + + +static __inline__ __device__ uintx256 make_uintx64(const uintx128 &s0, const uintx128 &s1) +{ + uintx256 t; t.s0 = s0; t.s1 = s1; + return t; +} + + +static __inline__ __host__ __device__ uint4x16 make_uint4x16( + uint4 s0, uint4 s1, uint4 s2, uint4 s3, uint4 s4, uint4 s5, uint4 s6, uint4 s7, + uint4 s8, uint4 s9, uint4 sa, uint4 sb, uint4 sc, uint4 sd, uint4 se, uint4 sf) +{ + uint4x16 t; t.s0 = s0; t.s1 = s1; t.s2 = s2; t.s3 = s3; t.s4 = s4; t.s5 = s5; t.s6 = s6; t.s7 = s7; + t.s8 = s8; t.s9 = s9; t.s10 = sa; t.s11 = sb; t.s12 = sc; t.s13 = sd; t.s14 = se; t.s15 = sf; + return t; +} + + +static __inline__ __device__ uint2_16 make_uint2_16( + uint2 s0, uint2 s1, uint2 s2, uint2 s3, uint2 s4, uint2 s5, uint2 s6, uint2 s7, + uint2 s8, uint2 s9, uint2 sa, uint2 sb, uint2 sc, uint2 sd, uint2 se, uint2 sf) +{ + uint2_16 t; t.s0 = s0; t.s1 = s1; t.s2 = s2; t.s3 = s3; t.s4 = s4; t.s5 = s5; t.s6 = s6; t.s7 = s7; + t.s8 = s8; t.s9 = s9; t.sa = sa; t.sb = sb; t.sc = sc; t.sd = sd; t.se = se; t.sf = sf; + return t; +} + + +static __inline__ __host__ __device__ uint16 make_uint16( + unsigned int s0, unsigned int s1, unsigned int s2, unsigned int s3, unsigned int s4, unsigned int s5, unsigned int s6, unsigned int s7, + unsigned int s8, unsigned int s9, unsigned int sa, unsigned int sb, unsigned int sc, unsigned int sd, unsigned int se, unsigned int sf) +{ + uint16 t; t.s0 = s0; t.s1 = s1; t.s2 = s2; t.s3 = s3; t.s4 = s4; t.s5 = s5; t.s6 = s6; t.s7 = s7; + t.s8 = s8; t.s9 = s9; t.sa = sa; t.sb = sb; t.sc = sc; t.sd = sd; t.se = se; t.sf = sf; + return t; +} + +static __inline__ __host__ __device__ uint16 make_uint16(const uint8 &a, const uint8 &b) +{ + uint16 t; t.lo=a; t.hi=b; return t; +} + +static __inline__ __host__ __device__ uint32 make_uint32(const uint16 &a, const uint16 &b) +{ + uint32 t; t.lo = a; t.hi = b; return t; +} + + +static __inline__ __host__ __device__ uint8 make_uint8( + unsigned int s0, unsigned int s1, unsigned int s2, unsigned int s3, unsigned int s4, unsigned int s5, unsigned int s6, unsigned int s7) +{ + uint8 t; t.s0 = s0; t.s1 = s1; t.s2 = s2; t.s3 = s3; t.s4 = s4; t.s5 = s5; t.s6 = s6; t.s7 = s7; + return t; +} + +static __inline__ __host__ __device__ uint2_8 make_uint2_8( + uint2 s0, uint2 s1, uint2 s2, uint2 s3, uint2 s4, uint2 s5, uint2 s6, uint2 s7) +{ + uint2_8 t; t.s0 = s0; t.s1 = s1; t.s2 = s2; t.s3 = s3; t.s4 = s4; t.s5 = s5; t.s6 = s6; t.s7 = s7; + return t; +} + + +static __inline__ __host__ __device__ ulonglong16 make_ulonglong16(const ulonglong4 &s0, const ulonglong4 &s1, + const ulonglong4 &s2, const ulonglong4 &s3, const ulonglong4 &s4, const ulonglong4 &s5, const ulonglong4 &s6, const ulonglong4 &s7) +{ + ulonglong16 t; t.s0 = s0; t.s1 = s1; t.s2 = s2; t.s3 = s3; t.s4 = s4; t.s5 = s5; t.s6 = s6; t.s7 = s7; + return t; +} + + + + +static __inline__ __host__ __device__ ulong8 make_ulong8( + ulonglong4 s0, ulonglong4 s1, ulonglong4 s2, ulonglong4 s3) +{ + ulong8 t; t.s0 = s0; t.s1 = s1; t.s2 = s2; t.s3 = s3;// t.s4 = s4; t.s5 = s5; t.s6 = s6; t.s7 = s7; + return t; +} + +static __forceinline__ __device__ ulonglong4 operator^ (ulonglong4 a, ulonglong4 b) { return make_ulonglong4(a.x ^ b.x, a.y ^ b.y, a.z ^ b.z, a.w ^ b.w); } +static __forceinline__ __device__ ulonglong4 operator+ (ulonglong4 a, ulonglong4 b) { return make_ulonglong4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); } +static __forceinline__ __device__ ulonglong2 operator^ (ulonglong2 a, ulonglong2 b) { return make_ulonglong2(a.x ^ b.x, a.y ^ b.y); } +static __forceinline__ __device__ ulonglong2 operator+ (ulonglong2 a, ulonglong2 b) { return make_ulonglong2(a.x + b.x, a.y + b.y); } + +static __forceinline__ __device__ ulong8 operator^ (const ulong8 &a, const ulong8 &b) { + return make_ulong8(a.s0 ^ b.s0, a.s1 ^ b.s1, a.s2 ^ b.s2, a.s3 ^ b.s3); +} //, a.s4 ^ b.s4, a.s5 ^ b.s5, a.s6 ^ b.s6, a.s7 ^ b.s7); } + +static __forceinline__ __device__ ulong8 operator+ (const ulong8 &a, const ulong8 &b) { + return make_ulong8(a.s0 + b.s0, a.s1 + b.s1, a.s2 + b.s2, a.s3 + b.s3); +} //, a.s4 + b.s4, a.s5 + b.s5, a.s6 + b.s6, a.s7 + b.s7); } + + +static __forceinline__ __device__ __host__ uint8 operator^ (const uint8 &a, const uint8 &b) { return make_uint8(a.s0 ^ b.s0, a.s1 ^ b.s1, a.s2 ^ b.s2, a.s3 ^ b.s3, a.s4 ^ b.s4, a.s5 ^ b.s5, a.s6 ^ b.s6, a.s7 ^ b.s7); } + +static __forceinline__ __device__ __host__ uint8 operator+ (const uint8 &a, const uint8 &b) { return make_uint8(a.s0 + b.s0, a.s1 + b.s1, a.s2 + b.s2, a.s3 + b.s3, a.s4 + b.s4, a.s5 + b.s5, a.s6 + b.s6, a.s7 + b.s7); } + +static __forceinline__ __device__ uint2_8 operator^ (const uint2_8 &a, const uint2_8 &b) { return make_uint2_8(a.s0 ^ b.s0, a.s1 ^ b.s1, a.s2 ^ b.s2, a.s3 ^ b.s3, a.s4 ^ b.s4, a.s5 ^ b.s5, a.s6 ^ b.s6, a.s7 ^ b.s7); } + +static __forceinline__ __device__ uint2_8 operator+ (const uint2_8 &a, const uint2_8 &b) { return make_uint2_8(a.s0 + b.s0, a.s1 + b.s1, a.s2 + b.s2, a.s3 + b.s3, a.s4 + b.s4, a.s5 + b.s5, a.s6 + b.s6, a.s7 + b.s7); } + + +////////////// mess++ ////// + +static __forceinline__ __device__ uint48 operator^ (const uint48 &a, const uint48 &b) { + return make_uint48(a.s0 ^ b.s0, a.s1 ^ b.s1); +} + +static __forceinline__ __device__ uint28 operator^ (const uint28 &a, const uint28 &b) { + return make_uint28(a.x ^ b.x, a.y ^ b.y, a.z ^ b.z, a.w ^ b.w); +} + +static __forceinline__ __device__ uint28 operator+ (const uint28 &a, const uint28 &b) { + return make_uint28(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); +} + +static __forceinline__ __device__ uint48 operator+ (const uint48 &a, const uint48 &b) { + return make_uint48(a.s0 + b.s0, a.s1 + b.s1); +} +/* +static __forceinline__ __device__ uint48 operator+ (const uint48 &a, const uint48 &b) { + return make_uint48(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); +} +*/ + +static __forceinline__ __device__ uint816 operator^ (const uint816 &a, const uint816 &b) { + return make_uint816(a.s0 ^ b.s0, a.s1 ^ b.s1); +} + +static __forceinline__ __device__ uint816 operator+ (const uint816 &a, const uint816 &b) { + return make_uint816(a.s0 + b.s0, a.s1 + b.s1); +} + + +static __forceinline__ __device__ uint1632 operator^ (const uint1632 &a, const uint1632 &b) { + return make_uint1632(a.s0 ^ b.s0, a.s1 ^ b.s1); +} + + +static __forceinline__ __device__ uintx64 operator^ (const uintx64 &a, const uintx64 &b) { + return make_uintx64(a.s0 ^ b.s0, a.s1 ^ b.s1); +} + +static __forceinline__ __device__ uintx128 operator^ (const uintx128 &a, const uintx128 &b) { + return make_uintx128(a.s0 ^ b.s0, a.s1 ^ b.s1); +} + +static __forceinline__ __device__ uintx256 operator^ (const uintx256 &a, const uintx256 &b) { + return make_uintx256(a.s0 ^ b.s0, a.s1 ^ b.s1); +} + +///////////////////////// + +static __forceinline__ __device__ __host__ uint16 operator^ (const uint16 &a, const uint16 &b) { + return make_uint16(a.s0 ^ b.s0, a.s1 ^ b.s1, a.s2 ^ b.s2, a.s3 ^ b.s3, a.s4 ^ b.s4, a.s5 ^ b.s5, a.s6 ^ b.s6, a.s7 ^ b.s7, + a.s8 ^ b.s8, a.s9 ^ b.s9, a.sa ^ b.sa, a.sb ^ b.sb, a.sc ^ b.sc, a.sd ^ b.sd, a.se ^ b.se, a.sf ^ b.sf); +} + +static __forceinline__ __device__ __host__ uint16 operator+ (const uint16 &a, const uint16 &b) { + return make_uint16(a.s0 + b.s0, a.s1 + b.s1, a.s2 + b.s2, a.s3 + b.s3, a.s4 + b.s4, a.s5 + b.s5, a.s6 + b.s6, a.s7 + b.s7, + a.s8 + b.s8, a.s9 + b.s9, a.sa + b.sa, a.sb + b.sb, a.sc + b.sc, a.sd + b.sd, a.se + b.se, a.sf + b.sf); +} + +static __forceinline__ __device__ uint2_16 operator^ (const uint2_16 &a, const uint2_16 &b) { + return make_uint2_16(a.s0 ^ b.s0, a.s1 ^ b.s1, a.s2 ^ b.s2, a.s3 ^ b.s3, a.s4 ^ b.s4, a.s5 ^ b.s5, a.s6 ^ b.s6, a.s7 ^ b.s7, + a.s8 ^ b.s8, a.s9 ^ b.s9, a.sa ^ b.sa, a.sb ^ b.sb, a.sc ^ b.sc, a.sd ^ b.sd, a.se ^ b.se, a.sf ^ b.sf); +} + +static __forceinline__ __device__ uint2_16 operator+ (const uint2_16 &a, const uint2_16 &b) { + return make_uint2_16(a.s0 + b.s0, a.s1 + b.s1, a.s2 + b.s2, a.s3 + b.s3, a.s4 + b.s4, a.s5 + b.s5, a.s6 + b.s6, a.s7 + b.s7, + a.s8 + b.s8, a.s9 + b.s9, a.sa + b.sa, a.sb + b.sb, a.sc + b.sc, a.sd + b.sd, a.se + b.se, a.sf + b.sf); +} + +static __forceinline__ __device__ uintx64bis operator^ (const uintx64bis &a, const uintx64bis &b) { + return make_uintx64bis(a.s0 ^ b.s0, a.s1 ^ b.s1, a.s2 ^ b.s2, a.s3 ^ b.s3, a.s4 ^ b.s4, a.s5 ^ b.s5, a.s6 ^ b.s6, a.s7 ^ b.s7); +} + +static __forceinline__ __device__ uintx64bis operator+ (const uintx64bis &a, const uintx64bis &b) { + return make_uintx64bis(a.s0 + b.s0, a.s1 + b.s1, a.s2 + b.s2, a.s3 + b.s3, a.s4 + b.s4, a.s5 + b.s5, a.s6 + b.s6, a.s7 + b.s7); +} + + +static __forceinline__ __device__ uint32 operator^ (const uint32 &a, const uint32 &b) { + return make_uint32(a.lo ^ b.lo, a.hi ^ b.hi); +} + +static __forceinline__ __device__ uint32 operator+ (const uint32 &a, const uint32 &b) { + return make_uint32(a.lo + b.lo, a.hi + b.hi); +} + + +static __forceinline__ __device__ ulonglong16 operator^ (const ulonglong16 &a, const ulonglong16 &b) { + return make_ulonglong16(a.s0 ^ b.s0, a.s1 ^ b.s1, a.s2 ^ b.s2, a.s3 ^ b.s3, a.s4 ^ b.s4, a.s5 ^ b.s5, a.s6 ^ b.s6, a.s7 ^ b.s7); +} + +static __forceinline__ __device__ ulonglong16 operator+ (const ulonglong16 &a, const ulonglong16 &b) { + return make_ulonglong16(a.s0 + b.s0, a.s1 + b.s1, a.s2 + b.s2, a.s3 + b.s3, a.s4 + b.s4, a.s5 + b.s5, a.s6 + b.s6, a.s7 + b.s7); +} + +static __forceinline__ __device__ void operator^= (ulong8 &a, const ulong8 &b) { a = a ^ b; } +static __forceinline__ __device__ void operator^= (uintx64 &a, const uintx64 &b) { a = a ^ b; } + +static __forceinline__ __device__ void operator^= (uintx64bis &a, const uintx64bis &b) { a = a ^ b; } + + +static __forceinline__ __device__ void operator^= (uintx128 &a, const uintx128 &b) { a = a ^ b; } +static __forceinline__ __device__ void operator^= (uintx256 &a, const uintx256 &b) { a = a ^ b; } + + +static __forceinline__ __device__ void operator^= (uint816 &a, const uint816 &b) { a = a ^ b; } + +static __forceinline__ __device__ void operator+= (uint816 &a, const uint816 &b) { a = a + b; } + + +static __forceinline__ __device__ void operator^= (uint48 &a, const uint48 &b) { a = a ^ b; } + +//static __forceinline__ __device__ void operator+= (uint48 &a, const uint48 &b) { a = a + b; } + + +static __forceinline__ __device__ void operator^= (uint28 &a, const uint28 &b) { a = a ^ b; } + +static __forceinline__ __device__ void operator+= (uint28 &a, const uint28 &b) { a = a + b; } + +static __forceinline__ __device__ void operator^= (uint2_8 &a, const uint2_8 &b) { a = a ^ b; } + +static __forceinline__ __device__ void operator+= (uint2_8 &a, const uint2_8 &b) { a = a + b; } + + + +static __forceinline__ __device__ void operator^= (uint32 &a, const uint32 &b) { a = a ^ b; } + +static __forceinline__ __device__ void operator+= (uint32 &a, const uint32 &b) { a = a + b; } + + +//static __forceinline__ __device__ void operator^= (uint4 &a, uint4 b) { a = a ^ b; } +static __forceinline__ __device__ __host__ void operator^= (uint8 &a, const uint8 &b) { a = a ^ b; } +static __forceinline__ __device__ __host__ void operator^= (uint16 &a, const uint16 &b) { a = a ^ b; } + +static __forceinline__ __device__ void operator^= (ulonglong16 &a, const ulonglong16 &b) { a = a ^ b; } +static __forceinline__ __device__ void operator^= (ulonglong4 &a, const ulonglong4 &b) { a = a ^ b; } +static __forceinline__ __device__ void operator+= (ulonglong4 &a, const ulonglong4 &b) { a = a + b; } + +static __forceinline__ __device__ void operator^= (ulonglong2 &a, const ulonglong2 &b) { a = a ^ b; } +static __forceinline__ __device__ void operator+= (ulonglong2 &a, const ulonglong2 &b) { a = a + b; } + +static __forceinline__ __device__ +ulonglong2to8 operator^ (const ulonglong2to8 &a, const ulonglong2to8 &b) +{ + return make_ulonglong2to8(a.l0 ^ b.l0, a.l1 ^ b.l1, a.l2 ^ b.l2, a.l3 ^ b.l3); +} +static __forceinline__ __device__ +ulonglong2to8 operator+ (const ulonglong2to8 &a, const ulonglong2to8 &b) +{ + return make_ulonglong2to8(a.l0 + b.l0, a.l1 + b.l1, a.l2 + b.l2, a.l3 + b.l3); +} + + +static __forceinline__ __device__ +ulonglong8to16 operator^ (const ulonglong8to16 &a, const ulonglong8to16 &b) +{ + return make_ulonglong8to16(a.lo ^ b.lo, a.hi ^ b.hi); +} + +static __forceinline__ __device__ +ulonglong8to16 operator+ (const ulonglong8to16 &a, const ulonglong8to16 &b) +{ + return make_ulonglong8to16(a.lo + b.lo, a.hi + b.hi); +} + +static __forceinline__ __device__ +ulonglong16to32 operator^ (const ulonglong16to32 &a, const ulonglong16to32 &b) +{ + return make_ulonglong16to32(a.lo ^ b.lo, a.hi ^ b.hi); +} + +static __forceinline__ __device__ +ulonglong16to32 operator+ (const ulonglong16to32 &a, const ulonglong16to32 &b) +{ + return make_ulonglong16to32(a.lo + b.lo, a.hi + b.hi); +} + +static __forceinline__ __device__ +ulonglong32to64 operator^ (const ulonglong32to64 &a, const ulonglong32to64 &b) +{ + return make_ulonglong32to64(a.lo ^ b.lo, a.hi ^ b.hi); +} + +static __forceinline__ __device__ +ulonglong32to64 operator+ (const ulonglong32to64 &a, const ulonglong32to64 &b) +{ + return make_ulonglong32to64(a.lo + b.lo, a.hi + b.hi); +} + + +static __forceinline__ __device__ ulonglonglong operator^ (const ulonglonglong &a, const ulonglonglong &b) { + return make_ulonglonglong(a.s0 ^ b.s0, a.s1 ^ b.s1, a.s2 ^ b.s2, a.s3 ^ b.s3, a.s4 ^ b.s4, a.s5 ^ b.s5); +} + +static __forceinline__ __device__ ulonglonglong operator+ (const ulonglonglong &a, const ulonglonglong &b) { + return make_ulonglonglong(a.s0 + b.s0, a.s1 + b.s1, a.s2 + b.s2, a.s3 + b.s3, a.s4 + b.s4, a.s5 + b.s5); +} + + +static __forceinline__ __device__ void operator^= (ulonglong2to8 &a, const ulonglong2to8 &b) { a = a ^ b; } + + +static __forceinline__ __device__ void operator+= (uint4 &a, uint4 b) { a = a + b; } +static __forceinline__ __device__ void operator+= (uchar4 &a, uchar4 b) { a = a + b; } +static __forceinline__ __device__ __host__ void operator+= (uint8 &a, const uint8 &b) { a = a + b; } +static __forceinline__ __device__ __host__ void operator+= (uint16 &a, const uint16 &b) { a = a + b; } +static __forceinline__ __device__ void operator+= (uint2_16 &a, const uint2_16 &b) { a = a + b; } +static __forceinline__ __device__ void operator^= (uint2_16 &a, const uint2_16 &b) { a = a + b; } + +static __forceinline__ __device__ void operator+= (ulong8 &a, const ulong8 &b) { a = a + b; } +static __forceinline__ __device__ void operator+= (ulonglong16 &a, const ulonglong16 &b) { a = a + b; } +static __forceinline__ __device__ void operator+= (ulonglong8to16 &a, const ulonglong8to16 &b) { a = a + b; } +static __forceinline__ __device__ void operator^= (ulonglong8to16 &a, const ulonglong8to16 &b) { a = a ^ b; } + +static __forceinline__ __device__ void operator+= (ulonglong16to32 &a, const ulonglong16to32 &b) { a = a + b; } +static __forceinline__ __device__ void operator^= (ulonglong16to32 &a, const ulonglong16to32 &b) { a = a ^ b; } + +static __forceinline__ __device__ void operator+= (ulonglong32to64 &a, const ulonglong32to64 &b) { a = a + b; } +static __forceinline__ __device__ void operator^= (ulonglong32to64 &a, const ulonglong32to64 &b) { a = a ^ b; } + + +static __forceinline__ __device__ void operator+= (ulonglonglong &a, const ulonglonglong &b) { a = a + b; } +static __forceinline__ __device__ void operator^= (ulonglonglong &a, const ulonglonglong &b) { a = a ^ b; } + + +#define rotate ROTL32 +#define rotateR ROTR32 + +#if __CUDA_ARCH__ >= 320 + +static __forceinline__ __device__ uint4 rotate4(uint4 vec4, uint32_t shift) +{ + uint4 ret; + asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(ret.x) : "r"(vec4.x), "r"(vec4.x), "r"(shift)); + asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(ret.y) : "r"(vec4.y), "r"(vec4.y), "r"(shift)); + asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(ret.z) : "r"(vec4.z), "r"(vec4.z), "r"(shift)); + asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(ret.w) : "r"(vec4.w), "r"(vec4.w), "r"(shift)); + return ret; +} + +static __forceinline__ __device__ uint4 rotate4R(uint4 vec4, uint32_t shift) +{ + uint4 ret; + asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(ret.x) : "r"(vec4.x), "r"(vec4.x), "r"(shift)); + asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(ret.y) : "r"(vec4.y), "r"(vec4.y), "r"(shift)); + asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(ret.z) : "r"(vec4.z), "r"(vec4.z), "r"(shift)); + asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(ret.w) : "r"(vec4.w), "r"(vec4.w), "r"(shift)); + return ret; +} + +static __device__ __inline__ uint8 __ldg8(const uint8_t *ptr) +{ + uint8 test; + asm volatile ("ld.global.nc.v4.u32 {%0,%1,%2,%3},[%4];" : "=r"(test.s0), "=r"(test.s1), "=r"(test.s2), "=r"(test.s3) : __LDG_PTR(ptr)); + asm volatile ("ld.global.nc.v4.u32 {%0,%1,%2,%3},[%4+16];" : "=r"(test.s4), "=r"(test.s5), "=r"(test.s6), "=r"(test.s7) : __LDG_PTR(ptr)); + return (test); +} + + +static __device__ __inline__ uint32_t __ldgtoint(const uint8_t *ptr) +{ + uint32_t test; + asm volatile ("ld.global.nc.u32 {%0},[%1];" : "=r"(test) : __LDG_PTR(ptr)); + return (test); +} + +static __device__ __inline__ uint32_t __ldgtoint64(const uint8_t *ptr) +{ + uint64_t test; + asm volatile ("ld.global.nc.u64 {%0},[%1];" : "=l"(test) : __LDG_PTR(ptr)); + return (test); +} + + +static __device__ __inline__ uint32_t __ldgtoint_unaligned(const uint8_t *ptr) +{ + uint32_t test; + asm volatile ("{\n\t" + ".reg .u8 a,b,c,d; \n\t" + "ld.global.nc.u8 a,[%1]; \n\t" + "ld.global.nc.u8 b,[%1+1]; \n\t" + "ld.global.nc.u8 c,[%1+2]; \n\t" + "ld.global.nc.u8 d,[%1+3]; \n\t" + "mov.b32 %0,{a,b,c,d}; }\n\t" + : "=r"(test) : __LDG_PTR(ptr)); + return (test); +} + +static __device__ __inline__ uint64_t __ldgtoint64_unaligned(const uint8_t *ptr) +{ + uint64_t test; + asm volatile ("{\n\t" + ".reg .u8 a,b,c,d,e,f,g,h; \n\t" + ".reg .u32 i,j; \n\t" + "ld.global.nc.u8 a,[%1]; \n\t" + "ld.global.nc.u8 b,[%1+1]; \n\t" + "ld.global.nc.u8 c,[%1+2]; \n\t" + "ld.global.nc.u8 d,[%1+3]; \n\t" + "ld.global.nc.u8 e,[%1+4]; \n\t" + "ld.global.nc.u8 f,[%1+5]; \n\t" + "ld.global.nc.u8 g,[%1+6]; \n\t" + "ld.global.nc.u8 h,[%1+7]; \n\t" + "mov.b32 i,{a,b,c,d}; \n\t" + "mov.b32 j,{e,f,g,h}; \n\t" + "mov.b64 %0,{i,j}; }\n\t" + : "=l"(test) : __LDG_PTR(ptr)); + return (test); +} + + +static __device__ __inline__ uint64_t __ldgtoint64_trunc(const uint8_t *ptr) +{ + uint32_t zero = 0; + uint64_t test; + asm volatile ("{\n\t" + ".reg .u8 a,b,c,d; \n\t" + ".reg .u32 i; \n\t" + "ld.global.nc.u8 a,[%1]; \n\t" + "ld.global.nc.u8 b,[%1+1]; \n\t" + "ld.global.nc.u8 c,[%1+2]; \n\t" + "ld.global.nc.u8 d,[%1+3]; \n\t" + "mov.b32 i,{a,b,c,d}; \n\t" + "mov.b64 %0,{i,%1}; }\n\t" + : "=l"(test) : __LDG_PTR(ptr), "r"(zero)); + return (test); +} + + + +static __device__ __inline__ uint32_t __ldgtoint_unaligned2(const uint8_t *ptr) +{ + uint32_t test; + asm("{\n\t" + ".reg .u8 e,b,c,d; \n\t" + "ld.global.nc.u8 e,[%1]; \n\t" + "ld.global.nc.u8 b,[%1+1]; \n\t" + "ld.global.nc.u8 c,[%1+2]; \n\t" + "ld.global.nc.u8 d,[%1+3]; \n\t" + "mov.b32 %0,{e,b,c,d}; }\n\t" + : "=r"(test) : __LDG_PTR(ptr)); + return (test); +} + +#endif + +static __forceinline__ __device__ void shift256R2(uint32_t * ret, const uint8 &vec4, const uint32_t shift) +{ + uint32_t truc = 0, truc2 = cuda_swab32(vec4.s7), truc3 = 0; + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(truc) : "r"(truc3), "r"(truc2), "r"(shift)); + ret[8] = cuda_swab32(truc); + truc3 = cuda_swab32(vec4.s6); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(truc) : "r"(truc2), "r"(truc3), "r"(shift)); + ret[7] = cuda_swab32(truc); + truc2 = cuda_swab32(vec4.s5); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(truc) : "r"(truc3), "r"(truc2), "r"(shift)); + ret[6] = cuda_swab32(truc); + truc3 = cuda_swab32(vec4.s4); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(truc) : "r"(truc2), "r"(truc3), "r"(shift)); + ret[5] = cuda_swab32(truc); + truc2 = cuda_swab32(vec4.s3); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(truc) : "r"(truc3), "r"(truc2), "r"(shift)); + ret[4] = cuda_swab32(truc); + truc3 = cuda_swab32(vec4.s2); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(truc) : "r"(truc2), "r"(truc3), "r"(shift)); + ret[3] = cuda_swab32(truc); + truc2 = cuda_swab32(vec4.s1); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(truc) : "r"(truc3), "r"(truc2), "r"(shift)); + ret[2] = cuda_swab32(truc); + truc3 = cuda_swab32(vec4.s0); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(truc) : "r"(truc2), "r"(truc3), "r"(shift)); + ret[1] = cuda_swab32(truc); + asm("shr.b32 %0, %1, %2;" : "=r"(truc) : "r"(truc3), "r"(shift)); + ret[0] = cuda_swab32(truc); + +} + +#define shift256R3(ret,vec4, shift) \ +{ \ + \ +uint32_t truc=0,truc2=cuda_swab32(vec4.s7),truc3=0; \ + asm volatile ("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(truc) : "r"(truc3), "r"(truc2), "r"(shift)); \ + ret[8] = cuda_swab32(truc); \ +truc2=cuda_swab32(vec4.s6);truc3=cuda_swab32(vec4.s7); \ + asm volatile ("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(truc) : "r"(truc3), "r"(truc2), "r"(shift)); \ + ret[7] = cuda_swab32(truc); \ +truc2=cuda_swab32(vec4.s5);truc3=cuda_swab32(vec4.s6); \ + asm volatile ("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(truc) : "r"(truc3), "r"(truc2), "r"(shift)); \ + ret[6] = cuda_swab32(truc); \ +truc2 = cuda_swab32(vec4.s4); truc3 = cuda_swab32(vec4.s5); \ + asm volatile ("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(truc) : "r"(truc3), "r"(truc2), "r"(shift)); \ + ret[5] = cuda_swab32(truc); \ +truc2 = cuda_swab32(vec4.s3); truc3 = cuda_swab32(vec4.s4); \ + asm volatile ("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(truc) : "r"(truc3), "r"(truc2), "r"(shift)); \ + ret[4] = cuda_swab32(truc); \ +truc2 = cuda_swab32(vec4.s2); truc3 = cuda_swab32(vec4.s3); \ + asm volatile ("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(truc) : "r"(truc3), "r"(truc2), "r"(shift)); \ + ret[3] = cuda_swab32(truc); \ +truc2 = cuda_swab32(vec4.s1); truc3 = cuda_swab32(vec4.s2); \ + asm volatile ("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(truc) : "r"(truc3), "r"(truc2), "r"(shift)); \ + ret[2] = cuda_swab32(truc); \ +truc2 = cuda_swab32(vec4.s0); truc3 = cuda_swab32(vec4.s1); \ + asm volatile ("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(truc) : "r"(truc3), "r"(truc2), "r"(shift)); \ + ret[1] = cuda_swab32(truc); \ +truc3 = cuda_swab32(vec4.s0); \ + asm volatile ("shr.b32 %0, %1, %2;" : "=r"(truc) : "r"(truc3), "r"(shift)); \ + ret[0] = cuda_swab32(truc); \ + \ + \ +} + +#if __CUDA_ARCH__ >= 320 && !defined NOASM +static __device__ __inline__ uint32 __ldg32b(const uint32 *ptr) +{ + uint32 ret; + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(ret.lo.s0), "=r"(ret.lo.s1), "=r"(ret.lo.s2), "=r"(ret.lo.s3) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+16];" : "=r"(ret.lo.s4), "=r"(ret.lo.s5), "=r"(ret.lo.s6), "=r"(ret.lo.s7) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+32];" : "=r"(ret.lo.s8), "=r"(ret.lo.s9), "=r"(ret.lo.sa), "=r"(ret.lo.sb) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+48];" : "=r"(ret.lo.sc), "=r"(ret.lo.sd), "=r"(ret.lo.se), "=r"(ret.lo.sf) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+64];" : "=r"(ret.hi.s0), "=r"(ret.hi.s1), "=r"(ret.hi.s2), "=r"(ret.hi.s3) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+80];" : "=r"(ret.hi.s4), "=r"(ret.hi.s5), "=r"(ret.hi.s6), "=r"(ret.hi.s7) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+96];" : "=r"(ret.hi.s8), "=r"(ret.hi.s9), "=r"(ret.hi.sa), "=r"(ret.hi.sb) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+112];" : "=r"(ret.hi.sc), "=r"(ret.hi.sd), "=r"(ret.hi.se), "=r"(ret.hi.sf) : __LDG_PTR(ptr)); + return ret; +} + +static __device__ __inline__ uint16 __ldg16b(const uint16 *ptr) +{ + uint16 ret; + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(ret.s0), "=r"(ret.s1), "=r"(ret.s2), "=r"(ret.s3) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+16];" : "=r"(ret.s4), "=r"(ret.s5), "=r"(ret.s6), "=r"(ret.s7) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+32];" : "=r"(ret.s8), "=r"(ret.s9), "=r"(ret.sa), "=r"(ret.sb) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+48];" : "=r"(ret.sc), "=r"(ret.sd), "=r"(ret.se), "=r"(ret.sf) : __LDG_PTR(ptr)); + return ret; +} + + +static __device__ __inline__ uintx64 __ldg32(const uint4 *ptr) +{ + uintx64 ret; + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(ret.s0.s0.s0.s0.x), "=r"(ret.s0.s0.s0.s0.y), "=r"(ret.s0.s0.s0.s0.z), "=r"(ret.s0.s0.s0.s0.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+16];" : "=r"(ret.s0.s0.s0.s1.x), "=r"(ret.s0.s0.s0.s1.y), "=r"(ret.s0.s0.s0.s1.z), "=r"(ret.s0.s0.s0.s1.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+32];" : "=r"(ret.s0.s0.s1.s0.x), "=r"(ret.s0.s0.s1.s0.y), "=r"(ret.s0.s0.s1.s0.z), "=r"(ret.s0.s0.s1.s0.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+48];" : "=r"(ret.s0.s0.s1.s1.x), "=r"(ret.s0.s0.s1.s1.y), "=r"(ret.s0.s0.s1.s1.z), "=r"(ret.s0.s0.s1.s1.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+64];" : "=r"(ret.s0.s1.s0.s0.x), "=r"(ret.s0.s1.s0.s0.y), "=r"(ret.s0.s1.s0.s0.z), "=r"(ret.s0.s1.s0.s0.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+80];" : "=r"(ret.s0.s1.s0.s1.x), "=r"(ret.s0.s1.s0.s1.y), "=r"(ret.s0.s1.s0.s1.z), "=r"(ret.s0.s1.s0.s1.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+96];" : "=r"(ret.s0.s1.s1.s0.x), "=r"(ret.s0.s1.s1.s0.y), "=r"(ret.s0.s1.s1.s0.z), "=r"(ret.s0.s1.s1.s0.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+112];" : "=r"(ret.s0.s1.s1.s1.x), "=r"(ret.s0.s1.s1.s1.y), "=r"(ret.s0.s1.s1.s1.z), "=r"(ret.s0.s1.s1.s1.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+128];" : "=r"(ret.s1.s0.s0.s0.x), "=r"(ret.s1.s0.s0.s0.y), "=r"(ret.s1.s0.s0.s0.z), "=r"(ret.s1.s0.s0.s0.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+144];" : "=r"(ret.s1.s0.s0.s1.x), "=r"(ret.s1.s0.s0.s1.y), "=r"(ret.s1.s0.s0.s1.z), "=r"(ret.s1.s0.s0.s1.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+160];" : "=r"(ret.s1.s0.s1.s0.x), "=r"(ret.s1.s0.s1.s0.y), "=r"(ret.s1.s0.s1.s0.z), "=r"(ret.s1.s0.s1.s0.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+176];" : "=r"(ret.s1.s0.s1.s1.x), "=r"(ret.s1.s0.s1.s1.y), "=r"(ret.s1.s0.s1.s1.z), "=r"(ret.s1.s0.s1.s1.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+192];" : "=r"(ret.s1.s1.s0.s0.x), "=r"(ret.s1.s1.s0.s0.y), "=r"(ret.s1.s1.s0.s0.z), "=r"(ret.s1.s1.s0.s0.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+208];" : "=r"(ret.s1.s1.s0.s1.x), "=r"(ret.s1.s1.s0.s1.y), "=r"(ret.s1.s1.s0.s1.z), "=r"(ret.s1.s1.s0.s1.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+224];" : "=r"(ret.s1.s1.s1.s0.x), "=r"(ret.s1.s1.s1.s0.y), "=r"(ret.s1.s1.s1.s0.z), "=r"(ret.s1.s1.s1.s0.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+240];" : "=r"(ret.s1.s1.s1.s1.x), "=r"(ret.s1.s1.s1.s1.y), "=r"(ret.s1.s1.s1.s1.z), "=r"(ret.s1.s1.s1.s1.w) : __LDG_PTR(ptr)); + return ret; +} + +static __device__ __inline__ uintx64 __ldg32c(const uintx64 *ptr) +{ + uintx64 ret; + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(ret.s0.s0.s0.s0.x), "=r"(ret.s0.s0.s0.s0.y), "=r"(ret.s0.s0.s0.s0.z), "=r"(ret.s0.s0.s0.s0.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+16];" : "=r"(ret.s0.s0.s0.s1.x), "=r"(ret.s0.s0.s0.s1.y), "=r"(ret.s0.s0.s0.s1.z), "=r"(ret.s0.s0.s0.s1.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+32];" : "=r"(ret.s0.s0.s1.s0.x), "=r"(ret.s0.s0.s1.s0.y), "=r"(ret.s0.s0.s1.s0.z), "=r"(ret.s0.s0.s1.s0.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+48];" : "=r"(ret.s0.s0.s1.s1.x), "=r"(ret.s0.s0.s1.s1.y), "=r"(ret.s0.s0.s1.s1.z), "=r"(ret.s0.s0.s1.s1.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+64];" : "=r"(ret.s0.s1.s0.s0.x), "=r"(ret.s0.s1.s0.s0.y), "=r"(ret.s0.s1.s0.s0.z), "=r"(ret.s0.s1.s0.s0.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+80];" : "=r"(ret.s0.s1.s0.s1.x), "=r"(ret.s0.s1.s0.s1.y), "=r"(ret.s0.s1.s0.s1.z), "=r"(ret.s0.s1.s0.s1.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+96];" : "=r"(ret.s0.s1.s1.s0.x), "=r"(ret.s0.s1.s1.s0.y), "=r"(ret.s0.s1.s1.s0.z), "=r"(ret.s0.s1.s1.s0.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+112];" : "=r"(ret.s0.s1.s1.s1.x), "=r"(ret.s0.s1.s1.s1.y), "=r"(ret.s0.s1.s1.s1.z), "=r"(ret.s0.s1.s1.s1.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+128];" : "=r"(ret.s1.s0.s0.s0.x), "=r"(ret.s1.s0.s0.s0.y), "=r"(ret.s1.s0.s0.s0.z), "=r"(ret.s1.s0.s0.s0.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+144];" : "=r"(ret.s1.s0.s0.s1.x), "=r"(ret.s1.s0.s0.s1.y), "=r"(ret.s1.s0.s0.s1.z), "=r"(ret.s1.s0.s0.s1.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+160];" : "=r"(ret.s1.s0.s1.s0.x), "=r"(ret.s1.s0.s1.s0.y), "=r"(ret.s1.s0.s1.s0.z), "=r"(ret.s1.s0.s1.s0.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+176];" : "=r"(ret.s1.s0.s1.s1.x), "=r"(ret.s1.s0.s1.s1.y), "=r"(ret.s1.s0.s1.s1.z), "=r"(ret.s1.s0.s1.s1.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+192];" : "=r"(ret.s1.s1.s0.s0.x), "=r"(ret.s1.s1.s0.s0.y), "=r"(ret.s1.s1.s0.s0.z), "=r"(ret.s1.s1.s0.s0.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+208];" : "=r"(ret.s1.s1.s0.s1.x), "=r"(ret.s1.s1.s0.s1.y), "=r"(ret.s1.s1.s0.s1.z), "=r"(ret.s1.s1.s0.s1.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+224];" : "=r"(ret.s1.s1.s1.s0.x), "=r"(ret.s1.s1.s1.s0.y), "=r"(ret.s1.s1.s1.s0.z), "=r"(ret.s1.s1.s1.s0.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+240];" : "=r"(ret.s1.s1.s1.s1.x), "=r"(ret.s1.s1.s1.s1.y), "=r"(ret.s1.s1.s1.s1.z), "=r"(ret.s1.s1.s1.s1.w) : __LDG_PTR(ptr)); + + return ret; +} + +static __device__ __inline__ uintx128 __ldg128(const uintx128 *ptr) +{ + uintx128 ret; + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(ret.s0.s0.s0.s0.s0.x), "=r"(ret.s0.s0.s0.s0.s0.y), "=r"(ret.s0.s0.s0.s0.s0.z), "=r"(ret.s0.s0.s0.s0.s0.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+16];" : "=r"(ret.s0.s0.s0.s0.s1.x), "=r"(ret.s0.s0.s0.s0.s1.y), "=r"(ret.s0.s0.s0.s0.s1.z), "=r"(ret.s0.s0.s0.s0.s1.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+32];" : "=r"(ret.s0.s0.s0.s1.s0.x), "=r"(ret.s0.s0.s0.s1.s0.y), "=r"(ret.s0.s0.s0.s1.s0.z), "=r"(ret.s0.s0.s0.s1.s0.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+48];" : "=r"(ret.s0.s0.s0.s1.s1.x), "=r"(ret.s0.s0.s0.s1.s1.y), "=r"(ret.s0.s0.s0.s1.s1.z), "=r"(ret.s0.s0.s0.s1.s1.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+64];" : "=r"(ret.s0.s0.s1.s0.s0.x), "=r"(ret.s0.s0.s1.s0.s0.y), "=r"(ret.s0.s0.s1.s0.s0.z), "=r"(ret.s0.s0.s1.s0.s0.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+80];" : "=r"(ret.s0.s0.s1.s0.s1.x), "=r"(ret.s0.s0.s1.s0.s1.y), "=r"(ret.s0.s0.s1.s0.s1.z), "=r"(ret.s0.s0.s1.s0.s1.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+96];" : "=r"(ret.s0.s0.s1.s1.s0.x), "=r"(ret.s0.s0.s1.s1.s0.y), "=r"(ret.s0.s0.s1.s1.s0.z), "=r"(ret.s0.s0.s1.s1.s0.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+112];" : "=r"(ret.s0.s0.s1.s1.s1.x), "=r"(ret.s0.s0.s1.s1.s1.y), "=r"(ret.s0.s0.s1.s1.s1.z), "=r"(ret.s0.s0.s1.s1.s1.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+128];" : "=r"(ret.s0.s1.s0.s0.s0.x), "=r"(ret.s0.s1.s0.s0.s0.y), "=r"(ret.s0.s1.s0.s0.s0.z), "=r"(ret.s0.s1.s0.s0.s0.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+144];" : "=r"(ret.s0.s1.s0.s0.s1.x), "=r"(ret.s0.s1.s0.s0.s1.y), "=r"(ret.s0.s1.s0.s0.s1.z), "=r"(ret.s0.s1.s0.s0.s1.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+160];" : "=r"(ret.s0.s1.s0.s1.s0.x), "=r"(ret.s0.s1.s0.s1.s0.y), "=r"(ret.s0.s1.s0.s1.s0.z), "=r"(ret.s0.s1.s0.s1.s0.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+176];" : "=r"(ret.s0.s1.s0.s1.s1.x), "=r"(ret.s0.s1.s0.s1.s1.y), "=r"(ret.s0.s1.s0.s1.s1.z), "=r"(ret.s0.s1.s0.s1.s1.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+192];" : "=r"(ret.s0.s1.s1.s0.s0.x), "=r"(ret.s0.s1.s1.s0.s0.y), "=r"(ret.s0.s1.s1.s0.s0.z), "=r"(ret.s0.s1.s1.s0.s0.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+208];" : "=r"(ret.s0.s1.s1.s0.s1.x), "=r"(ret.s0.s1.s1.s0.s1.y), "=r"(ret.s0.s1.s1.s0.s1.z), "=r"(ret.s0.s1.s1.s0.s1.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+224];" : "=r"(ret.s0.s1.s1.s1.s0.x), "=r"(ret.s0.s1.s1.s1.s0.y), "=r"(ret.s0.s1.s1.s1.s0.z), "=r"(ret.s0.s1.s1.s1.s0.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+240];" : "=r"(ret.s0.s1.s1.s1.s1.x), "=r"(ret.s0.s1.s1.s1.s1.y), "=r"(ret.s0.s1.s1.s1.s1.z), "=r"(ret.s0.s1.s1.s1.s1.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+256];" : "=r"(ret.s1.s0.s0.s0.s0.x), "=r"(ret.s1.s0.s0.s0.s0.y), "=r"(ret.s1.s0.s0.s0.s0.z), "=r"(ret.s1.s0.s0.s0.s0.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+272];" : "=r"(ret.s1.s0.s0.s0.s1.x), "=r"(ret.s1.s0.s0.s0.s1.y), "=r"(ret.s1.s0.s0.s0.s1.z), "=r"(ret.s1.s0.s0.s0.s1.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+288];" : "=r"(ret.s1.s0.s0.s1.s0.x), "=r"(ret.s1.s0.s0.s1.s0.y), "=r"(ret.s1.s0.s0.s1.s0.z), "=r"(ret.s1.s0.s0.s1.s0.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+304];" : "=r"(ret.s1.s0.s0.s1.s1.x), "=r"(ret.s1.s0.s0.s1.s1.y), "=r"(ret.s1.s0.s0.s1.s1.z), "=r"(ret.s1.s0.s0.s1.s1.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+320];" : "=r"(ret.s1.s0.s1.s0.s0.x), "=r"(ret.s1.s0.s1.s0.s0.y), "=r"(ret.s1.s0.s1.s0.s0.z), "=r"(ret.s1.s0.s1.s0.s0.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+336];" : "=r"(ret.s1.s0.s1.s0.s1.x), "=r"(ret.s1.s0.s1.s0.s1.y), "=r"(ret.s1.s0.s1.s0.s1.z), "=r"(ret.s1.s0.s1.s0.s1.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+352];" : "=r"(ret.s1.s0.s1.s1.s0.x), "=r"(ret.s1.s0.s1.s1.s0.y), "=r"(ret.s1.s0.s1.s1.s0.z), "=r"(ret.s1.s0.s1.s1.s0.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+368];" : "=r"(ret.s1.s0.s1.s1.s1.x), "=r"(ret.s1.s0.s1.s1.s1.y), "=r"(ret.s1.s0.s1.s1.s1.z), "=r"(ret.s1.s0.s1.s1.s1.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+384];" : "=r"(ret.s1.s1.s0.s0.s0.x), "=r"(ret.s1.s1.s0.s0.s0.y), "=r"(ret.s1.s1.s0.s0.s0.z), "=r"(ret.s1.s1.s0.s0.s0.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+400];" : "=r"(ret.s1.s1.s0.s0.s1.x), "=r"(ret.s1.s1.s0.s0.s1.y), "=r"(ret.s1.s1.s0.s0.s1.z), "=r"(ret.s1.s1.s0.s0.s1.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+416];" : "=r"(ret.s1.s1.s0.s1.s0.x), "=r"(ret.s1.s1.s0.s1.s0.y), "=r"(ret.s1.s1.s0.s1.s0.z), "=r"(ret.s1.s1.s0.s1.s0.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+432];" : "=r"(ret.s1.s1.s0.s1.s1.x), "=r"(ret.s1.s1.s0.s1.s1.y), "=r"(ret.s1.s1.s0.s1.s1.z), "=r"(ret.s1.s1.s0.s1.s1.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+448];" : "=r"(ret.s1.s1.s1.s0.s0.x), "=r"(ret.s1.s1.s1.s0.s0.y), "=r"(ret.s1.s1.s1.s0.s0.z), "=r"(ret.s1.s1.s1.s0.s0.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+464];" : "=r"(ret.s1.s1.s1.s0.s1.x), "=r"(ret.s1.s1.s1.s0.s1.y), "=r"(ret.s1.s1.s1.s0.s1.z), "=r"(ret.s1.s1.s1.s0.s1.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+480];" : "=r"(ret.s1.s1.s1.s1.s0.x), "=r"(ret.s1.s1.s1.s1.s0.y), "=r"(ret.s1.s1.s1.s1.s0.z), "=r"(ret.s1.s1.s1.s1.s0.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+496];" : "=r"(ret.s1.s1.s1.s1.s1.x), "=r"(ret.s1.s1.s1.s1.s1.y), "=r"(ret.s1.s1.s1.s1.s1.z), "=r"(ret.s1.s1.s1.s1.s1.w) : __LDG_PTR(ptr)); + + return ret; +} + +static __device__ __inline__ ulonglong2 __ldg2(const ulonglong2 *ptr) +{ + ulonglong2 ret; + asm("ld.global.nc.v2.u64 {%0,%1}, [%2];" : "=l"(ret.x), "=l"(ret.y) : __LDG_PTR(ptr)); +return ret; +} + +static __device__ __inline__ ulonglong4 __ldg4(const ulonglong4 *ptr) +{ + ulonglong4 ret; + asm("ld.global.nc.v2.u64 {%0,%1}, [%2];" : "=l"(ret.x), "=l"(ret.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+16];" : "=l"(ret.z), "=l"(ret.w) : __LDG_PTR(ptr)); + return ret; +} +static __device__ __inline__ void ldg4(const ulonglong4 *ptr,ulonglong4 *ret) +{ + asm("ld.global.nc.v2.u64 {%0,%1}, [%2];" : "=l"(ret[0].x), "=l"(ret[0].y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+16];" : "=l"(ret[0].z), "=l"(ret[0].w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+32];" : "=l"(ret[1].x), "=l"(ret[1].y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+48];" : "=l"(ret[1].z), "=l"(ret[1].w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+64];" : "=l"(ret[2].x), "=l"(ret[2].y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+80];" : "=l"(ret[2].z), "=l"(ret[2].w) : __LDG_PTR(ptr)); +} +static __device__ __inline__ void ldg4xor(const ulonglong4 *ptr, ulonglong4 *ret, ulonglong4 *state) +{ + + asm("ld.global.nc.v2.u64 {%0,%1}, [%2];" : "=l"(ret[0].x), "=l"(ret[0].y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+16];" : "=l"(ret[0].z), "=l"(ret[0].w) : __LDG_PTR(ptr)); + state[0] ^= ret[0]; + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+32];" : "=l"(ret[1].x), "=l"(ret[1].y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+48];" : "=l"(ret[1].z), "=l"(ret[1].w) : __LDG_PTR(ptr)); + state[1] ^= ret[1]; + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+64];" : "=l"(ret[2].x), "=l"(ret[2].y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+80];" : "=l"(ret[2].z), "=l"(ret[2].w) : __LDG_PTR(ptr)); + state[2] ^= ret[2]; +} + + +static __device__ __inline__ uint28 __ldg4(const uint28 *ptr) +{ + uint28 ret; +asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(ret.x.x), "=r"(ret.x.y), "=r"(ret.y.x), "=r"(ret.y.y) : __LDG_PTR(ptr)); +asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+16];" : "=r"(ret.z.x), "=r"(ret.z.y), "=r"(ret.w.x), "=r"(ret.w.y) : __LDG_PTR(ptr)); + + return ret; +} + +static __device__ __inline__ uint48 __ldg4(const uint48 *ptr) +{ + uint48 ret; + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(ret.s0.x), "=r"(ret.s0.y), "=r"(ret.s0.z), "=r"(ret.s0.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+16];" : "=r"(ret.s1.x), "=r"(ret.s1.y), "=r"(ret.s1.z), "=r"(ret.s1.w) : __LDG_PTR(ptr)); + return ret; +} + + +static __device__ __inline__ void ldg4(const uint28 *ptr, uint28 *ret) +{ + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(ret[0].x.x), "=r"(ret[0].x.y), "=r"(ret[0].y.x), "=r"(ret[0].y.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+16];" : "=r"(ret[0].z.x), "=r"(ret[0].z.y), "=r"(ret[0].w.x), "=r"(ret[0].w.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+32];" : "=r"(ret[1].x.x), "=r"(ret[1].x.y), "=r"(ret[1].y.x), "=r"(ret[1].y.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+48];" : "=r"(ret[1].z.x), "=r"(ret[1].z.y), "=r"(ret[1].w.x), "=r"(ret[1].w.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+64];" : "=r"(ret[2].x.x), "=r"(ret[2].x.y), "=r"(ret[2].y.x), "=r"(ret[2].y.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+80];" : "=r"(ret[2].z.x), "=r"(ret[2].z.y), "=r"(ret[2].w.x), "=r"(ret[2].w.y) : __LDG_PTR(ptr)); +} +static __device__ __inline__ void ldg4xor(const uint28 *ptr, uint28 *ret,uint28* state) +{ + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(ret[0].x.x), "=r"(ret[0].x.y), "=r"(ret[0].y.x), "=r"(ret[0].y.y) : __LDG_PTR(ptr)); + state[0].x ^= ret[0].x; state[0].y ^= ret[0].y; + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+16];" : "=r"(ret[0].z.x), "=r"(ret[0].z.y), "=r"(ret[0].w.x), "=r"(ret[0].w.y) : __LDG_PTR(ptr)); + state[0].z ^= ret[0].z; state[0].w ^= ret[0].w; + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+32];" : "=r"(ret[1].x.x), "=r"(ret[1].x.y), "=r"(ret[1].y.x), "=r"(ret[1].y.y) : __LDG_PTR(ptr)); + state[1].x ^= ret[1].x; state[1].y ^= ret[1].y; + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+48];" : "=r"(ret[1].z.x), "=r"(ret[1].z.y), "=r"(ret[1].w.x), "=r"(ret[1].w.y) : __LDG_PTR(ptr)); + state[1].z ^= ret[1].z; state[1].w ^= ret[1].w; + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+64];" : "=r"(ret[2].x.x), "=r"(ret[2].x.y), "=r"(ret[2].y.x), "=r"(ret[2].y.y) : __LDG_PTR(ptr)); + state[2].x ^= ret[2].x; state[2].y ^= ret[2].y; + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+80];" : "=r"(ret[2].z.x), "=r"(ret[2].z.y), "=r"(ret[2].w.x), "=r"(ret[2].w.y) : __LDG_PTR(ptr)); + state[2].z ^= ret[2].z; state[2].w ^= ret[2].w; + + +} + + +static __device__ __inline__ ulonglong2to8 __ldg2to8(const ulonglong2to8 *ptr) +{ + ulonglong2to8 ret; + asm("ld.global.nc.v2.u64 {%0,%1}, [%2];" : "=l"(ret.l0.x), "=l"(ret.l0.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+16];" : "=l"(ret.l1.x), "=l"(ret.l1.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+32];" : "=l"(ret.l2.x), "=l"(ret.l2.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+48];" : "=l"(ret.l3.x), "=l"(ret.l3.y) : __LDG_PTR(ptr)); + return ret; +} +static __device__ __inline__ ulonglong8to16 __ldg8to16(const ulonglong8to16 *ptr) +{ + ulonglong8to16 ret; + asm("ld.global.nc.v2.u64 {%0,%1}, [%2];" : "=l"(ret.lo.l0.x), "=l"(ret.lo.l0.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+16];" : "=l"(ret.lo.l1.x), "=l"(ret.lo.l1.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+32];" : "=l"(ret.lo.l2.x), "=l"(ret.lo.l2.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+48];" : "=l"(ret.lo.l3.x), "=l"(ret.lo.l3.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+64];" : "=l"(ret.hi.l0.x), "=l"(ret.hi.l0.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+80];" : "=l"(ret.hi.l1.x), "=l"(ret.hi.l1.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+96];" : "=l"(ret.hi.l2.x), "=l"(ret.hi.l2.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+112];" : "=l"(ret.hi.l3.x), "=l"(ret.hi.l3.y) : __LDG_PTR(ptr)); + return ret; +} + +static __device__ __inline__ ulonglonglong __ldgxtralong(const ulonglonglong *ptr) +{ + ulonglonglong ret; + asm("ld.global.nc.v2.u64 {%0,%1}, [%2];" : "=l"(ret.s0.x), "=l"(ret.s0.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+16];" : "=l"(ret.s1.x), "=l"(ret.s1.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+32];" : "=l"(ret.s2.x), "=l"(ret.s2.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+48];" : "=l"(ret.s3.x), "=l"(ret.s3.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+64];" : "=l"(ret.s4.x), "=l"(ret.s4.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+80];" : "=l"(ret.s5.x), "=l"(ret.s5.y) : __LDG_PTR(ptr)); + return ret; +} +static __device__ __inline__ uint8 ldg8bis(const uint8 *ptr) +{ + uint8 test; + asm volatile ("ld.global.nc.v4.u32 {%0,%1,%2,%3},[%4];" : "=r"(test.s0), "=r"(test.s1), "=r"(test.s2), "=r"(test.s3) : __LDG_PTR(ptr)); + asm volatile ("ld.global.nc.v4.u32 {%0,%1,%2,%3},[%4+16];" : "=r"(test.s4), "=r"(test.s5), "=r"(test.s6), "=r"(test.s7) : __LDG_PTR(ptr)); + return (test); +} + + +static __device__ __inline__ ulonglong16 __ldg32(const ulonglong4 *ptr) +{ + ulonglong16 ret; + asm("ld.global.nc.v2.u64 {%0,%1}, [%2];" : "=l"(ret.s0.x), "=l"(ret.s0.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+16];" : "=l"(ret.s0.z), "=l"(ret.s0.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+32];" : "=l"(ret.s1.x), "=l"(ret.s1.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+48];" : "=l"(ret.s1.z), "=l"(ret.s1.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+64];" : "=l"(ret.s2.x), "=l"(ret.s2.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+80];" : "=l"(ret.s2.z), "=l"(ret.s2.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+96];" : "=l"(ret.s3.x), "=l"(ret.s3.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+112];" : "=l"(ret.s3.z), "=l"(ret.s3.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+128];" : "=l"(ret.s4.x), "=l"(ret.s4.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+144];" : "=l"(ret.s4.z), "=l"(ret.s4.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+160];" : "=l"(ret.s5.x), "=l"(ret.s5.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+176];" : "=l"(ret.s5.z), "=l"(ret.s5.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+192];" : "=l"(ret.s6.x), "=l"(ret.s6.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+208];" : "=l"(ret.s6.z), "=l"(ret.s6.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+224];" : "=l"(ret.s7.x), "=l"(ret.s7.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+240];" : "=l"(ret.s7.z), "=l"(ret.s7.w) : __LDG_PTR(ptr)); + return ret; +} + +static __device__ __inline__ uintx64bis __ldg32(const uint28 *ptr) +{ + uintx64bis ret; + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(ret.s0.x.x), "=r"(ret.s0.x.y), "=r"(ret.s0.y.x), "=r"(ret.s0.y.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+16];" : "=r"(ret.s0.z.x), "=r"(ret.s0.z.y), "=r"(ret.s0.w.x), "=r"(ret.s0.w.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+32];" : "=r"(ret.s1.x.x), "=r"(ret.s1.x.y), "=r"(ret.s1.y.x), "=r"(ret.s1.y.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+48];" : "=r"(ret.s1.z.x), "=r"(ret.s1.z.y), "=r"(ret.s1.w.x), "=r"(ret.s1.w.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+64];" : "=r"(ret.s2.x.x), "=r"(ret.s2.x.y), "=r"(ret.s2.y.x), "=r"(ret.s2.y.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+80];" : "=r"(ret.s2.z.x), "=r"(ret.s2.z.y), "=r"(ret.s2.w.x), "=r"(ret.s2.w.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+96];" : "=r"(ret.s3.x.x), "=r"(ret.s3.x.y), "=r"(ret.s3.y.x), "=r"(ret.s3.y.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+112];" : "=r"(ret.s3.z.x), "=r"(ret.s3.z.y), "=r"(ret.s3.w.x), "=r"(ret.s3.w.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+128];" : "=r"(ret.s4.x.x), "=r"(ret.s4.x.y), "=r"(ret.s4.y.x), "=r"(ret.s4.y.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+144];" : "=r"(ret.s4.z.x), "=r"(ret.s4.z.y), "=r"(ret.s4.w.x), "=r"(ret.s4.w.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+160];" : "=r"(ret.s5.x.x), "=r"(ret.s5.x.y), "=r"(ret.s5.y.x), "=r"(ret.s5.y.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+176];" : "=r"(ret.s5.z.x), "=r"(ret.s5.z.y), "=r"(ret.s5.w.x), "=r"(ret.s5.w.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+192];" : "=r"(ret.s6.x.x), "=r"(ret.s6.x.y), "=r"(ret.s6.y.x), "=r"(ret.s6.y.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+208];" : "=r"(ret.s6.z.x), "=r"(ret.s6.z.y), "=r"(ret.s6.w.x), "=r"(ret.s6.w.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+224];" : "=r"(ret.s7.x.x), "=r"(ret.s7.x.y), "=r"(ret.s7.y.x), "=r"(ret.s7.y.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+240];" : "=r"(ret.s7.z.x), "=r"(ret.s7.z.y), "=r"(ret.s7.w.x), "=r"(ret.s7.w.y) : __LDG_PTR(ptr)); + return ret; +} +#else //not implemented yet +static __device__ __inline__ uint32 __ldg32b(const uint32 *ptr) +{ + return *ptr; +} + +static __device__ __inline__ uint16 __ldg16b(const uint16 *ptr) +{ + return *ptr; +} + + +static __device__ __inline__ uintx64 __ldg32(const uint4 *ptr) +{ + return *((uintx64*)ptr); +} + +static __device__ __inline__ uintx64 __ldg32c(const uintx64 *ptr) +{ + return *ptr; +} + +static __device__ __inline__ uintx128 __ldg128(const uintx128 *ptr) +{ + return *ptr; +} + +static __device__ __inline__ ulonglong2 __ldg2(const ulonglong2 *ptr) +{ + return *ptr; +} + +static __device__ __inline__ ulonglong4 __ldg4(const ulonglong4 *ptr) +{ + return *ptr; +} +static __device__ __inline__ void ldg4(const ulonglong4 *ptr, ulonglong4 *ret) +{ + *ret = *ptr; +} +static __device__ __inline__ void ldg4xor(const ulonglong4 *ptr, ulonglong4 *ret, ulonglong4 *state) +{ + ret[0] = ptr[0]; + ret[1] = ptr[1]; + ret[2] = ptr[2]; + state[0] ^= ret[0]; + state[1] ^= ret[1]; + state[2] ^= ret[2]; +} + + +static __device__ __inline__ uint28 __ldg4(const uint28 *ptr) +{ + uint28 ret; + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(ret.x.x), "=r"(ret.x.y), "=r"(ret.y.x), "=r"(ret.y.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+16];" : "=r"(ret.z.x), "=r"(ret.z.y), "=r"(ret.w.x), "=r"(ret.w.y) : __LDG_PTR(ptr)); + + return ret; +} + +static __device__ __inline__ uint48 __ldg4(const uint48 *ptr) +{ + uint48 ret; + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(ret.s0.x), "=r"(ret.s0.y), "=r"(ret.s0.z), "=r"(ret.s0.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+16];" : "=r"(ret.s1.x), "=r"(ret.s1.y), "=r"(ret.s1.z), "=r"(ret.s1.w) : __LDG_PTR(ptr)); + return ret; +} + + +static __device__ __inline__ void ldg4(const uint28 *ptr, uint28 *ret) +{ + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(ret[0].x.x), "=r"(ret[0].x.y), "=r"(ret[0].y.x), "=r"(ret[0].y.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+16];" : "=r"(ret[0].z.x), "=r"(ret[0].z.y), "=r"(ret[0].w.x), "=r"(ret[0].w.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+32];" : "=r"(ret[1].x.x), "=r"(ret[1].x.y), "=r"(ret[1].y.x), "=r"(ret[1].y.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+48];" : "=r"(ret[1].z.x), "=r"(ret[1].z.y), "=r"(ret[1].w.x), "=r"(ret[1].w.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+64];" : "=r"(ret[2].x.x), "=r"(ret[2].x.y), "=r"(ret[2].y.x), "=r"(ret[2].y.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+80];" : "=r"(ret[2].z.x), "=r"(ret[2].z.y), "=r"(ret[2].w.x), "=r"(ret[2].w.y) : __LDG_PTR(ptr)); +} +static __device__ __inline__ void ldg4xor(const uint28 *ptr, uint28 *ret, uint28* state) +{ + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(ret[0].x.x), "=r"(ret[0].x.y), "=r"(ret[0].y.x), "=r"(ret[0].y.y) : __LDG_PTR(ptr)); + state[0].x ^= ret[0].x; state[0].y ^= ret[0].y; + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+16];" : "=r"(ret[0].z.x), "=r"(ret[0].z.y), "=r"(ret[0].w.x), "=r"(ret[0].w.y) : __LDG_PTR(ptr)); + state[0].z ^= ret[0].z; state[0].w ^= ret[0].w; + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+32];" : "=r"(ret[1].x.x), "=r"(ret[1].x.y), "=r"(ret[1].y.x), "=r"(ret[1].y.y) : __LDG_PTR(ptr)); + state[1].x ^= ret[1].x; state[1].y ^= ret[1].y; + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+48];" : "=r"(ret[1].z.x), "=r"(ret[1].z.y), "=r"(ret[1].w.x), "=r"(ret[1].w.y) : __LDG_PTR(ptr)); + state[1].z ^= ret[1].z; state[1].w ^= ret[1].w; + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+64];" : "=r"(ret[2].x.x), "=r"(ret[2].x.y), "=r"(ret[2].y.x), "=r"(ret[2].y.y) : __LDG_PTR(ptr)); + state[2].x ^= ret[2].x; state[2].y ^= ret[2].y; + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+80];" : "=r"(ret[2].z.x), "=r"(ret[2].z.y), "=r"(ret[2].w.x), "=r"(ret[2].w.y) : __LDG_PTR(ptr)); + state[2].z ^= ret[2].z; state[2].w ^= ret[2].w; + + +} + + +static __device__ __inline__ ulonglong2to8 __ldg2to8(const ulonglong2to8 *ptr) +{ + ulonglong2to8 ret; + asm("ld.global.nc.v2.u64 {%0,%1}, [%2];" : "=l"(ret.l0.x), "=l"(ret.l0.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+16];" : "=l"(ret.l1.x), "=l"(ret.l1.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+32];" : "=l"(ret.l2.x), "=l"(ret.l2.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+48];" : "=l"(ret.l3.x), "=l"(ret.l3.y) : __LDG_PTR(ptr)); + return ret; +} +static __device__ __inline__ ulonglong8to16 __ldg8to16(const ulonglong8to16 *ptr) +{ + ulonglong8to16 ret; + asm("ld.global.nc.v2.u64 {%0,%1}, [%2];" : "=l"(ret.lo.l0.x), "=l"(ret.lo.l0.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+16];" : "=l"(ret.lo.l1.x), "=l"(ret.lo.l1.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+32];" : "=l"(ret.lo.l2.x), "=l"(ret.lo.l2.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+48];" : "=l"(ret.lo.l3.x), "=l"(ret.lo.l3.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+64];" : "=l"(ret.hi.l0.x), "=l"(ret.hi.l0.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+80];" : "=l"(ret.hi.l1.x), "=l"(ret.hi.l1.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+96];" : "=l"(ret.hi.l2.x), "=l"(ret.hi.l2.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+112];" : "=l"(ret.hi.l3.x), "=l"(ret.hi.l3.y) : __LDG_PTR(ptr)); + return ret; +} + +static __device__ __inline__ ulonglonglong __ldgxtralong(const ulonglonglong *ptr) +{ + ulonglonglong ret; + asm("ld.global.nc.v2.u64 {%0,%1}, [%2];" : "=l"(ret.s0.x), "=l"(ret.s0.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+16];" : "=l"(ret.s1.x), "=l"(ret.s1.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+32];" : "=l"(ret.s2.x), "=l"(ret.s2.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+48];" : "=l"(ret.s3.x), "=l"(ret.s3.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+64];" : "=l"(ret.s4.x), "=l"(ret.s4.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+80];" : "=l"(ret.s5.x), "=l"(ret.s5.y) : __LDG_PTR(ptr)); + return ret; +} +static __device__ __inline__ uint8 ldg8bis(const uint8 *ptr) +{ + uint8 test; + asm volatile ("ld.global.nc.v4.u32 {%0,%1,%2,%3},[%4];" : "=r"(test.s0), "=r"(test.s1), "=r"(test.s2), "=r"(test.s3) : __LDG_PTR(ptr)); + asm volatile ("ld.global.nc.v4.u32 {%0,%1,%2,%3},[%4+16];" : "=r"(test.s4), "=r"(test.s5), "=r"(test.s6), "=r"(test.s7) : __LDG_PTR(ptr)); + return (test); +} + + +static __device__ __inline__ ulonglong16 __ldg32(const ulonglong4 *ptr) +{ + ulonglong16 ret; + asm("ld.global.nc.v2.u64 {%0,%1}, [%2];" : "=l"(ret.s0.x), "=l"(ret.s0.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+16];" : "=l"(ret.s0.z), "=l"(ret.s0.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+32];" : "=l"(ret.s1.x), "=l"(ret.s1.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+48];" : "=l"(ret.s1.z), "=l"(ret.s1.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+64];" : "=l"(ret.s2.x), "=l"(ret.s2.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+80];" : "=l"(ret.s2.z), "=l"(ret.s2.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+96];" : "=l"(ret.s3.x), "=l"(ret.s3.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+112];" : "=l"(ret.s3.z), "=l"(ret.s3.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+128];" : "=l"(ret.s4.x), "=l"(ret.s4.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+144];" : "=l"(ret.s4.z), "=l"(ret.s4.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+160];" : "=l"(ret.s5.x), "=l"(ret.s5.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+176];" : "=l"(ret.s5.z), "=l"(ret.s5.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+192];" : "=l"(ret.s6.x), "=l"(ret.s6.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+208];" : "=l"(ret.s6.z), "=l"(ret.s6.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+224];" : "=l"(ret.s7.x), "=l"(ret.s7.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+240];" : "=l"(ret.s7.z), "=l"(ret.s7.w) : __LDG_PTR(ptr)); + return ret; +} + +static __device__ __inline__ uintx64bis __ldg32(const uint28 *ptr) +{ + uintx64bis ret; + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(ret.s0.x.x), "=r"(ret.s0.x.y), "=r"(ret.s0.y.x), "=r"(ret.s0.y.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+16];" : "=r"(ret.s0.z.x), "=r"(ret.s0.z.y), "=r"(ret.s0.w.x), "=r"(ret.s0.w.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+32];" : "=r"(ret.s1.x.x), "=r"(ret.s1.x.y), "=r"(ret.s1.y.x), "=r"(ret.s1.y.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+48];" : "=r"(ret.s1.z.x), "=r"(ret.s1.z.y), "=r"(ret.s1.w.x), "=r"(ret.s1.w.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+64];" : "=r"(ret.s2.x.x), "=r"(ret.s2.x.y), "=r"(ret.s2.y.x), "=r"(ret.s2.y.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+80];" : "=r"(ret.s2.z.x), "=r"(ret.s2.z.y), "=r"(ret.s2.w.x), "=r"(ret.s2.w.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+96];" : "=r"(ret.s3.x.x), "=r"(ret.s3.x.y), "=r"(ret.s3.y.x), "=r"(ret.s3.y.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+112];" : "=r"(ret.s3.z.x), "=r"(ret.s3.z.y), "=r"(ret.s3.w.x), "=r"(ret.s3.w.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+128];" : "=r"(ret.s4.x.x), "=r"(ret.s4.x.y), "=r"(ret.s4.y.x), "=r"(ret.s4.y.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+144];" : "=r"(ret.s4.z.x), "=r"(ret.s4.z.y), "=r"(ret.s4.w.x), "=r"(ret.s4.w.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+160];" : "=r"(ret.s5.x.x), "=r"(ret.s5.x.y), "=r"(ret.s5.y.x), "=r"(ret.s5.y.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+176];" : "=r"(ret.s5.z.x), "=r"(ret.s5.z.y), "=r"(ret.s5.w.x), "=r"(ret.s5.w.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+192];" : "=r"(ret.s6.x.x), "=r"(ret.s6.x.y), "=r"(ret.s6.y.x), "=r"(ret.s6.y.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+208];" : "=r"(ret.s6.z.x), "=r"(ret.s6.z.y), "=r"(ret.s6.w.x), "=r"(ret.s6.w.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+224];" : "=r"(ret.s7.x.x), "=r"(ret.s7.x.y), "=r"(ret.s7.y.x), "=r"(ret.s7.y.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+240];" : "=r"(ret.s7.z.x), "=r"(ret.s7.z.y), "=r"(ret.s7.w.x), "=r"(ret.s7.w.y) : __LDG_PTR(ptr)); + return ret; +} + +#endif + +static __forceinline__ __device__ uint8 swapvec(const uint8 &buf) +{ + uint8 vec; + vec.s0 = cuda_swab32(buf.s0); + vec.s1 = cuda_swab32(buf.s1); + vec.s2 = cuda_swab32(buf.s2); + vec.s3 = cuda_swab32(buf.s3); + vec.s4 = cuda_swab32(buf.s4); + vec.s5 = cuda_swab32(buf.s5); + vec.s6 = cuda_swab32(buf.s6); + vec.s7 = cuda_swab32(buf.s7); + return vec; +} + + +static __forceinline__ __device__ uint8 swapvec(const uint8 *buf) +{ + uint8 vec; + vec.s0 = cuda_swab32(buf[0].s0); + vec.s1 = cuda_swab32(buf[0].s1); + vec.s2 = cuda_swab32(buf[0].s2); + vec.s3 = cuda_swab32(buf[0].s3); + vec.s4 = cuda_swab32(buf[0].s4); + vec.s5 = cuda_swab32(buf[0].s5); + vec.s6 = cuda_swab32(buf[0].s6); + vec.s7 = cuda_swab32(buf[0].s7); + return vec; +} + +static __forceinline__ __device__ uint16 swapvec(const uint16 *buf) +{ + uint16 vec; + vec.s0 = cuda_swab32(buf[0].s0); + vec.s1 = cuda_swab32(buf[0].s1); + vec.s2 = cuda_swab32(buf[0].s2); + vec.s3 = cuda_swab32(buf[0].s3); + vec.s4 = cuda_swab32(buf[0].s4); + vec.s5 = cuda_swab32(buf[0].s5); + vec.s6 = cuda_swab32(buf[0].s6); + vec.s7 = cuda_swab32(buf[0].s7); + vec.s8 = cuda_swab32(buf[0].s8); + vec.s9 = cuda_swab32(buf[0].s9); + vec.sa = cuda_swab32(buf[0].sa); + vec.sb = cuda_swab32(buf[0].sb); + vec.sc = cuda_swab32(buf[0].sc); + vec.sd = cuda_swab32(buf[0].sd); + vec.se = cuda_swab32(buf[0].se); + vec.sf = cuda_swab32(buf[0].sf); + return vec; +} + +static __forceinline__ __device__ uint16 swapvec(const uint16 &buf) +{ + uint16 vec; + vec.s0 = cuda_swab32(buf.s0); + vec.s1 = cuda_swab32(buf.s1); + vec.s2 = cuda_swab32(buf.s2); + vec.s3 = cuda_swab32(buf.s3); + vec.s4 = cuda_swab32(buf.s4); + vec.s5 = cuda_swab32(buf.s5); + vec.s6 = cuda_swab32(buf.s6); + vec.s7 = cuda_swab32(buf.s7); + vec.s8 = cuda_swab32(buf.s8); + vec.s9 = cuda_swab32(buf.s9); + vec.sa = cuda_swab32(buf.sa); + vec.sb = cuda_swab32(buf.sb); + vec.sc = cuda_swab32(buf.sc); + vec.sd = cuda_swab32(buf.sd); + vec.se = cuda_swab32(buf.se); + vec.sf = cuda_swab32(buf.sf); + return vec; +} + +static __device__ __forceinline__ uint28 shuffle4(const uint28 &var, int lane) +{ +uint28 res; +res.x.x = __shfl(var.x.x, lane); +res.x.y = __shfl(var.x.y, lane); +res.y.x = __shfl(var.y.x, lane); +res.y.y = __shfl(var.y.y, lane); +res.z.x = __shfl(var.z.x, lane); +res.z.y = __shfl(var.z.y, lane); +res.w.x = __shfl(var.w.x, lane); +res.w.y = __shfl(var.w.y, lane); +return res; +} + + +static __device__ __forceinline__ ulonglong4 shuffle4(ulonglong4 var, int lane) +{ + ulonglong4 res; + uint2 temp; + temp = vectorize(var.x); + temp.x = __shfl(temp.x, lane); + temp.y = __shfl(temp.y, lane); + res.x = devectorize(temp); + temp = vectorize(var.y); + temp.x = __shfl(temp.x, lane); + temp.y = __shfl(temp.y, lane); + res.y = devectorize(temp); + temp = vectorize(var.z); + temp.x = __shfl(temp.x, lane); + temp.y = __shfl(temp.y, lane); + res.z = devectorize(temp); + temp = vectorize(var.w); + temp.x = __shfl(temp.x, lane); + temp.y = __shfl(temp.y, lane); + res.w = devectorize(temp); + return res; +} + + +#endif // #ifndef CUDA_VECTOR_H diff --git a/cuda_x11_aes_noasm.cu b/cuda_x11_aes_noasm.cu new file mode 100644 index 0000000000..1f595c6192 --- /dev/null +++ b/cuda_x11_aes_noasm.cu @@ -0,0 +1,347 @@ +#include "cuda_helper.h" + +/* AES Helper for inline-usage from SPH */ +#define AESx(x) SPH_C32(x) + +__constant__ __align__(64) uint32_t d_AES0[256] = { + AESx(0xA56363C6), AESx(0x847C7CF8), AESx(0x997777EE), AESx(0x8D7B7BF6), + AESx(0x0DF2F2FF), AESx(0xBD6B6BD6), AESx(0xB16F6FDE), AESx(0x54C5C591), + AESx(0x50303060), AESx(0x03010102), AESx(0xA96767CE), AESx(0x7D2B2B56), + AESx(0x19FEFEE7), AESx(0x62D7D7B5), AESx(0xE6ABAB4D), AESx(0x9A7676EC), + AESx(0x45CACA8F), AESx(0x9D82821F), AESx(0x40C9C989), AESx(0x877D7DFA), + AESx(0x15FAFAEF), AESx(0xEB5959B2), AESx(0xC947478E), AESx(0x0BF0F0FB), + AESx(0xECADAD41), AESx(0x67D4D4B3), AESx(0xFDA2A25F), AESx(0xEAAFAF45), + AESx(0xBF9C9C23), AESx(0xF7A4A453), AESx(0x967272E4), AESx(0x5BC0C09B), + AESx(0xC2B7B775), AESx(0x1CFDFDE1), AESx(0xAE93933D), AESx(0x6A26264C), + AESx(0x5A36366C), AESx(0x413F3F7E), AESx(0x02F7F7F5), AESx(0x4FCCCC83), + AESx(0x5C343468), AESx(0xF4A5A551), AESx(0x34E5E5D1), AESx(0x08F1F1F9), + AESx(0x937171E2), AESx(0x73D8D8AB), AESx(0x53313162), AESx(0x3F15152A), + AESx(0x0C040408), AESx(0x52C7C795), AESx(0x65232346), AESx(0x5EC3C39D), + AESx(0x28181830), AESx(0xA1969637), AESx(0x0F05050A), AESx(0xB59A9A2F), + AESx(0x0907070E), AESx(0x36121224), AESx(0x9B80801B), AESx(0x3DE2E2DF), + AESx(0x26EBEBCD), AESx(0x6927274E), AESx(0xCDB2B27F), AESx(0x9F7575EA), + AESx(0x1B090912), AESx(0x9E83831D), AESx(0x742C2C58), AESx(0x2E1A1A34), + AESx(0x2D1B1B36), AESx(0xB26E6EDC), AESx(0xEE5A5AB4), AESx(0xFBA0A05B), + AESx(0xF65252A4), AESx(0x4D3B3B76), AESx(0x61D6D6B7), AESx(0xCEB3B37D), + AESx(0x7B292952), AESx(0x3EE3E3DD), AESx(0x712F2F5E), AESx(0x97848413), + AESx(0xF55353A6), AESx(0x68D1D1B9), AESx(0x00000000), AESx(0x2CEDEDC1), + AESx(0x60202040), AESx(0x1FFCFCE3), AESx(0xC8B1B179), AESx(0xED5B5BB6), + AESx(0xBE6A6AD4), AESx(0x46CBCB8D), AESx(0xD9BEBE67), AESx(0x4B393972), + AESx(0xDE4A4A94), AESx(0xD44C4C98), AESx(0xE85858B0), AESx(0x4ACFCF85), + AESx(0x6BD0D0BB), AESx(0x2AEFEFC5), AESx(0xE5AAAA4F), AESx(0x16FBFBED), + AESx(0xC5434386), AESx(0xD74D4D9A), AESx(0x55333366), AESx(0x94858511), + AESx(0xCF45458A), AESx(0x10F9F9E9), AESx(0x06020204), AESx(0x817F7FFE), + AESx(0xF05050A0), AESx(0x443C3C78), AESx(0xBA9F9F25), AESx(0xE3A8A84B), + AESx(0xF35151A2), AESx(0xFEA3A35D), AESx(0xC0404080), AESx(0x8A8F8F05), + AESx(0xAD92923F), AESx(0xBC9D9D21), AESx(0x48383870), AESx(0x04F5F5F1), + AESx(0xDFBCBC63), AESx(0xC1B6B677), AESx(0x75DADAAF), AESx(0x63212142), + AESx(0x30101020), AESx(0x1AFFFFE5), AESx(0x0EF3F3FD), AESx(0x6DD2D2BF), + AESx(0x4CCDCD81), AESx(0x140C0C18), AESx(0x35131326), AESx(0x2FECECC3), + AESx(0xE15F5FBE), AESx(0xA2979735), AESx(0xCC444488), AESx(0x3917172E), + AESx(0x57C4C493), AESx(0xF2A7A755), AESx(0x827E7EFC), AESx(0x473D3D7A), + AESx(0xAC6464C8), AESx(0xE75D5DBA), AESx(0x2B191932), AESx(0x957373E6), + AESx(0xA06060C0), AESx(0x98818119), AESx(0xD14F4F9E), AESx(0x7FDCDCA3), + AESx(0x66222244), AESx(0x7E2A2A54), AESx(0xAB90903B), AESx(0x8388880B), + AESx(0xCA46468C), AESx(0x29EEEEC7), AESx(0xD3B8B86B), AESx(0x3C141428), + AESx(0x79DEDEA7), AESx(0xE25E5EBC), AESx(0x1D0B0B16), AESx(0x76DBDBAD), + AESx(0x3BE0E0DB), AESx(0x56323264), AESx(0x4E3A3A74), AESx(0x1E0A0A14), + AESx(0xDB494992), AESx(0x0A06060C), AESx(0x6C242448), AESx(0xE45C5CB8), + AESx(0x5DC2C29F), AESx(0x6ED3D3BD), AESx(0xEFACAC43), AESx(0xA66262C4), + AESx(0xA8919139), AESx(0xA4959531), AESx(0x37E4E4D3), AESx(0x8B7979F2), + AESx(0x32E7E7D5), AESx(0x43C8C88B), AESx(0x5937376E), AESx(0xB76D6DDA), + AESx(0x8C8D8D01), AESx(0x64D5D5B1), AESx(0xD24E4E9C), AESx(0xE0A9A949), + AESx(0xB46C6CD8), AESx(0xFA5656AC), AESx(0x07F4F4F3), AESx(0x25EAEACF), + AESx(0xAF6565CA), AESx(0x8E7A7AF4), AESx(0xE9AEAE47), AESx(0x18080810), + AESx(0xD5BABA6F), AESx(0x887878F0), AESx(0x6F25254A), AESx(0x722E2E5C), + AESx(0x241C1C38), AESx(0xF1A6A657), AESx(0xC7B4B473), AESx(0x51C6C697), + AESx(0x23E8E8CB), AESx(0x7CDDDDA1), AESx(0x9C7474E8), AESx(0x211F1F3E), + AESx(0xDD4B4B96), AESx(0xDCBDBD61), AESx(0x868B8B0D), AESx(0x858A8A0F), + AESx(0x907070E0), AESx(0x423E3E7C), AESx(0xC4B5B571), AESx(0xAA6666CC), + AESx(0xD8484890), AESx(0x05030306), AESx(0x01F6F6F7), AESx(0x120E0E1C), + AESx(0xA36161C2), AESx(0x5F35356A), AESx(0xF95757AE), AESx(0xD0B9B969), + AESx(0x91868617), AESx(0x58C1C199), AESx(0x271D1D3A), AESx(0xB99E9E27), + AESx(0x38E1E1D9), AESx(0x13F8F8EB), AESx(0xB398982B), AESx(0x33111122), + AESx(0xBB6969D2), AESx(0x70D9D9A9), AESx(0x898E8E07), AESx(0xA7949433), + AESx(0xB69B9B2D), AESx(0x221E1E3C), AESx(0x92878715), AESx(0x20E9E9C9), + AESx(0x49CECE87), AESx(0xFF5555AA), AESx(0x78282850), AESx(0x7ADFDFA5), + AESx(0x8F8C8C03), AESx(0xF8A1A159), AESx(0x80898909), AESx(0x170D0D1A), + AESx(0xDABFBF65), AESx(0x31E6E6D7), AESx(0xC6424284), AESx(0xB86868D0), + AESx(0xC3414182), AESx(0xB0999929), AESx(0x772D2D5A), AESx(0x110F0F1E), + AESx(0xCBB0B07B), AESx(0xFC5454A8), AESx(0xD6BBBB6D), AESx(0x3A16162C) +}; + +__constant__ __align__(64) uint32_t d_AES1[256] = { + AESx(0x6363C6A5), AESx(0x7C7CF884), AESx(0x7777EE99), AESx(0x7B7BF68D), + AESx(0xF2F2FF0D), AESx(0x6B6BD6BD), AESx(0x6F6FDEB1), AESx(0xC5C59154), + AESx(0x30306050), AESx(0x01010203), AESx(0x6767CEA9), AESx(0x2B2B567D), + AESx(0xFEFEE719), AESx(0xD7D7B562), AESx(0xABAB4DE6), AESx(0x7676EC9A), + AESx(0xCACA8F45), AESx(0x82821F9D), AESx(0xC9C98940), AESx(0x7D7DFA87), + AESx(0xFAFAEF15), AESx(0x5959B2EB), AESx(0x47478EC9), AESx(0xF0F0FB0B), + AESx(0xADAD41EC), AESx(0xD4D4B367), AESx(0xA2A25FFD), AESx(0xAFAF45EA), + AESx(0x9C9C23BF), AESx(0xA4A453F7), AESx(0x7272E496), AESx(0xC0C09B5B), + AESx(0xB7B775C2), AESx(0xFDFDE11C), AESx(0x93933DAE), AESx(0x26264C6A), + AESx(0x36366C5A), AESx(0x3F3F7E41), AESx(0xF7F7F502), AESx(0xCCCC834F), + AESx(0x3434685C), AESx(0xA5A551F4), AESx(0xE5E5D134), AESx(0xF1F1F908), + AESx(0x7171E293), AESx(0xD8D8AB73), AESx(0x31316253), AESx(0x15152A3F), + AESx(0x0404080C), AESx(0xC7C79552), AESx(0x23234665), AESx(0xC3C39D5E), + AESx(0x18183028), AESx(0x969637A1), AESx(0x05050A0F), AESx(0x9A9A2FB5), + AESx(0x07070E09), AESx(0x12122436), AESx(0x80801B9B), AESx(0xE2E2DF3D), + AESx(0xEBEBCD26), AESx(0x27274E69), AESx(0xB2B27FCD), AESx(0x7575EA9F), + AESx(0x0909121B), AESx(0x83831D9E), AESx(0x2C2C5874), AESx(0x1A1A342E), + AESx(0x1B1B362D), AESx(0x6E6EDCB2), AESx(0x5A5AB4EE), AESx(0xA0A05BFB), + AESx(0x5252A4F6), AESx(0x3B3B764D), AESx(0xD6D6B761), AESx(0xB3B37DCE), + AESx(0x2929527B), AESx(0xE3E3DD3E), AESx(0x2F2F5E71), AESx(0x84841397), + AESx(0x5353A6F5), AESx(0xD1D1B968), AESx(0x00000000), AESx(0xEDEDC12C), + AESx(0x20204060), AESx(0xFCFCE31F), AESx(0xB1B179C8), AESx(0x5B5BB6ED), + AESx(0x6A6AD4BE), AESx(0xCBCB8D46), AESx(0xBEBE67D9), AESx(0x3939724B), + AESx(0x4A4A94DE), AESx(0x4C4C98D4), AESx(0x5858B0E8), AESx(0xCFCF854A), + AESx(0xD0D0BB6B), AESx(0xEFEFC52A), AESx(0xAAAA4FE5), AESx(0xFBFBED16), + AESx(0x434386C5), AESx(0x4D4D9AD7), AESx(0x33336655), AESx(0x85851194), + AESx(0x45458ACF), AESx(0xF9F9E910), AESx(0x02020406), AESx(0x7F7FFE81), + AESx(0x5050A0F0), AESx(0x3C3C7844), AESx(0x9F9F25BA), AESx(0xA8A84BE3), + AESx(0x5151A2F3), AESx(0xA3A35DFE), AESx(0x404080C0), AESx(0x8F8F058A), + AESx(0x92923FAD), AESx(0x9D9D21BC), AESx(0x38387048), AESx(0xF5F5F104), + AESx(0xBCBC63DF), AESx(0xB6B677C1), AESx(0xDADAAF75), AESx(0x21214263), + AESx(0x10102030), AESx(0xFFFFE51A), AESx(0xF3F3FD0E), AESx(0xD2D2BF6D), + AESx(0xCDCD814C), AESx(0x0C0C1814), AESx(0x13132635), AESx(0xECECC32F), + AESx(0x5F5FBEE1), AESx(0x979735A2), AESx(0x444488CC), AESx(0x17172E39), + AESx(0xC4C49357), AESx(0xA7A755F2), AESx(0x7E7EFC82), AESx(0x3D3D7A47), + AESx(0x6464C8AC), AESx(0x5D5DBAE7), AESx(0x1919322B), AESx(0x7373E695), + AESx(0x6060C0A0), AESx(0x81811998), AESx(0x4F4F9ED1), AESx(0xDCDCA37F), + AESx(0x22224466), AESx(0x2A2A547E), AESx(0x90903BAB), AESx(0x88880B83), + AESx(0x46468CCA), AESx(0xEEEEC729), AESx(0xB8B86BD3), AESx(0x1414283C), + AESx(0xDEDEA779), AESx(0x5E5EBCE2), AESx(0x0B0B161D), AESx(0xDBDBAD76), + AESx(0xE0E0DB3B), AESx(0x32326456), AESx(0x3A3A744E), AESx(0x0A0A141E), + AESx(0x494992DB), AESx(0x06060C0A), AESx(0x2424486C), AESx(0x5C5CB8E4), + AESx(0xC2C29F5D), AESx(0xD3D3BD6E), AESx(0xACAC43EF), AESx(0x6262C4A6), + AESx(0x919139A8), AESx(0x959531A4), AESx(0xE4E4D337), AESx(0x7979F28B), + AESx(0xE7E7D532), AESx(0xC8C88B43), AESx(0x37376E59), AESx(0x6D6DDAB7), + AESx(0x8D8D018C), AESx(0xD5D5B164), AESx(0x4E4E9CD2), AESx(0xA9A949E0), + AESx(0x6C6CD8B4), AESx(0x5656ACFA), AESx(0xF4F4F307), AESx(0xEAEACF25), + AESx(0x6565CAAF), AESx(0x7A7AF48E), AESx(0xAEAE47E9), AESx(0x08081018), + AESx(0xBABA6FD5), AESx(0x7878F088), AESx(0x25254A6F), AESx(0x2E2E5C72), + AESx(0x1C1C3824), AESx(0xA6A657F1), AESx(0xB4B473C7), AESx(0xC6C69751), + AESx(0xE8E8CB23), AESx(0xDDDDA17C), AESx(0x7474E89C), AESx(0x1F1F3E21), + AESx(0x4B4B96DD), AESx(0xBDBD61DC), AESx(0x8B8B0D86), AESx(0x8A8A0F85), + AESx(0x7070E090), AESx(0x3E3E7C42), AESx(0xB5B571C4), AESx(0x6666CCAA), + AESx(0x484890D8), AESx(0x03030605), AESx(0xF6F6F701), AESx(0x0E0E1C12), + AESx(0x6161C2A3), AESx(0x35356A5F), AESx(0x5757AEF9), AESx(0xB9B969D0), + AESx(0x86861791), AESx(0xC1C19958), AESx(0x1D1D3A27), AESx(0x9E9E27B9), + AESx(0xE1E1D938), AESx(0xF8F8EB13), AESx(0x98982BB3), AESx(0x11112233), + AESx(0x6969D2BB), AESx(0xD9D9A970), AESx(0x8E8E0789), AESx(0x949433A7), + AESx(0x9B9B2DB6), AESx(0x1E1E3C22), AESx(0x87871592), AESx(0xE9E9C920), + AESx(0xCECE8749), AESx(0x5555AAFF), AESx(0x28285078), AESx(0xDFDFA57A), + AESx(0x8C8C038F), AESx(0xA1A159F8), AESx(0x89890980), AESx(0x0D0D1A17), + AESx(0xBFBF65DA), AESx(0xE6E6D731), AESx(0x424284C6), AESx(0x6868D0B8), + AESx(0x414182C3), AESx(0x999929B0), AESx(0x2D2D5A77), AESx(0x0F0F1E11), + AESx(0xB0B07BCB), AESx(0x5454A8FC), AESx(0xBBBB6DD6), AESx(0x16162C3A) +}; + +__constant__ __align__(64) uint32_t d_AES2[256] = { + AESx(0x63C6A563), AESx(0x7CF8847C), AESx(0x77EE9977), AESx(0x7BF68D7B), + AESx(0xF2FF0DF2), AESx(0x6BD6BD6B), AESx(0x6FDEB16F), AESx(0xC59154C5), + AESx(0x30605030), AESx(0x01020301), AESx(0x67CEA967), AESx(0x2B567D2B), + AESx(0xFEE719FE), AESx(0xD7B562D7), AESx(0xAB4DE6AB), AESx(0x76EC9A76), + AESx(0xCA8F45CA), AESx(0x821F9D82), AESx(0xC98940C9), AESx(0x7DFA877D), + AESx(0xFAEF15FA), AESx(0x59B2EB59), AESx(0x478EC947), AESx(0xF0FB0BF0), + AESx(0xAD41ECAD), AESx(0xD4B367D4), AESx(0xA25FFDA2), AESx(0xAF45EAAF), + AESx(0x9C23BF9C), AESx(0xA453F7A4), AESx(0x72E49672), AESx(0xC09B5BC0), + AESx(0xB775C2B7), AESx(0xFDE11CFD), AESx(0x933DAE93), AESx(0x264C6A26), + AESx(0x366C5A36), AESx(0x3F7E413F), AESx(0xF7F502F7), AESx(0xCC834FCC), + AESx(0x34685C34), AESx(0xA551F4A5), AESx(0xE5D134E5), AESx(0xF1F908F1), + AESx(0x71E29371), AESx(0xD8AB73D8), AESx(0x31625331), AESx(0x152A3F15), + AESx(0x04080C04), AESx(0xC79552C7), AESx(0x23466523), AESx(0xC39D5EC3), + AESx(0x18302818), AESx(0x9637A196), AESx(0x050A0F05), AESx(0x9A2FB59A), + AESx(0x070E0907), AESx(0x12243612), AESx(0x801B9B80), AESx(0xE2DF3DE2), + AESx(0xEBCD26EB), AESx(0x274E6927), AESx(0xB27FCDB2), AESx(0x75EA9F75), + AESx(0x09121B09), AESx(0x831D9E83), AESx(0x2C58742C), AESx(0x1A342E1A), + AESx(0x1B362D1B), AESx(0x6EDCB26E), AESx(0x5AB4EE5A), AESx(0xA05BFBA0), + AESx(0x52A4F652), AESx(0x3B764D3B), AESx(0xD6B761D6), AESx(0xB37DCEB3), + AESx(0x29527B29), AESx(0xE3DD3EE3), AESx(0x2F5E712F), AESx(0x84139784), + AESx(0x53A6F553), AESx(0xD1B968D1), AESx(0x00000000), AESx(0xEDC12CED), + AESx(0x20406020), AESx(0xFCE31FFC), AESx(0xB179C8B1), AESx(0x5BB6ED5B), + AESx(0x6AD4BE6A), AESx(0xCB8D46CB), AESx(0xBE67D9BE), AESx(0x39724B39), + AESx(0x4A94DE4A), AESx(0x4C98D44C), AESx(0x58B0E858), AESx(0xCF854ACF), + AESx(0xD0BB6BD0), AESx(0xEFC52AEF), AESx(0xAA4FE5AA), AESx(0xFBED16FB), + AESx(0x4386C543), AESx(0x4D9AD74D), AESx(0x33665533), AESx(0x85119485), + AESx(0x458ACF45), AESx(0xF9E910F9), AESx(0x02040602), AESx(0x7FFE817F), + AESx(0x50A0F050), AESx(0x3C78443C), AESx(0x9F25BA9F), AESx(0xA84BE3A8), + AESx(0x51A2F351), AESx(0xA35DFEA3), AESx(0x4080C040), AESx(0x8F058A8F), + AESx(0x923FAD92), AESx(0x9D21BC9D), AESx(0x38704838), AESx(0xF5F104F5), + AESx(0xBC63DFBC), AESx(0xB677C1B6), AESx(0xDAAF75DA), AESx(0x21426321), + AESx(0x10203010), AESx(0xFFE51AFF), AESx(0xF3FD0EF3), AESx(0xD2BF6DD2), + AESx(0xCD814CCD), AESx(0x0C18140C), AESx(0x13263513), AESx(0xECC32FEC), + AESx(0x5FBEE15F), AESx(0x9735A297), AESx(0x4488CC44), AESx(0x172E3917), + AESx(0xC49357C4), AESx(0xA755F2A7), AESx(0x7EFC827E), AESx(0x3D7A473D), + AESx(0x64C8AC64), AESx(0x5DBAE75D), AESx(0x19322B19), AESx(0x73E69573), + AESx(0x60C0A060), AESx(0x81199881), AESx(0x4F9ED14F), AESx(0xDCA37FDC), + AESx(0x22446622), AESx(0x2A547E2A), AESx(0x903BAB90), AESx(0x880B8388), + AESx(0x468CCA46), AESx(0xEEC729EE), AESx(0xB86BD3B8), AESx(0x14283C14), + AESx(0xDEA779DE), AESx(0x5EBCE25E), AESx(0x0B161D0B), AESx(0xDBAD76DB), + AESx(0xE0DB3BE0), AESx(0x32645632), AESx(0x3A744E3A), AESx(0x0A141E0A), + AESx(0x4992DB49), AESx(0x060C0A06), AESx(0x24486C24), AESx(0x5CB8E45C), + AESx(0xC29F5DC2), AESx(0xD3BD6ED3), AESx(0xAC43EFAC), AESx(0x62C4A662), + AESx(0x9139A891), AESx(0x9531A495), AESx(0xE4D337E4), AESx(0x79F28B79), + AESx(0xE7D532E7), AESx(0xC88B43C8), AESx(0x376E5937), AESx(0x6DDAB76D), + AESx(0x8D018C8D), AESx(0xD5B164D5), AESx(0x4E9CD24E), AESx(0xA949E0A9), + AESx(0x6CD8B46C), AESx(0x56ACFA56), AESx(0xF4F307F4), AESx(0xEACF25EA), + AESx(0x65CAAF65), AESx(0x7AF48E7A), AESx(0xAE47E9AE), AESx(0x08101808), + AESx(0xBA6FD5BA), AESx(0x78F08878), AESx(0x254A6F25), AESx(0x2E5C722E), + AESx(0x1C38241C), AESx(0xA657F1A6), AESx(0xB473C7B4), AESx(0xC69751C6), + AESx(0xE8CB23E8), AESx(0xDDA17CDD), AESx(0x74E89C74), AESx(0x1F3E211F), + AESx(0x4B96DD4B), AESx(0xBD61DCBD), AESx(0x8B0D868B), AESx(0x8A0F858A), + AESx(0x70E09070), AESx(0x3E7C423E), AESx(0xB571C4B5), AESx(0x66CCAA66), + AESx(0x4890D848), AESx(0x03060503), AESx(0xF6F701F6), AESx(0x0E1C120E), + AESx(0x61C2A361), AESx(0x356A5F35), AESx(0x57AEF957), AESx(0xB969D0B9), + AESx(0x86179186), AESx(0xC19958C1), AESx(0x1D3A271D), AESx(0x9E27B99E), + AESx(0xE1D938E1), AESx(0xF8EB13F8), AESx(0x982BB398), AESx(0x11223311), + AESx(0x69D2BB69), AESx(0xD9A970D9), AESx(0x8E07898E), AESx(0x9433A794), + AESx(0x9B2DB69B), AESx(0x1E3C221E), AESx(0x87159287), AESx(0xE9C920E9), + AESx(0xCE8749CE), AESx(0x55AAFF55), AESx(0x28507828), AESx(0xDFA57ADF), + AESx(0x8C038F8C), AESx(0xA159F8A1), AESx(0x89098089), AESx(0x0D1A170D), + AESx(0xBF65DABF), AESx(0xE6D731E6), AESx(0x4284C642), AESx(0x68D0B868), + AESx(0x4182C341), AESx(0x9929B099), AESx(0x2D5A772D), AESx(0x0F1E110F), + AESx(0xB07BCBB0), AESx(0x54A8FC54), AESx(0xBB6DD6BB), AESx(0x162C3A16) +}; + +__constant__ __align__(64) uint32_t d_AES3[256] = { + AESx(0xC6A56363), AESx(0xF8847C7C), AESx(0xEE997777), AESx(0xF68D7B7B), + AESx(0xFF0DF2F2), AESx(0xD6BD6B6B), AESx(0xDEB16F6F), AESx(0x9154C5C5), + AESx(0x60503030), AESx(0x02030101), AESx(0xCEA96767), AESx(0x567D2B2B), + AESx(0xE719FEFE), AESx(0xB562D7D7), AESx(0x4DE6ABAB), AESx(0xEC9A7676), + AESx(0x8F45CACA), AESx(0x1F9D8282), AESx(0x8940C9C9), AESx(0xFA877D7D), + AESx(0xEF15FAFA), AESx(0xB2EB5959), AESx(0x8EC94747), AESx(0xFB0BF0F0), + AESx(0x41ECADAD), AESx(0xB367D4D4), AESx(0x5FFDA2A2), AESx(0x45EAAFAF), + AESx(0x23BF9C9C), AESx(0x53F7A4A4), AESx(0xE4967272), AESx(0x9B5BC0C0), + AESx(0x75C2B7B7), AESx(0xE11CFDFD), AESx(0x3DAE9393), AESx(0x4C6A2626), + AESx(0x6C5A3636), AESx(0x7E413F3F), AESx(0xF502F7F7), AESx(0x834FCCCC), + AESx(0x685C3434), AESx(0x51F4A5A5), AESx(0xD134E5E5), AESx(0xF908F1F1), + AESx(0xE2937171), AESx(0xAB73D8D8), AESx(0x62533131), AESx(0x2A3F1515), + AESx(0x080C0404), AESx(0x9552C7C7), AESx(0x46652323), AESx(0x9D5EC3C3), + AESx(0x30281818), AESx(0x37A19696), AESx(0x0A0F0505), AESx(0x2FB59A9A), + AESx(0x0E090707), AESx(0x24361212), AESx(0x1B9B8080), AESx(0xDF3DE2E2), + AESx(0xCD26EBEB), AESx(0x4E692727), AESx(0x7FCDB2B2), AESx(0xEA9F7575), + AESx(0x121B0909), AESx(0x1D9E8383), AESx(0x58742C2C), AESx(0x342E1A1A), + AESx(0x362D1B1B), AESx(0xDCB26E6E), AESx(0xB4EE5A5A), AESx(0x5BFBA0A0), + AESx(0xA4F65252), AESx(0x764D3B3B), AESx(0xB761D6D6), AESx(0x7DCEB3B3), + AESx(0x527B2929), AESx(0xDD3EE3E3), AESx(0x5E712F2F), AESx(0x13978484), + AESx(0xA6F55353), AESx(0xB968D1D1), AESx(0x00000000), AESx(0xC12CEDED), + AESx(0x40602020), AESx(0xE31FFCFC), AESx(0x79C8B1B1), AESx(0xB6ED5B5B), + AESx(0xD4BE6A6A), AESx(0x8D46CBCB), AESx(0x67D9BEBE), AESx(0x724B3939), + AESx(0x94DE4A4A), AESx(0x98D44C4C), AESx(0xB0E85858), AESx(0x854ACFCF), + AESx(0xBB6BD0D0), AESx(0xC52AEFEF), AESx(0x4FE5AAAA), AESx(0xED16FBFB), + AESx(0x86C54343), AESx(0x9AD74D4D), AESx(0x66553333), AESx(0x11948585), + AESx(0x8ACF4545), AESx(0xE910F9F9), AESx(0x04060202), AESx(0xFE817F7F), + AESx(0xA0F05050), AESx(0x78443C3C), AESx(0x25BA9F9F), AESx(0x4BE3A8A8), + AESx(0xA2F35151), AESx(0x5DFEA3A3), AESx(0x80C04040), AESx(0x058A8F8F), + AESx(0x3FAD9292), AESx(0x21BC9D9D), AESx(0x70483838), AESx(0xF104F5F5), + AESx(0x63DFBCBC), AESx(0x77C1B6B6), AESx(0xAF75DADA), AESx(0x42632121), + AESx(0x20301010), AESx(0xE51AFFFF), AESx(0xFD0EF3F3), AESx(0xBF6DD2D2), + AESx(0x814CCDCD), AESx(0x18140C0C), AESx(0x26351313), AESx(0xC32FECEC), + AESx(0xBEE15F5F), AESx(0x35A29797), AESx(0x88CC4444), AESx(0x2E391717), + AESx(0x9357C4C4), AESx(0x55F2A7A7), AESx(0xFC827E7E), AESx(0x7A473D3D), + AESx(0xC8AC6464), AESx(0xBAE75D5D), AESx(0x322B1919), AESx(0xE6957373), + AESx(0xC0A06060), AESx(0x19988181), AESx(0x9ED14F4F), AESx(0xA37FDCDC), + AESx(0x44662222), AESx(0x547E2A2A), AESx(0x3BAB9090), AESx(0x0B838888), + AESx(0x8CCA4646), AESx(0xC729EEEE), AESx(0x6BD3B8B8), AESx(0x283C1414), + AESx(0xA779DEDE), AESx(0xBCE25E5E), AESx(0x161D0B0B), AESx(0xAD76DBDB), + AESx(0xDB3BE0E0), AESx(0x64563232), AESx(0x744E3A3A), AESx(0x141E0A0A), + AESx(0x92DB4949), AESx(0x0C0A0606), AESx(0x486C2424), AESx(0xB8E45C5C), + AESx(0x9F5DC2C2), AESx(0xBD6ED3D3), AESx(0x43EFACAC), AESx(0xC4A66262), + AESx(0x39A89191), AESx(0x31A49595), AESx(0xD337E4E4), AESx(0xF28B7979), + AESx(0xD532E7E7), AESx(0x8B43C8C8), AESx(0x6E593737), AESx(0xDAB76D6D), + AESx(0x018C8D8D), AESx(0xB164D5D5), AESx(0x9CD24E4E), AESx(0x49E0A9A9), + AESx(0xD8B46C6C), AESx(0xACFA5656), AESx(0xF307F4F4), AESx(0xCF25EAEA), + AESx(0xCAAF6565), AESx(0xF48E7A7A), AESx(0x47E9AEAE), AESx(0x10180808), + AESx(0x6FD5BABA), AESx(0xF0887878), AESx(0x4A6F2525), AESx(0x5C722E2E), + AESx(0x38241C1C), AESx(0x57F1A6A6), AESx(0x73C7B4B4), AESx(0x9751C6C6), + AESx(0xCB23E8E8), AESx(0xA17CDDDD), AESx(0xE89C7474), AESx(0x3E211F1F), + AESx(0x96DD4B4B), AESx(0x61DCBDBD), AESx(0x0D868B8B), AESx(0x0F858A8A), + AESx(0xE0907070), AESx(0x7C423E3E), AESx(0x71C4B5B5), AESx(0xCCAA6666), + AESx(0x90D84848), AESx(0x06050303), AESx(0xF701F6F6), AESx(0x1C120E0E), + AESx(0xC2A36161), AESx(0x6A5F3535), AESx(0xAEF95757), AESx(0x69D0B9B9), + AESx(0x17918686), AESx(0x9958C1C1), AESx(0x3A271D1D), AESx(0x27B99E9E), + AESx(0xD938E1E1), AESx(0xEB13F8F8), AESx(0x2BB39898), AESx(0x22331111), + AESx(0xD2BB6969), AESx(0xA970D9D9), AESx(0x07898E8E), AESx(0x33A79494), + AESx(0x2DB69B9B), AESx(0x3C221E1E), AESx(0x15928787), AESx(0xC920E9E9), + AESx(0x8749CECE), AESx(0xAAFF5555), AESx(0x50782828), AESx(0xA57ADFDF), + AESx(0x038F8C8C), AESx(0x59F8A1A1), AESx(0x09808989), AESx(0x1A170D0D), + AESx(0x65DABFBF), AESx(0xD731E6E6), AESx(0x84C64242), AESx(0xD0B86868), + AESx(0x82C34141), AESx(0x29B09999), AESx(0x5A772D2D), AESx(0x1E110F0F), + AESx(0x7BCBB0B0), AESx(0xA8FC5454), AESx(0x6DD6BBBB), AESx(0x2C3A1616) +}; + + +__device__ __forceinline__ +void aes_gpu_init(uint32_t *const sharedMemory) +{ + /* each thread startup will fill a uint32 */ + if(threadIdx.x < 256) + { + sharedMemory[threadIdx.x] = d_AES0[threadIdx.x]; + sharedMemory[threadIdx.x + 256] = d_AES1[threadIdx.x]; + sharedMemory[threadIdx.x + 512] = d_AES2[threadIdx.x]; + sharedMemory[threadIdx.x + 768] = d_AES3[threadIdx.x]; + } +} + +#define xor4_32(a,b,c,d) (a) ^ (b) ^ (c) ^ (d) + +// with k0 +__device__ +static void aes_round(const uint32_t *const __restrict__ sharedMemory, const uint32_t x0, const uint32_t x1, const uint32_t x2, const uint32_t x3, const uint32_t k0, uint32_t &y0, uint32_t &y1, uint32_t &y2, uint32_t &y3) +{ + + y0 = xor4_32( + sharedMemory[__byte_perm(x0, 0, 0x4440)], + sharedMemory[__byte_perm(x1, 0, 0x4441) + 256], + sharedMemory[__byte_perm(x2, 0, 0x4442) + 512], + sharedMemory[__byte_perm(x3, 0, 0x4443) + 768]) ^ k0; + + y1 = xor4_32( + sharedMemory[__byte_perm(x1, 0, 0x4440)], + sharedMemory[__byte_perm(x2, 0, 0x4441) + 256], + sharedMemory[__byte_perm(x3, 0, 0x4442) + 512], + sharedMemory[__byte_perm(x0, 0, 0x4443) + 768]); + + y2 = xor4_32( + sharedMemory[__byte_perm(x2, 0, 0x4440)], + sharedMemory[__byte_perm(x3, 0, 0x4441) + 256], + sharedMemory[__byte_perm(x0, 0, 0x4442) + 512], + sharedMemory[__byte_perm(x1, 0, 0x4443) + 768]); + + y3 = xor4_32( + sharedMemory[__byte_perm(x3, 0, 0x4440)], + sharedMemory[__byte_perm(x0, 0, 0x4441) + 256], + sharedMemory[__byte_perm(x1, 0, 0x4442) + 512], + sharedMemory[__byte_perm(x2, 0, 0x4443) + 768]); +} + +//without k0 +__device__ +static void aes_round(const uint32_t *const __restrict__ sharedMemory, const uint32_t x0, const uint32_t x1, const uint32_t x2, const uint32_t x3, uint32_t &y0, uint32_t &y1, uint32_t &y2, uint32_t &y3) +{ + y0 = xor4_32( + sharedMemory[__byte_perm(x0, 0, 0x4440)], + sharedMemory[__byte_perm(x1, 0, 0x4441) + 256], + sharedMemory[__byte_perm(x2, 0, 0x4442) + 512], + sharedMemory[__byte_perm(x3, 0, 0x4443) + 768]); + + y1 = xor4_32( + sharedMemory[__byte_perm(x1, 0, 0x4440)], + sharedMemory[__byte_perm(x2, 0, 0x4441) + 256], + sharedMemory[__byte_perm(x3, 0, 0x4442) + 512], + sharedMemory[__byte_perm(x0, 0, 0x4443) + 768]); + + y2 = xor4_32( + sharedMemory[__byte_perm(x2, 0, 0x4440)], + sharedMemory[__byte_perm(x3, 0, 0x4441) + 256], + sharedMemory[__byte_perm(x0, 0, 0x4442) + 512], + sharedMemory[__byte_perm(x1, 0, 0x4443) + 768]); + + y3 = xor4_32( + sharedMemory[__byte_perm(x3, 0, 0x4440)], + sharedMemory[__byte_perm(x0, 0, 0x4441) + 256], + sharedMemory[__byte_perm(x1, 0, 0x4442) + 512], + sharedMemory[__byte_perm(x2, 0, 0x4443) + 768]); +} diff --git a/fuguecoin.cpp b/fuguecoin.cpp index 4db4d54fa9..1d3234ace7 100644 --- a/fuguecoin.cpp +++ b/fuguecoin.cpp @@ -1,12 +1,17 @@ #include +#ifdef __cplusplus +#include +#else #include +#endif -#include "uint256.h" #include "sph/sph_fugue.h" #include "miner.h" - #include "cuda_fugue256.h" +#include +extern bool stop_mining; +extern volatile bool mining_has_stopped[MAX_GPUS]; extern "C" void my_fugue256_init(void *cc); extern "C" void my_fugue256(void *cc, const void *data, size_t len); @@ -16,28 +21,35 @@ extern "C" void my_fugue256_addbits_and_close(void *cc, unsigned ub, unsigned n, // vorbereitete Kontexte nach den ersten 80 Bytes // sph_fugue256_context ctx_fugue_const[MAX_GPUS]; -#define SWAP32(x) \ - ((((x) << 24) & 0xff000000u) | (((x) << 8) & 0x00ff0000u) | \ - (((x) >> 8) & 0x0000ff00u) | (((x) >> 24) & 0x000000ffu)) - -static bool init[MAX_GPUS] = { 0 }; +#define SWAP32(x) swab32(x) -extern "C" int scanhash_fugue256(int thr_id, uint32_t *pdata, const uint32_t *ptarget, - uint32_t max_nonce, unsigned long *hashes_done) +extern int scanhash_fugue256(int thr_id, uint32_t *pdata, uint32_t *ptarget, + uint32_t max_nonce, uint32_t *hashes_done) { - uint32_t start_nonce = pdata[19]++; + uint32_t start_nonce = pdata[19]; unsigned int intensity = (device_sm[device_map[thr_id]] > 500) ? 22 : 19; - uint32_t throughput = device_intensity(thr_id, __func__, 1 << intensity); // 256*256*8 - throughput = min(throughput, max_nonce - start_nonce); + uint32_t throughputmax = device_intensity(device_map[thr_id], __func__, 1 << intensity); // 256*256*8 + uint32_t throughput = min(throughputmax, max_nonce - start_nonce) & 0xfffffc00; if (opt_benchmark) - ((uint32_t*)ptarget)[7] = 0xf; + ptarget[7] = 0xf; // init - if(!init[thr_id]) + static THREAD volatile bool init = false; + if(!init) { - fugue256_cpu_init(thr_id, throughput); - init[thr_id] = true; +#if defined WIN32 && !defined _WIN64 + // 2GB limit for cudaMalloc + if(throughputmax > 0x7fffffffULL / (8 * sizeof(uint32_t))) + { + applog(LOG_ERR, "intensity too high"); + mining_has_stopped[thr_id] = true; + proper_exit(2); + } +#endif + fugue256_cpu_init(thr_id, throughputmax); + mining_has_stopped[thr_id] = false; + init = true; } // Endian Drehung ist notwendig @@ -53,6 +65,7 @@ extern "C" int scanhash_fugue256(int thr_id, uint32_t *pdata, const uint32_t *pt uint32_t foundNounce = 0xFFFFFFFF; fugue256_cpu_hash(thr_id, throughput, pdata[19], NULL, &foundNounce); + if(stop_mining) {mining_has_stopped[thr_id] = true; pthread_exit(nullptr);} if(foundNounce < 0xffffffff) { uint32_t hash[8]; @@ -66,18 +79,24 @@ extern "C" int scanhash_fugue256(int thr_id, uint32_t *pdata, const uint32_t *pt if (hash[7] <= Htarg && fulltest(hash, ptarget)) { + *hashes_done = pdata[19] - start_nonce + throughput; pdata[19] = foundNounce; - *hashes_done = foundNounce - start_nonce + 1; return 1; } else { - applog(LOG_INFO, "GPU #%d: result for nonce $%08X does not validate on CPU!", thr_id, foundNounce); + applog(LOG_INFO, "GPU #%d: result for nonce $%08X does not validate on CPU!", device_map[thr_id], foundNounce); } } pdata[19] += throughput; + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) + { + applog(LOG_ERR, "GPU #%d: %s", device_map[thr_id], cudaGetErrorString(err)); + exit(EXIT_FAILURE); + } } while (!work_restart[thr_id].restart && ((uint64_t)max_nonce > ((uint64_t)(pdata[19]) + (uint64_t)throughput))); - *hashes_done = pdata[19] - start_nonce + 1; + *hashes_done = pdata[19] - start_nonce; return 0; } diff --git a/groestl_functions_quad.cu b/groestl_functions_quad.cu index 3753866124..8fa044ab2a 100644 --- a/groestl_functions_quad.cu +++ b/groestl_functions_quad.cu @@ -1,18 +1,5 @@ #include "cuda_helper.h" -__device__ __forceinline__ void G256_Mul2(uint32_t *const regs) -{ - uint32_t tmp = regs[7]; - regs[7] = regs[6]; - regs[6] = regs[5]; - regs[5] = regs[4]; - regs[4] = regs[3] ^ tmp; - regs[3] = regs[2] ^ tmp; - regs[2] = regs[1]; - regs[1] = regs[0] ^ tmp; - regs[0] = tmp; -} - __device__ __forceinline__ void G256_AddRoundConstantQ_quad(uint32_t &x7, uint32_t &x6, uint32_t &x5, uint32_t &x4, uint32_t &x3, uint32_t &x2, uint32_t &x1, uint32_t &x0, const int round) { x0 = ~x0; @@ -24,7 +11,7 @@ __device__ __forceinline__ void G256_AddRoundConstantQ_quad(uint32_t &x7, uint32 x6 = ~x6; x7 = ~x7; - uint32_t andmask1 = ((-((threadIdx.x & 0x03) == 3)) & 0xffff0000); + const uint32_t andmask1 = ((-((threadIdx.x & 0x03) == 3)) & 0xffff0000); x0 ^= ((-(round & 0x01)) & andmask1); x1 ^= ((-(round & 0x02)) & andmask1); @@ -38,7 +25,7 @@ __device__ __forceinline__ void G256_AddRoundConstantQ_quad(uint32_t &x7, uint32 __device__ __forceinline__ void G256_AddRoundConstantP_quad(uint32_t &x7, uint32_t &x6, uint32_t &x5, uint32_t &x4, uint32_t &x3, uint32_t &x2, uint32_t &x1, uint32_t &x0, const int round) { - uint32_t andmask1 = ((threadIdx.x & 0x03) - 1) >> 16; + const uint32_t andmask1 = ((threadIdx.x & 0x03) - 1) >> 16; x4 ^= (0xAAAA & andmask1); x5 ^= (0xCCCC & andmask1); @@ -240,15 +227,6 @@ __device__ __forceinline__ void G256_ShiftBytesQ_quad(uint32_t &x7, uint32_t &x6 x7 = __byte_perm(t0, t1, 0x5410); } -#if __CUDA_ARCH__ < 300 -/** - * __shfl() returns the value of var held by the thread whose ID is given by srcLane. - * If srcLane is outside the range 0..width-1, the thread’s own value of var is returned. - */ -#undef __shfl -#define __shfl(var, srcLane, width) (uint32_t)(var) -#endif - __device__ __forceinline__ void G256_MixFunction_quad(uint32_t *r) { #define SHIFT64_16(hi, lo) __byte_perm(lo, hi, 0x5432) @@ -262,19 +240,34 @@ __device__ __forceinline__ void G256_MixFunction_quad(uint32_t *r) #define SINGLE_EVEN(i, bc) ( A(i, (bc)) ) uint32_t b[8]; -#pragma unroll 8 - for(int i=0;i<8;i++) - b[i] = DOUBLE_ODD(i, 1) ^ DOUBLE_EVEN(i, 3); - - G256_Mul2(b); -#pragma unroll 8 - for(int i=0;i<8;i++) - b[i] = b[i] ^ DOUBLE_ODD(i, 3) ^ DOUBLE_ODD(i, 4) ^ SINGLE_ODD(i, 6); - - G256_Mul2(b); -#pragma unroll 8 - for(int i=0;i<8;i++) - r[i] = b[i] ^ DOUBLE_EVEN(i, 2) ^ DOUBLE_EVEN(i, 3) ^ SINGLE_EVEN(i, 5); + b[0] = (S(0, (1)) ^ A(0, (1) + 1)) ^ DOUBLE_EVEN(0, 3); + b[1] = (S(1, (1)) ^ A(1, (1) + 1)) ^ DOUBLE_EVEN(1, 3); + b[2] = (S(2, (1)) ^ A(2, (1) + 1)) ^ DOUBLE_EVEN(2, 3); + b[3] = (S(3, (1)) ^ A(3, (1) + 1)) ^ DOUBLE_EVEN(3, 3); + b[4] = (S(4, (1)) ^ A(4, (1) + 1)) ^ DOUBLE_EVEN(4, 3); + b[5] = (S(5, (1)) ^ A(5, (1) + 1)) ^ DOUBLE_EVEN(5, 3); + b[6] = (S(6, (1)) ^ A(6, (1) + 1)) ^ DOUBLE_EVEN(6, 3); + b[7] = (S(7, (1)) ^ A(7, (1) + 1)) ^ DOUBLE_EVEN(7, 3); + + uint32_t tmp = b[7]; + b[7] = b[6] ^ (S(7, (3)) ^ A(7, (3) + 1)) ^ DOUBLE_ODD(7, 4) ^ SINGLE_ODD(7, 6); + b[6] = b[5] ^ (S(6, (3)) ^ A(6, (3) + 1)) ^ DOUBLE_ODD(6, 4) ^ SINGLE_ODD(6, 6); + b[5] = b[4] ^ (S(5, (3)) ^ A(5, (3) + 1)) ^ DOUBLE_ODD(5, 4) ^ SINGLE_ODD(5, 6); + b[4] = b[3] ^ (S(4, (3)) ^ A(4, (3) + 1)) ^ DOUBLE_ODD(4, 4) ^ SINGLE_ODD(4, 6) ^ tmp; + b[3] = b[2] ^ (S(3, (3)) ^ A(3, (3) + 1)) ^ DOUBLE_ODD(3, 4) ^ SINGLE_ODD(3, 6) ^ tmp; + b[2] = b[1] ^ (S(2, (3)) ^ A(2, (3) + 1)) ^ DOUBLE_ODD(2, 4) ^ SINGLE_ODD(2, 6); + b[1] = b[0] ^ (S(1, (3)) ^ A(1, (3) + 1)) ^ DOUBLE_ODD(1, 4) ^ SINGLE_ODD(1, 6) ^ tmp; + b[0] = tmp ^ (S(0, (3)) ^ A(0, (3) + 1)) ^ DOUBLE_ODD(0, 4) ^ SINGLE_ODD(0, 6); + + tmp = b[7]; + r[7] = b[6] ^ DOUBLE_EVEN(7, 2) ^ DOUBLE_EVEN(7, 3) ^ SINGLE_EVEN(7, 5); + r[6] = b[5] ^ DOUBLE_EVEN(6, 2) ^ DOUBLE_EVEN(6, 3) ^ SINGLE_EVEN(6, 5); + r[5] = b[4] ^ DOUBLE_EVEN(5, 2) ^ DOUBLE_EVEN(5, 3) ^ SINGLE_EVEN(5, 5); + r[4] = b[3] ^ DOUBLE_EVEN(4, 2) ^ DOUBLE_EVEN(4, 3) ^ SINGLE_EVEN(4, 5) ^ tmp; + r[3] = b[2] ^ DOUBLE_EVEN(3, 2) ^ DOUBLE_EVEN(3, 3) ^ SINGLE_EVEN(3, 5) ^ tmp; + r[2] = b[1] ^ DOUBLE_EVEN(2, 2) ^ DOUBLE_EVEN(2, 3) ^ SINGLE_EVEN(2, 5); + r[1] = b[0] ^ DOUBLE_EVEN(1, 2) ^ DOUBLE_EVEN(1, 3) ^ SINGLE_EVEN(1, 5)^tmp; + r[0] = tmp ^ DOUBLE_EVEN(0, 2) ^ DOUBLE_EVEN(0, 3) ^ SINGLE_EVEN(0, 5); #undef S #undef A @@ -285,19 +278,174 @@ __device__ __forceinline__ void G256_MixFunction_quad(uint32_t *r) __device__ __forceinline__ void groestl512_perm_P_quad(uint32_t *const r) { +#if __CUDA_ARCH__ > 500 + const uint32_t andmask1 = ((threadIdx.x & 0x03) - 1) >> 16; + + r[4] ^= (0xAAAA & andmask1); + r[5] ^= (0xCCCC & andmask1); + r[6] ^= (0xF0F0 & andmask1); + r[7] ^= (0xFF00 & andmask1); + sbox_quad(r); + G256_ShiftBytesP_quad(r[7], r[6], r[5], r[4], r[3], r[2], r[1], r[0]); + G256_MixFunction_quad(r); + + r[4] ^= (0xAAAA & andmask1); + r[5] ^= (0xCCCC & andmask1); + r[6] ^= (0xF0F0 & andmask1); + r[7] ^= (0xFF00 & andmask1); + r[0] ^= andmask1; + sbox_quad(r); + G256_ShiftBytesP_quad(r[7], r[6], r[5], r[4], r[3], r[2], r[1], r[0]); + G256_MixFunction_quad(r); + + r[4] ^= (0xAAAA & andmask1); + r[5] ^= (0xCCCC & andmask1); + r[6] ^= (0xF0F0 & andmask1); + r[7] ^= (0xFF00 & andmask1); + r[1] ^= andmask1; + sbox_quad(r); + G256_ShiftBytesP_quad(r[7], r[6], r[5], r[4], r[3], r[2], r[1], r[0]); + G256_MixFunction_quad(r); + + for (int round = 3; round<14; round++) + { + G256_AddRoundConstantP_quad(r[7], r[6], r[5], r[4], r[3], r[2], r[1], r[0], round); + sbox_quad(r); + G256_ShiftBytesP_quad(r[7], r[6], r[5], r[4], r[3], r[2], r[1], r[0]); + G256_MixFunction_quad(r); + } + +#else + for (int round = 0; round<14; round++) + { + G256_AddRoundConstantP_quad(r[7], r[6], r[5], r[4], r[3], r[2], r[1], r[0], round); + sbox_quad(r); + G256_ShiftBytesP_quad(r[7], r[6], r[5], r[4], r[3], r[2], r[1], r[0]); + G256_MixFunction_quad(r); + } +#endif - for(int round=0;round<14;round++) - { - G256_AddRoundConstantP_quad(r[7], r[6], r[5], r[4], r[3], r[2], r[1], r[0], round); - sbox_quad(r); - G256_ShiftBytesP_quad(r[7], r[6], r[5], r[4], r[3], r[2], r[1], r[0]); - G256_MixFunction_quad(r); - } +/* + + + +r[4] ^= (0xAAAA & andmask1); +r[5] ^= (0xCCCC & andmask1); +r[6] ^= (0xF0F0 & andmask1); +r[7] ^= (0xFF00 & andmask1); +r[0] ^= andmask1; +r[1] ^= andmask1; +sbox_quad(r); +G256_ShiftBytesP_quad(r[7], r[6], r[5], r[4], r[3], r[2], r[1], r[0]); +G256_MixFunction_quad(r); + +r[4] ^= (0xAAAA & andmask1); +r[5] ^= (0xCCCC & andmask1); +r[6] ^= (0xF0F0 & andmask1); +r[7] ^= (0xFF00 & andmask1); +r[2] ^= andmask1; +sbox_quad(r); +G256_ShiftBytesP_quad(r[7], r[6], r[5], r[4], r[3], r[2], r[1], r[0]); + G256_MixFunction_quad(r); + + + r[4] ^= (0xAAAA & andmask1); + r[5] ^= (0xCCCC & andmask1); + r[6] ^= (0xF0F0 & andmask1); + r[7] ^= (0xFF00 & andmask1); + r[2] ^= andmask1; + sbox_quad(r); + G256_ShiftBytesP_quad(r[7], r[6], r[5], r[4], r[3], r[2], r[1], r[0]); + G256_MixFunction_quad(r); + r[4] ^= (0xAAAA & andmask1); + r[5] ^= (0xCCCC & andmask1); + r[6] ^= (0xF0F0 & andmask1); + r[7] ^= (0xFF00 & andmask1); + r[0] ^= andmask1; + r[2] ^= andmask1; + sbox_quad(r); + G256_ShiftBytesP_quad(r[7], r[6], r[5], r[4], r[3], r[2], r[1], r[0]); + G256_MixFunction_quad(r); + r[4] ^= (0xAAAA & andmask1); + r[5] ^= (0xCCCC & andmask1); + r[6] ^= (0xF0F0 & andmask1); + r[7] ^= (0xFF00 & andmask1); + r[1] ^= andmask1; + r[2] ^= andmask1; + sbox_quad(r); + G256_ShiftBytesP_quad(r[7], r[6], r[5], r[4], r[3], r[2], r[1], r[0]); + G256_MixFunction_quad(r); + r[4] ^= (0xAAAA & andmask1); + r[5] ^= (0xCCCC & andmask1); + r[6] ^= (0xF0F0 & andmask1); + r[7] ^= (0xFF00 & andmask1); + r[0] ^= andmask1; + r[1] ^= andmask1; + r[2] ^= andmask1; + sbox_quad(r); + G256_ShiftBytesP_quad(r[7], r[6], r[5], r[4], r[3], r[2], r[1], r[0]); + G256_MixFunction_quad(r); + r[4] ^= (0xAAAA & andmask1); + r[5] ^= (0xCCCC & andmask1); + r[6] ^= (0xF0F0 & andmask1); + r[7] ^= (0xFF00 & andmask1); + r[3] ^= andmask1; + sbox_quad(r); + G256_ShiftBytesP_quad(r[7], r[6], r[5], r[4], r[3], r[2], r[1], r[0]); + G256_MixFunction_quad(r); + r[4] ^= (0xAAAA & andmask1); + r[5] ^= (0xCCCC & andmask1); + r[6] ^= (0xF0F0 & andmask1); + r[7] ^= (0xFF00 & andmask1); + r[0] ^= andmask1; + r[3] ^= andmask1; + sbox_quad(r); + G256_ShiftBytesP_quad(r[7], r[6], r[5], r[4], r[3], r[2], r[1], r[0]); + G256_MixFunction_quad(r); + r[4] ^= (0xAAAA & andmask1); + r[5] ^= (0xCCCC & andmask1); + r[6] ^= (0xF0F0 & andmask1); + r[7] ^= (0xFF00 & andmask1); + r[1] ^= andmask1; + r[3] ^= andmask1; + sbox_quad(r); + G256_ShiftBytesP_quad(r[7], r[6], r[5], r[4], r[3], r[2], r[1], r[0]); + G256_MixFunction_quad(r); + r[4] ^= (0xAAAA & andmask1); + r[5] ^= (0xCCCC & andmask1); + r[6] ^= (0xF0F0 & andmask1); + r[7] ^= (0xFF00 & andmask1); + r[0] ^= andmask1; + r[1] ^= andmask1; + r[3] ^= andmask1; + sbox_quad(r); + G256_ShiftBytesP_quad(r[7], r[6], r[5], r[4], r[3], r[2], r[1], r[0]); + G256_MixFunction_quad(r); + r[4] ^= (0xAAAA & andmask1); + r[5] ^= (0xCCCC & andmask1); + r[6] ^= (0xF0F0 & andmask1); + r[7] ^= (0xFF00 & andmask1); + r[2] ^= andmask1; + r[3] ^= andmask1; + sbox_quad(r); + G256_ShiftBytesP_quad(r[7], r[6], r[5], r[4], r[3], r[2], r[1], r[0]); + G256_MixFunction_quad(r); + r[4] ^= (0xAAAA & andmask1); + r[5] ^= (0xCCCC & andmask1); + r[6] ^= (0xF0F0 & andmask1); + r[7] ^= (0xFF00 & andmask1); + r[0] ^= andmask1; + r[2] ^= andmask1; + r[3] ^= andmask1; + sbox_quad(r); + G256_ShiftBytesP_quad(r[7], r[6], r[5], r[4], r[3], r[2], r[1], r[0]); + G256_MixFunction_quad(r); + */ } __device__ __forceinline__ void groestl512_perm_Q_quad(uint32_t *const r) { - for(int round=0;round<14;round++) + for (int round = 0; round<14; round++) { G256_AddRoundConstantQ_quad(r[7], r[6], r[5], r[4], r[3], r[2], r[1], r[0], round); sbox_quad(r); @@ -308,18 +456,46 @@ __device__ __forceinline__ void groestl512_perm_Q_quad(uint32_t *const r) __device__ __forceinline__ void groestl512_progressMessage_quad(uint32_t *const __restrict__ state, uint32_t *const __restrict__ message) { -#pragma unroll 8 - for(int u=0;u<8;u++) state[u] = message[u]; + state[0] = message[0]; + state[1] = message[1]; + state[2] = message[2]; + state[3] = message[3]; + state[4] = message[4]; + state[5] = message[5]; + state[6] = message[6]; + state[7] = message[7]; if ((threadIdx.x & 0x03) == 3) state[ 1] ^= 0x00008000; groestl512_perm_P_quad(state); if ((threadIdx.x & 0x03) == 3) state[ 1] ^= 0x00008000; groestl512_perm_Q_quad(message); -#pragma unroll 8 - for(int u=0;u<8;u++) state[u] ^= message[u]; -#pragma unroll 8 - for(int u=0;u<8;u++) message[u] = state[u]; - groestl512_perm_P_quad(message); -#pragma unroll 8 - for(int u=0;u<8;u++) state[u] ^= message[u]; + + state[0] ^= message[0]; + state[1] ^= message[1]; + state[2] ^= message[2]; + state[3] ^= message[3]; + state[4] ^= message[4]; + state[5] ^= message[5]; + state[6] ^= message[6]; + state[7] ^= message[7]; + + message[0] = state[0]; + message[1] = state[1]; + message[2] = state[2]; + message[3] = state[3]; + message[4] = state[4]; + message[5] = state[5]; + message[6] = state[6]; + message[7] = state[7]; + + groestl512_perm_P_quad(message); + + state[0] ^= message[0]; + state[1] ^= message[1]; + state[2] ^= message[2]; + state[3] ^= message[3]; + state[4] ^= message[4]; + state[5] ^= message[5]; + state[6] ^= message[6]; + state[7] ^= message[7]; } diff --git a/groestlcoin.cpp b/groestlcoin.cpp index f732be93c2..9897a66888 100644 --- a/groestlcoin.cpp +++ b/groestlcoin.cpp @@ -1,19 +1,35 @@ + #include +#ifdef __cplusplus +#include +#else #include +#endif #include -#include "uint256.h" #include "sph/sph_groestl.h" #include "cuda_groestlcoin.h" #include "miner.h" - -#define SWAP32(x) \ - ((((x) << 24) & 0xff000000u) | (((x) << 8) & 0x00ff0000u) | \ - (((x) >> 8) & 0x0000ff00u) | (((x) >> 24) & 0x000000ffu)) +#include +#include +extern bool stop_mining; +extern volatile bool mining_has_stopped[MAX_GPUS]; + +#define CUDA_SAFE_CALL(call) \ +do { \ + cudaError_t err = call; \ + if (cudaSuccess != err) { \ + fprintf(stderr, "Cuda error in func '%s' at line %i : %s.\n", \ + __FUNCTION__, __LINE__, cudaGetErrorString(err) ); \ + exit(EXIT_FAILURE); \ + } \ +} while (0) + +#define SWAP32(x) swab32(x) // CPU-groestl -extern "C" void groestlhash(void *state, const void *input) +void groestlhash(void *state, const void *input) { sph_groestl512_context ctx_groestl; @@ -31,25 +47,34 @@ extern "C" void groestlhash(void *state, const void *input) memcpy(state, hashB, 32); } -static bool init[MAX_GPUS] = { 0 }; +extern cudaStream_t gpustream[MAX_GPUS]; -extern "C" int scanhash_groestlcoin(int thr_id, uint32_t *pdata, const uint32_t *ptarget, - uint32_t max_nonce, unsigned long *hashes_done) +extern int scanhash_groestlcoin(int thr_id, uint32_t *pdata, uint32_t *ptarget, + uint32_t max_nonce, uint32_t *hashes_done) { - uint32_t start_nonce = pdata[19]++; - uint32_t throughput = device_intensity(thr_id, __func__, 1 << 19); // 256*256*8 - throughput = min(throughput, max_nonce - start_nonce); + static THREAD uint32_t *foundNounce = nullptr; - uint32_t *outputHash = (uint32_t*)malloc(throughput * 16 * sizeof(uint32_t)); + uint32_t start_nonce = pdata[19]; + unsigned int intensity = (device_sm[device_map[thr_id]] > 500) ? 24 : 23; + uint32_t throughputmax = device_intensity(device_map[thr_id], __func__, 1U << intensity); + uint32_t throughput = min(throughputmax, max_nonce - start_nonce) & 0xfffffc00; if (opt_benchmark) - ((uint32_t*)ptarget)[7] = 0x000000ff; + ptarget[7] = 0x0000000f; // init - if(!init[thr_id]) + static THREAD volatile bool init = false; + if(!init) { - groestlcoin_cpu_init(thr_id, throughput); - init[thr_id] = true; + CUDA_SAFE_CALL(cudaSetDevice(device_map[thr_id])); + cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync); + cudaDeviceSetCacheConfig(cudaFuncCachePreferL1); + CUDA_SAFE_CALL(cudaStreamCreate(&gpustream[thr_id])); + + groestlcoin_cpu_init(thr_id, throughputmax); + CUDA_SAFE_CALL(cudaMallocHost(&foundNounce, 2 * 4)); + mining_has_stopped[thr_id] = false; + init = true; } // Endian Drehung ist notwendig @@ -58,38 +83,69 @@ extern "C" int scanhash_groestlcoin(int thr_id, uint32_t *pdata, const uint32_t be32enc(&endiandata[kk], pdata[kk]); // Context mit dem Endian gedrehten Blockheader vorbereiten (Nonce wird später ersetzt) - groestlcoin_cpu_setBlock(thr_id, endiandata, (void*)ptarget); - - do { - // GPU - uint32_t foundNounce = 0xFFFFFFFF; - const uint32_t Htarg = ptarget[7]; - - groestlcoin_cpu_hash(thr_id, throughput, pdata[19], outputHash, &foundNounce); - - if(foundNounce < 0xffffffff) - { - uint32_t tmpHash[8]; - endiandata[19] = SWAP32(foundNounce); - groestlhash(tmpHash, endiandata); - - if (tmpHash[7] <= Htarg && fulltest(tmpHash, ptarget)) { - pdata[19] = foundNounce; - *hashes_done = foundNounce - start_nonce + 1; - free(outputHash); - return true; - } else { - applog(LOG_INFO, "GPU #%d: result for nonce $%08X does not validate on CPU!", thr_id, foundNounce); - } - - foundNounce = 0xffffffff; - } + groestlcoin_cpu_setBlock(thr_id, endiandata); + + do + { + // GPU + const uint32_t Htarg = ptarget[7]; + + groestlcoin_cpu_hash(thr_id, throughput, pdata[19], foundNounce, ptarget[7]); + + if(stop_mining) {mining_has_stopped[thr_id] = true; cudaStreamDestroy(gpustream[thr_id]); pthread_exit(nullptr);} + if(foundNounce[0] < 0xffffffff) + { + uint32_t tmpHash[8]; + endiandata[19] = SWAP32(foundNounce[0]); + groestlhash(tmpHash, endiandata); + + if(tmpHash[7] <= Htarg && fulltest(tmpHash, ptarget)) + { + int res = 1; + if(opt_benchmark) + applog(LOG_INFO, "GPU #%d Found nounce %08x", device_map[thr_id], foundNounce[0]); + *hashes_done = pdata[19] - start_nonce + throughput; + if(foundNounce[1] != 0xffffffff) + { + endiandata[19] = SWAP32(foundNounce[1]); + groestlhash(tmpHash, endiandata); + if(tmpHash[7] <= Htarg && fulltest(tmpHash, ptarget)) + { + pdata[21] = foundNounce[1]; + res++; + if(opt_benchmark) + applog(LOG_INFO, "GPU #%d Found second nounce %08x", device_map[thr_id], foundNounce[1]); + } + else + { + if(tmpHash[7] != Htarg) + { + applog(LOG_WARNING, "GPU #%d: result for %08x does not validate on CPU!", device_map[thr_id], foundNounce[1]); + } + } + } + pdata[19] = foundNounce[0]; + return res; + } + else + { + if(tmpHash[7] != Htarg) + { + applog(LOG_WARNING, "GPU #%d: result for %08x does not validate on CPU!", device_map[thr_id], foundNounce[0]); + } + } + } pdata[19] += throughput; - } while (!work_restart[thr_id].restart && ((uint64_t)max_nonce > ((uint64_t)(pdata[19]) + (uint64_t)throughput))); - - *hashes_done = pdata[19] - start_nonce + 1; - free(outputHash); + cudaError_t err = cudaGetLastError(); + if(err != cudaSuccess) + { + applog(LOG_ERR, "GPU #%d: %s", device_map[thr_id], cudaGetErrorString(err)); + exit(EXIT_FAILURE); + } + } while(!work_restart[thr_id].restart && ((uint64_t)max_nonce > ((uint64_t)(pdata[19]) + (uint64_t)throughput))); + + *hashes_done = pdata[19] - start_nonce; return 0; } diff --git a/heavy/cuda_blake512.cu b/heavy/cuda_blake512.cu deleted file mode 100644 index 75e7c13b1f..0000000000 --- a/heavy/cuda_blake512.cu +++ /dev/null @@ -1,253 +0,0 @@ -#include -#include - -#include "cuda_helper.h" - -// globaler Speicher für alle HeftyHashes aller Threads -extern uint32_t *heavy_heftyHashes[MAX_GPUS]; -extern uint32_t *heavy_nonceVector[MAX_GPUS]; - -// globaler Speicher für unsere Ergebnisse -uint32_t *d_hash5output[MAX_GPUS]; - -// die Message (112 bzw. 116 Bytes) mit Padding zur Berechnung auf der GPU -__constant__ uint64_t c_PaddedMessage[16]; // padded message (80/84+32 bytes + padding) - - -// ---------------------------- BEGIN CUDA blake512 functions ------------------------------------ - -__constant__ uint8_t c_sigma[16][16]; - -const uint8_t host_sigma[16][16] = -{ - { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, - {14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, - {11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 }, - { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 }, - { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 }, - { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }, - {12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 }, - {13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 }, - { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 }, - {10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13 , 0 }, - { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, - {14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, - {11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 }, - { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 }, - { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 }, - { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 } -}; - -/* in cuda_helper */ -#define SWAP32(x) cuda_swab32(x) -#define SWAP64(x) cuda_swab64(x) - -__constant__ uint64_t c_SecondRound[15]; - -const uint64_t host_SecondRound[15] = -{ - 0,0,0,0,0,0,0,0,0,0,0,0,0,SWAP64(1),0 -}; - -__constant__ uint64_t c_u512[16]; - -const uint64_t host_u512[16] = -{ - 0x243f6a8885a308d3ULL, 0x13198a2e03707344ULL, - 0xa4093822299f31d0ULL, 0x082efa98ec4e6c89ULL, - 0x452821e638d01377ULL, 0xbe5466cf34e90c6cULL, - 0xc0ac29b7c97c50ddULL, 0x3f84d5b5b5470917ULL, - 0x9216d5d98979fb1bULL, 0xd1310ba698dfb5acULL, - 0x2ffd72dbd01adfb7ULL, 0xb8e1afed6a267e96ULL, - 0xba7c9045f12c7f99ULL, 0x24a19947b3916cf7ULL, - 0x0801f2e2858efc16ULL, 0x636920d871574e69ULL -}; - - -#define G(a,b,c,d,e) \ - v[a] += (m[sigma[i][e]] ^ u512[sigma[i][e+1]]) + v[b];\ - v[d] = SWAPDWORDS( v[d] ^ v[a]); \ - v[c] += v[d]; \ - v[b] = ROTR64( v[b] ^ v[c],25); \ - v[a] += (m[sigma[i][e+1]] ^ u512[sigma[i][e]])+v[b]; \ - v[d] = ROTR64( v[d] ^ v[a],16); \ - v[c] += v[d]; \ - v[b] = ROTR64( v[b] ^ v[c],11); - -template __device__ void blake512_compress( uint64_t *h, const uint64_t *block, int nullt, const uint8_t ((*sigma)[16]), const uint64_t *u512 ) -{ - uint64_t v[16], m[16], i; - -#pragma unroll 16 - for( i = 0; i < 16; ++i ) m[i] = cuda_swab64(block[i]); - -#pragma unroll 8 - for( i = 0; i < 8; ++i ) v[i] = h[i]; - - v[ 8] = u512[0]; - v[ 9] = u512[1]; - v[10] = u512[2]; - v[11] = u512[3]; - v[12] = u512[4]; - v[13] = u512[5]; - v[14] = u512[6]; - v[15] = u512[7]; - - /* don't xor t when the block is only padding */ - if ( !nullt ) { - v[12] ^= 8*(BLOCKSIZE+32); - v[13] ^= 8*(BLOCKSIZE+32); - } - -//#pragma unroll 16 - for( i = 0; i < 16; ++i ) - { - /* column step */ - G( 0, 4, 8, 12, 0 ); - G( 1, 5, 9, 13, 2 ); - G( 2, 6, 10, 14, 4 ); - G( 3, 7, 11, 15, 6 ); - /* diagonal step */ - G( 0, 5, 10, 15, 8 ); - G( 1, 6, 11, 12, 10 ); - G( 2, 7, 8, 13, 12 ); - G( 3, 4, 9, 14, 14 ); - } - -#pragma unroll 16 - for( i = 0; i < 16; ++i ) h[i & 7] ^= v[i]; -} - -template __global__ void blake512_gpu_hash(uint32_t threads, uint32_t startNounce, void *outputHash, uint32_t *heftyHashes, uint32_t *nonceVector) -{ - uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); - if (thread < threads) - { - // bestimme den aktuellen Zähler - //uint32_t nounce = startNounce + thread; - uint32_t nounce = nonceVector[thread]; - - // Index-Position des Hashes in den Hash Puffern bestimmen (Hefty1 und outputHash) - uint32_t hashPosition = nounce - startNounce; - - // State vorbereiten - uint64_t h[8]; - h[0] = 0x6a09e667f3bcc908ULL; - h[1] = 0xbb67ae8584caa73bULL; - h[2] = 0x3c6ef372fe94f82bULL; - h[3] = 0xa54ff53a5f1d36f1ULL; - h[4] = 0x510e527fade682d1ULL; - h[5] = 0x9b05688c2b3e6c1fULL; - h[6] = 0x1f83d9abfb41bd6bULL; - h[7] = 0x5be0cd19137e2179ULL; - - // 128 Byte für die Message - uint64_t buf[16]; - - // Message für die erste Runde in Register holen -#pragma unroll 16 - for (int i=0; i < 16; ++i) buf[i] = c_PaddedMessage[i]; - - // die Nounce durch die thread-spezifische ersetzen - buf[9] = REPLACE_HIWORD(buf[9], nounce); - - uint32_t *hefty = heftyHashes + 8 * hashPosition; - if (BLOCKSIZE == 84) { - // den thread-spezifischen Hefty1 hash einsetzen - // aufwändig, weil das nicht mit uint64_t Wörtern aligned ist. - buf[10] = REPLACE_HIWORD(buf[10], hefty[0]); - buf[11] = REPLACE_LOWORD(buf[11], hefty[1]); - buf[11] = REPLACE_HIWORD(buf[11], hefty[2]); - buf[12] = REPLACE_LOWORD(buf[12], hefty[3]); - buf[12] = REPLACE_HIWORD(buf[12], hefty[4]); - buf[13] = REPLACE_LOWORD(buf[13], hefty[5]); - buf[13] = REPLACE_HIWORD(buf[13], hefty[6]); - buf[14] = REPLACE_LOWORD(buf[14], hefty[7]); - } - else if (BLOCKSIZE == 80) { - buf[10] = MAKE_ULONGLONG(hefty[0], hefty[1]); - buf[11] = MAKE_ULONGLONG(hefty[2], hefty[3]); - buf[12] = MAKE_ULONGLONG(hefty[4], hefty[5]); - buf[13] = MAKE_ULONGLONG(hefty[6], hefty[7]); - } - - // erste Runde - blake512_compress( h, buf, 0, c_sigma, c_u512 ); - - - // zweite Runde -#pragma unroll 15 - for (int i=0; i < 15; ++i) buf[i] = c_SecondRound[i]; - buf[15] = SWAP64(8*(BLOCKSIZE+32)); // Blocksize in Bits einsetzen - blake512_compress( h, buf, 1, c_sigma, c_u512 ); - - // Hash rauslassen - uint64_t *outHash = (uint64_t *)outputHash + 8 * hashPosition; -#pragma unroll 8 - for (int i=0; i < 8; ++i) outHash[i] = cuda_swab64( h[i] ); - } -} - - -// ---------------------------- END CUDA blake512 functions ------------------------------------ - -// Setup-Funktionen -__host__ void blake512_cpu_init(int thr_id, uint32_t threads) -{ - // Kopiere die Hash-Tabellen in den GPU-Speicher - cudaMemcpyToSymbol( c_sigma, - host_sigma, - sizeof(host_sigma), - 0, cudaMemcpyHostToDevice); - - cudaMemcpyToSymbol( c_u512, - host_u512, - sizeof(host_u512), - 0, cudaMemcpyHostToDevice); - - cudaMemcpyToSymbol( c_SecondRound, - host_SecondRound, - sizeof(host_SecondRound), - 0, cudaMemcpyHostToDevice); - - // Speicher für alle Ergebnisse belegen - CUDA_SAFE_CALL(cudaMalloc(&d_hash5output[thr_id], 16 * sizeof(uint32_t) * threads)); -} - -static int BLOCKSIZE = 84; - -__host__ void blake512_cpu_setBlock(void *pdata, int len) - // data muss 84-Byte haben! - // heftyHash hat 32-Byte -{ - unsigned char PaddedMessage[128]; - if (len == 84) { - // Message mit Padding für erste Runde bereitstellen - memcpy(PaddedMessage, pdata, 84); - memset(PaddedMessage+84, 0, 32); // leeres Hefty Hash einfüllen - memset(PaddedMessage+116, 0, 12); - PaddedMessage[116] = 0x80; - } else if (len == 80) { - memcpy(PaddedMessage, pdata, 80); - memset(PaddedMessage+80, 0, 32); // leeres Hefty Hash einfüllen - memset(PaddedMessage+112, 0, 16); - PaddedMessage[112] = 0x80; - } - // die Message (116 Bytes) ohne Padding zur Berechnung auf der GPU - cudaMemcpyToSymbol( c_PaddedMessage, PaddedMessage, 16*sizeof(uint64_t), 0, cudaMemcpyHostToDevice); - BLOCKSIZE = len; -} - -__host__ void blake512_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce) -{ - const uint32_t threadsperblock = 256; - - // berechne wie viele Thread Blocks wir brauchen - dim3 grid((threads + threadsperblock-1)/threadsperblock); - dim3 block(threadsperblock); - - if (BLOCKSIZE == 80) - blake512_gpu_hash<80><<>>(threads, startNounce, d_hash5output[thr_id], heavy_heftyHashes[thr_id], heavy_nonceVector[thr_id]); - else if (BLOCKSIZE == 84) - blake512_gpu_hash<84><<>>(threads, startNounce, d_hash5output[thr_id], heavy_heftyHashes[thr_id], heavy_nonceVector[thr_id]); -} diff --git a/heavy/cuda_blake512.h b/heavy/cuda_blake512.h deleted file mode 100644 index 7e24973348..0000000000 --- a/heavy/cuda_blake512.h +++ /dev/null @@ -1,7 +0,0 @@ -#ifndef _CUDA_BLAKE512_H -#define _CUDA_BLAKE512_H - -void blake512_cpu_init(int thr_id, int threads); -void blake512_cpu_setBlock(void *pdata, int len); -void blake512_cpu_hash(int thr_id, int threads, uint32_t startNounce); -#endif diff --git a/heavy/cuda_combine.cu b/heavy/cuda_combine.cu deleted file mode 100644 index 3365cf18c7..0000000000 --- a/heavy/cuda_combine.cu +++ /dev/null @@ -1,145 +0,0 @@ -#include - -#include "cuda_helper.h" - -// globaler Speicher für unsere Ergebnisse -static uint32_t *d_hashoutput[MAX_GPUS]; -extern uint32_t *d_hash2output[MAX_GPUS]; -extern uint32_t *d_hash3output[MAX_GPUS]; -extern uint32_t *d_hash4output[MAX_GPUS]; -extern uint32_t *d_hash5output[MAX_GPUS]; - -extern uint32_t *heavy_nonceVector[MAX_GPUS]; - -/* Combines top 64-bits from each hash into a single hash */ -__device__ -static void combine_hashes(uint32_t *out, uint32_t *hash1, uint32_t *hash2, uint32_t *hash3, uint32_t *hash4) -{ - uint32_t lout[8]; // Combining in Registern machen - -#pragma unroll 8 - for (int i=0; i < 8; ++i) - lout[i] = 0; - - // das Makro setzt jeweils 4 Bits aus vier verschiedenen Hashes zu einem Nibble zusammen -#define MIX(bits, mask, i) \ - lout[(255 - (bits+3))/32] <<= 4; \ - if ((hash1[i] & mask) != 0) lout[(255 - (bits+0))/32] |= 8; \ - if ((hash2[i] & mask) != 0) lout[(255 - (bits+1))/32] |= 4; \ - if ((hash3[i] & mask) != 0) lout[(255 - (bits+2))/32] |= 2; \ - if ((hash4[i] & mask) != 0) lout[(255 - (bits+3))/32] |= 1; \ - - /* Transpose first 64 bits of each hash into out */ - MIX( 0, 0x80000000, 7); - MIX( 4, 0x40000000, 7); - MIX( 8, 0x20000000, 7); - MIX( 12, 0x10000000, 7); - MIX( 16, 0x08000000, 7); - MIX( 20, 0x04000000, 7); - MIX( 24, 0x02000000, 7); - MIX( 28, 0x01000000, 7); - MIX( 32, 0x00800000, 7); - MIX( 36, 0x00400000, 7); - MIX( 40, 0x00200000, 7); - MIX( 44, 0x00100000, 7); - MIX( 48, 0x00080000, 7); - MIX( 52, 0x00040000, 7); - MIX( 56, 0x00020000, 7); - MIX( 60, 0x00010000, 7); - MIX( 64, 0x00008000, 7); - MIX( 68, 0x00004000, 7); - MIX( 72, 0x00002000, 7); - MIX( 76, 0x00001000, 7); - MIX( 80, 0x00000800, 7); - MIX( 84, 0x00000400, 7); - MIX( 88, 0x00000200, 7); - MIX( 92, 0x00000100, 7); - MIX( 96, 0x00000080, 7); - MIX(100, 0x00000040, 7); - MIX(104, 0x00000020, 7); - MIX(108, 0x00000010, 7); - MIX(112, 0x00000008, 7); - MIX(116, 0x00000004, 7); - MIX(120, 0x00000002, 7); - MIX(124, 0x00000001, 7); - - MIX(128, 0x80000000, 6); - MIX(132, 0x40000000, 6); - MIX(136, 0x20000000, 6); - MIX(140, 0x10000000, 6); - MIX(144, 0x08000000, 6); - MIX(148, 0x04000000, 6); - MIX(152, 0x02000000, 6); - MIX(156, 0x01000000, 6); - MIX(160, 0x00800000, 6); - MIX(164, 0x00400000, 6); - MIX(168, 0x00200000, 6); - MIX(172, 0x00100000, 6); - MIX(176, 0x00080000, 6); - MIX(180, 0x00040000, 6); - MIX(184, 0x00020000, 6); - MIX(188, 0x00010000, 6); - MIX(192, 0x00008000, 6); - MIX(196, 0x00004000, 6); - MIX(200, 0x00002000, 6); - MIX(204, 0x00001000, 6); - MIX(208, 0x00000800, 6); - MIX(212, 0x00000400, 6); - MIX(216, 0x00000200, 6); - MIX(220, 0x00000100, 6); - MIX(224, 0x00000080, 6); - MIX(228, 0x00000040, 6); - MIX(232, 0x00000020, 6); - MIX(236, 0x00000010, 6); - MIX(240, 0x00000008, 6); - MIX(244, 0x00000004, 6); - MIX(248, 0x00000002, 6); - MIX(252, 0x00000001, 6); - -#pragma unroll 8 - for (int i=0; i < 8; ++i) - out[i] = lout[i]; -} - -__global__ -void combine_gpu_hash(uint32_t threads, uint32_t startNounce, uint32_t *out, uint32_t *hash2, uint32_t *hash3, uint32_t *hash4, uint32_t *hash5, uint32_t *nonceVector) -{ - uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); - if (thread < threads) - { - uint32_t nounce = nonceVector[thread]; - uint32_t hashPosition = nounce - startNounce; - // Die Aufgabe der combine-funktion besteht aus zwei Teilen. - // 1) Komprimiere die hashes zu einem kleinen Array - // 2) Errechne dort den combines-value - - // Die Kompression wird dadurch verwirklicht, dass im out-array weiterhin mit "thread" indiziert - // wird. Die anderen Werte werden mit der nonce indiziert - - combine_hashes(&out[8 * thread], &hash2[8 * hashPosition], &hash3[16 * hashPosition], &hash4[16 * hashPosition], &hash5[16 * hashPosition]); - } -} - -__host__ -void combine_cpu_init(int thr_id, uint32_t threads) -{ - // Speicher für alle Ergebnisse belegen - CUDA_SAFE_CALL(cudaMalloc(&d_hashoutput[thr_id], 8 * sizeof(uint32_t) * threads)); -} - -__host__ -void combine_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *hash) -{ - // diese Kopien sind optional, da die Hashes jetzt bereits auf der GPU liegen sollten - - const uint32_t threadsperblock = 128; - - // berechne wie viele Thread Blocks wir brauchen - dim3 grid((threads + threadsperblock-1)/threadsperblock); - dim3 block(threadsperblock); - - combine_gpu_hash <<>> (threads, startNounce, d_hashoutput[thr_id], d_hash2output[thr_id], d_hash3output[thr_id], d_hash4output[thr_id], d_hash5output[thr_id], heavy_nonceVector[thr_id]); - - // da die Hash Auswertung noch auf der CPU erfolgt, müssen die Ergebnisse auf jeden Fall zum Host kopiert werden - CUDA_SAFE_CALL(cudaMemcpy(hash, d_hashoutput[thr_id], 8 * sizeof(uint32_t) * threads, cudaMemcpyDeviceToHost)); -} diff --git a/heavy/cuda_combine.h b/heavy/cuda_combine.h deleted file mode 100644 index 5bb5832d19..0000000000 --- a/heavy/cuda_combine.h +++ /dev/null @@ -1,7 +0,0 @@ -#ifndef _CUDA_COMBINE_H -#define _CUDA_COMBINE_H - -void combine_cpu_init(int thr_id, int threads); -void combine_cpu_hash(int thr_id, int threads, uint32_t startNounce, uint32_t *hash); - -#endif diff --git a/heavy/cuda_groestl512.cu b/heavy/cuda_groestl512.cu deleted file mode 100644 index eac60fdaad..0000000000 --- a/heavy/cuda_groestl512.cu +++ /dev/null @@ -1,816 +0,0 @@ -#include -#include - -#define SPH_C32(x) ((uint32_t)(x ## U)) -#define SPH_T32(x) ((x) & SPH_C32(0xFFFFFFFF)) - -#include "cuda_helper.h" - -// globaler Speicher für alle HeftyHashes aller Threads -extern uint32_t *heavy_heftyHashes[MAX_GPUS]; -extern uint32_t *heavy_nonceVector[MAX_GPUS]; - -// globaler Speicher für unsere Ergebnisse -uint32_t *d_hash4output[MAX_GPUS]; - -__constant__ uint32_t groestl_gpu_state[32]; -__constant__ uint32_t groestl_gpu_msg[32]; - -#define PC32up(j, r) ((uint32_t)((j) + (r))) -#define PC32dn(j, r) 0 -#define QC32up(j, r) 0xFFFFFFFF -#define QC32dn(j, r) (((uint32_t)(r) << 24) ^ (~((uint32_t)(j) << 24))) - -#define B32_0(x) ((x) & 0xFF) -#define B32_1(x) (((x) >> 8) & 0xFF) -#define B32_2(x) (((x) >> 16) & 0xFF) -#define B32_3(x) ((x) >> 24) - -#define C32e(x) ((SPH_C32(x) >> 24) \ - | ((SPH_C32(x) >> 8) & SPH_C32(0x0000FF00)) \ - | ((SPH_C32(x) << 8) & SPH_C32(0x00FF0000)) \ - | ((SPH_C32(x) << 24) & SPH_C32(0xFF000000))) - -#define T0up(x) tex1Dfetch(t0up, x) -#define T0dn(x) tex1Dfetch(t0dn, x) -#define T1up(x) tex1Dfetch(t1up, x) -#define T1dn(x) tex1Dfetch(t1dn, x) -#define T2up(x) tex1Dfetch(t2up, x) -#define T2dn(x) tex1Dfetch(t2dn, x) -#define T3up(x) tex1Dfetch(t3up, x) -#define T3dn(x) tex1Dfetch(t3dn, x) - -texture t0up; -texture t0dn; -texture t1up; -texture t1dn; -texture t2up; -texture t2dn; -texture t3up; -texture t3dn; - -uint32_t T0up_cpu[] = { - C32e(0xc632f4a5), C32e(0xf86f9784), C32e(0xee5eb099), C32e(0xf67a8c8d), - C32e(0xffe8170d), C32e(0xd60adcbd), C32e(0xde16c8b1), C32e(0x916dfc54), - C32e(0x6090f050), C32e(0x02070503), C32e(0xce2ee0a9), C32e(0x56d1877d), - C32e(0xe7cc2b19), C32e(0xb513a662), C32e(0x4d7c31e6), C32e(0xec59b59a), - C32e(0x8f40cf45), C32e(0x1fa3bc9d), C32e(0x8949c040), C32e(0xfa689287), - C32e(0xefd03f15), C32e(0xb29426eb), C32e(0x8ece40c9), C32e(0xfbe61d0b), - C32e(0x416e2fec), C32e(0xb31aa967), C32e(0x5f431cfd), C32e(0x456025ea), - C32e(0x23f9dabf), C32e(0x535102f7), C32e(0xe445a196), C32e(0x9b76ed5b), - C32e(0x75285dc2), C32e(0xe1c5241c), C32e(0x3dd4e9ae), C32e(0x4cf2be6a), - C32e(0x6c82ee5a), C32e(0x7ebdc341), C32e(0xf5f30602), C32e(0x8352d14f), - C32e(0x688ce45c), C32e(0x515607f4), C32e(0xd18d5c34), C32e(0xf9e11808), - C32e(0xe24cae93), C32e(0xab3e9573), C32e(0x6297f553), C32e(0x2a6b413f), - C32e(0x081c140c), C32e(0x9563f652), C32e(0x46e9af65), C32e(0x9d7fe25e), - C32e(0x30487828), C32e(0x37cff8a1), C32e(0x0a1b110f), C32e(0x2febc4b5), - C32e(0x0e151b09), C32e(0x247e5a36), C32e(0x1badb69b), C32e(0xdf98473d), - C32e(0xcda76a26), C32e(0x4ef5bb69), C32e(0x7f334ccd), C32e(0xea50ba9f), - C32e(0x123f2d1b), C32e(0x1da4b99e), C32e(0x58c49c74), C32e(0x3446722e), - C32e(0x3641772d), C32e(0xdc11cdb2), C32e(0xb49d29ee), C32e(0x5b4d16fb), - C32e(0xa4a501f6), C32e(0x76a1d74d), C32e(0xb714a361), C32e(0x7d3449ce), - C32e(0x52df8d7b), C32e(0xdd9f423e), C32e(0x5ecd9371), C32e(0x13b1a297), - C32e(0xa6a204f5), C32e(0xb901b868), C32e(0x00000000), C32e(0xc1b5742c), - C32e(0x40e0a060), C32e(0xe3c2211f), C32e(0x793a43c8), C32e(0xb69a2ced), - C32e(0xd40dd9be), C32e(0x8d47ca46), C32e(0x671770d9), C32e(0x72afdd4b), - C32e(0x94ed79de), C32e(0x98ff67d4), C32e(0xb09323e8), C32e(0x855bde4a), - C32e(0xbb06bd6b), C32e(0xc5bb7e2a), C32e(0x4f7b34e5), C32e(0xedd73a16), - C32e(0x86d254c5), C32e(0x9af862d7), C32e(0x6699ff55), C32e(0x11b6a794), - C32e(0x8ac04acf), C32e(0xe9d93010), C32e(0x040e0a06), C32e(0xfe669881), - C32e(0xa0ab0bf0), C32e(0x78b4cc44), C32e(0x25f0d5ba), C32e(0x4b753ee3), - C32e(0xa2ac0ef3), C32e(0x5d4419fe), C32e(0x80db5bc0), C32e(0x0580858a), - C32e(0x3fd3ecad), C32e(0x21fedfbc), C32e(0x70a8d848), C32e(0xf1fd0c04), - C32e(0x63197adf), C32e(0x772f58c1), C32e(0xaf309f75), C32e(0x42e7a563), - C32e(0x20705030), C32e(0xe5cb2e1a), C32e(0xfdef120e), C32e(0xbf08b76d), - C32e(0x8155d44c), C32e(0x18243c14), C32e(0x26795f35), C32e(0xc3b2712f), - C32e(0xbe8638e1), C32e(0x35c8fda2), C32e(0x88c74fcc), C32e(0x2e654b39), - C32e(0x936af957), C32e(0x55580df2), C32e(0xfc619d82), C32e(0x7ab3c947), - C32e(0xc827efac), C32e(0xba8832e7), C32e(0x324f7d2b), C32e(0xe642a495), - C32e(0xc03bfba0), C32e(0x19aab398), C32e(0x9ef668d1), C32e(0xa322817f), - C32e(0x44eeaa66), C32e(0x54d6827e), C32e(0x3bdde6ab), C32e(0x0b959e83), - C32e(0x8cc945ca), C32e(0xc7bc7b29), C32e(0x6b056ed3), C32e(0x286c443c), - C32e(0xa72c8b79), C32e(0xbc813de2), C32e(0x1631271d), C32e(0xad379a76), - C32e(0xdb964d3b), C32e(0x649efa56), C32e(0x74a6d24e), C32e(0x1436221e), - C32e(0x92e476db), C32e(0x0c121e0a), C32e(0x48fcb46c), C32e(0xb88f37e4), - C32e(0x9f78e75d), C32e(0xbd0fb26e), C32e(0x43692aef), C32e(0xc435f1a6), - C32e(0x39dae3a8), C32e(0x31c6f7a4), C32e(0xd38a5937), C32e(0xf274868b), - C32e(0xd5835632), C32e(0x8b4ec543), C32e(0x6e85eb59), C32e(0xda18c2b7), - C32e(0x018e8f8c), C32e(0xb11dac64), C32e(0x9cf16dd2), C32e(0x49723be0), - C32e(0xd81fc7b4), C32e(0xacb915fa), C32e(0xf3fa0907), C32e(0xcfa06f25), - C32e(0xca20eaaf), C32e(0xf47d898e), C32e(0x476720e9), C32e(0x10382818), - C32e(0x6f0b64d5), C32e(0xf0738388), C32e(0x4afbb16f), C32e(0x5cca9672), - C32e(0x38546c24), C32e(0x575f08f1), C32e(0x732152c7), C32e(0x9764f351), - C32e(0xcbae6523), C32e(0xa125847c), C32e(0xe857bf9c), C32e(0x3e5d6321), - C32e(0x96ea7cdd), C32e(0x611e7fdc), C32e(0x0d9c9186), C32e(0x0f9b9485), - C32e(0xe04bab90), C32e(0x7cbac642), C32e(0x712657c4), C32e(0xcc29e5aa), - C32e(0x90e373d8), C32e(0x06090f05), C32e(0xf7f40301), C32e(0x1c2a3612), - C32e(0xc23cfea3), C32e(0x6a8be15f), C32e(0xaebe10f9), C32e(0x69026bd0), - C32e(0x17bfa891), C32e(0x9971e858), C32e(0x3a536927), C32e(0x27f7d0b9), - C32e(0xd9914838), C32e(0xebde3513), C32e(0x2be5ceb3), C32e(0x22775533), - C32e(0xd204d6bb), C32e(0xa9399070), C32e(0x07878089), C32e(0x33c1f2a7), - C32e(0x2decc1b6), C32e(0x3c5a6622), C32e(0x15b8ad92), C32e(0xc9a96020), - C32e(0x875cdb49), C32e(0xaab01aff), C32e(0x50d88878), C32e(0xa52b8e7a), - C32e(0x03898a8f), C32e(0x594a13f8), C32e(0x09929b80), C32e(0x1a233917), - C32e(0x651075da), C32e(0xd7845331), C32e(0x84d551c6), C32e(0xd003d3b8), - C32e(0x82dc5ec3), C32e(0x29e2cbb0), C32e(0x5ac39977), C32e(0x1e2d3311), - C32e(0x7b3d46cb), C32e(0xa8b71ffc), C32e(0x6d0c61d6), C32e(0x2c624e3a) -}; - -uint32_t T0dn_cpu[] = { - C32e(0xf497a5c6), C32e(0x97eb84f8), C32e(0xb0c799ee), C32e(0x8cf78df6), - C32e(0x17e50dff), C32e(0xdcb7bdd6), C32e(0xc8a7b1de), C32e(0xfc395491), - C32e(0xf0c05060), C32e(0x05040302), C32e(0xe087a9ce), C32e(0x87ac7d56), - C32e(0x2bd519e7), C32e(0xa67162b5), C32e(0x319ae64d), C32e(0xb5c39aec), - C32e(0xcf05458f), C32e(0xbc3e9d1f), C32e(0xc0094089), C32e(0x92ef87fa), - C32e(0x3fc515ef), C32e(0x267febb2), C32e(0x4007c98e), C32e(0x1ded0bfb), - C32e(0x2f82ec41), C32e(0xa97d67b3), C32e(0x1cbefd5f), C32e(0x258aea45), - C32e(0xda46bf23), C32e(0x02a6f753), C32e(0xa1d396e4), C32e(0xed2d5b9b), - C32e(0x5deac275), C32e(0x24d91ce1), C32e(0xe97aae3d), C32e(0xbe986a4c), - C32e(0xeed85a6c), C32e(0xc3fc417e), C32e(0x06f102f5), C32e(0xd11d4f83), - C32e(0xe4d05c68), C32e(0x07a2f451), C32e(0x5cb934d1), C32e(0x18e908f9), - C32e(0xaedf93e2), C32e(0x954d73ab), C32e(0xf5c45362), C32e(0x41543f2a), - C32e(0x14100c08), C32e(0xf6315295), C32e(0xaf8c6546), C32e(0xe2215e9d), - C32e(0x78602830), C32e(0xf86ea137), C32e(0x11140f0a), C32e(0xc45eb52f), - C32e(0x1b1c090e), C32e(0x5a483624), C32e(0xb6369b1b), C32e(0x47a53ddf), - C32e(0x6a8126cd), C32e(0xbb9c694e), C32e(0x4cfecd7f), C32e(0xbacf9fea), - C32e(0x2d241b12), C32e(0xb93a9e1d), C32e(0x9cb07458), C32e(0x72682e34), - C32e(0x776c2d36), C32e(0xcda3b2dc), C32e(0x2973eeb4), C32e(0x16b6fb5b), - C32e(0x0153f6a4), C32e(0xd7ec4d76), C32e(0xa37561b7), C32e(0x49face7d), - C32e(0x8da47b52), C32e(0x42a13edd), C32e(0x93bc715e), C32e(0xa2269713), - C32e(0x0457f5a6), C32e(0xb86968b9), C32e(0x00000000), C32e(0x74992cc1), - C32e(0xa0806040), C32e(0x21dd1fe3), C32e(0x43f2c879), C32e(0x2c77edb6), - C32e(0xd9b3bed4), C32e(0xca01468d), C32e(0x70ced967), C32e(0xdde44b72), - C32e(0x7933de94), C32e(0x672bd498), C32e(0x237be8b0), C32e(0xde114a85), - C32e(0xbd6d6bbb), C32e(0x7e912ac5), C32e(0x349ee54f), C32e(0x3ac116ed), - C32e(0x5417c586), C32e(0x622fd79a), C32e(0xffcc5566), C32e(0xa7229411), - C32e(0x4a0fcf8a), C32e(0x30c910e9), C32e(0x0a080604), C32e(0x98e781fe), - C32e(0x0b5bf0a0), C32e(0xccf04478), C32e(0xd54aba25), C32e(0x3e96e34b), - C32e(0x0e5ff3a2), C32e(0x19bafe5d), C32e(0x5b1bc080), C32e(0x850a8a05), - C32e(0xec7ead3f), C32e(0xdf42bc21), C32e(0xd8e04870), C32e(0x0cf904f1), - C32e(0x7ac6df63), C32e(0x58eec177), C32e(0x9f4575af), C32e(0xa5846342), - C32e(0x50403020), C32e(0x2ed11ae5), C32e(0x12e10efd), C32e(0xb7656dbf), - C32e(0xd4194c81), C32e(0x3c301418), C32e(0x5f4c3526), C32e(0x719d2fc3), - C32e(0x3867e1be), C32e(0xfd6aa235), C32e(0x4f0bcc88), C32e(0x4b5c392e), - C32e(0xf93d5793), C32e(0x0daaf255), C32e(0x9de382fc), C32e(0xc9f4477a), - C32e(0xef8bacc8), C32e(0x326fe7ba), C32e(0x7d642b32), C32e(0xa4d795e6), - C32e(0xfb9ba0c0), C32e(0xb3329819), C32e(0x6827d19e), C32e(0x815d7fa3), - C32e(0xaa886644), C32e(0x82a87e54), C32e(0xe676ab3b), C32e(0x9e16830b), - C32e(0x4503ca8c), C32e(0x7b9529c7), C32e(0x6ed6d36b), C32e(0x44503c28), - C32e(0x8b5579a7), C32e(0x3d63e2bc), C32e(0x272c1d16), C32e(0x9a4176ad), - C32e(0x4dad3bdb), C32e(0xfac85664), C32e(0xd2e84e74), C32e(0x22281e14), - C32e(0x763fdb92), C32e(0x1e180a0c), C32e(0xb4906c48), C32e(0x376be4b8), - C32e(0xe7255d9f), C32e(0xb2616ebd), C32e(0x2a86ef43), C32e(0xf193a6c4), - C32e(0xe372a839), C32e(0xf762a431), C32e(0x59bd37d3), C32e(0x86ff8bf2), - C32e(0x56b132d5), C32e(0xc50d438b), C32e(0xebdc596e), C32e(0xc2afb7da), - C32e(0x8f028c01), C32e(0xac7964b1), C32e(0x6d23d29c), C32e(0x3b92e049), - C32e(0xc7abb4d8), C32e(0x1543faac), C32e(0x09fd07f3), C32e(0x6f8525cf), - C32e(0xea8fafca), C32e(0x89f38ef4), C32e(0x208ee947), C32e(0x28201810), - C32e(0x64ded56f), C32e(0x83fb88f0), C32e(0xb1946f4a), C32e(0x96b8725c), - C32e(0x6c702438), C32e(0x08aef157), C32e(0x52e6c773), C32e(0xf3355197), - C32e(0x658d23cb), C32e(0x84597ca1), C32e(0xbfcb9ce8), C32e(0x637c213e), - C32e(0x7c37dd96), C32e(0x7fc2dc61), C32e(0x911a860d), C32e(0x941e850f), - C32e(0xabdb90e0), C32e(0xc6f8427c), C32e(0x57e2c471), C32e(0xe583aacc), - C32e(0x733bd890), C32e(0x0f0c0506), C32e(0x03f501f7), C32e(0x3638121c), - C32e(0xfe9fa3c2), C32e(0xe1d45f6a), C32e(0x1047f9ae), C32e(0x6bd2d069), - C32e(0xa82e9117), C32e(0xe8295899), C32e(0x6974273a), C32e(0xd04eb927), - C32e(0x48a938d9), C32e(0x35cd13eb), C32e(0xce56b32b), C32e(0x55443322), - C32e(0xd6bfbbd2), C32e(0x904970a9), C32e(0x800e8907), C32e(0xf266a733), - C32e(0xc15ab62d), C32e(0x6678223c), C32e(0xad2a9215), C32e(0x608920c9), - C32e(0xdb154987), C32e(0x1a4fffaa), C32e(0x88a07850), C32e(0x8e517aa5), - C32e(0x8a068f03), C32e(0x13b2f859), C32e(0x9b128009), C32e(0x3934171a), - C32e(0x75cada65), C32e(0x53b531d7), C32e(0x5113c684), C32e(0xd3bbb8d0), - C32e(0x5e1fc382), C32e(0xcb52b029), C32e(0x99b4775a), C32e(0x333c111e), - C32e(0x46f6cb7b), C32e(0x1f4bfca8), C32e(0x61dad66d), C32e(0x4e583a2c) -}; - -uint32_t T1up_cpu[] = { - C32e(0xc6c632f4), C32e(0xf8f86f97), C32e(0xeeee5eb0), C32e(0xf6f67a8c), - C32e(0xffffe817), C32e(0xd6d60adc), C32e(0xdede16c8), C32e(0x91916dfc), - C32e(0x606090f0), C32e(0x02020705), C32e(0xcece2ee0), C32e(0x5656d187), - C32e(0xe7e7cc2b), C32e(0xb5b513a6), C32e(0x4d4d7c31), C32e(0xecec59b5), - C32e(0x8f8f40cf), C32e(0x1f1fa3bc), C32e(0x898949c0), C32e(0xfafa6892), - C32e(0xefefd03f), C32e(0xb2b29426), C32e(0x8e8ece40), C32e(0xfbfbe61d), - C32e(0x41416e2f), C32e(0xb3b31aa9), C32e(0x5f5f431c), C32e(0x45456025), - C32e(0x2323f9da), C32e(0x53535102), C32e(0xe4e445a1), C32e(0x9b9b76ed), - C32e(0x7575285d), C32e(0xe1e1c524), C32e(0x3d3dd4e9), C32e(0x4c4cf2be), - C32e(0x6c6c82ee), C32e(0x7e7ebdc3), C32e(0xf5f5f306), C32e(0x838352d1), - C32e(0x68688ce4), C32e(0x51515607), C32e(0xd1d18d5c), C32e(0xf9f9e118), - C32e(0xe2e24cae), C32e(0xabab3e95), C32e(0x626297f5), C32e(0x2a2a6b41), - C32e(0x08081c14), C32e(0x959563f6), C32e(0x4646e9af), C32e(0x9d9d7fe2), - C32e(0x30304878), C32e(0x3737cff8), C32e(0x0a0a1b11), C32e(0x2f2febc4), - C32e(0x0e0e151b), C32e(0x24247e5a), C32e(0x1b1badb6), C32e(0xdfdf9847), - C32e(0xcdcda76a), C32e(0x4e4ef5bb), C32e(0x7f7f334c), C32e(0xeaea50ba), - C32e(0x12123f2d), C32e(0x1d1da4b9), C32e(0x5858c49c), C32e(0x34344672), - C32e(0x36364177), C32e(0xdcdc11cd), C32e(0xb4b49d29), C32e(0x5b5b4d16), - C32e(0xa4a4a501), C32e(0x7676a1d7), C32e(0xb7b714a3), C32e(0x7d7d3449), - C32e(0x5252df8d), C32e(0xdddd9f42), C32e(0x5e5ecd93), C32e(0x1313b1a2), - C32e(0xa6a6a204), C32e(0xb9b901b8), C32e(0x00000000), C32e(0xc1c1b574), - C32e(0x4040e0a0), C32e(0xe3e3c221), C32e(0x79793a43), C32e(0xb6b69a2c), - C32e(0xd4d40dd9), C32e(0x8d8d47ca), C32e(0x67671770), C32e(0x7272afdd), - C32e(0x9494ed79), C32e(0x9898ff67), C32e(0xb0b09323), C32e(0x85855bde), - C32e(0xbbbb06bd), C32e(0xc5c5bb7e), C32e(0x4f4f7b34), C32e(0xededd73a), - C32e(0x8686d254), C32e(0x9a9af862), C32e(0x666699ff), C32e(0x1111b6a7), - C32e(0x8a8ac04a), C32e(0xe9e9d930), C32e(0x04040e0a), C32e(0xfefe6698), - C32e(0xa0a0ab0b), C32e(0x7878b4cc), C32e(0x2525f0d5), C32e(0x4b4b753e), - C32e(0xa2a2ac0e), C32e(0x5d5d4419), C32e(0x8080db5b), C32e(0x05058085), - C32e(0x3f3fd3ec), C32e(0x2121fedf), C32e(0x7070a8d8), C32e(0xf1f1fd0c), - C32e(0x6363197a), C32e(0x77772f58), C32e(0xafaf309f), C32e(0x4242e7a5), - C32e(0x20207050), C32e(0xe5e5cb2e), C32e(0xfdfdef12), C32e(0xbfbf08b7), - C32e(0x818155d4), C32e(0x1818243c), C32e(0x2626795f), C32e(0xc3c3b271), - C32e(0xbebe8638), C32e(0x3535c8fd), C32e(0x8888c74f), C32e(0x2e2e654b), - C32e(0x93936af9), C32e(0x5555580d), C32e(0xfcfc619d), C32e(0x7a7ab3c9), - C32e(0xc8c827ef), C32e(0xbaba8832), C32e(0x32324f7d), C32e(0xe6e642a4), - C32e(0xc0c03bfb), C32e(0x1919aab3), C32e(0x9e9ef668), C32e(0xa3a32281), - C32e(0x4444eeaa), C32e(0x5454d682), C32e(0x3b3bdde6), C32e(0x0b0b959e), - C32e(0x8c8cc945), C32e(0xc7c7bc7b), C32e(0x6b6b056e), C32e(0x28286c44), - C32e(0xa7a72c8b), C32e(0xbcbc813d), C32e(0x16163127), C32e(0xadad379a), - C32e(0xdbdb964d), C32e(0x64649efa), C32e(0x7474a6d2), C32e(0x14143622), - C32e(0x9292e476), C32e(0x0c0c121e), C32e(0x4848fcb4), C32e(0xb8b88f37), - C32e(0x9f9f78e7), C32e(0xbdbd0fb2), C32e(0x4343692a), C32e(0xc4c435f1), - C32e(0x3939dae3), C32e(0x3131c6f7), C32e(0xd3d38a59), C32e(0xf2f27486), - C32e(0xd5d58356), C32e(0x8b8b4ec5), C32e(0x6e6e85eb), C32e(0xdada18c2), - C32e(0x01018e8f), C32e(0xb1b11dac), C32e(0x9c9cf16d), C32e(0x4949723b), - C32e(0xd8d81fc7), C32e(0xacacb915), C32e(0xf3f3fa09), C32e(0xcfcfa06f), - C32e(0xcaca20ea), C32e(0xf4f47d89), C32e(0x47476720), C32e(0x10103828), - C32e(0x6f6f0b64), C32e(0xf0f07383), C32e(0x4a4afbb1), C32e(0x5c5cca96), - C32e(0x3838546c), C32e(0x57575f08), C32e(0x73732152), C32e(0x979764f3), - C32e(0xcbcbae65), C32e(0xa1a12584), C32e(0xe8e857bf), C32e(0x3e3e5d63), - C32e(0x9696ea7c), C32e(0x61611e7f), C32e(0x0d0d9c91), C32e(0x0f0f9b94), - C32e(0xe0e04bab), C32e(0x7c7cbac6), C32e(0x71712657), C32e(0xcccc29e5), - C32e(0x9090e373), C32e(0x0606090f), C32e(0xf7f7f403), C32e(0x1c1c2a36), - C32e(0xc2c23cfe), C32e(0x6a6a8be1), C32e(0xaeaebe10), C32e(0x6969026b), - C32e(0x1717bfa8), C32e(0x999971e8), C32e(0x3a3a5369), C32e(0x2727f7d0), - C32e(0xd9d99148), C32e(0xebebde35), C32e(0x2b2be5ce), C32e(0x22227755), - C32e(0xd2d204d6), C32e(0xa9a93990), C32e(0x07078780), C32e(0x3333c1f2), - C32e(0x2d2decc1), C32e(0x3c3c5a66), C32e(0x1515b8ad), C32e(0xc9c9a960), - C32e(0x87875cdb), C32e(0xaaaab01a), C32e(0x5050d888), C32e(0xa5a52b8e), - C32e(0x0303898a), C32e(0x59594a13), C32e(0x0909929b), C32e(0x1a1a2339), - C32e(0x65651075), C32e(0xd7d78453), C32e(0x8484d551), C32e(0xd0d003d3), - C32e(0x8282dc5e), C32e(0x2929e2cb), C32e(0x5a5ac399), C32e(0x1e1e2d33), - C32e(0x7b7b3d46), C32e(0xa8a8b71f), C32e(0x6d6d0c61), C32e(0x2c2c624e) -}; - -uint32_t T1dn_cpu[] = { - C32e(0xa5f497a5), C32e(0x8497eb84), C32e(0x99b0c799), C32e(0x8d8cf78d), - C32e(0x0d17e50d), C32e(0xbddcb7bd), C32e(0xb1c8a7b1), C32e(0x54fc3954), - C32e(0x50f0c050), C32e(0x03050403), C32e(0xa9e087a9), C32e(0x7d87ac7d), - C32e(0x192bd519), C32e(0x62a67162), C32e(0xe6319ae6), C32e(0x9ab5c39a), - C32e(0x45cf0545), C32e(0x9dbc3e9d), C32e(0x40c00940), C32e(0x8792ef87), - C32e(0x153fc515), C32e(0xeb267feb), C32e(0xc94007c9), C32e(0x0b1ded0b), - C32e(0xec2f82ec), C32e(0x67a97d67), C32e(0xfd1cbefd), C32e(0xea258aea), - C32e(0xbfda46bf), C32e(0xf702a6f7), C32e(0x96a1d396), C32e(0x5bed2d5b), - C32e(0xc25deac2), C32e(0x1c24d91c), C32e(0xaee97aae), C32e(0x6abe986a), - C32e(0x5aeed85a), C32e(0x41c3fc41), C32e(0x0206f102), C32e(0x4fd11d4f), - C32e(0x5ce4d05c), C32e(0xf407a2f4), C32e(0x345cb934), C32e(0x0818e908), - C32e(0x93aedf93), C32e(0x73954d73), C32e(0x53f5c453), C32e(0x3f41543f), - C32e(0x0c14100c), C32e(0x52f63152), C32e(0x65af8c65), C32e(0x5ee2215e), - C32e(0x28786028), C32e(0xa1f86ea1), C32e(0x0f11140f), C32e(0xb5c45eb5), - C32e(0x091b1c09), C32e(0x365a4836), C32e(0x9bb6369b), C32e(0x3d47a53d), - C32e(0x266a8126), C32e(0x69bb9c69), C32e(0xcd4cfecd), C32e(0x9fbacf9f), - C32e(0x1b2d241b), C32e(0x9eb93a9e), C32e(0x749cb074), C32e(0x2e72682e), - C32e(0x2d776c2d), C32e(0xb2cda3b2), C32e(0xee2973ee), C32e(0xfb16b6fb), - C32e(0xf60153f6), C32e(0x4dd7ec4d), C32e(0x61a37561), C32e(0xce49face), - C32e(0x7b8da47b), C32e(0x3e42a13e), C32e(0x7193bc71), C32e(0x97a22697), - C32e(0xf50457f5), C32e(0x68b86968), C32e(0x00000000), C32e(0x2c74992c), - C32e(0x60a08060), C32e(0x1f21dd1f), C32e(0xc843f2c8), C32e(0xed2c77ed), - C32e(0xbed9b3be), C32e(0x46ca0146), C32e(0xd970ced9), C32e(0x4bdde44b), - C32e(0xde7933de), C32e(0xd4672bd4), C32e(0xe8237be8), C32e(0x4ade114a), - C32e(0x6bbd6d6b), C32e(0x2a7e912a), C32e(0xe5349ee5), C32e(0x163ac116), - C32e(0xc55417c5), C32e(0xd7622fd7), C32e(0x55ffcc55), C32e(0x94a72294), - C32e(0xcf4a0fcf), C32e(0x1030c910), C32e(0x060a0806), C32e(0x8198e781), - C32e(0xf00b5bf0), C32e(0x44ccf044), C32e(0xbad54aba), C32e(0xe33e96e3), - C32e(0xf30e5ff3), C32e(0xfe19bafe), C32e(0xc05b1bc0), C32e(0x8a850a8a), - C32e(0xadec7ead), C32e(0xbcdf42bc), C32e(0x48d8e048), C32e(0x040cf904), - C32e(0xdf7ac6df), C32e(0xc158eec1), C32e(0x759f4575), C32e(0x63a58463), - C32e(0x30504030), C32e(0x1a2ed11a), C32e(0x0e12e10e), C32e(0x6db7656d), - C32e(0x4cd4194c), C32e(0x143c3014), C32e(0x355f4c35), C32e(0x2f719d2f), - C32e(0xe13867e1), C32e(0xa2fd6aa2), C32e(0xcc4f0bcc), C32e(0x394b5c39), - C32e(0x57f93d57), C32e(0xf20daaf2), C32e(0x829de382), C32e(0x47c9f447), - C32e(0xacef8bac), C32e(0xe7326fe7), C32e(0x2b7d642b), C32e(0x95a4d795), - C32e(0xa0fb9ba0), C32e(0x98b33298), C32e(0xd16827d1), C32e(0x7f815d7f), - C32e(0x66aa8866), C32e(0x7e82a87e), C32e(0xabe676ab), C32e(0x839e1683), - C32e(0xca4503ca), C32e(0x297b9529), C32e(0xd36ed6d3), C32e(0x3c44503c), - C32e(0x798b5579), C32e(0xe23d63e2), C32e(0x1d272c1d), C32e(0x769a4176), - C32e(0x3b4dad3b), C32e(0x56fac856), C32e(0x4ed2e84e), C32e(0x1e22281e), - C32e(0xdb763fdb), C32e(0x0a1e180a), C32e(0x6cb4906c), C32e(0xe4376be4), - C32e(0x5de7255d), C32e(0x6eb2616e), C32e(0xef2a86ef), C32e(0xa6f193a6), - C32e(0xa8e372a8), C32e(0xa4f762a4), C32e(0x3759bd37), C32e(0x8b86ff8b), - C32e(0x3256b132), C32e(0x43c50d43), C32e(0x59ebdc59), C32e(0xb7c2afb7), - C32e(0x8c8f028c), C32e(0x64ac7964), C32e(0xd26d23d2), C32e(0xe03b92e0), - C32e(0xb4c7abb4), C32e(0xfa1543fa), C32e(0x0709fd07), C32e(0x256f8525), - C32e(0xafea8faf), C32e(0x8e89f38e), C32e(0xe9208ee9), C32e(0x18282018), - C32e(0xd564ded5), C32e(0x8883fb88), C32e(0x6fb1946f), C32e(0x7296b872), - C32e(0x246c7024), C32e(0xf108aef1), C32e(0xc752e6c7), C32e(0x51f33551), - C32e(0x23658d23), C32e(0x7c84597c), C32e(0x9cbfcb9c), C32e(0x21637c21), - C32e(0xdd7c37dd), C32e(0xdc7fc2dc), C32e(0x86911a86), C32e(0x85941e85), - C32e(0x90abdb90), C32e(0x42c6f842), C32e(0xc457e2c4), C32e(0xaae583aa), - C32e(0xd8733bd8), C32e(0x050f0c05), C32e(0x0103f501), C32e(0x12363812), - C32e(0xa3fe9fa3), C32e(0x5fe1d45f), C32e(0xf91047f9), C32e(0xd06bd2d0), - C32e(0x91a82e91), C32e(0x58e82958), C32e(0x27697427), C32e(0xb9d04eb9), - C32e(0x3848a938), C32e(0x1335cd13), C32e(0xb3ce56b3), C32e(0x33554433), - C32e(0xbbd6bfbb), C32e(0x70904970), C32e(0x89800e89), C32e(0xa7f266a7), - C32e(0xb6c15ab6), C32e(0x22667822), C32e(0x92ad2a92), C32e(0x20608920), - C32e(0x49db1549), C32e(0xff1a4fff), C32e(0x7888a078), C32e(0x7a8e517a), - C32e(0x8f8a068f), C32e(0xf813b2f8), C32e(0x809b1280), C32e(0x17393417), - C32e(0xda75cada), C32e(0x3153b531), C32e(0xc65113c6), C32e(0xb8d3bbb8), - C32e(0xc35e1fc3), C32e(0xb0cb52b0), C32e(0x7799b477), C32e(0x11333c11), - C32e(0xcb46f6cb), C32e(0xfc1f4bfc), C32e(0xd661dad6), C32e(0x3a4e583a) -}; - -uint32_t T2up_cpu[] = { - C32e(0xa5c6c632), C32e(0x84f8f86f), C32e(0x99eeee5e), C32e(0x8df6f67a), - C32e(0x0dffffe8), C32e(0xbdd6d60a), C32e(0xb1dede16), C32e(0x5491916d), - C32e(0x50606090), C32e(0x03020207), C32e(0xa9cece2e), C32e(0x7d5656d1), - C32e(0x19e7e7cc), C32e(0x62b5b513), C32e(0xe64d4d7c), C32e(0x9aecec59), - C32e(0x458f8f40), C32e(0x9d1f1fa3), C32e(0x40898949), C32e(0x87fafa68), - C32e(0x15efefd0), C32e(0xebb2b294), C32e(0xc98e8ece), C32e(0x0bfbfbe6), - C32e(0xec41416e), C32e(0x67b3b31a), C32e(0xfd5f5f43), C32e(0xea454560), - C32e(0xbf2323f9), C32e(0xf7535351), C32e(0x96e4e445), C32e(0x5b9b9b76), - C32e(0xc2757528), C32e(0x1ce1e1c5), C32e(0xae3d3dd4), C32e(0x6a4c4cf2), - C32e(0x5a6c6c82), C32e(0x417e7ebd), C32e(0x02f5f5f3), C32e(0x4f838352), - C32e(0x5c68688c), C32e(0xf4515156), C32e(0x34d1d18d), C32e(0x08f9f9e1), - C32e(0x93e2e24c), C32e(0x73abab3e), C32e(0x53626297), C32e(0x3f2a2a6b), - C32e(0x0c08081c), C32e(0x52959563), C32e(0x654646e9), C32e(0x5e9d9d7f), - C32e(0x28303048), C32e(0xa13737cf), C32e(0x0f0a0a1b), C32e(0xb52f2feb), - C32e(0x090e0e15), C32e(0x3624247e), C32e(0x9b1b1bad), C32e(0x3ddfdf98), - C32e(0x26cdcda7), C32e(0x694e4ef5), C32e(0xcd7f7f33), C32e(0x9feaea50), - C32e(0x1b12123f), C32e(0x9e1d1da4), C32e(0x745858c4), C32e(0x2e343446), - C32e(0x2d363641), C32e(0xb2dcdc11), C32e(0xeeb4b49d), C32e(0xfb5b5b4d), - C32e(0xf6a4a4a5), C32e(0x4d7676a1), C32e(0x61b7b714), C32e(0xce7d7d34), - C32e(0x7b5252df), C32e(0x3edddd9f), C32e(0x715e5ecd), C32e(0x971313b1), - C32e(0xf5a6a6a2), C32e(0x68b9b901), C32e(0x00000000), C32e(0x2cc1c1b5), - C32e(0x604040e0), C32e(0x1fe3e3c2), C32e(0xc879793a), C32e(0xedb6b69a), - C32e(0xbed4d40d), C32e(0x468d8d47), C32e(0xd9676717), C32e(0x4b7272af), - C32e(0xde9494ed), C32e(0xd49898ff), C32e(0xe8b0b093), C32e(0x4a85855b), - C32e(0x6bbbbb06), C32e(0x2ac5c5bb), C32e(0xe54f4f7b), C32e(0x16ededd7), - C32e(0xc58686d2), C32e(0xd79a9af8), C32e(0x55666699), C32e(0x941111b6), - C32e(0xcf8a8ac0), C32e(0x10e9e9d9), C32e(0x0604040e), C32e(0x81fefe66), - C32e(0xf0a0a0ab), C32e(0x447878b4), C32e(0xba2525f0), C32e(0xe34b4b75), - C32e(0xf3a2a2ac), C32e(0xfe5d5d44), C32e(0xc08080db), C32e(0x8a050580), - C32e(0xad3f3fd3), C32e(0xbc2121fe), C32e(0x487070a8), C32e(0x04f1f1fd), - C32e(0xdf636319), C32e(0xc177772f), C32e(0x75afaf30), C32e(0x634242e7), - C32e(0x30202070), C32e(0x1ae5e5cb), C32e(0x0efdfdef), C32e(0x6dbfbf08), - C32e(0x4c818155), C32e(0x14181824), C32e(0x35262679), C32e(0x2fc3c3b2), - C32e(0xe1bebe86), C32e(0xa23535c8), C32e(0xcc8888c7), C32e(0x392e2e65), - C32e(0x5793936a), C32e(0xf2555558), C32e(0x82fcfc61), C32e(0x477a7ab3), - C32e(0xacc8c827), C32e(0xe7baba88), C32e(0x2b32324f), C32e(0x95e6e642), - C32e(0xa0c0c03b), C32e(0x981919aa), C32e(0xd19e9ef6), C32e(0x7fa3a322), - C32e(0x664444ee), C32e(0x7e5454d6), C32e(0xab3b3bdd), C32e(0x830b0b95), - C32e(0xca8c8cc9), C32e(0x29c7c7bc), C32e(0xd36b6b05), C32e(0x3c28286c), - C32e(0x79a7a72c), C32e(0xe2bcbc81), C32e(0x1d161631), C32e(0x76adad37), - C32e(0x3bdbdb96), C32e(0x5664649e), C32e(0x4e7474a6), C32e(0x1e141436), - C32e(0xdb9292e4), C32e(0x0a0c0c12), C32e(0x6c4848fc), C32e(0xe4b8b88f), - C32e(0x5d9f9f78), C32e(0x6ebdbd0f), C32e(0xef434369), C32e(0xa6c4c435), - C32e(0xa83939da), C32e(0xa43131c6), C32e(0x37d3d38a), C32e(0x8bf2f274), - C32e(0x32d5d583), C32e(0x438b8b4e), C32e(0x596e6e85), C32e(0xb7dada18), - C32e(0x8c01018e), C32e(0x64b1b11d), C32e(0xd29c9cf1), C32e(0xe0494972), - C32e(0xb4d8d81f), C32e(0xfaacacb9), C32e(0x07f3f3fa), C32e(0x25cfcfa0), - C32e(0xafcaca20), C32e(0x8ef4f47d), C32e(0xe9474767), C32e(0x18101038), - C32e(0xd56f6f0b), C32e(0x88f0f073), C32e(0x6f4a4afb), C32e(0x725c5cca), - C32e(0x24383854), C32e(0xf157575f), C32e(0xc7737321), C32e(0x51979764), - C32e(0x23cbcbae), C32e(0x7ca1a125), C32e(0x9ce8e857), C32e(0x213e3e5d), - C32e(0xdd9696ea), C32e(0xdc61611e), C32e(0x860d0d9c), C32e(0x850f0f9b), - C32e(0x90e0e04b), C32e(0x427c7cba), C32e(0xc4717126), C32e(0xaacccc29), - C32e(0xd89090e3), C32e(0x05060609), C32e(0x01f7f7f4), C32e(0x121c1c2a), - C32e(0xa3c2c23c), C32e(0x5f6a6a8b), C32e(0xf9aeaebe), C32e(0xd0696902), - C32e(0x911717bf), C32e(0x58999971), C32e(0x273a3a53), C32e(0xb92727f7), - C32e(0x38d9d991), C32e(0x13ebebde), C32e(0xb32b2be5), C32e(0x33222277), - C32e(0xbbd2d204), C32e(0x70a9a939), C32e(0x89070787), C32e(0xa73333c1), - C32e(0xb62d2dec), C32e(0x223c3c5a), C32e(0x921515b8), C32e(0x20c9c9a9), - C32e(0x4987875c), C32e(0xffaaaab0), C32e(0x785050d8), C32e(0x7aa5a52b), - C32e(0x8f030389), C32e(0xf859594a), C32e(0x80090992), C32e(0x171a1a23), - C32e(0xda656510), C32e(0x31d7d784), C32e(0xc68484d5), C32e(0xb8d0d003), - C32e(0xc38282dc), C32e(0xb02929e2), C32e(0x775a5ac3), C32e(0x111e1e2d), - C32e(0xcb7b7b3d), C32e(0xfca8a8b7), C32e(0xd66d6d0c), C32e(0x3a2c2c62) -}; - -uint32_t T2dn_cpu[] = { - C32e(0xf4a5f497), C32e(0x978497eb), C32e(0xb099b0c7), C32e(0x8c8d8cf7), - C32e(0x170d17e5), C32e(0xdcbddcb7), C32e(0xc8b1c8a7), C32e(0xfc54fc39), - C32e(0xf050f0c0), C32e(0x05030504), C32e(0xe0a9e087), C32e(0x877d87ac), - C32e(0x2b192bd5), C32e(0xa662a671), C32e(0x31e6319a), C32e(0xb59ab5c3), - C32e(0xcf45cf05), C32e(0xbc9dbc3e), C32e(0xc040c009), C32e(0x928792ef), - C32e(0x3f153fc5), C32e(0x26eb267f), C32e(0x40c94007), C32e(0x1d0b1ded), - C32e(0x2fec2f82), C32e(0xa967a97d), C32e(0x1cfd1cbe), C32e(0x25ea258a), - C32e(0xdabfda46), C32e(0x02f702a6), C32e(0xa196a1d3), C32e(0xed5bed2d), - C32e(0x5dc25dea), C32e(0x241c24d9), C32e(0xe9aee97a), C32e(0xbe6abe98), - C32e(0xee5aeed8), C32e(0xc341c3fc), C32e(0x060206f1), C32e(0xd14fd11d), - C32e(0xe45ce4d0), C32e(0x07f407a2), C32e(0x5c345cb9), C32e(0x180818e9), - C32e(0xae93aedf), C32e(0x9573954d), C32e(0xf553f5c4), C32e(0x413f4154), - C32e(0x140c1410), C32e(0xf652f631), C32e(0xaf65af8c), C32e(0xe25ee221), - C32e(0x78287860), C32e(0xf8a1f86e), C32e(0x110f1114), C32e(0xc4b5c45e), - C32e(0x1b091b1c), C32e(0x5a365a48), C32e(0xb69bb636), C32e(0x473d47a5), - C32e(0x6a266a81), C32e(0xbb69bb9c), C32e(0x4ccd4cfe), C32e(0xba9fbacf), - C32e(0x2d1b2d24), C32e(0xb99eb93a), C32e(0x9c749cb0), C32e(0x722e7268), - C32e(0x772d776c), C32e(0xcdb2cda3), C32e(0x29ee2973), C32e(0x16fb16b6), - C32e(0x01f60153), C32e(0xd74dd7ec), C32e(0xa361a375), C32e(0x49ce49fa), - C32e(0x8d7b8da4), C32e(0x423e42a1), C32e(0x937193bc), C32e(0xa297a226), - C32e(0x04f50457), C32e(0xb868b869), C32e(0x00000000), C32e(0x742c7499), - C32e(0xa060a080), C32e(0x211f21dd), C32e(0x43c843f2), C32e(0x2ced2c77), - C32e(0xd9bed9b3), C32e(0xca46ca01), C32e(0x70d970ce), C32e(0xdd4bdde4), - C32e(0x79de7933), C32e(0x67d4672b), C32e(0x23e8237b), C32e(0xde4ade11), - C32e(0xbd6bbd6d), C32e(0x7e2a7e91), C32e(0x34e5349e), C32e(0x3a163ac1), - C32e(0x54c55417), C32e(0x62d7622f), C32e(0xff55ffcc), C32e(0xa794a722), - C32e(0x4acf4a0f), C32e(0x301030c9), C32e(0x0a060a08), C32e(0x988198e7), - C32e(0x0bf00b5b), C32e(0xcc44ccf0), C32e(0xd5bad54a), C32e(0x3ee33e96), - C32e(0x0ef30e5f), C32e(0x19fe19ba), C32e(0x5bc05b1b), C32e(0x858a850a), - C32e(0xecadec7e), C32e(0xdfbcdf42), C32e(0xd848d8e0), C32e(0x0c040cf9), - C32e(0x7adf7ac6), C32e(0x58c158ee), C32e(0x9f759f45), C32e(0xa563a584), - C32e(0x50305040), C32e(0x2e1a2ed1), C32e(0x120e12e1), C32e(0xb76db765), - C32e(0xd44cd419), C32e(0x3c143c30), C32e(0x5f355f4c), C32e(0x712f719d), - C32e(0x38e13867), C32e(0xfda2fd6a), C32e(0x4fcc4f0b), C32e(0x4b394b5c), - C32e(0xf957f93d), C32e(0x0df20daa), C32e(0x9d829de3), C32e(0xc947c9f4), - C32e(0xefacef8b), C32e(0x32e7326f), C32e(0x7d2b7d64), C32e(0xa495a4d7), - C32e(0xfba0fb9b), C32e(0xb398b332), C32e(0x68d16827), C32e(0x817f815d), - C32e(0xaa66aa88), C32e(0x827e82a8), C32e(0xe6abe676), C32e(0x9e839e16), - C32e(0x45ca4503), C32e(0x7b297b95), C32e(0x6ed36ed6), C32e(0x443c4450), - C32e(0x8b798b55), C32e(0x3de23d63), C32e(0x271d272c), C32e(0x9a769a41), - C32e(0x4d3b4dad), C32e(0xfa56fac8), C32e(0xd24ed2e8), C32e(0x221e2228), - C32e(0x76db763f), C32e(0x1e0a1e18), C32e(0xb46cb490), C32e(0x37e4376b), - C32e(0xe75de725), C32e(0xb26eb261), C32e(0x2aef2a86), C32e(0xf1a6f193), - C32e(0xe3a8e372), C32e(0xf7a4f762), C32e(0x593759bd), C32e(0x868b86ff), - C32e(0x563256b1), C32e(0xc543c50d), C32e(0xeb59ebdc), C32e(0xc2b7c2af), - C32e(0x8f8c8f02), C32e(0xac64ac79), C32e(0x6dd26d23), C32e(0x3be03b92), - C32e(0xc7b4c7ab), C32e(0x15fa1543), C32e(0x090709fd), C32e(0x6f256f85), - C32e(0xeaafea8f), C32e(0x898e89f3), C32e(0x20e9208e), C32e(0x28182820), - C32e(0x64d564de), C32e(0x838883fb), C32e(0xb16fb194), C32e(0x967296b8), - C32e(0x6c246c70), C32e(0x08f108ae), C32e(0x52c752e6), C32e(0xf351f335), - C32e(0x6523658d), C32e(0x847c8459), C32e(0xbf9cbfcb), C32e(0x6321637c), - C32e(0x7cdd7c37), C32e(0x7fdc7fc2), C32e(0x9186911a), C32e(0x9485941e), - C32e(0xab90abdb), C32e(0xc642c6f8), C32e(0x57c457e2), C32e(0xe5aae583), - C32e(0x73d8733b), C32e(0x0f050f0c), C32e(0x030103f5), C32e(0x36123638), - C32e(0xfea3fe9f), C32e(0xe15fe1d4), C32e(0x10f91047), C32e(0x6bd06bd2), - C32e(0xa891a82e), C32e(0xe858e829), C32e(0x69276974), C32e(0xd0b9d04e), - C32e(0x483848a9), C32e(0x351335cd), C32e(0xceb3ce56), C32e(0x55335544), - C32e(0xd6bbd6bf), C32e(0x90709049), C32e(0x8089800e), C32e(0xf2a7f266), - C32e(0xc1b6c15a), C32e(0x66226678), C32e(0xad92ad2a), C32e(0x60206089), - C32e(0xdb49db15), C32e(0x1aff1a4f), C32e(0x887888a0), C32e(0x8e7a8e51), - C32e(0x8a8f8a06), C32e(0x13f813b2), C32e(0x9b809b12), C32e(0x39173934), - C32e(0x75da75ca), C32e(0x533153b5), C32e(0x51c65113), C32e(0xd3b8d3bb), - C32e(0x5ec35e1f), C32e(0xcbb0cb52), C32e(0x997799b4), C32e(0x3311333c), - C32e(0x46cb46f6), C32e(0x1ffc1f4b), C32e(0x61d661da), C32e(0x4e3a4e58) -}; - -uint32_t T3up_cpu[] = { - C32e(0x97a5c6c6), C32e(0xeb84f8f8), C32e(0xc799eeee), C32e(0xf78df6f6), - C32e(0xe50dffff), C32e(0xb7bdd6d6), C32e(0xa7b1dede), C32e(0x39549191), - C32e(0xc0506060), C32e(0x04030202), C32e(0x87a9cece), C32e(0xac7d5656), - C32e(0xd519e7e7), C32e(0x7162b5b5), C32e(0x9ae64d4d), C32e(0xc39aecec), - C32e(0x05458f8f), C32e(0x3e9d1f1f), C32e(0x09408989), C32e(0xef87fafa), - C32e(0xc515efef), C32e(0x7febb2b2), C32e(0x07c98e8e), C32e(0xed0bfbfb), - C32e(0x82ec4141), C32e(0x7d67b3b3), C32e(0xbefd5f5f), C32e(0x8aea4545), - C32e(0x46bf2323), C32e(0xa6f75353), C32e(0xd396e4e4), C32e(0x2d5b9b9b), - C32e(0xeac27575), C32e(0xd91ce1e1), C32e(0x7aae3d3d), C32e(0x986a4c4c), - C32e(0xd85a6c6c), C32e(0xfc417e7e), C32e(0xf102f5f5), C32e(0x1d4f8383), - C32e(0xd05c6868), C32e(0xa2f45151), C32e(0xb934d1d1), C32e(0xe908f9f9), - C32e(0xdf93e2e2), C32e(0x4d73abab), C32e(0xc4536262), C32e(0x543f2a2a), - C32e(0x100c0808), C32e(0x31529595), C32e(0x8c654646), C32e(0x215e9d9d), - C32e(0x60283030), C32e(0x6ea13737), C32e(0x140f0a0a), C32e(0x5eb52f2f), - C32e(0x1c090e0e), C32e(0x48362424), C32e(0x369b1b1b), C32e(0xa53ddfdf), - C32e(0x8126cdcd), C32e(0x9c694e4e), C32e(0xfecd7f7f), C32e(0xcf9feaea), - C32e(0x241b1212), C32e(0x3a9e1d1d), C32e(0xb0745858), C32e(0x682e3434), - C32e(0x6c2d3636), C32e(0xa3b2dcdc), C32e(0x73eeb4b4), C32e(0xb6fb5b5b), - C32e(0x53f6a4a4), C32e(0xec4d7676), C32e(0x7561b7b7), C32e(0xface7d7d), - C32e(0xa47b5252), C32e(0xa13edddd), C32e(0xbc715e5e), C32e(0x26971313), - C32e(0x57f5a6a6), C32e(0x6968b9b9), C32e(0x00000000), C32e(0x992cc1c1), - C32e(0x80604040), C32e(0xdd1fe3e3), C32e(0xf2c87979), C32e(0x77edb6b6), - C32e(0xb3bed4d4), C32e(0x01468d8d), C32e(0xced96767), C32e(0xe44b7272), - C32e(0x33de9494), C32e(0x2bd49898), C32e(0x7be8b0b0), C32e(0x114a8585), - C32e(0x6d6bbbbb), C32e(0x912ac5c5), C32e(0x9ee54f4f), C32e(0xc116eded), - C32e(0x17c58686), C32e(0x2fd79a9a), C32e(0xcc556666), C32e(0x22941111), - C32e(0x0fcf8a8a), C32e(0xc910e9e9), C32e(0x08060404), C32e(0xe781fefe), - C32e(0x5bf0a0a0), C32e(0xf0447878), C32e(0x4aba2525), C32e(0x96e34b4b), - C32e(0x5ff3a2a2), C32e(0xbafe5d5d), C32e(0x1bc08080), C32e(0x0a8a0505), - C32e(0x7ead3f3f), C32e(0x42bc2121), C32e(0xe0487070), C32e(0xf904f1f1), - C32e(0xc6df6363), C32e(0xeec17777), C32e(0x4575afaf), C32e(0x84634242), - C32e(0x40302020), C32e(0xd11ae5e5), C32e(0xe10efdfd), C32e(0x656dbfbf), - C32e(0x194c8181), C32e(0x30141818), C32e(0x4c352626), C32e(0x9d2fc3c3), - C32e(0x67e1bebe), C32e(0x6aa23535), C32e(0x0bcc8888), C32e(0x5c392e2e), - C32e(0x3d579393), C32e(0xaaf25555), C32e(0xe382fcfc), C32e(0xf4477a7a), - C32e(0x8bacc8c8), C32e(0x6fe7baba), C32e(0x642b3232), C32e(0xd795e6e6), - C32e(0x9ba0c0c0), C32e(0x32981919), C32e(0x27d19e9e), C32e(0x5d7fa3a3), - C32e(0x88664444), C32e(0xa87e5454), C32e(0x76ab3b3b), C32e(0x16830b0b), - C32e(0x03ca8c8c), C32e(0x9529c7c7), C32e(0xd6d36b6b), C32e(0x503c2828), - C32e(0x5579a7a7), C32e(0x63e2bcbc), C32e(0x2c1d1616), C32e(0x4176adad), - C32e(0xad3bdbdb), C32e(0xc8566464), C32e(0xe84e7474), C32e(0x281e1414), - C32e(0x3fdb9292), C32e(0x180a0c0c), C32e(0x906c4848), C32e(0x6be4b8b8), - C32e(0x255d9f9f), C32e(0x616ebdbd), C32e(0x86ef4343), C32e(0x93a6c4c4), - C32e(0x72a83939), C32e(0x62a43131), C32e(0xbd37d3d3), C32e(0xff8bf2f2), - C32e(0xb132d5d5), C32e(0x0d438b8b), C32e(0xdc596e6e), C32e(0xafb7dada), - C32e(0x028c0101), C32e(0x7964b1b1), C32e(0x23d29c9c), C32e(0x92e04949), - C32e(0xabb4d8d8), C32e(0x43faacac), C32e(0xfd07f3f3), C32e(0x8525cfcf), - C32e(0x8fafcaca), C32e(0xf38ef4f4), C32e(0x8ee94747), C32e(0x20181010), - C32e(0xded56f6f), C32e(0xfb88f0f0), C32e(0x946f4a4a), C32e(0xb8725c5c), - C32e(0x70243838), C32e(0xaef15757), C32e(0xe6c77373), C32e(0x35519797), - C32e(0x8d23cbcb), C32e(0x597ca1a1), C32e(0xcb9ce8e8), C32e(0x7c213e3e), - C32e(0x37dd9696), C32e(0xc2dc6161), C32e(0x1a860d0d), C32e(0x1e850f0f), - C32e(0xdb90e0e0), C32e(0xf8427c7c), C32e(0xe2c47171), C32e(0x83aacccc), - C32e(0x3bd89090), C32e(0x0c050606), C32e(0xf501f7f7), C32e(0x38121c1c), - C32e(0x9fa3c2c2), C32e(0xd45f6a6a), C32e(0x47f9aeae), C32e(0xd2d06969), - C32e(0x2e911717), C32e(0x29589999), C32e(0x74273a3a), C32e(0x4eb92727), - C32e(0xa938d9d9), C32e(0xcd13ebeb), C32e(0x56b32b2b), C32e(0x44332222), - C32e(0xbfbbd2d2), C32e(0x4970a9a9), C32e(0x0e890707), C32e(0x66a73333), - C32e(0x5ab62d2d), C32e(0x78223c3c), C32e(0x2a921515), C32e(0x8920c9c9), - C32e(0x15498787), C32e(0x4fffaaaa), C32e(0xa0785050), C32e(0x517aa5a5), - C32e(0x068f0303), C32e(0xb2f85959), C32e(0x12800909), C32e(0x34171a1a), - C32e(0xcada6565), C32e(0xb531d7d7), C32e(0x13c68484), C32e(0xbbb8d0d0), - C32e(0x1fc38282), C32e(0x52b02929), C32e(0xb4775a5a), C32e(0x3c111e1e), - C32e(0xf6cb7b7b), C32e(0x4bfca8a8), C32e(0xdad66d6d), C32e(0x583a2c2c) -}; - -uint32_t T3dn_cpu[] = { - C32e(0x32f4a5f4), C32e(0x6f978497), C32e(0x5eb099b0), C32e(0x7a8c8d8c), - C32e(0xe8170d17), C32e(0x0adcbddc), C32e(0x16c8b1c8), C32e(0x6dfc54fc), - C32e(0x90f050f0), C32e(0x07050305), C32e(0x2ee0a9e0), C32e(0xd1877d87), - C32e(0xcc2b192b), C32e(0x13a662a6), C32e(0x7c31e631), C32e(0x59b59ab5), - C32e(0x40cf45cf), C32e(0xa3bc9dbc), C32e(0x49c040c0), C32e(0x68928792), - C32e(0xd03f153f), C32e(0x9426eb26), C32e(0xce40c940), C32e(0xe61d0b1d), - C32e(0x6e2fec2f), C32e(0x1aa967a9), C32e(0x431cfd1c), C32e(0x6025ea25), - C32e(0xf9dabfda), C32e(0x5102f702), C32e(0x45a196a1), C32e(0x76ed5bed), - C32e(0x285dc25d), C32e(0xc5241c24), C32e(0xd4e9aee9), C32e(0xf2be6abe), - C32e(0x82ee5aee), C32e(0xbdc341c3), C32e(0xf3060206), C32e(0x52d14fd1), - C32e(0x8ce45ce4), C32e(0x5607f407), C32e(0x8d5c345c), C32e(0xe1180818), - C32e(0x4cae93ae), C32e(0x3e957395), C32e(0x97f553f5), C32e(0x6b413f41), - C32e(0x1c140c14), C32e(0x63f652f6), C32e(0xe9af65af), C32e(0x7fe25ee2), - C32e(0x48782878), C32e(0xcff8a1f8), C32e(0x1b110f11), C32e(0xebc4b5c4), - C32e(0x151b091b), C32e(0x7e5a365a), C32e(0xadb69bb6), C32e(0x98473d47), - C32e(0xa76a266a), C32e(0xf5bb69bb), C32e(0x334ccd4c), C32e(0x50ba9fba), - C32e(0x3f2d1b2d), C32e(0xa4b99eb9), C32e(0xc49c749c), C32e(0x46722e72), - C32e(0x41772d77), C32e(0x11cdb2cd), C32e(0x9d29ee29), C32e(0x4d16fb16), - C32e(0xa501f601), C32e(0xa1d74dd7), C32e(0x14a361a3), C32e(0x3449ce49), - C32e(0xdf8d7b8d), C32e(0x9f423e42), C32e(0xcd937193), C32e(0xb1a297a2), - C32e(0xa204f504), C32e(0x01b868b8), C32e(0x00000000), C32e(0xb5742c74), - C32e(0xe0a060a0), C32e(0xc2211f21), C32e(0x3a43c843), C32e(0x9a2ced2c), - C32e(0x0dd9bed9), C32e(0x47ca46ca), C32e(0x1770d970), C32e(0xafdd4bdd), - C32e(0xed79de79), C32e(0xff67d467), C32e(0x9323e823), C32e(0x5bde4ade), - C32e(0x06bd6bbd), C32e(0xbb7e2a7e), C32e(0x7b34e534), C32e(0xd73a163a), - C32e(0xd254c554), C32e(0xf862d762), C32e(0x99ff55ff), C32e(0xb6a794a7), - C32e(0xc04acf4a), C32e(0xd9301030), C32e(0x0e0a060a), C32e(0x66988198), - C32e(0xab0bf00b), C32e(0xb4cc44cc), C32e(0xf0d5bad5), C32e(0x753ee33e), - C32e(0xac0ef30e), C32e(0x4419fe19), C32e(0xdb5bc05b), C32e(0x80858a85), - C32e(0xd3ecadec), C32e(0xfedfbcdf), C32e(0xa8d848d8), C32e(0xfd0c040c), - C32e(0x197adf7a), C32e(0x2f58c158), C32e(0x309f759f), C32e(0xe7a563a5), - C32e(0x70503050), C32e(0xcb2e1a2e), C32e(0xef120e12), C32e(0x08b76db7), - C32e(0x55d44cd4), C32e(0x243c143c), C32e(0x795f355f), C32e(0xb2712f71), - C32e(0x8638e138), C32e(0xc8fda2fd), C32e(0xc74fcc4f), C32e(0x654b394b), - C32e(0x6af957f9), C32e(0x580df20d), C32e(0x619d829d), C32e(0xb3c947c9), - C32e(0x27efacef), C32e(0x8832e732), C32e(0x4f7d2b7d), C32e(0x42a495a4), - C32e(0x3bfba0fb), C32e(0xaab398b3), C32e(0xf668d168), C32e(0x22817f81), - C32e(0xeeaa66aa), C32e(0xd6827e82), C32e(0xdde6abe6), C32e(0x959e839e), - C32e(0xc945ca45), C32e(0xbc7b297b), C32e(0x056ed36e), C32e(0x6c443c44), - C32e(0x2c8b798b), C32e(0x813de23d), C32e(0x31271d27), C32e(0x379a769a), - C32e(0x964d3b4d), C32e(0x9efa56fa), C32e(0xa6d24ed2), C32e(0x36221e22), - C32e(0xe476db76), C32e(0x121e0a1e), C32e(0xfcb46cb4), C32e(0x8f37e437), - C32e(0x78e75de7), C32e(0x0fb26eb2), C32e(0x692aef2a), C32e(0x35f1a6f1), - C32e(0xdae3a8e3), C32e(0xc6f7a4f7), C32e(0x8a593759), C32e(0x74868b86), - C32e(0x83563256), C32e(0x4ec543c5), C32e(0x85eb59eb), C32e(0x18c2b7c2), - C32e(0x8e8f8c8f), C32e(0x1dac64ac), C32e(0xf16dd26d), C32e(0x723be03b), - C32e(0x1fc7b4c7), C32e(0xb915fa15), C32e(0xfa090709), C32e(0xa06f256f), - C32e(0x20eaafea), C32e(0x7d898e89), C32e(0x6720e920), C32e(0x38281828), - C32e(0x0b64d564), C32e(0x73838883), C32e(0xfbb16fb1), C32e(0xca967296), - C32e(0x546c246c), C32e(0x5f08f108), C32e(0x2152c752), C32e(0x64f351f3), - C32e(0xae652365), C32e(0x25847c84), C32e(0x57bf9cbf), C32e(0x5d632163), - C32e(0xea7cdd7c), C32e(0x1e7fdc7f), C32e(0x9c918691), C32e(0x9b948594), - C32e(0x4bab90ab), C32e(0xbac642c6), C32e(0x2657c457), C32e(0x29e5aae5), - C32e(0xe373d873), C32e(0x090f050f), C32e(0xf4030103), C32e(0x2a361236), - C32e(0x3cfea3fe), C32e(0x8be15fe1), C32e(0xbe10f910), C32e(0x026bd06b), - C32e(0xbfa891a8), C32e(0x71e858e8), C32e(0x53692769), C32e(0xf7d0b9d0), - C32e(0x91483848), C32e(0xde351335), C32e(0xe5ceb3ce), C32e(0x77553355), - C32e(0x04d6bbd6), C32e(0x39907090), C32e(0x87808980), C32e(0xc1f2a7f2), - C32e(0xecc1b6c1), C32e(0x5a662266), C32e(0xb8ad92ad), C32e(0xa9602060), - C32e(0x5cdb49db), C32e(0xb01aff1a), C32e(0xd8887888), C32e(0x2b8e7a8e), - C32e(0x898a8f8a), C32e(0x4a13f813), C32e(0x929b809b), C32e(0x23391739), - C32e(0x1075da75), C32e(0x84533153), C32e(0xd551c651), C32e(0x03d3b8d3), - C32e(0xdc5ec35e), C32e(0xe2cbb0cb), C32e(0xc3997799), C32e(0x2d331133), - C32e(0x3d46cb46), C32e(0xb71ffc1f), C32e(0x0c61d661), C32e(0x624e3a4e) -}; - -__device__ void groestl512_perm_P(uint32_t *a) -{ - uint32_t t[32]; - -//#pragma unroll 14 - for(int r=0;r<14;r++) - { -#pragma unroll 16 - for(int k=0;k<16;k++) - { - a[(k*2)+0] ^= PC32up(k * 0x10, r); - //a[(k<<1)+1] ^= PC32dn(k * 0x10, r); - } - - // RBTT -#pragma unroll 16 - for(int k=0;k<32;k+=2) - { - t[k + 0] = T0up( B32_0(a[k & 0x1f]) ) ^ - T1up( B32_1(a[(k + 2) & 0x1f]) ) ^ - T2up( B32_2(a[(k + 4) & 0x1f]) ) ^ - T3up( B32_3(a[(k + 6) & 0x1f]) ) ^ - T0dn( B32_0(a[(k + 9) & 0x1f]) ) ^ - T1dn( B32_1(a[(k + 11) & 0x1f]) ) ^ - T2dn( B32_2(a[(k + 13) & 0x1f]) ) ^ - T3dn( B32_3(a[(k + 23) & 0x1f]) ); - - t[k + 1] = T0dn( B32_0(a[k & 0x1f]) ) ^ - T1dn( B32_1(a[(k + 2) & 0x1f]) ) ^ - T2dn( B32_2(a[(k + 4) & 0x1f]) ) ^ - T3dn( B32_3(a[(k + 6) & 0x1f]) ) ^ - T0up( B32_0(a[(k + 9) & 0x1f]) ) ^ - T1up( B32_1(a[(k + 11) & 0x1f]) ) ^ - T2up( B32_2(a[(k + 13) & 0x1f]) ) ^ - T3up( B32_3(a[(k + 23) & 0x1f]) ); - } -#pragma unroll 32 - for(int k=0;k<32;k++) - a[k] = t[k]; - } -} - -__device__ void groestl512_perm_Q(uint32_t *a) -{ -//#pragma unroll 14 - for(int r=0;r<14;r++) - { - uint32_t t[32]; - -#pragma unroll 16 - for(int k=0;k<16;k++) - { - a[(k*2)+0] ^= QC32up(k * 0x10, r); - a[(k*2)+1] ^= QC32dn(k * 0x10, r); - } - - // RBTT -#pragma unroll 16 - for(int k=0;k<32;k+=2) - { - t[k + 0] = T0up( B32_0(a[(k + 2) & 0x1f]) ) ^ - T1up( B32_1(a[(k + 6) & 0x1f]) ) ^ - T2up( B32_2(a[(k + 10) & 0x1f]) ) ^ - T3up( B32_3(a[(k + 22) & 0x1f]) ) ^ - T0dn( B32_0(a[(k + 1) & 0x1f]) ) ^ - T1dn( B32_1(a[(k + 5) & 0x1f]) ) ^ - T2dn( B32_2(a[(k + 9) & 0x1f]) ) ^ - T3dn( B32_3(a[(k + 13) & 0x1f]) ); - - t[k + 1] = T0dn( B32_0(a[(k + 2) & 0x1f]) ) ^ - T1dn( B32_1(a[(k + 6) & 0x1f]) ) ^ - T2dn( B32_2(a[(k + 10) & 0x1f]) ) ^ - T3dn( B32_3(a[(k + 22) & 0x1f]) ) ^ - T0up( B32_0(a[(k + 1) & 0x1f]) ) ^ - T1up( B32_1(a[(k + 5) & 0x1f]) ) ^ - T2up( B32_2(a[(k + 9) & 0x1f]) ) ^ - T3up( B32_3(a[(k + 13) & 0x1f]) ); - } -#pragma unroll 32 - for(int k=0;k<32;k++) - a[k] = t[k]; - } -} - -template __global__ void groestl512_gpu_hash(uint32_t threads, uint32_t startNounce, void *outputHash, uint32_t *heftyHashes, uint32_t *nonceVector) -{ - uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); - if (thread < threads) - { - uint32_t message[32]; - uint32_t state[32]; - - // lese message ein & verknüpfe diese mit dem hash1 von hefty1 - // lese den state ein - -#pragma unroll 32 - for(int k=0;k<32;k++) - { - state[k] = groestl_gpu_state[k]; - message[k] = groestl_gpu_msg[k]; - } - - uint32_t nounce = nonceVector[thread]; - // nounce setzen - //message[19] = startNounce + thread; - message[19] = nounce; - - uint32_t hashPosition = nounce - startNounce; - - // den richtigen Hefty1 Hash holen -// memcpy(&message[21], &heftyHashes[8 * hashPosition], sizeof(uint32_t) * 8); - uint32_t *heftyHash = &heftyHashes[8 * hashPosition]; -#pragma unroll 8 - for (int k=0; k<8; ++k) - message[BLOCKSIZE/4+k] = heftyHash[k]; - - uint32_t g[32]; -#pragma unroll 32 - for(int u=0;u<32;u++) - g[u] = message[u] ^ state[u]; - - // Perm - groestl512_perm_P(g); - groestl512_perm_Q(message); - -#pragma unroll 32 - for(int u=0;u<32;u++) - { - state[u] ^= g[u] ^ message[u]; - g[u] = state[u]; - } - - groestl512_perm_P(g); - -#pragma unroll 32 - for(int u=0;u<32;u++) - state[u] ^= g[u]; - - // kopiere Ergebnis -#pragma unroll 16 - for(int k=0;k<16;k++) - ((uint32_t*)outputHash)[16*hashPosition+k] = state[k + 16]; - } -} - -#define texDef(texname, texmem, texsource, texsize) \ - unsigned int *texmem; \ - cudaMalloc(&texmem, texsize); \ - cudaMemcpy(texmem, texsource, texsize, cudaMemcpyHostToDevice); \ - texname.normalized = 0; \ - texname.filterMode = cudaFilterModePoint; \ - texname.addressMode[0] = cudaAddressModeClamp; \ - { cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(); \ - cudaBindTexture(NULL, &texname, texmem, &channelDesc, texsize ); } \ - -// Setup-Funktionen -__host__ void groestl512_cpu_init(int thr_id, uint32_t threads) -{ - // Texturen mit obigem Makro initialisieren - texDef(t0up, d_T0up, T0up_cpu, sizeof(uint32_t)*256); - texDef(t0dn, d_T0dn, T0dn_cpu, sizeof(uint32_t)*256); - texDef(t1up, d_T1up, T1up_cpu, sizeof(uint32_t)*256); - texDef(t1dn, d_T1dn, T1dn_cpu, sizeof(uint32_t)*256); - texDef(t2up, d_T2up, T2up_cpu, sizeof(uint32_t)*256); - texDef(t2dn, d_T2dn, T2dn_cpu, sizeof(uint32_t)*256); - texDef(t3up, d_T3up, T3up_cpu, sizeof(uint32_t)*256); - texDef(t3dn, d_T3dn, T3dn_cpu, sizeof(uint32_t)*256); - - // Speicher für alle Ergebnisse belegen - cudaMalloc(&d_hash4output[thr_id], 16 * sizeof(uint32_t) * threads); -} - -static int BLOCKSIZE = 84; - -__host__ void groestl512_cpu_setBlock(void *data, int len) - // data muss 80/84-Byte haben! - // heftyHash hat 32-Byte -{ - // Nachricht expandieren und setzen - uint32_t msgBlock[32]; - - memset(msgBlock, 0, sizeof(uint32_t) * 32); - memcpy(&msgBlock[0], data, len); - - // Erweitere die Nachricht auf den Nachrichtenblock (padding) - // Unsere Nachricht hat 112/116 Byte - if (len == 84) { - msgBlock[29] = 0x80; - msgBlock[31] = 0x01000000; - } else if (len == 80) { - msgBlock[28] = 0x80; - msgBlock[31] = 0x01000000; - } - // groestl512 braucht hierfür keinen CPU-Code (die einzige Runde wird - // auf der GPU ausgeführt) - - // setze register - uint32_t groestl_state_init[32]; - memset(groestl_state_init, 0, sizeof(uint32_t) * 32); - groestl_state_init[31] = 0x20000; - - // state speichern - cudaMemcpyToSymbol(groestl_gpu_state, groestl_state_init, 128); - - // Blockheader setzen (korrekte Nonce und Hefty Hash fehlen da drin noch) - cudaMemcpyToSymbol(groestl_gpu_msg, msgBlock, 128); - BLOCKSIZE = len; -} - -__host__ void groestl512_cpu_copyHeftyHash(int thr_id, uint32_t threads, void *heftyHashes, int copy) -{ - // Hefty1 Hashes kopieren (eigentlich nur zum debuggen) - if (copy) - CUDA_SAFE_CALL(cudaMemcpy(heavy_heftyHashes[thr_id], heftyHashes, 8 * sizeof(uint32_t) * threads, cudaMemcpyHostToDevice)); -} - -__host__ void groestl512_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce) -{ - const uint32_t threadsperblock = 128; - - // berechne wie viele Thread Blocks wir brauchen - dim3 grid((threads + threadsperblock-1)/threadsperblock); - dim3 block(threadsperblock); - - if (BLOCKSIZE == 84) - groestl512_gpu_hash<84><<>>(threads, startNounce, d_hash4output[thr_id], heavy_heftyHashes[thr_id], heavy_nonceVector[thr_id]); - else if (BLOCKSIZE == 80) - groestl512_gpu_hash<80><<>>(threads, startNounce, d_hash4output[thr_id], heavy_heftyHashes[thr_id], heavy_nonceVector[thr_id]); -} diff --git a/heavy/cuda_groestl512.h b/heavy/cuda_groestl512.h deleted file mode 100644 index 0cdc13b809..0000000000 --- a/heavy/cuda_groestl512.h +++ /dev/null @@ -1,9 +0,0 @@ -#ifndef _CUDA_GROESTL512_H -#define _CUDA_GROESTL512_H - -void groestl512_cpu_init(int thr_id, int threads); -void groestl512_cpu_copyHeftyHash(int thr_id, int threads, void *heftyHashes, int copy); -void groestl512_cpu_setBlock(void *data, int len); -void groestl512_cpu_hash(int thr_id, int threads, uint32_t startNounce); - -#endif \ No newline at end of file diff --git a/heavy/cuda_hefty1.cu b/heavy/cuda_hefty1.cu deleted file mode 100644 index 7bc4411a29..0000000000 --- a/heavy/cuda_hefty1.cu +++ /dev/null @@ -1,410 +0,0 @@ -#include -#include - -#include "miner.h" - -#include "cuda_helper.h" - -#define USE_SHARED 1 - -// globaler Speicher für alle HeftyHashes aller Threads -uint32_t *heavy_heftyHashes[MAX_GPUS]; - -/* Hash-Tabellen */ -__constant__ uint32_t hefty_gpu_constantTable[64]; -#if USE_SHARED -#define heftyLookUp(x) (*((uint32_t*)heftytab + (x))) -#else -#define heftyLookUp(x) hefty_gpu_constantTable[x] -#endif - -// muss expandiert werden -__constant__ uint32_t hefty_gpu_blockHeader[16]; // 2x512 Bit Message -__constant__ uint32_t hefty_gpu_register[8]; -__constant__ uint32_t hefty_gpu_sponge[4]; - -uint32_t hefty_cpu_hashTable[] = { - 0x6a09e667UL, - 0xbb67ae85UL, - 0x3c6ef372UL, - 0xa54ff53aUL, - 0x510e527fUL, - 0x9b05688cUL, - 0x1f83d9abUL, - 0x5be0cd19UL }; - -uint32_t hefty_cpu_constantTable[] = { - 0x428a2f98UL, 0x71374491UL, 0xb5c0fbcfUL, 0xe9b5dba5UL, - 0x3956c25bUL, 0x59f111f1UL, 0x923f82a4UL, 0xab1c5ed5UL, - 0xd807aa98UL, 0x12835b01UL, 0x243185beUL, 0x550c7dc3UL, - 0x72be5d74UL, 0x80deb1feUL, 0x9bdc06a7UL, 0xc19bf174UL, - 0xe49b69c1UL, 0xefbe4786UL, 0x0fc19dc6UL, 0x240ca1ccUL, - 0x2de92c6fUL, 0x4a7484aaUL, 0x5cb0a9dcUL, 0x76f988daUL, - 0x983e5152UL, 0xa831c66dUL, 0xb00327c8UL, 0xbf597fc7UL, - 0xc6e00bf3UL, 0xd5a79147UL, 0x06ca6351UL, 0x14292967UL, - 0x27b70a85UL, 0x2e1b2138UL, 0x4d2c6dfcUL, 0x53380d13UL, - 0x650a7354UL, 0x766a0abbUL, 0x81c2c92eUL, 0x92722c85UL, - 0xa2bfe8a1UL, 0xa81a664bUL, 0xc24b8b70UL, 0xc76c51a3UL, - 0xd192e819UL, 0xd6990624UL, 0xf40e3585UL, 0x106aa070UL, - 0x19a4c116UL, 0x1e376c08UL, 0x2748774cUL, 0x34b0bcb5UL, - 0x391c0cb3UL, 0x4ed8aa4aUL, 0x5b9cca4fUL, 0x682e6ff3UL, - 0x748f82eeUL, 0x78a5636fUL, 0x84c87814UL, 0x8cc70208UL, - 0x90befffaUL, 0xa4506cebUL, 0xbef9a3f7UL, 0xc67178f2UL -}; - -#if 0 -#define S(x, n) (((x) >> (n)) | ((x) << (32 - (n)))) -#else -__host__ __device__ -static uint32_t S(uint32_t x, int n) -{ - return (((x) >> (n)) | ((x) << (32 - (n)))); -} -#endif - -#define R(x, n) ((x) >> (n)) -#define Ch(x, y, z) ((x & (y ^ z)) ^ z) -#define Maj(x, y, z) ((x & (y | z)) | (y & z)) -#define S0(x) (S(x, 2) ^ S(x, 13) ^ S(x, 22)) -#define S1(x) (S(x, 6) ^ S(x, 11) ^ S(x, 25)) -#define s0(x) (S(x, 7) ^ S(x, 18) ^ R(x, 3)) -#define s1(x) (S(x, 17) ^ S(x, 19) ^ R(x, 10)) - -#define SWAB32(x) ( ((x & 0x000000FF) << 24) | ((x & 0x0000FF00) << 8) | ((x & 0x00FF0000) >> 8) | ((x & 0xFF000000) >> 24) ) - -// uint8_t -#define smoosh4(x) ( ((x)>>4) ^ ((x) & 0x0F) ) - -__host__ __forceinline__ __device__ -uint8_t smoosh2(uint32_t x) -{ - uint16_t w = (x >> 16) ^ (x & 0xffff); - uint8_t n = smoosh4( (uint8_t)( (w >> 8) ^ (w & 0xFF) ) ); - return 24 - (((n >> 2) ^ (n & 0x03)) << 3); -} -// 4 auf einmal -#define smoosh4Quad(x) ( (((x)>>4) ^ (x)) & 0x0F0F0F0F ) -#define getByte(x,y) ( ((x) >> (y)) & 0xFF ) - -__host__ __forceinline__ __device__ -void Mangle(uint32_t *inp) -{ - uint32_t r = smoosh4Quad(inp[0]); - uint32_t inp0org; - uint32_t tmp0Mask, tmp1Mask; - uint32_t in1, in2, isAddition; - int32_t tmp; - uint8_t b; - - inp[1] = inp[1] ^ S(inp[0], getByte(r, 24)); - - r += 0x01010101; - tmp = smoosh2(inp[1]); - b = getByte(r,tmp); - inp0org = S(inp[0], b); - tmp0Mask = (uint32_t) -((tmp >> 3) & 1); // Bit 3 an Position 0 - tmp1Mask = (uint32_t) -((tmp >> 4) & 1); // Bit 4 an Position 0 - - in1 = (inp[2] & ~inp0org) | - (tmp1Mask & ~inp[2] & inp0org) | - (~tmp0Mask & ~inp[2] & inp0org); - in2 = inp[2] += ~inp0org; - isAddition = ~tmp0Mask & tmp1Mask; - inp[2] = isAddition ? in2 : in1; - - r += 0x01010101; - tmp = smoosh2(inp[1] ^ inp[2]); - b = getByte(r,tmp); - inp0org = S(inp[0], b); - tmp0Mask = (uint32_t) -((tmp >> 3) & 1); // Bit 3 an Position 0 - tmp1Mask = (uint32_t) -((tmp >> 4) & 1); // Bit 4 an Position 0 - - in1 = (inp[3] & ~inp0org) | - (tmp1Mask & ~inp[3] & inp0org) | - (~tmp0Mask & ~inp[3] & inp0org); - in2 = inp[3] += ~inp0org; - isAddition = ~tmp0Mask & tmp1Mask; - inp[3] = isAddition ? in2 : in1; - - inp[0] ^= (inp[1] ^ inp[2]) + inp[3]; -} - -__host__ __forceinline__ __device__ -void Absorb(uint32_t *inp, uint32_t x) -{ - inp[0] ^= x; - Mangle(inp); -} - -__host__ __forceinline__ __device__ -uint32_t Squeeze(uint32_t *inp) -{ - uint32_t y = inp[0]; - Mangle(inp); - return y; -} - -__host__ __forceinline__ __device__ -uint32_t Br(uint32_t *sponge, uint32_t x) -{ - uint32_t r = Squeeze(sponge); - uint32_t t = ((r >> 8) & 0x1F); - uint32_t y = 1 << t; - - uint32_t a = (((r>>1) & 0x01) << t) & y; - uint32_t b = ((r & 0x01) << t) & y; - uint32_t c = x & y; - - uint32_t retVal = (x & ~y) | (~b & c) | (a & ~c); - return retVal; -} - -__device__ __forceinline__ -void hefty_gpu_round(uint32_t *regs, uint32_t W, uint32_t K, uint32_t *sponge) -{ - uint32_t tmpBr; - - uint32_t brG = Br(sponge, regs[6]); - uint32_t brF = Br(sponge, regs[5]); - uint32_t tmp1 = Ch(regs[4], brF, brG) + regs[7] + W + K; - uint32_t brE = Br(sponge, regs[4]); - uint32_t tmp2 = tmp1 + S1(brE); - uint32_t brC = Br(sponge, regs[2]); - uint32_t brB = Br(sponge, regs[1]); - uint32_t brA = Br(sponge, regs[0]); - uint32_t tmp3 = Maj(brA, brB, brC); - tmpBr = Br(sponge, regs[0]); - uint32_t tmp4 = tmp3 + S0(tmpBr); - tmpBr = Br(sponge, tmp2); - - #pragma unroll 7 - for (int k=6; k >= 0; k--) regs[k+1] = regs[k]; - regs[0] = tmp2 + tmp4; - regs[4] += tmpBr; -} - -__host__ -void hefty_cpu_round(uint32_t *regs, uint32_t W, uint32_t K, uint32_t *sponge) -{ - uint32_t tmpBr; - - uint32_t brG = Br(sponge, regs[6]); - uint32_t brF = Br(sponge, regs[5]); - uint32_t tmp1 = Ch(regs[4], brF, brG) + regs[7] + W + K; - uint32_t brE = Br(sponge, regs[4]); - uint32_t tmp2 = tmp1 + S1(brE); - uint32_t brC = Br(sponge, regs[2]); - uint32_t brB = Br(sponge, regs[1]); - uint32_t brA = Br(sponge, regs[0]); - uint32_t tmp3 = Maj(brA, brB, brC); - tmpBr = Br(sponge, regs[0]); - uint32_t tmp4 = tmp3 + S0(tmpBr); - tmpBr = Br(sponge, tmp2); - - for (int k=6; k >= 0; k--) regs[k+1] = regs[k]; - regs[0] = tmp2 + tmp4; - regs[4] += tmpBr; -} - -__global__ -void hefty_gpu_hash(uint32_t threads, uint32_t startNounce, uint32_t *outputHash) -{ -#if USE_SHARED - extern __shared__ unsigned char heftytab[]; - if(threadIdx.x < 64) - { - *((uint32_t*)heftytab + threadIdx.x) = hefty_gpu_constantTable[threadIdx.x]; - } - - __syncthreads(); -#endif - - uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); - if (thread < threads) - { - // bestimme den aktuellen Zähler - uint32_t nounce = startNounce + thread; - - // jeder thread in diesem Block bekommt sein eigenes W Array im Shared memory - // reduktion von 256 byte auf 128 byte - uint32_t W1[16]; - uint32_t W2[16]; - - // Initialisiere die register a bis h mit der Hash-Tabelle - uint32_t regs[8]; - uint32_t hash[8]; - uint32_t sponge[4]; - -#pragma unroll 4 - for(int k=0; k < 4; k++) - sponge[k] = hefty_gpu_sponge[k]; - - // pre -#pragma unroll 8 - for (int k=0; k < 8; k++) - { - regs[k] = hefty_gpu_register[k]; - hash[k] = regs[k]; - } - - //memcpy(W, &hefty_gpu_blockHeader[0], sizeof(uint32_t) * 16); // verbleibende 20 bytes aus Block 2 plus padding -#pragma unroll 16 - for(int k=0;k<16;k++) - W1[k] = hefty_gpu_blockHeader[k]; - W1[3] = SWAB32(nounce); - - // 2. Runde -#pragma unroll 16 - for(int j=0;j<16;j++) - Absorb(sponge, W1[j] ^ heftyLookUp(j)); - -// Progress W1 (Bytes 0...63) -#pragma unroll 16 - for(int j=0;j<16;j++) - { - Absorb(sponge, regs[3] ^ regs[7]); - hefty_gpu_round(regs, W1[j], heftyLookUp(j), sponge); - } - -// Progress W2 (Bytes 64...127) then W3 (Bytes 128...191) ... - -#pragma unroll 3 - for(int k=0;k<3;k++) - { - #pragma unroll 2 - for(int j=0;j<2;j++) - W2[j] = s1(W1[14+j]) + W1[9+j] + s0(W1[1+j]) + W1[j]; - #pragma unroll 5 - for(int j=2;j<7;j++) - W2[j] = s1(W2[j-2]) + W1[9+j] + s0(W1[1+j]) + W1[j]; - - #pragma unroll 8 - for(int j=7;j<15;j++) - W2[j] = s1(W2[j-2]) + W2[j-7] + s0(W1[1+j]) + W1[j]; - - W2[15] = s1(W2[13]) + W2[8] + s0(W2[0]) + W1[15]; - - #pragma unroll 16 - for(int j=0;j<16;j++) - { - Absorb(sponge, regs[3] + regs[7]); - hefty_gpu_round(regs, W2[j], heftyLookUp(j + ((k+1)<<4)), sponge); - } - #pragma unroll 16 - for(int j=0;j<16;j++) - W1[j] = W2[j]; - } - -#pragma unroll 8 - for(int k=0;k<8;k++) - hash[k] += regs[k]; - -#pragma unroll 8 - for(int k=0;k<8;k++) - ((uint32_t*)outputHash)[(thread<<3)+k] = SWAB32(hash[k]); - } -} - -__host__ -void hefty_cpu_init(int thr_id, uint32_t threads) -{ - CUDA_SAFE_CALL(cudaSetDevice(device_map[thr_id])); - - // Kopiere die Hash-Tabellen in den GPU-Speicher - cudaMemcpyToSymbol( hefty_gpu_constantTable, - hefty_cpu_constantTable, - sizeof(uint32_t) * 64 ); - - // Speicher für alle Hefty1 hashes belegen - CUDA_SAFE_CALL(cudaMalloc(&heavy_heftyHashes[thr_id], 8 * sizeof(uint32_t) * threads)); -} - -__host__ -void hefty_cpu_setBlock(int thr_id, uint32_t threads, void *data, int len) -// data muss 80/84-Byte haben! -{ - // Nachricht expandieren und setzen - uint32_t msgBlock[32]; - - memset(msgBlock, 0, sizeof(msgBlock)); - memcpy(&msgBlock[0], data, len); - if (len == 84) { - msgBlock[21] |= 0x80; - msgBlock[31] = 672; // bitlen - } else if (len == 80) { - msgBlock[20] |= 0x80; - msgBlock[31] = 640; // bitlen - } - - for(int i=0;i<31;i++) // Byteorder drehen - msgBlock[i] = SWAB32(msgBlock[i]); - - // die erste Runde wird auf der CPU durchgeführt, da diese für - // alle Threads gleich ist. Der Hash wird dann an die Threads - // übergeben - - // Erstelle expandierten Block W - uint32_t W[64]; - memcpy(W, &msgBlock[0], sizeof(uint32_t) * 16); - for(int j=16;j<64;j++) - W[j] = s1(W[j-2]) + W[j-7] + s0(W[j-15]) + W[j-16]; - - // Initialisiere die register a bis h mit der Hash-Tabelle - uint32_t regs[8]; - uint32_t hash[8]; - uint32_t sponge[4]; - - // pre - memset(sponge, 0, sizeof(uint32_t) * 4); - for (int k=0; k < 8; k++) - { - regs[k] = hefty_cpu_hashTable[k]; - hash[k] = regs[k]; - } - - // 1. Runde - for(int j=0;j<16;j++) - Absorb(sponge, W[j] ^ hefty_cpu_constantTable[j]); - - for(int j=0;j<16;j++) - { - Absorb(sponge, regs[3] ^ regs[7]); - hefty_cpu_round(regs, W[j], hefty_cpu_constantTable[j], sponge); - } - - for(int j=16;j<64;j++) - { - Absorb(sponge, regs[3] + regs[7]); - hefty_cpu_round(regs, W[j], hefty_cpu_constantTable[j], sponge); - } - - for(int k=0;k<8;k++) - hash[k] += regs[k]; - - // sponge speichern - cudaMemcpyToSymbol(hefty_gpu_sponge, sponge, 16); - // hash speichern - cudaMemcpyToSymbol(hefty_gpu_register, hash, 32); - // Blockheader setzen (korrekte Nonce fehlt da drin noch) - CUDA_SAFE_CALL(cudaMemcpyToSymbol(hefty_gpu_blockHeader, &msgBlock[16], 64)); -} - -__host__ -void hefty_cpu_hash(int thr_id, uint32_t threads, int startNounce) -{ - uint32_t threadsperblock = 256; - - // berechne wie viele Thread Blocks wir brauchen - dim3 grid((threads + threadsperblock-1)/threadsperblock); - dim3 block(threadsperblock); - - // Größe des dynamischen Shared Memory Bereichs -#if USE_SHARED - int shared_size = 8 * 64 * sizeof(uint32_t); -#else - int shared_size = 0; -#endif - - hefty_gpu_hash <<< grid, block, shared_size >>> (threads, startNounce, heavy_heftyHashes[thr_id]); - -} diff --git a/heavy/cuda_hefty1.h b/heavy/cuda_hefty1.h deleted file mode 100644 index 17b196c836..0000000000 --- a/heavy/cuda_hefty1.h +++ /dev/null @@ -1,8 +0,0 @@ -#ifndef _CUDA_HEFTY1_H -#define _CUDA_HEFTY1_H - -void hefty_cpu_hash(int thr_id, int threads, int startNounce); -void hefty_cpu_setBlock(int thr_id, int threads, void *data, int len); -void hefty_cpu_init(int thr_id, int threads); - -#endif \ No newline at end of file diff --git a/heavy/cuda_keccak512.cu b/heavy/cuda_keccak512.cu deleted file mode 100644 index 76872e9824..0000000000 --- a/heavy/cuda_keccak512.cu +++ /dev/null @@ -1,276 +0,0 @@ -#include -#include - -#include "cuda_helper.h" - -// globaler Speicher für alle HeftyHashes aller Threads -extern uint32_t *heavy_heftyHashes[MAX_GPUS]; -extern uint32_t *heavy_nonceVector[MAX_GPUS]; - -// globaler Speicher für unsere Ergebnisse -uint32_t *d_hash3output[MAX_GPUS]; -extern uint32_t *d_hash4output[MAX_GPUS]; -extern uint32_t *d_hash5output[MAX_GPUS]; - -// der Keccak512 State nach der ersten Runde (72 Bytes) -__constant__ uint64_t c_State[25]; - -// die Message (72 Bytes) für die zweite Runde auf der GPU -__constant__ uint32_t c_PaddedMessage2[18]; // 44 bytes of remaining message (Nonce at offset 4) plus padding - -// ---------------------------- BEGIN CUDA keccak512 functions ------------------------------------ - -#define U32TO64_LE(p) \ - (((uint64_t)(*p)) | (((uint64_t)(*(p + 1))) << 32)) - -#define U64TO32_LE(p, v) \ - *p = (uint32_t)((v)); *(p+1) = (uint32_t)((v) >> 32); - -static __device__ void mycpy72(uint32_t *d, const uint32_t *s) { - #pragma unroll 18 - for (int k=0; k < 18; ++k) d[k] = s[k]; -} - -static __device__ void mycpy32(uint32_t *d, const uint32_t *s) { - #pragma unroll 8 - for (int k=0; k < 8; ++k) d[k] = s[k]; -} - -typedef struct keccak_hash_state_t { - uint64_t state[25]; // 25*2 - uint32_t buffer[72/4]; // 72 -} keccak_hash_state; - -__device__ void statecopy(uint64_t *d, uint64_t *s) -{ - #pragma unroll 25 - for (int i=0; i < 25; ++i) - d[i] = s[i]; -} - - -static const uint64_t host_keccak_round_constants[24] = { - 0x0000000000000001ull, 0x0000000000008082ull, - 0x800000000000808aull, 0x8000000080008000ull, - 0x000000000000808bull, 0x0000000080000001ull, - 0x8000000080008081ull, 0x8000000000008009ull, - 0x000000000000008aull, 0x0000000000000088ull, - 0x0000000080008009ull, 0x000000008000000aull, - 0x000000008000808bull, 0x800000000000008bull, - 0x8000000000008089ull, 0x8000000000008003ull, - 0x8000000000008002ull, 0x8000000000000080ull, - 0x000000000000800aull, 0x800000008000000aull, - 0x8000000080008081ull, 0x8000000000008080ull, - 0x0000000080000001ull, 0x8000000080008008ull -}; - -__constant__ uint64_t c_keccak_round_constants[24]; - -__host__ __device__ void -keccak_block(uint64_t *s, const uint32_t *in, const uint64_t *keccak_round_constants) { - int i; - uint64_t t[5], u[5], v, w; - - /* absorb input */ - for (i = 0; i < 9 /* 72/8 */; i++, in += 2) - s[i] ^= U32TO64_LE(in); - - for (i = 0; i < 24; i++) { - /* theta: c = a[0,i] ^ a[1,i] ^ .. a[4,i] */ - t[0] = s[0] ^ s[5] ^ s[10] ^ s[15] ^ s[20]; - t[1] = s[1] ^ s[6] ^ s[11] ^ s[16] ^ s[21]; - t[2] = s[2] ^ s[7] ^ s[12] ^ s[17] ^ s[22]; - t[3] = s[3] ^ s[8] ^ s[13] ^ s[18] ^ s[23]; - t[4] = s[4] ^ s[9] ^ s[14] ^ s[19] ^ s[24]; - - /* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */ - u[0] = t[4] ^ ROTL64(t[1], 1); - u[1] = t[0] ^ ROTL64(t[2], 1); - u[2] = t[1] ^ ROTL64(t[3], 1); - u[3] = t[2] ^ ROTL64(t[4], 1); - u[4] = t[3] ^ ROTL64(t[0], 1); - - /* theta: a[0,i], a[1,i], .. a[4,i] ^= d[i] */ - s[0] ^= u[0]; s[5] ^= u[0]; s[10] ^= u[0]; s[15] ^= u[0]; s[20] ^= u[0]; - s[1] ^= u[1]; s[6] ^= u[1]; s[11] ^= u[1]; s[16] ^= u[1]; s[21] ^= u[1]; - s[2] ^= u[2]; s[7] ^= u[2]; s[12] ^= u[2]; s[17] ^= u[2]; s[22] ^= u[2]; - s[3] ^= u[3]; s[8] ^= u[3]; s[13] ^= u[3]; s[18] ^= u[3]; s[23] ^= u[3]; - s[4] ^= u[4]; s[9] ^= u[4]; s[14] ^= u[4]; s[19] ^= u[4]; s[24] ^= u[4]; - - /* rho pi: b[..] = rotl(a[..], ..) */ - v = s[ 1]; - s[ 1] = ROTL64(s[ 6], 44); - s[ 6] = ROTL64(s[ 9], 20); - s[ 9] = ROTL64(s[22], 61); - s[22] = ROTL64(s[14], 39); - s[14] = ROTL64(s[20], 18); - s[20] = ROTL64(s[ 2], 62); - s[ 2] = ROTL64(s[12], 43); - s[12] = ROTL64(s[13], 25); - s[13] = ROTL64(s[19], 8); - s[19] = ROTL64(s[23], 56); - s[23] = ROTL64(s[15], 41); - s[15] = ROTL64(s[ 4], 27); - s[ 4] = ROTL64(s[24], 14); - s[24] = ROTL64(s[21], 2); - s[21] = ROTL64(s[ 8], 55); - s[ 8] = ROTL64(s[16], 45); - s[16] = ROTL64(s[ 5], 36); - s[ 5] = ROTL64(s[ 3], 28); - s[ 3] = ROTL64(s[18], 21); - s[18] = ROTL64(s[17], 15); - s[17] = ROTL64(s[11], 10); - s[11] = ROTL64(s[ 7], 6); - s[ 7] = ROTL64(s[10], 3); - s[10] = ROTL64( v, 1); - - /* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */ - v = s[ 0]; w = s[ 1]; s[ 0] ^= (~w) & s[ 2]; s[ 1] ^= (~s[ 2]) & s[ 3]; s[ 2] ^= (~s[ 3]) & s[ 4]; s[ 3] ^= (~s[ 4]) & v; s[ 4] ^= (~v) & w; - v = s[ 5]; w = s[ 6]; s[ 5] ^= (~w) & s[ 7]; s[ 6] ^= (~s[ 7]) & s[ 8]; s[ 7] ^= (~s[ 8]) & s[ 9]; s[ 8] ^= (~s[ 9]) & v; s[ 9] ^= (~v) & w; - v = s[10]; w = s[11]; s[10] ^= (~w) & s[12]; s[11] ^= (~s[12]) & s[13]; s[12] ^= (~s[13]) & s[14]; s[13] ^= (~s[14]) & v; s[14] ^= (~v) & w; - v = s[15]; w = s[16]; s[15] ^= (~w) & s[17]; s[16] ^= (~s[17]) & s[18]; s[17] ^= (~s[18]) & s[19]; s[18] ^= (~s[19]) & v; s[19] ^= (~v) & w; - v = s[20]; w = s[21]; s[20] ^= (~w) & s[22]; s[21] ^= (~s[22]) & s[23]; s[22] ^= (~s[23]) & s[24]; s[23] ^= (~s[24]) & v; s[24] ^= (~v) & w; - - /* iota: a[0,0] ^= round constant */ - s[0] ^= keccak_round_constants[i]; - } -} - -// Die Hash-Funktion -template __global__ void keccak512_gpu_hash(uint32_t threads, uint32_t startNounce, void *outputHash, uint32_t *heftyHashes, uint32_t *nonceVector) -{ - uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); - if (thread < threads) - { - // bestimme den aktuellen Zähler - //uint32_t nounce = startNounce + thread; - uint32_t nounce = nonceVector[thread]; - - // Index-Position des Hashes in den Hash Puffern bestimmen (Hefty1 und outputHash) - uint32_t hashPosition = nounce - startNounce; - - // erstmal den State der ersten Runde holen - uint64_t keccak_gpu_state[25]; -#pragma unroll 25 - for (int i=0; i < 25; ++i) - keccak_gpu_state[i] = c_State[i]; - - // Message2 in den Puffer holen - uint32_t msgBlock[18]; - mycpy72(msgBlock, c_PaddedMessage2); - - // die individuelle Nonce einsetzen - msgBlock[1] = nounce; - - // den individuellen Hefty1 Hash einsetzen - mycpy32(&msgBlock[(BLOCKSIZE-72)/sizeof(uint32_t)], &heftyHashes[8 * hashPosition]); - - // den Block einmal gut durchschütteln - keccak_block(keccak_gpu_state, msgBlock, c_keccak_round_constants); - - // das Hash erzeugen - uint32_t hash[16]; - -#pragma unroll 8 - for (int i = 0; i < 64; i += 8) { - U64TO32_LE((&hash[i/4]), keccak_gpu_state[i / 8]); - } - - // und ins Global Memory rausschreiben -#pragma unroll 16 - for(int k=0;k<16;k++) - ((uint32_t*)outputHash)[16*hashPosition+k] = hash[k]; - } -} - -// ---------------------------- END CUDA keccak512 functions ------------------------------------ - -__host__ -void keccak512_cpu_init(int thr_id, uint32_t threads) -{ - // Kopiere die Hash-Tabellen in den GPU-Speicher - cudaMemcpyToSymbol( c_keccak_round_constants, - host_keccak_round_constants, - sizeof(host_keccak_round_constants), - 0, cudaMemcpyHostToDevice); - - // Speicher für alle Ergebnisse belegen - cudaMalloc(&d_hash3output[thr_id], 16 * sizeof(uint32_t) * threads); -} - -// ----------------BEGIN keccak512 CPU version from scrypt-jane code -------------------- - -#define SCRYPT_HASH_DIGEST_SIZE 64 -#define SCRYPT_KECCAK_F 1600 -#define SCRYPT_KECCAK_C (SCRYPT_HASH_DIGEST_SIZE * 8 * 2) /* 1024 */ -#define SCRYPT_KECCAK_R (SCRYPT_KECCAK_F - SCRYPT_KECCAK_C) /* 576 */ -#define SCRYPT_HASH_BLOCK_SIZE (SCRYPT_KECCAK_R / 8) /* 72 */ - -// --------------- END keccak512 CPU version from scrypt-jane code -------------------- - -static int BLOCKSIZE = 84; - -__host__ -void keccak512_cpu_setBlock(void *data, int len) - // data muss 80 oder 84-Byte haben! - // heftyHash hat 32-Byte -{ - // CH - // state init - uint64_t keccak_cpu_state[25]; - memset(keccak_cpu_state, 0, sizeof(keccak_cpu_state)); - - // erste Runde - keccak_block((uint64_t*)&keccak_cpu_state, (const uint32_t*)data, host_keccak_round_constants); - - // state kopieren - cudaMemcpyToSymbol( c_State, keccak_cpu_state, 25*sizeof(uint64_t), 0, cudaMemcpyHostToDevice); - - // keccak hat 72-Byte blöcke, d.h. in unserem Fall zwei Blöcke - // zu jeweils - uint32_t msgBlock[18]; - memset(msgBlock, 0, 18 * sizeof(uint32_t)); - - // kopiere die restlichen Daten rein (aber nur alles nach Byte 72) - if (len == 84) - memcpy(&msgBlock[0], &((uint8_t*)data)[72], 12); - else if (len == 80) - memcpy(&msgBlock[0], &((uint8_t*)data)[72], 8); - - // Nachricht abschließen - if (len == 84) - msgBlock[11] = 0x01; - else if (len == 80) - msgBlock[10] = 0x01; - msgBlock[17] = 0x80000000; - - // Message 2 ins Constant Memory kopieren (die variable Nonce und - // der Hefty1 Anteil muss aber auf der GPU erst noch ersetzt werden) - cudaMemcpyToSymbol( c_PaddedMessage2, msgBlock, 18*sizeof(uint32_t), 0, cudaMemcpyHostToDevice ); - - BLOCKSIZE = len; -} - -__host__ -void keccak512_cpu_copyHeftyHash(int thr_id, uint32_t threads, void *heftyHashes, int copy) -{ - // Hefty1 Hashes kopieren - if (copy) - CUDA_SAFE_CALL(cudaMemcpy(heavy_heftyHashes[thr_id], heftyHashes, 8 * sizeof(uint32_t) * threads, cudaMemcpyHostToDevice)); - //else cudaDeviceSynchronize(); -} - -__host__ -void keccak512_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce) -{ - const uint32_t threadsperblock = 128; - - // berechne wie viele Thread Blocks wir brauchen - dim3 grid((threads + threadsperblock-1)/threadsperblock); - dim3 block(threadsperblock); - - if (BLOCKSIZE==84) - keccak512_gpu_hash<84><<>>(threads, startNounce, d_hash3output[thr_id], heavy_heftyHashes[thr_id], heavy_nonceVector[thr_id]); - else if (BLOCKSIZE==80) - keccak512_gpu_hash<80><<>>(threads, startNounce, d_hash3output[thr_id], heavy_heftyHashes[thr_id], heavy_nonceVector[thr_id]); -} diff --git a/heavy/cuda_keccak512.h b/heavy/cuda_keccak512.h deleted file mode 100644 index 1182447573..0000000000 --- a/heavy/cuda_keccak512.h +++ /dev/null @@ -1,9 +0,0 @@ -#ifndef _CUDA_KECCAK512_H -#define _CUDA_KECCAK512_H - -void keccak512_cpu_init(int thr_id, int threads); -void keccak512_cpu_setBlock(void *data, int len); -void keccak512_cpu_copyHeftyHash(int thr_id, int threads, void *heftyHashes, int copy); -void keccak512_cpu_hash(int thr_id, int threads, uint32_t startNounce); - -#endif diff --git a/heavy/cuda_sha256.cu b/heavy/cuda_sha256.cu deleted file mode 100644 index a4c309ba33..0000000000 --- a/heavy/cuda_sha256.cu +++ /dev/null @@ -1,272 +0,0 @@ -#include -#include - -#include "cuda_helper.h" - -// globaler Speicher für alle HeftyHashes aller Threads -extern uint32_t *heavy_heftyHashes[MAX_GPUS]; -extern uint32_t *heavy_nonceVector[MAX_GPUS]; - -// globaler Speicher für unsere Ergebnisse -uint32_t *d_hash2output[MAX_GPUS]; - - -/* Hash-Tabellen */ -__constant__ uint32_t sha256_gpu_constantTable[64]; - -// muss expandiert werden -__constant__ uint32_t sha256_gpu_blockHeader[16]; // 2x512 Bit Message -__constant__ uint32_t sha256_gpu_register[8]; - -uint32_t sha256_cpu_hashTable[] = { 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19 }; -uint32_t sha256_cpu_constantTable[] = { - 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, - 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, - 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, - 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, - 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, - 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, - 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, - 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2, -}; - -#define S(x, n) (((x) >> (n)) | ((x) << (32 - (n)))) -#define R(x, n) ((x) >> (n)) -#define Ch(x, y, z) ((x & (y ^ z)) ^ z) -#define Maj(x, y, z) ((x & (y | z)) | (y & z)) -#define S0(x) (S(x, 2) ^ S(x, 13) ^ S(x, 22)) -#define S1(x) (S(x, 6) ^ S(x, 11) ^ S(x, 25)) -#define s0(x) (S(x, 7) ^ S(x, 18) ^ R(x, 3)) -#define s1(x) (S(x, 17) ^ S(x, 19) ^ R(x, 10)) - -#define SWAB32(x) ( ((x & 0x000000FF) << 24) | ((x & 0x0000FF00) << 8) | ((x & 0x00FF0000) >> 8) | ((x & 0xFF000000) >> 24) ) - -// Die Hash-Funktion -template __global__ void sha256_gpu_hash(uint32_t threads, uint32_t startNounce, void *outputHash, uint32_t *heftyHashes, uint32_t *nonceVector) -{ - uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); - if (thread < threads) - { - // bestimme den aktuellen Zähler - uint32_t nounce = startNounce + thread; - nonceVector[thread] = nounce; - - // jeder thread in diesem Block bekommt sein eigenes W Array im Shared memory - uint32_t W1[16]; - uint32_t W2[16]; - - // Initialisiere die register a bis h mit der Hash-Tabelle - uint32_t regs[8]; - uint32_t hash[8]; - - // pre -#pragma unroll 8 - for (int k=0; k < 8; k++) - { - regs[k] = sha256_gpu_register[k]; - hash[k] = regs[k]; - } - - // 2. Runde - //memcpy(W, &sha256_gpu_blockHeader[0], sizeof(uint32_t) * 16); // TODO: aufsplitten in zwei Teilblöcke - //memcpy(&W[5], &heftyHashes[8 * (blockDim.x * blockIdx.x + threadIdx.x)], sizeof(uint32_t) * 8); // den richtigen Hefty1 Hash holen -#pragma unroll 16 - for(int k=0;k<16;k++) - W1[k] = sha256_gpu_blockHeader[k]; - - uint32_t offset = 8 * (blockDim.x * blockIdx.x + threadIdx.x); -#pragma unroll 8 - for(int k=0;k<8;k++) - W1[((BLOCKSIZE-64)/4)+k] = heftyHashes[offset + k]; - -#pragma unroll 8 - for (int i=((BLOCKSIZE-64)/4); i < ((BLOCKSIZE-64)/4)+8; ++i) W1[i] = SWAB32(W1[i]); // die Hefty1 Hashes brauchen eine Drehung ;) - W1[3] = SWAB32(nounce); - -// Progress W1 -#pragma unroll 16 - for(int j=0;j<16;j++) - { - uint32_t T1, T2; - T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + sha256_gpu_constantTable[j] + W1[j]; - T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]); - - #pragma unroll 7 - for (int k=6; k >= 0; k--) regs[k+1] = regs[k]; - regs[0] = T1 + T2; - regs[4] += T1; - } - -// Progress W2...W3 -#pragma unroll 3 - for(int k=0;k<3;k++) - { - #pragma unroll 2 - for(int j=0;j<2;j++) - W2[j] = s1(W1[14+j]) + W1[9+j] + s0(W1[1+j]) + W1[j]; - #pragma unroll 5 - for(int j=2;j<7;j++) - W2[j] = s1(W2[j-2]) + W1[9+j] + s0(W1[1+j]) + W1[j]; - - #pragma unroll 8 - for(int j=7;j<15;j++) - W2[j] = s1(W2[j-2]) + W2[j-7] + s0(W1[1+j]) + W1[j]; - - W2[15] = s1(W2[13]) + W2[8] + s0(W2[0]) + W1[15]; - - // Rundenfunktion - #pragma unroll 16 - for(int j=0;j<16;j++) - { - uint32_t T1, T2; - T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + sha256_gpu_constantTable[j + 16 * (k+1)] + W2[j]; - T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]); - - #pragma unroll 7 - for (int l=6; l >= 0; l--) regs[l+1] = regs[l]; - regs[0] = T1 + T2; - regs[4] += T1; - } - - #pragma unroll 16 - for(int j=0;j<16;j++) - W1[j] = W2[j]; - } - -/* - for(int j=16;j<64;j++) - W[j] = s1(W[j-2]) + W[j-7] + s0(W[j-15]) + W[j-16]; - -#pragma unroll 64 - for(int j=0;j<64;j++) - { - uint32_t T1, T2; - T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + sha256_gpu_constantTable[j] + W[j]; - T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]); - - #pragma unroll 7 - for (int k=6; k >= 0; k--) regs[k+1] = regs[k]; - regs[0] = T1 + T2; - regs[4] += T1; - } -*/ -#pragma unroll 8 - for(int k=0;k<8;k++) - hash[k] += regs[k]; - -#pragma unroll 8 - for(int k=0;k<8;k++) - ((uint32_t*)outputHash)[8*thread+k] = SWAB32(hash[k]); - } -} - -// Setup-Funktionen -__host__ void sha256_cpu_init(int thr_id, uint32_t threads) -{ - // Kopiere die Hash-Tabellen in den GPU-Speicher - cudaMemcpyToSymbol( sha256_gpu_constantTable, - sha256_cpu_constantTable, - sizeof(uint32_t) * 64 ); - - // Speicher für alle Ergebnisse belegen - cudaMalloc(&d_hash2output[thr_id], 8 * sizeof(uint32_t) * threads); -} - -static int BLOCKSIZE = 84; - -__host__ void sha256_cpu_setBlock(void *data, int len) - // data muss 80/84-Byte haben! - // heftyHash hat 32-Byte -{ - // Nachricht expandieren und setzen - uint32_t msgBlock[32]; - - memset(msgBlock, 0, sizeof(uint32_t) * 32); - memcpy(&msgBlock[0], data, len); - if (len == 84) { - memset(&msgBlock[21], 0, 32); // vorläufig Nullen anstatt der Hefty1 Hashes einfüllen - msgBlock[29] |= 0x80; - msgBlock[31] = 928; // bitlen - } else if (len == 80) { - memset(&msgBlock[20], 0, 32); // vorläufig Nullen anstatt der Hefty1 Hashes einfüllen - msgBlock[28] |= 0x80; - msgBlock[31] = 896; // bitlen - } - - for(int i=0;i<31;i++) // Byteorder drehen - msgBlock[i] = SWAB32(msgBlock[i]); - - // die erste Runde wird auf der CPU durchgeführt, da diese für - // alle Threads gleich ist. Der Hash wird dann an die Threads - // übergeben - uint32_t W[64]; - - // Erstelle expandierten Block W - memcpy(W, &msgBlock[0], sizeof(uint32_t) * 16); - for(int j=16;j<64;j++) - W[j] = s1(W[j-2]) + W[j-7] + s0(W[j-15]) + W[j-16]; - - // Initialisiere die register a bis h mit der Hash-Tabelle - uint32_t regs[8]; - uint32_t hash[8]; - - // pre - for (int k=0; k < 8; k++) - { - regs[k] = sha256_cpu_hashTable[k]; - hash[k] = regs[k]; - } - - // 1. Runde - for(int j=0;j<64;j++) - { - uint32_t T1, T2; - T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + sha256_cpu_constantTable[j] + W[j]; - T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]); - - //#pragma unroll 7 - for (int k=6; k >= 0; k--) regs[k+1] = regs[k]; - // sollte mal noch durch memmov ersetzt werden! -// memcpy(®s[1], ®s[0], sizeof(uint32_t) * 7); - regs[0] = T1 + T2; - regs[4] += T1; - } - - for(int k=0;k<8;k++) - hash[k] += regs[k]; - - // hash speichern - cudaMemcpyToSymbol( sha256_gpu_register, - hash, - sizeof(uint32_t) * 8 ); - - // Blockheader setzen (korrekte Nonce und Hefty Hash fehlen da drin noch) - cudaMemcpyToSymbol( sha256_gpu_blockHeader, - &msgBlock[16], - 64); - - BLOCKSIZE = len; -} - -__host__ void sha256_cpu_copyHeftyHash(int thr_id, uint32_t threads, void *heftyHashes, int copy) -{ - // Hefty1 Hashes kopieren - if (copy) - CUDA_SAFE_CALL(cudaMemcpy(heavy_heftyHashes[thr_id], heftyHashes, 8 * sizeof(uint32_t) * threads, cudaMemcpyHostToDevice)); - //else cudaDeviceSynchronize(); -} - -__host__ void sha256_cpu_hash(int thr_id, uint32_t threads, int startNounce) -{ - const uint32_t threadsperblock = 256; - - // berechne wie viele Thread Blocks wir brauchen - dim3 grid((threads + threadsperblock-1)/threadsperblock); - dim3 block(threadsperblock); - - if (BLOCKSIZE == 84) - sha256_gpu_hash<84><<>>(threads, startNounce, d_hash2output[thr_id], heavy_heftyHashes[thr_id], heavy_nonceVector[thr_id]); - else if (BLOCKSIZE == 80) { - sha256_gpu_hash<80><<>>(threads, startNounce, d_hash2output[thr_id], heavy_heftyHashes[thr_id], heavy_nonceVector[thr_id]); - } -} diff --git a/heavy/cuda_sha256.h b/heavy/cuda_sha256.h deleted file mode 100644 index 03385d125a..0000000000 --- a/heavy/cuda_sha256.h +++ /dev/null @@ -1,8 +0,0 @@ -#ifndef _CUDA_SHA256_H -#define _CUDA_SHA256_H - -void sha256_cpu_init(int thr_id, int threads); -void sha256_cpu_setBlock(void *data, int len); -void sha256_cpu_hash(int thr_id, int threads, int startNounce); -void sha256_cpu_copyHeftyHash(int thr_id, int threads, void *heftyHashes, int copy); -#endif diff --git a/heavy/heavy.cu b/heavy/heavy.cu deleted file mode 100644 index 99b0357f57..0000000000 --- a/heavy/heavy.cu +++ /dev/null @@ -1,352 +0,0 @@ -#include -#include -#include -// include thrust -#include -#include - -#include "miner.h" - -extern "C" { -#include "sph/sph_keccak.h" -#include "sph/sph_blake.h" -#include "sph/sph_groestl.h" -} -#include "hefty1.h" -#include "heavy/heavy.h" -#include "cuda_helper.h" - -extern uint32_t *d_hash2output[MAX_GPUS]; -extern uint32_t *d_hash3output[MAX_GPUS]; -extern uint32_t *d_hash4output[MAX_GPUS]; -extern uint32_t *d_hash5output[MAX_GPUS]; - -#define HEAVYCOIN_BLKHDR_SZ 84 -#define MNR_BLKHDR_SZ 80 - -// nonce-array für die threads -uint32_t *heavy_nonceVector[MAX_GPUS]; - -extern uint32_t *heavy_heftyHashes[MAX_GPUS]; - -/* Combines top 64-bits from each hash into a single hash */ -static void combine_hashes(uint32_t *out, const uint32_t *hash1, const uint32_t *hash2, const uint32_t *hash3, const uint32_t *hash4) -{ - const uint32_t *hash[4] = { hash1, hash2, hash3, hash4 }; - int bits; - unsigned int i; - uint32_t mask; - unsigned int k; - - /* Transpose first 64 bits of each hash into out */ - memset(out, 0, 32); - bits = 0; - for (i = 7; i >= 6; i--) { - for (mask = 0x80000000; mask; mask >>= 1) { - for (k = 0; k < 4; k++) { - out[(255 - bits)/32] <<= 1; - if ((hash[k][i] & mask) != 0) - out[(255 - bits)/32] |= 1; - bits++; - } - } - } -} - -#ifdef _MSC_VER -#include -static uint32_t __inline bitsset( uint32_t x ) -{ - DWORD r = 0; - _BitScanReverse(&r, x); - return r; -} -#else -static uint32_t bitsset( uint32_t x ) -{ - return 31-__builtin_clz(x); -} -#endif - -// Finde das high bit in einem Multiword-Integer. -static int findhighbit(const uint32_t *ptarget, int words) -{ - int i; - int highbit = 0; - for (i=words-1; i >= 0; --i) - { - if (ptarget[i] != 0) { - highbit = i*32 + bitsset(ptarget[i])+1; - break; - } - } - return highbit; -} - -// Generiere ein Multiword-Integer das die Zahl -// (2 << highbit) - 1 repräsentiert. -static void genmask(uint32_t *ptarget, int words, int highbit) -{ - int i; - for (i=words-1; i >= 0; --i) - { - if ((i+1)*32 <= highbit) - ptarget[i] = UINT32_MAX; - else if (i*32 > highbit) - ptarget[i] = 0x00000000; - else - ptarget[i] = (1 << (highbit-i*32)) - 1; - } -} - -struct check_nonce_for_remove -{ - check_nonce_for_remove(uint64_t target, uint32_t *hashes, uint32_t hashlen, uint32_t startNonce) : - m_target(target), - m_hashes(hashes), - m_hashlen(hashlen), - m_startNonce(startNonce) { } - - uint64_t m_target; - uint32_t *m_hashes; - uint32_t m_hashlen; - uint32_t m_startNonce; - - __device__ - bool operator()(const uint32_t x) - { - // Position im Hash Buffer - uint32_t hashIndex = x - m_startNonce; - // Wert des Hashes (als uint64_t) auslesen. - // Steht im 6. und 7. Wort des Hashes (jeder dieser Hashes hat 512 Bits) - uint64_t hashValue = *((uint64_t*)(&m_hashes[m_hashlen*hashIndex + 6])); - bool res = (hashValue & m_target) != hashValue; - //printf("ndx=%x val=%08x target=%lx\n", hashIndex, hashValue, m_target); - // gegen das Target prüfen. Es dürfen nur Bits aus dem Target gesetzt sein. - return res; - } -}; - -static bool init[MAX_GPUS] = { 0 }; - -__host__ -int scanhash_heavy(int thr_id, uint32_t *pdata, - const uint32_t *ptarget, uint32_t max_nonce, - unsigned long *hashes_done, uint32_t maxvote, int blocklen) -{ - const uint32_t first_nonce = pdata[19]; - // CUDA will process thousands of threads. - uint32_t throughput = device_intensity(thr_id, __func__, (1U << 19) - 256); - throughput = min(throughput, (max_nonce - first_nonce)); - - int rc = 0; - uint32_t *hash = NULL; - uint32_t *cpu_nonceVector = NULL; - CUDA_SAFE_CALL(cudaMallocHost(&hash, throughput*8*sizeof(uint32_t))); - CUDA_SAFE_CALL(cudaMallocHost(&cpu_nonceVector, throughput*sizeof(uint32_t))); - - int nrmCalls[6]; - memset(nrmCalls, 0, sizeof(int) * 6); - - if (opt_benchmark) - ((uint32_t*)ptarget)[7] = 0x00ff; - - // für jeden Hash ein individuelles Target erstellen basierend - // auf dem höchsten Bit, das in ptarget gesetzt ist. - int highbit = findhighbit(ptarget, 8); - uint32_t target2[2], target3[2], target4[2], target5[2]; - genmask(target2, 2, highbit/4+(((highbit%4)>3)?1:0) ); // SHA256 - genmask(target3, 2, highbit/4+(((highbit%4)>2)?1:0) ); // keccak512 - genmask(target4, 2, highbit/4+(((highbit%4)>1)?1:0) ); // groestl512 - genmask(target5, 2, highbit/4+(((highbit%4)>0)?1:0) ); // blake512 - - if (!init[thr_id]) - { - hefty_cpu_init(thr_id, throughput); - sha256_cpu_init(thr_id, throughput); - keccak512_cpu_init(thr_id, throughput); - groestl512_cpu_init(thr_id, throughput); - blake512_cpu_init(thr_id, throughput); - combine_cpu_init(thr_id, throughput); - - CUDA_SAFE_CALL(cudaMalloc(&heavy_nonceVector[thr_id], sizeof(uint32_t) * throughput)); - - init[thr_id] = true; - } - - if (blocklen == HEAVYCOIN_BLKHDR_SZ) - { - uint16_t *ext = (uint16_t *)&pdata[20]; - - if (opt_vote > maxvote) { - applog(LOG_WARNING, "Your block reward vote (%hu) exceeds " - "the maxvote reported by the pool (%hu).", - opt_vote, maxvote); - } - - if (opt_trust_pool && opt_vote > maxvote) { - applog(LOG_WARNING, "Capping block reward vote to maxvote reported by pool."); - ext[0] = maxvote; - } - else - ext[0] = opt_vote; - } - - // Setze die Blockdaten - hefty_cpu_setBlock(thr_id, throughput, pdata, blocklen); - sha256_cpu_setBlock(pdata, blocklen); - keccak512_cpu_setBlock(pdata, blocklen); - groestl512_cpu_setBlock(pdata, blocklen); - blake512_cpu_setBlock(pdata, blocklen); - - do { - - ////// Compaction init - thrust::device_ptr devNoncePtr(heavy_nonceVector[thr_id]); - thrust::device_ptr devNoncePtrEnd((heavy_nonceVector[thr_id]) + throughput); - uint32_t actualNumberOfValuesInNonceVectorGPU = throughput; - uint64_t *t; - - hefty_cpu_hash(thr_id, throughput, pdata[19]); - //cudaDeviceSynchronize(); - sha256_cpu_hash(thr_id, throughput, pdata[19]); - //cudaDeviceSynchronize(); - - - ////// Compaction - t = (uint64_t*) target2; - devNoncePtrEnd = thrust::remove_if(devNoncePtr, devNoncePtrEnd, check_nonce_for_remove(*t, d_hash2output[thr_id], 8, pdata[19])); - actualNumberOfValuesInNonceVectorGPU = (uint32_t)(devNoncePtrEnd - devNoncePtr); - if(actualNumberOfValuesInNonceVectorGPU == 0) - goto emptyNonceVector; - - keccak512_cpu_hash(thr_id, actualNumberOfValuesInNonceVectorGPU, pdata[19]); - //cudaDeviceSynchronize(); - - ////// Compaction - t = (uint64_t*) target3; - devNoncePtrEnd = thrust::remove_if(devNoncePtr, devNoncePtrEnd, check_nonce_for_remove(*t, d_hash3output[thr_id], 16, pdata[19])); - actualNumberOfValuesInNonceVectorGPU = (uint32_t)(devNoncePtrEnd - devNoncePtr); - if(actualNumberOfValuesInNonceVectorGPU == 0) - goto emptyNonceVector; - - blake512_cpu_hash(thr_id, actualNumberOfValuesInNonceVectorGPU, pdata[19]); - //cudaDeviceSynchronize(); - - ////// Compaction - t = (uint64_t*) target5; - devNoncePtrEnd = thrust::remove_if(devNoncePtr, devNoncePtrEnd, check_nonce_for_remove(*t, d_hash5output[thr_id], 16, pdata[19])); - actualNumberOfValuesInNonceVectorGPU = (uint32_t)(devNoncePtrEnd - devNoncePtr); - if(actualNumberOfValuesInNonceVectorGPU == 0) - goto emptyNonceVector; - - groestl512_cpu_hash(thr_id, actualNumberOfValuesInNonceVectorGPU, pdata[19]); - //cudaDeviceSynchronize(); - - ////// Compaction - t = (uint64_t*) target4; - devNoncePtrEnd = thrust::remove_if(devNoncePtr, devNoncePtrEnd, check_nonce_for_remove(*t, d_hash4output[thr_id], 16, pdata[19])); - actualNumberOfValuesInNonceVectorGPU = (uint32_t)(devNoncePtrEnd - devNoncePtr); - if(actualNumberOfValuesInNonceVectorGPU == 0) - goto emptyNonceVector; - - // combine - combine_cpu_hash(thr_id, actualNumberOfValuesInNonceVectorGPU, pdata[19], hash); - - if (opt_tracegpu) { - applog(LOG_BLUE, "heavy GPU hash:"); - applog_hash((uchar*)hash); - } - - // Ergebnisse kopieren - if(actualNumberOfValuesInNonceVectorGPU > 0) - { - size_t size = sizeof(uint32_t) * actualNumberOfValuesInNonceVectorGPU; - CUDA_SAFE_CALL(cudaMemcpy(cpu_nonceVector, heavy_nonceVector[thr_id], size, cudaMemcpyDeviceToHost)); - - for (uint32_t i=0; i < actualNumberOfValuesInNonceVectorGPU; i++) - { - uint32_t nonce = cpu_nonceVector[i]; - uint32_t *foundhash = &hash[8*i]; - if (foundhash[7] <= ptarget[7]) { - if (fulltest(foundhash, ptarget)) { - uint32_t verification[8]; - pdata[19] += nonce - pdata[19]; - heavycoin_hash((uchar*)verification, (uchar*)pdata, blocklen); - if (memcmp(verification, foundhash, 8*sizeof(uint32_t))) { - applog(LOG_ERR, "hash for nonce=$%08X does not validate on CPU!\n", nonce); - } else { - *hashes_done = pdata[19] - first_nonce; - rc = 1; - goto exit; - } - } - } - } - } - -emptyNonceVector: - - pdata[19] += throughput; - - } while (pdata[19] < max_nonce && !work_restart[thr_id].restart); - *hashes_done = pdata[19] - first_nonce; - -exit: - cudaFreeHost(cpu_nonceVector); - cudaFreeHost(hash); - return rc; -} - -__host__ -void heavycoin_hash(uchar* output, const uchar* input, int len) -{ - unsigned char hash1[32]; - unsigned char hash2[32]; - uint32_t hash3[16]; - uint32_t hash4[16]; - uint32_t hash5[16]; - uint32_t *final; - SHA256_CTX ctx; - sph_keccak512_context keccakCtx; - sph_groestl512_context groestlCtx; - sph_blake512_context blakeCtx; - - HEFTY1(input, len, hash1); - - /* HEFTY1 is new, so take an extra security measure to eliminate - * the possiblity of collisions: - * - * Hash(x) = SHA256(x + HEFTY1(x)) - * - * N.B. '+' is concatenation. - */ - SHA256_Init(&ctx); - SHA256_Update(&ctx, input, len); - SHA256_Update(&ctx, hash1, sizeof(hash1)); - SHA256_Final(hash2, &ctx); - - /* Additional security: Do not rely on a single cryptographic hash - * function. Instead, combine the outputs of 4 of the most secure - * cryptographic hash functions-- SHA256, KECCAK512, GROESTL512 - * and BLAKE512. - */ - - sph_keccak512_init(&keccakCtx); - sph_keccak512(&keccakCtx, input, len); - sph_keccak512(&keccakCtx, hash1, sizeof(hash1)); - sph_keccak512_close(&keccakCtx, (void *)&hash3); - - sph_groestl512_init(&groestlCtx); - sph_groestl512(&groestlCtx, input, len); - sph_groestl512(&groestlCtx, hash1, sizeof(hash1)); - sph_groestl512_close(&groestlCtx, (void *)&hash4); - - sph_blake512_init(&blakeCtx); - sph_blake512(&blakeCtx, input, len); - sph_blake512(&blakeCtx, (unsigned char *)&hash1, sizeof(hash1)); - sph_blake512_close(&blakeCtx, (void *)&hash5); - - final = (uint32_t *)output; - combine_hashes(final, (uint32_t *)hash2, hash3, hash4, hash5); -} diff --git a/heavy/heavy.h b/heavy/heavy.h deleted file mode 100644 index 59f39139ba..0000000000 --- a/heavy/heavy.h +++ /dev/null @@ -1,30 +0,0 @@ -#ifndef _CUDA_HEAVY_H -#define _CUDA_HEAVY_H - -void blake512_cpu_init(int thr_id, uint32_t threads); -void blake512_cpu_setBlock(void *pdata, int len); -void blake512_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce); - -void groestl512_cpu_init(int thr_id, uint32_t threads); -void groestl512_cpu_copyHeftyHash(int thr_id, uint32_t threads, void *heftyHashes, int copy); -void groestl512_cpu_setBlock(void *data, int len); -void groestl512_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce); - -void hefty_cpu_hash(int thr_id, uint32_t threads, int startNounce); -void hefty_cpu_setBlock(int thr_id, uint32_t threads, void *data, int len); -void hefty_cpu_init(int thr_id, uint32_t threads); - -void keccak512_cpu_init(int thr_id, uint32_t threads); -void keccak512_cpu_setBlock(void *data, int len); -void keccak512_cpu_copyHeftyHash(int thr_id, uint32_t threads, void *heftyHashes, int copy); -void keccak512_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce); - -void sha256_cpu_init(int thr_id, uint32_t threads); -void sha256_cpu_setBlock(void *data, int len); -void sha256_cpu_hash(int thr_id, uint32_t threads, int startNounce); -void sha256_cpu_copyHeftyHash(int thr_id, uint32_t threads, void *heftyHashes, int copy); - -void combine_cpu_init(int thr_id, uint32_t threads); -void combine_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *hash); - -#endif diff --git a/install-sh b/install-sh deleted file mode 100644 index a9244eb078..0000000000 --- a/install-sh +++ /dev/null @@ -1,527 +0,0 @@ -#!/bin/sh -# install - install a program, script, or datafile - -scriptversion=2011-01-19.21; # UTC - -# This originates from X11R5 (mit/util/scripts/install.sh), which was -# later released in X11R6 (xc/config/util/install.sh) with the -# following copyright and license. -# -# Copyright (C) 1994 X Consortium -# -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to -# deal in the Software without restriction, including without limitation the -# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or -# sell copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included in -# all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# X CONSORTIUM BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN -# AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNEC- -# TION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# Except as contained in this notice, the name of the X Consortium shall not -# be used in advertising or otherwise to promote the sale, use or other deal- -# ings in this Software without prior written authorization from the X Consor- -# tium. -# -# -# FSF changes to this file are in the public domain. -# -# Calling this script install-sh is preferred over install.sh, to prevent -# `make' implicit rules from creating a file called install from it -# when there is no Makefile. -# -# This script is compatible with the BSD install script, but was written -# from scratch. - -nl=' -' -IFS=" "" $nl" - -# set DOITPROG to echo to test this script - -# Don't use :- since 4.3BSD and earlier shells don't like it. -doit=${DOITPROG-} -if test -z "$doit"; then - doit_exec=exec -else - doit_exec=$doit -fi - -# Put in absolute file names if you don't have them in your path; -# or use environment vars. - -chgrpprog=${CHGRPPROG-chgrp} -chmodprog=${CHMODPROG-chmod} -chownprog=${CHOWNPROG-chown} -cmpprog=${CMPPROG-cmp} -cpprog=${CPPROG-cp} -mkdirprog=${MKDIRPROG-mkdir} -mvprog=${MVPROG-mv} -rmprog=${RMPROG-rm} -stripprog=${STRIPPROG-strip} - -posix_glob='?' -initialize_posix_glob=' - test "$posix_glob" != "?" || { - if (set -f) 2>/dev/null; then - posix_glob= - else - posix_glob=: - fi - } -' - -posix_mkdir= - -# Desired mode of installed file. -mode=0755 - -chgrpcmd= -chmodcmd=$chmodprog -chowncmd= -mvcmd=$mvprog -rmcmd="$rmprog -f" -stripcmd= - -src= -dst= -dir_arg= -dst_arg= - -copy_on_change=false -no_target_directory= - -usage="\ -Usage: $0 [OPTION]... [-T] SRCFILE DSTFILE - or: $0 [OPTION]... SRCFILES... DIRECTORY - or: $0 [OPTION]... -t DIRECTORY SRCFILES... - or: $0 [OPTION]... -d DIRECTORIES... - -In the 1st form, copy SRCFILE to DSTFILE. -In the 2nd and 3rd, copy all SRCFILES to DIRECTORY. -In the 4th, create DIRECTORIES. - -Options: - --help display this help and exit. - --version display version info and exit. - - -c (ignored) - -C install only if different (preserve the last data modification time) - -d create directories instead of installing files. - -g GROUP $chgrpprog installed files to GROUP. - -m MODE $chmodprog installed files to MODE. - -o USER $chownprog installed files to USER. - -s $stripprog installed files. - -t DIRECTORY install into DIRECTORY. - -T report an error if DSTFILE is a directory. - -Environment variables override the default commands: - CHGRPPROG CHMODPROG CHOWNPROG CMPPROG CPPROG MKDIRPROG MVPROG - RMPROG STRIPPROG -" - -while test $# -ne 0; do - case $1 in - -c) ;; - - -C) copy_on_change=true;; - - -d) dir_arg=true;; - - -g) chgrpcmd="$chgrpprog $2" - shift;; - - --help) echo "$usage"; exit $?;; - - -m) mode=$2 - case $mode in - *' '* | *' '* | *' -'* | *'*'* | *'?'* | *'['*) - echo "$0: invalid mode: $mode" >&2 - exit 1;; - esac - shift;; - - -o) chowncmd="$chownprog $2" - shift;; - - -s) stripcmd=$stripprog;; - - -t) dst_arg=$2 - # Protect names problematic for `test' and other utilities. - case $dst_arg in - -* | [=\(\)!]) dst_arg=./$dst_arg;; - esac - shift;; - - -T) no_target_directory=true;; - - --version) echo "$0 $scriptversion"; exit $?;; - - --) shift - break;; - - -*) echo "$0: invalid option: $1" >&2 - exit 1;; - - *) break;; - esac - shift -done - -if test $# -ne 0 && test -z "$dir_arg$dst_arg"; then - # When -d is used, all remaining arguments are directories to create. - # When -t is used, the destination is already specified. - # Otherwise, the last argument is the destination. Remove it from $@. - for arg - do - if test -n "$dst_arg"; then - # $@ is not empty: it contains at least $arg. - set fnord "$@" "$dst_arg" - shift # fnord - fi - shift # arg - dst_arg=$arg - # Protect names problematic for `test' and other utilities. - case $dst_arg in - -* | [=\(\)!]) dst_arg=./$dst_arg;; - esac - done -fi - -if test $# -eq 0; then - if test -z "$dir_arg"; then - echo "$0: no input file specified." >&2 - exit 1 - fi - # It's OK to call `install-sh -d' without argument. - # This can happen when creating conditional directories. - exit 0 -fi - -if test -z "$dir_arg"; then - do_exit='(exit $ret); exit $ret' - trap "ret=129; $do_exit" 1 - trap "ret=130; $do_exit" 2 - trap "ret=141; $do_exit" 13 - trap "ret=143; $do_exit" 15 - - # Set umask so as not to create temps with too-generous modes. - # However, 'strip' requires both read and write access to temps. - case $mode in - # Optimize common cases. - *644) cp_umask=133;; - *755) cp_umask=22;; - - *[0-7]) - if test -z "$stripcmd"; then - u_plus_rw= - else - u_plus_rw='% 200' - fi - cp_umask=`expr '(' 777 - $mode % 1000 ')' $u_plus_rw`;; - *) - if test -z "$stripcmd"; then - u_plus_rw= - else - u_plus_rw=,u+rw - fi - cp_umask=$mode$u_plus_rw;; - esac -fi - -for src -do - # Protect names problematic for `test' and other utilities. - case $src in - -* | [=\(\)!]) src=./$src;; - esac - - if test -n "$dir_arg"; then - dst=$src - dstdir=$dst - test -d "$dstdir" - dstdir_status=$? - else - - # Waiting for this to be detected by the "$cpprog $src $dsttmp" command - # might cause directories to be created, which would be especially bad - # if $src (and thus $dsttmp) contains '*'. - if test ! -f "$src" && test ! -d "$src"; then - echo "$0: $src does not exist." >&2 - exit 1 - fi - - if test -z "$dst_arg"; then - echo "$0: no destination specified." >&2 - exit 1 - fi - dst=$dst_arg - - # If destination is a directory, append the input filename; won't work - # if double slashes aren't ignored. - if test -d "$dst"; then - if test -n "$no_target_directory"; then - echo "$0: $dst_arg: Is a directory" >&2 - exit 1 - fi - dstdir=$dst - dst=$dstdir/`basename "$src"` - dstdir_status=0 - else - # Prefer dirname, but fall back on a substitute if dirname fails. - dstdir=` - (dirname "$dst") 2>/dev/null || - expr X"$dst" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \ - X"$dst" : 'X\(//\)[^/]' \| \ - X"$dst" : 'X\(//\)$' \| \ - X"$dst" : 'X\(/\)' \| . 2>/dev/null || - echo X"$dst" | - sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{ - s//\1/ - q - } - /^X\(\/\/\)[^/].*/{ - s//\1/ - q - } - /^X\(\/\/\)$/{ - s//\1/ - q - } - /^X\(\/\).*/{ - s//\1/ - q - } - s/.*/./; q' - ` - - test -d "$dstdir" - dstdir_status=$? - fi - fi - - obsolete_mkdir_used=false - - if test $dstdir_status != 0; then - case $posix_mkdir in - '') - # Create intermediate dirs using mode 755 as modified by the umask. - # This is like FreeBSD 'install' as of 1997-10-28. - umask=`umask` - case $stripcmd.$umask in - # Optimize common cases. - *[2367][2367]) mkdir_umask=$umask;; - .*0[02][02] | .[02][02] | .[02]) mkdir_umask=22;; - - *[0-7]) - mkdir_umask=`expr $umask + 22 \ - - $umask % 100 % 40 + $umask % 20 \ - - $umask % 10 % 4 + $umask % 2 - `;; - *) mkdir_umask=$umask,go-w;; - esac - - # With -d, create the new directory with the user-specified mode. - # Otherwise, rely on $mkdir_umask. - if test -n "$dir_arg"; then - mkdir_mode=-m$mode - else - mkdir_mode= - fi - - posix_mkdir=false - case $umask in - *[123567][0-7][0-7]) - # POSIX mkdir -p sets u+wx bits regardless of umask, which - # is incompatible with FreeBSD 'install' when (umask & 300) != 0. - ;; - *) - tmpdir=${TMPDIR-/tmp}/ins$RANDOM-$$ - trap 'ret=$?; rmdir "$tmpdir/d" "$tmpdir" 2>/dev/null; exit $ret' 0 - - if (umask $mkdir_umask && - exec $mkdirprog $mkdir_mode -p -- "$tmpdir/d") >/dev/null 2>&1 - then - if test -z "$dir_arg" || { - # Check for POSIX incompatibilities with -m. - # HP-UX 11.23 and IRIX 6.5 mkdir -m -p sets group- or - # other-writeable bit of parent directory when it shouldn't. - # FreeBSD 6.1 mkdir -m -p sets mode of existing directory. - ls_ld_tmpdir=`ls -ld "$tmpdir"` - case $ls_ld_tmpdir in - d????-?r-*) different_mode=700;; - d????-?--*) different_mode=755;; - *) false;; - esac && - $mkdirprog -m$different_mode -p -- "$tmpdir" && { - ls_ld_tmpdir_1=`ls -ld "$tmpdir"` - test "$ls_ld_tmpdir" = "$ls_ld_tmpdir_1" - } - } - then posix_mkdir=: - fi - rmdir "$tmpdir/d" "$tmpdir" - else - # Remove any dirs left behind by ancient mkdir implementations. - rmdir ./$mkdir_mode ./-p ./-- 2>/dev/null - fi - trap '' 0;; - esac;; - esac - - if - $posix_mkdir && ( - umask $mkdir_umask && - $doit_exec $mkdirprog $mkdir_mode -p -- "$dstdir" - ) - then : - else - - # The umask is ridiculous, or mkdir does not conform to POSIX, - # or it failed possibly due to a race condition. Create the - # directory the slow way, step by step, checking for races as we go. - - case $dstdir in - /*) prefix='/';; - [-=\(\)!]*) prefix='./';; - *) prefix='';; - esac - - eval "$initialize_posix_glob" - - oIFS=$IFS - IFS=/ - $posix_glob set -f - set fnord $dstdir - shift - $posix_glob set +f - IFS=$oIFS - - prefixes= - - for d - do - test X"$d" = X && continue - - prefix=$prefix$d - if test -d "$prefix"; then - prefixes= - else - if $posix_mkdir; then - (umask=$mkdir_umask && - $doit_exec $mkdirprog $mkdir_mode -p -- "$dstdir") && break - # Don't fail if two instances are running concurrently. - test -d "$prefix" || exit 1 - else - case $prefix in - *\'*) qprefix=`echo "$prefix" | sed "s/'/'\\\\\\\\''/g"`;; - *) qprefix=$prefix;; - esac - prefixes="$prefixes '$qprefix'" - fi - fi - prefix=$prefix/ - done - - if test -n "$prefixes"; then - # Don't fail if two instances are running concurrently. - (umask $mkdir_umask && - eval "\$doit_exec \$mkdirprog $prefixes") || - test -d "$dstdir" || exit 1 - obsolete_mkdir_used=true - fi - fi - fi - - if test -n "$dir_arg"; then - { test -z "$chowncmd" || $doit $chowncmd "$dst"; } && - { test -z "$chgrpcmd" || $doit $chgrpcmd "$dst"; } && - { test "$obsolete_mkdir_used$chowncmd$chgrpcmd" = false || - test -z "$chmodcmd" || $doit $chmodcmd $mode "$dst"; } || exit 1 - else - - # Make a couple of temp file names in the proper directory. - dsttmp=$dstdir/_inst.$$_ - rmtmp=$dstdir/_rm.$$_ - - # Trap to clean up those temp files at exit. - trap 'ret=$?; rm -f "$dsttmp" "$rmtmp" && exit $ret' 0 - - # Copy the file name to the temp name. - (umask $cp_umask && $doit_exec $cpprog "$src" "$dsttmp") && - - # and set any options; do chmod last to preserve setuid bits. - # - # If any of these fail, we abort the whole thing. If we want to - # ignore errors from any of these, just make sure not to ignore - # errors from the above "$doit $cpprog $src $dsttmp" command. - # - { test -z "$chowncmd" || $doit $chowncmd "$dsttmp"; } && - { test -z "$chgrpcmd" || $doit $chgrpcmd "$dsttmp"; } && - { test -z "$stripcmd" || $doit $stripcmd "$dsttmp"; } && - { test -z "$chmodcmd" || $doit $chmodcmd $mode "$dsttmp"; } && - - # If -C, don't bother to copy if it wouldn't change the file. - if $copy_on_change && - old=`LC_ALL=C ls -dlL "$dst" 2>/dev/null` && - new=`LC_ALL=C ls -dlL "$dsttmp" 2>/dev/null` && - - eval "$initialize_posix_glob" && - $posix_glob set -f && - set X $old && old=:$2:$4:$5:$6 && - set X $new && new=:$2:$4:$5:$6 && - $posix_glob set +f && - - test "$old" = "$new" && - $cmpprog "$dst" "$dsttmp" >/dev/null 2>&1 - then - rm -f "$dsttmp" - else - # Rename the file to the real destination. - $doit $mvcmd -f "$dsttmp" "$dst" 2>/dev/null || - - # The rename failed, perhaps because mv can't rename something else - # to itself, or perhaps because mv is so ancient that it does not - # support -f. - { - # Now remove or move aside any old file at destination location. - # We try this two ways since rm can't unlink itself on some - # systems and the destination file might be busy for other - # reasons. In this case, the final cleanup might fail but the new - # file should still install successfully. - { - test ! -f "$dst" || - $doit $rmcmd -f "$dst" 2>/dev/null || - { $doit $mvcmd -f "$dst" "$rmtmp" 2>/dev/null && - { $doit $rmcmd -f "$rmtmp" 2>/dev/null; :; } - } || - { echo "$0: cannot unlink or rename $dst" >&2 - (exit 1); exit 1 - } - } && - - # Now rename the file to the real destination. - $doit $mvcmd "$dsttmp" "$dst" - } - fi || exit 1 - - trap '' 0 - fi -done - -# Local variables: -# eval: (add-hook 'write-file-hooks 'time-stamp) -# time-stamp-start: "scriptversion=" -# time-stamp-format: "%:y-%02m-%02d.%02H" -# time-stamp-time-zone: "UTC" -# time-stamp-end: "; # UTC" -# End: diff --git a/lyra2/Lyra2.c b/lyra2/Lyra2.c index 452f37cab7..d99659a624 100644 --- a/lyra2/Lyra2.c +++ b/lyra2/Lyra2.c @@ -21,7 +21,6 @@ #include #include #include - #include "Lyra2.h" #include "Sponge.h" @@ -44,8 +43,176 @@ * * @return 0 if the key is generated correctly; -1 if there is an error (usually due to lack of memory for allocation) */ -int LYRA2(void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen, const void *salt, uint64_t saltlen, uint64_t timeCost, uint64_t nRows, uint64_t nCols) -{ +int LYRA2(void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen, const void *salt, uint64_t saltlen, uint64_t timeCost, uint64_t nRows, uint64_t nCols) { + + //============================= Basic variables ============================// + int64_t row = 2; //index of row to be processed + int64_t prev = 1; //index of prev (last row ever computed/modified) + int64_t rowa = 0; //index of row* (a previous row, deterministically picked during Setup and randomly picked while Wandering) + int64_t tau; //Time Loop iterator + int64_t step = 1; //Visitation step (used during Setup and Wandering phases) + int64_t window = 2; //Visitation window (used to define which rows can be revisited during Setup) + int64_t gap = 1; //Modifier to the step, assuming the values 1 or -1 + int64_t i; //auxiliary iteration counter + //==========================================================================/ + + //========== Initializing the Memory Matrix and pointers to it =============// + //Tries to allocate enough space for the whole memory matrix + + + const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols; + const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8; + + i = (int64_t) ((int64_t) nRows * (int64_t) ROW_LEN_BYTES); + uint64_t *wholeMatrix = malloc(i); + if (wholeMatrix == NULL) { + return -1; + } + memset(wholeMatrix, 0, i); + + //Allocates pointers to each row of the matrix + uint64_t **memMatrix = malloc(nRows * sizeof (uint64_t*)); + if (memMatrix == NULL) { + return -1; + } + //Places the pointers in the correct positions + uint64_t *ptrWord = wholeMatrix; + for (i = 0; i < nRows; i++) { + memMatrix[i] = ptrWord; + ptrWord += ROW_LEN_INT64; + } + //==========================================================================/ + + //============= Getting the password + salt + basil padded with 10*1 ===============// + //OBS.:The memory matrix will temporarily hold the password: not for saving memory, + //but this ensures that the password copied locally will be overwritten as soon as possible + + //First, we clean enough blocks for the password, salt, basil and padding + uint64_t nBlocksInput = ((saltlen + pwdlen + 6 * sizeof (uint64_t)) / BLOCK_LEN_BLAKE2_SAFE_BYTES) + 1; + byte *ptrByte = (byte*) wholeMatrix; + memset(ptrByte, 0, nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES); + + //Prepends the password + memcpy(ptrByte, pwd, pwdlen); + ptrByte += pwdlen; + + //Concatenates the salt + memcpy(ptrByte, salt, saltlen); + ptrByte += saltlen; + + //Concatenates the basil: every integer passed as parameter, in the order they are provided by the interface + memcpy(ptrByte, &kLen, sizeof (uint64_t)); + ptrByte += sizeof (uint64_t); + memcpy(ptrByte, &pwdlen, sizeof (uint64_t)); + ptrByte += sizeof (uint64_t); + memcpy(ptrByte, &saltlen, sizeof (uint64_t)); + ptrByte += sizeof (uint64_t); + memcpy(ptrByte, &timeCost, sizeof (uint64_t)); + ptrByte += sizeof (uint64_t); + memcpy(ptrByte, &nRows, sizeof (uint64_t)); + ptrByte += sizeof (uint64_t); + memcpy(ptrByte, &nCols, sizeof (uint64_t)); + ptrByte += sizeof (uint64_t); + + //Now comes the padding + *ptrByte = 0x80; //first byte of padding: right after the password + ptrByte = (byte*) wholeMatrix; //resets the pointer to the start of the memory matrix + ptrByte += nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES - 1; //sets the pointer to the correct position: end of incomplete block + *ptrByte ^= 0x01; //last byte of padding: at the end of the last incomplete block + //==========================================================================/ + + //======================= Initializing the Sponge State ====================// + //Sponge state: 16 uint64_t, BLOCK_LEN_INT64 words of them for the bitrate (b) and the remainder for the capacity (c) + uint64_t *state = malloc(16 * sizeof (uint64_t)); + if (state == NULL) { + return -1; + } + initState(state); + //==========================================================================/ + + //================================ Setup Phase =============================// + //Absorbing salt, password and basil: this is the only place in which the block length is hard-coded to 512 bits + ptrWord = wholeMatrix; + for (i = 0; i < nBlocksInput; i++) { + absorbBlockBlake2Safe(state, ptrWord); //absorbs each block of pad(pwd || salt || basil) + ptrWord += BLOCK_LEN_BLAKE2_SAFE_INT64; //goes to next block of pad(pwd || salt || basil) + } + //Initializes M[0] and M[1] + reducedSqueezeRow0(state, memMatrix[0], nCols); //The locally copied password is most likely overwritten here + reducedDuplexRow1(state, memMatrix[0], memMatrix[1], nCols); + + do { + //M[row] = rand; //M[row*] = M[row*] XOR rotW(rand) + reducedDuplexRowSetup(state, memMatrix[prev], memMatrix[rowa], memMatrix[row], nCols); + + + //updates the value of row* (deterministically picked during Setup)) + rowa = (rowa + step) & (window - 1); + //update prev: it now points to the last row ever computed + prev = row; + //updates row: goes to the next row to be computed + row++; + + //Checks if all rows in the window where visited. + if (rowa == 0) { + step = window + gap; //changes the step: approximately doubles its value + window *= 2; //doubles the size of the re-visitation window + gap = -gap; //inverts the modifier to the step + } + + } while (row < nRows); + //==========================================================================/ + + //============================ Wandering Phase =============================// + row = 0; //Resets the visitation to the first row of the memory matrix + for (tau = 1; tau <= timeCost; tau++) { + //Step is approximately half the number of all rows of the memory matrix for an odd tau; otherwise, it is -1 + step = (tau % 2 == 0) ? -1 : nRows / 2 - 1; + do { + //Selects a pseudorandom index row* + //------------------------------------------------------------------------------------------ + //rowa = ((unsigned int)state[0]) & (nRows-1); //(USE THIS IF nRows IS A POWER OF 2) + rowa = ((uint64_t) (state[0])) % nRows; //(USE THIS FOR THE "GENERIC" CASE) + //------------------------------------------------------------------------------------------ + + //Performs a reduced-round duplexing operation over M[row*] XOR M[prev], updating both M[row*] and M[row] + reducedDuplexRow(state, memMatrix[prev], memMatrix[rowa], memMatrix[row], nCols); + + //update prev: it now points to the last row ever computed + prev = row; + + //updates row: goes to the next row to be computed + //------------------------------------------------------------------------------------------ + //row = (row + step) & (nRows-1); //(USE THIS IF nRows IS A POWER OF 2) + row = (row + step) % nRows; //(USE THIS FOR THE "GENERIC" CASE) + //------------------------------------------------------------------------------------------ + + } while (row != 0); + } + //==========================================================================/ + + //============================ Wrap-up Phase ===============================// + //Absorbs the last block of the memory matrix + absorbBlock(state, memMatrix[rowa]); + + //Squeezes the key + squeeze(state, K, kLen); + //==========================================================================/ + + //========================= Freeing the memory =============================// + free(memMatrix); + free(wholeMatrix); + + //Wiping out the sponge's internal state before freeing it + memset(state, 0, 16 * sizeof (uint64_t)); + free(state); + //==========================================================================/ + + return 0; +} + +int LYRA2_old(void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen, const void *salt, uint64_t saltlen, uint64_t timeCost, uint64_t nRows, uint64_t nCols) { + //============================= Basic variables ============================// int64_t row = 2; //index of row to be processed int64_t prev = 1; //index of prev (last row ever computed/modified) @@ -59,21 +226,26 @@ int LYRA2(void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen, const void * //========== Initializing the Memory Matrix and pointers to it =============// //Tries to allocate enough space for the whole memory matrix - i = (int64_t) ((int64_t) nRows * (int64_t) ROW_LEN_BYTES); - uint64_t *wholeMatrix = (uint64_t*) malloc((size_t) i); + + + const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols; + const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8; + + i = (int64_t)((int64_t)nRows * (int64_t)ROW_LEN_BYTES); + uint64_t *wholeMatrix = malloc(i); if (wholeMatrix == NULL) { return -1; } - memset(wholeMatrix, 0, (size_t) i); + memset(wholeMatrix, 0, i); //Allocates pointers to each row of the matrix - uint64_t **memMatrix = malloc((size_t) nRows * sizeof(uint64_t*)); + uint64_t **memMatrix = malloc(nRows * sizeof(uint64_t*)); if (memMatrix == NULL) { return -1; } //Places the pointers in the correct positions uint64_t *ptrWord = wholeMatrix; - for (i = 0; i < (int64_t) nRows; i++) { + for (i = 0; i < nRows; i++) { memMatrix[i] = ptrWord; ptrWord += ROW_LEN_INT64; } @@ -84,43 +256,42 @@ int LYRA2(void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen, const void * //but this ensures that the password copied locally will be overwritten as soon as possible //First, we clean enough blocks for the password, salt, basil and padding - uint64_t nBlocksInput = ((saltlen + pwdlen + 6 * sizeof (uint64_t)) / BLOCK_LEN_BLAKE2_SAFE_BYTES) + 1; - - byte *ptrByte = (byte*) wholeMatrix; - memset(ptrByte, 0, (size_t) nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES); + uint64_t nBlocksInput = ((saltlen + pwdlen + 6 * sizeof(uint64_t)) / BLOCK_LEN_BLAKE2_SAFE_BYTES) + 1; + byte *ptrByte = (byte*)wholeMatrix; + memset(ptrByte, 0, nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES); //Prepends the password - memcpy(ptrByte, pwd, (size_t) pwdlen); + memcpy(ptrByte, pwd, pwdlen); ptrByte += pwdlen; //Concatenates the salt - memcpy(ptrByte, salt, (size_t) saltlen); + memcpy(ptrByte, salt, saltlen); ptrByte += saltlen; //Concatenates the basil: every integer passed as parameter, in the order they are provided by the interface - memcpy(ptrByte, &kLen, sizeof (uint64_t)); - ptrByte += sizeof (uint64_t); - memcpy(ptrByte, &pwdlen, sizeof (uint64_t)); - ptrByte += sizeof (uint64_t); - memcpy(ptrByte, &saltlen, sizeof (uint64_t)); - ptrByte += sizeof (uint64_t); - memcpy(ptrByte, &timeCost, sizeof (uint64_t)); - ptrByte += sizeof (uint64_t); - memcpy(ptrByte, &nRows, sizeof (uint64_t)); - ptrByte += sizeof (uint64_t); - memcpy(ptrByte, &nCols, sizeof (uint64_t)); - ptrByte += sizeof (uint64_t); + memcpy(ptrByte, &kLen, sizeof(uint64_t)); + ptrByte += sizeof(uint64_t); + memcpy(ptrByte, &pwdlen, sizeof(uint64_t)); + ptrByte += sizeof(uint64_t); + memcpy(ptrByte, &saltlen, sizeof(uint64_t)); + ptrByte += sizeof(uint64_t); + memcpy(ptrByte, &timeCost, sizeof(uint64_t)); + ptrByte += sizeof(uint64_t); + memcpy(ptrByte, &nRows, sizeof(uint64_t)); + ptrByte += sizeof(uint64_t); + memcpy(ptrByte, &nCols, sizeof(uint64_t)); + ptrByte += sizeof(uint64_t); //Now comes the padding *ptrByte = 0x80; //first byte of padding: right after the password - ptrByte = (byte*) wholeMatrix; //resets the pointer to the start of the memory matrix + ptrByte = (byte*)wholeMatrix; //resets the pointer to the start of the memory matrix ptrByte += nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES - 1; //sets the pointer to the correct position: end of incomplete block *ptrByte ^= 0x01; //last byte of padding: at the end of the last incomplete block //==========================================================================/ //======================= Initializing the Sponge State ====================// //Sponge state: 16 uint64_t, BLOCK_LEN_INT64 words of them for the bitrate (b) and the remainder for the capacity (c) - uint64_t *state = malloc(16 * sizeof (uint64_t)); + uint64_t *state = malloc(16 * sizeof(uint64_t)); if (state == NULL) { return -1; } @@ -130,20 +301,18 @@ int LYRA2(void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen, const void * //================================ Setup Phase =============================// //Absorbing salt, password and basil: this is the only place in which the block length is hard-coded to 512 bits ptrWord = wholeMatrix; - for (i = 0; i < (int64_t) nBlocksInput; i++) { + for (i = 0; i < nBlocksInput; i++) { absorbBlockBlake2Safe(state, ptrWord); //absorbs each block of pad(pwd || salt || basil) ptrWord += BLOCK_LEN_BLAKE2_SAFE_BYTES; //goes to next block of pad(pwd || salt || basil) } - //Initializes M[0] and M[1] - reducedSqueezeRow0(state, memMatrix[0]); //The locally copied password is most likely overwritten here - - reducedDuplexRow1(state, memMatrix[0], memMatrix[1]); + reducedSqueezeRow0(state, memMatrix[0], nCols); //The locally copied password is most likely overwritten here + reducedDuplexRow1(state, memMatrix[0], memMatrix[1], nCols); do { //M[row] = rand; //M[row*] = M[row*] XOR rotW(rand) + reducedDuplexRowSetup(state, memMatrix[prev], memMatrix[rowa], memMatrix[row], nCols); - reducedDuplexRowSetup(state, memMatrix[prev], memMatrix[rowa], memMatrix[row]); //updates the value of row* (deterministically picked during Setup)) rowa = (rowa + step) & (window - 1); @@ -154,35 +323,35 @@ int LYRA2(void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen, const void * //Checks if all rows in the window where visited. if (rowa == 0) { - step = window + gap; //changes the step: approximately doubles its value - window *= 2; //doubles the size of the re-visitation window - gap = -gap; //inverts the modifier to the step - } + step = window + gap; //changes the step: approximately doubles its value + window *= 2; //doubles the size of the re-visitation window + gap = -gap; //inverts the modifier to the step + } - } while (row < (int64_t) nRows); + } while (row < nRows); //==========================================================================/ //============================ Wandering Phase =============================// row = 0; //Resets the visitation to the first row of the memory matrix - for (tau = 1; tau <= (int64_t) timeCost; tau++) { + for (tau = 1; tau <= timeCost; tau++) { //Step is approximately half the number of all rows of the memory matrix for an odd tau; otherwise, it is -1 - step = ((tau & 1) == 0) ? -1 : nRows / 2 - 1; + step = (tau % 2 == 0) ? -1 : nRows / 2 - 1; do { //Selects a pseudorandom index row* //------------------------------------------------------------------------------------------ - //rowa = ((unsigned int)state[0]) & (nRows-1); //(USE THIS IF nRows IS A POWER OF 2) - rowa = ((uint64_t) (state[0])) % nRows; //(USE THIS FOR THE "GENERIC" CASE) + //rowa = ((unsigned int)state[0]) & (nRows-1); //(USE THIS IF nRows IS A POWER OF 2) + rowa = ((uint64_t)(state[0])) % nRows; //(USE THIS FOR THE "GENERIC" CASE) //------------------------------------------------------------------------------------------ //Performs a reduced-round duplexing operation over M[row*] XOR M[prev], updating both M[row*] and M[row] - reducedDuplexRow(state, memMatrix[prev], memMatrix[rowa], memMatrix[row]); + reducedDuplexRow(state, memMatrix[prev], memMatrix[rowa], memMatrix[row], nCols); //update prev: it now points to the last row ever computed prev = row; //updates row: goes to the next row to be computed //------------------------------------------------------------------------------------------ - //row = (row + step) & (nRows-1); //(USE THIS IF nRows IS A POWER OF 2) + //row = (row + step) & (nRows-1); //(USE THIS IF nRows IS A POWER OF 2) row = (row + step) % nRows; //(USE THIS FOR THE "GENERIC" CASE) //------------------------------------------------------------------------------------------ @@ -195,7 +364,7 @@ int LYRA2(void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen, const void * absorbBlock(state, memMatrix[rowa]); //Squeezes the key - squeeze(state, K, (size_t) kLen); + squeeze(state, K, kLen); //==========================================================================/ //========================= Freeing the memory =============================// @@ -203,7 +372,7 @@ int LYRA2(void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen, const void * free(wholeMatrix); //Wiping out the sponge's internal state before freeing it - memset(state, 0, 16 * sizeof (uint64_t)); + memset(state, 0, 16 * sizeof(uint64_t)); free(state); //==========================================================================/ diff --git a/lyra2/Lyra2.h b/lyra2/Lyra2.h index 229b2c9cc3..e595ecea95 100644 --- a/lyra2/Lyra2.h +++ b/lyra2/Lyra2.h @@ -18,9 +18,13 @@ * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef LYRA2_H_ -#define LYRA2_H_ - +#define LYRA2_H_ + +#ifdef __cplusplus +#include +#else #include +#endif typedef unsigned char byte; @@ -37,14 +41,7 @@ typedef unsigned char byte; #define BLOCK_LEN_BYTES (BLOCK_LEN_INT64 * 8) //Block length, in bytes #endif -#ifndef N_COLS - #define N_COLS 8 //Number of columns in the memory matrix: fixed to 64 by default -#endif - -#define ROW_LEN_INT64 (BLOCK_LEN_INT64 * N_COLS) //Total length of a row: N_COLS blocks -#define ROW_LEN_BYTES (ROW_LEN_INT64 * 8) //Number of bytes per row - - int LYRA2(void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen, const void *salt, uint64_t saltlen, uint64_t timeCost, uint64_t nRows, uint64_t nCols); +int LYRA2_old(void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen, const void *salt, uint64_t saltlen, uint64_t timeCost, uint64_t nRows, uint64_t nCols); #endif /* LYRA2_H_ */ diff --git a/lyra2/Sponge.c b/lyra2/Sponge.c index e0a001e0ee..104c188f7a 100644 --- a/lyra2/Sponge.c +++ b/lyra2/Sponge.c @@ -41,7 +41,6 @@ //First 512 bis are zeros memset(state, 0, 64); //Remainder BLOCK_LEN_BLAKE2_SAFE_BYTES are reserved to the IV - state[8] = blake2b_IV[0]; state[9] = blake2b_IV[1]; state[10] = blake2b_IV[2]; @@ -50,7 +49,6 @@ state[13] = blake2b_IV[5]; state[14] = blake2b_IV[6]; state[15] = blake2b_IV[7]; - } /** @@ -80,7 +78,7 @@ __inline static void blake2bLyra(uint64_t *v) { __inline static void reducedBlake2bLyra(uint64_t *v) { ROUND_LYRA(0); } - + /** * Performs a squeeze operation, using Blake2b's G function as the * internal permutation @@ -95,9 +93,9 @@ __inline static void reducedBlake2bLyra(uint64_t *v) { int i; //Squeezes full blocks for (i = 0; i < fullBlocks; i++) { - memcpy(ptr, state, BLOCK_LEN_BYTES); - blake2bLyra(state); - ptr += BLOCK_LEN_BYTES; + memcpy(ptr, state, BLOCK_LEN_BYTES); + blake2bLyra(state); + ptr += BLOCK_LEN_BYTES; } //Squeezes remaining bytes @@ -111,7 +109,7 @@ __inline static void reducedBlake2bLyra(uint64_t *v) { * @param state The current state of the sponge * @param in The block to be absorbed (BLOCK_LEN_INT64 words) */ -void absorbBlock(uint64_t *state, const uint64_t *in) { + void absorbBlock(uint64_t *state, const uint64_t *in) { //XORs the first BLOCK_LEN_INT64 words of "in" with the current state state[0] ^= in[0]; state[1] ^= in[1]; @@ -137,9 +135,10 @@ void absorbBlock(uint64_t *state, const uint64_t *in) { * @param state The current state of the sponge * @param in The block to be absorbed (BLOCK_LEN_BLAKE2_SAFE_INT64 words) */ -void absorbBlockBlake2Safe(uint64_t *state, const uint64_t *in) { + void absorbBlockBlake2Safe(uint64_t *state, const uint64_t *in) { //XORs the first BLOCK_LEN_BLAKE2_SAFE_INT64 words of "in" with the current state - state[0] ^= in[0]; + + state[0] ^= in[0]; state[1] ^= in[1]; state[2] ^= in[2]; state[3] ^= in[3]; @@ -148,14 +147,10 @@ void absorbBlockBlake2Safe(uint64_t *state, const uint64_t *in) { state[6] ^= in[6]; state[7] ^= in[7]; + //Applies the transformation f to the sponge's state blake2bLyra(state); -/* - for(int i = 0; i<16; i++) { - printf(" final state %d %08x %08x in %08x %08x\n", i, (uint32_t)(state[i] & 0xFFFFFFFFULL), (uint32_t)(state[i] >> 32), - (uint32_t)(in[i] & 0xFFFFFFFFULL), (uint32_t)(in[i] >> 32)); - } -*/ + } /** @@ -166,12 +161,11 @@ void absorbBlockBlake2Safe(uint64_t *state, const uint64_t *in) { * @param state The current state of the sponge * @param rowOut Row to receive the data squeezed */ -void reducedSqueezeRow0(uint64_t* state, uint64_t* rowOut) { - uint64_t* ptrWord = rowOut + (N_COLS-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to M[0][C-1] + void reducedSqueezeRow0(uint64_t* state, uint64_t* rowOut, uint64_t nCols) { + uint64_t* ptrWord = rowOut + (nCols-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to M[0][C-1] int i; //M[row][C-1-col] = H.reduced_squeeze() - for (i = 0; i < N_COLS; i++) { - + for (i = 0; i < nCols; i++) { ptrWord[0] = state[0]; ptrWord[1] = state[1]; ptrWord[2] = state[2]; @@ -184,12 +178,7 @@ void reducedSqueezeRow0(uint64_t* state, uint64_t* rowOut) { ptrWord[9] = state[9]; ptrWord[10] = state[10]; ptrWord[11] = state[11]; - /* -for (int i = 0; i<12; i++) { - printf(" after reducedSqueezeRow0 %d %08x %08x in %08x %08x\n", i, (uint32_t)(ptrWord[i] & 0xFFFFFFFFULL), (uint32_t)(ptrWord[i] >> 32), - (uint32_t)(state[i] & 0xFFFFFFFFULL), (uint32_t)(state[i] >> 32)); - } -*/ + //Goes to next block (column) that will receive the squeezed data ptrWord -= BLOCK_LEN_INT64; @@ -207,12 +196,12 @@ for (int i = 0; i<12; i++) { * @param rowIn Row to feed the sponge * @param rowOut Row to receive the sponge's output */ - void reducedDuplexRow1(uint64_t *state, uint64_t *rowIn, uint64_t *rowOut) { + void reducedDuplexRow1(uint64_t *state, uint64_t *rowIn, uint64_t *rowOut, uint64_t nCols) { uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev - uint64_t* ptrWordOut = rowOut + (N_COLS-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to row + uint64_t* ptrWordOut = rowOut + (nCols-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to row int i; - for (i = 0; i < N_COLS; i++) { + for (i = 0; i < nCols; i++) { //Absorbing "M[prev][col]" state[0] ^= (ptrWordIn[0]); @@ -267,12 +256,13 @@ for (int i = 0; i<12; i++) { * @param rowOut Row receiving the output * */ - void reducedDuplexRowSetup(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut) { + void reducedDuplexRowSetup(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut, uint64_t nCols) { uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev uint64_t* ptrWordInOut = rowInOut; //In Lyra2: pointer to row* - uint64_t* ptrWordOut = rowOut + (N_COLS-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to row + uint64_t* ptrWordOut = rowOut + (nCols-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to row int i; - for (i = 0; i < N_COLS; i++) { + + for (i = 0; i < nCols; i++) { //Absorbing "M[prev] [+] M[row*]" state[0] ^= (ptrWordIn[0] + ptrWordInOut[0]); state[1] ^= (ptrWordIn[1] + ptrWordInOut[1]); @@ -340,13 +330,13 @@ for (int i = 0; i<12; i++) { * @param rowOut Row receiving the output * */ -void reducedDuplexRow(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut) { + void reducedDuplexRow(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut, uint64_t nCols) { uint64_t* ptrWordInOut = rowInOut; //In Lyra2: pointer to row* uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev uint64_t* ptrWordOut = rowOut; //In Lyra2: pointer to row int i; - for (i = 0; i < N_COLS; i++) { + for (i = 0; i < nCols; i++) { //Absorbing "M[prev] [+] M[row*]" state[0] ^= (ptrWordIn[0] + ptrWordInOut[0]); @@ -744,12 +734,13 @@ inline void reducedDuplexRowd(uint64_t *state, uint64_t *rowIn, uint64_t *rowInO Prints an array of unsigned chars */ void printArray(unsigned char *array, unsigned int size, char *name) { - unsigned int i; - printf("%s: ", name); - for (i = 0; i < size; i++) { - printf("%2x|", array[i]); - } - printf("\n"); + int i; + printf("%s: ", name); + for (i = 0; i < size; i++) { + printf("%2x|", array[i]); + } + printf("\n"); } //////////////////////////////////////////////////////////////////////////////////////////////// + \ No newline at end of file diff --git a/lyra2/Sponge.h b/lyra2/Sponge.h index 9bd8ed664e..2ce23d876d 100644 --- a/lyra2/Sponge.h +++ b/lyra2/Sponge.h @@ -22,7 +22,11 @@ #ifndef SPONGE_H_ #define SPONGE_H_ +#ifdef __cplusplus +#include +#else #include +#endif #if defined(__GNUC__) #define ALIGN __attribute__ ((aligned(32))) @@ -74,20 +78,20 @@ static __inline uint64_t rotr64( const uint64_t w, const unsigned c ){ //---- Housekeeping -void initState(uint64_t state[/*16*/]); + void initState(uint64_t state[/*16*/]); //---- Squeezes -void squeeze(uint64_t *state, unsigned char *out, unsigned int len); -void reducedSqueezeRow0(uint64_t* state, uint64_t* row); + void squeeze(uint64_t *state, unsigned char *out, unsigned int len); + void reducedSqueezeRow0(uint64_t* state, uint64_t* row, uint64_t nCols); //---- Absorbs -void absorbBlock(uint64_t *state, const uint64_t *in); -void absorbBlockBlake2Safe(uint64_t *state, const uint64_t *in); + void absorbBlock(uint64_t *state, const uint64_t *in); + void absorbBlockBlake2Safe(uint64_t *state, const uint64_t *in); //---- Duplexes -void reducedDuplexRow1(uint64_t *state, uint64_t *rowIn, uint64_t *rowOut); -void reducedDuplexRowSetup(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut); -void reducedDuplexRow(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut); + void reducedDuplexRow1(uint64_t *state, uint64_t *rowIn, uint64_t *rowOut, uint64_t nCols); + void reducedDuplexRowSetup(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut, uint64_t nCols); + void reducedDuplexRow(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut, uint64_t nCols); //---- Misc void printArray(unsigned char *array, unsigned int size, char *name); diff --git a/lyra2/cuda_lyra2.cu b/lyra2/cuda_lyra2.cu deleted file mode 100644 index 55986bc793..0000000000 --- a/lyra2/cuda_lyra2.cu +++ /dev/null @@ -1,295 +0,0 @@ -#include - -#include "cuda_helper.h" - -#define TPB 160 - -static __constant__ uint2 blake2b_IV[8] = { - { 0xf3bcc908, 0x6a09e667 }, - { 0x84caa73b, 0xbb67ae85 }, - { 0xfe94f82b, 0x3c6ef372 }, - { 0x5f1d36f1, 0xa54ff53a }, - { 0xade682d1, 0x510e527f }, - { 0x2b3e6c1f, 0x9b05688c }, - { 0xfb41bd6b, 0x1f83d9ab }, - { 0x137e2179, 0x5be0cd19 } -}; - - -#define reduceDuplexRow(rowIn, rowInOut, rowOut) { \ - for (int i = 0; i < 8; i++) { \ - for (int j = 0; j < 12; j++) \ - state[j] ^= Matrix[12 * i + j][rowIn] + Matrix[12 * i + j][rowInOut]; \ - round_lyra_v35(state); \ - for (int j = 0; j < 12; j++) \ - Matrix[j + 12 * i][rowOut] ^= state[j]; \ - Matrix[0 + 12 * i][rowInOut] ^= state[11]; \ - Matrix[1 + 12 * i][rowInOut] ^= state[0]; \ - Matrix[2 + 12 * i][rowInOut] ^= state[1]; \ - Matrix[3 + 12 * i][rowInOut] ^= state[2]; \ - Matrix[4 + 12 * i][rowInOut] ^= state[3]; \ - Matrix[5 + 12 * i][rowInOut] ^= state[4]; \ - Matrix[6 + 12 * i][rowInOut] ^= state[5]; \ - Matrix[7 + 12 * i][rowInOut] ^= state[6]; \ - Matrix[8 + 12 * i][rowInOut] ^= state[7]; \ - Matrix[9 + 12 * i][rowInOut] ^= state[8]; \ - Matrix[10+ 12 * i][rowInOut] ^= state[9]; \ - Matrix[11+ 12 * i][rowInOut] ^= state[10]; \ - } \ - } - -#define absorbblock(in) { \ - state[0] ^= Matrix[0][in]; \ - state[1] ^= Matrix[1][in]; \ - state[2] ^= Matrix[2][in]; \ - state[3] ^= Matrix[3][in]; \ - state[4] ^= Matrix[4][in]; \ - state[5] ^= Matrix[5][in]; \ - state[6] ^= Matrix[6][in]; \ - state[7] ^= Matrix[7][in]; \ - state[8] ^= Matrix[8][in]; \ - state[9] ^= Matrix[9][in]; \ - state[10] ^= Matrix[10][in]; \ - state[11] ^= Matrix[11][in]; \ - round_lyra_v35(state); \ - round_lyra_v35(state); \ - round_lyra_v35(state); \ - round_lyra_v35(state); \ - round_lyra_v35(state); \ - round_lyra_v35(state); \ - round_lyra_v35(state); \ - round_lyra_v35(state); \ - round_lyra_v35(state); \ - round_lyra_v35(state); \ - round_lyra_v35(state); \ - round_lyra_v35(state); \ - } - -//// test version -#define reduceDuplexRowSetup_test(rowIn, rowInOut, rowOut) { \ - for (int i = 0; i < 8; i++) { \ - for (int j = 0; j < 12; j++) \ - state[j] ^= Matrix[j][i][rowIn] + Matrix[j][i][rowInOut]; \ - round_lyra_v35(state); \ - for (int j = 0; j < 12; j++) \ - Matrix[j][7-i][rowOut] = Matrix[j][i][rowIn] ^ state[j]; \ - Matrix[0][i][rowInOut] ^= state[11]; \ - Matrix[1][i][rowInOut] ^= state[0]; \ - Matrix[2][i][rowInOut] ^= state[1]; \ - Matrix[3][i][rowInOut] ^= state[2]; \ - Matrix[4][i][rowInOut] ^= state[3]; \ - Matrix[5][i][rowInOut] ^= state[4]; \ - Matrix[6][i][rowInOut] ^= state[5]; \ - Matrix[7][i][rowInOut] ^= state[6]; \ - Matrix[8][i][rowInOut] ^= state[7]; \ - Matrix[9][i][rowInOut] ^= state[8]; \ - Matrix[10][i][rowInOut] ^= state[9]; \ - Matrix[11][i][rowInOut] ^= state[10]; \ - } \ - } - -#define reduceDuplexRow_test(rowIn, rowInOut, rowOut) { \ - for (int i = 0; i < 8; i++) { \ - for (int j = 0; j < 12; j++) \ - state[j] ^= Matrix[j][i][rowIn] + Matrix[j][i][rowInOut]; \ - round_lyra_v35(state); \ - for (int j = 0; j < 12; j++) \ - Matrix[j][i][rowOut] ^= state[j]; \ - Matrix[0][i][rowInOut] ^= state[11]; \ - Matrix[1][i][rowInOut] ^= state[0]; \ - Matrix[2][i][rowInOut] ^= state[1]; \ - Matrix[3][i][rowInOut] ^= state[2]; \ - Matrix[4][i][rowInOut] ^= state[3]; \ - Matrix[5][i][rowInOut] ^= state[4]; \ - Matrix[6][i][rowInOut] ^= state[5]; \ - Matrix[7][i][rowInOut] ^= state[6]; \ - Matrix[8][i][rowInOut] ^= state[7]; \ - Matrix[9][i][rowInOut] ^= state[8]; \ - Matrix[10][i][rowInOut] ^= state[9]; \ - Matrix[11][i][rowInOut] ^= state[10]; \ - } \ - } - -#define absorbblock_test(in) { \ - state[0] ^= Matrix[0][0][ in]; \ - state[1] ^= Matrix[1][0][in]; \ - state[2] ^= Matrix[2][0][in]; \ - state[3] ^= Matrix[3][0][in]; \ - state[4] ^= Matrix[4][0][in]; \ - state[5] ^= Matrix[5][0][in]; \ - state[6] ^= Matrix[6][0][in]; \ - state[7] ^= Matrix[7][0][in]; \ - state[8] ^= Matrix[8][0][in]; \ - state[9] ^= Matrix[9][0][in]; \ - state[10] ^= Matrix[10][0][in]; \ - state[11] ^= Matrix[11][0][in]; \ - round_lyra_v35(state); \ - round_lyra_v35(state); \ - round_lyra_v35(state); \ - round_lyra_v35(state); \ - round_lyra_v35(state); \ - round_lyra_v35(state); \ - round_lyra_v35(state); \ - round_lyra_v35(state); \ - round_lyra_v35(state); \ - round_lyra_v35(state); \ - round_lyra_v35(state); \ - round_lyra_v35(state); \ - } - -static __device__ __forceinline__ -void Gfunc_v35(uint2 & a, uint2 &b, uint2 &c, uint2 &d) -{ - a += b; d = SWAPINT2(d ^ a); - c += d; b = ROR2(b ^ c, 24); - a += b; d = ROR2(d ^ a, 16); - c += d; b = ROR2(b ^ c, 63); -} - - -#define round_lyra_v35_new(state) { \ - Gfunc_v35(state[0], state[4], state[8], state[12]); \ - Gfunc_v35(state[1], state[5], state[9], state[13]); \ - Gfunc_v35(state[2], state[6], state[10], state[14]); \ - Gfunc_v35(state[3], state[7], state[11], state[15]); \ - Gfunc_v35(state[0], state[5], state[10], state[15]); \ - Gfunc_v35(state[1], state[6], state[11], state[12]); \ - Gfunc_v35(state[2], state[7], state[8], state[13]); \ - Gfunc_v35(state[3], state[4], state[9], state[14]); \ -} - -static __device__ __forceinline__ void round_lyra_v35(uint2 *s) -{ - Gfunc_v35(s[0], s[4], s[8], s[12]); - Gfunc_v35(s[1], s[5], s[9], s[13]); - Gfunc_v35(s[2], s[6], s[10], s[14]); - Gfunc_v35(s[3], s[7], s[11], s[15]); - Gfunc_v35(s[0], s[5], s[10], s[15]); - Gfunc_v35(s[1], s[6], s[11], s[12]); - Gfunc_v35(s[2], s[7], s[8], s[13]); - Gfunc_v35(s[3], s[4], s[9], s[14]); -} - -__device__ __forceinline__ void reduceDuplexRowSetup(const int rowIn, const int rowInOut, const int rowOut, uint2 state[16], uint2 Matrix[96][8]) -{ - for (int i = 0; i < 8; i++) - { - #pragma unroll - for (int j = 0; j < 12; j++) - state[j] ^= Matrix[12 * i + j][rowIn] + Matrix[12 * i + j][rowInOut]; - round_lyra_v35(state); - #pragma unroll - for (int j = 0; j < 12; j++) - Matrix[j + 84 - 12 * i][rowOut] = Matrix[12 * i + j][rowIn] ^ state[j]; - Matrix[0 + 12 * i][rowInOut] ^= state[11]; - Matrix[1 + 12 * i][rowInOut] ^= state[0]; - Matrix[2 + 12 * i][rowInOut] ^= state[1]; - Matrix[3 + 12 * i][rowInOut] ^= state[2]; - Matrix[4 + 12 * i][rowInOut] ^= state[3]; - Matrix[5 + 12 * i][rowInOut] ^= state[4]; - Matrix[6 + 12 * i][rowInOut] ^= state[5]; - Matrix[7 + 12 * i][rowInOut] ^= state[6]; - Matrix[8 + 12 * i][rowInOut] ^= state[7]; - Matrix[9 + 12 * i][rowInOut] ^= state[8]; - Matrix[10 + 12 * i][rowInOut] ^= state[9]; - Matrix[11 + 12 * i][rowInOut] ^= state[10]; - } -} - -__global__ __launch_bounds__(TPB, 1) -void lyra2_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint64_t *outputHash) -{ - uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); - if (thread < threads) - { - uint2 state[16]; - #pragma unroll - for (int i = 0; i<4; i++) - { - LOHI(state[i].x, state[i].y, outputHash[threads*i + thread]); - } //password - #pragma unroll - for (int i = 0; i<4; i++) { state[i + 4] = state[i]; } //salt - #pragma unroll - for (int i = 0; i<8; i++) { state[i + 8] = blake2b_IV[i]; } - - // blake2blyra x2 - //#pragma unroll 24 - for (int i = 0; i<24; i++) { round_lyra_v35(state); } //because 12 is not enough - - uint2 Matrix[96][8]; // not cool - - // reducedSqueezeRow0 - #pragma unroll 8 - for (int i = 0; i < 8; i++) - { - #pragma unroll 12 - for (int j = 0; j<12; j++) { Matrix[j + 84 - 12 * i][0] = state[j]; } - round_lyra_v35(state); - } - - // reducedSqueezeRow1 - #pragma unroll 8 - for (int i = 0; i < 8; i++) - { - #pragma unroll 12 - for (int j = 0; j<12; j++) { state[j] ^= Matrix[j + 12 * i][0]; } - round_lyra_v35(state); - #pragma unroll 12 - for (int j = 0; j<12; j++) { Matrix[j + 84 - 12 * i][1] = Matrix[j + 12 * i][0] ^ state[j]; } - } - - - - reduceDuplexRowSetup(1, 0, 2,state, Matrix); - reduceDuplexRowSetup(2, 1, 3, state, Matrix); - reduceDuplexRowSetup(3, 0, 4, state, Matrix); - reduceDuplexRowSetup(4, 3, 5, state, Matrix); - reduceDuplexRowSetup(5, 2, 6, state, Matrix); - reduceDuplexRowSetup(6, 1, 7, state, Matrix); - - uint32_t rowa; - rowa = state[0].x & 7; - reduceDuplexRow(7, rowa, 0); - rowa = state[0].x & 7; - reduceDuplexRow(0, rowa, 3); - rowa = state[0].x & 7; - reduceDuplexRow(3, rowa, 6); - rowa = state[0].x & 7; - reduceDuplexRow(6, rowa, 1); - rowa = state[0].x & 7; - reduceDuplexRow(1, rowa, 4); - rowa = state[0].x & 7; - reduceDuplexRow(4, rowa, 7); - rowa = state[0].x & 7; - reduceDuplexRow(7, rowa, 2); - rowa = state[0].x & 7; - reduceDuplexRow(2, rowa, 5); - - absorbblock(rowa); - - #pragma unroll - for (int i = 0; i<4; i++) { - outputHash[threads*i + thread] = devectorize(state[i]); - } //password - - } //thread -} - -__host__ -void lyra2_cpu_init(int thr_id, uint32_t threads) -{ - //not used -} - -__host__ -void lyra2_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_outputHash, int order) -{ - dim3 grid((threads + TPB - 1) / TPB); - dim3 block(TPB); - - lyra2_gpu_hash_32 <<>> (threads, startNounce, d_outputHash); - - //MyStreamSynchronize(NULL, order, thr_id); -} - diff --git a/lyra2/cuda_lyra2_vectors.h b/lyra2/cuda_lyra2_vectors.h new file mode 100644 index 0000000000..716158eedb --- /dev/null +++ b/lyra2/cuda_lyra2_vectors.h @@ -0,0 +1,735 @@ +/* DJM CRAP to strip (again) made for SM 3.2+ */ + +#ifndef CUDA_LYRA_VECTOR_H +#define CUDA_LYRA_VECTOR_H + +/////////////////////////////////////////////////////////////////////////////////// +#include "cuda_helper.h" + +#if (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) +#define __LDG_PTR "l" +#else +#define __LDG_PTR "r" +#endif + +#if __CUDA_ARCH__ < 300 +#define __shfl(x, y) (x) +#endif + +#if __CUDA_ARCH__ < 320 && !defined(__ldg4) +#define __ldg4(x) (*(x)) +#endif + +typedef struct __align__(32) uint8{ + unsigned int s0, s1, s2, s3, s4, s5, s6, s7; +} uint8; + +typedef struct __align__(64) uint2_8{ + uint2 s0, s1, s2, s3, s4, s5, s6, s7; +} uint2_8; + +typedef struct __align__(64) ulonglong2to8{ + ulonglong2 l0, l1, l2, l3; +} ulonglong2to8; + +typedef struct __align__(128) ulonglong8to16{ + ulonglong2to8 lo, hi; +} ulonglong8to16; + +typedef struct __align__(256) ulonglong16to32{ + ulonglong8to16 lo, hi; +} ulonglong16to32; + +typedef struct __align__(512) ulonglong32to64{ + ulonglong16to32 lo, hi; +} ulonglong32to64; + +typedef struct __align__(128) ulonglonglong{ + ulonglong2 s0, s1, s2, s3, s4, s5, s6, s7; +} ulonglonglong; + +typedef struct __align__(64) uint16{ + union + { + struct + { + unsigned int s0, s1, s2, s3, s4, s5, s6, s7; + }; + uint8 lo; + }; + union + { + struct + { + unsigned int s8, s9, sa, sb, sc, sd, se, sf; + }; + uint8 hi; + }; +} uint16; + +typedef struct __align__(128) uint2_16{ + union + { + struct + { + uint2 s0, s1, s2, s3, s4, s5, s6, s7; + }; + uint2_8 lo; + }; + union + { + struct + { + uint2 s8, s9, sa, sb, sc, sd, se, sf; + }; + uint2_8 hi; + }; +} uint2_16; + +typedef struct __align__(128) uint32{ + uint16 lo, hi; +} uint32; + +struct __align__(128) ulong8{ + ulonglong4 s0, s1, s2, s3; +}; +typedef __device_builtin__ struct ulong8 ulong8; + +typedef struct __align__(256) ulonglong16{ + ulonglong4 s0, s1, s2, s3, s4, s5, s6, s7; +} ulonglong16; + +typedef struct __align__(16) uint28{ + uint2 x, y, z, w; +} uint2x4; +typedef uint2x4 uint28; /* name deprecated */ + +typedef struct __builtin_align__(32) uint48{ + uint4 s0, s1; +} uint48; + +typedef struct __align__(256) uint4x16{ + uint4 s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15; +} uint4x16; + +static __inline__ __device__ ulonglong2to8 make_ulonglong2to8(ulonglong2 s0, ulonglong2 s1, ulonglong2 s2, ulonglong2 s3) +{ + ulonglong2to8 t; t.l0 = s0; t.l1 = s1; t.l2 = s2; t.l3 = s3; + return t; +} + +static __inline__ __device__ ulonglong8to16 make_ulonglong8to16(const ulonglong2to8 &s0, const ulonglong2to8 &s1) +{ + ulonglong8to16 t; t.lo = s0; t.hi = s1; + return t; +} + +static __inline__ __device__ ulonglong16to32 make_ulonglong16to32(const ulonglong8to16 &s0, const ulonglong8to16 &s1) +{ + ulonglong16to32 t; t.lo = s0; t.hi = s1; + return t; +} + +static __inline__ __device__ ulonglong32to64 make_ulonglong32to64(const ulonglong16to32 &s0, const ulonglong16to32 &s1) +{ + ulonglong32to64 t; t.lo = s0; t.hi = s1; + return t; +} + +static __inline__ __host__ __device__ ulonglonglong make_ulonglonglong( + const ulonglong2 &s0, const ulonglong2 &s1, const ulonglong2 &s2, const ulonglong2 &s3, + const ulonglong2 &s4, const ulonglong2 &s5) +{ + ulonglonglong t; t.s0 = s0; t.s1 = s1; t.s2 = s2; t.s3 = s3; t.s4 = s4; t.s5 = s5; + return t; +} + +static __inline__ __device__ uint48 make_uint48(uint4 s0, uint4 s1) +{ + uint48 t; t.s0 = s0; t.s1 = s1; + return t; +} + +static __inline__ __device__ uint28 make_uint28(uint2 s0, uint2 s1, uint2 s2, uint2 s3) +{ + uint28 t; t.x = s0; t.y = s1; t.z = s2; t.w = s3; + return t; +} + +static __inline__ __host__ __device__ uint4x16 make_uint4x16( + uint4 s0, uint4 s1, uint4 s2, uint4 s3, uint4 s4, uint4 s5, uint4 s6, uint4 s7, + uint4 s8, uint4 s9, uint4 sa, uint4 sb, uint4 sc, uint4 sd, uint4 se, uint4 sf) +{ + uint4x16 t; t.s0 = s0; t.s1 = s1; t.s2 = s2; t.s3 = s3; t.s4 = s4; t.s5 = s5; t.s6 = s6; t.s7 = s7; + t.s8 = s8; t.s9 = s9; t.s10 = sa; t.s11 = sb; t.s12 = sc; t.s13 = sd; t.s14 = se; t.s15 = sf; + return t; +} + +static __inline__ __device__ uint2_16 make_uint2_16( + uint2 s0, uint2 s1, uint2 s2, uint2 s3, uint2 s4, uint2 s5, uint2 s6, uint2 s7, + uint2 s8, uint2 s9, uint2 sa, uint2 sb, uint2 sc, uint2 sd, uint2 se, uint2 sf) +{ + uint2_16 t; t.s0 = s0; t.s1 = s1; t.s2 = s2; t.s3 = s3; t.s4 = s4; t.s5 = s5; t.s6 = s6; t.s7 = s7; + t.s8 = s8; t.s9 = s9; t.sa = sa; t.sb = sb; t.sc = sc; t.sd = sd; t.se = se; t.sf = sf; + return t; +} + +static __inline__ __host__ __device__ uint16 make_uint16( + unsigned int s0, unsigned int s1, unsigned int s2, unsigned int s3, unsigned int s4, unsigned int s5, unsigned int s6, unsigned int s7, + unsigned int s8, unsigned int s9, unsigned int sa, unsigned int sb, unsigned int sc, unsigned int sd, unsigned int se, unsigned int sf) +{ + uint16 t; t.s0 = s0; t.s1 = s1; t.s2 = s2; t.s3 = s3; t.s4 = s4; t.s5 = s5; t.s6 = s6; t.s7 = s7; + t.s8 = s8; t.s9 = s9; t.sa = sa; t.sb = sb; t.sc = sc; t.sd = sd; t.se = se; t.sf = sf; + return t; +} + +static __inline__ __host__ __device__ uint16 make_uint16(const uint8 &a, const uint8 &b) +{ + uint16 t; t.lo = a; t.hi = b; return t; +} + +static __inline__ __host__ __device__ uint32 make_uint32(const uint16 &a, const uint16 &b) +{ + uint32 t; t.lo = a; t.hi = b; return t; +} + + +static __inline__ __host__ __device__ uint8 make_uint8( + unsigned int s0, unsigned int s1, unsigned int s2, unsigned int s3, unsigned int s4, unsigned int s5, unsigned int s6, unsigned int s7) +{ + uint8 t; t.s0 = s0; t.s1 = s1; t.s2 = s2; t.s3 = s3; t.s4 = s4; t.s5 = s5; t.s6 = s6; t.s7 = s7; + return t; +} + +static __inline__ __host__ __device__ uint2_8 make_uint2_8( + uint2 s0, uint2 s1, uint2 s2, uint2 s3, uint2 s4, uint2 s5, uint2 s6, uint2 s7) +{ + uint2_8 t; t.s0 = s0; t.s1 = s1; t.s2 = s2; t.s3 = s3; t.s4 = s4; t.s5 = s5; t.s6 = s6; t.s7 = s7; + return t; +} + +static __inline__ __host__ __device__ ulonglong16 make_ulonglong16(const ulonglong4 &s0, const ulonglong4 &s1, + const ulonglong4 &s2, const ulonglong4 &s3, const ulonglong4 &s4, const ulonglong4 &s5, const ulonglong4 &s6, const ulonglong4 &s7) +{ + ulonglong16 t; t.s0 = s0; t.s1 = s1; t.s2 = s2; t.s3 = s3; t.s4 = s4; t.s5 = s5; t.s6 = s6; t.s7 = s7; + return t; +} + +static __inline__ __host__ __device__ ulong8 make_ulong8( + ulonglong4 s0, ulonglong4 s1, ulonglong4 s2, ulonglong4 s3) +{ + ulong8 t; t.s0 = s0; t.s1 = s1; t.s2 = s2; t.s3 = s3;// t.s4 = s4; t.s5 = s5; t.s6 = s6; t.s7 = s7; + return t; +} + +static __forceinline__ __device__ ulonglong4 operator^ (ulonglong4 a, ulonglong4 b) +{ + return make_ulonglong4(a.x ^ b.x, a.y ^ b.y, a.z ^ b.z, a.w ^ b.w); +} +static __forceinline__ __device__ ulonglong4 operator+ (ulonglong4 a, ulonglong4 b) +{ + return make_ulonglong4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); +} +static __forceinline__ __device__ ulonglong2 operator^ (ulonglong2 a, ulonglong2 b) +{ + return make_ulonglong2(a.x ^ b.x, a.y ^ b.y); +} +static __forceinline__ __device__ ulonglong2 operator+ (ulonglong2 a, ulonglong2 b) +{ + return make_ulonglong2(a.x + b.x, a.y + b.y); +} + +static __forceinline__ __device__ ulong8 operator^ (const ulong8 &a, const ulong8 &b) +{ + return make_ulong8(a.s0 ^ b.s0, a.s1 ^ b.s1, a.s2 ^ b.s2, a.s3 ^ b.s3); +} + +static __forceinline__ __device__ ulong8 operator+ (const ulong8 &a, const ulong8 &b) +{ + return make_ulong8(a.s0 + b.s0, a.s1 + b.s1, a.s2 + b.s2, a.s3 + b.s3); +} + +static __forceinline__ __device__ __host__ uint8 operator^ (const uint8 &a, const uint8 &b) +{ + return make_uint8(a.s0 ^ b.s0, a.s1 ^ b.s1, a.s2 ^ b.s2, a.s3 ^ b.s3, a.s4 ^ b.s4, a.s5 ^ b.s5, a.s6 ^ b.s6, a.s7 ^ b.s7); +} + +static __forceinline__ __device__ __host__ uint8 operator+ (const uint8 &a, const uint8 &b) +{ + return make_uint8(a.s0 + b.s0, a.s1 + b.s1, a.s2 + b.s2, a.s3 + b.s3, a.s4 + b.s4, a.s5 + b.s5, a.s6 + b.s6, a.s7 + b.s7); +} + +static __forceinline__ __device__ uint2_8 operator^ (const uint2_8 &a, const uint2_8 &b) +{ + return make_uint2_8(a.s0 ^ b.s0, a.s1 ^ b.s1, a.s2 ^ b.s2, a.s3 ^ b.s3, a.s4 ^ b.s4, a.s5 ^ b.s5, a.s6 ^ b.s6, a.s7 ^ b.s7); +} + +static __forceinline__ __device__ uint2_8 operator+ (const uint2_8 &a, const uint2_8 &b) +{ + return make_uint2_8(a.s0 + b.s0, a.s1 + b.s1, a.s2 + b.s2, a.s3 + b.s3, a.s4 + b.s4, a.s5 + b.s5, a.s6 + b.s6, a.s7 + b.s7); +} + + +////////////// mess++ ////// + +static __forceinline__ __device__ uint28 operator^ (const uint28 &a, const uint28 &b) +{ + return make_uint28(a.x ^ b.x, a.y ^ b.y, a.z ^ b.z, a.w ^ b.w); +} + +static __forceinline__ __device__ uint28 operator+ (const uint28 &a, const uint28 &b) +{ + return make_uint28(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); +} + +static __forceinline__ __device__ uint48 operator+ (const uint48 &a, const uint48 &b) +{ + return make_uint48(a.s0 + b.s0, a.s1 + b.s1); +} + +///////////////////////// + +static __forceinline__ __device__ __host__ uint16 operator^ (const uint16 &a, const uint16 &b) +{ + return make_uint16(a.s0 ^ b.s0, a.s1 ^ b.s1, a.s2 ^ b.s2, a.s3 ^ b.s3, a.s4 ^ b.s4, a.s5 ^ b.s5, a.s6 ^ b.s6, a.s7 ^ b.s7, + a.s8 ^ b.s8, a.s9 ^ b.s9, a.sa ^ b.sa, a.sb ^ b.sb, a.sc ^ b.sc, a.sd ^ b.sd, a.se ^ b.se, a.sf ^ b.sf); +} + +static __forceinline__ __device__ __host__ uint16 operator+ (const uint16 &a, const uint16 &b) +{ + return make_uint16(a.s0 + b.s0, a.s1 + b.s1, a.s2 + b.s2, a.s3 + b.s3, a.s4 + b.s4, a.s5 + b.s5, a.s6 + b.s6, a.s7 + b.s7, + a.s8 + b.s8, a.s9 + b.s9, a.sa + b.sa, a.sb + b.sb, a.sc + b.sc, a.sd + b.sd, a.se + b.se, a.sf + b.sf); +} + +static __forceinline__ __device__ uint2_16 operator^ (const uint2_16 &a, const uint2_16 &b) +{ + return make_uint2_16(a.s0 ^ b.s0, a.s1 ^ b.s1, a.s2 ^ b.s2, a.s3 ^ b.s3, a.s4 ^ b.s4, a.s5 ^ b.s5, a.s6 ^ b.s6, a.s7 ^ b.s7, + a.s8 ^ b.s8, a.s9 ^ b.s9, a.sa ^ b.sa, a.sb ^ b.sb, a.sc ^ b.sc, a.sd ^ b.sd, a.se ^ b.se, a.sf ^ b.sf); +} + +static __forceinline__ __device__ uint2_16 operator+ (const uint2_16 &a, const uint2_16 &b) +{ + return make_uint2_16(a.s0 + b.s0, a.s1 + b.s1, a.s2 + b.s2, a.s3 + b.s3, a.s4 + b.s4, a.s5 + b.s5, a.s6 + b.s6, a.s7 + b.s7, + a.s8 + b.s8, a.s9 + b.s9, a.sa + b.sa, a.sb + b.sb, a.sc + b.sc, a.sd + b.sd, a.se + b.se, a.sf + b.sf); +} + +static __forceinline__ __device__ uint32 operator^ (const uint32 &a, const uint32 &b) +{ + return make_uint32(a.lo ^ b.lo, a.hi ^ b.hi); +} + +static __forceinline__ __device__ uint32 operator+ (const uint32 &a, const uint32 &b) +{ + return make_uint32(a.lo + b.lo, a.hi + b.hi); +} + +static __forceinline__ __device__ ulonglong16 operator^ (const ulonglong16 &a, const ulonglong16 &b) +{ + return make_ulonglong16(a.s0 ^ b.s0, a.s1 ^ b.s1, a.s2 ^ b.s2, a.s3 ^ b.s3, a.s4 ^ b.s4, a.s5 ^ b.s5, a.s6 ^ b.s6, a.s7 ^ b.s7); +} + +static __forceinline__ __device__ ulonglong16 operator+ (const ulonglong16 &a, const ulonglong16 &b) +{ + return make_ulonglong16(a.s0 + b.s0, a.s1 + b.s1, a.s2 + b.s2, a.s3 + b.s3, a.s4 + b.s4, a.s5 + b.s5, a.s6 + b.s6, a.s7 + b.s7); +} + +static __forceinline__ __device__ void operator^= (ulong8 &a, const ulong8 &b) +{ + a = a ^ b; +} + +static __forceinline__ __device__ void operator^= (uint28 &a, const uint28 &b) +{ + a = a ^ b; +} +static __forceinline__ __device__ void operator+= (uint28 &a, const uint28 &b) +{ + a = a + b; +} + +static __forceinline__ __device__ void operator^= (uint2_8 &a, const uint2_8 &b) +{ + a = a ^ b; +} +static __forceinline__ __device__ void operator+= (uint2_8 &a, const uint2_8 &b) +{ + a = a + b; +} + +static __forceinline__ __device__ void operator^= (uint32 &a, const uint32 &b) +{ + a = a ^ b; +} +static __forceinline__ __device__ void operator+= (uint32 &a, const uint32 &b) +{ + a = a + b; +} + +static __forceinline__ __device__ __host__ void operator^= (uint8 &a, const uint8 &b) +{ + a = a ^ b; +} +static __forceinline__ __device__ __host__ void operator^= (uint16 &a, const uint16 &b) +{ + a = a ^ b; +} + +static __forceinline__ __device__ void operator^= (ulonglong16 &a, const ulonglong16 &b) +{ + a = a ^ b; +} +static __forceinline__ __device__ void operator^= (ulonglong4 &a, const ulonglong4 &b) +{ + a = a ^ b; +} +static __forceinline__ __device__ void operator+= (ulonglong4 &a, const ulonglong4 &b) +{ + a = a + b; +} + +static __forceinline__ __device__ void operator^= (ulonglong2 &a, const ulonglong2 &b) +{ + a = a ^ b; +} +static __forceinline__ __device__ void operator+= (ulonglong2 &a, const ulonglong2 &b) +{ + a = a + b; +} + +static __forceinline__ __device__ +ulonglong2to8 operator^ (const ulonglong2to8 &a, const ulonglong2to8 &b) +{ + return make_ulonglong2to8(a.l0 ^ b.l0, a.l1 ^ b.l1, a.l2 ^ b.l2, a.l3 ^ b.l3); +} +static __forceinline__ __device__ +ulonglong2to8 operator+ (const ulonglong2to8 &a, const ulonglong2to8 &b) +{ + return make_ulonglong2to8(a.l0 + b.l0, a.l1 + b.l1, a.l2 + b.l2, a.l3 + b.l3); +} + +static __forceinline__ __device__ +ulonglong8to16 operator^ (const ulonglong8to16 &a, const ulonglong8to16 &b) +{ + return make_ulonglong8to16(a.lo ^ b.lo, a.hi ^ b.hi); +} + +static __forceinline__ __device__ +ulonglong8to16 operator+ (const ulonglong8to16 &a, const ulonglong8to16 &b) +{ + return make_ulonglong8to16(a.lo + b.lo, a.hi + b.hi); +} + +static __forceinline__ __device__ +ulonglong16to32 operator^ (const ulonglong16to32 &a, const ulonglong16to32 &b) +{ + return make_ulonglong16to32(a.lo ^ b.lo, a.hi ^ b.hi); +} + +static __forceinline__ __device__ +ulonglong16to32 operator+ (const ulonglong16to32 &a, const ulonglong16to32 &b) +{ + return make_ulonglong16to32(a.lo + b.lo, a.hi + b.hi); +} + +static __forceinline__ __device__ +ulonglong32to64 operator^ (const ulonglong32to64 &a, const ulonglong32to64 &b) +{ + return make_ulonglong32to64(a.lo ^ b.lo, a.hi ^ b.hi); +} + +static __forceinline__ __device__ +ulonglong32to64 operator+ (const ulonglong32to64 &a, const ulonglong32to64 &b) +{ + return make_ulonglong32to64(a.lo + b.lo, a.hi + b.hi); +} + +static __forceinline__ __device__ ulonglonglong operator^ (const ulonglonglong &a, const ulonglonglong &b) +{ + return make_ulonglonglong(a.s0 ^ b.s0, a.s1 ^ b.s1, a.s2 ^ b.s2, a.s3 ^ b.s3, a.s4 ^ b.s4, a.s5 ^ b.s5); +} + +static __forceinline__ __device__ ulonglonglong operator+ (const ulonglonglong &a, const ulonglonglong &b) +{ + return make_ulonglonglong(a.s0 + b.s0, a.s1 + b.s1, a.s2 + b.s2, a.s3 + b.s3, a.s4 + b.s4, a.s5 + b.s5); +} + +static __forceinline__ __device__ void operator^= (ulonglong2to8 &a, const ulonglong2to8 &b) +{ + a = a ^ b; +} + +static __forceinline__ __device__ void operator+= (uint4 &a, uint4 b) +{ + a = a + b; +} +static __forceinline__ __device__ void operator+= (uchar4 &a, uchar4 b) +{ + a = a + b; +} +static __forceinline__ __device__ __host__ void operator+= (uint8 &a, const uint8 &b) +{ + a = a + b; +} +static __forceinline__ __device__ __host__ void operator+= (uint16 &a, const uint16 &b) +{ + a = a + b; +} +static __forceinline__ __device__ void operator+= (uint2_16 &a, const uint2_16 &b) +{ + a = a + b; +} +static __forceinline__ __device__ void operator^= (uint2_16 &a, const uint2_16 &b) +{ + a = a + b; +} + +static __forceinline__ __device__ void operator+= (ulong8 &a, const ulong8 &b) +{ + a = a + b; +} +static __forceinline__ __device__ void operator+= (ulonglong16 &a, const ulonglong16 &b) +{ + a = a + b; +} +static __forceinline__ __device__ void operator+= (ulonglong8to16 &a, const ulonglong8to16 &b) +{ + a = a + b; +} +static __forceinline__ __device__ void operator^= (ulonglong8to16 &a, const ulonglong8to16 &b) +{ + a = a ^ b; +} + +static __forceinline__ __device__ void operator+= (ulonglong16to32 &a, const ulonglong16to32 &b) +{ + a = a + b; +} +static __forceinline__ __device__ void operator^= (ulonglong16to32 &a, const ulonglong16to32 &b) +{ + a = a ^ b; +} + +static __forceinline__ __device__ void operator+= (ulonglong32to64 &a, const ulonglong32to64 &b) +{ + a = a + b; +} +static __forceinline__ __device__ void operator^= (ulonglong32to64 &a, const ulonglong32to64 &b) +{ + a = a ^ b; +} + +static __forceinline__ __device__ void operator+= (ulonglonglong &a, const ulonglonglong &b) +{ + a = a + b; +} +static __forceinline__ __device__ void operator^= (ulonglonglong &a, const ulonglonglong &b) +{ + a = a ^ b; +} + +#if __CUDA_ARCH__ < 320 + +#define rotate ROTL32 +#define rotateR ROTR32 + +#else + +static __forceinline__ __device__ uint4 rotate4(uint4 vec4, uint32_t shift) +{ + uint4 ret; + asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(ret.x) : "r"(vec4.x), "r"(vec4.x), "r"(shift)); + asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(ret.y) : "r"(vec4.y), "r"(vec4.y), "r"(shift)); + asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(ret.z) : "r"(vec4.z), "r"(vec4.z), "r"(shift)); + asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(ret.w) : "r"(vec4.w), "r"(vec4.w), "r"(shift)); + return ret; +} + +static __forceinline__ __device__ uint4 rotate4R(uint4 vec4, uint32_t shift) +{ + uint4 ret; + asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(ret.x) : "r"(vec4.x), "r"(vec4.x), "r"(shift)); + asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(ret.y) : "r"(vec4.y), "r"(vec4.y), "r"(shift)); + asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(ret.z) : "r"(vec4.z), "r"(vec4.z), "r"(shift)); + asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(ret.w) : "r"(vec4.w), "r"(vec4.w), "r"(shift)); + return ret; +} + +static __forceinline__ __device__ uint32_t rotate(uint32_t vec4, uint32_t shift) +{ + uint32_t ret; + asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(vec4), "r"(vec4), "r"(shift)); + return ret; +} + +static __forceinline__ __device__ uint32_t rotateR(uint32_t vec4, uint32_t shift) +{ + uint32_t ret; + asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(vec4), "r"(vec4), "r"(shift)); + return ret; +} + +static __device__ __inline__ ulonglong4 __ldg4(const ulonglong4 *ptr) +{ + ulonglong4 ret; + asm("ld.global.nc.v2.u64 {%0,%1}, [%2];" : "=l"(ret.x), "=l"(ret.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+16];" : "=l"(ret.z), "=l"(ret.w) : __LDG_PTR(ptr)); + return ret; +} + +static __device__ __inline__ void ldg4(const ulonglong4 *ptr, ulonglong4 *ret) +{ + asm("ld.global.nc.v2.u64 {%0,%1}, [%2];" : "=l"(ret[0].x), "=l"(ret[0].y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+16];" : "=l"(ret[0].z), "=l"(ret[0].w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+32];" : "=l"(ret[1].x), "=l"(ret[1].y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+48];" : "=l"(ret[1].z), "=l"(ret[1].w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+64];" : "=l"(ret[2].x), "=l"(ret[2].y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+80];" : "=l"(ret[2].z), "=l"(ret[2].w) : __LDG_PTR(ptr)); +} + +static __device__ __inline__ uint28 __ldg4(const uint28 *ptr) +{ + uint28 ret; + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(ret.x.x), "=r"(ret.x.y), "=r"(ret.y.x), "=r"(ret.y.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+16];" : "=r"(ret.z.x), "=r"(ret.z.y), "=r"(ret.w.x), "=r"(ret.w.y) : __LDG_PTR(ptr)); + return ret; +} + +static __device__ __inline__ uint48 __ldg4(const uint48 *ptr) +{ + uint48 ret; + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(ret.s0.x), "=r"(ret.s0.y), "=r"(ret.s0.z), "=r"(ret.s0.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+16];" : "=r"(ret.s1.x), "=r"(ret.s1.y), "=r"(ret.s1.z), "=r"(ret.s1.w) : __LDG_PTR(ptr)); + return ret; +} + +static __device__ __inline__ void ldg4(const uint28 *ptr, uint28 *ret) +{ + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(ret[0].x.x), "=r"(ret[0].x.y), "=r"(ret[0].y.x), "=r"(ret[0].y.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+16];" : "=r"(ret[0].z.x), "=r"(ret[0].z.y), "=r"(ret[0].w.x), "=r"(ret[0].w.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+32];" : "=r"(ret[1].x.x), "=r"(ret[1].x.y), "=r"(ret[1].y.x), "=r"(ret[1].y.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+48];" : "=r"(ret[1].z.x), "=r"(ret[1].z.y), "=r"(ret[1].w.x), "=r"(ret[1].w.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+64];" : "=r"(ret[2].x.x), "=r"(ret[2].x.y), "=r"(ret[2].y.x), "=r"(ret[2].y.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+80];" : "=r"(ret[2].z.x), "=r"(ret[2].z.y), "=r"(ret[2].w.x), "=r"(ret[2].w.y) : __LDG_PTR(ptr)); +} + +#endif /* __CUDA_ARCH__ < 320 */ + + +static __forceinline__ __device__ uint8 swapvec(const uint8 &buf) +{ + uint8 vec; + vec.s0 = cuda_swab32(buf.s0); + vec.s1 = cuda_swab32(buf.s1); + vec.s2 = cuda_swab32(buf.s2); + vec.s3 = cuda_swab32(buf.s3); + vec.s4 = cuda_swab32(buf.s4); + vec.s5 = cuda_swab32(buf.s5); + vec.s6 = cuda_swab32(buf.s6); + vec.s7 = cuda_swab32(buf.s7); + return vec; +} + +static __forceinline__ __device__ uint8 swapvec(const uint8 *buf) +{ + uint8 vec; + vec.s0 = cuda_swab32(buf[0].s0); + vec.s1 = cuda_swab32(buf[0].s1); + vec.s2 = cuda_swab32(buf[0].s2); + vec.s3 = cuda_swab32(buf[0].s3); + vec.s4 = cuda_swab32(buf[0].s4); + vec.s5 = cuda_swab32(buf[0].s5); + vec.s6 = cuda_swab32(buf[0].s6); + vec.s7 = cuda_swab32(buf[0].s7); + return vec; +} + +static __forceinline__ __device__ uint16 swapvec(const uint16 *buf) +{ + uint16 vec; + vec.s0 = cuda_swab32(buf[0].s0); + vec.s1 = cuda_swab32(buf[0].s1); + vec.s2 = cuda_swab32(buf[0].s2); + vec.s3 = cuda_swab32(buf[0].s3); + vec.s4 = cuda_swab32(buf[0].s4); + vec.s5 = cuda_swab32(buf[0].s5); + vec.s6 = cuda_swab32(buf[0].s6); + vec.s7 = cuda_swab32(buf[0].s7); + vec.s8 = cuda_swab32(buf[0].s8); + vec.s9 = cuda_swab32(buf[0].s9); + vec.sa = cuda_swab32(buf[0].sa); + vec.sb = cuda_swab32(buf[0].sb); + vec.sc = cuda_swab32(buf[0].sc); + vec.sd = cuda_swab32(buf[0].sd); + vec.se = cuda_swab32(buf[0].se); + vec.sf = cuda_swab32(buf[0].sf); + return vec; +} + +static __forceinline__ __device__ uint16 swapvec(const uint16 &buf) +{ + uint16 vec; + vec.s0 = cuda_swab32(buf.s0); + vec.s1 = cuda_swab32(buf.s1); + vec.s2 = cuda_swab32(buf.s2); + vec.s3 = cuda_swab32(buf.s3); + vec.s4 = cuda_swab32(buf.s4); + vec.s5 = cuda_swab32(buf.s5); + vec.s6 = cuda_swab32(buf.s6); + vec.s7 = cuda_swab32(buf.s7); + vec.s8 = cuda_swab32(buf.s8); + vec.s9 = cuda_swab32(buf.s9); + vec.sa = cuda_swab32(buf.sa); + vec.sb = cuda_swab32(buf.sb); + vec.sc = cuda_swab32(buf.sc); + vec.sd = cuda_swab32(buf.sd); + vec.se = cuda_swab32(buf.se); + vec.sf = cuda_swab32(buf.sf); + return vec; +} + +static __device__ __forceinline__ uint28 shuffle4(const uint28 &var, int lane) +{ +#if __CUDA_ARCH__ >= 300 + uint28 res; + res.x.x = __shfl(var.x.x, lane); + res.x.y = __shfl(var.x.y, lane); + res.y.x = __shfl(var.y.x, lane); + res.y.y = __shfl(var.y.y, lane); + res.z.x = __shfl(var.z.x, lane); + res.z.y = __shfl(var.z.y, lane); + res.w.x = __shfl(var.w.x, lane); + res.w.y = __shfl(var.w.y, lane); + return res; +#else + return var; +#endif +} + +static __device__ __forceinline__ ulonglong4 shuffle4(ulonglong4 var, int lane) +{ +#if __CUDA_ARCH__ >= 300 + ulonglong4 res; + uint2 temp; + temp = vectorize(var.x); + temp.x = __shfl(temp.x, lane); + temp.y = __shfl(temp.y, lane); + res.x = devectorize(temp); + temp = vectorize(var.y); + temp.x = __shfl(temp.x, lane); + temp.y = __shfl(temp.y, lane); + res.y = devectorize(temp); + temp = vectorize(var.z); + temp.x = __shfl(temp.x, lane); + temp.y = __shfl(temp.y, lane); + res.z = devectorize(temp); + temp = vectorize(var.w); + temp.x = __shfl(temp.x, lane); + temp.y = __shfl(temp.y, lane); + res.w = devectorize(temp); + return res; +#else + return var; +#endif +} + +#endif // #ifndef CUDA_LYRA_VECTOR_H diff --git a/lyra2/cuda_lyra2v2.cu b/lyra2/cuda_lyra2v2.cu new file mode 100644 index 0000000000..05e75838d3 --- /dev/null +++ b/lyra2/cuda_lyra2v2.cu @@ -0,0 +1,493 @@ +/* +* Lyra2 (v2) CUDA Implementation +* +* Based on tpruvot/djm34/VTC sources and incredible 2x boost by Nanashi Meiyo-Meijin (May 2016) +*/ + +#include +#include + +#include "cuda_lyra2v2_sm3.cuh" + +#ifdef __INTELLISENSE__ +/* just for vstudio code colors */ +#define __CUDA_ARCH__ 500 +#endif + +__device__ __forceinline__ +uint2 SWAPUINT2(uint2 value) +{ + return make_uint2(value.y, value.x); +} + +#define TPB5x 128 + +#if __CUDA_ARCH__ >= 500 + +#include "cuda_lyra2_vectors.h" + +#define Nrow 4 +#define Ncol 4 +#define memshift 3 + +__device__ uint2x4 *DMatrix; + +__device__ __forceinline__ uint2 LD4S(uint2 *shared_mem, const int index) +{ + return shared_mem[(index * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x]; +} + +__device__ __forceinline__ void ST4S(uint2 *shared_mem, const int index, const uint2 data) +{ + shared_mem[(index * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x] = data; +} + +__device__ __forceinline__ uint2 shuffle2(uint2 a, uint32_t b, uint32_t c) +{ + return make_uint2(__shfl(a.x, b, c), __shfl(a.y, b, c)); +} + +__device__ __forceinline__ +void Gfunc_v5(uint2 &a, uint2 &b, uint2 &c, uint2 &d) +{ + a += b; d = eorswap32(a, d); + c += d; b ^= c; b = ROR24(b); + a += b; d ^= a; d = ROR16(d); + c += d; b ^= c; b = ROR2(b, 63); +} + +__device__ __forceinline__ +void round_lyra_v5(uint2x4 s[4]) +{ + Gfunc_v5(s[0].x, s[1].x, s[2].x, s[3].x); + Gfunc_v5(s[0].y, s[1].y, s[2].y, s[3].y); + Gfunc_v5(s[0].z, s[1].z, s[2].z, s[3].z); + Gfunc_v5(s[0].w, s[1].w, s[2].w, s[3].w); + + Gfunc_v5(s[0].x, s[1].y, s[2].z, s[3].w); + Gfunc_v5(s[0].y, s[1].z, s[2].w, s[3].x); + Gfunc_v5(s[0].z, s[1].w, s[2].x, s[3].y); + Gfunc_v5(s[0].w, s[1].x, s[2].y, s[3].z); +} + +__device__ __forceinline__ +void round_lyra_v5(uint2 s[4]) +{ + Gfunc_v5(s[0], s[1], s[2], s[3]); + s[1] = shuffle2(s[1], threadIdx.x + 1, 4); + s[2] = shuffle2(s[2], threadIdx.x + 2, 4); + s[3] = shuffle2(s[3], threadIdx.x + 3, 4); + Gfunc_v5(s[0], s[1], s[2], s[3]); + s[1] = shuffle2(s[1], threadIdx.x + 3, 4); + s[2] = shuffle2(s[2], threadIdx.x + 2, 4); + s[3] = shuffle2(s[3], threadIdx.x + 1, 4); +} + +__device__ __forceinline__ +void reduceDuplexRowSetup2(uint2 *shared_mem, uint2 state[4]) +{ + uint2 state1[Ncol][3], state0[Ncol][3], state2[3]; + int i, j; + +#pragma unroll + for(int i = 0; i < Ncol; i++) + { +#pragma unroll + for(j = 0; j < 3; j++) + state0[Ncol - i - 1][j] = state[j]; + round_lyra_v5(state); + } + + //#pragma unroll 4 + for(i = 0; i < Ncol; i++) + { +#pragma unroll + for(j = 0; j < 3; j++) + state[j] ^= state0[i][j]; + + round_lyra_v5(state); + +#pragma unroll + for(j = 0; j < 3; j++) + state1[Ncol - i - 1][j] = state0[i][j]; + +#pragma unroll + for(j = 0; j < 3; j++) + state1[Ncol - i - 1][j] ^= state[j]; + } + + for(i = 0; i < Ncol; i++) + { + const uint32_t s0 = memshift * Ncol * 0 + i * memshift; + const uint32_t s2 = memshift * Ncol * 2 + memshift * (Ncol - 1) - i*memshift; + +#pragma unroll + for(j = 0; j < 3; j++) + state[j] ^= state1[i][j] + state0[i][j]; + + round_lyra_v5(state); + +#pragma unroll + for(j = 0; j < 3; j++) + state2[j] = state1[i][j]; + +#pragma unroll + for(j = 0; j < 3; j++) + state2[j] ^= state[j]; + +#pragma unroll + for(j = 0; j < 3; j++) + ST4S(shared_mem, s2 + j, state2[j]); + + uint2 Data0 = shuffle2(state[0], threadIdx.x - 1, 4); + uint2 Data1 = shuffle2(state[1], threadIdx.x - 1, 4); + uint2 Data2 = shuffle2(state[2], threadIdx.x - 1, 4); + + if(threadIdx.x == 0) + { + state0[i][0] ^= Data2; + state0[i][1] ^= Data0; + state0[i][2] ^= Data1; + } + else + { + state0[i][0] ^= Data0; + state0[i][1] ^= Data1; + state0[i][2] ^= Data2; + } + +#pragma unroll + for(j = 0; j < 3; j++) + ST4S(shared_mem, s0 + j, state0[i][j]); + +#pragma unroll + for(j = 0; j < 3; j++) + state0[i][j] = state2[j]; + + } + + for(i = 0; i < Ncol; i++) + { + const uint32_t s1 = memshift * Ncol * 1 + i*memshift; + const uint32_t s3 = memshift * Ncol * 3 + memshift * (Ncol - 1) - i*memshift; + +#pragma unroll + for(j = 0; j < 3; j++) + state[j] ^= state1[i][j] + state0[Ncol - i - 1][j]; + + round_lyra_v5(state); + +#pragma unroll + for(j = 0; j < 3; j++) + state0[Ncol - i - 1][j] ^= state[j]; + +#pragma unroll + for(j = 0; j < 3; j++) + ST4S(shared_mem, s3 + j, state0[Ncol - i - 1][j]); + + uint2 Data0 = shuffle2(state[0], threadIdx.x - 1, 4); + uint2 Data1 = shuffle2(state[1], threadIdx.x - 1, 4); + uint2 Data2 = shuffle2(state[2], threadIdx.x - 1, 4); + + if(threadIdx.x == 0) + { + state1[i][0] ^= Data2; + state1[i][1] ^= Data0; + state1[i][2] ^= Data1; + } + else + { + state1[i][0] ^= Data0; + state1[i][1] ^= Data1; + state1[i][2] ^= Data2; + } + +#pragma unroll + for(j = 0; j < 3; j++) + ST4S(shared_mem, s1 + j, state1[i][j]); + } + __syncthreads(); +} + +__device__ +void reduceDuplexRowt2(uint2 *shared_mem, const int rowIn, const int rowInOut, const int rowOut, uint2 state[4]) +{ + uint2 state1[3], state2[3]; + const uint32_t ps1 = memshift * Ncol * rowIn; + const uint32_t ps2 = memshift * Ncol * rowInOut; + const uint32_t ps3 = memshift * Ncol * rowOut; + + for(int i = 0; i < Ncol; i++) + { + const uint32_t s1 = ps1 + i*memshift; + const uint32_t s2 = ps2 + i*memshift; + const uint32_t s3 = ps3 + i*memshift; + +#pragma unroll + for(int j = 0; j < 3; j++) + state1[j] = LD4S(shared_mem, s1 + j); + +#pragma unroll + for(int j = 0; j < 3; j++) + state2[j] = LD4S(shared_mem, s2 + j); + +#pragma unroll + for(int j = 0; j < 3; j++) + state[j] ^= state1[j] + state2[j]; + + round_lyra_v5(state); + + uint2 Data0 = shuffle2(state[0], threadIdx.x - 1, 4); + uint2 Data1 = shuffle2(state[1], threadIdx.x - 1, 4); + uint2 Data2 = shuffle2(state[2], threadIdx.x - 1, 4); + + if(threadIdx.x == 0) + { + state2[0] ^= Data2; + state2[1] ^= Data0; + state2[2] ^= Data1; + } + else + { + state2[0] ^= Data0; + state2[1] ^= Data1; + state2[2] ^= Data2; + } + +#pragma unroll + for(int j = 0; j < 3; j++) + ST4S(shared_mem, s2 + j, state2[j]); + __syncthreads(); + +#pragma unroll + for(int j = 0; j < 3; j++) + ST4S(shared_mem, s3 + j, LD4S(shared_mem, s3 + j) ^ state[j]); + __syncthreads(); + } +} + +__device__ +void reduceDuplexRowt2x4(uint2 *shared_mem, const int rowInOut, uint2 state[4]) +{ + const int rowIn = 2; + const int rowOut = 3; + + int i, j; + uint2 last[3]; + const uint32_t ps1 = memshift * Ncol * rowIn; + const uint32_t ps2 = memshift * Ncol * rowInOut; + +#pragma unroll + for(int j = 0; j < 3; j++) + last[j] = LD4S(shared_mem, ps2 + j); + +#pragma unroll + for(int j = 0; j < 3; j++) + state[j] ^= LD4S(shared_mem, ps1 + j) + last[j]; + + round_lyra_v5(state); + + uint2 Data0 = shuffle2(state[0], threadIdx.x - 1, 4); + uint2 Data1 = shuffle2(state[1], threadIdx.x - 1, 4); + uint2 Data2 = shuffle2(state[2], threadIdx.x - 1, 4); + + if(threadIdx.x == 0) + { + last[0] ^= Data2; + last[1] ^= Data0; + last[2] ^= Data1; + } + else + { + last[0] ^= Data0; + last[1] ^= Data1; + last[2] ^= Data2; + } + + if(rowInOut == rowOut) + { +#pragma unroll + for(j = 0; j < 3; j++) + last[j] ^= state[j]; + } + + for(i = 1; i < Ncol; i++) + { + const uint32_t s1 = ps1 + i*memshift; + const uint32_t s2 = ps2 + i*memshift; + +#pragma unroll + for(j = 0; j < 3; j++) + state[j] ^= LD4S(shared_mem, s1 + j) + LD4S(shared_mem, s2 + j); + + round_lyra_v5(state); + } + +#pragma unroll + for(int j = 0; j < 3; j++) + state[j] ^= last[j]; +} + +__global__ +__launch_bounds__(TPB5x, 1) +void lyra2v2_gpu_hash_32_1(uint32_t threads, uint2 *inputHash) +{ + const uint32_t thread = blockDim.x * blockIdx.x + threadIdx.x; + + const uint2x4 blake2b_IV[2] = { + 0xf3bcc908UL, 0x6a09e667UL, 0x84caa73bUL, 0xbb67ae85UL, + 0xfe94f82bUL, 0x3c6ef372UL, 0x5f1d36f1UL, 0xa54ff53aUL, + 0xade682d1UL, 0x510e527fUL, 0x2b3e6c1fUL, 0x9b05688cUL, + 0xfb41bd6bUL, 0x1f83d9abUL, 0x137e2179UL, 0x5be0cd19UL + }; + + const uint2x4 Mask[2] = { + 0x00000020UL, 0x00000000UL, 0x00000020UL, 0x00000000UL, + 0x00000020UL, 0x00000000UL, 0x00000001UL, 0x00000000UL, + 0x00000004UL, 0x00000000UL, 0x00000004UL, 0x00000000UL, + 0x00000080UL, 0x00000000UL, 0x00000000UL, 0x01000000UL + }; + + uint2x4 state[4]; + + if(thread < threads) + { + state[0].x = state[1].x = __ldg(&inputHash[thread + threads * 0]); + state[0].y = state[1].y = __ldg(&inputHash[thread + threads * 1]); + state[0].z = state[1].z = __ldg(&inputHash[thread + threads * 2]); + state[0].w = state[1].w = __ldg(&inputHash[thread + threads * 3]); + state[2] = blake2b_IV[0]; + state[3] = blake2b_IV[1]; + + for(int i = 0; i<12; i++) + round_lyra_v5(state); + + state[0] ^= Mask[0]; + state[1] ^= Mask[1]; + + for(int i = 0; i<12; i++) + round_lyra_v5(state); + + DMatrix[blockDim.x * gridDim.x * 0 + thread] = state[0]; + DMatrix[blockDim.x * gridDim.x * 1 + thread] = state[1]; + DMatrix[blockDim.x * gridDim.x * 2 + thread] = state[2]; + DMatrix[blockDim.x * gridDim.x * 3 + thread] = state[3]; + } +} + +__global__ +__launch_bounds__(32, 1) +void lyra2v2_gpu_hash_32_2(uint32_t threads) +{ + const uint32_t thread = blockDim.y * blockIdx.x + threadIdx.y; + __shared__ uint2 shared_mem[1536]; + if(thread < threads) + { + uint2 state[4]; + state[0] = ((uint2*)DMatrix)[(0 * gridDim.x * blockDim.y + thread) * blockDim.x + threadIdx.x]; + state[1] = ((uint2*)DMatrix)[(1 * gridDim.x * blockDim.y + thread) * blockDim.x + threadIdx.x]; + state[2] = ((uint2*)DMatrix)[(2 * gridDim.x * blockDim.y + thread) * blockDim.x + threadIdx.x]; + state[3] = ((uint2*)DMatrix)[(3 * gridDim.x * blockDim.y + thread) * blockDim.x + threadIdx.x]; + + reduceDuplexRowSetup2(shared_mem, state); + + uint32_t rowa; + int prev = 3; + + for(int i = 0; i < 3; i++) + { + rowa = __shfl(state[0].x, 0, 4) & 3; + reduceDuplexRowt2(shared_mem, prev, rowa, i, state); + prev = i; + } + + rowa = __shfl(state[0].x, 0, 4) & 3; + reduceDuplexRowt2x4(shared_mem, rowa, state); + + ((uint2*)DMatrix)[(0 * gridDim.x * blockDim.y + thread) * blockDim.x + threadIdx.x] = state[0]; + ((uint2*)DMatrix)[(1 * gridDim.x * blockDim.y + thread) * blockDim.x + threadIdx.x] = state[1]; + ((uint2*)DMatrix)[(2 * gridDim.x * blockDim.y + thread) * blockDim.x + threadIdx.x] = state[2]; + ((uint2*)DMatrix)[(3 * gridDim.x * blockDim.y + thread) * blockDim.x + threadIdx.x] = state[3]; + } +} + +__global__ +__launch_bounds__(TPB5x, 1) +void lyra2v2_gpu_hash_32_3(uint32_t threads, uint2 *outputHash) +{ + const uint32_t thread = blockDim.x * blockIdx.x + threadIdx.x; + + uint2x4 state[4]; + + if(thread < threads) + { + state[0] = __ldg4(&DMatrix[blockDim.x * gridDim.x * 0 + thread]); + state[1] = __ldg4(&DMatrix[blockDim.x * gridDim.x * 1 + thread]); + state[2] = __ldg4(&DMatrix[blockDim.x * gridDim.x * 2 + thread]); + state[3] = __ldg4(&DMatrix[blockDim.x * gridDim.x * 3 + thread]); + + for(int i = 0; i < 12; i++) + round_lyra_v5(state); + + outputHash[thread + threads * 0] = state[0].x; + outputHash[thread + threads * 1] = state[0].y; + outputHash[thread + threads * 2] = state[0].z; + outputHash[thread + threads * 3] = state[0].w; + } +} + +#else +#include "cuda_helper.h" +#if __CUDA_ARCH__ < 200 +__device__ void* DMatrix; +#endif +__global__ void lyra2v2_gpu_hash_32_1(uint32_t threads, uint2 *inputHash) +{} +__global__ void lyra2v2_gpu_hash_32_2(uint32_t threads) +{} +__global__ void lyra2v2_gpu_hash_32_3(uint32_t threads, uint2 *outputHash) +{} +#endif + + +__host__ +void lyra2v2_cpu_init(int thr_id, uint64_t *d_matrix) +{ + get_cuda_arch(&cuda_arch[thr_id]); + // just assign the device pointer allocated in main loop + cudaMemcpyToSymbol(DMatrix, &d_matrix, sizeof(uint64_t*), 0, cudaMemcpyHostToDevice); +} + +__host__ +void lyra2v2_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *g_hash) +{ + if(cuda_arch[thr_id] >= 500) + { + + const uint32_t tpb = TPB5x; + + dim3 grid2((threads + tpb - 1) / tpb); + dim3 block2(tpb); + dim3 grid4((threads * 4 + 32 - 1) / 32); + dim3 block4(4, 32 / 4); + + lyra2v2_gpu_hash_32_1 << < grid2, block2, 0, gpustream[thr_id] >> > (threads, (uint2*)g_hash); + lyra2v2_gpu_hash_32_2 << < grid4, block4, 0, gpustream[thr_id] >> > (threads); + lyra2v2_gpu_hash_32_3 << < grid2, block2, 0, gpustream[thr_id] >> > (threads, (uint2*)g_hash); + + } + else + { + + uint32_t tpb = 16; + if(cuda_arch[thr_id] >= 350) tpb = TPB35; + else if(cuda_arch[thr_id] >= 300) tpb = TPB30; + else if(cuda_arch[thr_id] >= 200) tpb = TPB20; + + dim3 grid((threads + tpb - 1) / tpb); + dim3 block(tpb); + lyra2v2_gpu_hash_32_v3 << < grid, block, 0, gpustream[thr_id] >> > (threads, startNounce, (uint2*)g_hash); + + } + CUDA_SAFE_CALL(cudaGetLastError()); +} \ No newline at end of file diff --git a/lyra2/cuda_lyra2v2_sm3.cuh b/lyra2/cuda_lyra2v2_sm3.cuh new file mode 100644 index 0000000000..56c6ccfe38 --- /dev/null +++ b/lyra2/cuda_lyra2v2_sm3.cuh @@ -0,0 +1,345 @@ +/* SM 2/3/3.5 Variant for lyra2REv2 */ +#include +#ifdef __INTELLISENSE__ +/* just for vstudio code colors */ +#undef __CUDA_ARCH__ +#define __CUDA_ARCH__ 350 +#endif + +#define TPB20 64 +#define TPB30 64 +#define TPB35 64 + +#if __CUDA_ARCH__ >= 200 && __CUDA_ARCH__ < 500 + +#include "cuda_lyra2_vectors.h" + +#define Nrow 4 +#define Ncol 4 + +#define vectype ulonglong4 +#define memshift 4 + +__device__ vectype *DMatrix; + +static __device__ __forceinline__ +void Gfunc_v35(unsigned long long &a, unsigned long long &b, unsigned long long &c, unsigned long long &d) +{ + a += b; d ^= a; d = ROTR64(d, 32); + c += d; b ^= c; b = ROTR64(b, 24); + a += b; d ^= a; d = ROTR64(d, 16); + c += d; b ^= c; b = ROTR64(b, 63); +} + +static __device__ __forceinline__ +void round_lyra_v35(vectype* s) +{ + Gfunc_v35(s[0].x, s[1].x, s[2].x, s[3].x); + Gfunc_v35(s[0].y, s[1].y, s[2].y, s[3].y); + Gfunc_v35(s[0].z, s[1].z, s[2].z, s[3].z); + Gfunc_v35(s[0].w, s[1].w, s[2].w, s[3].w); + + Gfunc_v35(s[0].x, s[1].y, s[2].z, s[3].w); + Gfunc_v35(s[0].y, s[1].z, s[2].w, s[3].x); + Gfunc_v35(s[0].z, s[1].w, s[2].x, s[3].y); + Gfunc_v35(s[0].w, s[1].x, s[2].y, s[3].z); +} + +static __device__ __forceinline__ +void reduceDuplexV3(vectype state[4], uint32_t thread) +{ + vectype state1[3]; + uint32_t ps1 = (Nrow * Ncol * memshift * thread); + uint32_t ps2 = (memshift * (Ncol - 1) * Nrow + memshift * 1 + Nrow * Ncol * memshift * thread); + +#pragma unroll 4 + for(int i = 0; i < Ncol; i++) + { + uint32_t s1 = ps1 + Nrow * i *memshift; + uint32_t s2 = ps2 - Nrow * i *memshift; + + for(int j = 0; j < 3; j++) + state1[j] = __ldg4(&(DMatrix + s1)[j]); + + for(int j = 0; j < 3; j++) + state[j] ^= state1[j]; + round_lyra_v35(state); + + for(int j = 0; j < 3; j++) + state1[j] ^= state[j]; + + for(int j = 0; j < 3; j++) + (DMatrix + s2)[j] = state1[j]; + } +} + +static __device__ __forceinline__ +void reduceDuplexRowSetupV3(const int rowIn, const int rowInOut, const int rowOut, vectype state[4], uint32_t thread) +{ + vectype state2[3], state1[3]; + + uint32_t ps1 = (memshift * rowIn + Nrow * Ncol * memshift * thread); + uint32_t ps2 = (memshift * rowInOut + Nrow * Ncol * memshift * thread); + uint32_t ps3 = (Nrow * memshift * (Ncol - 1) + memshift * rowOut + Nrow * Ncol * memshift * thread); + + for(int i = 0; i < Ncol; i++) + { + uint32_t s1 = ps1 + Nrow*i*memshift; + uint32_t s2 = ps2 + Nrow*i*memshift; + uint32_t s3 = ps3 - Nrow*i*memshift; + + for(int j = 0; j < 3; j++) + state1[j] = __ldg4(&(DMatrix + s1)[j]); + for(int j = 0; j < 3; j++) + state2[j] = __ldg4(&(DMatrix + s2)[j]); + for(int j = 0; j < 3; j++) + { + vectype tmp = state1[j] + state2[j]; + state[j] ^= tmp; + } + + round_lyra_v35(state); + + for(int j = 0; j < 3; j++) + { + state1[j] ^= state[j]; + (DMatrix + s3)[j] = state1[j]; + } + + ((uint2*)state2)[0] ^= ((uint2*)state)[11]; + for(int j = 0; j < 11; j++) + ((uint2*)state2)[j + 1] ^= ((uint2*)state)[j]; + + for(int j = 0; j < 3; j++) + (DMatrix + s2)[j] = state2[j]; + } +} + +static __device__ __forceinline__ +void reduceDuplexRowtV3(const int rowIn, const int rowInOut, const int rowOut, vectype* state, uint32_t thread) +{ + vectype state1[3], state2[3]; + uint32_t ps1 = (memshift * rowIn + Nrow * Ncol * memshift * thread); + uint32_t ps2 = (memshift * rowInOut + Nrow * Ncol * memshift * thread); + uint32_t ps3 = (memshift * rowOut + Nrow * Ncol * memshift * thread); + +#pragma nounroll + for(int i = 0; i < Ncol; i++) + { + uint32_t s1 = ps1 + Nrow * i*memshift; + uint32_t s2 = ps2 + Nrow * i*memshift; + uint32_t s3 = ps3 + Nrow * i*memshift; + + for(int j = 0; j < 3; j++) + state1[j] = __ldg4(&(DMatrix + s1)[j]); + + for(int j = 0; j < 3; j++) + state2[j] = __ldg4(&(DMatrix + s2)[j]); + + for(int j = 0; j < 3; j++) + state1[j] += state2[j]; + + for(int j = 0; j < 3; j++) + state[j] ^= state1[j]; + + round_lyra_v35(state); + + ((uint2*)state2)[0] ^= ((uint2*)state)[11]; + + for(int j = 0; j < 11; j++) + ((uint2*)state2)[j + 1] ^= ((uint2*)state)[j]; + + if(rowInOut != rowOut) + { + + for(int j = 0; j < 3; j++) + (DMatrix + s2)[j] = state2[j]; + + for(int j = 0; j < 3; j++) + (DMatrix + s3)[j] ^= state[j]; + + } + else + { + + for(int j = 0; j < 3; j++) + state2[j] ^= state[j]; + + for(int j = 0; j < 3; j++) + (DMatrix + s2)[j] = state2[j]; + } + } +} + +#if __CUDA_ARCH__ >= 300 +__global__ __launch_bounds__(TPB35, 1) +void lyra2v2_gpu_hash_32_v3(uint32_t threads, uint32_t startNounce, uint2 *outputHash) +{ + uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + + vectype state[4]; + vectype blake2b_IV[2]; + vectype padding[2]; + + if(threadIdx.x == 0) + { + + ((uint16*)blake2b_IV)[0] = make_uint16( + 0xf3bcc908, 0x6a09e667, 0x84caa73b, 0xbb67ae85, + 0xfe94f82b, 0x3c6ef372, 0x5f1d36f1, 0xa54ff53a, + 0xade682d1, 0x510e527f, 0x2b3e6c1f, 0x9b05688c, + 0xfb41bd6b, 0x1f83d9ab, 0x137e2179, 0x5be0cd19 + ); + ((uint16*)padding)[0] = make_uint16( + 0x20, 0x0, 0x20, 0x0, 0x20, 0x0, 0x01, 0x0, + 0x04, 0x0, 0x04, 0x0, 0x80, 0x0, 0x0, 0x01000000 + ); + } + + if(thread < threads) + { + ((uint2*)state)[0] = __ldg(&outputHash[thread]); + ((uint2*)state)[1] = __ldg(&outputHash[thread + threads]); + ((uint2*)state)[2] = __ldg(&outputHash[thread + 2 * threads]); + ((uint2*)state)[3] = __ldg(&outputHash[thread + 3 * threads]); + + state[1] = state[0]; + state[2] = shuffle4(((vectype*)blake2b_IV)[0], 0); + state[3] = shuffle4(((vectype*)blake2b_IV)[1], 0); + + for(int i = 0; i<12; i++) + round_lyra_v35(state); + + state[0] ^= shuffle4(((vectype*)padding)[0], 0); + state[1] ^= shuffle4(((vectype*)padding)[1], 0); + + for(int i = 0; i<12; i++) + round_lyra_v35(state); + + uint32_t ps1 = (4 * memshift * 3 + 16 * memshift * thread); + + //#pragma unroll 4 + for(int i = 0; i < 4; i++) + { + uint32_t s1 = ps1 - 4 * memshift * i; + for(int j = 0; j < 3; j++) + (DMatrix + s1)[j] = (state)[j]; + + round_lyra_v35(state); + } + + reduceDuplexV3(state, thread); + reduceDuplexRowSetupV3(1, 0, 2, state, thread); + reduceDuplexRowSetupV3(2, 1, 3, state, thread); + + uint32_t rowa; + int prev = 3; + for(int i = 0; i < 4; i++) + { + rowa = ((uint2*)state)[0].x & 3; reduceDuplexRowtV3(prev, rowa, i, state, thread); + prev = i; + } + + uint32_t shift = (memshift * rowa + 16 * memshift * thread); + + for(int j = 0; j < 3; j++) + state[j] ^= __ldg4(&(DMatrix + shift)[j]); + + for(int i = 0; i < 12; i++) + round_lyra_v35(state); + + outputHash[thread] = ((uint2*)state)[0]; + outputHash[thread + threads] = ((uint2*)state)[1]; + outputHash[thread + 2 * threads] = ((uint2*)state)[2]; + outputHash[thread + 3 * threads] = ((uint2*)state)[3]; + + } //thread +} +#elif __CUDA_ARCH__ >= 200 +__global__ __launch_bounds__(TPB20, 1) +void lyra2v2_gpu_hash_32_v3(uint32_t threads, uint32_t startNounce, uint2 *outputHash) +{ + uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + + vectype state[4]; + vectype blake2b_IV[2]; + vectype padding[2]; + + ((uint16*)blake2b_IV)[0] = make_uint16( + 0xf3bcc908, 0x6a09e667, 0x84caa73b, 0xbb67ae85, + 0xfe94f82b, 0x3c6ef372, 0x5f1d36f1, 0xa54ff53a, + 0xade682d1, 0x510e527f, 0x2b3e6c1f, 0x9b05688c, + 0xfb41bd6b, 0x1f83d9ab, 0x137e2179, 0x5be0cd19 + ); + ((uint16*)padding)[0] = make_uint16( + 0x20, 0x0, 0x20, 0x0, 0x20, 0x0, 0x01, 0x0, + 0x04, 0x0, 0x04, 0x0, 0x80, 0x0, 0x0, 0x01000000 + ); + + if(thread < threads) + { + + ((uint2*)state)[0] = outputHash[thread]; + ((uint2*)state)[1] = outputHash[thread + threads]; + ((uint2*)state)[2] = outputHash[thread + 2 * threads]; + ((uint2*)state)[3] = outputHash[thread + 3 * threads]; + + state[1] = state[0]; + state[2] = ((vectype*)blake2b_IV)[0]; + state[3] = ((vectype*)blake2b_IV)[1]; + + for(int i = 0; i<12; i++) + round_lyra_v35(state); + + state[0] ^= ((vectype*)padding)[0]; + state[1] ^= ((vectype*)padding)[1]; + + for(int i = 0; i<12; i++) + round_lyra_v35(state); + + uint32_t ps1 = (4 * memshift * 3 + 16 * memshift * thread); + + //#pragma unroll 4 + for(int i = 0; i < 4; i++) + { + uint32_t s1 = ps1 - 4 * memshift * i; + for(int j = 0; j < 3; j++) + (DMatrix + s1)[j] = (state)[j]; + + round_lyra_v35(state); + } + + reduceDuplexV3(state, thread); + reduceDuplexRowSetupV3(1, 0, 2, state, thread); + reduceDuplexRowSetupV3(2, 1, 3, state, thread); + + uint32_t rowa; + int prev = 3; + for(int i = 0; i < 4; i++) + { + rowa = ((uint2*)state)[0].x & 3; reduceDuplexRowtV3(prev, rowa, i, state, thread); + prev = i; + } + + uint32_t shift = (memshift * rowa + 16 * memshift * thread); + + for(int j = 0; j < 3; j++) + state[j] ^= __ldg4(&(DMatrix + shift)[j]); + + for(int i = 0; i < 12; i++) + round_lyra_v35(state); + + outputHash[thread] = ((uint2*)state)[0]; + outputHash[thread + threads] = ((uint2*)state)[1]; + outputHash[thread + 2 * threads] = ((uint2*)state)[2]; + outputHash[thread + 3 * threads] = ((uint2*)state)[3]; + + } //thread +} +#endif + +#else +/* host & sm5+ */ +__global__ void lyra2v2_gpu_hash_32_v3(uint32_t threads, uint32_t startNounce, uint2 *outputHash) +{} +#endif \ No newline at end of file diff --git a/lyra2/lyra2RE.cu b/lyra2/lyra2RE.cu index 083caeb734..a05b48ae19 100644 --- a/lyra2/lyra2RE.cu +++ b/lyra2/lyra2RE.cu @@ -8,22 +8,25 @@ extern "C" { #include "miner.h" #include "cuda_helper.h" - +#include static _ALIGN(64) uint64_t *d_hash[MAX_GPUS]; +static THREAD uint32_t *foundNonce; + -extern void blake256_cpu_init(int thr_id, uint32_t threads); -extern void blake256_cpu_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint64_t *Hash, int order); -extern void blake256_cpu_setBlock_80(uint32_t *pdata); -extern void keccak256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, int order); +extern void blake256_cpu_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint64_t *Hash); +extern void blake256_cpu_setBlock_80(int thr_id, uint32_t *pdata); +extern void keccak256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash); extern void keccak256_cpu_init(int thr_id, uint32_t threads); -extern void skein256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, int order); +extern void skein256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash); extern void skein256_cpu_init(int thr_id, uint32_t threads); -extern void lyra2_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, int order); -extern void lyra2_cpu_init(int thr_id, uint32_t threads); +extern void lyra2_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash); +extern void lyra2_cpu_hash_32_multi(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash); -extern void groestl256_setTarget(const void *ptarget); -extern void groestl256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_outputHash, int order, uint32_t *resultnonces); +extern void groestl256_setTarget(int thr_id, const void *ptarget); +extern void lyra2_cpu_init(int thr_id, uint32_t threads); +extern void lyra2_cpu_init_multi(int thr_id, uint32_t threads, uint64_t *hash, uint64_t* hash2); +extern void groestl256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_outputHash, uint32_t *resultnonces); extern void groestl256_cpu_init(int thr_id, uint32_t threads); extern "C" void lyra2_hash(void *state, const void *input) @@ -43,8 +46,7 @@ extern "C" void lyra2_hash(void *state, const void *input) sph_keccak256(&ctx_keccak, hashA, 32); sph_keccak256_close(&ctx_keccak, hashB); - LYRA2(hashA, 32, hashB, 32, hashB, 32, 1, 8, 8); - + LYRA2_old(hashA, 32, hashB, 32, hashB, 32, 1, 8, 8); sph_skein256_init(&ctx_skein); sph_skein256(&ctx_skein, hashA, 32); sph_skein256_close(&ctx_skein, hashB); @@ -56,63 +58,61 @@ extern "C" void lyra2_hash(void *state, const void *input) memcpy(state, hashA, 32); } -static bool init[MAX_GPUS] = { 0 }; +static volatile bool init[MAX_GPUS] = { false }; -extern "C" int scanhash_lyra2(int thr_id, uint32_t *pdata, - const uint32_t *ptarget, uint32_t max_nonce, - unsigned long *hashes_done) +extern int scanhash_lyra2(int thr_id, uint32_t *pdata, + uint32_t *ptarget, uint32_t max_nonce, + uint32_t *hashes_done) { const uint32_t first_nonce = pdata[19]; - unsigned int intensity = (device_sm[device_map[thr_id]] > 500) ? 256 * 256 * 25 : 256 * 256 * 14; - uint32_t throughput = device_intensity(thr_id, __func__, 1U << intensity); // 18=256*256*4; - throughput = min(throughput, (max_nonce - first_nonce)); + unsigned int intensity = (device_sm[device_map[thr_id]] > 500) ? 256 * 256 * 4 : 256 * 256 * 4 ; + intensity = (device_sm[device_map[thr_id]] == 500) ? 256 * 256 * 2 : intensity; + uint32_t throughput = device_intensity(device_map[thr_id], __func__, intensity); // 18=256*256*4; + if (opt_benchmark) - ((uint32_t*)ptarget)[7] = 0x000f; + ptarget[7] = 0x00ff; - if (!init[thr_id]) - { - cudaSetDevice(device_map[thr_id]); - cudaDeviceReset(); + + if(!init[thr_id]) + { + CUDA_SAFE_CALL(cudaSetDevice(device_map[thr_id])); cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync); cudaDeviceSetCacheConfig(cudaFuncCachePreferL1); - - blake256_cpu_init(thr_id, throughput); - keccak256_cpu_init(thr_id,throughput); + CUDA_SAFE_CALL(cudaStreamCreate(&gpustream[thr_id])); + CUDA_SAFE_CALL(cudaProfilerStop()); + CUDA_SAFE_CALL(cudaMallocHost(&foundNonce, 2 * 4)); + CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], 8 * sizeof(uint32_t) * throughput)); + keccak256_cpu_init(thr_id, throughput); skein256_cpu_init(thr_id, throughput); groestl256_cpu_init(thr_id, throughput); lyra2_cpu_init(thr_id, throughput); - CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput)); - - init[thr_id] = true; + init[thr_id] = true; } + else + CUDA_SAFE_CALL(cudaProfilerStart()); uint32_t endiandata[20]; for (int k=0; k < 20; k++) - be32enc(&endiandata[k], ((uint32_t*)pdata)[k]); - - blake256_cpu_setBlock_80(pdata); - groestl256_setTarget(ptarget); + be32enc(&endiandata[k], pdata[k]); + blake256_cpu_setBlock_80(thr_id, pdata); + groestl256_setTarget(thr_id, ptarget); do { - int order = 0; - uint32_t foundNonce[2] = { 0, 0 }; - - blake256_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++); - keccak256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++); - lyra2_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++); - skein256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++); - //MyStreamSynchronize(NULL, 2, thr_id); - groestl256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++, foundNonce); - if (foundNonce[0] != 0) + blake256_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); + keccak256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id]); + lyra2_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id]); + skein256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id]); + groestl256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], foundNonce); + CUDA_SAFE_CALL(cudaGetLastError()); + if(stop_mining) {mining_has_stopped[thr_id] = true; cudaStreamDestroy(gpustream[thr_id]); pthread_exit(nullptr);} + if(foundNonce[0] != 0) { - CUDA_SAFE_CALL(cudaGetLastError()); const uint32_t Htarg = ptarget[7]; uint32_t vhash64[8]; be32enc(&endiandata[19], foundNonce[0]); lyra2_hash(vhash64, endiandata); - if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) { int res = 1; @@ -120,22 +120,34 @@ extern "C" int scanhash_lyra2(int thr_id, uint32_t *pdata, *hashes_done = pdata[19] - first_nonce + throughput; if (foundNonce[1] != 0) { - pdata[21] = foundNonce[1]; - res++; - if (opt_benchmark) applog(LOG_INFO, "GPU #%d Found second nounce %08x", thr_id, foundNonce[1], vhash64[7], Htarg); + be32enc(&endiandata[19], foundNonce[1]); + lyra2_hash(vhash64, endiandata); + + if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) + { + pdata[21] = foundNonce[1]; + res++; + if (opt_benchmark) applog(LOG_INFO, "GPU #%d: Found second nounce %08x", device_map[thr_id], foundNonce[1]); + } + else + { + if (vhash64[7] != Htarg) // don't show message if it is equal but fails fulltest + applog(LOG_WARNING, "GPU #%d: result %08x does not validate on CPU!", device_map[thr_id], foundNonce[1]); + } } pdata[19] = foundNonce[0]; - if (opt_benchmark) applog(LOG_INFO, "GPU #%d Found nounce % 08x", thr_id, foundNonce[0], vhash64[7], Htarg); + if (opt_benchmark) applog(LOG_INFO, "GPU #%d: Found nounce %08x", device_map[thr_id], foundNonce[0]); return res; } else { - if (vhash64[7] > Htarg) // don't show message if it is equal but fails fulltest - applog(LOG_WARNING, "GPU #%d: result does not validate on CPU!", thr_id); + if (vhash64[7] != Htarg) // don't show message if it is equal but fails fulltest + applog(LOG_WARNING, "GPU #%d: result %08x does not validate on CPU!", device_map[thr_id], foundNonce[0]); } } pdata[19] += throughput; + } while (!work_restart[thr_id].restart && ((uint64_t)max_nonce > ((uint64_t)(pdata[19]) + (uint64_t)throughput))); *hashes_done = pdata[19] - first_nonce + 1; diff --git a/lyra2/lyra2REv2.cu b/lyra2/lyra2REv2.cu new file mode 100644 index 0000000000..076d4217da --- /dev/null +++ b/lyra2/lyra2REv2.cu @@ -0,0 +1,230 @@ +extern "C" { +#include "sph/sph_blake.h" +#include "sph/sph_bmw.h" +#include "sph/sph_skein.h" +#include "sph/sph_keccak.h" +#include "sph/sph_cubehash.h" +#include "lyra2/Lyra2.h" +} + +#include "miner.h" +#include "cuda_helper.h" + +extern void blakeKeccak256_cpu_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint64_t *Hash); +extern void blake256_cpu_setBlock_80(int thr_id, uint32_t *pdata); + +extern void keccak256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash); +extern void keccak256_cpu_init(int thr_id, uint32_t threads); + +extern void skein256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash); +extern void skein256_cpu_init(int thr_id, uint32_t threads); + +extern void skeinCube256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash); + + +extern void lyra2v2_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash); +extern void lyra2v2_cpu_init(int thr_id, uint64_t* matrix); + +extern void bmw256_cpu_init(int thr_id); +extern void bmw256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *resultnonces, uint32_t target); + +extern void cubehash256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_hash); + +extern "C" void lyra2v2_hash(void *state, const void *input) +{ + sph_blake256_context ctx_blake; + sph_keccak256_context ctx_keccak; + sph_skein256_context ctx_skein; + sph_bmw256_context ctx_bmw; + sph_cubehash256_context ctx_cube; + + uint32_t hashA[8], hashB[8]; + + sph_blake256_init(&ctx_blake); + sph_blake256(&ctx_blake, input, 80); + sph_blake256_close(&ctx_blake, hashA); + + sph_keccak256_init(&ctx_keccak); + sph_keccak256(&ctx_keccak, hashA, 32); + sph_keccak256_close(&ctx_keccak, hashB); + + sph_cubehash256_init(&ctx_cube); + sph_cubehash256(&ctx_cube, hashB, 32); + sph_cubehash256_close(&ctx_cube, hashA); + + + LYRA2(hashB, 32, hashA, 32, hashA, 32, 1, 4, 4); + + sph_skein256_init(&ctx_skein); + sph_skein256(&ctx_skein, hashB, 32); + sph_skein256_close(&ctx_skein, hashA); + + sph_cubehash256_init(&ctx_cube); + sph_cubehash256(&ctx_cube, hashA, 32); + sph_cubehash256_close(&ctx_cube, hashB); + + + sph_bmw256_init(&ctx_bmw); + sph_bmw256(&ctx_bmw, hashB, 32); + sph_bmw256_close(&ctx_bmw, hashA); + + memcpy(state, hashA, 32); +} + +int scanhash_lyra2v2(int thr_id, uint32_t *pdata, + const uint32_t *ptarget, uint32_t max_nonce, + uint32_t *hashes_done) +{ + static THREAD uint64_t *d_hash = nullptr; + static THREAD uint64_t *d_hash2 = nullptr; + + const uint32_t first_nonce = pdata[19]; + uint32_t intensity = 256 * 256 * 8; + + cudaDeviceProp props; + cudaGetDeviceProperties(&props, device_map[thr_id]); + + if(strstr(props.name, "1080")) + { + intensity = 256 * 256 * 15; +#ifdef _WIN64 + intensity = 256 * 256 * 22; +#endif + } + else if(strstr(props.name, "1070")) + { + intensity = 256 * 256 * 15; +#ifdef _WIN64 + intensity = 256 * 256 * 22; +#endif + } + else if(strstr(props.name, "970")) + { + intensity = 256 * 256 * 15; +#ifdef _WIN64 + intensity = 256 * 256 * 22; +#endif + } + else if (strstr(props.name, "980")) + { + intensity = 256 * 256 * 15; +#ifdef _WIN64 + intensity = 256 * 256 * 22; +#endif + } + else if (strstr(props.name, "750 Ti")) + { + intensity = 256 * 256 * 12; + } + else if (strstr(props.name, "750")) + { + intensity = 256 * 256 * 5; + } + else if (strstr(props.name, "960")) + { + intensity = 256 * 256 * 8; + } + uint32_t throughputmax = device_intensity(device_map[thr_id], __func__, intensity); + uint32_t throughput = min(throughputmax, max_nonce - first_nonce) & 0xfffffe00; + + if (opt_benchmark) + ((uint32_t*)ptarget)[7] = 0x004f; + + static THREAD bool init = false; + if (!init) + { + CUDA_SAFE_CALL(cudaSetDevice(device_map[thr_id])); + cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync); + cudaDeviceSetCacheConfig(cudaFuncCachePreferL1); + CUDA_SAFE_CALL(cudaStreamCreate(&gpustream[thr_id])); +#if defined WIN32 && !defined _WIN64 + // 2GB limit for cudaMalloc + if(throughputmax > 0x7fffffffULL / (16 * 4 * 4 * sizeof(uint64_t))) + { + applog(LOG_ERR, "intensity too high"); + mining_has_stopped[thr_id] = true; + cudaStreamDestroy(gpustream[thr_id]); + proper_exit(2); + } +#endif + CUDA_SAFE_CALL(cudaMalloc(&d_hash2, 16 * 4 * 4 * sizeof(uint64_t) * throughputmax)); + CUDA_SAFE_CALL(cudaMalloc(&d_hash, 8 * sizeof(uint32_t) * throughputmax)); + + bmw256_cpu_init(thr_id); + lyra2v2_cpu_init(thr_id, d_hash2); + mining_has_stopped[thr_id] = false; + + init = true; + } + + uint32_t endiandata[20]; + for (int k=0; k < 20; k++) + be32enc(&endiandata[k], ((uint32_t*)pdata)[k]); + + blake256_cpu_setBlock_80(thr_id, pdata); + + do { + uint32_t foundNonce[2] = { 0, 0 }; + + blakeKeccak256_cpu_hash_80(thr_id, throughput, pdata[19], d_hash); +// keccak256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash); + cubehash256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash); + lyra2v2_cpu_hash_32(thr_id, throughput, pdata[19], d_hash); + skein256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash); + cubehash256_cpu_hash_32(thr_id, throughput,pdata[19], d_hash); + bmw256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash, foundNonce, ptarget[7]); + if(stop_mining) + { + mining_has_stopped[thr_id] = true; cudaStreamDestroy(gpustream[thr_id]); pthread_exit(nullptr); + } + if(foundNonce[0] != 0) + { + const uint32_t Htarg = ptarget[7]; + uint32_t vhash64[8]={0}; + if(opt_verify) + { + be32enc(&endiandata[19], foundNonce[0]); + lyra2v2_hash(vhash64, endiandata); + } + if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) + { + int res = 1; + // check if there was some other ones... + *hashes_done = pdata[19] - first_nonce + throughput; + if (foundNonce[1] != 0) + { + if(opt_verify) + { + be32enc(&endiandata[19], foundNonce[1]); + lyra2v2_hash(vhash64, endiandata); + } + if(vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) + { + pdata[21] = foundNonce[1]; + res++; + if(opt_benchmark) applog(LOG_INFO, "GPU #%d Found second nonce %08x", thr_id, foundNonce[1]); + } + else + { + if(vhash64[7] != Htarg) // don't show message if it is equal but fails fulltest + applog(LOG_WARNING, "GPU #%d: result does not validate on CPU!", thr_id); + } + } + pdata[19] = foundNonce[0]; + if (opt_benchmark) applog(LOG_INFO, "GPU #%d Found nonce % 08x", thr_id, foundNonce[0]); + return res; + } + else + { + if (vhash64[7] != Htarg) // don't show message if it is equal but fails fulltest + applog(LOG_WARNING, "GPU #%d: result does not validate on CPU!", thr_id); + } + } + + pdata[19] += throughput; + + } while (!work_restart[thr_id].restart && ((uint64_t)max_nonce > ((uint64_t)(pdata[19]) + (uint64_t)throughput))); + + *hashes_done = pdata[19] - first_nonce ; + return 0; +} diff --git a/miner.h b/miner.h index f84a3b9a11..12fc2f1fab 100644 --- a/miner.h +++ b/miner.h @@ -1,13 +1,15 @@ #ifndef __MINER_H__ #define __MINER_H__ -#ifdef __cplusplus -extern "C" { +#ifndef WIN32 +#include "ccminer-config.h" +#else +#include "ccminer-config-win.h" #endif -#include "cpuminer-config.h" - +#ifndef __cplusplus #include +#endif #include #include #include @@ -15,6 +17,9 @@ extern "C" { #include #ifdef WIN32 +#ifndef __cplusplus +#define inline __inline +#endif #define snprintf(...) _snprintf(__VA_ARGS__) #define strdup(x) _strdup(x) #define strncasecmp(x,y,z) _strnicmp(x,y,z) @@ -54,19 +59,10 @@ void *alloca (size_t); #include "compat.h" -#ifdef __INTELLISENSE__ -/* should be in stdint.h but... */ -typedef __int64 int64_t; -typedef unsigned __int64 uint64_t; -typedef __int32 int32_t; -typedef unsigned __int32 uint32_t; -typedef __int16 int16_t; -typedef unsigned __int16 uint16_t; -typedef __int16 int8_t; -typedef unsigned __int16 uint8_t; - -typedef unsigned __int32 time_t; -typedef char * va_list; +#ifdef _MSC_VER +#define THREAD __declspec(thread) +#else +#define THREAD __thread #endif #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ > 0 @@ -130,28 +126,31 @@ static inline bool is_windows(void) { #if ((__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3)) #define WANT_BUILTIN_BSWAP -#else -#define bswap_32(x) ((((x) << 24) & 0xff000000u) | (((x) << 8) & 0x00ff0000u) \ - | (((x) >> 8) & 0x0000ff00u) | (((x) >> 24) & 0x000000ffu)) -#define bswap_64(x) (((uint64_t) bswap_32((uint32_t)((x) & 0xffffffffu)) << 32) \ - | (uint64_t) bswap_32((uint32_t)((x) >> 32))) #endif -static inline uint32_t swab32(uint32_t v) +static inline uint32_t swab32(uint32_t x) { #ifdef WANT_BUILTIN_BSWAP - return __builtin_bswap32(v); + return __builtin_bswap32(x); #else - return bswap_32(v); +#ifdef _MSC_VER + return _byteswap_ulong(x); +#else + return ((((x) << 24) & 0xff000000u) | (((x) << 8) & 0x00ff0000u) | (((x) >> 8) & 0x0000ff00u) | (((x) >> 24) & 0x000000ffu)); +#endif #endif } -static inline uint64_t swab64(uint64_t v) +static inline uint64_t swab64(uint64_t x) { #ifdef WANT_BUILTIN_BSWAP - return __builtin_bswap64(v); + return __builtin_bswap64(x); #else - return bswap_64(v); +#ifdef _MSC_VER + return _byteswap_uint64(x); +#else + return (((uint64_t)bswap_32((uint32_t)((x)& 0xffffffffu)) << 32) | (uint64_t)bswap_32((uint32_t)((x) >> 32))); +#endif #endif } @@ -177,9 +176,7 @@ static inline void swab256(void *dest_p, const void *src_p) #if !HAVE_DECL_BE32DEC static inline uint32_t be32dec(const void *pp) { - const uint8_t *p = (uint8_t const *)pp; - return ((uint32_t)(p[3]) + ((uint32_t)(p[2]) << 8) + - ((uint32_t)(p[1]) << 16) + ((uint32_t)(p[0]) << 24)); + return swab32(*((uint32_t*)pp)); } #endif @@ -195,11 +192,7 @@ static inline uint32_t le32dec(const void *pp) #if !HAVE_DECL_BE32ENC static inline void be32enc(void *pp, uint32_t x) { - uint8_t *p = (uint8_t *)pp; - p[3] = x & 0xff; - p[2] = (x >> 8) & 0xff; - p[1] = (x >> 16) & 0xff; - p[0] = (x >> 24) & 0xff; + *((uint32_t*)pp) = swab32(x); } #endif @@ -260,134 +253,142 @@ void aligned_free(void *ptr); #define USER_AGENT PACKAGE_NAME "/" PACKAGE_VERSION -void sha256_init(uint32_t *state); -void sha256_transform(uint32_t *state, const uint32_t *block, int swap); -void sha256d(unsigned char *hash, const unsigned char *data, int len); - -#if defined(__ARM_NEON__) || defined(__i386__) || defined(__x86_64__) -#define HAVE_SHA256_4WAY 0 -int sha256_use_4way(); -void sha256_init_4way(uint32_t *state); -void sha256_transform_4way(uint32_t *state, const uint32_t *block, int swap); +#ifdef __cplusplus +extern "C" { #endif -#if defined(__x86_64__) && defined(USE_AVX2) -#define HAVE_SHA256_8WAY 0 -int sha256_use_8way(); -void sha256_init_8way(uint32_t *state); -void sha256_transform_8way(uint32_t *state, const uint32_t *block, int swap); + void sha256_init(uint32_t *state); + void sha256_transform(uint32_t *state, const uint32_t *block, int swap); + void sha256d(unsigned char *hash, const unsigned char *data, int len); + +#ifdef __cplusplus +} #endif -extern int scanhash_sha256d(int thr_id, uint32_t *pdata, - const uint32_t *ptarget, uint32_t max_nonce, unsigned long *hashes_done); +struct work_restart +{ + volatile unsigned long restart; + char padding[128 - sizeof(unsigned long)]; +}; +extern struct work_restart *work_restart; -extern unsigned char *scrypt_buffer_alloc(); +bool fulltest(const uint32_t *hash, const uint32_t *target); extern int scanhash_deep(int thr_id, uint32_t *pdata, - const uint32_t *ptarget, uint32_t max_nonce, - unsigned long *hashes_done); + uint32_t *ptarget, uint32_t max_nonce, + uint32_t *hashes_done); extern int scanhash_doom(int thr_id, uint32_t *pdata, - const uint32_t *ptarget, uint32_t max_nonce, - unsigned long *hashes_done); + uint32_t *ptarget, uint32_t max_nonce, + uint32_t *hashes_done); extern int scanhash_fugue256(int thr_id, uint32_t *pdata, - const uint32_t *ptarget, uint32_t max_nonce, - unsigned long *hashes_done); + uint32_t *ptarget, uint32_t max_nonce, + uint32_t *hashes_done); extern int scanhash_groestlcoin(int thr_id, uint32_t *pdata, - const uint32_t *ptarget, uint32_t max_nonce, - unsigned long *hashes_done); + uint32_t *ptarget, uint32_t max_nonce, + uint32_t *hashes_done); -extern int scanhash_heavy(int thr_id, uint32_t *pdata, - const uint32_t *ptarget, uint32_t max_nonce, - unsigned long *hashes_done, uint32_t maxvote, int blocklen); +extern int scanhash_c11(int thr_id, uint32_t *pdata, + uint32_t *ptarget, uint32_t max_nonce, + uint32_t *hashes_done); extern int scanhash_keccak256(int thr_id, uint32_t *pdata, - const uint32_t *ptarget, uint32_t max_nonce, - unsigned long *hashes_done); + uint32_t *ptarget, uint32_t max_nonce, + uint32_t *hashes_done); extern int scanhash_myriad(int thr_id, uint32_t *pdata, - const uint32_t *ptarget, uint32_t max_nonce, - unsigned long *hashes_done); + uint32_t *ptarget, uint32_t max_nonce, + uint32_t *hashes_done); extern int scanhash_jackpot(int thr_id, uint32_t *pdata, - const uint32_t *ptarget, uint32_t max_nonce, - unsigned long *hashes_done); + uint32_t *ptarget, uint32_t max_nonce, + uint32_t *hashes_done); extern int scanhash_quark(int thr_id, uint32_t *pdata, - const uint32_t *ptarget, uint32_t max_nonce, - unsigned long *hashes_done); + uint32_t *ptarget, uint32_t max_nonce, + uint32_t *hashes_done); -extern int scanhash_anime(int thr_id, uint32_t *pdata, - const uint32_t *ptarget, uint32_t max_nonce, - unsigned long *hashes_done); extern int scanhash_blake256(int thr_id, uint32_t *pdata, - const uint32_t *ptarget, uint32_t max_nonce, - unsigned long *hashes_done, int8_t blakerounds); + uint32_t *ptarget, uint32_t max_nonce, + uint32_t *hashes_done, int8_t blakerounds); extern int scanhash_fresh(int thr_id, uint32_t *pdata, - const uint32_t *ptarget, uint32_t max_nonce, - unsigned long *hashes_done); + uint32_t *ptarget, uint32_t max_nonce, + uint32_t *hashes_done); -extern int scanhash_lyra2(int thr_id, uint32_t *pdata, +extern int scanhash_lyra2v2(int thr_id, uint32_t *pdata, const uint32_t *ptarget, uint32_t max_nonce, - unsigned long *hashes_done); + uint32_t *hashes_done); extern int scanhash_nist5(int thr_id, uint32_t *pdata, - const uint32_t *ptarget, uint32_t max_nonce, - unsigned long *hashes_done); + uint32_t *ptarget, uint32_t max_nonce, + uint32_t *hashes_done); extern int scanhash_pentablake(int thr_id, uint32_t *pdata, - const uint32_t *ptarget, uint32_t max_nonce, - unsigned long *hashes_done); + uint32_t *ptarget, uint32_t max_nonce, + uint32_t *hashes_done); extern int scanhash_qubit(int thr_id, uint32_t *pdata, - const uint32_t *ptarget, uint32_t max_nonce, - unsigned long *hashes_done); + uint32_t *ptarget, uint32_t max_nonce, + uint32_t *hashes_done); + -extern int scanhash_scrypt(int thr_id, uint32_t *pdata, - unsigned char *scratchbuf, const uint32_t *ptarget, - uint32_t max_nonce, unsigned long *hashes_done); +extern int scanhash_skeincoin(int thr_id, uint32_t *pdata, + uint32_t *ptarget, uint32_t max_nonce, + uint32_t *hashes_done); extern int scanhash_s3(int thr_id, uint32_t *pdata, - const uint32_t *ptarget, uint32_t max_nonce, - unsigned long *hashes_done); + uint32_t *ptarget, uint32_t max_nonce, + uint32_t *hashes_done); extern int scanhash_whc(int thr_id, uint32_t *pdata, - const uint32_t *ptarget, uint32_t max_nonce, - unsigned long *hashes_done); + uint32_t *ptarget, uint32_t max_nonce, + uint32_t *hashes_done); + +extern int scanhash_whirlpoolx(int thr_id, uint32_t *pdata, + uint32_t *ptarget, uint32_t max_nonce, + uint32_t *hashes_done); extern int scanhash_x11(int thr_id, uint32_t *pdata, - const uint32_t *ptarget, uint32_t max_nonce, - unsigned long *hashes_done); + uint32_t *ptarget, uint32_t max_nonce, + uint32_t *hashes_done); extern int scanhash_x13(int thr_id, uint32_t *pdata, - const uint32_t *ptarget, uint32_t max_nonce, - unsigned long *hashes_done); + uint32_t *ptarget, uint32_t max_nonce, + uint32_t *hashes_done); extern int scanhash_x14(int thr_id, uint32_t *pdata, - const uint32_t *ptarget, uint32_t max_nonce, - unsigned long *hashes_done); + uint32_t *ptarget, uint32_t max_nonce, + uint32_t *hashes_done); extern int scanhash_x15(int thr_id, uint32_t *pdata, - const uint32_t *ptarget, uint32_t max_nonce, - unsigned long *hashes_done); + uint32_t *ptarget, uint32_t max_nonce, + uint32_t *hashes_done); extern int scanhash_x17(int thr_id, uint32_t *pdata, - const uint32_t *ptarget, uint32_t max_nonce, - unsigned long *hashes_done); + uint32_t *ptarget, uint32_t max_nonce, + uint32_t *hashes_done); extern int scanhash_bitcoin(int thr_id, uint32_t *pdata, - const uint32_t *ptarget, uint32_t max_nonce, - unsigned long *hashes_done); + uint32_t *ptarget, uint32_t max_nonce, + uint32_t *hashes_done); + +extern int scanhash_neoscrypt(bool stratum, int thr_id, uint32_t *pdata, + uint32_t *ptarget, uint32_t max_nonce, + uint32_t *hashes_done); +extern int scanhash_sia(int thr_id, uint32_t *pdata, + uint32_t *ptarget, uint32_t max_nonce, + uint32_t *hashes_done); /* api related */ void *api_thread(void *userdata); void api_set_throughput(int thr_id, uint32_t throughput); -struct cgpu_info { +struct cgpu_info +{ uint8_t gpu_id; uint8_t thr_id; int accepted; @@ -403,7 +404,7 @@ struct cgpu_info { int gpu_clock; int gpu_memclock; size_t gpu_mem; - uint32_t gpu_usage; + uint32_t gpu_power; double gpu_vddc; int16_t gpu_pstate; int16_t gpu_bus; @@ -459,23 +460,24 @@ struct thr_info { struct cgpu_info gpu; }; -struct work_restart { - volatile unsigned long restart; - char padding[128 - sizeof(unsigned long)]; -}; - +extern int cuda_num_devices(); +extern int cuda_version(); +extern int cuda_gpu_clocks(struct cgpu_info *gpu); +extern bool opt_verify; extern bool opt_benchmark; extern bool opt_debug; extern bool opt_quiet; extern bool opt_protocol; extern bool opt_tracegpu; extern int opt_n_threads; +extern int num_cpus; extern int active_gpus; extern int opt_timeout; extern bool want_longpoll; extern bool have_longpoll; extern bool want_stratum; extern bool have_stratum; +extern bool opt_stratum_stats; extern char *opt_cert; extern char *opt_proxy; extern long opt_proxy_type; @@ -486,14 +488,12 @@ extern struct thr_info *thr_info; extern int longpoll_thr_id; extern int stratum_thr_id; extern int api_thr_id; -extern struct work_restart *work_restart; extern bool opt_trust_pool; -extern uint16_t opt_vote; extern uint64_t global_hashrate; extern double global_diff; -#define MAX_GPUS 16 +#define MAX_GPUS 8 extern char* device_name[MAX_GPUS]; extern int device_map[MAX_GPUS]; extern long device_sm[MAX_GPUS]; @@ -530,18 +530,16 @@ extern uint32_t gpus_intensity[MAX_GPUS]; #define CL_WHT "\x1B[01;37m" /* white */ -extern void applog(int prio, const char *fmt, ...); -extern json_t *json_rpc_call(CURL *curl, const char *url, const char *userpass, - const char *rpc_req, bool, bool, int *); -extern void cbin2hex(char *out, const char *in, size_t len); -extern char *bin2hex(const unsigned char *in, size_t len); -extern bool hex2bin(unsigned char *p, const char *hexstr, size_t len); -extern int timeval_subtract(struct timeval *result, struct timeval *x, - struct timeval *y); -extern bool fulltest(const uint32_t *hash, const uint32_t *target); -extern void diff_to_target(uint32_t *target, double diff); -extern void get_currentalgo(char* buf, int sz); -extern uint32_t device_intensity(int thr_id, const char *func, uint32_t defcount); +void format_hashrate(double hashrate, char *output); +void applog(int prio, const char *fmt, ...); +json_t *json_rpc_call(CURL *curl, const char *url, const char *userpass, const char *rpc_req, bool, bool, int *); +void cbin2hex(char *out, const char *in, size_t len); +char *bin2hex(const unsigned char *in, size_t len); +bool hex2bin(unsigned char *p, const char *hexstr, size_t len); +int timeval_subtract(struct timeval *result, struct timeval *x, struct timeval *y); +void diff_to_target(uint32_t *target, double diff); +void get_currentalgo(char* buf, int sz); +uint32_t device_intensity(int thr_id, const char *func, uint32_t defcount); struct stratum_job { char *job_id; @@ -565,7 +563,6 @@ struct stratum_ctx { CURL *curl; char *curl_url; - char curl_err_str[CURL_ERROR_SIZE]; curl_socket_t sock; size_t sockbuf_size; char *sockbuf; @@ -589,9 +586,10 @@ struct stratum_ctx { }; struct work { - uint32_t data[32]; + uint32_t data[64]; + size_t datasize; + uint32_t midstate[8]; uint32_t target[8]; - uint32_t maxvote; char job_id[128]; size_t xnonce2_len; @@ -609,13 +607,49 @@ struct work { uint32_t scanned_to; }; +enum sha_algos +{ + ALGO_BITCOIN, + ALGO_BLAKE, + ALGO_BLAKECOIN, + ALGO_C11, + ALGO_DEEP, + ALGO_DMD_GR, + ALGO_DOOM, + ALGO_FRESH, + ALGO_FUGUE256, /* Fugue256 */ + ALGO_GROESTL, + ALGO_KECCAK, + ALGO_JACKPOT, + ALGO_LUFFA_DOOM, + ALGO_LYRA2v2, + ALGO_MYR_GR, + ALGO_NIST5, + ALGO_PENTABLAKE, + ALGO_QUARK, + ALGO_QUBIT, + ALGO_SIA, + ALGO_SKEIN, + ALGO_S3, + ALGO_SPREADX11, + ALGO_WHC, + ALGO_WHCX, + ALGO_X11, + ALGO_X13, + ALGO_X14, + ALGO_X15, + ALGO_X17, + ALGO_VANILLA, + ALGO_NEO +}; + bool stratum_socket_full(struct stratum_ctx *sctx, int timeout); bool stratum_send_line(struct stratum_ctx *sctx, char *s); char *stratum_recv_line(struct stratum_ctx *sctx); bool stratum_connect(struct stratum_ctx *sctx, const char *url); void stratum_disconnect(struct stratum_ctx *sctx); bool stratum_subscribe(struct stratum_ctx *sctx); -bool stratum_authorize(struct stratum_ctx *sctx, const char *user, const char *pass); +bool stratum_authorize(struct stratum_ctx *sctx, const char *user, const char *pass,bool extranonce); bool stratum_handle_method(struct stratum_ctx *sctx, const char *s); void hashlog_remember_submit(struct work* work, uint32_t nonce); @@ -647,6 +681,7 @@ extern void tq_freeze(struct thread_q *tq); extern void tq_thaw(struct thread_q *tq); void proper_exit(int reason); +void restart_threads(void); size_t time2str(char* buf, time_t timer); char* atime2str(time_t timer); @@ -655,22 +690,21 @@ void applog_hash(unsigned char *hash); void applog_compare_hash(unsigned char *hash, unsigned char *hash2); void print_hash_tests(void); -void animehash(void *state, const void *input); + void blake256hash(void *output, const void *input, int8_t rounds); void deephash(void *state, const void *input); void doomhash(void *state, const void *input); void fresh_hash(void *state, const void *input); void fugue256_hash(unsigned char* output, const unsigned char* input, int len); -void heavycoin_hash(unsigned char* output, const unsigned char* input, int len); void keccak256_hash(void *state, const void *input); unsigned int jackpothash(void *state, const void *input); void groestlhash(void *state, const void *input); -void lyra2_hash(void *state, const void *input); void myriadhash(void *state, const void *input); void nist5hash(void *state, const void *input); void pentablakehash(void *output, const void *input); void quarkhash(void *state, const void *input); void qubithash(void *state, const void *input); +void skeincoinhash(void *output, const void *input); void s3hash(void *output, const void *input); void wcoinhash(void *state, const void *input); void x11hash(void *output, const void *input); @@ -679,8 +713,4 @@ void x14hash(void *output, const void *input); void x15hash(void *output, const void *input); void x17hash(void *output, const void *input); -#ifdef __cplusplus -} -#endif - #endif /* __MINER_H__ */ diff --git a/myriadgroestl.cpp b/myriadgroestl.cpp index 76ed46732c..39774c3a4c 100644 --- a/myriadgroestl.cpp +++ b/myriadgroestl.cpp @@ -1,25 +1,23 @@ #include +#ifdef __cplusplus +#include +#else #include +#endif #include -#include "uint256.h" #include "sph/sph_groestl.h" #include "miner.h" #include - -static bool init[MAX_GPUS] = { 0 }; -static uint32_t *h_found[MAX_GPUS]; +extern bool stop_mining; +extern volatile bool mining_has_stopped[MAX_GPUS]; void myriadgroestl_cpu_init(int thr_id, uint32_t threads); void myriadgroestl_cpu_setBlock(int thr_id, void *data, void *pTargetIn); void myriadgroestl_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *nounce); -#define SWAP32(x) \ - ((((x) << 24) & 0xff000000u) | (((x) << 8) & 0x00ff0000u) | \ - (((x) >> 8) & 0x0000ff00u) | (((x) >> 24) & 0x000000ffu)) - -extern "C" void myriadhash(void *state, const void *input) +void myriadhash(void *state, const void *input) { uint32_t hashA[16], hashB[16]; sph_groestl512_context ctx_groestl; @@ -36,25 +34,38 @@ extern "C" void myriadhash(void *state, const void *input) memcpy(state, hashB, 32); } -extern "C" int scanhash_myriad(int thr_id, uint32_t *pdata, const uint32_t *ptarget, - uint32_t max_nonce, unsigned long *hashes_done) +extern int scanhash_myriad(int thr_id, uint32_t *pdata, uint32_t *ptarget, + uint32_t max_nonce, uint32_t *hashes_done) { - uint32_t start_nonce = pdata[19]++; - uint32_t throughput = device_intensity(thr_id, __func__, 1 << 17); - throughput = min(throughput, max_nonce - start_nonce); + static THREAD uint32_t *h_found = nullptr; + + uint32_t start_nonce = pdata[19]; + uint32_t throughputmax = device_intensity(device_map[thr_id], __func__, 1 << 19); + uint32_t throughput = min(throughputmax, max_nonce - start_nonce) & 0xfffffc00; if (opt_benchmark) - ((uint32_t*)ptarget)[7] = 0x0000ff; + ptarget[7] = 0x0000ff; // init - if(!init[thr_id]) + static THREAD volatile bool init = false; + if(!init) { #if BIG_DEBUG #else - myriadgroestl_cpu_init(thr_id, throughput); +#if defined WIN32 && !defined _WIN64 + // 2GB limit for cudaMalloc + if(throughputmax > 0x7fffffffULL / (16 * sizeof(uint32_t))) + { + applog(LOG_ERR, "intensity too high"); + mining_has_stopped[thr_id] = true; + proper_exit(2); + } +#endif + myriadgroestl_cpu_init(thr_id, throughputmax); #endif - cudaMallocHost(&(h_found[thr_id]), 4 * sizeof(uint32_t)); - init[thr_id] = true; + cudaMallocHost(&h_found, 4 * sizeof(uint32_t)); + mining_has_stopped[thr_id] = false; + init = true; } uint32_t endiandata[32]; @@ -67,39 +78,64 @@ extern "C" int scanhash_myriad(int thr_id, uint32_t *pdata, const uint32_t *ptar do { const uint32_t Htarg = ptarget[7]; - myriadgroestl_cpu_hash(thr_id, throughput, pdata[19], h_found[thr_id]); + myriadgroestl_cpu_hash(thr_id, throughput, pdata[19], h_found); - if (h_found[thr_id][0] < 0xffffffff) + if(stop_mining) {mining_has_stopped[thr_id] = true; pthread_exit(nullptr);} + if(h_found[0] != 0xffffffff) { - uint32_t tmpHash[8]; - endiandata[19] = SWAP32(h_found[thr_id][0]); - myriadhash(tmpHash, endiandata); - if (tmpHash[7] <= Htarg && fulltest(tmpHash, ptarget)) + const uint32_t Htarg = ptarget[7]; + uint32_t vhash64[8]={0}; + if(opt_verify){ be32enc(&endiandata[19], h_found[0]); + myriadhash(vhash64, endiandata); + + } if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) { int res = 1; *hashes_done = pdata[19] - start_nonce + throughput; - if (h_found[thr_id][1] != 0xffffffff) + if (h_found[1] != 0xffffffff) { - if (opt_benchmark) applog(LOG_INFO, "found second nounce %08x", thr_id, h_found[thr_id][1]); - pdata[21] = h_found[thr_id][1]; - res++; + if(opt_verify){ be32enc(&endiandata[19], h_found[1]); + myriadhash(vhash64, endiandata); + } if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) + { + + pdata[21] = h_found[1]; + res++; + if (opt_benchmark) + applog(LOG_INFO, "GPU #%d Found second nounce %08x", device_map[thr_id], h_found[1]); + } + else + { + if (vhash64[7] != Htarg) + { + applog(LOG_WARNING, "GPU #%d: result for %08x does not validate on CPU!", device_map[thr_id], h_found[1]); + } + } + } - pdata[19] = h_found[thr_id][0]; + pdata[19] = h_found[0]; if (opt_benchmark) - applog(LOG_INFO, "found nounce %08x", thr_id, h_found[thr_id][0]); + applog(LOG_INFO, "GPU #%d Found nounce %08x", device_map[thr_id], h_found[0]); return res; } else { - if (tmpHash[7] != Htarg) // don't show message if it is equal but fails fulltest - applog(LOG_WARNING, "GPU #%d: result for %08x does not validate on CPU!", thr_id, h_found[thr_id][0]); + if (vhash64[7] != Htarg) + { + applog(LOG_WARNING, "GPU #%d: result for %08x does not validate on CPU!", device_map[thr_id], h_found[0]); + } } } - pdata[19] += throughput; + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) + { + applog(LOG_ERR, "GPU #%d: %s", device_map[thr_id], cudaGetErrorString(err)); + exit(EXIT_FAILURE); + } } while (!work_restart[thr_id].restart && ((uint64_t)max_nonce > ((uint64_t)(pdata[19]) + (uint64_t)throughput))); - *hashes_done = pdata[19] - start_nonce + 1; + *hashes_done = pdata[19] - start_nonce; return 0; } diff --git a/neoscrypt/cuda_neoscrypt.cu b/neoscrypt/cuda_neoscrypt.cu new file mode 100644 index 0000000000..273a6b7e72 --- /dev/null +++ b/neoscrypt/cuda_neoscrypt.cu @@ -0,0 +1,1498 @@ +// originally from djm34 (https://github.com/djm34/ccminer-sp-neoscrypt/) + +#include +#include +#include "cuda_helper.h" +#include "cuda_vector.h" + +#define vectype uintx64bis +#define vectypeS uint28 + +#if (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) +#define __LDG_PTR "l" +#else +#define __LDG_PTR "r" +#endif + +#ifdef _MSC_VER +#define THREAD __declspec(thread) +#else +#define THREAD __thread +#endif + +static THREAD cudaStream_t stream[2]; + +__device__ __align__(16) vectypeS * W; +__device__ __align__(16) vectypeS * W2; +__device__ __align__(16) vectypeS* Tr; +__device__ __align__(16) vectypeS* Tr2; +__device__ __align__(16) vectypeS* Input; +__device__ __align__(16) vectypeS* B2; + +static uint32_t *d_NNonce[MAX_GPUS]; + +__constant__ uint32_t pTarget[8]; +__constant__ uint32_t key_init[16]; +__constant__ uint32_t input_init[16]; +__constant__ uint32_t c_data[64]; + +#define SALSA_SMALL_UNROLL 1 +#define CHACHA_SMALL_UNROLL 1 +#define BLAKE2S_BLOCK_SIZE 64U +#define BLAKE2S_OUT_SIZE 32U +#define BLAKE2S_KEY_SIZE 32U +#define BLOCK_SIZE 64U +#define FASTKDF_BUFFER_SIZE 256U +#define PASSWORD_LEN 80U +/// constants /// + +static const __constant__ uint8 BLAKE2S_IV_Vec = +{ + 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A, + 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19 +}; + + +static const uint8 BLAKE2S_IV_Vechost = +{ + 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A, + 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19 +}; + +static const uint32_t BLAKE2S_SIGMA_host[10][16] = +{ + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, + {14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3}, + {11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4}, + {7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8}, + {9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13}, + {2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9}, + {12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11}, + {13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10}, + {6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5}, + {10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0}, +}; + +__constant__ uint32_t BLAKE2S_SIGMA[10][16] = +{ + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, + {14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3}, + {11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4}, + {7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8}, + {9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13}, + {2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9}, + {12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11}, + {13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10}, + {6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5}, + {10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0}, +}; + +#define SALSA(a,b,c,d) { \ + b^=rotate(a+d, 7); \ + c^=rotate(b+a, 9); \ + d^=rotate(c+b, 13); \ + a^=rotate(d+c, 18); \ +} + +#define SALSA_CORE(state) { \ +\ +SALSA(state.s0,state.s4,state.s8,state.sc); \ +SALSA(state.s5,state.s9,state.sd,state.s1); \ +SALSA(state.sa,state.se,state.s2,state.s6); \ +SALSA(state.sf,state.s3,state.s7,state.sb); \ +SALSA(state.s0,state.s1,state.s2,state.s3); \ +SALSA(state.s5,state.s6,state.s7,state.s4); \ +SALSA(state.sa,state.sb,state.s8,state.s9); \ +SALSA(state.sf,state.sc,state.sd,state.se); \ + } + +static __forceinline__ __device__ void shift256R4(uint32_t * ret, const uint8 &vec4, uint32_t shift2) +{ + uint32_t shift = 32 - shift2; + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(ret[0]) : "r"(0), "r"(vec4.s0), "r"(shift)); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(ret[1]) : "r"(vec4.s0), "r"(vec4.s1), "r"(shift)); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(ret[2]) : "r"(vec4.s1), "r"(vec4.s2), "r"(shift)); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(ret[3]) : "r"(vec4.s2), "r"(vec4.s3), "r"(shift)); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(ret[4]) : "r"(vec4.s3), "r"(vec4.s4), "r"(shift)); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(ret[5]) : "r"(vec4.s4), "r"(vec4.s5), "r"(shift)); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(ret[6]) : "r"(vec4.s5), "r"(vec4.s6), "r"(shift)); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(ret[7]) : "r"(vec4.s6), "r"(vec4.s7), "r"(shift)); + asm("shr.b32 %0, %1, %2;" : "=r"(ret[8]) : "r"(vec4.s7), "r"(shift)); +} + +/*static __device__ __inline__ void chacha_step(uint32_t &a, uint32_t &b, uint32_t &c, uint32_t &d) +{ + asm("{\n\t" + "add.u32 %0,%0,%1; \n\t" + "xor.b32 %3,%3,%0; \n\t" + "prmt.b32 %3, %3, 0, 0x1032; \n\t" + "add.u32 %2,%2,%3; \n\t" + "xor.b32 %1,%1,%2; \n\t" + "shf.l.wrap.b32 %1, %1, %1, 12; \n\t" + "add.u32 %0,%0,%1; \n\t" + "xor.b32 %3,%3,%0; \n\t" + "prmt.b32 %3, %3, 0, 0x2103; \n\t" + "add.u32 %2,%2,%3; \n\t" + "xor.b32 %1,%1,%2; \n\t" + "shf.l.wrap.b32 %1, %1, %1, 7; \n\t}" + : "+r"(a), "+r"(b), "+r"(c), "+r"(d)); +} +*/ +#if __CUDA_ARCH__ >=500 + +#define CHACHA_STEP(a,b,c,d) { \ +a += b; d = __byte_perm(d^a,0,0x1032); \ +c += d; b = rotate(b^c, 12); \ +a += b; d = __byte_perm(d^a,0,0x2103); \ +c += d; b = rotate(b^c, 7); \ + } + +//#define CHACHA_STEP(a,b,c,d) chacha_step(a,b,c,d) +#else +#define CHACHA_STEP(a,b,c,d) { \ +a += b; d = rotate(d^a,16); \ +c += d; b = rotate(b^c, 12); \ +a += b; d = rotate(d^a,8); \ +c += d; b = rotate(b^c, 7); \ + } +#endif + +#define CHACHA_CORE_PARALLEL(state) { \ + \ + CHACHA_STEP(state.lo.s0, state.lo.s4, state.hi.s0, state.hi.s4); \ + CHACHA_STEP(state.lo.s1, state.lo.s5, state.hi.s1, state.hi.s5); \ + CHACHA_STEP(state.lo.s2, state.lo.s6, state.hi.s2, state.hi.s6); \ + CHACHA_STEP(state.lo.s3, state.lo.s7, state.hi.s3, state.hi.s7); \ + CHACHA_STEP(state.lo.s0, state.lo.s5, state.hi.s2, state.hi.s7); \ + CHACHA_STEP(state.lo.s1, state.lo.s6, state.hi.s3, state.hi.s4); \ + CHACHA_STEP(state.lo.s2, state.lo.s7, state.hi.s0, state.hi.s5); \ + CHACHA_STEP(state.lo.s3, state.lo.s4, state.hi.s1, state.hi.s6); \ +\ + } + +#define CHACHA_CORE_PARALLEL2(i0,state) { \ + \ + CHACHA_STEP(state[2*i0].x.x, state[2*i0].z.x, state[2*i0+1].x.x, state[2*i0+1].z.x); \ + CHACHA_STEP(state[2*i0].x.y, state[2*i0].z.y, state[2*i0+1].x.y, state[2*i0+1].z.y); \ + CHACHA_STEP(state[2*i0].y.x, state[2*i0].w.x, state[2*i0+1].y.x, state[2*i0+1].w.x); \ + CHACHA_STEP(state[2*i0].y.y, state[2*i0].w.y, state[2*i0+1].y.y, state[2*i0+1].w.y); \ + CHACHA_STEP(state[2*i0].x.x, state[2*i0].z.y, state[2*i0+1].y.x, state[2*i0+1].w.y); \ + CHACHA_STEP(state[2*i0].x.y, state[2*i0].w.x, state[2*i0+1].y.y, state[2*i0+1].z.x); \ + CHACHA_STEP(state[2*i0].y.x, state[2*i0].w.y, state[2*i0+1].x.x, state[2*i0+1].z.y); \ + CHACHA_STEP(state[2*i0].y.y, state[2*i0].z.x, state[2*i0+1].x.y, state[2*i0+1].w.x); \ +\ + } + +#define BLAKE2S_BLOCK_SIZE 64U +#define BLAKE2S_OUT_SIZE 32U +#define BLAKE2S_KEY_SIZE 32U + +#if __CUDA_ARCH__ >= 500 +#define BLAKE_G(idx0, idx1, a, b, c, d, key) { \ + idx = BLAKE2S_SIGMA[idx0][idx1]; a += key[idx]; \ + a += b; d = __byte_perm(d^a,0, 0x1032); \ + c += d; b = rotateR(b^c, 12); \ + idx = BLAKE2S_SIGMA[idx0][idx1+1]; a += key[idx]; \ + a += b; d = __byte_perm(d^a,0, 0x0321); \ + c += d; b = rotateR(b^c, 7); \ +} +#else +#define BLAKE_G(idx0, idx1, a, b, c, d, key) { \ + idx = BLAKE2S_SIGMA[idx0][idx1]; a += key[idx]; \ + a += b; d = rotate(d ^ a,16); \ + c += d; b = rotateR(b ^ c, 12); \ + idx = BLAKE2S_SIGMA[idx0][idx1 + 1]; a += key[idx]; \ + a += b; d = rotateR(d ^ a,8); \ + c += d; b = rotateR(b ^ c, 7); \ +} +#endif + +#if __CUDA_ARCH__ >= 500 + +#define BLAKE(a, b, c, d, key1,key2) { \ + a += b + key1; \ + d = __byte_perm(d^a, 0, 0x1032); \ + c += d; b = rotateR(b^c, 12); \ + a += b + key2; \ + d = __byte_perm(d^a, 0, 0x0321); \ + c += d; b = rotateR(b^c, 7); \ +} + +#define BLAKE_G_PRE(idx0, idx1, a, b, c, d, key) { \ + a += b + key[idx0]; \ + d = __byte_perm(d^a, 0, 0x1032); \ + c += d; b = rotateR(b^c, 12); \ + a += b + key[idx1]; \ + d = __byte_perm(d^a, 0, 0x0321); \ + c += d; b = rotateR(b^c, 7); \ +} + +#define BLAKE_G_PRE0(idx0, idx1, a, b, c, d, key) { \ + a += b; d = __byte_perm(d^a, 0, 0x1032); \ + c += d; b = rotateR(b^c, 12); \ + a += b; d = __byte_perm(d^a, 0, 0x0321); \ + c += d; b = rotateR(b^c, 7); \ +} + +#define BLAKE_G_PRE1(idx0, idx1, a, b, c, d, key) { \ + a += b + key[idx0]; \ + d = __byte_perm(d^a, 0, 0x1032); \ + c += d; b = rotateR(b^c, 12); \ + a += b; d = __byte_perm(d^a, 0, 0x0321); \ + c += d; b = rotateR(b^c, 7); \ +} + +#define BLAKE_G_PRE2(idx0, idx1, a, b, c, d, key) { \ + a += b; d = __byte_perm(d^a,0, 0x1032); \ + c += d; b = rotateR(b^c, 12); \ + a += b + key[idx1]; \ + d = __byte_perm(d^a,0, 0x0321); \ + c += d; b = rotateR(b^c, 7); \ +} + +#else +#define BLAKE(a, b, c, d, key1,key2) { \ + \ + a += key1; \ + a += b; d = rotate(d^a,16); \ + c += d; b = rotateR(b^c, 12); \ + a += key2; \ + a += b; d = rotateR(d^a,8); \ + c += d; b = rotateR(b^c, 7); \ + } + +#define BLAKE_G_PRE(idx0, idx1, a, b, c, d, key) { \ + a += key[idx0]; \ + a += b; d = rotate(d^a,16); \ + c += d; b = rotateR(b^c, 12); \ + a += key[idx1]; \ + a += b; d = rotateR(d^a,8); \ + c += d; b = rotateR(b^c, 7); \ + } + +#define BLAKE_G_PRE0(idx0, idx1, a, b, c, d, key) { \ + \ + a += b; d = rotate(d^a,16); \ + c += d; b = rotateR(b^c, 12); \ + \ + a += b; d = rotateR(d^a,8); \ + c += d; b = rotateR(b^c, 7); \ + } + +#define BLAKE_G_PRE1(idx0, idx1, a, b, c, d, key) { \ + a += key[idx0]; \ + a += b; d = rotate(d^a,16); \ + c += d; b = rotateR(b^c, 12); \ + a += b; d = rotateR(d^a,8); \ + c += d; b = rotateR(b^c, 7); \ + } + +#define BLAKE_G_PRE2(idx0, idx1, a, b, c, d, key) { \ + \ + a += b; d = rotate(d^a,16); \ + c += d; b = rotateR(b^c, 12); \ + a += key[idx1]; \ + a += b; d = rotateR(d^a,8); \ + c += d; b = rotateR(b^c, 7); \ + } +#endif + +#define BLAKE_Ghost(idx0, idx1, a, b, c, d, key) { \ + idx = BLAKE2S_SIGMA_host[idx0][idx1]; \ + a += b + key[idx]; \ + d = ROTR32(d ^ a, 16); \ + c += d; b = ROTR32(b ^ c, 12); \ + idx = BLAKE2S_SIGMA_host[idx0][idx1 + 1]; \ + a += b + key[idx]; \ + d = ROTR32(d ^ a, 8); \ + c += d; b = ROTR32(b ^ c, 7); \ +} +#if __CUDA_ARCH__ < 500 +static __forceinline__ __device__ void Blake2S(uint32_t * __restrict__ out, const uint32_t* __restrict__ inout, const uint32_t * __restrict__ TheKey) +{ + uint16 V; + uint32_t idx; + uint8 tmpblock; + + V.hi = BLAKE2S_IV_Vec; + V.lo = BLAKE2S_IV_Vec; + V.lo.s0 ^= 0x01012020; + + // Copy input block for later + tmpblock = V.lo; + + V.hi.s4 ^= BLAKE2S_BLOCK_SIZE; + + + // { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + BLAKE_G_PRE(0, 1, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey); + BLAKE_G_PRE(2, 3, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey); + BLAKE_G_PRE(4, 5, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey); + BLAKE_G_PRE(6, 7, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey); + BLAKE_G_PRE0(8, 9, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey); + BLAKE_G_PRE0(10, 11, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey); + BLAKE_G_PRE0(12, 13, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey); + BLAKE_G_PRE0(14, 15, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey); + + + // { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, + BLAKE_G_PRE0(14, 10, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey); + BLAKE_G_PRE1(4, 8, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey); + BLAKE_G_PRE0(9, 15, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey); + BLAKE_G_PRE2(13, 6, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey); + BLAKE_G_PRE1(1, 12, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey); + BLAKE_G_PRE(0, 2, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey); + BLAKE_G_PRE2(11, 7, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey); + BLAKE_G_PRE(5, 3, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey); + + + // { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 }, + BLAKE_G_PRE0(11, 8, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey); + BLAKE_G_PRE2(12, 0, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey); + BLAKE_G_PRE(5, 2, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey); + BLAKE_G_PRE0(15, 13, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey); + BLAKE_G_PRE0(10, 14, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey); + BLAKE_G_PRE(3, 6, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey); + BLAKE_G_PRE(7, 1, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey); + BLAKE_G_PRE2(9, 4, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey); + + + // { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 }, + BLAKE_G_PRE1(7, 9, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey); + BLAKE_G_PRE(3, 1, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey); + BLAKE_G_PRE0(13, 12, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey); + BLAKE_G_PRE0(11, 14, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey); + BLAKE_G_PRE(2, 6, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey); + BLAKE_G_PRE1(5, 10, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey); + BLAKE_G_PRE(4, 0, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey); + BLAKE_G_PRE0(15, 8, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey); + + + + // { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 }, + BLAKE_G_PRE2(9, 0, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey); + BLAKE_G_PRE(5, 7, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey); + BLAKE_G_PRE(2, 4, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey); + BLAKE_G_PRE0(10, 15, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey); + BLAKE_G_PRE2(14, 1, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey); + BLAKE_G_PRE0(11, 12, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey); + BLAKE_G_PRE1(6, 8, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey); + BLAKE_G_PRE1(3, 13, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey); + + // { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }, + BLAKE_G_PRE1(2, 12, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey); + BLAKE_G_PRE1(6, 10, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey); + BLAKE_G_PRE1(0, 11, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey); + BLAKE_G_PRE2(8, 3, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey); + BLAKE_G_PRE1(4, 13, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey); + BLAKE_G_PRE(7, 5, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey); + BLAKE_G_PRE0(15, 14, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey); + BLAKE_G_PRE1(1, 9, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey); + + + // { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 }, + BLAKE_G_PRE2(12, 5, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey); + BLAKE_G_PRE1(1, 15, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey); + BLAKE_G_PRE0(14, 13, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey); + BLAKE_G_PRE1(4, 10, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey); + BLAKE_G_PRE(0, 7, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey); + BLAKE_G_PRE(6, 3, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey); + BLAKE_G_PRE2(9, 2, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey); + BLAKE_G_PRE0(8, 11, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey); + + + // { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 }, + BLAKE_G_PRE0(13, 11, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey); + BLAKE_G_PRE1(7, 14, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey); + BLAKE_G_PRE2(12, 1, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey); + BLAKE_G_PRE1(3, 9, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey); + BLAKE_G_PRE(5, 0, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey); + BLAKE_G_PRE2(15, 4, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey); + BLAKE_G_PRE2(8, 6, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey); + BLAKE_G_PRE(2, 10, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey); + + // { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 }, + BLAKE_G_PRE1(6, 15, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey); + BLAKE_G_PRE0(14, 9, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey); + BLAKE_G_PRE2(11, 3, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey); + BLAKE_G_PRE1(0, 8, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey); + BLAKE_G_PRE2(12, 2, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey); + BLAKE_G_PRE2(13, 7, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey); + BLAKE_G_PRE(1, 4, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey); + BLAKE_G_PRE2(10, 5, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey); + + // { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 }, + BLAKE_G_PRE2(10, 2, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey); + BLAKE_G_PRE2(8, 4, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey); + BLAKE_G_PRE(7, 6, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey); + BLAKE_G_PRE(1, 5, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey); + BLAKE_G_PRE0(15, 11, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey); + BLAKE_G_PRE0(9, 14, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey); + BLAKE_G_PRE1(3, 12, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey); + BLAKE_G_PRE2(13, 0, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey); + + V.lo ^= V.hi ^ tmpblock; + + V.hi = BLAKE2S_IV_Vec; + tmpblock = V.lo; + + V.hi.s4 ^= 128; + V.hi.s6 = ~V.hi.s6; + + // { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + BLAKE_G_PRE(0, 1, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout); + BLAKE_G_PRE(2, 3, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout); + BLAKE_G_PRE(4, 5, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout); + BLAKE_G_PRE(6, 7, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout); + BLAKE_G_PRE(8, 9, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout); + BLAKE_G_PRE(10, 11, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout); + BLAKE_G_PRE(12, 13, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout); + BLAKE_G_PRE(14, 15, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout); + + + // { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, + BLAKE_G_PRE(14, 10, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout); + BLAKE_G_PRE(4, 8, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout); + BLAKE_G_PRE(9, 15, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout); + BLAKE_G_PRE(13, 6, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout); + BLAKE_G_PRE(1, 12, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout); + BLAKE_G_PRE(0, 2, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout); + BLAKE_G_PRE(11, 7, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout); + BLAKE_G_PRE(5, 3, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout); + + // { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 }, + BLAKE_G_PRE(11, 8, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout); + BLAKE_G_PRE(12, 0, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout); + BLAKE_G_PRE(5, 2, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout); + BLAKE_G_PRE(15, 13, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout); + BLAKE_G_PRE(10, 14, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout); + BLAKE_G_PRE(3, 6, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout); + BLAKE_G_PRE(7, 1, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout); + BLAKE_G_PRE(9, 4, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout); + + + // { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 }, + BLAKE_G_PRE(7, 9, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout); + BLAKE_G_PRE(3, 1, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout); + BLAKE_G_PRE(13, 12, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout); + BLAKE_G_PRE(11, 14, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout); + BLAKE_G_PRE(2, 6, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout); + BLAKE_G_PRE(5, 10, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout); + BLAKE_G_PRE(4, 0, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout); + BLAKE_G_PRE(15, 8, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout); + + for(int x = 4; x < 10; ++x) + { + BLAKE_G(x, 0x00, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout); + BLAKE_G(x, 0x02, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout); + BLAKE_G(x, 0x04, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout); + BLAKE_G(x, 0x06, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout); + BLAKE_G(x, 0x08, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout); + BLAKE_G(x, 0x0A, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout); + BLAKE_G(x, 0x0C, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout); + BLAKE_G(x, 0x0E, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout); + } + + V.lo ^= V.hi ^ tmpblock; + + ((uint8*)out)[0] = V.lo; +} +#else +static __forceinline__ __device__ void Blake2S_v2(uint32_t * __restrict__ out, const uint32_t* __restrict__ inout, const uint32_t * __restrict__ TheKey) +{ + uint16 V; + uint8 tmpblock; + + V.hi = BLAKE2S_IV_Vec; + V.lo = BLAKE2S_IV_Vec; + V.lo.s0 ^= 0x01012020; + + // Copy input block for later + tmpblock = V.lo; + + V.hi.s4 ^= BLAKE2S_BLOCK_SIZE; + + // { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + BLAKE_G_PRE(0, 1, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey); + BLAKE_G_PRE(2, 3, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey); + BLAKE_G_PRE(4, 5, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey); + BLAKE_G_PRE(6, 7, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey); + BLAKE_G_PRE0(8, 9, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey); + BLAKE_G_PRE0(10, 11, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey); + BLAKE_G_PRE0(12, 13, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey); + BLAKE_G_PRE0(14, 15, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey); + + // { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, + BLAKE_G_PRE0(14, 10, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey); + BLAKE_G_PRE1(4, 8, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey); + BLAKE_G_PRE0(9, 15, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey); + BLAKE_G_PRE2(13, 6, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey); + BLAKE_G_PRE1(1, 12, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey); + BLAKE_G_PRE(0, 2, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey); + BLAKE_G_PRE2(11, 7, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey); + BLAKE_G_PRE(5, 3, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey); + + // { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 }, + BLAKE_G_PRE0(11, 8, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey); + BLAKE_G_PRE2(12, 0, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey); + BLAKE_G_PRE(5, 2, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey); + BLAKE_G_PRE0(15, 13, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey); + BLAKE_G_PRE0(10, 14, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey); + BLAKE_G_PRE(3, 6, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey); + BLAKE_G_PRE(7, 1, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey); + BLAKE_G_PRE2(9, 4, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey); + + // { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 }, + BLAKE_G_PRE1(7, 9, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey); + BLAKE_G_PRE(3, 1, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey); + BLAKE_G_PRE0(13, 12, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey); + BLAKE_G_PRE0(11, 14, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey); + BLAKE_G_PRE(2, 6, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey); + BLAKE_G_PRE1(5, 10, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey); + BLAKE_G_PRE(4, 0, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey); + BLAKE_G_PRE0(15, 8, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey); + + // { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 }, + BLAKE_G_PRE2(9, 0, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey); + BLAKE_G_PRE(5, 7, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey); + BLAKE_G_PRE(2, 4, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey); + BLAKE_G_PRE0(10, 15, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey); + BLAKE_G_PRE2(14, 1, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey); + BLAKE_G_PRE0(11, 12, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey); + BLAKE_G_PRE1(6, 8, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey); + BLAKE_G_PRE1(3, 13, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey); + + // { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }, + BLAKE_G_PRE1(2, 12, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey); + BLAKE_G_PRE1(6, 10, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey); + BLAKE_G_PRE1(0, 11, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey); + BLAKE_G_PRE2(8, 3, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey); + BLAKE_G_PRE1(4, 13, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey); + BLAKE_G_PRE(7, 5, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey); + BLAKE_G_PRE0(15, 14, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey); + BLAKE_G_PRE1(1, 9, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey); + + + // { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 }, + BLAKE_G_PRE2(12, 5, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey); + BLAKE_G_PRE1(1, 15, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey); + BLAKE_G_PRE0(14, 13, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey); + BLAKE_G_PRE1(4, 10, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey); + BLAKE_G_PRE(0, 7, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey); + BLAKE_G_PRE(6, 3, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey); + BLAKE_G_PRE2(9, 2, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey); + BLAKE_G_PRE0(8, 11, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey); + + + // { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 }, + BLAKE_G_PRE0(13, 11, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey); + BLAKE_G_PRE1(7, 14, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey); + BLAKE_G_PRE2(12, 1, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey); + BLAKE_G_PRE1(3, 9, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey); + BLAKE_G_PRE(5, 0, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey); + BLAKE_G_PRE2(15, 4, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey); + BLAKE_G_PRE2(8, 6, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey); + BLAKE_G_PRE(2, 10, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey); + + // { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 }, + BLAKE_G_PRE1(6, 15, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey); + BLAKE_G_PRE0(14, 9, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey); + BLAKE_G_PRE2(11, 3, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey); + BLAKE_G_PRE1(0, 8, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey); + BLAKE_G_PRE2(12, 2, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey); + BLAKE_G_PRE2(13, 7, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey); + BLAKE_G_PRE(1, 4, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey); + BLAKE_G_PRE2(10, 5, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey); + + // { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 }, + BLAKE_G_PRE2(10, 2, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey); + BLAKE_G_PRE2(8, 4, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey); + BLAKE_G_PRE(7, 6, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey); + BLAKE_G_PRE(1, 5, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey); + BLAKE_G_PRE0(15, 11, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey); + BLAKE_G_PRE0(9, 14, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey); + BLAKE_G_PRE1(3, 12, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey); + BLAKE_G_PRE2(13, 0, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey); + + V.lo ^= V.hi; + V.lo ^= tmpblock; + + V.hi = BLAKE2S_IV_Vec; + tmpblock = V.lo; + + V.hi.s4 ^= 128; + V.hi.s6 = ~V.hi.s6; + + // { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + BLAKE_G_PRE(0, 1, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout); + BLAKE_G_PRE(2, 3, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout); + BLAKE_G_PRE(4, 5, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout); + BLAKE_G_PRE(6, 7, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout); + BLAKE_G_PRE(8, 9, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout); + BLAKE_G_PRE(10, 11, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout); + BLAKE_G_PRE(12, 13, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout); + BLAKE_G_PRE(14, 15, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout); + + + // { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, + BLAKE_G_PRE(14, 10, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout); + BLAKE_G_PRE(4, 8, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout); + BLAKE_G_PRE(9, 15, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout); + BLAKE_G_PRE(13, 6, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout); + BLAKE_G_PRE(1, 12, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout); + BLAKE_G_PRE(0, 2, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout); + BLAKE_G_PRE(11, 7, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout); + BLAKE_G_PRE(5, 3, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout); + + // { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 }, + BLAKE_G_PRE(11, 8, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout); + BLAKE_G_PRE(12, 0, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout); + BLAKE_G_PRE(5, 2, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout); + BLAKE_G_PRE(15, 13, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout); + BLAKE_G_PRE(10, 14, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout); + BLAKE_G_PRE(3, 6, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout); + BLAKE_G_PRE(7, 1, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout); + BLAKE_G_PRE(9, 4, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout); + + + // { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 }, + BLAKE_G_PRE(7, 9, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout); + BLAKE_G_PRE(3, 1, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout); + BLAKE_G_PRE(13, 12, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout); + BLAKE_G_PRE(11, 14, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout); + BLAKE_G_PRE(2, 6, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout); + BLAKE_G_PRE(5, 10, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout); + BLAKE_G_PRE(4, 0, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout); + BLAKE_G_PRE(15, 8, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout); + + //#pragma unroll + + // 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10, + // 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5, + // 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0, + + BLAKE(V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout[9], inout[0]); + BLAKE(V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout[5], inout[7]); + BLAKE(V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout[2], inout[4]); + BLAKE(V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout[10], inout[15]); + BLAKE(V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout[14], inout[1]); + BLAKE(V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout[11], inout[12]); + BLAKE(V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout[6], inout[8]); + BLAKE(V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout[3], inout[13]); + + BLAKE(V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout[2], inout[12]); + BLAKE(V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout[6], inout[10]); + BLAKE(V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout[0], inout[11]); + BLAKE(V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout[8], inout[3]); + BLAKE(V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout[4], inout[13]); + BLAKE(V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout[7], inout[5]); + BLAKE(V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout[15], inout[14]); + BLAKE(V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout[1], inout[9]); + + BLAKE(V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout[12], inout[5]); + BLAKE(V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout[1], inout[15]); + BLAKE(V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout[14], inout[13]); + BLAKE(V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout[4], inout[10]); + BLAKE(V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout[0], inout[7]); + BLAKE(V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout[6], inout[3]); + BLAKE(V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout[9], inout[2]); + BLAKE(V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout[8], inout[11]); + + // 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10, + // 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5, + + BLAKE(V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout[13], inout[11]); + BLAKE(V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout[7], inout[14]); + BLAKE(V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout[12], inout[1]); + BLAKE(V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout[3], inout[9]); + BLAKE(V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout[5], inout[0]); + BLAKE(V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout[15], inout[4]); + BLAKE(V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout[8], inout[6]); + BLAKE(V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout[2], inout[10]); + + BLAKE(V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout[6], inout[15]); + BLAKE(V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout[14], inout[9]); + BLAKE(V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout[11], inout[3]); + BLAKE(V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout[0], inout[8]); + BLAKE(V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout[12], inout[2]); + BLAKE(V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout[13], inout[7]); + BLAKE(V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout[1], inout[4]); + BLAKE(V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout[10], inout[5]); + // 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0, + BLAKE(V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout[10], inout[2]); + BLAKE(V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout[8], inout[4]); + BLAKE(V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout[7], inout[6]); + BLAKE(V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout[1], inout[5]); + BLAKE(V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout[15], inout[11]); + BLAKE(V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout[9], inout[14]); + BLAKE(V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout[3], inout[12]); + BLAKE(V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout[13], inout[0]); + + V.lo ^= V.hi; + V.lo ^= tmpblock; + + ((uint8*)out)[0] = V.lo; +} +#endif + +static __forceinline__ __device__ uint16 salsa_small_scalar_rnd(const uint16 &X) +{ + uint16 state = X; + +#pragma unroll 1 + for(int i = 0; i < 10; ++i) + { + SALSA_CORE(state); + } + + return(X + state); +} + +static __device__ __forceinline__ uint16 chacha_small_parallel_rnd(const uint16 &X) +{ + uint16 st = X; +#pragma nounroll + for(int i = 0; i < 10; ++i) + { + CHACHA_CORE_PARALLEL(st); + } + return(X + st); +} + +static __device__ __forceinline__ void neoscrypt_chacha(uint16 *XV) +{ + uint16 temp; + + XV[0] = chacha_small_parallel_rnd(XV[0] ^ XV[3]); + temp = chacha_small_parallel_rnd(XV[1] ^ XV[0]); + XV[1] = chacha_small_parallel_rnd(XV[2] ^ temp); + XV[3] = chacha_small_parallel_rnd(XV[3] ^ XV[1]); + XV[2] = temp; +} + +static __device__ __forceinline__ void neoscrypt_salsa(uint16 *XV) +{ + uint16 temp; + + XV[0] = salsa_small_scalar_rnd(XV[0] ^ XV[3]); + temp = salsa_small_scalar_rnd(XV[1] ^ XV[0]); + XV[1] = salsa_small_scalar_rnd(XV[2] ^ temp); + XV[3] = salsa_small_scalar_rnd(XV[3] ^ XV[1]); + XV[2] = temp; +} + +static __forceinline__ __host__ void Blake2Shost(uint32_t * inout, const uint32_t * inkey) +{ + uint16 V; + uint32_t idx; + uint8 tmpblock; + + V.hi = BLAKE2S_IV_Vechost; + V.lo = BLAKE2S_IV_Vechost; + V.lo.s0 ^= 0x01012020; + + // Copy input block for later + tmpblock = V.lo; + + V.hi.s4 ^= BLAKE2S_BLOCK_SIZE; + + for(int x = 0; x < 10; ++x) + { + BLAKE_Ghost(x, 0x00, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inkey); + BLAKE_Ghost(x, 0x02, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inkey); + BLAKE_Ghost(x, 0x04, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inkey); + BLAKE_Ghost(x, 0x06, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inkey); + BLAKE_Ghost(x, 0x08, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inkey); + BLAKE_Ghost(x, 0x0A, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inkey); + BLAKE_Ghost(x, 0x0C, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inkey); + BLAKE_Ghost(x, 0x0E, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inkey); + } + + V.lo ^= V.hi; + V.lo ^= tmpblock; + + V.hi = BLAKE2S_IV_Vechost; + tmpblock = V.lo; + + V.hi.s4 ^= 128; + V.hi.s6 = ~V.hi.s6; + + for(int x = 0; x < 10; ++x) + { + BLAKE_Ghost(x, 0x00, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout); + BLAKE_Ghost(x, 0x02, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout); + BLAKE_Ghost(x, 0x04, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout); + BLAKE_Ghost(x, 0x06, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout); + BLAKE_Ghost(x, 0x08, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout); + BLAKE_Ghost(x, 0x0A, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout); + BLAKE_Ghost(x, 0x0C, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout); + BLAKE_Ghost(x, 0x0E, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout); + } + + V.lo ^= V.hi ^ tmpblock; + + ((uint8*)inout)[0] = V.lo; +} + +#if __CUDA_ARCH__ < 500 +static __forceinline__ __device__ void fastkdf256_v1(uint32_t thread, const uint32_t nonce, const uint32_t * __restrict__ s_data) //, vectypeS * output) +{ + vectypeS __align__(16) output[8]; + uint8_t bufidx; + uchar4 bufhelper; + uint32_t B[64]; + uint32_t qbuf, rbuf, bitbuf; + uint32_t input[BLAKE2S_BLOCK_SIZE / 4]; + uint32_t key[BLAKE2S_BLOCK_SIZE / 4] = {0}; + + const uint32_t data18 = s_data[18]; + const uint32_t data20 = s_data[0]; + + ((uintx64*)(B))[0] = ((uintx64*)s_data)[0]; + ((uint32_t*)B)[19] = nonce; + ((uint32_t*)B)[39] = nonce; + ((uint32_t*)B)[59] = nonce; + + ((uint816*)input)[0] = ((uint816*)input_init)[0]; + ((uint48*)key)[0] = ((uint48*)key_init)[0]; + +#pragma unroll 1 + for(int i = 0; i < 31; ++i) + { + bufhelper = ((uchar4*)input)[0]; + for(int x = 1; x < BLAKE2S_OUT_SIZE / 4; ++x) + { + bufhelper += ((uchar4*)input)[x]; + } + bufidx = bufhelper.x + bufhelper.y + bufhelper.z + bufhelper.w; + + qbuf = bufidx / 4; + rbuf = bufidx & 3; + bitbuf = rbuf << 3; + + uint32_t shifted[9]; + + shift256R4(shifted, ((uint8*)input)[0], bitbuf); + + //#pragma unroll + uint32_t temp[9]; + + for(int k = 0; k < 9; ++k) + { + uint32_t indice = (k + qbuf) & 0x0000003f; + temp[k] = B[indice] ^ shifted[k]; + B[indice] = temp[k]; + } + + uint32_t a = s_data[qbuf & 0x0000003f], b; + //#pragma unroll + for(int k = 0; k<16; k+=2) + { + b = s_data[(qbuf + k + 1) & 0x0000003f]; + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[k]) : "r"(a), "r"(b), "r"(bitbuf)); + a = s_data[(qbuf + k + 2) & 0x0000003f]; + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[k + 1]) : "r"(b), "r"(a), "r"(bitbuf)); + } + + const uint32_t noncepos = 19 - qbuf % 20; + if(noncepos <= 16 && qbuf<60) + { + if(noncepos != 0) + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[noncepos - 1]) : "r"(data18), "r"(nonce), "r"(bitbuf)); + if(noncepos != 16) + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[noncepos]) : "r"(nonce), "r"(data20), "r"(bitbuf)); + } + + for(int k = 0; k<8; k++) + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[k]) : "r"(temp[k]), "r"(temp[k + 1]), "r"(bitbuf)); + + Blake2S(input, input, key); //yeah right... + } + bufhelper = ((uchar4*)input)[0]; + for(int x = 1; x < BLAKE2S_OUT_SIZE / 4; ++x) + { + bufhelper += ((uchar4*)input)[x]; + } + bufidx = bufhelper.x + bufhelper.y + bufhelper.z + bufhelper.w; + + qbuf = bufidx / 4; + rbuf = bufidx & 3; + bitbuf = rbuf << 3; + + for(int i = 0; i<64; i++) + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(((uint32_t*)output)[i]) : "r"(B[(qbuf + i) & 0x0000003f]), "r"(B[(qbuf + i + 1) & 0x0000003f4]), "r"(bitbuf)); + + ((ulonglong4*)output)[0] ^= ((ulonglong4*)input)[0]; + + ((uintx64*)output)[0] ^= ((uintx64*)s_data)[0]; + ((uint32_t*)output)[19] ^= nonce; + ((uint32_t*)output)[39] ^= nonce; + ((uint32_t*)output)[59] ^= nonce; + + for(int i = 0; i<8; i++) + (Input + 8 * thread)[i] = output[i]; +} + +static __forceinline__ __device__ void fastkdf32_v1(uint32_t thread, const uint32_t nonce, const uint32_t * __restrict__ salt, const uint32_t *__restrict__ s_data, uint32_t &output) +{ + uint8_t bufidx; + uchar4 bufhelper; + uint32_t temp[9]; + +#define Bshift 16*thread + + uint32_t* const B0 = (uint32_t*)&B2[Bshift]; + const uint32_t cdata7 = s_data[7]; + const uint32_t data18 = s_data[18]; + const uint32_t data20 = s_data[0]; + + ((uintx64*)B0)[0] = ((uintx64*)salt)[0]; + uint32_t input[BLAKE2S_BLOCK_SIZE / 4]; uint32_t key[BLAKE2S_BLOCK_SIZE / 4] = {0}; + ((uint816*)input)[0] = ((uint816*)s_data)[0]; + ((uint48*)key)[0] = ((uint48*)salt)[0]; + uint32_t qbuf, rbuf, bitbuf; + +#pragma nounroll + for(int i = 0; i < 31; i++) + { + Blake2S(input, input, key); + + bufidx = 0; + bufhelper = ((uchar4*)input)[0]; + for(int x = 1; x < BLAKE2S_OUT_SIZE / 4; ++x) + { + bufhelper += ((uchar4*)input)[x]; + } + bufidx = bufhelper.x + bufhelper.y + bufhelper.z + bufhelper.w; + qbuf = bufidx / 4; + rbuf = bufidx & 3; + bitbuf = rbuf << 3; + uint32_t shifted[9]; + + shift256R4(shifted, ((uint8*)input)[0], bitbuf); + + for(int k = 0; k < 9; k++) + { + temp[k] = B0[(k + qbuf) & 0x0000003f]; + } + + ((uint28*)temp)[0] ^= ((uint28*)shifted)[0]; + temp[8] ^= shifted[8]; + + uint32_t a = s_data[qbuf & 0x0000003f], b; + //#pragma unroll + for(int k = 0; k<16; k += 2) + { + b = s_data[(qbuf + k + 1) & 0x0000003f]; + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[k]) : "r"(a), "r"(b), "r"(bitbuf)); + a = s_data[(qbuf + k + 2) & 0x0000003f]; + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[k + 1]) : "r"(b), "r"(a), "r"(bitbuf)); + } + + const uint32_t noncepos = 19 - qbuf % 20; + if(noncepos <= 16 && qbuf<60) + { + if(noncepos != 0) asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[noncepos - 1]) : "r"(data18), "r"(nonce), "r"(bitbuf)); + if(noncepos != 16) asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[noncepos]) : "r"(nonce), "r"(data20), "r"(bitbuf)); + } + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[0]) : "r"(temp[0]), "r"(temp[1]), "r"(bitbuf)); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[1]) : "r"(temp[1]), "r"(temp[2]), "r"(bitbuf)); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[2]) : "r"(temp[2]), "r"(temp[3]), "r"(bitbuf)); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[3]) : "r"(temp[3]), "r"(temp[4]), "r"(bitbuf)); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[4]) : "r"(temp[4]), "r"(temp[5]), "r"(bitbuf)); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[5]) : "r"(temp[5]), "r"(temp[6]), "r"(bitbuf)); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[6]) : "r"(temp[6]), "r"(temp[7]), "r"(bitbuf)); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[7]) : "r"(temp[7]), "r"(temp[8]), "r"(bitbuf)); + + for(int k = 0; k < 9; k++) + { + B0[(k + qbuf) & 0x0000003f] = temp[k]; + } + } + + Blake2S(input, input, key); + + bufidx = 0; + bufhelper = ((uchar4*)input)[0]; + for(int x = 1; x < BLAKE2S_OUT_SIZE / 4; ++x) + { + bufhelper += ((uchar4*)input)[x]; + } + bufidx = bufhelper.x + bufhelper.y + bufhelper.z + bufhelper.w; + qbuf = bufidx / 4; + rbuf = bufidx & 3; + bitbuf = rbuf << 3; + + for(int k = 7; k < 9; k++) + { + temp[k] = B0[(k + qbuf) & 0x0000003f]; + } + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(output) : "r"(temp[7]), "r"(temp[8]), "r"(bitbuf)); + output ^= input[7] ^ cdata7; +} + +#else +static __forceinline__ __device__ void fastkdf256_v2(uint32_t thread, const uint32_t nonce, const uint32_t* __restrict__ s_data) //, vectypeS * output) +{ + vectypeS __align__(16) output[8]; + uint8_t bufidx; + uchar4 bufhelper; + const uint32_t data18 = s_data[18]; + const uint32_t data20 = s_data[0]; + uint32_t input[16]; + uint32_t key[16] = {0}; + uint32_t qbuf, rbuf, bitbuf; + +#define Bshift 16*thread + + uint32_t *const B = (uint32_t*)&B2[Bshift]; + ((uintx64*)(B))[0] = ((uintx64*)s_data)[0]; + + B[19] = nonce; + B[39] = nonce; + B[59] = nonce; + + ((ulonglong4*)input)[0] = ((ulonglong4*)input_init)[0]; + ((uint28*)key)[0] = ((uint28*)key_init)[0]; + + +#pragma unroll 1 + for(int i = 0; i < 31; ++i) + { + bufhelper = ((uchar4*)input)[0]; + for(int x = 1; x < BLAKE2S_OUT_SIZE / 4; ++x) + { + bufhelper += ((uchar4*)input)[x]; + } + bufidx = bufhelper.x + bufhelper.y + bufhelper.z + bufhelper.w; + + qbuf = bufidx >> 2; + rbuf = bufidx & 3; + bitbuf = rbuf << 3; + uint32_t shifted[9]; + + shift256R4(shifted, ((uint8*)input)[0], bitbuf); + + uint32_t temp[9]; + + for(int k = 0; k < 9; ++k) + temp[k] = __ldg(&B[(k + qbuf) & 0x0000003f]) ^ shifted[k]; + + uint32_t a = s_data[qbuf & 0x0000003f], b; + //#pragma unroll + + for(int k = 0; k<16; k+=2) + { + b = s_data[(qbuf + k + 1) & 0x0000003f]; + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[k]) : "r"(a), "r"(b), "r"(bitbuf)); + a = s_data[(qbuf + k + 2) & 0x0000003f]; + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[k + 1]) : "r"(b), "r"(a), "r"(bitbuf)); + } + + const uint32_t noncepos = 19 - qbuf % 20; + if(noncepos <= 16 && qbuf<60) + { + if(noncepos != 0) + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[noncepos - 1]) : "r"(data18), "r"(nonce), "r"(bitbuf)); + if(noncepos != 16) + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[noncepos]) : "r"(nonce), "r"(data20), "r"(bitbuf)); + } + + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[0]) : "r"(temp[0]), "r"(temp[1]), "r"(bitbuf)); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[1]) : "r"(temp[1]), "r"(temp[2]), "r"(bitbuf)); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[2]) : "r"(temp[2]), "r"(temp[3]), "r"(bitbuf)); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[3]) : "r"(temp[3]), "r"(temp[4]), "r"(bitbuf)); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[4]) : "r"(temp[4]), "r"(temp[5]), "r"(bitbuf)); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[5]) : "r"(temp[5]), "r"(temp[6]), "r"(bitbuf)); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[6]) : "r"(temp[6]), "r"(temp[7]), "r"(bitbuf)); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[7]) : "r"(temp[7]), "r"(temp[8]), "r"(bitbuf)); + + Blake2S_v2(input, input, key); + + for(int k = 0; k < 9; k++) + B[(k + qbuf) & 0x0000003f] = temp[k]; + } + + bufhelper = ((uchar4*)input)[0]; + for(int x = 1; x < BLAKE2S_OUT_SIZE / 4; ++x) + { + bufhelper += ((uchar4*)input)[x]; + } + bufidx = bufhelper.x + bufhelper.y + bufhelper.z + bufhelper.w; + + qbuf = bufidx / 4; + rbuf = bufidx & 3; + bitbuf = rbuf << 3; + + for(int i = 0; i<64; i++) + { + const uint32_t a = (qbuf + i) & 0x0000003f, b = (qbuf + i + 1) & 0x0000003f; + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(((uint32_t*)output)[i]) : "r"(__ldg(&B[a])), "r"(__ldg(&B[b])), "r"(bitbuf)); + } + + output[0] ^= ((uint28*)input)[0]; + for(int i = 0; i<8; i++) + output[i] ^= ((uint28*)s_data)[i]; + // ((ulonglong16 *)output)[0] ^= ((ulonglong16*)s_data)[0]; + ((uint32_t*)output)[19] ^= nonce; + ((uint32_t*)output)[39] ^= nonce; + ((uint32_t*)output)[59] ^= nonce;; + ((ulonglong16 *)(Input + 8 * thread))[0] = ((ulonglong16*)output)[0]; +} + +static __forceinline__ __device__ void fastkdf32_v3(uint32_t thread, const uint32_t nonce, const uint32_t * __restrict__ salt, const uint32_t * __restrict__ s_data, uint32_t &output) +{ + uint32_t temp[9]; + uint8_t bufidx; + uchar4 bufhelper; + +#define Bshift 16*thread + + uint32_t*const B0 = (uint32_t*)&B2[Bshift]; + const uint32_t cdata7 = s_data[7]; + const uint32_t data18 = s_data[18]; + const uint32_t data20 = s_data[0]; + + ((uintx64*)B0)[0] = ((uintx64*)salt)[0]; + uint32_t input[BLAKE2S_BLOCK_SIZE / 4]; uint32_t key[BLAKE2S_BLOCK_SIZE / 4] = {0}; + ((uint816*)input)[0] = ((uint816*)s_data)[0]; + ((uint48*)key)[0] = ((uint48*)salt)[0]; + uint32_t qbuf, rbuf, bitbuf; + +#pragma nounroll + for(int i = 0; i < 31; i++) + { + Blake2S_v2(input, input, key); + + bufidx = 0; + bufhelper = ((uchar4*)input)[0]; + for(int x = 1; x < BLAKE2S_OUT_SIZE / 4; ++x) + { + bufhelper += ((uchar4*)input)[x]; + } + bufidx = bufhelper.x + bufhelper.y + bufhelper.z + bufhelper.w; + qbuf = bufidx / 4; + rbuf = bufidx & 3; + bitbuf = rbuf << 3; + uint32_t shifted[9]; + + shift256R4(shifted, ((uint8*)input)[0], bitbuf); + + for(int k = 0; k < 9; k++) + { + temp[k] = __ldg(&B0[(k + qbuf) & 0x0000003f]); + } + + ((uint28*)temp)[0] ^= ((uint28*)shifted)[0]; + temp[8] ^= shifted[8]; + + uint32_t a = s_data[qbuf & 0x0000003f], b; + //#pragma unroll + for(int k = 0; k<16; k += 2) + { + b = s_data[(qbuf + k + 1) & 0x0000003f]; + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[k]) : "r"(a), "r"(b), "r"(bitbuf)); + a = s_data[(qbuf + k + 2) & 0x0000003f]; + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[k + 1]) : "r"(b), "r"(a), "r"(bitbuf)); + } + + const uint32_t noncepos = 19 - qbuf % 20; + if(noncepos <= 16 && qbuf<60) + { + if(noncepos != 0) + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[noncepos - 1]) : "r"(data18), "r"(nonce), "r"(bitbuf)); + if(noncepos != 16) + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[noncepos]) : "r"(nonce), "r"(data20), "r"(bitbuf)); + } + + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[0]) : "r"(temp[0]), "r"(temp[1]), "r"(bitbuf)); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[1]) : "r"(temp[1]), "r"(temp[2]), "r"(bitbuf)); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[2]) : "r"(temp[2]), "r"(temp[3]), "r"(bitbuf)); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[3]) : "r"(temp[3]), "r"(temp[4]), "r"(bitbuf)); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[4]) : "r"(temp[4]), "r"(temp[5]), "r"(bitbuf)); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[5]) : "r"(temp[5]), "r"(temp[6]), "r"(bitbuf)); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[6]) : "r"(temp[6]), "r"(temp[7]), "r"(bitbuf)); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[7]) : "r"(temp[7]), "r"(temp[8]), "r"(bitbuf)); + + for(int k = 0; k < 9; k++) + { + B0[(k + qbuf) & 0x0000003f] = temp[k]; + } + } + + Blake2S_v2(input, input, key); + + bufidx = 0; + bufhelper = ((uchar4*)input)[0]; + for(int x = 1; x < BLAKE2S_OUT_SIZE / 4; ++x) + { + bufhelper += ((uchar4*)input)[x]; + } + bufidx = bufhelper.x + bufhelper.y + bufhelper.z + bufhelper.w; + qbuf = bufidx / 4; + rbuf = bufidx & 3; + bitbuf = rbuf << 3; + + temp[7] = __ldg(&B0[(qbuf + 7) & 0x0000003f]); + temp[8] = __ldg(&B0[(qbuf + 8) & 0x0000003f]); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(output) : "r"(temp[7]), "r"(temp[8]), "r"(bitbuf)); + output ^= input[7] ^ cdata7; +} + +#endif + + +#define SHIFT 128 +#define TPB 128 +#define TPB2 64 + +__global__ __launch_bounds__(TPB2, 1) void neoscrypt_gpu_hash_start(int stratum, uint32_t threads, uint32_t startNonce) +{ + __shared__ uint32_t s_data[64]; + +#if TPB2<64 +#error TPB2 too low +#else +#if TPB2>64 + if(threadIdx.x<64) +#endif +#endif + s_data[threadIdx.x] = c_data[threadIdx.x]; + __syncthreads(); + + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + const uint32_t nonce = startNonce + thread; + + const uint32_t ZNonce = (stratum) ? cuda_swab32(nonce) : nonce; //freaking morons !!! + +#if __CUDA_ARCH__ < 500 + fastkdf256_v1(thread, ZNonce, s_data); +#else + fastkdf256_v2(thread, ZNonce, s_data); +#endif + +} + +__global__ __launch_bounds__(TPB, 1) void neoscrypt_gpu_hash_chacha1_stream1(uint32_t threads, uint32_t startNonce) +{ + uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + const int shift = SHIFT * 8 * thread; + const unsigned int shiftTr = 8 * thread; + + vectypeS __align__(16) X[8]; + for(int i = 0; i<8; i++) + X[i] = __ldg4(&(Input + shiftTr)[i]); + +#pragma nounroll + for(int i = 0; i < 128; ++i) + { + uint32_t offset = shift + i * 8; + for(int j = 0; j<8; j++) + (W + offset)[j] = X[j]; + neoscrypt_chacha((uint16*)X); + + } + for(int i = 0; i<8; i++) + (Tr + shiftTr)[i] = X[i]; +} + +__global__ __launch_bounds__(TPB, 1) void neoscrypt_gpu_hash_chacha2_stream1(uint32_t threads, uint32_t startNonce) +{ + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + const int shift = SHIFT * 8 * thread; + const int shiftTr = 8 * thread; + + vectypeS __align__(16) X[8]; +#pragma unroll + for(int i = 0; i<8; i++) + X[i] = __ldg4(&(Tr + shiftTr)[i]); + +#pragma nounroll + for(int t = 0; t < 128; t++) + { + int idx = (X[6].x.x & 0x7F) << 3; + + for(int j = 0; j<8; j++) + X[j] ^= __ldg4(&(W + shift + idx)[j]); + neoscrypt_chacha((uint16*)X); + } +#pragma unroll + for(int i = 0; i<8; i++) + (Tr + shiftTr)[i] = X[i]; // best checked +} + +__global__ __launch_bounds__(TPB, 1) void neoscrypt_gpu_hash_salsa1_stream1(uint32_t threads, uint32_t startNonce) +{ + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + const int shift = SHIFT * 8 * thread; + const int shiftTr = 8 * thread; + + vectypeS __align__(16) Z[8]; +#pragma unroll + for(int i = 0; i<8; i++) + Z[i] = __ldg4(&(Input + shiftTr)[i]); + +#pragma nounroll + for(int i = 0; i < 128; ++i) + { + for(int j = 0; j<8; j++) + (W2 + shift + i * 8)[j] = Z[j]; + neoscrypt_salsa((uint16*)Z); + } +#pragma unroll + for(int i = 0; i<8; i++) + (Tr2 + shiftTr)[i] = Z[i]; +} + +__global__ __launch_bounds__(TPB, 1) void neoscrypt_gpu_hash_salsa2_stream1(uint32_t threads, uint32_t startNonce) +{ + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + const int shift = SHIFT * 8 * thread; + const int shiftTr = 8 * thread; + + vectypeS __align__(16) X[8]; +#pragma unroll + for(int i = 0; i<8; i++) + X[i] = __ldg4(&(Tr2 + shiftTr)[i]); + +#pragma nounroll + for(int t = 0; t < 128; t++) + { + int idx = (X[6].x.x & 0x7F) << 3; + + for(int j = 0; j<8; j++) + X[j] ^= __ldg4(&(W2 + shift + idx)[j]); + neoscrypt_salsa((uint16*)X); + } +#pragma unroll + for(int i = 0; i<8; i++) + (Tr2 + shiftTr)[i] = X[i]; // best checked +} + +__global__ __launch_bounds__(TPB2, 8) void neoscrypt_gpu_hash_ending(int stratum, uint32_t threads, uint32_t startNonce, uint32_t *nonceVector) +{ + __shared__ uint32_t s_data[64]; + +#if TPB2<64 +#error TPB2 too low +#else +#if TPB2>64 + if(threadIdx.x<64) +#endif +#endif + s_data[threadIdx.x] = c_data[threadIdx.x]; + __syncthreads(); + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + const uint32_t nonce = startNonce + thread; + + const int shiftTr = 8 * thread; + vectypeS __align__(16) Z[8]; + uint32_t outbuf; + + const uint32_t ZNonce = (stratum) ? cuda_swab32(nonce) : nonce; + +#pragma unroll + for(int i = 0; i<8; i++) + Z[i] = (Tr2 + shiftTr)[i] ^ (Tr + shiftTr)[i]; + +#if __CUDA_ARCH__ < 500 + fastkdf32_v1(thread, ZNonce, (uint32_t*)Z, s_data, outbuf); +#else + fastkdf32_v3(thread, ZNonce, (uint32_t*)Z, s_data, outbuf); +#endif + if(outbuf <= pTarget[7]) + { + uint32_t tmp = atomicExch(nonceVector, nonce); + if(tmp != 0xffffffff) + nonceVector[1] = tmp; + } +} + +void neoscrypt_cpu_init_2stream(int thr_id, uint32_t threads) +{ + uint32_t *hash1; + uint32_t *hash2; // 2 streams + uint32_t *Trans1; + uint32_t *Trans2; // 2 streams + uint32_t *Trans3; // 2 streams + uint32_t *Bhash; + + CUDA_SAFE_CALL(cudaStreamCreate(&stream[0])); + CUDA_SAFE_CALL(cudaStreamCreate(&stream[1])); + + CUDA_SAFE_CALL(cudaMalloc(&d_NNonce[thr_id], 2 * sizeof(uint32_t))); + CUDA_SAFE_CALL(cudaMalloc(&hash1, 32 * 128 * sizeof(uint64_t) * threads)); + CUDA_SAFE_CALL(cudaMalloc(&hash2, 32 * 128 * sizeof(uint64_t) * threads)); + CUDA_SAFE_CALL(cudaMalloc(&Trans1, 32 * sizeof(uint64_t) * threads)); + CUDA_SAFE_CALL(cudaMalloc(&Trans2, 32 * sizeof(uint64_t) * threads)); + CUDA_SAFE_CALL(cudaMalloc(&Trans3, 32 * sizeof(uint64_t) * threads)); + CUDA_SAFE_CALL(cudaMalloc(&Bhash, 128 * sizeof(uint32_t) * threads)); + + CUDA_SAFE_CALL(cudaMemcpyToSymbolAsync(B2, &Bhash, sizeof(uint28*), 0, cudaMemcpyHostToDevice, stream[0])); + CUDA_SAFE_CALL(cudaMemcpyToSymbolAsync(W, &hash1, sizeof(uint28*), 0, cudaMemcpyHostToDevice, stream[0])); + CUDA_SAFE_CALL(cudaMemcpyToSymbolAsync(W2, &hash2, sizeof(uint28*), 0, cudaMemcpyHostToDevice, stream[0])); + CUDA_SAFE_CALL(cudaMemcpyToSymbolAsync(Tr, &Trans1, sizeof(uint28*), 0, cudaMemcpyHostToDevice, stream[0])); + CUDA_SAFE_CALL(cudaMemcpyToSymbolAsync(Tr2, &Trans2, sizeof(uint28*), 0, cudaMemcpyHostToDevice, stream[0])); + CUDA_SAFE_CALL(cudaMemcpyToSymbolAsync(Input, &Trans3, sizeof(uint28*), 0, cudaMemcpyHostToDevice, stream[0])); +} + +__host__ void neoscrypt_cpu_hash_k4_2stream(bool stratum, int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *result) +{ + const uint32_t threadsperblock = TPB; + + dim3 grid((threads + threadsperblock - 1) / threadsperblock); + dim3 block(threadsperblock); + + const uint32_t threadsperblock2 = TPB2; + dim3 grid2((threads + threadsperblock2 - 1) / threadsperblock2); + dim3 block2(threadsperblock2); + + neoscrypt_gpu_hash_start << > >(stratum, threads, startNounce); //fastkdf + + CUDA_SAFE_CALL(cudaStreamSynchronize(stream[0])); + + neoscrypt_gpu_hash_salsa1_stream1 << > >(threads, startNounce); //chacha + neoscrypt_gpu_hash_chacha1_stream1 << > >(threads, startNounce); //salsa + + neoscrypt_gpu_hash_salsa2_stream1 << > >(threads, startNounce); //chacha + neoscrypt_gpu_hash_chacha2_stream1 << > >(threads, startNounce); //salsa + + CUDA_SAFE_CALL(cudaDeviceSynchronize()); + + neoscrypt_gpu_hash_ending << > >(stratum, threads, startNounce, d_NNonce[thr_id]); //fastkdf+end + + CUDA_SAFE_CALL(cudaMemcpy(result, d_NNonce[thr_id], 2 * sizeof(uint32_t), cudaMemcpyDeviceToHost)); +} + +__host__ void neoscrypt_setBlockTarget(int thr_id, uint32_t* pdata, const void *target) +{ + uint32_t PaddedMessage[64]; + uint32_t input[16], key[16] = {0}; + + for(int i = 0; i < 19; i++) + { + PaddedMessage[i ] = pdata[i]; + PaddedMessage[i + 20] = pdata[i]; + PaddedMessage[i + 40] = pdata[i]; + } + for(int i = 0; i<4; i++) + PaddedMessage[i + 60] = pdata[i]; + + PaddedMessage[19] = 0; + PaddedMessage[39] = 0; + PaddedMessage[59] = 0; + + for(int i = 0; i < 16; i++) + input[i] = pdata[i]; + for(int i = 0; i < 8; i++) + key[i] = pdata[i]; + + Blake2Shost(input, key); + + cudaMemcpyToSymbolAsync(pTarget, target, 8 * sizeof(uint32_t), 0, cudaMemcpyHostToDevice, stream[1]); + cudaMemcpyToSymbolAsync(input_init, input, 16 * sizeof(uint32_t), 0, cudaMemcpyHostToDevice, stream[0]); + cudaMemcpyToSymbolAsync(key_init, key, 16 * sizeof(uint32_t), 0, cudaMemcpyHostToDevice, stream[1]); + cudaMemcpyToSymbolAsync(c_data, PaddedMessage, 64 * sizeof(uint32_t), 0, cudaMemcpyHostToDevice, stream[0]); + + CUDA_SAFE_CALL(cudaMemsetAsync(d_NNonce[thr_id], 0xff, 2 * sizeof(uint32_t), stream[1])); +} diff --git a/neoscrypt/cuda_neoscrypt_tpruvot.cu b/neoscrypt/cuda_neoscrypt_tpruvot.cu new file mode 100644 index 0000000000..6edd267729 --- /dev/null +++ b/neoscrypt/cuda_neoscrypt_tpruvot.cu @@ -0,0 +1,1559 @@ +// originally from djm34 - github.com/djm34/ccminer-sp-neoscrypt +// kernel code from Nanashi Meiyo-Meijin 1.7.6-r10 (July 2016) +// modified by tpruvot + +#include +#include +#include "cuda_helper.h" +#include "cuda_vector_uint2x4.cuh" +#include "cuda_vector_tpruvot.cuh" +#include "miner.h" + +#ifdef _MSC_VER +#define THREAD __declspec(thread) +#else +#define THREAD __thread +#endif + +#define rotate ROTL32 +#define rotateR ROTR32 +#define rotateL ROTL32 + +typedef uint48 uint4x2; + +static uint32_t* d_NNonce[MAX_GPUS]; + +__device__ uint2x4* W; +__device__ uint2x4* Tr; +__device__ uint2x4* Tr2; +__device__ uint2x4* Input; + +__constant__ uint32_t c_data[64]; +__constant__ uint32_t c_target[2]; +__constant__ uint32_t key_init[16]; +__constant__ uint32_t input_init[16]; + +static const __constant__ uint8 BLAKE2S_IV_Vec = { + 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A, + 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19 +}; + +static const uint8 BLAKE2S_IV_Vechost = { + 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A, + 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19 +}; + +static const uint32_t BLAKE2S_SIGMA_host[10][16] = { + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, + {14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3}, + {11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4}, + {7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8}, + {9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13}, + {2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9}, + {12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11}, + {13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10}, + {6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5}, + {10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0}, +}; + +__constant__ uint32_t BLAKE2S_SIGMA[10][16] = { + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, + {14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3}, + {11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4}, + {7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8}, + {9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13}, + {2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9}, + {12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11}, + {13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10}, + {6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5}, + {10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0}, +}; + +#define BLOCK_SIZE 64U +#define BLAKE2S_BLOCK_SIZE 64U +#define BLAKE2S_OUT_SIZE 32U + +#define SALSA(a,b,c,d) { \ + t = rotateL(a + d, 7U); b ^= t; \ + t = rotateL(b + a, 9U); c ^= t; \ + t = rotateL(c + b, 13U); d ^= t; \ + t = rotateL(d + c, 18U); a ^= t; \ +} + +#define shf_r_clamp32(out,a,b,shift) \ + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(out) : "r"(a), "r"(b), "r"(shift)); + +#if __CUDA_ARCH__ >= 300 +__device__ __forceinline__ uint32_t WarpShuffle(uint32_t a, uint32_t b, uint32_t c) +{ + return __shfl(a, b, c); +} + +__device__ __forceinline__ void WarpShuffle3(uint32_t &a1, uint32_t &a2, uint32_t &a3, uint32_t b1, uint32_t b2, uint32_t b3, uint32_t c) +{ + a1 = WarpShuffle(a1, b1, c); + a2 = WarpShuffle(a2, b2, c); + a3 = WarpShuffle(a3, b3, c); +} + +#else +__device__ __forceinline__ uint32_t WarpShuffle(uint32_t a, uint32_t b, uint32_t c) +{ + __shared__ uint32_t shared_mem[32]; + + const uint32_t thread = blockDim.x * threadIdx.y + threadIdx.x; + + shared_mem[thread] = a; + __threadfence_block(); + + uint32_t result = shared_mem[(thread&~(c - 1)) + (b&(c - 1))]; + __threadfence_block(); + + return result; +} + +__device__ __forceinline__ void WarpShuffle3(uint32_t &a1, uint32_t &a2, uint32_t &a3, uint32_t b1, uint32_t b2, uint32_t b3, uint32_t c) +{ + __shared__ uint32_t shared_mem[32]; + + const uint32_t thread = blockDim.x * threadIdx.y + threadIdx.x; + + shared_mem[thread] = a1; + __threadfence_block(); + + a1 = shared_mem[(thread&~(c - 1)) + (b1&(c - 1))]; + __threadfence_block(); + + shared_mem[thread] = a2; + __threadfence_block(); + + a2 = shared_mem[(thread&~(c - 1)) + (b2&(c - 1))]; + __threadfence_block(); + + shared_mem[thread] = a3; + __threadfence_block(); + + a3 = shared_mem[(thread&~(c - 1)) + (b3&(c - 1))]; + __threadfence_block(); +} + +#endif + +#define CHACHA_STEP(a,b,c,d) { \ + a += b; d = __byte_perm(d ^ a, 0, 0x1032); \ + c += d; b = rotateL(b ^ c, 12); \ + a += b; d = __byte_perm(d ^ a, 0, 0x2103); \ + c += d; b = rotateL(b ^ c, 7); \ +} + +#if __CUDA_ARCH__ < 500 + +__device__ __forceinline__ +static void shift256R4(uint32_t* ret, const uint8 &vec4, const uint32_t shift2) +{ +#if __CUDA_ARCH__ >= 320 + uint32_t shift = 32U - shift2; + asm("shf.r.clamp.b32 %0, 0, %1, %2;" : "=r"(ret[0]) : "r"(vec4.s0), "r"(shift)); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(ret[1]) : "r"(vec4.s0), "r"(vec4.s1), "r"(shift)); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(ret[2]) : "r"(vec4.s1), "r"(vec4.s2), "r"(shift)); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(ret[3]) : "r"(vec4.s2), "r"(vec4.s3), "r"(shift)); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(ret[4]) : "r"(vec4.s3), "r"(vec4.s4), "r"(shift)); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(ret[5]) : "r"(vec4.s4), "r"(vec4.s5), "r"(shift)); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(ret[6]) : "r"(vec4.s5), "r"(vec4.s6), "r"(shift)); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(ret[7]) : "r"(vec4.s6), "r"(vec4.s7), "r"(shift)); + asm("shr.b32 %0, %1, %2;" : "=r"(ret[8]) : "r"(vec4.s7), "r"(shift)); +#else + // to check + shift256R(ret, vec4, shift2); +#endif +} + +#define BLAKE(a, b, c, d, key1, key2) { \ + a += key1; \ + a += b; d = rotateL(d ^ a, 16); \ + c += d; b = rotateR(b ^ c, 12); \ + a += key2; \ + a += b; d = rotateR(d ^ a, 8); \ + c += d; b = rotateR(b ^ c, 7); \ +} + +#define BLAKE_G(idx0, idx1, a, b, c, d, key) { \ + idx = BLAKE2S_SIGMA[idx0][idx1]; a += key[idx]; \ + a += b; d = rotate(d ^ a, 16); \ + c += d; b = rotateR(b ^ c, 12); \ + idx = BLAKE2S_SIGMA[idx0][idx1 + 1]; a += key[idx]; \ + a += b; d = rotateR(d ^ a, 8); \ + c += d; b = rotateR(b ^ c, 7); \ +} + +#define BLAKE_G_PRE(idx0, idx1, a, b, c, d, key) { \ + a += key[idx0]; \ + a += b; d = rotateL(d ^ a, 16); \ + c += d; b = rotateR(b ^ c, 12); \ + a += key[idx1]; \ + a += b; d = rotateR(d ^ a, 8); \ + c += d; b = rotateR(b ^ c, 7); \ +} + +#define BLAKE_G_PRE0(idx0, idx1, a, b, c, d, key) { \ + a += b; d = rotateL(d ^ a, 16); \ + c += d; b = rotateR(b ^ c, 12); \ + a += b; d = rotateR(d ^ a, 8); \ + c += d; b = rotateR(b ^ c, 7); \ +} + +#define BLAKE_G_PRE1(idx0, idx1, a, b, c, d, key) { \ + a += key[idx0]; \ + a += b; d = rotateL(d ^ a, 16); \ + c += d; b = rotateR(b ^ c, 12); \ + a += b; d = rotateR(d ^ a, 8); \ + c += d; b = rotateR(b ^ c, 7); \ +} + +#define BLAKE_G_PRE2(idx0, idx1, a, b, c, d, key) { \ + a += b; d = rotateL(d ^ a, 16); \ + c += d; b = rotateR(b ^ c, 12); \ + a += key[idx1]; \ + a += b; d = rotateR(d ^ a, 8); \ + c += d; b = rotateR(b ^ c, 7); \ +} + +static __forceinline__ __device__ +void Blake2S(uint32_t *out, const uint32_t* const __restrict__ inout, const uint32_t * const __restrict__ TheKey) +{ + uint16 V; + uint32_t idx; + uint8 tmpblock; + + V.hi = BLAKE2S_IV_Vec; + V.lo = BLAKE2S_IV_Vec; + V.lo.s0 ^= 0x01012020; + + // Copy input block for later + tmpblock = V.lo; + + V.hi.s4 ^= BLAKE2S_BLOCK_SIZE; + + // { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + BLAKE_G_PRE(0, 1, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey); + BLAKE_G_PRE(2, 3, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey); + BLAKE_G_PRE(4, 5, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey); + BLAKE_G_PRE(6, 7, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey); + BLAKE_G_PRE0(8, 9, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey); + BLAKE_G_PRE0(10, 11, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey); + BLAKE_G_PRE0(12, 13, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey); + BLAKE_G_PRE0(14, 15, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey); + // { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, + BLAKE_G_PRE0(14, 10, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey); + BLAKE_G_PRE1(4, 8, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey); + BLAKE_G_PRE0(9, 15, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey); + BLAKE_G_PRE2(13, 6, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey); + BLAKE_G_PRE1(1, 12, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey); + BLAKE_G_PRE(0, 2, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey); + BLAKE_G_PRE2(11, 7, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey); + BLAKE_G_PRE(5, 3, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey); + // { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 }, + BLAKE_G_PRE0(11, 8, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey); + BLAKE_G_PRE2(12, 0, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey); + BLAKE_G_PRE(5, 2, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey); + BLAKE_G_PRE0(15, 13, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey); + BLAKE_G_PRE0(10, 14, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey); + BLAKE_G_PRE(3, 6, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey); + BLAKE_G_PRE(7, 1, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey); + BLAKE_G_PRE2(9, 4, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey); + // { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 }, + BLAKE_G_PRE1(7, 9, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey); + BLAKE_G_PRE(3, 1, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey); + BLAKE_G_PRE0(13, 12, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey); + BLAKE_G_PRE0(11, 14, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey); + BLAKE_G_PRE(2, 6, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey); + BLAKE_G_PRE1(5, 10, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey); + BLAKE_G_PRE(4, 0, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey); + BLAKE_G_PRE0(15, 8, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey); + // { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 }, + BLAKE_G_PRE2(9, 0, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey); + BLAKE_G_PRE(5, 7, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey); + BLAKE_G_PRE(2, 4, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey); + BLAKE_G_PRE0(10, 15, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey); + BLAKE_G_PRE2(14, 1, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey); + BLAKE_G_PRE0(11, 12, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey); + BLAKE_G_PRE1(6, 8, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey); + BLAKE_G_PRE1(3, 13, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey); + // { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }, + BLAKE_G_PRE1(2, 12, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey); + BLAKE_G_PRE1(6, 10, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey); + BLAKE_G_PRE1(0, 11, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey); + BLAKE_G_PRE2(8, 3, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey); + BLAKE_G_PRE1(4, 13, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey); + BLAKE_G_PRE(7, 5, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey); + BLAKE_G_PRE0(15, 14, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey); + BLAKE_G_PRE1(1, 9, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey); + // { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 }, + BLAKE_G_PRE2(12, 5, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey); + BLAKE_G_PRE1(1, 15, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey); + BLAKE_G_PRE0(14, 13, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey); + BLAKE_G_PRE1(4, 10, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey); + BLAKE_G_PRE(0, 7, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey); + BLAKE_G_PRE(6, 3, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey); + BLAKE_G_PRE2(9, 2, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey); + BLAKE_G_PRE0(8, 11, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey); + // { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 }, + BLAKE_G_PRE0(13, 11, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey); + BLAKE_G_PRE1(7, 14, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey); + BLAKE_G_PRE2(12, 1, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey); + BLAKE_G_PRE1(3, 9, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey); + BLAKE_G_PRE(5, 0, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey); + BLAKE_G_PRE2(15, 4, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey); + BLAKE_G_PRE2(8, 6, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey); + BLAKE_G_PRE(2, 10, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey); + // { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 }, + BLAKE_G_PRE1(6, 15, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey); + BLAKE_G_PRE0(14, 9, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey); + BLAKE_G_PRE2(11, 3, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey); + BLAKE_G_PRE1(0, 8, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey); + BLAKE_G_PRE2(12, 2, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey); + BLAKE_G_PRE2(13, 7, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey); + BLAKE_G_PRE(1, 4, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey); + BLAKE_G_PRE2(10, 5, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey); + // { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 }, + BLAKE_G_PRE2(10, 2, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey); + BLAKE_G_PRE2(8, 4, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey); + BLAKE_G_PRE(7, 6, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey); + BLAKE_G_PRE(1, 5, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey); + BLAKE_G_PRE0(15, 11, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey); + BLAKE_G_PRE0(9, 14, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey); + BLAKE_G_PRE1(3, 12, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey); + BLAKE_G_PRE2(13, 0, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey); + + V.lo ^= V.hi ^ tmpblock; + + V.hi = BLAKE2S_IV_Vec; + tmpblock = V.lo; + + V.hi.s4 ^= 128; + V.hi.s6 = ~V.hi.s6; + + // { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + BLAKE_G_PRE(0, 1, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout); + BLAKE_G_PRE(2, 3, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout); + BLAKE_G_PRE(4, 5, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout); + BLAKE_G_PRE(6, 7, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout); + BLAKE_G_PRE(8, 9, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout); + BLAKE_G_PRE(10, 11, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout); + BLAKE_G_PRE(12, 13, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout); + BLAKE_G_PRE(14, 15, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout); + // { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, + BLAKE_G_PRE(14, 10, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout); + BLAKE_G_PRE(4, 8, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout); + BLAKE_G_PRE(9, 15, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout); + BLAKE_G_PRE(13, 6, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout); + BLAKE_G_PRE(1, 12, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout); + BLAKE_G_PRE(0, 2, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout); + BLAKE_G_PRE(11, 7, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout); + BLAKE_G_PRE(5, 3, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout); + // { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 }, + BLAKE_G_PRE(11, 8, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout); + BLAKE_G_PRE(12, 0, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout); + BLAKE_G_PRE(5, 2, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout); + BLAKE_G_PRE(15, 13, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout); + BLAKE_G_PRE(10, 14, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout); + BLAKE_G_PRE(3, 6, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout); + BLAKE_G_PRE(7, 1, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout); + BLAKE_G_PRE(9, 4, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout); + // { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 }, + BLAKE_G_PRE(7, 9, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout); + BLAKE_G_PRE(3, 1, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout); + BLAKE_G_PRE(13, 12, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout); + BLAKE_G_PRE(11, 14, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout); + BLAKE_G_PRE(2, 6, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout); + BLAKE_G_PRE(5, 10, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout); + BLAKE_G_PRE(4, 0, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout); + BLAKE_G_PRE(15, 8, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout); + + for(int x = 4; x < 10; x++) + { + BLAKE_G(x, 0x00, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout); + BLAKE_G(x, 0x02, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout); + BLAKE_G(x, 0x04, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout); + BLAKE_G(x, 0x06, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout); + BLAKE_G(x, 0x08, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout); + BLAKE_G(x, 0x0A, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout); + BLAKE_G(x, 0x0C, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout); + BLAKE_G(x, 0x0E, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout); + } + + V.lo ^= V.hi ^ tmpblock; + + ((uint8*)out)[0] = V.lo; +} +#endif + +#if __CUDA_ARCH__ >= 500 + +#define BLAKE_G(idx0, idx1, a, b, c, d, key) { \ + idx = BLAKE2S_SIGMA[idx0][idx1]; a += key[idx]; \ + a += b; d = __byte_perm(d ^ a, 0, 0x1032); \ + c += d; b = rotateR(b ^ c, 12); \ + idx = BLAKE2S_SIGMA[idx0][idx1+1]; a += key[idx]; \ + a += b; d = __byte_perm(d ^ a, 0, 0x0321); \ + c += d; b = rotateR(b ^ c, 7); \ +} + +#define BLAKE(a, b, c, d, key1,key2) { \ + a += key1; \ + a += b; d = __byte_perm(d ^ a, 0, 0x1032); \ + c += d; b = rotateR(b ^ c, 12); \ + a += key2; \ + a += b; d = __byte_perm(d ^ a, 0, 0x0321); \ + c += d; b = rotateR(b ^ c, 7); \ +} + +#define BLAKE_G_PRE(idx0,idx1, a, b, c, d, key) { \ + a += key[idx0]; \ + a += b; d = __byte_perm(d ^ a, 0, 0x1032); \ + c += d; b = rotateR(b ^ c, 12); \ + a += key[idx1]; \ + a += b; d = __byte_perm(d ^ a, 0, 0x0321); \ + c += d; b = rotateR(b ^ c, 7); \ +} + +#define BLAKE_G_PRE0(idx0,idx1, a, b, c, d, key) { \ + a += b; d = __byte_perm(d ^ a, 0, 0x1032); \ + c += d; b = rotateR(b ^ c, 12); \ + a += b; d = __byte_perm(d^a, 0, 0x0321); \ + c += d; b = rotateR(b^c, 7); \ +} + +#define BLAKE_G_PRE1(idx0,idx1, a, b, c, d, key) { \ + a += key[idx0]; \ + a += b; d = __byte_perm(d ^ a, 0, 0x1032); \ + c += d; b = rotateR(b^c, 12); \ + a += b; d = __byte_perm(d ^ a, 0, 0x0321); \ + c += d; b = rotateR(b ^ c, 7); \ +} + +#define BLAKE_G_PRE2(idx0,idx1, a, b, c, d, key) { \ + a += b; d = __byte_perm(d ^ a, 0, 0x1032); \ + c += d; b = rotateR(b ^ c, 12); \ + a += key[idx1]; \ + a += b; d = __byte_perm(d ^ a, 0, 0x0321); \ + c += d; b = rotateR(b ^ c, 7); \ +} + +static __forceinline__ __device__ +void Blake2S_v2(uint32_t *out, const uint32_t* __restrict__ inout, const uint32_t * __restrict__ TheKey) +{ + uint16 V; + uint8 tmpblock; + + V.hi = BLAKE2S_IV_Vec; + V.lo = BLAKE2S_IV_Vec; + V.lo.s0 ^= 0x01012020; + + // Copy input block for later + tmpblock = V.lo; + + V.hi.s4 ^= BLAKE2S_BLOCK_SIZE; + + // { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + BLAKE_G_PRE(0, 1, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey); + BLAKE_G_PRE(2, 3, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey); + BLAKE_G_PRE(4, 5, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey); + BLAKE_G_PRE(6, 7, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey); + BLAKE_G_PRE0(8, 9, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey); + BLAKE_G_PRE0(10, 11, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey); + BLAKE_G_PRE0(12, 13, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey); + BLAKE_G_PRE0(14, 15, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey); + // { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, + BLAKE_G_PRE0(14, 10, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey); + BLAKE_G_PRE1(4, 8, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey); + BLAKE_G_PRE0(9, 15, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey); + BLAKE_G_PRE2(13, 6, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey); + BLAKE_G_PRE1(1, 12, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey); + BLAKE_G_PRE(0, 2, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey); + BLAKE_G_PRE2(11, 7, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey); + BLAKE_G_PRE(5, 3, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey); + // { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 }, + BLAKE_G_PRE0(11, 8, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey); + BLAKE_G_PRE2(12, 0, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey); + BLAKE_G_PRE(5, 2, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey); + BLAKE_G_PRE0(15, 13, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey); + BLAKE_G_PRE0(10, 14, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey); + BLAKE_G_PRE(3, 6, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey); + BLAKE_G_PRE(7, 1, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey); + BLAKE_G_PRE2(9, 4, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey); + // { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 }, + BLAKE_G_PRE1(7, 9, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey); + BLAKE_G_PRE(3, 1, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey); + BLAKE_G_PRE0(13, 12, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey); + BLAKE_G_PRE0(11, 14, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey); + BLAKE_G_PRE(2, 6, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey); + BLAKE_G_PRE1(5, 10, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey); + BLAKE_G_PRE(4, 0, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey); + BLAKE_G_PRE0(15, 8, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey); + // { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 }, + BLAKE_G_PRE2(9, 0, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey); + BLAKE_G_PRE(5, 7, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey); + BLAKE_G_PRE(2, 4, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey); + BLAKE_G_PRE0(10, 15, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey); + BLAKE_G_PRE2(14, 1, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey); + BLAKE_G_PRE0(11, 12, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey); + BLAKE_G_PRE1(6, 8, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey); + BLAKE_G_PRE1(3, 13, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey); + // { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }, + BLAKE_G_PRE1(2, 12, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey); + BLAKE_G_PRE1(6, 10, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey); + BLAKE_G_PRE1(0, 11, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey); + BLAKE_G_PRE2(8, 3, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey); + BLAKE_G_PRE1(4, 13, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey); + BLAKE_G_PRE(7, 5, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey); + BLAKE_G_PRE0(15, 14, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey); + BLAKE_G_PRE1(1, 9, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey); + // { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 }, + BLAKE_G_PRE2(12, 5, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey); + BLAKE_G_PRE1(1, 15, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey); + BLAKE_G_PRE0(14, 13, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey); + BLAKE_G_PRE1(4, 10, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey); + BLAKE_G_PRE(0, 7, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey); + BLAKE_G_PRE(6, 3, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey); + BLAKE_G_PRE2(9, 2, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey); + BLAKE_G_PRE0(8, 11, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey); + // { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 }, + BLAKE_G_PRE0(13, 11, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey); + BLAKE_G_PRE1(7, 14, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey); + BLAKE_G_PRE2(12, 1, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey); + BLAKE_G_PRE1(3, 9, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey); + BLAKE_G_PRE(5, 0, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey); + BLAKE_G_PRE2(15, 4, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey); + BLAKE_G_PRE2(8, 6, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey); + BLAKE_G_PRE(2, 10, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey); + // { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 }, + BLAKE_G_PRE1(6, 15, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey); + BLAKE_G_PRE0(14, 9, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey); + BLAKE_G_PRE2(11, 3, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey); + BLAKE_G_PRE1(0, 8, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey); + BLAKE_G_PRE2(12, 2, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey); + BLAKE_G_PRE2(13, 7, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey); + BLAKE_G_PRE(1, 4, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey); + BLAKE_G_PRE2(10, 5, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey); + // { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 }, + BLAKE_G_PRE2(10, 2, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey); + BLAKE_G_PRE2(8, 4, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey); + BLAKE_G_PRE(7, 6, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey); + BLAKE_G_PRE(1, 5, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey); + BLAKE_G_PRE0(15, 11, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey); + BLAKE_G_PRE0(9, 14, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey); + BLAKE_G_PRE1(3, 12, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey); + BLAKE_G_PRE2(13, 0, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey); + + V.lo ^= V.hi; + V.lo ^= tmpblock; + + V.hi = BLAKE2S_IV_Vec; + tmpblock = V.lo; + + V.hi.s4 ^= 128; + V.hi.s6 = ~V.hi.s6; + + // { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + BLAKE_G_PRE(0, 1, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout); + BLAKE_G_PRE(2, 3, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout); + BLAKE_G_PRE(4, 5, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout); + BLAKE_G_PRE(6, 7, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout); + BLAKE_G_PRE(8, 9, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout); + BLAKE_G_PRE(10, 11, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout); + BLAKE_G_PRE(12, 13, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout); + BLAKE_G_PRE(14, 15, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout); + // { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, + BLAKE_G_PRE(14, 10, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout); + BLAKE_G_PRE(4, 8, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout); + BLAKE_G_PRE(9, 15, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout); + BLAKE_G_PRE(13, 6, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout); + BLAKE_G_PRE(1, 12, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout); + BLAKE_G_PRE(0, 2, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout); + BLAKE_G_PRE(11, 7, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout); + BLAKE_G_PRE(5, 3, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout); + // { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 }, + BLAKE_G_PRE(11, 8, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout); + BLAKE_G_PRE(12, 0, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout); + BLAKE_G_PRE(5, 2, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout); + BLAKE_G_PRE(15, 13, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout); + BLAKE_G_PRE(10, 14, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout); + BLAKE_G_PRE(3, 6, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout); + BLAKE_G_PRE(7, 1, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout); + BLAKE_G_PRE(9, 4, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout); + // { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 }, + BLAKE_G_PRE(7, 9, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout); + BLAKE_G_PRE(3, 1, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout); + BLAKE_G_PRE(13, 12, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout); + BLAKE_G_PRE(11, 14, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout); + BLAKE_G_PRE(2, 6, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout); + BLAKE_G_PRE(5, 10, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout); + BLAKE_G_PRE(4, 0, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout); + BLAKE_G_PRE(15, 8, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout); + + BLAKE(V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout[9], inout[0]); + BLAKE(V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout[5], inout[7]); + BLAKE(V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout[2], inout[4]); + BLAKE(V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout[10], inout[15]); + BLAKE(V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout[14], inout[1]); + BLAKE(V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout[11], inout[12]); + BLAKE(V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout[6], inout[8]); + BLAKE(V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout[3], inout[13]); + + BLAKE(V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout[2], inout[12]); + BLAKE(V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout[6], inout[10]); + BLAKE(V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout[0], inout[11]); + BLAKE(V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout[8], inout[3]); + BLAKE(V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout[4], inout[13]); + BLAKE(V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout[7], inout[5]); + BLAKE(V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout[15], inout[14]); + BLAKE(V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout[1], inout[9]); + + BLAKE(V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout[12], inout[5]); + BLAKE(V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout[1], inout[15]); + BLAKE(V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout[14], inout[13]); + BLAKE(V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout[4], inout[10]); + BLAKE(V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout[0], inout[7]); + BLAKE(V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout[6], inout[3]); + BLAKE(V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout[9], inout[2]); + BLAKE(V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout[8], inout[11]); + // 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10, + BLAKE(V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout[13], inout[11]); + BLAKE(V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout[7], inout[14]); + BLAKE(V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout[12], inout[1]); + BLAKE(V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout[3], inout[9]); + BLAKE(V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout[5], inout[0]); + BLAKE(V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout[15], inout[4]); + BLAKE(V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout[8], inout[6]); + BLAKE(V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout[2], inout[10]); + // 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5, + BLAKE(V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout[6], inout[15]); + BLAKE(V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout[14], inout[9]); + BLAKE(V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout[11], inout[3]); + BLAKE(V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout[0], inout[8]); + BLAKE(V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout[12], inout[2]); + BLAKE(V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout[13], inout[7]); + BLAKE(V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout[1], inout[4]); + BLAKE(V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout[10], inout[5]); + // 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0, + BLAKE(V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout[10], inout[2]); + BLAKE(V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout[8], inout[4]); + BLAKE(V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout[7], inout[6]); + BLAKE(V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout[1], inout[5]); + BLAKE(V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout[15], inout[11]); + BLAKE(V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout[9], inout[14]); + BLAKE(V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout[3], inout[12]); + BLAKE(V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout[13], inout[0]); + + V.lo ^= V.hi; + V.lo ^= tmpblock; + + ((uint8*)out)[0] = V.lo; +} + +#endif /* __CUDA_ARCH__ >= 500 */ + +#define SALSA_CORE(state) { \ + uint32_t t; \ + SALSA(state.x, state.y, state.z, state.w); \ + WarpShuffle3(state.y, state.z, state.w, threadIdx.x + 3, threadIdx.x + 2, threadIdx.x + 1,4); \ + SALSA(state.x, state.w, state.z, state.y); \ + WarpShuffle3(state.y, state.z, state.w, threadIdx.x + 1, threadIdx.x + 2, threadIdx.x + 3,4); \ +} + +#define CHACHA_CORE_PARALLEL(state) { \ + CHACHA_STEP(state.x, state.y, state.z, state.w); \ + WarpShuffle3(state.y, state.z, state.w, threadIdx.x + 1, threadIdx.x + 2, threadIdx.x + 3,4); \ + CHACHA_STEP(state.x, state.y, state.z, state.w); \ + WarpShuffle3(state.y, state.z, state.w, threadIdx.x + 3, threadIdx.x + 2, threadIdx.x + 1,4); \ +} + +__forceinline__ __device__ +uint4 salsa_small_scalar_rnd(const uint4 X) +{ + uint4 state = X; + +#pragma nounroll + for(int i = 0; i < 10; i++) + { + SALSA_CORE(state); + } + + return (X + state); +} + +__device__ __forceinline__ +uint4 chacha_small_parallel_rnd(const uint4 X) +{ + uint4 state = X; + +#pragma nounroll + for(int i = 0; i < 10; i++) + { + CHACHA_CORE_PARALLEL(state); + } + return (X + state); +} + +__device__ __forceinline__ +void neoscrypt_chacha(uint4 XV[4]) +{ + uint4 temp; + + XV[0] = chacha_small_parallel_rnd(XV[0] ^ XV[3]); + temp = chacha_small_parallel_rnd(XV[1] ^ XV[0]); + XV[1] = chacha_small_parallel_rnd(XV[2] ^ temp); + XV[3] = chacha_small_parallel_rnd(XV[3] ^ XV[1]); + XV[2] = temp; +} + +__device__ __forceinline__ +void neoscrypt_salsa(uint4 XV[4]) +{ + uint4 temp; + + XV[0] = salsa_small_scalar_rnd(XV[0] ^ XV[3]); + temp = salsa_small_scalar_rnd(XV[1] ^ XV[0]); + XV[1] = salsa_small_scalar_rnd(XV[2] ^ temp); + XV[3] = salsa_small_scalar_rnd(XV[3] ^ XV[1]); + XV[2] = temp; +} + + +#if __CUDA_ARCH__ < 500 +static __forceinline__ __device__ +void fastkdf256_v1(const uint32_t thread, const uint32_t nonce, uint32_t* const s_data) +{ + uint2x4 output[8]; + uint32_t* B = (uint32_t*)&s_data[threadIdx.x * 64U]; + uint32_t qbuf, rbuf, bitbuf; + uint32_t input[BLAKE2S_BLOCK_SIZE / 4]; + uint32_t key[BLAKE2S_BLOCK_SIZE / 4] = {0}; + + const uint32_t data18 = c_data[18]; + const uint32_t data20 = c_data[0]; + + ((uintx64*)(B))[0] = ((uintx64*)c_data)[0]; + ((uint32_t*)B)[19] = nonce; + ((uint32_t*)B)[39] = nonce; + ((uint32_t*)B)[59] = nonce; + + ((uint816*)input)[0] = ((uint816*)input_init)[0]; + ((uint4x2*)key)[0] = ((uint4x2*)key_init)[0]; + +#pragma unroll 1 + for(int i = 0; i < 31; i++) + { + uint32_t bufidx = 0; +#pragma unroll + for(int x = 0; x < BLAKE2S_OUT_SIZE / 4; ++x) + { + uint32_t bufhelper = (input[x] & 0x00ff00ff) + ((input[x] & 0xff00ff00) >> 8); + bufhelper = bufhelper + (bufhelper >> 16); + bufidx += bufhelper; + } + bufidx &= 0x000000ff; + qbuf = bufidx >> 2; + rbuf = bufidx & 3; + bitbuf = rbuf << 3; + + uint32_t shifted[9]; + shift256R4(shifted, ((uint8*)input)[0], bitbuf); + + uint32_t temp[9]; + //#pragma unroll + for(int k = 0; k < 9; k++) + { + uint32_t indice = (k + qbuf) & 0x3f; + temp[k] = B[indice] ^ shifted[k]; + B[indice] = temp[k]; + } +#if __CUDA_ARCH__ >= 320 || !defined(__CUDA_ARCH__) + uint32_t a = c_data[qbuf & 0x3f], b; + //#pragma unroll + for(int k = 0; k<16; k += 2) + { + b = c_data[(qbuf + k + 1) & 0x3f]; + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[k]) : "r"(a), "r"(b), "r"(bitbuf)); + a = c_data[(qbuf + k + 2) & 0x3f]; + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[k + 1]) : "r"(b), "r"(a), "r"(bitbuf)); + } + + const uint32_t noncepos = 19U - qbuf % 20U; + if(noncepos <= 16U && qbuf < 60U) + { + if(noncepos != 0) + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[noncepos - 1]) : "r"(data18), "r"(nonce), "r"(bitbuf)); + if(noncepos != 16U) + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[noncepos]) : "r"(nonce), "r"(data20), "r"(bitbuf)); + } + + for(int k = 0; k<8; k++) + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[k]) : "r"(temp[k]), "r"(temp[k + 1]), "r"(bitbuf)); +#else + //#error SM 3.0 code missing here + printf("", data18, data20); +#endif + Blake2S(input, input, key); + } + + uint32_t bufidx = 0; +#pragma unroll + for(int x = 0; x < BLAKE2S_OUT_SIZE / 4; ++x) + { + uint32_t bufhelper = (input[x] & 0x00ff00ff) + ((input[x] & 0xff00ff00) >> 8); + bufhelper = bufhelper + (bufhelper >> 16); + bufidx += bufhelper; + } + bufidx &= 0x000000ff; + qbuf = bufidx >> 2; + rbuf = bufidx & 3; + bitbuf = rbuf << 3; + +#if __CUDA_ARCH__ >= 320 + for(int i = 0; i<64; i++) + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(((uint32_t*)output)[i]) : "r"(B[(qbuf + i) & 0x3f]), "r"(B[(qbuf + i + 1) & 0x3f4]), "r"(bitbuf)); +#endif + + ((ulonglong4*)output)[0] ^= ((ulonglong4*)input)[0]; + ((uintx64*)output)[0] ^= ((uintx64*)c_data)[0]; + ((uint32_t*)output)[19] ^= nonce; + ((uint32_t*)output)[39] ^= nonce; + ((uint32_t*)output)[59] ^= nonce; + + for(int i = 0; i<8; i++) + (Input + 8U * thread)[i] = output[i]; +} +#endif + +#if __CUDA_ARCH__ >= 500 +static __forceinline__ __device__ +void fastkdf256_v2(const uint32_t thread, const uint32_t nonce, uint32_t* const s_data) +{ + const uint32_t data18 = c_data[18]; + const uint32_t data20 = c_data[0]; + uint32_t input[16]; + uint32_t key[16] = {0}; + uint32_t qbuf, rbuf, bitbuf; + + uint32_t* B = (uint32_t*)&s_data[threadIdx.x * 64U]; + ((uintx64*)(B))[0] = ((uintx64*)c_data)[0]; + + B[19] = nonce; + B[39] = nonce; + B[59] = nonce; + + { + uint32_t bufidx = 0; +#pragma unroll + for(int x = 0; x < BLAKE2S_OUT_SIZE / 4; ++x) + { + uint32_t bufhelper = (input_init[x] & 0x00ff00ff) + ((input_init[x] & 0xff00ff00) >> 8); + bufhelper = bufhelper + (bufhelper >> 16); + bufidx += bufhelper; + } + bufidx &= 0x000000ff; + qbuf = bufidx >> 2; + rbuf = bufidx & 3; + bitbuf = rbuf << 3; + + uint32_t temp[9]; + + uint32_t shifted; + uint32_t shift = 32U - bitbuf; + asm("shl.b32 %0, %1, %2;" : "=r"(shifted) : "r"(input_init[0]), "r"(bitbuf)); + temp[0] = B[(0 + qbuf) & 0x3f] ^ shifted; + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(shifted) : "r"(input_init[0]), "r"(input_init[1]), "r"(shift)); + temp[1] = B[(1 + qbuf) & 0x3f] ^ shifted; + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(shifted) : "r"(input_init[1]), "r"(input_init[2]), "r"(shift)); + temp[2] = B[(2 + qbuf) & 0x3f] ^ shifted; + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(shifted) : "r"(input_init[2]), "r"(input_init[3]), "r"(shift)); + temp[3] = B[(3 + qbuf) & 0x3f] ^ shifted; + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(shifted) : "r"(input_init[3]), "r"(input_init[4]), "r"(shift)); + temp[4] = B[(4 + qbuf) & 0x3f] ^ shifted; + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(shifted) : "r"(input_init[4]), "r"(input_init[5]), "r"(shift)); + temp[5] = B[(5 + qbuf) & 0x3f] ^ shifted; + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(shifted) : "r"(input_init[5]), "r"(input_init[6]), "r"(shift)); + temp[6] = B[(6 + qbuf) & 0x3f] ^ shifted; + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(shifted) : "r"(input_init[6]), "r"(input_init[7]), "r"(shift)); + temp[7] = B[(7 + qbuf) & 0x3f] ^ shifted; + asm("shr.b32 %0, %1, %2;" : "=r"(shifted) : "r"(input_init[7]), "r"(shift)); + temp[8] = B[(8 + qbuf) & 0x3f] ^ shifted; + + uint32_t a = c_data[qbuf & 0x3f], b; + +#pragma unroll + for(int k = 0; k<16; k += 2) + { + b = c_data[(qbuf + k + 1) & 0x3f]; + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[k]) : "r"(a), "r"(b), "r"(bitbuf)); + a = c_data[(qbuf + k + 2) & 0x3f]; + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[k + 1]) : "r"(b), "r"(a), "r"(bitbuf)); + } + + const uint32_t noncepos = 19 - qbuf % 20U; + if(noncepos <= 16U && qbuf < 60U) + { + if(noncepos) + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[noncepos - 1]) : "r"(data18), "r"(nonce), "r"(bitbuf)); + if(noncepos != 16U) + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[noncepos]) : "r"(nonce), "r"(data20), "r"(bitbuf)); + } + + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[0]) : "r"(temp[0]), "r"(temp[1]), "r"(bitbuf)); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[1]) : "r"(temp[1]), "r"(temp[2]), "r"(bitbuf)); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[2]) : "r"(temp[2]), "r"(temp[3]), "r"(bitbuf)); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[3]) : "r"(temp[3]), "r"(temp[4]), "r"(bitbuf)); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[4]) : "r"(temp[4]), "r"(temp[5]), "r"(bitbuf)); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[5]) : "r"(temp[5]), "r"(temp[6]), "r"(bitbuf)); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[6]) : "r"(temp[6]), "r"(temp[7]), "r"(bitbuf)); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[7]) : "r"(temp[7]), "r"(temp[8]), "r"(bitbuf)); + + Blake2S_v2(input, input, key); + +#pragma unroll + for(int k = 0; k < 9; k++) + B[(k + qbuf) & 0x3f] = temp[k]; + } + + for(int i = 1; i < 31; i++) + { + uint32_t bufidx = 0; +#pragma unroll + for(int x = 0; x < BLAKE2S_OUT_SIZE / 4; ++x) + { + uint32_t bufhelper = (input[x] & 0x00ff00ff) + ((input[x] & 0xff00ff00) >> 8); + bufhelper = bufhelper + (bufhelper >> 16); + bufidx += bufhelper; + } + bufidx &= 0x000000ff; + qbuf = bufidx >> 2; + rbuf = bufidx & 3; + bitbuf = rbuf << 3; + + uint32_t temp[9]; + + uint32_t shifted; + uint32_t shift = 32U - bitbuf; + asm("shl.b32 %0, %1, %2;" : "=r"(shifted) : "r"(input[0]), "r"(bitbuf)); + temp[0] = B[(0 + qbuf) & 0x3f] ^ shifted; + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(shifted) : "r"(input[0]), "r"(input[1]), "r"(shift)); + temp[1] = B[(1 + qbuf) & 0x3f] ^ shifted; + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(shifted) : "r"(input[1]), "r"(input[2]), "r"(shift)); + temp[2] = B[(2 + qbuf) & 0x3f] ^ shifted; + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(shifted) : "r"(input[2]), "r"(input[3]), "r"(shift)); + temp[3] = B[(3 + qbuf) & 0x3f] ^ shifted; + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(shifted) : "r"(input[3]), "r"(input[4]), "r"(shift)); + temp[4] = B[(4 + qbuf) & 0x3f] ^ shifted; + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(shifted) : "r"(input[4]), "r"(input[5]), "r"(shift)); + temp[5] = B[(5 + qbuf) & 0x3f] ^ shifted; + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(shifted) : "r"(input[5]), "r"(input[6]), "r"(shift)); + temp[6] = B[(6 + qbuf) & 0x3f] ^ shifted; + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(shifted) : "r"(input[6]), "r"(input[7]), "r"(shift)); + temp[7] = B[(7 + qbuf) & 0x3f] ^ shifted; + asm("shr.b32 %0, %1, %2;" : "=r"(shifted) : "r"(input[7]), "r"(shift)); + temp[8] = B[(8 + qbuf) & 0x3f] ^ shifted; + + uint32_t a = c_data[qbuf & 0x3f], b; + +#pragma unroll + for(int k = 0; k<16; k += 2) + { + b = c_data[(qbuf + k + 1) & 0x3f]; + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[k]) : "r"(a), "r"(b), "r"(bitbuf)); + a = c_data[(qbuf + k + 2) & 0x3f]; + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[k + 1]) : "r"(b), "r"(a), "r"(bitbuf)); + } + + const uint32_t noncepos = 19 - qbuf % 20U; + if(noncepos <= 16U && qbuf < 60U) + { + if(noncepos) + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[noncepos - 1]) : "r"(data18), "r"(nonce), "r"(bitbuf)); + if(noncepos != 16U) + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[noncepos]) : "r"(nonce), "r"(data20), "r"(bitbuf)); + } + + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[0]) : "r"(temp[0]), "r"(temp[1]), "r"(bitbuf)); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[1]) : "r"(temp[1]), "r"(temp[2]), "r"(bitbuf)); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[2]) : "r"(temp[2]), "r"(temp[3]), "r"(bitbuf)); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[3]) : "r"(temp[3]), "r"(temp[4]), "r"(bitbuf)); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[4]) : "r"(temp[4]), "r"(temp[5]), "r"(bitbuf)); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[5]) : "r"(temp[5]), "r"(temp[6]), "r"(bitbuf)); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[6]) : "r"(temp[6]), "r"(temp[7]), "r"(bitbuf)); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[7]) : "r"(temp[7]), "r"(temp[8]), "r"(bitbuf)); + + Blake2S_v2(input, input, key); + +#pragma unroll + for(int k = 0; k < 9; k++) + B[(k + qbuf) & 0x3f] = temp[k]; + } + + { + uint32_t bufidx = 0; +#pragma unroll + for(int x = 0; x < BLAKE2S_OUT_SIZE / 4; ++x) + { + uint32_t bufhelper = (input[x] & 0x00ff00ff) + ((input[x] & 0xff00ff00) >> 8); + bufhelper = bufhelper + (bufhelper >> 16); + bufidx += bufhelper; + } + bufidx &= 0x000000ff; + qbuf = bufidx >> 2; + rbuf = bufidx & 3; + bitbuf = rbuf << 3; + } + + uint2x4 output[8]; + for(int i = 0; i<64; i++) + { + const uint32_t a = (qbuf + i) & 0x3f, b = (qbuf + i + 1) & 0x3f; + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(((uint32_t*)output)[i]) : "r"(B[a]), "r"(B[b]), "r"(bitbuf)); + } + + output[0] ^= ((uint2x4*)input)[0]; +#pragma unroll + for(int i = 0; i<8; i++) + output[i] ^= ((uint2x4*)c_data)[i]; + + ((uint32_t*)output)[19] ^= nonce; + ((uint32_t*)output)[39] ^= nonce; + ((uint32_t*)output)[59] ^= nonce;; + ((ulonglong16 *)(Input + 8U * thread))[0] = ((ulonglong16*)output)[0]; +} +#endif + +#if __CUDA_ARCH__ < 500 +static __forceinline__ __device__ +uint32_t fastkdf32_v1(uint32_t thread, const uint32_t nonce, uint32_t* const salt, uint32_t* const s_data) +{ + const uint32_t cdata7 = c_data[7]; + const uint32_t data18 = c_data[18]; + const uint32_t data20 = c_data[0]; + + uint32_t* B0 = (uint32_t*)&s_data[threadIdx.x * 64U]; + ((uintx64*)B0)[0] = ((uintx64*)salt)[0]; + + uint32_t input[BLAKE2S_BLOCK_SIZE / 4]; + ((uint816*)input)[0] = ((uint816*)c_data)[0]; + + uint32_t key[BLAKE2S_BLOCK_SIZE / 4]; + ((uint4x2*)key)[0] = ((uint4x2*)salt)[0]; + ((uint4*)key)[2] = make_uint4(0, 0, 0, 0); + ((uint4*)key)[3] = make_uint4(0, 0, 0, 0); + + uint32_t qbuf, rbuf, bitbuf; + uint32_t temp[9]; + +#pragma nounroll + for(int i = 0; i < 31; i++) + { + Blake2S(input, input, key); + + uint32_t bufidx = 0; +#pragma unroll + for(int x = 0; x < BLAKE2S_OUT_SIZE / 4; ++x) + { + uint32_t bufhelper = (input[x] & 0x00ff00ff) + ((input[x] & 0xff00ff00) >> 8); + bufhelper = bufhelper + (bufhelper >> 16); + bufidx += bufhelper; + } + bufidx &= 0x000000ff; + qbuf = bufidx >> 2; + rbuf = bufidx & 3; + bitbuf = rbuf << 3; + uint32_t shifted[9]; + + shift256R4(shifted, ((uint8*)input)[0], bitbuf); + + for(int k = 0; k < 9; k++) + { + temp[k] = B0[(k + qbuf) & 0x3f]; + } + + ((uint2x4*)temp)[0] ^= ((uint2x4*)shifted)[0]; + temp[8] ^= shifted[8]; + +#if __CUDA_ARCH__ >= 320 || !defined(__CUDA_ARCH__) + uint32_t a = c_data[qbuf & 0x3f], b; + //#pragma unroll + for(int k = 0; k<16; k += 2) + { + b = c_data[(qbuf + k + 1) & 0x3f]; + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[k]) : "r"(a), "r"(b), "r"(bitbuf)); + a = c_data[(qbuf + k + 2) & 0x3f]; + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[k + 1]) : "r"(b), "r"(a), "r"(bitbuf)); + } + + const uint32_t noncepos = 19U - qbuf % 20U; + if(noncepos <= 16U && qbuf < 60U) + { + if(noncepos != 0) asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[noncepos - 1]) : "r"(data18), "r"(nonce), "r"(bitbuf)); + if(noncepos != 16U) asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[noncepos]) : "r"(nonce), "r"(data20), "r"(bitbuf)); + } + + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[0]) : "r"(temp[0]), "r"(temp[1]), "r"(bitbuf)); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[1]) : "r"(temp[1]), "r"(temp[2]), "r"(bitbuf)); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[2]) : "r"(temp[2]), "r"(temp[3]), "r"(bitbuf)); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[3]) : "r"(temp[3]), "r"(temp[4]), "r"(bitbuf)); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[4]) : "r"(temp[4]), "r"(temp[5]), "r"(bitbuf)); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[5]) : "r"(temp[5]), "r"(temp[6]), "r"(bitbuf)); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[6]) : "r"(temp[6]), "r"(temp[7]), "r"(bitbuf)); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[7]) : "r"(temp[7]), "r"(temp[8]), "r"(bitbuf)); +#else + //#error SM 3.0 code missing here + printf("", data18, data20); +#endif + for(int k = 0; k < 9; k++) + { + B0[(k + qbuf) & 0x3f] = temp[k]; + } + } + + Blake2S(input, input, key); + + uint32_t bufidx = 0; +#pragma unroll + for(int x = 0; x < BLAKE2S_OUT_SIZE / 4; ++x) + { + uint32_t bufhelper = (input[x] & 0x00ff00ff) + ((input[x] & 0xff00ff00) >> 8); + bufhelper = bufhelper + (bufhelper >> 16); + bufidx += bufhelper; + } + bufidx &= 0x000000ff; + qbuf = bufidx >> 2; + rbuf = bufidx & 3; + bitbuf = rbuf << 3; + + for(int k = 7; k < 9; k++) + { + temp[k] = B0[(k + qbuf) & 0x3f]; + } + + uint32_t output; +#if __CUDA_ARCH__ >= 320 + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(output) : "r"(temp[7]), "r"(temp[8]), "r"(bitbuf)); +#else + output = (MAKE_ULONGLONG(temp[7], temp[8]) >> bitbuf); // to check maybe 7/8 reversed +#endif + output ^= input[7] ^ cdata7; + return output; +} +#endif + +#if __CUDA_ARCH__ >= 500 +static __forceinline__ __device__ +uint32_t fastkdf32_v3(uint32_t thread, const uint32_t nonce, uint32_t* const salt, uint32_t* const s_data) +{ + const uint32_t cdata7 = c_data[7]; + const uint32_t data18 = c_data[18]; + const uint32_t data20 = c_data[0]; + + uint32_t* B0 = (uint32_t*)&s_data[threadIdx.x * 64U]; + ((uintx64*)B0)[0] = ((uintx64*)salt)[0]; + + uint32_t input[BLAKE2S_BLOCK_SIZE / 4]; + ((uint816*)input)[0] = ((uint816*)c_data)[0]; + + uint32_t key[BLAKE2S_BLOCK_SIZE / 4]; + ((uint4x2*)key)[0] = ((uint4x2*)salt)[0]; + ((uint4*)key)[2] = make_uint4(0, 0, 0, 0); + ((uint4*)key)[3] = make_uint4(0, 0, 0, 0); + + uint32_t qbuf, rbuf, bitbuf; + uint32_t temp[9]; + +#pragma nounroll + for(int i = 0; i < 31; i++) + { + Blake2S_v2(input, input, key); + + uint32_t bufidx = 0; +#pragma unroll + for(int x = 0; x < BLAKE2S_OUT_SIZE / 4; ++x) + { + uint32_t bufhelper = (input[x] & 0x00ff00ff) + ((input[x] & 0xff00ff00) >> 8); + bufhelper = bufhelper + (bufhelper >> 16); + bufidx += bufhelper; + } + bufidx &= 0x000000ff; + qbuf = bufidx >> 2; + rbuf = bufidx & 3; + bitbuf = rbuf << 3; + + uint32_t shifted; + uint32_t shift = 32U - bitbuf; + asm("shl.b32 %0, %1, %2;" : "=r"(shifted) : "r"(input[0]), "r"(bitbuf)); + temp[0] = B0[(0 + qbuf) & 0x3f] ^ shifted; + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(shifted) : "r"(input[0]), "r"(input[1]), "r"(shift)); + temp[1] = B0[(1 + qbuf) & 0x3f] ^ shifted; + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(shifted) : "r"(input[1]), "r"(input[2]), "r"(shift)); + temp[2] = B0[(2 + qbuf) & 0x3f] ^ shifted; + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(shifted) : "r"(input[2]), "r"(input[3]), "r"(shift)); + temp[3] = B0[(3 + qbuf) & 0x3f] ^ shifted; + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(shifted) : "r"(input[3]), "r"(input[4]), "r"(shift)); + temp[4] = B0[(4 + qbuf) & 0x3f] ^ shifted; + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(shifted) : "r"(input[4]), "r"(input[5]), "r"(shift)); + temp[5] = B0[(5 + qbuf) & 0x3f] ^ shifted; + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(shifted) : "r"(input[5]), "r"(input[6]), "r"(shift)); + temp[6] = B0[(6 + qbuf) & 0x3f] ^ shifted; + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(shifted) : "r"(input[6]), "r"(input[7]), "r"(shift)); + temp[7] = B0[(7 + qbuf) & 0x3f] ^ shifted; + asm("shr.b32 %0, %1, %2;" : "=r"(shifted) : "r"(input[7]), "r"(shift)); + temp[8] = B0[(8 + qbuf) & 0x3f] ^ shifted; + + uint32_t a = c_data[qbuf & 0x3f], b; +#pragma unroll + for(int k = 0; k<16; k += 2) + { + b = c_data[(qbuf + k + 1) & 0x3f]; + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[k]) : "r"(a), "r"(b), "r"(bitbuf)); + a = c_data[(qbuf + k + 2) & 0x3f]; + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[k + 1]) : "r"(b), "r"(a), "r"(bitbuf)); + } + + const uint32_t noncepos = 19U - qbuf % 20U; + if(noncepos <= 16U && qbuf < 60U) + { + if(noncepos != 0) + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[noncepos - 1]) : "r"(data18), "r"(nonce), "r"(bitbuf)); + if(noncepos != 16U) + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[noncepos]) : "r"(nonce), "r"(data20), "r"(bitbuf)); + } + + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[0]) : "r"(temp[0]), "r"(temp[1]), "r"(bitbuf)); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[1]) : "r"(temp[1]), "r"(temp[2]), "r"(bitbuf)); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[2]) : "r"(temp[2]), "r"(temp[3]), "r"(bitbuf)); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[3]) : "r"(temp[3]), "r"(temp[4]), "r"(bitbuf)); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[4]) : "r"(temp[4]), "r"(temp[5]), "r"(bitbuf)); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[5]) : "r"(temp[5]), "r"(temp[6]), "r"(bitbuf)); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[6]) : "r"(temp[6]), "r"(temp[7]), "r"(bitbuf)); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[7]) : "r"(temp[7]), "r"(temp[8]), "r"(bitbuf)); + +#pragma unroll + for(int k = 0; k < 9; k++) + { + B0[(k + qbuf) & 0x3f] = temp[k]; + } + } + + Blake2S_v2(input, input, key); + + uint32_t bufidx = 0; +#pragma unroll + for(int x = 0; x < BLAKE2S_OUT_SIZE / 4; ++x) + { + uint32_t bufhelper = (input[x] & 0x00ff00ff) + ((input[x] & 0xff00ff00) >> 8); + bufhelper = bufhelper + (bufhelper >> 16); + bufidx += bufhelper; + } + bufidx &= 0x000000ff; + qbuf = bufidx >> 2; + rbuf = bufidx & 3; + bitbuf = rbuf << 3; + + temp[7] = B0[(qbuf + 7) & 0x3f]; + temp[8] = B0[(qbuf + 8) & 0x3f]; + + uint32_t output; + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(output) : "r"(temp[7]), "r"(temp[8]), "r"(bitbuf)); + output ^= input[7] ^ cdata7; + return output; +} +#endif + + +#define BLAKE_Ghost(idx0, idx1, a, b, c, d, key) { \ + idx = BLAKE2S_SIGMA_host[idx0][idx1]; a += key[idx]; \ + a += b; d = ROTR32(d ^ a, 16); \ + c += d; b = ROTR32(b ^ c, 12); \ + idx = BLAKE2S_SIGMA_host[idx0][idx1 + 1]; a += key[idx]; \ + a += b; d = ROTR32(d ^ a, 8); \ + c += d; b = ROTR32(b ^ c, 7); \ +} + +static void Blake2Shost(uint32_t * inout, const uint32_t * inkey) +{ + uint16 V; + uint32_t idx; + uint8 tmpblock; + + V.hi = BLAKE2S_IV_Vechost; + V.lo = BLAKE2S_IV_Vechost; + V.lo.s0 ^= 0x01012020; + + // Copy input block for later + tmpblock = V.lo; + + V.hi.s4 ^= BLAKE2S_BLOCK_SIZE; + + for(int x = 0; x < 10; ++x) + { + BLAKE_Ghost(x, 0x00, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inkey); + BLAKE_Ghost(x, 0x02, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inkey); + BLAKE_Ghost(x, 0x04, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inkey); + BLAKE_Ghost(x, 0x06, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inkey); + BLAKE_Ghost(x, 0x08, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inkey); + BLAKE_Ghost(x, 0x0A, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inkey); + BLAKE_Ghost(x, 0x0C, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inkey); + BLAKE_Ghost(x, 0x0E, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inkey); + } + + V.lo ^= V.hi; + V.lo ^= tmpblock; + + V.hi = BLAKE2S_IV_Vechost; + tmpblock = V.lo; + + V.hi.s4 ^= 128; + V.hi.s6 = ~V.hi.s6; + + for(int x = 0; x < 10; ++x) + { + BLAKE_Ghost(x, 0x00, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout); + BLAKE_Ghost(x, 0x02, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout); + BLAKE_Ghost(x, 0x04, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout); + BLAKE_Ghost(x, 0x06, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout); + BLAKE_Ghost(x, 0x08, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout); + BLAKE_Ghost(x, 0x0A, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout); + BLAKE_Ghost(x, 0x0C, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout); + BLAKE_Ghost(x, 0x0E, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout); + } + + V.lo ^= V.hi ^ tmpblock; + + ((uint8*)inout)[0] = V.lo; +} + + +#define SHIFT 128U +#define TPB 32 +#define TPB2 64 + +__global__ +__launch_bounds__(TPB2, 1) +void neoscrypt_gpu_hash_start(const int stratum, const uint32_t startNonce) +{ + __shared__ uint32_t s_data[64 * TPB2]; + + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + const uint32_t nonce = startNonce + thread; + const uint32_t ZNonce = (stratum) ? cuda_swab32(nonce) : nonce; //freaking morons !!! + + __syncthreads(); +#if __CUDA_ARCH__ < 500 + fastkdf256_v1(thread, ZNonce, s_data); +#else + fastkdf256_v2(thread, ZNonce, s_data); +#endif +} + +__global__ +__launch_bounds__(TPB, 1) +void neoscrypt_gpu_hash_chacha1() +{ + const uint32_t thread = (blockDim.y * blockIdx.x + threadIdx.y); + const uint32_t shift = SHIFT * 8U * (thread & 8191); + const uint32_t shiftTr = 8U * thread; + + uint4 X[4]; + for(int i = 0; i < 4; i++) + { + X[i].x = __ldg((uint32_t*)&(Input + shiftTr)[i * 2] + 0 * 4 + threadIdx.x); + X[i].y = __ldg((uint32_t*)&(Input + shiftTr)[i * 2] + 1 * 4 + threadIdx.x); + X[i].z = __ldg((uint32_t*)&(Input + shiftTr)[i * 2] + 2 * 4 + threadIdx.x); + X[i].w = __ldg((uint32_t*)&(Input + shiftTr)[i * 2] + 3 * 4 + threadIdx.x); + } + +#pragma nounroll + for(int i = 0; i < 128; i++) + { + uint32_t offset = shift + i * 8U; + for(int j = 0; j < 4; j++) + ((uint4*)(W + offset))[j * 4 + threadIdx.x] = X[j]; + neoscrypt_chacha(X); + } + +#pragma nounroll + for(int t = 0; t < 128; t++) + { + uint32_t offset = shift + (WarpShuffle(X[3].x, 0, 4) & 0x7F) * 8U; + for(int j = 0; j < 4; j++) + X[j] ^= ((uint4*)(W + offset))[j * 4 + threadIdx.x]; + neoscrypt_chacha(X); + } + +#pragma unroll + for(int i = 0; i < 4; i++) + { + *((uint32_t*)&(Tr + shiftTr)[i * 2] + 0 * 4 + threadIdx.x) = X[i].x; + *((uint32_t*)&(Tr + shiftTr)[i * 2] + 1 * 4 + threadIdx.x) = X[i].y; + *((uint32_t*)&(Tr + shiftTr)[i * 2] + 2 * 4 + threadIdx.x) = X[i].z; + *((uint32_t*)&(Tr + shiftTr)[i * 2] + 3 * 4 + threadIdx.x) = X[i].w; + } +} + +__global__ +__launch_bounds__(TPB, 1) +void neoscrypt_gpu_hash_salsa1() +{ + const uint32_t thread = (blockDim.y * blockIdx.x + threadIdx.y); + const uint32_t shift = SHIFT * 8U * (thread & 8191); + const uint32_t shiftTr = 8U * thread; + + uint4 Z[4]; + for(int i = 0; i < 4; i++) + { + Z[i].x = __ldg((uint32_t*)&(Input + shiftTr)[i * 2] + ((0 + threadIdx.x) & 3) * 4 + threadIdx.x); + Z[i].y = __ldg((uint32_t*)&(Input + shiftTr)[i * 2] + ((1 + threadIdx.x) & 3) * 4 + threadIdx.x); + Z[i].z = __ldg((uint32_t*)&(Input + shiftTr)[i * 2] + ((2 + threadIdx.x) & 3) * 4 + threadIdx.x); + Z[i].w = __ldg((uint32_t*)&(Input + shiftTr)[i * 2] + ((3 + threadIdx.x) & 3) * 4 + threadIdx.x); + } + +#pragma nounroll + for(int i = 0; i < 128; i++) + { + uint32_t offset = shift + i * 8U; + for(int j = 0; j < 4; j++) + ((uint4*)(W + offset))[j * 4 + threadIdx.x] = Z[j]; + neoscrypt_salsa(Z); + } + +#pragma nounroll + for(int t = 0; t < 128; t++) + { + uint32_t offset = shift + (WarpShuffle(Z[3].x, 0, 4) & 0x7F) * 8U; + for(int j = 0; j < 4; j++) + Z[j] ^= ((uint4*)(W + offset))[j * 4 + threadIdx.x]; + neoscrypt_salsa(Z); + } +#pragma unroll + for(int i = 0; i < 4; i++) + { + *((uint32_t*)&(Tr2 + shiftTr)[i * 2] + ((0 + threadIdx.x) & 3) * 4 + threadIdx.x) = Z[i].x; + *((uint32_t*)&(Tr2 + shiftTr)[i * 2] + ((1 + threadIdx.x) & 3) * 4 + threadIdx.x) = Z[i].y; + *((uint32_t*)&(Tr2 + shiftTr)[i * 2] + ((2 + threadIdx.x) & 3) * 4 + threadIdx.x) = Z[i].z; + *((uint32_t*)&(Tr2 + shiftTr)[i * 2] + ((3 + threadIdx.x) & 3) * 4 + threadIdx.x) = Z[i].w; + } +} + +__global__ +__launch_bounds__(TPB2, 8) +void neoscrypt_gpu_hash_ending(const int stratum, const uint32_t startNonce, uint32_t *resNonces) +{ + __shared__ uint32_t s_data[64 * TPB2]; + + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + const uint32_t shiftTr = thread * 8U; + const uint32_t nonce = startNonce + thread; + const uint32_t ZNonce = (stratum) ? cuda_swab32(nonce) : nonce; + + __syncthreads(); + + uint2x4 Z[8]; +#pragma unroll + for(int i = 0; i<8; i++) + Z[i] = __ldg4(&(Tr2 + shiftTr)[i]) ^ __ldg4(&(Tr + shiftTr)[i]); + +#if __CUDA_ARCH__ < 500 + uint32_t outbuf = fastkdf32_v1(thread, ZNonce, (uint32_t*)Z, s_data); +#else + uint32_t outbuf = fastkdf32_v3(thread, ZNonce, (uint32_t*)Z, s_data); +#endif + + if(outbuf <= c_target[1]) + { + resNonces[0] = nonce; + //uint32_t tmp = atomicExch(resNonces, nonce); + //if(tmp != UINT32_MAX) + // resNonces[1] = tmp; + } +} + +static THREAD uint32_t *hash1 = NULL; +static THREAD uint32_t *Trans1 = NULL; +static THREAD uint32_t *Trans2 = NULL; // 2 streams +static THREAD uint32_t *Trans3 = NULL; // 2 streams + +__host__ +void neoscrypt_init(int thr_id, uint32_t threads) +{ + CUDA_SAFE_CALL(cudaMalloc(&d_NNonce[thr_id], 2 * sizeof(uint32_t))); + CUDA_SAFE_CALL(cudaMalloc(&hash1, 32 * 128 * sizeof(uint64_t) * min(8192, threads))); + CUDA_SAFE_CALL(cudaMalloc(&Trans1, 32 * sizeof(uint64_t) * threads)); + CUDA_SAFE_CALL(cudaMalloc(&Trans2, 32 * sizeof(uint64_t) * threads)); + CUDA_SAFE_CALL(cudaMalloc(&Trans3, 32 * sizeof(uint64_t) * threads)); + + CUDA_SAFE_CALL(cudaMemcpyToSymbol(W, &hash1, sizeof(uint2x4*), 0, cudaMemcpyHostToDevice)); + CUDA_SAFE_CALL(cudaMemcpyToSymbol(Tr, &Trans1, sizeof(uint2x4*), 0, cudaMemcpyHostToDevice)); + CUDA_SAFE_CALL(cudaMemcpyToSymbol(Tr2, &Trans2, sizeof(uint2x4*), 0, cudaMemcpyHostToDevice)); + CUDA_SAFE_CALL(cudaMemcpyToSymbol(Input, &Trans3, sizeof(uint2x4*), 0, cudaMemcpyHostToDevice)); +} +/* +__host__ +void neoscrypt_free(int thr_id) +{ + cudaFree(d_NNonce[thr_id]); + + cudaFree(hash1); + cudaFree(Trans1); + cudaFree(Trans2); + cudaFree(Trans3); +} +*/ +__host__ +void neoscrypt_hash_tpruvot(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *resNonces, bool stratum) +{ + CUDA_SAFE_CALL(cudaMemset(d_NNonce[thr_id], 0xff, 2 * sizeof(uint32_t))); + + const int threadsperblock2 = TPB2; + dim3 grid2((threads + threadsperblock2 - 1) / threadsperblock2); + dim3 block2(threadsperblock2); + + const int threadsperblock = TPB; + dim3 grid3((threads * 4 + threadsperblock - 1) / threadsperblock); + dim3 block3(4, threadsperblock >> 2); + + neoscrypt_gpu_hash_start << > > (stratum, startNounce); //fastkdf + + neoscrypt_gpu_hash_salsa1 << > > (); + neoscrypt_gpu_hash_chacha1 << > > (); + + neoscrypt_gpu_hash_ending << > > (stratum, startNounce, d_NNonce[thr_id]); //fastkdf+end + + CUDA_SAFE_CALL(cudaMemcpy(resNonces, d_NNonce[thr_id], 2 * sizeof(uint32_t), cudaMemcpyDeviceToHost)); +} + +__host__ +void neoscrypt_setBlockTarget_tpruvot(uint32_t* const pdata, uint32_t* const target) +{ + uint32_t PaddedMessage[64]; + uint32_t input[16], key[16] = {0}; + + for(int i = 0; i < 19; i++) + { + PaddedMessage[i] = pdata[i]; + PaddedMessage[i + 20] = pdata[i]; + PaddedMessage[i + 40] = pdata[i]; + } + for(int i = 0; i<4; i++) + PaddedMessage[i + 60] = pdata[i]; + + PaddedMessage[19] = 0; + PaddedMessage[39] = 0; + PaddedMessage[59] = 0; + + ((uint16*)input)[0] = ((uint16*)pdata)[0]; + ((uint8*)key)[0] = ((uint8*)pdata)[0]; + + Blake2Shost(input, key); + + cudaMemcpyToSymbol(input_init, input, 64, 0, cudaMemcpyHostToDevice); + cudaMemcpyToSymbol(key_init, key, 64, 0, cudaMemcpyHostToDevice); + + cudaMemcpyToSymbol(c_target, &target[6], 2 * sizeof(uint32_t), 0, cudaMemcpyHostToDevice); + cudaMemcpyToSymbol(c_data, PaddedMessage, 64 * sizeof(uint32_t), 0, cudaMemcpyHostToDevice); + CUDA_SAFE_CALL(cudaGetLastError()); +} diff --git a/neoscrypt/cuda_vector_tpruvot.cuh b/neoscrypt/cuda_vector_tpruvot.cuh new file mode 100644 index 0000000000..c9e09411a2 --- /dev/null +++ b/neoscrypt/cuda_vector_tpruvot.cuh @@ -0,0 +1,720 @@ +#ifndef CUDA_VECTOR_H +#define CUDA_VECTOR_H + + +/////////////////////////////////////////////////////////////////////////////////// +#if (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) +#define __LDG_PTR "l" +#else +#define __LDG_PTR "r" +#endif + +#include "cuda_helper.h" + +//typedef __device_builtin__ struct ulong16 ulong16; + + +typedef struct __align__(32) uint8 +{ + unsigned int s0, s1, s2, s3, s4, s5, s6, s7; +} uint8; + +typedef struct __align__(64) ulonglong2to8 +{ + ulonglong2 l0, l1, l2, l3; +} ulonglong2to8; + +typedef struct __align__(128) ulonglong8to16 +{ + ulonglong2to8 lo, hi; +} ulonglong8to16; + +typedef struct __align__(256) ulonglong16to32 +{ + ulonglong8to16 lo, hi; +} ulonglong16to32; + +typedef struct __align__(512) ulonglong32to64 +{ + ulonglong16to32 lo, hi; +} ulonglong32to64; + + + +typedef struct __align__(1024) ulonglonglong +{ + ulonglong8to16 s0, s1, s2, s3, s4, s5, s6, s7; +} ulonglonglong; + + + + +typedef struct __align__(64) uint16 +{ + union + { + struct + { + unsigned int s0, s1, s2, s3, s4, s5, s6, s7; + }; + uint8 lo; + }; + union + { + struct + { + unsigned int s8, s9, sa, sb, sc, sd, se, sf; + }; + uint8 hi; + }; +} uint16; + +typedef struct __align__(128) uint32 +{ + + uint16 lo, hi; +} uint32; + + + +struct __align__(128) ulong8 +{ + ulonglong4 s0, s1, s2, s3; +}; +typedef __device_builtin__ struct ulong8 ulong8; + + +typedef struct __align__(256) ulonglong16 +{ + ulonglong2 s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sa, sb, sc, sd, se, sf; +} ulonglong16; + +typedef struct __align__(32) uint48 +{ + uint4 s0, s1; + +} uint48; + +typedef struct __align__(64) uint816 +{ + uint48 s0, s1; + +} uint816; + +typedef struct __align__(128) uint1632 +{ + uint816 s0, s1; + +} uint1632; + +typedef struct __align__(256) uintx64 +{ + uint1632 s0, s1; + +} uintx64; + +typedef struct __align__(512) uintx128 +{ + uintx64 s0, s1; + +} uintx128; + +typedef struct __align__(1024) uintx256 +{ + uintx128 s0, s1; + +} uintx256; + + + +typedef struct __align__(256) uint4x16 +{ + uint4 s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15; +} uint4x16; + +static __inline__ __device__ ulonglong2to8 make_ulonglong2to8(ulonglong2 s0, ulonglong2 s1, ulonglong2 s2, ulonglong2 s3) +{ + ulonglong2to8 t; t.l0 = s0; t.l1 = s1; t.l2 = s2; t.l3 = s3; + return t; +} + +static __inline__ __device__ ulonglong8to16 make_ulonglong8to16(const ulonglong2to8 &s0, const ulonglong2to8 &s1) +{ + ulonglong8to16 t; t.lo = s0; t.hi = s1; + return t; +} + +static __inline__ __device__ ulonglong16to32 make_ulonglong16to32(const ulonglong8to16 &s0, const ulonglong8to16 &s1) +{ + ulonglong16to32 t; t.lo = s0; t.hi = s1; + return t; +} + +static __inline__ __device__ ulonglong32to64 make_ulonglong32to64(const ulonglong16to32 &s0, const ulonglong16to32 &s1) +{ + ulonglong32to64 t; t.lo = s0; t.hi = s1; + return t; +} + + +static __inline__ __host__ __device__ ulonglonglong make_ulonglonglong( + const ulonglong8to16 &s0, const ulonglong8to16 &s1, const ulonglong8to16 &s2, const ulonglong8to16 &s3, + const ulonglong8to16 &s4, const ulonglong8to16 &s5, const ulonglong8to16 &s6, const ulonglong8to16 &s7) +{ + ulonglonglong t; t.s0 = s0; t.s1 = s1; t.s2 = s2; t.s3 = s3; t.s4 = s4; t.s5 = s5; t.s6 = s6; t.s7 = s7; + return t; +} + + +static __inline__ __device__ uint48 make_uint48(uint4 s0, uint4 s1) +{ + uint48 t; t.s0 = s0; t.s1 = s1; + return t; +} + +static __inline__ __device__ uint816 make_uint816(const uint48 &s0, const uint48 &s1) +{ + uint816 t; t.s0 = s0; t.s1 = s1; + return t; +} + +static __inline__ __device__ uint1632 make_uint1632(const uint816 &s0, const uint816 &s1) +{ + uint1632 t; t.s0 = s0; t.s1 = s1; + return t; +} + +static __inline__ __device__ uintx64 make_uintx64(const uint1632 &s0, const uint1632 &s1) +{ + uintx64 t; t.s0 = s0; t.s1 = s1; + return t; +} + +static __inline__ __device__ uintx128 make_uintx128(const uintx64 &s0, const uintx64 &s1) +{ + uintx128 t; t.s0 = s0; t.s1 = s1; + return t; +} + +static __inline__ __device__ uintx256 make_uintx256(const uintx128 &s0, const uintx128 &s1) +{ + uintx256 t; t.s0 = s0; t.s1 = s1; + return t; +} + + +static __inline__ __device__ uintx256 make_uintx64(const uintx128 &s0, const uintx128 &s1) +{ + uintx256 t; t.s0 = s0; t.s1 = s1; + return t; +} + + +static __inline__ __host__ __device__ uint4x16 make_uint4x16( + uint4 s0, uint4 s1, uint4 s2, uint4 s3, uint4 s4, uint4 s5, uint4 s6, uint4 s7, + uint4 s8, uint4 s9, uint4 sa, uint4 sb, uint4 sc, uint4 sd, uint4 se, uint4 sf) +{ + uint4x16 t; t.s0 = s0; t.s1 = s1; t.s2 = s2; t.s3 = s3; t.s4 = s4; t.s5 = s5; t.s6 = s6; t.s7 = s7; + t.s8 = s8; t.s9 = s9; t.s10 = sa; t.s11 = sb; t.s12 = sc; t.s13 = sd; t.s14 = se; t.s15 = sf; + return t; +} + + + + +static __inline__ __host__ __device__ uint16 make_uint16( + unsigned int s0, unsigned int s1, unsigned int s2, unsigned int s3, unsigned int s4, unsigned int s5, unsigned int s6, unsigned int s7, + unsigned int s8, unsigned int s9, unsigned int sa, unsigned int sb, unsigned int sc, unsigned int sd, unsigned int se, unsigned int sf) +{ + uint16 t; t.s0 = s0; t.s1 = s1; t.s2 = s2; t.s3 = s3; t.s4 = s4; t.s5 = s5; t.s6 = s6; t.s7 = s7; + t.s8 = s8; t.s9 = s9; t.sa = sa; t.sb = sb; t.sc = sc; t.sd = sd; t.se = se; t.sf = sf; + return t; +} + +static __inline__ __host__ __device__ uint16 make_uint16(const uint8 &a, const uint8 &b) +{ + uint16 t; t.lo = a; t.hi = b; return t; +} + +static __inline__ __host__ __device__ uint32 make_uint32(const uint16 &a, const uint16 &b) +{ + uint32 t; t.lo = a; t.hi = b; return t; +} + + +static __inline__ __host__ __device__ uint8 make_uint8( + unsigned int s0, unsigned int s1, unsigned int s2, unsigned int s3, unsigned int s4, unsigned int s5, unsigned int s6, unsigned int s7) +{ + uint8 t; t.s0 = s0; t.s1 = s1; t.s2 = s2; t.s3 = s3; t.s4 = s4; t.s5 = s5; t.s6 = s6; t.s7 = s7; + return t; +} + + +static __inline__ __host__ __device__ ulonglong16 make_ulonglong16(const ulonglong2 &s0, const ulonglong2 &s1, + const ulonglong2 &s2, const ulonglong2 &s3, const ulonglong2 &s4, const ulonglong2 &s5, const ulonglong2 &s6, const ulonglong2 &s7, + const ulonglong2 &s8, const ulonglong2 &s9, + const ulonglong2 &sa, const ulonglong2 &sb, const ulonglong2 &sc, const ulonglong2 &sd, const ulonglong2 &se, const ulonglong2 &sf +) +{ + ulonglong16 t; t.s0 = s0; t.s1 = s1; t.s2 = s2; t.s3 = s3; t.s4 = s4; t.s5 = s5; t.s6 = s6; t.s7 = s7; + t.s8 = s8; t.s9 = s9; t.sa = sa; t.sb = sb; t.sc = sc; t.sd = sd; t.se = se; t.sf = sf; + return t; +} + + + +static __inline__ __host__ __device__ ulong8 make_ulong8( + ulonglong4 s0, ulonglong4 s1, ulonglong4 s2, ulonglong4 s3) +{ + ulong8 t; t.s0 = s0; t.s1 = s1; t.s2 = s2; t.s3 = s3;// t.s4 = s4; t.s5 = s5; t.s6 = s6; t.s7 = s7; + return t; +} + +/* +static __forceinline__ __device__ uchar4 operator^ (uchar4 a, uchar4 b) +{ + return make_uchar4(a.x ^ b.x, a.y ^ b.y, a.z ^ b.z, a.w ^ b.w); +} +static __forceinline__ __device__ uchar4 operator+ (uchar4 a, uchar4 b) +{ + return make_uchar4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); +} + + + + + +static __forceinline__ __device__ uint4 operator^ (uint4 a, uint4 b) +{ + return make_uint4(a.x ^ b.x, a.y ^ b.y, a.z ^ b.z, a.w ^ b.w); +} +static __forceinline__ __device__ uint4 operator+ (uint4 a, uint4 b) +{ + return make_uint4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); +} +*/ + +static __forceinline__ __device__ ulonglong4 operator^ (ulonglong4 a, ulonglong4 b) +{ + return make_ulonglong4(a.x ^ b.x, a.y ^ b.y, a.z ^ b.z, a.w ^ b.w); +} +static __forceinline__ __device__ ulonglong4 operator+ (ulonglong4 a, ulonglong4 b) +{ + return make_ulonglong4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); +} +static __forceinline__ __device__ ulonglong2 operator^ (ulonglong2 a, ulonglong2 b) +{ + return make_ulonglong2(a.x ^ b.x, a.y ^ b.y); +} +static __forceinline__ __device__ ulonglong2 operator+ (ulonglong2 a, ulonglong2 b) +{ + return make_ulonglong2(a.x + b.x, a.y + b.y); +} + +static __forceinline__ __device__ ulong8 operator^ (const ulong8 &a, const ulong8 &b) +{ + return make_ulong8(a.s0 ^ b.s0, a.s1 ^ b.s1, a.s2 ^ b.s2, a.s3 ^ b.s3); +} //, a.s4 ^ b.s4, a.s5 ^ b.s5, a.s6 ^ b.s6, a.s7 ^ b.s7); } + +static __forceinline__ __device__ ulong8 operator+ (const ulong8 &a, const ulong8 &b) +{ + return make_ulong8(a.s0 + b.s0, a.s1 + b.s1, a.s2 + b.s2, a.s3 + b.s3); +} //, a.s4 + b.s4, a.s5 + b.s5, a.s6 + b.s6, a.s7 + b.s7); } + + +static __forceinline__ __device__ __host__ uint8 operator^ (const uint8 &a, const uint8 &b) +{ + return make_uint8(a.s0 ^ b.s0, a.s1 ^ b.s1, a.s2 ^ b.s2, a.s3 ^ b.s3, a.s4 ^ b.s4, a.s5 ^ b.s5, a.s6 ^ b.s6, a.s7 ^ b.s7); +} + +static __forceinline__ __device__ __host__ uint8 operator+ (const uint8 &a, const uint8 &b) +{ + return make_uint8(a.s0 + b.s0, a.s1 + b.s1, a.s2 + b.s2, a.s3 + b.s3, a.s4 + b.s4, a.s5 + b.s5, a.s6 + b.s6, a.s7 + b.s7); +} + +////////////// mess++ ////// + +static __forceinline__ __device__ uint48 operator^ (const uint48 &a, const uint48 &b) +{ + return make_uint48(a.s0 ^ b.s0, a.s1 ^ b.s1); +} + +static __forceinline__ __device__ uint816 operator^ (const uint816 &a, const uint816 &b) +{ + return make_uint816(a.s0 ^ b.s0, a.s1 ^ b.s1); +} + +static __forceinline__ __device__ uint1632 operator^ (const uint1632 &a, const uint1632 &b) +{ + return make_uint1632(a.s0 ^ b.s0, a.s1 ^ b.s1); +} + + +static __forceinline__ __device__ uintx64 operator^ (const uintx64 &a, const uintx64 &b) +{ + return make_uintx64(a.s0 ^ b.s0, a.s1 ^ b.s1); +} + +static __forceinline__ __device__ uintx128 operator^ (const uintx128 &a, const uintx128 &b) +{ + return make_uintx128(a.s0 ^ b.s0, a.s1 ^ b.s1); +} + +static __forceinline__ __device__ uintx256 operator^ (const uintx256 &a, const uintx256 &b) +{ + return make_uintx256(a.s0 ^ b.s0, a.s1 ^ b.s1); +} + +///////////////////////// + +static __forceinline__ __device__ __host__ uint16 operator^ (const uint16 &a, const uint16 &b) +{ + return make_uint16(a.s0 ^ b.s0, a.s1 ^ b.s1, a.s2 ^ b.s2, a.s3 ^ b.s3, a.s4 ^ b.s4, a.s5 ^ b.s5, a.s6 ^ b.s6, a.s7 ^ b.s7, + a.s8 ^ b.s8, a.s9 ^ b.s9, a.sa ^ b.sa, a.sb ^ b.sb, a.sc ^ b.sc, a.sd ^ b.sd, a.se ^ b.se, a.sf ^ b.sf); +} + +static __forceinline__ __device__ __host__ uint16 operator+ (const uint16 &a, const uint16 &b) +{ + return make_uint16(a.s0 + b.s0, a.s1 + b.s1, a.s2 + b.s2, a.s3 + b.s3, a.s4 + b.s4, a.s5 + b.s5, a.s6 + b.s6, a.s7 + b.s7, + a.s8 + b.s8, a.s9 + b.s9, a.sa + b.sa, a.sb + b.sb, a.sc + b.sc, a.sd + b.sd, a.se + b.se, a.sf + b.sf); +} + +static __forceinline__ __device__ uint32 operator^ (const uint32 &a, const uint32 &b) +{ + return make_uint32(a.lo ^ b.lo, a.hi ^ b.hi); +} + +static __forceinline__ __device__ uint32 operator+ (const uint32 &a, const uint32 &b) +{ + return make_uint32(a.lo + b.lo, a.hi + b.hi); +} + +static __forceinline__ __device__ ulonglong16 operator^ (const ulonglong16 &a, const ulonglong16 &b) +{ + return make_ulonglong16(a.s0 ^ b.s0, a.s1 ^ b.s1, a.s2 ^ b.s2, a.s3 ^ b.s3, a.s4 ^ b.s4, a.s5 ^ b.s5, a.s6 ^ b.s6, a.s7 ^ b.s7, + a.s8 ^ b.s8, a.s9 ^ b.s9, a.sa ^ b.sa, a.sb ^ b.sb, a.sc ^ b.sc, a.sd ^ b.sd, a.se ^ b.se, a.sf ^ b.sf + ); +} + +static __forceinline__ __device__ ulonglong16 operator+ (const ulonglong16 &a, const ulonglong16 &b) +{ + return make_ulonglong16(a.s0 + b.s0, a.s1 + b.s1, a.s2 + b.s2, a.s3 + b.s3, a.s4 + b.s4, a.s5 + b.s5, a.s6 + b.s6, a.s7 + b.s7, + a.s8 + b.s8, a.s9 + b.s9, a.sa + b.sa, a.sb + b.sb, a.sc + b.sc, a.sd + b.sd, a.se + b.se, a.sf + b.sf + ); +} + +static __forceinline__ __device__ void operator^= (ulong8 &a, const ulong8 &b) +{ + a = a ^ b; +} +static __forceinline__ __device__ void operator^= (uintx64 &a, const uintx64 &b) +{ + a = a ^ b; +} + +static __forceinline__ __device__ void operator^= (uintx128 &a, const uintx128 &b) +{ + a = a ^ b; +} +static __forceinline__ __device__ void operator^= (uintx256 &a, const uintx256 &b) +{ + a = a ^ b; +} + + +static __forceinline__ __device__ void operator^= (uint816 &a, const uint816 &b) +{ + a = a ^ b; +} + +static __forceinline__ __device__ void operator^= (uint48 &a, const uint48 &b) +{ + a = a ^ b; +} + +static __forceinline__ __device__ void operator^= (uint32 &a, const uint32 &b) +{ + a = a ^ b; +} + +static __forceinline__ __device__ void operator+= (uint32 &a, const uint32 &b) +{ + a = a + b; +} + +/* +static __forceinline__ __device__ void operator^= (uint4 &a, uint4 b) +{ + a = a ^ b; +} +static __forceinline__ __device__ void operator^= (uchar4 &a, uchar4 b) +{ + a = a ^ b; +} +*/ +static __forceinline__ __device__ __host__ void operator^= (uint8 &a, const uint8 &b) +{ + a = a ^ b; +} +static __forceinline__ __device__ __host__ void operator^= (uint16 &a, const uint16 &b) +{ + a = a ^ b; +} + +static __forceinline__ __device__ void operator^= (ulonglong16 &a, const ulonglong16 &b) +{ + a = a ^ b; +} +static __forceinline__ __device__ void operator^= (ulonglong4 &a, const ulonglong4 &b) +{ + a = a ^ b; +} +static __forceinline__ __device__ void operator^= (ulonglong2 &a, const ulonglong2 &b) +{ + a = a ^ b; +} +static __forceinline__ __device__ void operator+= (ulonglong2 &a, const ulonglong2 &b) +{ + a = a + b; +} + +static __forceinline__ __device__ +ulonglong2to8 operator^ (const ulonglong2to8 &a, const ulonglong2to8 &b) +{ + return make_ulonglong2to8(a.l0 ^ b.l0, a.l1 ^ b.l1, a.l2 ^ b.l2, a.l3 ^ b.l3); +} +static __forceinline__ __device__ +ulonglong2to8 operator+ (const ulonglong2to8 &a, const ulonglong2to8 &b) +{ + return make_ulonglong2to8(a.l0 + b.l0, a.l1 + b.l1, a.l2 + b.l2, a.l3 + b.l3); +} + + +static __forceinline__ __device__ +ulonglong8to16 operator^ (const ulonglong8to16 &a, const ulonglong8to16 &b) +{ + return make_ulonglong8to16(a.lo ^ b.lo, a.hi ^ b.hi); +} + +static __forceinline__ __device__ +ulonglong8to16 operator+ (const ulonglong8to16 &a, const ulonglong8to16 &b) +{ + return make_ulonglong8to16(a.lo + b.lo, a.hi + b.hi); +} + +static __forceinline__ __device__ +ulonglong16to32 operator^ (const ulonglong16to32 &a, const ulonglong16to32 &b) +{ + return make_ulonglong16to32(a.lo ^ b.lo, a.hi ^ b.hi); +} + +static __forceinline__ __device__ +ulonglong16to32 operator+ (const ulonglong16to32 &a, const ulonglong16to32 &b) +{ + return make_ulonglong16to32(a.lo + b.lo, a.hi + b.hi); +} + +static __forceinline__ __device__ +ulonglong32to64 operator^ (const ulonglong32to64 &a, const ulonglong32to64 &b) +{ + return make_ulonglong32to64(a.lo ^ b.lo, a.hi ^ b.hi); +} + +static __forceinline__ __device__ +ulonglong32to64 operator+ (const ulonglong32to64 &a, const ulonglong32to64 &b) +{ + return make_ulonglong32to64(a.lo + b.lo, a.hi + b.hi); +} + + +static __forceinline__ __device__ ulonglonglong operator^ (const ulonglonglong &a, const ulonglonglong &b) +{ + return make_ulonglonglong(a.s0 ^ b.s0, a.s1 ^ b.s1, a.s2 ^ b.s2, a.s3 ^ b.s3, a.s4 ^ b.s4, a.s5 ^ b.s5, a.s6 ^ b.s6, a.s7 ^ b.s7); +} + +static __forceinline__ __device__ ulonglonglong operator+ (const ulonglonglong &a, const ulonglonglong &b) +{ + return make_ulonglonglong(a.s0 + b.s0, a.s1 + b.s1, a.s2 + b.s2, a.s3 + b.s3, a.s4 + b.s4, a.s5 + b.s5, a.s6 + b.s6, a.s7 + b.s7); +} + + +static __forceinline__ __device__ void operator^= (ulonglong2to8 &a, const ulonglong2to8 &b) +{ + a = a ^ b; +} +static __forceinline__ __device__ void operator+= (uint4 &a, uint4 b) +{ + a = a + b; +} +static __forceinline__ __device__ void operator+= (uchar4 &a, uchar4 b) +{ + a = a + b; +} +static __forceinline__ __device__ __host__ void operator+= (uint8 &a, const uint8 &b) +{ + a = a + b; +} +static __forceinline__ __device__ __host__ void operator+= (uint16 &a, const uint16 &b) +{ + a = a + b; +} +static __forceinline__ __device__ void operator+= (ulong8 &a, const ulong8 &b) +{ + a = a + b; +} +static __forceinline__ __device__ void operator+= (ulonglong16 &a, const ulonglong16 &b) +{ + a = a + b; +} +static __forceinline__ __device__ void operator+= (ulonglong8to16 &a, const ulonglong8to16 &b) +{ + a = a + b; +} +static __forceinline__ __device__ void operator^= (ulonglong8to16 &a, const ulonglong8to16 &b) +{ + a = a ^ b; +} + +static __forceinline__ __device__ void operator+= (ulonglong16to32 &a, const ulonglong16to32 &b) +{ + a = a + b; +} +static __forceinline__ __device__ void operator^= (ulonglong16to32 &a, const ulonglong16to32 &b) +{ + a = a ^ b; +} + +static __forceinline__ __device__ void operator+= (ulonglong32to64 &a, const ulonglong32to64 &b) +{ + a = a + b; +} +static __forceinline__ __device__ void operator^= (ulonglong32to64 &a, const ulonglong32to64 &b) +{ + a = a ^ b; +} + +static __forceinline__ __device__ void operator+= (ulonglonglong &a, const ulonglonglong &b) +{ + a = a + b; +} +static __forceinline__ __device__ void operator^= (ulonglonglong &a, const ulonglonglong &b) +{ + a = a ^ b; +} + +#if __CUDA_ARCH__ < 320 + +#define rotateL ROTL32 +#define rotateR ROTR32 + +#else + +static __forceinline__ __device__ uint32_t rotateL(uint32_t vec4, uint32_t shift) +{ + uint32_t ret; + asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(vec4), "r"(vec4), "r"(shift)); + return ret; +} + +static __forceinline__ __device__ uint32_t rotateR(uint32_t vec4, uint32_t shift) +{ + uint32_t ret; + asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(vec4), "r"(vec4), "r"(shift)); + return ret; +} + +#endif + +#if __CUDA_ARCH__ < 320 + +// right shift a 64-bytes integer (256-bits) by 0 8 16 24 bits +// require a uint32_t[9] ret array +// note: djm neoscrypt implementation is near the limits of gpu capabilities +// and weird behaviors can happen when tuning device functions code... +__device__ static void shift256R(uint32_t* ret, const uint8 &vec4, uint32_t shift) +{ + uint8_t *v = (uint8_t*)&vec4.s0; + uint8_t *r = (uint8_t*)ret; + uint8_t bytes = (uint8_t)(shift >> 3); + ret[0] = 0; + for(uint8_t i = bytes; i<32; i++) + r[i] = v[i - bytes]; + ret[8] = vec4.s7 >> (32 - shift); // shuffled part required +} + +#else + +// same for SM 3.5+, really faster ? +__device__ static void shift256R(uint32_t* ret, const uint8 &vec4, uint32_t shift) +{ + uint32_t truc = 0, truc2 = cuda_swab32(vec4.s7), truc3 = 0; + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(truc) : "r"(truc3), "r"(truc2), "r"(shift)); + ret[8] = cuda_swab32(truc); + truc3 = cuda_swab32(vec4.s6); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(truc) : "r"(truc2), "r"(truc3), "r"(shift)); + ret[7] = cuda_swab32(truc); + truc2 = cuda_swab32(vec4.s5); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(truc) : "r"(truc3), "r"(truc2), "r"(shift)); + ret[6] = cuda_swab32(truc); + truc3 = cuda_swab32(vec4.s4); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(truc) : "r"(truc2), "r"(truc3), "r"(shift)); + ret[5] = cuda_swab32(truc); + truc2 = cuda_swab32(vec4.s3); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(truc) : "r"(truc3), "r"(truc2), "r"(shift)); + ret[4] = cuda_swab32(truc); + truc3 = cuda_swab32(vec4.s2); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(truc) : "r"(truc2), "r"(truc3), "r"(shift)); + ret[3] = cuda_swab32(truc); + truc2 = cuda_swab32(vec4.s1); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(truc) : "r"(truc3), "r"(truc2), "r"(shift)); + ret[2] = cuda_swab32(truc); + truc3 = cuda_swab32(vec4.s0); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(truc) : "r"(truc2), "r"(truc3), "r"(shift)); + ret[1] = cuda_swab32(truc); + asm("shr.b32 %0, %1, %2;" : "=r"(truc) : "r"(truc3), "r"(shift)); + ret[0] = cuda_swab32(truc); +} +#endif + +#if __CUDA_ARCH__ < 320 + +// copy 256 bytes +static __device__ __inline__ uintx64 ldg256(const uint4 *ptr) +{ + uintx64 ret; + uint32_t *dst = (uint32_t*)&ret.s0; + uint32_t *src = (uint32_t*)&ptr[0].x; + for(int i = 0; i < (256 / sizeof(uint32_t)); i++) + { + dst[i] = src[i]; + } + return ret; +} + +#else + +// complicated way to copy 256 bytes ;) +static __device__ __inline__ uintx64 ldg256(const uint4 *ptr) +{ + uintx64 ret; + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(ret.s0.s0.s0.s0.x), "=r"(ret.s0.s0.s0.s0.y), "=r"(ret.s0.s0.s0.s0.z), "=r"(ret.s0.s0.s0.s0.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+16];" : "=r"(ret.s0.s0.s0.s1.x), "=r"(ret.s0.s0.s0.s1.y), "=r"(ret.s0.s0.s0.s1.z), "=r"(ret.s0.s0.s0.s1.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+32];" : "=r"(ret.s0.s0.s1.s0.x), "=r"(ret.s0.s0.s1.s0.y), "=r"(ret.s0.s0.s1.s0.z), "=r"(ret.s0.s0.s1.s0.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+48];" : "=r"(ret.s0.s0.s1.s1.x), "=r"(ret.s0.s0.s1.s1.y), "=r"(ret.s0.s0.s1.s1.z), "=r"(ret.s0.s0.s1.s1.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+64];" : "=r"(ret.s0.s1.s0.s0.x), "=r"(ret.s0.s1.s0.s0.y), "=r"(ret.s0.s1.s0.s0.z), "=r"(ret.s0.s1.s0.s0.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+80];" : "=r"(ret.s0.s1.s0.s1.x), "=r"(ret.s0.s1.s0.s1.y), "=r"(ret.s0.s1.s0.s1.z), "=r"(ret.s0.s1.s0.s1.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+96];" : "=r"(ret.s0.s1.s1.s0.x), "=r"(ret.s0.s1.s1.s0.y), "=r"(ret.s0.s1.s1.s0.z), "=r"(ret.s0.s1.s1.s0.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+112];" : "=r"(ret.s0.s1.s1.s1.x), "=r"(ret.s0.s1.s1.s1.y), "=r"(ret.s0.s1.s1.s1.z), "=r"(ret.s0.s1.s1.s1.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+128];" : "=r"(ret.s1.s0.s0.s0.x), "=r"(ret.s1.s0.s0.s0.y), "=r"(ret.s1.s0.s0.s0.z), "=r"(ret.s1.s0.s0.s0.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+144];" : "=r"(ret.s1.s0.s0.s1.x), "=r"(ret.s1.s0.s0.s1.y), "=r"(ret.s1.s0.s0.s1.z), "=r"(ret.s1.s0.s0.s1.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+160];" : "=r"(ret.s1.s0.s1.s0.x), "=r"(ret.s1.s0.s1.s0.y), "=r"(ret.s1.s0.s1.s0.z), "=r"(ret.s1.s0.s1.s0.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+176];" : "=r"(ret.s1.s0.s1.s1.x), "=r"(ret.s1.s0.s1.s1.y), "=r"(ret.s1.s0.s1.s1.z), "=r"(ret.s1.s0.s1.s1.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+192];" : "=r"(ret.s1.s1.s0.s0.x), "=r"(ret.s1.s1.s0.s0.y), "=r"(ret.s1.s1.s0.s0.z), "=r"(ret.s1.s1.s0.s0.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+208];" : "=r"(ret.s1.s1.s0.s1.x), "=r"(ret.s1.s1.s0.s1.y), "=r"(ret.s1.s1.s0.s1.z), "=r"(ret.s1.s1.s0.s1.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+224];" : "=r"(ret.s1.s1.s1.s0.x), "=r"(ret.s1.s1.s1.s0.y), "=r"(ret.s1.s1.s1.s0.z), "=r"(ret.s1.s1.s1.s0.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+240];" : "=r"(ret.s1.s1.s1.s1.x), "=r"(ret.s1.s1.s1.s1.y), "=r"(ret.s1.s1.s1.s1.z), "=r"(ret.s1.s1.s1.s1.w) : __LDG_PTR(ptr)); + return ret; +} +#endif + +#endif // #ifndef CUDA_VECTOR_H \ No newline at end of file diff --git a/neoscrypt/cuda_vector_uint2x4.cuh b/neoscrypt/cuda_vector_uint2x4.cuh new file mode 100644 index 0000000000..780fb67077 --- /dev/null +++ b/neoscrypt/cuda_vector_uint2x4.cuh @@ -0,0 +1,72 @@ +// used in tpruvot's neoscrypt code + +#ifndef CUDA_VECTOR_UINT2x4_H +#define CUDA_VECTOR_UINT2x4_H + +#if (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) +#define __LDG_PTR "l" +#else +#define __LDG_PTR "r" +#endif + +#include "cuda_helper.h" + +typedef struct __align__(16) uint2x4 +{ + uint2 x, y, z, w; +} uint2x4; + + +static __inline__ __device__ uint2x4 make_uint2x4(uint2 s0, uint2 s1, uint2 s2, uint2 s3) +{ + uint2x4 t; + t.x = s0; t.y = s1; t.z = s2; t.w = s3; + return t; +} + +static __forceinline__ __device__ uint2x4 operator^ (const uint2x4 &a, const uint2x4 &b) +{ + return make_uint2x4(a.x ^ b.x, a.y ^ b.y, a.z ^ b.z, a.w ^ b.w); +} + +static __forceinline__ __device__ uint2x4 operator+ (const uint2x4 &a, const uint2x4 &b) +{ + return make_uint2x4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); +} + +///////////////////////// + +static __forceinline__ __device__ void operator^= (uint2x4 &a, const uint2x4 &b) +{ + a = a ^ b; +} +static __forceinline__ __device__ void operator+= (uint2x4 &a, const uint2x4 &b) +{ + a = a + b; +} + +#if __CUDA_ARCH__ >= 320 + +static __device__ __inline__ uint2x4 __ldg4(const uint2x4 *ptr) +{ + uint2x4 ret; + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(ret.x.x), "=r"(ret.x.y), "=r"(ret.y.x), "=r"(ret.y.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+16];" : "=r"(ret.z.x), "=r"(ret.z.y), "=r"(ret.w.x), "=r"(ret.w.y) : __LDG_PTR(ptr)); + return ret; +} + +static __device__ __inline__ void ldg4(const uint2x4 *ptr, uint2x4 *ret) +{ + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(ret[0].x.x), "=r"(ret[0].x.y), "=r"(ret[0].y.x), "=r"(ret[0].y.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+16];" : "=r"(ret[0].z.x), "=r"(ret[0].z.y), "=r"(ret[0].w.x), "=r"(ret[0].w.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+32];" : "=r"(ret[1].x.x), "=r"(ret[1].x.y), "=r"(ret[1].y.x), "=r"(ret[1].y.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+48];" : "=r"(ret[1].z.x), "=r"(ret[1].z.y), "=r"(ret[1].w.x), "=r"(ret[1].w.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+64];" : "=r"(ret[2].x.x), "=r"(ret[2].x.y), "=r"(ret[2].y.x), "=r"(ret[2].y.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+80];" : "=r"(ret[2].z.x), "=r"(ret[2].z.y), "=r"(ret[2].w.x), "=r"(ret[2].w.y) : __LDG_PTR(ptr)); +} +#elif !defined(__ldg4) +#define __ldg4(x) (*(x)) +#define ldg4(ptr, ret) { *(ret) = (*(ptr)); } +#endif + +#endif // H \ No newline at end of file diff --git a/neoscrypt/neoscrypt.cu b/neoscrypt/neoscrypt.cu new file mode 100644 index 0000000000..9d7e9a6272 --- /dev/null +++ b/neoscrypt/neoscrypt.cu @@ -0,0 +1,201 @@ +#include +#include "cuda_helper.h" +#include "miner.h" +#include "sph/neoscrypt.h" + +extern void neoscrypt_setBlockTarget(int thr_id, uint32_t* pdata, const void *target); +extern void neoscrypt_cpu_init_2stream(int thr_id, uint32_t threads); +extern void neoscrypt_cpu_hash_k4_2stream(bool stratum, int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *result); +//extern void neoscrypt_cpu_hash_k4_52(int stratum, int thr_id, int threads, uint32_t startNounce, int order, uint32_t* foundnonce); +void neoscrypt_init(int thr_id, uint32_t threads); +void neoscrypt_setBlockTarget_tpruvot(uint32_t* const pdata, uint32_t* const target); +void neoscrypt_hash_tpruvot(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *resNonces, bool stratum); + +int scanhash_neoscrypt(bool stratum, int thr_id, uint32_t *pdata, + uint32_t *ptarget, uint32_t max_nonce, + uint32_t *hashes_done) +{ + const uint32_t first_nonce = pdata[19]; + uint32_t throughput; + static THREAD uint32_t throughputmax; + + static THREAD volatile bool init = false; + static THREAD uint32_t hw_errors = 0; + static THREAD uint32_t *foundNonce = nullptr; + static THREAD bool use_tpruvot = false; + + if(opt_benchmark) + { + ptarget[7] = 0x01ff; + stratum = 0; + } + + if(!init) + { + CUDA_SAFE_CALL(cudaSetDevice(device_map[thr_id])); + + cudaDeviceProp props; + cudaGetDeviceProperties(&props, device_map[thr_id]); + unsigned int cc = props.major * 10 + props.minor; + if(cc < 32) + { + applog(LOG_ERR, "GPU #%d: this gpu is not supported", device_map[thr_id]); + mining_has_stopped[thr_id] = true; + proper_exit(2); + } + unsigned int intensity = (256 * 64 * 1); // -i 14 + if(strstr(props.name, "1080 Ti")) + { + intensity = 256 * 64 * 5; + use_tpruvot = true; + } + else if(strstr(props.name, "1080")) + { + intensity = 256 * 64 * 5; + } + else if(strstr(props.name, "1070")) + { + intensity = 256 * 64 * 5; + } + else if(strstr(props.name, "970")) + { + intensity = (256 * 64 * 5); + } + else if(strstr(props.name, "980")) + { + intensity = (256 * 64 * 5); + } + else if(strstr(props.name, "980 Ti")) + { + intensity = (256 * 64 * 5); + } + else if(strstr(props.name, "750 Ti")) + { + intensity = (256 * 64 * 3); + } + else if(strstr(props.name, "750")) + { + intensity = (256 * 64 * 1); + } + else if(strstr(props.name, "960")) + { + intensity = (256 * 64 * 2); + } + else if(strstr(props.name, "950")) + { + intensity = (256 * 64 * 2); + } + + throughputmax = device_intensity(device_map[thr_id], __func__, intensity) / 2; + cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync); + // cudaDeviceSetCacheConfig(cudaFuncCachePreferL1); + CUDA_SAFE_CALL(cudaMallocHost(&foundNonce, 2 * 4)); + +#if defined WIN32 && !defined _WIN64 + // 2GB limit for cudaMalloc + if(throughputmax > 0x7fffffffULL / (32 * 128 * sizeof(uint64_t))) + { + applog(LOG_ERR, "intensity too high"); + mining_has_stopped[thr_id] = true; + proper_exit(2); + } +#endif + if(use_tpruvot) + neoscrypt_init(thr_id, throughputmax); + else + neoscrypt_cpu_init_2stream(thr_id, throughputmax); + mining_has_stopped[thr_id] = false; + init = true; + } + throughput = min(throughputmax, (max_nonce - first_nonce) / 2) & 0xffffff00; + + uint32_t endiandata[20]; + for(int k = 0; k < 20; k++) + { + if(stratum) + be32enc(&endiandata[k], ((uint32_t*)pdata)[k]); + else endiandata[k] = pdata[k]; + } + if(use_tpruvot) + neoscrypt_setBlockTarget_tpruvot(endiandata, ptarget); + else + neoscrypt_setBlockTarget(thr_id, endiandata, ptarget); + + + do + { + if(use_tpruvot) + neoscrypt_hash_tpruvot(thr_id, throughput, pdata[19], foundNonce, stratum); + else + neoscrypt_cpu_hash_k4_2stream(stratum, thr_id, throughput, pdata[19], foundNonce); + if(stop_mining) + { + mining_has_stopped[thr_id] = true; pthread_exit(nullptr); + } + if(foundNonce[0] != 0xffffffff) + { + uint32_t vhash64[8]={0}; + if(opt_verify) + { + if(stratum) + be32enc(&endiandata[19], foundNonce[0]); + else + endiandata[19] = foundNonce[0]; + neoscrypt((unsigned char*)endiandata, (unsigned char*)vhash64, 0x80000620); + } + if(vhash64[7] <= ptarget[7] && fulltest(vhash64, ptarget)) + { + *hashes_done = pdata[19] - first_nonce + throughput; + int res = 1; + if(opt_benchmark) + applog(LOG_INFO, "GPU #%d Found nonce %08x", device_map[thr_id], foundNonce[0]); + pdata[19] = foundNonce[0]; + if(foundNonce[1] != 0xffffffff) + { + if(opt_verify) + { + if(stratum) + { + be32enc(&endiandata[19], foundNonce[1]); + } + else + { + endiandata[19] = foundNonce[1]; + } + neoscrypt((unsigned char*)endiandata, (unsigned char*)vhash64, 0x80000620); + } + if(vhash64[7] <= ptarget[7] && fulltest(vhash64, ptarget)) + { + pdata[21] = foundNonce[1]; + res++; + if(opt_benchmark) + applog(LOG_INFO, "GPU #%d: Found second nonce %08x", device_map[thr_id], foundNonce[1]); + } + else + { + if(vhash64[7] != ptarget[7]) + { + applog(LOG_WARNING, "GPU #%d: Second nonce $%08X does not validate on CPU!", device_map[thr_id], foundNonce[1]); + hw_errors++; + } + } + + } + return res; + } + else + { + if(vhash64[7] != ptarget[7]) + { + applog(LOG_WARNING, "GPU #%d: Nonce $%08X does not validate on CPU!", device_map[thr_id], foundNonce[0]); + hw_errors++; + } + } +// if(hw_errors > 0) applog(LOG_WARNING, "Hardware errors: %u", hw_errors); + } + pdata[19] += throughput; + } while(!work_restart[thr_id].restart && ((uint64_t)max_nonce > ((uint64_t)(pdata[19]) + (uint64_t)throughput))); + *hashes_done = pdata[19] - first_nonce ; + return 0; +} + diff --git a/nvml.cpp b/nvml.cpp index 8be79b09ce..593abe73ce 100644 --- a/nvml.cpp +++ b/nvml.cpp @@ -15,20 +15,16 @@ * */ +#include #include #include #include -#ifndef _MSC_VER -#include -#endif +#include #include "miner.h" #include "nvml.h" #include "cuda_runtime.h" -// cuda.cpp -int cuda_num_devices(); - #ifdef USE_WRAPNVML extern nvml_handle *hnvml; @@ -36,6 +32,15 @@ extern char driver_version[32]; static uint32_t device_bus_ids[MAX_GPUS] = { 0 }; +extern uint32_t device_gpu_clocks[MAX_GPUS]; +extern uint32_t device_mem_clocks[MAX_GPUS]; +extern uint32_t device_plimit[MAX_GPUS]; +extern int8_t device_pstate[MAX_GPUS]; + +uint32_t clock_prev[MAX_GPUS] = { 0 }; +uint32_t clock_prev_mem[MAX_GPUS] = { 0 }; +uint32_t limit_prev[MAX_GPUS] = { 0 }; + /* * Wrappers to emulate dlopen() on other systems like Windows */ @@ -110,14 +115,12 @@ nvml_handle * nvml_create() nvmlh->nvml_dll = nvml_dll; - nvmlh->nvmlInit = (nvmlReturn_t (*)(void)) - wrap_dlsym(nvmlh->nvml_dll, "nvmlInit_v2"); - if (!nvmlh->nvmlInit) { - nvmlh->nvmlInit = (nvmlReturn_t (*)(void)) - wrap_dlsym(nvmlh->nvml_dll, "nvmlInit"); - } - nvmlh->nvmlDeviceGetCount = (nvmlReturn_t (*)(int *)) - wrap_dlsym(nvmlh->nvml_dll, "nvmlDeviceGetCount_v2"); + nvmlh->nvmlInit = (nvmlReturn_t (*)(void)) wrap_dlsym(nvmlh->nvml_dll, "nvmlInit_v2"); + if (!nvmlh->nvmlInit) + nvmlh->nvmlInit = (nvmlReturn_t (*)(void)) wrap_dlsym(nvmlh->nvml_dll, "nvmlInit"); + nvmlh->nvmlDeviceGetCount = (nvmlReturn_t (*)(int *)) wrap_dlsym(nvmlh->nvml_dll, "nvmlDeviceGetCount_v2"); + if (!nvmlh->nvmlDeviceGetCount) + nvmlh->nvmlDeviceGetCount = (nvmlReturn_t (*)(int *)) wrap_dlsym(nvmlh->nvml_dll, "nvmlDeviceGetCount"); nvmlh->nvmlDeviceGetHandleByIndex = (nvmlReturn_t (*)(int, nvmlDevice_t *)) wrap_dlsym(nvmlh->nvml_dll, "nvmlDeviceGetHandleByIndex_v2"); nvmlh->nvmlDeviceGetAPIRestriction = (nvmlReturn_t (*)(nvmlDevice_t, nvmlRestrictedAPI_t, nvmlEnableState_t *)) @@ -130,10 +133,37 @@ nvml_handle * nvml_create() wrap_dlsym(nvmlh->nvml_dll, "nvmlDeviceGetApplicationsClock"); nvmlh->nvmlDeviceSetApplicationsClocks = (nvmlReturn_t (*)(nvmlDevice_t, unsigned int mem, unsigned int gpu)) wrap_dlsym(nvmlh->nvml_dll, "nvmlDeviceSetApplicationsClocks"); + nvmlh->nvmlDeviceResetApplicationsClocks = (nvmlReturn_t (*)(nvmlDevice_t)) + wrap_dlsym(nvmlh->nvml_dll, "nvmlDeviceResetApplicationsClocks"); + nvmlh->nvmlDeviceGetSupportedGraphicsClocks = (nvmlReturn_t (*)(nvmlDevice_t, uint32_t mem, uint32_t *num, uint32_t *)) + wrap_dlsym(nvmlh->nvml_dll, "nvmlDeviceGetSupportedGraphicsClocks"); + nvmlh->nvmlDeviceGetSupportedMemoryClocks = (nvmlReturn_t (*)(nvmlDevice_t, unsigned int *count, unsigned int *clocksMHz)) + wrap_dlsym(nvmlh->nvml_dll, "nvmlDeviceGetSupportedMemoryClocks"); nvmlh->nvmlDeviceGetClockInfo = (nvmlReturn_t (*)(nvmlDevice_t, nvmlClockType_t, unsigned int *clock)) wrap_dlsym(nvmlh->nvml_dll, "nvmlDeviceGetClockInfo"); - nvmlh->nvmlDeviceGetPciInfo = (nvmlReturn_t (*)(nvmlDevice_t, nvmlPciInfo_t *)) - wrap_dlsym(nvmlh->nvml_dll, "nvmlDeviceGetPciInfo"); + nvmlh->nvmlDeviceGetMaxClockInfo = (nvmlReturn_t (*)(nvmlDevice_t, nvmlClockType_t, unsigned int *clock)) + wrap_dlsym(nvmlh->nvml_dll, "nvmlDeviceGetMaxClockInfo"); + nvmlh->nvmlDeviceGetPciInfo = (nvmlReturn_t (*)(nvmlDevice_t, nvmlPciInfo_t *)) wrap_dlsym(nvmlh->nvml_dll, "nvmlDeviceGetPciInfo_v2"); + if (!nvmlh->nvmlDeviceGetPciInfo) + nvmlh->nvmlDeviceGetPciInfo = (nvmlReturn_t (*)(nvmlDevice_t, nvmlPciInfo_t *)) wrap_dlsym(nvmlh->nvml_dll, "nvmlDeviceGetPciInfo"); + nvmlh->nvmlDeviceGetCurrPcieLinkGeneration = (nvmlReturn_t (*)(nvmlDevice_t, unsigned int *gen)) + wrap_dlsym(nvmlh->nvml_dll, "nvmlDeviceGetCurrPcieLinkGeneration"); + nvmlh->nvmlDeviceGetCurrPcieLinkWidth = (nvmlReturn_t (*)(nvmlDevice_t, unsigned int *width)) + wrap_dlsym(nvmlh->nvml_dll, "nvmlDeviceGetCurrPcieLinkWidth"); + nvmlh->nvmlDeviceGetMaxPcieLinkGeneration = (nvmlReturn_t (*)(nvmlDevice_t, unsigned int *gen)) + wrap_dlsym(nvmlh->nvml_dll, "nvmlDeviceGetMaxPcieLinkGeneration"); + nvmlh->nvmlDeviceGetMaxPcieLinkWidth = (nvmlReturn_t (*)(nvmlDevice_t, unsigned int *width)) + wrap_dlsym(nvmlh->nvml_dll, "nvmlDeviceGetMaxPcieLinkWidth"); + nvmlh->nvmlDeviceGetPowerUsage = (nvmlReturn_t (*)(nvmlDevice_t, unsigned int *)) + wrap_dlsym(nvmlh->nvml_dll, "nvmlDeviceGetPowerUsage"); + nvmlh->nvmlDeviceGetPowerManagementDefaultLimit = (nvmlReturn_t (*)(nvmlDevice_t, unsigned int *limit)) + wrap_dlsym(nvmlh->nvml_dll, "nvmlDeviceGetPowerManagementDefaultLimit"); + nvmlh->nvmlDeviceGetPowerManagementLimit = (nvmlReturn_t (*)(nvmlDevice_t, unsigned int *limit)) + wrap_dlsym(nvmlh->nvml_dll, "nvmlDeviceGetPowerManagementLimit"); + nvmlh->nvmlDeviceGetPowerManagementLimitConstraints = (nvmlReturn_t (*)(nvmlDevice_t, unsigned int *min, unsigned int *max)) + wrap_dlsym(nvmlh->nvml_dll, "nvmlDeviceGetPowerManagementLimitConstraints"); + nvmlh->nvmlDeviceSetPowerManagementLimit = (nvmlReturn_t (*)(nvmlDevice_t, unsigned int limit)) + wrap_dlsym(nvmlh->nvml_dll, "nvmlDeviceSetPowerManagementLimit"); nvmlh->nvmlDeviceGetName = (nvmlReturn_t (*)(nvmlDevice_t, char *, int)) wrap_dlsym(nvmlh->nvml_dll, "nvmlDeviceGetName"); nvmlh->nvmlDeviceGetTemperature = (nvmlReturn_t (*)(nvmlDevice_t, int, unsigned int *)) @@ -141,7 +171,7 @@ nvml_handle * nvml_create() nvmlh->nvmlDeviceGetFanSpeed = (nvmlReturn_t (*)(nvmlDevice_t, unsigned int *)) wrap_dlsym(nvmlh->nvml_dll, "nvmlDeviceGetFanSpeed"); nvmlh->nvmlDeviceGetPerformanceState = (nvmlReturn_t (*)(nvmlDevice_t, int *)) - wrap_dlsym(nvmlh->nvml_dll, "nvmlDeviceGetPowerUsage"); + wrap_dlsym(nvmlh->nvml_dll, "nvmlDeviceGetPerformanceState"); /* or nvmlDeviceGetPowerState */ nvmlh->nvmlDeviceGetSerial = (nvmlReturn_t (*)(nvmlDevice_t, char *, unsigned int)) wrap_dlsym(nvmlh->nvml_dll, "nvmlDeviceGetSerial"); nvmlh->nvmlDeviceGetUUID = (nvmlReturn_t (*)(nvmlDevice_t, char *, unsigned int)) @@ -154,17 +184,26 @@ nvml_handle * nvml_create() wrap_dlsym(nvmlh->nvml_dll, "nvmlErrorString"); nvmlh->nvmlShutdown = (nvmlReturn_t (*)()) wrap_dlsym(nvmlh->nvml_dll, "nvmlShutdown"); + // v331 + nvmlh->nvmlDeviceGetEnforcedPowerLimit = (nvmlReturn_t (*)(nvmlDevice_t, unsigned int *limit)) + wrap_dlsym(nvmlh->nvml_dll, "nvmlDeviceGetEnforcedPowerLimit"); + // v340 + /* NVML_ERROR_NOT_SUPPORTED + nvmlh->nvmlDeviceGetAutoBoostedClocksEnabled = (nvmlReturn_t (*)(nvmlDevice_t, nvmlEnableState_t *isEnabled, nvmlEnableState_t *defaultIsEnabled)) + wrap_dlsym(nvmlh->nvml_dll, "nvmlDeviceGetAutoBoostedClocksEnabled"); + nvmlh->nvmlDeviceSetAutoBoostedClocksEnabled = (nvmlReturn_t (*)(nvmlDevice_t, nvmlEnableState_t enabled)) + wrap_dlsym(nvmlh->nvml_dll, "nvmlDeviceSetAutoBoostedClocksEnabled"); */ + // v346 + nvmlh->nvmlDeviceGetPcieThroughput = (nvmlReturn_t (*)(nvmlDevice_t, nvmlPcieUtilCounter_t, unsigned int *value)) + wrap_dlsym(nvmlh->nvml_dll, "nvmlDeviceGetPcieThroughput"); if (nvmlh->nvmlInit == NULL || nvmlh->nvmlShutdown == NULL || nvmlh->nvmlErrorString == NULL || - nvmlh->nvmlSystemGetDriverVersion == NULL || nvmlh->nvmlDeviceGetCount == NULL || nvmlh->nvmlDeviceGetHandleByIndex == NULL || nvmlh->nvmlDeviceGetPciInfo == NULL || - nvmlh->nvmlDeviceGetName == NULL || - nvmlh->nvmlDeviceGetTemperature == NULL || - nvmlh->nvmlDeviceGetFanSpeed == NULL) + nvmlh->nvmlDeviceGetName == NULL) { if (opt_debug) applog(LOG_DEBUG, "Failed to obtain required NVML function pointers"); @@ -172,10 +211,20 @@ nvml_handle * nvml_create() free(nvmlh); return NULL; } + nvmlReturn_t rc; + rc = nvmlh->nvmlInit(); + if(rc != NVML_SUCCESS) + { + applog(LOG_WARNING, "nvmlInit() failed: %s", nvmlh->nvmlErrorString(rc)); + return NULL; + } - nvmlh->nvmlInit(); - nvmlh->nvmlSystemGetDriverVersion(driver_version, sizeof(driver_version)); - nvmlh->nvmlDeviceGetCount(&nvmlh->nvml_gpucount); + rc = nvmlh->nvmlSystemGetDriverVersion(driver_version, sizeof(driver_version)); + if(rc != NVML_SUCCESS) + applog(LOG_WARNING, "nvmlSystemGetDriverVersion() failed: %s", nvmlh->nvmlErrorString(rc)); + rc = nvmlh->nvmlDeviceGetCount(&nvmlh->nvml_gpucount); + if(rc != NVML_SUCCESS) + applog(LOG_WARNING, "nvmlDeviceGetCount() failed: %s", nvmlh->nvmlErrorString(rc)); /* Query CUDA device count, in case it doesn't agree with NVML, since */ /* CUDA will only report GPUs with compute capability greater than 1.0 */ @@ -197,8 +246,11 @@ nvml_handle * nvml_create() nvmlh->app_clocks = (nvmlEnableState_t*) calloc(nvmlh->nvml_gpucount, sizeof(nvmlEnableState_t)); /* Obtain GPU device handles we're going to need repeatedly... */ - for (i=0; invml_gpucount; i++) { - nvmlh->nvmlDeviceGetHandleByIndex(i, &nvmlh->devs[i]); + for (i=0; invml_gpucount; i++) + { + rc = nvmlh->nvmlDeviceGetHandleByIndex(i, &nvmlh->devs[i]); + if(rc != NVML_SUCCESS) + applog(LOG_WARNING, "GPU %d: nvmlDeviceGetHandleByIndex() failed: %s", i, nvmlh->nvmlErrorString(rc)); } /* Query PCI info for each NVML device, and build table for mapping of */ @@ -210,32 +262,23 @@ nvml_handle * nvml_create() nvmlh->nvml_pci_domain_id[i] = pciinfo.domain; nvmlh->nvml_pci_bus_id[i] = pciinfo.bus; nvmlh->nvml_pci_device_id[i] = pciinfo.device; - nvmlh->nvml_pci_subsys_id[i] = pciinfo.pci_device_id; + nvmlh->nvml_pci_subsys_id[i] = pciinfo.pci_subsystem_id; nvmlh->app_clocks[i] = NVML_FEATURE_UNKNOWN; - if (nvmlh->nvmlDeviceSetAPIRestriction) { - nvmlh->nvmlDeviceSetAPIRestriction(nvmlh->devs[i], NVML_RESTRICTED_API_SET_APPLICATION_CLOCKS, + if (nvmlh->nvmlDeviceSetAPIRestriction) + { + rc = nvmlh->nvmlDeviceSetAPIRestriction(nvmlh->devs[i], NVML_RESTRICTED_API_SET_APPLICATION_CLOCKS, NVML_FEATURE_ENABLED); + if(rc != NVML_SUCCESS && opt_debug) + applog(LOG_WARNING, "Device %d: nvmlDeviceSetAPIRestriction() failed: %s", nvmlh->devs[i], nvmlh->nvmlErrorString(rc)); /* there is only this API_SET_APPLICATION_CLOCKS on the 750 Ti (340.58) */ } - if (nvmlh->nvmlDeviceGetAPIRestriction) { - nvmlh->nvmlDeviceGetAPIRestriction(nvmlh->devs[i], NVML_RESTRICTED_API_SET_APPLICATION_CLOCKS, + if (nvmlh->nvmlDeviceGetAPIRestriction) + { + rc = nvmlh->nvmlDeviceGetAPIRestriction(nvmlh->devs[i], NVML_RESTRICTED_API_SET_APPLICATION_CLOCKS, &nvmlh->app_clocks[i]); - if (nvmlh->app_clocks[i] == NVML_FEATURE_ENABLED && opt_debug) { - applog(LOG_DEBUG, "NVML application clock feature is allowed"); -#if 0 - uint32_t mem; - nvmlReturn_t rc; - rc = nvmlh->nvmlDeviceGetDefaultApplicationsClock(nvmlh->devs[i], NVML_CLOCK_MEM, &mem); - if (rc == NVML_SUCCESS) - applog(LOG_DEBUG, "nvmlDeviceGetDefaultApplicationsClock: mem %u", mem); - else - applog(LOG_DEBUG, "nvmlDeviceGetDefaultApplicationsClock: %s", nvmlh->nvmlErrorString(rc)); - rc = nvmlh->nvmlDeviceSetApplicationsClocks(nvmlh->devs[i], mem, 1228000); - if (rc != NVML_SUCCESS) - applog(LOG_DEBUG, "nvmlDeviceSetApplicationsClocks: %s", nvmlh->nvmlErrorString(rc)); -#endif - } + if(rc != NVML_SUCCESS) + applog(LOG_WARNING, "Device %d: nvmlDeviceGetAPIRestriction() failed: %s", nvmlh->devs[i], nvmlh->nvmlErrorString(rc)); } } @@ -254,7 +297,7 @@ nvml_handle * nvml_create() (nvmlh->nvml_pci_bus_id[j] == (uint32_t) props.pciBusID) && (nvmlh->nvml_pci_device_id[j] == (uint32_t) props.pciDeviceID)) { if (opt_debug) - applog(LOG_DEBUG, "CUDA GPU#%d matches NVML GPU %d by busId %u", + applog(LOG_DEBUG, "CUDA GPU %d matches NVML GPU %d by busId %u", i, j, (uint32_t) props.pciBusID); nvmlh->nvml_cuda_device_id[j] = i; nvmlh->cuda_nvml_device_id[i] = j; @@ -266,6 +309,279 @@ nvml_handle * nvml_create() return nvmlh; } +#define MAXCLOCKS 255 +/* apply config clocks to an used device */ +int nvml_set_clocks(nvml_handle *nvmlh, int dev_id) +{ + nvmlReturn_t rc; + uint32_t gpu_clk = 0, mem_clk = 0; + int n = nvmlh->cuda_nvml_device_id[dev_id]; + if (n < 0 || n >= nvmlh->nvml_gpucount) + return -ENODEV; + + if (!device_gpu_clocks[dev_id] && !device_mem_clocks[dev_id]) + return 0; // nothing to do + + if (nvmlh->app_clocks[n] != NVML_FEATURE_ENABLED) { + applog(LOG_WARNING, "GPU #%d: NVML application clock feature is not allowed!", dev_id); + return -EPERM; + } + + uint32_t mem_prev = clock_prev_mem[dev_id]; + if(!mem_prev) + { + rc = nvmlh->nvmlDeviceGetApplicationsClock(nvmlh->devs[n], NVML_CLOCK_MEM, &mem_prev); + if(rc != NVML_SUCCESS) + { + applog(LOG_WARNING, "GPU #%d: unable to query memory clock", dev_id); + return -1; + } + } + uint32_t gpu_prev = clock_prev[dev_id]; + if(!gpu_prev) + { + rc = nvmlh->nvmlDeviceGetApplicationsClock(nvmlh->devs[n], NVML_CLOCK_GRAPHICS, &gpu_prev); + if(rc != NVML_SUCCESS) + { + applog(LOG_WARNING, "GPU #%d: unable to query graphics clock", dev_id); + return -1; + } + } + + rc = nvmlh->nvmlDeviceGetDefaultApplicationsClock(nvmlh->devs[n], NVML_CLOCK_MEM, &mem_clk); + if(rc != NVML_SUCCESS) + { + applog(LOG_WARNING, "GPU #%d: unable to query default memory clock", dev_id); + return -1; + } + rc = nvmlh->nvmlDeviceGetDefaultApplicationsClock(nvmlh->devs[n], NVML_CLOCK_GRAPHICS, &gpu_clk); + if (rc != NVML_SUCCESS) { + applog(LOG_WARNING, "GPU #%d: unable to query default graphics clock", dev_id); + return -1; + } + + if (opt_debug) + applog(LOG_DEBUG, "GPU #%d: default application clocks are %u/%u", dev_id, mem_clk, gpu_clk); + + // get application config values + if (device_mem_clocks[dev_id]) mem_clk = device_mem_clocks[dev_id]; + if (device_gpu_clocks[dev_id]) gpu_clk = device_gpu_clocks[dev_id]; + + // these functions works for the 960 and the 970 (346.72+), not for the 750 Ti + uint32_t nclocks = MAXCLOCKS; + uint32_t clocks[MAXCLOCKS] = {0}; + + rc = nvmlh->nvmlDeviceGetSupportedMemoryClocks(nvmlh->devs[n], &nclocks, clocks); + if(rc != NVML_SUCCESS) + { + applog(LOG_WARNING, "GPU #%d: unable to query supported memory clocks", dev_id); + return -1; + } + for (uint8_t u=0; u < nclocks; u++) { + // ordered by pstate (so highest is first memory clock - P0) + if(clocks[u] <= mem_clk) + { + mem_clk = clocks[u]; + break; + } + } + + nclocks = MAXCLOCKS; + rc = nvmlh->nvmlDeviceGetSupportedGraphicsClocks(nvmlh->devs[n], mem_clk, &nclocks, clocks); + if(rc != NVML_SUCCESS) + { + applog(LOG_WARNING, "GPU #%d: unable to query supported graphics clocks", dev_id); + return -1; + } + for (uint8_t u=0; u < nclocks; u++) { + // ordered desc, so get first + if (clocks[u] <= gpu_clk) { + gpu_clk = clocks[u]; + break; + } + } + + rc = nvmlh->nvmlDeviceSetApplicationsClocks(nvmlh->devs[n], mem_clk, gpu_clk); + if (rc == NVML_SUCCESS) + applog(LOG_INFO, "GPU #%d: application clocks set to %u/%u", dev_id, mem_clk, gpu_clk); + else { + applog(LOG_WARNING, "GPU #%d: %u/%u - %s", dev_id, mem_clk, gpu_clk, nvmlh->nvmlErrorString(rc)); + return -1; + } + + // store previous clocks for reset on exit (or during wait...) + clock_prev[dev_id] = gpu_prev; + clock_prev_mem[dev_id] = mem_prev; + return 1; +} + +/* reset default app clocks and limits on exit */ +int nvml_reset_clocks(nvml_handle *nvmlh, int dev_id) +{ + int ret = 0; + nvmlReturn_t rc; + uint32_t gpu_clk = 0, mem_clk = 0; + int n = nvmlh->cuda_nvml_device_id[dev_id]; + if (n < 0 || n >= nvmlh->nvml_gpucount) + return -ENODEV; + + if (clock_prev[dev_id]) { + rc = nvmlh->nvmlDeviceResetApplicationsClocks(nvmlh->devs[n]); + if (rc != NVML_SUCCESS) { + applog(LOG_WARNING, "GPU #%d: unable to reset application clocks", dev_id); + } + clock_prev[dev_id] = 0; + ret = 1; + } + + if (limit_prev[dev_id]) { + uint32_t plimit = limit_prev[dev_id]; + if (nvmlh->nvmlDeviceGetPowerManagementDefaultLimit && !plimit) { + rc = nvmlh->nvmlDeviceGetPowerManagementDefaultLimit(nvmlh->devs[n], &plimit); + } else if (plimit) { + rc = NVML_SUCCESS; + } + if (rc == NVML_SUCCESS) + nvmlh->nvmlDeviceSetPowerManagementLimit(nvmlh->devs[n], plimit); + ret = 1; + } + return ret; +} + + +/** + * Set power state of a device (9xx) + * Code is similar as clocks one, which allow the change of the pstate + */ +int nvml_set_pstate(nvml_handle *nvmlh, int dev_id) +{ + nvmlReturn_t rc; + uint32_t gpu_clk = 0, mem_clk = 0; + int n = nvmlh->cuda_nvml_device_id[dev_id]; + if (n < 0 || n >= nvmlh->nvml_gpucount) + return -ENODEV; + + if (device_pstate[dev_id] < 0) + return 0; + + if (nvmlh->app_clocks[n] != NVML_FEATURE_ENABLED) { + applog(LOG_WARNING, "GPU #%d: NVML app. clock feature is not allowed!", dev_id); + return -EPERM; + } + + rc = nvmlh->nvmlDeviceGetDefaultApplicationsClock(nvmlh->devs[n], NVML_CLOCK_MEM, &mem_clk); + if(rc != NVML_SUCCESS) + { + applog(LOG_WARNING, "GPU #%d: nvmlDeviceGetDefaultApplicationsClock: %s", dev_id, nvmlh->nvmlErrorString(rc)); + return -1; + } + rc = nvmlh->nvmlDeviceGetDefaultApplicationsClock(nvmlh->devs[n], NVML_CLOCK_GRAPHICS, &gpu_clk); + if (rc != NVML_SUCCESS) { + applog(LOG_WARNING, "GPU #%d: unable to query application clocks", dev_id); + return -EINVAL; + } + + // get application config values + if (device_mem_clocks[dev_id]) mem_clk = device_mem_clocks[dev_id]; + if (device_gpu_clocks[dev_id]) gpu_clk = device_gpu_clocks[dev_id]; + + // these functions works for the 960 and the 970 (346.72+), not for the 750 Ti + uint32_t clocks[MAXCLOCKS] = {0}; + uint32_t nclocks = MAXCLOCKS; + int8_t wanted_pstate = device_pstate[dev_id]; + rc = nvmlh->nvmlDeviceGetSupportedMemoryClocks(nvmlh->devs[n], &nclocks, clocks); + if(rc != NVML_SUCCESS) + { + applog(LOG_WARNING, "GPU #%d: nvmlDeviceGetSupportedMemoryClocks: %s", dev_id, nvmlh->nvmlErrorString(rc)); + return -1; + } + if(wanted_pstate < 0) + return -1; + if(wanted_pstate < nclocks) + { + mem_clk = clocks[wanted_pstate]; + } + else + { + applog(LOG_WARNING, "GPU #%d: pstate %d is unsupported"); + return -1; + } + + nclocks = MAXCLOCKS; + rc = nvmlh->nvmlDeviceGetSupportedGraphicsClocks(nvmlh->devs[n], mem_clk, &nclocks, clocks); + if(rc != NVML_SUCCESS) + { + applog(LOG_WARNING, "GPU #%d: nvmlDeviceGetSupportedGraphicsClocks: %s", dev_id, nvmlh->nvmlErrorString(rc)); + return -1; + } + if(device_gpu_clocks[dev_id] == 0) + gpu_clk = 9999; + for(uint8_t u = 0; u < nclocks; u++) + { + // ordered desc, so get first + if(clocks[u] <= gpu_clk) + { + gpu_clk = clocks[u]; + break; + } + } + + rc = nvmlh->nvmlDeviceSetApplicationsClocks(nvmlh->devs[n], mem_clk, gpu_clk); + if (rc != NVML_SUCCESS) { + applog(LOG_WARNING, "GPU #%d: pstate %s", dev_id, nvmlh->nvmlErrorString(rc)); + return -1; + } + + if (!opt_quiet) + applog(LOG_INFO, "GPU #%d: app clocks set to P%d (%u/%u)", dev_id, (int)wanted_pstate, mem_clk, gpu_clk); + + clock_prev[dev_id] = 1; + return 1; +} + +int nvml_set_plimit(nvml_handle *nvmlh, int dev_id) +{ + nvmlReturn_t rc = NVML_ERROR_UNKNOWN; + uint32_t gpu_clk = 0, mem_clk = 0; + int n = nvmlh->cuda_nvml_device_id[dev_id]; + if (n < 0 || n >= nvmlh->nvml_gpucount) + return -ENODEV; + + if (!device_plimit[dev_id]) + return 0; // nothing to do + + if (!nvmlh->nvmlDeviceSetPowerManagementLimit) + return -ENOSYS; + + uint32_t plimit = device_plimit[dev_id] * 1000; + uint32_t pmin = 1000, pmax = 0, prev_limit = 0; + if (nvmlh->nvmlDeviceGetPowerManagementLimitConstraints) + rc = nvmlh->nvmlDeviceGetPowerManagementLimitConstraints(nvmlh->devs[n], &pmin, &pmax); + + if (rc != NVML_SUCCESS) { + if (!nvmlh->nvmlDeviceGetPowerManagementLimit) + return -ENOSYS; + } + nvmlh->nvmlDeviceGetPowerManagementLimit(nvmlh->devs[n], &prev_limit); + if (!pmax) pmax = prev_limit; + + plimit = min(plimit, pmax); + plimit = max(plimit, pmin); + rc = nvmlh->nvmlDeviceSetPowerManagementLimit(nvmlh->devs[n], plimit); + if (rc != NVML_SUCCESS) { + applog(LOG_WARNING, "GPU #%d: plimit %s", dev_id, nvmlh->nvmlErrorString(rc)); + return -1; + } + + if (!opt_quiet) { + applog(LOG_INFO, "GPU #%d: power limit set to %uW (allowed range is %u-%u)", + dev_id, plimit/1000U, pmin/1000U, pmax/1000U); + } + + limit_prev[dev_id] = prev_limit; + return 1; +} + int nvml_get_gpucount(nvml_handle *nvmlh, int *gpucount) { *gpucount = nvmlh->nvml_gpucount; @@ -283,7 +599,10 @@ int nvml_get_gpu_name(nvml_handle *nvmlh, int cudaindex, char *namebuf, int bufs { int gpuindex = nvmlh->cuda_nvml_device_id[cudaindex]; if (gpuindex < 0 || gpuindex >= nvmlh->nvml_gpucount) - return -1; + return -ENODEV; + + if (!nvmlh->nvmlDeviceGetName) + return -ENOSYS; if (nvmlh->nvmlDeviceGetName(nvmlh->devs[gpuindex], namebuf, bufsize) != NVML_SUCCESS) return -1; @@ -297,7 +616,10 @@ int nvml_get_tempC(nvml_handle *nvmlh, int cudaindex, unsigned int *tempC) nvmlReturn_t rc; int gpuindex = nvmlh->cuda_nvml_device_id[cudaindex]; if (gpuindex < 0 || gpuindex >= nvmlh->nvml_gpucount) - return -1; + return -ENODEV; + + if (!nvmlh->nvmlDeviceGetTemperature) + return -ENOSYS; rc = nvmlh->nvmlDeviceGetTemperature(nvmlh->devs[gpuindex], 0u /* NVML_TEMPERATURE_GPU */, tempC); if (rc != NVML_SUCCESS) { @@ -313,7 +635,10 @@ int nvml_get_fanpcnt(nvml_handle *nvmlh, int cudaindex, unsigned int *fanpcnt) nvmlReturn_t rc; int gpuindex = nvmlh->cuda_nvml_device_id[cudaindex]; if (gpuindex < 0 || gpuindex >= nvmlh->nvml_gpucount) - return -1; + return -ENODEV; + + if (!nvmlh->nvmlDeviceGetFanSpeed) + return -ENOSYS; rc = nvmlh->nvmlDeviceGetFanSpeed(nvmlh->devs[gpuindex], fanpcnt); if (rc != NVML_SUCCESS) { @@ -328,12 +653,15 @@ int nvml_get_power_usage(nvml_handle *nvmlh, int cudaindex, unsigned int *milliw { int gpuindex = nvmlh->cuda_nvml_device_id[cudaindex]; if (gpuindex < 0 || gpuindex >= nvmlh->nvml_gpucount) - return -1; + return -ENODEV; + + if (!nvmlh->nvmlDeviceGetPowerUsage) + return -ENOSYS; nvmlReturn_t res = nvmlh->nvmlDeviceGetPowerUsage(nvmlh->devs[gpuindex], milliwatts); if (res != NVML_SUCCESS) { - if (opt_debug) - applog(LOG_DEBUG, "nvmlDeviceGetPowerUsage: %s", nvmlh->nvmlErrorString(res)); + //if (opt_debug) + // applog(LOG_DEBUG, "nvmlDeviceGetPowerUsage: %s", nvmlh->nvmlErrorString(res)); return -1; } @@ -345,7 +673,10 @@ int nvml_get_pstate(nvml_handle *nvmlh, int cudaindex, int *pstate) { int gpuindex = nvmlh->cuda_nvml_device_id[cudaindex]; if (gpuindex < 0 || gpuindex >= nvmlh->nvml_gpucount) - return -1; + return -ENODEV; + + if (!nvmlh->nvmlDeviceGetPerformanceState) + return -ENOSYS; nvmlReturn_t res = nvmlh->nvmlDeviceGetPerformanceState(nvmlh->devs[gpuindex], pstate); if (res != NVML_SUCCESS) { @@ -361,7 +692,7 @@ int nvml_get_busid(nvml_handle *nvmlh, int cudaindex, int *busid) { int gpuindex = nvmlh->cuda_nvml_device_id[cudaindex]; if (gpuindex < 0 || gpuindex >= nvmlh->nvml_gpucount) - return -1; + return -ENODEV; (*busid) = nvmlh->nvml_pci_bus_id[gpuindex]; return 0; @@ -374,13 +705,17 @@ int nvml_get_serial(nvml_handle *nvmlh, int cudaindex, char *sn, int maxlen) int gpuindex = nvmlh->cuda_nvml_device_id[cudaindex]; nvmlReturn_t res; if (gpuindex < 0 || gpuindex >= nvmlh->nvml_gpucount) - return -1; + return -ENODEV; - res = nvmlh->nvmlDeviceGetSerial(nvmlh->devs[gpuindex], sn, maxlen); - if (res == NVML_SUCCESS) { - return 0; + if (nvmlh->nvmlDeviceGetSerial) { + res = nvmlh->nvmlDeviceGetSerial(nvmlh->devs[gpuindex], sn, maxlen); + if (res == NVML_SUCCESS) + return 0; } + if (!nvmlh->nvmlDeviceGetUUID) + return -ENOSYS; + // nvmlDeviceGetUUID: GPU-f2bd642c-369f-5a14-e0b4-0d22dfe9a1fc // use a part of uuid to generate an unique serial // todo: check if there is vendor id is inside @@ -401,7 +736,10 @@ int nvml_get_bios(nvml_handle *nvmlh, int cudaindex, char *desc, int maxlen) uint32_t subids = 0; int gpuindex = nvmlh->cuda_nvml_device_id[cudaindex]; if (gpuindex < 0 || gpuindex >= nvmlh->nvml_gpucount) - return -1; + return -ENODEV; + + if (!nvmlh->nvmlDeviceGetVbiosVersion) + return -ENOSYS; nvmlReturn_t res = nvmlh->nvmlDeviceGetVbiosVersion(nvmlh->devs[gpuindex], desc, maxlen); if (res != NVML_SUCCESS) { @@ -412,16 +750,17 @@ int nvml_get_bios(nvml_handle *nvmlh, int cudaindex, char *desc, int maxlen) return 0; } -int nvml_get_info(nvml_handle *nvmlh, int cudaindex, uint16_t *vid, uint16_t *pid) +int nvml_get_info(nvml_handle *nvmlh, int cudaindex, uint16_t &vid, uint16_t &pid) { uint32_t subids = 0; int gpuindex = nvmlh->cuda_nvml_device_id[cudaindex]; if (gpuindex < 0 || gpuindex >= nvmlh->nvml_gpucount) - return -1; + return -ENODEV; subids = nvmlh->nvml_pci_subsys_id[gpuindex]; - (*pid) = subids >> 16; - (*vid) = subids & 0xFFFF; + if (!subids) subids = nvmlh->nvml_pci_device_id[gpuindex]; + pid = subids >> 16; + vid = subids & 0xFFFF; return 0; } @@ -461,7 +800,7 @@ int nvapi_temperature(unsigned int devNum, unsigned int *temperature) NvAPI_Status ret; if (devNum >= nvapi_dev_cnt) - return -1; + return -ENODEV; NV_GPU_THERMAL_SETTINGS thermal; thermal.version = NV_GPU_THERMAL_SETTINGS_VER; @@ -484,7 +823,7 @@ int nvapi_fanspeed(unsigned int devNum, unsigned int *speed) NvAPI_Status ret; if (devNum >= nvapi_dev_cnt) - return -1; + return -ENODEV; NvU32 fanspeed = 0; ret = NvAPI_GPU_GetTachReading(phys[devNum], &fanspeed); @@ -506,7 +845,7 @@ int nvapi_getpstate(unsigned int devNum, unsigned int *power) NvAPI_Status ret; if (devNum >= nvapi_dev_cnt) - return -1; + return -ENODEV; NV_GPU_PERF_PSTATE_ID CurrentPstate = NVAPI_GPU_PERF_PSTATE_UNDEFINED; /* 16 */ ret = NvAPI_GPU_GetCurrentPstate(phys[devNum], &CurrentPstate); @@ -531,7 +870,7 @@ int nvapi_getusage(unsigned int devNum, unsigned int *pct) NvAPI_Status ret; if (devNum >= nvapi_dev_cnt) - return -1; + return -ENODEV; NV_GPU_DYNAMIC_PSTATES_INFO_EX info; info.version = NV_GPU_DYNAMIC_PSTATES_INFO_EX_VER; @@ -551,13 +890,13 @@ int nvapi_getusage(unsigned int devNum, unsigned int *pct) return 0; } -int nvapi_getinfo(unsigned int devNum, uint16_t *vid, uint16_t *pid) +int nvapi_getinfo(unsigned int devNum, uint16_t &vid, uint16_t &pid) { NvAPI_Status ret; NvU32 pDeviceId, pSubSystemId, pRevisionId, pExtDeviceId; if (devNum >= nvapi_dev_cnt) - return -1; + return -ENODEV; ret = NvAPI_GPU_GetPCIIdentifiers(phys[devNum], &pDeviceId, &pSubSystemId, &pRevisionId, &pExtDeviceId); if (ret != NVAPI_OK) { @@ -568,8 +907,12 @@ int nvapi_getinfo(unsigned int devNum, uint16_t *vid, uint16_t *pid) return -1; } - (*pid) = pDeviceId >> 16; - (*vid) = pDeviceId & 0xFFFF; + pid = pDeviceId >> 16; + vid = pDeviceId & 0xFFFF; + if (vid == 0x10DE && pSubSystemId) { + vid = pSubSystemId & 0xFFFF; + pid = pSubSystemId >> 16; + } return 0; } @@ -578,7 +921,7 @@ int nvapi_getserial(unsigned int devNum, char *serial, unsigned int maxlen) { // NvAPI_Status ret; if (devNum >= nvapi_dev_cnt) - return -1; + return -ENODEV; sprintf(serial, ""); @@ -602,7 +945,7 @@ int nvapi_getbios(unsigned int devNum, char *desc, unsigned int maxlen) { NvAPI_Status ret; if (devNum >= nvapi_dev_cnt) - return -1; + return -ENODEV; if (maxlen < 64) // Short String return -1; @@ -657,7 +1000,7 @@ int nvapi_init() if (ret == NVAPI_OK && busId == device_bus_ids[g]) { nvapi_dev_map[g] = i; if (opt_debug) - applog(LOG_DEBUG, "CUDA GPU#%d matches NVAPI GPU %d by busId %u", + applog(LOG_DEBUG, "CUDA GPU %d matches NVAPI GPU %d by busId %u", g, i, busId); break; } @@ -679,7 +1022,7 @@ int nvapi_init() NvAPI_ShortString str; ret = NvAPI_SYS_GetDriverAndBranchVersion(&udv, str); if (ret == NVAPI_OK) { - sprintf(driver_version,"%d.%d", udv/100, udv % 100); + sprintf(driver_version,"%d.%02d", udv / 100, udv % 100); } return 0; @@ -785,12 +1128,125 @@ unsigned int gpu_power(struct cgpu_info *gpu) mw = pct; // to fix } #endif + if(gpu->gpu_power > 0) + { + // average + mw = (gpu->gpu_power + mw) / 2; + } return mw; } +static int translate_vendor_id(uint16_t vid, char *vendorname) +{ + struct VENDORS { + const uint16_t vid; + const char *name; + } vendors[] = { + { 0x1043, "ASUS" }, + { 0x107D, "Leadtek" }, + { 0x10B0, "Gainward" }, + // { 0x10DE, "NVIDIA" }, + { 0x1458, "Gigabyte" }, + { 0x1462, "MSI" }, + { 0x154B, "PNY" }, + { 0x1682, "XFX" }, + { 0x196D, "Club3D" }, + { 0x19DA, "Zotac" }, + { 0x19F1, "BFG" }, + { 0x1ACC, "PoV" }, + { 0x1B4C, "KFA2" }, + { 0x3842, "EVGA" }, + { 0x7377, "Colorful" }, + { 0, "" } + }; + + if (!vendorname) + return -EINVAL; + + for(int v=0; v < ARRAY_SIZE(vendors); v++) { + if (vid == vendors[v].vid) { + strcpy(vendorname, vendors[v].name); + return vid; + } + } + if (opt_debug && vid != 0x10DE) + applog(LOG_DEBUG, "nvml: Unknown vendor %04x\n", vid); + return 0; +} + +#ifdef HAVE_PCIDEV +extern "C" { +#include +} +static int linux_gpu_vendor(uint8_t pci_bus_id, char* vendorname, uint16_t &pid) +{ + uint16_t subvendor = 0; + struct pci_access *pci; + struct pci_dev *dev; + uint16_t subdevice; + + if (!vendorname) + return -EINVAL; + + pci = pci_alloc(); + if (!pci) + return -ENODEV; + + pci_init(pci); + pci_scan_bus(pci); + + for(dev = pci->devices; dev; dev = dev->next) + { + if (dev->bus == pci_bus_id && dev->vendor_id == 0x10DE) + { + if (!(dev->known_fields & PCI_FILL_CLASS)) + pci_fill_info(dev, PCI_FILL_CLASS); + if (dev->device_class != PCI_CLASS_DISPLAY_VGA) + continue; + subvendor = pci_read_word(dev, PCI_SUBSYSTEM_VENDOR_ID); + subdevice = pci_read_word(dev, PCI_SUBSYSTEM_ID); // model + + translate_vendor_id(subvendor, vendorname); + } + } + pci_cleanup(pci); + return (int) subvendor; +} +#endif + +int gpu_vendor(uint8_t pci_bus_id, char *vendorname) +{ +#ifdef HAVE_PCIDEV + uint16_t pid = 0; + return linux_gpu_vendor(pci_bus_id, vendorname, pid); +#else + uint16_t vid = 0, pid = 0; + if (hnvml) { // may not be initialized on start... + for (int id=0; id < hnvml->nvml_gpucount; id++) { + if (hnvml->nvml_pci_bus_id[id] == pci_bus_id) { + int dev_id = hnvml->nvml_cuda_device_id[id]; + nvml_get_info(hnvml, dev_id, vid, pid); + } + } + } else { +#ifdef WIN32 + for (unsigned id = 0; id < nvapi_dev_cnt; id++) { + if (device_bus_ids[id] == pci_bus_id) { + nvapi_getinfo(nvapi_dev_map[id], vid, pid); + break; + } + } +#endif + } + return translate_vendor_id(vid, vendorname); +#endif +} + int gpu_info(struct cgpu_info *gpu) { + char vendorname[32] = { 0 }; int id = gpu->gpu_id; + uint8_t bus_id = 0; gpu->nvml_id = -1; gpu->nvapi_id = -1; @@ -800,13 +1256,19 @@ int gpu_info(struct cgpu_info *gpu) if (hnvml) { gpu->nvml_id = (int8_t) hnvml->cuda_nvml_device_id[id]; - nvml_get_info(hnvml, id, &gpu->gpu_vid, &gpu->gpu_pid); +#ifdef HAVE_PCIDEV + gpu->gpu_vid = linux_gpu_vendor(hnvml->nvml_pci_bus_id[id], vendorname, gpu->gpu_pid); + if (!gpu->gpu_vid || !gpu->gpu_pid) + nvml_get_info(hnvml, id, gpu->gpu_vid, gpu->gpu_pid); +#else + nvml_get_info(hnvml, id, gpu->gpu_vid, gpu->gpu_pid); +#endif nvml_get_serial(hnvml, id, gpu->gpu_sn, sizeof(gpu->gpu_sn)); nvml_get_bios(hnvml, id, gpu->gpu_desc, sizeof(gpu->gpu_desc)); } #ifdef WIN32 gpu->nvapi_id = (int8_t) nvapi_dev_map[id]; - nvapi_getinfo(nvapi_dev_map[id], &gpu->gpu_vid, &gpu->gpu_pid); + nvapi_getinfo(nvapi_dev_map[id], gpu->gpu_vid, gpu->gpu_pid); nvapi_getserial(nvapi_dev_map[id], gpu->gpu_sn, sizeof(gpu->gpu_sn)); nvapi_getbios(nvapi_dev_map[id], gpu->gpu_desc, sizeof(gpu->gpu_desc)); #endif diff --git a/nvml.h b/nvml.h index d9fa5e4a08..4e1df9ff3d 100644 --- a/nvml.h +++ b/nvml.h @@ -66,6 +66,20 @@ enum nvmlClockType_t { NVML_CLOCK_MEM = 2 }; +enum nvmlPcieUtilCounter_t { + NVML_PCIE_UTIL_TX_BYTES = 0, + NVML_PCIE_UTIL_RX_BYTES = 1, + NVML_PCIE_UTIL_COUNT +}; + +enum nvmlValueType_t { + NVML_VALUE_TYPE_DOUBLE = 0, + NVML_VALUE_TYPE_UNSIGNED_INT = 1, + NVML_VALUE_TYPE_UNSIGNED_LONG = 2, + NVML_VALUE_TYPE_UNSIGNED_LONG_LONG = 3, + NVML_VALUE_TYPE_COUNT +}; + #define NVML_DEVICE_SERIAL_BUFFER_SIZE 30 #define NVML_DEVICE_UUID_BUFFER_SIZE 80 #define NVML_DEVICE_VBIOS_VERSION_BUFFER_SIZE 32 @@ -94,8 +108,20 @@ typedef struct { nvmlReturn_t (*nvmlDeviceGetDefaultApplicationsClock)(nvmlDevice_t, nvmlClockType_t, unsigned int *); nvmlReturn_t (*nvmlDeviceGetApplicationsClock)(nvmlDevice_t, nvmlClockType_t, unsigned int *); nvmlReturn_t (*nvmlDeviceSetApplicationsClocks)(nvmlDevice_t, unsigned int, unsigned int); + nvmlReturn_t (*nvmlDeviceResetApplicationsClocks)(nvmlDevice_t); + nvmlReturn_t (*nvmlDeviceGetSupportedGraphicsClocks)(nvmlDevice_t, uint32_t mem, uint32_t *num, uint32_t *arr); + nvmlReturn_t (*nvmlDeviceGetSupportedMemoryClocks)(nvmlDevice_t, unsigned int *count, unsigned int *clocksMHz); nvmlReturn_t (*nvmlDeviceGetClockInfo)(nvmlDevice_t, nvmlClockType_t, unsigned int *); + nvmlReturn_t (*nvmlDeviceGetMaxClockInfo)(nvmlDevice_t, nvmlClockType_t, unsigned int *); + nvmlReturn_t (*nvmlDeviceGetPowerManagementDefaultLimit)(nvmlDevice_t, unsigned int *limit); + nvmlReturn_t (*nvmlDeviceGetPowerManagementLimit)(nvmlDevice_t, unsigned int *limit); + nvmlReturn_t (*nvmlDeviceGetPowerManagementLimitConstraints)(nvmlDevice_t, unsigned int *min, unsigned int *max); + nvmlReturn_t (*nvmlDeviceSetPowerManagementLimit)(nvmlDevice_t device, unsigned int limit); nvmlReturn_t (*nvmlDeviceGetPciInfo)(nvmlDevice_t, nvmlPciInfo_t *); + nvmlReturn_t (*nvmlDeviceGetCurrPcieLinkGeneration)(nvmlDevice_t device, unsigned int *gen); + nvmlReturn_t (*nvmlDeviceGetCurrPcieLinkWidth)(nvmlDevice_t device, unsigned int *width); + nvmlReturn_t (*nvmlDeviceGetMaxPcieLinkGeneration)(nvmlDevice_t device, unsigned int *gen); + nvmlReturn_t (*nvmlDeviceGetMaxPcieLinkWidth)(nvmlDevice_t device, unsigned int *width); nvmlReturn_t (*nvmlDeviceGetName)(nvmlDevice_t, char *, int); nvmlReturn_t (*nvmlDeviceGetTemperature)(nvmlDevice_t, int, unsigned int *); nvmlReturn_t (*nvmlDeviceGetFanSpeed)(nvmlDevice_t, unsigned int *); @@ -107,6 +133,15 @@ typedef struct { nvmlReturn_t (*nvmlSystemGetDriverVersion)(char *version, unsigned int len); char* (*nvmlErrorString)(nvmlReturn_t); nvmlReturn_t (*nvmlShutdown)(void); + // v331 + nvmlReturn_t (*nvmlDeviceGetEnforcedPowerLimit)(nvmlDevice_t, unsigned int *limit); + // v340 + //nvmlReturn_t (*nvmlDeviceGetCpuAffinity)(nvmlDevice_t, unsigned int cpuSetSize, unsigned long* cpuSet); + //nvmlReturn_t (*nvmlDeviceSetCpuAffinity)(nvmlDevice_t); + //nvmlReturn_t (*nvmlDeviceGetAutoBoostedClocksEnabled)(nvmlDevice_t, nvmlEnableState_t *isEnabled, nvmlEnableState_t *defaultIsEnabled); + //nvmlReturn_t (*nvmlDeviceSetAutoBoostedClocksEnabled)(nvmlDevice_t, nvmlEnableState_t enabled); + // v346 + nvmlReturn_t (*nvmlDeviceGetPcieThroughput)(nvmlDevice_t, nvmlPcieUtilCounter_t, unsigned int *value); } nvml_handle; @@ -118,43 +153,11 @@ int nvml_destroy(nvml_handle *nvmlh); */ int nvml_get_gpucount(nvml_handle *nvmlh, int *gpucount); -/* - * Query the number of GPUs seen by CUDA - */ -int cuda_get_gpucount(nvml_handle *nvmlh, int *gpucount); +int nvml_set_plimit(nvml_handle *nvmlh, int dev_id); +int nvml_set_pstate(nvml_handle *nvmlh, int dev_id); - -/* - * query the name of the GPU model from the CUDA device ID - * - */ -int nvml_get_gpu_name(nvml_handle *nvmlh, - int gpuindex, - char *namebuf, - int bufsize); - -/* - * Query the current GPU temperature (Celsius), from the CUDA device ID - */ -int nvml_get_tempC(nvml_handle *nvmlh, - int gpuindex, unsigned int *tempC); - -/* - * Query the current GPU fan speed (percent) from the CUDA device ID - */ -int nvml_get_fanpcnt(nvml_handle *nvmlh, - int gpuindex, unsigned int *fanpcnt); - -/* - * Query the current GPU power usage in millwatts from the CUDA device ID - * - * This feature is only available on recent GPU generations and may be - * limited in some cases only to Tesla series GPUs. - * If the query is run on an unsupported GPU, this routine will return -1. - */ -int nvml_get_power_usage(nvml_handle *nvmlh, - int gpuindex, - unsigned int *milliwatts); +int nvml_set_clocks(nvml_handle *nvmlh, int dev_id); +int nvml_reset_clocks(nvml_handle *nvmlh, int dev_id); /* api functions */ @@ -162,12 +165,13 @@ unsigned int gpu_fanpercent(struct cgpu_info *gpu); unsigned int gpu_fanrpm(struct cgpu_info *gpu); float gpu_temp(struct cgpu_info *gpu); unsigned int gpu_power(struct cgpu_info *gpu); -unsigned int gpu_usage(struct cgpu_info *gpu); int gpu_pstate(struct cgpu_info *gpu); int gpu_busid(struct cgpu_info *gpu); /* pid/vid, sn and bios rev */ int gpu_info(struct cgpu_info *gpu); +int gpu_vendor(uint8_t pci_bus_id, char *vendorname); + /* nvapi functions */ #ifdef WIN32 diff --git a/pentablake.cu b/pentablake.cu index 3b184e53af..0c791dad8b 100644 --- a/pentablake.cu +++ b/pentablake.cu @@ -8,15 +8,21 @@ extern "C" { #include "sph/sph_blake.h" +} +#ifdef __cplusplus +#include +#else #include +#endif #include -} + + /* threads per block */ #define TPB 192 /* hash by cpu with blake 256 */ -extern "C" void pentablakehash(void *output, const void *input) +void pentablakehash(void *output, const void *input) { unsigned char hash[128]; #define hashB hash + 64 @@ -49,10 +55,9 @@ static uint32_t __align__(32) c_Target[8]; __constant__ static uint64_t __align__(32) c_data[32]; -static uint32_t *d_hash[MAX_GPUS]; static uint32_t *d_resNounce[MAX_GPUS]; static uint32_t *h_resNounce[MAX_GPUS]; -static uint32_t extra_results[2] = { UINT32_MAX, UINT32_MAX }; +static uint32_t extra_results[MAX_GPUS][2] = { UINT32_MAX }; /* prefer uint32_t to prevent size conversions = speed +5/10 % */ __constant__ @@ -103,7 +108,7 @@ const uint64_t c_u512[16] = #define G(a,b,c,d,x) { \ uint32_t idx1 = c_sigma[i][x]; \ - uint32_t idx2 = c_sigma[i][x+1]; \ + uint32_t idx2 = c_sigma[i][x + 1]; \ v[a] += (m[idx1] ^ c_u512[idx2]) + v[b]; \ v[d] = SWAPDWORDS(v[d] ^ v[a]); \ v[c] += v[d]; \ @@ -188,7 +193,7 @@ void pentablake_compress(uint64_t *h, const uint64_t *block, const uint32_t T0) __global__ void pentablake_gpu_hash_80(uint32_t threads, uint32_t startNounce, uint32_t *resNounce) { - uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); if (thread < threads) { const uint32_t nounce = startNounce + thread; @@ -268,12 +273,12 @@ void pentablake_compress(uint64_t *h, const uint64_t *block, const uint64_t T0) __global__ void pentablake_gpu_hash_80(uint32_t threads, const uint32_t startNounce, void *outputHash) { - uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); if (thread < threads) { uint64_t h[8]; uint64_t buf[16]; - uint32_t nounce = startNounce + thread; + const uint32_t nounce = startNounce + thread; //#pragma unroll 8 for(int i=0; i<8; i++) @@ -288,39 +293,27 @@ void pentablake_gpu_hash_80(uint32_t threads, const uint32_t startNounce, void * pentablake_compress(h, buf, 640ULL); -#if __CUDA_ARCH__ < 300 - uint32_t *outHash = (uint32_t *)outputHash + 16 * thread; - #pragma unroll 8 - for (uint32_t i=0; i < 8; i++) { - outHash[2*i] = cuda_swab32( _HIWORD(h[i]) ); - outHash[2*i+1] = cuda_swab32( _LOWORD(h[i]) ); - } -#else uint64_t *outHash = (uint64_t *)outputHash + 8 * thread; for (uint32_t i=0; i < 8; i++) { outHash[i] = cuda_swab64( h[i] ); } -#endif - } } __host__ -void pentablake_cpu_hash_80(int thr_id, uint32_t threads, const uint32_t startNounce, uint32_t *d_outputHash, int order) +void pentablake_cpu_hash_80(int thr_id, uint32_t threads, const uint32_t startNounce, uint32_t *d_outputHash) { dim3 grid((threads + TPB-1)/TPB); dim3 block(TPB); - pentablake_gpu_hash_80 <<>> (threads, startNounce, d_outputHash); - - //MyStreamSynchronize(NULL, order, thr_id); + pentablake_gpu_hash_80 <<>> (threads, startNounce, d_outputHash); } __global__ void pentablake_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t *g_hash) { - uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); if (thread < threads) { @@ -344,31 +337,20 @@ void pentablake_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t *g_ // Ending round pentablake_compress(h, buf, 512); -#if __CUDA_ARCH__ < 300 - uint32_t *outHash = (uint32_t*)&g_hash[thread<<3]; - #pragma unroll 8 - for (int i=0; i < 8; i++) { - outHash[2*i+0] = cuda_swab32( _HIWORD(h[i]) ); - outHash[2*i+1] = cuda_swab32( _LOWORD(h[i]) ); - } -#else uint64_t *outHash = &g_hash[thread<<3]; for (int i=0; i < 8; i++) { outHash[i] = cuda_swab64(h[i]); } -#endif } } __host__ -void pentablake_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_outputHash, int order) +void pentablake_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_outputHash) { dim3 grid((threads + TPB - 1) / TPB); dim3 block(TPB); - pentablake_gpu_hash_64 <<>> (threads, startNounce, (uint64_t*)d_outputHash); - - //MyStreamSynchronize(NULL, order, thr_id); + pentablake_gpu_hash_64 <<>> (threads, startNounce, (uint64_t*)d_outputHash); } #if 0 @@ -385,11 +367,11 @@ uint32_t pentablake_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNoun if (cudaMemset(d_resNounce[thr_id], 0xff, 2*sizeof(uint32_t)) != cudaSuccess) return result; - pentablake_gpu_hash_80<<>>(threads, startNounce, d_resNounce[thr_id]); + pentablake_gpu_hash_80<<>>(threads, startNounce, d_resNounce[thr_id]); cudaDeviceSynchronize(); - if (cudaSuccess == cudaMemcpy(h_resNounce[thr_id], d_resNounce[thr_id], 2*sizeof(uint32_t), cudaMemcpyDeviceToHost)) { + if (cudaSuccess == cudaMemcpyAsync(h_resNounce[thr_id], d_resNounce[thr_id], 2*sizeof(uint32_t), cudaMemcpyDeviceToHost)) { result = h_resNounce[thr_id][0]; - extra_results[0] = h_resNounce[thr_id][1]; + extra_results[thr_id][0] = h_resNounce[thr_id][1]; } return result; } @@ -398,11 +380,11 @@ uint32_t pentablake_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNoun __global__ void pentablake_gpu_check_hash(uint32_t threads, uint32_t startNounce, uint32_t *g_hash, uint32_t *resNounce) { - uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); if (thread < threads) { - uint32_t nounce = startNounce + thread; - uint32_t *inpHash = &g_hash[thread<<4]; + const uint32_t nounce = startNounce + thread; + const uint32_t *const inpHash = &g_hash[thread<<4]; if (cuda_hashisbelowtarget(inpHash, c_Target)) { @@ -414,7 +396,7 @@ void pentablake_gpu_check_hash(uint32_t threads, uint32_t startNounce, uint32_t } __host__ static -uint32_t pentablake_check_hash(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_inputHash, int order) +uint32_t pentablake_check_hash(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_inputHash) { uint32_t result = UINT32_MAX; @@ -422,21 +404,21 @@ uint32_t pentablake_check_hash(int thr_id, uint32_t threads, uint32_t startNounc dim3 block(TPB); /* Check error on Ctrl+C or kill to prevent segfaults on exit */ - if (cudaMemset(d_resNounce[thr_id], 0xff, 2*sizeof(uint32_t)) != cudaSuccess) + if (cudaMemsetAsync(d_resNounce[thr_id], 0xff, 2 * sizeof(uint32_t), gpustream[thr_id]) != cudaSuccess) return result; - pentablake_gpu_check_hash <<>> (threads, startNounce, d_inputHash, d_resNounce[thr_id]); + pentablake_gpu_check_hash <<>> (threads, startNounce, d_inputHash, d_resNounce[thr_id]); - if (cudaSuccess == cudaMemcpy(h_resNounce[thr_id], d_resNounce[thr_id], 2*sizeof(uint32_t), cudaMemcpyDeviceToHost)) { - result = h_resNounce[thr_id][0]; - extra_results[0] = h_resNounce[thr_id][1]; - } + CUDA_SAFE_CALL(cudaMemcpyAsync(h_resNounce[thr_id], d_resNounce[thr_id], 2 * sizeof(uint32_t), cudaMemcpyDeviceToHost, gpustream[thr_id])); + cudaStreamSynchronize(gpustream[thr_id]); + result = h_resNounce[thr_id][0]; + extra_results[thr_id][0] = h_resNounce[thr_id][1]; return result; } __host__ -void pentablake_cpu_setBlock_80(uint32_t *pdata, const uint32_t *ptarget) +void pentablake_cpu_setBlock_80(int thr_id, uint32_t *pdata, const uint32_t *ptarget) { uint8_t data[128]; memcpy((void*) data, (void*) pdata, 80); @@ -448,32 +430,47 @@ void pentablake_cpu_setBlock_80(uint32_t *pdata, const uint32_t *ptarget) data[126] = 0x02; data[127] = 0x80; - CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_data, data, sizeof(data), 0, cudaMemcpyHostToDevice)); - CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_sigma, host_sigma, sizeof(host_sigma), 0, cudaMemcpyHostToDevice)); - CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_Target, ptarget, 32, 0, cudaMemcpyHostToDevice)); + CUDA_SAFE_CALL(cudaMemcpyToSymbolAsync(c_data, data, sizeof(data), 0, cudaMemcpyHostToDevice, gpustream[thr_id])); + CUDA_SAFE_CALL(cudaMemcpyToSymbolAsync(c_sigma, host_sigma, sizeof(host_sigma), 0, cudaMemcpyHostToDevice, gpustream[thr_id])); + CUDA_SAFE_CALL(cudaMemcpyToSymbolAsync(c_Target, ptarget, 32, 0, cudaMemcpyHostToDevice, gpustream[thr_id])); } -static bool init[MAX_GPUS] = { 0 }; +static volatile bool init[MAX_GPUS] = { false }; -extern "C" int scanhash_pentablake(int thr_id, uint32_t *pdata, const uint32_t *ptarget, - uint32_t max_nonce, unsigned long *hashes_done) +extern int scanhash_pentablake(int thr_id, uint32_t *pdata, uint32_t *ptarget, + uint32_t max_nonce, uint32_t *hashes_done) { + static THREAD uint32_t *d_hash = nullptr; + const uint32_t first_nonce = pdata[19]; uint32_t endiandata[20]; int rc = 0; - uint32_t throughput = device_intensity(thr_id, __func__, 128U * 2560); // 18.5 - throughput = min(throughput, (max_nonce - first_nonce)); + uint32_t throughputmax = device_intensity(device_map[thr_id], __func__, 128U * 2560); // 18.5 + uint32_t throughput = min(throughputmax, (max_nonce - first_nonce)) & 0xfffffc00; if (opt_benchmark) - ((uint32_t*)ptarget)[7] = 0x000F; + ptarget[7] = 0x000F; - if (!init[thr_id]) { - if (active_gpus > 1) { - CUDA_CALL_OR_RET_X(cudaSetDevice(device_map[thr_id]), 0); + if (!init[thr_id]) + { + CUDA_SAFE_CALL(cudaSetDevice(device_map[thr_id])); + cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync); + cudaDeviceSetCacheConfig(cudaFuncCachePreferL1); + CUDA_SAFE_CALL(cudaStreamCreate(&gpustream[thr_id])); +#if defined WIN32 && !defined _WIN64 + // 2GB limit for cudaMalloc + if(throughputmax > 0x7fffffffULL / 64) + { + applog(LOG_ERR, "intensity too high"); + mining_has_stopped[thr_id] = true; + cudaStreamDestroy(gpustream[thr_id]); + proper_exit(2); } - CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], 64 * throughput)); +#endif + CUDA_SAFE_CALL(cudaMalloc(&d_hash, 64 * throughputmax)); CUDA_SAFE_CALL(cudaMallocHost(&h_resNounce[thr_id], 2*sizeof(uint32_t))); CUDA_SAFE_CALL(cudaMalloc(&d_resNounce[thr_id], 2*sizeof(uint32_t))); + mining_has_stopped[thr_id] = false; init[thr_id] = true; } @@ -481,52 +478,55 @@ extern "C" int scanhash_pentablake(int thr_id, uint32_t *pdata, const uint32_t * for (int k=0; k < 20; k++) be32enc(&endiandata[k], pdata[k]); - pentablake_cpu_setBlock_80(endiandata, ptarget); + pentablake_cpu_setBlock_80(thr_id, endiandata, ptarget); do { - int order = 0; // GPU HASH - pentablake_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++); - - pentablake_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id], order++); - pentablake_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id], order++); - pentablake_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id], order++); - pentablake_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id], order++); - - uint32_t foundNonce = pentablake_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id], order++); - if (foundNonce != UINT32_MAX) + pentablake_cpu_hash_80(thr_id, throughput, pdata[19], d_hash); + + pentablake_cpu_hash_64(thr_id, throughput, pdata[19], d_hash); + pentablake_cpu_hash_64(thr_id, throughput, pdata[19], d_hash); + pentablake_cpu_hash_64(thr_id, throughput, pdata[19], d_hash); + pentablake_cpu_hash_64(thr_id, throughput, pdata[19], d_hash); + CUDA_SAFE_CALL(cudaGetLastError()); + uint32_t foundNonce = pentablake_check_hash(thr_id, throughput, pdata[19], d_hash); + if(stop_mining) {mining_has_stopped[thr_id] = true; cudaStreamDestroy(gpustream[thr_id]); pthread_exit(nullptr);} + if(foundNonce != UINT32_MAX) { const uint32_t Htarg = ptarget[7]; - uint32_t vhashcpu[8]; + uint32_t vhashcpu[8] = { 0 }; - be32enc(&endiandata[19], foundNonce); - pentablakehash(vhashcpu, endiandata); - - if (vhashcpu[7] <= Htarg && fulltest(vhashcpu, ptarget)) { + if(opt_verify) + { + be32enc(&endiandata[19], foundNonce); + pentablakehash(vhashcpu, endiandata); + } + if (vhashcpu[7] <= Htarg && fulltest(vhashcpu, ptarget)) + { rc = 1; *hashes_done = pdata[19] - first_nonce + throughput; - if (extra_results[0] != UINT32_MAX) { + if (extra_results[thr_id][0] != UINT32_MAX) { // Rare but possible if the throughput is big applog(LOG_NOTICE, "GPU found more than one result yippee!"); - pdata[21] = extra_results[0]; - extra_results[0] = UINT32_MAX; + pdata[21] = extra_results[thr_id][0]; + extra_results[thr_id][0] = UINT32_MAX; rc++; } pdata[19] = foundNonce; return rc; } else if (vhashcpu[7] > Htarg) { - applog(LOG_WARNING, "GPU #%d: result for nounce %08x is not in range: %x > %x", thr_id, foundNonce, vhashcpu[7], Htarg); + applog(LOG_WARNING, "GPU #%d: result for nounce %08x is not in range: %x > %x", device_map[thr_id], foundNonce, vhashcpu[7], Htarg); } else { - applog(LOG_WARNING, "GPU #%d: result for nounce %08x does not validate on CPU!", thr_id, foundNonce); + applog(LOG_WARNING, "GPU #%d: result for nounce %08x does not validate on CPU!", device_map[thr_id], foundNonce); } } - pdata[19] += throughput; + pdata[19] += throughput; CUDA_SAFE_CALL(cudaGetLastError()); } while (!work_restart[thr_id].restart && ((uint64_t)max_nonce > ((uint64_t)(pdata[19]) + (uint64_t)throughput))); - *hashes_done = pdata[19] - first_nonce + 1; + *hashes_done = pdata[19] - first_nonce ; return rc; } diff --git a/quark/animecoin.cu b/quark/animecoin.cu index 4994e0a831..426d5e85d8 100644 --- a/quark/animecoin.cu +++ b/quark/animecoin.cu @@ -13,42 +13,39 @@ extern "C" static uint32_t *d_hash[MAX_GPUS]; // Speicher zur Generierung der Noncevektoren für die bedingten Hashes -static uint32_t *d_animeNonces[MAX_GPUS]; static uint32_t *d_branch1Nonces[MAX_GPUS]; static uint32_t *d_branch2Nonces[MAX_GPUS]; static uint32_t *d_branch3Nonces[MAX_GPUS]; -extern void quark_blake512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); +extern void quark_blake512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash); extern void quark_bmw512_cpu_init(int thr_id, uint32_t threads); -extern void quark_bmw512_cpu_setBlock_80(void *pdata); -extern void quark_bmw512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_outputHash, int order); -extern void quark_bmw512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_outputHash, int order); +extern void quark_bmw512_cpu_setBlock_80(int thr_id, void *pdata); +extern void quark_bmw512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_outputHash); +extern void quark_bmw512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_outputHash); extern void quark_groestl512_cpu_init(int thr_id, uint32_t threads); -extern void quark_groestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); -extern void quark_doublegroestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); +extern void quark_groestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash); +extern void quark_doublegroestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash); extern void quark_skein512_cpu_init(int thr_id); -extern void quark_skein512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); +extern void quark_skein512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash); -extern void quark_keccak512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); -extern void quark_keccak512_cpu_hash_64_final(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); +extern void quark_keccak512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash); +extern void quark_keccak512_cpu_hash_64_final(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash); -extern void quark_jh512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); -extern void quark_jh512_cpu_hash_64_final(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); +extern void quark_jh512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash); +extern void quark_jh512_cpu_hash_64_final(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash); extern void quark_compactTest_cpu_init(int thr_id, uint32_t threads); extern void quark_compactTest_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable, uint32_t *d_nonces1, uint32_t *nrm1, - uint32_t *d_nonces2, uint32_t *nrm2, - int order); + uint32_t *d_nonces2, uint32_t *nrm2); extern void quark_compactTest_single_false_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable, - uint32_t *d_nonces1, uint32_t *nrm1, - int order); + uint32_t *d_nonces1, uint32_t *nrm1); -extern uint32_t cuda_check_hash_branch(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order); -extern void cuda_check_quarkcoin(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order, uint32_t *foundnonces); +extern uint32_t cuda_check_hash_branch(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash); +extern void cuda_check_quarkcoin(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, uint32_t *foundnonces); /* CPU Hash */ extern "C" void animehash(void *state, const void *input) @@ -151,7 +148,7 @@ struct HashPredicate __device__ bool operator()(const uint32_t x) { - uint32_t *hash = &m_hashes[(x - m_startNonce)*16]; + uint32_t *const Hash = &m_hashes[(x - m_startNonce)*16]; return hash[0] & 0x8; } @@ -160,25 +157,26 @@ struct HashPredicate }; */ -static bool init[MAX_GPUS] = { 0 }; +static volatile bool init[MAX_GPUS] = { false }; -extern "C" int scanhash_anime(int thr_id, uint32_t *pdata, - const uint32_t *ptarget, uint32_t max_nonce, - unsigned long *hashes_done) +extern int scanhash_anime(int thr_id, uint32_t *pdata, + uint32_t *ptarget, uint32_t max_nonce, + uint32_t *hashes_done) { const uint32_t first_nonce = pdata[19]; - uint32_t throughput = device_intensity(thr_id, __func__, 1 << 19); // 256*256*8 - throughput = min(throughput, (max_nonce - first_nonce)); + uint32_t throughput = device_intensity(device_map[thr_id], __func__, 1 << 20); // 256*256*8 + throughput = min(throughput, (max_nonce - first_nonce)) & 0xfffffc00; if (opt_benchmark) - ((uint32_t*)ptarget)[7] = 0x00ff; + ptarget[7] = 0x00ff; if (!init[thr_id]) { - cudaSetDevice(device_map[thr_id]); - cudaDeviceReset(); + CUDA_SAFE_CALL(cudaSetDevice(device_map[thr_id])); cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync); cudaDeviceSetCacheConfig(cudaFuncCachePreferL1); + CUDA_SAFE_CALL(cudaStreamCreate(&gpustream[thr_id])); + get_cuda_arch(&cuda_arch[thr_id]); CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput)); @@ -188,7 +186,6 @@ extern "C" int scanhash_anime(int thr_id, uint32_t *pdata, cuda_check_cpu_init(thr_id, throughput); quark_compactTest_cpu_init(thr_id, throughput); - CUDA_SAFE_CALL(cudaMalloc(&d_animeNonces[thr_id], sizeof(uint32_t)*throughput)); CUDA_SAFE_CALL(cudaMalloc(&d_branch1Nonces[thr_id], sizeof(uint32_t)*throughput)); CUDA_SAFE_CALL(cudaMalloc(&d_branch2Nonces[thr_id], sizeof(uint32_t)*throughput)); CUDA_SAFE_CALL(cudaMalloc(&d_branch3Nonces[thr_id], sizeof(uint32_t)*throughput)); @@ -198,92 +195,95 @@ extern "C" int scanhash_anime(int thr_id, uint32_t *pdata, uint32_t endiandata[20]; for (int k=0; k < 20; k++) - be32enc(&endiandata[k], ((uint32_t*)pdata)[k]); + be32enc(&endiandata[k], pdata[k]); - quark_bmw512_cpu_setBlock_80((void*)endiandata); - cuda_check_cpu_setTarget(ptarget); + quark_bmw512_cpu_setBlock_80(thr_id, (void*)endiandata); + cuda_check_cpu_setTarget(ptarget, thr_id); do { - int order = 0; uint32_t nrm1=0, nrm2=0, nrm3=0; // erstes BMW512 Hash mit CUDA - quark_bmw512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++); + quark_bmw512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); // das ist der unbedingte Branch für Blake512 - quark_blake512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + quark_blake512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id]); quark_compactTest_single_false_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id], NULL, - d_branch3Nonces[thr_id], &nrm3, - order++); + d_branch3Nonces[thr_id], &nrm3); // nur den Skein Branch weiterverfolgen - quark_skein512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++); + quark_skein512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id]); // das ist der unbedingte Branch für Groestl512 - quark_groestl512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++); + quark_groestl512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id]); // das ist der unbedingte Branch für JH512 - quark_jh512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++); + quark_jh512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id]); // quarkNonces in branch1 und branch2 aufsplitten gemäss if (hash[0] & 0x8) quark_compactTest_cpu_hash_64(thr_id, nrm3, pdata[19], d_hash[thr_id], d_branch3Nonces[thr_id], d_branch1Nonces[thr_id], &nrm1, - d_branch2Nonces[thr_id], &nrm2, - order++); + d_branch2Nonces[thr_id], &nrm2); // das ist der bedingte Branch für Blake512 - quark_blake512_cpu_hash_64(thr_id, nrm1, pdata[19], d_branch1Nonces[thr_id], d_hash[thr_id], order++); + quark_blake512_cpu_hash_64(thr_id, nrm1, pdata[19], d_branch1Nonces[thr_id], d_hash[thr_id]); // das ist der bedingte Branch für Bmw512 - quark_bmw512_cpu_hash_64(thr_id, nrm2, pdata[19], d_branch2Nonces[thr_id], d_hash[thr_id], order++); + quark_bmw512_cpu_hash_64(thr_id, nrm2, pdata[19], d_branch2Nonces[thr_id], d_hash[thr_id]); // das ist der unbedingte Branch für Keccak512 - quark_keccak512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++); + quark_keccak512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id]); // das ist der unbedingte Branch für Skein512 - quark_skein512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++); + quark_skein512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id]); // quarkNonces in branch1 und branch2 aufsplitten gemäss if (hash[0] & 0x8) quark_compactTest_cpu_hash_64(thr_id, nrm3, pdata[19], d_hash[thr_id], d_branch3Nonces[thr_id], d_branch1Nonces[thr_id], &nrm1, - d_branch2Nonces[thr_id], &nrm2, - order++); + d_branch2Nonces[thr_id], &nrm2); - quark_keccak512_cpu_hash_64_final(thr_id, nrm1, pdata[19], d_branch1Nonces[thr_id], d_hash[thr_id], order++); - quark_jh512_cpu_hash_64_final(thr_id, nrm2, pdata[19], d_branch2Nonces[thr_id], d_hash[thr_id], order++); + quark_keccak512_cpu_hash_64_final(thr_id, nrm1, pdata[19], d_branch1Nonces[thr_id], d_hash[thr_id]); + quark_jh512_cpu_hash_64_final(thr_id, nrm2, pdata[19], d_branch2Nonces[thr_id], d_hash[thr_id]); uint32_t foundnonces[2]; - cuda_check_quarkcoin(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++, foundnonces); - if (foundnonces[0] != 0xffffffff) + cuda_check_quarkcoin(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], foundnonces); + if(stop_mining) {mining_has_stopped[thr_id] = true; cudaStreamDestroy(gpustream[thr_id]); pthread_exit(nullptr);} + if(foundnonces[0] != 0xffffffff) { const uint32_t Htarg = ptarget[7]; -/* uint32_t vhash64[8]; + uint32_t vhash64[8]; be32enc(&endiandata[19], foundnonces[0]); animehash(vhash64, endiandata); if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) -*/ { + { int res = 1; *hashes_done = pdata[19] - first_nonce + throughput; // check if there was some other ones... if (foundnonces[1] != 0xffffffff) { - pdata[21] = foundnonces[1]; - res++; - if (opt_benchmark) applog(LOG_INFO, "GPU #%d: Found second nonce $%08X", thr_id, foundnonces[1]); + be32enc(&endiandata[19], foundnonces[1]); + animehash(vhash64, endiandata); + if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) + { + + pdata[21] = foundnonces[1]; + res++; + if (opt_benchmark) applog(LOG_INFO, "GPU #%d: Found second nonce $%08X", device_map[thr_id], foundnonces[1]); + } } pdata[19] = foundnonces[0]; - if (opt_benchmark) applog(LOG_INFO, "GPU #%d: Found nonce $%08X", thr_id, foundnonces[0]); + if (opt_benchmark) applog(LOG_INFO, "GPU #%d: Found nonce $%08X", device_map[thr_id], foundnonces[0]); return res; } -/* else + else { if (vhash64[7] != Htarg) // don't show message if it is equal but fails fulltest - applog(LOG_INFO, "GPU #%d: result for nonce $%08X does not validate on CPU!", thr_id, foundnonces[0]); + applog(LOG_INFO, "GPU #%d: result for nonce $%08X does not validate on CPU!", device_map[thr_id], foundnonces[0]); } -*/ } - pdata[19] += throughput; + } + pdata[19] += throughput; CUDA_SAFE_CALL(cudaGetLastError()); } while (!work_restart[thr_id].restart && ((uint64_t)max_nonce > ((uint64_t)(pdata[19]) + (uint64_t)throughput))); *hashes_done = pdata[19] - first_nonce + 1; diff --git a/quark/cuda_bmw512.cu b/quark/cuda_bmw512.cu index a425e9667d..1640630f4d 100644 --- a/quark/cuda_bmw512.cu +++ b/quark/cuda_bmw512.cu @@ -2,13 +2,8 @@ #include #include "cuda_helper.h" +#include "cuda_vector.h" - -// die Message it Padding zur Berechnung auf der GPU -__constant__ uint64_t c_PaddedMessage80[16]; // padded message (80 bytes + padding) - -//#define SHL(x, n) ((x) << (n)) -//#define SHR(x, n) ((x) >> (n)) #define SHR(x, n) SHR2(x, n) #define SHL(x, n) SHL2(x, n) @@ -21,185 +16,78 @@ __constant__ uint64_t c_PaddedMessage80[16]; // padded message (80 bytes + paddi q[i+8] + ROTL64(q[i+9], 37) + q[i+10] + ROTL64(q[i+11], 43) + \ q[i+12] + ROTL64(q[i+13], 53) + (SHR(q[i+14],1) ^ q[i+14]) + (SHR(q[i+15],2) ^ q[i+15]) -__device__ void Compression512_64_first(uint2 *msg, uint2 *hash) -{ - // Compression ref. implementation - uint2 q[32]; - uint2 tmp; - - tmp = (msg[5] ^ hash[5]) - (msg[7] ^ hash[7]) + (hash[10]) + (hash[13]) + (hash[14]); - q[0] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ ROTL64(tmp, 4) ^ ROTL64(tmp, 37)) + hash[1]; - tmp = (msg[6] ^ hash[6]) - (msg[8] ^ hash[8]) + (hash[11]) + (hash[14]) - (msg[15] ^ hash[15]); - q[1] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ ROTL64(tmp, 13) ^ ROTL64(tmp, 43)) + hash[2]; - tmp = (msg[0] ^ hash[0]) + (msg[7] ^ hash[7]) + (hash[9]) - (hash[12]) + (msg[15] ^ hash[15]); - q[2] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ ROTL64(tmp, 19) ^ ROTL64(tmp, 53)) + hash[3]; - tmp = (msg[0] ^ hash[0]) - (msg[1] ^ hash[1]) + (msg[8] ^ hash[8]) - (hash[10]) + (hash[13]); - q[3] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ ROTL64(tmp, 28) ^ ROTL64(tmp, 59)) + hash[4]; - tmp = (msg[1] ^ hash[1]) + (msg[2] ^ hash[2]) + (hash[9]) - (hash[11]) - (hash[14]); - q[4] = (SHR(tmp, 1) ^ tmp) + hash[5]; - tmp = (msg[3] ^ hash[3]) - (msg[2] ^ hash[2]) + (hash[10]) - (hash[12]) + (msg[15] ^ hash[15]); - q[5] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ ROTL64(tmp, 4) ^ ROTL64(tmp, 37)) + hash[6]; - tmp = (msg[4] ^ hash[4]) - (msg[0] ^ hash[0]) - (msg[3] ^ hash[3]) - (hash[11]) + (hash[13]); - q[6] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ ROTL64(tmp, 13) ^ ROTL64(tmp, 43)) + hash[7]; - tmp = (msg[1] ^ hash[1]) - (msg[4] ^ hash[4]) - (msg[5] ^ hash[5]) - (hash[12]) - (hash[14]); - q[7] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ ROTL64(tmp, 19) ^ ROTL64(tmp, 53)) + hash[8]; - tmp = (msg[2] ^ hash[2]) - (msg[5] ^ hash[5]) - (msg[6] ^ hash[6]) + (hash[13]) - (msg[15] ^ hash[15]); - q[8] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ ROTL64(tmp, 28) ^ ROTL64(tmp, 59)) + hash[9]; - tmp = (msg[0] ^ hash[0]) - (msg[3] ^ hash[3]) + (msg[6] ^ hash[6]) - (msg[7] ^ hash[7]) + (hash[14]); - q[9] = (SHR(tmp, 1) ^ tmp) + hash[10]; - tmp = (msg[8] ^ hash[8]) - (msg[1] ^ hash[1]) - (msg[4] ^ hash[4]) - (msg[7] ^ hash[7]) + (msg[15] ^ hash[15]); - q[10] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ ROTL64(tmp, 4) ^ ROTL64(tmp, 37)) + hash[11]; - tmp = (msg[8] ^ hash[8]) - (msg[0] ^ hash[0]) - (msg[2] ^ hash[2]) - (msg[5] ^ hash[5]) + (hash[9]); - q[11] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ ROTL64(tmp, 13) ^ ROTL64(tmp, 43)) + hash[12]; - tmp = (msg[1] ^ hash[1]) + (msg[3] ^ hash[3]) - (msg[6] ^ hash[6]) - (hash[9]) + (hash[10]); - q[12] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ ROTL64(tmp, 19) ^ ROTL64(tmp, 53)) + hash[13]; - tmp = (msg[2] ^ hash[2]) + (msg[4] ^ hash[4]) + (msg[7] ^ hash[7]) + (hash[10]) + (hash[11]); - q[13] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ ROTL64(tmp, 28) ^ ROTL64(tmp, 59)) + hash[14]; - tmp = (msg[3] ^ hash[3]) - (msg[5] ^ hash[5]) + (msg[8] ^ hash[8]) - (hash[11]) - (hash[12]); - q[14] = (SHR(tmp, 1) ^ tmp) + hash[15]; - tmp = (msg[12] ^ hash[12]) - (msg[4] ^ hash[4]) - (msg[6] ^ hash[6]) - (hash[9]) + (hash[13]); - q[15] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ ROTL64(tmp, 4) ^ ROTL64(tmp, 37)) + hash[0]; - - q[0 + 16] = - (SHR(q[0], 1) ^ SHL(q[0], 2) ^ ROTL64(q[0], 13) ^ ROTL64(q[0], 43)) + - (SHR(q[0 + 1], 2) ^ SHL(q[0 + 1], 1) ^ ROTL64(q[0 + 1], 19) ^ ROTL64(q[0 + 1], 53)) + - (SHR(q[0 + 2], 2) ^ SHL(q[0 + 2], 2) ^ ROTL64(q[0 + 2], 28) ^ ROTL64(q[0 + 2], 59)) + - (SHR(q[0 + 3], 1) ^ SHL(q[0 + 3], 3) ^ ROTL64(q[0 + 3], 4) ^ ROTL64(q[0 + 3], 37)) + - (SHR(q[0 + 4], 1) ^ SHL(q[0 + 4], 2) ^ ROTL64(q[0 + 4], 13) ^ ROTL64(q[0 + 4], 43)) + - (SHR(q[0 + 5], 2) ^ SHL(q[0 + 5], 1) ^ ROTL64(q[0 + 5], 19) ^ ROTL64(q[0 + 5], 53)) + - (SHR(q[0 + 6], 2) ^ SHL(q[0 + 6], 2) ^ ROTL64(q[0 + 6], 28) ^ ROTL64(q[0 + 6], 59)) + - (SHR(q[0 + 7], 1) ^ SHL(q[0 + 7], 3) ^ ROTL64(q[0 + 7], 4) ^ ROTL64(q[0 + 7], 37)) + - (SHR(q[0 + 8], 1) ^ SHL(q[0 + 8], 2) ^ ROTL64(q[0 + 8], 13) ^ ROTL64(q[0 + 8], 43)) + - (SHR(q[0 + 9], 2) ^ SHL(q[0 + 9], 1) ^ ROTL64(q[0 + 9], 19) ^ ROTL64(q[0 + 9], 53)) + - (SHR(q[0 + 10], 2) ^ SHL(q[0 + 10], 2) ^ ROTL64(q[0 + 10], 28) ^ ROTL64(q[0 + 10], 59)) + - (SHR(q[0 + 11], 1) ^ SHL(q[0 + 11], 3) ^ ROTL64(q[0 + 11], 4) ^ ROTL64(q[0 + 11], 37)) + - (SHR(q[0 + 12], 1) ^ SHL(q[0 + 12], 2) ^ ROTL64(q[0 + 12], 13) ^ ROTL64(q[0 + 12], 43)) + - (SHR(q[0 + 13], 2) ^ SHL(q[0 + 13], 1) ^ ROTL64(q[0 + 13], 19) ^ ROTL64(q[0 + 13], 53)) + - (SHR(q[0 + 14], 2) ^ SHL(q[0 + 14], 2) ^ ROTL64(q[0 + 14], 28) ^ ROTL64(q[0 + 14], 59)) + - (SHR(q[0 + 15], 1) ^ SHL(q[0 + 15], 3) ^ ROTL64(q[0 + 15], 4) ^ ROTL64(q[0 + 15], 37)) + - ((make_uint2(0x55555550ul,0x55555555) + ROTL64(msg[0], 0 + 1) + - ROTL64(msg[0 + 3], 0 + 4)) ^ hash[0 + 7]); - q[1 + 16] = - (SHR(q[1], 1) ^ SHL(q[1], 2) ^ ROTL64(q[1], 13) ^ ROTL64(q[1], 43)) + - (SHR(q[1 + 1], 2) ^ SHL(q[1 + 1], 1) ^ ROTL64(q[1 + 1], 19) ^ ROTL64(q[1 + 1], 53)) + - (SHR(q[1 + 2], 2) ^ SHL(q[1 + 2], 2) ^ ROTL64(q[1 + 2], 28) ^ ROTL64(q[1 + 2], 59)) + - (SHR(q[1 + 3], 1) ^ SHL(q[1 + 3], 3) ^ ROTL64(q[1 + 3], 4) ^ ROTL64(q[1 + 3], 37)) + - (SHR(q[1 + 4], 1) ^ SHL(q[1 + 4], 2) ^ ROTL64(q[1 + 4], 13) ^ ROTL64(q[1 + 4], 43)) + - (SHR(q[1 + 5], 2) ^ SHL(q[1 + 5], 1) ^ ROTL64(q[1 + 5], 19) ^ ROTL64(q[1 + 5], 53)) + - (SHR(q[1 + 6], 2) ^ SHL(q[1 + 6], 2) ^ ROTL64(q[1 + 6], 28) ^ ROTL64(q[1 + 6], 59)) + - (SHR(q[1 + 7], 1) ^ SHL(q[1 + 7], 3) ^ ROTL64(q[1 + 7], 4) ^ ROTL64(q[1 + 7], 37)) + - (SHR(q[1 + 8], 1) ^ SHL(q[1 + 8], 2) ^ ROTL64(q[1 + 8], 13) ^ ROTL64(q[1 + 8], 43)) + - (SHR(q[1 + 9], 2) ^ SHL(q[1 + 9], 1) ^ ROTL64(q[1 + 9], 19) ^ ROTL64(q[1 + 9], 53)) + - (SHR(q[1 + 10], 2) ^ SHL(q[1 + 10], 2) ^ ROTL64(q[1 + 10], 28) ^ ROTL64(q[1 + 10], 59)) + - (SHR(q[1 + 11], 1) ^ SHL(q[1 + 11], 3) ^ ROTL64(q[1 + 11], 4) ^ ROTL64(q[1 + 11], 37)) + - (SHR(q[1 + 12], 1) ^ SHL(q[1 + 12], 2) ^ ROTL64(q[1 + 12], 13) ^ ROTL64(q[1 + 12], 43)) + - (SHR(q[1 + 13], 2) ^ SHL(q[1 + 13], 1) ^ ROTL64(q[1 + 13], 19) ^ ROTL64(q[1 + 13], 53)) + - (SHR(q[1 + 14], 2) ^ SHL(q[1 + 14], 2) ^ ROTL64(q[1 + 14], 28) ^ ROTL64(q[1 + 14], 59)) + - (SHR(q[1 + 15], 1) ^ SHL(q[1 + 15], 3) ^ ROTL64(q[1 + 15], 4) ^ ROTL64(q[1 + 15], 37)) + - ((make_uint2(0xAAAAAAA5, 0x5AAAAAAA) + ROTL64(msg[1], 1 + 1) + - ROTL64(msg[1 + 3], 1 + 4)) ^ hash[1 + 7]); - - q[2 + 16] = CONST_EXP2(2) + - ((make_uint2(0xFFFFFFFA, 0x5FFFFFFF) + ROTL64(msg[2], 2 + 1) + - ROTL64(msg[2 + 3], 2 + 4) - ROTL64(msg[2 + 10], 2 + 11)) ^ hash[2 + 7]); - q[3 + 16] = CONST_EXP2(3) + - ((make_uint2(0x5555554F, 0x65555555) + ROTL64(msg[3], 3 + 1) + - ROTL64(msg[3 + 3], 3 + 4) - ROTL64(msg[3 + 10], 3 + 11)) ^ hash[3 + 7]); - q[4 + 16] = CONST_EXP2(4) + - ((make_uint2(0xAAAAAAA4, 0x6AAAAAAA) +ROTL64(msg[4], 4 + 1) + - ROTL64(msg[4 + 3], 4 + 4) - ROTL64(msg[4 + 10], 4 + 11)) ^ hash[4 + 7]); - q[5 + 16] = CONST_EXP2(5) + - ((make_uint2(0xFFFFFFF9, 0x6FFFFFFF) + ROTL64(msg[5], 5 + 1) + - ROTL64(msg[5 + 3], 5 + 4) - ROTL64(msg[5 + 10], 5 + 11)) ^ hash[5 + 7]); - - -#pragma unroll 3 - for (int i = 6; i<9; i++) { - q[i + 16] = CONST_EXP2(i) + - ((vectorize((i + 16)*(0x0555555555555555ull)) + ROTL64(msg[i], i + 1) - - ROTL64(msg[i - 6], (i - 6) + 1)) ^ hash[i + 7]); - } - -#pragma unroll 4 - for (int i = 9; i<13; i++) { - q[i + 16] = CONST_EXP2(i) + - ((vectorize((i + 16)*(0x0555555555555555ull)) + - ROTL64(msg[i + 3], i + 4) - ROTL64(msg[i - 6], (i - 6) + 1)) ^ hash[i - 9]); - } - - q[13 + 16] = CONST_EXP2(13) + - ((make_uint2(0xAAAAAAA1, 0x9AAAAAAA) + ROTL64(msg[13], 13 + 1) + - ROTL64(msg[13 - 13], (13 - 13) + 1) - ROTL64(msg[13 - 6], (13 - 6) + 1)) ^ hash[13 - 9]); - q[14 + 16] = CONST_EXP2(14) + - ((make_uint2(0xFFFFFFF6, 0x9FFFFFFF) + ROTL64(msg[14], 14 + 1) + - ROTL64(msg[14 - 13], (14 - 13) + 1) - ROTL64(msg[14 - 6], (14 - 6) + 1)) ^ hash[14 - 9]); - q[15 + 16] = CONST_EXP2(15) + - ((make_uint2(0x5555554B, 0xA5555555) + ROTL64(msg[15], 15 + 1) + - ROTL64(msg[15 - 13], (15 - 13) + 1) - ROTL64(msg[15 - 6], (15 - 6) + 1)) ^ hash[15 - 9]); +#define CONST_EXP3(i) ROTL64(q[i+1], 5) + ROTL64(q[i+3], 11) + \ + ROTL64(q[i+5], 27) + SWAPDWORDS2(q[i+7]) + \ + ROTL64(q[i+9], 37) + ROTL64(q[i+11], 43) + \ + ROTL64(q[i+13], 53) + (SHR(q[i+14],1) ^ q[i+14]) + (SHR(q[i+15],2) ^ q[i+15]) +__device__ __forceinline__ void Compression512(const uint2 *msg, uint2 *hash) +{ + const uint2 precalc[16] = + { + { 0x55555550, 0x55555555 }, + { 0xAAAAAAA5, 0x5AAAAAAA }, + { 0xFFFFFFFA, 0x5FFFFFFF }, + { 0x5555554F, 0x65555555 }, + { 0xAAAAAAA4, 0x6AAAAAAA }, + { 0xFFFFFFF9, 0x6FFFFFFF }, + { 0x5555554E, 0x75555555 }, + { 0xAAAAAAA3, 0x7AAAAAAA }, + { 0xFFFFFFF8, 0x7FFFFFFF }, + { 0x5555554D, 0x85555555 }, + { 0xAAAAAAA2, 0x8AAAAAAA }, + { 0xFFFFFFF7, 0x8FFFFFFF }, + { 0x5555554C, 0x95555555 }, + { 0xAAAAAAA1, 0x9AAAAAAA }, + { 0xFFFFFFF6, 0x9FFFFFFF }, + { 0x5555554B, 0xA5555555 }, + }; - uint2 XL64 = q[16] ^ q[17] ^ q[18] ^ q[19] ^ q[20] ^ q[21] ^ q[22] ^ q[23]; - uint2 XH64 = XL64^q[24] ^ q[25] ^ q[26] ^ q[27] ^ q[28] ^ q[29] ^ q[30] ^ q[31]; - hash[0] = (SHL(XH64, 5) ^ SHR(q[16], 5) ^ msg[0]) + (XL64 ^ q[24] ^ q[0]); - hash[1] = (SHR(XH64, 7) ^ SHL(q[17], 8) ^ msg[1]) + (XL64 ^ q[25] ^ q[1]); - hash[2] = (SHR(XH64, 5) ^ SHL(q[18], 5) ^ msg[2]) + (XL64 ^ q[26] ^ q[2]); - hash[3] = (SHR(XH64, 1) ^ SHL(q[19], 5) ^ msg[3]) + (XL64 ^ q[27] ^ q[3]); - hash[4] = (SHR(XH64, 3) ^ q[20] ^ msg[4]) + (XL64 ^ q[28] ^ q[4]); - hash[5] = (SHL(XH64, 6) ^ SHR(q[21], 6) ^ msg[5]) + (XL64 ^ q[29] ^ q[5]); - hash[6] = (SHR(XH64, 4) ^ SHL(q[22], 6) ^ msg[6]) + (XL64 ^ q[30] ^ q[6]); - hash[7] = (SHR(XH64, 11) ^ SHL(q[23], 2) ^ msg[7]) + (XL64 ^ q[31] ^ q[7]); - - hash[8] = ROTL64(hash[4], 9) + (XH64 ^ q[24] ^ msg[8]) + (SHL(XL64, 8) ^ q[23] ^ q[8]); - hash[9] = ROTL64(hash[5], 10) + (XH64 ^ q[25]) + (SHR(XL64, 6) ^ q[16] ^ q[9]); - hash[10] = ROTL64(hash[6], 11) + (XH64 ^ q[26]) + (SHL(XL64, 6) ^ q[17] ^ q[10]); - hash[11] = ROTL64(hash[7], 12) + (XH64 ^ q[27]) + (SHL(XL64, 4) ^ q[18] ^ q[11]); - hash[12] = ROTL64(hash[0], 13) + (XH64 ^ q[28]) + (SHR(XL64, 3) ^ q[19] ^ q[12]); - hash[13] = ROTL64(hash[1], 14) + (XH64 ^ q[29]) + (SHR(XL64, 4) ^ q[20] ^ q[13]); - hash[14] = ROTL64(hash[2], 15) + (XH64 ^ q[30]) + (SHR(XL64, 7) ^ q[21] ^ q[14]); - hash[15] = ROTL64(hash[3], 16) + (XH64 ^ q[31] ^ msg[15]) + (SHR(XL64, 2) ^ q[22] ^ q[15]); -} - -__device__ void Compression512(uint2 *msg, uint2 *hash) -{ - // Compression ref. implementation + // Compression ref. implementation uint2 q[32]; uint2 tmp; +// const uint2 pre = (msg[10] ^ hash[10]) + (msg[13] ^ hash[13]); + const uint2 pre2 = (msg[12] ^ hash[12]) + (msg[15] ^ hash[15]); + const uint2 pre3 = (msg[14] ^ hash[14]) - (msg[7] ^ hash[7]); + const uint2 pre4 = (msg[6] ^ hash[6]) + (msg[9] ^ hash[9]); + const uint2 pre5 = (msg[8] ^ hash[8]) - (msg[5] ^ hash[5]); + const uint2 pre6 = (msg[1] ^ hash[1]) - (msg[14] ^ hash[14]); + const uint2 pre7 = (msg[8] ^ hash[8]) - (msg[1] ^ hash[1]); - tmp = (msg[ 5] ^ hash[ 5]) - (msg[ 7] ^ hash[ 7]) + (msg[10] ^ hash[10]) + (msg[13] ^ hash[13]) + (msg[14] ^ hash[14]); + tmp = (msg[5] ^ hash[5]) + (msg[10] ^ hash[10]) + (msg[13] ^ hash[13]) + pre3; q[0] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ ROTL64(tmp, 4) ^ ROTL64(tmp, 37)) + hash[1]; - tmp = (msg[ 6] ^ hash[ 6]) - (msg[ 8] ^ hash[ 8]) + (msg[11] ^ hash[11]) + (msg[14] ^ hash[14]) - (msg[15] ^ hash[15]); + tmp = (msg[6] ^ hash[6]) - (msg[8] ^ hash[8]) + pre2 - (msg[15] ^ hash[15]); q[1] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ ROTL64(tmp, 13) ^ ROTL64(tmp, 43)) + hash[2]; - tmp = (msg[ 0] ^ hash[ 0]) + (msg[ 7] ^ hash[ 7]) + (msg[ 9] ^ hash[ 9]) - (msg[12] ^ hash[12]) + (msg[15] ^ hash[15]); + tmp = (msg[ 0] ^ hash[ 0]) + (msg[ 7] ^ hash[ 7]) + (msg[ 9] ^ hash[ 9]) - pre2; q[2] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ ROTL64(tmp, 19) ^ ROTL64(tmp, 53)) + hash[3]; - tmp = (msg[ 0] ^ hash[ 0]) - (msg[ 1] ^ hash[ 1]) + (msg[ 8] ^ hash[ 8]) - (msg[10] ^ hash[10]) + (msg[13] ^ hash[13]); + tmp = (msg[0] ^ hash[0]) + pre7 - (msg[10] ^ hash[10]) + (msg[13] ^ hash[13]); q[3] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ ROTL64(tmp, 28) ^ ROTL64(tmp, 59)) + hash[4]; - tmp = (msg[ 1] ^ hash[ 1]) + (msg[ 2] ^ hash[ 2]) + (msg[ 9] ^ hash[ 9]) - (msg[11] ^ hash[11]) - (msg[14] ^ hash[14]); + tmp = pre6 + (msg[2] ^ hash[2]) + (msg[9] ^ hash[9]) - (msg[11] ^ hash[11]); q[4] = (SHR(tmp, 1) ^ tmp) + hash[5]; - tmp = (msg[ 3] ^ hash[ 3]) - (msg[ 2] ^ hash[ 2]) + (msg[10] ^ hash[10]) - (msg[12] ^ hash[12]) + (msg[15] ^ hash[15]); + tmp = (msg[3] ^ hash[3]) - (msg[2] ^ hash[2]) + (msg[10] ^ hash[10]) - pre2; q[5] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ ROTL64(tmp, 4) ^ ROTL64(tmp, 37)) + hash[6]; tmp = (msg[ 4] ^ hash[ 4]) - (msg[ 0] ^ hash[ 0]) - (msg[ 3] ^ hash[ 3]) - (msg[11] ^ hash[11]) + (msg[13] ^ hash[13]); q[6] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ ROTL64(tmp, 13) ^ ROTL64(tmp, 43)) + hash[7]; - tmp = (msg[ 1] ^ hash[ 1]) - (msg[ 4] ^ hash[ 4]) - (msg[ 5] ^ hash[ 5]) - (msg[12] ^ hash[12]) - (msg[14] ^ hash[14]); + tmp = pre6 - (msg[ 4] ^ hash[ 4]) - (msg[ 5] ^ hash[ 5]) - (msg[12] ^ hash[12]); q[7] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ ROTL64(tmp, 19) ^ ROTL64(tmp, 53)) + hash[8]; tmp = (msg[ 2] ^ hash[ 2]) - (msg[ 5] ^ hash[ 5]) - (msg[ 6] ^ hash[ 6]) + (msg[13] ^ hash[13]) - (msg[15] ^ hash[15]); q[8] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ ROTL64(tmp, 28) ^ ROTL64(tmp, 59)) + hash[9]; - tmp = (msg[ 0] ^ hash[ 0]) - (msg[ 3] ^ hash[ 3]) + (msg[ 6] ^ hash[ 6]) - (msg[ 7] ^ hash[ 7]) + (msg[14] ^ hash[14]); + tmp = (msg[ 0] ^ hash[ 0]) - (msg[ 3] ^ hash[ 3]) + (msg[ 6] ^ hash[ 6])+pre3; q[9] = (SHR(tmp, 1) ^ tmp) + hash[10]; - tmp = (msg[ 8] ^ hash[ 8]) - (msg[ 1] ^ hash[ 1]) - (msg[ 4] ^ hash[ 4]) - (msg[ 7] ^ hash[ 7]) + (msg[15] ^ hash[15]); + tmp = pre7 - (msg[4] ^ hash[4]) - (msg[7] ^ hash[7]) + (msg[15] ^ hash[15]); q[10] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ ROTL64(tmp, 4) ^ ROTL64(tmp, 37)) + hash[11]; - tmp = (msg[ 8] ^ hash[ 8]) - (msg[ 0] ^ hash[ 0]) - (msg[ 2] ^ hash[ 2]) - (msg[ 5] ^ hash[ 5]) + (msg[ 9] ^ hash[ 9]); + tmp = pre5 - (msg[ 0] ^ hash[ 0]) - (msg[ 2] ^ hash[ 2]) + (msg[ 9] ^ hash[ 9]); q[11] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ ROTL64(tmp, 13) ^ ROTL64(tmp, 43)) + hash[12]; - tmp = (msg[ 1] ^ hash[ 1]) + (msg[ 3] ^ hash[ 3]) - (msg[ 6] ^ hash[ 6]) - (msg[ 9] ^ hash[ 9]) + (msg[10] ^ hash[10]); + tmp = (msg[1] ^ hash[1]) + (msg[3] ^ hash[3]) - pre4 + (msg[10] ^ hash[10]); q[12] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ ROTL64(tmp, 19) ^ ROTL64(tmp, 53)) + hash[13]; tmp = (msg[ 2] ^ hash[ 2]) + (msg[ 4] ^ hash[ 4]) + (msg[ 7] ^ hash[ 7]) + (msg[10] ^ hash[10]) + (msg[11] ^ hash[11]); q[13] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ ROTL64(tmp, 28) ^ ROTL64(tmp, 59)) + hash[14]; - tmp = (msg[ 3] ^ hash[ 3]) - (msg[ 5] ^ hash[ 5]) + (msg[ 8] ^ hash[ 8]) - (msg[11] ^ hash[11]) - (msg[12] ^ hash[12]); + tmp = (msg[ 3] ^ hash[ 3]) +pre5 - (msg[11] ^ hash[11]) - (msg[12] ^ hash[12]); q[14] = (SHR(tmp, 1) ^ tmp) + hash[15]; - tmp = (msg[12] ^ hash[12]) - (msg[ 4] ^ hash[ 4]) - (msg[ 6] ^ hash[ 6]) - (msg[ 9] ^ hash[ 9]) + (msg[13] ^ hash[13]); + tmp = (msg[12] ^ hash[12]) - (msg[4] ^ hash[4]) - pre4 + (msg[13] ^ hash[13]); q[15] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ ROTL64(tmp, 4) ^ ROTL64(tmp, 37)) + hash[0]; - q[0+16] = (SHR(q[0], 1) ^ SHL(q[0], 2) ^ ROTL64(q[0], 13) ^ ROTL64(q[0], 43)) + (SHR(q[0+1], 2) ^ SHL(q[0+1], 1) ^ ROTL64(q[0+1], 19) ^ ROTL64(q[0+1], 53)) + @@ -217,7 +105,7 @@ __device__ void Compression512(uint2 *msg, uint2 *hash) (SHR(q[0+13], 2) ^ SHL(q[0+13], 1) ^ ROTL64(q[0+13], 19) ^ ROTL64(q[0+13], 53)) + (SHR(q[0+14], 2) ^ SHL(q[0+14], 2) ^ ROTL64(q[0+14], 28) ^ ROTL64(q[0+14], 59)) + (SHR(q[0+15], 1) ^ SHL(q[0+15], 3) ^ ROTL64(q[0+15], 4) ^ ROTL64(q[0+15], 37)) + - ((make_uint2(0x55555550ul, 0x55555555) + ROTL64(msg[0], 0 + 1) + + ((precalc[0] + ROTL64(msg[0], 0 + 1) + ROTL64(msg[0+3], 0+4) - ROTL64(msg[0+10], 0+11) ) ^ hash[0+7]); q[1 + 16] = (SHR(q[1], 1) ^ SHL(q[1], 2) ^ ROTL64(q[1], 13) ^ ROTL64(q[1], 43)) + @@ -236,55 +124,56 @@ __device__ void Compression512(uint2 *msg, uint2 *hash) (SHR(q[1 + 13], 2) ^ SHL(q[1 + 13], 1) ^ ROTL64(q[1 + 13], 19) ^ ROTL64(q[1 + 13], 53)) + (SHR(q[1 + 14], 2) ^ SHL(q[1 + 14], 2) ^ ROTL64(q[1 + 14], 28) ^ ROTL64(q[1 + 14], 59)) + (SHR(q[1 + 15], 1) ^ SHL(q[1 + 15], 3) ^ ROTL64(q[1 + 15], 4) ^ ROTL64(q[1 + 15], 37)) + - ((make_uint2(0xAAAAAAA5, 0x5AAAAAAA) + ROTL64(msg[1], 1 + 1) + + ((precalc[1] + ROTL64(msg[1], 1 + 1) + ROTL64(msg[1 + 3], 1 + 4) - ROTL64(msg[1 + 10], 1 + 11)) ^ hash[1 + 7]); q[2 + 16] = CONST_EXP2(2) + - ((make_uint2(0xFFFFFFFA, 0x5FFFFFFF) + ROTL64(msg[2], 2 + 1) + + ((precalc[2] + ROTL64(msg[2], 2 + 1) + ROTL64(msg[2+3], 2+4) - ROTL64(msg[2+10], 2+11) ) ^ hash[2+7]); q[3 + 16] = CONST_EXP2(3) + - ((make_uint2(0x5555554F, 0x65555555) + ROTL64(msg[3], 3 + 1) + + ((precalc[3] + ROTL64(msg[3], 3 + 1) + ROTL64(msg[3 + 3], 3 + 4) - ROTL64(msg[3 + 10], 3 + 11)) ^ hash[3 + 7]); q[4 + 16] = CONST_EXP2(4) + - ((make_uint2(0xAAAAAAA4, 0x6AAAAAAA) + ROTL64(msg[4], 4 + 1) + - ROTL64(msg[4 + 3], 4 + 4) - ROTL64(msg[4 + 10], 4 + 11)) ^ hash[4 + 7]); + ((precalc[4] + ROTL64(msg[4], 4 + 1) + + ROL8(msg[4 + 3]) - ROTL64(msg[4 + 10], 4 + 11)) ^ hash[4 + 7]); q[5 + 16] = CONST_EXP2(5) + - ((make_uint2(0xFFFFFFF9, 0x6FFFFFFF) + ROTL64(msg[5], 5 + 1) + - ROTL64(msg[5 + 3], 5 + 4) - ROTL64(msg[5 + 10], 5 + 11)) ^ hash[5 + 7]); + ((precalc[5] + ROTL64(msg[5], 5 + 1) + + ROTL64(msg[5 + 3], 5 + 4) - ROL16(msg[5 + 10])) ^ hash[5 + 7]); + q[6 + 16] = CONST_EXP2(6) + - ((make_uint2(0x5555554E, 0x75555555)+ ROTL64(msg[6], 6 + 1) + + ((precalc[6]+ ROTL64(msg[6], 6 + 1) + ROTL64(msg[6 + 3], 6 + 4) - ROTL64(msg[6 - 6], (6 - 6) + 1)) ^ hash[6 + 7]); q[7 + 16] = CONST_EXP2(7) + - ((make_uint2(0xAAAAAAA3, 0x7AAAAAAA) + ROTL64(msg[7], 7 + 1) + + ((precalc[7] + ROL8(msg[7]) + ROTL64(msg[7 + 3], 7 + 4) - ROTL64(msg[7 - 6], (7 - 6) + 1)) ^ hash[7 + 7]); q[8 + 16] = CONST_EXP2(8) + - ((make_uint2(0xFFFFFFF8, 0x7FFFFFFF) + ROTL64(msg[8], 8 + 1) + + ((precalc[8] + ROTL64(msg[8], 8 + 1) + ROTL64(msg[8 + 3], 8 + 4) - ROTL64(msg[8 - 6], (8 - 6) + 1)) ^ hash[8 + 7]); q[9 + 16] = CONST_EXP2(9) + - ((make_uint2(0x5555554D, 0x85555555) + ROTL64(msg[9], 9 + 1) + + ((precalc[9] + ROTL64(msg[9], 9 + 1) + ROTL64(msg[9 + 3], 9 + 4) - ROTL64(msg[9 - 6], (9 - 6) + 1)) ^ hash[9 - 9]); q[10 + 16] = CONST_EXP2(10) + - ((make_uint2(0xAAAAAAA2, 0x8AAAAAAA) + ROTL64(msg[10], 10 + 1) + + ((precalc[10] + ROTL64(msg[10], 10 + 1) + ROTL64(msg[10 + 3], 10 + 4) - ROTL64(msg[10 - 6], (10 - 6) + 1)) ^ hash[10 - 9]); q[11 + 16] = CONST_EXP2(11) + - ((make_uint2(0xFFFFFFF7, 0x8FFFFFFF) + ROTL64(msg[11], 11 + 1) + + ((precalc[11] + ROTL64(msg[11], 11 + 1) + ROTL64(msg[11 + 3], 11 + 4) - ROTL64(msg[11 - 6], (11 - 6) + 1)) ^ hash[11 - 9]); q[12 + 16] = CONST_EXP2(12) + - ((make_uint2(0x5555554C, 0x95555555) + ROTL64(msg[12], 12 + 1) + - ROTL64(msg[12 + 3], 12 + 4) - ROTL64(msg[12 - 6], (12 - 6) + 1)) ^ hash[12 - 9]); + ((precalc[12] + ROTL64(msg[12], 12 + 1) + + ROL16(msg[12 + 3]) - ROTL64(msg[12 - 6], (12 - 6) + 1)) ^ hash[12 - 9]); q[13 + 16] = CONST_EXP2(13) + - ((make_uint2(0xAAAAAAA1, 0x9AAAAAAA) + ROTL64(msg[13], 13 + 1) + - ROTL64(msg[13 - 13], (13 - 13) + 1) - ROTL64(msg[13 - 6], (13 - 6) + 1)) ^ hash[13 - 9]); + ((precalc[13] + ROTL64(msg[13], 13 + 1) + + ROTL64(msg[13 - 13], (13 - 13) + 1) - ROL8(msg[13 - 6])) ^ hash[13 - 9]); q[14 + 16] = CONST_EXP2(14) + - ((make_uint2(0xFFFFFFF6, 0x9FFFFFFF) + ROTL64(msg[14], 14 + 1) + + ((precalc[14] + ROTL64(msg[14], 14 + 1) + ROTL64(msg[14 - 13], (14 - 13) + 1) - ROTL64(msg[14 - 6], (14 - 6) + 1)) ^ hash[14 - 9]); q[15 + 16] = CONST_EXP2(15) + - ((make_uint2(0x5555554B, 0xA5555555) + ROTL64(msg[15], 15 + 1) + + ((precalc[15] + ROL16(msg[15]) + ROTL64(msg[15 - 13], (15 - 13) + 1) - ROTL64(msg[15 - 6], (15 - 6) + 1)) ^ hash[15 - 9]); uint2 XL64 = q[16]^q[17]^q[18]^q[19]^q[20]^q[21]^q[22]^q[23]; @@ -306,27 +195,22 @@ __device__ void Compression512(uint2 *msg, uint2 *hash) hash[12] = ROTL64(hash[0],13) + ( XH64 ^ q[28] ^ msg[12]) + (SHR(XL64,3) ^ q[19] ^ q[12]); hash[13] = ROTL64(hash[1],14) + ( XH64 ^ q[29] ^ msg[13]) + (SHR(XL64,4) ^ q[20] ^ q[13]); hash[14] = ROTL64(hash[2],15) + ( XH64 ^ q[30] ^ msg[14]) + (SHR(XL64,7) ^ q[21] ^ q[14]); - hash[15] = ROTL64(hash[3],16) + (XH64 ^ q[31] ^ msg[15]) + (SHR(XL64, 2) ^ q[22] ^ q[15]); + hash[15] = ROL16(hash[3]) + (XH64 ^ q[31] ^ msg[15]) + (SHR(XL64, 2) ^ q[22] ^ q[15]); } - -__global__ -#if __CUDA_ARCH__ > 500 -__launch_bounds__(32, 16) -#else -__launch_bounds__(64, 8) -#endif -void quark_bmw512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector) +__global__ __launch_bounds__(32, 16) +void quark_bmw512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t *const __restrict__ g_hash, const uint32_t *const __restrict__ g_nonceVector) { - uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); - if (thread < threads) - { - uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread); + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + if (thread < threads) + { + + const uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread); - int hashPosition = nounce - startNounce; - uint64_t *inpHash = &g_hash[8 * hashPosition]; + const uint32_t hashPosition = nounce - startNounce; + uint64_t *const inpHash = &g_hash[8 * hashPosition]; - // Init - uint2 h[16] = { + const uint2 hash[16] = + { { 0x84858687UL, 0x80818283UL }, { 0x8C8D8E8FUL, 0x88898A8BUL }, { 0x94959697UL, 0x90919293UL }, @@ -335,8 +219,8 @@ void quark_bmw512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t * { 0xACADAEAFUL, 0xA8A9AAABUL }, { 0xB4B5B6B7UL, 0xB0B1B2B3UL }, { 0xBCBDBEBFUL, 0xB8B9BABBUL }, - { 0xC4C5C6C7UL, 0xC0C1C2C3UL, }, - { 0xCCCDCECFUL, 0xC8C9CACBUL, }, + { 0xC4C5C6C7UL, 0xC0C1C2C3UL }, + { 0xCCCDCECFUL, 0xC8C9CACBUL }, { 0xD4D5D6D7UL, 0xD0D1D2D3UL }, { 0xDCDDDEDFUL, 0xD8D9DADBUL }, { 0xE4E5E6E7UL, 0xE0E1E2E3UL }, @@ -344,52 +228,470 @@ void quark_bmw512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t * { 0xF4F5F6F7UL, 0xF0F1F2F3UL }, { 0xFCFDFEFFUL, 0xF8F9FAFBUL } }; - // Nachricht kopieren (Achtung, die Nachricht hat 64 Byte, - // BMW arbeitet mit 128 Byte!!! - uint2 message[16]; + + const uint64_t hash2[16] = + { + 0x8081828384858687, + 0x88898A8B8C8D8E8F, + 0x9091929394959697, + 0x98999A9B9C9D9E9F, + 0xA0A1A2A3A4A5A6A7, + 0xA8A9AAABACADAEAF, + 0xB0B1B2B3B4B5B6B7, + 0xB8B9BABBBCBDBEBF, + 0xC0C1C2C3C4C5C6C7^0x80, + 0xC8C9CACBCCCDCECF, + 0xD0D1D2D3D4D5D6D7, + 0xD8D9DADBDCDDDEDF, + 0xE0E1E2E3E4E5E6E7, + 0xE8E9EAEBECEDEEEF, + 0xF0F1F2F3F4F5F6F7, + 0xF8F9FAFBFCFDFEFF + }; + + uint64_t msg[16]; + uint2 msg2[16]; + uint64_t mxh[8]; + uint2 h[16]; + + uint28 *phash = (uint28*)inpHash; + uint28 *outpt = (uint28*)msg2; + outpt[0] = phash[0]; + outpt[1] = phash[1]; + #pragma unroll 8 - for(int i=0;i<8;i++) - message[i] = vectorize(inpHash[i]); -#pragma unroll 6 - for(int i=9;i<15;i++) - message[i] = make_uint2(0,0); + for (int i = 0; i < 8; i++) + { + msg[i] = devectorize(msg2[i]); + } + + + mxh[0] = msg[0] ^ hash2[0]; + mxh[1] = msg[1] ^ hash2[1]; + mxh[2] = msg[2] ^ hash2[2]; + mxh[3] = msg[3] ^ hash2[3]; + mxh[4] = msg[4] ^ hash2[4]; + mxh[5] = msg[5] ^ hash2[5]; + mxh[6] = msg[6] ^ hash2[6]; + mxh[7] = msg[7] ^ hash2[7]; + + const uint2 precalcf[9] = + { + { 0x55555550ul, 0x55555555 }, + { 0xAAAAAAA5, 0x5AAAAAAA }, + { 0xFFFFFFFA, 0x5FFFFFFF }, + { 0x5555554F, 0x65555555 }, + { 0xAAAAAAA4, 0x6AAAAAAA }, + { 0xFE00FFF9, 0x6FFFFFFF }, + { 0xAAAAAAA1, 0x9AAAAAAA }, + { 0xFFFEFFF6, 0x9FFFFFFF }, + { 0x5755554B, 0xA5555555 }, + }; - // Padding einfügen (Byteorder?!?) - message[8] = make_uint2(0x80,0); - // Länge (in Bits, d.h. 64 Byte * 8 = 512 Bits - message[15] = make_uint2(512,0); + uint2 q[32]; - // Compression 1 - Compression512_64_first(message, h); + uint2 tmp; + tmp = vectorize((mxh[5]) - (mxh[7]) + (hash2[10] + hash2[13] + hash2[14])); + q[0] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ ROTL64(tmp, 4) ^ ROTL64(tmp, 37)) + hash[1]; + tmp = vectorize((mxh[6]) + (hash2[11] + hash2[14] - (512 ^ hash2[15]) - hash2[8])); + q[1] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ ROTL64(tmp, 13) ^ ROTL64(tmp, 43)) + hash[2]; + tmp = vectorize((mxh[0] + mxh[7]) + hash2[9] - hash2[12] + (512 ^ hash2[15])); + q[2] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ ROTL64(tmp, 19) ^ ROTL64(tmp, 53)) + hash[3]; + tmp = vectorize((mxh[0] - mxh[1]) + hash2[8] - hash2[10] + hash2[13]); + q[3] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ ROTL64(tmp, 28) ^ ROTL64(tmp, 59)) + hash[4]; + tmp = vectorize((mxh[1] + mxh[2]) + hash2[9] - hash2[11] - hash2[14]); + q[4] = (SHR(tmp, 1) ^ tmp) + hash[5]; + tmp = vectorize((mxh[3] - mxh[2] + hash2[10] - hash2[12] + (512 ^ hash2[15]))); + q[5] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ ROTL64(tmp, 4) ^ ROTL64(tmp, 37)) + hash[6]; + tmp = vectorize((mxh[4]) - (mxh[0]) - (mxh[3]) + hash2[13] - hash2[11]); + q[6] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ ROTL64(tmp, 13) ^ ROTL64(tmp, 43)) + hash[7]; + tmp = vectorize((mxh[1]) - (mxh[4]) - (mxh[5]) - hash2[12] - hash2[14]); + q[7] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ ROTL64(tmp, 19) ^ ROTL64(tmp, 53)) + hash[8]; + tmp = vectorize((mxh[2]) - (mxh[5]) - (mxh[6]) + hash2[13] - (512 ^ hash2[15])); + q[8] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ ROTL64(tmp, 28) ^ ROTL64(tmp, 59)) + hash[9]; + tmp = vectorize((mxh[0]) - (mxh[3]) + (mxh[6]) - (mxh[7]) + (hash2[14])); + q[9] = (SHR(tmp, 1) ^ tmp) + hash[10]; + tmp = vectorize((512 ^ hash2[15]) + hash2[8] - (mxh[1]) - (mxh[4]) - (mxh[7])); + q[10] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ ROTL64(tmp, 4) ^ ROTL64(tmp, 37)) + hash[11]; + tmp = vectorize(hash2[9] + hash2[8] - (mxh[0]) - (mxh[2]) - (mxh[5])); + q[11] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ ROTL64(tmp, 13) ^ ROTL64(tmp, 43)) + hash[12]; + tmp = vectorize((mxh[1]) + (mxh[3]) - (mxh[6]) + hash2[10] - hash2[9]); + q[12] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ ROTL64(tmp, 19) ^ ROTL64(tmp, 53)) + hash[13]; + tmp = vectorize((mxh[2]) + (mxh[4]) + (mxh[7]) + hash2[10] + hash2[11]); + q[13] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ ROTL64(tmp, 28) ^ ROTL64(tmp, 59)) + hash[14]; + tmp = vectorize((mxh[3]) - (mxh[5]) + hash2[8] - hash2[11] - hash2[12]); + q[14] = (SHR(tmp, 1) ^ tmp) + hash[15]; + tmp = vectorize(hash2[12] - hash2[9] + hash2[13] - (mxh[4]) - (mxh[6])); + q[15] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ ROTL64(tmp, 4) ^ ROTL64(tmp, 37)) + hash[0]; + + q[0 + 16] = + (SHR(q[0], 1) ^ SHL(q[0], 2) ^ ROTL64(q[0], 13) ^ ROTL64(q[0], 43)) + + (SHR(q[0 + 1], 2) ^ SHL(q[0 + 1], 1) ^ ROTL64(q[0 + 1], 19) ^ ROTL64(q[0 + 1], 53)) + + (SHR(q[0 + 2], 2) ^ SHL(q[0 + 2], 2) ^ ROTL64(q[0 + 2], 28) ^ ROTL64(q[0 + 2], 59)) + + (SHR(q[0 + 3], 1) ^ SHL(q[0 + 3], 3) ^ ROTL64(q[0 + 3], 4) ^ ROTL64(q[0 + 3], 37)) + + (SHR(q[0 + 4], 1) ^ SHL(q[0 + 4], 2) ^ ROTL64(q[0 + 4], 13) ^ ROTL64(q[0 + 4], 43)) + + (SHR(q[0 + 5], 2) ^ SHL(q[0 + 5], 1) ^ ROTL64(q[0 + 5], 19) ^ ROTL64(q[0 + 5], 53)) + + (SHR(q[0 + 6], 2) ^ SHL(q[0 + 6], 2) ^ ROTL64(q[0 + 6], 28) ^ ROTL64(q[0 + 6], 59)) + + (SHR(q[0 + 7], 1) ^ SHL(q[0 + 7], 3) ^ ROTL64(q[0 + 7], 4) ^ ROTL64(q[0 + 7], 37)) + + (SHR(q[0 + 8], 1) ^ SHL(q[0 + 8], 2) ^ ROTL64(q[0 + 8], 13) ^ ROTL64(q[0 + 8], 43)) + + (SHR(q[0 + 9], 2) ^ SHL(q[0 + 9], 1) ^ ROTL64(q[0 + 9], 19) ^ ROTL64(q[0 + 9], 53)) + + (SHR(q[0 + 10], 2) ^ SHL(q[0 + 10], 2) ^ ROTL64(q[0 + 10], 28) ^ ROTL64(q[0 + 10], 59)) + + (SHR(q[0 + 11], 1) ^ SHL(q[0 + 11], 3) ^ ROTL64(q[0 + 11], 4) ^ ROTL64(q[0 + 11], 37)) + + (SHR(q[0 + 12], 1) ^ SHL(q[0 + 12], 2) ^ ROTL64(q[0 + 12], 13) ^ ROTL64(q[0 + 12], 43)) + + (SHR(q[0 + 13], 2) ^ SHL(q[0 + 13], 1) ^ ROTL64(q[0 + 13], 19) ^ ROTL64(q[0 + 13], 53)) + + (SHR(q[0 + 14], 2) ^ SHL(q[0 + 14], 2) ^ ROTL64(q[0 + 14], 28) ^ ROTL64(q[0 + 14], 59)) + + (SHR(q[0 + 15], 1) ^ SHL(q[0 + 15], 3) ^ ROTL64(q[0 + 15], 4) ^ ROTL64(q[0 + 15], 37)) + + ((precalcf[0] + ROTL64(msg2[0], 0 + 1) + + ROTL64(msg2[0 + 3], 0 + 4)) ^ hash[0 + 7]); + q[1 + 16] = + (SHR(q[1], 1) ^ SHL(q[1], 2) ^ ROTL64(q[1], 13) ^ ROTL64(q[1], 43)) + + (SHR(q[1 + 1], 2) ^ SHL(q[1 + 1], 1) ^ ROTL64(q[1 + 1], 19) ^ ROTL64(q[1 + 1], 53)) + + (SHR(q[1 + 2], 2) ^ SHL(q[1 + 2], 2) ^ ROTL64(q[1 + 2], 28) ^ ROTL64(q[1 + 2], 59)) + + (SHR(q[1 + 3], 1) ^ SHL(q[1 + 3], 3) ^ ROTL64(q[1 + 3], 4) ^ ROTL64(q[1 + 3], 37)) + + (SHR(q[1 + 4], 1) ^ SHL(q[1 + 4], 2) ^ ROTL64(q[1 + 4], 13) ^ ROTL64(q[1 + 4], 43)) + + (SHR(q[1 + 5], 2) ^ SHL(q[1 + 5], 1) ^ ROTL64(q[1 + 5], 19) ^ ROTL64(q[1 + 5], 53)) + + (SHR(q[1 + 6], 2) ^ SHL(q[1 + 6], 2) ^ ROTL64(q[1 + 6], 28) ^ ROTL64(q[1 + 6], 59)) + + (SHR(q[1 + 7], 1) ^ SHL(q[1 + 7], 3) ^ ROTL64(q[1 + 7], 4) ^ ROTL64(q[1 + 7], 37)) + + (SHR(q[1 + 8], 1) ^ SHL(q[1 + 8], 2) ^ ROTL64(q[1 + 8], 13) ^ ROTL64(q[1 + 8], 43)) + + (SHR(q[1 + 9], 2) ^ SHL(q[1 + 9], 1) ^ ROTL64(q[1 + 9], 19) ^ ROTL64(q[1 + 9], 53)) + + (SHR(q[1 + 10], 2) ^ SHL(q[1 + 10], 2) ^ ROTL64(q[1 + 10], 28) ^ ROTL64(q[1 + 10], 59)) + + (SHR(q[1 + 11], 1) ^ SHL(q[1 + 11], 3) ^ ROTL64(q[1 + 11], 4) ^ ROTL64(q[1 + 11], 37)) + + (SHR(q[1 + 12], 1) ^ SHL(q[1 + 12], 2) ^ ROTL64(q[1 + 12], 13) ^ ROTL64(q[1 + 12], 43)) + + (SHR(q[1 + 13], 2) ^ SHL(q[1 + 13], 1) ^ ROTL64(q[1 + 13], 19) ^ ROTL64(q[1 + 13], 53)) + + (SHR(q[1 + 14], 2) ^ SHL(q[1 + 14], 2) ^ ROTL64(q[1 + 14], 28) ^ ROTL64(q[1 + 14], 59)) + + (SHR(q[1 + 15], 1) ^ SHL(q[1 + 15], 3) ^ ROTL64(q[1 + 15], 4) ^ ROTL64(q[1 + 15], 37)) + + ((precalcf[1] + ROTL64(msg2[1], 1 + 1) + + ROTL64(msg2[1 + 3], 1 + 4)) ^ hash[1 + 7]); + + uint2 pre1 = q[2 + 0] + q[2 + 2] + q[2 + 4] + q[2 + 6] + q[2 + 8] + q[2 + 10] + q[2 + 12]; + uint2 pre2 = q[3 + 0] + q[3 + 2] + q[3 + 4] + q[3 + 6] + q[3 + 8] + q[3 + 10] + q[3 + 12]; + + q[2 + 16] = pre1 + CONST_EXP3(2) + + ((precalcf[2] + ROTL64(msg2[2], 2 + 1) + + ROTL64(msg2[2 + 3], 2 + 4)) ^ hash[2 + 7]); + q[3 + 16] = pre2 + CONST_EXP3(3) + + ((precalcf[3] + ROTL64(msg2[3], 3 + 1) + + ROTL64(msg2[3 + 3], 3 + 4)) ^ hash[3 + 7]); + pre1 = pre1 - q[2 + 0] + q[2 + 14]; + pre2 = pre2 - q[3 + 0] + q[3 + 14]; + + q[4 + 16] = pre1 + CONST_EXP3(4) + + ((precalcf[4] + ROTL64(msg2[4], 4 + 1) + + ROL8(msg2[4 + 3])) ^ hash[4 + 7]); + q[5 + 16] = pre2 + CONST_EXP3(5) + + ((precalcf[5] + ROTL64(msg2[5], 5 + 1)) + ^ hash[5 + 7]); + + pre1 = pre1 - q[4 + 0] + q[4 + 14]; + pre2 = pre2 - q[5 + 0] + q[5 + 14]; + + + q[6 + 16] = pre1 + CONST_EXP3(6) + + ((vectorize((6 + 16)*(0x0555555555555555ull)) + ROTL64(msg2[6], 6 + 1) - + ROTL64(msg2[6 - 6], (6 - 6) + 1)) ^ hash[13]); + q[7 + 16] = pre2 + CONST_EXP3(7) + + ((vectorize((7 + 16)*(0x0555555555555555ull)) + ROTL64(msg2[7], 7 + 1) - + ROTL64(msg2[7 - 6], (7 - 6) + 1)) ^ hash[14]); + + pre1 = pre1 - q[6 + 0] + q[6 + 14]; + pre2 = pre2 - q[7 + 0] + q[7 + 14]; + + q[8 + 16] = pre1 + CONST_EXP3(8) + + ((vectorize((8 + 16)*(0x0555555555555555ull) + 0x10000) - + ROTL64(msg2[8 - 6], (8 - 6) + 1)) ^ hash[15]); + q[25] = pre2 + CONST_EXP3(9) + + ((vectorize((25)*(0x0555555555555555ull)) - ROTL64(msg2[3], 4)) ^ hash[0]); + + pre1 = pre1 - q[8 + 0] + q[8 + 14]; + pre2 = pre2 - q[9 + 0] + q[9 + 14]; + + q[26] = pre1 + CONST_EXP3(10) + + ((vectorize((26)*(0x0555555555555555ull)) - ROTL64(msg2[4], 5)) ^ hash[1]); + q[27] = pre2 + CONST_EXP3(11) + + ((vectorize((27)*(0x0555555555555555ull)) - ROTL64(msg2[5], 6)) ^ hash[2]); + + pre1 = pre1 - q[10 + 0] + q[10 + 14]; + pre2 = pre2 - q[11 + 0] + q[11 + 14]; + + q[28] = pre1 + CONST_EXP3(12) + + ((vectorize(0x955555555755554C) - ROTL64(msg2[6], 7)) ^ hash[3]); + q[13 + 16] = pre2 + CONST_EXP3(13) + + ((precalcf[6] + + ROTL64(msg2[13 - 13], (13 - 13) + 1) - ROL8(msg2[13 - 6])) ^ hash[13 - 9]); + + pre1 = pre1 - q[12 + 0] + q[12 + 14]; + pre2 = pre2 - q[13 + 0] + q[13 + 14]; + + q[14 + 16] = pre1 + CONST_EXP3(14) + + ((precalcf[7] + + ROTL64(msg2[14 - 13], (14 - 13) + 1)) ^ hash[14 - 9]); + q[15 + 16] = pre2 + CONST_EXP3(15) + + ((precalcf[8] + + ROTL64(msg2[15 - 13], (15 - 13) + 1)) ^ hash[15 - 9]); + + + uint2 XL64 = q[16] ^ q[17] ^ q[18] ^ q[19] ^ q[20] ^ q[21] ^ q[22] ^ q[23]; + uint2 XH64 = XL64^q[24] ^ q[25] ^ q[26] ^ q[27] ^ q[28] ^ q[29] ^ q[30] ^ q[31]; + + h[0] = (SHL(XH64, 5) ^ SHR(q[16], 5) ^ msg2[0]) + (XL64 ^ q[24] ^ q[0]); + h[1] = (SHR(XH64, 7) ^ SHL(q[17], 8) ^ msg2[1]) + (XL64 ^ q[25] ^ q[1]); + h[2] = (SHR(XH64, 5) ^ SHL(q[18], 5) ^ msg2[2]) + (XL64 ^ q[26] ^ q[2]); + h[3] = (SHR(XH64, 1) ^ SHL(q[19], 5) ^ msg2[3]) + (XL64 ^ q[27] ^ q[3]); + h[4] = (SHR(XH64, 3) ^ q[20] ^ msg2[4]) + (XL64 ^ q[28] ^ q[4]); + h[5] = (SHL(XH64, 6) ^ SHR(q[21], 6) ^ msg2[5]) + (XL64 ^ q[29] ^ q[5]); + h[6] = (SHR(XH64, 4) ^ SHL(q[22], 6) ^ msg2[6]) + (XL64 ^ q[30] ^ q[6]); + h[7] = (SHR(XH64, 11) ^ SHL(q[23], 2) ^ msg2[7]) + (XL64 ^ q[31] ^ q[7]); + + h[8] = ROTL64(h[4], 9) + (XH64 ^ q[24] ^ 0x80) + (SHL(XL64, 8) ^ q[23] ^ q[8]); + h[9] = ROTL64(h[5], 10) + (XH64 ^ q[25]) + (SHR(XL64, 6) ^ q[16] ^ q[9]); + h[10] = ROTL64(h[6], 11) + (XH64 ^ q[26]) + (SHL(XL64, 6) ^ q[17] ^ q[10]); + h[11] = ROTL64(h[7], 12) + (XH64 ^ q[27]) + (SHL(XL64, 4) ^ q[18] ^ q[11]); + h[12] = ROTL64(h[0], 13) + (XH64 ^ q[28]) + (SHR(XL64, 3) ^ q[19] ^ q[12]); + h[13] = ROTL64(h[1], 14) + (XH64 ^ q[29]) + (SHR(XL64, 4) ^ q[20] ^ q[13]); + h[14] = ROTL64(h[2], 15) + (XH64 ^ q[30]) + (SHR(XL64, 7) ^ q[21] ^ q[14]); + h[15] = ROL16(h[3]) + (XH64 ^ q[31] ^ (512)) + (SHR(XL64, 2) ^ q[22] ^ q[15]); + + const uint2 cmsg[16] = + { + 0xaaaaaaa0, 0xaaaaaaaa, + 0xaaaaaaa1, 0xaaaaaaaa, + 0xaaaaaaa2, 0xaaaaaaaa, + 0xaaaaaaa3, 0xaaaaaaaa, + 0xaaaaaaa4, 0xaaaaaaaa, + 0xaaaaaaa5, 0xaaaaaaaa, + 0xaaaaaaa6, 0xaaaaaaaa, + 0xaaaaaaa7, 0xaaaaaaaa, + 0xaaaaaaa8, 0xaaaaaaaa, + 0xaaaaaaa9, 0xaaaaaaaa, + 0xaaaaaaaa, 0xaaaaaaaa, + 0xaaaaaaab, 0xaaaaaaaa, + 0xaaaaaaac, 0xaaaaaaaa, + 0xaaaaaaad, 0xaaaaaaaa, + 0xaaaaaaae, 0xaaaaaaaa, + 0xaaaaaaaf, 0xaaaaaaaa + }; - // Final #pragma unroll 16 - for(int i=0;i<16;i++) + for (int i = 0; i < 16; i++) { - message[i].y = 0xaaaaaaaa; - message[i].x = 0xaaaaaaa0ul + (uint32_t)i; + msg[i] = devectorize(cmsg[i] ^ h[i]); } - Compression512(h, message); - // fertig - uint64_t *outpHash = &g_hash[8 * hashPosition]; -#pragma unroll 8 - for(int i=0;i<8;i++) - outpHash[i] = devectorize(message[i+8]); - } + const uint2 precalc[16] = + { + { 0x55555550, 0x55555555 }, + { 0xAAAAAAA5, 0x5AAAAAAA }, + { 0xFFFFFFFA, 0x5FFFFFFF }, + { 0x5555554F, 0x65555555 }, + { 0xAAAAAAA4, 0x6AAAAAAA }, + { 0xFFFFFFF9, 0x6FFFFFFF }, + { 0x5555554E, 0x75555555 }, + { 0xAAAAAAA3, 0x7AAAAAAA }, + { 0xFFFFFFF8, 0x7FFFFFFF }, + { 0x5555554D, 0x85555555 }, + { 0xAAAAAAA2, 0x8AAAAAAA }, + { 0xFFFFFFF7, 0x8FFFFFFF }, + { 0x5555554C, 0x95555555 }, + { 0xAAAAAAA1, 0x9AAAAAAA }, + { 0xFFFFFFF6, 0x9FFFFFFF }, + { 0x5555554B, 0xA5555555 }, + }; + + const uint64_t p2 = msg[15] - msg[12]; + const uint64_t p3 = msg[14] - msg[7]; + const uint64_t p4 = msg[6] + msg[9]; + const uint64_t p5 = msg[8] - msg[5]; + const uint64_t p6 = msg[1] - msg[14]; + const uint64_t p7 = msg[8] - msg[1]; + const uint64_t p8 = msg[3] + msg[10]; + + + tmp = vectorize((msg[5]) + (msg[10]) + (msg[13]) + p3); + q[0] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ ROTL64(tmp, 4) ^ ROTL64(tmp, 37)) + cmsg[1]; + tmp = vectorize((msg[6]) - (msg[8]) + (msg[11]) + (msg[14]) - (msg[15])); + q[1] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ ROTL64(tmp, 13) ^ ROTL64(tmp, 43)) + cmsg[2]; + tmp = vectorize((msg[0]) + (msg[7]) + (msg[9]) + p2); + q[2] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ ROTL64(tmp, 19) ^ ROTL64(tmp, 53)) + cmsg[3]; + tmp = vectorize((msg[0]) + p7 - (msg[10]) + (msg[13])); + q[3] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ ROTL64(tmp, 28) ^ ROTL64(tmp, 59)) + cmsg[4]; + tmp = vectorize((msg[2]) + (msg[9]) - (msg[11]) + p6); + q[4] = (SHR(tmp, 1) ^ tmp) + cmsg[5]; + tmp = vectorize(p8 + p2 - (msg[2])); + q[5] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ ROTL64(tmp, 4) ^ ROTL64(tmp, 37)) + cmsg[6]; + tmp = vectorize((msg[4]) - (msg[0]) - (msg[3]) - (msg[11]) + (msg[13])); + q[6] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ ROTL64(tmp, 13) ^ ROTL64(tmp, 43)) + cmsg[7]; + tmp = vectorize(p6 - (msg[4]) - (msg[5]) - (msg[12])); + q[7] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ ROTL64(tmp, 19) ^ ROTL64(tmp, 53)) + cmsg[8]; + tmp = vectorize((msg[2]) - (msg[5]) - (msg[6]) + (msg[13]) - (msg[15])); + q[8] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ ROTL64(tmp, 28) ^ ROTL64(tmp, 59)) + cmsg[9]; + tmp = vectorize((msg[0]) - (msg[3]) + (msg[6]) + p3); + q[9] = (SHR(tmp, 1) ^ tmp) + cmsg[10]; + tmp = vectorize(p7 - (msg[4]) - (msg[7]) + (msg[15])); + q[10] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ ROTL64(tmp, 4) ^ ROTL64(tmp, 37)) + cmsg[11]; + tmp = vectorize(p5 - (msg[0]) - (msg[2]) + (msg[9])); + q[11] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ ROTL64(tmp, 13) ^ ROTL64(tmp, 43)) + cmsg[12]; + tmp = vectorize(p8+msg[1] - p4 ); + q[12] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ ROTL64(tmp, 19) ^ ROTL64(tmp, 53)) + cmsg[13]; + tmp = vectorize((msg[2]) + (msg[4]) + (msg[7]) + (msg[10]) + (msg[11])); + q[13] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ ROTL64(tmp, 28) ^ ROTL64(tmp, 59)) + cmsg[14]; + tmp = vectorize((msg[3]) + p5 - (msg[11]) - (msg[12])); + q[14] = (SHR(tmp, 1) ^ tmp) + cmsg[15]; + tmp = vectorize((msg[12]) - (msg[4]) - p4 + (msg[13])); + q[15] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ ROTL64(tmp, 4) ^ ROTL64(tmp, 37)) + cmsg[0]; + + q[0 + 16] = + (SHR(q[0], 1) ^ SHL(q[0], 2) ^ ROTL64(q[0], 13) ^ ROTL64(q[0], 43)) + + (SHR(q[0 + 1], 2) ^ SHL(q[0 + 1], 1) ^ ROTL64(q[0 + 1], 19) ^ ROTL64(q[0 + 1], 53)) + + (SHR(q[0 + 2], 2) ^ SHL(q[0 + 2], 2) ^ ROTL64(q[0 + 2], 28) ^ ROTL64(q[0 + 2], 59)) + + (SHR(q[0 + 3], 1) ^ SHL(q[0 + 3], 3) ^ ROTL64(q[0 + 3], 4) ^ ROTL64(q[0 + 3], 37)) + + (SHR(q[0 + 4], 1) ^ SHL(q[0 + 4], 2) ^ ROTL64(q[0 + 4], 13) ^ ROTL64(q[0 + 4], 43)) + + (SHR(q[0 + 5], 2) ^ SHL(q[0 + 5], 1) ^ ROTL64(q[0 + 5], 19) ^ ROTL64(q[0 + 5], 53)) + + (SHR(q[0 + 6], 2) ^ SHL(q[0 + 6], 2) ^ ROTL64(q[0 + 6], 28) ^ ROTL64(q[0 + 6], 59)) + + (SHR(q[0 + 7], 1) ^ SHL(q[0 + 7], 3) ^ ROTL64(q[0 + 7], 4) ^ ROTL64(q[0 + 7], 37)) + + (SHR(q[0 + 8], 1) ^ SHL(q[0 + 8], 2) ^ ROTL64(q[0 + 8], 13) ^ ROTL64(q[0 + 8], 43)) + + (SHR(q[0 + 9], 2) ^ SHL(q[0 + 9], 1) ^ ROTL64(q[0 + 9], 19) ^ ROTL64(q[0 + 9], 53)) + + (SHR(q[0 + 10], 2) ^ SHL(q[0 + 10], 2) ^ ROTL64(q[0 + 10], 28) ^ ROTL64(q[0 + 10], 59)) + + (SHR(q[0 + 11], 1) ^ SHL(q[0 + 11], 3) ^ ROTL64(q[0 + 11], 4) ^ ROTL64(q[0 + 11], 37)) + + (SHR(q[0 + 12], 1) ^ SHL(q[0 + 12], 2) ^ ROTL64(q[0 + 12], 13) ^ ROTL64(q[0 + 12], 43)) + + (SHR(q[0 + 13], 2) ^ SHL(q[0 + 13], 1) ^ ROTL64(q[0 + 13], 19) ^ ROTL64(q[0 + 13], 53)) + + (SHR(q[0 + 14], 2) ^ SHL(q[0 + 14], 2) ^ ROTL64(q[0 + 14], 28) ^ ROTL64(q[0 + 14], 59)) + + (SHR(q[0 + 15], 1) ^ SHL(q[0 + 15], 3) ^ ROTL64(q[0 + 15], 4) ^ ROTL64(q[0 + 15], 37)) + + ((precalc[0] + ROTL64(h[0], 0 + 1) + + ROTL64(h[0 + 3], 0 + 4) - ROTL64(h[0 + 10], 0 + 11)) ^ cmsg[0 + 7]); + q[1 + 16] = + (SHR(q[1], 1) ^ SHL(q[1], 2) ^ ROTL64(q[1], 13) ^ ROTL64(q[1], 43)) + + (SHR(q[1 + 1], 2) ^ SHL(q[1 + 1], 1) ^ ROTL64(q[1 + 1], 19) ^ ROTL64(q[1 + 1], 53)) + + (SHR(q[1 + 2], 2) ^ SHL(q[1 + 2], 2) ^ ROTL64(q[1 + 2], 28) ^ ROTL64(q[1 + 2], 59)) + + (SHR(q[1 + 3], 1) ^ SHL(q[1 + 3], 3) ^ ROTL64(q[1 + 3], 4) ^ ROTL64(q[1 + 3], 37)) + + (SHR(q[1 + 4], 1) ^ SHL(q[1 + 4], 2) ^ ROTL64(q[1 + 4], 13) ^ ROTL64(q[1 + 4], 43)) + + (SHR(q[1 + 5], 2) ^ SHL(q[1 + 5], 1) ^ ROTL64(q[1 + 5], 19) ^ ROTL64(q[1 + 5], 53)) + + (SHR(q[1 + 6], 2) ^ SHL(q[1 + 6], 2) ^ ROTL64(q[1 + 6], 28) ^ ROTL64(q[1 + 6], 59)) + + (SHR(q[1 + 7], 1) ^ SHL(q[1 + 7], 3) ^ ROTL64(q[1 + 7], 4) ^ ROTL64(q[1 + 7], 37)) + + (SHR(q[1 + 8], 1) ^ SHL(q[1 + 8], 2) ^ ROTL64(q[1 + 8], 13) ^ ROTL64(q[1 + 8], 43)) + + (SHR(q[1 + 9], 2) ^ SHL(q[1 + 9], 1) ^ ROTL64(q[1 + 9], 19) ^ ROTL64(q[1 + 9], 53)) + + (SHR(q[1 + 10], 2) ^ SHL(q[1 + 10], 2) ^ ROTL64(q[1 + 10], 28) ^ ROTL64(q[1 + 10], 59)) + + (SHR(q[1 + 11], 1) ^ SHL(q[1 + 11], 3) ^ ROTL64(q[1 + 11], 4) ^ ROTL64(q[1 + 11], 37)) + + (SHR(q[1 + 12], 1) ^ SHL(q[1 + 12], 2) ^ ROTL64(q[1 + 12], 13) ^ ROTL64(q[1 + 12], 43)) + + (SHR(q[1 + 13], 2) ^ SHL(q[1 + 13], 1) ^ ROTL64(q[1 + 13], 19) ^ ROTL64(q[1 + 13], 53)) + + (SHR(q[1 + 14], 2) ^ SHL(q[1 + 14], 2) ^ ROTL64(q[1 + 14], 28) ^ ROTL64(q[1 + 14], 59)) + + (SHR(q[1 + 15], 1) ^ SHL(q[1 + 15], 3) ^ ROTL64(q[1 + 15], 4) ^ ROTL64(q[1 + 15], 37)) + + ((precalc[1] + ROTL64(h[1], 1 + 1) + + ROTL64(h[1 + 3], 1 + 4) - ROTL64(h[1 + 10], 1 + 11)) ^ cmsg[1 + 7]); + + pre1 = q[2] + q[4] + q[6] + q[8] + q[10] + q[12] + q[14]; + pre2 = q[3] + q[5] + q[7] + q[9] + q[11] + q[13] + q[15]; + + q[2 + 16] = pre1 + CONST_EXP3(2) + + ((precalc[2] + ROTL64(h[2], 2 + 1) + + ROTL64(h[2 + 3], 2 + 4) - ROTL64(h[2 + 10], 2 + 11)) ^ cmsg[2 + 7]); + q[3 + 16] = pre2 + CONST_EXP3(3) + + ((precalc[3] + ROTL64(h[3], 3 + 1) + + ROTL64(h[3 + 3], 3 + 4) - ROTL64(h[3 + 10], 3 + 11)) ^ cmsg[3 + 7]); + + pre1 = pre1 - q[2 + 0] + q[2 + 14]; + pre2 = pre2 - q[3 + 0] + q[3 + 14]; + + q[4 + 16] = pre1 + CONST_EXP3(4) + + ((precalc[4] + ROTL64(h[4], 4 + 1) + + ROL8(h[4 + 3]) - ROTL64(h[4 + 10], 4 + 11)) ^ cmsg[4 + 7]); + q[5 + 16] = pre2 + CONST_EXP3(5) + + ((precalc[5] + ROTL64(h[5], 5 + 1) + + ROTL64(h[5 + 3], 5 + 4) - ROL16(h[5 + 10])) ^ cmsg[5 + 7]); + + pre1 = pre1 - q[4 + 0] + q[4 + 14]; + pre2 = pre2 - q[5 + 0] + q[5 + 14]; + + q[6 + 16] = pre1 + CONST_EXP3(6) + + ((precalc[6] + ROTL64(h[6], 6 + 1) + + ROTL64(h[6 + 3], 6 + 4) - ROTL64(h[6 - 6], (6 - 6) + 1)) ^ cmsg[6 + 7]); + q[7 + 16] = pre2 + CONST_EXP3(7) + + ((precalc[7] + ROL8(h[7]) + + ROTL64(h[7 + 3], 7 + 4) - ROTL64(h[7 - 6], (7 - 6) + 1)) ^ cmsg[7 + 7]); + + pre1 = pre1 - q[6 + 0] + q[6 + 14]; + pre2 = pre2 - q[7 + 0] + q[7 + 14]; + + q[8 + 16] = pre1 + CONST_EXP3(8) + + ((precalc[8] + ROTL64(h[8], 8 + 1) + + ROTL64(h[8 + 3], 8 + 4) - ROTL64(h[8 - 6], (8 - 6) + 1)) ^ cmsg[8 + 7]); + q[9 + 16] = pre2 + CONST_EXP3(9) + + ((precalc[9] + ROTL64(h[9], 9 + 1) + + ROTL64(h[9 + 3], 9 + 4) - ROTL64(h[9 - 6], (9 - 6) + 1)) ^ cmsg[9 - 9]); + + pre1 = pre1 - q[8 + 0] + q[8 + 14]; + pre2 = pre2 - q[9 + 0] + q[9 + 14]; + + q[10 + 16] = pre1 + CONST_EXP3(10) + + ((precalc[10] + ROTL64(h[10], 10 + 1) + + ROTL64(h[10 + 3], 10 + 4) - ROTL64(h[10 - 6], (10 - 6) + 1)) ^ cmsg[10 - 9]); + q[11 + 16] = pre2 + CONST_EXP3(11) + + ((precalc[11] + ROTL64(h[11], 11 + 1) + + ROTL64(h[11 + 3], 11 + 4) - ROTL64(h[11 - 6], (11 - 6) + 1)) ^ cmsg[11 - 9]); + + pre1 = pre1 - q[10 + 0] + q[10 + 14]; + pre2 = pre2 - q[11 + 0] + q[11 + 14]; + + q[12 + 16] = pre1 + CONST_EXP3(12) + + ((precalc[12] + ROTL64(h[12], 12 + 1) + + ROL16(h[12 + 3]) - ROTL64(h[12 - 6], (12 - 6) + 1)) ^ cmsg[12 - 9]); + q[13 + 16] = pre2 + CONST_EXP3(13) + + ((precalc[13] + ROTL64(h[13], 13 + 1) + + ROTL64(h[13 - 13], (13 - 13) + 1) - ROL8(h[13 - 6])) ^ cmsg[13 - 9]); + + pre1 = pre1 - q[12 + 0] + q[12 + 14]; + pre2 = pre2 - q[13 + 0] + q[13 + 14]; + + q[14 + 16] = pre1 + CONST_EXP3(14) + + ((precalc[14] + ROTL64(h[14], 14 + 1) + + ROTL64(h[14 - 13], (14 - 13) + 1) - ROTL64(h[14 - 6], (14 - 6) + 1)) ^ cmsg[14 - 9]); + q[15 + 16] = pre2 + CONST_EXP3(15) + + ((precalc[15] + ROL16(h[15]) + + ROTL64(h[15 - 13], (15 - 13) + 1) - ROTL64(h[15 - 6], (15 - 6) + 1)) ^ cmsg[15 - 9]); + + XL64 = q[16] ^ q[17] ^ q[18] ^ q[19] ^ q[20] ^ q[21] ^ q[22] ^ q[23]; + XH64 = XL64^q[24] ^ q[25] ^ q[26] ^ q[27] ^ q[28] ^ q[29] ^ q[30] ^ q[31]; + + msg2[0] = (SHL(XH64, 5) ^ SHR(q[16], 5) ^ h[0]) + (XL64 ^ q[24] ^ q[0]); + msg2[1] = (SHR(XH64, 7) ^ SHL(q[17], 8) ^ h[1]) + (XL64 ^ q[25] ^ q[1]); + msg2[2] = (SHR(XH64, 5) ^ SHL(q[18], 5) ^ h[2]) + (XL64 ^ q[26] ^ q[2]); + msg2[3] = (SHR(XH64, 1) ^ SHL(q[19], 5) ^ h[3]) + (XL64 ^ q[27] ^ q[3]); + msg2[4] = (SHR(XH64, 3) ^ q[20] ^ h[4]) + (XL64 ^ q[28] ^ q[4]); + msg2[5] = (SHL(XH64, 6) ^ SHR(q[21], 6) ^ h[5]) + (XL64 ^ q[29] ^ q[5]); + msg2[6] = (SHR(XH64, 4) ^ SHL(q[22], 6) ^ h[6]) + (XL64 ^ q[30] ^ q[6]); + msg2[7] = (SHR(XH64, 11) ^ SHL(q[23], 2) ^ h[7]) + (XL64 ^ q[31] ^ q[7]); + msg2[8] = ROTL64(msg2[4], 9) + (XH64 ^ q[24] ^ h[8]) + (SHL(XL64, 8) ^ q[23] ^ q[8]); + + msg2[9] = ROTL64(msg2[5], 10) + (XH64 ^ q[25] ^ h[9]) + (SHR(XL64, 6) ^ q[16] ^ q[9]); + msg2[10] = ROTL64(msg2[6], 11) + (XH64 ^ q[26] ^ h[10]) + (SHL(XL64, 6) ^ q[17] ^ q[10]); + msg2[11] = ROTL64(msg2[7], 12) + (XH64 ^ q[27] ^ h[11]) + (SHL(XL64, 4) ^ q[18] ^ q[11]); + uint28 *phash2 = (uint28*)inpHash; + phash2[0] = make_uint28(msg2[8], msg2[9], msg2[10], msg2[11]); + + msg2[12] = ROTL64(msg2[0], 13) + (XH64 ^ q[28] ^ h[12]) + (SHR(XL64, 3) ^ q[19] ^ q[12]); + msg2[13] = ROTL64(msg2[1], 14) + (XH64 ^ q[29] ^ h[13]) + (SHR(XL64, 4) ^ q[20] ^ q[13]); + msg2[14] = ROTL64(msg2[2], 15) + (XH64 ^ q[30] ^ h[14]) + (SHR(XL64, 7) ^ q[21] ^ q[14]); + msg2[15] = ROL16(msg2[3]) + (XH64 ^ q[31] ^ h[15]) + (SHR(XL64, 2) ^ q[22] ^ q[15]); + + phash2[1] = make_uint28(msg2[8], msg2[9], msg2[10], msg2[11]); + phash2[1] = make_uint28(msg2[12], msg2[13], msg2[14], msg2[15]); + + } } -__global__ __launch_bounds__(256, 2) -void quark_bmw512_gpu_hash_80(uint32_t threads, uint32_t startNounce, uint64_t *g_hash) +__global__ __launch_bounds__(32, 16) +void quark_bmw512_gpu_hash_64_quark(uint32_t threads, uint32_t startNounce, uint64_t *const __restrict__ g_hash, uint32_t *const __restrict__ g_nonceVector) { - uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); - if (thread < threads) - { - uint32_t nounce = startNounce + thread; + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); +// if(thread < threads) + { + const uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread); + + const int hashPosition = nounce - startNounce; + uint64_t *const inpHash = &g_hash[8 * hashPosition]; - // Init - uint2 h[16] = { + const uint2 hash[16] = + { { 0x84858687UL, 0x80818283UL }, { 0x8C8D8E8FUL, 0x88898A8BUL }, { 0x94959697UL, 0x90919293UL }, @@ -398,8 +700,8 @@ void quark_bmw512_gpu_hash_80(uint32_t threads, uint32_t startNounce, uint64_t * { 0xACADAEAFUL, 0xA8A9AAABUL }, { 0xB4B5B6B7UL, 0xB0B1B2B3UL }, { 0xBCBDBEBFUL, 0xB8B9BABBUL }, - { 0xC4C5C6C7UL, 0xC0C1C2C3UL, }, - { 0xCCCDCECFUL, 0xC8C9CACBUL, }, + { 0xC4C5C6C7UL, 0xC0C1C2C3UL }, + { 0xCCCDCECFUL, 0xC8C9CACBUL }, { 0xD4D5D6D7UL, 0xD0D1D2D3UL }, { 0xDCDDDEDFUL, 0xD8D9DADBUL }, { 0xE4E5E6E7UL, 0xE0E1E2E3UL }, @@ -407,32 +709,460 @@ void quark_bmw512_gpu_hash_80(uint32_t threads, uint32_t startNounce, uint64_t * { 0xF4F5F6F7UL, 0xF0F1F2F3UL }, { 0xFCFDFEFFUL, 0xF8F9FAFBUL } }; - // Nachricht kopieren (Achtung, die Nachricht hat 64 Byte, - // BMW arbeitet mit 128 Byte!!! - uint2 message[16]; -#pragma unroll 16 - for(int i=0;i<16;i++) - message[i] = vectorize(c_PaddedMessage80[i]); - // die Nounce durch die thread-spezifische ersetzen - message[9].x = cuda_swab32(nounce); //REPLACE_HIWORD(message[9], cuda_swab32(nounce)); - // Compression 1 - Compression512(message, h); + const uint64_t hash2[16] = + { + 0x8081828384858687, + 0x88898A8B8C8D8E8F, + 0x9091929394959697, + 0x98999A9B9C9D9E9F, + 0xA0A1A2A3A4A5A6A7, + 0xA8A9AAABACADAEAF, + 0xB0B1B2B3B4B5B6B7, + 0xB8B9BABBBCBDBEBF, + 0xC0C1C2C3C4C5C6C7, + 0xC8C9CACBCCCDCECF, + 0xD0D1D2D3D4D5D6D7, + 0xD8D9DADBDCDDDEDF, + 0xE0E1E2E3E4E5E6E7, + 0xE8E9EAEBECEDEEEF, + 0xF0F1F2F3F4F5F6F7, + 0xF8F9FAFBFCFDFEFF + }; + + uint64_t msg[16]; + uint2 msg2[16]; + uint64_t mxh[8]; + uint2 h[16]; + + uint28 *phash = (uint28*)inpHash; + uint28 *outpt = (uint28*)msg2; + outpt[0] = phash[0]; + outpt[1] = phash[1]; + +#pragma unroll 8 + for(int i = 0; i < 8; i++) + { + msg[i] = devectorize(msg2[i]); + } + + + mxh[0] = msg[0] ^ hash2[0]; + mxh[1] = msg[1] ^ hash2[1]; + mxh[2] = msg[2] ^ hash2[2]; + mxh[3] = msg[3] ^ hash2[3]; + mxh[4] = msg[4] ^ hash2[4]; + mxh[5] = msg[5] ^ hash2[5]; + mxh[6] = msg[6] ^ hash2[6]; + mxh[7] = msg[7] ^ hash2[7]; + + const uint2 precalcf[9] = + { + { 0x55555550ul, 0x55555555 }, + { 0xAAAAAAA5, 0x5AAAAAAA }, + { 0xFFFFFFFA, 0x5FFFFFFF }, + { 0x5555554F, 0x65555555 }, + { 0xAAAAAAA4, 0x6AAAAAAA }, + { 0xFE00FFF9, 0x6FFFFFFF }, + { 0xAAAAAAA1, 0x9AAAAAAA }, + { 0xFFFEFFF6, 0x9FFFFFFF }, + { 0x5755554B, 0xA5555555 }, + }; + + uint2 q[32]; + + uint2 tmp; + tmp = vectorize((mxh[5]) - (mxh[7]) + (hash2[10] + hash2[13] + hash2[14])); + q[0] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ ROTL64(tmp, 4) ^ ROTL64(tmp, 37)) + hash[1]; + tmp = vectorize((mxh[6]) + (hash2[11] + hash2[14] - (512 ^ hash2[15]) - (0x80 ^ hash2[8]))); + q[1] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ ROTL64(tmp, 13) ^ ROTL64(tmp, 43)) + hash[2]; + tmp = vectorize((mxh[0] + mxh[7]) + hash2[9] - hash2[12] + (512 ^ hash2[15])); + q[2] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ ROTL64(tmp, 19) ^ ROTL64(tmp, 53)) + hash[3]; + tmp = vectorize((mxh[0] - mxh[1]) + (0x80 ^ hash2[8]) - hash2[10] + hash2[13]); + q[3] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ ROTL64(tmp, 28) ^ ROTL64(tmp, 59)) + hash[4]; + tmp = vectorize((mxh[1] + mxh[2]) + hash2[9] - hash2[11] - hash2[14]); + q[4] = (SHR(tmp, 1) ^ tmp) + hash[5]; + tmp = vectorize((mxh[3] - mxh[2] + hash2[10] - hash2[12] + (512 ^ hash2[15]))); + q[5] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ ROTL64(tmp, 4) ^ ROTL64(tmp, 37)) + hash[6]; + tmp = vectorize((mxh[4]) - (mxh[0]) - (mxh[3]) + hash2[13] - hash2[11]); + q[6] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ ROTL64(tmp, 13) ^ ROTL64(tmp, 43)) + hash[7]; + tmp = vectorize((mxh[1]) - (mxh[4]) - (mxh[5]) - hash2[12] - hash2[14]); + q[7] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ ROTL64(tmp, 19) ^ ROTL64(tmp, 53)) + hash[8]; + tmp = vectorize((mxh[2]) - (mxh[5]) - (mxh[6]) + hash2[13] - (512 ^ hash2[15])); + q[8] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ ROTL64(tmp, 28) ^ ROTL64(tmp, 59)) + hash[9]; + tmp = vectorize((mxh[0]) - (mxh[3]) + (mxh[6]) - (mxh[7]) + (hash2[14])); + q[9] = (SHR(tmp, 1) ^ tmp) + hash[10]; + tmp = vectorize((512 ^ hash2[15]) + (0x80 ^ hash2[8]) - (mxh[1]) - (mxh[4]) - (mxh[7])); + q[10] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ ROTL64(tmp, 4) ^ ROTL64(tmp, 37)) + hash[11]; + tmp = vectorize(hash2[9] + (0x80 ^ hash2[8]) - (mxh[0]) - (mxh[2]) - (mxh[5])); + q[11] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ ROTL64(tmp, 13) ^ ROTL64(tmp, 43)) + hash[12]; + tmp = vectorize((mxh[1]) + (mxh[3]) - (mxh[6]) + hash2[10] - hash2[9]); + q[12] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ ROTL64(tmp, 19) ^ ROTL64(tmp, 53)) + hash[13]; + tmp = vectorize((mxh[2]) + (mxh[4]) + (mxh[7]) + hash2[10] + hash2[11]); + q[13] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ ROTL64(tmp, 28) ^ ROTL64(tmp, 59)) + hash[14]; + tmp = vectorize((mxh[3]) - (mxh[5]) + (0x80 ^ hash2[8]) - hash2[11] - hash2[12]); + q[14] = (SHR(tmp, 1) ^ tmp) + hash[15]; + tmp = vectorize(hash2[12] - hash2[9] + hash2[13] - (mxh[4]) - (mxh[6])); + q[15] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ ROTL64(tmp, 4) ^ ROTL64(tmp, 37)) + hash[0]; + + q[0 + 16] = + (SHR(q[0], 1) ^ SHL(q[0], 2) ^ ROTL64(q[0], 13) ^ ROTL64(q[0], 43)) + + (SHR(q[0 + 1], 2) ^ SHL(q[0 + 1], 1) ^ ROTL64(q[0 + 1], 19) ^ ROTL64(q[0 + 1], 53)) + + (SHR(q[0 + 2], 2) ^ SHL(q[0 + 2], 2) ^ ROTL64(q[0 + 2], 28) ^ ROTL64(q[0 + 2], 59)) + + (SHR(q[0 + 3], 1) ^ SHL(q[0 + 3], 3) ^ ROTL64(q[0 + 3], 4) ^ ROTL64(q[0 + 3], 37)) + + (SHR(q[0 + 4], 1) ^ SHL(q[0 + 4], 2) ^ ROTL64(q[0 + 4], 13) ^ ROTL64(q[0 + 4], 43)) + + (SHR(q[0 + 5], 2) ^ SHL(q[0 + 5], 1) ^ ROTL64(q[0 + 5], 19) ^ ROTL64(q[0 + 5], 53)) + + (SHR(q[0 + 6], 2) ^ SHL(q[0 + 6], 2) ^ ROTL64(q[0 + 6], 28) ^ ROTL64(q[0 + 6], 59)) + + (SHR(q[0 + 7], 1) ^ SHL(q[0 + 7], 3) ^ ROTL64(q[0 + 7], 4) ^ ROTL64(q[0 + 7], 37)) + + (SHR(q[0 + 8], 1) ^ SHL(q[0 + 8], 2) ^ ROTL64(q[0 + 8], 13) ^ ROTL64(q[0 + 8], 43)) + + (SHR(q[0 + 9], 2) ^ SHL(q[0 + 9], 1) ^ ROTL64(q[0 + 9], 19) ^ ROTL64(q[0 + 9], 53)) + + (SHR(q[0 + 10], 2) ^ SHL(q[0 + 10], 2) ^ ROTL64(q[0 + 10], 28) ^ ROTL64(q[0 + 10], 59)) + + (SHR(q[0 + 11], 1) ^ SHL(q[0 + 11], 3) ^ ROTL64(q[0 + 11], 4) ^ ROTL64(q[0 + 11], 37)) + + (SHR(q[0 + 12], 1) ^ SHL(q[0 + 12], 2) ^ ROTL64(q[0 + 12], 13) ^ ROTL64(q[0 + 12], 43)) + + (SHR(q[0 + 13], 2) ^ SHL(q[0 + 13], 1) ^ ROTL64(q[0 + 13], 19) ^ ROTL64(q[0 + 13], 53)) + + (SHR(q[0 + 14], 2) ^ SHL(q[0 + 14], 2) ^ ROTL64(q[0 + 14], 28) ^ ROTL64(q[0 + 14], 59)) + + (SHR(q[0 + 15], 1) ^ SHL(q[0 + 15], 3) ^ ROTL64(q[0 + 15], 4) ^ ROTL64(q[0 + 15], 37)) + + ((precalcf[0] + ROTL64(msg2[0], 0 + 1) + + ROTL64(msg2[0 + 3], 0 + 4)) ^ hash[0 + 7]); + q[1 + 16] = + (SHR(q[1], 1) ^ SHL(q[1], 2) ^ ROTL64(q[1], 13) ^ ROTL64(q[1], 43)) + + (SHR(q[1 + 1], 2) ^ SHL(q[1 + 1], 1) ^ ROTL64(q[1 + 1], 19) ^ ROTL64(q[1 + 1], 53)) + + (SHR(q[1 + 2], 2) ^ SHL(q[1 + 2], 2) ^ ROTL64(q[1 + 2], 28) ^ ROTL64(q[1 + 2], 59)) + + (SHR(q[1 + 3], 1) ^ SHL(q[1 + 3], 3) ^ ROTL64(q[1 + 3], 4) ^ ROTL64(q[1 + 3], 37)) + + (SHR(q[1 + 4], 1) ^ SHL(q[1 + 4], 2) ^ ROTL64(q[1 + 4], 13) ^ ROTL64(q[1 + 4], 43)) + + (SHR(q[1 + 5], 2) ^ SHL(q[1 + 5], 1) ^ ROTL64(q[1 + 5], 19) ^ ROTL64(q[1 + 5], 53)) + + (SHR(q[1 + 6], 2) ^ SHL(q[1 + 6], 2) ^ ROTL64(q[1 + 6], 28) ^ ROTL64(q[1 + 6], 59)) + + (SHR(q[1 + 7], 1) ^ SHL(q[1 + 7], 3) ^ ROTL64(q[1 + 7], 4) ^ ROTL64(q[1 + 7], 37)) + + (SHR(q[1 + 8], 1) ^ SHL(q[1 + 8], 2) ^ ROTL64(q[1 + 8], 13) ^ ROTL64(q[1 + 8], 43)) + + (SHR(q[1 + 9], 2) ^ SHL(q[1 + 9], 1) ^ ROTL64(q[1 + 9], 19) ^ ROTL64(q[1 + 9], 53)) + + (SHR(q[1 + 10], 2) ^ SHL(q[1 + 10], 2) ^ ROTL64(q[1 + 10], 28) ^ ROTL64(q[1 + 10], 59)) + + (SHR(q[1 + 11], 1) ^ SHL(q[1 + 11], 3) ^ ROTL64(q[1 + 11], 4) ^ ROTL64(q[1 + 11], 37)) + + (SHR(q[1 + 12], 1) ^ SHL(q[1 + 12], 2) ^ ROTL64(q[1 + 12], 13) ^ ROTL64(q[1 + 12], 43)) + + (SHR(q[1 + 13], 2) ^ SHL(q[1 + 13], 1) ^ ROTL64(q[1 + 13], 19) ^ ROTL64(q[1 + 13], 53)) + + (SHR(q[1 + 14], 2) ^ SHL(q[1 + 14], 2) ^ ROTL64(q[1 + 14], 28) ^ ROTL64(q[1 + 14], 59)) + + (SHR(q[1 + 15], 1) ^ SHL(q[1 + 15], 3) ^ ROTL64(q[1 + 15], 4) ^ ROTL64(q[1 + 15], 37)) + + ((precalcf[1] + ROTL64(msg2[1], 1 + 1) + + ROTL64(msg2[1 + 3], 1 + 4)) ^ hash[1 + 7]); + + uint2 pre1 = q[2 + 0] + q[2 + 2] + q[2 + 4] + q[2 + 6] + q[2 + 8] + q[2 + 10] + q[2 + 12]; + uint2 pre2 = q[3 + 0] + q[3 + 2] + q[3 + 4] + q[3 + 6] + q[3 + 8] + q[3 + 10] + q[3 + 12]; + + q[2 + 16] = pre1 + CONST_EXP3(2) + + ((precalcf[2] + ROTL64(msg2[2], 2 + 1) + + ROTL64(msg2[2 + 3], 2 + 4)) ^ hash[2 + 7]); + q[3 + 16] = pre2 + CONST_EXP3(3) + + ((precalcf[3] + ROTL64(msg2[3], 3 + 1) + + ROTL64(msg2[3 + 3], 3 + 4)) ^ hash[3 + 7]); + pre1 = pre1 - q[2 + 0] + q[2 + 14]; + pre2 = pre2 - q[3 + 0] + q[3 + 14]; + + q[4 + 16] = pre1 + CONST_EXP3(4) + + ((precalcf[4] + ROTL64(msg2[4], 4 + 1) + + ROL8(msg2[4 + 3])) ^ hash[4 + 7]); + q[5 + 16] = pre2 + CONST_EXP3(5) + + ((precalcf[5] + ROTL64(msg2[5], 5 + 1)) + ^ hash[5 + 7]); + + pre1 = pre1 - q[4 + 0] + q[4 + 14]; + pre2 = pre2 - q[5 + 0] + q[5 + 14]; + + + q[6 + 16] = pre1 + CONST_EXP3(6) + + ((vectorize((6 + 16)*(0x0555555555555555ull)) + ROTL64(msg2[6], 6 + 1) - + ROTL64(msg2[6 - 6], (6 - 6) + 1)) ^ hash[6 + 7]); + q[7 + 16] = pre2 + CONST_EXP3(7) + + ((vectorize((7 + 16)*(0x0555555555555555ull)) + ROTL64(msg2[7], 7 + 1) - + ROTL64(msg2[7 - 6], (7 - 6) + 1)) ^ hash[7 + 7]); + + pre1 = pre1 - q[6 + 0] + q[6 + 14]; + pre2 = pre2 - q[7 + 0] + q[7 + 14]; + + q[8 + 16] = pre1 + CONST_EXP3(8) + + ((vectorize((8 + 16)*(0x0555555555555555ull) + 0x10000) - + ROTL64(msg2[8 - 6], (8 - 6) + 1)) ^ hash[8 + 7]); + q[25] = pre2 + CONST_EXP3(9) + + ((vectorize((25)*(0x0555555555555555ull)) - ROTL64(msg2[3], 4)) ^ hash[0]); + + pre1 = pre1 - q[8 + 0] + q[8 + 14]; + pre2 = pre2 - q[9 + 0] + q[9 + 14]; + + q[26] = pre1 + CONST_EXP3(10) + + ((vectorize((26)*(0x0555555555555555ull)) - ROTL64(msg2[4], 5)) ^ hash[1]); + q[27] = pre2 + CONST_EXP3(11) + + ((vectorize((27)*(0x0555555555555555ull)) - ROTL64(msg2[5], 6)) ^ hash[2]); + + pre1 = pre1 - q[10 + 0] + q[10 + 14]; + pre2 = pre2 - q[11 + 0] + q[11 + 14]; + + q[28] = pre1 + CONST_EXP3(12) + + ((vectorize(0x955555555755554C) - ROTL64(msg2[6], 7)) ^ hash[3]); + q[13 + 16] = pre2 + CONST_EXP3(13) + + ((precalcf[6] + + ROTL64(msg2[13 - 13], (13 - 13) + 1) - ROL8(msg2[13 - 6])) ^ hash[13 - 9]); + + pre1 = pre1 - q[12 + 0] + q[12 + 14]; + pre2 = pre2 - q[13 + 0] + q[13 + 14]; + + q[14 + 16] = pre1 + CONST_EXP3(14) + + ((precalcf[7] + + ROTL64(msg2[14 - 13], (14 - 13) + 1)) ^ hash[14 - 9]); + q[15 + 16] = pre2 + CONST_EXP3(15) + + ((precalcf[8] + + ROTL64(msg2[15 - 13], (15 - 13) + 1)) ^ hash[15 - 9]); + + + uint2 XL64 = q[16] ^ q[17] ^ q[18] ^ q[19] ^ q[20] ^ q[21] ^ q[22] ^ q[23]; + uint2 XH64 = XL64^q[24] ^ q[25] ^ q[26] ^ q[27] ^ q[28] ^ q[29] ^ q[30] ^ q[31]; + + h[0] = (SHL(XH64, 5) ^ SHR(q[16], 5) ^ msg2[0]) + (XL64 ^ q[24] ^ q[0]); + h[1] = (SHR(XH64, 7) ^ SHL(q[17], 8) ^ msg2[1]) + (XL64 ^ q[25] ^ q[1]); + h[2] = (SHR(XH64, 5) ^ SHL(q[18], 5) ^ msg2[2]) + (XL64 ^ q[26] ^ q[2]); + h[3] = (SHR(XH64, 1) ^ SHL(q[19], 5) ^ msg2[3]) + (XL64 ^ q[27] ^ q[3]); + h[4] = (SHR(XH64, 3) ^ q[20] ^ msg2[4]) + (XL64 ^ q[28] ^ q[4]); + h[5] = (SHL(XH64, 6) ^ SHR(q[21], 6) ^ msg2[5]) + (XL64 ^ q[29] ^ q[5]); + h[6] = (SHR(XH64, 4) ^ SHL(q[22], 6) ^ msg2[6]) + (XL64 ^ q[30] ^ q[6]); + h[7] = (SHR(XH64, 11) ^ SHL(q[23], 2) ^ msg2[7]) + (XL64 ^ q[31] ^ q[7]); + + h[8] = ROTL64(h[4], 9) + (XH64 ^ q[24] ^ 0x80) + (SHL(XL64, 8) ^ q[23] ^ q[8]); + h[9] = ROTL64(h[5], 10) + (XH64 ^ q[25]) + (SHR(XL64, 6) ^ q[16] ^ q[9]); + h[10] = ROTL64(h[6], 11) + (XH64 ^ q[26]) + (SHL(XL64, 6) ^ q[17] ^ q[10]); + h[11] = ROTL64(h[7], 12) + (XH64 ^ q[27]) + (SHL(XL64, 4) ^ q[18] ^ q[11]); + h[12] = ROTL64(h[0], 13) + (XH64 ^ q[28]) + (SHR(XL64, 3) ^ q[19] ^ q[12]); + h[13] = ROTL64(h[1], 14) + (XH64 ^ q[29]) + (SHR(XL64, 4) ^ q[20] ^ q[13]); + h[14] = ROTL64(h[2], 15) + (XH64 ^ q[30]) + (SHR(XL64, 7) ^ q[21] ^ q[14]); + h[15] = ROL16(h[3]) + (XH64 ^ q[31] ^ (512)) + (SHR(XL64, 2) ^ q[22] ^ q[15]); + + const uint2 cmsg[16] = + { + 0xaaaaaaa0, 0xaaaaaaaa, + 0xaaaaaaa1, 0xaaaaaaaa, + 0xaaaaaaa2, 0xaaaaaaaa, + 0xaaaaaaa3, 0xaaaaaaaa, + 0xaaaaaaa4, 0xaaaaaaaa, + 0xaaaaaaa5, 0xaaaaaaaa, + 0xaaaaaaa6, 0xaaaaaaaa, + 0xaaaaaaa7, 0xaaaaaaaa, + 0xaaaaaaa8, 0xaaaaaaaa, + 0xaaaaaaa9, 0xaaaaaaaa, + 0xaaaaaaaa, 0xaaaaaaaa, + 0xaaaaaaab, 0xaaaaaaaa, + 0xaaaaaaac, 0xaaaaaaaa, + 0xaaaaaaad, 0xaaaaaaaa, + 0xaaaaaaae, 0xaaaaaaaa, + 0xaaaaaaaf, 0xaaaaaaaa + }; #pragma unroll 16 - for(int i=0;i<16;i++) - message[i] = make_uint2(0xaaaaaaa0+i,0xaaaaaaaa); + for(int i = 0; i < 16; i++) + { + msg[i] = devectorize(cmsg[i] ^ h[i]); + } - Compression512(h, message); + const uint2 precalc[16] = + { + { 0x55555550, 0x55555555 }, + { 0xAAAAAAA5, 0x5AAAAAAA }, + { 0xFFFFFFFA, 0x5FFFFFFF }, + { 0x5555554F, 0x65555555 }, + { 0xAAAAAAA4, 0x6AAAAAAA }, + { 0xFFFFFFF9, 0x6FFFFFFF }, + { 0x5555554E, 0x75555555 }, + { 0xAAAAAAA3, 0x7AAAAAAA }, + { 0xFFFFFFF8, 0x7FFFFFFF }, + { 0x5555554D, 0x85555555 }, + { 0xAAAAAAA2, 0x8AAAAAAA }, + { 0xFFFFFFF7, 0x8FFFFFFF }, + { 0x5555554C, 0x95555555 }, + { 0xAAAAAAA1, 0x9AAAAAAA }, + { 0xFFFFFFF6, 0x9FFFFFFF }, + { 0x5555554B, 0xA5555555 }, + }; - // fertig - uint64_t *outpHash = &g_hash[8 * thread]; + const uint64_t p2 = msg[15] - msg[12]; + const uint64_t p3 = msg[14] - msg[7]; + const uint64_t p4 = msg[6] + msg[9]; + const uint64_t p5 = msg[8] - msg[5]; + const uint64_t p6 = msg[1] - msg[14]; + const uint64_t p7 = msg[8] - msg[1]; + const uint64_t p8 = msg[3] + msg[10]; -#pragma unroll 8 - for(int i=0;i<8;i++) - outpHash[i] = devectorize(message[i+8]); - } + + tmp = vectorize((msg[5]) + (msg[10]) + (msg[13]) + p3); + q[0] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ ROTL64(tmp, 4) ^ ROTL64(tmp, 37)) + cmsg[1]; + tmp = vectorize((msg[6]) - (msg[8]) + (msg[11]) + (msg[14]) - (msg[15])); + q[1] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ ROTL64(tmp, 13) ^ ROTL64(tmp, 43)) + cmsg[2]; + tmp = vectorize((msg[0]) + (msg[7]) + (msg[9]) + p2); + q[2] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ ROTL64(tmp, 19) ^ ROTL64(tmp, 53)) + cmsg[3]; + tmp = vectorize((msg[0]) + p7 - (msg[10]) + (msg[13])); + q[3] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ ROTL64(tmp, 28) ^ ROTL64(tmp, 59)) + cmsg[4]; + tmp = vectorize((msg[2]) + (msg[9]) - (msg[11]) + p6); + q[4] = (SHR(tmp, 1) ^ tmp) + cmsg[5]; + tmp = vectorize(p8 + p2 - (msg[2])); + q[5] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ ROTL64(tmp, 4) ^ ROTL64(tmp, 37)) + cmsg[6]; + tmp = vectorize((msg[4]) - (msg[0]) - (msg[3]) - (msg[11]) + (msg[13])); + q[6] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ ROTL64(tmp, 13) ^ ROTL64(tmp, 43)) + cmsg[7]; + tmp = vectorize(p6 - (msg[4]) - (msg[5]) - (msg[12])); + q[7] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ ROTL64(tmp, 19) ^ ROTL64(tmp, 53)) + cmsg[8]; + tmp = vectorize((msg[2]) - (msg[5]) - (msg[6]) + (msg[13]) - (msg[15])); + q[8] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ ROTL64(tmp, 28) ^ ROTL64(tmp, 59)) + cmsg[9]; + tmp = vectorize((msg[0]) - (msg[3]) + (msg[6]) + p3); + q[9] = (SHR(tmp, 1) ^ tmp) + cmsg[10]; + tmp = vectorize(p7 - (msg[4]) - (msg[7]) + (msg[15])); + q[10] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ ROTL64(tmp, 4) ^ ROTL64(tmp, 37)) + cmsg[11]; + tmp = vectorize(p5 - (msg[0]) - (msg[2]) + (msg[9])); + q[11] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ ROTL64(tmp, 13) ^ ROTL64(tmp, 43)) + cmsg[12]; + tmp = vectorize(p8 + msg[1] - p4); + q[12] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ ROTL64(tmp, 19) ^ ROTL64(tmp, 53)) + cmsg[13]; + tmp = vectorize((msg[2]) + (msg[4]) + (msg[7]) + (msg[10]) + (msg[11])); + q[13] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ ROTL64(tmp, 28) ^ ROTL64(tmp, 59)) + cmsg[14]; + tmp = vectorize((msg[3]) + p5 - (msg[11]) - (msg[12])); + q[14] = (SHR(tmp, 1) ^ tmp) + cmsg[15]; + tmp = vectorize((msg[12]) - (msg[4]) - p4 + (msg[13])); + q[15] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ ROTL64(tmp, 4) ^ ROTL64(tmp, 37)) + cmsg[0]; + + q[0 + 16] = + (SHR(q[0], 1) ^ SHL(q[0], 2) ^ ROTL64(q[0], 13) ^ ROTL64(q[0], 43)) + + (SHR(q[0 + 1], 2) ^ SHL(q[0 + 1], 1) ^ ROTL64(q[0 + 1], 19) ^ ROTL64(q[0 + 1], 53)) + + (SHR(q[0 + 2], 2) ^ SHL(q[0 + 2], 2) ^ ROTL64(q[0 + 2], 28) ^ ROTL64(q[0 + 2], 59)) + + (SHR(q[0 + 3], 1) ^ SHL(q[0 + 3], 3) ^ ROTL64(q[0 + 3], 4) ^ ROTL64(q[0 + 3], 37)) + + (SHR(q[0 + 4], 1) ^ SHL(q[0 + 4], 2) ^ ROTL64(q[0 + 4], 13) ^ ROTL64(q[0 + 4], 43)) + + (SHR(q[0 + 5], 2) ^ SHL(q[0 + 5], 1) ^ ROTL64(q[0 + 5], 19) ^ ROTL64(q[0 + 5], 53)) + + (SHR(q[0 + 6], 2) ^ SHL(q[0 + 6], 2) ^ ROTL64(q[0 + 6], 28) ^ ROTL64(q[0 + 6], 59)) + + (SHR(q[0 + 7], 1) ^ SHL(q[0 + 7], 3) ^ ROTL64(q[0 + 7], 4) ^ ROTL64(q[0 + 7], 37)) + + (SHR(q[0 + 8], 1) ^ SHL(q[0 + 8], 2) ^ ROTL64(q[0 + 8], 13) ^ ROTL64(q[0 + 8], 43)) + + (SHR(q[0 + 9], 2) ^ SHL(q[0 + 9], 1) ^ ROTL64(q[0 + 9], 19) ^ ROTL64(q[0 + 9], 53)) + + (SHR(q[0 + 10], 2) ^ SHL(q[0 + 10], 2) ^ ROTL64(q[0 + 10], 28) ^ ROTL64(q[0 + 10], 59)) + + (SHR(q[0 + 11], 1) ^ SHL(q[0 + 11], 3) ^ ROTL64(q[0 + 11], 4) ^ ROTL64(q[0 + 11], 37)) + + (SHR(q[0 + 12], 1) ^ SHL(q[0 + 12], 2) ^ ROTL64(q[0 + 12], 13) ^ ROTL64(q[0 + 12], 43)) + + (SHR(q[0 + 13], 2) ^ SHL(q[0 + 13], 1) ^ ROTL64(q[0 + 13], 19) ^ ROTL64(q[0 + 13], 53)) + + (SHR(q[0 + 14], 2) ^ SHL(q[0 + 14], 2) ^ ROTL64(q[0 + 14], 28) ^ ROTL64(q[0 + 14], 59)) + + (SHR(q[0 + 15], 1) ^ SHL(q[0 + 15], 3) ^ ROTL64(q[0 + 15], 4) ^ ROTL64(q[0 + 15], 37)) + + ((precalc[0] + ROTL64(h[0], 0 + 1) + + ROTL64(h[0 + 3], 0 + 4) - ROTL64(h[0 + 10], 0 + 11)) ^ cmsg[0 + 7]); + q[1 + 16] = + (SHR(q[1], 1) ^ SHL(q[1], 2) ^ ROTL64(q[1], 13) ^ ROTL64(q[1], 43)) + + (SHR(q[1 + 1], 2) ^ SHL(q[1 + 1], 1) ^ ROTL64(q[1 + 1], 19) ^ ROTL64(q[1 + 1], 53)) + + (SHR(q[1 + 2], 2) ^ SHL(q[1 + 2], 2) ^ ROTL64(q[1 + 2], 28) ^ ROTL64(q[1 + 2], 59)) + + (SHR(q[1 + 3], 1) ^ SHL(q[1 + 3], 3) ^ ROTL64(q[1 + 3], 4) ^ ROTL64(q[1 + 3], 37)) + + (SHR(q[1 + 4], 1) ^ SHL(q[1 + 4], 2) ^ ROTL64(q[1 + 4], 13) ^ ROTL64(q[1 + 4], 43)) + + (SHR(q[1 + 5], 2) ^ SHL(q[1 + 5], 1) ^ ROTL64(q[1 + 5], 19) ^ ROTL64(q[1 + 5], 53)) + + (SHR(q[1 + 6], 2) ^ SHL(q[1 + 6], 2) ^ ROTL64(q[1 + 6], 28) ^ ROTL64(q[1 + 6], 59)) + + (SHR(q[1 + 7], 1) ^ SHL(q[1 + 7], 3) ^ ROTL64(q[1 + 7], 4) ^ ROTL64(q[1 + 7], 37)) + + (SHR(q[1 + 8], 1) ^ SHL(q[1 + 8], 2) ^ ROTL64(q[1 + 8], 13) ^ ROTL64(q[1 + 8], 43)) + + (SHR(q[1 + 9], 2) ^ SHL(q[1 + 9], 1) ^ ROTL64(q[1 + 9], 19) ^ ROTL64(q[1 + 9], 53)) + + (SHR(q[1 + 10], 2) ^ SHL(q[1 + 10], 2) ^ ROTL64(q[1 + 10], 28) ^ ROTL64(q[1 + 10], 59)) + + (SHR(q[1 + 11], 1) ^ SHL(q[1 + 11], 3) ^ ROTL64(q[1 + 11], 4) ^ ROTL64(q[1 + 11], 37)) + + (SHR(q[1 + 12], 1) ^ SHL(q[1 + 12], 2) ^ ROTL64(q[1 + 12], 13) ^ ROTL64(q[1 + 12], 43)) + + (SHR(q[1 + 13], 2) ^ SHL(q[1 + 13], 1) ^ ROTL64(q[1 + 13], 19) ^ ROTL64(q[1 + 13], 53)) + + (SHR(q[1 + 14], 2) ^ SHL(q[1 + 14], 2) ^ ROTL64(q[1 + 14], 28) ^ ROTL64(q[1 + 14], 59)) + + (SHR(q[1 + 15], 1) ^ SHL(q[1 + 15], 3) ^ ROTL64(q[1 + 15], 4) ^ ROTL64(q[1 + 15], 37)) + + ((precalc[1] + ROTL64(h[1], 1 + 1) + + ROTL64(h[1 + 3], 1 + 4) - ROTL64(h[1 + 10], 1 + 11)) ^ cmsg[1 + 7]); + + pre1 = q[2] + q[4] + q[6] + q[8] + q[10] + q[12] + q[14]; + pre2 = q[3] + q[5] + q[7] + q[9] + q[11] + q[13] + q[15]; + + q[2 + 16] = pre1 + CONST_EXP3(2) + + ((precalc[2] + ROTL64(h[2], 2 + 1) + + ROTL64(h[2 + 3], 2 + 4) - ROTL64(h[2 + 10], 2 + 11)) ^ cmsg[2 + 7]); + q[3 + 16] = pre2 + CONST_EXP3(3) + + ((precalc[3] + ROTL64(h[3], 3 + 1) + + ROTL64(h[3 + 3], 3 + 4) - ROTL64(h[3 + 10], 3 + 11)) ^ cmsg[3 + 7]); + + pre1 = pre1 - q[2 + 0] + q[2 + 14]; + pre2 = pre2 - q[3 + 0] + q[3 + 14]; + + q[4 + 16] = pre1 + CONST_EXP3(4) + + ((precalc[4] + ROTL64(h[4], 4 + 1) + + ROL8(h[4 + 3]) - ROTL64(h[4 + 10], 4 + 11)) ^ cmsg[4 + 7]); + q[5 + 16] = pre2 + CONST_EXP3(5) + + ((precalc[5] + ROTL64(h[5], 5 + 1) + + ROTL64(h[5 + 3], 5 + 4) - ROL16(h[5 + 10])) ^ cmsg[5 + 7]); + + pre1 = pre1 - q[4 + 0] + q[4 + 14]; + pre2 = pre2 - q[5 + 0] + q[5 + 14]; + + q[6 + 16] = pre1 + CONST_EXP3(6) + + ((precalc[6] + ROTL64(h[6], 6 + 1) + + ROTL64(h[6 + 3], 6 + 4) - ROTL64(h[6 - 6], (6 - 6) + 1)) ^ cmsg[6 + 7]); + q[7 + 16] = pre2 + CONST_EXP3(7) + + ((precalc[7] + ROL8(h[7]) + + ROTL64(h[7 + 3], 7 + 4) - ROTL64(h[7 - 6], (7 - 6) + 1)) ^ cmsg[7 + 7]); + + pre1 = pre1 - q[6 + 0] + q[6 + 14]; + pre2 = pre2 - q[7 + 0] + q[7 + 14]; + + q[8 + 16] = pre1 + CONST_EXP3(8) + + ((precalc[8] + ROTL64(h[8], 8 + 1) + + ROTL64(h[8 + 3], 8 + 4) - ROTL64(h[8 - 6], (8 - 6) + 1)) ^ cmsg[8 + 7]); + q[9 + 16] = pre2 + CONST_EXP3(9) + + ((precalc[9] + ROTL64(h[9], 9 + 1) + + ROTL64(h[9 + 3], 9 + 4) - ROTL64(h[9 - 6], (9 - 6) + 1)) ^ cmsg[9 - 9]); + + pre1 = pre1 - q[8 + 0] + q[8 + 14]; + pre2 = pre2 - q[9 + 0] + q[9 + 14]; + + q[10 + 16] = pre1 + CONST_EXP3(10) + + ((precalc[10] + ROTL64(h[10], 10 + 1) + + ROTL64(h[10 + 3], 10 + 4) - ROTL64(h[10 - 6], (10 - 6) + 1)) ^ cmsg[10 - 9]); + q[11 + 16] = pre2 + CONST_EXP3(11) + + ((precalc[11] + ROTL64(h[11], 11 + 1) + + ROTL64(h[11 + 3], 11 + 4) - ROTL64(h[11 - 6], (11 - 6) + 1)) ^ cmsg[11 - 9]); + + pre1 = pre1 - q[10 + 0] + q[10 + 14]; + pre2 = pre2 - q[11 + 0] + q[11 + 14]; + + q[12 + 16] = pre1 + CONST_EXP3(12) + + ((precalc[12] + ROTL64(h[12], 12 + 1) + + ROL16(h[12 + 3]) - ROTL64(h[12 - 6], (12 - 6) + 1)) ^ cmsg[12 - 9]); + q[13 + 16] = pre2 + CONST_EXP3(13) + + ((precalc[13] + ROTL64(h[13], 13 + 1) + + ROTL64(h[13 - 13], (13 - 13) + 1) - ROL8(h[13 - 6])) ^ cmsg[13 - 9]); + + pre1 = pre1 - q[12 + 0] + q[12 + 14]; + pre2 = pre2 - q[13 + 0] + q[13 + 14]; + + q[14 + 16] = pre1 + CONST_EXP3(14) + + ((precalc[14] + ROTL64(h[14], 14 + 1) + + ROTL64(h[14 - 13], (14 - 13) + 1) - ROTL64(h[14 - 6], (14 - 6) + 1)) ^ cmsg[14 - 9]); + q[15 + 16] = pre2 + CONST_EXP3(15) + + ((precalc[15] + ROL16(h[15]) + + ROTL64(h[15 - 13], (15 - 13) + 1) - ROTL64(h[15 - 6], (15 - 6) + 1)) ^ cmsg[15 - 9]); + + XL64 = q[16] ^ q[17] ^ q[18] ^ q[19] ^ q[20] ^ q[21] ^ q[22] ^ q[23]; + XH64 = XL64^q[24] ^ q[25] ^ q[26] ^ q[27] ^ q[28] ^ q[29] ^ q[30] ^ q[31]; + + msg2[4] = (SHR(XH64, 3) ^ q[20] ^ h[4]) + (XL64 ^ q[28] ^ q[4]); + msg2[8] = ROTL64(msg2[4], 9) + (XH64 ^ q[24] ^ h[8]) + (SHL(XL64, 8) ^ q[23] ^ q[8]); + + inpHash[0] = devectorize(msg2[8]); + + if(((msg2[8].x) & 0x8)) return; + { + + msg2[0] = (SHL(XH64, 5) ^ SHR(q[16], 5) ^ h[0]) + (XL64 ^ q[24] ^ q[0]); + msg2[1] = (SHR(XH64, 7) ^ SHL(q[17], 8) ^ h[1]) + (XL64 ^ q[25] ^ q[1]); + msg2[2] = (SHR(XH64, 5) ^ SHL(q[18], 5) ^ h[2]) + (XL64 ^ q[26] ^ q[2]); + msg2[3] = (SHR(XH64, 1) ^ SHL(q[19], 5) ^ h[3]) + (XL64 ^ q[27] ^ q[3]); + msg2[5] = (SHL(XH64, 6) ^ SHR(q[21], 6) ^ h[5]) + (XL64 ^ q[29] ^ q[5]); + msg2[6] = (SHR(XH64, 4) ^ SHL(q[22], 6) ^ h[6]) + (XL64 ^ q[30] ^ q[6]); + msg2[7] = (SHR(XH64, 11) ^ SHL(q[23], 2) ^ h[7]) + (XL64 ^ q[31] ^ q[7]); + + msg2[9] = ROTL64(msg2[5], 10) + (XH64 ^ q[25] ^ h[9]) + (SHR(XL64, 6) ^ q[16] ^ q[9]); + msg2[10] = ROTL64(msg2[6], 11) + (XH64 ^ q[26] ^ h[10]) + (SHL(XL64, 6) ^ q[17] ^ q[10]); + msg2[11] = ROTL64(msg2[7], 12) + (XH64 ^ q[27] ^ h[11]) + (SHL(XL64, 4) ^ q[18] ^ q[11]); + msg2[12] = ROTL64(msg2[0], 13) + (XH64 ^ q[28] ^ h[12]) + (SHR(XL64, 3) ^ q[19] ^ q[12]); + msg2[13] = ROTL64(msg2[1], 14) + (XH64 ^ q[29] ^ h[13]) + (SHR(XL64, 4) ^ q[20] ^ q[13]); + msg2[14] = ROTL64(msg2[2], 15) + (XH64 ^ q[30] ^ h[14]) + (SHR(XL64, 7) ^ q[21] ^ q[14]); + msg2[15] = ROL16(msg2[3]) + (XH64 ^ q[31] ^ h[15]) + (SHR(XL64, 2) ^ q[22] ^ q[15]); + + uint28 *phash2 = (uint28*)inpHash; + phash2[0] = make_uint28(msg2[8], msg2[9], msg2[10], msg2[11]); + phash2[1] = make_uint28(msg2[12], msg2[13], msg2[14], msg2[15]); + + } + } } // Setup-Funktionen @@ -440,44 +1170,25 @@ __host__ void quark_bmw512_cpu_init(int thr_id, uint32_t threads) { } -// Bmw512 für 80 Byte grosse Eingangsdaten -__host__ void quark_bmw512_cpu_setBlock_80(void *pdata) -{ - // Message mit Padding bereitstellen - // lediglich die korrekte Nonce ist noch ab Byte 76 einzusetzen. - unsigned char PaddedMessage[128]; - memcpy(PaddedMessage, pdata, 80); - memset(PaddedMessage+80, 0, 48); - uint64_t *message = (uint64_t*)PaddedMessage; - // Padding einfügen (Byteorder?!?) - message[10] = SPH_C64(0x80); - // Länge (in Bits, d.h. 80 Byte * 8 = 640 Bits - message[15] = SPH_C64(640); - - // die Message zur Berechnung auf der GPU - cudaMemcpyToSymbol( c_PaddedMessage80, PaddedMessage, 16*sizeof(uint64_t), 0, cudaMemcpyHostToDevice); -} -__host__ void quark_bmw512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order) +__host__ void quark_bmw512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash) { - const uint32_t threadsperblock = 32; + const uint32_t threadsperblock = 32; // berechne wie viele Thread Blocks wir brauchen dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 block(threadsperblock); - quark_bmw512_gpu_hash_64<<>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector); -// MyStreamSynchronize(NULL, order, thr_id); + quark_bmw512_gpu_hash_64<<>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector); + CUDA_SAFE_CALL(cudaGetLastError()); } - -__host__ void quark_bmw512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order) +__host__ void quark_bmw512_cpu_hash_64_quark(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash) { - const uint32_t threadsperblock = 128; + const uint32_t threadsperblock = 32; - // berechne wie viele Thread Blocks wir brauchen - dim3 grid((threads + threadsperblock-1)/threadsperblock); - dim3 block(threadsperblock); + // berechne wie viele Thread Blocks wir brauchen + dim3 grid((threads + threadsperblock - 1) / threadsperblock); + dim3 block(threadsperblock); - quark_bmw512_gpu_hash_80<<>>(threads, startNounce, (uint64_t*)d_hash); + quark_bmw512_gpu_hash_64_quark << > >(threads, startNounce, (uint64_t*)d_hash, d_nonceVector); } - diff --git a/quark/cuda_jh512.cu b/quark/cuda_jh512.cu index 7809443d02..f0c26f2dc8 100644 --- a/quark/cuda_jh512.cu +++ b/quark/cuda_jh512.cu @@ -1,69 +1,124 @@ #include "cuda_helper.h" +#include "cuda_vector.h" +__constant__ static __align__(16) uint32_t c_E8_bslice32[42][8] = { + // Round 0 (Function0) + { 0xa2ded572, 0x90d6ab81, 0x67f815df, 0xf6875a4d, 0x0a15847b, 0xc54f9f4e, 0x571523b7, 0x402bd1c3 }, + { 0xe03a98ea, 0xb4960266, 0x9cfa455c, 0x8a53bbf2, 0x99d2c503, 0x1a1456b5, 0x9a99b266, 0x31a2db88 }, // 1 + { 0x5c5aa303, 0x8019051c, 0xdb0e199a, 0x1d959e84, 0x0ab23f40, 0xadeb336f, 0x1044c187, 0xdccde75e }, // 2 + { 0x9213ba10, 0x39812c0a, 0x416bbf02, 0x5078aa37, 0x156578dc, 0xd2bf1a3f, 0xd027bbf7, 0xd3910041 }, // 3 + { 0x0d5a2d42, 0x0ba75c18, 0x907eccf6, 0xac442bc7, 0x9c9f62dd, 0xd665dfd1, 0xce97c092, 0x23fcc663 }, // 4 + { 0x036c6e97, 0xbb03f1ee, 0x1ab8e09e, 0xfa618e5d, 0x7e450521, 0xb29796fd, 0xa8ec6c44, 0x97818394 }, // 5 + { 0x37858e4a, 0x8173fe8a, 0x2f3003db, 0x6c69b8f8, 0x2d8d672a, 0x4672c78a, 0x956a9ffb, 0x14427fc0 }, // 6 + // Round 7 (Function0) + { 0x8f15f4c5, 0xb775de52, 0xc45ec7bd, 0xbc88e4ae, 0xa76f4475, 0x1e00b882, 0x80bb118f, 0xf4a3a698 }, + { 0x338ff48e, 0x20edf1b6, 0x1563a3a9, 0xfde05a7c, 0x24565faa, 0x5ae9ca36, 0x89f9b7d5, 0x362c4206 }, + { 0x433529ce, 0x591ff5d0, 0x3d98fe4e, 0x86814e6f, 0x74f93a53, 0x81ad9d0e, 0xa74b9a73, 0x9f5ad8af }, + { 0x670605a7, 0x26077447, 0x6a6234ee, 0x3f1080c6, 0xbe280b8b, 0x6f7ea0e0, 0x2717b96e, 0x7b487ec6 }, + { 0xa50a550d, 0x81727686, 0xc0a4f84a, 0xd48d6050, 0x9fe7e391, 0x415a9e7e, 0x9ef18e97, 0x62b0e5f3 }, + { 0xec1f9ffc, 0xf594d74f, 0x7a205440, 0xd895fa9d, 0x001ae4e3, 0x117e2e55, 0x84c9f4ce, 0xa554c324 }, + { 0x2872df5b, 0xef7c8905, 0x286efebd, 0x2ed349ee, 0xe27ff578, 0x85937e44, 0xb2c4a50f, 0x7f5928eb }, + // Round 14 (Function0) + { 0x37695f70, 0x04771bc7, 0x4a3124b3, 0xe720b951, 0xf128865e, 0xe843fe74, 0x65e4d61d, 0x8a87d423 }, + { 0xa3e8297d, 0xfb301b1d, 0xf2947692, 0xe01bdc5b, 0x097acbdd, 0x4f4924da, 0xc1d9309b, 0xbf829cf2 }, + { 0x31bae7a4, 0x32fcae3b, 0xffbf70b4, 0x39d3bb53, 0x0544320d, 0xc1c39f45, 0x48bcf8de, 0xa08b29e0 }, + { 0xfd05c9e5, 0x01b771a2, 0x0f09aef7, 0x95ed44e3, 0x12347094, 0x368e3be9, 0x34f19042, 0x4a982f4f }, + { 0x631d4088, 0xf14abb7e, 0x15f66ca0, 0x30c60ae2, 0x4b44c147, 0xc5b67046, 0xffaf5287, 0xe68c6ecc }, + { 0x56a4d5a4, 0x45ce5773, 0x00ca4fbd, 0xadd16430, 0x4b849dda, 0x68cea6e8, 0xae183ec8, 0x67255c14 }, + { 0xf28cdaa3, 0x20b2601f, 0x16e10ecb, 0x7b846fc2, 0x5806e933, 0x7facced1, 0x9a99949a, 0x1885d1a0 }, + // Round 21 (Function0) + { 0xa15b5932, 0x67633d9f, 0xd319dd8d, 0xba6b04e4, 0xc01c9a50, 0xab19caf6, 0x46b4a5aa, 0x7eee560b }, + { 0xea79b11f, 0x5aac571d, 0x742128a9, 0x76d35075, 0x35f7bde9, 0xfec2463a, 0xee51363b, 0x01707da3 }, + { 0xafc135f7, 0x15638341, 0x42d8a498, 0xa8db3aea, 0x20eced78, 0x4d3bc3fa, 0x79676b9e, 0x832c8332 }, + { 0x1f3b40a7, 0x6c4e3ee7, 0xf347271c, 0xfd4f21d2, 0x34f04059, 0x398dfdb8, 0x9a762db7, 0xef5957dc }, + { 0x490c9b8d, 0xd0ae3b7d, 0xdaeb492b, 0x84558d7a, 0x49d7a25b, 0xf0e9a5f5, 0x0d70f368, 0x658ef8e4 }, + { 0xf4a2b8a0, 0x92946891, 0x533b1036, 0x4f88e856, 0x9e07a80c, 0x555cb05b, 0x5aec3e75, 0x4cbcbaf8 }, + { 0x993bbbe3, 0x28acae64, 0x7b9487f3, 0x6db334dc, 0xd6f4da75, 0x50a5346c, 0x5d1c6b72, 0x71db28b8 }, + // Round 28 (Function0) + { 0xf2e261f8, 0xf1bcac1c, 0x2a518d10, 0xa23fce43, 0x3364dbe3, 0x3cd1bb67, 0xfc75dd59, 0xb043e802 }, + { 0xca5b0a33, 0xc3943b92, 0x75a12988, 0x1e4d790e, 0x4d19347f, 0xd7757479, 0x5c5316b4, 0x3fafeeb6 }, + { 0xf7d4a8ea, 0x5324a326, 0x21391abe, 0xd23c32ba, 0x097ef45c, 0x4a17a344, 0x5127234c, 0xadd5a66d }, + { 0xa63e1db5, 0xa17cf84c, 0x08c9f2af, 0x4d608672, 0x983d5983, 0xcc3ee246, 0x563c6b91, 0xf6c76e08 }, + { 0xb333982f, 0xe8b6f406, 0x5e76bcb1, 0x36d4c1be, 0xa566d62b, 0x1582ee74, 0x2ae6c4ef, 0x6321efbc }, + { 0x0d4ec1fd, 0x1614c17e, 0x69c953f4, 0x16fae006, 0xc45a7da7, 0x3daf907e, 0x26585806, 0x3f9d6328 }, + { 0xe3f2c9d2, 0x16512a74, 0x0cd29b00, 0x9832e0f2, 0x30ceaa5f, 0xd830eb0d, 0x300cd4b7, 0x9af8cee3 }, + // Round 35 (Function0) + { 0x7b9ec54b, 0x574d239b, 0x9279f1b5, 0x316796e6, 0x6ee651ff, 0xf3a6e6cc, 0xd3688604, 0x05750a17 }, + { 0xd98176b1, 0xb3cb2bf4, 0xce6c3213, 0x47154778, 0x8452173c, 0x825446ff, 0x62a205f8, 0x486a9323 }, + { 0x0758df38, 0x442e7031, 0x65655e4e, 0x86ca0bd0, 0x897cfcf2, 0xa20940f0, 0x8e5086fc, 0x4e477830 }, + { 0x39eea065, 0x26b29721, 0x8338f7d1, 0x6ff81301, 0x37e95ef7, 0xd1ed44a3, 0xbd3a2ce4, 0xe7de9fef }, + { 0x15dfa08b, 0x7ceca7d8, 0xd9922576, 0x7eb027ab, 0xf6f7853c, 0xda7d8d53, 0xbe42dc12, 0xdea83eaa }, + { 0x93ce25aa, 0xdaef5fc0, 0xd86902bd, 0xa5194a17, 0xfd43f65a, 0x33664d97, 0xf908731a, 0x6a21fd4c }, + { 0x3198b435, 0xa163d09a, 0x701541db, 0x72409751, 0xbb0f1eea, 0xbf9d75f6, 0x9b54cded, 0xe26f4791 } + // 42 rounds... +}; + +static uint32_t *d_found[MAX_GPUS]; + +#ifndef NOASM +__device__ __forceinline__ +static void SWAP4(uint32_t *x) { +#pragma nounroll + // y is used as tmp register too + for (uint32_t y = 0; y<4; y++, ++x) { + asm("and.b32 %1, %0, 0xF0F0F0F0;" + "xor.b32 %0, %0, %1;" + "shr.b32 %1, %1, 4;" + "vshl.u32.u32.u32.clamp.add %0, %0, 4, %1;\n\t" + : "+r"(*x) : "r"(y)); + } +} -__constant__ unsigned char c_E8_bitslice_roundconstant[42][32] = { - { 0x72, 0xd5, 0xde, 0xa2, 0xdf, 0x15, 0xf8, 0x67, 0x7b, 0x84, 0x15, 0xa, 0xb7, 0x23, 0x15, 0x57, 0x81, 0xab, 0xd6, 0x90, 0x4d, 0x5a, 0x87, 0xf6, 0x4e, 0x9f, 0x4f, 0xc5, 0xc3, 0xd1, 0x2b, 0x40 }, - { 0xea, 0x98, 0x3a, 0xe0, 0x5c, 0x45, 0xfa, 0x9c, 0x3, 0xc5, 0xd2, 0x99, 0x66, 0xb2, 0x99, 0x9a, 0x66, 0x2, 0x96, 0xb4, 0xf2, 0xbb, 0x53, 0x8a, 0xb5, 0x56, 0x14, 0x1a, 0x88, 0xdb, 0xa2, 0x31 }, - { 0x3, 0xa3, 0x5a, 0x5c, 0x9a, 0x19, 0xe, 0xdb, 0x40, 0x3f, 0xb2, 0xa, 0x87, 0xc1, 0x44, 0x10, 0x1c, 0x5, 0x19, 0x80, 0x84, 0x9e, 0x95, 0x1d, 0x6f, 0x33, 0xeb, 0xad, 0x5e, 0xe7, 0xcd, 0xdc }, - { 0x10, 0xba, 0x13, 0x92, 0x2, 0xbf, 0x6b, 0x41, 0xdc, 0x78, 0x65, 0x15, 0xf7, 0xbb, 0x27, 0xd0, 0xa, 0x2c, 0x81, 0x39, 0x37, 0xaa, 0x78, 0x50, 0x3f, 0x1a, 0xbf, 0xd2, 0x41, 0x0, 0x91, 0xd3 }, - { 0x42, 0x2d, 0x5a, 0xd, 0xf6, 0xcc, 0x7e, 0x90, 0xdd, 0x62, 0x9f, 0x9c, 0x92, 0xc0, 0x97, 0xce, 0x18, 0x5c, 0xa7, 0xb, 0xc7, 0x2b, 0x44, 0xac, 0xd1, 0xdf, 0x65, 0xd6, 0x63, 0xc6, 0xfc, 0x23 }, - { 0x97, 0x6e, 0x6c, 0x3, 0x9e, 0xe0, 0xb8, 0x1a, 0x21, 0x5, 0x45, 0x7e, 0x44, 0x6c, 0xec, 0xa8, 0xee, 0xf1, 0x3, 0xbb, 0x5d, 0x8e, 0x61, 0xfa, 0xfd, 0x96, 0x97, 0xb2, 0x94, 0x83, 0x81, 0x97 }, - { 0x4a, 0x8e, 0x85, 0x37, 0xdb, 0x3, 0x30, 0x2f, 0x2a, 0x67, 0x8d, 0x2d, 0xfb, 0x9f, 0x6a, 0x95, 0x8a, 0xfe, 0x73, 0x81, 0xf8, 0xb8, 0x69, 0x6c, 0x8a, 0xc7, 0x72, 0x46, 0xc0, 0x7f, 0x42, 0x14 }, - { 0xc5, 0xf4, 0x15, 0x8f, 0xbd, 0xc7, 0x5e, 0xc4, 0x75, 0x44, 0x6f, 0xa7, 0x8f, 0x11, 0xbb, 0x80, 0x52, 0xde, 0x75, 0xb7, 0xae, 0xe4, 0x88, 0xbc, 0x82, 0xb8, 0x0, 0x1e, 0x98, 0xa6, 0xa3, 0xf4 }, - { 0x8e, 0xf4, 0x8f, 0x33, 0xa9, 0xa3, 0x63, 0x15, 0xaa, 0x5f, 0x56, 0x24, 0xd5, 0xb7, 0xf9, 0x89, 0xb6, 0xf1, 0xed, 0x20, 0x7c, 0x5a, 0xe0, 0xfd, 0x36, 0xca, 0xe9, 0x5a, 0x6, 0x42, 0x2c, 0x36 }, - { 0xce, 0x29, 0x35, 0x43, 0x4e, 0xfe, 0x98, 0x3d, 0x53, 0x3a, 0xf9, 0x74, 0x73, 0x9a, 0x4b, 0xa7, 0xd0, 0xf5, 0x1f, 0x59, 0x6f, 0x4e, 0x81, 0x86, 0xe, 0x9d, 0xad, 0x81, 0xaf, 0xd8, 0x5a, 0x9f }, - { 0xa7, 0x5, 0x6, 0x67, 0xee, 0x34, 0x62, 0x6a, 0x8b, 0xb, 0x28, 0xbe, 0x6e, 0xb9, 0x17, 0x27, 0x47, 0x74, 0x7, 0x26, 0xc6, 0x80, 0x10, 0x3f, 0xe0, 0xa0, 0x7e, 0x6f, 0xc6, 0x7e, 0x48, 0x7b }, - { 0xd, 0x55, 0xa, 0xa5, 0x4a, 0xf8, 0xa4, 0xc0, 0x91, 0xe3, 0xe7, 0x9f, 0x97, 0x8e, 0xf1, 0x9e, 0x86, 0x76, 0x72, 0x81, 0x50, 0x60, 0x8d, 0xd4, 0x7e, 0x9e, 0x5a, 0x41, 0xf3, 0xe5, 0xb0, 0x62 }, - { 0xfc, 0x9f, 0x1f, 0xec, 0x40, 0x54, 0x20, 0x7a, 0xe3, 0xe4, 0x1a, 0x0, 0xce, 0xf4, 0xc9, 0x84, 0x4f, 0xd7, 0x94, 0xf5, 0x9d, 0xfa, 0x95, 0xd8, 0x55, 0x2e, 0x7e, 0x11, 0x24, 0xc3, 0x54, 0xa5 }, - { 0x5b, 0xdf, 0x72, 0x28, 0xbd, 0xfe, 0x6e, 0x28, 0x78, 0xf5, 0x7f, 0xe2, 0xf, 0xa5, 0xc4, 0xb2, 0x5, 0x89, 0x7c, 0xef, 0xee, 0x49, 0xd3, 0x2e, 0x44, 0x7e, 0x93, 0x85, 0xeb, 0x28, 0x59, 0x7f }, - { 0x70, 0x5f, 0x69, 0x37, 0xb3, 0x24, 0x31, 0x4a, 0x5e, 0x86, 0x28, 0xf1, 0x1d, 0xd6, 0xe4, 0x65, 0xc7, 0x1b, 0x77, 0x4, 0x51, 0xb9, 0x20, 0xe7, 0x74, 0xfe, 0x43, 0xe8, 0x23, 0xd4, 0x87, 0x8a }, - { 0x7d, 0x29, 0xe8, 0xa3, 0x92, 0x76, 0x94, 0xf2, 0xdd, 0xcb, 0x7a, 0x9, 0x9b, 0x30, 0xd9, 0xc1, 0x1d, 0x1b, 0x30, 0xfb, 0x5b, 0xdc, 0x1b, 0xe0, 0xda, 0x24, 0x49, 0x4f, 0xf2, 0x9c, 0x82, 0xbf }, - { 0xa4, 0xe7, 0xba, 0x31, 0xb4, 0x70, 0xbf, 0xff, 0xd, 0x32, 0x44, 0x5, 0xde, 0xf8, 0xbc, 0x48, 0x3b, 0xae, 0xfc, 0x32, 0x53, 0xbb, 0xd3, 0x39, 0x45, 0x9f, 0xc3, 0xc1, 0xe0, 0x29, 0x8b, 0xa0 }, - { 0xe5, 0xc9, 0x5, 0xfd, 0xf7, 0xae, 0x9, 0xf, 0x94, 0x70, 0x34, 0x12, 0x42, 0x90, 0xf1, 0x34, 0xa2, 0x71, 0xb7, 0x1, 0xe3, 0x44, 0xed, 0x95, 0xe9, 0x3b, 0x8e, 0x36, 0x4f, 0x2f, 0x98, 0x4a }, - { 0x88, 0x40, 0x1d, 0x63, 0xa0, 0x6c, 0xf6, 0x15, 0x47, 0xc1, 0x44, 0x4b, 0x87, 0x52, 0xaf, 0xff, 0x7e, 0xbb, 0x4a, 0xf1, 0xe2, 0xa, 0xc6, 0x30, 0x46, 0x70, 0xb6, 0xc5, 0xcc, 0x6e, 0x8c, 0xe6 }, - { 0xa4, 0xd5, 0xa4, 0x56, 0xbd, 0x4f, 0xca, 0x0, 0xda, 0x9d, 0x84, 0x4b, 0xc8, 0x3e, 0x18, 0xae, 0x73, 0x57, 0xce, 0x45, 0x30, 0x64, 0xd1, 0xad, 0xe8, 0xa6, 0xce, 0x68, 0x14, 0x5c, 0x25, 0x67 }, - { 0xa3, 0xda, 0x8c, 0xf2, 0xcb, 0xe, 0xe1, 0x16, 0x33, 0xe9, 0x6, 0x58, 0x9a, 0x94, 0x99, 0x9a, 0x1f, 0x60, 0xb2, 0x20, 0xc2, 0x6f, 0x84, 0x7b, 0xd1, 0xce, 0xac, 0x7f, 0xa0, 0xd1, 0x85, 0x18 }, - { 0x32, 0x59, 0x5b, 0xa1, 0x8d, 0xdd, 0x19, 0xd3, 0x50, 0x9a, 0x1c, 0xc0, 0xaa, 0xa5, 0xb4, 0x46, 0x9f, 0x3d, 0x63, 0x67, 0xe4, 0x4, 0x6b, 0xba, 0xf6, 0xca, 0x19, 0xab, 0xb, 0x56, 0xee, 0x7e }, - { 0x1f, 0xb1, 0x79, 0xea, 0xa9, 0x28, 0x21, 0x74, 0xe9, 0xbd, 0xf7, 0x35, 0x3b, 0x36, 0x51, 0xee, 0x1d, 0x57, 0xac, 0x5a, 0x75, 0x50, 0xd3, 0x76, 0x3a, 0x46, 0xc2, 0xfe, 0xa3, 0x7d, 0x70, 0x1 }, - { 0xf7, 0x35, 0xc1, 0xaf, 0x98, 0xa4, 0xd8, 0x42, 0x78, 0xed, 0xec, 0x20, 0x9e, 0x6b, 0x67, 0x79, 0x41, 0x83, 0x63, 0x15, 0xea, 0x3a, 0xdb, 0xa8, 0xfa, 0xc3, 0x3b, 0x4d, 0x32, 0x83, 0x2c, 0x83 }, - { 0xa7, 0x40, 0x3b, 0x1f, 0x1c, 0x27, 0x47, 0xf3, 0x59, 0x40, 0xf0, 0x34, 0xb7, 0x2d, 0x76, 0x9a, 0xe7, 0x3e, 0x4e, 0x6c, 0xd2, 0x21, 0x4f, 0xfd, 0xb8, 0xfd, 0x8d, 0x39, 0xdc, 0x57, 0x59, 0xef }, - { 0x8d, 0x9b, 0xc, 0x49, 0x2b, 0x49, 0xeb, 0xda, 0x5b, 0xa2, 0xd7, 0x49, 0x68, 0xf3, 0x70, 0xd, 0x7d, 0x3b, 0xae, 0xd0, 0x7a, 0x8d, 0x55, 0x84, 0xf5, 0xa5, 0xe9, 0xf0, 0xe4, 0xf8, 0x8e, 0x65 }, - { 0xa0, 0xb8, 0xa2, 0xf4, 0x36, 0x10, 0x3b, 0x53, 0xc, 0xa8, 0x7, 0x9e, 0x75, 0x3e, 0xec, 0x5a, 0x91, 0x68, 0x94, 0x92, 0x56, 0xe8, 0x88, 0x4f, 0x5b, 0xb0, 0x5c, 0x55, 0xf8, 0xba, 0xbc, 0x4c }, - { 0xe3, 0xbb, 0x3b, 0x99, 0xf3, 0x87, 0x94, 0x7b, 0x75, 0xda, 0xf4, 0xd6, 0x72, 0x6b, 0x1c, 0x5d, 0x64, 0xae, 0xac, 0x28, 0xdc, 0x34, 0xb3, 0x6d, 0x6c, 0x34, 0xa5, 0x50, 0xb8, 0x28, 0xdb, 0x71 }, - { 0xf8, 0x61, 0xe2, 0xf2, 0x10, 0x8d, 0x51, 0x2a, 0xe3, 0xdb, 0x64, 0x33, 0x59, 0xdd, 0x75, 0xfc, 0x1c, 0xac, 0xbc, 0xf1, 0x43, 0xce, 0x3f, 0xa2, 0x67, 0xbb, 0xd1, 0x3c, 0x2, 0xe8, 0x43, 0xb0 }, - { 0x33, 0xa, 0x5b, 0xca, 0x88, 0x29, 0xa1, 0x75, 0x7f, 0x34, 0x19, 0x4d, 0xb4, 0x16, 0x53, 0x5c, 0x92, 0x3b, 0x94, 0xc3, 0xe, 0x79, 0x4d, 0x1e, 0x79, 0x74, 0x75, 0xd7, 0xb6, 0xee, 0xaf, 0x3f }, - { 0xea, 0xa8, 0xd4, 0xf7, 0xbe, 0x1a, 0x39, 0x21, 0x5c, 0xf4, 0x7e, 0x9, 0x4c, 0x23, 0x27, 0x51, 0x26, 0xa3, 0x24, 0x53, 0xba, 0x32, 0x3c, 0xd2, 0x44, 0xa3, 0x17, 0x4a, 0x6d, 0xa6, 0xd5, 0xad }, - { 0xb5, 0x1d, 0x3e, 0xa6, 0xaf, 0xf2, 0xc9, 0x8, 0x83, 0x59, 0x3d, 0x98, 0x91, 0x6b, 0x3c, 0x56, 0x4c, 0xf8, 0x7c, 0xa1, 0x72, 0x86, 0x60, 0x4d, 0x46, 0xe2, 0x3e, 0xcc, 0x8, 0x6e, 0xc7, 0xf6 }, - { 0x2f, 0x98, 0x33, 0xb3, 0xb1, 0xbc, 0x76, 0x5e, 0x2b, 0xd6, 0x66, 0xa5, 0xef, 0xc4, 0xe6, 0x2a, 0x6, 0xf4, 0xb6, 0xe8, 0xbe, 0xc1, 0xd4, 0x36, 0x74, 0xee, 0x82, 0x15, 0xbc, 0xef, 0x21, 0x63 }, - { 0xfd, 0xc1, 0x4e, 0xd, 0xf4, 0x53, 0xc9, 0x69, 0xa7, 0x7d, 0x5a, 0xc4, 0x6, 0x58, 0x58, 0x26, 0x7e, 0xc1, 0x14, 0x16, 0x6, 0xe0, 0xfa, 0x16, 0x7e, 0x90, 0xaf, 0x3d, 0x28, 0x63, 0x9d, 0x3f }, - { 0xd2, 0xc9, 0xf2, 0xe3, 0x0, 0x9b, 0xd2, 0xc, 0x5f, 0xaa, 0xce, 0x30, 0xb7, 0xd4, 0xc, 0x30, 0x74, 0x2a, 0x51, 0x16, 0xf2, 0xe0, 0x32, 0x98, 0xd, 0xeb, 0x30, 0xd8, 0xe3, 0xce, 0xf8, 0x9a }, - { 0x4b, 0xc5, 0x9e, 0x7b, 0xb5, 0xf1, 0x79, 0x92, 0xff, 0x51, 0xe6, 0x6e, 0x4, 0x86, 0x68, 0xd3, 0x9b, 0x23, 0x4d, 0x57, 0xe6, 0x96, 0x67, 0x31, 0xcc, 0xe6, 0xa6, 0xf3, 0x17, 0xa, 0x75, 0x5 }, - { 0xb1, 0x76, 0x81, 0xd9, 0x13, 0x32, 0x6c, 0xce, 0x3c, 0x17, 0x52, 0x84, 0xf8, 0x5, 0xa2, 0x62, 0xf4, 0x2b, 0xcb, 0xb3, 0x78, 0x47, 0x15, 0x47, 0xff, 0x46, 0x54, 0x82, 0x23, 0x93, 0x6a, 0x48 }, - { 0x38, 0xdf, 0x58, 0x7, 0x4e, 0x5e, 0x65, 0x65, 0xf2, 0xfc, 0x7c, 0x89, 0xfc, 0x86, 0x50, 0x8e, 0x31, 0x70, 0x2e, 0x44, 0xd0, 0xb, 0xca, 0x86, 0xf0, 0x40, 0x9, 0xa2, 0x30, 0x78, 0x47, 0x4e }, - { 0x65, 0xa0, 0xee, 0x39, 0xd1, 0xf7, 0x38, 0x83, 0xf7, 0x5e, 0xe9, 0x37, 0xe4, 0x2c, 0x3a, 0xbd, 0x21, 0x97, 0xb2, 0x26, 0x1, 0x13, 0xf8, 0x6f, 0xa3, 0x44, 0xed, 0xd1, 0xef, 0x9f, 0xde, 0xe7 }, - { 0x8b, 0xa0, 0xdf, 0x15, 0x76, 0x25, 0x92, 0xd9, 0x3c, 0x85, 0xf7, 0xf6, 0x12, 0xdc, 0x42, 0xbe, 0xd8, 0xa7, 0xec, 0x7c, 0xab, 0x27, 0xb0, 0x7e, 0x53, 0x8d, 0x7d, 0xda, 0xaa, 0x3e, 0xa8, 0xde }, - { 0xaa, 0x25, 0xce, 0x93, 0xbd, 0x2, 0x69, 0xd8, 0x5a, 0xf6, 0x43, 0xfd, 0x1a, 0x73, 0x8, 0xf9, 0xc0, 0x5f, 0xef, 0xda, 0x17, 0x4a, 0x19, 0xa5, 0x97, 0x4d, 0x66, 0x33, 0x4c, 0xfd, 0x21, 0x6a }, - { 0x35, 0xb4, 0x98, 0x31, 0xdb, 0x41, 0x15, 0x70, 0xea, 0x1e, 0xf, 0xbb, 0xed, 0xcd, 0x54, 0x9b, 0x9a, 0xd0, 0x63, 0xa1, 0x51, 0x97, 0x40, 0x72, 0xf6, 0x75, 0x9d, 0xbf, 0x91, 0x47, 0x6f, 0xe2 } }; - -#define SWAP4(x,y)\ - y = (x & 0xf0f0f0f0UL); \ - x = (x ^ y); \ - y = (y >> 4); \ - x = (x << 4); \ - x= x | y; - -#define SWAP2(x,y)\ - y = (x & 0xccccccccUL); \ - x = (x ^ y); \ - y = (y >> 2); \ - x = (x << 2); \ - x= x | y; - -#define SWAP1(x,y)\ - y = (x & 0xaaaaaaaaUL); \ - x = (x ^ y); \ - y = (y >> 1); \ - x = x + x; \ - x= x | y; +__device__ __forceinline__ +static void SWAP2(uint32_t *x) { +#pragma nounroll + // y is used as tmp register too + for (uint32_t y = 0; y<4; y++, ++x) { + asm("and.b32 %1, %0, 0xCCCCCCCC;" + "xor.b32 %0, %0, %1;" + "shr.b32 %1, %1, 2;" + "vshl.u32.u32.u32.clamp.add %0, %0, 2, %1;\n\t" + : "+r"(*x) : "r"(y)); + } +} + +__device__ __forceinline__ +static void SWAP1(uint32_t *x) { +#pragma nounroll + // y is used as tmp register too + for (uint32_t y = 0; y<4; y++, ++x) { + asm("and.b32 %1, %0, 0xAAAAAAAA;" + "xor.b32 %0, %0, %1;" + "shr.b32 %1, %1, 1;" + "vshl.u32.u32.u32.clamp.add %0, %0, 1, %1;\n\t" + : "+r"(*x) : "r"(y)); + } +} +#else +__device__ __forceinline__ +static void SWAP4(uint32_t *x) +{ + x[0] = ((x[0] & 0x0f0f0f0fu) << 4) | (x[0] & 0xF0F0F0F0u) >> 4; + x[1] = ((x[1] & 0x0f0f0f0fu) << 4) | (x[1] & 0xF0F0F0F0u) >> 4; + x[2] = ((x[2] & 0x0f0f0f0fu) << 4) | (x[2] & 0xF0F0F0F0u) >> 4; + x[3] = ((x[3] & 0x0f0f0f0fu) << 4) | (x[3] & 0xF0F0F0F0u) >> 4; +} +__device__ __forceinline__ +static void SWAP2(uint32_t *x) +{ + x[0] = ((x[0] & 0x33333333u) << 2) | (x[0] & 0xCCCCCCCCu) >> 2; + x[1] = ((x[1] & 0x33333333u) << 2) | (x[1] & 0xCCCCCCCCu) >> 2; + x[2] = ((x[2] & 0x33333333u) << 2) | (x[2] & 0xCCCCCCCCu) >> 2; + x[3] = ((x[3] & 0x33333333u) << 2) | (x[3] & 0xCCCCCCCCu) >> 2; +} +__device__ __forceinline__ +static void SWAP1(uint32_t *x) +{ + x[0] = ((x[0] & 0x55555555u) << 1) | (x[0] & 0xAAAAAAAAu) >> 1; + x[1] = ((x[1] & 0x55555555u) << 1) | (x[1] & 0xAAAAAAAAu) >> 1; + x[2] = ((x[2] & 0x55555555u) << 1) | (x[2] & 0xAAAAAAAAu) >> 1; + x[3] = ((x[3] & 0x55555555u) << 1) | (x[3] & 0xAAAAAAAAu) >> 1; +} +#endif /*swapping bits 16i||16i+1||......||16i+7 with bits 16i+8||16i+9||......||16i+15 of 32-bit x*/ //#define SWAP8(x) (x) = ((((x) & 0x00ff00ffUL) << 8) | (((x) & 0xff00ff00UL) >> 8)); #define SWAP8(x) (x) = __byte_perm(x, x, 0x2301); @@ -96,17 +151,17 @@ __constant__ unsigned char c_E8_bitslice_roundconstant[42][32] = { m1 ^= (temp0 & (m0)); \ m2 ^= temp0; -static __device__ __forceinline__ void Sbox_and_MDS_layer(uint32_t x[8][4], uint32_t roundnumber) +__device__ __forceinline__ +static void Sbox_and_MDS_layer(uint32_t x[8][4], const int rnd) { - uint32_t temp0; - uint32_t cc0, cc1; - //Sbox and MDS layer -#pragma unroll 4 - for (int i = 0; i < 4; i++) { - cc0 = ((uint32_t*)c_E8_bitslice_roundconstant[roundnumber])[i]; - cc1 = ((uint32_t*)c_E8_bitslice_roundconstant[roundnumber])[i + 4]; - Sbox(x[0][i], x[2][i], x[4][i], x[6][i], cc0); - Sbox(x[1][i], x[3][i], x[5][i], x[7][i], cc1); + uint2* cc = (uint2*)&c_E8_bslice32[rnd]; + + // Sbox and MDS layer +#pragma unroll + for (int i = 0; i < 4; i++, ++cc) { + uint32_t temp0; + Sbox(x[0][i], x[2][i], x[4][i], x[6][i], cc->x); + Sbox(x[1][i], x[3][i], x[5][i], x[7][i], cc->y); L(x[0][i], x[2][i], x[4][i], x[6][i], x[1][i], x[3][i], x[5][i], x[7][i]); } } @@ -118,11 +173,7 @@ static __device__ __forceinline__ void RoundFunction0(uint32_t x[8][4], uint32_t #pragma unroll 4 for (int j = 1; j < 8; j = j + 2) { - uint32_t y; - SWAP1(x[j][0], y); - SWAP1(x[j][1], y); - SWAP1(x[j][2], y); - SWAP1(x[j][3], y); + SWAP1(x[j]); } } @@ -133,11 +184,7 @@ static __device__ __forceinline__ void RoundFunction1(uint32_t x[8][4], uint32_t #pragma unroll 4 for (int j = 1; j < 8; j = j + 2) { - uint32_t y; - SWAP2(x[j][0], y); - SWAP2(x[j][1], y); - SWAP2(x[j][2], y); - SWAP2(x[j][3], y); + SWAP2(x[j]); } } @@ -148,11 +195,7 @@ static __device__ __forceinline__ void RoundFunction2(uint32_t x[8][4], uint32_t #pragma unroll 4 for (int j = 1; j < 8; j = j + 2) { - uint32_t y; - SWAP4(x[j][0], y); - SWAP4(x[j][1], y); - SWAP4(x[j][2], y); - SWAP4(x[j][3], y); + SWAP4(x[j]); } } @@ -245,14 +288,14 @@ static __device__ __forceinline__ void F8(uint32_t x[8][4], const uint32_t buffe // Die Hash-Funktion __global__ __launch_bounds__(256, 4) -void quark_jh512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint32_t *g_hash, uint32_t *g_nonceVector) +void quark_jh512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint32_t *const __restrict__ g_hash, const uint32_t *const __restrict__ g_nonceVector) { - uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); if (thread < threads) { - uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread); - uint32_t hashPosition = nounce - startNounce; - uint32_t *Hash = &g_hash[16 * hashPosition]; + const uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread); + const uint32_t hashPosition = nounce - startNounce; + uint32_t *const Hash = &g_hash[16 * hashPosition]; uint32_t x[8][4] = { { 0x964bd16f, 0x17aa003e, 0x052e6a63, 0x43d5157a }, { 0x8d5e228a, 0x0bef970c, 0x591234e9, 0x61c3b3f2 }, @@ -263,11 +306,18 @@ void quark_jh512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint32_t *g { 0xecf657cf, 0x56f8b19d, 0x7c8806a7, 0x56b11657 }, { 0xdffcc2e3, 0xfb1785e6, 0x78465a54, 0x4bdd8ccc } }; + uint32_t msg[16]; + + uint28 *phash = (uint28*)Hash; + uint28 *outpt = (uint28*)msg; + outpt[0] = phash[0]; + outpt[1] = phash[1]; + #pragma unroll 16 - for (int i = 0; i < 16; i++) x[i >> 2][i & 3] ^= ((uint32_t*)Hash)[i]; + for (int i = 0; i < 16; i++) x[i >> 2][i & 3] ^= msg[i]; E8(x); #pragma unroll 16 - for (int i = 0; i < 16; i++) x[(16 + i) >> 2][(16 + i) & 3] ^= ((uint32_t*)Hash)[i]; + for (int i = 0; i < 16; i++) x[(16 + i) >> 2][(16 + i) & 3] ^= msg[i]; x[0 >> 2][0 & 3] ^= 0x80; x[15 >> 2][15 & 3] ^= 0x00020000; @@ -296,16 +346,25 @@ void quark_jh512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint32_t *g // Die Hash-Funktion #define TPB2 256 -__global__ __launch_bounds__(TPB2, 4) -void quark_jh512_gpu_hash_64_final(uint32_t threads, uint32_t startNounce, uint64_t *const __restrict__ g_hash, const uint32_t *const __restrict__ g_nonceVector) +__global__ __launch_bounds__(TPB2, 2) +void quark_jh512_gpu_hash_64_final(uint32_t threads, uint32_t startNounce, uint64_t *const __restrict__ g_hash, const uint32_t *const __restrict__ g_nonceVector, uint32_t *const __restrict__ d_found, uint32_t target) { - uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); if (thread < threads) { - uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread); + const uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread); + + const uint32_t hashPosition = nounce - startNounce; + uint32_t *const Hash = (uint32_t*)&g_hash[8 * hashPosition]; + + + uint32_t msg[16]; + + uint28 *phash = (uint28*)Hash; + uint28 *outpt = (uint28*)msg; + outpt[0] = phash[0]; + outpt[1] = phash[1]; - int hashPosition = nounce - startNounce; - uint32_t *Hash = (uint32_t*)&g_hash[8 * hashPosition]; uint32_t x[8][4] = { { 0x964bd16f, 0x17aa003e, 0x052e6a63, 0x43d5157a }, @@ -317,12 +376,12 @@ void quark_jh512_gpu_hash_64_final(uint32_t threads, uint32_t startNounce, uint6 { 0xecf657cf, 0x56f8b19d, 0x7c8806a7, 0x56b11657 }, { 0xdffcc2e3, 0xfb1785e6, 0x78465a54, 0x4bdd8ccc } }; - F8(x, Hash); + F8(x, msg); x[0][0] ^= 0x80U; x[3][3] ^= 0x00020000U; - for (int i = 0; i < 42; i += 7) + for (int i = 0; i < 35; i += 7) { RoundFunction0(x, i); RoundFunction1(x, i + 1); @@ -332,31 +391,45 @@ void quark_jh512_gpu_hash_64_final(uint32_t threads, uint32_t startNounce, uint6 RoundFunction5(x, i + 5); RoundFunction6(x, i + 6); } - - Hash[7] = x[5][3]; + RoundFunction0(x, 35); + RoundFunction1(x, 35 + 1); + RoundFunction2(x, 35 + 2); + RoundFunction3(x, 35 + 3); + RoundFunction4(x, 35 + 4); + RoundFunction5(x, 35 + 5); + RoundFunction6(x, 35 + 6); + + if(x[5][3] <= target) + { + uint32_t tmp = atomicExch(&(d_found[0]), nounce); + if(tmp != 0xffffffff) + d_found[1] = tmp; + } } } -__host__ void quark_jh512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order) +__host__ void quark_jh512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash) { const uint32_t threadsperblock = 32; // berechne wie viele Thread Blocks wir brauchen dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 block(threadsperblock); - quark_jh512_gpu_hash_64<<>>(threads, startNounce, d_hash, d_nonceVector); + quark_jh512_gpu_hash_64<<>>(threads, startNounce, d_hash, d_nonceVector); } // Setup-Funktionen -__host__ void quark_jh512_cpu_init(int thr_id, uint32_t threads) +__host__ void quark_jh512_cpu_init(int thr_id) { + cudaMalloc(&(d_found[thr_id]), 2 * sizeof(uint32_t)); } -__host__ void quark_jh512_cpu_hash_64_final(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order) +__host__ void quark_jh512_cpu_hash_64_final(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, uint32_t target, uint32_t *h_found) { dim3 grid((threads + TPB2 - 1) / TPB2); dim3 block(TPB2); - - quark_jh512_gpu_hash_64_final << > >(threads, startNounce, (uint64_t*)d_hash, d_nonceVector); + cudaMemsetAsync(d_found[thr_id], 0xff, 2 * sizeof(uint32_t), gpustream[thr_id]); + quark_jh512_gpu_hash_64_final << > >(threads, startNounce, (uint64_t*)d_hash, d_nonceVector, d_found[thr_id], target); + cudaMemcpyAsync(h_found, d_found[thr_id], 2 * sizeof(uint32_t), cudaMemcpyDeviceToHost, gpustream[thr_id]); } diff --git a/quark/cuda_jh512__4.cu b/quark/cuda_jh512__4.cu new file mode 100644 index 0000000000..b0290033e5 --- /dev/null +++ b/quark/cuda_jh512__4.cu @@ -0,0 +1,362 @@ +#include "cuda_helper.h" + +__constant__ unsigned char c_E8_bitslice_roundconstant[42][32] = { + { 0x72, 0xd5, 0xde, 0xa2, 0xdf, 0x15, 0xf8, 0x67, 0x7b, 0x84, 0x15, 0xa, 0xb7, 0x23, 0x15, 0x57, 0x81, 0xab, 0xd6, 0x90, 0x4d, 0x5a, 0x87, 0xf6, 0x4e, 0x9f, 0x4f, 0xc5, 0xc3, 0xd1, 0x2b, 0x40 }, + { 0xea, 0x98, 0x3a, 0xe0, 0x5c, 0x45, 0xfa, 0x9c, 0x3, 0xc5, 0xd2, 0x99, 0x66, 0xb2, 0x99, 0x9a, 0x66, 0x2, 0x96, 0xb4, 0xf2, 0xbb, 0x53, 0x8a, 0xb5, 0x56, 0x14, 0x1a, 0x88, 0xdb, 0xa2, 0x31 }, + { 0x3, 0xa3, 0x5a, 0x5c, 0x9a, 0x19, 0xe, 0xdb, 0x40, 0x3f, 0xb2, 0xa, 0x87, 0xc1, 0x44, 0x10, 0x1c, 0x5, 0x19, 0x80, 0x84, 0x9e, 0x95, 0x1d, 0x6f, 0x33, 0xeb, 0xad, 0x5e, 0xe7, 0xcd, 0xdc }, + { 0x10, 0xba, 0x13, 0x92, 0x2, 0xbf, 0x6b, 0x41, 0xdc, 0x78, 0x65, 0x15, 0xf7, 0xbb, 0x27, 0xd0, 0xa, 0x2c, 0x81, 0x39, 0x37, 0xaa, 0x78, 0x50, 0x3f, 0x1a, 0xbf, 0xd2, 0x41, 0x0, 0x91, 0xd3 }, + { 0x42, 0x2d, 0x5a, 0xd, 0xf6, 0xcc, 0x7e, 0x90, 0xdd, 0x62, 0x9f, 0x9c, 0x92, 0xc0, 0x97, 0xce, 0x18, 0x5c, 0xa7, 0xb, 0xc7, 0x2b, 0x44, 0xac, 0xd1, 0xdf, 0x65, 0xd6, 0x63, 0xc6, 0xfc, 0x23 }, + { 0x97, 0x6e, 0x6c, 0x3, 0x9e, 0xe0, 0xb8, 0x1a, 0x21, 0x5, 0x45, 0x7e, 0x44, 0x6c, 0xec, 0xa8, 0xee, 0xf1, 0x3, 0xbb, 0x5d, 0x8e, 0x61, 0xfa, 0xfd, 0x96, 0x97, 0xb2, 0x94, 0x83, 0x81, 0x97 }, + { 0x4a, 0x8e, 0x85, 0x37, 0xdb, 0x3, 0x30, 0x2f, 0x2a, 0x67, 0x8d, 0x2d, 0xfb, 0x9f, 0x6a, 0x95, 0x8a, 0xfe, 0x73, 0x81, 0xf8, 0xb8, 0x69, 0x6c, 0x8a, 0xc7, 0x72, 0x46, 0xc0, 0x7f, 0x42, 0x14 }, + { 0xc5, 0xf4, 0x15, 0x8f, 0xbd, 0xc7, 0x5e, 0xc4, 0x75, 0x44, 0x6f, 0xa7, 0x8f, 0x11, 0xbb, 0x80, 0x52, 0xde, 0x75, 0xb7, 0xae, 0xe4, 0x88, 0xbc, 0x82, 0xb8, 0x0, 0x1e, 0x98, 0xa6, 0xa3, 0xf4 }, + { 0x8e, 0xf4, 0x8f, 0x33, 0xa9, 0xa3, 0x63, 0x15, 0xaa, 0x5f, 0x56, 0x24, 0xd5, 0xb7, 0xf9, 0x89, 0xb6, 0xf1, 0xed, 0x20, 0x7c, 0x5a, 0xe0, 0xfd, 0x36, 0xca, 0xe9, 0x5a, 0x6, 0x42, 0x2c, 0x36 }, + { 0xce, 0x29, 0x35, 0x43, 0x4e, 0xfe, 0x98, 0x3d, 0x53, 0x3a, 0xf9, 0x74, 0x73, 0x9a, 0x4b, 0xa7, 0xd0, 0xf5, 0x1f, 0x59, 0x6f, 0x4e, 0x81, 0x86, 0xe, 0x9d, 0xad, 0x81, 0xaf, 0xd8, 0x5a, 0x9f }, + { 0xa7, 0x5, 0x6, 0x67, 0xee, 0x34, 0x62, 0x6a, 0x8b, 0xb, 0x28, 0xbe, 0x6e, 0xb9, 0x17, 0x27, 0x47, 0x74, 0x7, 0x26, 0xc6, 0x80, 0x10, 0x3f, 0xe0, 0xa0, 0x7e, 0x6f, 0xc6, 0x7e, 0x48, 0x7b }, + { 0xd, 0x55, 0xa, 0xa5, 0x4a, 0xf8, 0xa4, 0xc0, 0x91, 0xe3, 0xe7, 0x9f, 0x97, 0x8e, 0xf1, 0x9e, 0x86, 0x76, 0x72, 0x81, 0x50, 0x60, 0x8d, 0xd4, 0x7e, 0x9e, 0x5a, 0x41, 0xf3, 0xe5, 0xb0, 0x62 }, + { 0xfc, 0x9f, 0x1f, 0xec, 0x40, 0x54, 0x20, 0x7a, 0xe3, 0xe4, 0x1a, 0x0, 0xce, 0xf4, 0xc9, 0x84, 0x4f, 0xd7, 0x94, 0xf5, 0x9d, 0xfa, 0x95, 0xd8, 0x55, 0x2e, 0x7e, 0x11, 0x24, 0xc3, 0x54, 0xa5 }, + { 0x5b, 0xdf, 0x72, 0x28, 0xbd, 0xfe, 0x6e, 0x28, 0x78, 0xf5, 0x7f, 0xe2, 0xf, 0xa5, 0xc4, 0xb2, 0x5, 0x89, 0x7c, 0xef, 0xee, 0x49, 0xd3, 0x2e, 0x44, 0x7e, 0x93, 0x85, 0xeb, 0x28, 0x59, 0x7f }, + { 0x70, 0x5f, 0x69, 0x37, 0xb3, 0x24, 0x31, 0x4a, 0x5e, 0x86, 0x28, 0xf1, 0x1d, 0xd6, 0xe4, 0x65, 0xc7, 0x1b, 0x77, 0x4, 0x51, 0xb9, 0x20, 0xe7, 0x74, 0xfe, 0x43, 0xe8, 0x23, 0xd4, 0x87, 0x8a }, + { 0x7d, 0x29, 0xe8, 0xa3, 0x92, 0x76, 0x94, 0xf2, 0xdd, 0xcb, 0x7a, 0x9, 0x9b, 0x30, 0xd9, 0xc1, 0x1d, 0x1b, 0x30, 0xfb, 0x5b, 0xdc, 0x1b, 0xe0, 0xda, 0x24, 0x49, 0x4f, 0xf2, 0x9c, 0x82, 0xbf }, + { 0xa4, 0xe7, 0xba, 0x31, 0xb4, 0x70, 0xbf, 0xff, 0xd, 0x32, 0x44, 0x5, 0xde, 0xf8, 0xbc, 0x48, 0x3b, 0xae, 0xfc, 0x32, 0x53, 0xbb, 0xd3, 0x39, 0x45, 0x9f, 0xc3, 0xc1, 0xe0, 0x29, 0x8b, 0xa0 }, + { 0xe5, 0xc9, 0x5, 0xfd, 0xf7, 0xae, 0x9, 0xf, 0x94, 0x70, 0x34, 0x12, 0x42, 0x90, 0xf1, 0x34, 0xa2, 0x71, 0xb7, 0x1, 0xe3, 0x44, 0xed, 0x95, 0xe9, 0x3b, 0x8e, 0x36, 0x4f, 0x2f, 0x98, 0x4a }, + { 0x88, 0x40, 0x1d, 0x63, 0xa0, 0x6c, 0xf6, 0x15, 0x47, 0xc1, 0x44, 0x4b, 0x87, 0x52, 0xaf, 0xff, 0x7e, 0xbb, 0x4a, 0xf1, 0xe2, 0xa, 0xc6, 0x30, 0x46, 0x70, 0xb6, 0xc5, 0xcc, 0x6e, 0x8c, 0xe6 }, + { 0xa4, 0xd5, 0xa4, 0x56, 0xbd, 0x4f, 0xca, 0x0, 0xda, 0x9d, 0x84, 0x4b, 0xc8, 0x3e, 0x18, 0xae, 0x73, 0x57, 0xce, 0x45, 0x30, 0x64, 0xd1, 0xad, 0xe8, 0xa6, 0xce, 0x68, 0x14, 0x5c, 0x25, 0x67 }, + { 0xa3, 0xda, 0x8c, 0xf2, 0xcb, 0xe, 0xe1, 0x16, 0x33, 0xe9, 0x6, 0x58, 0x9a, 0x94, 0x99, 0x9a, 0x1f, 0x60, 0xb2, 0x20, 0xc2, 0x6f, 0x84, 0x7b, 0xd1, 0xce, 0xac, 0x7f, 0xa0, 0xd1, 0x85, 0x18 }, + { 0x32, 0x59, 0x5b, 0xa1, 0x8d, 0xdd, 0x19, 0xd3, 0x50, 0x9a, 0x1c, 0xc0, 0xaa, 0xa5, 0xb4, 0x46, 0x9f, 0x3d, 0x63, 0x67, 0xe4, 0x4, 0x6b, 0xba, 0xf6, 0xca, 0x19, 0xab, 0xb, 0x56, 0xee, 0x7e }, + { 0x1f, 0xb1, 0x79, 0xea, 0xa9, 0x28, 0x21, 0x74, 0xe9, 0xbd, 0xf7, 0x35, 0x3b, 0x36, 0x51, 0xee, 0x1d, 0x57, 0xac, 0x5a, 0x75, 0x50, 0xd3, 0x76, 0x3a, 0x46, 0xc2, 0xfe, 0xa3, 0x7d, 0x70, 0x1 }, + { 0xf7, 0x35, 0xc1, 0xaf, 0x98, 0xa4, 0xd8, 0x42, 0x78, 0xed, 0xec, 0x20, 0x9e, 0x6b, 0x67, 0x79, 0x41, 0x83, 0x63, 0x15, 0xea, 0x3a, 0xdb, 0xa8, 0xfa, 0xc3, 0x3b, 0x4d, 0x32, 0x83, 0x2c, 0x83 }, + { 0xa7, 0x40, 0x3b, 0x1f, 0x1c, 0x27, 0x47, 0xf3, 0x59, 0x40, 0xf0, 0x34, 0xb7, 0x2d, 0x76, 0x9a, 0xe7, 0x3e, 0x4e, 0x6c, 0xd2, 0x21, 0x4f, 0xfd, 0xb8, 0xfd, 0x8d, 0x39, 0xdc, 0x57, 0x59, 0xef }, + { 0x8d, 0x9b, 0xc, 0x49, 0x2b, 0x49, 0xeb, 0xda, 0x5b, 0xa2, 0xd7, 0x49, 0x68, 0xf3, 0x70, 0xd, 0x7d, 0x3b, 0xae, 0xd0, 0x7a, 0x8d, 0x55, 0x84, 0xf5, 0xa5, 0xe9, 0xf0, 0xe4, 0xf8, 0x8e, 0x65 }, + { 0xa0, 0xb8, 0xa2, 0xf4, 0x36, 0x10, 0x3b, 0x53, 0xc, 0xa8, 0x7, 0x9e, 0x75, 0x3e, 0xec, 0x5a, 0x91, 0x68, 0x94, 0x92, 0x56, 0xe8, 0x88, 0x4f, 0x5b, 0xb0, 0x5c, 0x55, 0xf8, 0xba, 0xbc, 0x4c }, + { 0xe3, 0xbb, 0x3b, 0x99, 0xf3, 0x87, 0x94, 0x7b, 0x75, 0xda, 0xf4, 0xd6, 0x72, 0x6b, 0x1c, 0x5d, 0x64, 0xae, 0xac, 0x28, 0xdc, 0x34, 0xb3, 0x6d, 0x6c, 0x34, 0xa5, 0x50, 0xb8, 0x28, 0xdb, 0x71 }, + { 0xf8, 0x61, 0xe2, 0xf2, 0x10, 0x8d, 0x51, 0x2a, 0xe3, 0xdb, 0x64, 0x33, 0x59, 0xdd, 0x75, 0xfc, 0x1c, 0xac, 0xbc, 0xf1, 0x43, 0xce, 0x3f, 0xa2, 0x67, 0xbb, 0xd1, 0x3c, 0x2, 0xe8, 0x43, 0xb0 }, + { 0x33, 0xa, 0x5b, 0xca, 0x88, 0x29, 0xa1, 0x75, 0x7f, 0x34, 0x19, 0x4d, 0xb4, 0x16, 0x53, 0x5c, 0x92, 0x3b, 0x94, 0xc3, 0xe, 0x79, 0x4d, 0x1e, 0x79, 0x74, 0x75, 0xd7, 0xb6, 0xee, 0xaf, 0x3f }, + { 0xea, 0xa8, 0xd4, 0xf7, 0xbe, 0x1a, 0x39, 0x21, 0x5c, 0xf4, 0x7e, 0x9, 0x4c, 0x23, 0x27, 0x51, 0x26, 0xa3, 0x24, 0x53, 0xba, 0x32, 0x3c, 0xd2, 0x44, 0xa3, 0x17, 0x4a, 0x6d, 0xa6, 0xd5, 0xad }, + { 0xb5, 0x1d, 0x3e, 0xa6, 0xaf, 0xf2, 0xc9, 0x8, 0x83, 0x59, 0x3d, 0x98, 0x91, 0x6b, 0x3c, 0x56, 0x4c, 0xf8, 0x7c, 0xa1, 0x72, 0x86, 0x60, 0x4d, 0x46, 0xe2, 0x3e, 0xcc, 0x8, 0x6e, 0xc7, 0xf6 }, + { 0x2f, 0x98, 0x33, 0xb3, 0xb1, 0xbc, 0x76, 0x5e, 0x2b, 0xd6, 0x66, 0xa5, 0xef, 0xc4, 0xe6, 0x2a, 0x6, 0xf4, 0xb6, 0xe8, 0xbe, 0xc1, 0xd4, 0x36, 0x74, 0xee, 0x82, 0x15, 0xbc, 0xef, 0x21, 0x63 }, + { 0xfd, 0xc1, 0x4e, 0xd, 0xf4, 0x53, 0xc9, 0x69, 0xa7, 0x7d, 0x5a, 0xc4, 0x6, 0x58, 0x58, 0x26, 0x7e, 0xc1, 0x14, 0x16, 0x6, 0xe0, 0xfa, 0x16, 0x7e, 0x90, 0xaf, 0x3d, 0x28, 0x63, 0x9d, 0x3f }, + { 0xd2, 0xc9, 0xf2, 0xe3, 0x0, 0x9b, 0xd2, 0xc, 0x5f, 0xaa, 0xce, 0x30, 0xb7, 0xd4, 0xc, 0x30, 0x74, 0x2a, 0x51, 0x16, 0xf2, 0xe0, 0x32, 0x98, 0xd, 0xeb, 0x30, 0xd8, 0xe3, 0xce, 0xf8, 0x9a }, + { 0x4b, 0xc5, 0x9e, 0x7b, 0xb5, 0xf1, 0x79, 0x92, 0xff, 0x51, 0xe6, 0x6e, 0x4, 0x86, 0x68, 0xd3, 0x9b, 0x23, 0x4d, 0x57, 0xe6, 0x96, 0x67, 0x31, 0xcc, 0xe6, 0xa6, 0xf3, 0x17, 0xa, 0x75, 0x5 }, + { 0xb1, 0x76, 0x81, 0xd9, 0x13, 0x32, 0x6c, 0xce, 0x3c, 0x17, 0x52, 0x84, 0xf8, 0x5, 0xa2, 0x62, 0xf4, 0x2b, 0xcb, 0xb3, 0x78, 0x47, 0x15, 0x47, 0xff, 0x46, 0x54, 0x82, 0x23, 0x93, 0x6a, 0x48 }, + { 0x38, 0xdf, 0x58, 0x7, 0x4e, 0x5e, 0x65, 0x65, 0xf2, 0xfc, 0x7c, 0x89, 0xfc, 0x86, 0x50, 0x8e, 0x31, 0x70, 0x2e, 0x44, 0xd0, 0xb, 0xca, 0x86, 0xf0, 0x40, 0x9, 0xa2, 0x30, 0x78, 0x47, 0x4e }, + { 0x65, 0xa0, 0xee, 0x39, 0xd1, 0xf7, 0x38, 0x83, 0xf7, 0x5e, 0xe9, 0x37, 0xe4, 0x2c, 0x3a, 0xbd, 0x21, 0x97, 0xb2, 0x26, 0x1, 0x13, 0xf8, 0x6f, 0xa3, 0x44, 0xed, 0xd1, 0xef, 0x9f, 0xde, 0xe7 }, + { 0x8b, 0xa0, 0xdf, 0x15, 0x76, 0x25, 0x92, 0xd9, 0x3c, 0x85, 0xf7, 0xf6, 0x12, 0xdc, 0x42, 0xbe, 0xd8, 0xa7, 0xec, 0x7c, 0xab, 0x27, 0xb0, 0x7e, 0x53, 0x8d, 0x7d, 0xda, 0xaa, 0x3e, 0xa8, 0xde }, + { 0xaa, 0x25, 0xce, 0x93, 0xbd, 0x2, 0x69, 0xd8, 0x5a, 0xf6, 0x43, 0xfd, 0x1a, 0x73, 0x8, 0xf9, 0xc0, 0x5f, 0xef, 0xda, 0x17, 0x4a, 0x19, 0xa5, 0x97, 0x4d, 0x66, 0x33, 0x4c, 0xfd, 0x21, 0x6a }, + { 0x35, 0xb4, 0x98, 0x31, 0xdb, 0x41, 0x15, 0x70, 0xea, 0x1e, 0xf, 0xbb, 0xed, 0xcd, 0x54, 0x9b, 0x9a, 0xd0, 0x63, 0xa1, 0x51, 0x97, 0x40, 0x72, 0xf6, 0x75, 0x9d, 0xbf, 0x91, 0x47, 0x6f, 0xe2 } }; + +#define SWAP4(x,y)\ + y = (x & 0xf0f0f0f0UL); \ + x = (x ^ y); \ + y = (y >> 4); \ + x = (x << 4); \ + x= x | y; + +#define SWAP2(x,y)\ + y = (x & 0xccccccccUL); \ + x = (x ^ y); \ + y = (y >> 2); \ + x = (x << 2); \ + x= x | y; + +#define SWAP1(x,y)\ + y = (x & 0xaaaaaaaaUL); \ + x = (x ^ y); \ + y = (y >> 1); \ + x = x + x; \ + x= x | y; +/*swapping bits 16i||16i+1||......||16i+7 with bits 16i+8||16i+9||......||16i+15 of 32-bit x*/ +//#define SWAP8(x) (x) = ((((x) & 0x00ff00ffUL) << 8) | (((x) & 0xff00ff00UL) >> 8)); +#define SWAP8(x) (x) = __byte_perm(x, x, 0x2301); +/*swapping bits 32i||32i+1||......||32i+15 with bits 32i+16||32i+17||......||32i+31 of 32-bit x*/ +//#define SWAP16(x) (x) = ((((x) & 0x0000ffffUL) << 16) | (((x) & 0xffff0000UL) >> 16)); +#define SWAP16(x) (x) = __byte_perm(x, x, 0x1032); + +/*The MDS transform*/ +#define L(m0,m1,m2,m3,m4,m5,m6,m7) \ + (m4) ^= (m1); \ + (m5) ^= (m2); \ + (m6) ^= (m0) ^ (m3); \ + (m7) ^= (m0); \ + (m0) ^= (m5); \ + (m1) ^= (m6); \ + (m2) ^= (m4) ^ (m7); \ + (m3) ^= (m4); + +/*The Sbox*/ +#define Sbox(m0,m1,m2,m3,cc) \ + m3 = ~(m3); \ + m0 ^= ((~(m2)) & (cc)); \ + temp0 = (cc) ^ ((m0) & (m1));\ + m0 ^= ((m2) & (m3)); \ + m3 ^= ((~(m1)) & (m2)); \ + m1 ^= ((m0) & (m2)); \ + m2 ^= ((m0) & (~(m3))); \ + m0 ^= ((m1) | (m3)); \ + m3 ^= ((m1) & (m2)); \ + m1 ^= (temp0 & (m0)); \ + m2 ^= temp0; + +static __device__ __forceinline__ void Sbox_and_MDS_layer(uint32_t x[8][4], uint32_t roundnumber) +{ + uint32_t temp0; + uint32_t cc0, cc1; + //Sbox and MDS layer +#pragma unroll 4 + for (int i = 0; i < 4; i++) { + cc0 = c_E8_bitslice_roundconstant[roundnumber][i]; + cc1 = c_E8_bitslice_roundconstant[roundnumber][i + 4]; + Sbox(x[0][i], x[2][i], x[4][i], x[6][i], cc0); + Sbox(x[1][i], x[3][i], x[5][i], x[7][i], cc1); + L(x[0][i], x[2][i], x[4][i], x[6][i], x[1][i], x[3][i], x[5][i], x[7][i]); + } +} + +static __device__ __forceinline__ void RoundFunction0(uint32_t x[8][4], uint32_t roundnumber) +{ + Sbox_and_MDS_layer(x, roundnumber); + +#pragma unroll 4 + for (int j = 1; j < 8; j = j + 2) + { + uint32_t y; + SWAP1(x[j][0], y); + SWAP1(x[j][1], y); + SWAP1(x[j][2], y); + SWAP1(x[j][3], y); + } +} + +static __device__ __forceinline__ void RoundFunction1(uint32_t x[8][4], uint32_t roundnumber) +{ + Sbox_and_MDS_layer(x, roundnumber); + +#pragma unroll 4 + for (int j = 1; j < 8; j = j + 2) + { + uint32_t y; + SWAP2(x[j][0], y); + SWAP2(x[j][1], y); + SWAP2(x[j][2], y); + SWAP2(x[j][3], y); + } +} + +static __device__ __forceinline__ void RoundFunction2(uint32_t x[8][4], uint32_t roundnumber) +{ + Sbox_and_MDS_layer(x, roundnumber); + +#pragma unroll 4 + for (int j = 1; j < 8; j = j + 2) + { + uint32_t y; + SWAP4(x[j][0], y); + SWAP4(x[j][1], y); + SWAP4(x[j][2], y); + SWAP4(x[j][3], y); + } +} + +static __device__ __forceinline__ void RoundFunction3(uint32_t x[8][4], uint32_t roundnumber) +{ + Sbox_and_MDS_layer(x, roundnumber); + +#pragma unroll 4 + for (int j = 1; j < 8; j = j + 2) + { +#pragma unroll 4 + for (int i = 0; i < 4; i++) SWAP8(x[j][i]); + } +} + +static __device__ __forceinline__ void RoundFunction4(uint32_t x[8][4], uint32_t roundnumber) +{ + Sbox_and_MDS_layer(x, roundnumber); + +#pragma unroll 4 + for (int j = 1; j < 8; j = j + 2) + { +#pragma unroll 4 + for (int i = 0; i < 4; i++) SWAP16(x[j][i]); + } +} + +static __device__ __forceinline__ void RoundFunction5(uint32_t x[8][4], uint32_t roundnumber) +{ + uint32_t temp0; + + Sbox_and_MDS_layer(x, roundnumber); + +#pragma unroll 4 + for (int j = 1; j < 8; j = j + 2) + { +#pragma unroll 2 + for (int i = 0; i < 4; i = i + 2) { + temp0 = x[j][i]; x[j][i] = x[j][i + 1]; x[j][i + 1] = temp0; + } + } +} + +static __device__ __forceinline__ void RoundFunction6(uint32_t x[8][4], uint32_t roundnumber) +{ + uint32_t temp0; + + Sbox_and_MDS_layer(x, roundnumber); + +#pragma unroll 4 + for (int j = 1; j < 8; j = j + 2) + { +#pragma unroll 2 + for (int i = 0; i < 2; i++) { + temp0 = x[j][i]; x[j][i] = x[j][i + 2]; x[j][i + 2] = temp0; + } + } +} + +/*The bijective function E8, in bitslice form */ +static __device__ __forceinline__ void E8(uint32_t x[8][4]) +{ + /*perform 6 rounds*/ + //#pragma unroll 6 + for (int i = 0; i < 42; i += 7) + { + RoundFunction0(x, i); + RoundFunction1(x, i + 1); + RoundFunction2(x, i + 2); + RoundFunction3(x, i + 3); + RoundFunction4(x, i + 4); + RoundFunction5(x, i + 5); + RoundFunction6(x, i + 6); + } +} + +static __device__ __forceinline__ void F8(uint32_t x[8][4], const uint32_t buffer[16]) +{ + /*xor the 512-bit message with the fist half of the 1024-bit hash state*/ +#pragma unroll 16 + for (int i = 0; i < 16; i++) x[i >> 2][i & 3] ^= ((uint32_t*)buffer)[i]; + + /*the bijective function E8 */ + E8(x); + + /*xor the 512-bit message with the second half of the 1024-bit hash state*/ +#pragma unroll 16 + for (int i = 0; i < 16; i++) x[(16 + i) >> 2][(16 + i) & 3] ^= ((uint32_t*)buffer)[i]; +} + +// Die Hash-Funktion +__global__ __launch_bounds__(256, 4) +void quark_jh512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint32_t *g_hash, uint32_t *g_nonceVector) +{ + uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + if (thread < threads) + { + uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread); + uint32_t hashPosition = nounce - startNounce; + uint32_t *Hash = &g_hash[16 * hashPosition]; + uint32_t x[8][4] = { + { 0x964bd16f, 0x17aa003e, 0x052e6a63, 0x43d5157a }, + { 0x8d5e228a, 0x0bef970c, 0x591234e9, 0x61c3b3f2 }, + { 0xc1a01d89, 0x1e806f53, 0x6b05a92a, 0x806d2bea }, + { 0xdbcc8e58, 0xa6ba7520, 0x763a0fa9, 0xf73bf8ba }, + { 0x05e66901, 0x694ae341, 0x8e8ab546, 0x5ae66f2e }, + { 0xd0a74710, 0x243c84c1, 0xb1716e3b, 0x99c15a2d }, + { 0xecf657cf, 0x56f8b19d, 0x7c8806a7, 0x56b11657 }, + { 0xdffcc2e3, 0xfb1785e6, 0x78465a54, 0x4bdd8ccc } }; + +#pragma unroll 16 + for (int i = 0; i < 16; i++) x[i >> 2][i & 3] ^= ((uint32_t*)Hash)[i]; + E8(x); +#pragma unroll 16 + for (int i = 0; i < 16; i++) x[(16 + i) >> 2][(16 + i) & 3] ^= ((uint32_t*)Hash)[i]; + + x[0 >> 2][0 & 3] ^= 0x80; + x[15 >> 2][15 & 3] ^= 0x00020000; + E8(x); + x[(16 + 0) >> 2][(16 + 0) & 3] ^= 0x80; + x[(16 + 15) >> 2][(16 + 15) & 3] ^= 0x00020000; + + Hash[0] = x[4][0]; + Hash[1] = x[4][1]; + Hash[2] = x[4][2]; + Hash[3] = x[4][3]; + Hash[4] = x[5][0]; + Hash[5] = x[5][1]; + Hash[6] = x[5][2]; + Hash[7] = x[5][3]; + Hash[8] = x[6][0]; + Hash[9] = x[6][1]; + Hash[10] = x[6][2]; + Hash[11] = x[6][3]; + Hash[12] = x[7][0]; + Hash[13] = x[7][1]; + Hash[14] = x[7][2]; + Hash[15] = x[7][3]; + } +} + +// Die Hash-Funktion +#define TPB2 256 +__global__ __launch_bounds__(TPB2, 4) +void quark_jh512_gpu_hash_64_final(uint32_t threads, uint32_t startNounce, uint64_t *const __restrict__ g_hash, const uint32_t *const __restrict__ g_nonceVector) +{ + uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + if (thread < threads) + { + uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread); + + int hashPosition = nounce - startNounce; + uint32_t *Hash = (uint32_t*)&g_hash[8 * hashPosition]; + + uint32_t x[8][4] = { + { 0x964bd16f, 0x17aa003e, 0x052e6a63, 0x43d5157a }, + { 0x8d5e228a, 0x0bef970c, 0x591234e9, 0x61c3b3f2 }, + { 0xc1a01d89, 0x1e806f53, 0x6b05a92a, 0x806d2bea }, + { 0xdbcc8e58, 0xa6ba7520, 0x763a0fa9, 0xf73bf8ba }, + { 0x05e66901, 0x694ae341, 0x8e8ab546, 0x5ae66f2e }, + { 0xd0a74710, 0x243c84c1, 0xb1716e3b, 0x99c15a2d }, + { 0xecf657cf, 0x56f8b19d, 0x7c8806a7, 0x56b11657 }, + { 0xdffcc2e3, 0xfb1785e6, 0x78465a54, 0x4bdd8ccc } }; + + F8(x, Hash); + + x[0][0] ^= 0x80U; + x[3][3] ^= 0x00020000U; + + for (int i = 0; i < 42; i += 7) + { + RoundFunction0(x, i); + RoundFunction1(x, i + 1); + RoundFunction2(x, i + 2); + RoundFunction3(x, i + 3); + RoundFunction4(x, i + 4); + RoundFunction5(x, i + 5); + RoundFunction6(x, i + 6); + } + + Hash[7] = x[5][3]; + } +} + + +__host__ void quark_jh512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash) +{ + const uint32_t threadsperblock = 32; + + // berechne wie viele Thread Blocks wir brauchen + dim3 grid((threads + threadsperblock-1)/threadsperblock); + dim3 block(threadsperblock); + quark_jh512_gpu_hash_64<<>>(threads, startNounce, d_hash, d_nonceVector); +} + +// Setup-Funktionen +__host__ void quark_jh512_cpu_init(int thr_id, uint32_t threads) +{ +} + +__host__ void quark_jh512_cpu_hash_64_final(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash) +{ + dim3 grid((threads + TPB2 - 1) / TPB2); + dim3 block(TPB2); + + quark_jh512_gpu_hash_64_final << > >(threads, startNounce, (uint64_t*)d_hash, d_nonceVector); +} diff --git a/quark/cuda_jh512keccak512.cu b/quark/cuda_jh512keccak512.cu index 14489f390e..a0663d3e83 100644 --- a/quark/cuda_jh512keccak512.cu +++ b/quark/cuda_jh512keccak512.cu @@ -1,5 +1,5 @@ #include "cuda_helper.h" - +#include "cuda_vector.h" #ifdef _MSC_VER #define UINT2(x,y) { x, y } @@ -7,81 +7,131 @@ #define UINT2(x,y) (uint2) { x, y } #endif -/*42 round constants, each round constant is 32-byte (256-bit)*/ -__constant__ uint32_t c_INIT_bitslice[8][4] = { - { 0x964bd16f, 0x17aa003e, 0x052e6a63, 0x43d5157a }, - { 0x8d5e228a, 0x0bef970c, 0x591234e9, 0x61c3b3f2 }, - { 0xc1a01d89, 0x1e806f53, 0x6b05a92a, 0x806d2bea }, - { 0xdbcc8e58, 0xa6ba7520, 0x763a0fa9, 0xf73bf8ba }, - { 0x05e66901, 0x694ae341, 0x8e8ab546, 0x5ae66f2e }, - { 0xd0a74710, 0x243c84c1, 0xb1716e3b, 0x99c15a2d }, - { 0xecf657cf, 0x56f8b19d, 0x7c8806a7, 0x56b11657 }, - { 0xdffcc2e3, 0xfb1785e6, 0x78465a54, 0x4bdd8ccc } }; - -__constant__ unsigned char c_E8_bitslice_roundconstant[42][32] = { - { 0x72, 0xd5, 0xde, 0xa2, 0xdf, 0x15, 0xf8, 0x67, 0x7b, 0x84, 0x15, 0xa, 0xb7, 0x23, 0x15, 0x57, 0x81, 0xab, 0xd6, 0x90, 0x4d, 0x5a, 0x87, 0xf6, 0x4e, 0x9f, 0x4f, 0xc5, 0xc3, 0xd1, 0x2b, 0x40 }, - { 0xea, 0x98, 0x3a, 0xe0, 0x5c, 0x45, 0xfa, 0x9c, 0x3, 0xc5, 0xd2, 0x99, 0x66, 0xb2, 0x99, 0x9a, 0x66, 0x2, 0x96, 0xb4, 0xf2, 0xbb, 0x53, 0x8a, 0xb5, 0x56, 0x14, 0x1a, 0x88, 0xdb, 0xa2, 0x31 }, - { 0x3, 0xa3, 0x5a, 0x5c, 0x9a, 0x19, 0xe, 0xdb, 0x40, 0x3f, 0xb2, 0xa, 0x87, 0xc1, 0x44, 0x10, 0x1c, 0x5, 0x19, 0x80, 0x84, 0x9e, 0x95, 0x1d, 0x6f, 0x33, 0xeb, 0xad, 0x5e, 0xe7, 0xcd, 0xdc }, - { 0x10, 0xba, 0x13, 0x92, 0x2, 0xbf, 0x6b, 0x41, 0xdc, 0x78, 0x65, 0x15, 0xf7, 0xbb, 0x27, 0xd0, 0xa, 0x2c, 0x81, 0x39, 0x37, 0xaa, 0x78, 0x50, 0x3f, 0x1a, 0xbf, 0xd2, 0x41, 0x0, 0x91, 0xd3 }, - { 0x42, 0x2d, 0x5a, 0xd, 0xf6, 0xcc, 0x7e, 0x90, 0xdd, 0x62, 0x9f, 0x9c, 0x92, 0xc0, 0x97, 0xce, 0x18, 0x5c, 0xa7, 0xb, 0xc7, 0x2b, 0x44, 0xac, 0xd1, 0xdf, 0x65, 0xd6, 0x63, 0xc6, 0xfc, 0x23 }, - { 0x97, 0x6e, 0x6c, 0x3, 0x9e, 0xe0, 0xb8, 0x1a, 0x21, 0x5, 0x45, 0x7e, 0x44, 0x6c, 0xec, 0xa8, 0xee, 0xf1, 0x3, 0xbb, 0x5d, 0x8e, 0x61, 0xfa, 0xfd, 0x96, 0x97, 0xb2, 0x94, 0x83, 0x81, 0x97 }, - { 0x4a, 0x8e, 0x85, 0x37, 0xdb, 0x3, 0x30, 0x2f, 0x2a, 0x67, 0x8d, 0x2d, 0xfb, 0x9f, 0x6a, 0x95, 0x8a, 0xfe, 0x73, 0x81, 0xf8, 0xb8, 0x69, 0x6c, 0x8a, 0xc7, 0x72, 0x46, 0xc0, 0x7f, 0x42, 0x14 }, - { 0xc5, 0xf4, 0x15, 0x8f, 0xbd, 0xc7, 0x5e, 0xc4, 0x75, 0x44, 0x6f, 0xa7, 0x8f, 0x11, 0xbb, 0x80, 0x52, 0xde, 0x75, 0xb7, 0xae, 0xe4, 0x88, 0xbc, 0x82, 0xb8, 0x0, 0x1e, 0x98, 0xa6, 0xa3, 0xf4 }, - { 0x8e, 0xf4, 0x8f, 0x33, 0xa9, 0xa3, 0x63, 0x15, 0xaa, 0x5f, 0x56, 0x24, 0xd5, 0xb7, 0xf9, 0x89, 0xb6, 0xf1, 0xed, 0x20, 0x7c, 0x5a, 0xe0, 0xfd, 0x36, 0xca, 0xe9, 0x5a, 0x6, 0x42, 0x2c, 0x36 }, - { 0xce, 0x29, 0x35, 0x43, 0x4e, 0xfe, 0x98, 0x3d, 0x53, 0x3a, 0xf9, 0x74, 0x73, 0x9a, 0x4b, 0xa7, 0xd0, 0xf5, 0x1f, 0x59, 0x6f, 0x4e, 0x81, 0x86, 0xe, 0x9d, 0xad, 0x81, 0xaf, 0xd8, 0x5a, 0x9f }, - { 0xa7, 0x5, 0x6, 0x67, 0xee, 0x34, 0x62, 0x6a, 0x8b, 0xb, 0x28, 0xbe, 0x6e, 0xb9, 0x17, 0x27, 0x47, 0x74, 0x7, 0x26, 0xc6, 0x80, 0x10, 0x3f, 0xe0, 0xa0, 0x7e, 0x6f, 0xc6, 0x7e, 0x48, 0x7b }, - { 0xd, 0x55, 0xa, 0xa5, 0x4a, 0xf8, 0xa4, 0xc0, 0x91, 0xe3, 0xe7, 0x9f, 0x97, 0x8e, 0xf1, 0x9e, 0x86, 0x76, 0x72, 0x81, 0x50, 0x60, 0x8d, 0xd4, 0x7e, 0x9e, 0x5a, 0x41, 0xf3, 0xe5, 0xb0, 0x62 }, - { 0xfc, 0x9f, 0x1f, 0xec, 0x40, 0x54, 0x20, 0x7a, 0xe3, 0xe4, 0x1a, 0x0, 0xce, 0xf4, 0xc9, 0x84, 0x4f, 0xd7, 0x94, 0xf5, 0x9d, 0xfa, 0x95, 0xd8, 0x55, 0x2e, 0x7e, 0x11, 0x24, 0xc3, 0x54, 0xa5 }, - { 0x5b, 0xdf, 0x72, 0x28, 0xbd, 0xfe, 0x6e, 0x28, 0x78, 0xf5, 0x7f, 0xe2, 0xf, 0xa5, 0xc4, 0xb2, 0x5, 0x89, 0x7c, 0xef, 0xee, 0x49, 0xd3, 0x2e, 0x44, 0x7e, 0x93, 0x85, 0xeb, 0x28, 0x59, 0x7f }, - { 0x70, 0x5f, 0x69, 0x37, 0xb3, 0x24, 0x31, 0x4a, 0x5e, 0x86, 0x28, 0xf1, 0x1d, 0xd6, 0xe4, 0x65, 0xc7, 0x1b, 0x77, 0x4, 0x51, 0xb9, 0x20, 0xe7, 0x74, 0xfe, 0x43, 0xe8, 0x23, 0xd4, 0x87, 0x8a }, - { 0x7d, 0x29, 0xe8, 0xa3, 0x92, 0x76, 0x94, 0xf2, 0xdd, 0xcb, 0x7a, 0x9, 0x9b, 0x30, 0xd9, 0xc1, 0x1d, 0x1b, 0x30, 0xfb, 0x5b, 0xdc, 0x1b, 0xe0, 0xda, 0x24, 0x49, 0x4f, 0xf2, 0x9c, 0x82, 0xbf }, - { 0xa4, 0xe7, 0xba, 0x31, 0xb4, 0x70, 0xbf, 0xff, 0xd, 0x32, 0x44, 0x5, 0xde, 0xf8, 0xbc, 0x48, 0x3b, 0xae, 0xfc, 0x32, 0x53, 0xbb, 0xd3, 0x39, 0x45, 0x9f, 0xc3, 0xc1, 0xe0, 0x29, 0x8b, 0xa0 }, - { 0xe5, 0xc9, 0x5, 0xfd, 0xf7, 0xae, 0x9, 0xf, 0x94, 0x70, 0x34, 0x12, 0x42, 0x90, 0xf1, 0x34, 0xa2, 0x71, 0xb7, 0x1, 0xe3, 0x44, 0xed, 0x95, 0xe9, 0x3b, 0x8e, 0x36, 0x4f, 0x2f, 0x98, 0x4a }, - { 0x88, 0x40, 0x1d, 0x63, 0xa0, 0x6c, 0xf6, 0x15, 0x47, 0xc1, 0x44, 0x4b, 0x87, 0x52, 0xaf, 0xff, 0x7e, 0xbb, 0x4a, 0xf1, 0xe2, 0xa, 0xc6, 0x30, 0x46, 0x70, 0xb6, 0xc5, 0xcc, 0x6e, 0x8c, 0xe6 }, - { 0xa4, 0xd5, 0xa4, 0x56, 0xbd, 0x4f, 0xca, 0x0, 0xda, 0x9d, 0x84, 0x4b, 0xc8, 0x3e, 0x18, 0xae, 0x73, 0x57, 0xce, 0x45, 0x30, 0x64, 0xd1, 0xad, 0xe8, 0xa6, 0xce, 0x68, 0x14, 0x5c, 0x25, 0x67 }, - { 0xa3, 0xda, 0x8c, 0xf2, 0xcb, 0xe, 0xe1, 0x16, 0x33, 0xe9, 0x6, 0x58, 0x9a, 0x94, 0x99, 0x9a, 0x1f, 0x60, 0xb2, 0x20, 0xc2, 0x6f, 0x84, 0x7b, 0xd1, 0xce, 0xac, 0x7f, 0xa0, 0xd1, 0x85, 0x18 }, - { 0x32, 0x59, 0x5b, 0xa1, 0x8d, 0xdd, 0x19, 0xd3, 0x50, 0x9a, 0x1c, 0xc0, 0xaa, 0xa5, 0xb4, 0x46, 0x9f, 0x3d, 0x63, 0x67, 0xe4, 0x4, 0x6b, 0xba, 0xf6, 0xca, 0x19, 0xab, 0xb, 0x56, 0xee, 0x7e }, - { 0x1f, 0xb1, 0x79, 0xea, 0xa9, 0x28, 0x21, 0x74, 0xe9, 0xbd, 0xf7, 0x35, 0x3b, 0x36, 0x51, 0xee, 0x1d, 0x57, 0xac, 0x5a, 0x75, 0x50, 0xd3, 0x76, 0x3a, 0x46, 0xc2, 0xfe, 0xa3, 0x7d, 0x70, 0x1 }, - { 0xf7, 0x35, 0xc1, 0xaf, 0x98, 0xa4, 0xd8, 0x42, 0x78, 0xed, 0xec, 0x20, 0x9e, 0x6b, 0x67, 0x79, 0x41, 0x83, 0x63, 0x15, 0xea, 0x3a, 0xdb, 0xa8, 0xfa, 0xc3, 0x3b, 0x4d, 0x32, 0x83, 0x2c, 0x83 }, - { 0xa7, 0x40, 0x3b, 0x1f, 0x1c, 0x27, 0x47, 0xf3, 0x59, 0x40, 0xf0, 0x34, 0xb7, 0x2d, 0x76, 0x9a, 0xe7, 0x3e, 0x4e, 0x6c, 0xd2, 0x21, 0x4f, 0xfd, 0xb8, 0xfd, 0x8d, 0x39, 0xdc, 0x57, 0x59, 0xef }, - { 0x8d, 0x9b, 0xc, 0x49, 0x2b, 0x49, 0xeb, 0xda, 0x5b, 0xa2, 0xd7, 0x49, 0x68, 0xf3, 0x70, 0xd, 0x7d, 0x3b, 0xae, 0xd0, 0x7a, 0x8d, 0x55, 0x84, 0xf5, 0xa5, 0xe9, 0xf0, 0xe4, 0xf8, 0x8e, 0x65 }, - { 0xa0, 0xb8, 0xa2, 0xf4, 0x36, 0x10, 0x3b, 0x53, 0xc, 0xa8, 0x7, 0x9e, 0x75, 0x3e, 0xec, 0x5a, 0x91, 0x68, 0x94, 0x92, 0x56, 0xe8, 0x88, 0x4f, 0x5b, 0xb0, 0x5c, 0x55, 0xf8, 0xba, 0xbc, 0x4c }, - { 0xe3, 0xbb, 0x3b, 0x99, 0xf3, 0x87, 0x94, 0x7b, 0x75, 0xda, 0xf4, 0xd6, 0x72, 0x6b, 0x1c, 0x5d, 0x64, 0xae, 0xac, 0x28, 0xdc, 0x34, 0xb3, 0x6d, 0x6c, 0x34, 0xa5, 0x50, 0xb8, 0x28, 0xdb, 0x71 }, - { 0xf8, 0x61, 0xe2, 0xf2, 0x10, 0x8d, 0x51, 0x2a, 0xe3, 0xdb, 0x64, 0x33, 0x59, 0xdd, 0x75, 0xfc, 0x1c, 0xac, 0xbc, 0xf1, 0x43, 0xce, 0x3f, 0xa2, 0x67, 0xbb, 0xd1, 0x3c, 0x2, 0xe8, 0x43, 0xb0 }, - { 0x33, 0xa, 0x5b, 0xca, 0x88, 0x29, 0xa1, 0x75, 0x7f, 0x34, 0x19, 0x4d, 0xb4, 0x16, 0x53, 0x5c, 0x92, 0x3b, 0x94, 0xc3, 0xe, 0x79, 0x4d, 0x1e, 0x79, 0x74, 0x75, 0xd7, 0xb6, 0xee, 0xaf, 0x3f }, - { 0xea, 0xa8, 0xd4, 0xf7, 0xbe, 0x1a, 0x39, 0x21, 0x5c, 0xf4, 0x7e, 0x9, 0x4c, 0x23, 0x27, 0x51, 0x26, 0xa3, 0x24, 0x53, 0xba, 0x32, 0x3c, 0xd2, 0x44, 0xa3, 0x17, 0x4a, 0x6d, 0xa6, 0xd5, 0xad }, - { 0xb5, 0x1d, 0x3e, 0xa6, 0xaf, 0xf2, 0xc9, 0x8, 0x83, 0x59, 0x3d, 0x98, 0x91, 0x6b, 0x3c, 0x56, 0x4c, 0xf8, 0x7c, 0xa1, 0x72, 0x86, 0x60, 0x4d, 0x46, 0xe2, 0x3e, 0xcc, 0x8, 0x6e, 0xc7, 0xf6 }, - { 0x2f, 0x98, 0x33, 0xb3, 0xb1, 0xbc, 0x76, 0x5e, 0x2b, 0xd6, 0x66, 0xa5, 0xef, 0xc4, 0xe6, 0x2a, 0x6, 0xf4, 0xb6, 0xe8, 0xbe, 0xc1, 0xd4, 0x36, 0x74, 0xee, 0x82, 0x15, 0xbc, 0xef, 0x21, 0x63 }, - { 0xfd, 0xc1, 0x4e, 0xd, 0xf4, 0x53, 0xc9, 0x69, 0xa7, 0x7d, 0x5a, 0xc4, 0x6, 0x58, 0x58, 0x26, 0x7e, 0xc1, 0x14, 0x16, 0x6, 0xe0, 0xfa, 0x16, 0x7e, 0x90, 0xaf, 0x3d, 0x28, 0x63, 0x9d, 0x3f }, - { 0xd2, 0xc9, 0xf2, 0xe3, 0x0, 0x9b, 0xd2, 0xc, 0x5f, 0xaa, 0xce, 0x30, 0xb7, 0xd4, 0xc, 0x30, 0x74, 0x2a, 0x51, 0x16, 0xf2, 0xe0, 0x32, 0x98, 0xd, 0xeb, 0x30, 0xd8, 0xe3, 0xce, 0xf8, 0x9a }, - { 0x4b, 0xc5, 0x9e, 0x7b, 0xb5, 0xf1, 0x79, 0x92, 0xff, 0x51, 0xe6, 0x6e, 0x4, 0x86, 0x68, 0xd3, 0x9b, 0x23, 0x4d, 0x57, 0xe6, 0x96, 0x67, 0x31, 0xcc, 0xe6, 0xa6, 0xf3, 0x17, 0xa, 0x75, 0x5 }, - { 0xb1, 0x76, 0x81, 0xd9, 0x13, 0x32, 0x6c, 0xce, 0x3c, 0x17, 0x52, 0x84, 0xf8, 0x5, 0xa2, 0x62, 0xf4, 0x2b, 0xcb, 0xb3, 0x78, 0x47, 0x15, 0x47, 0xff, 0x46, 0x54, 0x82, 0x23, 0x93, 0x6a, 0x48 }, - { 0x38, 0xdf, 0x58, 0x7, 0x4e, 0x5e, 0x65, 0x65, 0xf2, 0xfc, 0x7c, 0x89, 0xfc, 0x86, 0x50, 0x8e, 0x31, 0x70, 0x2e, 0x44, 0xd0, 0xb, 0xca, 0x86, 0xf0, 0x40, 0x9, 0xa2, 0x30, 0x78, 0x47, 0x4e }, - { 0x65, 0xa0, 0xee, 0x39, 0xd1, 0xf7, 0x38, 0x83, 0xf7, 0x5e, 0xe9, 0x37, 0xe4, 0x2c, 0x3a, 0xbd, 0x21, 0x97, 0xb2, 0x26, 0x1, 0x13, 0xf8, 0x6f, 0xa3, 0x44, 0xed, 0xd1, 0xef, 0x9f, 0xde, 0xe7 }, - { 0x8b, 0xa0, 0xdf, 0x15, 0x76, 0x25, 0x92, 0xd9, 0x3c, 0x85, 0xf7, 0xf6, 0x12, 0xdc, 0x42, 0xbe, 0xd8, 0xa7, 0xec, 0x7c, 0xab, 0x27, 0xb0, 0x7e, 0x53, 0x8d, 0x7d, 0xda, 0xaa, 0x3e, 0xa8, 0xde }, - { 0xaa, 0x25, 0xce, 0x93, 0xbd, 0x2, 0x69, 0xd8, 0x5a, 0xf6, 0x43, 0xfd, 0x1a, 0x73, 0x8, 0xf9, 0xc0, 0x5f, 0xef, 0xda, 0x17, 0x4a, 0x19, 0xa5, 0x97, 0x4d, 0x66, 0x33, 0x4c, 0xfd, 0x21, 0x6a }, - { 0x35, 0xb4, 0x98, 0x31, 0xdb, 0x41, 0x15, 0x70, 0xea, 0x1e, 0xf, 0xbb, 0xed, 0xcd, 0x54, 0x9b, 0x9a, 0xd0, 0x63, 0xa1, 0x51, 0x97, 0x40, 0x72, 0xf6, 0x75, 0x9d, 0xbf, 0x91, 0x47, 0x6f, 0xe2 } }; - -#define SWAP4(x,y)\ - y = (x & 0xf0f0f0f0UL); \ - x = (x ^ y); \ - y = (y >> 4); \ - x = (x << 4); \ - x= x | y; - -#define SWAP2(x,y)\ - y = (x & 0xccccccccUL); \ - x = (x ^ y); \ - y = (y >> 2); \ - x = (x << 2); \ - x= x | y; - -#define SWAP1(x,y)\ - y = (x & 0xaaaaaaaaUL); \ - x = (x ^ y); \ - y = (y >> 1); \ - x = x + x; \ - x= x | y; +__constant__ static __align__(16) uint32_t c_E8_bslice32[42][8] = { + // Round 0 (Function0) + { 0xa2ded572, 0x90d6ab81, 0x67f815df, 0xf6875a4d, 0x0a15847b, 0xc54f9f4e, 0x571523b7, 0x402bd1c3 }, + { 0xe03a98ea, 0xb4960266, 0x9cfa455c, 0x8a53bbf2, 0x99d2c503, 0x1a1456b5, 0x9a99b266, 0x31a2db88 }, // 1 + { 0x5c5aa303, 0x8019051c, 0xdb0e199a, 0x1d959e84, 0x0ab23f40, 0xadeb336f, 0x1044c187, 0xdccde75e }, // 2 + { 0x9213ba10, 0x39812c0a, 0x416bbf02, 0x5078aa37, 0x156578dc, 0xd2bf1a3f, 0xd027bbf7, 0xd3910041 }, // 3 + { 0x0d5a2d42, 0x0ba75c18, 0x907eccf6, 0xac442bc7, 0x9c9f62dd, 0xd665dfd1, 0xce97c092, 0x23fcc663 }, // 4 + { 0x036c6e97, 0xbb03f1ee, 0x1ab8e09e, 0xfa618e5d, 0x7e450521, 0xb29796fd, 0xa8ec6c44, 0x97818394 }, // 5 + { 0x37858e4a, 0x8173fe8a, 0x2f3003db, 0x6c69b8f8, 0x2d8d672a, 0x4672c78a, 0x956a9ffb, 0x14427fc0 }, // 6 + // Round 7 (Function0) + { 0x8f15f4c5, 0xb775de52, 0xc45ec7bd, 0xbc88e4ae, 0xa76f4475, 0x1e00b882, 0x80bb118f, 0xf4a3a698 }, + { 0x338ff48e, 0x20edf1b6, 0x1563a3a9, 0xfde05a7c, 0x24565faa, 0x5ae9ca36, 0x89f9b7d5, 0x362c4206 }, + { 0x433529ce, 0x591ff5d0, 0x3d98fe4e, 0x86814e6f, 0x74f93a53, 0x81ad9d0e, 0xa74b9a73, 0x9f5ad8af }, + { 0x670605a7, 0x26077447, 0x6a6234ee, 0x3f1080c6, 0xbe280b8b, 0x6f7ea0e0, 0x2717b96e, 0x7b487ec6 }, + { 0xa50a550d, 0x81727686, 0xc0a4f84a, 0xd48d6050, 0x9fe7e391, 0x415a9e7e, 0x9ef18e97, 0x62b0e5f3 }, + { 0xec1f9ffc, 0xf594d74f, 0x7a205440, 0xd895fa9d, 0x001ae4e3, 0x117e2e55, 0x84c9f4ce, 0xa554c324 }, + { 0x2872df5b, 0xef7c8905, 0x286efebd, 0x2ed349ee, 0xe27ff578, 0x85937e44, 0xb2c4a50f, 0x7f5928eb }, + // Round 14 (Function0) + { 0x37695f70, 0x04771bc7, 0x4a3124b3, 0xe720b951, 0xf128865e, 0xe843fe74, 0x65e4d61d, 0x8a87d423 }, + { 0xa3e8297d, 0xfb301b1d, 0xf2947692, 0xe01bdc5b, 0x097acbdd, 0x4f4924da, 0xc1d9309b, 0xbf829cf2 }, + { 0x31bae7a4, 0x32fcae3b, 0xffbf70b4, 0x39d3bb53, 0x0544320d, 0xc1c39f45, 0x48bcf8de, 0xa08b29e0 }, + { 0xfd05c9e5, 0x01b771a2, 0x0f09aef7, 0x95ed44e3, 0x12347094, 0x368e3be9, 0x34f19042, 0x4a982f4f }, + { 0x631d4088, 0xf14abb7e, 0x15f66ca0, 0x30c60ae2, 0x4b44c147, 0xc5b67046, 0xffaf5287, 0xe68c6ecc }, + { 0x56a4d5a4, 0x45ce5773, 0x00ca4fbd, 0xadd16430, 0x4b849dda, 0x68cea6e8, 0xae183ec8, 0x67255c14 }, + { 0xf28cdaa3, 0x20b2601f, 0x16e10ecb, 0x7b846fc2, 0x5806e933, 0x7facced1, 0x9a99949a, 0x1885d1a0 }, + // Round 21 (Function0) + { 0xa15b5932, 0x67633d9f, 0xd319dd8d, 0xba6b04e4, 0xc01c9a50, 0xab19caf6, 0x46b4a5aa, 0x7eee560b }, + { 0xea79b11f, 0x5aac571d, 0x742128a9, 0x76d35075, 0x35f7bde9, 0xfec2463a, 0xee51363b, 0x01707da3 }, + { 0xafc135f7, 0x15638341, 0x42d8a498, 0xa8db3aea, 0x20eced78, 0x4d3bc3fa, 0x79676b9e, 0x832c8332 }, + { 0x1f3b40a7, 0x6c4e3ee7, 0xf347271c, 0xfd4f21d2, 0x34f04059, 0x398dfdb8, 0x9a762db7, 0xef5957dc }, + { 0x490c9b8d, 0xd0ae3b7d, 0xdaeb492b, 0x84558d7a, 0x49d7a25b, 0xf0e9a5f5, 0x0d70f368, 0x658ef8e4 }, + { 0xf4a2b8a0, 0x92946891, 0x533b1036, 0x4f88e856, 0x9e07a80c, 0x555cb05b, 0x5aec3e75, 0x4cbcbaf8 }, + { 0x993bbbe3, 0x28acae64, 0x7b9487f3, 0x6db334dc, 0xd6f4da75, 0x50a5346c, 0x5d1c6b72, 0x71db28b8 }, + // Round 28 (Function0) + { 0xf2e261f8, 0xf1bcac1c, 0x2a518d10, 0xa23fce43, 0x3364dbe3, 0x3cd1bb67, 0xfc75dd59, 0xb043e802 }, + { 0xca5b0a33, 0xc3943b92, 0x75a12988, 0x1e4d790e, 0x4d19347f, 0xd7757479, 0x5c5316b4, 0x3fafeeb6 }, + { 0xf7d4a8ea, 0x5324a326, 0x21391abe, 0xd23c32ba, 0x097ef45c, 0x4a17a344, 0x5127234c, 0xadd5a66d }, + { 0xa63e1db5, 0xa17cf84c, 0x08c9f2af, 0x4d608672, 0x983d5983, 0xcc3ee246, 0x563c6b91, 0xf6c76e08 }, + { 0xb333982f, 0xe8b6f406, 0x5e76bcb1, 0x36d4c1be, 0xa566d62b, 0x1582ee74, 0x2ae6c4ef, 0x6321efbc }, + { 0x0d4ec1fd, 0x1614c17e, 0x69c953f4, 0x16fae006, 0xc45a7da7, 0x3daf907e, 0x26585806, 0x3f9d6328 }, + { 0xe3f2c9d2, 0x16512a74, 0x0cd29b00, 0x9832e0f2, 0x30ceaa5f, 0xd830eb0d, 0x300cd4b7, 0x9af8cee3 }, + // Round 35 (Function0) + { 0x7b9ec54b, 0x574d239b, 0x9279f1b5, 0x316796e6, 0x6ee651ff, 0xf3a6e6cc, 0xd3688604, 0x05750a17 }, + { 0xd98176b1, 0xb3cb2bf4, 0xce6c3213, 0x47154778, 0x8452173c, 0x825446ff, 0x62a205f8, 0x486a9323 }, + { 0x0758df38, 0x442e7031, 0x65655e4e, 0x86ca0bd0, 0x897cfcf2, 0xa20940f0, 0x8e5086fc, 0x4e477830 }, + { 0x39eea065, 0x26b29721, 0x8338f7d1, 0x6ff81301, 0x37e95ef7, 0xd1ed44a3, 0xbd3a2ce4, 0xe7de9fef }, + { 0x15dfa08b, 0x7ceca7d8, 0xd9922576, 0x7eb027ab, 0xf6f7853c, 0xda7d8d53, 0xbe42dc12, 0xdea83eaa }, + { 0x93ce25aa, 0xdaef5fc0, 0xd86902bd, 0xa5194a17, 0xfd43f65a, 0x33664d97, 0xf908731a, 0x6a21fd4c }, + { 0x3198b435, 0xa163d09a, 0x701541db, 0x72409751, 0xbb0f1eea, 0xbf9d75f6, 0x9b54cded, 0xe26f4791 } + // 42 rounds... +}; + +#ifndef NOASM +__device__ __forceinline__ +static void SWAP4(uint32_t *x) +{ +#pragma nounroll + // y is used as tmp register too + for(uint32_t y = 0; y<4; y++, ++x) + { + asm("and.b32 %1, %0, 0xF0F0F0F0;" + "xor.b32 %0, %0, %1;" + "shr.b32 %1, %1, 4;" + "vshl.u32.u32.u32.clamp.add %0, %0, 4, %1;\n\t" + : "+r"(*x) : "r"(y)); + } +} + +__device__ __forceinline__ +static void SWAP2(uint32_t *x) +{ +#pragma nounroll + // y is used as tmp register too + for(uint32_t y = 0; y<4; y++, ++x) + { + asm("and.b32 %1, %0, 0xCCCCCCCC;" + "xor.b32 %0, %0, %1;" + "shr.b32 %1, %1, 2;" + "vshl.u32.u32.u32.clamp.add %0, %0, 2, %1;\n\t" + : "+r"(*x) : "r"(y)); + } +} + +__device__ __forceinline__ +static void SWAP1(uint32_t *x) +{ +#pragma nounroll + // y is used as tmp register too + for(uint32_t y = 0; y<4; y++, ++x) + { + asm("and.b32 %1, %0, 0xAAAAAAAA;" + "xor.b32 %0, %0, %1;" + "shr.b32 %1, %1, 1;" + "vshl.u32.u32.u32.clamp.add %0, %0, 1, %1;\n\t" + : "+r"(*x) : "r"(y)); + } +} +#else +__device__ __forceinline__ +static void SWAP4(uint32_t *x) +{ + x[0] = ((x[0] & 0x0f0f0f0fu) << 4) | (x[0] & 0xF0F0F0F0u) >> 4; + x[1] = ((x[1] & 0x0f0f0f0fu) << 4) | (x[1] & 0xF0F0F0F0u) >> 4; + x[2] = ((x[2] & 0x0f0f0f0fu) << 4) | (x[2] & 0xF0F0F0F0u) >> 4; + x[3] = ((x[3] & 0x0f0f0f0fu) << 4) | (x[3] & 0xF0F0F0F0u) >> 4; +} +__device__ __forceinline__ +static void SWAP2(uint32_t *x) +{ + x[0] = ((x[0] & 0x33333333u) << 2) | (x[0] & 0xCCCCCCCCu) >> 2; + x[1] = ((x[1] & 0x33333333u) << 2) | (x[1] & 0xCCCCCCCCu) >> 2; + x[2] = ((x[2] & 0x33333333u) << 2) | (x[2] & 0xCCCCCCCCu) >> 2; + x[3] = ((x[3] & 0x33333333u) << 2) | (x[3] & 0xCCCCCCCCu) >> 2; +} +__device__ __forceinline__ +static void SWAP1(uint32_t *x) +{ + x[0] = ((x[0] & 0x55555555u) << 1) | (x[0] & 0xAAAAAAAAu) >> 1; + x[1] = ((x[1] & 0x55555555u) << 1) | (x[1] & 0xAAAAAAAAu) >> 1; + x[2] = ((x[2] & 0x55555555u) << 1) | (x[2] & 0xAAAAAAAAu) >> 1; + x[3] = ((x[3] & 0x55555555u) << 1) | (x[3] & 0xAAAAAAAAu) >> 1; +} + +#endif + /*swapping bits 16i||16i+1||......||16i+7 with bits 16i+8||16i+9||......||16i+15 of 32-bit x*/ //#define SWAP8(x) (x) = ((((x) & 0x00ff00ffUL) << 8) | (((x) & 0xff00ff00UL) >> 8)); #define SWAP8(x) (x) = __byte_perm(x, x, 0x2301); @@ -114,19 +164,20 @@ __constant__ unsigned char c_E8_bitslice_roundconstant[42][32] = { m1 ^= (temp0 & (m0)); \ m2 ^= temp0; -static __device__ __forceinline__ void Sbox_and_MDS_layer(uint32_t x[8][4], uint32_t roundnumber) +__device__ __forceinline__ +static void Sbox_and_MDS_layer(uint32_t x[8][4], const int rnd) { - uint32_t temp0; - uint32_t cc0, cc1; - //Sbox and MDS layer -#pragma unroll 4 - for (int i = 0; i < 4; i++) { - cc0 = ((uint32_t*)c_E8_bitslice_roundconstant[roundnumber])[i]; - cc1 = ((uint32_t*)c_E8_bitslice_roundconstant[roundnumber])[i+4]; - Sbox(x[0][i],x[2][i], x[4][i], x[6][i], cc0); - Sbox(x[1][i],x[3][i], x[5][i], x[7][i], cc1); - L(x[0][i],x[2][i],x[4][i],x[6][i],x[1][i],x[3][i],x[5][i],x[7][i]); - } + uint2* cc = (uint2*)&c_E8_bslice32[rnd]; + + //Sbox and MDS layer +#pragma unroll + for(int i = 0; i < 4; i++, ++cc) + { + uint32_t temp0; + Sbox(x[0][i], x[2][i], x[4][i], x[6][i], cc->x); + Sbox(x[1][i], x[3][i], x[5][i], x[7][i], cc->y); + L(x[0][i], x[2][i], x[4][i], x[6][i], x[1][i], x[3][i], x[5][i], x[7][i]); + } } static __device__ __forceinline__ void RoundFunction0(uint32_t x[8][4], uint32_t roundnumber) @@ -134,13 +185,9 @@ static __device__ __forceinline__ void RoundFunction0(uint32_t x[8][4], uint32_t Sbox_and_MDS_layer(x, roundnumber); #pragma unroll 4 - for (int j = 1; j < 8; j = j+2) + for(int j = 1; j < 8; j = j + 2) { - uint32_t y; - SWAP1(x[j][0], y); - SWAP1(x[j][1], y); - SWAP1(x[j][2], y); - SWAP1(x[j][3], y); + SWAP1(x[j]); } } @@ -149,13 +196,9 @@ static __device__ __forceinline__ void RoundFunction1(uint32_t x[8][4], uint32_t Sbox_and_MDS_layer(x, roundnumber); #pragma unroll 4 - for (int j = 1; j < 8; j = j+2) + for(int j = 1; j < 8; j = j + 2) { - uint32_t y; - SWAP2(x[j][0], y); - SWAP2(x[j][1], y); - SWAP2(x[j][2], y); - SWAP2(x[j][3], y); + SWAP2(x[j]); } } @@ -164,13 +207,9 @@ static __device__ __forceinline__ void RoundFunction2(uint32_t x[8][4], uint32_t Sbox_and_MDS_layer(x, roundnumber); #pragma unroll 4 - for (int j = 1; j < 8; j = j+2) + for(int j = 1; j < 8; j = j + 2) { - uint32_t y; - SWAP4(x[j][0], y); - SWAP4(x[j][1], y); - SWAP4(x[j][2], y); - SWAP4(x[j][3], y); + SWAP4(x[j]); } } @@ -179,10 +218,10 @@ static __device__ __forceinline__ void RoundFunction3(uint32_t x[8][4], uint32_t Sbox_and_MDS_layer(x, roundnumber); #pragma unroll 4 - for (int j = 1; j < 8; j = j+2) + for(int j = 1; j < 8; j = j + 2) { #pragma unroll 4 - for (int i = 0; i < 4; i++) SWAP8(x[j][i]); + for(int i = 0; i < 4; i++) SWAP8(x[j][i]); } } @@ -191,10 +230,10 @@ static __device__ __forceinline__ void RoundFunction4(uint32_t x[8][4], uint32_t Sbox_and_MDS_layer(x, roundnumber); #pragma unroll 4 - for (int j = 1; j < 8; j = j+2) + for(int j = 1; j < 8; j = j + 2) { #pragma unroll 4 - for (int i = 0; i < 4; i++) SWAP16(x[j][i]); + for(int i = 0; i < 4; i++) SWAP16(x[j][i]); } } @@ -205,11 +244,12 @@ static __device__ __forceinline__ void RoundFunction5(uint32_t x[8][4], uint32_t Sbox_and_MDS_layer(x, roundnumber); #pragma unroll 4 - for (int j = 1; j < 8; j = j+2) + for(int j = 1; j < 8; j = j + 2) { #pragma unroll 2 - for (int i = 0; i < 4; i = i+2) { - temp0 = x[j][i]; x[j][i] = x[j][i+1]; x[j][i+1] = temp0; + for(int i = 0; i < 4; i = i + 2) + { + temp0 = x[j][i]; x[j][i] = x[j][i + 1]; x[j][i + 1] = temp0; } } } @@ -221,11 +261,12 @@ static __device__ __forceinline__ void RoundFunction6(uint32_t x[8][4], uint32_t Sbox_and_MDS_layer(x, roundnumber); #pragma unroll 4 - for (int j = 1; j < 8; j = j+2) + for(int j = 1; j < 8; j = j + 2) { #pragma unroll 2 - for (int i = 0; i < 2; i++) { - temp0 = x[j][i]; x[j][i] = x[j][i+2]; x[j][i+2] = temp0; + for(int i = 0; i < 2; i++) + { + temp0 = x[j][i]; x[j][i] = x[j][i + 2]; x[j][i + 2] = temp0; } } } @@ -233,9 +274,9 @@ static __device__ __forceinline__ void RoundFunction6(uint32_t x[8][4], uint32_t /*The bijective function E8, in bitslice form */ static __device__ __forceinline__ void E8(uint32_t x[8][4]) { - /*perform 6 rounds*/ -//#pragma unroll 6 - for (int i = 0; i < 42; i+=7) + /*perform 6 rounds*/ +#pragma unroll 1 + for(int i = 0; i < 42; i += 7) { RoundFunction0(x, i); RoundFunction1(x, i + 1); @@ -252,221 +293,213 @@ static __device__ __forceinline__ void E8(uint32_t x[8][4]) #define U64TO32_LE(p, v) \ *p = (uint32_t)((v)); *(p+1) = (uint32_t)((v) >> 32); - + __constant__ uint2 c_keccak_round_constants[24] = { - { 0x00000001ul, 0x00000000 }, { 0x00008082ul, 0x00000000 }, - { 0x0000808aul, 0x80000000 }, { 0x80008000ul, 0x80000000 }, - { 0x0000808bul, 0x00000000 }, { 0x80000001ul, 0x00000000 }, - { 0x80008081ul, 0x80000000 }, { 0x00008009ul, 0x80000000 }, - { 0x0000008aul, 0x00000000 }, { 0x00000088ul, 0x00000000 }, - { 0x80008009ul, 0x00000000 }, { 0x8000000aul, 0x00000000 }, - { 0x8000808bul, 0x00000000 }, { 0x0000008bul, 0x80000000 }, - { 0x00008089ul, 0x80000000 }, { 0x00008003ul, 0x80000000 }, - { 0x00008002ul, 0x80000000 }, { 0x00000080ul, 0x80000000 }, - { 0x0000800aul, 0x00000000 }, { 0x8000000aul, 0x80000000 }, - { 0x80008081ul, 0x80000000 }, { 0x00008080ul, 0x80000000 }, - { 0x80000001ul, 0x00000000 }, { 0x80008008ul, 0x80000000 } + { 0x00000001ul, 0x00000000 }, { 0x00008082ul, 0x00000000 }, + { 0x0000808aul, 0x80000000 }, { 0x80008000ul, 0x80000000 }, + { 0x0000808bul, 0x00000000 }, { 0x80000001ul, 0x00000000 }, + { 0x80008081ul, 0x80000000 }, { 0x00008009ul, 0x80000000 }, + { 0x0000008aul, 0x00000000 }, { 0x00000088ul, 0x00000000 }, + { 0x80008009ul, 0x00000000 }, { 0x8000000aul, 0x00000000 }, + { 0x8000808bul, 0x00000000 }, { 0x0000008bul, 0x80000000 }, + { 0x00008089ul, 0x80000000 }, { 0x00008003ul, 0x80000000 }, + { 0x00008002ul, 0x80000000 }, { 0x00000080ul, 0x80000000 }, + { 0x0000800aul, 0x00000000 }, { 0x8000000aul, 0x80000000 }, + { 0x80008081ul, 0x80000000 }, { 0x00008080ul, 0x80000000 }, + { 0x80000001ul, 0x00000000 }, { 0x80008008ul, 0x80000000 } }; -static __device__ __forceinline__ void -keccak_block(uint2 *s) { - int i; - uint2 t[5], u[5], v, w; - - /* theta: c = a[0,i] ^ a[1,i] ^ .. a[4,i] */ - t[0] = s[0] ^ s[5]; - t[1] = s[1] ^ s[6]; - t[2] = s[2] ^ s[7]; - t[3] = s[3] ^ s[8]; - t[4] = s[4]; - - /* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */ - u[0] = t[4] ^ ROL2(t[1], 1); - u[1] = t[0] ^ ROL2(t[2], 1); - u[2] = t[1] ^ ROL2(t[3], 1); - u[3] = t[2] ^ ROL2(t[4], 1); - u[4] = t[3] ^ ROL2(t[0], 1); - - /* theta: a[0,i], a[1,i], .. a[4,i] ^= d[i] */ - s[0] ^= u[0]; s[5] ^= u[0]; s[10] = u[0]; s[15] = u[0]; s[20] = u[0]; - s[1] ^= u[1]; s[6] ^= u[1]; s[11] = u[1]; s[16] = u[1]; s[21] = u[1]; - s[2] ^= u[2]; s[7] ^= u[2]; s[12] = u[2]; s[17] = u[2]; s[22] = u[2]; - s[3] ^= u[3]; s[8] ^= u[3]; s[13] = u[3]; s[18] = u[3]; s[23] = u[3]; - s[4] ^= u[4]; s[9] = u[4]; s[14] = u[4]; s[19] = u[4]; s[24] = u[4]; - - /* rho pi: b[..] = rotl(a[..], ..) */ - v = s[1]; - s[1] = ROL2(s[6], 44); - s[6] = ROL2(u[4], 20); - s[9] = ROL2(u[2], 61); - s[22] = ROL2(u[4], 39); - s[14] = ROL2(u[0], 18); - s[20] = ROL2(s[2], 62); - s[2] = ROL2(u[2], 43); - s[12] = ROL2(u[3], 25); - s[13] = ROL2(u[4], 8); - s[19] = ROL2(u[3], 56); - s[23] = ROL2(u[0], 41); - s[15] = ROL2(s[4], 27); - s[4] = ROL2(u[4], 14); - s[24] = ROL2(u[1], 2); - s[21] = ROL2(s[8], 55); - s[8] = ROL2(u[1], 45); - s[16] = ROL2(s[5], 36); - s[5] = ROL2(s[3], 28); - s[3] = ROL2(u[3], 21); - s[18] = ROL2(u[2], 15); - s[17] = ROL2(u[1], 10); - s[11] = ROL2(s[7], 6); - s[7] = ROL2(u[0], 3); - s[10] = ROL2(v, 1); - - /* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */ - v = s[0]; w = s[1]; s[0] ^= (~w) & s[2]; s[1] ^= (~s[2]) & s[3]; s[2] ^= (~s[3]) & s[4]; s[3] ^= (~s[4]) & v; s[4] ^= (~v) & w; - v = s[5]; w = s[6]; s[5] ^= (~w) & s[7]; s[6] ^= (~s[7]) & s[8]; s[7] ^= (~s[8]) & s[9]; s[8] ^= (~s[9]) & v; s[9] ^= (~v) & w; - v = s[10]; w = s[11]; s[10] ^= (~w) & s[12]; s[11] ^= (~s[12]) & s[13]; s[12] ^= (~s[13]) & s[14]; s[13] ^= (~s[14]) & v; s[14] ^= (~v) & w; - v = s[15]; w = s[16]; s[15] ^= (~w) & s[17]; s[16] ^= (~s[17]) & s[18]; s[17] ^= (~s[18]) & s[19]; s[18] ^= (~s[19]) & v; s[19] ^= (~v) & w; - v = s[20]; w = s[21]; s[20] ^= (~w) & s[22]; s[21] ^= (~s[22]) & s[23]; s[22] ^= (~s[23]) & s[24]; s[23] ^= (~s[24]) & v; s[24] ^= (~v) & w; - - /* iota: a[0,0] ^= round constant */ - s[0] = s[0]^1;//vectorize(c_keccak_round_constants[0]); - - for (i = 1; i < 24; i++) { - /* theta: c = a[0,i] ^ a[1,i] ^ .. a[4,i] */ - t[0] = s[0] ^ s[5] ^ s[10] ^ s[15] ^ s[20]; - t[1] = s[1] ^ s[6] ^ s[11] ^ s[16] ^ s[21]; - t[2] = s[2] ^ s[7] ^ s[12] ^ s[17] ^ s[22]; - t[3] = s[3] ^ s[8] ^ s[13] ^ s[18] ^ s[23]; - t[4] = s[4] ^ s[9] ^ s[14] ^ s[19] ^ s[24]; - - /* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */ - u[0] = t[4] ^ ROL2(t[1], 1); - u[1] = t[0] ^ ROL2(t[2], 1); - u[2] = t[1] ^ ROL2(t[3], 1); - u[3] = t[2] ^ ROL2(t[4], 1); - u[4] = t[3] ^ ROL2(t[0], 1); - - /* theta: a[0,i], a[1,i], .. a[4,i] ^= d[i] */ - s[0] ^= u[0]; s[5] ^= u[0]; s[10] ^= u[0]; s[15] ^= u[0]; s[20] ^= u[0]; - s[1] ^= u[1]; s[6] ^= u[1]; s[11] ^= u[1]; s[16] ^= u[1]; s[21] ^= u[1]; - s[2] ^= u[2]; s[7] ^= u[2]; s[12] ^= u[2]; s[17] ^= u[2]; s[22] ^= u[2]; - s[3] ^= u[3]; s[8] ^= u[3]; s[13] ^= u[3]; s[18] ^= u[3]; s[23] ^= u[3]; - s[4] ^= u[4]; s[9] ^= u[4]; s[14] ^= u[4]; s[19] ^= u[4]; s[24] ^= u[4]; - - /* rho pi: b[..] = rotl(a[..], ..) */ - v = s[1]; - s[1] = ROL2(s[6], 44); - s[6] = ROL2(s[9], 20); - s[9] = ROL2(s[22], 61); - s[22] = ROL2(s[14], 39); - s[14] = ROL2(s[20], 18); - s[20] = ROL2(s[2], 62); - s[2] = ROL2(s[12], 43); - s[12] = ROL2(s[13], 25); - s[13] = ROL2(s[19], 8); - s[19] = ROL2(s[23], 56); - s[23] = ROL2(s[15], 41); - s[15] = ROL2(s[4], 27); - s[4] = ROL2(s[24], 14); - s[24] = ROL2(s[21], 2); - s[21] = ROL2(s[8], 55); - s[8] = ROL2(s[16], 45); - s[16] = ROL2(s[5], 36); - s[5] = ROL2(s[3], 28); - s[3] = ROL2(s[18], 21); - s[18] = ROL2(s[17], 15); - s[17] = ROL2(s[11], 10); - s[11] = ROL2(s[7], 6); - s[7] = ROL2(s[10], 3); - s[10] = ROL2(v, 1); - - /* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */ - v = s[0]; w = s[1]; s[0] ^= (~w) & s[2]; s[1] ^= (~s[2]) & s[3]; s[2] ^= (~s[3]) & s[4]; s[3] ^= (~s[4]) & v; s[4] ^= (~v) & w; - v = s[5]; w = s[6]; s[5] ^= (~w) & s[7]; s[6] ^= (~s[7]) & s[8]; s[7] ^= (~s[8]) & s[9]; s[8] ^= (~s[9]) & v; s[9] ^= (~v) & w; - v = s[10]; w = s[11]; s[10] ^= (~w) & s[12]; s[11] ^= (~s[12]) & s[13]; s[12] ^= (~s[13]) & s[14]; s[13] ^= (~s[14]) & v; s[14] ^= (~v) & w; - v = s[15]; w = s[16]; s[15] ^= (~w) & s[17]; s[16] ^= (~s[17]) & s[18]; s[17] ^= (~s[18]) & s[19]; s[18] ^= (~s[19]) & v; s[19] ^= (~v) & w; - v = s[20]; w = s[21]; s[20] ^= (~w) & s[22]; s[21] ^= (~s[22]) & s[23]; s[22] ^= (~s[23]) & s[24]; s[23] ^= (~s[24]) & v; s[24] ^= (~v) & w; - - /* iota: a[0,0] ^= round constant */ - s[0] ^= c_keccak_round_constants[i]; - } -} +#define bitselect(a, b, c) ((a) ^ ((c) & ((b) ^ (a)))) -__global__ __launch_bounds__(256,3) -void quark_jh512Keccak512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint32_t *g_hash, uint32_t *g_nonceVector) +__global__ __launch_bounds__(256, 3) +void quark_jh512Keccak512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint32_t *g_hash) { - uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); - if (thread < threads) - { - uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread); + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); +// if(thread < threads) + { + const uint32_t nounce = (startNounce + thread); - int hashPosition = nounce - startNounce; - uint32_t *Hash = &g_hash[16 * hashPosition]; + const uint32_t hashPosition = nounce - startNounce; + uint32_t *Hash = &g_hash[16 * hashPosition]; uint32_t x[8][4] = { - { 0x964bd16f, 0x17aa003e, 0x052e6a63, 0x43d5157a }, - { 0x8d5e228a, 0x0bef970c, 0x591234e9, 0x61c3b3f2 }, - { 0xc1a01d89, 0x1e806f53, 0x6b05a92a, 0x806d2bea }, - { 0xdbcc8e58, 0xa6ba7520, 0x763a0fa9, 0xf73bf8ba }, - { 0x05e66901, 0x694ae341, 0x8e8ab546, 0x5ae66f2e }, - { 0xd0a74710, 0x243c84c1, 0xb1716e3b, 0x99c15a2d }, - { 0xecf657cf, 0x56f8b19d, 0x7c8806a7, 0x56b11657 }, - { 0xdffcc2e3, 0xfb1785e6, 0x78465a54, 0x4bdd8ccc } }; + { 0x964bd16f, 0x17aa003e, 0x052e6a63, 0x43d5157a }, + { 0x8d5e228a, 0x0bef970c, 0x591234e9, 0x61c3b3f2 }, + { 0xc1a01d89, 0x1e806f53, 0x6b05a92a, 0x806d2bea }, + { 0xdbcc8e58, 0xa6ba7520, 0x763a0fa9, 0xf73bf8ba }, + { 0x05e66901, 0x694ae341, 0x8e8ab546, 0x5ae66f2e }, + { 0xd0a74710, 0x243c84c1, 0xb1716e3b, 0x99c15a2d }, + { 0xecf657cf, 0x56f8b19d, 0x7c8806a7, 0x56b11657 }, + { 0xdffcc2e3, 0xfb1785e6, 0x78465a54, 0x4bdd8ccc } }; + + + uint32_t msg[16]; + + uint28 *phash = (uint28*)Hash; + uint28 *outpt = (uint28*)msg; + outpt[0] = phash[0]; + outpt[1] = phash[1]; #pragma unroll 16 - for (int i = 0; i < 16; i++) x[i >> 2][i & 3] ^= ((uint32_t*)Hash)[i]; + for (int i = 0; i < 16; i++) x[i >> 2][i & 3] ^= (msg)[i]; E8(x); #pragma unroll 16 - for (int i = 0; i < 16; i++) x[(16 + i) >> 2][(16 + i) & 3] ^= ((uint32_t*)Hash)[i]; + for (int i = 0; i < 16; i++) x[(16 + i) >> 2][(16 + i) & 3] ^= (msg)[i]; - x[0 >> 2][0 & 3] ^= 0x80; - x[15 >> 2][15 & 3] ^= 0x00020000; + x[0][0] ^= 0x80; + x[3][3] ^= 0x00020000; E8(x); - x[(16 + 0) >> 2][(16 + 0) & 3] ^= 0x80; - x[(16 + 15) >> 2][(16 + 15) & 3] ^= 0x00020000; - - uint2 keccak_gpu_state[25]; - - keccak_gpu_state[0].x = x[4][0]; - keccak_gpu_state[0].y = x[4][1]; - keccak_gpu_state[1].x = x[4][2]; - keccak_gpu_state[1].y = x[4][3]; - keccak_gpu_state[2].x = x[5][0]; - keccak_gpu_state[2].y = x[5][1]; - keccak_gpu_state[3].x = x[5][2]; - keccak_gpu_state[3].y = x[5][3]; - keccak_gpu_state[4].x = x[6][0]; - keccak_gpu_state[4].y = x[6][1]; - keccak_gpu_state[5].x = x[6][2]; - keccak_gpu_state[5].y = x[6][3]; - keccak_gpu_state[6].x = x[7][0]; - keccak_gpu_state[6].y = x[7][1]; - keccak_gpu_state[7].x = x[7][2]; - keccak_gpu_state[7].y = x[7][3]; - keccak_gpu_state[8] = make_uint2(0x00000001, 0x80000000); -#pragma unroll - for (int i = 9; i<25; i++) + x[4][0] ^= 0x80; + x[7][3] ^= 0x00020000; + + uint2 s[25] = + { + { x[4][0], x[4][1] }, { x[4][2], x[4][3] }, { x[5][0], x[5][1] }, { x[5][2], x[5][3] }, + { x[6][0], x[6][1] }, { x[6][2], x[6][3] }, { x[7][0], x[7][1] }, { x[7][2], x[7][3] }, + { 1, 0x80000000 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, + { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, + { 0, 0 } + }; + uint2 bc[5], tmpxor[5], tmp1, tmp2; + + tmpxor[0] = s[0] ^ s[5]; + tmpxor[1] = s[1] ^ s[6]; + tmpxor[2] = s[2] ^ s[7]; + tmpxor[3] = s[3] ^ s[8]; + tmpxor[4] = s[4]; + + bc[0] = tmpxor[0] ^ ROL2(tmpxor[2], 1); + bc[1] = tmpxor[1] ^ ROL2(tmpxor[3], 1); + bc[2] = tmpxor[2] ^ ROL2(tmpxor[4], 1); + bc[3] = tmpxor[3] ^ ROL2(tmpxor[0], 1); + bc[4] = tmpxor[4] ^ ROL2(tmpxor[1], 1); + + tmp1 = s[1] ^ bc[0]; + + s[0] = s[0] ^ bc[4]; + s[1] = ROL2(s[6] ^ bc[0], 44); + s[6] = ROL2(bc[3], 20); + s[9] = ROL2(bc[1], 61); + s[22] = ROL2(bc[3], 39); + s[14] = ROL2(bc[4], 18); + s[20] = ROL2(s[2] ^ bc[1], 62); + s[2] = ROL2(bc[1], 43); + s[12] = ROL2(bc[2], 25); + s[13] = ROL8(bc[3]); + s[19] = ROR8(bc[2]); + s[23] = ROL2(bc[4], 41); + s[15] = ROL2(s[4] ^ bc[3], 27); + s[4] = ROL2(bc[3], 14); + s[24] = ROL2(bc[0], 2); + s[21] = ROL2(s[8] ^ bc[2], 55); + s[8] = ROL2(bc[0], 45); + s[16] = ROL2(s[5] ^ bc[4], 36); + s[5] = ROL2(s[3] ^ bc[2], 28); + s[3] = ROL2(bc[2], 21); + s[18] = ROL2(bc[1], 15); + s[17] = ROL2(bc[0], 10); + s[11] = ROL2(s[7] ^ bc[1], 6); + s[7] = ROL2(bc[4], 3); + s[10] = ROL2(tmp1, 1); + + tmp1 = s[0]; tmp2 = s[1]; s[0] = bitselect(s[0] ^ s[2], s[0], s[1]); s[1] = bitselect(s[1] ^ s[3], s[1], s[2]); s[2] = bitselect(s[2] ^ s[4], s[2], s[3]); s[3] = bitselect(s[3] ^ tmp1, s[3], s[4]); s[4] = bitselect(s[4] ^ tmp2, s[4], tmp1); + tmp1 = s[5]; tmp2 = s[6]; s[5] = bitselect(s[5] ^ s[7], s[5], s[6]); s[6] = bitselect(s[6] ^ s[8], s[6], s[7]); s[7] = bitselect(s[7] ^ s[9], s[7], s[8]); s[8] = bitselect(s[8] ^ tmp1, s[8], s[9]); s[9] = bitselect(s[9] ^ tmp2, s[9], tmp1); + tmp1 = s[10]; tmp2 = s[11]; s[10] = bitselect(s[10] ^ s[12], s[10], s[11]); s[11] = bitselect(s[11] ^ s[13], s[11], s[12]); s[12] = bitselect(s[12] ^ s[14], s[12], s[13]); s[13] = bitselect(s[13] ^ tmp1, s[13], s[14]); s[14] = bitselect(s[14] ^ tmp2, s[14], tmp1); + tmp1 = s[15]; tmp2 = s[16]; s[15] = bitselect(s[15] ^ s[17], s[15], s[16]); s[16] = bitselect(s[16] ^ s[18], s[16], s[17]); s[17] = bitselect(s[17] ^ s[19], s[17], s[18]); s[18] = bitselect(s[18] ^ tmp1, s[18], s[19]); s[19] = bitselect(s[19] ^ tmp2, s[19], tmp1); + tmp1 = s[20]; tmp2 = s[21]; s[20] = bitselect(s[20] ^ s[22], s[20], s[21]); s[21] = bitselect(s[21] ^ s[23], s[21], s[22]); s[22] = bitselect(s[22] ^ s[24], s[22], s[23]); s[23] = bitselect(s[23] ^ tmp1, s[23], s[24]); s[24] = bitselect(s[24] ^ tmp2, s[24], tmp1); + s[0].x ^= 1; +#pragma unroll 1 + for(int i = 1; i < 23; ++i) { - keccak_gpu_state[i] = make_uint2(0, 0); - } - keccak_block(keccak_gpu_state); - uint64_t *outputhash = (uint64_t *)Hash; -#pragma unroll 16 - for (int i = 0; i<8; i++) - outputhash[i] = devectorize(keccak_gpu_state[i]); +#pragma unroll + for(int x = 0; x < 5; x++) + tmpxor[x] = s[x] ^ s[x + 5] ^ s[x + 10] ^ s[x + 15] ^ s[x + 20]; + + bc[0] = tmpxor[0] ^ ROL2(tmpxor[2], 1); + bc[1] = tmpxor[1] ^ ROL2(tmpxor[3], 1); + bc[2] = tmpxor[2] ^ ROL2(tmpxor[4], 1); + bc[3] = tmpxor[3] ^ ROL2(tmpxor[0], 1); + bc[4] = tmpxor[4] ^ ROL2(tmpxor[1], 1); + + tmp1 = s[1] ^ bc[0]; + + s[0] ^= bc[4]; + s[1] = ROL2(s[6] ^ bc[0], 44); + s[6] = ROL2(s[9] ^ bc[3], 20); + s[9] = ROL2(s[22] ^ bc[1], 61); + s[22] = ROL2(s[14] ^ bc[3], 39); + s[14] = ROL2(s[20] ^ bc[4], 18); + s[20] = ROL2(s[2] ^ bc[1], 62); + s[2] = ROL2(s[12] ^ bc[1], 43); + s[12] = ROL2(s[13] ^ bc[2], 25); + s[13] = ROL8(s[19] ^ bc[3]); + s[19] = ROR8(s[23] ^ bc[2]); + s[23] = ROL2(s[15] ^ bc[4], 41); + s[15] = ROL2(s[4] ^ bc[3], 27); + s[4] = ROL2(s[24] ^ bc[3], 14); + s[24] = ROL2(s[21] ^ bc[0], 2); + s[21] = ROL2(s[8] ^ bc[2], 55); + s[8] = ROL2(s[16] ^ bc[0], 45); + s[16] = ROL2(s[5] ^ bc[4], 36); + s[5] = ROL2(s[3] ^ bc[2], 28); + s[3] = ROL2(s[18] ^ bc[2], 21); + s[18] = ROL2(s[17] ^ bc[1], 15); + s[17] = ROL2(s[11] ^ bc[0], 10); + s[11] = ROL2(s[7] ^ bc[1], 6); + s[7] = ROL2(s[10] ^ bc[4], 3); + s[10] = ROL2(tmp1, 1); + + tmp1 = s[0]; tmp2 = s[1]; s[0] = bitselect(s[0] ^ s[2], s[0], s[1]); s[1] = bitselect(s[1] ^ s[3], s[1], s[2]); s[2] = bitselect(s[2] ^ s[4], s[2], s[3]); s[3] = bitselect(s[3] ^ tmp1, s[3], s[4]); s[4] = bitselect(s[4] ^ tmp2, s[4], tmp1); + tmp1 = s[5]; tmp2 = s[6]; s[5] = bitselect(s[5] ^ s[7], s[5], s[6]); s[6] = bitselect(s[6] ^ s[8], s[6], s[7]); s[7] = bitselect(s[7] ^ s[9], s[7], s[8]); s[8] = bitselect(s[8] ^ tmp1, s[8], s[9]); s[9] = bitselect(s[9] ^ tmp2, s[9], tmp1); + tmp1 = s[10]; tmp2 = s[11]; s[10] = bitselect(s[10] ^ s[12], s[10], s[11]); s[11] = bitselect(s[11] ^ s[13], s[11], s[12]); s[12] = bitselect(s[12] ^ s[14], s[12], s[13]); s[13] = bitselect(s[13] ^ tmp1, s[13], s[14]); s[14] = bitselect(s[14] ^ tmp2, s[14], tmp1); + tmp1 = s[15]; tmp2 = s[16]; s[15] = bitselect(s[15] ^ s[17], s[15], s[16]); s[16] = bitselect(s[16] ^ s[18], s[16], s[17]); s[17] = bitselect(s[17] ^ s[19], s[17], s[18]); s[18] = bitselect(s[18] ^ tmp1, s[18], s[19]); s[19] = bitselect(s[19] ^ tmp2, s[19], tmp1); + tmp1 = s[20]; tmp2 = s[21]; s[20] = bitselect(s[20] ^ s[22], s[20], s[21]); s[21] = bitselect(s[21] ^ s[23], s[21], s[22]); s[22] = bitselect(s[22] ^ s[24], s[22], s[23]); s[23] = bitselect(s[23] ^ tmp1, s[23], s[24]); s[24] = bitselect(s[24] ^ tmp2, s[24], tmp1); + s[0] ^= c_keccak_round_constants[i]; + } +#pragma unroll + for(int x = 0; x < 5; x++) + tmpxor[x] = s[x] ^ s[x + 5] ^ s[x + 10] ^ s[x + 15] ^ s[x + 20]; + + bc[0] = tmpxor[0] ^ ROL2(tmpxor[2], 1); + bc[1] = tmpxor[1] ^ ROL2(tmpxor[3], 1); + bc[2] = tmpxor[2] ^ ROL2(tmpxor[4], 1); + bc[3] = tmpxor[3] ^ ROL2(tmpxor[0], 1); + bc[4] = tmpxor[4] ^ ROL2(tmpxor[1], 1); + + s[0] ^= bc[4]; + s[1] = ROL2(s[6] ^ bc[0], 44); + s[6] = ROL2(s[9] ^ bc[3], 20); + s[9] = ROL2(s[22] ^ bc[1], 61); + s[2] = ROL2(s[12] ^ bc[1], 43); + s[4] = ROL2(s[24] ^ bc[3], 14); + s[8] = ROL2(s[16] ^ bc[0], 45); + s[5] = ROL2(s[3] ^ bc[2], 28); + s[3] = ROL2(s[18] ^ bc[2], 21); + s[7] = ROL2(s[10] ^ bc[4], 3); + + uint2 *outputhash = (uint2 *)Hash; + + outputhash[0] = bitselect(s[0] ^ s[2], s[0], s[1]) ^ c_keccak_round_constants[23]; + outputhash[1] = bitselect(s[1] ^ s[3], s[1], s[2]); + outputhash[2] = bitselect(s[2] ^ s[4], s[2], s[3]); + outputhash[3] = bitselect(s[3] ^ s[0], s[3], s[4]); + outputhash[4] = bitselect(s[4] ^ s[1], s[4], s[0]); + outputhash[5] = bitselect(s[5] ^ s[7], s[5], s[6]); + outputhash[6] = bitselect(s[6] ^ s[8], s[6], s[7]); + outputhash[7] = bitselect(s[7] ^ s[9], s[7], s[8]); } } -__host__ void cuda_jh512Keccak512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order) +__host__ void cuda_jh512Keccak512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash) { - const uint32_t threadsperblock = 256; + const uint32_t threadsperblock = 256; - // berechne wie viele Thread Blocks wir brauchen - dim3 grid((threads + threadsperblock-1)/threadsperblock); - dim3 block(threadsperblock); + // berechne wie viele Thread Blocks wir brauchen + dim3 grid((threads + threadsperblock - 1) / threadsperblock); + dim3 block(threadsperblock); - quark_jh512Keccak512_gpu_hash_64 << > >(threads, startNounce, d_hash, d_nonceVector); -// MyStreamSynchronize(NULL, order, thr_id); + quark_jh512Keccak512_gpu_hash_64 << > >(threads, startNounce, d_hash); + CUDA_SAFE_CALL(cudaGetLastError()); } diff --git a/quark/cuda_quark_blake512.cu b/quark/cuda_quark_blake512.cu index 534280dd83..bc05bfe7ed 100644 --- a/quark/cuda_quark_blake512.cu +++ b/quark/cuda_quark_blake512.cu @@ -2,89 +2,213 @@ #include #include "cuda_helper.h" +#include "cuda_vector.h" #define ROTR(x,n) ROTR64(x,n) #define USE_SHUFFLE 0 -// die Message it Padding zur Berechnung auf der GPU -__constant__ uint64_t c_PaddedMessage80[16]; // padded message (80 bytes + padding) +static uint2* c_PaddedMessage80[MAX_GPUS]; // padded message (80 bytes + padding) +__constant__ uint2 __align__(16) c_PaddedM[10]; +__constant__ uint28 Hostprecalc[4]; +__constant__ uint2 __align__(16) pre[224]; + + +__constant__ uint2 c_u512[16] = +{ + {0x85a308d3UL, 0x243f6a88}, {0x03707344UL, 0x13198a2e}, + {0x299f31d0UL, 0xa4093822}, {0xec4e6c89UL, 0x082efa98}, + {0x38d01377UL, 0x452821e6}, {0x34e90c6cUL, 0xbe5466cf}, + {0xc97c50ddUL, 0xc0ac29b7}, {0xb5470917UL, 0x3f84d5b5}, + {0x8979fb1bUL, 0x9216d5d9}, {0x98dfb5acUL, 0xd1310ba6}, + {0xd01adfb7UL, 0x2ffd72db}, {0x6a267e96UL, 0xb8e1afed}, + {0xf12c7f99UL, 0xba7c9045}, {0xb3916cf7UL, 0x24a19947}, + {0x858efc16UL, 0x0801f2e2}, {0x71574e69UL, 0x636920d8} +}; // ---------------------------- BEGIN CUDA quark_blake512 functions ------------------------------------ +#define GSPREC_SP(a,b,c,d) { \ + v[a] += (pre[i++]) + v[b]; \ + v[d] = eorswap32( v[d] , v[a]); \ + v[c] += v[d]; \ + v[b] = ROR2(v[b] ^ v[c], 25); \ + v[a] += (pre[i++]) + v[b]; \ + v[d] = ROR16(v[d] ^ v[a]); \ + v[c] += v[d]; \ + v[b] = ROR2(v[b] ^ v[c], 11); \ + } + +#define GSPREC_SP_HI(a,b,c,d,idx1,idx2) { \ + v[a] += (block[idx2] ^ u512[idx1]) + v[b]; \ + v[d] = eorswap32( v[d] , v[a]); \ + v[c] += v[d]; \ + v[b] = ROR2(v[b] ^ v[c], 25); \ + v[a] += (pre[i++]) + v[b]; \ + v[d] = ROR16(v[d] ^ v[a]); \ + v[c] += v[d]; \ + v[b] = ROR2(v[b] ^ v[c], 11); \ + } + +#define GSPREC_SP_LO(a,b,c,d,idx1,idx2) { \ + v[a] += (pre[i++]) + v[b]; \ + v[d] = eorswap32( v[d] , v[a]); \ + v[c] += v[d]; \ + v[b] = ROR2(v[b] ^ v[c], 25); \ + v[a] += (block[idx1] ^ u512[idx2]) + v[b]; \ + v[d] = ROR16(v[d] ^ v[a]); \ + v[c] += v[d]; \ + v[b] = ROR2(v[b] ^ v[c], 11); \ + } #define Gprecalc(a,b,c,d,idx1,idx2) { \ v[a] += (block[idx2] ^ u512[idx1]) + v[b]; \ - v[d] = SWAPDWORDS2( v[d] ^ v[a]); \ + v[d] = eorswap32( v[d] , v[a]); \ v[c] += v[d]; \ v[b] = ROR2(v[b] ^ v[c], 25); \ v[a] += (block[idx1] ^ u512[idx2]) + v[b]; \ - v[d] = ROR2(v[d] ^ v[a],16); \ + v[d] = ROR16(v[d] ^ v[a]); \ v[c] += v[d]; \ v[b] = ROR2(v[b] ^ v[c], 11); \ } -__global__ +#define RSPRECHOST(idx1,idx2) { \ + prehost[i++] = (block[idx2] ^ u512[idx1]); \ + prehost[i++] = (block[idx1] ^ u512[idx2]); \ + } + +#define RSPRECHOSTLO(idx1,idx2) { \ + prehost[i++] = (block[idx2] ^ u512[idx1]); \ + } +#define RSPRECHOSTHI(idx1,idx2) { \ + prehost[i++] = (block[idx1] ^ u512[idx2]); \ + } + + +#define GprecalcHost(a,b,c,d,idx1,idx2) { \ + v[a] += (block[idx2] ^ u512[idx1]) + v[b]; \ + v[d] = ROTR64( v[d] ^ v[a],32); \ + v[c] += v[d]; \ + v[b] = ROTR64(v[b] ^ v[c], 25); \ + v[a] += (block[idx1] ^ u512[idx2]) + v[b]; \ + v[d] = ROTR64(v[d] ^ v[a],16); \ + v[c] += v[d]; \ + v[b] = ROTR64(v[b] ^ v[c], 11); \ + } + +__constant__ uint8_t c_sigma[16][16] = { + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, + {14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3}, + {11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4}, + {7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8}, + {9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13}, + {2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9}, + {12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11}, + {13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10}, + {6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5}, + {10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0}, + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, + {14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3}, + {11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4}, + {7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8}, + {9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13}, + {2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9}}; + + +#define G(a,b,c,d,x) { \ + uint32_t idx1 = c_sigma[i][x]; \ + uint32_t idx2 = c_sigma[i][x+1]; \ + v[a] += (block[idx1] ^ c_u512[idx2]) + v[b]; \ + v[d] = eorswap32(v[d] , v[a]); \ + v[c] += v[d]; \ + v[b] = ROR2( v[b] ^ v[c], 25); \ + v[a] += (block[idx2] ^ c_u512[idx1]) + v[b]; \ + v[d] = ROR16( v[d] ^ v[a]); \ + v[c] += v[d]; \ + v[b] = ROR2( v[b] ^ v[c], 11); \ +} + +__global__ #if __CUDA_ARCH__ > 500 - __launch_bounds__(256, 1) +__launch_bounds__(256, 1) #else - __launch_bounds__(256, 2) +__launch_bounds__(256, 2) #endif -void quark_blake512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint32_t *const __restrict__ g_nonceVector, uint64_t *const __restrict__ g_hash) +void quark_blake512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint32_t *const __restrict__ g_nonceVector, uint2 *const __restrict__ g_hash) { - uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); #if USE_SHUFFLE - const int warpID = threadIdx.x & 0x0F; // 16 warps - const int warpBlockID = (thread + 15)>>4; // aufrunden auf volle Warp-Blöcke - const int maxHashPosition = thread<<3; + // const int warpID = threadIdx.x & 0x02F; // 16 warps + const int warpBlockID = (thread + 15) >> 5; // aufrunden auf volle Warp-Blöcke + // const int maxHashPosition = thread<<3; #endif #if USE_SHUFFLE - if (warpBlockID < ( (threads+15)>>4 )) + if(warpBlockID < ((threads + 15) >> 5)) #else - if (thread < threads) + if(thread < threads) #endif { - uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread); + const uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread); - int hashPosition = nounce - startNounce; + const int hashPosition = nounce - startNounce; + + uint2 block[16]; + uint2 msg[16]; + + uint28 *phash = (uint28*)&g_hash[hashPosition * 8]; + uint28 *outpt = (uint28*)msg; + outpt[0] = phash[0]; + outpt[1] = phash[1]; + block[0].x = cuda_swab32(msg[0].y); + block[0].y = cuda_swab32(msg[0].x); + block[1].x = cuda_swab32(msg[1].y); + block[1].y = cuda_swab32(msg[1].x); + block[2].x = cuda_swab32(msg[2].y); + block[2].y = cuda_swab32(msg[2].x); + block[3].x = cuda_swab32(msg[3].y); + block[3].y = cuda_swab32(msg[3].x); + block[4].x = cuda_swab32(msg[4].y); + block[4].y = cuda_swab32(msg[4].x); + block[5].x = cuda_swab32(msg[5].y); + block[5].y = cuda_swab32(msg[5].x); + block[6].x = cuda_swab32(msg[6].y); + block[6].y = cuda_swab32(msg[6].x); + block[7].x = cuda_swab32(msg[7].y); + block[7].y = cuda_swab32(msg[7].x); + + + block[8] = vectorizehigh(0x80000000); + block[9] = vectorizelow(0x0); + block[10] = vectorizelow(0x0); + block[11] = vectorizelow(0x0); + block[12] = vectorizelow(0x0); + block[13] = vectorizelow(0x1); + block[14] = vectorizelow(0x0); + block[15] = vectorizelow(0x200); - uint64_t *inpHash = &g_hash[hashPosition*8]; - uint2 block[16] = - { - vectorizeswap(inpHash[0]), vectorizeswap(inpHash[1]), vectorizeswap(inpHash[2]), vectorizeswap(inpHash[3]), - vectorizeswap(inpHash[4]), vectorizeswap(inpHash[5]), vectorizeswap(inpHash[6]), vectorizeswap(inpHash[7]) - }; - block[8] = make_uint2(0, 0x80000000UL); - block[9] = make_uint2(0,0); - block[10] = make_uint2(0,0); - block[11] = make_uint2(0,0); - block[12] = make_uint2(0,0); - block[13] = make_uint2(1,0); - block[14] = make_uint2(0,0); - block[15] = make_uint2(0x200,0); const uint2 h[8] = { - { 0xf3bcc908UL, 0x6a09e667UL }, - { 0x84caa73bUL, 0xbb67ae85UL }, - { 0xfe94f82bUL, 0x3c6ef372UL }, - { 0x5f1d36f1UL, 0xa54ff53aUL }, - { 0xade682d1UL, 0x510e527fUL }, - { 0x2b3e6c1fUL, 0x9b05688cUL }, - { 0xfb41bd6bUL, 0x1f83d9abUL }, - { 0x137e2179UL, 0x5be0cd19UL } + {0xf3bcc908UL, 0x6a09e667UL}, + {0x84caa73bUL, 0xbb67ae85UL}, + {0xfe94f82bUL, 0x3c6ef372UL}, + {0x5f1d36f1UL, 0xa54ff53aUL}, + {0xade682d1UL, 0x510e527fUL}, + {0x2b3e6c1fUL, 0x9b05688cUL}, + {0xfb41bd6bUL, 0x1f83d9abUL}, + {0x137e2179UL, 0x5be0cd19UL} }; const uint2 u512[16] = { - { 0x85a308d3UL, 0x243f6a88 }, { 0x03707344UL, 0x13198a2e }, - { 0x299f31d0UL, 0xa4093822 }, { 0xec4e6c89UL, 0x082efa98 }, - { 0x38d01377UL, 0x452821e6 }, { 0x34e90c6cUL, 0xbe5466cf }, - { 0xc97c50ddUL, 0xc0ac29b7 }, { 0xb5470917UL, 0x3f84d5b5 }, - { 0x8979fb1bUL, 0x9216d5d9 }, { 0x98dfb5acUL, 0xd1310ba6 }, - { 0xd01adfb7UL, 0x2ffd72db }, { 0x6a267e96UL, 0xb8e1afed }, - { 0xf12c7f99UL, 0xba7c9045 }, { 0xb3916cf7UL, 0x24a19947 }, - { 0x858efc16UL, 0x0801f2e2 }, { 0x71574e69UL, 0x636920d8 } + {0x85a308d3UL, 0x243f6a88}, {0x03707344UL, 0x13198a2e}, + {0x299f31d0UL, 0xa4093822}, {0xec4e6c89UL, 0x082efa98}, + {0x38d01377UL, 0x452821e6}, {0x34e90c6cUL, 0xbe5466cf}, + {0xc97c50ddUL, 0xc0ac29b7}, {0xb5470917UL, 0x3f84d5b5}, + {0x8979fb1bUL, 0x9216d5d9}, {0x98dfb5acUL, 0xd1310ba6}, + {0xd01adfb7UL, 0x2ffd72db}, {0x6a267e96UL, 0xb8e1afed}, + {0xf12c7f99UL, 0xba7c9045}, {0xb3916cf7UL, 0x24a19947}, + {0x858efc16UL, 0x0801f2e2}, {0x71574e69UL, 0x636920d8} }; uint2 v[16] = @@ -94,412 +218,803 @@ void quark_blake512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint32_t }; Gprecalc(0, 4, 8, 12, 0x1, 0x0) - Gprecalc(1, 5, 9, 13, 0x3, 0x2) - Gprecalc(2, 6, 10, 14, 0x5, 0x4) - Gprecalc(3, 7, 11, 15, 0x7, 0x6) - Gprecalc(0, 5, 10, 15, 0x9, 0x8) - Gprecalc(1, 6, 11, 12, 0xb, 0xa) - Gprecalc(2, 7, 8, 13, 0xd, 0xc) - Gprecalc(3, 4, 9, 14, 0xf, 0xe) - - Gprecalc(0, 4, 8, 12, 0xa, 0xe) - Gprecalc(1, 5, 9, 13, 0x8, 0x4) - Gprecalc(2, 6, 10, 14, 0xf, 0x9) - Gprecalc(3, 7, 11, 15, 0x6, 0xd) - Gprecalc(0, 5, 10, 15, 0xc, 0x1) - Gprecalc(1, 6, 11, 12, 0x2, 0x0) - Gprecalc(2, 7, 8, 13, 0x7, 0xb) - Gprecalc(3, 4, 9, 14, 0x3, 0x5) - - Gprecalc(0, 4, 8, 12, 0x8, 0xb) - Gprecalc(1, 5, 9, 13, 0x0, 0xc) - Gprecalc(2, 6, 10, 14, 0x2, 0x5) - Gprecalc(3, 7, 11, 15, 0xd, 0xf) - Gprecalc(0, 5, 10, 15, 0xe, 0xa) - Gprecalc(1, 6, 11, 12, 0x6, 0x3) - Gprecalc(2, 7, 8, 13, 0x1, 0x7) - Gprecalc(3, 4, 9, 14, 0x4, 0x9) - - Gprecalc(0, 4, 8, 12, 0x9, 0x7) - Gprecalc(1, 5, 9, 13, 0x1, 0x3) - Gprecalc(2, 6, 10, 14, 0xc, 0xd) - Gprecalc(3, 7, 11, 15, 0xe, 0xb) - Gprecalc(0, 5, 10, 15, 0x6, 0x2) - Gprecalc(1, 6, 11, 12, 0xa, 0x5) - Gprecalc(2, 7, 8, 13, 0x0, 0x4) - Gprecalc(3, 4, 9, 14, 0x8, 0xf) - - Gprecalc(0, 4, 8, 12, 0x0, 0x9) - Gprecalc(1, 5, 9, 13, 0x7, 0x5) - Gprecalc(2, 6, 10, 14, 0x4, 0x2) - Gprecalc(3, 7, 11, 15, 0xf, 0xa) - Gprecalc(0, 5, 10, 15, 0x1, 0xe) - Gprecalc(1, 6, 11, 12, 0xc, 0xb) - Gprecalc(2, 7, 8, 13, 0x8, 0x6) - Gprecalc(3, 4, 9, 14, 0xd, 0x3) - - Gprecalc(0, 4, 8, 12, 0xc, 0x2) - Gprecalc(1, 5, 9, 13, 0xa, 0x6) - Gprecalc(2, 6, 10, 14, 0xb, 0x0) - Gprecalc(3, 7, 11, 15, 0x3, 0x8) - Gprecalc(0, 5, 10, 15, 0xd, 0x4) - Gprecalc(1, 6, 11, 12, 0x5, 0x7) - Gprecalc(2, 7, 8, 13, 0xe, 0xf) - Gprecalc(3, 4, 9, 14, 0x9, 0x1) - - Gprecalc(0, 4, 8, 12, 0x5, 0xc) - Gprecalc(1, 5, 9, 13, 0xf, 0x1) - Gprecalc(2, 6, 10, 14, 0xd, 0xe) - Gprecalc(3, 7, 11, 15, 0xa, 0x4) - Gprecalc(0, 5, 10, 15, 0x7, 0x0) - Gprecalc(1, 6, 11, 12, 0x3, 0x6) - Gprecalc(2, 7, 8, 13, 0x2, 0x9) - Gprecalc(3, 4, 9, 14, 0xb, 0x8) - - Gprecalc(0, 4, 8, 12, 0xb, 0xd) - Gprecalc(1, 5, 9, 13, 0xe, 0x7) - Gprecalc(2, 6, 10, 14, 0x1, 0xc) - Gprecalc(3, 7, 11, 15, 0x9, 0x3) - Gprecalc(0, 5, 10, 15, 0x0, 0x5) - Gprecalc(1, 6, 11, 12, 0x4, 0xf) - Gprecalc(2, 7, 8, 13, 0x6, 0x8) - Gprecalc(3, 4, 9, 14, 0xa, 0x2) - - Gprecalc(0, 4, 8, 12, 0xf, 0x6) - Gprecalc(1, 5, 9, 13, 0x9, 0xe) - Gprecalc(2, 6, 10, 14, 0x3, 0xb) - Gprecalc(3, 7, 11, 15, 0x8, 0x0) - Gprecalc(0, 5, 10, 15, 0x2, 0xc) - Gprecalc(1, 6, 11, 12, 0x7, 0xd) - Gprecalc(2, 7, 8, 13, 0x4, 0x1) - Gprecalc(3, 4, 9, 14, 0x5, 0xa) - - Gprecalc(0, 4, 8, 12, 0x2, 0xa) - Gprecalc(1, 5, 9, 13, 0x4, 0x8) - Gprecalc(2, 6, 10, 14, 0x6, 0x7) - Gprecalc(3, 7, 11, 15, 0x5, 0x1) - Gprecalc(0, 5, 10, 15, 0xb, 0xf) - Gprecalc(1, 6, 11, 12, 0xe, 0x9) - Gprecalc(2, 7, 8, 13, 0xc, 0x3) - Gprecalc(3, 4, 9, 14, 0x0, 0xd) + Gprecalc(1, 5, 9, 13, 0x3, 0x2) + Gprecalc(2, 6, 10, 14, 0x5, 0x4) + Gprecalc(3, 7, 11, 15, 0x7, 0x6) + Gprecalc(0, 5, 10, 15, 0x9, 0x8) + Gprecalc(1, 6, 11, 12, 0xb, 0xa) + Gprecalc(2, 7, 8, 13, 0xd, 0xc) + Gprecalc(3, 4, 9, 14, 0xf, 0xe) + + Gprecalc(0, 4, 8, 12, 0xa, 0xe) + Gprecalc(1, 5, 9, 13, 0x8, 0x4) + Gprecalc(2, 6, 10, 14, 0xf, 0x9) + Gprecalc(3, 7, 11, 15, 0x6, 0xd) + Gprecalc(0, 5, 10, 15, 0xc, 0x1) + Gprecalc(1, 6, 11, 12, 0x2, 0x0) + Gprecalc(2, 7, 8, 13, 0x7, 0xb) + Gprecalc(3, 4, 9, 14, 0x3, 0x5) + + Gprecalc(0, 4, 8, 12, 0x8, 0xb) + Gprecalc(1, 5, 9, 13, 0x0, 0xc) + Gprecalc(2, 6, 10, 14, 0x2, 0x5) + Gprecalc(3, 7, 11, 15, 0xd, 0xf) + Gprecalc(0, 5, 10, 15, 0xe, 0xa) + Gprecalc(1, 6, 11, 12, 0x6, 0x3) + Gprecalc(2, 7, 8, 13, 0x1, 0x7) + Gprecalc(3, 4, 9, 14, 0x4, 0x9) + + Gprecalc(0, 4, 8, 12, 0x9, 0x7) + Gprecalc(1, 5, 9, 13, 0x1, 0x3) + Gprecalc(2, 6, 10, 14, 0xc, 0xd) + Gprecalc(3, 7, 11, 15, 0xe, 0xb) + Gprecalc(0, 5, 10, 15, 0x6, 0x2) + Gprecalc(1, 6, 11, 12, 0xa, 0x5) + Gprecalc(2, 7, 8, 13, 0x0, 0x4) + Gprecalc(3, 4, 9, 14, 0x8, 0xf) + + Gprecalc(0, 4, 8, 12, 0x0, 0x9) + Gprecalc(1, 5, 9, 13, 0x7, 0x5) + Gprecalc(2, 6, 10, 14, 0x4, 0x2) + Gprecalc(3, 7, 11, 15, 0xf, 0xa) + Gprecalc(0, 5, 10, 15, 0x1, 0xe) + Gprecalc(1, 6, 11, 12, 0xc, 0xb) + Gprecalc(2, 7, 8, 13, 0x8, 0x6) + Gprecalc(3, 4, 9, 14, 0xd, 0x3) + + Gprecalc(0, 4, 8, 12, 0xc, 0x2) + Gprecalc(1, 5, 9, 13, 0xa, 0x6) + Gprecalc(2, 6, 10, 14, 0xb, 0x0) + Gprecalc(3, 7, 11, 15, 0x3, 0x8) + Gprecalc(0, 5, 10, 15, 0xd, 0x4) + Gprecalc(1, 6, 11, 12, 0x5, 0x7) + Gprecalc(2, 7, 8, 13, 0xe, 0xf) + Gprecalc(3, 4, 9, 14, 0x9, 0x1) + + Gprecalc(0, 4, 8, 12, 0x5, 0xc) + Gprecalc(1, 5, 9, 13, 0xf, 0x1) + Gprecalc(2, 6, 10, 14, 0xd, 0xe) + Gprecalc(3, 7, 11, 15, 0xa, 0x4) + Gprecalc(0, 5, 10, 15, 0x7, 0x0) + Gprecalc(1, 6, 11, 12, 0x3, 0x6) + Gprecalc(2, 7, 8, 13, 0x2, 0x9) + Gprecalc(3, 4, 9, 14, 0xb, 0x8) + + Gprecalc(0, 4, 8, 12, 0xb, 0xd) + Gprecalc(1, 5, 9, 13, 0xe, 0x7) + Gprecalc(2, 6, 10, 14, 0x1, 0xc) + Gprecalc(3, 7, 11, 15, 0x9, 0x3) + Gprecalc(0, 5, 10, 15, 0x0, 0x5) + Gprecalc(1, 6, 11, 12, 0x4, 0xf) + Gprecalc(2, 7, 8, 13, 0x6, 0x8) + Gprecalc(3, 4, 9, 14, 0xa, 0x2) + + Gprecalc(0, 4, 8, 12, 0xf, 0x6) + Gprecalc(1, 5, 9, 13, 0x9, 0xe) + Gprecalc(2, 6, 10, 14, 0x3, 0xb) + Gprecalc(3, 7, 11, 15, 0x8, 0x0) + Gprecalc(0, 5, 10, 15, 0x2, 0xc) + Gprecalc(1, 6, 11, 12, 0x7, 0xd) + Gprecalc(2, 7, 8, 13, 0x4, 0x1) + Gprecalc(3, 4, 9, 14, 0x5, 0xa) + + Gprecalc(0, 4, 8, 12, 0x2, 0xa) + Gprecalc(1, 5, 9, 13, 0x4, 0x8) + Gprecalc(2, 6, 10, 14, 0x6, 0x7) + Gprecalc(3, 7, 11, 15, 0x5, 0x1) + Gprecalc(0, 5, 10, 15, 0xb, 0xf) + Gprecalc(1, 6, 11, 12, 0xe, 0x9) + Gprecalc(2, 7, 8, 13, 0xc, 0x3) + Gprecalc(3, 4, 9, 14, 0x0, 0xd) + +#if __CUDA_ARCH__ == 500 + + Gprecalc(0, 4, 8, 12, 0x1, 0x0) + Gprecalc(1, 5, 9, 13, 0x3, 0x2) + Gprecalc(2, 6, 10, 14, 0x5, 0x4) + Gprecalc(3, 7, 11, 15, 0x7, 0x6) + Gprecalc(0, 5, 10, 15, 0x9, 0x8) + Gprecalc(1, 6, 11, 12, 0xb, 0xa) + Gprecalc(2, 7, 8, 13, 0xd, 0xc) + Gprecalc(3, 4, 9, 14, 0xf, 0xe) + + Gprecalc(0, 4, 8, 12, 0xa, 0xe) + Gprecalc(1, 5, 9, 13, 0x8, 0x4) + Gprecalc(2, 6, 10, 14, 0xf, 0x9) + Gprecalc(3, 7, 11, 15, 0x6, 0xd) + Gprecalc(0, 5, 10, 15, 0xc, 0x1) + Gprecalc(1, 6, 11, 12, 0x2, 0x0) + Gprecalc(2, 7, 8, 13, 0x7, 0xb) + Gprecalc(3, 4, 9, 14, 0x3, 0x5) + + Gprecalc(0, 4, 8, 12, 0x8, 0xb) + Gprecalc(1, 5, 9, 13, 0x0, 0xc) + Gprecalc(2, 6, 10, 14, 0x2, 0x5) + Gprecalc(3, 7, 11, 15, 0xd, 0xf) + Gprecalc(0, 5, 10, 15, 0xe, 0xa) + Gprecalc(1, 6, 11, 12, 0x6, 0x3) + Gprecalc(2, 7, 8, 13, 0x1, 0x7) + Gprecalc(3, 4, 9, 14, 0x4, 0x9) + + Gprecalc(0, 4, 8, 12, 0x9, 0x7) + Gprecalc(1, 5, 9, 13, 0x1, 0x3) + Gprecalc(2, 6, 10, 14, 0xc, 0xd) + Gprecalc(3, 7, 11, 15, 0xe, 0xb) + Gprecalc(0, 5, 10, 15, 0x6, 0x2) + Gprecalc(1, 6, 11, 12, 0xa, 0x5) + Gprecalc(2, 7, 8, 13, 0x0, 0x4) + Gprecalc(3, 4, 9, 14, 0x8, 0xf) + + Gprecalc(0, 4, 8, 12, 0x0, 0x9) + Gprecalc(1, 5, 9, 13, 0x7, 0x5) + Gprecalc(2, 6, 10, 14, 0x4, 0x2) + Gprecalc(3, 7, 11, 15, 0xf, 0xa) + Gprecalc(0, 5, 10, 15, 0x1, 0xe) + Gprecalc(1, 6, 11, 12, 0xc, 0xb) + Gprecalc(2, 7, 8, 13, 0x8, 0x6) + Gprecalc(3, 4, 9, 14, 0xd, 0x3) + + Gprecalc(0, 4, 8, 12, 0xc, 0x2) + Gprecalc(1, 5, 9, 13, 0xa, 0x6) + Gprecalc(2, 6, 10, 14, 0xb, 0x0) + Gprecalc(3, 7, 11, 15, 0x3, 0x8) + Gprecalc(0, 5, 10, 15, 0xd, 0x4) + Gprecalc(1, 6, 11, 12, 0x5, 0x7) + Gprecalc(2, 7, 8, 13, 0xe, 0xf) + Gprecalc(3, 4, 9, 14, 0x9, 0x1) - Gprecalc(0, 4, 8, 12, 0x1, 0x0) - Gprecalc(1, 5, 9, 13, 0x3, 0x2) - Gprecalc(2, 6, 10, 14, 0x5, 0x4) - Gprecalc(3, 7, 11, 15, 0x7, 0x6) - Gprecalc(0, 5, 10, 15, 0x9, 0x8) - Gprecalc(1, 6, 11, 12, 0xb, 0xa) - Gprecalc(2, 7, 8, 13, 0xd, 0xc) - Gprecalc(3, 4, 9, 14, 0xf, 0xe) - - Gprecalc(0, 4, 8, 12, 0xa, 0xe) - Gprecalc(1, 5, 9, 13, 0x8, 0x4) - Gprecalc(2, 6, 10, 14, 0xf, 0x9) - Gprecalc(3, 7, 11, 15, 0x6, 0xd) - Gprecalc(0, 5, 10, 15, 0xc, 0x1) - Gprecalc(1, 6, 11, 12, 0x2, 0x0) - Gprecalc(2, 7, 8, 13, 0x7, 0xb) - Gprecalc(3, 4, 9, 14, 0x3, 0x5) - - Gprecalc(0, 4, 8, 12, 0x8, 0xb) - Gprecalc(1, 5, 9, 13, 0x0, 0xc) - Gprecalc(2, 6, 10, 14, 0x2, 0x5) - Gprecalc(3, 7, 11, 15, 0xd, 0xf) - Gprecalc(0, 5, 10, 15, 0xe, 0xa) - Gprecalc(1, 6, 11, 12, 0x6, 0x3) - Gprecalc(2, 7, 8, 13, 0x1, 0x7) - Gprecalc(3, 4, 9, 14, 0x4, 0x9) - - Gprecalc(0, 4, 8, 12, 0x9, 0x7) - Gprecalc(1, 5, 9, 13, 0x1, 0x3) - Gprecalc(2, 6, 10, 14, 0xc, 0xd) - Gprecalc(3, 7, 11, 15, 0xe, 0xb) - Gprecalc(0, 5, 10, 15, 0x6, 0x2) - Gprecalc(1, 6, 11, 12, 0xa, 0x5) - Gprecalc(2, 7, 8, 13, 0x0, 0x4) - Gprecalc(3, 4, 9, 14, 0x8, 0xf) - - Gprecalc(0, 4, 8, 12, 0x0, 0x9) - Gprecalc(1, 5, 9, 13, 0x7, 0x5) - Gprecalc(2, 6, 10, 14, 0x4, 0x2) - Gprecalc(3, 7, 11, 15, 0xf, 0xa) - Gprecalc(0, 5, 10, 15, 0x1, 0xe) - Gprecalc(1, 6, 11, 12, 0xc, 0xb) - Gprecalc(2, 7, 8, 13, 0x8, 0x6) - Gprecalc(3, 4, 9, 14, 0xd, 0x3) - - Gprecalc(0, 4, 8, 12, 0xc, 0x2) - Gprecalc(1, 5, 9, 13, 0xa, 0x6) - Gprecalc(2, 6, 10, 14, 0xb, 0x0) - Gprecalc(3, 7, 11, 15, 0x3, 0x8) - Gprecalc(0, 5, 10, 15, 0xd, 0x4) - Gprecalc(1, 6, 11, 12, 0x5, 0x7) - Gprecalc(2, 7, 8, 13, 0xe, 0xf) - Gprecalc(3, 4, 9, 14, 0x9, 0x1) - - uint64_t *outHash = &g_hash[8 * hashPosition]; - - outHash[0] = devectorizeswap(h[0] ^ v[0] ^ v[8]); - outHash[1] = devectorizeswap(h[1] ^ v[1] ^ v[9]); - outHash[2] = devectorizeswap(h[2] ^ v[2] ^ v[10]); - outHash[3] = devectorizeswap(h[3] ^ v[3] ^ v[11]); - outHash[4] = devectorizeswap(h[4] ^ v[4] ^ v[12]); - outHash[5] = devectorizeswap(h[5] ^ v[5] ^ v[13]); - outHash[6] = devectorizeswap(h[6] ^ v[6] ^ v[14]); - outHash[7] = devectorizeswap(h[7] ^ v[7] ^ v[15]); +#else + + for(int i = 10; i < 16; i++) + { + /* column step */ + G(0, 4, 8, 12, 0); + G(1, 5, 9, 13, 2); + G(2, 6, 10, 14, 4); + G(3, 7, 11, 15, 6); + /* diagonal step */ + G(0, 5, 10, 15, 8); + G(1, 6, 11, 12, 10); + G(2, 7, 8, 13, 12); + G(3, 4, 9, 14, 14); + } +#endif + + v[0] = cuda_swap(h[0] ^ v[0] ^ v[8]); + v[1] = cuda_swap(h[1] ^ v[1] ^ v[9]); + v[2] = cuda_swap(h[2] ^ v[2] ^ v[10]); + v[3] = cuda_swap(h[3] ^ v[3] ^ v[11]); + v[4] = cuda_swap(h[4] ^ v[4] ^ v[12]); + v[5] = cuda_swap(h[5] ^ v[5] ^ v[13]); + v[6] = cuda_swap(h[6] ^ v[6] ^ v[14]); + v[7] = cuda_swap(h[7] ^ v[7] ^ v[15]); + + phash = (uint28*)v; + outpt = (uint28*)&g_hash[hashPosition * 8]; + outpt[0] = phash[0]; + outpt[1] = phash[1]; } } -__global__ -#if __CUDA_ARCH__ > 500 -__launch_bounds__(256, 4) -#else -__launch_bounds__(32, 32) -#endif -void quark_blake512_gpu_hash_80(uint32_t threads, uint32_t startNounce, uint32_t *outputHash) +__global__ +void quark_blake512_gpu_hash_80(uint32_t threads, uint32_t startNounce, uint2 *outputHash) { - uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); - if (thread < threads) + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + if(thread < threads) { - uint32_t nounce = startNounce + thread; - + const uint32_t nounce = startNounce + thread; uint2 block[16]; - // Message für die erste Runde in Register holen -#pragma unroll 16 - for (int i = 0; i < 16; ++i) - block[i] = vectorize(c_PaddedMessage80[i]); - // The test Nonce - // ((uint32_t*)block)[18] = nounce; + block[0] = c_PaddedM[0]; + block[1] = c_PaddedM[1]; + block[2] = c_PaddedM[2]; + block[3] = c_PaddedM[3]; + block[4] = c_PaddedM[4]; + block[5] = c_PaddedM[5]; + block[6] = c_PaddedM[6]; + block[7] = c_PaddedM[7]; + block[8] = c_PaddedM[8]; + block[9].y = c_PaddedM[9].y; + block[10] = vectorizehigh(0x80000000); + block[11] = vectorizelow(0); + block[12] = vectorizelow(0); + block[13] = vectorizelow(0x1); + block[14] = vectorizelow(0); + block[15] = vectorizelow(0x280); block[9].x = nounce; -// ((uint32_t*)block)[18] = nounce; const uint2 u512[16] = { - { 0x85a308d3UL, 0x243f6a88 }, { 0x03707344UL, 0x13198a2e }, - { 0x299f31d0UL, 0xa4093822 }, { 0xec4e6c89UL, 0x082efa98 }, - { 0x38d01377UL, 0x452821e6 }, { 0x34e90c6cUL, 0xbe5466cf }, - { 0xc97c50ddUL, 0xc0ac29b7 }, { 0xb5470917UL, 0x3f84d5b5 }, - { 0x8979fb1bUL, 0x9216d5d9 }, { 0x98dfb5acUL, 0xd1310ba6 }, - { 0xd01adfb7UL, 0x2ffd72db }, { 0x6a267e96UL, 0xb8e1afed }, - { 0xf12c7f99UL, 0xba7c9045 }, { 0xb3916cf7UL, 0x24a19947 }, - { 0x858efc16UL, 0x0801f2e2 }, { 0x71574e69UL, 0x636920d8 } + {0x85a308d3UL, 0x243f6a88}, {0x03707344UL, 0x13198a2e}, + {0x299f31d0UL, 0xa4093822}, {0xec4e6c89UL, 0x082efa98}, + {0x38d01377UL, 0x452821e6}, {0x34e90c6cUL, 0xbe5466cf}, + {0xc97c50ddUL, 0xc0ac29b7}, {0xb5470917UL, 0x3f84d5b5}, + {0x8979fb1bUL, 0x9216d5d9}, {0x98dfb5acUL, 0xd1310ba6}, + {0xd01adfb7UL, 0x2ffd72db}, {0x6a267e96UL, 0xb8e1afed}, + {0xf12c7f99UL, 0xba7c9045}, {0xb3916cf7UL, 0x24a19947}, + {0x858efc16UL, 0x0801f2e2}, {0x71574e69UL, 0x636920d8} }; - const uint2 h[8] = { - { 0xf3bcc908UL,0x6a09e667UL }, - { 0x84caa73bUL ,0xbb67ae85UL }, - { 0xfe94f82bUL,0x3c6ef372UL }, - { 0x5f1d36f1UL,0xa54ff53aUL }, - { 0xade682d1UL,0x510e527fUL }, - { 0x2b3e6c1fUL,0x9b05688cUL }, - { 0xfb41bd6bUL,0x1f83d9abUL }, - { 0x137e2179UL,0x5be0cd19UL } - }; - - uint2 v[16] = + /* const uint2 u512[16] = { - h[0], h[1], h[2], h[3], h[4], h[5], h[6], h[7], - u512[0], u512[1], u512[2], u512[3], u512[4] ^ 640, u512[5] ^ 640, u512[6], u512[7] + { 0x85a308d3UL, 0x243f6a88 }, { 0x03707344UL, 0x13198a2e }, + { 0x299f31d0UL, 0xa4093822 }, { 0xec4e6c89UL, 0x082efa98 }, + { 0x38d01377UL, 0x452821e6 }, { 0x34e90c6cUL, 0xbe5466cf }, + { 0xc97c50ddUL, 0xc0ac29b7 }, { 0xb5470917UL, 0x3f84d5b5 }, + { 0x8979fb1bUL, 0x9216d5d9 }, { 0x98dfb5acUL, 0xd1310ba6 }, + { 0xd01adfb7UL, 0x2ffd72db }, { 0x6a267e96UL, 0xb8e1afed }, + { 0xf12c7f99UL, 0xba7c9045 }, { 0xb3916cf7UL, 0x24a19947 }, + { 0x858efc16UL, 0x0801f2e2 }, { 0x71574e69UL, 0x636920d8 } }; - - Gprecalc(0, 4, 8, 12, 0x1, 0x0) - Gprecalc(1, 5, 9, 13, 0x3, 0x2) - Gprecalc(2, 6, 10, 14, 0x5, 0x4) - Gprecalc(3, 7, 11, 15, 0x7, 0x6) - Gprecalc(0, 5, 10, 15, 0x9, 0x8) - Gprecalc(1, 6, 11, 12, 0xb, 0xa) - Gprecalc(2, 7, 8, 13, 0xd, 0xc) - Gprecalc(3, 4, 9, 14, 0xf, 0xe) - - Gprecalc(0, 4, 8, 12, 0xa, 0xe) - Gprecalc(1, 5, 9, 13, 0x8, 0x4) - Gprecalc(2, 6, 10, 14, 0xf, 0x9) - Gprecalc(3, 7, 11, 15, 0x6, 0xd) - Gprecalc(0, 5, 10, 15, 0xc, 0x1) - Gprecalc(1, 6, 11, 12, 0x2, 0x0) - Gprecalc(2, 7, 8, 13, 0x7, 0xb) - Gprecalc(3, 4, 9, 14, 0x3, 0x5) - - Gprecalc(0, 4, 8, 12, 0x8, 0xb) - Gprecalc(1, 5, 9, 13, 0x0, 0xc) - Gprecalc(2, 6, 10, 14, 0x2, 0x5) - Gprecalc(3, 7, 11, 15, 0xd, 0xf) - Gprecalc(0, 5, 10, 15, 0xe, 0xa) - Gprecalc(1, 6, 11, 12, 0x6, 0x3) - Gprecalc(2, 7, 8, 13, 0x1, 0x7) - Gprecalc(3, 4, 9, 14, 0x4, 0x9) - - Gprecalc(0, 4, 8, 12, 0x9, 0x7) - Gprecalc(1, 5, 9, 13, 0x1, 0x3) - Gprecalc(2, 6, 10, 14, 0xc, 0xd) - Gprecalc(3, 7, 11, 15, 0xe, 0xb) - Gprecalc(0, 5, 10, 15, 0x6, 0x2) - Gprecalc(1, 6, 11, 12, 0xa, 0x5) - Gprecalc(2, 7, 8, 13, 0x0, 0x4) - Gprecalc(3, 4, 9, 14, 0x8, 0xf) - - Gprecalc(0, 4, 8, 12, 0x0, 0x9) - Gprecalc(1, 5, 9, 13, 0x7, 0x5) - Gprecalc(2, 6, 10, 14, 0x4, 0x2) - Gprecalc(3, 7, 11, 15, 0xf, 0xa) - Gprecalc(0, 5, 10, 15, 0x1, 0xe) - Gprecalc(1, 6, 11, 12, 0xc, 0xb) - Gprecalc(2, 7, 8, 13, 0x8, 0x6) - Gprecalc(3, 4, 9, 14, 0xd, 0x3) - - Gprecalc(0, 4, 8, 12, 0xc, 0x2) - Gprecalc(1, 5, 9, 13, 0xa, 0x6) - Gprecalc(2, 6, 10, 14, 0xb, 0x0) - Gprecalc(3, 7, 11, 15, 0x3, 0x8) - Gprecalc(0, 5, 10, 15, 0xd, 0x4) - Gprecalc(1, 6, 11, 12, 0x5, 0x7) - Gprecalc(2, 7, 8, 13, 0xe, 0xf) - Gprecalc(3, 4, 9, 14, 0x9, 0x1) - - Gprecalc(0, 4, 8, 12, 0x5, 0xc) - Gprecalc(1, 5, 9, 13, 0xf, 0x1) - Gprecalc(2, 6, 10, 14, 0xd, 0xe) - Gprecalc(3, 7, 11, 15, 0xa, 0x4) - Gprecalc(0, 5, 10, 15, 0x7, 0x0) - Gprecalc(1, 6, 11, 12, 0x3, 0x6) - Gprecalc(2, 7, 8, 13, 0x2, 0x9) - Gprecalc(3, 4, 9, 14, 0xb, 0x8) - - Gprecalc(0, 4, 8, 12, 0xb, 0xd) - Gprecalc(1, 5, 9, 13, 0xe, 0x7) - Gprecalc(2, 6, 10, 14, 0x1, 0xc) - Gprecalc(3, 7, 11, 15, 0x9, 0x3) - Gprecalc(0, 5, 10, 15, 0x0, 0x5) - Gprecalc(1, 6, 11, 12, 0x4, 0xf) - Gprecalc(2, 7, 8, 13, 0x6, 0x8) - Gprecalc(3, 4, 9, 14, 0xa, 0x2) - - Gprecalc(0, 4, 8, 12, 0xf, 0x6) - Gprecalc(1, 5, 9, 13, 0x9, 0xe) - Gprecalc(2, 6, 10, 14, 0x3, 0xb) - Gprecalc(3, 7, 11, 15, 0x8, 0x0) - Gprecalc(0, 5, 10, 15, 0x2, 0xc) - Gprecalc(1, 6, 11, 12, 0x7, 0xd) - Gprecalc(2, 7, 8, 13, 0x4, 0x1) - Gprecalc(3, 4, 9, 14, 0x5, 0xa) - - Gprecalc(0, 4, 8, 12, 0x2, 0xa) - Gprecalc(1, 5, 9, 13, 0x4, 0x8) - Gprecalc(2, 6, 10, 14, 0x6, 0x7) - Gprecalc(3, 7, 11, 15, 0x5, 0x1) - Gprecalc(0, 5, 10, 15, 0xb, 0xf) - Gprecalc(1, 6, 11, 12, 0xe, 0x9) - Gprecalc(2, 7, 8, 13, 0xc, 0x3) - Gprecalc(3, 4, 9, 14, 0x0, 0xd) - - Gprecalc(0, 4, 8, 12, 0x1, 0x0) - Gprecalc(1, 5, 9, 13, 0x3, 0x2) - Gprecalc(2, 6, 10, 14, 0x5, 0x4) - Gprecalc(3, 7, 11, 15, 0x7, 0x6) - Gprecalc(0, 5, 10, 15, 0x9, 0x8) - Gprecalc(1, 6, 11, 12, 0xb, 0xa) - Gprecalc(2, 7, 8, 13, 0xd, 0xc) - Gprecalc(3, 4, 9, 14, 0xf, 0xe) - - Gprecalc(0, 4, 8, 12, 0xa, 0xe) - Gprecalc(1, 5, 9, 13, 0x8, 0x4) - Gprecalc(2, 6, 10, 14, 0xf, 0x9) - Gprecalc(3, 7, 11, 15, 0x6, 0xd) - Gprecalc(0, 5, 10, 15, 0xc, 0x1) - Gprecalc(1, 6, 11, 12, 0x2, 0x0) - Gprecalc(2, 7, 8, 13, 0x7, 0xb) - Gprecalc(3, 4, 9, 14, 0x3, 0x5) - - Gprecalc(0, 4, 8, 12, 0x8, 0xb) - Gprecalc(1, 5, 9, 13, 0x0, 0xc) - Gprecalc(2, 6, 10, 14, 0x2, 0x5) - Gprecalc(3, 7, 11, 15, 0xd, 0xf) - Gprecalc(0, 5, 10, 15, 0xe, 0xa) - Gprecalc(1, 6, 11, 12, 0x6, 0x3) - Gprecalc(2, 7, 8, 13, 0x1, 0x7) - Gprecalc(3, 4, 9, 14, 0x4, 0x9) - - Gprecalc(0, 4, 8, 12, 0x9, 0x7) - Gprecalc(1, 5, 9, 13, 0x1, 0x3) - Gprecalc(2, 6, 10, 14, 0xc, 0xd) - Gprecalc(3, 7, 11, 15, 0xe, 0xb) - Gprecalc(0, 5, 10, 15, 0x6, 0x2) - Gprecalc(1, 6, 11, 12, 0xa, 0x5) - Gprecalc(2, 7, 8, 13, 0x0, 0x4) - Gprecalc(3, 4, 9, 14, 0x8, 0xf) - - Gprecalc(0, 4, 8, 12, 0x0, 0x9) - Gprecalc(1, 5, 9, 13, 0x7, 0x5) - Gprecalc(2, 6, 10, 14, 0x4, 0x2) - Gprecalc(3, 7, 11, 15, 0xf, 0xa) - Gprecalc(0, 5, 10, 15, 0x1, 0xe) - Gprecalc(1, 6, 11, 12, 0xc, 0xb) - Gprecalc(2, 7, 8, 13, 0x8, 0x6) - Gprecalc(3, 4, 9, 14, 0xd, 0x3) - - Gprecalc(0, 4, 8, 12, 0xc, 0x2) - Gprecalc(1, 5, 9, 13, 0xa, 0x6) - Gprecalc(2, 6, 10, 14, 0xb, 0x0) - Gprecalc(3, 7, 11, 15, 0x3, 0x8) - Gprecalc(0, 5, 10, 15, 0xd, 0x4) - Gprecalc(1, 6, 11, 12, 0x5, 0x7) - Gprecalc(2, 7, 8, 13, 0xe, 0xf) - Gprecalc(3, 4, 9, 14, 0x9, 0x1) - - uint64_t *outHash = (uint64_t *)outputHash + 8 * thread; - outHash[0] = devectorizeswap(h[0] ^ v[0] ^ v[8]); - outHash[1] = devectorizeswap(h[1] ^ v[1] ^ v[9]); - outHash[2] = devectorizeswap(h[2] ^ v[2] ^ v[10]); - outHash[3] = devectorizeswap(h[3] ^ v[3] ^ v[11]); - outHash[4] = devectorizeswap(h[4] ^ v[4] ^ v[12]); - outHash[5] = devectorizeswap(h[5] ^ v[5] ^ v[13]); - outHash[6] = devectorizeswap(h[6] ^ v[6] ^ v[14]); - outHash[7] = devectorizeswap(h[7] ^ v[7] ^ v[15]); + */ + const uint2 h[8] = { + {0xf3bcc908UL, 0x6a09e667UL}, + {0x84caa73bUL, 0xbb67ae85UL}, + {0xfe94f82bUL, 0x3c6ef372UL}, + {0x5f1d36f1UL, 0xa54ff53aUL}, + {0xade682d1UL, 0x510e527fUL}, + {0x2b3e6c1fUL, 0x9b05688cUL}, + {0xfb41bd6bUL, 0x1f83d9abUL}, + {0x137e2179UL, 0x5be0cd19UL} + }; + uint2 v[16]; + uint28 *outpt = (uint28*)v; + outpt[0] = Hostprecalc[0]; + outpt[1] = Hostprecalc[1]; + outpt[2] = Hostprecalc[2]; + outpt[3] = Hostprecalc[3]; + + int i = 0; + + v[0] += (block[9] ^ c_u512[8]); + v[15] = ROR16(v[15] ^ v[0]); + v[10] += v[15]; + v[5] = ROR2(v[5] ^ v[10], 11); + + GSPREC_SP(0, 4, 8, 12) + + // Gprecalc(1, 5, 9, 13, 0x8, 0x4) + v[1] += v[5]; + v[13] = eorswap32(v[13], v[1]); + v[9] += v[13]; + + v[5] = ROR2(v[5] ^ v[9], 25); + v[1] += (pre[i++]) + v[5]; + v[13] = ROR16(v[13] ^ v[1]); + v[9] += v[13]; + v[5] = ROR2(v[5] ^ v[9], 11); + + // Gprecalc(2, 6, 10, 14, 0xf, 0x9) + v[2] += (block[9] ^ c_u512[0xf]); + v[14] = eorswap32(v[14], v[2]); + v[10] += v[14]; + v[6] = ROR2(v[6] ^ v[10], 25); + v[2] += pre[i++] + v[6]; + v[14] = ROR16(v[14] ^ v[2]); + v[10] += v[14]; + v[6] = ROR2(v[6] ^ v[10], 11); + + // Gprecalc(3, 7, 11, 15, 0x6, 0xd) + v[15] = eorswap32(v[15], v[3]); + v[11] += v[15]; + v[7] = ROR2(v[7] ^ v[11], 25); + v[3] += pre[i++] + v[7]; + v[15] = ROR16(v[15] ^ v[3]); + v[11] += v[15]; + v[7] = ROR2(v[7] ^ v[11], 11); + + GSPREC_SP(0, 5, 10, 15) + GSPREC_SP(1, 6, 11, 12) + GSPREC_SP(2, 7, 8, 13) + GSPREC_SP(3, 4, 9, 14) + + GSPREC_SP(0, 4, 8, 12) + GSPREC_SP(1, 5, 9, 13) + GSPREC_SP(2, 6, 10, 14) + GSPREC_SP(3, 7, 11, 15) + GSPREC_SP(0, 5, 10, 15) + GSPREC_SP(1, 6, 11, 12) + GSPREC_SP(2, 7, 8, 13) + GSPREC_SP_HI(3, 4, 9, 14, 0x4, 0x9) + + GSPREC_SP_LO(0, 4, 8, 12, 0x9, 0x7) + GSPREC_SP(1, 5, 9, 13) + GSPREC_SP(2, 6, 10, 14) + GSPREC_SP(3, 7, 11, 15) + GSPREC_SP(0, 5, 10, 15) + GSPREC_SP(1, 6, 11, 12) + GSPREC_SP(2, 7, 8, 13) + GSPREC_SP(3, 4, 9, 14) + + + GSPREC_SP_HI(0, 4, 8, 12, 0x0, 0x9) + GSPREC_SP(1, 5, 9, 13) + GSPREC_SP(2, 6, 10, 14) + GSPREC_SP(3, 7, 11, 15) + GSPREC_SP(0, 5, 10, 15) + GSPREC_SP(1, 6, 11, 12) + GSPREC_SP(2, 7, 8, 13) + GSPREC_SP(3, 4, 9, 14) + + GSPREC_SP(0, 4, 8, 12) + GSPREC_SP(1, 5, 9, 13) + GSPREC_SP(2, 6, 10, 14) + GSPREC_SP(3, 7, 11, 15) + GSPREC_SP(0, 5, 10, 15) + GSPREC_SP(1, 6, 11, 12) + GSPREC_SP(2, 7, 8, 13) + GSPREC_SP_LO(3, 4, 9, 14, 0x9, 0x1) + + GSPREC_SP(0, 4, 8, 12) + GSPREC_SP(1, 5, 9, 13) + GSPREC_SP(2, 6, 10, 14) + GSPREC_SP(3, 7, 11, 15) + GSPREC_SP(0, 5, 10, 15) + GSPREC_SP(1, 6, 11, 12) + GSPREC_SP_HI(2, 7, 8, 13, 0x2, 0x9) + GSPREC_SP(3, 4, 9, 14) + + + GSPREC_SP(0, 4, 8, 12) + GSPREC_SP(1, 5, 9, 13) + GSPREC_SP(2, 6, 10, 14) + GSPREC_SP_LO(3, 7, 11, 15, 0x9, 0x3) + GSPREC_SP(0, 5, 10, 15) + GSPREC_SP(1, 6, 11, 12) + GSPREC_SP(2, 7, 8, 13) + GSPREC_SP(3, 4, 9, 14) + + GSPREC_SP(0, 4, 8, 12) + GSPREC_SP_LO(1, 5, 9, 13, 0x9, 0xe) + GSPREC_SP(2, 6, 10, 14) + GSPREC_SP(3, 7, 11, 15) + GSPREC_SP(0, 5, 10, 15) + GSPREC_SP(1, 6, 11, 12) + GSPREC_SP(2, 7, 8, 13) + GSPREC_SP(3, 4, 9, 14) + + GSPREC_SP(0, 4, 8, 12) + GSPREC_SP(1, 5, 9, 13) + GSPREC_SP(2, 6, 10, 14) + GSPREC_SP(3, 7, 11, 15) + GSPREC_SP(0, 5, 10, 15) + GSPREC_SP_HI(1, 6, 11, 12, 0xe, 0x9) + GSPREC_SP(2, 7, 8, 13) + GSPREC_SP(3, 4, 9, 14) + + GSPREC_SP(0, 4, 8, 12) + GSPREC_SP(1, 5, 9, 13) + GSPREC_SP(2, 6, 10, 14) + GSPREC_SP(3, 7, 11, 15) + GSPREC_SP_LO(0, 5, 10, 15, 0x9, 0x8) + GSPREC_SP(1, 6, 11, 12) + GSPREC_SP(2, 7, 8, 13) + GSPREC_SP(3, 4, 9, 14) + + GSPREC_SP(0, 4, 8, 12) + GSPREC_SP(1, 5, 9, 13) + GSPREC_SP_HI(2, 6, 10, 14, 0xf, 0x9) + GSPREC_SP(3, 7, 11, 15) + GSPREC_SP(0, 5, 10, 15) + GSPREC_SP(1, 6, 11, 12) + GSPREC_SP(2, 7, 8, 13) + GSPREC_SP(3, 4, 9, 14) + + GSPREC_SP(0, 4, 8, 12) + GSPREC_SP(1, 5, 9, 13) + GSPREC_SP(2, 6, 10, 14) + GSPREC_SP(3, 7, 11, 15) + GSPREC_SP(0, 5, 10, 15) + GSPREC_SP(1, 6, 11, 12) + GSPREC_SP(2, 7, 8, 13) + GSPREC_SP_HI(3, 4, 9, 14, 0x4, 0x9) + + GSPREC_SP_LO(0, 4, 8, 12, 0x9, 0x7) + GSPREC_SP(1, 5, 9, 13) + GSPREC_SP(2, 6, 10, 14) + GSPREC_SP(3, 7, 11, 15) + GSPREC_SP(0, 5, 10, 15) + GSPREC_SP(1, 6, 11, 12) + GSPREC_SP(2, 7, 8, 13) + GSPREC_SP(3, 4, 9, 14) + + GSPREC_SP_HI(0, 4, 8, 12, 0x0, 0x9) + GSPREC_SP(1, 5, 9, 13) + GSPREC_SP(2, 6, 10, 14) + GSPREC_SP(3, 7, 11, 15) + GSPREC_SP(0, 5, 10, 15) + GSPREC_SP(1, 6, 11, 12) + GSPREC_SP(2, 7, 8, 13) + GSPREC_SP(3, 4, 9, 14) + + GSPREC_SP(0, 4, 8, 12) + GSPREC_SP(1, 5, 9, 13) + GSPREC_SP(2, 6, 10, 14) + GSPREC_SP(3, 7, 11, 15) + GSPREC_SP(0, 5, 10, 15) + GSPREC_SP(1, 6, 11, 12) + GSPREC_SP(2, 7, 8, 13) + GSPREC_SP_LO(3, 4, 9, 14, 0x9, 0x1) + + + /* Gprecalc(0, 5, 10, 15, 0xc, 0x1) + Gprecalc(1, 6, 11, 12, 0x2, 0x0) + Gprecalc(2, 7, 8, 13, 0x7, 0xb) + Gprecalc(3, 4, 9, 14, 0x3, 0x5) + Gprecalc(0, 4, 8, 12, 0x8, 0xb) + Gprecalc(1, 5, 9, 13, 0x0, 0xc) + Gprecalc(2, 6, 10, 14, 0x2, 0x5) + Gprecalc(3, 7, 11, 15, 0xd, 0xf) + Gprecalc(0, 5, 10, 15, 0xe, 0xa) + Gprecalc(1, 6, 11, 12, 0x6, 0x3) + Gprecalc(2, 7, 8, 13, 0x1, 0x7) + Gprecalc(3, 4, 9, 14, 0x4, 0x9) + Gprecalc(0, 4, 8, 12, 0x9, 0x7) + Gprecalc(1, 5, 9, 13, 0x1, 0x3) + Gprecalc(2, 6, 10, 14, 0xc, 0xd) + Gprecalc(3, 7, 11, 15, 0xe, 0xb) + Gprecalc(0, 5, 10, 15, 0x6, 0x2) + Gprecalc(1, 6, 11, 12, 0xa, 0x5) + Gprecalc(2, 7, 8, 13, 0x0, 0x4) + Gprecalc(3, 4, 9, 14, 0x8, 0xf) + Gprecalc(0, 4, 8, 12, 0x0, 0x9) + Gprecalc(1, 5, 9, 13, 0x7, 0x5) + Gprecalc(2, 6, 10, 14, 0x4, 0x2) + Gprecalc(3, 7, 11, 15, 0xf, 0xa) + Gprecalc(0, 5, 10, 15, 0x1, 0xe) + Gprecalc(1, 6, 11, 12, 0xc, 0xb) + Gprecalc(2, 7, 8, 13, 0x8, 0x6) + Gprecalc(3, 4, 9, 14, 0xd, 0x3) + Gprecalc(0, 4, 8, 12, 0xc, 0x2) + Gprecalc(1, 5, 9, 13, 0xa, 0x6) + Gprecalc(2, 6, 10, 14, 0xb, 0x0) + Gprecalc(3, 7, 11, 15, 0x3, 0x8) + Gprecalc(0, 5, 10, 15, 0xd, 0x4) + Gprecalc(1, 6, 11, 12, 0x5, 0x7) + Gprecalc(2, 7, 8, 13, 0xe, 0xf) + Gprecalc(3, 4, 9, 14, 0x9, 0x1) + Gprecalc(0, 4, 8, 12, 0x5, 0xc) + Gprecalc(1, 5, 9, 13, 0xf, 0x1) + Gprecalc(2, 6, 10, 14, 0xd, 0xe) + Gprecalc(3, 7, 11, 15, 0xa, 0x4) + Gprecalc(0, 5, 10, 15, 0x7, 0x0) + Gprecalc(1, 6, 11, 12, 0x3, 0x6) + Gprecalc(2, 7, 8, 13, 0x2, 0x9) + Gprecalc(3, 4, 9, 14, 0xb, 0x8) + Gprecalc(0, 4, 8, 12, 0xb, 0xd) + Gprecalc(1, 5, 9, 13, 0xe, 0x7) + Gprecalc(2, 6, 10, 14, 0x1, 0xc) + Gprecalc(3, 7, 11, 15, 0x9, 0x3) + Gprecalc(0, 5, 10, 15, 0x0, 0x5) + Gprecalc(1, 6, 11, 12, 0x4, 0xf) + Gprecalc(2, 7, 8, 13, 0x6, 0x8) + Gprecalc(3, 4, 9, 14, 0xa, 0x2) + Gprecalc(0, 4, 8, 12, 0xf, 0x6) + Gprecalc(1, 5, 9, 13, 0x9, 0xe) + Gprecalc(2, 6, 10, 14, 0x3, 0xb) + Gprecalc(3, 7, 11, 15, 0x8, 0x0) + Gprecalc(0, 5, 10, 15, 0x2, 0xc) + Gprecalc(1, 6, 11, 12, 0x7, 0xd) + Gprecalc(2, 7, 8, 13, 0x4, 0x1) + Gprecalc(3, 4, 9, 14, 0x5, 0xa) + Gprecalc(0, 4, 8, 12, 0x2, 0xa) + Gprecalc(1, 5, 9, 13, 0x4, 0x8) + Gprecalc(2, 6, 10, 14, 0x6, 0x7) + Gprecalc(3, 7, 11, 15, 0x5, 0x1) + Gprecalc(0, 5, 10, 15, 0xb, 0xf) + Gprecalc(1, 6, 11, 12, 0xe, 0x9) + Gprecalc(2, 7, 8, 13, 0xc, 0x3) + Gprecalc(3, 4, 9, 14, 0x0, 0xd) + Gprecalc(0, 4, 8, 12, 0x1, 0x0) + Gprecalc(1, 5, 9, 13, 0x3, 0x2) + Gprecalc(2, 6, 10, 14, 0x5, 0x4) + Gprecalc(3, 7, 11, 15, 0x7, 0x6) + Gprecalc(0, 5, 10, 15, 0x9, 0x8) + Gprecalc(1, 6, 11, 12, 0xb, 0xa) + Gprecalc(2, 7, 8, 13, 0xd, 0xc) + Gprecalc(3, 4, 9, 14, 0xf, 0xe) + Gprecalc(0, 4, 8, 12, 0xa, 0xe) + Gprecalc(1, 5, 9, 13, 0x8, 0x4) + Gprecalc(2, 6, 10, 14, 0xf, 0x9) + Gprecalc(3, 7, 11, 15, 0x6, 0xd) + Gprecalc(0, 5, 10, 15, 0xc, 0x1) + Gprecalc(1, 6, 11, 12, 0x2, 0x0) + Gprecalc(2, 7, 8, 13, 0x7, 0xb) + Gprecalc(3, 4, 9, 14, 0x3, 0x5) + Gprecalc(0, 4, 8, 12, 0x8, 0xb) + Gprecalc(1, 5, 9, 13, 0x0, 0xc) + Gprecalc(2, 6, 10, 14, 0x2, 0x5) + Gprecalc(3, 7, 11, 15, 0xd, 0xf) + Gprecalc(0, 5, 10, 15, 0xe, 0xa) + Gprecalc(1, 6, 11, 12, 0x6, 0x3) + Gprecalc(2, 7, 8, 13, 0x1, 0x7) + Gprecalc(3, 4, 9, 14, 0x4, 0x9) + Gprecalc(0, 4, 8, 12, 0x9, 0x7) + Gprecalc(1, 5, 9, 13, 0x1, 0x3) + Gprecalc(2, 6, 10, 14, 0xc, 0xd) + Gprecalc(3, 7, 11, 15, 0xe, 0xb) + Gprecalc(0, 5, 10, 15, 0x6, 0x2) + Gprecalc(1, 6, 11, 12, 0xa, 0x5) + Gprecalc(2, 7, 8, 13, 0x0, 0x4) + Gprecalc(3, 4, 9, 14, 0x8, 0xf) + Gprecalc(0, 4, 8, 12, 0x0, 0x9) + Gprecalc(1, 5, 9, 13, 0x7, 0x5) + Gprecalc(2, 6, 10, 14, 0x4, 0x2) + Gprecalc(3, 7, 11, 15, 0xf, 0xa) + Gprecalc(0, 5, 10, 15, 0x1, 0xe) + Gprecalc(1, 6, 11, 12, 0xc, 0xb) + Gprecalc(2, 7, 8, 13, 0x8, 0x6) + Gprecalc(3, 4, 9, 14, 0xd, 0x3) + Gprecalc(0, 4, 8, 12, 0xc, 0x2) + Gprecalc(1, 5, 9, 13, 0xa, 0x6) + Gprecalc(2, 6, 10, 14, 0xb, 0x0) + Gprecalc(3, 7, 11, 15, 0x3, 0x8) + Gprecalc(0, 5, 10, 15, 0xd, 0x4) + Gprecalc(1, 6, 11, 12, 0x5, 0x7) + Gprecalc(2, 7, 8, 13, 0xe, 0xf) + Gprecalc(3, 4, 9, 14, 0x9, 0x1) + */ + v[0] = cuda_swap(h[0] ^ v[0] ^ v[8]); + v[1] = cuda_swap(h[1] ^ v[1] ^ v[9]); + v[2] = cuda_swap(h[2] ^ v[2] ^ v[10]); + v[3] = cuda_swap(h[3] ^ v[3] ^ v[11]); + v[4] = cuda_swap(h[4] ^ v[4] ^ v[12]); + v[5] = cuda_swap(h[5] ^ v[5] ^ v[13]); + v[6] = cuda_swap(h[6] ^ v[6] ^ v[14]); + v[7] = cuda_swap(h[7] ^ v[7] ^ v[15]); + + uint28 *phash = (uint28*)v; + outpt = (uint28*)&outputHash[8 * thread]; + outpt[0] = phash[0]; + outpt[1] = phash[1]; } } + // ---------------------------- END CUDA quark_blake512 functions ------------------------------------ +__host__ void quark_blake512_cpu_init(int thr_id) +{ + CUDA_SAFE_CALL(cudaMalloc(&c_PaddedMessage80[thr_id], 10 * sizeof(uint2))); +} -// Blake512 für 80 Byte grosse Eingangsdaten -__host__ void quark_blake512_cpu_setBlock_80(void *pdata) +__host__ void quark_blake512_cpu_setBlock_80_multi(uint32_t thr_id, uint64_t *pdata) { - // Message mit Padding bereitstellen - // lediglich die korrekte Nonce ist noch ab Byte 76 einzusetzen. - unsigned char PaddedMessage[128]; - memcpy(PaddedMessage, pdata, 80); - memset(PaddedMessage+80, 0, 48); - PaddedMessage[80] = 0x80; - PaddedMessage[111] = 1; - PaddedMessage[126] = 0x02; - PaddedMessage[127] = 0x80; - for (int i = 0; i < 16; i++) - ((uint64_t*)PaddedMessage)[i] = cuda_swab64(((uint64_t*)PaddedMessage)[i]); - CUDA_SAFE_CALL( - cudaMemcpyToSymbol(c_PaddedMessage80, PaddedMessage, 16*sizeof(uint64_t), 0, cudaMemcpyHostToDevice) - ); + uint64_t PaddedMessage[10]; + for(int i = 0; i < 10; i++) + PaddedMessage[i] = cuda_swab64(pdata[i]); + CUDA_SAFE_CALL(cudaMemcpy(c_PaddedMessage80[thr_id], PaddedMessage, 10 * sizeof(uint64_t), cudaMemcpyHostToDevice)); } +__host__ void quark_blake512_cpu_setBlock_80(int thr_id, uint64_t *pdata) +{ + const uint64_t u512[16] = + { + 0x243f6a8885a308d3ULL, 0x13198a2e03707344ULL, + 0xa4093822299f31d0ULL, 0x082efa98ec4e6c89ULL, + 0x452821e638d01377ULL, 0xbe5466cf34e90c6cULL, + 0xc0ac29b7c97c50ddULL, 0x3f84d5b5b5470917ULL, + 0x9216d5d98979fb1bULL, 0xd1310ba698dfb5acULL, + 0x2ffd72dbd01adfb7ULL, 0xb8e1afed6a267e96ULL, + 0xba7c9045f12c7f99ULL, 0x24a19947b3916cf7ULL, + 0x0801f2e2858efc16ULL, 0x636920d871574e69ULL + }; + + uint64_t h[8] = { + 0x6a09e667f3bcc908ULL, + 0xbb67ae8584caa73bULL, + 0x3c6ef372fe94f82bULL, + 0xa54ff53a5f1d36f1ULL, + 0x510e527fade682d1ULL, + 0x9b05688c2b3e6c1fULL, + 0x1f83d9abfb41bd6bULL, + 0x5be0cd19137e2179ULL + }; + + uint64_t v[16] = + { + h[0], h[1], h[2], h[3], h[4], h[5], h[6], h[7], + u512[0], u512[1], u512[2], u512[3], u512[4] ^ 640, u512[5] ^ 640, u512[6], u512[7] + }; + + uint64_t PaddedMessage[10]; + uint64_t block[16]; + uint64_t prehost[224]; + + for(int i = 0; i < 10; i++) + { + PaddedMessage[i] = cuda_swab64(pdata[i]); + block[i] = PaddedMessage[i]; + } + CUDA_SAFE_CALL(cudaMemcpyToSymbolAsync(c_PaddedM, PaddedMessage, 10 * sizeof(uint64_t), 0, cudaMemcpyHostToDevice, gpustream[thr_id])); + + block[10] = 0x8000000000000000; + block[11] = 0; + block[12] = 0; + block[13] = 1; + block[14] = 0; + block[15] = 0x280; + + GprecalcHost(0, 4, 8, 12, 0x1, 0x0) + GprecalcHost(1, 5, 9, 13, 0x3, 0x2) + GprecalcHost(2, 6, 10, 14, 0x5, 0x4) + GprecalcHost(3, 7, 11, 15, 0x7, 0x6) + + GprecalcHost(1, 6, 11, 12, 0xb, 0xa) + GprecalcHost(2, 7, 8, 13, 0xd, 0xc) + + v[0] += (block[8] ^ u512[9]) + v[5]; + v[15] = ROTR64(v[15] ^ v[0], 32); + v[10] += v[15]; + v[5] = ROTR64(v[5] ^ v[10], 25); + v[0] += v[5]; + + GprecalcHost(3, 4, 9, 14, 0xf, 0xe); + + v[1] += (block[0x4] ^ u512[0x8]); + v[2] += v[6]; + v[3] += (block[0xd] ^ u512[6]) + v[7]; + CUDA_SAFE_CALL(cudaMemcpyToSymbolAsync(Hostprecalc, v, 16 * sizeof(uint64_t), 0, cudaMemcpyHostToDevice, gpustream[thr_id])); + + int i = 0; + RSPRECHOST(0xa, 0xe); + prehost[i++] = block[8] ^ u512[4]; + prehost[i++] = block[0xf] ^ u512[9]; + prehost[i++] = block[6] ^ u512[0xd]; + + + RSPRECHOST(0xc, 0x1) + RSPRECHOST(0x2, 0x0) + RSPRECHOST(0x7, 0xb) + RSPRECHOST(0x3, 0x5) + + RSPRECHOST(0x8, 0xb) + RSPRECHOST(0x0, 0xc) + RSPRECHOST(0x2, 0x5) + RSPRECHOST(0xd, 0xf) + RSPRECHOST(0xe, 0xa) + RSPRECHOST(0x6, 0x3) + RSPRECHOST(0x1, 0x7) + RSPRECHOSTHI(0x4, 0x9) + + RSPRECHOSTLO(0x9, 0x7) + RSPRECHOST(0x1, 0x3) + RSPRECHOST(0xc, 0xd) + RSPRECHOST(0xe, 0xb) + RSPRECHOST(0x6, 0x2) + RSPRECHOST(0xa, 0x5) + RSPRECHOST(0x0, 0x4) + RSPRECHOST(0x8, 0xf) + + RSPRECHOSTHI(0, 0x9) + RSPRECHOST(0x7, 0x5) + RSPRECHOST(0x4, 0x2) + RSPRECHOST(0xf, 0xa) + RSPRECHOST(0x1, 0xe) + RSPRECHOST(0xc, 0xb) + RSPRECHOST(0x8, 0x6) + RSPRECHOST(0xd, 0x3) + + RSPRECHOST(0xc, 0x2) + RSPRECHOST(0xa, 0x6) + RSPRECHOST(0xb, 0x0) + RSPRECHOST(0x3, 0x8) + RSPRECHOST(0xd, 0x4) + RSPRECHOST(0x5, 0x7) + RSPRECHOST(0xe, 0xf) + RSPRECHOSTLO(0x9, 0x1) + + RSPRECHOST(0x5, 0xc) + RSPRECHOST(0xf, 0x1) + RSPRECHOST(0xd, 0xe) + RSPRECHOST(0xa, 0x4) + RSPRECHOST(0x7, 0x0) + RSPRECHOST(0x3, 0x6) + RSPRECHOSTHI(0x2, 0x9) + RSPRECHOST(0xb, 0x8) + + RSPRECHOST(0xb, 0xd) + RSPRECHOST(0xe, 0x7) + RSPRECHOST(0x1, 0xc) + RSPRECHOSTLO(0x9, 0x3) + RSPRECHOST(0x0, 0x5) + RSPRECHOST(0x4, 0xf) + RSPRECHOST(0x6, 0x8) + RSPRECHOST(0xa, 0x2) + + RSPRECHOST(0xf, 0x6) + RSPRECHOSTLO(0x9, 0xe) + RSPRECHOST(0x3, 0xb) + RSPRECHOST(0x8, 0x0) + RSPRECHOST(0x2, 0xc) + RSPRECHOST(0x7, 0xd) + RSPRECHOST(0x4, 0x1) + RSPRECHOST(0x5, 0xa) + + RSPRECHOST(0x2, 0xa) + RSPRECHOST(0x4, 0x8) + RSPRECHOST(0x6, 0x7) + RSPRECHOST(0x5, 0x1) + RSPRECHOST(0xb, 0xf) + RSPRECHOSTHI(0xe, 0x9) + RSPRECHOST(0xc, 0x3) + RSPRECHOST(0x0, 0xd) + + RSPRECHOST(0x1, 0x0) + RSPRECHOST(0x3, 0x2) + RSPRECHOST(0x5, 0x4) + RSPRECHOST(0x7, 0x6) + RSPRECHOSTLO(0x9, 0x8) + RSPRECHOST(0xb, 0xa) + RSPRECHOST(0xd, 0xc) + RSPRECHOST(0xf, 0xe) + + RSPRECHOST(0xa, 0xe) + RSPRECHOST(0x8, 0x4) + RSPRECHOSTHI(0xf, 0x9) + RSPRECHOST(0x6, 0xd) + RSPRECHOST(0xc, 0x1) + RSPRECHOST(0x2, 0x0) + RSPRECHOST(0x7, 0xb) + RSPRECHOST(0x3, 0x5) + + RSPRECHOST(0x8, 0xb) + RSPRECHOST(0x0, 0xc) + RSPRECHOST(0x2, 0x5) + RSPRECHOST(0xd, 0xf) + RSPRECHOST(0xe, 0xa) + RSPRECHOST(0x6, 0x3) + RSPRECHOST(0x1, 0x7) + RSPRECHOSTHI(0x4, 0x9) + + RSPRECHOSTLO(0x9, 0x7) + RSPRECHOST(0x1, 0x3) + RSPRECHOST(0xc, 0xd) + RSPRECHOST(0xe, 0xb) + RSPRECHOST(0x6, 0x2) + RSPRECHOST(0xa, 0x5) + RSPRECHOST(0x0, 0x4) + RSPRECHOST(0x8, 0xf) + + RSPRECHOSTHI(0x0, 0x9) + RSPRECHOST(0x7, 0x5) + RSPRECHOST(0x4, 0x2) + RSPRECHOST(0xf, 0xa) + RSPRECHOST(0x1, 0xe) + RSPRECHOST(0xc, 0xb) + RSPRECHOST(0x8, 0x6) + RSPRECHOST(0xd, 0x3) + + RSPRECHOST(0xc, 0x2) + RSPRECHOST(0xa, 0x6) + RSPRECHOST(0xb, 0x0) + RSPRECHOST(0x3, 0x8) + RSPRECHOST(0xd, 0x4) + RSPRECHOST(0x5, 0x7) + RSPRECHOST(0xe, 0xf) + RSPRECHOSTLO(0x9, 0x1) + + CUDA_SAFE_CALL(cudaMemcpyToSymbolAsync(pre, prehost, 224 * 8, 0, cudaMemcpyHostToDevice, gpustream[thr_id])); +} -__host__ void quark_blake512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_outputHash, int order) +__host__ void quark_blake512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_outputHash) { const uint32_t threadsperblock = 32; // berechne wie viele Thread Blocks wir brauchen - dim3 grid((threads + threadsperblock-1)/threadsperblock); + dim3 grid((threads + threadsperblock - 1) / threadsperblock); dim3 block(threadsperblock); - quark_blake512_gpu_hash_64<<>>(threads, startNounce, d_nonceVector, (uint64_t*)d_outputHash); + quark_blake512_gpu_hash_64 << > >(threads, startNounce, d_nonceVector, (uint2 *)d_outputHash); } -__host__ void quark_blake512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_outputHash, int order) +__host__ void quark_blake512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_outputHash) { const uint32_t threadsperblock = 32; // berechne wie viele Thread Blocks wir brauchen - dim3 grid((threads + threadsperblock-1)/threadsperblock); + dim3 grid((threads + threadsperblock - 1) / threadsperblock); dim3 block(threadsperblock); - quark_blake512_gpu_hash_80<<>>(threads, startNounce, d_outputHash); -// MyStreamSynchronize(NULL, order, thr_id); + quark_blake512_gpu_hash_80 << > >(threads, startNounce, (uint2 *)d_outputHash); + CUDA_SAFE_CALL(cudaGetLastError()); } diff --git a/quark/cuda_quark_blake512.cu.orig b/quark/cuda_quark_blake512.cu.orig new file mode 100644 index 0000000000..1a16b9e65b --- /dev/null +++ b/quark/cuda_quark_blake512.cu.orig @@ -0,0 +1,804 @@ +#include +#include + +#include "cuda_helper.h" + +#define ROTR(x,n) ROTR64(x,n) + +#define USE_SHUFFLE 0 + +__constant__ uint2 c_PaddedM[16]; +__constant__ uint2 Hostprecalc[16]; + +// ---------------------------- BEGIN CUDA quark_blake512 functions ------------------------------------ + +#define Gprecalc(a,b,c,d,idx1,idx2) { \ + v[a] += (block[idx2] ^ u512[idx1]) + v[b]; \ + v[d] = SWAPDWORDS2( v[d] ^ v[a]); \ + v[c] += v[d]; \ + v[b] = ROR2(v[b] ^ v[c], 25); \ + v[a] += (block[idx1] ^ u512[idx2]) + v[b]; \ + v[d] = ROR16(v[d] ^ v[a]); \ + v[c] += v[d]; \ + v[b] = ROR2(v[b] ^ v[c], 11); \ + } + + +#define GprecalcHost(a,b,c,d,idx1,idx2) { \ + v[a] += (block[idx2] ^ u512[idx1]) + v[b]; \ + v[d] = ROTR64( v[d] ^ v[a],32); \ + v[c] += v[d]; \ + v[b] = ROTR64(v[b] ^ v[c], 25); \ + v[a] += (block[idx1] ^ u512[idx2]) + v[b]; \ + v[d] = ROTR64(v[d] ^ v[a],16); \ + v[c] += v[d]; \ + v[b] = ROTR64(v[b] ^ v[c], 11); \ + } + + +__global__ +#if __CUDA_ARCH__ > 500 + __launch_bounds__(256, 1) +#else + __launch_bounds__(256, 2) +#endif +void quark_blake512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint32_t *const __restrict__ g_nonceVector, uint64_t *const __restrict__ g_hash) +{ + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + +#if USE_SHUFFLE + const int warpID = threadIdx.x & 0x0F; // 16 warps + const int warpBlockID = (thread + 15)>>4; // aufrunden auf volle Warp-Blöcke + const int maxhashPosition = thread<<3; +#endif + +#if USE_SHUFFLE + if (warpBlockID < ( (threads+15)>>4 )) +#else + if (thread < threads) +#endif + { + const uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread); + + const uint32_t hashPosition = nounce - startNounce; + + uint64_t *inpHash = &g_hash[hashPosition*8]; + uint2 block[16]; + block[0] = vectorizeswap(inpHash[0]); + block[1] = vectorizeswap(inpHash[1]); + block[2] = vectorizeswap(inpHash[2]); + block[3] = vectorizeswap(inpHash[3]); + block[4] = vectorizeswap(inpHash[4]); + block[5] = vectorizeswap(inpHash[5]); + block[6] = vectorizeswap(inpHash[6]); + block[7] = vectorizeswap(inpHash[7]); + block[8] = vectorizehigh(0x80000000); + block[9] = vectorizelow(0x0); + block[10] = vectorizelow(0x0); + block[11] = vectorizelow(0x0); + block[12] = vectorizelow(0x0); + block[13] = vectorizelow(0x1); + block[14] = vectorizelow(0x0); + block[15] = vectorizelow(0x200); + + const uint2 h[8] = + { + { 0xf3bcc908UL, 0x6a09e667UL }, + { 0x84caa73bUL, 0xbb67ae85UL }, + { 0xfe94f82bUL, 0x3c6ef372UL }, + { 0x5f1d36f1UL, 0xa54ff53aUL }, + { 0xade682d1UL, 0x510e527fUL }, + { 0x2b3e6c1fUL, 0x9b05688cUL }, + { 0xfb41bd6bUL, 0x1f83d9abUL }, + { 0x137e2179UL, 0x5be0cd19UL } + }; + const uint2 u512[16] = + { + { 0x85a308d3UL, 0x243f6a88 }, { 0x03707344UL, 0x13198a2e }, + { 0x299f31d0UL, 0xa4093822 }, { 0xec4e6c89UL, 0x082efa98 }, + { 0x38d01377UL, 0x452821e6 }, { 0x34e90c6cUL, 0xbe5466cf }, + { 0xc97c50ddUL, 0xc0ac29b7 }, { 0xb5470917UL, 0x3f84d5b5 }, + { 0x8979fb1bUL, 0x9216d5d9 }, { 0x98dfb5acUL, 0xd1310ba6 }, + { 0xd01adfb7UL, 0x2ffd72db }, { 0x6a267e96UL, 0xb8e1afed }, + { 0xf12c7f99UL, 0xba7c9045 }, { 0xb3916cf7UL, 0x24a19947 }, + { 0x858efc16UL, 0x0801f2e2 }, { 0x71574e69UL, 0x636920d8 } + }; + + uint2 v[16] = + { + h[0], h[1], h[2], h[3], h[4], h[5], h[6], h[7], + u512[0], u512[1], u512[2], u512[3], u512[4] ^ 512, u512[5] ^ 512, u512[6], u512[7] + }; + + Gprecalc(0, 4, 8, 12, 0x1, 0x0) + Gprecalc(1, 5, 9, 13, 0x3, 0x2) + Gprecalc(2, 6, 10, 14, 0x5, 0x4) + Gprecalc(3, 7, 11, 15, 0x7, 0x6) + Gprecalc(0, 5, 10, 15, 0x9, 0x8) + Gprecalc(1, 6, 11, 12, 0xb, 0xa) + Gprecalc(2, 7, 8, 13, 0xd, 0xc) + Gprecalc(3, 4, 9, 14, 0xf, 0xe) + + Gprecalc(0, 4, 8, 12, 0xa, 0xe) + Gprecalc(1, 5, 9, 13, 0x8, 0x4) + Gprecalc(2, 6, 10, 14, 0xf, 0x9) + Gprecalc(3, 7, 11, 15, 0x6, 0xd) + Gprecalc(0, 5, 10, 15, 0xc, 0x1) + Gprecalc(1, 6, 11, 12, 0x2, 0x0) + Gprecalc(2, 7, 8, 13, 0x7, 0xb) + Gprecalc(3, 4, 9, 14, 0x3, 0x5) + + Gprecalc(0, 4, 8, 12, 0x8, 0xb) + Gprecalc(1, 5, 9, 13, 0x0, 0xc) + Gprecalc(2, 6, 10, 14, 0x2, 0x5) + Gprecalc(3, 7, 11, 15, 0xd, 0xf) + Gprecalc(0, 5, 10, 15, 0xe, 0xa) + Gprecalc(1, 6, 11, 12, 0x6, 0x3) + Gprecalc(2, 7, 8, 13, 0x1, 0x7) + Gprecalc(3, 4, 9, 14, 0x4, 0x9) + + Gprecalc(0, 4, 8, 12, 0x9, 0x7) + Gprecalc(1, 5, 9, 13, 0x1, 0x3) + Gprecalc(2, 6, 10, 14, 0xc, 0xd) + Gprecalc(3, 7, 11, 15, 0xe, 0xb) + Gprecalc(0, 5, 10, 15, 0x6, 0x2) + Gprecalc(1, 6, 11, 12, 0xa, 0x5) + Gprecalc(2, 7, 8, 13, 0x0, 0x4) + Gprecalc(3, 4, 9, 14, 0x8, 0xf) + + Gprecalc(0, 4, 8, 12, 0x0, 0x9) + Gprecalc(1, 5, 9, 13, 0x7, 0x5) + Gprecalc(2, 6, 10, 14, 0x4, 0x2) + Gprecalc(3, 7, 11, 15, 0xf, 0xa) + Gprecalc(0, 5, 10, 15, 0x1, 0xe) + Gprecalc(1, 6, 11, 12, 0xc, 0xb) + Gprecalc(2, 7, 8, 13, 0x8, 0x6) + Gprecalc(3, 4, 9, 14, 0xd, 0x3) + + Gprecalc(0, 4, 8, 12, 0xc, 0x2) + Gprecalc(1, 5, 9, 13, 0xa, 0x6) + Gprecalc(2, 6, 10, 14, 0xb, 0x0) + Gprecalc(3, 7, 11, 15, 0x3, 0x8) + Gprecalc(0, 5, 10, 15, 0xd, 0x4) + Gprecalc(1, 6, 11, 12, 0x5, 0x7) + Gprecalc(2, 7, 8, 13, 0xe, 0xf) + Gprecalc(3, 4, 9, 14, 0x9, 0x1) + + Gprecalc(0, 4, 8, 12, 0x5, 0xc) + Gprecalc(1, 5, 9, 13, 0xf, 0x1) + Gprecalc(2, 6, 10, 14, 0xd, 0xe) + Gprecalc(3, 7, 11, 15, 0xa, 0x4) + Gprecalc(0, 5, 10, 15, 0x7, 0x0) + Gprecalc(1, 6, 11, 12, 0x3, 0x6) + Gprecalc(2, 7, 8, 13, 0x2, 0x9) + Gprecalc(3, 4, 9, 14, 0xb, 0x8) + + Gprecalc(0, 4, 8, 12, 0xb, 0xd) + Gprecalc(1, 5, 9, 13, 0xe, 0x7) + Gprecalc(2, 6, 10, 14, 0x1, 0xc) + Gprecalc(3, 7, 11, 15, 0x9, 0x3) + Gprecalc(0, 5, 10, 15, 0x0, 0x5) + Gprecalc(1, 6, 11, 12, 0x4, 0xf) + Gprecalc(2, 7, 8, 13, 0x6, 0x8) + Gprecalc(3, 4, 9, 14, 0xa, 0x2) + + Gprecalc(0, 4, 8, 12, 0xf, 0x6) + Gprecalc(1, 5, 9, 13, 0x9, 0xe) + Gprecalc(2, 6, 10, 14, 0x3, 0xb) + Gprecalc(3, 7, 11, 15, 0x8, 0x0) + Gprecalc(0, 5, 10, 15, 0x2, 0xc) + Gprecalc(1, 6, 11, 12, 0x7, 0xd) + Gprecalc(2, 7, 8, 13, 0x4, 0x1) + Gprecalc(3, 4, 9, 14, 0x5, 0xa) + + Gprecalc(0, 4, 8, 12, 0x2, 0xa) + Gprecalc(1, 5, 9, 13, 0x4, 0x8) + Gprecalc(2, 6, 10, 14, 0x6, 0x7) + Gprecalc(3, 7, 11, 15, 0x5, 0x1) + Gprecalc(0, 5, 10, 15, 0xb, 0xf) + Gprecalc(1, 6, 11, 12, 0xe, 0x9) + Gprecalc(2, 7, 8, 13, 0xc, 0x3) + Gprecalc(3, 4, 9, 14, 0x0, 0xd) + + Gprecalc(0, 4, 8, 12, 0x1, 0x0) + Gprecalc(1, 5, 9, 13, 0x3, 0x2) + Gprecalc(2, 6, 10, 14, 0x5, 0x4) + Gprecalc(3, 7, 11, 15, 0x7, 0x6) + Gprecalc(0, 5, 10, 15, 0x9, 0x8) + Gprecalc(1, 6, 11, 12, 0xb, 0xa) + Gprecalc(2, 7, 8, 13, 0xd, 0xc) + Gprecalc(3, 4, 9, 14, 0xf, 0xe) + + Gprecalc(0, 4, 8, 12, 0xa, 0xe) + Gprecalc(1, 5, 9, 13, 0x8, 0x4) + Gprecalc(2, 6, 10, 14, 0xf, 0x9) + Gprecalc(3, 7, 11, 15, 0x6, 0xd) + Gprecalc(0, 5, 10, 15, 0xc, 0x1) + Gprecalc(1, 6, 11, 12, 0x2, 0x0) + Gprecalc(2, 7, 8, 13, 0x7, 0xb) + Gprecalc(3, 4, 9, 14, 0x3, 0x5) + + Gprecalc(0, 4, 8, 12, 0x8, 0xb) + Gprecalc(1, 5, 9, 13, 0x0, 0xc) + Gprecalc(2, 6, 10, 14, 0x2, 0x5) + Gprecalc(3, 7, 11, 15, 0xd, 0xf) + Gprecalc(0, 5, 10, 15, 0xe, 0xa) + Gprecalc(1, 6, 11, 12, 0x6, 0x3) + Gprecalc(2, 7, 8, 13, 0x1, 0x7) + Gprecalc(3, 4, 9, 14, 0x4, 0x9) + + Gprecalc(0, 4, 8, 12, 0x9, 0x7) + Gprecalc(1, 5, 9, 13, 0x1, 0x3) + Gprecalc(2, 6, 10, 14, 0xc, 0xd) + Gprecalc(3, 7, 11, 15, 0xe, 0xb) + Gprecalc(0, 5, 10, 15, 0x6, 0x2) + Gprecalc(1, 6, 11, 12, 0xa, 0x5) + Gprecalc(2, 7, 8, 13, 0x0, 0x4) + Gprecalc(3, 4, 9, 14, 0x8, 0xf) + + Gprecalc(0, 4, 8, 12, 0x0, 0x9) + Gprecalc(1, 5, 9, 13, 0x7, 0x5) + Gprecalc(2, 6, 10, 14, 0x4, 0x2) + Gprecalc(3, 7, 11, 15, 0xf, 0xa) + Gprecalc(0, 5, 10, 15, 0x1, 0xe) + Gprecalc(1, 6, 11, 12, 0xc, 0xb) + Gprecalc(2, 7, 8, 13, 0x8, 0x6) + Gprecalc(3, 4, 9, 14, 0xd, 0x3) + + Gprecalc(0, 4, 8, 12, 0xc, 0x2) + Gprecalc(1, 5, 9, 13, 0xa, 0x6) + Gprecalc(2, 6, 10, 14, 0xb, 0x0) + Gprecalc(3, 7, 11, 15, 0x3, 0x8) + Gprecalc(0, 5, 10, 15, 0xd, 0x4) + Gprecalc(1, 6, 11, 12, 0x5, 0x7) + Gprecalc(2, 7, 8, 13, 0xe, 0xf) + Gprecalc(3, 4, 9, 14, 0x9, 0x1) + + uint64_t *outHash = &g_hash[8 * hashPosition]; + + outHash[0] = devectorizeswap(h[0] ^ v[0] ^ v[8]); + outHash[1] = devectorizeswap(h[1] ^ v[1] ^ v[9]); + outHash[2] = devectorizeswap(h[2] ^ v[2] ^ v[10]); + outHash[3] = devectorizeswap(h[3] ^ v[3] ^ v[11]); + outHash[4] = devectorizeswap(h[4] ^ v[4] ^ v[12]); + outHash[5] = devectorizeswap(h[5] ^ v[5] ^ v[13]); + outHash[6] = devectorizeswap(h[6] ^ v[6] ^ v[14]); + outHash[7] = devectorizeswap(h[7] ^ v[7] ^ v[15]); + } +} + +__global__ +#if __CUDA_ARCH__ > 500 +__launch_bounds__(256, 4) +#else +__launch_bounds__(32, 32) +#endif +void quark_blake512_gpu_hash_80(uint32_t threads, uint32_t startNounce, uint32_t *outputHash) +{ + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + if (thread < threads) + { + const uint32_t nounce = startNounce + thread; + uint2 block[16]; + + block[0] = c_PaddedM[0]; + block[1] = c_PaddedM[1]; + block[2] = c_PaddedM[2]; + block[3] = c_PaddedM[3]; + block[4] = c_PaddedM[4]; + block[5] = c_PaddedM[5]; + block[6] = c_PaddedM[6]; + block[7] = c_PaddedM[7]; + block[8] = c_PaddedM[8]; + block[9] = c_PaddedM[9]; + block[10] = vectorizehigh(0x80000000); + block[11] = vectorize(0); + block[12] = vectorize(0); + block[13] = vectorize(0x0000000000000001); + block[14] = vectorize(0); + block[15] = vectorize(0x0000000000000280); + block[9].x = nounce; + const uint2 u512[16] = + { + { 0x85a308d3UL, 0x243f6a88 }, { 0x03707344UL, 0x13198a2e }, + { 0x299f31d0UL, 0xa4093822 }, { 0xec4e6c89UL, 0x082efa98 }, + { 0x38d01377UL, 0x452821e6 }, { 0x34e90c6cUL, 0xbe5466cf }, + { 0xc97c50ddUL, 0xc0ac29b7 }, { 0xb5470917UL, 0x3f84d5b5 }, + { 0x8979fb1bUL, 0x9216d5d9 }, { 0x98dfb5acUL, 0xd1310ba6 }, + { 0xd01adfb7UL, 0x2ffd72db }, { 0x6a267e96UL, 0xb8e1afed }, + { 0xf12c7f99UL, 0xba7c9045 }, { 0xb3916cf7UL, 0x24a19947 }, + { 0x858efc16UL, 0x0801f2e2 }, { 0x71574e69UL, 0x636920d8 } + }; + + const uint2 h[8] = { + { 0xf3bcc908UL, 0x6a09e667UL }, + { 0x84caa73bUL, 0xbb67ae85UL }, + { 0xfe94f82bUL, 0x3c6ef372UL }, + { 0x5f1d36f1UL, 0xa54ff53aUL }, + { 0xade682d1UL, 0x510e527fUL }, + { 0x2b3e6c1fUL, 0x9b05688cUL }, + { 0xfb41bd6bUL, 0x1f83d9abUL }, + { 0x137e2179UL, 0x5be0cd19UL } + }; + + uint2 v[16] = + { + h[0], h[1], h[2], h[3], h[4], h[5], h[6], h[7], + u512[0], u512[1], u512[2], u512[3], u512[4] ^ 640, u512[5] ^ 640, u512[6], u512[7] + }; + + Gprecalc(0, 4, 8, 12, 0x1, 0x0) + Gprecalc(1, 5, 9, 13, 0x3, 0x2) + Gprecalc(2, 6, 10, 14, 0x5, 0x4) + Gprecalc(3, 7, 11, 15, 0x7, 0x6) + Gprecalc(0, 5, 10, 15, 0x9, 0x8) + Gprecalc(1, 6, 11, 12, 0xb, 0xa) + Gprecalc(2, 7, 8, 13, 0xd, 0xc) + Gprecalc(3, 4, 9, 14, 0xf, 0xe) + + Gprecalc(0, 4, 8, 12, 0xa, 0xe) + Gprecalc(1, 5, 9, 13, 0x8, 0x4) + Gprecalc(2, 6, 10, 14, 0xf, 0x9) + Gprecalc(3, 7, 11, 15, 0x6, 0xd) + Gprecalc(0, 5, 10, 15, 0xc, 0x1) + Gprecalc(1, 6, 11, 12, 0x2, 0x0) + Gprecalc(2, 7, 8, 13, 0x7, 0xb) + Gprecalc(3, 4, 9, 14, 0x3, 0x5) + + Gprecalc(0, 4, 8, 12, 0x8, 0xb) + Gprecalc(1, 5, 9, 13, 0x0, 0xc) + Gprecalc(2, 6, 10, 14, 0x2, 0x5) + Gprecalc(3, 7, 11, 15, 0xd, 0xf) + Gprecalc(0, 5, 10, 15, 0xe, 0xa) + Gprecalc(1, 6, 11, 12, 0x6, 0x3) + Gprecalc(2, 7, 8, 13, 0x1, 0x7) + Gprecalc(3, 4, 9, 14, 0x4, 0x9) + + Gprecalc(0, 4, 8, 12, 0x9, 0x7) + Gprecalc(1, 5, 9, 13, 0x1, 0x3) + Gprecalc(2, 6, 10, 14, 0xc, 0xd) + Gprecalc(3, 7, 11, 15, 0xe, 0xb) + Gprecalc(0, 5, 10, 15, 0x6, 0x2) + Gprecalc(1, 6, 11, 12, 0xa, 0x5) + Gprecalc(2, 7, 8, 13, 0x0, 0x4) + Gprecalc(3, 4, 9, 14, 0x8, 0xf) + + Gprecalc(0, 4, 8, 12, 0x0, 0x9) + Gprecalc(1, 5, 9, 13, 0x7, 0x5) + Gprecalc(2, 6, 10, 14, 0x4, 0x2) + Gprecalc(3, 7, 11, 15, 0xf, 0xa) + Gprecalc(0, 5, 10, 15, 0x1, 0xe) + Gprecalc(1, 6, 11, 12, 0xc, 0xb) + Gprecalc(2, 7, 8, 13, 0x8, 0x6) + Gprecalc(3, 4, 9, 14, 0xd, 0x3) + + Gprecalc(0, 4, 8, 12, 0xc, 0x2) + Gprecalc(1, 5, 9, 13, 0xa, 0x6) + Gprecalc(2, 6, 10, 14, 0xb, 0x0) + Gprecalc(3, 7, 11, 15, 0x3, 0x8) + Gprecalc(0, 5, 10, 15, 0xd, 0x4) + Gprecalc(1, 6, 11, 12, 0x5, 0x7) + Gprecalc(2, 7, 8, 13, 0xe, 0xf) + Gprecalc(3, 4, 9, 14, 0x9, 0x1) + + Gprecalc(0, 4, 8, 12, 0x5, 0xc) + Gprecalc(1, 5, 9, 13, 0xf, 0x1) + Gprecalc(2, 6, 10, 14, 0xd, 0xe) + Gprecalc(3, 7, 11, 15, 0xa, 0x4) + Gprecalc(0, 5, 10, 15, 0x7, 0x0) + Gprecalc(1, 6, 11, 12, 0x3, 0x6) + Gprecalc(2, 7, 8, 13, 0x2, 0x9) + Gprecalc(3, 4, 9, 14, 0xb, 0x8) + + Gprecalc(0, 4, 8, 12, 0xb, 0xd) + Gprecalc(1, 5, 9, 13, 0xe, 0x7) + Gprecalc(2, 6, 10, 14, 0x1, 0xc) + Gprecalc(3, 7, 11, 15, 0x9, 0x3) + Gprecalc(0, 5, 10, 15, 0x0, 0x5) + Gprecalc(1, 6, 11, 12, 0x4, 0xf) + Gprecalc(2, 7, 8, 13, 0x6, 0x8) + Gprecalc(3, 4, 9, 14, 0xa, 0x2) + + Gprecalc(0, 4, 8, 12, 0xf, 0x6) + Gprecalc(1, 5, 9, 13, 0x9, 0xe) + Gprecalc(2, 6, 10, 14, 0x3, 0xb) + Gprecalc(3, 7, 11, 15, 0x8, 0x0) + Gprecalc(0, 5, 10, 15, 0x2, 0xc) + Gprecalc(1, 6, 11, 12, 0x7, 0xd) + Gprecalc(2, 7, 8, 13, 0x4, 0x1) + Gprecalc(3, 4, 9, 14, 0x5, 0xa) + + Gprecalc(0, 4, 8, 12, 0x2, 0xa) + Gprecalc(1, 5, 9, 13, 0x4, 0x8) + Gprecalc(2, 6, 10, 14, 0x6, 0x7) + Gprecalc(3, 7, 11, 15, 0x5, 0x1) + Gprecalc(0, 5, 10, 15, 0xb, 0xf) + Gprecalc(1, 6, 11, 12, 0xe, 0x9) + Gprecalc(2, 7, 8, 13, 0xc, 0x3) + Gprecalc(3, 4, 9, 14, 0x0, 0xd) + + Gprecalc(0, 4, 8, 12, 0x1, 0x0) + Gprecalc(1, 5, 9, 13, 0x3, 0x2) + Gprecalc(2, 6, 10, 14, 0x5, 0x4) + Gprecalc(3, 7, 11, 15, 0x7, 0x6) + Gprecalc(0, 5, 10, 15, 0x9, 0x8) + Gprecalc(1, 6, 11, 12, 0xb, 0xa) + Gprecalc(2, 7, 8, 13, 0xd, 0xc) + Gprecalc(3, 4, 9, 14, 0xf, 0xe) + + Gprecalc(0, 4, 8, 12, 0xa, 0xe) + Gprecalc(1, 5, 9, 13, 0x8, 0x4) + Gprecalc(2, 6, 10, 14, 0xf, 0x9) + Gprecalc(3, 7, 11, 15, 0x6, 0xd) + Gprecalc(0, 5, 10, 15, 0xc, 0x1) + Gprecalc(1, 6, 11, 12, 0x2, 0x0) + Gprecalc(2, 7, 8, 13, 0x7, 0xb) + Gprecalc(3, 4, 9, 14, 0x3, 0x5) + + Gprecalc(0, 4, 8, 12, 0x8, 0xb) + Gprecalc(1, 5, 9, 13, 0x0, 0xc) + Gprecalc(2, 6, 10, 14, 0x2, 0x5) + Gprecalc(3, 7, 11, 15, 0xd, 0xf) + Gprecalc(0, 5, 10, 15, 0xe, 0xa) + Gprecalc(1, 6, 11, 12, 0x6, 0x3) + Gprecalc(2, 7, 8, 13, 0x1, 0x7) + Gprecalc(3, 4, 9, 14, 0x4, 0x9) + + Gprecalc(0, 4, 8, 12, 0x9, 0x7) + Gprecalc(1, 5, 9, 13, 0x1, 0x3) + Gprecalc(2, 6, 10, 14, 0xc, 0xd) + Gprecalc(3, 7, 11, 15, 0xe, 0xb) + Gprecalc(0, 5, 10, 15, 0x6, 0x2) + Gprecalc(1, 6, 11, 12, 0xa, 0x5) + Gprecalc(2, 7, 8, 13, 0x0, 0x4) + Gprecalc(3, 4, 9, 14, 0x8, 0xf) + + Gprecalc(0, 4, 8, 12, 0x0, 0x9) + Gprecalc(1, 5, 9, 13, 0x7, 0x5) + Gprecalc(2, 6, 10, 14, 0x4, 0x2) + Gprecalc(3, 7, 11, 15, 0xf, 0xa) + Gprecalc(0, 5, 10, 15, 0x1, 0xe) + Gprecalc(1, 6, 11, 12, 0xc, 0xb) + Gprecalc(2, 7, 8, 13, 0x8, 0x6) + Gprecalc(3, 4, 9, 14, 0xd, 0x3) + + Gprecalc(0, 4, 8, 12, 0xc, 0x2) + Gprecalc(1, 5, 9, 13, 0xa, 0x6) + Gprecalc(2, 6, 10, 14, 0xb, 0x0) + Gprecalc(3, 7, 11, 15, 0x3, 0x8) + Gprecalc(0, 5, 10, 15, 0xd, 0x4) + Gprecalc(1, 6, 11, 12, 0x5, 0x7) + Gprecalc(2, 7, 8, 13, 0xe, 0xf) + Gprecalc(3, 4, 9, 14, 0x9, 0x1) + + uint64_t *outHash = (uint64_t *)outputHash + 8 * thread; + outHash[0] = devectorizeswap(h[0] ^ v[0] ^ v[8]); + outHash[1] = devectorizeswap(h[1] ^ v[1] ^ v[9]); + outHash[2] = devectorizeswap(h[2] ^ v[2] ^ v[10]); + outHash[3] = devectorizeswap(h[3] ^ v[3] ^ v[11]); + outHash[4] = devectorizeswap(h[4] ^ v[4] ^ v[12]); + outHash[5] = devectorizeswap(h[5] ^ v[5] ^ v[13]); + outHash[6] = devectorizeswap(h[6] ^ v[6] ^ v[14]); + outHash[7] = devectorizeswap(h[7] ^ v[7] ^ v[15]); + } +} + +<<<<<<< HEAD +======= + + +__global__ +#if __CUDA_ARCH__ > 500 +__launch_bounds__(32) +#else +__launch_bounds__(32, 16) +#endif +void quark_blake512_gpu_hash_80_multi(uint32_t threads, uint32_t startNounce, uint32_t *const __restrict__ outputHash, const uint2*const __restrict__ c_PaddedMessage) +{ + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + if (thread < threads) + { + uint2 block[16]; + const uint32_t nounce = startNounce + thread; + + block[0] = c_PaddedMessage[0]; + block[1] = c_PaddedMessage[1]; + block[2] = c_PaddedMessage[2]; + block[3] = c_PaddedMessage[3]; + block[4] = c_PaddedMessage[4]; + block[5] = c_PaddedMessage[5]; + block[6] = c_PaddedMessage[6]; + block[7] = c_PaddedMessage[7]; + block[8] = c_PaddedMessage[8]; + block[9] = c_PaddedMessage[9]; + block[10] = vectorizehigh(0x80000000); + block[11] = vectorizelow(0); + block[12] = vectorizelow(0); + block[13] = vectorizelow(0x1); + block[14] = vectorizelow(0); + block[15] = vectorizelow(0x280); + block[9].x = nounce; + + const uint2 u512[16] = + { + { 0x85a308d3UL, 0x243f6a88 }, { 0x03707344UL, 0x13198a2e }, + { 0x299f31d0UL, 0xa4093822 }, { 0xec4e6c89UL, 0x082efa98 }, + { 0x38d01377UL, 0x452821e6 }, { 0x34e90c6cUL, 0xbe5466cf }, + { 0xc97c50ddUL, 0xc0ac29b7 }, { 0xb5470917UL, 0x3f84d5b5 }, + { 0x8979fb1bUL, 0x9216d5d9 }, { 0x98dfb5acUL, 0xd1310ba6 }, + { 0xd01adfb7UL, 0x2ffd72db }, { 0x6a267e96UL, 0xb8e1afed }, + { 0xf12c7f99UL, 0xba7c9045 }, { 0xb3916cf7UL, 0x24a19947 }, + { 0x858efc16UL, 0x0801f2e2 }, { 0x71574e69UL, 0x636920d8 } + }; + + const uint2 h[8] = { + { 0xf3bcc908UL,0x6a09e667UL }, + { 0x84caa73bUL ,0xbb67ae85UL }, + { 0xfe94f82bUL,0x3c6ef372UL }, + { 0x5f1d36f1UL,0xa54ff53aUL }, + { 0xade682d1UL,0x510e527fUL }, + { 0x2b3e6c1fUL,0x9b05688cUL }, + { 0xfb41bd6bUL,0x1f83d9abUL }, + { 0x137e2179UL,0x5be0cd19UL } + }; + + uint2 v[16] = + { + Hostprecalc[0], Hostprecalc[1], Hostprecalc[2], Hostprecalc[3], Hostprecalc[4], Hostprecalc[5], + Hostprecalc[6], Hostprecalc[7], Hostprecalc[8], Hostprecalc[9], Hostprecalc[10], Hostprecalc[11], + Hostprecalc[12], Hostprecalc[13], Hostprecalc[14], Hostprecalc[15], + }; + +// Gprecalc(0, 4, 8, 12, 0x1, 0x0) +// Gprecalc(1, 5, 9, 13, 0x3, 0x2) +// Gprecalc(2, 6, 10, 14, 0x5, 0x4) +// Gprecalc(3, 7, 11, 15, 0x7, 0x6) + + Gprecalc(0, 5, 10, 15, 0x9, 0x8) + Gprecalc(1, 6, 11, 12, 0xb, 0xa) + Gprecalc(2, 7, 8, 13, 0xd, 0xc) + Gprecalc(3, 4, 9, 14, 0xf, 0xe) + + Gprecalc(0, 4, 8, 12, 0xa, 0xe) + Gprecalc(1, 5, 9, 13, 0x8, 0x4) + Gprecalc(2, 6, 10, 14, 0xf, 0x9) + Gprecalc(3, 7, 11, 15, 0x6, 0xd) + Gprecalc(0, 5, 10, 15, 0xc, 0x1) + Gprecalc(1, 6, 11, 12, 0x2, 0x0) + Gprecalc(2, 7, 8, 13, 0x7, 0xb) + Gprecalc(3, 4, 9, 14, 0x3, 0x5) + + Gprecalc(0, 4, 8, 12, 0x8, 0xb) + Gprecalc(1, 5, 9, 13, 0x0, 0xc) + Gprecalc(2, 6, 10, 14, 0x2, 0x5) + Gprecalc(3, 7, 11, 15, 0xd, 0xf) + Gprecalc(0, 5, 10, 15, 0xe, 0xa) + Gprecalc(1, 6, 11, 12, 0x6, 0x3) + Gprecalc(2, 7, 8, 13, 0x1, 0x7) + Gprecalc(3, 4, 9, 14, 0x4, 0x9) + + Gprecalc(0, 4, 8, 12, 0x9, 0x7) + Gprecalc(1, 5, 9, 13, 0x1, 0x3) + Gprecalc(2, 6, 10, 14, 0xc, 0xd) + Gprecalc(3, 7, 11, 15, 0xe, 0xb) + Gprecalc(0, 5, 10, 15, 0x6, 0x2) + Gprecalc(1, 6, 11, 12, 0xa, 0x5) + Gprecalc(2, 7, 8, 13, 0x0, 0x4) + Gprecalc(3, 4, 9, 14, 0x8, 0xf) + + Gprecalc(0, 4, 8, 12, 0x0, 0x9) + Gprecalc(1, 5, 9, 13, 0x7, 0x5) + Gprecalc(2, 6, 10, 14, 0x4, 0x2) + Gprecalc(3, 7, 11, 15, 0xf, 0xa) + Gprecalc(0, 5, 10, 15, 0x1, 0xe) + Gprecalc(1, 6, 11, 12, 0xc, 0xb) + Gprecalc(2, 7, 8, 13, 0x8, 0x6) + Gprecalc(3, 4, 9, 14, 0xd, 0x3) + + Gprecalc(0, 4, 8, 12, 0xc, 0x2) + Gprecalc(1, 5, 9, 13, 0xa, 0x6) + Gprecalc(2, 6, 10, 14, 0xb, 0x0) + Gprecalc(3, 7, 11, 15, 0x3, 0x8) + Gprecalc(0, 5, 10, 15, 0xd, 0x4) + Gprecalc(1, 6, 11, 12, 0x5, 0x7) + Gprecalc(2, 7, 8, 13, 0xe, 0xf) + Gprecalc(3, 4, 9, 14, 0x9, 0x1) + + Gprecalc(0, 4, 8, 12, 0x5, 0xc) + Gprecalc(1, 5, 9, 13, 0xf, 0x1) + Gprecalc(2, 6, 10, 14, 0xd, 0xe) + Gprecalc(3, 7, 11, 15, 0xa, 0x4) + Gprecalc(0, 5, 10, 15, 0x7, 0x0) + Gprecalc(1, 6, 11, 12, 0x3, 0x6) + Gprecalc(2, 7, 8, 13, 0x2, 0x9) + Gprecalc(3, 4, 9, 14, 0xb, 0x8) + + Gprecalc(0, 4, 8, 12, 0xb, 0xd) + Gprecalc(1, 5, 9, 13, 0xe, 0x7) + Gprecalc(2, 6, 10, 14, 0x1, 0xc) + Gprecalc(3, 7, 11, 15, 0x9, 0x3) + Gprecalc(0, 5, 10, 15, 0x0, 0x5) + Gprecalc(1, 6, 11, 12, 0x4, 0xf) + Gprecalc(2, 7, 8, 13, 0x6, 0x8) + Gprecalc(3, 4, 9, 14, 0xa, 0x2) + + Gprecalc(0, 4, 8, 12, 0xf, 0x6) + Gprecalc(1, 5, 9, 13, 0x9, 0xe) + Gprecalc(2, 6, 10, 14, 0x3, 0xb) + Gprecalc(3, 7, 11, 15, 0x8, 0x0) + Gprecalc(0, 5, 10, 15, 0x2, 0xc) + Gprecalc(1, 6, 11, 12, 0x7, 0xd) + Gprecalc(2, 7, 8, 13, 0x4, 0x1) + Gprecalc(3, 4, 9, 14, 0x5, 0xa) + + Gprecalc(0, 4, 8, 12, 0x2, 0xa) + Gprecalc(1, 5, 9, 13, 0x4, 0x8) + Gprecalc(2, 6, 10, 14, 0x6, 0x7) + Gprecalc(3, 7, 11, 15, 0x5, 0x1) + Gprecalc(0, 5, 10, 15, 0xb, 0xf) + Gprecalc(1, 6, 11, 12, 0xe, 0x9) + Gprecalc(2, 7, 8, 13, 0xc, 0x3) + Gprecalc(3, 4, 9, 14, 0x0, 0xd) + + Gprecalc(0, 4, 8, 12, 0x1, 0x0) + Gprecalc(1, 5, 9, 13, 0x3, 0x2) + Gprecalc(2, 6, 10, 14, 0x5, 0x4) + Gprecalc(3, 7, 11, 15, 0x7, 0x6) + Gprecalc(0, 5, 10, 15, 0x9, 0x8) + Gprecalc(1, 6, 11, 12, 0xb, 0xa) + Gprecalc(2, 7, 8, 13, 0xd, 0xc) + Gprecalc(3, 4, 9, 14, 0xf, 0xe) + + Gprecalc(0, 4, 8, 12, 0xa, 0xe) + Gprecalc(1, 5, 9, 13, 0x8, 0x4) + Gprecalc(2, 6, 10, 14, 0xf, 0x9) + Gprecalc(3, 7, 11, 15, 0x6, 0xd) + Gprecalc(0, 5, 10, 15, 0xc, 0x1) + Gprecalc(1, 6, 11, 12, 0x2, 0x0) + Gprecalc(2, 7, 8, 13, 0x7, 0xb) + Gprecalc(3, 4, 9, 14, 0x3, 0x5) + + Gprecalc(0, 4, 8, 12, 0x8, 0xb) + Gprecalc(1, 5, 9, 13, 0x0, 0xc) + Gprecalc(2, 6, 10, 14, 0x2, 0x5) + Gprecalc(3, 7, 11, 15, 0xd, 0xf) + Gprecalc(0, 5, 10, 15, 0xe, 0xa) + Gprecalc(1, 6, 11, 12, 0x6, 0x3) + Gprecalc(2, 7, 8, 13, 0x1, 0x7) + Gprecalc(3, 4, 9, 14, 0x4, 0x9) + + Gprecalc(0, 4, 8, 12, 0x9, 0x7) + Gprecalc(1, 5, 9, 13, 0x1, 0x3) + Gprecalc(2, 6, 10, 14, 0xc, 0xd) + Gprecalc(3, 7, 11, 15, 0xe, 0xb) + Gprecalc(0, 5, 10, 15, 0x6, 0x2) + Gprecalc(1, 6, 11, 12, 0xa, 0x5) + Gprecalc(2, 7, 8, 13, 0x0, 0x4) + Gprecalc(3, 4, 9, 14, 0x8, 0xf) + + Gprecalc(0, 4, 8, 12, 0x0, 0x9) + Gprecalc(1, 5, 9, 13, 0x7, 0x5) + Gprecalc(2, 6, 10, 14, 0x4, 0x2) + Gprecalc(3, 7, 11, 15, 0xf, 0xa) + Gprecalc(0, 5, 10, 15, 0x1, 0xe) + Gprecalc(1, 6, 11, 12, 0xc, 0xb) + Gprecalc(2, 7, 8, 13, 0x8, 0x6) + Gprecalc(3, 4, 9, 14, 0xd, 0x3) + + Gprecalc(0, 4, 8, 12, 0xc, 0x2) + Gprecalc(1, 5, 9, 13, 0xa, 0x6) + Gprecalc(2, 6, 10, 14, 0xb, 0x0) + Gprecalc(3, 7, 11, 15, 0x3, 0x8) + Gprecalc(0, 5, 10, 15, 0xd, 0x4) + Gprecalc(1, 6, 11, 12, 0x5, 0x7) + Gprecalc(2, 7, 8, 13, 0xe, 0xf) + Gprecalc(3, 4, 9, 14, 0x9, 0x1) + + uint64_t *outHash = (uint64_t *)outputHash + 8 * thread; + outHash[0] = devectorizeswap(h[0] ^ v[0] ^ v[8]); + outHash[1] = devectorizeswap(h[1] ^ v[1] ^ v[9]); + outHash[2] = devectorizeswap(h[2] ^ v[2] ^ v[10]); + outHash[3] = devectorizeswap(h[3] ^ v[3] ^ v[11]); + outHash[4] = devectorizeswap(h[4] ^ v[4] ^ v[12]); + outHash[5] = devectorizeswap(h[5] ^ v[5] ^ v[13]); + outHash[6] = devectorizeswap(h[6] ^ v[6] ^ v[14]); + outHash[7] = devectorizeswap(h[7] ^ v[7] ^ v[15]); + } +} + + +>>>>>>> 4221eab... Faster quark/x11 Precalculated 1/32 of blake with the cpu. +// ---------------------------- END CUDA quark_blake512 functions ------------------------------------ + +__host__ void quark_blake512_cpu_init(int thr_id) +{ +} + +__host__ void quark_blake512_cpu_setBlock_80(int thr_id, uint64_t *pdata) +{ + uint64_t PaddedMessage[10]; + for (int i = 0; i < 10; i++) + PaddedMessage[i] = cuda_swab64(pdata[i]); + CUDA_SAFE_CALL(cudaMemcpyToSymbolAsync(c_PaddedM, PaddedMessage, 10 * sizeof(uint64_t), 0, cudaMemcpyHostToDevice, gpustream[thr_id])); + + uint64_t block[16]; + + uint64_t *peker = (uint64_t *)&PaddedMessage[0]; + + block[0] = peker[0]; + block[1] = peker[1]; + block[2] = peker[2]; + block[3] = peker[3]; + block[4] = peker[4]; + block[5] = peker[5]; + block[6] = peker[6]; + block[7] = peker[7]; + block[8] = peker[8]; + block[9] = peker[9]; + block[10] = 0x8000000000000000; + block[11] = 0; + block[12] = 0; + block[13] = 1; + block[14] = 0; + block[15] = 280; + + const uint64_t u512[16] = + { + 0x243f6a8885a308d3ULL, 0x13198a2e03707344ULL, + 0xa4093822299f31d0ULL, 0x082efa98ec4e6c89ULL, + 0x452821e638d01377ULL, 0xbe5466cf34e90c6cULL, + 0xc0ac29b7c97c50ddULL, 0x3f84d5b5b5470917ULL, + 0x9216d5d98979fb1bULL, 0xd1310ba698dfb5acULL, + 0x2ffd72dbd01adfb7ULL, 0xb8e1afed6a267e96ULL, + 0xba7c9045f12c7f99ULL, 0x24a19947b3916cf7ULL, + 0x0801f2e2858efc16ULL, 0x636920d871574e69ULL + }; + + uint64_t h[8] = { + 0x6a09e667f3bcc908ULL, + 0xbb67ae8584caa73bULL, + 0x3c6ef372fe94f82bULL, + 0xa54ff53a5f1d36f1ULL, + 0x510e527fade682d1ULL, + 0x9b05688c2b3e6c1fULL, + 0x1f83d9abfb41bd6bULL, + 0x5be0cd19137e2179ULL + }; + + uint64_t v[16] = + { + h[0], h[1], h[2], h[3], h[4], h[5], h[6], h[7], + u512[0], u512[1], u512[2], u512[3], u512[4] ^ 640, u512[5] ^ 640, u512[6], u512[7] + }; + + GprecalcHost(0, 4, 8, 12, 0x1, 0x0) + GprecalcHost(1, 5, 9, 13, 0x3, 0x2) + GprecalcHost(2, 6, 10, 14, 0x5, 0x4) + GprecalcHost(3, 7, 11, 15, 0x7, 0x6) + + CUDA_SAFE_CALL(cudaMemcpyToSymbol(Hostprecalc, &v[0], 10 * sizeof(uint64_t), 0, cudaMemcpyHostToDevice)); + + +} + + +__host__ void quark_blake512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_outputHash) +{ + const uint32_t threadsperblock = 32; + // berechne wie viele Thread Blocks wir brauchen + dim3 grid((threads + threadsperblock-1)/threadsperblock); + dim3 block(threadsperblock); + quark_blake512_gpu_hash_64<<>>(threads, startNounce, d_nonceVector, (uint64_t*)d_outputHash); +} + +__host__ void quark_blake512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_outputHash) +{ + + const uint32_t threadsperblock = 32; + // berechne wie viele Thread Blocks wir brauchen + dim3 grid((threads + threadsperblock - 1) / threadsperblock); + dim3 block(threadsperblock); + + quark_blake512_gpu_hash_80 << >>(threads, startNounce, d_outputHash); + // MyStreamSynchronize(NULL, order, thr_id); +} diff --git a/quark/cuda_quark_compactionTest.cu b/quark/cuda_quark_compactionTest.cu index acf7534c99..e89b058cf0 100644 --- a/quark/cuda_quark_compactionTest.cu +++ b/quark/cuda_quark_compactionTest.cu @@ -12,14 +12,14 @@ static uint32_t *d_partSum[2][MAX_GPUS]; // fuer bis zu vier partielle Summen // True/False tester -typedef uint32_t(*cuda_compactTestFunction_t)(uint32_t *inpHash); +typedef uint32_t(*cuda_compactTestFunction_t)(const uint32_t *inpHash); -__device__ uint32_t QuarkTrueTest(uint32_t *inpHash) +__device__ __forceinline__ uint32_t QuarkTrueTest(const uint32_t *inpHash) { return ((inpHash[0] & 0x08) == 0x08); } -__device__ uint32_t QuarkFalseTest(uint32_t *inpHash) +__device__ __forceinline__ uint32_t QuarkFalseTest(const uint32_t *inpHash) { return ((inpHash[0] & 0x08) == 0); } @@ -31,31 +31,23 @@ cuda_compactTestFunction_t h_QuarkTrueFunction[MAX_GPUS], h_QuarkFalseFunction[M // Setup-Funktionen __host__ void quark_compactTest_cpu_init(int thr_id, uint32_t threads) { - cudaMemcpyFromSymbol(&h_QuarkTrueFunction[thr_id], d_QuarkTrueFunction, sizeof(cuda_compactTestFunction_t)); - cudaMemcpyFromSymbol(&h_QuarkFalseFunction[thr_id], d_QuarkFalseFunction, sizeof(cuda_compactTestFunction_t)); + CUDA_SAFE_CALL(cudaMemcpyFromSymbolAsync(&h_QuarkTrueFunction[thr_id], d_QuarkTrueFunction, sizeof(cuda_compactTestFunction_t), 0, cudaMemcpyDeviceToHost, gpustream[thr_id])); + CUDA_SAFE_CALL(cudaMemcpyFromSymbolAsync(&h_QuarkFalseFunction[thr_id], d_QuarkFalseFunction, sizeof(cuda_compactTestFunction_t), 0, cudaMemcpyDeviceToHost, gpustream[thr_id])); // wir brauchen auch Speicherplatz auf dem Device - cudaMalloc(&d_tempBranch1Nonces[thr_id], sizeof(uint32_t) * threads * 2); - cudaMalloc(&d_numValid[thr_id], 2*sizeof(uint32_t)); - cudaMallocHost(&h_numValid[thr_id], 2*sizeof(uint32_t)); + CUDA_SAFE_CALL(cudaMalloc(&d_tempBranch1Nonces[thr_id], sizeof(uint32_t) * threads * 2)); + CUDA_SAFE_CALL(cudaMalloc(&d_numValid[thr_id], 2 * sizeof(uint32_t))); + CUDA_SAFE_CALL(cudaMallocHost(&h_numValid[thr_id], 2 * sizeof(uint32_t))); uint32_t s1; s1 = (threads / 256) * 2; - cudaMalloc(&d_partSum[0][thr_id], sizeof(uint32_t) * s1); // BLOCKSIZE (Threads/Block) - cudaMalloc(&d_partSum[1][thr_id], sizeof(uint32_t) * s1); // BLOCKSIZE (Threads/Block) + CUDA_SAFE_CALL(cudaMalloc(&d_partSum[0][thr_id], sizeof(uint32_t) * s1)); // BLOCKSIZE (Threads/Block) + CUDA_SAFE_CALL(cudaMalloc(&d_partSum[1][thr_id], sizeof(uint32_t) * s1)); // BLOCKSIZE (Threads/Block) } -#if __CUDA_ARCH__ < 300 -/** - * __shfl_up() calculates a source lane ID by subtracting delta from the caller's lane ID, and clamping to the range 0..width-1 - */ -#undef __shfl_up -#define __shfl_up(var, delta, width) (0) -#endif - // Die Summenfunktion (vom NVIDIA SDK) -__global__ void quark_compactTest_gpu_SCAN(uint32_t *data, int width, uint32_t *partial_sums=NULL, cuda_compactTestFunction_t testFunc=NULL, uint32_t threads=0, uint32_t startNounce=0, uint32_t *inpHashes=NULL, uint32_t *d_validNonceTable=NULL) +__global__ void quark_compactTest_gpu_SCAN(uint32_t *data, int width, uint32_t *partial_sums=NULL, cuda_compactTestFunction_t testFunc=NULL, uint32_t threads=0, uint32_t startNounce=0, const uint32_t *inpHashes=NULL, const uint32_t *d_validNonceTable=NULL) { __shared__ uint32_t sums[32]; int id = ((blockIdx.x * blockDim.x) + threadIdx.x); @@ -75,18 +67,16 @@ __global__ void quark_compactTest_gpu_SCAN(uint32_t *data, int width, uint32_t * { if (id < threads) { - uint32_t *inpHash; if(d_validNonceTable == NULL) { // keine Nonce-Liste - inpHash = &inpHashes[id<<4]; + value = (*testFunc)(&inpHashes[id << 4]); }else { // Nonce-Liste verfügbar int nonce = d_validNonceTable[id] - startNounce; - inpHash = &inpHashes[nonce<<4]; + value = (*testFunc)(&inpHashes[nonce << 4]); } - value = (*testFunc)(inpHash); }else { value = 0; @@ -167,7 +157,7 @@ __global__ void quark_compactTest_gpu_SCAN(uint32_t *data, int width, uint32_t * } // Uniform add: add partial sums array -__global__ void quark_compactTest_gpu_ADD(uint32_t *data, uint32_t *partial_sums, int len) +__global__ void quark_compactTest_gpu_ADD(uint32_t *data, const uint32_t *partial_sums, int len) { __shared__ uint32_t buf; int id = ((blockIdx.x * blockDim.x) + threadIdx.x); @@ -184,28 +174,26 @@ __global__ void quark_compactTest_gpu_ADD(uint32_t *data, uint32_t *partial_sums } // Der Scatter -__global__ void quark_compactTest_gpu_SCATTER(uint32_t *sum, uint32_t *outp, cuda_compactTestFunction_t testFunc, uint32_t threads=0, uint32_t startNounce=0, uint32_t *inpHashes=NULL, uint32_t *d_validNonceTable=NULL) +__global__ void quark_compactTest_gpu_SCATTER(const uint32_t *sum, uint32_t *outp, cuda_compactTestFunction_t testFunc, uint32_t threads=0, uint32_t startNounce=0, const uint32_t *inpHashes=NULL, const uint32_t *d_validNonceTable=NULL) { int id = ((blockIdx.x * blockDim.x) + threadIdx.x); uint32_t actNounce = id; uint32_t value; if (id < threads) { -// uint32_t nounce = startNounce + id; - uint32_t *inpHash; +// const uint32_t nounce = startNounce + id; if(d_validNonceTable == NULL) { // keine Nonce-Liste - inpHash = &inpHashes[id<<4]; + value = (*testFunc)(&inpHashes[id << 4]); }else { // Nonce-Liste verfügbar int nonce = d_validNonceTable[id] - startNounce; actNounce = nonce; - inpHash = &inpHashes[nonce<<4]; + value = (*testFunc)(&inpHashes[nonce << 4]); } - value = (*testFunc)(inpHash); }else { value = 0; @@ -235,7 +223,7 @@ __host__ static uint32_t quark_compactTest_roundUpExp(uint32_t val) __host__ void quark_compactTest_cpu_singleCompaction(int thr_id, uint32_t threads, uint32_t *nrm, uint32_t *d_nonces1, cuda_compactTestFunction_t function, - uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable) + uint32_t startNounce, const uint32_t *inpHashes, const uint32_t *d_validNonceTable) { int orgThreads = threads; threads = (int)quark_compactTest_roundUpExp((uint32_t)threads); @@ -251,50 +239,62 @@ __host__ void quark_compactTest_cpu_singleCompaction(int thr_id, uint32_t thread bool callThrid = (thr2 > 0) ? true : false; // Erster Initialscan - quark_compactTest_gpu_SCAN<<>>( - d_tempBranch1Nonces[thr_id], 32, d_partSum[0][thr_id], function, orgThreads, startNounce, inpHashes, d_validNonceTable); + quark_compactTest_gpu_SCAN<<>>(d_tempBranch1Nonces[thr_id], 32, d_partSum[0][thr_id], function, orgThreads, startNounce, inpHashes, d_validNonceTable); + CUDA_SAFE_CALL(cudaGetLastError()); // weitere Scans if(callThrid) { - quark_compactTest_gpu_SCAN<<>>(d_partSum[0][thr_id], 32, d_partSum[1][thr_id]); - quark_compactTest_gpu_SCAN<<<1, thr2>>>(d_partSum[1][thr_id], (thr2>32) ? 32 : thr2); - }else + quark_compactTest_gpu_SCAN<<>>(d_partSum[0][thr_id], 32, d_partSum[1][thr_id]); + CUDA_SAFE_CALL(cudaGetLastError()); + quark_compactTest_gpu_SCAN << <1, thr2, 0, gpustream[thr_id] >> >(d_partSum[1][thr_id], (thr2>32) ? 32 : thr2); + CUDA_SAFE_CALL(cudaGetLastError()); + } + else { - quark_compactTest_gpu_SCAN<<>>(d_partSum[0][thr_id], (blockSize2>32) ? 32 : blockSize2); + quark_compactTest_gpu_SCAN<<>>(d_partSum[0][thr_id], (blockSize2>32) ? 32 : blockSize2); + CUDA_SAFE_CALL(cudaGetLastError()); } if(callThrid) - cudaMemcpy(nrm, &(d_partSum[1][thr_id])[thr2-1], sizeof(uint32_t), cudaMemcpyDeviceToHost); + { + cudaMemcpyAsync(nrm, &(d_partSum[1][thr_id])[thr2 - 1], sizeof(uint32_t), cudaMemcpyDeviceToHost, gpustream[thr_id]); + CUDA_SAFE_CALL(cudaGetLastError()); + } else - cudaMemcpy(nrm, &(d_partSum[0][thr_id])[nSummen-1], sizeof(uint32_t), cudaMemcpyDeviceToHost); - + { + cudaMemcpyAsync(nrm, &(d_partSum[0][thr_id])[nSummen - 1], sizeof(uint32_t), cudaMemcpyDeviceToHost, gpustream[thr_id]); + CUDA_SAFE_CALL(cudaGetLastError()); + } // Addieren if(callThrid) { - quark_compactTest_gpu_ADD<<>>(d_partSum[0][thr_id]+blockSize, d_partSum[1][thr_id], blockSize*thr2); + quark_compactTest_gpu_ADD<<>>(d_partSum[0][thr_id]+blockSize, d_partSum[1][thr_id], blockSize*thr2); + CUDA_SAFE_CALL(cudaGetLastError()); } - quark_compactTest_gpu_ADD<<>>(d_tempBranch1Nonces[thr_id]+blockSize, d_partSum[0][thr_id], threads); - + quark_compactTest_gpu_ADD<<>>(d_tempBranch1Nonces[thr_id]+blockSize, d_partSum[0][thr_id], threads); + CUDA_SAFE_CALL(cudaGetLastError()); + // Scatter - quark_compactTest_gpu_SCATTER<<>>(d_tempBranch1Nonces[thr_id], d_nonces1, + quark_compactTest_gpu_SCATTER<<>>(d_tempBranch1Nonces[thr_id], d_nonces1, function, orgThreads, startNounce, inpHashes, d_validNonceTable); + CUDA_SAFE_CALL(cudaGetLastError()); + cudaStreamSynchronize(gpustream[thr_id]); } ////// ACHTUNG: Diese funktion geht aktuell nur mit threads > 65536 (Am besten 256 * 1024 oder 256*2048) __host__ void quark_compactTest_cpu_dualCompaction(int thr_id, uint32_t threads, uint32_t *nrm, uint32_t *d_nonces1, uint32_t *d_nonces2, - uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable) + uint32_t startNounce, const uint32_t *inpHashes, const uint32_t *d_validNonceTable) { quark_compactTest_cpu_singleCompaction(thr_id, threads, &nrm[0], d_nonces1, h_QuarkTrueFunction[thr_id], startNounce, inpHashes, d_validNonceTable); quark_compactTest_cpu_singleCompaction(thr_id, threads, &nrm[1], d_nonces2, h_QuarkFalseFunction[thr_id], startNounce, inpHashes, d_validNonceTable); } -__host__ void quark_compactTest_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable, +__host__ void quark_compactTest_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, const uint32_t *inpHashes, const uint32_t *d_validNonceTable, uint32_t *d_nonces1, uint32_t *nrm1, - uint32_t *d_nonces2, uint32_t *nrm2, - int order) + uint32_t *d_nonces2, uint32_t *nrm2) { // Wenn validNonceTable genutzt wird, dann werden auch nur die Nonces betrachtet, die dort enthalten sind // "threads" ist in diesem Fall auf die Länge dieses Array's zu setzen! @@ -308,8 +308,7 @@ __host__ void quark_compactTest_cpu_hash_64(int thr_id, uint32_t threads, uint32 } __host__ void quark_compactTest_single_false_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable, - uint32_t *d_nonces1, uint32_t *nrm1, - int order) + uint32_t *d_nonces1, uint32_t *nrm1) { // Wenn validNonceTable genutzt wird, dann werden auch nur die Nonces betrachtet, die dort enthalten sind // "threads" ist in diesem Fall auf die Länge dieses Array's zu setzen! diff --git a/quark/cuda_quark_groestl512.cu b/quark/cuda_quark_groestl512.cu index b0d50f731d..a433236d4f 100644 --- a/quark/cuda_quark_groestl512.cu +++ b/quark/cuda_quark_groestl512.cu @@ -4,8 +4,9 @@ #include #include "cuda_helper.h" +#include "cuda_vector.h" -#define TPB 256 +#define TPB 512 #define THF 4 // aus cpu-miner.c @@ -19,20 +20,20 @@ #include "groestl_functions_quad.cu" #include "bitslice_transformations_quad.cu" -__global__ __launch_bounds__(TPB, THF) +__global__ __launch_bounds__(TPB, 2) void quark_groestl512_gpu_hash_64_quad(uint32_t threads, uint32_t startNounce, uint32_t *const __restrict__ g_hash, const uint32_t *const __restrict__ g_nonceVector) { - uint32_t msgBitsliced[8]; - uint32_t state[8]; - uint32_t hash[16]; + uint32_t __align__(16) msgBitsliced[8]; + uint32_t __align__(16) state[8]; + uint32_t __align__(16) hash[16]; // durch 4 dividieren, weil jeweils 4 Threads zusammen ein Hash berechnen - uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x) >> 2; + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x) >> 2; if (thread < threads) { // GROESTL - uint32_t nounce = g_nonceVector ? g_nonceVector[thread] : (startNounce + thread); - uint32_t hashPosition = nounce - startNounce; - uint32_t *inpHash = &g_hash[hashPosition * 16]; + const uint32_t nounce = g_nonceVector ? g_nonceVector[thread] : (startNounce + thread); + const uint32_t hashPosition = nounce - startNounce; + uint32_t *const inpHash = &g_hash[hashPosition * 16]; const uint32_t thr = threadIdx.x & (THF-1); @@ -48,103 +49,33 @@ void quark_groestl512_gpu_hash_64_quad(uint32_t threads, uint32_t startNounce, u groestl512_progressMessage_quad(state, msgBitsliced); from_bitslice_quad(state, hash); + if (thr == 0) { - #pragma unroll - for (int k = 0; k < 16; k++) inpHash[k] = hash[k]; + uint28 *phash = (uint28*)hash; + uint28 *outpt = (uint28*)inpHash; /* var kept for hash align */ + outpt[0] = phash[0]; + outpt[1] = phash[1]; +// outpt[2] = phash[2]; +// outpt[3] = phash[3]; } } } -__global__ void __launch_bounds__(TPB, THF) -quark_doublegroestl512_gpu_hash_64_quad(uint32_t threads, uint32_t startNounce, uint32_t * __restrict__ g_hash, uint32_t * __restrict__ g_nonceVector) -{ - uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x)>>2; - if (thread < threads) - { - // GROESTL - uint32_t message[8]; - uint32_t state[8]; - - uint32_t nounce = g_nonceVector ? g_nonceVector[thread] : (startNounce + thread); - - int hashPosition = nounce - startNounce; - uint32_t * inpHash = &g_hash[hashPosition<<4]; - const uint16_t thr = threadIdx.x & (THF-1); - - #pragma unroll - for(int k=0;k<4;k++) message[k] = inpHash[(k * THF) + thr]; - - #pragma unroll - for(int k=4;k<8;k++) message[k] = 0; - - if (thr == 0) message[4] = 0x80; - if (thr == 3) message[7] = 0x01000000; - - uint32_t msgBitsliced[8]; - to_bitslice_quad(message, msgBitsliced); - - for (int round=0; round<2; round++) - { - groestl512_progressMessage_quad(state, msgBitsliced); - - if (round < 1) - { - // Verkettung zweier Runden inclusive Padding. - msgBitsliced[ 0] = __byte_perm(state[ 0], 0x00800100, 0x4341 + (((threadIdx.x&3)==3)<<13)); - msgBitsliced[ 1] = __byte_perm(state[ 1], 0x00800100, 0x4341); - msgBitsliced[ 2] = __byte_perm(state[ 2], 0x00800100, 0x4341); - msgBitsliced[ 3] = __byte_perm(state[ 3], 0x00800100, 0x4341); - msgBitsliced[ 4] = __byte_perm(state[ 4], 0x00800100, 0x4341); - msgBitsliced[ 5] = __byte_perm(state[ 5], 0x00800100, 0x4341); - msgBitsliced[ 6] = __byte_perm(state[ 6], 0x00800100, 0x4341); - msgBitsliced[ 7] = __byte_perm(state[ 7], 0x00800100, 0x4341 + (((threadIdx.x&3)==0)<<4)); - } - } - - // Nur der erste von jeweils 4 Threads bekommt das Ergebns-Hash - uint32_t *outpHash = inpHash; - uint32_t hash[16]; - from_bitslice_quad(state, hash); - - if (thr != 0) return; - - #pragma unroll - for(int k=0;k<16;k++) outpHash[k] = hash[k]; - } -} -// Setup-Funktionen __host__ void quark_groestl512_cpu_init(int thr_id, uint32_t threads) { // cudaGetDeviceProperties(&props[thr_id], device_map[thr_id]); } -__host__ void quark_groestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order) +__host__ void quark_groestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash) { - // Compute 3.0 benutzt die registeroptimierte Quad Variante mit Warp Shuffle - // mit den Quad Funktionen brauchen wir jetzt 4 threads pro Hash, daher Faktor 4 bei der Blockzahl - const int factor = THF; // berechne wie viele Thread Blocks wir brauchen - dim3 grid(factor*((threads + TPB - 1) / TPB)); + dim3 grid(THF*((threads + TPB - 1) / TPB)); dim3 block(TPB); - quark_groestl512_gpu_hash_64_quad<<>>(threads, startNounce, d_hash, d_nonceVector); - - // Strategisches Sleep Kommando zur Senkung der CPU Last - //MyStreamSynchronize(NULL, order, thr_id); + quark_groestl512_gpu_hash_64_quad<<>>(threads, startNounce, d_hash, d_nonceVector); + CUDA_SAFE_CALL(cudaGetLastError()); } -__host__ void quark_doublegroestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order) -{ - // Compute 3.0 benutzt die registeroptimierte Quad Variante mit Warp Shuffle - // mit den Quad Funktionen brauchen wir jetzt 4 threads pro Hash, daher Faktor 4 bei der Blockzahl - const int factor = THF; - - // berechne wie viele Thread Blocks wir brauchen - dim3 grid(factor*((threads + TPB-1)/TPB)); - dim3 block(TPB); - - quark_doublegroestl512_gpu_hash_64_quad<<>>(threads, startNounce, d_hash, d_nonceVector); -} diff --git a/quark/cuda_quark_keccak512.cu b/quark/cuda_quark_keccak512.cu index 6ae3a9acab..875f48e75b 100644 --- a/quark/cuda_quark_keccak512.cu +++ b/quark/cuda_quark_keccak512.cu @@ -2,6 +2,7 @@ #include #include "cuda_helper.h" +#include "cuda_vector.h" #ifdef _MSC_VER #define UINT2(x,y) { x, y } @@ -9,6 +10,8 @@ #define UINT2(x,y) (uint2) { x, y } #endif +static uint32_t *d_found[MAX_GPUS]; + __constant__ uint2 c_keccak_round_constants35[24] = { { 0x00000001ul, 0x00000000 }, { 0x00008082ul, 0x00000000 }, { 0x0000808aul, 0x80000000 }, { 0x80008000ul, 0x80000000 }, @@ -23,344 +26,1923 @@ __constant__ uint2 c_keccak_round_constants35[24] = { { 0x80008081ul, 0x80000000 }, { 0x00008080ul, 0x80000000 }, { 0x80000001ul, 0x00000000 }, { 0x80008008ul, 0x80000000 } }; +#define bitselect(a, b, c) ((a) ^ ((c) & ((b) ^ (a)))) -static __device__ __forceinline__ void -keccak_block_35(uint2 *s) { - int i = 0; - uint2 t[5], u[5], v, w; - - t[0] = s[0] ^ s[5]; - t[1] = s[1] ^ s[6]; - t[2] = s[2] ^ s[7]; - t[3] = s[3] ^ s[8]; - - /* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */ - u[0] = s[4] ^ ROL2(t[1], 1); - u[1] = t[0] ^ ROL2(t[2], 1); - u[2] = t[1] ^ ROL2(t[3], 1); - u[3] = t[2] ^ ROL2(s[4], 1); - u[4] = t[3] ^ ROL2(t[0], 1); - - /* theta: a[0,i], a[1,i], .. a[4,i] ^= d[i] */ - s[0] ^= u[0]; s[5] ^= u[0]; s[10] ^= u[0]; s[15] ^= u[0]; s[20] ^= u[0]; - s[1] ^= u[1]; s[6] ^= u[1]; s[11] ^= u[1]; s[16] ^= u[1]; s[21] ^= u[1]; - s[2] ^= u[2]; s[7] ^= u[2]; s[12] ^= u[2]; s[17] ^= u[2]; s[22] ^= u[2]; - s[3] ^= u[3]; s[8] ^= u[3]; s[13] ^= u[3]; s[18] ^= u[3]; s[23] ^= u[3]; - s[4] ^= u[4]; s[9] ^= u[4]; s[14] ^= u[4]; s[19] ^= u[4]; s[24] ^= u[4]; - - /* rho pi: b[..] = rotl(a[..], ..) */ - v = s[1]; - s[1] = ROL2(s[6], 44); - s[6] = ROL2(s[9], 20); - s[9] = ROL2(s[22], 61); - s[22] = ROL2(s[14], 39); - s[14] = ROL2(s[20], 18); - s[20] = ROL2(s[2], 62); - s[2] = ROL2(s[12], 43); - s[12] = ROL2(s[13], 25); - s[13] = ROL2(s[19], 8); - s[19] = ROL2(s[23], 56); - s[23] = ROL2(s[15], 41); - s[15] = ROL2(s[4], 27); - s[4] = ROL2(s[24], 14); - s[24] = ROL2(s[21], 2); - s[21] = ROL2(s[8], 55); - s[8] = ROL2(s[16], 45); - s[16] = ROL2(s[5], 36); - s[5] = ROL2(s[3], 28); - s[3] = ROL2(s[18], 21); - s[18] = ROL2(s[17], 15); - s[17] = ROL2(s[11], 10); - s[11] = ROL2(s[7], 6); - s[7] = ROL2(s[10], 3); - s[10] = ROL2(v, 1); - - /* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */ - v = s[0]; w = s[1]; s[0] ^= (~w) & s[2]; s[1] ^= (~s[2]) & s[3]; s[2] ^= (~s[3]) & s[4]; s[3] ^= (~s[4]) & v; s[4] ^= (~v) & w; - v = s[5]; w = s[6]; s[5] ^= (~w) & s[7]; s[6] ^= (~s[7]) & s[8]; s[7] ^= (~s[8]) & s[9]; s[8] ^= (~s[9]) & v; s[9] ^= (~v) & w; - v = s[10]; w = s[11]; s[10] ^= (~w) & s[12]; s[11] ^= (~s[12]) & s[13]; s[12] ^= (~s[13]) & s[14]; s[13] ^= (~s[14]) & v; s[14] ^= (~v) & w; - v = s[15]; w = s[16]; s[15] ^= (~w) & s[17]; s[16] ^= (~s[17]) & s[18]; s[17] ^= (~s[18]) & s[19]; s[18] ^= (~s[19]) & v; s[19] ^= (~v) & w; - v = s[20]; w = s[21]; s[20] ^= (~w) & s[22]; s[21] ^= (~s[22]) & s[23]; s[22] ^= (~s[23]) & s[24]; s[23] ^= (~s[24]) & v; s[24] ^= (~v) & w; - - /* iota: a[0,0] ^= round constant */ - s[0] = s[0] ^ 1; //c_keccak_round_constants[0]); - - for (i = 1; i < 24; i++) { - /* theta: c = a[0,i] ^ a[1,i] ^ .. a[4,i] */ - t[0] = s[0] ^ s[5] ^ s[10] ^ s[15] ^ s[20]; - t[1] = s[1] ^ s[6] ^ s[11] ^ s[16] ^ s[21]; - t[2] = s[2] ^ s[7] ^ s[12] ^ s[17] ^ s[22]; - t[3] = s[3] ^ s[8] ^ s[13] ^ s[18] ^ s[23]; - t[4] = s[4] ^ s[9] ^ s[14] ^ s[19] ^ s[24]; +__global__ __launch_bounds__(128, 4) +void quark_keccak512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint2 *const __restrict__ g_hash, uint32_t *const __restrict__ g_nonceVector) +{ + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); +// if (thread < threads) + { + const uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread); - /* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */ - u[0] = t[4] ^ ROL2(t[1], 1); - u[1] = t[0] ^ ROL2(t[2], 1); - u[2] = t[1] ^ ROL2(t[3], 1); - u[3] = t[2] ^ ROL2(t[4], 1); - u[4] = t[3] ^ ROL2(t[0], 1); - - /* theta: a[0,i], a[1,i], .. a[4,i] ^= d[i] */ - s[0] ^= u[0]; s[5] ^= u[0]; s[10] ^= u[0]; s[15] ^= u[0]; s[20] ^= u[0]; - s[1] ^= u[1]; s[6] ^= u[1]; s[11] ^= u[1]; s[16] ^= u[1]; s[21] ^= u[1]; - s[2] ^= u[2]; s[7] ^= u[2]; s[12] ^= u[2]; s[17] ^= u[2]; s[22] ^= u[2]; - s[3] ^= u[3]; s[8] ^= u[3]; s[13] ^= u[3]; s[18] ^= u[3]; s[23] ^= u[3]; - s[4] ^= u[4]; s[9] ^= u[4]; s[14] ^= u[4]; s[19] ^= u[4]; s[24] ^= u[4]; - - /* rho pi: b[..] = rotl(a[..], ..) */ - v = s[1]; - s[1] = ROL2(s[6], 44); - s[6] = ROL2(s[9], 20); - s[9] = ROL2(s[22], 61); - s[22] = ROL2(s[14], 39); - s[14] = ROL2(s[20], 18); - s[20] = ROL2(s[2], 62); - s[2] = ROL2(s[12], 43); - s[12] = ROL2(s[13], 25); - s[13] = ROL2(s[19], 8); - s[19] = ROL2(s[23], 56); - s[23] = ROL2(s[15], 41); - s[15] = ROL2(s[4], 27); - s[4] = ROL2(s[24], 14); - s[24] = ROL2(s[21], 2); - s[21] = ROL2(s[8], 55); - s[8] = ROL2(s[16], 45); - s[16] = ROL2(s[5], 36); - s[5] = ROL2(s[3], 28); - s[3] = ROL2(s[18], 21); - s[18] = ROL2(s[17], 15); - s[17] = ROL2(s[11], 10); - s[11] = ROL2(s[7], 6); - s[7] = ROL2(s[10], 3); - s[10] = ROL2(v, 1); - - /* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */ - v = s[0]; w = s[1]; s[0] ^= (~w) & s[2]; s[1] ^= (~s[2]) & s[3]; s[2] ^= (~s[3]) & s[4]; s[3] ^= (~s[4]) & v; s[4] ^= (~v) & w; - v = s[5]; w = s[6]; s[5] ^= (~w) & s[7]; s[6] ^= (~s[7]) & s[8]; s[7] ^= (~s[8]) & s[9]; s[8] ^= (~s[9]) & v; s[9] ^= (~v) & w; - v = s[10]; w = s[11]; s[10] ^= (~w) & s[12]; s[11] ^= (~s[12]) & s[13]; s[12] ^= (~s[13]) & s[14]; s[13] ^= (~s[14]) & v; s[14] ^= (~v) & w; - v = s[15]; w = s[16]; s[15] ^= (~w) & s[17]; s[16] ^= (~s[17]) & s[18]; s[17] ^= (~s[18]) & s[19]; s[18] ^= (~s[19]) & v; s[19] ^= (~v) & w; - v = s[20]; w = s[21]; s[20] ^= (~w) & s[22]; s[21] ^= (~s[22]) & s[23]; s[22] ^= (~s[23]) & s[24]; s[23] ^= (~s[24]) & v; s[24] ^= (~v) & w; - - /* iota: a[0,0] ^= round constant */ - s[0] ^= c_keccak_round_constants35[i]; - } -} + const uint32_t hashPosition = nounce - startNounce; + uint2 *const inpHash = &g_hash[8 * hashPosition]; -static __device__ __forceinline__ void -keccak_block_35_final(uint2 *s) -{ - int i = 0; - uint2 t[5], u[5], v, w; - - t[0] = s[0] ^ s[5]; - t[1] = s[1] ^ s[6]; - t[2] = s[2] ^ s[7]; - t[3] = s[3] ^ s[8]; - - /* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */ - u[0] = s[4] ^ ROL2(t[1], 1); - u[1] = t[0] ^ ROL2(t[2], 1); - u[2] = t[1] ^ ROL2(t[3], 1); - u[3] = t[2] ^ ROL2(s[4], 1); - u[4] = t[3] ^ ROL2(t[0], 1); - - /* theta: a[0,i], a[1,i], .. a[4,i] ^= d[i] */ - s[0] ^= u[0]; s[5] ^= u[0]; s[10] = u[0]; s[15] = u[0]; s[20] = u[0]; - s[1] ^= u[1]; s[6] ^= u[1]; s[11] = u[1]; s[16] = u[1]; s[21] = u[1]; - s[2] ^= u[2]; s[7] ^= u[2]; s[12] = u[2]; s[17] = u[2]; s[22] = u[2]; - s[3] ^= u[3]; s[8] ^= u[3]; s[13] = u[3]; s[18] = u[3]; s[23] = u[3]; - s[4] ^= u[4]; s[9] = u[4]; s[14] = u[4]; s[19] = u[4]; s[24] = u[4]; - - /* rho pi: b[..] = rotl(a[..], ..) */ - v = s[1]; - s[1] = ROL2(s[6], 44); - s[6] = ROL2(s[9], 20); - s[9] = ROL2(s[22], 61); - s[22] = ROL2(s[14], 39); - s[14] = ROL2(s[20], 18); - s[20] = ROL2(s[2], 62); - s[2] = ROL2(s[12], 43); - s[12] = ROL2(s[13], 25); - s[13] = ROL2(s[19], 8); - s[19] = ROL2(s[23], 56); - s[23] = ROL2(s[15], 41); - s[15] = ROL2(s[4], 27); - s[4] = ROL2(s[24], 14); - s[24] = ROL2(s[21], 2); - s[21] = ROL2(s[8], 55); - s[8] = ROL2(s[16], 45); - s[16] = ROL2(s[5], 36); - s[5] = ROL2(s[3], 28); - s[3] = ROL2(s[18], 21); - s[18] = ROL2(s[17], 15); - s[17] = ROL2(s[11], 10); - s[11] = ROL2(s[7], 6); - s[7] = ROL2(s[10], 3); - s[10] = ROL2(v, 1); - - /* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */ - v = s[0]; w = s[1]; s[0] ^= (~w) & s[2]; s[1] ^= (~s[2]) & s[3]; s[2] ^= (~s[3]) & s[4]; s[3] ^= (~s[4]) & v; s[4] ^= (~v) & w; - v = s[5]; w = s[6]; s[5] ^= (~w) & s[7]; s[6] ^= (~s[7]) & s[8]; s[7] ^= (~s[8]) & s[9]; s[8] ^= (~s[9]) & v; s[9] ^= (~v) & w; - v = s[10]; w = s[11]; s[10] ^= (~w) & s[12]; s[11] ^= (~s[12]) & s[13]; s[12] ^= (~s[13]) & s[14]; s[13] ^= (~s[14]) & v; s[14] ^= (~v) & w; - v = s[15]; w = s[16]; s[15] ^= (~w) & s[17]; s[16] ^= (~s[17]) & s[18]; s[17] ^= (~s[18]) & s[19]; s[18] ^= (~s[19]) & v; s[19] ^= (~v) & w; - v = s[20]; w = s[21]; s[20] ^= (~w) & s[22]; s[21] ^= (~s[22]) & s[23]; s[22] ^= (~s[23]) & s[24]; s[23] ^= (~s[24]) & v; s[24] ^= (~v) & w; - - /* iota: a[0,0] ^= round constant */ - s[0] = s[0] ^ 1; //c_keccak_round_constants[0]); - - for (i = 1; i < 23; i++) - { - /* theta: c = a[0,i] ^ a[1,i] ^ .. a[4,i] */ - t[0] = s[0] ^ s[5] ^ s[10] ^ s[15] ^ s[20]; - t[1] = s[1] ^ s[6] ^ s[11] ^ s[16] ^ s[21]; - t[2] = s[2] ^ s[7] ^ s[12] ^ s[17] ^ s[22]; - t[3] = s[3] ^ s[8] ^ s[13] ^ s[18] ^ s[23]; - t[4] = s[4] ^ s[9] ^ s[14] ^ s[19] ^ s[24]; - /* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */ - u[0] = t[4] ^ ROL2(t[1], 1); - u[1] = t[0] ^ ROL2(t[2], 1); - u[2] = t[1] ^ ROL2(t[3], 1); - u[3] = t[2] ^ ROL2(t[4], 1); - u[4] = t[3] ^ ROL2(t[0], 1); - - /* theta: a[0,i], a[1,i], .. a[4,i] ^= d[i] */ - s[0] ^= u[0]; s[5] ^= u[0]; s[10] ^= u[0]; s[15] ^= u[0]; s[20] ^= u[0]; - s[1] ^= u[1]; s[6] ^= u[1]; s[11] ^= u[1]; s[16] ^= u[1]; s[21] ^= u[1]; - s[2] ^= u[2]; s[7] ^= u[2]; s[12] ^= u[2]; s[17] ^= u[2]; s[22] ^= u[2]; - s[3] ^= u[3]; s[8] ^= u[3]; s[13] ^= u[3]; s[18] ^= u[3]; s[23] ^= u[3]; - s[4] ^= u[4]; s[9] ^= u[4]; s[14] ^= u[4]; s[19] ^= u[4]; s[24] ^= u[4]; - - /* rho pi: b[..] = rotl(a[..], ..) */ - v = s[1]; - s[1] = ROL2(s[6], 44); - s[6] = ROL2(s[9], 20); - s[9] = ROL2(s[22], 61); - s[22] = ROL2(s[14], 39); - s[14] = ROL2(s[20], 18); - s[20] = ROL2(s[2], 62); - s[2] = ROL2(s[12], 43); - s[12] = ROL2(s[13], 25); - s[13] = ROL2(s[19], 8); - s[19] = ROL2(s[23], 56); - s[23] = ROL2(s[15], 41); - s[15] = ROL2(s[4], 27); - s[4] = ROL2(s[24], 14); - s[24] = ROL2(s[21], 2); - s[21] = ROL2(s[8], 55); - s[8] = ROL2(s[16], 45); - s[16] = ROL2(s[5], 36); - s[5] = ROL2(s[3], 28); - s[3] = ROL2(s[18], 21); - s[18] = ROL2(s[17], 15); - s[17] = ROL2(s[11], 10); - s[11] = ROL2(s[7], 6); - s[7] = ROL2(s[10], 3); - s[10] = ROL2(v, 1); - - /* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */ - v = s[0]; w = s[1]; s[0] ^= (~w) & s[2]; s[1] ^= (~s[2]) & s[3]; s[2] ^= (~s[3]) & s[4]; s[3] ^= (~s[4]) & v; s[4] ^= (~v) & w; - v = s[5]; w = s[6]; s[5] ^= (~w) & s[7]; s[6] ^= (~s[7]) & s[8]; s[7] ^= (~s[8]) & s[9]; s[8] ^= (~s[9]) & v; s[9] ^= (~v) & w; - v = s[10]; w = s[11]; s[10] ^= (~w) & s[12]; s[11] ^= (~s[12]) & s[13]; s[12] ^= (~s[13]) & s[14]; s[13] ^= (~s[14]) & v; s[14] ^= (~v) & w; - v = s[15]; w = s[16]; s[15] ^= (~w) & s[17]; s[16] ^= (~s[17]) & s[18]; s[17] ^= (~s[18]) & s[19]; s[18] ^= (~s[19]) & v; s[19] ^= (~v) & w; - v = s[20]; w = s[21]; s[20] ^= (~w) & s[22]; s[21] ^= (~s[22]) & s[23]; s[22] ^= (~s[23]) & s[24]; s[23] ^= (~s[24]) & v; s[24] ^= (~v) & w; - - /* iota: a[0,0] ^= round constant */ - s[0] ^= c_keccak_round_constants35[i]; - } - t[0] = s[0] ^ s[5] ^ s[10] ^ s[15] ^ s[20]; - t[1] = s[1] ^ s[6] ^ s[11] ^ s[16] ^ s[21]; - t[2] = s[2] ^ s[7] ^ s[12] ^ s[17] ^ s[22]; - t[3] = s[3] ^ s[8] ^ s[13] ^ s[18] ^ s[23]; - t[4] = s[4] ^ s[9] ^ s[14] ^ s[19] ^ s[24]; + uint2 msg[8]; - s[0] ^= t[4] ^ ROL2(t[1], 1); - s[18] ^= t[2] ^ ROL2(t[4], 1); - s[24] ^= t[3] ^ ROL2(t[0], 1); + ((uint28*)msg)[0] = ((uint28*)inpHash)[0]; + ((uint28*)msg)[1] = ((uint28*)inpHash)[1]; - s[3] = ROL2(s[18], 21) ^ ((~ROL2(s[24], 14)) & s[0]); -} + uint2 s[25]; + uint2 bc[5], tmpxor[5], tmp1, tmp2; -__global__ __launch_bounds__(256, 2) -void quark_keccak512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector) -{ - uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); - if (thread < threads) - { - uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread); + tmpxor[0] = msg[0] ^ msg[5]; + tmpxor[1] = msg[1] ^ msg[6]; + tmpxor[2] = msg[2] ^ msg[7]; + tmpxor[3] = msg[3] ^ make_uint2(0x1, 0x80000000); + tmpxor[4] = msg[4]; - int hashPosition = nounce - startNounce; - uint64_t *inpHash = &g_hash[8 * hashPosition]; + bc[0] = tmpxor[0] ^ ROL2(tmpxor[2], 1); + bc[1] = tmpxor[1] ^ ROL2(tmpxor[3], 1); + bc[2] = tmpxor[2] ^ ROL2(tmpxor[4], 1); + bc[3] = tmpxor[3] ^ ROL2(tmpxor[0], 1); + bc[4] = tmpxor[4] ^ ROL2(tmpxor[1], 1); - uint2 keccak_gpu_state[25]; -#pragma unroll - for (int i = 0; i<8; i++) + s[0] = msg[0] ^ bc[4]; + s[1] = ROL2(msg[6] ^ bc[0], 44); + s[6] = ROL2(bc[3], 20); + s[9] = ROL2(bc[1], 61); + s[22] = ROL2(bc[3], 39); + s[14] = ROL2(bc[4], 18); + s[20] = ROL2(msg[2] ^ bc[1], 62); + s[2] = ROL2(bc[1], 43); + s[12] = ROL2(bc[2], 25); + s[13] = ROL8(bc[3]); + s[19] = ROR8(bc[2]); + s[23] = ROL2(bc[4], 41); + s[15] = ROL2(msg[4] ^ bc[3], 27); + s[4] = ROL2(bc[3], 14); + s[24] = ROL2(bc[0], 2); + s[21] = ROL2(make_uint2(0x1, 0x80000000) ^ bc[2], 55); + s[8] = ROL2(bc[0], 45); + s[16] = ROL2(msg[5] ^ bc[4], 36); + s[5] = ROL2(msg[3] ^ bc[2], 28); + s[3] = ROL2(bc[2], 21); + s[18] = ROL2(bc[1], 15); + s[17] = ROL2(bc[0], 10); + s[11] = ROL2(msg[7] ^ bc[1], 6); + s[7] = ROL2(bc[4], 3); + s[10] = ROL2(msg[1] ^ bc[0], 1); + + tmp1 = s[0]; tmp2 = s[1]; s[0] = bitselect(s[0] ^ s[2], s[0], s[1]); s[1] = bitselect(s[1] ^ s[3], s[1], s[2]); s[2] = bitselect(s[2] ^ s[4], s[2], s[3]); s[3] = bitselect(s[3] ^ tmp1, s[3], s[4]); s[4] = bitselect(s[4] ^ tmp2, s[4], tmp1); + tmp1 = s[5]; tmp2 = s[6]; s[5] = bitselect(s[5] ^ s[7], s[5], s[6]); s[6] = bitselect(s[6] ^ s[8], s[6], s[7]); s[7] = bitselect(s[7] ^ s[9], s[7], s[8]); s[8] = bitselect(s[8] ^ tmp1, s[8], s[9]); s[9] = bitselect(s[9] ^ tmp2, s[9], tmp1); + tmp1 = s[10]; tmp2 = s[11]; s[10] = bitselect(s[10] ^ s[12], s[10], s[11]); s[11] = bitselect(s[11] ^ s[13], s[11], s[12]); s[12] = bitselect(s[12] ^ s[14], s[12], s[13]); s[13] = bitselect(s[13] ^ tmp1, s[13], s[14]); s[14] = bitselect(s[14] ^ tmp2, s[14], tmp1); + tmp1 = s[15]; tmp2 = s[16]; s[15] = bitselect(s[15] ^ s[17], s[15], s[16]); s[16] = bitselect(s[16] ^ s[18], s[16], s[17]); s[17] = bitselect(s[17] ^ s[19], s[17], s[18]); s[18] = bitselect(s[18] ^ tmp1, s[18], s[19]); s[19] = bitselect(s[19] ^ tmp2, s[19], tmp1); + tmp1 = s[20]; tmp2 = s[21]; s[20] = bitselect(s[20] ^ s[22], s[20], s[21]); s[21] = bitselect(s[21] ^ s[23], s[21], s[22]); s[22] = bitselect(s[22] ^ s[24], s[22], s[23]); s[23] = bitselect(s[23] ^ tmp1, s[23], s[24]); s[24] = bitselect(s[24] ^ tmp2, s[24], tmp1); + s[0].x ^= 1; + +#pragma unroll 2 + for (int i = 1; i < 24; ++i) { - keccak_gpu_state[i] = vectorize(inpHash[i]); - } - keccak_gpu_state[8] = make_uint2(0x00000001UL, 0x80000000); //vectorize(0x8000000000000001ULL); #pragma unroll - for (int i=9; i<25; i++) - { - keccak_gpu_state[i] = make_uint2(0, 0); + for (int x = 0; x < 5; x++) + tmpxor[x] = s[x] ^ s[x + 5] ^ s[x + 10] ^ s[x + 15] ^ s[x + 20]; + + bc[0] = tmpxor[0] ^ ROL2(tmpxor[2], 1); + bc[1] = tmpxor[1] ^ ROL2(tmpxor[3], 1); + bc[2] = tmpxor[2] ^ ROL2(tmpxor[4], 1); + bc[3] = tmpxor[3] ^ ROL2(tmpxor[0], 1); + bc[4] = tmpxor[4] ^ ROL2(tmpxor[1], 1); + + tmp1 = s[1] ^ bc[0]; + + s[0] ^= bc[4]; + s[1] = ROL2(s[6] ^ bc[0], 44); + s[6] = ROL2(s[9] ^ bc[3], 20); + s[9] = ROL2(s[22] ^ bc[1], 61); + s[22] = ROL2(s[14] ^ bc[3], 39); + s[14] = ROL2(s[20] ^ bc[4], 18); + s[20] = ROL2(s[2] ^ bc[1], 62); + s[2] = ROL2(s[12] ^ bc[1], 43); + s[12] = ROL2(s[13] ^ bc[2], 25); + s[13] = ROL8(s[19] ^ bc[3]); + s[19] = ROR8(s[23] ^ bc[2]); + s[23] = ROL2(s[15] ^ bc[4], 41); + s[15] = ROL2(s[4] ^ bc[3], 27); + s[4] = ROL2(s[24] ^ bc[3], 14); + s[24] = ROL2(s[21] ^ bc[0], 2); + s[21] = ROL2(s[8] ^ bc[2], 55); + s[8] = ROL2(s[16] ^ bc[0], 45); + s[16] = ROL2(s[5] ^ bc[4], 36); + s[5] = ROL2(s[3] ^ bc[2], 28); + s[3] = ROL2(s[18] ^ bc[2], 21); + s[18] = ROL2(s[17] ^ bc[1], 15); + s[17] = ROL2(s[11] ^ bc[0], 10); + s[11] = ROL2(s[7] ^ bc[1], 6); + s[7] = ROL2(s[10] ^ bc[4], 3); + s[10] = ROL2(tmp1, 1); + + tmp1 = s[0]; tmp2 = s[1]; s[0] = bitselect(s[0] ^ s[2], s[0], s[1]); s[1] = bitselect(s[1] ^ s[3], s[1], s[2]); s[2] = bitselect(s[2] ^ s[4], s[2], s[3]); s[3] = bitselect(s[3] ^ tmp1, s[3], s[4]); s[4] = bitselect(s[4] ^ tmp2, s[4], tmp1); + tmp1 = s[5]; tmp2 = s[6]; s[5] = bitselect(s[5] ^ s[7], s[5], s[6]); s[6] = bitselect(s[6] ^ s[8], s[6], s[7]); s[7] = bitselect(s[7] ^ s[9], s[7], s[8]); s[8] = bitselect(s[8] ^ tmp1, s[8], s[9]); s[9] = bitselect(s[9] ^ tmp2, s[9], tmp1); + tmp1 = s[10]; tmp2 = s[11]; s[10] = bitselect(s[10] ^ s[12], s[10], s[11]); s[11] = bitselect(s[11] ^ s[13], s[11], s[12]); s[12] = bitselect(s[12] ^ s[14], s[12], s[13]); s[13] = bitselect(s[13] ^ tmp1, s[13], s[14]); s[14] = bitselect(s[14] ^ tmp2, s[14], tmp1); + tmp1 = s[15]; tmp2 = s[16]; s[15] = bitselect(s[15] ^ s[17], s[15], s[16]); s[16] = bitselect(s[16] ^ s[18], s[16], s[17]); s[17] = bitselect(s[17] ^ s[19], s[17], s[18]); s[18] = bitselect(s[18] ^ tmp1, s[18], s[19]); s[19] = bitselect(s[19] ^ tmp2, s[19], tmp1); + tmp1 = s[20]; tmp2 = s[21]; s[20] = bitselect(s[20] ^ s[22], s[20], s[21]); s[21] = bitselect(s[21] ^ s[23], s[21], s[22]); s[22] = bitselect(s[22] ^ s[24], s[22], s[23]); s[23] = bitselect(s[23] ^ tmp1, s[23], s[24]); s[24] = bitselect(s[24] ^ tmp2, s[24], tmp1); + s[0] ^= c_keccak_round_constants35[i]; } - keccak_block_35(keccak_gpu_state); #pragma unroll for(int i=0;i<8;i++) - inpHash[i] = devectorize(keccak_gpu_state[i]); + inpHash[i] = s[i]; } } -__global__ __launch_bounds__(256, 2) -void quark_keccak512_gpu_hash_64_final(uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector) + +__global__ __launch_bounds__(128, 6) +void quark_keccakskein512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint2 *const __restrict__ g_hash, uint32_t *const __restrict__ g_nonceVector) { - uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); if (thread < threads) + { + const uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread); + + const uint32_t hashPosition = nounce - startNounce; + uint2 *const inpHash = &g_hash[8 * hashPosition]; + + uint2 s[25]; + uint2 bc[5], tmpxor[5], tmp1, tmp2; + + uint2 msg[8]; + + ((uint28*)msg)[0] = ((uint28*)inpHash)[0]; + ((uint28*)msg)[1] = ((uint28*)inpHash)[1]; + + tmpxor[0] = msg[0] ^ msg[5]; + tmpxor[1] = msg[1] ^ msg[6]; + tmpxor[2] = msg[2] ^ msg[7]; + tmpxor[3] = msg[3] ^ make_uint2(0x1, 0x80000000); + tmpxor[4] = msg[4]; + + bc[0] = tmpxor[0] ^ ROL2(tmpxor[2], 1); + bc[1] = tmpxor[1] ^ ROL2(tmpxor[3], 1); + bc[2] = tmpxor[2] ^ ROL2(tmpxor[4], 1); + bc[3] = tmpxor[3] ^ ROL2(tmpxor[0], 1); + bc[4] = tmpxor[4] ^ ROL2(tmpxor[1], 1); + + s[0] = inpHash[0] ^ bc[4]; + s[1] = ROL2(inpHash[6] ^ bc[0], 44); + s[6] = ROL2(bc[3], 20); + s[9] = ROL2(bc[1], 61); + s[22] = ROL2(bc[3], 39); + s[14] = ROL2(bc[4], 18); + s[20] = ROL2(inpHash[2] ^ bc[1], 62); + s[2] = ROL2(bc[1], 43); + s[12] = ROL2(bc[2], 25); + s[13] = ROL8(bc[3]); + s[19] = ROR8(bc[2]); + s[23] = ROL2(bc[4], 41); + s[15] = ROL2(inpHash[4] ^ bc[3], 27); + s[4] = ROL2(bc[3], 14); + s[24] = ROL2(bc[0], 2); + s[21] = ROL2(make_uint2(0x1, 0x80000000) ^ bc[2], 55); + s[8] = ROL2(bc[0], 45); + s[16] = ROL2(inpHash[5] ^ bc[4], 36); + s[5] = ROL2(inpHash[3] ^ bc[2], 28); + s[3] = ROL2(bc[2], 21); + s[18] = ROL2(bc[1], 15); + s[17] = ROL2(bc[0], 10); + s[11] = ROL2(inpHash[7] ^ bc[1], 6); + s[7] = ROL2(bc[4], 3); + s[10] = ROL2(inpHash[1] ^ bc[0], 1); + + tmp1 = s[0]; tmp2 = s[1]; s[0] = bitselect(s[0] ^ s[2], s[0], s[1]); s[1] = bitselect(s[1] ^ s[3], s[1], s[2]); s[2] = bitselect(s[2] ^ s[4], s[2], s[3]); s[3] = bitselect(s[3] ^ tmp1, s[3], s[4]); s[4] = bitselect(s[4] ^ tmp2, s[4], tmp1); + tmp1 = s[5]; tmp2 = s[6]; s[5] = bitselect(s[5] ^ s[7], s[5], s[6]); s[6] = bitselect(s[6] ^ s[8], s[6], s[7]); s[7] = bitselect(s[7] ^ s[9], s[7], s[8]); s[8] = bitselect(s[8] ^ tmp1, s[8], s[9]); s[9] = bitselect(s[9] ^ tmp2, s[9], tmp1); + tmp1 = s[10]; tmp2 = s[11]; s[10] = bitselect(s[10] ^ s[12], s[10], s[11]); s[11] = bitselect(s[11] ^ s[13], s[11], s[12]); s[12] = bitselect(s[12] ^ s[14], s[12], s[13]); s[13] = bitselect(s[13] ^ tmp1, s[13], s[14]); s[14] = bitselect(s[14] ^ tmp2, s[14], tmp1); + tmp1 = s[15]; tmp2 = s[16]; s[15] = bitselect(s[15] ^ s[17], s[15], s[16]); s[16] = bitselect(s[16] ^ s[18], s[16], s[17]); s[17] = bitselect(s[17] ^ s[19], s[17], s[18]); s[18] = bitselect(s[18] ^ tmp1, s[18], s[19]); s[19] = bitselect(s[19] ^ tmp2, s[19], tmp1); + tmp1 = s[20]; tmp2 = s[21]; s[20] = bitselect(s[20] ^ s[22], s[20], s[21]); s[21] = bitselect(s[21] ^ s[23], s[21], s[22]); s[22] = bitselect(s[22] ^ s[24], s[22], s[23]); s[23] = bitselect(s[23] ^ tmp1, s[23], s[24]); s[24] = bitselect(s[24] ^ tmp2, s[24], tmp1); + s[0].x ^= 1; + +#pragma unroll 2 + for (int i = 1; i < 24; ++i) + { + +#pragma unroll + for (int x = 0; x < 5; x++) + tmpxor[x] = s[x] ^ s[x + 5] ^ s[x + 10] ^ s[x + 15] ^ s[x + 20]; + + bc[0] = tmpxor[0] ^ ROL2(tmpxor[2], 1); + bc[1] = tmpxor[1] ^ ROL2(tmpxor[3], 1); + bc[2] = tmpxor[2] ^ ROL2(tmpxor[4], 1); + bc[3] = tmpxor[3] ^ ROL2(tmpxor[0], 1); + bc[4] = tmpxor[4] ^ ROL2(tmpxor[1], 1); + + tmp1 = s[1] ^ bc[0]; + + s[0] ^= bc[4]; + s[1] = ROL2(s[6] ^ bc[0], 44); + s[6] = ROL2(s[9] ^ bc[3], 20); + s[9] = ROL2(s[22] ^ bc[1], 61); + s[22] = ROL2(s[14] ^ bc[3], 39); + s[14] = ROL2(s[20] ^ bc[4], 18); + s[20] = ROL2(s[2] ^ bc[1], 62); + s[2] = ROL2(s[12] ^ bc[1], 43); + s[12] = ROL2(s[13] ^ bc[2], 25); + s[13] = ROL8(s[19] ^ bc[3]); + s[19] = ROR8(s[23] ^ bc[2]); + s[23] = ROL2(s[15] ^ bc[4], 41); + s[15] = ROL2(s[4] ^ bc[3], 27); + s[4] = ROL2(s[24] ^ bc[3], 14); + s[24] = ROL2(s[21] ^ bc[0], 2); + s[21] = ROL2(s[8] ^ bc[2], 55); + s[8] = ROL2(s[16] ^ bc[0], 45); + s[16] = ROL2(s[5] ^ bc[4], 36); + s[5] = ROL2(s[3] ^ bc[2], 28); + s[3] = ROL2(s[18] ^ bc[2], 21); + s[18] = ROL2(s[17] ^ bc[1], 15); + s[17] = ROL2(s[11] ^ bc[0], 10); + s[11] = ROL2(s[7] ^ bc[1], 6); + s[7] = ROL2(s[10] ^ bc[4], 3); + s[10] = ROL2(tmp1, 1); + + tmp1 = s[0]; tmp2 = s[1]; s[0] = bitselect(s[0] ^ s[2], s[0], s[1]); s[1] = bitselect(s[1] ^ s[3], s[1], s[2]); s[2] = bitselect(s[2] ^ s[4], s[2], s[3]); s[3] = bitselect(s[3] ^ tmp1, s[3], s[4]); s[4] = bitselect(s[4] ^ tmp2, s[4], tmp1); + tmp1 = s[5]; tmp2 = s[6]; s[5] = bitselect(s[5] ^ s[7], s[5], s[6]); s[6] = bitselect(s[6] ^ s[8], s[6], s[7]); s[7] = bitselect(s[7] ^ s[9], s[7], s[8]); s[8] = bitselect(s[8] ^ tmp1, s[8], s[9]); s[9] = bitselect(s[9] ^ tmp2, s[9], tmp1); + tmp1 = s[10]; tmp2 = s[11]; s[10] = bitselect(s[10] ^ s[12], s[10], s[11]); s[11] = bitselect(s[11] ^ s[13], s[11], s[12]); s[12] = bitselect(s[12] ^ s[14], s[12], s[13]); s[13] = bitselect(s[13] ^ tmp1, s[13], s[14]); s[14] = bitselect(s[14] ^ tmp2, s[14], tmp1); + tmp1 = s[15]; tmp2 = s[16]; s[15] = bitselect(s[15] ^ s[17], s[15], s[16]); s[16] = bitselect(s[16] ^ s[18], s[16], s[17]); s[17] = bitselect(s[17] ^ s[19], s[17], s[18]); s[18] = bitselect(s[18] ^ tmp1, s[18], s[19]); s[19] = bitselect(s[19] ^ tmp2, s[19], tmp1); + tmp1 = s[20]; tmp2 = s[21]; s[20] = bitselect(s[20] ^ s[22], s[20], s[21]); s[21] = bitselect(s[21] ^ s[23], s[21], s[22]); s[22] = bitselect(s[22] ^ s[24], s[22], s[23]); s[23] = bitselect(s[23] ^ tmp1, s[23], s[24]); s[24] = bitselect(s[24] ^ tmp2, s[24], tmp1); + s[0] ^= c_keccak_round_constants35[i]; + } + +//#pragma unroll +// for (int i = 0; i<8; i++) +// inpHash[i] = s[i]; + uint2 skein_p[8], h[9]; + + h[0] = skein_p[0] = (s[0]); + h[1] = skein_p[1] = (s[1]); + h[2] = skein_p[2] = (s[2]); + h[3] = skein_p[3] = (s[3]); + h[4] = skein_p[4] = (s[4]); + h[5] = skein_p[5] = (s[5]); + h[6] = skein_p[6] = (s[6]); + h[7] = skein_p[7] = (s[7]); + + skein_p[0] += vectorize(0x4903ADFF749C51CEULL); + skein_p[1] += vectorize(0x0D95DE399746DF03ULL); + skein_p[2] += vectorize(0x8FD1934127C79BCEULL); + skein_p[3] += vectorize(0x9A255629FF352CB1ULL); + skein_p[4] += vectorize(0x5DB62599DF6CA7B0ULL); + skein_p[5] += vectorize(0xEABE394CA9D5C434ULL); + skein_p[6] += vectorize(0x891112C71A75B523ULL); + skein_p[7] += vectorize(0xAE18A40B660FCC33ULL); + skein_p[0] += skein_p[1]; + skein_p[1] = ROL2(skein_p[1], 46) ^ skein_p[0]; + skein_p[2] += skein_p[3]; + skein_p[3] = ROL2(skein_p[3], 36) ^ skein_p[2]; + skein_p[4] += skein_p[5]; + skein_p[5] = ROL2(skein_p[5], 19) ^ skein_p[4]; + skein_p[6] += skein_p[7]; + skein_p[7] = ROL2(skein_p[7], 37) ^ skein_p[6]; + skein_p[2] += skein_p[1]; + skein_p[1] = ROL2(skein_p[1], 33) ^ skein_p[2]; + skein_p[4] += skein_p[7]; + skein_p[7] = ROL2(skein_p[7], 27) ^ skein_p[4]; + skein_p[6] += skein_p[5]; + skein_p[5] = ROL2(skein_p[5], 14) ^ skein_p[6]; + skein_p[0] += skein_p[3]; + skein_p[3] = ROL2(skein_p[3], 42) ^ skein_p[0]; + skein_p[4] += skein_p[1]; + skein_p[1] = ROL2(skein_p[1], 17) ^ skein_p[4]; + skein_p[6] += skein_p[3]; + skein_p[3] = ROL2(skein_p[3], 49) ^ skein_p[6]; + skein_p[0] += skein_p[5]; + skein_p[5] = ROL2(skein_p[5], 36) ^ skein_p[0]; + skein_p[2] += skein_p[7]; + skein_p[7] = ROL2(skein_p[7], 39) ^ skein_p[2]; + skein_p[6] += skein_p[1]; + skein_p[1] = ROL2(skein_p[1], 44) ^ skein_p[6]; + skein_p[0] += skein_p[7]; + skein_p[7] = ROL2(skein_p[7], 9) ^ skein_p[0]; + skein_p[2] += skein_p[5]; + skein_p[5] = ROL2(skein_p[5], 54) ^ skein_p[2]; + skein_p[4] += skein_p[3]; + skein_p[3] = ROR8(skein_p[3]) ^ skein_p[4]; + skein_p[0] += vectorize(0x0D95DE399746DF03ULL); + skein_p[1] += vectorize(0x8FD1934127C79BCEULL); + skein_p[2] += vectorize(0x9A255629FF352CB1ULL); + skein_p[3] += vectorize(0x5DB62599DF6CA7B0ULL); + skein_p[4] += vectorize(0xEABE394CA9D5C3F4ULL); + skein_p[5] += vectorize(0x891112C71A75B523ULL); + skein_p[6] += vectorize(0x9E18A40B660FCC73ULL); + skein_p[7] += vectorize(0xcab2076d98173ec4ULL + 1); + skein_p[0] += skein_p[1]; + skein_p[1] = ROL2(skein_p[1], 39) ^ skein_p[0]; + skein_p[2] += skein_p[3]; + skein_p[3] = ROL2(skein_p[3], 30) ^ skein_p[2]; + skein_p[4] += skein_p[5]; + skein_p[5] = ROL2(skein_p[5], 34) ^ skein_p[4]; + skein_p[6] += skein_p[7]; + skein_p[7] = ROL24(skein_p[7]) ^ skein_p[6]; + skein_p[2] += skein_p[1]; + skein_p[1] = ROL2(skein_p[1], 13) ^ skein_p[2]; + skein_p[4] += skein_p[7]; + skein_p[7] = ROL2(skein_p[7], 50) ^ skein_p[4]; + skein_p[6] += skein_p[5]; + skein_p[5] = ROL2(skein_p[5], 10) ^ skein_p[6]; + skein_p[0] += skein_p[3]; + skein_p[3] = ROL2(skein_p[3], 17) ^ skein_p[0]; + skein_p[4] += skein_p[1]; + skein_p[1] = ROL2(skein_p[1], 25) ^ skein_p[4]; + skein_p[6] += skein_p[3]; + skein_p[3] = ROL2(skein_p[3], 29) ^ skein_p[6]; + skein_p[0] += skein_p[5]; + skein_p[5] = ROL2(skein_p[5], 39) ^ skein_p[0]; + skein_p[2] += skein_p[7]; + skein_p[7] = ROL2(skein_p[7], 43) ^ skein_p[2]; + skein_p[6] += skein_p[1]; + skein_p[1] = ROL8(skein_p[1]) ^ skein_p[6]; + skein_p[0] += skein_p[7]; + skein_p[7] = ROL2(skein_p[7], 35) ^ skein_p[0]; + skein_p[2] += skein_p[5]; + skein_p[5] = ROR8(skein_p[5]) ^ skein_p[2]; + skein_p[4] += skein_p[3]; + skein_p[3] = ROL2(skein_p[3], 22) ^ skein_p[4]; + skein_p[0] += vectorize(0x8FD1934127C79BCEULL); + skein_p[1] += vectorize(0x9A255629FF352CB1ULL); + skein_p[2] += vectorize(0x5DB62599DF6CA7B0ULL); + skein_p[3] += vectorize(0xEABE394CA9D5C3F4ULL); + skein_p[4] += vectorize(0x991112C71A75B523ULL); + skein_p[5] += vectorize(0x9E18A40B660FCC73ULL); + skein_p[6] += vectorize(0xCAB2076D98173F04ULL); + skein_p[7] += vectorize(0x4903ADFF749C51D0ULL); + skein_p[0] += skein_p[1]; + skein_p[1] = ROL2(skein_p[1], 46) ^ skein_p[0]; + skein_p[2] += skein_p[3]; + skein_p[3] = ROL2(skein_p[3], 36) ^ skein_p[2]; + skein_p[4] += skein_p[5]; + skein_p[5] = ROL2(skein_p[5], 19) ^ skein_p[4]; + skein_p[6] += skein_p[7]; + skein_p[7] = ROL2(skein_p[7], 37) ^ skein_p[6]; + skein_p[2] += skein_p[1]; + skein_p[1] = ROL2(skein_p[1], 33) ^ skein_p[2]; + skein_p[4] += skein_p[7]; + skein_p[7] = ROL2(skein_p[7], 27) ^ skein_p[4]; + skein_p[6] += skein_p[5]; + skein_p[5] = ROL2(skein_p[5], 14) ^ skein_p[6]; + skein_p[0] += skein_p[3]; + skein_p[3] = ROL2(skein_p[3], 42) ^ skein_p[0]; + skein_p[4] += skein_p[1]; + skein_p[1] = ROL2(skein_p[1], 17) ^ skein_p[4]; + skein_p[6] += skein_p[3]; + skein_p[3] = ROL2(skein_p[3], 49) ^ skein_p[6]; + skein_p[0] += skein_p[5]; + skein_p[5] = ROL2(skein_p[5], 36) ^ skein_p[0]; + skein_p[2] += skein_p[7]; + skein_p[7] = ROL2(skein_p[7], 39) ^ skein_p[2]; + skein_p[6] += skein_p[1]; + skein_p[1] = ROL2(skein_p[1], 44) ^ skein_p[6]; + skein_p[0] += skein_p[7]; + skein_p[7] = ROL2(skein_p[7], 9) ^ skein_p[0]; + skein_p[2] += skein_p[5]; + skein_p[5] = ROL2(skein_p[5], 54) ^ skein_p[2]; + skein_p[4] += skein_p[3]; + skein_p[3] = ROR8(skein_p[3]) ^ skein_p[4]; + skein_p[0] += vectorize(0x9A255629FF352CB1ULL); + skein_p[1] += vectorize(0x5DB62599DF6CA7B0ULL); + skein_p[2] += vectorize(0xEABE394CA9D5C3F4ULL); + skein_p[3] += vectorize(0x991112C71A75B523ULL); + skein_p[4] += vectorize(0xAE18A40B660FCC33ULL); + skein_p[5] += vectorize(0xcab2076d98173f04ULL); + skein_p[6] += vectorize(0x3903ADFF749C51CEULL); + skein_p[7] += vectorize(0x0D95DE399746DF03ULL + 3); + skein_p[0] += skein_p[1]; + skein_p[1] = ROL2(skein_p[1], 39) ^ skein_p[0]; + skein_p[2] += skein_p[3]; + skein_p[3] = ROL2(skein_p[3], 30) ^ skein_p[2]; + skein_p[4] += skein_p[5]; + skein_p[5] = ROL2(skein_p[5], 34) ^ skein_p[4]; + skein_p[6] += skein_p[7]; + skein_p[7] = ROL24(skein_p[7]) ^ skein_p[6]; + skein_p[2] += skein_p[1]; + skein_p[1] = ROL2(skein_p[1], 13) ^ skein_p[2]; + skein_p[4] += skein_p[7]; + skein_p[7] = ROL2(skein_p[7], 50) ^ skein_p[4]; + skein_p[6] += skein_p[5]; + skein_p[5] = ROL2(skein_p[5], 10) ^ skein_p[6]; + skein_p[0] += skein_p[3]; + skein_p[3] = ROL2(skein_p[3], 17) ^ skein_p[0]; + skein_p[4] += skein_p[1]; + skein_p[1] = ROL2(skein_p[1], 25) ^ skein_p[4]; + skein_p[6] += skein_p[3]; + skein_p[3] = ROL2(skein_p[3], 29) ^ skein_p[6]; + skein_p[0] += skein_p[5]; + skein_p[5] = ROL2(skein_p[5], 39) ^ skein_p[0]; + skein_p[2] += skein_p[7]; + skein_p[7] = ROL2(skein_p[7], 43) ^ skein_p[2]; + skein_p[6] += skein_p[1]; + skein_p[1] = ROL8(skein_p[1]) ^ skein_p[6]; + skein_p[0] += skein_p[7]; + skein_p[7] = ROL2(skein_p[7], 35) ^ skein_p[0]; + skein_p[2] += skein_p[5]; + skein_p[5] = ROR8(skein_p[5]) ^ skein_p[2]; + skein_p[4] += skein_p[3]; + skein_p[3] = ROL2(skein_p[3], 22) ^ skein_p[4]; + skein_p[0] += vectorize(0x5DB62599DF6CA7B0ULL); + skein_p[1] += vectorize(0xEABE394CA9D5C3F4ULL); + skein_p[2] += vectorize(0x991112C71A75B523ULL); + skein_p[3] += vectorize(0xAE18A40B660FCC33ULL); + skein_p[4] += vectorize(0xcab2076d98173ec4ULL); + skein_p[5] += vectorize(0x3903ADFF749C51CEULL); + skein_p[6] += vectorize(0xFD95DE399746DF43ULL); + skein_p[7] += vectorize(0x8FD1934127C79BD2ULL); + skein_p[0] += skein_p[1]; + skein_p[1] = ROL2(skein_p[1], 46) ^ skein_p[0]; + skein_p[2] += skein_p[3]; + skein_p[3] = ROL2(skein_p[3], 36) ^ skein_p[2]; + skein_p[4] += skein_p[5]; + skein_p[5] = ROL2(skein_p[5], 19) ^ skein_p[4]; + skein_p[6] += skein_p[7]; + skein_p[7] = ROL2(skein_p[7], 37) ^ skein_p[6]; + skein_p[2] += skein_p[1]; + skein_p[1] = ROL2(skein_p[1], 33) ^ skein_p[2]; + skein_p[4] += skein_p[7]; + skein_p[7] = ROL2(skein_p[7], 27) ^ skein_p[4]; + skein_p[6] += skein_p[5]; + skein_p[5] = ROL2(skein_p[5], 14) ^ skein_p[6]; + skein_p[0] += skein_p[3]; + skein_p[3] = ROL2(skein_p[3], 42) ^ skein_p[0]; + skein_p[4] += skein_p[1]; + skein_p[1] = ROL2(skein_p[1], 17) ^ skein_p[4]; + skein_p[6] += skein_p[3]; + skein_p[3] = ROL2(skein_p[3], 49) ^ skein_p[6]; + skein_p[0] += skein_p[5]; + skein_p[5] = ROL2(skein_p[5], 36) ^ skein_p[0]; + skein_p[2] += skein_p[7]; + skein_p[7] = ROL2(skein_p[7], 39) ^ skein_p[2]; + skein_p[6] += skein_p[1]; + skein_p[1] = ROL2(skein_p[1], 44) ^ skein_p[6]; + skein_p[0] += skein_p[7]; + skein_p[7] = ROL2(skein_p[7], 9) ^ skein_p[0]; + skein_p[2] += skein_p[5]; + skein_p[5] = ROL2(skein_p[5], 54) ^ skein_p[2]; + skein_p[4] += skein_p[3]; + skein_p[3] = ROR8(skein_p[3]) ^ skein_p[4]; + skein_p[0] += vectorize(0xEABE394CA9D5C3F4ULL); + skein_p[1] += vectorize(0x991112C71A75B523ULL); + skein_p[2] += vectorize(0xAE18A40B660FCC33ULL); + skein_p[3] += vectorize(0xcab2076d98173ec4ULL); + skein_p[4] += vectorize(0x4903ADFF749C51CEULL); + skein_p[5] += vectorize(0x0D95DE399746DF03ULL + 0xf000000000000040ULL); + skein_p[6] += vectorize(0x8FD1934127C79BCEULL + 0x0000000000000040ULL); + skein_p[7] += vectorize(0x9A255629FF352CB1ULL + 5); + skein_p[0] += skein_p[1]; + skein_p[1] = ROL2(skein_p[1], 39) ^ skein_p[0]; + skein_p[2] += skein_p[3]; + skein_p[3] = ROL2(skein_p[3], 30) ^ skein_p[2]; + skein_p[4] += skein_p[5]; + skein_p[5] = ROL2(skein_p[5], 34) ^ skein_p[4]; + skein_p[6] += skein_p[7]; + skein_p[7] = ROL24(skein_p[7]) ^ skein_p[6]; + skein_p[2] += skein_p[1]; + skein_p[1] = ROL2(skein_p[1], 13) ^ skein_p[2]; + skein_p[4] += skein_p[7]; + skein_p[7] = ROL2(skein_p[7], 50) ^ skein_p[4]; + skein_p[6] += skein_p[5]; + skein_p[5] = ROL2(skein_p[5], 10) ^ skein_p[6]; + skein_p[0] += skein_p[3]; + skein_p[3] = ROL2(skein_p[3], 17) ^ skein_p[0]; + skein_p[4] += skein_p[1]; + skein_p[1] = ROL2(skein_p[1], 25) ^ skein_p[4]; + skein_p[6] += skein_p[3]; + skein_p[3] = ROL2(skein_p[3], 29) ^ skein_p[6]; + skein_p[0] += skein_p[5]; + skein_p[5] = ROL2(skein_p[5], 39) ^ skein_p[0]; + skein_p[2] += skein_p[7]; + skein_p[7] = ROL2(skein_p[7], 43) ^ skein_p[2]; + skein_p[6] += skein_p[1]; + skein_p[1] = ROL8(skein_p[1]) ^ skein_p[6]; + skein_p[0] += skein_p[7]; + skein_p[7] = ROL2(skein_p[7], 35) ^ skein_p[0]; + skein_p[2] += skein_p[5]; + skein_p[5] = ROR8(skein_p[5]) ^ skein_p[2]; + skein_p[4] += skein_p[3]; + skein_p[3] = ROL2(skein_p[3], 22) ^ skein_p[4]; + skein_p[0] += vectorize(0x991112C71A75B523ULL); + skein_p[1] += vectorize(0xAE18A40B660FCC33ULL); + skein_p[2] += vectorize(0xcab2076d98173ec4ULL); + skein_p[3] += vectorize(0x4903ADFF749C51CEULL); + skein_p[4] += vectorize(0x0D95DE399746DF03ULL); + skein_p[5] += vectorize(0x8FD1934127C79BCEULL + 0x0000000000000040ULL); + skein_p[6] += vectorize(0x8A255629FF352CB1ULL); + skein_p[7] += vectorize(0x5DB62599DF6CA7B0ULL + 6); + skein_p[0] += skein_p[1]; + skein_p[1] = ROL2(skein_p[1], 46) ^ skein_p[0]; + skein_p[2] += skein_p[3]; + skein_p[3] = ROL2(skein_p[3], 36) ^ skein_p[2]; + skein_p[4] += skein_p[5]; + skein_p[5] = ROL2(skein_p[5], 19) ^ skein_p[4]; + skein_p[6] += skein_p[7]; + skein_p[7] = ROL2(skein_p[7], 37) ^ skein_p[6]; + skein_p[2] += skein_p[1]; + skein_p[1] = ROL2(skein_p[1], 33) ^ skein_p[2]; + skein_p[4] += skein_p[7]; + skein_p[7] = ROL2(skein_p[7], 27) ^ skein_p[4]; + skein_p[6] += skein_p[5]; + skein_p[5] = ROL2(skein_p[5], 14) ^ skein_p[6]; + skein_p[0] += skein_p[3]; + skein_p[3] = ROL2(skein_p[3], 42) ^ skein_p[0]; + skein_p[4] += skein_p[1]; + skein_p[1] = ROL2(skein_p[1], 17) ^ skein_p[4]; + skein_p[6] += skein_p[3]; + skein_p[3] = ROL2(skein_p[3], 49) ^ skein_p[6]; + skein_p[0] += skein_p[5]; + skein_p[5] = ROL2(skein_p[5], 36) ^ skein_p[0]; + skein_p[2] += skein_p[7]; + skein_p[7] = ROL2(skein_p[7], 39) ^ skein_p[2]; + skein_p[6] += skein_p[1]; + skein_p[1] = ROL2(skein_p[1], 44) ^ skein_p[6]; + skein_p[0] += skein_p[7]; + skein_p[7] = ROL2(skein_p[7], 9) ^ skein_p[0]; + skein_p[2] += skein_p[5]; + skein_p[5] = ROL2(skein_p[5], 54) ^ skein_p[2]; + skein_p[4] += skein_p[3]; + skein_p[3] = ROR8(skein_p[3]) ^ skein_p[4]; + skein_p[0] += vectorize(0xAE18A40B660FCC33ULL); + skein_p[1] += vectorize(0xcab2076d98173ec4ULL); + skein_p[2] += vectorize(0x4903ADFF749C51CEULL); + skein_p[3] += vectorize(0x0D95DE399746DF03ULL); + skein_p[4] += vectorize(0x8FD1934127C79BCEULL); + skein_p[5] += vectorize(0x8A255629FF352CB1ULL); + skein_p[6] += vectorize(0x4DB62599DF6CA7F0ULL); + skein_p[7] += vectorize(0xEABE394CA9D5C3F4ULL + 7); + skein_p[0] += skein_p[1]; + skein_p[1] = ROL2(skein_p[1], 39) ^ skein_p[0]; + skein_p[2] += skein_p[3]; + skein_p[3] = ROL2(skein_p[3], 30) ^ skein_p[2]; + skein_p[4] += skein_p[5]; + skein_p[5] = ROL2(skein_p[5], 34) ^ skein_p[4]; + skein_p[6] += skein_p[7]; + skein_p[7] = ROL24(skein_p[7]) ^ skein_p[6]; + skein_p[2] += skein_p[1]; + skein_p[1] = ROL2(skein_p[1], 13) ^ skein_p[2]; + skein_p[4] += skein_p[7]; + skein_p[7] = ROL2(skein_p[7], 50) ^ skein_p[4]; + skein_p[6] += skein_p[5]; + skein_p[5] = ROL2(skein_p[5], 10) ^ skein_p[6]; + skein_p[0] += skein_p[3]; + skein_p[3] = ROL2(skein_p[3], 17) ^ skein_p[0]; + skein_p[4] += skein_p[1]; + skein_p[1] = ROL2(skein_p[1], 25) ^ skein_p[4]; + skein_p[6] += skein_p[3]; + skein_p[3] = ROL2(skein_p[3], 29) ^ skein_p[6]; + skein_p[0] += skein_p[5]; + skein_p[5] = ROL2(skein_p[5], 39) ^ skein_p[0]; + skein_p[2] += skein_p[7]; + skein_p[7] = ROL2(skein_p[7], 43) ^ skein_p[2]; + skein_p[6] += skein_p[1]; + skein_p[1] = ROL8(skein_p[1]) ^ skein_p[6]; + skein_p[0] += skein_p[7]; + skein_p[7] = ROL2(skein_p[7], 35) ^ skein_p[0]; + skein_p[2] += skein_p[5]; + skein_p[5] = ROR8(skein_p[5]) ^ skein_p[2]; + skein_p[4] += skein_p[3]; + skein_p[3] = ROL2(skein_p[3], 22) ^ skein_p[4]; + skein_p[0] += vectorize(0xcab2076d98173ec4ULL); + skein_p[1] += vectorize(0x4903ADFF749C51CEULL); + skein_p[2] += vectorize(0x0D95DE399746DF03ULL); + skein_p[3] += vectorize(0x8FD1934127C79BCEULL); + skein_p[4] += vectorize(0x9A255629FF352CB1ULL); + skein_p[5] += vectorize(0x4DB62599DF6CA7F0ULL); + skein_p[6] += vectorize(0xEABE394CA9D5C434ULL); + skein_p[7] += vectorize(0x991112C71A75B52BULL); + skein_p[0] += skein_p[1]; + skein_p[1] = ROL2(skein_p[1], 46) ^ skein_p[0]; + skein_p[2] += skein_p[3]; + skein_p[3] = ROL2(skein_p[3], 36) ^ skein_p[2]; + skein_p[4] += skein_p[5]; + skein_p[5] = ROL2(skein_p[5], 19) ^ skein_p[4]; + skein_p[6] += skein_p[7]; + skein_p[7] = ROL2(skein_p[7], 37) ^ skein_p[6]; + skein_p[2] += skein_p[1]; + skein_p[1] = ROL2(skein_p[1], 33) ^ skein_p[2]; + skein_p[4] += skein_p[7]; + skein_p[7] = ROL2(skein_p[7], 27) ^ skein_p[4]; + skein_p[6] += skein_p[5]; + skein_p[5] = ROL2(skein_p[5], 14) ^ skein_p[6]; + skein_p[0] += skein_p[3]; + skein_p[3] = ROL2(skein_p[3], 42) ^ skein_p[0]; + skein_p[4] += skein_p[1]; + skein_p[1] = ROL2(skein_p[1], 17) ^ skein_p[4]; + skein_p[6] += skein_p[3]; + skein_p[3] = ROL2(skein_p[3], 49) ^ skein_p[6]; + skein_p[0] += skein_p[5]; + skein_p[5] = ROL2(skein_p[5], 36) ^ skein_p[0]; + skein_p[2] += skein_p[7]; + skein_p[7] = ROL2(skein_p[7], 39) ^ skein_p[2]; + skein_p[6] += skein_p[1]; + skein_p[1] = ROL2(skein_p[1], 44) ^ skein_p[6]; + skein_p[0] += skein_p[7]; + skein_p[7] = ROL2(skein_p[7], 9) ^ skein_p[0]; + skein_p[2] += skein_p[5]; + skein_p[5] = ROL2(skein_p[5], 54) ^ skein_p[2]; + skein_p[4] += skein_p[3]; + skein_p[3] = ROR8(skein_p[3]) ^ skein_p[4]; + skein_p[0] += vectorize(0x4903ADFF749C51CEULL); + skein_p[1] += vectorize(0x0D95DE399746DF03ULL); + skein_p[2] += vectorize(0x8FD1934127C79BCEULL); + skein_p[3] += vectorize(0x9A255629FF352CB1ULL); + skein_p[4] += vectorize(0x5DB62599DF6CA7B0ULL); + skein_p[5] += vectorize(0xEABE394CA9D5C434ULL); + skein_p[6] += vectorize(0x891112C71A75B523ULL); + skein_p[7] += vectorize(0xAE18A40B660FCC33ULL + 9); + skein_p[0] += skein_p[1]; + skein_p[1] = ROL2(skein_p[1], 39) ^ skein_p[0]; + skein_p[2] += skein_p[3]; + skein_p[3] = ROL2(skein_p[3], 30) ^ skein_p[2]; + skein_p[4] += skein_p[5]; + skein_p[5] = ROL2(skein_p[5], 34) ^ skein_p[4]; + skein_p[6] += skein_p[7]; + skein_p[7] = ROL24(skein_p[7]) ^ skein_p[6]; + skein_p[2] += skein_p[1]; + skein_p[1] = ROL2(skein_p[1], 13) ^ skein_p[2]; + skein_p[4] += skein_p[7]; + skein_p[7] = ROL2(skein_p[7], 50) ^ skein_p[4]; + skein_p[6] += skein_p[5]; + skein_p[5] = ROL2(skein_p[5], 10) ^ skein_p[6]; + skein_p[0] += skein_p[3]; + skein_p[3] = ROL2(skein_p[3], 17) ^ skein_p[0]; + skein_p[4] += skein_p[1]; + skein_p[1] = ROL2(skein_p[1], 25) ^ skein_p[4]; + skein_p[6] += skein_p[3]; + skein_p[3] = ROL2(skein_p[3], 29) ^ skein_p[6]; + skein_p[0] += skein_p[5]; + skein_p[5] = ROL2(skein_p[5], 39) ^ skein_p[0]; + skein_p[2] += skein_p[7]; + skein_p[7] = ROL2(skein_p[7], 43) ^ skein_p[2]; + skein_p[6] += skein_p[1]; + skein_p[1] = ROL8(skein_p[1]) ^ skein_p[6]; + skein_p[0] += skein_p[7]; + skein_p[7] = ROL2(skein_p[7], 35) ^ skein_p[0]; + skein_p[2] += skein_p[5]; + skein_p[5] = ROR8(skein_p[5]) ^ skein_p[2]; + skein_p[4] += skein_p[3]; + skein_p[3] = ROL2(skein_p[3], 22) ^ skein_p[4]; + skein_p[0] += vectorize(0x0D95DE399746DF03ULL); + skein_p[1] += vectorize(0x8FD1934127C79BCEULL); + skein_p[2] += vectorize(0x9A255629FF352CB1ULL); + skein_p[3] += vectorize(0x5DB62599DF6CA7B0ULL); + skein_p[4] += vectorize(0xEABE394CA9D5C3F4ULL); + skein_p[5] += vectorize(0x891112C71A75B523ULL); + skein_p[6] += vectorize(0x9E18A40B660FCC73ULL); + skein_p[7] += vectorize(0xcab2076d98173eceULL); + skein_p[0] += skein_p[1]; + skein_p[1] = ROL2(skein_p[1], 46) ^ skein_p[0]; + skein_p[2] += skein_p[3]; + skein_p[3] = ROL2(skein_p[3], 36) ^ skein_p[2]; + skein_p[4] += skein_p[5]; + skein_p[5] = ROL2(skein_p[5], 19) ^ skein_p[4]; + skein_p[6] += skein_p[7]; + skein_p[7] = ROL2(skein_p[7], 37) ^ skein_p[6]; + skein_p[2] += skein_p[1]; + skein_p[1] = ROL2(skein_p[1], 33) ^ skein_p[2]; + skein_p[4] += skein_p[7]; + skein_p[7] = ROL2(skein_p[7], 27) ^ skein_p[4]; + skein_p[6] += skein_p[5]; + skein_p[5] = ROL2(skein_p[5], 14) ^ skein_p[6]; + skein_p[0] += skein_p[3]; + skein_p[3] = ROL2(skein_p[3], 42) ^ skein_p[0]; + skein_p[4] += skein_p[1]; + skein_p[1] = ROL2(skein_p[1], 17) ^ skein_p[4]; + skein_p[6] += skein_p[3]; + skein_p[3] = ROL2(skein_p[3], 49) ^ skein_p[6]; + skein_p[0] += skein_p[5]; + skein_p[5] = ROL2(skein_p[5], 36) ^ skein_p[0]; + skein_p[2] += skein_p[7]; + skein_p[7] = ROL2(skein_p[7], 39) ^ skein_p[2]; + skein_p[6] += skein_p[1]; + skein_p[1] = ROL2(skein_p[1], 44) ^ skein_p[6]; + skein_p[0] += skein_p[7]; + skein_p[7] = ROL2(skein_p[7], 9) ^ skein_p[0]; + skein_p[2] += skein_p[5]; + skein_p[5] = ROL2(skein_p[5], 54) ^ skein_p[2]; + skein_p[4] += skein_p[3]; + skein_p[3] = ROR8(skein_p[3]) ^ skein_p[4]; + skein_p[0] += vectorize(0x8FD1934127C79BCEULL); + skein_p[1] += vectorize(0x9A255629FF352CB1ULL); + skein_p[2] += vectorize(0x5DB62599DF6CA7B0ULL); + skein_p[3] += vectorize(0xEABE394CA9D5C3F4ULL); + skein_p[4] += vectorize(0x991112C71A75B523ULL); + skein_p[5] += vectorize(0x9E18A40B660FCC73ULL); + skein_p[6] += vectorize(0xcab2076d98173ec4ULL + 0x0000000000000040ULL); + skein_p[7] += vectorize(0x4903ADFF749C51CEULL + 11); + skein_p[0] += skein_p[1]; + skein_p[1] = ROL2(skein_p[1], 39) ^ skein_p[0]; + skein_p[2] += skein_p[3]; + skein_p[3] = ROL2(skein_p[3], 30) ^ skein_p[2]; + skein_p[4] += skein_p[5]; + skein_p[5] = ROL2(skein_p[5], 34) ^ skein_p[4]; + skein_p[6] += skein_p[7]; + skein_p[7] = ROL24(skein_p[7]) ^ skein_p[6]; + skein_p[2] += skein_p[1]; + skein_p[1] = ROL2(skein_p[1], 13) ^ skein_p[2]; + skein_p[4] += skein_p[7]; + skein_p[7] = ROL2(skein_p[7], 50) ^ skein_p[4]; + skein_p[6] += skein_p[5]; + skein_p[5] = ROL2(skein_p[5], 10) ^ skein_p[6]; + skein_p[0] += skein_p[3]; + skein_p[3] = ROL2(skein_p[3], 17) ^ skein_p[0]; + skein_p[4] += skein_p[1]; + skein_p[1] = ROL2(skein_p[1], 25) ^ skein_p[4]; + skein_p[6] += skein_p[3]; + skein_p[3] = ROL2(skein_p[3], 29) ^ skein_p[6]; + skein_p[0] += skein_p[5]; + skein_p[5] = ROL2(skein_p[5], 39) ^ skein_p[0]; + skein_p[2] += skein_p[7]; + skein_p[7] = ROL2(skein_p[7], 43) ^ skein_p[2]; + skein_p[6] += skein_p[1]; + skein_p[1] = ROL8(skein_p[1]) ^ skein_p[6]; + skein_p[0] += skein_p[7]; + skein_p[7] = ROL2(skein_p[7], 35) ^ skein_p[0]; + skein_p[2] += skein_p[5]; + skein_p[5] = ROR8(skein_p[5]) ^ skein_p[2]; + skein_p[4] += skein_p[3]; + skein_p[3] = ROL2(skein_p[3], 22) ^ skein_p[4]; + skein_p[0] += vectorize(0x9A255629FF352CB1ULL); + skein_p[1] += vectorize(0x5DB62599DF6CA7B0ULL); + skein_p[2] += vectorize(0xEABE394CA9D5C3F4ULL); + skein_p[3] += vectorize(0x991112C71A75B523ULL); + skein_p[4] += vectorize(0xAE18A40B660FCC33ULL); + skein_p[5] += vectorize(0xcab2076d98173ec4ULL + 0x0000000000000040ULL); + skein_p[6] += vectorize(0x3903ADFF749C51CEULL); + skein_p[7] += vectorize(0x0D95DE399746DF03ULL + 12); + skein_p[0] += skein_p[1]; + skein_p[1] = ROL2(skein_p[1], 46) ^ skein_p[0]; + skein_p[2] += skein_p[3]; + skein_p[3] = ROL2(skein_p[3], 36) ^ skein_p[2]; + skein_p[4] += skein_p[5]; + skein_p[5] = ROL2(skein_p[5], 19) ^ skein_p[4]; + skein_p[6] += skein_p[7]; + skein_p[7] = ROL2(skein_p[7], 37) ^ skein_p[6]; + skein_p[2] += skein_p[1]; + skein_p[1] = ROL2(skein_p[1], 33) ^ skein_p[2]; + skein_p[4] += skein_p[7]; + skein_p[7] = ROL2(skein_p[7], 27) ^ skein_p[4]; + skein_p[6] += skein_p[5]; + skein_p[5] = ROL2(skein_p[5], 14) ^ skein_p[6]; + skein_p[0] += skein_p[3]; + skein_p[3] = ROL2(skein_p[3], 42) ^ skein_p[0]; + skein_p[4] += skein_p[1]; + skein_p[1] = ROL2(skein_p[1], 17) ^ skein_p[4]; + skein_p[6] += skein_p[3]; + skein_p[3] = ROL2(skein_p[3], 49) ^ skein_p[6]; + skein_p[0] += skein_p[5]; + skein_p[5] = ROL2(skein_p[5], 36) ^ skein_p[0]; + skein_p[2] += skein_p[7]; + skein_p[7] = ROL2(skein_p[7], 39) ^ skein_p[2]; + skein_p[6] += skein_p[1]; + skein_p[1] = ROL2(skein_p[1], 44) ^ skein_p[6]; + skein_p[0] += skein_p[7]; + skein_p[7] = ROL2(skein_p[7], 9) ^ skein_p[0]; + skein_p[2] += skein_p[5]; + skein_p[5] = ROL2(skein_p[5], 54) ^ skein_p[2]; + skein_p[4] += skein_p[3]; + skein_p[3] = ROR8(skein_p[3]) ^ skein_p[4]; + skein_p[0] += vectorize(0x5DB62599DF6CA7B0ULL); + skein_p[1] += vectorize(0xEABE394CA9D5C3F4ULL); + skein_p[2] += vectorize(0x991112C71A75B523ULL); + skein_p[3] += vectorize(0xAE18A40B660FCC33ULL); + skein_p[4] += vectorize(0xcab2076d98173ec4ULL); + skein_p[5] += vectorize(0x3903ADFF749C51CEULL); + skein_p[6] += vectorize(0x0D95DE399746DF03ULL + 0xf000000000000040ULL); + skein_p[7] += vectorize(0x8FD1934127C79BCEULL + 13); + skein_p[0] += skein_p[1]; + skein_p[1] = ROL2(skein_p[1], 39) ^ skein_p[0]; + skein_p[2] += skein_p[3]; + skein_p[3] = ROL2(skein_p[3], 30) ^ skein_p[2]; + skein_p[4] += skein_p[5]; + skein_p[5] = ROL2(skein_p[5], 34) ^ skein_p[4]; + skein_p[6] += skein_p[7]; + skein_p[7] = ROL24(skein_p[7]) ^ skein_p[6]; + skein_p[2] += skein_p[1]; + skein_p[1] = ROL2(skein_p[1], 13) ^ skein_p[2]; + skein_p[4] += skein_p[7]; + skein_p[7] = ROL2(skein_p[7], 50) ^ skein_p[4]; + skein_p[6] += skein_p[5]; + skein_p[5] = ROL2(skein_p[5], 10) ^ skein_p[6]; + skein_p[0] += skein_p[3]; + skein_p[3] = ROL2(skein_p[3], 17) ^ skein_p[0]; + skein_p[4] += skein_p[1]; + skein_p[1] = ROL2(skein_p[1], 25) ^ skein_p[4]; + skein_p[6] += skein_p[3]; + skein_p[3] = ROL2(skein_p[3], 29) ^ skein_p[6]; + skein_p[0] += skein_p[5]; + skein_p[5] = ROL2(skein_p[5], 39) ^ skein_p[0]; + skein_p[2] += skein_p[7]; + skein_p[7] = ROL2(skein_p[7], 43) ^ skein_p[2]; + skein_p[6] += skein_p[1]; + skein_p[1] = ROL8(skein_p[1]) ^ skein_p[6]; + skein_p[0] += skein_p[7]; + skein_p[7] = ROL2(skein_p[7], 35) ^ skein_p[0]; + skein_p[2] += skein_p[5]; + skein_p[5] = ROR8(skein_p[5]) ^ skein_p[2]; + skein_p[4] += skein_p[3]; + skein_p[3] = ROL2(skein_p[3], 22) ^ skein_p[4]; + skein_p[0] += vectorize(0xEABE394CA9D5C3F4ULL); + skein_p[1] += vectorize(0x991112C71A75B523ULL); + skein_p[2] += vectorize(0xAE18A40B660FCC33ULL); + skein_p[3] += vectorize(0xcab2076d98173ec4ULL); + skein_p[4] += vectorize(0x4903ADFF749C51CEULL); + skein_p[5] += vectorize(0x0D95DE399746DF03ULL + 0xf000000000000040ULL); + skein_p[6] += vectorize(0x8FD1934127C79BCEULL + 0x0000000000000040ULL); + skein_p[7] += vectorize(0x9A255629FF352CB1ULL + 14); + skein_p[0] += skein_p[1]; + skein_p[1] = ROL2(skein_p[1], 46) ^ skein_p[0]; + skein_p[2] += skein_p[3]; + skein_p[3] = ROL2(skein_p[3], 36) ^ skein_p[2]; + skein_p[4] += skein_p[5]; + skein_p[5] = ROL2(skein_p[5], 19) ^ skein_p[4]; + skein_p[6] += skein_p[7]; + skein_p[7] = ROL2(skein_p[7], 37) ^ skein_p[6]; + skein_p[2] += skein_p[1]; + skein_p[1] = ROL2(skein_p[1], 33) ^ skein_p[2]; + skein_p[4] += skein_p[7]; + skein_p[7] = ROL2(skein_p[7], 27) ^ skein_p[4]; + skein_p[6] += skein_p[5]; + skein_p[5] = ROL2(skein_p[5], 14) ^ skein_p[6]; + skein_p[0] += skein_p[3]; + skein_p[3] = ROL2(skein_p[3], 42) ^ skein_p[0]; + skein_p[4] += skein_p[1]; + skein_p[1] = ROL2(skein_p[1], 17) ^ skein_p[4]; + skein_p[6] += skein_p[3]; + skein_p[3] = ROL2(skein_p[3], 49) ^ skein_p[6]; + skein_p[0] += skein_p[5]; + skein_p[5] = ROL2(skein_p[5], 36) ^ skein_p[0]; + skein_p[2] += skein_p[7]; + skein_p[7] = ROL2(skein_p[7], 39) ^ skein_p[2]; + skein_p[6] += skein_p[1]; + skein_p[1] = ROL2(skein_p[1], 44) ^ skein_p[6]; + skein_p[0] += skein_p[7]; + skein_p[7] = ROL2(skein_p[7], 9) ^ skein_p[0]; + skein_p[2] += skein_p[5]; + skein_p[5] = ROL2(skein_p[5], 54) ^ skein_p[2]; + skein_p[4] += skein_p[3]; + skein_p[3] = ROR8(skein_p[3]) ^ skein_p[4]; + skein_p[0] += vectorize(0x991112C71A75B523ULL); + skein_p[1] += vectorize(0xAE18A40B660FCC33ULL); + skein_p[2] += vectorize(0xcab2076d98173ec4ULL); + skein_p[3] += vectorize(0x4903ADFF749C51CEULL); + skein_p[4] += vectorize(0x0D95DE399746DF03ULL); + skein_p[5] += vectorize(0x8FD1934127C79BCEULL + 0x0000000000000040ULL); + skein_p[6] += vectorize(0x8A255629FF352CB1ULL); + skein_p[7] += vectorize(0x5DB62599DF6CA7B0ULL + 15); + skein_p[0] += skein_p[1]; + skein_p[1] = ROL2(skein_p[1], 39) ^ skein_p[0]; + skein_p[2] += skein_p[3]; + skein_p[3] = ROL2(skein_p[3], 30) ^ skein_p[2]; + skein_p[4] += skein_p[5]; + skein_p[5] = ROL2(skein_p[5], 34) ^ skein_p[4]; + skein_p[6] += skein_p[7]; + skein_p[7] = ROL24(skein_p[7]) ^ skein_p[6]; + skein_p[2] += skein_p[1]; + skein_p[1] = ROL2(skein_p[1], 13) ^ skein_p[2]; + skein_p[4] += skein_p[7]; + skein_p[7] = ROL2(skein_p[7], 50) ^ skein_p[4]; + skein_p[6] += skein_p[5]; + skein_p[5] = ROL2(skein_p[5], 10) ^ skein_p[6]; + skein_p[0] += skein_p[3]; + skein_p[3] = ROL2(skein_p[3], 17) ^ skein_p[0]; + skein_p[4] += skein_p[1]; + skein_p[1] = ROL2(skein_p[1], 25) ^ skein_p[4]; + skein_p[6] += skein_p[3]; + skein_p[3] = ROL2(skein_p[3], 29) ^ skein_p[6]; + skein_p[0] += skein_p[5]; + skein_p[5] = ROL2(skein_p[5], 39) ^ skein_p[0]; + skein_p[2] += skein_p[7]; + skein_p[7] = ROL2(skein_p[7], 43) ^ skein_p[2]; + skein_p[6] += skein_p[1]; + skein_p[1] = ROL8(skein_p[1]) ^ skein_p[6]; + skein_p[0] += skein_p[7]; + skein_p[7] = ROL2(skein_p[7], 35) ^ skein_p[0]; + skein_p[2] += skein_p[5]; + skein_p[5] = ROR8(skein_p[5]) ^ skein_p[2]; + skein_p[4] += skein_p[3]; + skein_p[3] = ROL2(skein_p[3], 22) ^ skein_p[4]; + skein_p[0] += vectorize(0xAE18A40B660FCC33ULL); + skein_p[1] += vectorize(0xcab2076d98173ec4ULL); + skein_p[2] += vectorize(0x4903ADFF749C51CEULL); + skein_p[3] += vectorize(0x0D95DE399746DF03ULL); + skein_p[4] += vectorize(0x8FD1934127C79BCEULL); + skein_p[5] += vectorize(0x8A255629FF352CB1ULL); + skein_p[6] += vectorize(0x4DB62599DF6CA7F0ULL); + skein_p[7] += vectorize(0xEABE394CA9D5C3F4ULL + 16ULL); + skein_p[0] += skein_p[1]; + skein_p[1] = ROL2(skein_p[1], 46) ^ skein_p[0]; + skein_p[2] += skein_p[3]; + skein_p[3] = ROL2(skein_p[3], 36) ^ skein_p[2]; + skein_p[4] += skein_p[5]; + skein_p[5] = ROL2(skein_p[5], 19) ^ skein_p[4]; + skein_p[6] += skein_p[7]; + skein_p[7] = ROL2(skein_p[7], 37) ^ skein_p[6]; + skein_p[2] += skein_p[1]; + skein_p[1] = ROL2(skein_p[1], 33) ^ skein_p[2]; + skein_p[4] += skein_p[7]; + skein_p[7] = ROL2(skein_p[7], 27) ^ skein_p[4]; + skein_p[6] += skein_p[5]; + skein_p[5] = ROL2(skein_p[5], 14) ^ skein_p[6]; + skein_p[0] += skein_p[3]; + skein_p[3] = ROL2(skein_p[3], 42) ^ skein_p[0]; + skein_p[4] += skein_p[1]; + skein_p[1] = ROL2(skein_p[1], 17) ^ skein_p[4]; + skein_p[6] += skein_p[3]; + skein_p[3] = ROL2(skein_p[3], 49) ^ skein_p[6]; + skein_p[0] += skein_p[5]; + skein_p[5] = ROL2(skein_p[5], 36) ^ skein_p[0]; + skein_p[2] += skein_p[7]; + skein_p[7] = ROL2(skein_p[7], 39) ^ skein_p[2]; + skein_p[6] += skein_p[1]; + skein_p[1] = ROL2(skein_p[1], 44) ^ skein_p[6]; + skein_p[0] += skein_p[7]; + skein_p[7] = ROL2(skein_p[7], 9) ^ skein_p[0]; + skein_p[2] += skein_p[5]; + skein_p[5] = ROL2(skein_p[5], 54) ^ skein_p[2]; + skein_p[4] += skein_p[3]; + skein_p[3] = ROR8(skein_p[3]) ^ skein_p[4]; + skein_p[0] += vectorize(0xcab2076d98173ec4ULL); + skein_p[1] += vectorize(0x4903ADFF749C51CEULL); + skein_p[2] += vectorize(0x0D95DE399746DF03ULL); + skein_p[3] += vectorize(0x8FD1934127C79BCEULL); + skein_p[4] += vectorize(0x9A255629FF352CB1ULL); + skein_p[5] += vectorize(0x4DB62599DF6CA7F0ULL); + skein_p[6] += vectorize(0xEABE394CA9D5C3F4ULL + 0x0000000000000040ULL); + skein_p[7] += vectorize(0x991112C71A75B523ULL + 17); + skein_p[0] += skein_p[1]; + skein_p[1] = ROL2(skein_p[1], 39) ^ skein_p[0]; + skein_p[2] += skein_p[3]; + skein_p[3] = ROL2(skein_p[3], 30) ^ skein_p[2]; + skein_p[4] += skein_p[5]; + skein_p[5] = ROL2(skein_p[5], 34) ^ skein_p[4]; + skein_p[6] += skein_p[7]; + skein_p[7] = ROL24(skein_p[7]) ^ skein_p[6]; + skein_p[2] += skein_p[1]; + skein_p[1] = ROL2(skein_p[1], 13) ^ skein_p[2]; + skein_p[4] += skein_p[7]; + skein_p[7] = ROL2(skein_p[7], 50) ^ skein_p[4]; + skein_p[6] += skein_p[5]; + skein_p[5] = ROL2(skein_p[5], 10) ^ skein_p[6]; + skein_p[0] += skein_p[3]; + skein_p[3] = ROL2(skein_p[3], 17) ^ skein_p[0]; + skein_p[4] += skein_p[1]; + skein_p[1] = ROL2(skein_p[1], 25) ^ skein_p[4]; + skein_p[6] += skein_p[3]; + skein_p[3] = ROL2(skein_p[3], 29) ^ skein_p[6]; + skein_p[0] += skein_p[5]; + skein_p[5] = ROL2(skein_p[5], 39) ^ skein_p[0]; + skein_p[2] += skein_p[7]; + skein_p[7] = ROL2(skein_p[7], 43) ^ skein_p[2]; + skein_p[6] += skein_p[1]; + skein_p[1] = ROL8(skein_p[1]) ^ skein_p[6]; + skein_p[0] += skein_p[7]; + skein_p[7] = ROL2(skein_p[7], 35) ^ skein_p[0]; + skein_p[2] += skein_p[5]; + skein_p[5] = ROR8(skein_p[5]) ^ skein_p[2]; + skein_p[4] += skein_p[3]; + skein_p[3] = ROL2(skein_p[3], 22) ^ skein_p[4]; + skein_p[0] += vectorize(0x4903ADFF749C51CEULL); + skein_p[1] += vectorize(0x0D95DE399746DF03ULL); + skein_p[2] += vectorize(0x8FD1934127C79BCEULL); + skein_p[3] += vectorize(0x9A255629FF352CB1ULL); + skein_p[4] += vectorize(0x5DB62599DF6CA7B0ULL); + skein_p[5] += vectorize(0xEABE394CA9D5C3F4ULL + 0x0000000000000040ULL); + skein_p[6] += vectorize(0x891112C71A75B523ULL); + skein_p[7] += vectorize(0xAE18A40B660FCC33ULL + 18); + +#define h0 skein_p[0] +#define h1 skein_p[1] +#define h2 skein_p[2] +#define h3 skein_p[3] +#define h4 skein_p[4] +#define h5 skein_p[5] +#define h6 skein_p[6] +#define h7 skein_p[7] + h0 ^= h[0]; + h1 ^= h[1]; + h2 ^= h[2]; + h3 ^= h[3]; + h4 ^= h[4]; + h5 ^= h[5]; + h6 ^= h[6]; + h7 ^= h[7]; + + const uint2 skein_h8 = h0 ^ h1 ^ h2 ^ h3 ^ h4 ^ h5 ^ h6 ^ h7 ^ vectorize(0x1BD11BDAA9FC1A22ULL); + + uint2 hash64[8]; + + hash64[0] = h0 + h1; + hash64[2] = h2 + h3; + hash64[5] = (h5 + vectorizelow(8ULL)); + + hash64[1] = ROL2(h1, 46) ^ hash64[0]; + hash64[3] = ROL2(h3, 36) ^ hash64[2]; + hash64[4] = h4 + hash64[5]; + hash64[5] = ROL2(hash64[5], 19) ^ hash64[4]; + hash64[6] = (h6 + vectorizehigh(0xff000000UL)) + h7; + hash64[7] = ROL2(h7, 37) ^ hash64[6]; + hash64[2] += hash64[1]; + hash64[1] = ROL2(hash64[1], 33) ^ hash64[2]; + hash64[4] += hash64[7]; + hash64[7] = ROL2(hash64[7], 27) ^ hash64[4]; + hash64[6] += hash64[5]; + hash64[5] = ROL2(hash64[5], 14) ^ hash64[6]; + hash64[0] += hash64[3]; + hash64[3] = ROL2(hash64[3], 42) ^ hash64[0]; + hash64[4] += hash64[1]; + hash64[1] = ROL2(hash64[1], 17) ^ hash64[4]; + hash64[6] += hash64[3]; + hash64[3] = ROL2(hash64[3], 49) ^ hash64[6]; + hash64[0] += hash64[5]; + hash64[5] = ROL2(hash64[5], 36) ^ hash64[0]; + hash64[2] += hash64[7]; + hash64[7] = ROL2(hash64[7], 39) ^ hash64[2]; + hash64[6] += hash64[1]; + hash64[1] = ROL2(hash64[1], 44) ^ hash64[6]; + hash64[0] += hash64[7]; + hash64[7] = ROL2(hash64[7], 9) ^ hash64[0]; + hash64[2] += hash64[5]; + hash64[5] = ROL2(hash64[5], 54) ^ hash64[2]; + hash64[4] += hash64[3]; + hash64[3] = ROR8(hash64[3]) ^ hash64[4]; + hash64[0] = (hash64[0] + h1); + hash64[1] = (hash64[1] + h2); + hash64[2] = (hash64[2] + h3); + hash64[3] = (hash64[3] + h4); + hash64[4] = (hash64[4] + h5); + hash64[5] = (hash64[5] + h6 + vectorizehigh(0xff000000UL)); + hash64[6] = (hash64[6] + h7 + vectorize(0xff00000000000008ULL)); + hash64[7] = (hash64[7] + skein_h8 + vectorizelow(1)); + hash64[0] += hash64[1]; + hash64[1] = ROL2(hash64[1], 39) ^ hash64[0]; + hash64[2] += hash64[3]; + hash64[3] = ROL2(hash64[3], 30) ^ hash64[2]; + hash64[4] += hash64[5]; + hash64[5] = ROL2(hash64[5], 34) ^ hash64[4]; + hash64[6] += hash64[7]; + hash64[7] = ROL24(hash64[7]) ^ hash64[6]; + hash64[2] += hash64[1]; + hash64[1] = ROL2(hash64[1], 13) ^ hash64[2]; + hash64[4] += hash64[7]; + hash64[7] = ROL2(hash64[7], 50) ^ hash64[4]; + hash64[6] += hash64[5]; + hash64[5] = ROL2(hash64[5], 10) ^ hash64[6]; + hash64[0] += hash64[3]; + hash64[3] = ROL2(hash64[3], 17) ^ hash64[0]; + hash64[4] += hash64[1]; + hash64[1] = ROL2(hash64[1], 25) ^ hash64[4]; + hash64[6] += hash64[3]; + hash64[3] = ROL2(hash64[3], 29) ^ hash64[6]; + hash64[0] += hash64[5]; + hash64[5] = ROL2(hash64[5], 39) ^ hash64[0]; + hash64[2] += hash64[7]; + hash64[7] = ROL2(hash64[7], 43) ^ hash64[2]; + hash64[6] += hash64[1]; + hash64[1] = ROL8(hash64[1]) ^ hash64[6]; + hash64[0] += hash64[7]; + hash64[7] = ROL2(hash64[7], 35) ^ hash64[0]; + hash64[2] += hash64[5]; + hash64[5] = ROR8(hash64[5]) ^ hash64[2]; + hash64[4] += hash64[3]; + hash64[3] = ROL2(hash64[3], 22) ^ hash64[4]; + hash64[0] = (hash64[0] + h2); + hash64[1] = (hash64[1] + h3); + hash64[2] = (hash64[2] + h4); + hash64[3] = (hash64[3] + h5); + hash64[4] = (hash64[4] + h6); + hash64[5] = (hash64[5] + h7 + vectorize(0xff00000000000008ULL)); + hash64[6] = (hash64[6] + skein_h8 + vectorizelow(8ULL)); + hash64[7] = (hash64[7] + h0 + vectorize(2)); + hash64[0] += hash64[1]; + hash64[1] = ROL2(hash64[1], 46) ^ hash64[0]; + hash64[2] += hash64[3]; + hash64[3] = ROL2(hash64[3], 36) ^ hash64[2]; + hash64[4] += hash64[5]; + hash64[5] = ROL2(hash64[5], 19) ^ hash64[4]; + hash64[6] += hash64[7]; + hash64[7] = ROL2(hash64[7], 37) ^ hash64[6]; + hash64[2] += hash64[1]; + hash64[1] = ROL2(hash64[1], 33) ^ hash64[2]; + hash64[4] += hash64[7]; + hash64[7] = ROL2(hash64[7], 27) ^ hash64[4]; + hash64[6] += hash64[5]; + hash64[5] = ROL2(hash64[5], 14) ^ hash64[6]; + hash64[0] += hash64[3]; + hash64[3] = ROL2(hash64[3], 42) ^ hash64[0]; + hash64[4] += hash64[1]; + hash64[1] = ROL2(hash64[1], 17) ^ hash64[4]; + hash64[6] += hash64[3]; + hash64[3] = ROL2(hash64[3], 49) ^ hash64[6]; + hash64[0] += hash64[5]; + hash64[5] = ROL2(hash64[5], 36) ^ hash64[0]; + hash64[2] += hash64[7]; + hash64[7] = ROL2(hash64[7], 39) ^ hash64[2]; + hash64[6] += hash64[1]; + hash64[1] = ROL2(hash64[1], 44) ^ hash64[6]; + hash64[0] += hash64[7]; + hash64[7] = ROL2(hash64[7], 9) ^ hash64[0]; + hash64[2] += hash64[5]; + hash64[5] = ROL2(hash64[5], 54) ^ hash64[2]; + hash64[4] += hash64[3]; + hash64[3] = ROR8(hash64[3]) ^ hash64[4]; + hash64[0] = (hash64[0] + h3); + hash64[1] = (hash64[1] + h4); + hash64[2] = (hash64[2] + h5); + hash64[3] = (hash64[3] + h6); + hash64[4] = (hash64[4] + h7); + hash64[5] = (hash64[5] + skein_h8 + vectorizelow(8)); + hash64[6] = (hash64[6] + h0 + vectorizehigh(0xff000000UL)); + hash64[7] = (hash64[7] + h1 + vectorizelow(3)); + hash64[0] += hash64[1]; + hash64[1] = ROL2(hash64[1], 39) ^ hash64[0]; + hash64[2] += hash64[3]; + hash64[3] = ROL2(hash64[3], 30) ^ hash64[2]; + hash64[4] += hash64[5]; + hash64[5] = ROL2(hash64[5], 34) ^ hash64[4]; + hash64[6] += hash64[7]; + hash64[7] = ROL24(hash64[7]) ^ hash64[6]; + hash64[2] += hash64[1]; + hash64[1] = ROL2(hash64[1], 13) ^ hash64[2]; + hash64[4] += hash64[7]; + hash64[7] = ROL2(hash64[7], 50) ^ hash64[4]; + hash64[6] += hash64[5]; + hash64[5] = ROL2(hash64[5], 10) ^ hash64[6]; + hash64[0] += hash64[3]; + hash64[3] = ROL2(hash64[3], 17) ^ hash64[0]; + hash64[4] += hash64[1]; + hash64[1] = ROL2(hash64[1], 25) ^ hash64[4]; + hash64[6] += hash64[3]; + hash64[3] = ROL2(hash64[3], 29) ^ hash64[6]; + hash64[0] += hash64[5]; + hash64[5] = ROL2(hash64[5], 39) ^ hash64[0]; + hash64[2] += hash64[7]; + hash64[7] = ROL2(hash64[7], 43) ^ hash64[2]; + hash64[6] += hash64[1]; + hash64[1] = ROL8(hash64[1]) ^ hash64[6]; + hash64[0] += hash64[7]; + hash64[7] = ROL2(hash64[7], 35) ^ hash64[0]; + hash64[2] += hash64[5]; + hash64[5] = ROR8(hash64[5]) ^ hash64[2]; + hash64[4] += hash64[3]; + hash64[3] = ROL2(hash64[3], 22) ^ hash64[4]; + hash64[0] = (hash64[0] + h4); + hash64[1] = (hash64[1] + h5); + hash64[2] = (hash64[2] + h6); + hash64[3] = (hash64[3] + h7); + hash64[4] = (hash64[4] + skein_h8); + hash64[5] = (hash64[5] + h0 + vectorizehigh(0xff000000UL)); + hash64[6] = (hash64[6] + h1 + vectorize(0xff00000000000008ULL)); + hash64[7] = (hash64[7] + h2 + vectorizelow(4)); + hash64[0] += hash64[1]; + hash64[1] = ROL2(hash64[1], 46) ^ hash64[0]; + hash64[2] += hash64[3]; + hash64[3] = ROL2(hash64[3], 36) ^ hash64[2]; + hash64[4] += hash64[5]; + hash64[5] = ROL2(hash64[5], 19) ^ hash64[4]; + hash64[6] += hash64[7]; + hash64[7] = ROL2(hash64[7], 37) ^ hash64[6]; + hash64[2] += hash64[1]; + hash64[1] = ROL2(hash64[1], 33) ^ hash64[2]; + hash64[4] += hash64[7]; + hash64[7] = ROL2(hash64[7], 27) ^ hash64[4]; + hash64[6] += hash64[5]; + hash64[5] = ROL2(hash64[5], 14) ^ hash64[6]; + hash64[0] += hash64[3]; + hash64[3] = ROL2(hash64[3], 42) ^ hash64[0]; + hash64[4] += hash64[1]; + hash64[1] = ROL2(hash64[1], 17) ^ hash64[4]; + hash64[6] += hash64[3]; + hash64[3] = ROL2(hash64[3], 49) ^ hash64[6]; + hash64[0] += hash64[5]; + hash64[5] = ROL2(hash64[5], 36) ^ hash64[0]; + hash64[2] += hash64[7]; + hash64[7] = ROL2(hash64[7], 39) ^ hash64[2]; + hash64[6] += hash64[1]; + hash64[1] = ROL2(hash64[1], 44) ^ hash64[6]; + hash64[0] += hash64[7]; + hash64[7] = ROL2(hash64[7], 9) ^ hash64[0]; + hash64[2] += hash64[5]; + hash64[5] = ROL2(hash64[5], 54) ^ hash64[2]; + hash64[4] += hash64[3]; + hash64[3] = ROR8(hash64[3]) ^ hash64[4]; + hash64[0] = (hash64[0] + h5); + hash64[1] = (hash64[1] + h6); + hash64[2] = (hash64[2] + h7); + hash64[3] = (hash64[3] + skein_h8); + hash64[4] = (hash64[4] + h0); + hash64[5] = (hash64[5] + h1 + vectorize(0xff00000000000008ULL)); + hash64[6] = (hash64[6] + h2 + vectorizelow(8ULL)); + hash64[7] = (hash64[7] + h3 + vectorizelow(5)); + hash64[0] += hash64[1]; + hash64[1] = ROL2(hash64[1], 39) ^ hash64[0]; + hash64[2] += hash64[3]; + hash64[3] = ROL2(hash64[3], 30) ^ hash64[2]; + hash64[4] += hash64[5]; + hash64[5] = ROL2(hash64[5], 34) ^ hash64[4]; + hash64[6] += hash64[7]; + hash64[7] = ROL24(hash64[7]) ^ hash64[6]; + hash64[2] += hash64[1]; + hash64[1] = ROL2(hash64[1], 13) ^ hash64[2]; + hash64[4] += hash64[7]; + hash64[7] = ROL2(hash64[7], 50) ^ hash64[4]; + hash64[6] += hash64[5]; + hash64[5] = ROL2(hash64[5], 10) ^ hash64[6]; + hash64[0] += hash64[3]; + hash64[3] = ROL2(hash64[3], 17) ^ hash64[0]; + hash64[4] += hash64[1]; + hash64[1] = ROL2(hash64[1], 25) ^ hash64[4]; + hash64[6] += hash64[3]; + hash64[3] = ROL2(hash64[3], 29) ^ hash64[6]; + hash64[0] += hash64[5]; + hash64[5] = ROL2(hash64[5], 39) ^ hash64[0]; + hash64[2] += hash64[7]; + hash64[7] = ROL2(hash64[7], 43) ^ hash64[2]; + hash64[6] += hash64[1]; + hash64[1] = ROL8(hash64[1]) ^ hash64[6]; + hash64[0] += hash64[7]; + hash64[7] = ROL2(hash64[7], 35) ^ hash64[0]; + hash64[2] += hash64[5]; + hash64[5] = ROR8(hash64[5]) ^ hash64[2]; + hash64[4] += hash64[3]; + hash64[3] = ROL2(hash64[3], 22) ^ hash64[4]; + hash64[0] = (hash64[0] + h6); + hash64[1] = (hash64[1] + h7); + hash64[2] = (hash64[2] + skein_h8); + hash64[3] = (hash64[3] + h0); + hash64[4] = (hash64[4] + h1); + hash64[5] = (hash64[5] + h2 + vectorizelow(8ULL)); + hash64[6] = (hash64[6] + h3 + vectorizehigh(0xff000000UL)); + hash64[7] = (hash64[7] + h4 + vectorizelow(6)); + hash64[0] += hash64[1]; + hash64[1] = ROL2(hash64[1], 46) ^ hash64[0]; + hash64[2] += hash64[3]; + hash64[3] = ROL2(hash64[3], 36) ^ hash64[2]; + hash64[4] += hash64[5]; + hash64[5] = ROL2(hash64[5], 19) ^ hash64[4]; + hash64[6] += hash64[7]; + hash64[7] = ROL2(hash64[7], 37) ^ hash64[6]; + hash64[2] += hash64[1]; + hash64[1] = ROL2(hash64[1], 33) ^ hash64[2]; + hash64[4] += hash64[7]; + hash64[7] = ROL2(hash64[7], 27) ^ hash64[4]; + hash64[6] += hash64[5]; + hash64[5] = ROL2(hash64[5], 14) ^ hash64[6]; + hash64[0] += hash64[3]; + hash64[3] = ROL2(hash64[3], 42) ^ hash64[0]; + hash64[4] += hash64[1]; + hash64[1] = ROL2(hash64[1], 17) ^ hash64[4]; + hash64[6] += hash64[3]; + hash64[3] = ROL2(hash64[3], 49) ^ hash64[6]; + hash64[0] += hash64[5]; + hash64[5] = ROL2(hash64[5], 36) ^ hash64[0]; + hash64[2] += hash64[7]; + hash64[7] = ROL2(hash64[7], 39) ^ hash64[2]; + hash64[6] += hash64[1]; + hash64[1] = ROL2(hash64[1], 44) ^ hash64[6]; + hash64[0] += hash64[7]; + hash64[7] = ROL2(hash64[7], 9) ^ hash64[0]; + hash64[2] += hash64[5]; + hash64[5] = ROL2(hash64[5], 54) ^ hash64[2]; + hash64[4] += hash64[3]; + hash64[3] = ROR8(hash64[3]) ^ hash64[4]; + hash64[0] = (hash64[0] + h7); + hash64[1] = (hash64[1] + skein_h8); + hash64[2] = (hash64[2] + h0); + hash64[3] = (hash64[3] + h1); + hash64[4] = (hash64[4] + h2); + hash64[5] = (hash64[5] + h3 + vectorizehigh(0xff000000UL)); + hash64[6] = (hash64[6] + h4 + vectorize(0xff00000000000008ULL)); + hash64[7] = (hash64[7] + h5 + vectorizelow(7)); + hash64[0] += hash64[1]; + hash64[1] = ROL2(hash64[1], 39) ^ hash64[0]; + hash64[2] += hash64[3]; + hash64[3] = ROL2(hash64[3], 30) ^ hash64[2]; + hash64[4] += hash64[5]; + hash64[5] = ROL2(hash64[5], 34) ^ hash64[4]; + hash64[6] += hash64[7]; + hash64[7] = ROL24(hash64[7]) ^ hash64[6]; + hash64[2] += hash64[1]; + hash64[1] = ROL2(hash64[1], 13) ^ hash64[2]; + hash64[4] += hash64[7]; + hash64[7] = ROL2(hash64[7], 50) ^ hash64[4]; + hash64[6] += hash64[5]; + hash64[5] = ROL2(hash64[5], 10) ^ hash64[6]; + hash64[0] += hash64[3]; + hash64[3] = ROL2(hash64[3], 17) ^ hash64[0]; + hash64[4] += hash64[1]; + hash64[1] = ROL2(hash64[1], 25) ^ hash64[4]; + hash64[6] += hash64[3]; + hash64[3] = ROL2(hash64[3], 29) ^ hash64[6]; + hash64[0] += hash64[5]; + hash64[5] = ROL2(hash64[5], 39) ^ hash64[0]; + hash64[2] += hash64[7]; + hash64[7] = ROL2(hash64[7], 43) ^ hash64[2]; + hash64[6] += hash64[1]; + hash64[1] = ROL8(hash64[1]) ^ hash64[6]; + hash64[0] += hash64[7]; + hash64[7] = ROL2(hash64[7], 35) ^ hash64[0]; + hash64[2] += hash64[5]; + hash64[5] = ROR8(hash64[5]) ^ hash64[2]; + hash64[4] += hash64[3]; + hash64[3] = ROL2(hash64[3], 22) ^ hash64[4]; + hash64[0] = (hash64[0] + skein_h8); + hash64[1] = (hash64[1] + h0); + hash64[2] = (hash64[2] + h1); + hash64[3] = (hash64[3] + h2); + hash64[4] = (hash64[4] + h3); + hash64[5] = (hash64[5] + h4 + vectorize(0xff00000000000008ULL)); + hash64[6] = (hash64[6] + h5 + vectorizelow(8)); + hash64[7] = (hash64[7] + h6 + vectorizelow(8)); + hash64[0] += hash64[1]; + hash64[1] = ROL2(hash64[1], 46) ^ hash64[0]; + hash64[2] += hash64[3]; + hash64[3] = ROL2(hash64[3], 36) ^ hash64[2]; + hash64[4] += hash64[5]; + hash64[5] = ROL2(hash64[5], 19) ^ hash64[4]; + hash64[6] += hash64[7]; + hash64[7] = ROL2(hash64[7], 37) ^ hash64[6]; + hash64[2] += hash64[1]; + hash64[1] = ROL2(hash64[1], 33) ^ hash64[2]; + hash64[4] += hash64[7]; + hash64[7] = ROL2(hash64[7], 27) ^ hash64[4]; + hash64[6] += hash64[5]; + hash64[5] = ROL2(hash64[5], 14) ^ hash64[6]; + hash64[0] += hash64[3]; + hash64[3] = ROL2(hash64[3], 42) ^ hash64[0]; + hash64[4] += hash64[1]; + hash64[1] = ROL2(hash64[1], 17) ^ hash64[4]; + hash64[6] += hash64[3]; + hash64[3] = ROL2(hash64[3], 49) ^ hash64[6]; + hash64[0] += hash64[5]; + hash64[5] = ROL2(hash64[5], 36) ^ hash64[0]; + hash64[2] += hash64[7]; + hash64[7] = ROL2(hash64[7], 39) ^ hash64[2]; + hash64[6] += hash64[1]; + hash64[1] = ROL2(hash64[1], 44) ^ hash64[6]; + hash64[0] += hash64[7]; + hash64[7] = ROL2(hash64[7], 9) ^ hash64[0]; + hash64[2] += hash64[5]; + hash64[5] = ROL2(hash64[5], 54) ^ hash64[2]; + hash64[4] += hash64[3]; + hash64[3] = ROR8(hash64[3]) ^ hash64[4]; + hash64[0] = (hash64[0] + h0); + hash64[1] = (hash64[1] + h1); + hash64[2] = (hash64[2] + h2); + hash64[3] = (hash64[3] + h3); + hash64[4] = (hash64[4] + h4); + hash64[5] = (hash64[5] + h5 + vectorizelow(8)); + hash64[6] = (hash64[6] + h6 + vectorizehigh(0xff000000UL)); + hash64[7] = (hash64[7] + h7 + vectorizelow(9)); + hash64[0] += hash64[1]; + hash64[1] = ROL2(hash64[1], 39) ^ hash64[0]; + hash64[2] += hash64[3]; + hash64[3] = ROL2(hash64[3], 30) ^ hash64[2]; + hash64[4] += hash64[5]; + hash64[5] = ROL2(hash64[5], 34) ^ hash64[4]; + hash64[6] += hash64[7]; + hash64[7] = ROL24(hash64[7]) ^ hash64[6]; + hash64[2] += hash64[1]; + hash64[1] = ROL2(hash64[1], 13) ^ hash64[2]; + hash64[4] += hash64[7]; + hash64[7] = ROL2(hash64[7], 50) ^ hash64[4]; + hash64[6] += hash64[5]; + hash64[5] = ROL2(hash64[5], 10) ^ hash64[6]; + hash64[0] += hash64[3]; + hash64[3] = ROL2(hash64[3], 17) ^ hash64[0]; + hash64[4] += hash64[1]; + hash64[1] = ROL2(hash64[1], 25) ^ hash64[4]; + hash64[6] += hash64[3]; + hash64[3] = ROL2(hash64[3], 29) ^ hash64[6]; + hash64[0] += hash64[5]; + hash64[5] = ROL2(hash64[5], 39) ^ hash64[0]; + hash64[2] += hash64[7]; + hash64[7] = ROL2(hash64[7], 43) ^ hash64[2]; + hash64[6] += hash64[1]; + hash64[1] = ROL8(hash64[1]) ^ hash64[6]; + hash64[0] += hash64[7]; + hash64[7] = ROL2(hash64[7], 35) ^ hash64[0]; + hash64[2] += hash64[5]; + hash64[5] = ROR8(hash64[5]) ^ hash64[2]; + hash64[4] += hash64[3]; + hash64[3] = ROL2(hash64[3], 22) ^ hash64[4]; + + hash64[0] = (hash64[0] + h1); + hash64[1] = (hash64[1] + h2); + hash64[2] = (hash64[2] + h3); + hash64[3] = (hash64[3] + h4); + hash64[4] = (hash64[4] + h5); + hash64[5] = (hash64[5] + h6 + vectorizehigh(0xff000000UL)); + hash64[6] = (hash64[6] + h7 + vectorize(0xff00000000000008ULL)); + hash64[7] = (hash64[7] + skein_h8 + (vectorizelow(10))); + hash64[0] += hash64[1]; + hash64[1] = ROL2(hash64[1], 46) ^ hash64[0]; + hash64[2] += hash64[3]; + hash64[3] = ROL2(hash64[3], 36) ^ hash64[2]; + hash64[4] += hash64[5]; + hash64[5] = ROL2(hash64[5], 19) ^ hash64[4]; + hash64[6] += hash64[7]; + hash64[7] = ROL2(hash64[7], 37) ^ hash64[6]; + hash64[2] += hash64[1]; + hash64[1] = ROL2(hash64[1], 33) ^ hash64[2]; + hash64[4] += hash64[7]; + hash64[7] = ROL2(hash64[7], 27) ^ hash64[4]; + hash64[6] += hash64[5]; + hash64[5] = ROL2(hash64[5], 14) ^ hash64[6]; + hash64[0] += hash64[3]; + hash64[3] = ROL2(hash64[3], 42) ^ hash64[0]; + hash64[4] += hash64[1]; + hash64[1] = ROL2(hash64[1], 17) ^ hash64[4]; + hash64[6] += hash64[3]; + hash64[3] = ROL2(hash64[3], 49) ^ hash64[6]; + hash64[0] += hash64[5]; + hash64[5] = ROL2(hash64[5], 36) ^ hash64[0]; + hash64[2] += hash64[7]; + hash64[7] = ROL2(hash64[7], 39) ^ hash64[2]; + hash64[6] += hash64[1]; + hash64[1] = ROL2(hash64[1], 44) ^ hash64[6]; + hash64[0] += hash64[7]; + hash64[7] = ROL2(hash64[7], 9) ^ hash64[0]; + hash64[2] += hash64[5]; + hash64[5] = ROL2(hash64[5], 54) ^ hash64[2]; + hash64[4] += hash64[3]; + hash64[3] = ROR8(hash64[3]) ^ hash64[4]; + hash64[0] = (hash64[0] + h2); + hash64[1] = (hash64[1] + h3); + hash64[2] = (hash64[2] + h4); + hash64[3] = (hash64[3] + h5); + hash64[4] = (hash64[4] + h6); + hash64[5] = (hash64[5] + h7 + vectorize(0xff00000000000008ULL)); + hash64[6] = (hash64[6] + skein_h8 + vectorizelow(8ULL)); + hash64[7] = (hash64[7] + h0 + vectorizelow(11)); + hash64[0] += hash64[1]; + hash64[1] = ROL2(hash64[1], 39) ^ hash64[0]; + hash64[2] += hash64[3]; + hash64[3] = ROL2(hash64[3], 30) ^ hash64[2]; + hash64[4] += hash64[5]; + hash64[5] = ROL2(hash64[5], 34) ^ hash64[4]; + hash64[6] += hash64[7]; + hash64[7] = ROL24(hash64[7]) ^ hash64[6]; + hash64[2] += hash64[1]; + hash64[1] = ROL2(hash64[1], 13) ^ hash64[2]; + hash64[4] += hash64[7]; + hash64[7] = ROL2(hash64[7], 50) ^ hash64[4]; + hash64[6] += hash64[5]; + hash64[5] = ROL2(hash64[5], 10) ^ hash64[6]; + hash64[0] += hash64[3]; + hash64[3] = ROL2(hash64[3], 17) ^ hash64[0]; + hash64[4] += hash64[1]; + hash64[1] = ROL2(hash64[1], 25) ^ hash64[4]; + hash64[6] += hash64[3]; + hash64[3] = ROL2(hash64[3], 29) ^ hash64[6]; + hash64[0] += hash64[5]; + hash64[5] = ROL2(hash64[5], 39) ^ hash64[0]; + hash64[2] += hash64[7]; + hash64[7] = ROL2(hash64[7], 43) ^ hash64[2]; + hash64[6] += hash64[1]; + hash64[1] = ROL8(hash64[1]) ^ hash64[6]; + hash64[0] += hash64[7]; + hash64[7] = ROL2(hash64[7], 35) ^ hash64[0]; + hash64[2] += hash64[5]; + hash64[5] = ROR8(hash64[5]) ^ hash64[2]; + hash64[4] += hash64[3]; + hash64[3] = ROL2(hash64[3], 22) ^ hash64[4]; + hash64[0] = (hash64[0] + h3); + hash64[1] = (hash64[1] + h4); + hash64[2] = (hash64[2] + h5); + hash64[3] = (hash64[3] + h6); + hash64[4] = (hash64[4] + h7); + hash64[5] = (hash64[5] + skein_h8 + vectorizelow(8)); + hash64[6] = (hash64[6] + h0 + vectorizehigh(0xff000000UL)); + hash64[7] = (hash64[7] + h1 + vectorizelow(12)); + hash64[0] += hash64[1]; + hash64[1] = ROL2(hash64[1], 46) ^ hash64[0]; + hash64[2] += hash64[3]; + hash64[3] = ROL2(hash64[3], 36) ^ hash64[2]; + hash64[4] += hash64[5]; + hash64[5] = ROL2(hash64[5], 19) ^ hash64[4]; + hash64[6] += hash64[7]; + hash64[7] = ROL2(hash64[7], 37) ^ hash64[6]; + hash64[2] += hash64[1]; + hash64[1] = ROL2(hash64[1], 33) ^ hash64[2]; + hash64[4] += hash64[7]; + hash64[7] = ROL2(hash64[7], 27) ^ hash64[4]; + hash64[6] += hash64[5]; + hash64[5] = ROL2(hash64[5], 14) ^ hash64[6]; + hash64[0] += hash64[3]; + hash64[3] = ROL2(hash64[3], 42) ^ hash64[0]; + hash64[4] += hash64[1]; + hash64[1] = ROL2(hash64[1], 17) ^ hash64[4]; + hash64[6] += hash64[3]; + hash64[3] = ROL2(hash64[3], 49) ^ hash64[6]; + hash64[0] += hash64[5]; + hash64[5] = ROL2(hash64[5], 36) ^ hash64[0]; + hash64[2] += hash64[7]; + hash64[7] = ROL2(hash64[7], 39) ^ hash64[2]; + hash64[6] += hash64[1]; + hash64[1] = ROL2(hash64[1], 44) ^ hash64[6]; + hash64[0] += hash64[7]; + hash64[7] = ROL2(hash64[7], 9) ^ hash64[0]; + hash64[2] += hash64[5]; + hash64[5] = ROL2(hash64[5], 54) ^ hash64[2]; + hash64[4] += hash64[3]; + hash64[3] = ROR8(hash64[3]) ^ hash64[4]; + hash64[0] = (hash64[0] + h4); + hash64[1] = (hash64[1] + h5); + hash64[2] = (hash64[2] + h6); + hash64[3] = (hash64[3] + h7); + hash64[4] = (hash64[4] + skein_h8); + hash64[5] = (hash64[5] + h0 + vectorizehigh(0xff000000UL)); + hash64[6] = (hash64[6] + h1 + vectorize(0xff00000000000008ULL)); + hash64[7] = (hash64[7] + h2 + vectorizelow(13)); + hash64[0] += hash64[1]; + hash64[1] = ROL2(hash64[1], 39) ^ hash64[0]; + hash64[2] += hash64[3]; + hash64[3] = ROL2(hash64[3], 30) ^ hash64[2]; + hash64[4] += hash64[5]; + hash64[5] = ROL2(hash64[5], 34) ^ hash64[4]; + hash64[6] += hash64[7]; + hash64[7] = ROL24(hash64[7]) ^ hash64[6]; + hash64[2] += hash64[1]; + hash64[1] = ROL2(hash64[1], 13) ^ hash64[2]; + hash64[4] += hash64[7]; + hash64[7] = ROL2(hash64[7], 50) ^ hash64[4]; + hash64[6] += hash64[5]; + hash64[5] = ROL2(hash64[5], 10) ^ hash64[6]; + hash64[0] += hash64[3]; + hash64[3] = ROL2(hash64[3], 17) ^ hash64[0]; + hash64[4] += hash64[1]; + hash64[1] = ROL2(hash64[1], 25) ^ hash64[4]; + hash64[6] += hash64[3]; + hash64[3] = ROL2(hash64[3], 29) ^ hash64[6]; + hash64[0] += hash64[5]; + hash64[5] = ROL2(hash64[5], 39) ^ hash64[0]; + hash64[2] += hash64[7]; + hash64[7] = ROL2(hash64[7], 43) ^ hash64[2]; + hash64[6] += hash64[1]; + hash64[1] = ROL8(hash64[1]) ^ hash64[6]; + hash64[0] += hash64[7]; + hash64[7] = ROL2(hash64[7], 35) ^ hash64[0]; + hash64[2] += hash64[5]; + hash64[5] = ROR8(hash64[5]) ^ hash64[2]; + hash64[4] += hash64[3]; + hash64[3] = ROL2(hash64[3], 22) ^ hash64[4]; + hash64[0] = (hash64[0] + h5); + hash64[1] = (hash64[1] + h6); + hash64[2] = (hash64[2] + h7); + hash64[3] = (hash64[3] + skein_h8); + hash64[4] = (hash64[4] + h0); + hash64[5] = (hash64[5] + h1 + vectorize(0xff00000000000008ULL)); + hash64[6] = (hash64[6] + h2 + vectorizelow(8ULL)); + hash64[7] = (hash64[7] + h3 + vectorizelow(14)); + hash64[0] += hash64[1]; + hash64[1] = ROL2(hash64[1], 46) ^ hash64[0]; + hash64[2] += hash64[3]; + hash64[3] = ROL2(hash64[3], 36) ^ hash64[2]; + hash64[4] += hash64[5]; + hash64[5] = ROL2(hash64[5], 19) ^ hash64[4]; + hash64[6] += hash64[7]; + hash64[7] = ROL2(hash64[7], 37) ^ hash64[6]; + hash64[2] += hash64[1]; + hash64[1] = ROL2(hash64[1], 33) ^ hash64[2]; + hash64[4] += hash64[7]; + hash64[7] = ROL2(hash64[7], 27) ^ hash64[4]; + hash64[6] += hash64[5]; + hash64[5] = ROL2(hash64[5], 14) ^ hash64[6]; + hash64[0] += hash64[3]; + hash64[3] = ROL2(hash64[3], 42) ^ hash64[0]; + hash64[4] += hash64[1]; + hash64[1] = ROL2(hash64[1], 17) ^ hash64[4]; + hash64[6] += hash64[3]; + hash64[3] = ROL2(hash64[3], 49) ^ hash64[6]; + hash64[0] += hash64[5]; + hash64[5] = ROL2(hash64[5], 36) ^ hash64[0]; + hash64[2] += hash64[7]; + hash64[7] = ROL2(hash64[7], 39) ^ hash64[2]; + hash64[6] += hash64[1]; + hash64[1] = ROL2(hash64[1], 44) ^ hash64[6]; + hash64[0] += hash64[7]; + hash64[7] = ROL2(hash64[7], 9) ^ hash64[0]; + hash64[2] += hash64[5]; + hash64[5] = ROL2(hash64[5], 54) ^ hash64[2]; + hash64[4] += hash64[3]; + hash64[3] = ROR8(hash64[3]) ^ hash64[4]; + hash64[0] = (hash64[0] + h6); + hash64[1] = (hash64[1] + h7); + hash64[2] = (hash64[2] + skein_h8); + hash64[3] = (hash64[3] + h0); + hash64[4] = (hash64[4] + h1); + hash64[5] = (hash64[5] + h2 + vectorizelow(8ULL)); + hash64[6] = (hash64[6] + h3 + vectorizehigh(0xff000000UL)); + hash64[7] = (hash64[7] + h4 + vectorizelow(15)); + hash64[0] += hash64[1]; + hash64[1] = ROL2(hash64[1], 39) ^ hash64[0]; + hash64[2] += hash64[3]; + hash64[3] = ROL2(hash64[3], 30) ^ hash64[2]; + hash64[4] += hash64[5]; + hash64[5] = ROL2(hash64[5], 34) ^ hash64[4]; + hash64[6] += hash64[7]; + hash64[7] = ROL24(hash64[7]) ^ hash64[6]; + hash64[2] += hash64[1]; + hash64[1] = ROL2(hash64[1], 13) ^ hash64[2]; + hash64[4] += hash64[7]; + hash64[7] = ROL2(hash64[7], 50) ^ hash64[4]; + hash64[6] += hash64[5]; + hash64[5] = ROL2(hash64[5], 10) ^ hash64[6]; + hash64[0] += hash64[3]; + hash64[3] = ROL2(hash64[3], 17) ^ hash64[0]; + hash64[4] += hash64[1]; + hash64[1] = ROL2(hash64[1], 25) ^ hash64[4]; + hash64[6] += hash64[3]; + hash64[3] = ROL2(hash64[3], 29) ^ hash64[6]; + hash64[0] += hash64[5]; + hash64[5] = ROL2(hash64[5], 39) ^ hash64[0]; + hash64[2] += hash64[7]; + hash64[7] = ROL2(hash64[7], 43) ^ hash64[2]; + hash64[6] += hash64[1]; + hash64[1] = ROL8(hash64[1]) ^ hash64[6]; + hash64[0] += hash64[7]; + hash64[7] = ROL2(hash64[7], 35) ^ hash64[0]; + hash64[2] += hash64[5]; + hash64[5] = ROR8(hash64[5]) ^ hash64[2]; + hash64[4] += hash64[3]; + hash64[3] = ROL2(hash64[3], 22) ^ hash64[4]; + hash64[0] = (hash64[0] + h7); + hash64[1] = (hash64[1] + skein_h8); + hash64[2] = (hash64[2] + h0); + hash64[3] = (hash64[3] + h1); + hash64[4] = (hash64[4] + h2); + hash64[5] = (hash64[5] + h3 + vectorizehigh(0xff000000UL)); + hash64[6] = (hash64[6] + h4 + vectorize(0xff00000000000008ULL)); + hash64[7] = (hash64[7] + h5 + vectorizelow(16)); + hash64[0] += hash64[1]; + hash64[1] = ROL2(hash64[1], 46) ^ hash64[0]; + hash64[2] += hash64[3]; + hash64[3] = ROL2(hash64[3], 36) ^ hash64[2]; + hash64[4] += hash64[5]; + hash64[5] = ROL2(hash64[5], 19) ^ hash64[4]; + hash64[6] += hash64[7]; + hash64[7] = ROL2(hash64[7], 37) ^ hash64[6]; + hash64[2] += hash64[1]; + hash64[1] = ROL2(hash64[1], 33) ^ hash64[2]; + hash64[4] += hash64[7]; + hash64[7] = ROL2(hash64[7], 27) ^ hash64[4]; + hash64[6] += hash64[5]; + hash64[5] = ROL2(hash64[5], 14) ^ hash64[6]; + hash64[0] += hash64[3]; + hash64[3] = ROL2(hash64[3], 42) ^ hash64[0]; + hash64[4] += hash64[1]; + hash64[1] = ROL2(hash64[1], 17) ^ hash64[4]; + hash64[6] += hash64[3]; + hash64[3] = ROL2(hash64[3], 49) ^ hash64[6]; + hash64[0] += hash64[5]; + hash64[5] = ROL2(hash64[5], 36) ^ hash64[0]; + hash64[2] += hash64[7]; + hash64[7] = ROL2(hash64[7], 39) ^ hash64[2]; + hash64[6] += hash64[1]; + hash64[1] = ROL2(hash64[1], 44) ^ hash64[6]; + hash64[0] += hash64[7]; + hash64[7] = ROL2(hash64[7], 9) ^ hash64[0]; + hash64[2] += hash64[5]; + hash64[5] = ROL2(hash64[5], 54) ^ hash64[2]; + hash64[4] += hash64[3]; + hash64[3] = ROR8(hash64[3]) ^ hash64[4]; + hash64[0] = (hash64[0] + skein_h8); + hash64[1] = (hash64[1] + h0); + hash64[2] = (hash64[2] + h1); + hash64[3] = (hash64[3] + h2); + hash64[4] = (hash64[4] + h3); + hash64[5] = (hash64[5] + h4 + vectorize(0xff00000000000008ULL)); + hash64[6] = (hash64[6] + h5 + vectorizelow(8ULL)); + hash64[7] = (hash64[7] + h6 + vectorizelow(17)); + hash64[0] += hash64[1]; + hash64[1] = ROL2(hash64[1], 39) ^ hash64[0]; + hash64[2] += hash64[3]; + hash64[3] = ROL2(hash64[3], 30) ^ hash64[2]; + hash64[4] += hash64[5]; + hash64[5] = ROL2(hash64[5], 34) ^ hash64[4]; + hash64[6] += hash64[7]; + hash64[7] = ROL24(hash64[7]) ^ hash64[6]; + hash64[2] += hash64[1]; + hash64[1] = ROL2(hash64[1], 13) ^ hash64[2]; + hash64[4] += hash64[7]; + hash64[7] = ROL2(hash64[7], 50) ^ hash64[4]; + hash64[6] += hash64[5]; + hash64[5] = ROL2(hash64[5], 10) ^ hash64[6]; + hash64[0] += hash64[3]; + hash64[3] = ROL2(hash64[3], 17) ^ hash64[0]; + hash64[4] += hash64[1]; + hash64[1] = ROL2(hash64[1], 25) ^ hash64[4]; + hash64[6] += hash64[3]; + hash64[3] = ROL2(hash64[3], 29) ^ hash64[6]; + hash64[0] += hash64[5]; + hash64[5] = ROL2(hash64[5], 39) ^ hash64[0]; + hash64[2] += hash64[7]; + hash64[7] = ROL2(hash64[7], 43) ^ hash64[2]; + hash64[6] += hash64[1]; + hash64[1] = ROL8(hash64[1]) ^ hash64[6]; + hash64[0] += hash64[7]; + hash64[7] = ROL2(hash64[7], 35) ^ hash64[0]; + hash64[2] += hash64[5]; + hash64[5] = ROR8(hash64[5]) ^ hash64[2]; + hash64[4] += hash64[3]; + hash64[3] = ROL2(hash64[3], 22) ^ hash64[4]; + + //#pragma unroll + // for (int i = 0; i<8; i++) + // inpHash[i] = s[i]; + uint64_t *const outHash = (uint64_t *)&g_hash[8 * hashPosition]; + + outHash[0] = devectorize(hash64[0] + h0); + outHash[1] = devectorize(hash64[1] + h1); + outHash[2] = devectorize(hash64[2] + h2); + outHash[3] = devectorize(hash64[3] + h3); + outHash[4] = devectorize(hash64[4] + h4); + outHash[5] = devectorize(hash64[5] + h5) + 8; + outHash[6] = devectorize(hash64[6] + h6) + 0xff00000000000000ULL; + outHash[7] = devectorize(hash64[7] + h7) + 18; + } + +#undef h0 +#undef h1 +#undef h2 +#undef h3 +#undef h4 +#undef h5 +#undef h6 +#undef h7 +} + +__global__ __launch_bounds__(192, 4) +void quark_keccak512_gpu_hash_64_final(uint32_t threads, uint32_t startNounce, uint2 *const __restrict__ g_hash, const uint32_t *const __restrict__ g_nonceVector, uint32_t *const __restrict__ d_found, uint32_t target) +{ + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + if(thread < threads) { uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread); int hashPosition = nounce - startNounce; - uint64_t *inpHash = &g_hash[8 * hashPosition]; + const uint2 *inpHash = &g_hash[8 * hashPosition]; - uint2 keccak_gpu_state[25]; -#pragma unroll - for (int i = 0; i<8; i++) + uint2 msg[8]; + + uint28 *phash = (uint28*)inpHash; + uint28 *outpt = (uint28*)msg; + outpt[0] = phash[0]; + outpt[1] = phash[1]; + + uint2 s[25]; + uint2 bc[5], tmpxor[5], tmp1, tmp2; + + tmpxor[0] = msg[0] ^ msg[5]; + tmpxor[1] = msg[1] ^ msg[6]; + tmpxor[2] = msg[2] ^ msg[7]; + tmpxor[3] = msg[3] ^ make_uint2(0x1, 0x80000000); + tmpxor[4] = msg[4]; + + bc[0] = tmpxor[0] ^ ROL2(tmpxor[2], 1); + bc[1] = tmpxor[1] ^ ROL2(tmpxor[3], 1); + bc[2] = tmpxor[2] ^ ROL2(tmpxor[4], 1); + bc[3] = tmpxor[3] ^ ROL2(tmpxor[0], 1); + bc[4] = tmpxor[4] ^ ROL2(tmpxor[1], 1); + + s[0] = inpHash[0] ^ bc[4]; + s[1] = ROL2(inpHash[6] ^ bc[0], 44); + s[6] = ROL2(bc[3], 20); + s[9] = ROL2(bc[1], 61); + s[22] = ROL2(bc[3], 39); + s[14] = ROL2(bc[4], 18); + s[20] = ROL2(inpHash[2] ^ bc[1], 62); + s[2] = ROL2(bc[1], 43); + s[12] = ROL2(bc[2], 25); + s[13] = ROL8(bc[3]); + s[19] = ROR8(bc[2]); + s[23] = ROL2(bc[4], 41); + s[15] = ROL2(inpHash[4] ^ bc[3], 27); + s[4] = ROL2(bc[3], 14); + s[24] = ROL2(bc[0], 2); + s[21] = ROL2(make_uint2(0x1, 0x80000000) ^ bc[2], 55); + s[8] = ROL2(bc[0], 45); + s[16] = ROL2(inpHash[5] ^ bc[4], 36); + s[5] = ROL2(inpHash[3] ^ bc[2], 28); + s[3] = ROL2(bc[2], 21); + s[18] = ROL2(bc[1], 15); + s[17] = ROL2(bc[0], 10); + s[11] = ROL2(inpHash[7] ^ bc[1], 6); + s[7] = ROL2(bc[4], 3); + s[10] = ROL2(inpHash[1] ^ bc[0], 1); + + tmp1 = s[0]; tmp2 = s[1]; s[0] = bitselect(s[0] ^ s[2], s[0], s[1]); s[1] = bitselect(s[1] ^ s[3], s[1], s[2]); s[2] = bitselect(s[2] ^ s[4], s[2], s[3]); s[3] = bitselect(s[3] ^ tmp1, s[3], s[4]); s[4] = bitselect(s[4] ^ tmp2, s[4], tmp1); + tmp1 = s[5]; tmp2 = s[6]; s[5] = bitselect(s[5] ^ s[7], s[5], s[6]); s[6] = bitselect(s[6] ^ s[8], s[6], s[7]); s[7] = bitselect(s[7] ^ s[9], s[7], s[8]); s[8] = bitselect(s[8] ^ tmp1, s[8], s[9]); s[9] = bitselect(s[9] ^ tmp2, s[9], tmp1); + tmp1 = s[10]; tmp2 = s[11]; s[10] = bitselect(s[10] ^ s[12], s[10], s[11]); s[11] = bitselect(s[11] ^ s[13], s[11], s[12]); s[12] = bitselect(s[12] ^ s[14], s[12], s[13]); s[13] = bitselect(s[13] ^ tmp1, s[13], s[14]); s[14] = bitselect(s[14] ^ tmp2, s[14], tmp1); + tmp1 = s[15]; tmp2 = s[16]; s[15] = bitselect(s[15] ^ s[17], s[15], s[16]); s[16] = bitselect(s[16] ^ s[18], s[16], s[17]); s[17] = bitselect(s[17] ^ s[19], s[17], s[18]); s[18] = bitselect(s[18] ^ tmp1, s[18], s[19]); s[19] = bitselect(s[19] ^ tmp2, s[19], tmp1); + tmp1 = s[20]; tmp2 = s[21]; s[20] = bitselect(s[20] ^ s[22], s[20], s[21]); s[21] = bitselect(s[21] ^ s[23], s[21], s[22]); s[22] = bitselect(s[22] ^ s[24], s[22], s[23]); s[23] = bitselect(s[23] ^ tmp1, s[23], s[24]); s[24] = bitselect(s[24] ^ tmp2, s[24], tmp1); + s[0].x ^= 1; + +#pragma nounroll + for(int i = 1; i < 23; i++) { - keccak_gpu_state[i] = vectorize(inpHash[i]); - } - keccak_gpu_state[8] = make_uint2(0x00000001UL, 0x80000000); //vectorize(0x8000000000000001ULL); #pragma unroll - for (int i = 9; i<25; i++) - { - keccak_gpu_state[i] = make_uint2(0, 0); + for(int x = 0; x < 5; x++) + tmpxor[x] = s[x] ^ s[x + 5] ^ s[x + 10] ^ s[x + 15] ^ s[x + 20]; + + bc[0] = tmpxor[0] ^ ROL2(tmpxor[2], 1); + bc[1] = tmpxor[1] ^ ROL2(tmpxor[3], 1); + bc[2] = tmpxor[2] ^ ROL2(tmpxor[4], 1); + bc[3] = tmpxor[3] ^ ROL2(tmpxor[0], 1); + bc[4] = tmpxor[4] ^ ROL2(tmpxor[1], 1); + + tmp1 = s[1] ^ bc[0]; + + s[0] ^= bc[4]; + s[1] = ROL2(s[6] ^ bc[0], 44); + s[6] = ROL2(s[9] ^ bc[3], 20); + s[9] = ROL2(s[22] ^ bc[1], 61); + s[22] = ROL2(s[14] ^ bc[3], 39); + s[14] = ROL2(s[20] ^ bc[4], 18); + s[20] = ROL2(s[2] ^ bc[1], 62); + s[2] = ROL2(s[12] ^ bc[1], 43); + s[12] = ROL2(s[13] ^ bc[2], 25); + s[13] = ROL8(s[19] ^ bc[3]); + s[19] = ROR8(s[23] ^ bc[2]); + s[23] = ROL2(s[15] ^ bc[4], 41); + s[15] = ROL2(s[4] ^ bc[3], 27); + s[4] = ROL2(s[24] ^ bc[3], 14); + s[24] = ROL2(s[21] ^ bc[0], 2); + s[21] = ROL2(s[8] ^ bc[2], 55); + s[8] = ROL2(s[16] ^ bc[0], 45); + s[16] = ROL2(s[5] ^ bc[4], 36); + s[5] = ROL2(s[3] ^ bc[2], 28); + s[3] = ROL2(s[18] ^ bc[2], 21); + s[18] = ROL2(s[17] ^ bc[1], 15); + s[17] = ROL2(s[11] ^ bc[0], 10); + s[11] = ROL2(s[7] ^ bc[1], 6); + s[7] = ROL2(s[10] ^ bc[4], 3); + s[10] = ROL2(tmp1, 1); + + tmp1 = s[0]; tmp2 = s[1]; s[0] = bitselect(s[0] ^ s[2], s[0], s[1]); + s[0] ^= c_keccak_round_constants35[i]; + s[1] = bitselect(s[1] ^ s[3], s[1], s[2]); s[2] = bitselect(s[2] ^ s[4], s[2], s[3]); s[3] = bitselect(s[3] ^ tmp1, s[3], s[4]); s[4] = bitselect(s[4] ^ tmp2, s[4], tmp1); + tmp1 = s[5]; tmp2 = s[6]; s[5] = bitselect(s[5] ^ s[7], s[5], s[6]); s[6] = bitselect(s[6] ^ s[8], s[6], s[7]); s[7] = bitselect(s[7] ^ s[9], s[7], s[8]); s[8] = bitselect(s[8] ^ tmp1, s[8], s[9]); s[9] = bitselect(s[9] ^ tmp2, s[9], tmp1); + tmp1 = s[10]; tmp2 = s[11]; s[10] = bitselect(s[10] ^ s[12], s[10], s[11]); s[11] = bitselect(s[11] ^ s[13], s[11], s[12]); s[12] = bitselect(s[12] ^ s[14], s[12], s[13]); s[13] = bitselect(s[13] ^ tmp1, s[13], s[14]); s[14] = bitselect(s[14] ^ tmp2, s[14], tmp1); + tmp1 = s[15]; tmp2 = s[16]; s[15] = bitselect(s[15] ^ s[17], s[15], s[16]); s[16] = bitselect(s[16] ^ s[18], s[16], s[17]); s[17] = bitselect(s[17] ^ s[19], s[17], s[18]); s[18] = bitselect(s[18] ^ tmp1, s[18], s[19]); s[19] = bitselect(s[19] ^ tmp2, s[19], tmp1); + tmp1 = s[20]; tmp2 = s[21]; s[20] = bitselect(s[20] ^ s[22], s[20], s[21]); s[21] = bitselect(s[21] ^ s[23], s[21], s[22]); s[22] = bitselect(s[22] ^ s[24], s[22], s[23]); s[23] = bitselect(s[23] ^ tmp1, s[23], s[24]); s[24] = bitselect(s[24] ^ tmp2, s[24], tmp1); } - keccak_block_35_final(keccak_gpu_state); + uint2 t[5]; + t[0] = s[0] ^ s[5] ^ s[10] ^ s[15] ^ s[20]; + t[1] = s[1] ^ s[6] ^ s[11] ^ s[16] ^ s[21]; + t[2] = s[2] ^ s[7] ^ s[12] ^ s[17] ^ s[22]; + t[3] = s[3] ^ s[8] ^ s[13] ^ s[18] ^ s[23]; + t[4] = s[4] ^ s[9] ^ s[14] ^ s[19] ^ s[24]; - inpHash[3] = devectorize(keccak_gpu_state[3]); + s[0] ^= t[4] ^ ROL2(t[1], 1); + s[18] ^= t[2] ^ ROL2(t[4], 1); + s[24] ^= t[3] ^ ROL2(t[0], 1); + + s[3] = ROL2(s[18], 21) ^ ((~ROL2(s[24], 14)) & s[0]); + + if(s[3].y <= target) + { + uint32_t tmp = atomicCAS(d_found, 0xffffffff, nounce); + if(tmp != 0xffffffff) + d_found[1] = nounce; + } } } -__host__ void quark_keccak512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order) +__host__ void quark_keccak512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash) { - const uint32_t threadsperblock = 32; + const uint32_t threadsperblock = 128; // berechne wie viele Thread Blocks wir brauchen dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 block(threadsperblock); - quark_keccak512_gpu_hash_64<<>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector); + quark_keccak512_gpu_hash_64<<>>(threads, startNounce, (uint2 *)d_hash, d_nonceVector); +} + +__host__ void quark_keccak512_cpu_init(int thr_id) +{ + CUDA_SAFE_CALL(cudaMalloc(&(d_found[thr_id]), 2 * sizeof(uint32_t))); } -__host__ void quark_keccak512_cpu_hash_64_final(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order) +__host__ void quark_keccak512_cpu_hash_64_final(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, uint32_t target, uint32_t *h_found) { const uint32_t threadsperblock = 32; + // berechne wie viele Thread Blocks wir brauchen + dim3 grid((threads + threadsperblock - 1) / threadsperblock); + dim3 block(threadsperblock); + cudaMemsetAsync(d_found[thr_id], 0xff, 2 * sizeof(uint32_t), gpustream[thr_id]); + quark_keccak512_gpu_hash_64_final << > >(threads, startNounce, (uint2 *)d_hash, d_nonceVector, d_found[thr_id], target); + CUDA_SAFE_CALL(cudaMemcpyAsync(h_found, d_found[thr_id], 2 * sizeof(uint32_t), cudaMemcpyDeviceToHost, gpustream[thr_id])); +} + +__host__ void quark_keccakskein512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash) +{ + const uint32_t threadsperblock = 64; + // berechne wie viele Thread Blocks wir brauchen dim3 grid((threads + threadsperblock - 1) / threadsperblock); dim3 block(threadsperblock); - quark_keccak512_gpu_hash_64_final << > >(threads, startNounce, (uint64_t*)d_hash, d_nonceVector); + quark_keccakskein512_gpu_hash_64 << > >(threads, startNounce, (uint2 *)d_hash, d_nonceVector); } + diff --git a/quark/cuda_skein512.cu b/quark/cuda_skein512.cu index ac0be31667..1d9e5ff69d 100644 --- a/quark/cuda_skein512.cu +++ b/quark/cuda_skein512.cu @@ -1,18 +1,25 @@ -#include -#include +#include +#include +using namespace std; #include +#include "miner.h" +#include "cuda_helper.h" +#include "cuda_vector.h" + -#include "cuda_helper.h" -#define TPB 128 #define TPBf 128 +#define TPB52 1024 +#define TPB50 256 + +static __constant__ uint64_t c_PaddedMessage80[2]; // padded message (80 bytes + padding) +__constant__ uint2 precalcvalues[9]; +static uint32_t *d_nonce[MAX_GPUS]; // Take a look at: https://www.schneier.com/skein1.3.pdf #define SHL(x, n) ((x) << (n)) #define SHR(x, n) ((x) >> (n)) -static uint32_t *d_nonce[MAX_GPUS]; - /* * M9_ ## s ## _ ## i evaluates to s+i mod 9 (0 <= s <= 18, 0 <= i <= 7). */ @@ -238,24 +245,23 @@ static uint32_t *d_nonce[MAX_GPUS]; #define SKBT(t, s, v) XCAT(t, XCAT(XCAT(XCAT(M3_, s), _), v)) #define TFBIG_KINIT(k0, k1, k2, k3, k4, k5, k6, k7, k8, t0, t1, t2) { \ - k8 = ((k0 ^ k1) ^ (k2 ^ k3)) ^ ((k4 ^ k5) ^ (k6 ^ k7)) \ - ^ make_uint2( 0xA9FC1A22UL,0x1BD11BDA); \ + k8 = k0 ^ k1 ^ k2 ^ k3 ^ k4 ^ k5 ^ k6 ^ k7 ^ make_uint2(0xA9FC1A22UL, 0x1BD11BDA); \ t2 = t0 ^ t1; \ } //vectorize(0x1BD11BDAA9FC1A22ULL); #define TFBIG_ADDKEY(w0, w1, w2, w3, w4, w5, w6, w7, k, t, s) { \ - w0 = (w0 + SKBI(k, s, 0)); \ - w1 = (w1 + SKBI(k, s, 1)); \ - w2 = (w2 + SKBI(k, s, 2)); \ - w3 = (w3 + SKBI(k, s, 3)); \ - w4 = (w4 + SKBI(k, s, 4)); \ - w5 = (w5 + SKBI(k, s, 5) + SKBT(t, s, 0)); \ - w6 = (w6 + SKBI(k, s, 6) + SKBT(t, s, 1)); \ - w7 = (w7 + SKBI(k, s, 7) + vectorizelow(s)); \ + w0 += SKBI(k, s, 0); \ + w1 += SKBI(k, s, 1); \ + w2 += SKBI(k, s, 2); \ + w3 += SKBI(k, s, 3); \ + w4 += SKBI(k, s, 4); \ + w5 += SKBI(k, s, 5) + SKBT(t, s, 0); \ + w6 += SKBI(k, s, 6) + SKBT(t, s, 1); \ + w7 += SKBI(k, s, 7) + vectorizelow(s); \ } #define TFBIG_MIX(x0, x1, rc) { \ - x0 = x0 + x1; \ + x0 += x1; \ x1 = ROL2(x1, rc) ^ x0; \ } @@ -282,119 +288,1628 @@ static uint32_t *d_nonce[MAX_GPUS]; TFBIG_MIX8(p[6], p[1], p[0], p[7], p[2], p[5], p[4], p[3], 8, 35, 56, 22); \ } -__global__ -#if __CUDA_ARCH__ > 500 -__launch_bounds__(TPB, 2) -#else -__launch_bounds__(TPB, 1) -#endif -void quark_skein512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t * const __restrict__ g_hash, uint32_t *g_nonceVector) -{ - uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); - if (thread < threads) - { - // Skein - uint2 p[8]; - uint2 h0, h1, h2, h3, h4, h5, h6, h7, h8; - uint2 t0, t1, t2; - - uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread); - - int hashPosition = nounce - startNounce; - uint64_t *inpHash = &g_hash[8 * hashPosition]; +/* uint2 variant for SM3.2+ */ - h0 = make_uint2(0x749C51CEull, 0x4903ADFF); - h1 = make_uint2(0x9746DF03ull, 0x0D95DE39); - h2 = make_uint2(0x27C79BCEull, 0x8FD19341); - h3 = make_uint2(0xFF352CB1ull, 0x9A255629); - h4 = make_uint2(0xDF6CA7B0ull, 0x5DB62599); - h5 = make_uint2(0xA9D5C3F4ull, 0xEABE394C); - h6 = make_uint2(0x1A75B523ull, 0x991112C7); - h7 = make_uint2(0x660FCC33ull, 0xAE18A40B); +#define TFBIG_KINIT_UI2(k0, k1, k2, k3, k4, k5, k6, k7, k8, t0, t1, t2) { \ + k8 = k0 ^ k1 ^ k2 ^ k3 ^ k4 ^ k5 ^ k6 ^ k7 ^ vectorize(SPH_C64(0x1BD11BDAA9FC1A22)); \ + t2 = t0 ^ t1; \ + } - // 1. Runde -> etype = 480, ptr = 64, bcount = 0, data = msg -#pragma unroll 8 - for(int i=0;i<8;i++) - p[i] = vectorize(inpHash[i]); +#define TFBIG_ADDKEY_UI2(w0, w1, w2, w3, w4, w5, w6, w7, k, t, s) { \ + w0 += SKBI(k, s, 0); \ + w1 += SKBI(k, s, 1); \ + w2 += SKBI(k, s, 2); \ + w3 += SKBI(k, s, 3); \ + w4 += SKBI(k, s, 4); \ + w5 += SKBI(k, s, 5) + SKBT(t, s, 0); \ + w6 += SKBI(k, s, 6) + SKBT(t, s, 1); \ + w7 += SKBI(k, s, 7) + vectorize(s); \ + } - t0 = vectorizelow(64); // ptr - t1 = vectorize(480ull << 55); // etype - TFBIG_KINIT(h0, h1, h2, h3, h4, h5, h6, h7, h8, t0, t1, t2); - TFBIG_4e(0); - TFBIG_4o(1); - TFBIG_4e(2); - TFBIG_4o(3); - TFBIG_4e(4); - TFBIG_4o(5); - TFBIG_4e(6); - TFBIG_4o(7); - TFBIG_4e(8); - TFBIG_4o(9); - TFBIG_4e(10); - TFBIG_4o(11); - TFBIG_4e(12); - TFBIG_4o(13); - TFBIG_4e(14); - TFBIG_4o(15); - TFBIG_4e(16); - TFBIG_4o(17); - TFBIG_ADDKEY(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], h, t, 18); +#define TFBIG_ADDKEY_PRE(w0, w1, w2, w3, w4, w5, w6, w7, k, t, s) { \ + w0 += SKBI(k, s, 0); \ + w1 += SKBI(k, s, 1); \ + w2 += SKBI(k, s, 2); \ + w3 += SKBI(k, s, 3); \ + w4 += SKBI(k, s, 4); \ + w5 += SKBI(k, s, 5) + SKBT(t, s, 0); \ + w6 += SKBI(k, s, 6) + SKBT(t, s, 1); \ + w7 += SKBI(k, s, 7) + (s); \ + } + +#define TFBIG_MIX_UI2(x0, x1, rc) { \ + x0 += x1; \ + x1 = ROL2(x1, rc) ^ x0; \ + } - h0 = vectorize(inpHash[0]) ^ p[0]; - h1 = vectorize(inpHash[1]) ^ p[1]; - h2 = vectorize(inpHash[2]) ^ p[2]; - h3 = vectorize(inpHash[3]) ^ p[3]; - h4 = vectorize(inpHash[4]) ^ p[4]; - h5 = vectorize(inpHash[5]) ^ p[5]; - h6 = vectorize(inpHash[6]) ^ p[6]; - h7 = vectorize(inpHash[7]) ^ p[7]; +#define TFBIG_MIX_PRE(x0, x1, rc) { \ + x0 = x0 + x1; \ + x1 = ROTL64(x1, rc) ^ x0; \ + } + +#define TFBIG_MIX8_UI2(w0, w1, w2, w3, w4, w5, w6, w7, rc0, rc1, rc2, rc3) { \ + TFBIG_MIX_UI2(w0, w1, rc0); \ + TFBIG_MIX_UI2(w2, w3, rc1); \ + TFBIG_MIX_UI2(w4, w5, rc2); \ + TFBIG_MIX_UI2(w6, w7, rc3); \ + } - // 2. Runde -> etype = 510, ptr = 8, bcount = 0, data = 0 -#pragma unroll 8 - for(int i=0;i<8;i++) - p[i] = make_uint2(0,0); +#define TFBIG_MIX8_PRE(w0, w1, w2, w3, w4, w5, w6, w7, rc0, rc1, rc2, rc3) { \ + TFBIG_MIX_PRE(w0, w1, rc0); \ + TFBIG_MIX_PRE(w2, w3, rc1); \ + TFBIG_MIX_PRE(w4, w5, rc2); \ + TFBIG_MIX_PRE(w6, w7, rc3); \ + } + +#define TFBIG_4e_UI2(s) { \ + TFBIG_ADDKEY_UI2(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], h, t, s); \ + TFBIG_MIX8_UI2(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], 46, 36, 19, 37); \ + TFBIG_MIX8_UI2(p[2], p[1], p[4], p[7], p[6], p[5], p[0], p[3], 33, 27, 14, 42); \ + TFBIG_MIX8_UI2(p[4], p[1], p[6], p[3], p[0], p[5], p[2], p[7], 17, 49, 36, 39); \ + TFBIG_MIX8_UI2(p[6], p[1], p[0], p[7], p[2], p[5], p[4], p[3], 44, 9, 54, 56); \ + } - t0 = vectorizelow(8); // ptr - t1 = vectorize(510ull << 55); // etype - TFBIG_KINIT(h0, h1, h2, h3, h4, h5, h6, h7, h8, t0, t1, t2); - TFBIG_4e(0); - TFBIG_4o(1); - TFBIG_4e(2); - TFBIG_4o(3); - TFBIG_4e(4); - TFBIG_4o(5); - TFBIG_4e(6); - TFBIG_4o(7); - TFBIG_4e(8); - TFBIG_4o(9); - TFBIG_4e(10); - TFBIG_4o(11); - TFBIG_4e(12); - TFBIG_4o(13); - TFBIG_4e(14); - TFBIG_4o(15); - TFBIG_4e(16); - TFBIG_4o(17); - TFBIG_ADDKEY(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], h, t, 18); +#define TFBIG_4e_PRE(s) { \ + TFBIG_ADDKEY_PRE(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], h, t, s); \ + TFBIG_MIX8_PRE(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], 46, 36, 19, 37); \ + TFBIG_MIX8_PRE(p[2], p[1], p[4], p[7], p[6], p[5], p[0], p[3], 33, 27, 14, 42); \ + TFBIG_MIX8_PRE(p[4], p[1], p[6], p[3], p[0], p[5], p[2], p[7], 17, 49, 36, 39); \ + TFBIG_MIX8_PRE(p[6], p[1], p[0], p[7], p[2], p[5], p[4], p[3], 44, 9, 54, 56); \ + } + +#define TFBIG_4o_UI2(s) { \ + TFBIG_ADDKEY_UI2(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], h, t, s); \ + TFBIG_MIX8_UI2(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], 39, 30, 34, 24); \ + TFBIG_MIX8_UI2(p[2], p[1], p[4], p[7], p[6], p[5], p[0], p[3], 13, 50, 10, 17); \ + TFBIG_MIX8_UI2(p[4], p[1], p[6], p[3], p[0], p[5], p[2], p[7], 25, 29, 39, 43); \ + TFBIG_MIX8_UI2(p[6], p[1], p[0], p[7], p[2], p[5], p[4], p[3], 8, 35, 56, 22); \ + } - // fertig - uint64_t *outpHash = &g_hash[8 * hashPosition]; +#define TFBIG_4o_PRE(s) { \ + TFBIG_ADDKEY_PRE(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], h, t, s); \ + TFBIG_MIX8_PRE(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], 39, 30, 34, 24); \ + TFBIG_MIX8_PRE(p[2], p[1], p[4], p[7], p[6], p[5], p[0], p[3], 13, 50, 10, 17); \ + TFBIG_MIX8_PRE(p[4], p[1], p[6], p[3], p[0], p[5], p[2], p[7], 25, 29, 39, 43); \ + TFBIG_MIX8_PRE(p[6], p[1], p[0], p[7], p[2], p[5], p[4], p[3], 8, 35, 56, 22); \ + } -#pragma unroll 8 - for(int i=0;i<8;i++) - outpHash[i] = devectorize(p[i]); +__global__ +#if __CUDA_ARCH__ > 500 +__launch_bounds__(448, 2) +#endif +void quark_skein512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t * const __restrict__ g_hash, const uint32_t *const __restrict__ g_nonceVector) +{ + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + if (thread < threads) + { + // Skein + uint2 skein_p[8], h[8]; + + const uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread); + + const uint32_t hashPosition = nounce - startNounce; + uint64_t *Hash = &g_hash[8 * hashPosition]; + + + uint2 msg[8]; + + uint28 *phash = (uint28*)Hash; + uint28 *outpt = (uint28*)msg; + outpt[0] = phash[0]; + outpt[1] = phash[1]; + + h[0] = skein_p[0] = (msg[0]); + h[1] = skein_p[1] = (msg[1]); + h[2] = skein_p[2] = (msg[2]); + h[3] = skein_p[3] = (msg[3]); + h[4] = skein_p[4] = (msg[4]); + h[5] = skein_p[5] = (msg[5]); + h[6] = skein_p[6] = (msg[6]); + h[7] = skein_p[7] = (msg[7]); + + skein_p[0] += vectorize(0x4903ADFF749C51CEULL); + skein_p[1] += vectorize(0x0D95DE399746DF03ULL); + skein_p[2] += vectorize(0x8FD1934127C79BCEULL); + skein_p[3] += vectorize(0x9A255629FF352CB1ULL); + skein_p[4] += vectorize(0x5DB62599DF6CA7B0ULL); + skein_p[5] += vectorize(0xEABE394CA9D5C434ULL); + skein_p[6] += vectorize(0x891112C71A75B523ULL); + skein_p[7] += vectorize(0xAE18A40B660FCC33ULL); + skein_p[0] += skein_p[1]; + skein_p[1] = ROL2(skein_p[1], 46) ^ skein_p[0]; + skein_p[2] += skein_p[3]; + skein_p[3] = ROL2(skein_p[3], 36) ^ skein_p[2]; + skein_p[4] += skein_p[5]; + skein_p[5] = ROL2(skein_p[5], 19) ^ skein_p[4]; + skein_p[6] += skein_p[7]; + skein_p[7] = ROL2(skein_p[7], 37) ^ skein_p[6]; + skein_p[2] += skein_p[1]; + skein_p[1] = ROL2(skein_p[1], 33) ^ skein_p[2]; + skein_p[4] += skein_p[7]; + skein_p[7] = ROL2(skein_p[7], 27) ^ skein_p[4]; + skein_p[6] += skein_p[5]; + skein_p[5] = ROL2(skein_p[5], 14) ^ skein_p[6]; + skein_p[0] += skein_p[3]; + skein_p[3] = ROL2(skein_p[3], 42) ^ skein_p[0]; + skein_p[4] += skein_p[1]; + skein_p[1] = ROL2(skein_p[1], 17) ^ skein_p[4]; + skein_p[6] += skein_p[3]; + skein_p[3] = ROL2(skein_p[3], 49) ^ skein_p[6]; + skein_p[0] += skein_p[5]; + skein_p[5] = ROL2(skein_p[5], 36) ^ skein_p[0]; + skein_p[2] += skein_p[7]; + skein_p[7] = ROL2(skein_p[7], 39) ^ skein_p[2]; + skein_p[6] += skein_p[1]; + skein_p[1] = ROL2(skein_p[1], 44) ^ skein_p[6]; + skein_p[0] += skein_p[7]; + skein_p[7] = ROL2(skein_p[7], 9) ^ skein_p[0]; + skein_p[2] += skein_p[5]; + skein_p[5] = ROL2(skein_p[5], 54) ^ skein_p[2]; + skein_p[4] += skein_p[3]; + skein_p[3] = ROR8(skein_p[3]) ^ skein_p[4]; + skein_p[0] += vectorize(0x0D95DE399746DF03ULL); + skein_p[1] += vectorize(0x8FD1934127C79BCEULL); + skein_p[2] += vectorize(0x9A255629FF352CB1ULL); + skein_p[3] += vectorize(0x5DB62599DF6CA7B0ULL); + skein_p[4] += vectorize(0xEABE394CA9D5C3F4ULL); + skein_p[5] += vectorize(0x891112C71A75B523ULL); + skein_p[6] += vectorize(0x9E18A40B660FCC73ULL); + skein_p[7] += vectorize(0xcab2076d98173ec4ULL+1); + skein_p[0] += skein_p[1]; + skein_p[1] = ROL2(skein_p[1], 39) ^ skein_p[0]; + skein_p[2] += skein_p[3]; + skein_p[3] = ROL2(skein_p[3], 30) ^ skein_p[2]; + skein_p[4] += skein_p[5]; + skein_p[5] = ROL2(skein_p[5], 34) ^ skein_p[4]; + skein_p[6] += skein_p[7]; + skein_p[7] = ROL24(skein_p[7]) ^ skein_p[6]; + skein_p[2] += skein_p[1]; + skein_p[1] = ROL2(skein_p[1], 13) ^ skein_p[2]; + skein_p[4] += skein_p[7]; + skein_p[7] = ROL2(skein_p[7], 50) ^ skein_p[4]; + skein_p[6] += skein_p[5]; + skein_p[5] = ROL2(skein_p[5], 10) ^ skein_p[6]; + skein_p[0] += skein_p[3]; + skein_p[3] = ROL2(skein_p[3], 17) ^ skein_p[0]; + skein_p[4] += skein_p[1]; + skein_p[1] = ROL2(skein_p[1], 25) ^ skein_p[4]; + skein_p[6] += skein_p[3]; + skein_p[3] = ROL2(skein_p[3], 29) ^ skein_p[6]; + skein_p[0] += skein_p[5]; + skein_p[5] = ROL2(skein_p[5], 39) ^ skein_p[0]; + skein_p[2] += skein_p[7]; + skein_p[7] = ROL2(skein_p[7], 43) ^ skein_p[2]; + skein_p[6] += skein_p[1]; + skein_p[1] = ROL8(skein_p[1]) ^ skein_p[6]; + skein_p[0] += skein_p[7]; + skein_p[7] = ROL2(skein_p[7], 35) ^ skein_p[0]; + skein_p[2] += skein_p[5]; + skein_p[5] = ROR8(skein_p[5]) ^ skein_p[2]; + skein_p[4] += skein_p[3]; + skein_p[3] = ROL2(skein_p[3], 22) ^ skein_p[4]; + skein_p[0] += vectorize(0x8FD1934127C79BCEULL); + skein_p[1] += vectorize(0x9A255629FF352CB1ULL); + skein_p[2] += vectorize(0x5DB62599DF6CA7B0ULL); + skein_p[3] += vectorize(0xEABE394CA9D5C3F4ULL); + skein_p[4] += vectorize(0x991112C71A75B523ULL); + skein_p[5] += vectorize(0x9E18A40B660FCC73ULL); + skein_p[6] += vectorize(0xCAB2076D98173F04ULL); + skein_p[7] += vectorize(0x4903ADFF749C51D0ULL); + skein_p[0] += skein_p[1]; + skein_p[1] = ROL2(skein_p[1], 46) ^ skein_p[0]; + skein_p[2] += skein_p[3]; + skein_p[3] = ROL2(skein_p[3], 36) ^ skein_p[2]; + skein_p[4] += skein_p[5]; + skein_p[5] = ROL2(skein_p[5], 19) ^ skein_p[4]; + skein_p[6] += skein_p[7]; + skein_p[7] = ROL2(skein_p[7], 37) ^ skein_p[6]; + skein_p[2] += skein_p[1]; + skein_p[1] = ROL2(skein_p[1], 33) ^ skein_p[2]; + skein_p[4] += skein_p[7]; + skein_p[7] = ROL2(skein_p[7], 27) ^ skein_p[4]; + skein_p[6] += skein_p[5]; + skein_p[5] = ROL2(skein_p[5], 14) ^ skein_p[6]; + skein_p[0] += skein_p[3]; + skein_p[3] = ROL2(skein_p[3], 42) ^ skein_p[0]; + skein_p[4] += skein_p[1]; + skein_p[1] = ROL2(skein_p[1], 17) ^ skein_p[4]; + skein_p[6] += skein_p[3]; + skein_p[3] = ROL2(skein_p[3], 49) ^ skein_p[6]; + skein_p[0] += skein_p[5]; + skein_p[5] = ROL2(skein_p[5], 36) ^ skein_p[0]; + skein_p[2] += skein_p[7]; + skein_p[7] = ROL2(skein_p[7], 39) ^ skein_p[2]; + skein_p[6] += skein_p[1]; + skein_p[1] = ROL2(skein_p[1], 44) ^ skein_p[6]; + skein_p[0] += skein_p[7]; + skein_p[7] = ROL2(skein_p[7], 9) ^ skein_p[0]; + skein_p[2] += skein_p[5]; + skein_p[5] = ROL2(skein_p[5], 54) ^ skein_p[2]; + skein_p[4] += skein_p[3]; + skein_p[3] = ROR8(skein_p[3]) ^ skein_p[4]; + skein_p[0] += vectorize(0x9A255629FF352CB1ULL); + skein_p[1] += vectorize(0x5DB62599DF6CA7B0ULL); + skein_p[2] += vectorize(0xEABE394CA9D5C3F4ULL); + skein_p[3] += vectorize(0x991112C71A75B523ULL); + skein_p[4] += vectorize(0xAE18A40B660FCC33ULL); + skein_p[5] += vectorize(0xcab2076d98173f04ULL); + skein_p[6] += vectorize(0x3903ADFF749C51CEULL); + skein_p[7] += vectorize(0x0D95DE399746DF03ULL+3); + skein_p[0] += skein_p[1]; + skein_p[1] = ROL2(skein_p[1], 39) ^ skein_p[0]; + skein_p[2] += skein_p[3]; + skein_p[3] = ROL2(skein_p[3], 30) ^ skein_p[2]; + skein_p[4] += skein_p[5]; + skein_p[5] = ROL2(skein_p[5], 34) ^ skein_p[4]; + skein_p[6] += skein_p[7]; + skein_p[7] = ROL24(skein_p[7]) ^ skein_p[6]; + skein_p[2] += skein_p[1]; + skein_p[1] = ROL2(skein_p[1], 13) ^ skein_p[2]; + skein_p[4] += skein_p[7]; + skein_p[7] = ROL2(skein_p[7], 50) ^ skein_p[4]; + skein_p[6] += skein_p[5]; + skein_p[5] = ROL2(skein_p[5], 10) ^ skein_p[6]; + skein_p[0] += skein_p[3]; + skein_p[3] = ROL2(skein_p[3], 17) ^ skein_p[0]; + skein_p[4] += skein_p[1]; + skein_p[1] = ROL2(skein_p[1], 25) ^ skein_p[4]; + skein_p[6] += skein_p[3]; + skein_p[3] = ROL2(skein_p[3], 29) ^ skein_p[6]; + skein_p[0] += skein_p[5]; + skein_p[5] = ROL2(skein_p[5], 39) ^ skein_p[0]; + skein_p[2] += skein_p[7]; + skein_p[7] = ROL2(skein_p[7], 43) ^ skein_p[2]; + skein_p[6] += skein_p[1]; + skein_p[1] = ROL8(skein_p[1]) ^ skein_p[6]; + skein_p[0] += skein_p[7]; + skein_p[7] = ROL2(skein_p[7], 35) ^ skein_p[0]; + skein_p[2] += skein_p[5]; + skein_p[5] = ROR8(skein_p[5]) ^ skein_p[2]; + skein_p[4] += skein_p[3]; + skein_p[3] = ROL2(skein_p[3], 22) ^ skein_p[4]; + skein_p[0] += vectorize(0x5DB62599DF6CA7B0ULL); + skein_p[1] += vectorize(0xEABE394CA9D5C3F4ULL); + skein_p[2] += vectorize(0x991112C71A75B523ULL); + skein_p[3] += vectorize(0xAE18A40B660FCC33ULL); + skein_p[4] += vectorize(0xcab2076d98173ec4ULL); + skein_p[5] += vectorize(0x3903ADFF749C51CEULL); + skein_p[6] += vectorize(0xFD95DE399746DF43ULL); + skein_p[7] += vectorize(0x8FD1934127C79BD2ULL); + skein_p[0] += skein_p[1]; + skein_p[1] = ROL2(skein_p[1], 46) ^ skein_p[0]; + skein_p[2] += skein_p[3]; + skein_p[3] = ROL2(skein_p[3], 36) ^ skein_p[2]; + skein_p[4] += skein_p[5]; + skein_p[5] = ROL2(skein_p[5], 19) ^ skein_p[4]; + skein_p[6] += skein_p[7]; + skein_p[7] = ROL2(skein_p[7], 37) ^ skein_p[6]; + skein_p[2] += skein_p[1]; + skein_p[1] = ROL2(skein_p[1], 33) ^ skein_p[2]; + skein_p[4] += skein_p[7]; + skein_p[7] = ROL2(skein_p[7], 27) ^ skein_p[4]; + skein_p[6] += skein_p[5]; + skein_p[5] = ROL2(skein_p[5], 14) ^ skein_p[6]; + skein_p[0] += skein_p[3]; + skein_p[3] = ROL2(skein_p[3], 42) ^ skein_p[0]; + skein_p[4] += skein_p[1]; + skein_p[1] = ROL2(skein_p[1], 17) ^ skein_p[4]; + skein_p[6] += skein_p[3]; + skein_p[3] = ROL2(skein_p[3], 49) ^ skein_p[6]; + skein_p[0] += skein_p[5]; + skein_p[5] = ROL2(skein_p[5], 36) ^ skein_p[0]; + skein_p[2] += skein_p[7]; + skein_p[7] = ROL2(skein_p[7], 39) ^ skein_p[2]; + skein_p[6] += skein_p[1]; + skein_p[1] = ROL2(skein_p[1], 44) ^ skein_p[6]; + skein_p[0] += skein_p[7]; + skein_p[7] = ROL2(skein_p[7], 9) ^ skein_p[0]; + skein_p[2] += skein_p[5]; + skein_p[5] = ROL2(skein_p[5], 54) ^ skein_p[2]; + skein_p[4] += skein_p[3]; + skein_p[3] = ROR8(skein_p[3]) ^ skein_p[4]; + skein_p[0] += vectorize(0xEABE394CA9D5C3F4ULL); + skein_p[1] += vectorize(0x991112C71A75B523ULL); + skein_p[2] += vectorize(0xAE18A40B660FCC33ULL); + skein_p[3] += vectorize(0xcab2076d98173ec4ULL); + skein_p[4] += vectorize(0x4903ADFF749C51CEULL); + skein_p[5] += vectorize(0x0D95DE399746DF03ULL + 0xf000000000000040ULL); + skein_p[6] += vectorize(0x8FD1934127C79BCEULL + 0x0000000000000040ULL); + skein_p[7] += vectorize(0x9A255629FF352CB1ULL + 5); + skein_p[0] += skein_p[1]; + skein_p[1] = ROL2(skein_p[1], 39) ^ skein_p[0]; + skein_p[2] += skein_p[3]; + skein_p[3] = ROL2(skein_p[3], 30) ^ skein_p[2]; + skein_p[4] += skein_p[5]; + skein_p[5] = ROL2(skein_p[5], 34) ^ skein_p[4]; + skein_p[6] += skein_p[7]; + skein_p[7] = ROL24(skein_p[7]) ^ skein_p[6]; + skein_p[2] += skein_p[1]; + skein_p[1] = ROL2(skein_p[1], 13) ^ skein_p[2]; + skein_p[4] += skein_p[7]; + skein_p[7] = ROL2(skein_p[7], 50) ^ skein_p[4]; + skein_p[6] += skein_p[5]; + skein_p[5] = ROL2(skein_p[5], 10) ^ skein_p[6]; + skein_p[0] += skein_p[3]; + skein_p[3] = ROL2(skein_p[3], 17) ^ skein_p[0]; + skein_p[4] += skein_p[1]; + skein_p[1] = ROL2(skein_p[1], 25) ^ skein_p[4]; + skein_p[6] += skein_p[3]; + skein_p[3] = ROL2(skein_p[3], 29) ^ skein_p[6]; + skein_p[0] += skein_p[5]; + skein_p[5] = ROL2(skein_p[5], 39) ^ skein_p[0]; + skein_p[2] += skein_p[7]; + skein_p[7] = ROL2(skein_p[7], 43) ^ skein_p[2]; + skein_p[6] += skein_p[1]; + skein_p[1] = ROL8(skein_p[1]) ^ skein_p[6]; + skein_p[0] += skein_p[7]; + skein_p[7] = ROL2(skein_p[7], 35) ^ skein_p[0]; + skein_p[2] += skein_p[5]; + skein_p[5] = ROR8(skein_p[5]) ^ skein_p[2]; + skein_p[4] += skein_p[3]; + skein_p[3] = ROL2(skein_p[3], 22) ^ skein_p[4]; + skein_p[0] += vectorize(0x991112C71A75B523ULL); + skein_p[1] += vectorize(0xAE18A40B660FCC33ULL); + skein_p[2] += vectorize(0xcab2076d98173ec4ULL); + skein_p[3] += vectorize(0x4903ADFF749C51CEULL); + skein_p[4] += vectorize(0x0D95DE399746DF03ULL); + skein_p[5] += vectorize(0x8FD1934127C79BCEULL + 0x0000000000000040ULL); + skein_p[6] += vectorize(0x8A255629FF352CB1ULL); + skein_p[7] += vectorize(0x5DB62599DF6CA7B0ULL + 6); + skein_p[0] += skein_p[1]; + skein_p[1] = ROL2(skein_p[1], 46) ^ skein_p[0]; + skein_p[2] += skein_p[3]; + skein_p[3] = ROL2(skein_p[3], 36) ^ skein_p[2]; + skein_p[4] += skein_p[5]; + skein_p[5] = ROL2(skein_p[5], 19) ^ skein_p[4]; + skein_p[6] += skein_p[7]; + skein_p[7] = ROL2(skein_p[7], 37) ^ skein_p[6]; + skein_p[2] += skein_p[1]; + skein_p[1] = ROL2(skein_p[1], 33) ^ skein_p[2]; + skein_p[4] += skein_p[7]; + skein_p[7] = ROL2(skein_p[7], 27) ^ skein_p[4]; + skein_p[6] += skein_p[5]; + skein_p[5] = ROL2(skein_p[5], 14) ^ skein_p[6]; + skein_p[0] += skein_p[3]; + skein_p[3] = ROL2(skein_p[3], 42) ^ skein_p[0]; + skein_p[4] += skein_p[1]; + skein_p[1] = ROL2(skein_p[1], 17) ^ skein_p[4]; + skein_p[6] += skein_p[3]; + skein_p[3] = ROL2(skein_p[3], 49) ^ skein_p[6]; + skein_p[0] += skein_p[5]; + skein_p[5] = ROL2(skein_p[5], 36) ^ skein_p[0]; + skein_p[2] += skein_p[7]; + skein_p[7] = ROL2(skein_p[7], 39) ^ skein_p[2]; + skein_p[6] += skein_p[1]; + skein_p[1] = ROL2(skein_p[1], 44) ^ skein_p[6]; + skein_p[0] += skein_p[7]; + skein_p[7] = ROL2(skein_p[7], 9) ^ skein_p[0]; + skein_p[2] += skein_p[5]; + skein_p[5] = ROL2(skein_p[5], 54) ^ skein_p[2]; + skein_p[4] += skein_p[3]; + skein_p[3] = ROR8(skein_p[3]) ^ skein_p[4]; + skein_p[0] += vectorize(0xAE18A40B660FCC33ULL); + skein_p[1] += vectorize(0xcab2076d98173ec4ULL); + skein_p[2] += vectorize(0x4903ADFF749C51CEULL); + skein_p[3] += vectorize(0x0D95DE399746DF03ULL); + skein_p[4] += vectorize(0x8FD1934127C79BCEULL); + skein_p[5] += vectorize(0x8A255629FF352CB1ULL); + skein_p[6] += vectorize(0x4DB62599DF6CA7F0ULL); + skein_p[7] += vectorize(0xEABE394CA9D5C3F4ULL + 7); + skein_p[0] += skein_p[1]; + skein_p[1] = ROL2(skein_p[1], 39) ^ skein_p[0]; + skein_p[2] += skein_p[3]; + skein_p[3] = ROL2(skein_p[3], 30) ^ skein_p[2]; + skein_p[4] += skein_p[5]; + skein_p[5] = ROL2(skein_p[5], 34) ^ skein_p[4]; + skein_p[6] += skein_p[7]; + skein_p[7] = ROL24(skein_p[7]) ^ skein_p[6]; + skein_p[2] += skein_p[1]; + skein_p[1] = ROL2(skein_p[1], 13) ^ skein_p[2]; + skein_p[4] += skein_p[7]; + skein_p[7] = ROL2(skein_p[7], 50) ^ skein_p[4]; + skein_p[6] += skein_p[5]; + skein_p[5] = ROL2(skein_p[5], 10) ^ skein_p[6]; + skein_p[0] += skein_p[3]; + skein_p[3] = ROL2(skein_p[3], 17) ^ skein_p[0]; + skein_p[4] += skein_p[1]; + skein_p[1] = ROL2(skein_p[1], 25) ^ skein_p[4]; + skein_p[6] += skein_p[3]; + skein_p[3] = ROL2(skein_p[3], 29) ^ skein_p[6]; + skein_p[0] += skein_p[5]; + skein_p[5] = ROL2(skein_p[5], 39) ^ skein_p[0]; + skein_p[2] += skein_p[7]; + skein_p[7] = ROL2(skein_p[7], 43) ^ skein_p[2]; + skein_p[6] += skein_p[1]; + skein_p[1] = ROL8(skein_p[1]) ^ skein_p[6]; + skein_p[0] += skein_p[7]; + skein_p[7] = ROL2(skein_p[7], 35) ^ skein_p[0]; + skein_p[2] += skein_p[5]; + skein_p[5] = ROR8(skein_p[5]) ^ skein_p[2]; + skein_p[4] += skein_p[3]; + skein_p[3] = ROL2(skein_p[3], 22) ^ skein_p[4]; + skein_p[0] += vectorize(0xcab2076d98173ec4ULL); + skein_p[1] += vectorize(0x4903ADFF749C51CEULL); + skein_p[2] += vectorize(0x0D95DE399746DF03ULL); + skein_p[3] += vectorize(0x8FD1934127C79BCEULL); + skein_p[4] += vectorize(0x9A255629FF352CB1ULL); + skein_p[5] += vectorize(0x4DB62599DF6CA7F0ULL); + skein_p[6] += vectorize(0xEABE394CA9D5C434ULL); + skein_p[7] += vectorize(0x991112C71A75B52BULL); + skein_p[0] += skein_p[1]; + skein_p[1] = ROL2(skein_p[1], 46) ^ skein_p[0]; + skein_p[2] += skein_p[3]; + skein_p[3] = ROL2(skein_p[3], 36) ^ skein_p[2]; + skein_p[4] += skein_p[5]; + skein_p[5] = ROL2(skein_p[5], 19) ^ skein_p[4]; + skein_p[6] += skein_p[7]; + skein_p[7] = ROL2(skein_p[7], 37) ^ skein_p[6]; + skein_p[2] += skein_p[1]; + skein_p[1] = ROL2(skein_p[1], 33) ^ skein_p[2]; + skein_p[4] += skein_p[7]; + skein_p[7] = ROL2(skein_p[7], 27) ^ skein_p[4]; + skein_p[6] += skein_p[5]; + skein_p[5] = ROL2(skein_p[5], 14) ^ skein_p[6]; + skein_p[0] += skein_p[3]; + skein_p[3] = ROL2(skein_p[3], 42) ^ skein_p[0]; + skein_p[4] += skein_p[1]; + skein_p[1] = ROL2(skein_p[1], 17) ^ skein_p[4]; + skein_p[6] += skein_p[3]; + skein_p[3] = ROL2(skein_p[3], 49) ^ skein_p[6]; + skein_p[0] += skein_p[5]; + skein_p[5] = ROL2(skein_p[5], 36) ^ skein_p[0]; + skein_p[2] += skein_p[7]; + skein_p[7] = ROL2(skein_p[7], 39) ^ skein_p[2]; + skein_p[6] += skein_p[1]; + skein_p[1] = ROL2(skein_p[1], 44) ^ skein_p[6]; + skein_p[0] += skein_p[7]; + skein_p[7] = ROL2(skein_p[7], 9) ^ skein_p[0]; + skein_p[2] += skein_p[5]; + skein_p[5] = ROL2(skein_p[5], 54) ^ skein_p[2]; + skein_p[4] += skein_p[3]; + skein_p[3] = ROR8(skein_p[3]) ^ skein_p[4]; + skein_p[0] += vectorize(0x4903ADFF749C51CEULL); + skein_p[1] += vectorize(0x0D95DE399746DF03ULL); + skein_p[2] += vectorize(0x8FD1934127C79BCEULL); + skein_p[3] += vectorize(0x9A255629FF352CB1ULL); + skein_p[4] += vectorize(0x5DB62599DF6CA7B0ULL); + skein_p[5] += vectorize(0xEABE394CA9D5C434ULL); + skein_p[6] += vectorize(0x891112C71A75B523ULL); + skein_p[7] += vectorize(0xAE18A40B660FCC33ULL + 9); + skein_p[0] += skein_p[1]; + skein_p[1] = ROL2(skein_p[1], 39) ^ skein_p[0]; + skein_p[2] += skein_p[3]; + skein_p[3] = ROL2(skein_p[3], 30) ^ skein_p[2]; + skein_p[4] += skein_p[5]; + skein_p[5] = ROL2(skein_p[5], 34) ^ skein_p[4]; + skein_p[6] += skein_p[7]; + skein_p[7] = ROL24(skein_p[7]) ^ skein_p[6]; + skein_p[2] += skein_p[1]; + skein_p[1] = ROL2(skein_p[1], 13) ^ skein_p[2]; + skein_p[4] += skein_p[7]; + skein_p[7] = ROL2(skein_p[7], 50) ^ skein_p[4]; + skein_p[6] += skein_p[5]; + skein_p[5] = ROL2(skein_p[5], 10) ^ skein_p[6]; + skein_p[0] += skein_p[3]; + skein_p[3] = ROL2(skein_p[3], 17) ^ skein_p[0]; + skein_p[4] += skein_p[1]; + skein_p[1] = ROL2(skein_p[1], 25) ^ skein_p[4]; + skein_p[6] += skein_p[3]; + skein_p[3] = ROL2(skein_p[3], 29) ^ skein_p[6]; + skein_p[0] += skein_p[5]; + skein_p[5] = ROL2(skein_p[5], 39) ^ skein_p[0]; + skein_p[2] += skein_p[7]; + skein_p[7] = ROL2(skein_p[7], 43) ^ skein_p[2]; + skein_p[6] += skein_p[1]; + skein_p[1] = ROL8(skein_p[1]) ^ skein_p[6]; + skein_p[0] += skein_p[7]; + skein_p[7] = ROL2(skein_p[7], 35) ^ skein_p[0]; + skein_p[2] += skein_p[5]; + skein_p[5] = ROR8(skein_p[5]) ^ skein_p[2]; + skein_p[4] += skein_p[3]; + skein_p[3] = ROL2(skein_p[3], 22) ^ skein_p[4]; + skein_p[0] += vectorize(0x0D95DE399746DF03ULL); + skein_p[1] += vectorize(0x8FD1934127C79BCEULL); + skein_p[2] += vectorize(0x9A255629FF352CB1ULL); + skein_p[3] += vectorize(0x5DB62599DF6CA7B0ULL); + skein_p[4] += vectorize(0xEABE394CA9D5C3F4ULL); + skein_p[5] += vectorize(0x891112C71A75B523ULL); + skein_p[6] += vectorize(0x9E18A40B660FCC73ULL); + skein_p[7] += vectorize(0xcab2076d98173eceULL); + skein_p[0] += skein_p[1]; + skein_p[1] = ROL2(skein_p[1], 46) ^ skein_p[0]; + skein_p[2] += skein_p[3]; + skein_p[3] = ROL2(skein_p[3], 36) ^ skein_p[2]; + skein_p[4] += skein_p[5]; + skein_p[5] = ROL2(skein_p[5], 19) ^ skein_p[4]; + skein_p[6] += skein_p[7]; + skein_p[7] = ROL2(skein_p[7], 37) ^ skein_p[6]; + skein_p[2] += skein_p[1]; + skein_p[1] = ROL2(skein_p[1], 33) ^ skein_p[2]; + skein_p[4] += skein_p[7]; + skein_p[7] = ROL2(skein_p[7], 27) ^ skein_p[4]; + skein_p[6] += skein_p[5]; + skein_p[5] = ROL2(skein_p[5], 14) ^ skein_p[6]; + skein_p[0] += skein_p[3]; + skein_p[3] = ROL2(skein_p[3], 42) ^ skein_p[0]; + skein_p[4] += skein_p[1]; + skein_p[1] = ROL2(skein_p[1], 17) ^ skein_p[4]; + skein_p[6] += skein_p[3]; + skein_p[3] = ROL2(skein_p[3], 49) ^ skein_p[6]; + skein_p[0] += skein_p[5]; + skein_p[5] = ROL2(skein_p[5], 36) ^ skein_p[0]; + skein_p[2] += skein_p[7]; + skein_p[7] = ROL2(skein_p[7], 39) ^ skein_p[2]; + skein_p[6] += skein_p[1]; + skein_p[1] = ROL2(skein_p[1], 44) ^ skein_p[6]; + skein_p[0] += skein_p[7]; + skein_p[7] = ROL2(skein_p[7], 9) ^ skein_p[0]; + skein_p[2] += skein_p[5]; + skein_p[5] = ROL2(skein_p[5], 54) ^ skein_p[2]; + skein_p[4] += skein_p[3]; + skein_p[3] = ROR8(skein_p[3]) ^ skein_p[4]; + skein_p[0] += vectorize(0x8FD1934127C79BCEULL); + skein_p[1] += vectorize(0x9A255629FF352CB1ULL); + skein_p[2] += vectorize(0x5DB62599DF6CA7B0ULL); + skein_p[3] += vectorize(0xEABE394CA9D5C3F4ULL); + skein_p[4] += vectorize(0x991112C71A75B523ULL); + skein_p[5] += vectorize(0x9E18A40B660FCC73ULL); + skein_p[6] += vectorize(0xcab2076d98173ec4ULL + 0x0000000000000040ULL); + skein_p[7] += vectorize(0x4903ADFF749C51CEULL + 11); + skein_p[0] += skein_p[1]; + skein_p[1] = ROL2(skein_p[1], 39) ^ skein_p[0]; + skein_p[2] += skein_p[3]; + skein_p[3] = ROL2(skein_p[3], 30) ^ skein_p[2]; + skein_p[4] += skein_p[5]; + skein_p[5] = ROL2(skein_p[5], 34) ^ skein_p[4]; + skein_p[6] += skein_p[7]; + skein_p[7] = ROL24(skein_p[7]) ^ skein_p[6]; + skein_p[2] += skein_p[1]; + skein_p[1] = ROL2(skein_p[1], 13) ^ skein_p[2]; + skein_p[4] += skein_p[7]; + skein_p[7] = ROL2(skein_p[7], 50) ^ skein_p[4]; + skein_p[6] += skein_p[5]; + skein_p[5] = ROL2(skein_p[5], 10) ^ skein_p[6]; + skein_p[0] += skein_p[3]; + skein_p[3] = ROL2(skein_p[3], 17) ^ skein_p[0]; + skein_p[4] += skein_p[1]; + skein_p[1] = ROL2(skein_p[1], 25) ^ skein_p[4]; + skein_p[6] += skein_p[3]; + skein_p[3] = ROL2(skein_p[3], 29) ^ skein_p[6]; + skein_p[0] += skein_p[5]; + skein_p[5] = ROL2(skein_p[5], 39) ^ skein_p[0]; + skein_p[2] += skein_p[7]; + skein_p[7] = ROL2(skein_p[7], 43) ^ skein_p[2]; + skein_p[6] += skein_p[1]; + skein_p[1] = ROL8(skein_p[1]) ^ skein_p[6]; + skein_p[0] += skein_p[7]; + skein_p[7] = ROL2(skein_p[7], 35) ^ skein_p[0]; + skein_p[2] += skein_p[5]; + skein_p[5] = ROR8(skein_p[5]) ^ skein_p[2]; + skein_p[4] += skein_p[3]; + skein_p[3] = ROL2(skein_p[3], 22) ^ skein_p[4]; + skein_p[0] += vectorize(0x9A255629FF352CB1ULL); + skein_p[1] += vectorize(0x5DB62599DF6CA7B0ULL); + skein_p[2] += vectorize(0xEABE394CA9D5C3F4ULL); + skein_p[3] += vectorize(0x991112C71A75B523ULL); + skein_p[4] += vectorize(0xAE18A40B660FCC33ULL); + skein_p[5] += vectorize(0xcab2076d98173ec4ULL + 0x0000000000000040ULL); + skein_p[6] += vectorize(0x3903ADFF749C51CEULL); + skein_p[7] += vectorize(0x0D95DE399746DF03ULL + 12); + skein_p[0] += skein_p[1]; + skein_p[1] = ROL2(skein_p[1], 46) ^ skein_p[0]; + skein_p[2] += skein_p[3]; + skein_p[3] = ROL2(skein_p[3], 36) ^ skein_p[2]; + skein_p[4] += skein_p[5]; + skein_p[5] = ROL2(skein_p[5], 19) ^ skein_p[4]; + skein_p[6] += skein_p[7]; + skein_p[7] = ROL2(skein_p[7], 37) ^ skein_p[6]; + skein_p[2] += skein_p[1]; + skein_p[1] = ROL2(skein_p[1], 33) ^ skein_p[2]; + skein_p[4] += skein_p[7]; + skein_p[7] = ROL2(skein_p[7], 27) ^ skein_p[4]; + skein_p[6] += skein_p[5]; + skein_p[5] = ROL2(skein_p[5], 14) ^ skein_p[6]; + skein_p[0] += skein_p[3]; + skein_p[3] = ROL2(skein_p[3], 42) ^ skein_p[0]; + skein_p[4] += skein_p[1]; + skein_p[1] = ROL2(skein_p[1], 17) ^ skein_p[4]; + skein_p[6] += skein_p[3]; + skein_p[3] = ROL2(skein_p[3], 49) ^ skein_p[6]; + skein_p[0] += skein_p[5]; + skein_p[5] = ROL2(skein_p[5], 36) ^ skein_p[0]; + skein_p[2] += skein_p[7]; + skein_p[7] = ROL2(skein_p[7], 39) ^ skein_p[2]; + skein_p[6] += skein_p[1]; + skein_p[1] = ROL2(skein_p[1], 44) ^ skein_p[6]; + skein_p[0] += skein_p[7]; + skein_p[7] = ROL2(skein_p[7], 9) ^ skein_p[0]; + skein_p[2] += skein_p[5]; + skein_p[5] = ROL2(skein_p[5], 54) ^ skein_p[2]; + skein_p[4] += skein_p[3]; + skein_p[3] = ROR8(skein_p[3]) ^ skein_p[4]; + skein_p[0] += vectorize(0x5DB62599DF6CA7B0ULL); + skein_p[1] += vectorize(0xEABE394CA9D5C3F4ULL); + skein_p[2] += vectorize(0x991112C71A75B523ULL); + skein_p[3] += vectorize(0xAE18A40B660FCC33ULL); + skein_p[4] += vectorize(0xcab2076d98173ec4ULL); + skein_p[5] += vectorize(0x3903ADFF749C51CEULL); + skein_p[6] += vectorize(0x0D95DE399746DF03ULL + 0xf000000000000040ULL); + skein_p[7] += vectorize(0x8FD1934127C79BCEULL + 13); + skein_p[0] += skein_p[1]; + skein_p[1] = ROL2(skein_p[1], 39) ^ skein_p[0]; + skein_p[2] += skein_p[3]; + skein_p[3] = ROL2(skein_p[3], 30) ^ skein_p[2]; + skein_p[4] += skein_p[5]; + skein_p[5] = ROL2(skein_p[5], 34) ^ skein_p[4]; + skein_p[6] += skein_p[7]; + skein_p[7] = ROL24(skein_p[7]) ^ skein_p[6]; + skein_p[2] += skein_p[1]; + skein_p[1] = ROL2(skein_p[1], 13) ^ skein_p[2]; + skein_p[4] += skein_p[7]; + skein_p[7] = ROL2(skein_p[7], 50) ^ skein_p[4]; + skein_p[6] += skein_p[5]; + skein_p[5] = ROL2(skein_p[5], 10) ^ skein_p[6]; + skein_p[0] += skein_p[3]; + skein_p[3] = ROL2(skein_p[3], 17) ^ skein_p[0]; + skein_p[4] += skein_p[1]; + skein_p[1] = ROL2(skein_p[1], 25) ^ skein_p[4]; + skein_p[6] += skein_p[3]; + skein_p[3] = ROL2(skein_p[3], 29) ^ skein_p[6]; + skein_p[0] += skein_p[5]; + skein_p[5] = ROL2(skein_p[5], 39) ^ skein_p[0]; + skein_p[2] += skein_p[7]; + skein_p[7] = ROL2(skein_p[7], 43) ^ skein_p[2]; + skein_p[6] += skein_p[1]; + skein_p[1] = ROL8(skein_p[1]) ^ skein_p[6]; + skein_p[0] += skein_p[7]; + skein_p[7] = ROL2(skein_p[7], 35) ^ skein_p[0]; + skein_p[2] += skein_p[5]; + skein_p[5] = ROR8(skein_p[5]) ^ skein_p[2]; + skein_p[4] += skein_p[3]; + skein_p[3] = ROL2(skein_p[3], 22) ^ skein_p[4]; + skein_p[0] += vectorize(0xEABE394CA9D5C3F4ULL); + skein_p[1] += vectorize(0x991112C71A75B523ULL); + skein_p[2] += vectorize(0xAE18A40B660FCC33ULL); + skein_p[3] += vectorize(0xcab2076d98173ec4ULL); + skein_p[4] += vectorize(0x4903ADFF749C51CEULL); + skein_p[5] += vectorize(0x0D95DE399746DF03ULL + 0xf000000000000040ULL); + skein_p[6] += vectorize(0x8FD1934127C79BCEULL + 0x0000000000000040ULL); + skein_p[7] += vectorize(0x9A255629FF352CB1ULL + 14); + skein_p[0] += skein_p[1]; + skein_p[1] = ROL2(skein_p[1], 46) ^ skein_p[0]; + skein_p[2] += skein_p[3]; + skein_p[3] = ROL2(skein_p[3], 36) ^ skein_p[2]; + skein_p[4] += skein_p[5]; + skein_p[5] = ROL2(skein_p[5], 19) ^ skein_p[4]; + skein_p[6] += skein_p[7]; + skein_p[7] = ROL2(skein_p[7], 37) ^ skein_p[6]; + skein_p[2] += skein_p[1]; + skein_p[1] = ROL2(skein_p[1], 33) ^ skein_p[2]; + skein_p[4] += skein_p[7]; + skein_p[7] = ROL2(skein_p[7], 27) ^ skein_p[4]; + skein_p[6] += skein_p[5]; + skein_p[5] = ROL2(skein_p[5], 14) ^ skein_p[6]; + skein_p[0] += skein_p[3]; + skein_p[3] = ROL2(skein_p[3], 42) ^ skein_p[0]; + skein_p[4] += skein_p[1]; + skein_p[1] = ROL2(skein_p[1], 17) ^ skein_p[4]; + skein_p[6] += skein_p[3]; + skein_p[3] = ROL2(skein_p[3], 49) ^ skein_p[6]; + skein_p[0] += skein_p[5]; + skein_p[5] = ROL2(skein_p[5], 36) ^ skein_p[0]; + skein_p[2] += skein_p[7]; + skein_p[7] = ROL2(skein_p[7], 39) ^ skein_p[2]; + skein_p[6] += skein_p[1]; + skein_p[1] = ROL2(skein_p[1], 44) ^ skein_p[6]; + skein_p[0] += skein_p[7]; + skein_p[7] = ROL2(skein_p[7], 9) ^ skein_p[0]; + skein_p[2] += skein_p[5]; + skein_p[5] = ROL2(skein_p[5], 54) ^ skein_p[2]; + skein_p[4] += skein_p[3]; + skein_p[3] = ROR8(skein_p[3]) ^ skein_p[4]; + skein_p[0] += vectorize(0x991112C71A75B523ULL); + skein_p[1] += vectorize(0xAE18A40B660FCC33ULL); + skein_p[2] += vectorize(0xcab2076d98173ec4ULL); + skein_p[3] += vectorize(0x4903ADFF749C51CEULL); + skein_p[4] += vectorize(0x0D95DE399746DF03ULL); + skein_p[5] += vectorize(0x8FD1934127C79BCEULL + 0x0000000000000040ULL); + skein_p[6] += vectorize(0x8A255629FF352CB1ULL); + skein_p[7] += vectorize(0x5DB62599DF6CA7B0ULL + 15); + skein_p[0] += skein_p[1]; + skein_p[1] = ROL2(skein_p[1], 39) ^ skein_p[0]; + skein_p[2] += skein_p[3]; + skein_p[3] = ROL2(skein_p[3], 30) ^ skein_p[2]; + skein_p[4] += skein_p[5]; + skein_p[5] = ROL2(skein_p[5], 34) ^ skein_p[4]; + skein_p[6] += skein_p[7]; + skein_p[7] = ROL24(skein_p[7]) ^ skein_p[6]; + skein_p[2] += skein_p[1]; + skein_p[1] = ROL2(skein_p[1], 13) ^ skein_p[2]; + skein_p[4] += skein_p[7]; + skein_p[7] = ROL2(skein_p[7], 50) ^ skein_p[4]; + skein_p[6] += skein_p[5]; + skein_p[5] = ROL2(skein_p[5], 10) ^ skein_p[6]; + skein_p[0] += skein_p[3]; + skein_p[3] = ROL2(skein_p[3], 17) ^ skein_p[0]; + skein_p[4] += skein_p[1]; + skein_p[1] = ROL2(skein_p[1], 25) ^ skein_p[4]; + skein_p[6] += skein_p[3]; + skein_p[3] = ROL2(skein_p[3], 29) ^ skein_p[6]; + skein_p[0] += skein_p[5]; + skein_p[5] = ROL2(skein_p[5], 39) ^ skein_p[0]; + skein_p[2] += skein_p[7]; + skein_p[7] = ROL2(skein_p[7], 43) ^ skein_p[2]; + skein_p[6] += skein_p[1]; + skein_p[1] = ROL8(skein_p[1]) ^ skein_p[6]; + skein_p[0] += skein_p[7]; + skein_p[7] = ROL2(skein_p[7], 35) ^ skein_p[0]; + skein_p[2] += skein_p[5]; + skein_p[5] = ROR8(skein_p[5]) ^ skein_p[2]; + skein_p[4] += skein_p[3]; + skein_p[3] = ROL2(skein_p[3], 22) ^ skein_p[4]; + skein_p[0] += vectorize(0xAE18A40B660FCC33ULL); + skein_p[1] += vectorize(0xcab2076d98173ec4ULL); + skein_p[2] += vectorize(0x4903ADFF749C51CEULL); + skein_p[3] += vectorize(0x0D95DE399746DF03ULL); + skein_p[4] += vectorize(0x8FD1934127C79BCEULL); + skein_p[5] += vectorize(0x8A255629FF352CB1ULL); + skein_p[6] += vectorize(0x4DB62599DF6CA7F0ULL); + skein_p[7] += vectorize(0xEABE394CA9D5C3F4ULL +16ULL); + skein_p[0] += skein_p[1]; + skein_p[1] = ROL2(skein_p[1], 46) ^ skein_p[0]; + skein_p[2] += skein_p[3]; + skein_p[3] = ROL2(skein_p[3], 36) ^ skein_p[2]; + skein_p[4] += skein_p[5]; + skein_p[5] = ROL2(skein_p[5], 19) ^ skein_p[4]; + skein_p[6] += skein_p[7]; + skein_p[7] = ROL2(skein_p[7], 37) ^ skein_p[6]; + skein_p[2] += skein_p[1]; + skein_p[1] = ROL2(skein_p[1], 33) ^ skein_p[2]; + skein_p[4] += skein_p[7]; + skein_p[7] = ROL2(skein_p[7], 27) ^ skein_p[4]; + skein_p[6] += skein_p[5]; + skein_p[5] = ROL2(skein_p[5], 14) ^ skein_p[6]; + skein_p[0] += skein_p[3]; + skein_p[3] = ROL2(skein_p[3], 42) ^ skein_p[0]; + skein_p[4] += skein_p[1]; + skein_p[1] = ROL2(skein_p[1], 17) ^ skein_p[4]; + skein_p[6] += skein_p[3]; + skein_p[3] = ROL2(skein_p[3], 49) ^ skein_p[6]; + skein_p[0] += skein_p[5]; + skein_p[5] = ROL2(skein_p[5], 36) ^ skein_p[0]; + skein_p[2] += skein_p[7]; + skein_p[7] = ROL2(skein_p[7], 39) ^ skein_p[2]; + skein_p[6] += skein_p[1]; + skein_p[1] = ROL2(skein_p[1], 44) ^ skein_p[6]; + skein_p[0] += skein_p[7]; + skein_p[7] = ROL2(skein_p[7], 9) ^ skein_p[0]; + skein_p[2] += skein_p[5]; + skein_p[5] = ROL2(skein_p[5], 54) ^ skein_p[2]; + skein_p[4] += skein_p[3]; + skein_p[3] = ROR8(skein_p[3]) ^ skein_p[4]; + skein_p[0] += vectorize(0xcab2076d98173ec4ULL); + skein_p[1] += vectorize(0x4903ADFF749C51CEULL); + skein_p[2] += vectorize(0x0D95DE399746DF03ULL); + skein_p[3] += vectorize(0x8FD1934127C79BCEULL); + skein_p[4] += vectorize(0x9A255629FF352CB1ULL); + skein_p[5] += vectorize(0x4DB62599DF6CA7F0ULL); + skein_p[6] += vectorize(0xEABE394CA9D5C3F4ULL + 0x0000000000000040ULL); + skein_p[7] += vectorize(0x991112C71A75B523ULL + 17); + skein_p[0] += skein_p[1]; + skein_p[1] = ROL2(skein_p[1], 39) ^ skein_p[0]; + skein_p[2] += skein_p[3]; + skein_p[3] = ROL2(skein_p[3], 30) ^ skein_p[2]; + skein_p[4] += skein_p[5]; + skein_p[5] = ROL2(skein_p[5], 34) ^ skein_p[4]; + skein_p[6] += skein_p[7]; + skein_p[7] = ROL24(skein_p[7]) ^ skein_p[6]; + skein_p[2] += skein_p[1]; + skein_p[1] = ROL2(skein_p[1], 13) ^ skein_p[2]; + skein_p[4] += skein_p[7]; + skein_p[7] = ROL2(skein_p[7], 50) ^ skein_p[4]; + skein_p[6] += skein_p[5]; + skein_p[5] = ROL2(skein_p[5], 10) ^ skein_p[6]; + skein_p[0] += skein_p[3]; + skein_p[3] = ROL2(skein_p[3], 17) ^ skein_p[0]; + skein_p[4] += skein_p[1]; + skein_p[1] = ROL2(skein_p[1], 25) ^ skein_p[4]; + skein_p[6] += skein_p[3]; + skein_p[3] = ROL2(skein_p[3], 29) ^ skein_p[6]; + skein_p[0] += skein_p[5]; + skein_p[5] = ROL2(skein_p[5], 39) ^ skein_p[0]; + skein_p[2] += skein_p[7]; + skein_p[7] = ROL2(skein_p[7], 43) ^ skein_p[2]; + skein_p[6] += skein_p[1]; + skein_p[1] = ROL8(skein_p[1]) ^ skein_p[6]; + skein_p[0] += skein_p[7]; + skein_p[7] = ROL2(skein_p[7], 35) ^ skein_p[0]; + skein_p[2] += skein_p[5]; + skein_p[5] = ROR8(skein_p[5]) ^ skein_p[2]; + skein_p[4] += skein_p[3]; + skein_p[3] = ROL2(skein_p[3], 22) ^ skein_p[4]; + skein_p[0] += vectorize(0x4903ADFF749C51CEULL); + skein_p[1] += vectorize(0x0D95DE399746DF03ULL); + skein_p[2] += vectorize(0x8FD1934127C79BCEULL); + skein_p[3] += vectorize(0x9A255629FF352CB1ULL); + skein_p[4] += vectorize(0x5DB62599DF6CA7B0ULL); + skein_p[5] += vectorize(0xEABE394CA9D5C3F4ULL + 0x0000000000000040ULL); + skein_p[6] += vectorize(0x891112C71A75B523ULL); + skein_p[7] += vectorize(0xAE18A40B660FCC33ULL + 18); + +#define h0 skein_p[0] +#define h1 skein_p[1] +#define h2 skein_p[2] +#define h3 skein_p[3] +#define h4 skein_p[4] +#define h5 skein_p[5] +#define h6 skein_p[6] +#define h7 skein_p[7] + h0 ^= h[0]; + h1 ^= h[1]; + h2 ^= h[2]; + h3 ^= h[3]; + h4 ^= h[4]; + h5 ^= h[5]; + h6 ^= h[6]; + h7 ^= h[7]; + + uint2 skein_h8 = h0 ^ h1 ^ h2 ^ h3 ^ h4 ^ h5 ^ h6 ^ h7 ^ vectorize(0x1BD11BDAA9FC1A22ULL); + + uint2 hash64[8]; + + hash64[0] = (h0); +// hash64[1] = (h1); + hash64[2] = (h2); +// hash64[3] = (h3); + hash64[4] = (h4); + hash64[5] = (h5 + vectorizelow(8ULL)); + hash64[6] = (h6 + vectorizehigh(0xff000000UL)); +// hash64[7] = (h7); + + hash64[0] += h1; + hash64[1] = ROL2(h1, 46) ^ hash64[0]; + hash64[2] += h3; + hash64[3] = ROL2(h3, 36) ^ hash64[2]; + hash64[4] += hash64[5]; + hash64[5] = ROL2(hash64[5], 19) ^ hash64[4]; + hash64[6] += h7; + hash64[7] = ROL2(h7, 37) ^ hash64[6]; + hash64[2] += hash64[1]; + hash64[1] = ROL2(hash64[1], 33) ^ hash64[2]; + hash64[4] += hash64[7]; + hash64[7] = ROL2(hash64[7], 27) ^ hash64[4]; + hash64[6] += hash64[5]; + hash64[5] = ROL2(hash64[5], 14) ^ hash64[6]; + hash64[0] += hash64[3]; + hash64[3] = ROL2(hash64[3], 42) ^ hash64[0]; + hash64[4] += hash64[1]; + hash64[1] = ROL2(hash64[1], 17) ^ hash64[4]; + hash64[6] += hash64[3]; + hash64[3] = ROL2(hash64[3], 49) ^ hash64[6]; + hash64[0] += hash64[5]; + hash64[5] = ROL2(hash64[5], 36) ^ hash64[0]; + hash64[2] += hash64[7]; + hash64[7] = ROL2(hash64[7], 39) ^ hash64[2]; + hash64[6] += hash64[1]; + hash64[1] = ROL2(hash64[1], 44) ^ hash64[6]; + hash64[0] += hash64[7]; + hash64[7] = ROL2(hash64[7], 9) ^ hash64[0]; + hash64[2] += hash64[5]; + hash64[5] = ROL2(hash64[5], 54) ^ hash64[2]; + hash64[4] += hash64[3]; + hash64[3] = ROR8(hash64[3]) ^ hash64[4]; + hash64[0] = (hash64[0] + h1); + hash64[1] = (hash64[1] + h2); + hash64[2] = (hash64[2] + h3); + hash64[3] = (hash64[3] + h4); + hash64[4] = (hash64[4] + h5); + hash64[5] = (hash64[5] + h6 + vectorizehigh(0xff000000UL)); + hash64[6] = (hash64[6] + h7 + vectorize(0xff00000000000008ULL)); + hash64[7] = (hash64[7] + skein_h8 + vectorizelow(1)); + hash64[0] += hash64[1]; + hash64[1] = ROL2(hash64[1], 39) ^ hash64[0]; + hash64[2] += hash64[3]; + hash64[3] = ROL2(hash64[3], 30) ^ hash64[2]; + hash64[4] += hash64[5]; + hash64[5] = ROL2(hash64[5], 34) ^ hash64[4]; + hash64[6] += hash64[7]; + hash64[7] = ROL24(hash64[7]) ^ hash64[6]; + hash64[2] += hash64[1]; + hash64[1] = ROL2(hash64[1], 13) ^ hash64[2]; + hash64[4] += hash64[7]; + hash64[7] = ROL2(hash64[7], 50) ^ hash64[4]; + hash64[6] += hash64[5]; + hash64[5] = ROL2(hash64[5], 10) ^ hash64[6]; + hash64[0] += hash64[3]; + hash64[3] = ROL2(hash64[3], 17) ^ hash64[0]; + hash64[4] += hash64[1]; + hash64[1] = ROL2(hash64[1], 25) ^ hash64[4]; + hash64[6] += hash64[3]; + hash64[3] = ROL2(hash64[3], 29) ^ hash64[6]; + hash64[0] += hash64[5]; + hash64[5] = ROL2(hash64[5], 39) ^ hash64[0]; + hash64[2] += hash64[7]; + hash64[7] = ROL2(hash64[7], 43) ^ hash64[2]; + hash64[6] += hash64[1]; + hash64[1] = ROL8(hash64[1]) ^ hash64[6]; + hash64[0] += hash64[7]; + hash64[7] = ROL2(hash64[7], 35) ^ hash64[0]; + hash64[2] += hash64[5]; + hash64[5] = ROR8(hash64[5]) ^ hash64[2]; + hash64[4] += hash64[3]; + hash64[3] = ROL2(hash64[3], 22) ^ hash64[4]; + hash64[0] = (hash64[0] + h2); + hash64[1] = (hash64[1] + h3); + hash64[2] = (hash64[2] + h4); + hash64[3] = (hash64[3] + h5); + hash64[4] = (hash64[4] + h6); + hash64[5] = (hash64[5] + h7 + vectorize(0xff00000000000008ULL)); + hash64[6] = (hash64[6] + skein_h8 + vectorizelow(8ULL)); + hash64[7] = (hash64[7] + h0 + vectorize(2)); + hash64[0] += hash64[1]; + hash64[1] = ROL2(hash64[1], 46) ^ hash64[0]; + hash64[2] += hash64[3]; + hash64[3] = ROL2(hash64[3], 36) ^ hash64[2]; + hash64[4] += hash64[5]; + hash64[5] = ROL2(hash64[5], 19) ^ hash64[4]; + hash64[6] += hash64[7]; + hash64[7] = ROL2(hash64[7], 37) ^ hash64[6]; + hash64[2] += hash64[1]; + hash64[1] = ROL2(hash64[1], 33) ^ hash64[2]; + hash64[4] += hash64[7]; + hash64[7] = ROL2(hash64[7], 27) ^ hash64[4]; + hash64[6] += hash64[5]; + hash64[5] = ROL2(hash64[5], 14) ^ hash64[6]; + hash64[0] += hash64[3]; + hash64[3] = ROL2(hash64[3], 42) ^ hash64[0]; + hash64[4] += hash64[1]; + hash64[1] = ROL2(hash64[1], 17) ^ hash64[4]; + hash64[6] += hash64[3]; + hash64[3] = ROL2(hash64[3], 49) ^ hash64[6]; + hash64[0] += hash64[5]; + hash64[5] = ROL2(hash64[5], 36) ^ hash64[0]; + hash64[2] += hash64[7]; + hash64[7] = ROL2(hash64[7], 39) ^ hash64[2]; + hash64[6] += hash64[1]; + hash64[1] = ROL2(hash64[1], 44) ^ hash64[6]; + hash64[0] += hash64[7]; + hash64[7] = ROL2(hash64[7], 9) ^ hash64[0]; + hash64[2] += hash64[5]; + hash64[5] = ROL2(hash64[5], 54) ^ hash64[2]; + hash64[4] += hash64[3]; + hash64[3] = ROR8(hash64[3]) ^ hash64[4]; + hash64[0] = (hash64[0] + h3); + hash64[1] = (hash64[1] + h4); + hash64[2] = (hash64[2] + h5); + hash64[3] = (hash64[3] + h6); + hash64[4] = (hash64[4] + h7); + hash64[5] = (hash64[5] + skein_h8 + vectorizelow(8)); + hash64[6] = (hash64[6] + h0 + vectorizehigh(0xff000000UL)); + hash64[7] = (hash64[7] + h1 + vectorizelow(3)); + hash64[0] += hash64[1]; + hash64[1] = ROL2(hash64[1], 39) ^ hash64[0]; + hash64[2] += hash64[3]; + hash64[3] = ROL2(hash64[3], 30) ^ hash64[2]; + hash64[4] += hash64[5]; + hash64[5] = ROL2(hash64[5], 34) ^ hash64[4]; + hash64[6] += hash64[7]; + hash64[7] = ROL24(hash64[7]) ^ hash64[6]; + hash64[2] += hash64[1]; + hash64[1] = ROL2(hash64[1], 13) ^ hash64[2]; + hash64[4] += hash64[7]; + hash64[7] = ROL2(hash64[7], 50) ^ hash64[4]; + hash64[6] += hash64[5]; + hash64[5] = ROL2(hash64[5], 10) ^ hash64[6]; + hash64[0] += hash64[3]; + hash64[3] = ROL2(hash64[3], 17) ^ hash64[0]; + hash64[4] += hash64[1]; + hash64[1] = ROL2(hash64[1], 25) ^ hash64[4]; + hash64[6] += hash64[3]; + hash64[3] = ROL2(hash64[3], 29) ^ hash64[6]; + hash64[0] += hash64[5]; + hash64[5] = ROL2(hash64[5], 39) ^ hash64[0]; + hash64[2] += hash64[7]; + hash64[7] = ROL2(hash64[7], 43) ^ hash64[2]; + hash64[6] += hash64[1]; + hash64[1] = ROL8(hash64[1]) ^ hash64[6]; + hash64[0] += hash64[7]; + hash64[7] = ROL2(hash64[7], 35) ^ hash64[0]; + hash64[2] += hash64[5]; + hash64[5] = ROR8(hash64[5]) ^ hash64[2]; + hash64[4] += hash64[3]; + hash64[3] = ROL2(hash64[3], 22) ^ hash64[4]; + hash64[0] = (hash64[0] + h4); + hash64[1] = (hash64[1] + h5); + hash64[2] = (hash64[2] + h6); + hash64[3] = (hash64[3] + h7); + hash64[4] = (hash64[4] + skein_h8); + hash64[5] = (hash64[5] + h0 + vectorizehigh(0xff000000UL)); + hash64[6] = (hash64[6] + h1 + vectorize(0xff00000000000008ULL)); + hash64[7] = (hash64[7] + h2 + vectorizelow(4)); + hash64[0] += hash64[1]; + hash64[1] = ROL2(hash64[1], 46) ^ hash64[0]; + hash64[2] += hash64[3]; + hash64[3] = ROL2(hash64[3], 36) ^ hash64[2]; + hash64[4] += hash64[5]; + hash64[5] = ROL2(hash64[5], 19) ^ hash64[4]; + hash64[6] += hash64[7]; + hash64[7] = ROL2(hash64[7], 37) ^ hash64[6]; + hash64[2] += hash64[1]; + hash64[1] = ROL2(hash64[1], 33) ^ hash64[2]; + hash64[4] += hash64[7]; + hash64[7] = ROL2(hash64[7], 27) ^ hash64[4]; + hash64[6] += hash64[5]; + hash64[5] = ROL2(hash64[5], 14) ^ hash64[6]; + hash64[0] += hash64[3]; + hash64[3] = ROL2(hash64[3], 42) ^ hash64[0]; + hash64[4] += hash64[1]; + hash64[1] = ROL2(hash64[1], 17) ^ hash64[4]; + hash64[6] += hash64[3]; + hash64[3] = ROL2(hash64[3], 49) ^ hash64[6]; + hash64[0] += hash64[5]; + hash64[5] = ROL2(hash64[5], 36) ^ hash64[0]; + hash64[2] += hash64[7]; + hash64[7] = ROL2(hash64[7], 39) ^ hash64[2]; + hash64[6] += hash64[1]; + hash64[1] = ROL2(hash64[1], 44) ^ hash64[6]; + hash64[0] += hash64[7]; + hash64[7] = ROL2(hash64[7], 9) ^ hash64[0]; + hash64[2] += hash64[5]; + hash64[5] = ROL2(hash64[5], 54) ^ hash64[2]; + hash64[4] += hash64[3]; + hash64[3] = ROR8(hash64[3]) ^ hash64[4]; + hash64[0] = (hash64[0] + h5); + hash64[1] = (hash64[1] + h6); + hash64[2] = (hash64[2] + h7); + hash64[3] = (hash64[3] + skein_h8); + hash64[4] = (hash64[4] + h0); + hash64[5] = (hash64[5] + h1 + vectorize(0xff00000000000008ULL)); + hash64[6] = (hash64[6] + h2 + vectorizelow(8ULL)); + hash64[7] = (hash64[7] + h3 + vectorizelow(5)); + hash64[0] += hash64[1]; + hash64[1] = ROL2(hash64[1], 39) ^ hash64[0]; + hash64[2] += hash64[3]; + hash64[3] = ROL2(hash64[3], 30) ^ hash64[2]; + hash64[4] += hash64[5]; + hash64[5] = ROL2(hash64[5], 34) ^ hash64[4]; + hash64[6] += hash64[7]; + hash64[7] = ROL24(hash64[7]) ^ hash64[6]; + hash64[2] += hash64[1]; + hash64[1] = ROL2(hash64[1], 13) ^ hash64[2]; + hash64[4] += hash64[7]; + hash64[7] = ROL2(hash64[7], 50) ^ hash64[4]; + hash64[6] += hash64[5]; + hash64[5] = ROL2(hash64[5], 10) ^ hash64[6]; + hash64[0] += hash64[3]; + hash64[3] = ROL2(hash64[3], 17) ^ hash64[0]; + hash64[4] += hash64[1]; + hash64[1] = ROL2(hash64[1], 25) ^ hash64[4]; + hash64[6] += hash64[3]; + hash64[3] = ROL2(hash64[3], 29) ^ hash64[6]; + hash64[0] += hash64[5]; + hash64[5] = ROL2(hash64[5], 39) ^ hash64[0]; + hash64[2] += hash64[7]; + hash64[7] = ROL2(hash64[7], 43) ^ hash64[2]; + hash64[6] += hash64[1]; + hash64[1] = ROL8(hash64[1]) ^ hash64[6]; + hash64[0] += hash64[7]; + hash64[7] = ROL2(hash64[7], 35) ^ hash64[0]; + hash64[2] += hash64[5]; + hash64[5] = ROR8(hash64[5]) ^ hash64[2]; + hash64[4] += hash64[3]; + hash64[3] = ROL2(hash64[3], 22) ^ hash64[4]; + hash64[0] = (hash64[0] + h6); + hash64[1] = (hash64[1] + h7); + hash64[2] = (hash64[2] + skein_h8); + hash64[3] = (hash64[3] + h0); + hash64[4] = (hash64[4] + h1); + hash64[5] = (hash64[5] + h2 + vectorizelow(8ULL)); + hash64[6] = (hash64[6] + h3 + vectorizehigh(0xff000000UL)); + hash64[7] = (hash64[7] + h4 + vectorizelow(6)); + hash64[0] += hash64[1]; + hash64[1] = ROL2(hash64[1], 46) ^ hash64[0]; + hash64[2] += hash64[3]; + hash64[3] = ROL2(hash64[3], 36) ^ hash64[2]; + hash64[4] += hash64[5]; + hash64[5] = ROL2(hash64[5], 19) ^ hash64[4]; + hash64[6] += hash64[7]; + hash64[7] = ROL2(hash64[7], 37) ^ hash64[6]; + hash64[2] += hash64[1]; + hash64[1] = ROL2(hash64[1], 33) ^ hash64[2]; + hash64[4] += hash64[7]; + hash64[7] = ROL2(hash64[7], 27) ^ hash64[4]; + hash64[6] += hash64[5]; + hash64[5] = ROL2(hash64[5], 14) ^ hash64[6]; + hash64[0] += hash64[3]; + hash64[3] = ROL2(hash64[3], 42) ^ hash64[0]; + hash64[4] += hash64[1]; + hash64[1] = ROL2(hash64[1], 17) ^ hash64[4]; + hash64[6] += hash64[3]; + hash64[3] = ROL2(hash64[3], 49) ^ hash64[6]; + hash64[0] += hash64[5]; + hash64[5] = ROL2(hash64[5], 36) ^ hash64[0]; + hash64[2] += hash64[7]; + hash64[7] = ROL2(hash64[7], 39) ^ hash64[2]; + hash64[6] += hash64[1]; + hash64[1] = ROL2(hash64[1], 44) ^ hash64[6]; + hash64[0] += hash64[7]; + hash64[7] = ROL2(hash64[7], 9) ^ hash64[0]; + hash64[2] += hash64[5]; + hash64[5] = ROL2(hash64[5], 54) ^ hash64[2]; + hash64[4] += hash64[3]; + hash64[3] = ROR8(hash64[3]) ^ hash64[4]; + hash64[0] = (hash64[0] + h7); + hash64[1] = (hash64[1] + skein_h8); + hash64[2] = (hash64[2] + h0); + hash64[3] = (hash64[3] + h1); + hash64[4] = (hash64[4] + h2); + hash64[5] = (hash64[5] + h3 + vectorizehigh(0xff000000UL)); + hash64[6] = (hash64[6] + h4 + vectorize(0xff00000000000008ULL)); + hash64[7] = (hash64[7] + h5 + vectorizelow(7)); + hash64[0] += hash64[1]; + hash64[1] = ROL2(hash64[1], 39) ^ hash64[0]; + hash64[2] += hash64[3]; + hash64[3] = ROL2(hash64[3], 30) ^ hash64[2]; + hash64[4] += hash64[5]; + hash64[5] = ROL2(hash64[5], 34) ^ hash64[4]; + hash64[6] += hash64[7]; + hash64[7] = ROL24(hash64[7]) ^ hash64[6]; + hash64[2] += hash64[1]; + hash64[1] = ROL2(hash64[1], 13) ^ hash64[2]; + hash64[4] += hash64[7]; + hash64[7] = ROL2(hash64[7], 50) ^ hash64[4]; + hash64[6] += hash64[5]; + hash64[5] = ROL2(hash64[5], 10) ^ hash64[6]; + hash64[0] += hash64[3]; + hash64[3] = ROL2(hash64[3], 17) ^ hash64[0]; + hash64[4] += hash64[1]; + hash64[1] = ROL2(hash64[1], 25) ^ hash64[4]; + hash64[6] += hash64[3]; + hash64[3] = ROL2(hash64[3], 29) ^ hash64[6]; + hash64[0] += hash64[5]; + hash64[5] = ROL2(hash64[5], 39) ^ hash64[0]; + hash64[2] += hash64[7]; + hash64[7] = ROL2(hash64[7], 43) ^ hash64[2]; + hash64[6] += hash64[1]; + hash64[1] = ROL8(hash64[1]) ^ hash64[6]; + hash64[0] += hash64[7]; + hash64[7] = ROL2(hash64[7], 35) ^ hash64[0]; + hash64[2] += hash64[5]; + hash64[5] = ROR8(hash64[5]) ^ hash64[2]; + hash64[4] += hash64[3]; + hash64[3] = ROL2(hash64[3], 22) ^ hash64[4]; + hash64[0] = (hash64[0] + skein_h8); + hash64[1] = (hash64[1] + h0); + hash64[2] = (hash64[2] + h1); + hash64[3] = (hash64[3] + h2); + hash64[4] = (hash64[4] + h3); + hash64[5] = (hash64[5] + h4 + vectorize(0xff00000000000008ULL)); + hash64[6] = (hash64[6] + h5 + vectorizelow(8)); + hash64[7] = (hash64[7] + h6 + vectorizelow(8)); + hash64[0] += hash64[1]; + hash64[1] = ROL2(hash64[1], 46) ^ hash64[0]; + hash64[2] += hash64[3]; + hash64[3] = ROL2(hash64[3], 36) ^ hash64[2]; + hash64[4] += hash64[5]; + hash64[5] = ROL2(hash64[5], 19) ^ hash64[4]; + hash64[6] += hash64[7]; + hash64[7] = ROL2(hash64[7], 37) ^ hash64[6]; + hash64[2] += hash64[1]; + hash64[1] = ROL2(hash64[1], 33) ^ hash64[2]; + hash64[4] += hash64[7]; + hash64[7] = ROL2(hash64[7], 27) ^ hash64[4]; + hash64[6] += hash64[5]; + hash64[5] = ROL2(hash64[5], 14) ^ hash64[6]; + hash64[0] += hash64[3]; + hash64[3] = ROL2(hash64[3], 42) ^ hash64[0]; + hash64[4] += hash64[1]; + hash64[1] = ROL2(hash64[1], 17) ^ hash64[4]; + hash64[6] += hash64[3]; + hash64[3] = ROL2(hash64[3], 49) ^ hash64[6]; + hash64[0] += hash64[5]; + hash64[5] = ROL2(hash64[5], 36) ^ hash64[0]; + hash64[2] += hash64[7]; + hash64[7] = ROL2(hash64[7], 39) ^ hash64[2]; + hash64[6] += hash64[1]; + hash64[1] = ROL2(hash64[1], 44) ^ hash64[6]; + hash64[0] += hash64[7]; + hash64[7] = ROL2(hash64[7], 9) ^ hash64[0]; + hash64[2] += hash64[5]; + hash64[5] = ROL2(hash64[5], 54) ^ hash64[2]; + hash64[4] += hash64[3]; + hash64[3] = ROR8(hash64[3]) ^ hash64[4]; + hash64[0] = (hash64[0] + h0); + hash64[1] = (hash64[1] + h1); + hash64[2] = (hash64[2] + h2); + hash64[3] = (hash64[3] + h3); + hash64[4] = (hash64[4] + h4); + hash64[5] = (hash64[5] + h5 + vectorizelow(8)); + hash64[6] = (hash64[6] + h6 + vectorizehigh(0xff000000UL)); + hash64[7] = (hash64[7] + h7 + vectorizelow(9)); + hash64[0] += hash64[1]; + hash64[1] = ROL2(hash64[1], 39) ^ hash64[0]; + hash64[2] += hash64[3]; + hash64[3] = ROL2(hash64[3], 30) ^ hash64[2]; + hash64[4] += hash64[5]; + hash64[5] = ROL2(hash64[5], 34) ^ hash64[4]; + hash64[6] += hash64[7]; + hash64[7] = ROL24(hash64[7]) ^ hash64[6]; + hash64[2] += hash64[1]; + hash64[1] = ROL2(hash64[1], 13) ^ hash64[2]; + hash64[4] += hash64[7]; + hash64[7] = ROL2(hash64[7], 50) ^ hash64[4]; + hash64[6] += hash64[5]; + hash64[5] = ROL2(hash64[5], 10) ^ hash64[6]; + hash64[0] += hash64[3]; + hash64[3] = ROL2(hash64[3], 17) ^ hash64[0]; + hash64[4] += hash64[1]; + hash64[1] = ROL2(hash64[1], 25) ^ hash64[4]; + hash64[6] += hash64[3]; + hash64[3] = ROL2(hash64[3], 29) ^ hash64[6]; + hash64[0] += hash64[5]; + hash64[5] = ROL2(hash64[5], 39) ^ hash64[0]; + hash64[2] += hash64[7]; + hash64[7] = ROL2(hash64[7], 43) ^ hash64[2]; + hash64[6] += hash64[1]; + hash64[1] = ROL8(hash64[1]) ^ hash64[6]; + hash64[0] += hash64[7]; + hash64[7] = ROL2(hash64[7], 35) ^ hash64[0]; + hash64[2] += hash64[5]; + hash64[5] = ROR8(hash64[5]) ^ hash64[2]; + hash64[4] += hash64[3]; + hash64[3] = ROL2(hash64[3], 22) ^ hash64[4]; + + hash64[0] = (hash64[0] + h1); + hash64[1] = (hash64[1] + h2); + hash64[2] = (hash64[2] + h3); + hash64[3] = (hash64[3] + h4); + hash64[4] = (hash64[4] + h5); + hash64[5] = (hash64[5] + h6 + vectorizehigh(0xff000000UL)); + hash64[6] = (hash64[6] + h7 + vectorize(0xff00000000000008ULL)); + hash64[7] = (hash64[7] + skein_h8 + (vectorizelow(10))); + hash64[0] += hash64[1]; + hash64[1] = ROL2(hash64[1], 46) ^ hash64[0]; + hash64[2] += hash64[3]; + hash64[3] = ROL2(hash64[3], 36) ^ hash64[2]; + hash64[4] += hash64[5]; + hash64[5] = ROL2(hash64[5], 19) ^ hash64[4]; + hash64[6] += hash64[7]; + hash64[7] = ROL2(hash64[7], 37) ^ hash64[6]; + hash64[2] += hash64[1]; + hash64[1] = ROL2(hash64[1], 33) ^ hash64[2]; + hash64[4] += hash64[7]; + hash64[7] = ROL2(hash64[7], 27) ^ hash64[4]; + hash64[6] += hash64[5]; + hash64[5] = ROL2(hash64[5], 14) ^ hash64[6]; + hash64[0] += hash64[3]; + hash64[3] = ROL2(hash64[3], 42) ^ hash64[0]; + hash64[4] += hash64[1]; + hash64[1] = ROL2(hash64[1], 17) ^ hash64[4]; + hash64[6] += hash64[3]; + hash64[3] = ROL2(hash64[3], 49) ^ hash64[6]; + hash64[0] += hash64[5]; + hash64[5] = ROL2(hash64[5], 36) ^ hash64[0]; + hash64[2] += hash64[7]; + hash64[7] = ROL2(hash64[7], 39) ^ hash64[2]; + hash64[6] += hash64[1]; + hash64[1] = ROL2(hash64[1], 44) ^ hash64[6]; + hash64[0] += hash64[7]; + hash64[7] = ROL2(hash64[7], 9) ^ hash64[0]; + hash64[2] += hash64[5]; + hash64[5] = ROL2(hash64[5], 54) ^ hash64[2]; + hash64[4] += hash64[3]; + hash64[3] = ROR8(hash64[3]) ^ hash64[4]; + hash64[0] = (hash64[0] + h2); + hash64[1] = (hash64[1] + h3); + hash64[2] = (hash64[2] + h4); + hash64[3] = (hash64[3] + h5); + hash64[4] = (hash64[4] + h6); + hash64[5] = (hash64[5] + h7 + vectorize(0xff00000000000008ULL)); + hash64[6] = (hash64[6] + skein_h8 + vectorizelow(8ULL)); + hash64[7] = (hash64[7] + h0 + vectorizelow(11)); + hash64[0] += hash64[1]; + hash64[1] = ROL2(hash64[1], 39) ^ hash64[0]; + hash64[2] += hash64[3]; + hash64[3] = ROL2(hash64[3], 30) ^ hash64[2]; + hash64[4] += hash64[5]; + hash64[5] = ROL2(hash64[5], 34) ^ hash64[4]; + hash64[6] += hash64[7]; + hash64[7] = ROL24(hash64[7]) ^ hash64[6]; + hash64[2] += hash64[1]; + hash64[1] = ROL2(hash64[1], 13) ^ hash64[2]; + hash64[4] += hash64[7]; + hash64[7] = ROL2(hash64[7], 50) ^ hash64[4]; + hash64[6] += hash64[5]; + hash64[5] = ROL2(hash64[5], 10) ^ hash64[6]; + hash64[0] += hash64[3]; + hash64[3] = ROL2(hash64[3], 17) ^ hash64[0]; + hash64[4] += hash64[1]; + hash64[1] = ROL2(hash64[1], 25) ^ hash64[4]; + hash64[6] += hash64[3]; + hash64[3] = ROL2(hash64[3], 29) ^ hash64[6]; + hash64[0] += hash64[5]; + hash64[5] = ROL2(hash64[5], 39) ^ hash64[0]; + hash64[2] += hash64[7]; + hash64[7] = ROL2(hash64[7], 43) ^ hash64[2]; + hash64[6] += hash64[1]; + hash64[1] = ROL8(hash64[1]) ^ hash64[6]; + hash64[0] += hash64[7]; + hash64[7] = ROL2(hash64[7], 35) ^ hash64[0]; + hash64[2] += hash64[5]; + hash64[5] = ROR8(hash64[5]) ^ hash64[2]; + hash64[4] += hash64[3]; + hash64[3] = ROL2(hash64[3], 22) ^ hash64[4]; + hash64[0] = (hash64[0] + h3); + hash64[1] = (hash64[1] + h4); + hash64[2] = (hash64[2] + h5); + hash64[3] = (hash64[3] + h6); + hash64[4] = (hash64[4] + h7); + hash64[5] = (hash64[5] + skein_h8 + vectorizelow(8)); + hash64[6] = (hash64[6] + h0 + vectorizehigh(0xff000000UL)); + hash64[7] = (hash64[7] + h1 + vectorizelow(12)); + hash64[0] += hash64[1]; + hash64[1] = ROL2(hash64[1], 46) ^ hash64[0]; + hash64[2] += hash64[3]; + hash64[3] = ROL2(hash64[3], 36) ^ hash64[2]; + hash64[4] += hash64[5]; + hash64[5] = ROL2(hash64[5], 19) ^ hash64[4]; + hash64[6] += hash64[7]; + hash64[7] = ROL2(hash64[7], 37) ^ hash64[6]; + hash64[2] += hash64[1]; + hash64[1] = ROL2(hash64[1], 33) ^ hash64[2]; + hash64[4] += hash64[7]; + hash64[7] = ROL2(hash64[7], 27) ^ hash64[4]; + hash64[6] += hash64[5]; + hash64[5] = ROL2(hash64[5], 14) ^ hash64[6]; + hash64[0] += hash64[3]; + hash64[3] = ROL2(hash64[3], 42) ^ hash64[0]; + hash64[4] += hash64[1]; + hash64[1] = ROL2(hash64[1], 17) ^ hash64[4]; + hash64[6] += hash64[3]; + hash64[3] = ROL2(hash64[3], 49) ^ hash64[6]; + hash64[0] += hash64[5]; + hash64[5] = ROL2(hash64[5], 36) ^ hash64[0]; + hash64[2] += hash64[7]; + hash64[7] = ROL2(hash64[7], 39) ^ hash64[2]; + hash64[6] += hash64[1]; + hash64[1] = ROL2(hash64[1], 44) ^ hash64[6]; + hash64[0] += hash64[7]; + hash64[7] = ROL2(hash64[7], 9) ^ hash64[0]; + hash64[2] += hash64[5]; + hash64[5] = ROL2(hash64[5], 54) ^ hash64[2]; + hash64[4] += hash64[3]; + hash64[3] = ROR8(hash64[3]) ^ hash64[4]; + hash64[0] = (hash64[0] + h4); + hash64[1] = (hash64[1] + h5); + hash64[2] = (hash64[2] + h6); + hash64[3] = (hash64[3] + h7); + hash64[4] = (hash64[4] + skein_h8); + hash64[5] = (hash64[5] + h0 + vectorizehigh(0xff000000UL)); + hash64[6] = (hash64[6] + h1 + vectorize(0xff00000000000008ULL)); + hash64[7] = (hash64[7] + h2 + vectorizelow(13)); + hash64[0] += hash64[1]; + hash64[1] = ROL2(hash64[1], 39) ^ hash64[0]; + hash64[2] += hash64[3]; + hash64[3] = ROL2(hash64[3], 30) ^ hash64[2]; + hash64[4] += hash64[5]; + hash64[5] = ROL2(hash64[5], 34) ^ hash64[4]; + hash64[6] += hash64[7]; + hash64[7] = ROL24(hash64[7]) ^ hash64[6]; + hash64[2] += hash64[1]; + hash64[1] = ROL2(hash64[1], 13) ^ hash64[2]; + hash64[4] += hash64[7]; + hash64[7] = ROL2(hash64[7], 50) ^ hash64[4]; + hash64[6] += hash64[5]; + hash64[5] = ROL2(hash64[5], 10) ^ hash64[6]; + hash64[0] += hash64[3]; + hash64[3] = ROL2(hash64[3], 17) ^ hash64[0]; + hash64[4] += hash64[1]; + hash64[1] = ROL2(hash64[1], 25) ^ hash64[4]; + hash64[6] += hash64[3]; + hash64[3] = ROL2(hash64[3], 29) ^ hash64[6]; + hash64[0] += hash64[5]; + hash64[5] = ROL2(hash64[5], 39) ^ hash64[0]; + hash64[2] += hash64[7]; + hash64[7] = ROL2(hash64[7], 43) ^ hash64[2]; + hash64[6] += hash64[1]; + hash64[1] = ROL8(hash64[1]) ^ hash64[6]; + hash64[0] += hash64[7]; + hash64[7] = ROL2(hash64[7], 35) ^ hash64[0]; + hash64[2] += hash64[5]; + hash64[5] = ROR8(hash64[5]) ^ hash64[2]; + hash64[4] += hash64[3]; + hash64[3] = ROL2(hash64[3], 22) ^ hash64[4]; + hash64[0] = (hash64[0] + h5); + hash64[1] = (hash64[1] + h6); + hash64[2] = (hash64[2] + h7); + hash64[3] = (hash64[3] + skein_h8); + hash64[4] = (hash64[4] + h0); + hash64[5] = (hash64[5] + h1 + vectorize(0xff00000000000008ULL)); + hash64[6] = (hash64[6] + h2 + vectorizelow(8ULL)); + hash64[7] = (hash64[7] + h3 + vectorizelow(14)); + hash64[0] += hash64[1]; + hash64[1] = ROL2(hash64[1], 46) ^ hash64[0]; + hash64[2] += hash64[3]; + hash64[3] = ROL2(hash64[3], 36) ^ hash64[2]; + hash64[4] += hash64[5]; + hash64[5] = ROL2(hash64[5], 19) ^ hash64[4]; + hash64[6] += hash64[7]; + hash64[7] = ROL2(hash64[7], 37) ^ hash64[6]; + hash64[2] += hash64[1]; + hash64[1] = ROL2(hash64[1], 33) ^ hash64[2]; + hash64[4] += hash64[7]; + hash64[7] = ROL2(hash64[7], 27) ^ hash64[4]; + hash64[6] += hash64[5]; + hash64[5] = ROL2(hash64[5], 14) ^ hash64[6]; + hash64[0] += hash64[3]; + hash64[3] = ROL2(hash64[3], 42) ^ hash64[0]; + hash64[4] += hash64[1]; + hash64[1] = ROL2(hash64[1], 17) ^ hash64[4]; + hash64[6] += hash64[3]; + hash64[3] = ROL2(hash64[3], 49) ^ hash64[6]; + hash64[0] += hash64[5]; + hash64[5] = ROL2(hash64[5], 36) ^ hash64[0]; + hash64[2] += hash64[7]; + hash64[7] = ROL2(hash64[7], 39) ^ hash64[2]; + hash64[6] += hash64[1]; + hash64[1] = ROL2(hash64[1], 44) ^ hash64[6]; + hash64[0] += hash64[7]; + hash64[7] = ROL2(hash64[7], 9) ^ hash64[0]; + hash64[2] += hash64[5]; + hash64[5] = ROL2(hash64[5], 54) ^ hash64[2]; + hash64[4] += hash64[3]; + hash64[3] = ROR8(hash64[3]) ^ hash64[4]; + hash64[0] = (hash64[0] + h6); + hash64[1] = (hash64[1] + h7); + hash64[2] = (hash64[2] + skein_h8); + hash64[3] = (hash64[3] + h0); + hash64[4] = (hash64[4] + h1); + hash64[5] = (hash64[5] + h2 + vectorizelow(8ULL)); + hash64[6] = (hash64[6] + h3 + vectorizehigh(0xff000000UL)); + hash64[7] = (hash64[7] + h4 + vectorizelow(15)); + hash64[0] += hash64[1]; + hash64[1] = ROL2(hash64[1], 39) ^ hash64[0]; + hash64[2] += hash64[3]; + hash64[3] = ROL2(hash64[3], 30) ^ hash64[2]; + hash64[4] += hash64[5]; + hash64[5] = ROL2(hash64[5], 34) ^ hash64[4]; + hash64[6] += hash64[7]; + hash64[7] = ROL24(hash64[7]) ^ hash64[6]; + hash64[2] += hash64[1]; + hash64[1] = ROL2(hash64[1], 13) ^ hash64[2]; + hash64[4] += hash64[7]; + hash64[7] = ROL2(hash64[7], 50) ^ hash64[4]; + hash64[6] += hash64[5]; + hash64[5] = ROL2(hash64[5], 10) ^ hash64[6]; + hash64[0] += hash64[3]; + hash64[3] = ROL2(hash64[3], 17) ^ hash64[0]; + hash64[4] += hash64[1]; + hash64[1] = ROL2(hash64[1], 25) ^ hash64[4]; + hash64[6] += hash64[3]; + hash64[3] = ROL2(hash64[3], 29) ^ hash64[6]; + hash64[0] += hash64[5]; + hash64[5] = ROL2(hash64[5], 39) ^ hash64[0]; + hash64[2] += hash64[7]; + hash64[7] = ROL2(hash64[7], 43) ^ hash64[2]; + hash64[6] += hash64[1]; + hash64[1] = ROL8(hash64[1]) ^ hash64[6]; + hash64[0] += hash64[7]; + hash64[7] = ROL2(hash64[7], 35) ^ hash64[0]; + hash64[2] += hash64[5]; + hash64[5] = ROR8(hash64[5]) ^ hash64[2]; + hash64[4] += hash64[3]; + hash64[3] = ROL2(hash64[3], 22) ^ hash64[4]; + hash64[0] = (hash64[0] + h7); + hash64[1] = (hash64[1] + skein_h8); + hash64[2] = (hash64[2] + h0); + hash64[3] = (hash64[3] + h1); + hash64[4] = (hash64[4] + h2); + hash64[5] = (hash64[5] + h3 + vectorizehigh(0xff000000UL)); + hash64[6] = (hash64[6] + h4 + vectorize(0xff00000000000008ULL)); + hash64[7] = (hash64[7] + h5 + vectorizelow(16)); + hash64[0] += hash64[1]; + hash64[1] = ROL2(hash64[1], 46) ^ hash64[0]; + hash64[2] += hash64[3]; + hash64[3] = ROL2(hash64[3], 36) ^ hash64[2]; + hash64[4] += hash64[5]; + hash64[5] = ROL2(hash64[5], 19) ^ hash64[4]; + hash64[6] += hash64[7]; + hash64[7] = ROL2(hash64[7], 37) ^ hash64[6]; + hash64[2] += hash64[1]; + hash64[1] = ROL2(hash64[1], 33) ^ hash64[2]; + hash64[4] += hash64[7]; + hash64[7] = ROL2(hash64[7], 27) ^ hash64[4]; + hash64[6] += hash64[5]; + hash64[5] = ROL2(hash64[5], 14) ^ hash64[6]; + hash64[0] += hash64[3]; + hash64[3] = ROL2(hash64[3], 42) ^ hash64[0]; + hash64[4] += hash64[1]; + hash64[1] = ROL2(hash64[1], 17) ^ hash64[4]; + hash64[6] += hash64[3]; + hash64[3] = ROL2(hash64[3], 49) ^ hash64[6]; + hash64[0] += hash64[5]; + hash64[5] = ROL2(hash64[5], 36) ^ hash64[0]; + hash64[2] += hash64[7]; + hash64[7] = ROL2(hash64[7], 39) ^ hash64[2]; + hash64[6] += hash64[1]; + hash64[1] = ROL2(hash64[1], 44) ^ hash64[6]; + hash64[0] += hash64[7]; + hash64[7] = ROL2(hash64[7], 9) ^ hash64[0]; + hash64[2] += hash64[5]; + hash64[5] = ROL2(hash64[5], 54) ^ hash64[2]; + hash64[4] += hash64[3]; + hash64[3] = ROR8(hash64[3]) ^ hash64[4]; + hash64[0] = (hash64[0] + skein_h8); + hash64[1] = (hash64[1] + h0); + hash64[2] = (hash64[2] + h1); + hash64[3] = (hash64[3] + h2); + hash64[4] = (hash64[4] + h3); + hash64[5] = (hash64[5] + h4 + vectorize(0xff00000000000008ULL)); + hash64[6] = (hash64[6] + h5 + vectorizelow(8ULL)); + hash64[7] = (hash64[7] + h6 + vectorizelow(17)); + hash64[0] += hash64[1]; + hash64[1] = ROL2(hash64[1], 39) ^ hash64[0]; + hash64[2] += hash64[3]; + hash64[3] = ROL2(hash64[3], 30) ^ hash64[2]; + hash64[4] += hash64[5]; + hash64[5] = ROL2(hash64[5], 34) ^ hash64[4]; + hash64[6] += hash64[7]; + hash64[7] = ROL24(hash64[7]) ^ hash64[6]; + hash64[2] += hash64[1]; + hash64[1] = ROL2(hash64[1], 13) ^ hash64[2]; + hash64[4] += hash64[7]; + hash64[7] = ROL2(hash64[7], 50) ^ hash64[4]; + hash64[6] += hash64[5]; + hash64[5] = ROL2(hash64[5], 10) ^ hash64[6]; + hash64[0] += hash64[3]; + hash64[3] = ROL2(hash64[3], 17) ^ hash64[0]; + hash64[4] += hash64[1]; + hash64[1] = ROL2(hash64[1], 25) ^ hash64[4]; + hash64[6] += hash64[3]; + hash64[3] = ROL2(hash64[3], 29) ^ hash64[6]; + hash64[0] += hash64[5]; + hash64[5] = ROL2(hash64[5], 39) ^ hash64[0]; + hash64[2] += hash64[7]; + hash64[7] = ROL2(hash64[7], 43) ^ hash64[2]; + hash64[6] += hash64[1]; + hash64[1] = ROL8(hash64[1]) ^ hash64[6]; + hash64[0] += hash64[7]; + hash64[7] = ROL2(hash64[7], 35) ^ hash64[0]; + hash64[2] += hash64[5]; + hash64[5] = ROR8(hash64[5]) ^ hash64[2]; + hash64[4] += hash64[3]; + hash64[3] = ROL2(hash64[3], 22) ^ hash64[4]; + + Hash[0] = devectorize(hash64[0] + h0); + Hash[1] = devectorize(hash64[1] + h1); + Hash[2] = devectorize(hash64[2] + h2); + Hash[3] = devectorize(hash64[3] + h3); + Hash[4] = devectorize(hash64[4] + h4); + Hash[5] = devectorize(hash64[5] + h5)+ 8; + Hash[6] = devectorize(hash64[6] + h6)+ 0xff00000000000000ULL; + Hash[7] = devectorize(hash64[7] + h7)+ 18; + +#undef h0 +#undef h1 +#undef h2 +#undef h3 +#undef h4 +#undef h5 +#undef h6 +#undef h7 } } +//#else +//__launch_bounds__(128, 10) +//#endif -__global__ +__global__ #if __CUDA_ARCH__ > 500 -__launch_bounds__(TPBf, 2) -#else -__launch_bounds__(TPBf, 1) +__launch_bounds__(448, 2) #endif void quark_skein512_gpu_hash_64_final(const uint32_t threads, const uint32_t startNounce, uint64_t * const __restrict__ g_hash, const uint32_t *g_nonceVector, uint32_t *d_nonce, uint32_t target) { - uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); if (thread < threads) { // Skein @@ -402,10 +1917,10 @@ void quark_skein512_gpu_hash_64_final(const uint32_t threads, const uint32_t sta uint2 h0, h1, h2, h3, h4, h5, h6, h7, h8; uint2 t0, t1, t2; - uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread); + const uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread); - int hashPosition = nounce - startNounce; - uint64_t *inpHash = &g_hash[8 * hashPosition]; + const uint32_t hashPosition = nounce - startNounce; + const uint64_t *const inpHash = &g_hash[8 * hashPosition]; h0 = make_uint2(0x749C51CEull, 0x4903ADFF); h1 = make_uint2(0x9746DF03ull, 0x0D95DE39); @@ -416,7 +1931,7 @@ void quark_skein512_gpu_hash_64_final(const uint32_t threads, const uint32_t sta h6 = make_uint2(0x1A75B523ull, 0x991112C7); h7 = make_uint2(0x660FCC33ull, 0xAE18A40B); - // 1. Runde -> etype = 480, ptr = 64, bcount = 0, data = msg + // 1. Runde -> etype = 480, ptr = 64, bcount = 0, data = msg #pragma unroll 8 for (int i = 0; i<8; i++) p[i] = vectorize(inpHash[i]); @@ -507,7 +2022,7 @@ __host__ void quark_skein512_cpu_init(int thr_id) cudaMalloc(&d_nonce[thr_id], 2*sizeof(uint32_t)); } -__host__ void quark_skein512_setTarget(const void *ptarget) +__host__ void quark_skein512_setTarget(int thr_id, const void *ptarget) { } __host__ void quark_skein512_cpu_free(int32_t thr_id) @@ -515,27 +2030,784 @@ __host__ void quark_skein512_cpu_free(int32_t thr_id) cudaFreeHost(&d_nonce[thr_id]); } -__host__ -void quark_skein512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order) + +/* Elementary functions used by SHA256 */ +#define SWAB32(x) cuda_swab32(x) + +#define R(x, n) ((x) >> (n)) +#define Ch(x, y, z) ((x & (y ^ z)) ^ z) +#define Maj(x, y, z) ((x & (y | z)) | (y & z)) +#define S0(x) (ROTR32(x, 2) ^ ROTR32(x, 13) ^ ROTR32(x, 22)) +#define S1(x) (ROTR32(x, 6) ^ ROTR32(x, 11) ^ ROTR32(x, 25)) +#define s0(x) (ROTR32(x, 7) ^ ROTR32(x, 18) ^ R(x, 3)) +#define s1(x) (ROTR32(x, 17) ^ ROTR32(x, 19) ^ R(x, 10)) + +__constant__ uint32_t sha256_endingTable[] = { + 0x80000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000200, + 0x80000000, 0x01400000, 0x00205000, 0x00005088, 0x22000800, 0x22550014, 0x05089742, 0xa0000020, + 0x5a880000, 0x005c9400, 0x0016d49d, 0xfa801f00, 0xd33225d0, 0x11675959, 0xf6e6bfda, 0xb30c1549, + 0x08b2b050, 0x9d7c4c27, 0x0ce2a393, 0x88e6e1ea, 0xa52b4335, 0x67a16f49, 0xd732016f, 0x4eeb2e91, + 0x5dbf55e5, 0x8eee2335, 0xe2bc5ec2, 0xa83f4394, 0x45ad78f7, 0x36f3d0cd, 0xd99c05e8, 0xb0511dc7, + 0x69bc7ac4, 0xbd11375b, 0xe3ba71e5, 0x3b209ff2, 0x18feee17, 0xe25ad9e7, 0x13375046, 0x0515089d, + 0x4f0d0f04, 0x2627484e, 0x310128d2, 0xc668b434, 0x420841cc, 0x62d311b8, 0xe59ba771, 0x85a7a484 + }; + +__constant__ uint32_t sha256_constantTable[64] = { + 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, + 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, + 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, + 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, + 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, + 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, + 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, + 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 +}; + +__global__ +__launch_bounds__(TPB52) +void skein512_gpu_hash_80_52(uint32_t threads, uint32_t startNounce, uint32_t *const __restrict__ d_nonce, uint64_t target, int thr_id) { - // berechne wie viele Thread Blocks wir brauchen - dim3 grid((threads + TPB-1)/TPB); - dim3 block(TPB); + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); +// if (thread < threads) + { + uint2 h8; + uint2 p[8]; + + uint2 h0 = precalcvalues[0]; + uint2 h1 = precalcvalues[1]; + uint2 h2 = precalcvalues[2]; + uint2 h3 = precalcvalues[3]; + uint2 h4 = precalcvalues[4]; + uint2 h5 = precalcvalues[5]; + uint2 h6 = precalcvalues[6]; + uint2 h7 = precalcvalues[7]; + uint2 t2 = precalcvalues[8]; + + const uint2 nounce2 = make_uint2(_LOWORD(c_PaddedMessage80[1]), cuda_swab32(startNounce + thread)); + + uint2 t0 = vectorizelow(0x50ull); // SPH_T64(bcount << 6) + (sph_u64)(extra); + uint2 t1 = vectorizehigh(0xB0000000ul); // (bcount >> 58) + ((sph_u64)(etype) << 55); + h8 = h0 ^ h1 ^ h2 ^ h3 ^ h4 ^ h5 ^ h6 ^ h7 ^ make_uint2(0xA9FC1A22UL, 0x1BD11BDA); + t2 = t0 ^ t1; + + p[0] = h0 + vectorize(c_PaddedMessage80[0]); + p[1] = h1 + nounce2; + p[2] = h2; + p[3] = h3; + p[4] = h4; + p[5] = h5 + t0; + p[6] = h6 + t1; + p[7] = h7; + + TFBIG_MIX8_UI2(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], 46, 36, 19, 37); + TFBIG_MIX8_UI2(p[2], p[1], p[4], p[7], p[6], p[5], p[0], p[3], 33, 27, 14, 42); + TFBIG_MIX8_UI2(p[4], p[1], p[6], p[3], p[0], p[5], p[2], p[7], 17, 49, 36, 39); + TFBIG_MIX8_UI2(p[6], p[1], p[0], p[7], p[2], p[5], p[4], p[3], 44, 9, 54, 56); + + TFBIG_4o_UI2(1); + TFBIG_4e_UI2(2); + TFBIG_4o_UI2(3); + TFBIG_4e_UI2(4); + TFBIG_4o_UI2(5); + TFBIG_4e_UI2(6); + TFBIG_4o_UI2(7); + TFBIG_4e_UI2(8); + TFBIG_4o_UI2(9); + TFBIG_4e_UI2(10); + TFBIG_4o_UI2(11); + TFBIG_4e_UI2(12); + TFBIG_4o_UI2(13); + TFBIG_4e_UI2(14); + TFBIG_4o_UI2(15); + TFBIG_4e_UI2(16); + TFBIG_4o_UI2(17); + TFBIG_ADDKEY_UI2(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], h, t, 18); + + + t0 = vectorizelow(8); // extra + t1 = vectorizehigh(0xFF000000ul); // etype + + h0 = vectorize(c_PaddedMessage80[0]) ^ p[0]; + h1 = nounce2 ^ p[1]; + h2 = p[2]; + h3 = p[3]; + h4 = p[4]; + h5 = p[5]; + h6 = p[6]; + h7 = p[7]; + + h8 = h0 ^ h1 ^ p[2] ^ p[3] ^ p[4] ^ p[5] ^ p[6] ^ p[7] ^ vectorize(0x1BD11BDAA9FC1A22); + t2 = vectorize(0xFF00000000000008ull); + + // p[8] = { 0 }; + #pragma unroll 8 + for (int i = 0; i<8; i++) + p[i] = make_uint2(0, 0); - quark_skein512_gpu_hash_64 << > >(threads, startNounce, (uint64_t*)d_hash, d_nonceVector); -// MyStreamSynchronize(NULL, order, thr_id); + TFBIG_4e_UI2(0); + TFBIG_4o_UI2(1); + TFBIG_4e_UI2(2); + TFBIG_4o_UI2(3); + TFBIG_4e_UI2(4); + TFBIG_4o_UI2(5); + TFBIG_4e_UI2(6); + TFBIG_4o_UI2(7); + TFBIG_4e_UI2(8); + TFBIG_4o_UI2(9); + TFBIG_4e_UI2(10); + TFBIG_4o_UI2(11); + TFBIG_4e_UI2(12); + TFBIG_4o_UI2(13); + TFBIG_4e_UI2(14); + TFBIG_4o_UI2(15); + TFBIG_4e_UI2(16); + TFBIG_4o_UI2(17); + TFBIG_ADDKEY_UI2(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], h, t, 18); + + uint32_t *message = (uint32_t *)p; + + uint32_t W1[16]; + uint32_t W2[16]; + + uint32_t regs[8] = { + 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19 + }; + uint32_t hash[8] = { + 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19 + }; + +#pragma unroll 16 + for (int k = 0; k<16; k++) + W1[k] = SWAB32(message[k]); + + // Progress W1 +#pragma unroll 16 + for (int j = 0; j<16; j++) + { + uint32_t T1, T2; + T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + sha256_constantTable[j] + W1[j]; + T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]); + + regs[7] = regs[6]; + regs[6] = regs[5]; + regs[5] = regs[4]; + regs[4] = regs[3] + T1; + regs[3] = regs[2]; + regs[2] = regs[1]; + regs[1] = regs[0]; + regs[0] = T1 + T2; + } + + // Progress W2...W3 + + ////// PART 1 +#pragma unroll 2 + for (int j = 0; j<2; j++) + W2[j] = s1(W1[14 + j]) + W1[9 + j] + s0(W1[1 + j]) + W1[j]; +#pragma unroll 5 + for (int j = 2; j<7; j++) + W2[j] = s1(W2[j - 2]) + W1[9 + j] + s0(W1[1 + j]) + W1[j]; + +#pragma unroll 8 + for (int j = 7; j<15; j++) + W2[j] = s1(W2[j - 2]) + W2[j - 7] + s0(W1[1 + j]) + W1[j]; + + W2[15] = s1(W2[13]) + W2[8] + s0(W2[0]) + W1[15]; + + // Round function +#pragma unroll 16 + for (int j = 0; j<16; j++) + { + uint32_t T1, T2; + T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + sha256_constantTable[j + 16] + W2[j]; + T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]); + + regs[7] = regs[6]; + regs[6] = regs[5]; + regs[5] = regs[4]; + regs[4] = regs[3] + T1; + regs[3] = regs[2]; + regs[2] = regs[1]; + regs[1] = regs[0]; + regs[0] = T1 + T2; + } + + ////// PART 2 +#pragma unroll 2 + for (int j = 0; j<2; j++) + W1[j] = s1(W2[14 + j]) + W2[9 + j] + s0(W2[1 + j]) + W2[j]; + +#pragma unroll 5 + for (int j = 2; j<7; j++) + W1[j] = s1(W1[j - 2]) + W2[9 + j] + s0(W2[1 + j]) + W2[j]; + +#pragma unroll 8 + for (int j = 7; j<15; j++) + W1[j] = s1(W1[j - 2]) + W1[j - 7] + s0(W2[1 + j]) + W2[j]; + + W1[15] = s1(W1[13]) + W1[8] + s0(W1[0]) + W2[15]; + + // Round function +#pragma unroll 16 + for (int j = 0; j<16; j++) + { + uint32_t T1, T2; + T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + sha256_constantTable[j + 32] + W1[j]; + T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]); + + regs[7] = regs[6]; + regs[6] = regs[5]; + regs[5] = regs[4]; + regs[4] = regs[3] + T1; + regs[3] = regs[2]; + regs[2] = regs[1]; + regs[1] = regs[0]; + regs[0] = T1 + T2; + } + + ////// PART 3 +#pragma unroll 2 + for (int j = 0; j<2; j++) + W2[j] = s1(W1[14 + j]) + W1[9 + j] + s0(W1[1 + j]) + W1[j]; + +#pragma unroll 5 + for (int j = 2; j<7; j++) + W2[j] = s1(W2[j - 2]) + W1[9 + j] + s0(W1[1 + j]) + W1[j]; + +#pragma unroll 8 + for (int j = 7; j<15; j++) + W2[j] = s1(W2[j - 2]) + W2[j - 7] + s0(W1[1 + j]) + W1[j]; + + W2[15] = s1(W2[13]) + W2[8] + s0(W2[0]) + W1[15]; + + // Round function +#pragma unroll 16 + for (int j = 0; j<16; j++) + { + uint32_t T1, T2; + T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + sha256_constantTable[j + 48] + W2[j]; + T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]); + + regs[7] = regs[6]; + regs[6] = regs[5]; + regs[5] = regs[4]; + regs[4] = regs[3] + T1; + regs[3] = regs[2]; + regs[2] = regs[1]; + regs[1] = regs[0]; + regs[0] = T1 + T2; + } + +#pragma unroll 8 + for (int k = 0; k<8; k++) + hash[k] += regs[k]; + + ///// + ///// Second Pass (ending) + ///// +#pragma unroll 8 + for (int k = 0; k<8; k++) + regs[k] = hash[k]; + + // Progress W1 + uint32_t T1, T2; + T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + sha256_constantTable[0] + sha256_endingTable[0]; + T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]); + + regs[7] = regs[6]; + regs[6] = regs[5]; + regs[5] = regs[4]; + regs[4] = regs[3] + T1; + regs[3] = regs[2]; + regs[2] = regs[1]; + regs[1] = regs[0]; + regs[0] = T1 + T2; +#pragma unroll + for(int j = 1; j<15; j++) + { + T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + sha256_constantTable[j]; + T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]); + + regs[7] = regs[6]; + regs[6] = regs[5]; + regs[5] = regs[4]; + regs[4] = regs[3] + T1; + regs[3] = regs[2]; + regs[2] = regs[1]; + regs[1] = regs[0]; + regs[0] = T1 + T2; + } +#pragma unroll + for (int j = 15; j<56; j++) + { + T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + sha256_constantTable[j] + sha256_endingTable[j]; + T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]); + + regs[7] = regs[6]; + regs[6] = regs[5]; + regs[5] = regs[4]; + regs[4] = regs[3] + T1; + regs[3] = regs[2]; + regs[2] = regs[1]; + regs[1] = regs[0]; + regs[0] = T1 + T2; + } + + T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + sha256_constantTable[56] + sha256_endingTable[56]; + regs[7] = T1 + S0(regs[0]) + Maj(regs[0], regs[1], regs[2]); + regs[3] += T1; + + T1 = regs[6] + S1(regs[3]) + Ch(regs[3], regs[4], regs[5]) + sha256_constantTable[57] + sha256_endingTable[57]; + regs[6] = T1 + S0(regs[7]) + Maj(regs[7], regs[0], regs[1]); + regs[2] += T1; + //************ + regs[1] += regs[5] + S1(regs[2]) + Ch(regs[2], regs[3], regs[4]) + sha256_constantTable[58] + sha256_endingTable[58]; + regs[0] += regs[4] + S1(regs[1]) + Ch(regs[1], regs[2], regs[3]) + sha256_constantTable[59] + sha256_endingTable[59]; + regs[7] += regs[3] + S1(regs[0]) + Ch(regs[0], regs[1], regs[2]) + sha256_constantTable[60] + sha256_endingTable[60]; + regs[6] += regs[2] + S1(regs[7]) + Ch(regs[7], regs[0], regs[1]) + sha256_constantTable[61] + sha256_endingTable[61]; + + uint64_t test = SWAB32(hash[7] + regs[7]); + test <<= 32; + test |= SWAB32(hash[6] + regs[6]); + if (test <= target) + { + uint32_t tmp = atomicExch(&(d_nonce[0]), startNounce + thread); + if (tmp != 0xffffffff) + d_nonce[1] = tmp; + } + } +} + +__global__ +__launch_bounds__(TPB50) +void skein512_gpu_hash_80_50(uint32_t threads, uint32_t startNounce, uint32_t *const __restrict__ d_nonce, uint64_t target, int thr_id) +{ + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + // if (thread < threads) + { + uint2 h8; + uint2 p[8]; + + uint2 h0 = precalcvalues[0]; + uint2 h1 = precalcvalues[1]; + uint2 h2 = precalcvalues[2]; + uint2 h3 = precalcvalues[3]; + uint2 h4 = precalcvalues[4]; + uint2 h5 = precalcvalues[5]; + uint2 h6 = precalcvalues[6]; + uint2 h7 = precalcvalues[7]; + uint2 t2 = precalcvalues[8]; + + const uint2 nounce2 = make_uint2(_LOWORD(c_PaddedMessage80[1]), cuda_swab32(startNounce + thread)); + + uint2 t0 = vectorizelow(0x50ull); // SPH_T64(bcount << 6) + (sph_u64)(extra); + uint2 t1 = vectorizehigh(0xB0000000ul); // (bcount >> 58) + ((sph_u64)(etype) << 55); + h8 = h0 ^ h1 ^ h2 ^ h3 ^ h4 ^ h5 ^ h6 ^ h7 ^ make_uint2(0xA9FC1A22UL, 0x1BD11BDA); + t2 = t0 ^ t1; + + p[0] = h0 + vectorize(c_PaddedMessage80[0]); + p[1] = h1 + nounce2; + p[2] = h2; + p[3] = h3; + p[4] = h4; + p[5] = h5 + t0; + p[6] = h6 + t1; + p[7] = h7; + + TFBIG_MIX8_UI2(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], 46, 36, 19, 37); + TFBIG_MIX8_UI2(p[2], p[1], p[4], p[7], p[6], p[5], p[0], p[3], 33, 27, 14, 42); + TFBIG_MIX8_UI2(p[4], p[1], p[6], p[3], p[0], p[5], p[2], p[7], 17, 49, 36, 39); + TFBIG_MIX8_UI2(p[6], p[1], p[0], p[7], p[2], p[5], p[4], p[3], 44, 9, 54, 56); + + TFBIG_4o_UI2(1); + TFBIG_4e_UI2(2); + TFBIG_4o_UI2(3); + TFBIG_4e_UI2(4); + TFBIG_4o_UI2(5); + TFBIG_4e_UI2(6); + TFBIG_4o_UI2(7); + TFBIG_4e_UI2(8); + TFBIG_4o_UI2(9); + TFBIG_4e_UI2(10); + TFBIG_4o_UI2(11); + TFBIG_4e_UI2(12); + TFBIG_4o_UI2(13); + TFBIG_4e_UI2(14); + TFBIG_4o_UI2(15); + TFBIG_4e_UI2(16); + TFBIG_4o_UI2(17); + TFBIG_ADDKEY_UI2(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], h, t, 18); + + + t0 = vectorizelow(8); // extra + t1 = vectorizehigh(0xFF000000ul); // etype + + h0 = vectorize(c_PaddedMessage80[0]) ^ p[0]; + h1 = nounce2 ^ p[1]; + h2 = p[2]; + h3 = p[3]; + h4 = p[4]; + h5 = p[5]; + h6 = p[6]; + h7 = p[7]; + + h8 = h0 ^ h1 ^ p[2] ^ p[3] ^ p[4] ^ p[5] ^ p[6] ^ p[7] ^ vectorize(0x1BD11BDAA9FC1A22); + t2 = vectorize(0xFF00000000000008ull); + + // p[8] = { 0 }; +#pragma unroll 8 + for (int i = 0; i<8; i++) + p[i] = make_uint2(0, 0); + + TFBIG_4e_UI2(0); + TFBIG_4o_UI2(1); + TFBIG_4e_UI2(2); + TFBIG_4o_UI2(3); + TFBIG_4e_UI2(4); + TFBIG_4o_UI2(5); + TFBIG_4e_UI2(6); + TFBIG_4o_UI2(7); + TFBIG_4e_UI2(8); + TFBIG_4o_UI2(9); + TFBIG_4e_UI2(10); + TFBIG_4o_UI2(11); + TFBIG_4e_UI2(12); + TFBIG_4o_UI2(13); + TFBIG_4e_UI2(14); + TFBIG_4o_UI2(15); + TFBIG_4e_UI2(16); + TFBIG_4o_UI2(17); + TFBIG_ADDKEY_UI2(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], h, t, 18); + + uint32_t *message = (uint32_t *)p; + + uint32_t W1[16]; + uint32_t W2[16]; + + uint32_t regs[8] = { + 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19 + };; + uint32_t hash[8] = { + 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19 + }; + +#pragma unroll 16 + for (int k = 0; k<16; k++) + W1[k] = SWAB32(message[k]); + + // Progress W1 +#pragma unroll 16 + for (int j = 0; j<16; j++) + { + uint32_t T1, T2; + T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + sha256_constantTable[j] + W1[j]; + T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]); + + regs[7] = regs[6]; + regs[6] = regs[5]; + regs[5] = regs[4]; + regs[4] = regs[3] + T1; + regs[3] = regs[2]; + regs[2] = regs[1]; + regs[1] = regs[0]; + regs[0] = T1 + T2; + } + + // Progress W2...W3 + + ////// PART 1 +#pragma unroll 2 + for (int j = 0; j<2; j++) + W2[j] = s1(W1[14 + j]) + W1[9 + j] + s0(W1[1 + j]) + W1[j]; +#pragma unroll 5 + for (int j = 2; j<7; j++) + W2[j] = s1(W2[j - 2]) + W1[9 + j] + s0(W1[1 + j]) + W1[j]; + +#pragma unroll 8 + for (int j = 7; j<15; j++) + W2[j] = s1(W2[j - 2]) + W2[j - 7] + s0(W1[1 + j]) + W1[j]; + + W2[15] = s1(W2[13]) + W2[8] + s0(W2[0]) + W1[15]; + + // Round function +#pragma unroll 16 + for (int j = 0; j<16; j++) + { + uint32_t T1, T2; + T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + sha256_constantTable[j + 16] + W2[j]; + T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]); + + regs[7] = regs[6]; + regs[6] = regs[5]; + regs[5] = regs[4]; + regs[4] = regs[3] + T1; + regs[3] = regs[2]; + regs[2] = regs[1]; + regs[1] = regs[0]; + regs[0] = T1 + T2; + } + + ////// PART 2 +#pragma unroll 2 + for (int j = 0; j<2; j++) + W1[j] = s1(W2[14 + j]) + W2[9 + j] + s0(W2[1 + j]) + W2[j]; + +#pragma unroll 5 + for (int j = 2; j<7; j++) + W1[j] = s1(W1[j - 2]) + W2[9 + j] + s0(W2[1 + j]) + W2[j]; + +#pragma unroll 8 + for (int j = 7; j<15; j++) + W1[j] = s1(W1[j - 2]) + W1[j - 7] + s0(W2[1 + j]) + W2[j]; + + W1[15] = s1(W1[13]) + W1[8] + s0(W1[0]) + W2[15]; + + // Round function +#pragma unroll 16 + for (int j = 0; j<16; j++) + { + uint32_t T1, T2; + T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + sha256_constantTable[j + 32] + W1[j]; + T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]); + + regs[7] = regs[6]; + regs[6] = regs[5]; + regs[5] = regs[4]; + regs[4] = regs[3] + T1; + regs[3] = regs[2]; + regs[2] = regs[1]; + regs[1] = regs[0]; + regs[0] = T1 + T2; + } + + ////// PART 3 +#pragma unroll 2 + for (int j = 0; j<2; j++) + W2[j] = s1(W1[14 + j]) + W1[9 + j] + s0(W1[1 + j]) + W1[j]; + +#pragma unroll 5 + for (int j = 2; j<7; j++) + W2[j] = s1(W2[j - 2]) + W1[9 + j] + s0(W1[1 + j]) + W1[j]; + +#pragma unroll 8 + for (int j = 7; j<15; j++) + W2[j] = s1(W2[j - 2]) + W2[j - 7] + s0(W1[1 + j]) + W1[j]; + + W2[15] = s1(W2[13]) + W2[8] + s0(W2[0]) + W1[15]; + + // Round function +#pragma unroll 16 + for (int j = 0; j<16; j++) + { + uint32_t T1, T2; + T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + sha256_constantTable[j + 48] + W2[j]; + T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]); + + regs[7] = regs[6]; + regs[6] = regs[5]; + regs[5] = regs[4]; + regs[4] = regs[3] + T1; + regs[3] = regs[2]; + regs[2] = regs[1]; + regs[1] = regs[0]; + regs[0] = T1 + T2; + } + +#pragma unroll 8 + for (int k = 0; k<8; k++) + hash[k] += regs[k]; + + ///// + ///// Second Pass (ending) + ///// +#pragma unroll 8 + for (int k = 0; k<8; k++) + regs[k] = hash[k]; + + // Progress W1 + uint32_t T1, T2; + T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + sha256_constantTable[0] + sha256_endingTable[0]; + T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]); + + regs[7] = regs[6]; + regs[6] = regs[5]; + regs[5] = regs[4]; + regs[4] = regs[3] + T1; + regs[3] = regs[2]; + regs[2] = regs[1]; + regs[1] = regs[0]; + regs[0] = T1 + T2; +#pragma unroll + for(int j = 1; j<15; j++) + { + T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + sha256_constantTable[j]; + T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]); + + regs[7] = regs[6]; + regs[6] = regs[5]; + regs[5] = regs[4]; + regs[4] = regs[3] + T1; + regs[3] = regs[2]; + regs[2] = regs[1]; + regs[1] = regs[0]; + regs[0] = T1 + T2; + } +#pragma unroll + for(int j = 15; j<56; j++) + { + T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + sha256_constantTable[j] + sha256_endingTable[j]; + T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]); + + regs[7] = regs[6]; + regs[6] = regs[5]; + regs[5] = regs[4]; + regs[4] = regs[3] + T1; + regs[3] = regs[2]; + regs[2] = regs[1]; + regs[1] = regs[0]; + regs[0] = T1 + T2; + } + + T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + sha256_constantTable[56] + sha256_endingTable[56]; + regs[7] = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]) + T1; + regs[3] += T1; + + T1 = regs[6] + S1(regs[3]) + Ch(regs[3], regs[4], regs[5]) + sha256_constantTable[57] + sha256_endingTable[57]; + T2 = S0(regs[7]) + Maj(regs[7], regs[0], regs[1]); + regs[6] = S0(regs[7]) + Maj(regs[7], regs[0], regs[1]) + T1; + regs[2] += T1; + //************ + regs[1] += regs[5] + S1(regs[2]) + Ch(regs[2], regs[3], regs[4]) + sha256_constantTable[58] + sha256_endingTable[58]; + regs[0] += regs[4] + S1(regs[1]) + Ch(regs[1], regs[2], regs[3]) + sha256_constantTable[59] + sha256_endingTable[59]; + regs[7] += regs[3] + S1(regs[0]) + Ch(regs[0], regs[1], regs[2]) + sha256_constantTable[60] + sha256_endingTable[60]; + regs[6] += regs[2] + S1(regs[7]) + Ch(regs[7], regs[0], regs[1]) + sha256_constantTable[61] + sha256_endingTable[61]; + + uint64_t test = SWAB32(hash[7] + regs[7]); + test <<= 32; + test|= SWAB32(hash[6] + regs[6]); + if (test <= target) + { + uint32_t tmp = atomicExch(&(d_nonce[0]), startNounce + thread); + if (tmp != 0xffffffff) + d_nonce[1] = tmp; + } + } } +__host__ +void quark_skein512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash) +{ + int t = 128; + if (device_sm[device_map[thr_id]] > 500) + { + if (cuda_arch[thr_id]>500) + t = 448; + } + dim3 grid((threads + t - 1) / t); + dim3 block(t); + quark_skein512_gpu_hash_64 << >>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector); + CUDA_SAFE_CALL(cudaGetLastError()); +} __host__ -void quark_skein512_cpu_hash_64_final(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, uint32_t *h_nonce, uint32_t target, int order) +void quark_skein512_cpu_hash_64_final(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, uint32_t *h_nonce, uint32_t target) { - dim3 grid((threads + TPBf - 1) / TPBf); - dim3 block(TPBf); + const int tp = 128; + dim3 grid((threads + tp - 1) / tp); + dim3 block(tp); + + CUDA_SAFE_CALL(cudaMemsetAsync(d_nonce[thr_id], 0xff, 2 * sizeof(uint32_t), gpustream[thr_id])); - cudaMemset(d_nonce[thr_id], 0xff, 2*sizeof(uint32_t)); + quark_skein512_gpu_hash_64_final <<>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector, d_nonce[thr_id], target); - quark_skein512_gpu_hash_64_final<< > >(threads, startNounce, (uint64_t*)d_hash, d_nonceVector, d_nonce[thr_id], target); - cudaMemcpy(h_nonce, d_nonce[thr_id], 2*sizeof(uint32_t), cudaMemcpyDeviceToHost); + CUDA_SAFE_CALL(cudaDeviceSynchronize()); + CUDA_SAFE_CALL(cudaMemcpy(h_nonce, d_nonce[thr_id], 2 * sizeof(uint32_t), cudaMemcpyDeviceToHost)); } +static void precalc(int thr_id, uint64_t *PaddedMessage) +{ + uint64_t h0, h1, h2, h3, h4, h5, h6, h7, h8; + uint64_t t0, t1, t2; + + h0 = 0x4903ADFF749C51CEull; + h1 = 0x0D95DE399746DF03ull; + h2 = 0x8FD1934127C79BCEull; + h3 = 0x9A255629FF352CB1ull; + h4 = 0x5DB62599DF6CA7B0ull; + h5 = 0xEABE394CA9D5C3F4ull; + h6 = 0x991112C71A75B523ull; + h7 = 0xAE18A40B660FCC33ull; + h8 = h0 ^ h1 ^ h2 ^ h3 ^ h4 ^ h5 ^ h6 ^ h7 ^ SPH_C64(0x1BD11BDAA9FC1A22); + + t0 = 64; // ptr + t1 = 0x7000000000000000ull; + t2 = 0x7000000000000040ull; + + uint64_t p[8]; + for (int i = 0; i<8; i++) + p[i] = PaddedMessage[i]; + + TFBIG_4e_PRE(0); + TFBIG_4o_PRE(1); + TFBIG_4e_PRE(2); + TFBIG_4o_PRE(3); + TFBIG_4e_PRE(4); + TFBIG_4o_PRE(5); + TFBIG_4e_PRE(6); + TFBIG_4o_PRE(7); + TFBIG_4e_PRE(8); + TFBIG_4o_PRE(9); + TFBIG_4e_PRE(10); + TFBIG_4o_PRE(11); + TFBIG_4e_PRE(12); + TFBIG_4o_PRE(13); + TFBIG_4e_PRE(14); + TFBIG_4o_PRE(15); + TFBIG_4e_PRE(16); + TFBIG_4o_PRE(17); + TFBIG_ADDKEY_PRE(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], h, t, 18); + + uint64_t buffer[9]; + + buffer[0] = PaddedMessage[0] ^ p[0]; + buffer[1] = PaddedMessage[1] ^ p[1]; + buffer[2] = PaddedMessage[2] ^ p[2]; + buffer[3] = PaddedMessage[3] ^ p[3]; + buffer[4] = PaddedMessage[4] ^ p[4]; + buffer[5] = PaddedMessage[5] ^ p[5]; + buffer[6] = PaddedMessage[6] ^ p[6]; + buffer[7] = PaddedMessage[7] ^ p[7]; + buffer[8] = t2; + CUDA_SAFE_CALL(cudaMemcpyToSymbolAsync(precalcvalues, buffer, sizeof(buffer), 0, cudaMemcpyHostToDevice, gpustream[thr_id])); +} + +__host__ +void skein512_cpu_setBlock_80(int thr_id, void *pdata) +{ + uint64_t *PaddedMessage = (uint64_t*)pdata; + CUDA_SAFE_CALL(cudaMalloc(&(d_nonce[thr_id]), 2 * sizeof(uint32_t))); + CUDA_SAFE_CALL(cudaMemcpyToSymbolAsync(c_PaddedMessage80, &PaddedMessage[8], 8 * 2, 0, cudaMemcpyHostToDevice, gpustream[thr_id])); + CUDA_SAFE_CALL(cudaMemsetAsync(d_nonce[thr_id], 0xff, 2 * sizeof(uint32_t), gpustream[thr_id])); + precalc(thr_id, PaddedMessage); +} + +__host__ +void skein512_cpu_hash_80_52(int thr_id, uint32_t threads, uint32_t startNounce, int swapu,uint64_t target, uint32_t *h_found) +{ + dim3 grid((threads + TPB52 - 1) / TPB52); + dim3 block(TPB52); + skein512_gpu_hash_80_52 << < grid, block, 0, gpustream[thr_id]>>> (threads, startNounce, d_nonce[thr_id], target, thr_id); + CUDA_SAFE_CALL(cudaMemcpyAsync(h_found, d_nonce[thr_id], 2 * sizeof(uint32_t), cudaMemcpyDeviceToHost, gpustream[thr_id])); + CUDA_SAFE_CALL(cudaStreamSynchronize(gpustream[thr_id])); +} + +__host__ +void skein512_cpu_hash_80_50(int thr_id, uint32_t threads, uint32_t startNounce, int swapu, uint64_t target, uint32_t *h_found) +{ + dim3 grid((threads + TPB50 - 1) / TPB50); + dim3 block(TPB50); + skein512_gpu_hash_80_50 << < grid, block, 0, gpustream[thr_id]>>> (threads, startNounce, d_nonce[thr_id], target, thr_id); + CUDA_SAFE_CALL(cudaMemcpyAsync(h_found, d_nonce[thr_id], 2 * sizeof(uint32_t), cudaMemcpyDeviceToHost, gpustream[thr_id])); + CUDA_SAFE_CALL(cudaStreamSynchronize(gpustream[thr_id])); +} diff --git a/quark/quarkcoin.cu b/quark/quarkcoin.cu index 17e86d88be..c63d11fb70 100644 --- a/quark/quarkcoin.cu +++ b/quark/quarkcoin.cu @@ -9,52 +9,45 @@ extern "C" } #include "miner.h" - #include "cuda_helper.h" -static uint32_t *d_hash[MAX_GPUS]; - -// Speicher zur Generierung der Noncevektoren für die bedingten Hashes -static uint32_t *d_quarkNonces[MAX_GPUS]; -static uint32_t *d_branch1Nonces[MAX_GPUS]; -static uint32_t *d_branch2Nonces[MAX_GPUS]; -static uint32_t *d_branch3Nonces[MAX_GPUS]; - -extern void quark_blake512_cpu_setBlock_80(void *pdata); -extern void quark_blake512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order); -extern void quark_blake512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); +extern void quark_blake512_cpu_init(int thr_id); +extern void quark_blake512_cpu_setBlock_80(int thr_id, uint64_t *pdata); +extern void quark_blake512_cpu_setBlock_80_multi(int thr_id, uint64_t *pdata); +extern void quark_blake512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash); +extern void quark_blake512_cpu_hash_80_multi(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash); +extern void quark_blake512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash); extern void quark_bmw512_cpu_init(int thr_id, uint32_t threads); -extern void quark_bmw512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); +extern void quark_bmw512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash); +extern void quark_bmw512_cpu_hash_64_quark(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash); extern void quark_groestl512_cpu_init(int thr_id, uint32_t threads); -extern void quark_groestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); -extern void quark_doublegroestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); +extern void quark_groestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash); extern void quark_skein512_cpu_init(int thr_id); -extern void quark_skein512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); +extern void quark_skein512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash); -extern void quark_keccak512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); -extern void quark_keccak512_cpu_hash_64_final(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); -extern void quark_jh512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); -extern void quark_jh512_cpu_hash_64_final(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); -extern void quark_jh512_cpu_init(int thr_id, uint32_t threads); +extern void quark_keccak512_cpu_init(int thr_id); +extern void quark_keccakskein512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash); +extern void quark_keccak512_cpu_hash_64_final(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, uint32_t target, uint32_t *h_found); +extern void quark_jh512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash); +extern void quark_jh512_cpu_hash_64_final(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, uint32_t target, uint32_t *h_found); +extern void quark_jh512_cpu_init(int thr_id); extern void quark_compactTest_cpu_init(int thr_id, uint32_t threads); -extern void quark_compactTest_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable, +extern void quark_compactTest_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, const uint32_t *inpHashes, const uint32_t *d_validNonceTable, uint32_t *d_nonces1, uint32_t *nrm1, - uint32_t *d_nonces2, uint32_t *nrm2, - int order); + uint32_t *d_nonces2, uint32_t *nrm2); extern void quark_compactTest_single_false_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable, - uint32_t *d_nonces1, uint32_t *nrm1, - int order); + uint32_t *d_nonces1, uint32_t *nrm1); -extern uint32_t cuda_check_hash_branch(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order); -extern void cuda_check_quarkcoin(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order, uint32_t *foundnonces); +extern uint32_t cuda_check_hash_branch(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash); +extern void cuda_check_quarkcoin(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, uint32_t *foundnonces); // Original Quarkhash Funktion aus einem miner Quelltext -extern "C" void quarkhash(void *state, const void *input) +void quarkhash(void *state, const void *input) { sph_blake512_context ctx_blake; sph_bmw512_context ctx_bmw; @@ -131,138 +124,172 @@ extern "C" void quarkhash(void *state, const void *input) memcpy(state, hash, 32); } -static bool init[MAX_GPUS] = { 0 }; - -extern "C" int scanhash_quark(int thr_id, uint32_t *pdata, - const uint32_t *ptarget, uint32_t max_nonce, - unsigned long *hashes_done) +extern int scanhash_quark(int thr_id, uint32_t *pdata, + uint32_t *ptarget, uint32_t max_nonce, + uint32_t *hashes_done) { const uint32_t first_nonce = pdata[19]; - uint32_t throughput = device_intensity(thr_id, __func__, 1 << 20); // 256*4096 - throughput = min(throughput, max_nonce - first_nonce); + uint32_t intensity = 1 << 22; + intensity = intensity + ((1 << 22)*9/10); + uint32_t throughputmax = device_intensity(device_map[thr_id], __func__, intensity); // 256*4096 + uint32_t throughput = min(throughputmax, max_nonce - first_nonce); if (opt_benchmark) - ((uint32_t*)ptarget)[7] = 0xf; + ptarget[7] = 0x0000003f; + + static THREAD uint32_t *foundnonces = nullptr; + static THREAD uint32_t *d_hash = nullptr; + static THREAD uint32_t *d_branch1Nonces = nullptr; + static THREAD uint32_t *d_branch2Nonces = nullptr; + static THREAD uint32_t *d_branch3Nonces = nullptr; - if (!init[thr_id]) + static THREAD volatile bool init = false; + if(!init) { - cudaSetDevice(device_map[thr_id]); - cudaDeviceReset(); + CUDA_SAFE_CALL(cudaSetDevice(device_map[thr_id])); cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync); cudaDeviceSetCacheConfig(cudaFuncCachePreferL1); + CUDA_SAFE_CALL(cudaStreamCreate(&gpustream[thr_id])); + get_cuda_arch(&cuda_arch[thr_id]); +// } - // Konstanten kopieren, Speicher belegen - CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput)); - - quark_groestl512_cpu_init(thr_id, throughput); - quark_skein512_cpu_init(thr_id); - quark_bmw512_cpu_init(thr_id, throughput); - cuda_check_cpu_init(thr_id, throughput); - quark_compactTest_cpu_init(thr_id, throughput); - - cudaMalloc(&d_quarkNonces[thr_id], sizeof(uint32_t)*throughput); - cudaMalloc(&d_branch1Nonces[thr_id], sizeof(uint32_t)*throughput); - cudaMalloc(&d_branch2Nonces[thr_id], sizeof(uint32_t)*throughput); - cudaMalloc(&d_branch3Nonces[thr_id], sizeof(uint32_t)*throughput); - - quark_jh512_cpu_init(thr_id, throughput); +#if defined WIN32 && !defined _WIN64 + // 2GB limit for cudaMalloc + if(throughputmax > 0x7fffffffULL / (16 * sizeof(uint32_t))) + { + applog(LOG_ERR, "intensity too high"); + mining_has_stopped[thr_id] = true; + cudaStreamDestroy(gpustream[thr_id]); + proper_exit(2); + } +#endif - init[thr_id] = true; + // Konstanten kopieren, Speicher belegen + CUDA_SAFE_CALL(cudaMalloc(&d_hash, 16 * sizeof(uint32_t) * throughputmax)); + CUDA_SAFE_CALL(cudaMallocHost(&foundnonces, 4 * 4)); +// CUDA_SAFE_CALL(cudaMalloc(&d_branch1Nonces, sizeof(uint32_t)*throughput)); +// CUDA_SAFE_CALL(cudaMalloc(&d_branch2Nonces, sizeof(uint32_t)*throughput)); + uint32_t noncebuffersize = throughputmax * 7 / 10; + uint32_t noncebuffersize2 = (throughputmax * 7 / 10)*7/10; + + CUDA_SAFE_CALL(cudaMalloc(&d_branch1Nonces, sizeof(uint32_t)*noncebuffersize2)); + CUDA_SAFE_CALL(cudaMalloc(&d_branch2Nonces, sizeof(uint32_t)*noncebuffersize2)); + CUDA_SAFE_CALL(cudaMalloc(&d_branch3Nonces, sizeof(uint32_t)*noncebuffersize)); + quark_blake512_cpu_init(thr_id); + quark_compactTest_cpu_init(thr_id, throughputmax); + quark_keccak512_cpu_init(thr_id); + quark_jh512_cpu_init(thr_id); + CUDA_SAFE_CALL(cudaGetLastError()); + mining_has_stopped[thr_id] = false; + init = true; } uint32_t endiandata[20]; for (int k=0; k < 20; k++) - be32enc(&endiandata[k], ((uint32_t*)pdata)[k]); - cuda_check_cpu_setTarget(ptarget); - quark_blake512_cpu_setBlock_80((void*)endiandata); + be32enc(&endiandata[k], pdata[k]); + quark_blake512_cpu_setBlock_80(thr_id, (uint64_t *)endiandata); do { - int order = 0; uint32_t nrm1 = 0, nrm2 = 0, nrm3 = 0; - // erstes Blake512 Hash mit CUDA - quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++); - - // das ist der unbedingte Branch für BMW512 - quark_bmw512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash); + quark_bmw512_cpu_hash_64_quark(thr_id, throughput, pdata[19], NULL, d_hash); - quark_compactTest_single_false_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id], NULL, - d_branch3Nonces[thr_id], &nrm3, - order++); + quark_compactTest_single_false_cpu_hash_64(thr_id, throughput, pdata[19], d_hash, NULL, + d_branch3Nonces, &nrm3); // nur den Skein Branch weiterverfolgen - quark_skein512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++); + quark_skein512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces, d_hash); // das ist der unbedingte Branch für Groestl512 - quark_groestl512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++); + quark_groestl512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces, d_hash); // das ist der unbedingte Branch für JH512 - quark_jh512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++); + quark_jh512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces, d_hash); // quarkNonces in branch1 und branch2 aufsplitten gemäss if (hash[0] & 0x8) - quark_compactTest_cpu_hash_64(thr_id, nrm3, pdata[19], d_hash[thr_id], d_branch3Nonces[thr_id], - d_branch1Nonces[thr_id], &nrm1, - d_branch2Nonces[thr_id], &nrm2, - order++); + quark_compactTest_cpu_hash_64(thr_id, nrm3, pdata[19], d_hash, d_branch3Nonces, + d_branch1Nonces, &nrm1, + d_branch2Nonces, &nrm2); // das ist der bedingte Branch für Blake512 - quark_blake512_cpu_hash_64(thr_id, nrm1, pdata[19], d_branch1Nonces[thr_id], d_hash[thr_id], order++); + quark_blake512_cpu_hash_64(thr_id, nrm1, pdata[19], d_branch1Nonces, d_hash); // das ist der bedingte Branch für Bmw512 - quark_bmw512_cpu_hash_64(thr_id, nrm2, pdata[19], d_branch2Nonces[thr_id], d_hash[thr_id], order++); + quark_bmw512_cpu_hash_64(thr_id, nrm2, pdata[19], d_branch2Nonces, d_hash); - // das ist der unbedingte Branch für Keccak512 - quark_keccak512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++); - - // das ist der unbedingte Branch für Skein512 - quark_skein512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++); + quark_keccakskein512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces, d_hash); // quarkNonces in branch1 und branch2 aufsplitten gemäss if (hash[0] & 0x8) - quark_compactTest_cpu_hash_64(thr_id, nrm3, pdata[19], d_hash[thr_id], d_branch3Nonces[thr_id], - d_branch1Nonces[thr_id], &nrm1, - d_branch2Nonces[thr_id], &nrm2, - order++); + quark_compactTest_cpu_hash_64(thr_id, nrm3, pdata[19], d_hash, d_branch3Nonces, + d_branch1Nonces, &nrm1, + d_branch3Nonces, &nrm2); + + quark_keccak512_cpu_hash_64_final(thr_id, nrm1, pdata[19], d_branch1Nonces, d_hash, ptarget[7], foundnonces); + quark_jh512_cpu_hash_64_final(thr_id, nrm2, pdata[19], d_branch3Nonces, d_hash, ptarget[7], foundnonces+2); + CUDA_SAFE_CALL(cudaStreamSynchronize(gpustream[thr_id])); + if(foundnonces[0] == 0xffffffff) + { + foundnonces[0] = foundnonces[2]; + foundnonces[1] = foundnonces[3]; + } + else + { + if(foundnonces[1] == 0xffffffff) + foundnonces[1] = foundnonces[2]; + } - // das ist der bedingte Branch für Keccak512 - quark_keccak512_cpu_hash_64_final(thr_id, nrm1, pdata[19], d_branch1Nonces[thr_id], d_hash[thr_id], order++); - quark_jh512_cpu_hash_64_final(thr_id, nrm2, pdata[19], d_branch2Nonces[thr_id], d_hash[thr_id], order++); + if(stop_mining) + { + mining_has_stopped[thr_id] = true; cudaStreamDestroy(gpustream[thr_id]); pthread_exit(nullptr); + } - uint32_t foundnonces[2]; - cuda_check_quarkcoin(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++, foundnonces); if (foundnonces[0] != 0xffffffff) { const uint32_t Htarg = ptarget[7]; - uint32_t vhash64[8]; - be32enc(&endiandata[19], foundnonces[0]); + uint32_t vhash64[8]={0}; + if(opt_verify){ be32enc(&endiandata[19], foundnonces[0]); quarkhash(vhash64, endiandata); - if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) + } if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) { int res = 1; *hashes_done = pdata[19] - first_nonce + throughput; + if(opt_benchmark) applog(LOG_INFO, "GPU #%d: Found nonce $%08X", device_map[thr_id], foundnonces[0]); // check if there was some other ones... if (foundnonces[1] != 0xffffffff) { - pdata[21] = foundnonces[1]; - res++; - if (opt_benchmark) applog(LOG_INFO, "GPU #%d: Found second nonce $%08X", thr_id, foundnonces[1]); + if(opt_verify){ be32enc(&endiandata[19], foundnonces[1]); + quarkhash(vhash64, endiandata); + + } if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) + { + pdata[21] = foundnonces[1]; + res++; + if(opt_benchmark) applog(LOG_INFO, "GPU #%d: Found second nonce $%08X", device_map[thr_id], foundnonces[1]); + } + else + { + if (vhash64[7] != Htarg) // don't show message if it is equal but fails fulltest + applog(LOG_INFO, "GPU #%d: result for nonce $%08X does not validate on CPU!", device_map[thr_id], foundnonces[1]); + } } pdata[19] = foundnonces[0]; - if (opt_benchmark) applog(LOG_INFO, "GPU #%d: Found nonce $%08X", thr_id, foundnonces[0]); + return res; } else { if (vhash64[7] != Htarg) // don't show message if it is equal but fails fulltest - applog(LOG_INFO, "GPU #%d: result for nonce $%08X does not validate on CPU!", thr_id, foundnonces[0]); + applog(LOG_INFO, "GPU #%d: result for nonce $%08X does not validate on CPU!", device_map[thr_id], foundnonces[0]); } } - pdata[19] += throughput; + pdata[19] += throughput; CUDA_SAFE_CALL(cudaGetLastError()); } while (!work_restart[thr_id].restart && ((uint64_t)max_nonce > ((uint64_t)(pdata[19]) + (uint64_t)throughput))); - *hashes_done = pdata[19] - first_nonce + 1; + *hashes_done = pdata[19] - first_nonce ; return 0; } diff --git a/qubit/deep.cu b/qubit/deep.cu index bb0a2ad1fd..11d6ad5d03 100644 --- a/qubit/deep.cu +++ b/qubit/deep.cu @@ -14,20 +14,19 @@ extern "C" { #include "cuda_helper.h" -static uint32_t *d_hash[MAX_GPUS]; - extern void qubit_luffa512_cpu_init(int thr_id, uint32_t threads); -extern void qubit_luffa512_cpu_setBlock_80(void *pdata); -extern void qubit_luffa512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order); -extern void qubit_luffa512_cpufinal_setBlock_80(void *pdata, const void *ptarget); -extern uint32_t qubit_luffa512_cpu_finalhash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order); +extern void qubit_luffa512_cpu_setBlock_80(int thr_id, void *pdata); +extern void qubit_luffa512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash); +extern void qubit_luffa512_cpufinal_setBlock_80(int thr_id, void *pdata, const void *ptarget); +extern uint32_t qubit_luffa512_cpu_finalhash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash); -extern void x11_cubehash512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); +extern void x11_cubehash512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash); extern void x11_echo512_cpu_init(int thr_id, uint32_t threads); -extern void x11_echo512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); +extern void x11_echo512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash); +extern void x11_echo512_cpu_hash_64_final(int thr_id, uint32_t threads, uint32_t startNounce, const uint32_t *d_hash, uint32_t target, uint32_t *h_found); -extern "C" void deephash(void *state, const void *input) +void deephash(void *state, const void *input) { // luffa1-cubehash2-shavite3-simd4-echo5 sph_luffa512_context ctx_luffa; @@ -51,77 +50,112 @@ extern "C" void deephash(void *state, const void *input) memcpy(state, hash, 32); } -static bool init[MAX_GPUS] = { 0 }; - -extern "C" int scanhash_deep(int thr_id, uint32_t *pdata, - const uint32_t *ptarget, uint32_t max_nonce, - unsigned long *hashes_done) +extern int scanhash_deep(int thr_id, uint32_t *pdata, + uint32_t *ptarget, uint32_t max_nonce, + uint32_t *hashes_done) { + static THREAD uint32_t *d_hash = nullptr; + static THREAD uint32_t *h_found = nullptr; + const uint32_t first_nonce = pdata[19]; uint32_t endiandata[20]; - uint32_t throughput = device_intensity(thr_id, __func__, 1U << 19); // 256*256*8 - throughput = min(throughput, (max_nonce - first_nonce)); + uint32_t throughputmax = device_intensity(device_map[thr_id], __func__, 1U << 19); // 256*256*8 + uint32_t throughput = min(throughputmax, (max_nonce - first_nonce)) & 0xfffffc00; if (opt_benchmark) - ((uint32_t*)ptarget)[7] = 0x0000f; + ptarget[7] = 0x00ff; - if (!init[thr_id]) + static THREAD volatile bool init = false; + if (!init) { - cudaSetDevice(device_map[thr_id]); - cudaDeviceReset(); + CUDA_SAFE_CALL(cudaSetDevice(device_map[thr_id])); cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync); cudaDeviceSetCacheConfig(cudaFuncCachePreferL1); + CUDA_SAFE_CALL(cudaStreamCreate(&gpustream[thr_id])); +#if defined WIN32 && !defined _WIN64 + // 2GB limit for cudaMalloc + if(throughputmax > 0x7fffffffULL / (16 * sizeof(uint32_t))) + { + applog(LOG_ERR, "intensity too high"); + mining_has_stopped[thr_id] = true; + cudaStreamDestroy(gpustream[thr_id]); + proper_exit(2); + } +#endif - cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput); + CUDA_SAFE_CALL(cudaMalloc(&d_hash, 16 * sizeof(uint32_t) * throughputmax)); - qubit_luffa512_cpu_init(thr_id, throughput); - x11_echo512_cpu_init(thr_id, throughput); + qubit_luffa512_cpu_init(thr_id, throughputmax); + x11_echo512_cpu_init(thr_id, throughputmax); + CUDA_SAFE_CALL(cudaMallocHost(&(h_found), 4 * sizeof(uint32_t))); - cuda_check_cpu_init(thr_id, throughput); + cuda_check_cpu_init(thr_id, throughputmax); + mining_has_stopped[thr_id] = false; - init[thr_id] = true; + init = true; } for (int k=0; k < 20; k++) - be32enc(&endiandata[k], ((uint32_t*)pdata)[k]); + be32enc(&endiandata[k], pdata[k]); - qubit_luffa512_cpufinal_setBlock_80((void*)endiandata,ptarget); - cuda_check_cpu_setTarget(ptarget); + qubit_luffa512_cpufinal_setBlock_80(thr_id, (void*)endiandata,ptarget); + cuda_check_cpu_setTarget(ptarget, thr_id); do { - int order = 0; - qubit_luffa512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++); - x11_cubehash512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); - x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); - - uint32_t foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]); - if (foundNonce != UINT32_MAX) + qubit_luffa512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash); + x11_cubehash512_cpu_hash_64(thr_id, throughput, pdata[19], d_hash); + x11_echo512_cpu_hash_64_final(thr_id, throughput, pdata[19], d_hash, ptarget[7], h_found); + cudaStreamSynchronize(gpustream[thr_id]); + if(stop_mining) {mining_has_stopped[thr_id] = true; cudaStreamDestroy(gpustream[thr_id]); pthread_exit(nullptr);} + if(h_found[0] != 0xffffffff) { const uint32_t Htarg = ptarget[7]; - uint32_t vhash64[8]; - be32enc(&endiandata[19], foundNonce); + uint32_t vhash64[8]={0}; + if(opt_verify){ be32enc(&endiandata[19], h_found[0]); deephash(vhash64, endiandata); - if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) { + } if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) + { int res = 1; - uint32_t secNonce = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], foundNonce); *hashes_done = pdata[19] - first_nonce + throughput; - if (secNonce != 0) { - pdata[21] = secNonce; - res++; + if (h_found[1] != 0xffffffff) + { + if(opt_verify){ be32enc(&endiandata[19], h_found[1]); + deephash(vhash64, endiandata); + } if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) + { + + pdata[21] = h_found[1]; + res++; + if (opt_benchmark) + applog(LOG_INFO, "GPU #%d Found second nounce %08x", device_map[thr_id], h_found[1]); + } + else + { + if (vhash64[7] != Htarg) + { + applog(LOG_WARNING, "GPU #%d: result for %08x does not validate on CPU!", device_map[thr_id], h_found[1]); + } + } + } - pdata[19] = foundNonce; + pdata[19] = h_found[0]; + if (opt_benchmark) + applog(LOG_INFO, "GPU #%d Found nounce %08x", device_map[thr_id], h_found[0]); return res; } - else { - applog(LOG_INFO, "GPU #%d: result for nonce $%08X does not validate on CPU!", thr_id, foundNonce); + else + { + if (vhash64[7] != Htarg) + { + applog(LOG_WARNING, "GPU #%d: result for %08x does not validate on CPU!", device_map[thr_id], h_found[0]); + } } } - - pdata[19] += throughput; + pdata[19] += throughput; CUDA_SAFE_CALL(cudaGetLastError()); } while (!work_restart[thr_id].restart && ((uint64_t)max_nonce > ((uint64_t)(pdata[19]) + (uint64_t)throughput))); - *hashes_done = pdata[19] - first_nonce + 1; + *hashes_done = pdata[19] - first_nonce ; return 0; } diff --git a/qubit/doom.cu b/qubit/doom.cu index 93f46ef0d8..479e7b36d4 100644 --- a/qubit/doom.cu +++ b/qubit/doom.cu @@ -10,13 +10,11 @@ extern "C" { #include "cuda_helper.h" -static uint32_t *d_hash[MAX_GPUS]; - extern void qubit_luffa512_cpu_init(int thr_id, uint32_t threads); -extern void qubit_luffa512_cpu_setBlock_80(void *pdata); -extern void qubit_luffa512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order); -extern void qubit_luffa512_cpufinal_setBlock_80(void *pdata, const void *ptarget); -extern uint32_t qubit_luffa512_cpu_finalhash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order); +extern void qubit_luffa512_cpu_setBlock_80(int thr_id, void *pdata); +extern void qubit_luffa512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash); +extern void qubit_luffa512_cpufinal_setBlock_80(int thr_id, void *pdata, const void *ptarget); +extern uint32_t qubit_luffa512_cpu_finalhash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash); extern void doomhash(void *state, const void *input) { @@ -32,63 +30,75 @@ extern void doomhash(void *state, const void *input) memcpy(state, hash, 32); } -static bool init[MAX_GPUS] = { 0 }; - -extern "C" int scanhash_doom(int thr_id, uint32_t *pdata, - const uint32_t *ptarget, uint32_t max_nonce, - unsigned long *hashes_done) +extern int scanhash_doom(int thr_id, uint32_t *pdata, + uint32_t *ptarget, uint32_t max_nonce, + uint32_t *hashes_done) { + static THREAD uint32_t *d_hash = nullptr; + const uint32_t first_nonce = pdata[19]; uint32_t endiandata[20]; - uint32_t throughput = device_intensity(thr_id, __func__, 1U << 22); // 256*256*8*8 - throughput = min(throughput, (max_nonce - first_nonce)); + uint32_t throughputmax = device_intensity(device_map[thr_id], __func__, 1U << 22); // 256*256*8*8 + uint32_t throughput = min(throughputmax, (max_nonce - first_nonce)) & 0xfffffc00; if (opt_benchmark) - ((uint32_t*)ptarget)[7] = 0x0000f; + ptarget[7] = 0x0000f; - if (!init[thr_id]) + static THREAD volatile bool init = false; + if(!init) { - cudaSetDevice(device_map[thr_id]); - cudaDeviceReset(); + CUDA_SAFE_CALL(cudaSetDevice(device_map[thr_id])); cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync); cudaDeviceSetCacheConfig(cudaFuncCachePreferL1); + CUDA_SAFE_CALL(cudaStreamCreate(&gpustream[thr_id])); +#if defined WIN32 && !defined _WIN64 + // 2GB limit for cudaMalloc + if(throughputmax > 0x7fffffffULL / (16 * sizeof(uint32_t))) + { + applog(LOG_ERR, "intensity too high"); + mining_has_stopped[thr_id] = true; + cudaStreamDestroy(gpustream[thr_id]); + proper_exit(2); + } +#endif - CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput)); + CUDA_SAFE_CALL(cudaMalloc(&d_hash, 16 * sizeof(uint32_t) * throughputmax)); - qubit_luffa512_cpu_init(thr_id, (int) throughput); + qubit_luffa512_cpu_init(thr_id, (int) throughputmax); + mining_has_stopped[thr_id] = false; - init[thr_id] = true; + init = true; } for (int k=0; k < 20; k++) - be32enc(&endiandata[k], ((uint32_t*)pdata)[k]); + be32enc(&endiandata[k], pdata[k]); - qubit_luffa512_cpufinal_setBlock_80((void*)endiandata,ptarget); + qubit_luffa512_cpufinal_setBlock_80(thr_id, (void*)endiandata,ptarget); do { - int order = 0; - uint32_t foundNonce = qubit_luffa512_cpu_finalhash_80(thr_id, (int) throughput, pdata[19], d_hash[thr_id], order++); - if (foundNonce != UINT32_MAX) + uint32_t foundNonce = qubit_luffa512_cpu_finalhash_80(thr_id, (int) throughput, pdata[19], d_hash); + if(stop_mining) {mining_has_stopped[thr_id] = true; cudaStreamDestroy(gpustream[thr_id]); pthread_exit(nullptr);} + if(foundNonce != UINT32_MAX) { const uint32_t Htarg = ptarget[7]; - uint32_t vhash64[8]; - be32enc(&endiandata[19], foundNonce); + uint32_t vhash64[8]={0}; + if(opt_verify){ be32enc(&endiandata[19], foundNonce); doomhash(vhash64, endiandata); - if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) { + } if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) { *hashes_done = min(max_nonce - first_nonce, (uint64_t) pdata[19] - first_nonce + throughput); pdata[19] = foundNonce; return 1; } else { - applog(LOG_INFO, "GPU #%d: result for nonce $%08X does not validate on CPU!", thr_id, foundNonce); + applog(LOG_INFO, "GPU #%d: result for nonce $%08X does not validate on CPU!", device_map[thr_id], foundNonce); } } - pdata[19] += throughput; + pdata[19] += throughput; CUDA_SAFE_CALL(cudaGetLastError()); } while (!work_restart[thr_id].restart && ((uint64_t)max_nonce > ((uint64_t)(pdata[19]) + (uint64_t)throughput))); - *hashes_done = pdata[19] - first_nonce + 1; + *hashes_done = pdata[19] - first_nonce ; return 0; } diff --git a/qubit/qubit.cu b/qubit/qubit.cu index 538fabe3c3..0961eac3e3 100644 --- a/qubit/qubit.cu +++ b/qubit/qubit.cu @@ -11,33 +11,28 @@ extern "C" { } #include "miner.h" - #include "cuda_helper.h" -static uint32_t *d_hash[MAX_GPUS]; -static uint32_t *h_found[MAX_GPUS]; - extern void qubit_luffa512_cpu_init(int thr_id, uint32_t threads); -extern void qubit_luffa512_cpu_setBlock_80(void *pdata); -extern void qubit_luffa512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order); +extern void qubit_luffa512_cpu_setBlock_80(int thr_id, void *pdata); +extern void qubit_luffa512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash); -extern void x11_cubehash512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); +extern void x11_cubehash512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash); -extern void x11_shavite512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); +extern void x11_shavite512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash); extern int x11_simd512_cpu_init(int thr_id, uint32_t threads); -extern void x11_simd512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); +extern void x11_simd512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash,const uint32_t simdthreads); extern void x11_echo512_cpu_init(int thr_id, uint32_t threads); -extern void x11_echo512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); -extern void x11_echo512_cpu_hash_64_final(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, uint32_t target, uint32_t *h_found, int order); +//extern void x11_echo512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash); +extern void x11_echo512_cpu_hash_64_final(int thr_id, uint32_t threads, uint32_t startNounce, const uint32_t *d_hash, uint32_t target, uint32_t *h_found); extern void quark_compactTest_cpu_init(int thr_id, uint32_t threads); -extern void quark_compactTest_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *inpHashes, - uint32_t *d_noncesTrue, uint32_t *nrmTrue, uint32_t *d_noncesFalse, uint32_t *nrmFalse, - int order); +extern void quark_compactTest_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, const uint32_t *inpHashes, + const uint32_t *d_noncesTrue, uint32_t *nrmTrue, uint32_t *d_noncesFalse, uint32_t *nrmFalse); -extern "C" void qubithash(void *state, const void *input) +void qubithash(void *state, const void *input) { // luffa1-cubehash2-shavite3-simd4-echo5 @@ -72,90 +67,148 @@ extern "C" void qubithash(void *state, const void *input) memcpy(state, hash, 32); } -static bool init[MAX_GPUS] = { 0 }; - -extern "C" int scanhash_qubit(int thr_id, uint32_t *pdata, - const uint32_t *ptarget, uint32_t max_nonce, - unsigned long *hashes_done) +extern int scanhash_qubit(int thr_id, uint32_t *pdata, + uint32_t *ptarget, uint32_t max_nonce, + uint32_t *hashes_done) { + static THREAD uint32_t *d_hash = nullptr; + static THREAD uint32_t *h_found = nullptr; + uint32_t endiandata[20]; const uint32_t first_nonce = pdata[19]; - uint32_t throughput = device_intensity(thr_id, __func__, 1U << 19); // 256*256*8 - throughput = min(throughput, (max_nonce - first_nonce)); + + uint32_t intensity = 256 * 256 * 10; + uint32_t simdthreads = (device_sm[device_map[thr_id]] > 500) ? 256 : 32; + + cudaDeviceProp props; + cudaGetDeviceProperties(&props, device_map[thr_id]); + if(strstr(props.name, "1080")) + { + intensity = 256 * 256 * 24; + } + else if(strstr(props.name, "1070")) + { + intensity = 256 * 256 * 24; + } + else if(strstr(props.name, "970")) + { + intensity = 256 * 256 * 16; + } + else if (strstr(props.name, "980")) + { + intensity = 256 * 256 * 24; + } + else if (strstr(props.name, "750 Ti")) + { + intensity = 256 * 256 * 12; + } + else if (strstr(props.name, "750")) + { + intensity = 256 * 256 * 10; + } + else if (strstr(props.name, "960")) + { + intensity = 256 * 256 * 16; + } + uint32_t throughputmax = device_intensity(device_map[thr_id], __func__, intensity); + uint32_t throughput = min(throughputmax, (max_nonce - first_nonce)) & 0xfffffc00; if (opt_benchmark) - ((uint32_t*)ptarget)[7] = 0x0000ff; + ptarget[7] = 0x0000ff; - if (!init[thr_id]) + static THREAD volatile bool init = false; + if(!init) { - cudaSetDevice(device_map[thr_id]); - cudaDeviceReset(); + CUDA_SAFE_CALL(cudaSetDevice(device_map[thr_id])); cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync); cudaDeviceSetCacheConfig(cudaFuncCachePreferL1); + CUDA_SAFE_CALL(cudaStreamCreate(&gpustream[thr_id])); - qubit_luffa512_cpu_init(thr_id, throughput); - x11_simd512_cpu_init(thr_id, throughput); - x11_echo512_cpu_init(thr_id, throughput); +#if defined WIN32 && !defined _WIN64 + // 2GB limit for cudaMalloc + if(throughputmax > 0x7fffffffULL / (64 * sizeof(uint4))) + { + applog(LOG_ERR, "intensity too high"); + mining_has_stopped[thr_id] = true; + cudaStreamDestroy(gpustream[thr_id]); + proper_exit(2); + } +#endif - CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput), 0); - CUDA_CALL_OR_RET_X(cudaMallocHost(&(h_found[thr_id]), 4 * sizeof(uint32_t)), 0); + qubit_luffa512_cpu_init(thr_id, throughputmax); + x11_simd512_cpu_init(thr_id, throughputmax); + x11_echo512_cpu_init(thr_id, throughputmax); - cuda_check_cpu_init(thr_id, throughput); + CUDA_SAFE_CALL(cudaMalloc(&d_hash, 16 * sizeof(uint32_t) * throughputmax)); + CUDA_SAFE_CALL(cudaMallocHost(&(h_found), 4 * sizeof(uint32_t))); + mining_has_stopped[thr_id] = false; - init[thr_id] = true; + init = true; } for (int k=0; k < 20; k++) - be32enc(&endiandata[k], ((uint32_t*)pdata)[k]); + be32enc(&endiandata[k], pdata[k]); - qubit_luffa512_cpu_setBlock_80((void*)endiandata); - cuda_check_cpu_setTarget(ptarget); + qubit_luffa512_cpu_setBlock_80(thr_id, (void*)endiandata); do { - int order = 0; // Hash with CUDA - qubit_luffa512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++); - x11_cubehash512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); - x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); - x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); - x11_echo512_cpu_hash_64_final(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], ptarget[7], h_found[thr_id], order++); - if (h_found[thr_id][0] != 0xffffffff) + qubit_luffa512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash); + x11_cubehash512_cpu_hash_64(thr_id, throughput, pdata[19], d_hash); + x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], d_hash); + x11_simd512_cpu_hash_64(thr_id,throughput, pdata[19], d_hash,simdthreads); + x11_echo512_cpu_hash_64_final(thr_id, throughput, pdata[19], d_hash, ptarget[7], h_found); + cudaStreamSynchronize(gpustream[thr_id]); + if(stop_mining) {mining_has_stopped[thr_id] = true; cudaStreamDestroy(gpustream[thr_id]); pthread_exit(nullptr);} + if(h_found[0] != 0xffffffff) { const uint32_t Htarg = ptarget[7]; - uint32_t vhash64[8]; - be32enc(&endiandata[19], h_found[thr_id][0]); + uint32_t vhash64[8]={0}; + if(opt_verify){ be32enc(&endiandata[19], h_found[0]); qubithash(vhash64, endiandata); - if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) + } if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) { int res = 1; - // check if there was some other ones... *hashes_done = pdata[19] - first_nonce + throughput; - if (h_found[thr_id][1] != 0xffffffff) + if (h_found[1] != 0xffffffff) { - pdata[21] = h_found[thr_id][1]; - res++; - if (opt_benchmark) - applog(LOG_INFO, "GPU #%d Found second nounce %08x", thr_id, h_found[thr_id][1], vhash64[7], Htarg); + if(opt_verify){ be32enc(&endiandata[19], h_found[1]); + qubithash(vhash64, endiandata); + } if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) + { + + pdata[21] = h_found[1]; + res++; + if (opt_benchmark) + applog(LOG_INFO, "GPU #%d Found second nounce %08x", device_map[thr_id], h_found[1]); + } + else + { + if (vhash64[7] != Htarg) + { + applog(LOG_WARNING, "GPU #%d: result for %08x does not validate on CPU!", device_map[thr_id], h_found[1]); + } + } + } - pdata[19] = h_found[thr_id][0]; + pdata[19] = h_found[0]; if (opt_benchmark) - applog(LOG_INFO, "GPU #%d Found nounce %08x", thr_id, h_found[thr_id][0], vhash64[7], Htarg); + applog(LOG_INFO, "GPU #%d Found nounce %08x", device_map[thr_id], h_found[0]); return res; } else { if (vhash64[7] != Htarg) { - applog(LOG_INFO, "GPU #%d: result for %08x does not validate on CPU!", thr_id, h_found[thr_id][0]); + applog(LOG_WARNING, "GPU #%d: result for %08x does not validate on CPU!", device_map[thr_id], h_found[0]); } } } - - pdata[19] += throughput; + pdata[19] += throughput; CUDA_SAFE_CALL(cudaGetLastError()); } while (!work_restart[thr_id].restart && ((uint64_t)max_nonce > ((uint64_t)(pdata[19]) + (uint64_t)throughput))); - *hashes_done = pdata[19] - first_nonce + 1; + *hashes_done = pdata[19] - first_nonce ; return 0; } diff --git a/qubit/qubit_luffa512.cu b/qubit/qubit_luffa512.cu index 4a577a62c7..67b85c5fcd 100644 --- a/qubit/qubit_luffa512.cu +++ b/qubit/qubit_luffa512.cu @@ -17,20 +17,31 @@ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ -#include +#ifdef __cplusplus +#include +#include +using namespace std; +#else #include +#include +#endif #include - +#include "miner.h" #include "cuda_helper.h" + + #ifndef UINT32_MAX #define UINT32_MAX UINT_MAX #endif -typedef unsigned char BitSequence; - +static THREAD unsigned char PaddedMessage[128]; __constant__ uint64_t c_PaddedMessage80[16]; // padded message (80 bytes + padding) __constant__ uint32_t c_Target[8]; +__constant__ uint32_t statebufferpre[8]; +__constant__ uint32_t statechainvpre[40]; + + static uint32_t *h_resNounce[MAX_GPUS]; static uint32_t *d_resNounce[MAX_GPUS]; @@ -40,11 +51,6 @@ static uint32_t *d_resNounce[MAX_GPUS]; static uint32_t extra_results[2] = { UINT32_MAX, UINT32_MAX }; #endif -typedef struct { - uint32_t buffer[8]; /* Buffer to be hashed */ - uint32_t chainv[40]; /* Chaining values */ -} hashState; - #define BYTES_SWAP32(x) cuda_swab32(x) #define MULT2(a,j)\ @@ -59,10 +65,10 @@ typedef struct { a[0+(8*j)] = tmp; #define TWEAK(a0,a1,a2,a3,j)\ - a0 = (a0<<(j))|(a0>>(32-j));\ - a1 = (a1<<(j))|(a1>>(32-j));\ - a2 = (a2<<(j))|(a2>>(32-j));\ - a3 = (a3<<(j))|(a3>>(32-j)); + a0 = ROTL32(a0,j);\ + a1 = ROTL32(a1,j);\ + a2 = ROTL32(a2,j);\ + a3 = ROTL32(a3,j); #define STEP(c0,c1)\ SUBCRUMB(chainv[0],chainv[1],chainv[2],chainv[3],tmp);\ @@ -107,18 +113,6 @@ typedef struct { b0 ^= c1; /* initial values of chaining variables */ -__constant__ uint32_t c_IV[40] = { - 0x6d251e69,0x44b051e0,0x4eaa6fb4,0xdbf78465, - 0x6e292011,0x90152df4,0xee058139,0xdef610bb, - 0xc3b44b95,0xd9d2f256,0x70eee9a0,0xde099fa3, - 0x5d9b0557,0x8fc944b3,0xcf1ccf0e,0x746cd581, - 0xf7efc89d,0x5dba5781,0x04016ce5,0xad659c05, - 0x0306194f,0x666d1836,0x24aa230a,0x8b264ae7, - 0x858075d5,0x36d79cce,0xe571f7d7,0x204b1f67, - 0x35870c6a,0x57e9e923,0x14bcb808,0x7cde72ce, - 0x6c68e9be,0x5ec41e22,0xc825b7c7,0xaffb4363, - 0xf5df3999,0x0fc688f1,0xb07224cc,0x03e86cea}; - __constant__ uint32_t c_CNS[80] = { 0x303994a6,0xe0337818,0xc0e65299,0x441ba90d, 0x6cc33a12,0x7f34d442,0xdc56983e,0x9389217f, @@ -141,251 +135,722 @@ __constant__ uint32_t c_CNS[80] = { 0x78602649,0x29131ab6,0x8edae952,0x0fc053c3, 0x3b6ba548,0x3f014f0c,0xedae9520,0xfc053c31}; +static uint32_t h_CNS[80] = { + 0x303994a6, 0xe0337818, 0xc0e65299, 0x441ba90d, + 0x6cc33a12, 0x7f34d442, 0xdc56983e, 0x9389217f, + 0x1e00108f, 0xe5a8bce6, 0x7800423d, 0x5274baf4, + 0x8f5b7882, 0x26889ba7, 0x96e1db12, 0x9a226e9d, + 0xb6de10ed, 0x01685f3d, 0x70f47aae, 0x05a17cf4, + 0x0707a3d4, 0xbd09caca, 0x1c1e8f51, 0xf4272b28, + 0x707a3d45, 0x144ae5cc, 0xaeb28562, 0xfaa7ae2b, + 0xbaca1589, 0x2e48f1c1, 0x40a46f3e, 0xb923c704, + 0xfc20d9d2, 0xe25e72c1, 0x34552e25, 0xe623bb72, + 0x7ad8818f, 0x5c58a4a4, 0x8438764a, 0x1e38e2e7, + 0xbb6de032, 0x78e38b9d, 0xedb780c8, 0x27586719, + 0xd9847356, 0x36eda57f, 0xa2c78434, 0x703aace7, + 0xb213afa5, 0xe028c9bf, 0xc84ebe95, 0x44756f91, + 0x4e608a22, 0x7e8fce32, 0x56d858fe, 0x956548be, + 0x343b138f, 0xfe191be2, 0xd0ec4e3d, 0x3cb226e5, + 0x2ceb4882, 0x5944a28e, 0xb3ad2208, 0xa1c4c355, + 0xf0d2e9e3, 0x5090d577, 0xac11d7fa, 0x2d1925ab, + 0x1bcb66f2, 0xb46496ac, 0x6f2d9bc9, 0xd1925ab0, + 0x78602649, 0x29131ab6, 0x8edae952, 0x0fc053c3, + 0x3b6ba548, 0x3f014f0c, 0xedae9520, 0xfc053c31 }; + -/***************************************************/ __device__ __forceinline__ -void rnd512(hashState *state) +void rnd512(uint32_t *statebuffer, uint32_t *statechainv) { - int i,j; + int i, j; uint32_t t[40]; uint32_t chainv[8]; uint32_t tmp; #pragma unroll 8 - for(i=0;i<8;i++) { - t[i]=0; -#pragma unroll 5 - for(j=0;j<5;j++) { - t[i] ^= state->chainv[i+8*j]; + for (i = 0; i<8; i++) + { + t[i] = statechainv[i]; +#pragma unroll + for (j = 1; j<5; j++) + { + t[i] ^= statechainv[i + 8 * j]; } } MULT2(t, 0); #pragma unroll 5 - for(j=0;j<5;j++) { + for (j = 0; j<5; j++) { #pragma unroll 8 - for(i=0;i<8;i++) { - state->chainv[i+8*j] ^= t[i]; + for (i = 0; i<8; i++) { + statechainv[i + 8 * j] ^= t[i]; } } #pragma unroll 5 - for(j=0;j<5;j++) { + for (j = 0; j<5; j++) { #pragma unroll 8 - for(i=0;i<8;i++) { - t[i+8*j] = state->chainv[i+8*j]; + for (i = 0; i<8; i++) { + t[i + 8 * j] = statechainv[i + 8 * j]; } } #pragma unroll 5 - for(j=0;j<5;j++) { - MULT2(state->chainv, j); + for (j = 0; j<5; j++) { + MULT2(statechainv, j); } #pragma unroll 5 - for(j=0;j<5;j++) { + for (j = 0; j<5; j++) { #pragma unroll 8 - for(i=0;i<8;i++) { - state->chainv[8*j+i] ^= t[8*((j+1)%5)+i]; + for (i = 0; i<8; i++) { + statechainv[8 * j + i] ^= t[8 * ((j + 1) % 5) + i]; } } #pragma unroll 5 - for(j=0;j<5;j++) { + for (j = 0; j<5; j++) { #pragma unroll 8 - for(i=0;i<8;i++) { - t[i+8*j] = state->chainv[i+8*j]; + for (i = 0; i<8; i++) { + t[i + 8 * j] = statechainv[i + 8 * j]; } } #pragma unroll 5 - for(j=0;j<5;j++) { - MULT2(state->chainv, j); + for (j = 0; j<5; j++) { + MULT2(statechainv, j); } #pragma unroll 5 - for(j=0;j<5;j++) { + for (j = 0; j<5; j++) { #pragma unroll 8 - for(i=0;i<8;i++) { - state->chainv[8*j+i] ^= t[8*((j+4)%5)+i]; + for (i = 0; i<8; i++) { + statechainv[8 * j + i] ^= t[8 * ((j + 4) % 5) + i]; } } #pragma unroll 5 - for(j=0;j<5;j++) { + for (j = 0; j<5; j++) { #pragma unroll 8 - for(i=0;i<8;i++) { - state->chainv[i+8*j] ^= state->buffer[i]; + for (i = 0; i<8; i++) { + statechainv[i + 8 * j] ^= statebuffer[i]; } - MULT2(state->buffer, 0); + MULT2(statebuffer, 0); } #pragma unroll 8 - for(i=0;i<8;i++) { - chainv[i] = state->chainv[i]; + for (i = 0; i<8; i++) { + chainv[i] = statechainv[i]; + } + +#pragma unroll 1 + for (i = 0; i<8; i++) + { + STEP(c_CNS[(2 * i)], c_CNS[(2 * i) + 1]); } #pragma unroll 8 - for(i=0;i<8;i++) { - STEP(c_CNS[(2*i)],c_CNS[(2*i)+1]); + for (i = 0; i<8; i++) + { + statechainv[i] = chainv[i]; + chainv[i] = statechainv[i + 8]; + } + + TWEAK(chainv[4], chainv[5], chainv[6], chainv[7], 1); + +#pragma unroll 1 + for (i = 0; i<8; i++) + { + STEP(c_CNS[(2 * i) + 16], c_CNS[(2 * i) + 16 + 1]); } #pragma unroll 8 - for(i=0;i<8;i++) { - state->chainv[i] = chainv[i]; - chainv[i] = state->chainv[i+8]; + for (i = 0; i<8; i++) { + statechainv[i + 8] = chainv[i]; + chainv[i] = statechainv[i + 16]; } - TWEAK(chainv[4],chainv[5],chainv[6],chainv[7],1); + TWEAK(chainv[4], chainv[5], chainv[6], chainv[7], 2); + +#pragma unroll 1 + for (i = 0; i<8; i++) + { + STEP(c_CNS[(2 * i) + 32], c_CNS[(2 * i) + 32 + 1]); + } #pragma unroll 8 - for(i=0;i<8;i++) { - STEP(c_CNS[(2*i)+16],c_CNS[(2*i)+16+1]); + for (i = 0; i<8; i++) { + statechainv[i + 16] = chainv[i]; + chainv[i] = statechainv[i + 24]; + } + + TWEAK(chainv[4], chainv[5], chainv[6], chainv[7], 3); + +#pragma unroll 1 + for (i = 0; i<8; i++) + { + STEP(c_CNS[(2 * i) + 48], c_CNS[(2 * i) + 48 + 1]); } #pragma unroll 8 - for(i=0;i<8;i++) { - state->chainv[i+8] = chainv[i]; - chainv[i] = state->chainv[i+16]; + for (i = 0; i<8; i++) + { + statechainv[i + 24] = chainv[i]; + chainv[i] = statechainv[i + 32]; } - TWEAK(chainv[4],chainv[5],chainv[6],chainv[7],2); + TWEAK(chainv[4], chainv[5], chainv[6], chainv[7], 4); + +#pragma unroll 1 + for (i = 0; i<8; i++) + { + STEP(c_CNS[(2 * i) + 64], c_CNS[(2 * i) + 64 + 1]); + } #pragma unroll 8 - for(i=0;i<8;i++) { - STEP(c_CNS[(2*i)+32],c_CNS[(2*i)+32+1]); + for (i = 0; i<8; i++) { + statechainv[i + 32] = chainv[i]; } +} + + +__device__ __forceinline__ +void rnd512_first(uint32_t *statebuffer, uint32_t *statechainv) +{ + uint32_t chainv[8]; + uint32_t tmp; + int i; + + statechainv[0 + 8 * 0] ^= statebuffer[0]; + statechainv[1 + 8 * 0] ^= statebuffer[1]; + statechainv[2 + 8 * 0] ^= statebuffer[2]; + statechainv[3 + 8 * 0] ^= statebuffer[3]; + statechainv[4 + 8 * 0] ^= statebuffer[4]; + + + statechainv[1 + 8 * 1] ^= statebuffer[0]; + statechainv[2 + 8 * 1] ^= statebuffer[1]; + statechainv[3 + 8 * 1] ^= statebuffer[2]; + statechainv[4 + 8 * 1] ^= statebuffer[3]; + statechainv[5 + 8 * 1] ^= statebuffer[4]; + + + statechainv[2 + 8 * 2] ^= statebuffer[0]; + statechainv[3 + 8 * 2] ^= statebuffer[1]; + statechainv[4 + 8 * 2] ^= statebuffer[2]; + statechainv[5 + 8 * 2] ^= statebuffer[3]; + statechainv[6 + 8 * 2] ^= statebuffer[4]; + + + statechainv[3 + 8 * 3] ^= statebuffer[0]; + statechainv[4 + 8 * 3] ^= statebuffer[1]; + statechainv[5 + 8 * 3] ^= statebuffer[2]; + statechainv[6 + 8 * 3] ^= statebuffer[3]; + statechainv[7 + 8 * 3] ^= statebuffer[4]; + + statechainv[4 + 8 * 4] ^= statebuffer[0] ^ statebuffer[4]; + statechainv[5 + 8 * 4] ^= statebuffer[1]; + statechainv[6 + 8 * 4] ^= statebuffer[2]; + statechainv[7 + 8 * 4] ^= statebuffer[3]; + statechainv[0 + 8 * 4] ^= statebuffer[4]; + + statechainv[1 + 8 * 4] = (statechainv[1 + 8 * 4] ^ statebuffer[4]); + statechainv[3 + 8 * 4] = (statechainv[3 + 8 * 4] ^ statebuffer[4]); #pragma unroll 8 - for(i=0;i<8;i++) { - state->chainv[i+16] = chainv[i]; - chainv[i] = state->chainv[i+24]; + for (i = 0; i<8; i++) { + chainv[i] = statechainv[i]; } - TWEAK(chainv[4],chainv[5],chainv[6],chainv[7],3); +#pragma unroll 1 + for (i = 0; i<8; i++) + { + STEP(c_CNS[(2 * i)], c_CNS[(2 * i) + 1]); + } #pragma unroll 8 - for(i=0;i<8;i++) { - STEP(c_CNS[(2*i)+48],c_CNS[(2*i)+48+1]); + for (i = 0; i<8; i++) + { + statechainv[i] = chainv[i]; + chainv[i] = statechainv[i + 8]; + } + + TWEAK(chainv[4], chainv[5], chainv[6], chainv[7], 1); + +#pragma unroll 1 + for (i = 0; i<8; i++) + { + STEP(c_CNS[(2 * i) + 16], c_CNS[(2 * i) + 16 + 1]); } #pragma unroll 8 - for(i=0;i<8;i++) { - state->chainv[i+24] = chainv[i]; - chainv[i] = state->chainv[i+32]; + for (i = 0; i<8; i++) { + statechainv[i + 8] = chainv[i]; + chainv[i] = statechainv[i + 16]; } - TWEAK(chainv[4],chainv[5],chainv[6],chainv[7],4); + TWEAK(chainv[4], chainv[5], chainv[6], chainv[7], 2); + +#pragma unroll 1 + for (i = 0; i<8; i++) + { + STEP(c_CNS[(2 * i) + 32], c_CNS[(2 * i) + 32 + 1]); + } #pragma unroll 8 - for(i=0;i<8;i++) { - STEP(c_CNS[(2*i)+64],c_CNS[(2*i)+64+1]); + for (i = 0; i<8; i++) { + statechainv[i + 16] = chainv[i]; + chainv[i] = statechainv[i + 24]; + } + + TWEAK(chainv[4], chainv[5], chainv[6], chainv[7], 3); + +#pragma unroll 1 + for (i = 0; i<8; i++) + { + STEP(c_CNS[(2 * i) + 48], c_CNS[(2 * i) + 48 + 1]); } #pragma unroll 8 - for(i=0;i<8;i++) { - state->chainv[i+32] = chainv[i]; + for (i = 0; i<8; i++) + { + statechainv[i + 24] = chainv[i]; + chainv[i] = statechainv[i + 32]; + } + + TWEAK(chainv[4], chainv[5], chainv[6], chainv[7], 4); + +#pragma unroll 1 + for (i = 0; i<8; i++) + { + STEP(c_CNS[(2 * i) + 64], c_CNS[(2 * i) + 64 + 1]); + } + +#pragma unroll 8 + for (i = 0; i<8; i++) { + statechainv[i + 32] = chainv[i]; } } +void rnd512cpu(uint32_t *statebuffer, uint32_t *statechainv) +{ + int i, j; + uint32_t t[40]; + uint32_t chainv[8]; + uint32_t tmp; + + for (i = 0; i<8; i++) + { + t[i] = statechainv[i]; + for (j = 1; j<5; j++) + { + t[i] ^= statechainv[i + 8 * j]; + } + } + + MULT2(t, 0); + + for (j = 0; j<5; j++) + { + for (i = 0; i<8; i++) + { + statechainv[i + 8 * j] ^= t[i]; + } + } + + for (j = 0; j<5; j++) + { + for (i = 0; i<8; i++) + { + t[i + 8 * j] = statechainv[i + 8 * j]; + } + } + + for (j = 0; j<5; j++) + { + MULT2(statechainv, j); + } + + for (j = 0; j<5; j++) + { + for (i = 0; i<8; i++) + { + statechainv[8 * j + i] ^= t[8 * ((j + 1) % 5) + i]; + } + } + + for (j = 0; j<5; j++) + { + for (i = 0; i<8; i++) + { + t[i + 8 * j] = statechainv[i + 8 * j]; + } + } + + for (j = 0; j<5; j++) + { + MULT2(statechainv, j); + } + + for (j = 0; j<5; j++) + { + for (i = 0; i<8; i++) + { + statechainv[8 * j + i] ^= t[8 * ((j + 4) % 5) + i]; + } + } + + for (j = 0; j<5; j++) + { + for (i = 0; i<8; i++) + { + statechainv[i + 8 * j] ^= statebuffer[i]; + } + MULT2(statebuffer, 0); + } + + for (i = 0; i<8; i++) + { + chainv[i] = statechainv[i]; + } + + for (i = 0; i<8; i++) + { + STEP(h_CNS[(2 * i)], h_CNS[(2 * i) + 1]); + } + + for (i = 0; i<8; i++) + { + statechainv[i] = chainv[i]; + chainv[i] = statechainv[i + 8]; + } + + TWEAK(chainv[4], chainv[5], chainv[6], chainv[7], 1); + + for (i = 0; i<8; i++) + { + STEP(h_CNS[(2 * i) + 16], h_CNS[(2 * i) + 16 + 1]); + } + + for (i = 0; i<8; i++) + { + statechainv[i + 8] = chainv[i]; + chainv[i] = statechainv[i + 16]; + } + + TWEAK(chainv[4], chainv[5], chainv[6], chainv[7], 2); + + for (i = 0; i<8; i++) + { + STEP(h_CNS[(2 * i) + 32], h_CNS[(2 * i) + 32 + 1]); + } + + for (i = 0; i<8; i++) + { + statechainv[i + 16] = chainv[i]; + chainv[i] = statechainv[i + 24]; + } + + TWEAK(chainv[4], chainv[5], chainv[6], chainv[7], 3); + + for (i = 0; i<8; i++) + { + STEP(h_CNS[(2 * i) + 48], h_CNS[(2 * i) + 48 + 1]); + } + + for (i = 0; i<8; i++) + { + statechainv[i + 24] = chainv[i]; + chainv[i] = statechainv[i + 32]; + } + + TWEAK(chainv[4], chainv[5], chainv[6], chainv[7], 4); + + for (i = 0; i<8; i++) + { + STEP(h_CNS[(2 * i) + 64], h_CNS[(2 * i) + 64 + 1]); + } + + for (i = 0; i<8; i++) + { + statechainv[i + 32] = chainv[i]; + } +} + __device__ __forceinline__ -void Update512(hashState *const __restrict__ state, const BitSequence *const __restrict__ data) +void Update512(uint32_t *const __restrict__ statebuffer, uint32_t *const __restrict__ statechainv, const uint32_t *const __restrict__ data) { #pragma unroll 8 - for(int i=0;i<8;i++) state->buffer[i] = BYTES_SWAP32(((uint32_t*)data)[i]); - rnd512(state); + for (int i = 0; i<8; i++) + statebuffer[i] = BYTES_SWAP32((data)[i]); + rnd512(statebuffer, statechainv); #pragma unroll 8 - for(int i=0;i<8;i++) state->buffer[i] = BYTES_SWAP32(((uint32_t*)(data+32))[i]); - rnd512(state); + for(int i=0;i<8;i++) + statebuffer[i] = BYTES_SWAP32(((data))[i+8]); + rnd512(statebuffer, statechainv); #pragma unroll 4 - for(int i=0;i<4;i++) state->buffer[i] = BYTES_SWAP32(((uint32_t*)(data+64))[i]); + for(int i=0;i<4;i++) + statebuffer[i] = BYTES_SWAP32(((data))[i+16]); } - /***************************************************/ __device__ __forceinline__ -void finalization512(hashState *const __restrict__ state, uint32_t *const __restrict__ b) +void rnd512_nullhash(uint32_t *state) +{ + int i, j; + uint32_t t[40]; + uint32_t chainv[8]; + uint32_t tmp; + +#pragma unroll 8 + for (i = 0; i<8; i++) { + t[i] = state[i + 8 * 0]; +#pragma unroll 4 + for (j = 1; j<5; j++) { + t[i] ^= state[i + 8 * j]; + } + } + + MULT2(t, 0); + +#pragma unroll 5 + for (j = 0; j<5; j++) { +#pragma unroll 8 + for (i = 0; i<8; i++) { + state[i + 8 * j] ^= t[i]; + } + } + +#pragma unroll 5 + for (j = 0; j<5; j++) { +#pragma unroll 8 + for (i = 0; i<8; i++) { + t[i + 8 * j] = state[i + 8 * j]; + } + } + +#pragma unroll 5 + for (j = 0; j<5; j++) { + MULT2(state, j); + } + +#pragma unroll 5 + for (j = 0; j<5; j++) { +#pragma unroll 8 + for (i = 0; i<8; i++) { + state[8 * j + i] ^= t[8 * ((j + 1) % 5) + i]; + } + } + +#pragma unroll 5 + for (j = 0; j<5; j++) { +#pragma unroll 8 + for (i = 0; i<8; i++) { + t[i + 8 * j] = state[i + 8 * j]; + } + } + +#pragma unroll 5 + for (j = 0; j<5; j++) { + MULT2(state, j); + } + +#pragma unroll 5 + for (j = 0; j<5; j++) { +#pragma unroll 8 + for (i = 0; i<8; i++) { + state[8 * j + i] ^= t[8 * ((j + 4) % 5) + i]; + } + } + +#pragma unroll 8 + for (i = 0; i<8; i++) { + chainv[i] = state[i]; + } + +#pragma unroll 1 + for (i = 0; i<8; i++) { + STEP(c_CNS[(2 * i)], c_CNS[(2 * i) + 1]); + } + +#pragma unroll 8 + for (i = 0; i<8; i++) { + state[i] = chainv[i]; + chainv[i] = state[i + 8]; + } + + TWEAK(chainv[4], chainv[5], chainv[6], chainv[7], 1); + +#pragma unroll 1 + for (i = 0; i<8; i++) { + STEP(c_CNS[(2 * i) + 16], c_CNS[(2 * i) + 16 + 1]); + } + +#pragma unroll 8 + for (i = 0; i<8; i++) { + state[i + 8] = chainv[i]; + chainv[i] = state[i + 16]; + } + + TWEAK(chainv[4], chainv[5], chainv[6], chainv[7], 2); + +#pragma unroll 1 + for (i = 0; i<8; i++) { + STEP(c_CNS[(2 * i) + 32], c_CNS[(2 * i) + 32 + 1]); + } + +#pragma unroll 8 + for (i = 0; i<8; i++) { + state[i + 16] = chainv[i]; + chainv[i] = state[i + 24]; + } + + TWEAK(chainv[4], chainv[5], chainv[6], chainv[7], 3); + +#pragma unroll 1 + for (i = 0; i<8; i++) { + STEP(c_CNS[(2 * i) + 48], c_CNS[(2 * i) + 48 + 1]); + } + +#pragma unroll 8 + for (i = 0; i<8; i++) { + state[i + 24] = chainv[i]; + chainv[i] = state[i + 32]; + } + + TWEAK(chainv[4], chainv[5], chainv[6], chainv[7], 4); + +#pragma unroll 1 + for (i = 0; i<8; i++) { + STEP(c_CNS[(2 * i) + 64], c_CNS[(2 * i) + 64 + 1]); + } + +#pragma unroll 8 + for (i = 0; i<8; i++) { + state[i + 32] = chainv[i]; + } +} + + +__device__ __forceinline__ +void finalization512(uint32_t *const __restrict__ statebuffer, uint32_t *const __restrict__ statechainv, uint32_t *const __restrict__ b) { int i,j; - state->buffer[4] = 0x80000000; + statebuffer[4] = 0x80000000; #pragma unroll 3 - for(int i=5;i<8;i++) state->buffer[i] = 0; - rnd512(state); + for(int i=5;i<8;i++) + statebuffer[i] = 0; + rnd512(statebuffer, statechainv); - /*---- blank round with m=0 ----*/ -#pragma unroll 8 - for(i=0;i<8;i++) state->buffer[i] =0; - rnd512(state); + rnd512_nullhash(statechainv); #pragma unroll 8 for(i=0;i<8;i++) { b[i] = 0; #pragma unroll 5 for(j=0;j<5;j++) { - b[i] ^= state->chainv[i+8*j]; + b[i] ^= statechainv[i+8*j]; } b[i] = BYTES_SWAP32((b[i])); } -#pragma unroll 8 - for(i=0;i<8;i++) state->buffer[i]=0; - rnd512(state); + rnd512_nullhash(statechainv); #pragma unroll 8 - for(i=0;i<8;i++) { + for(i=0;i<8;i++) + { b[8+i] = 0; #pragma unroll 5 - for(j=0;j<5;j++) { - b[8+i] ^= state->chainv[i+8*j]; + for(j=0;j<5;j++) + { + b[8+i] ^= statechainv[i+8*j]; } b[8+i] = BYTES_SWAP32((b[8+i])); } } - /***************************************************/ // Die Hash-Funktion __global__ -void qubit_luffa512_gpu_hash_80(uint32_t threads, uint32_t startNounce, void *outputHash) +#if __CUDA_ARCH__ == 500 +__launch_bounds__(256, 4) +#endif +void qubit_luffa512_gpu_hash_80(uint32_t threads, uint32_t startNounce, uint32_t *outputHash) { - uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); - if (thread < threads) + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); +// if (thread < threads) { - uint32_t nounce = startNounce + thread; - union { - uint64_t buf64[16]; - uint32_t buf32[32]; - } buff; + int i, j; + const uint32_t nounce = startNounce + thread; + uint64_t buff[16]; -#pragma unroll 16 - for (int i=0; i < 16; ++i) buff.buf64[i] = c_PaddedMessage80[i]; +#pragma unroll + for (int i=8; i < 16; ++i) + buff[i] = c_PaddedMessage80[i]; // die Nounce durch die thread-spezifische ersetzen - buff.buf64[9] = REPLACE_HIWORD(buff.buf64[9], cuda_swab32(nounce)); + buff[9] = REPLACE_HIWORD(buff[9], cuda_swab32(nounce)); + uint32_t statebuffer[8]; + uint32_t statechainv[40]; + +#pragma unroll 4 + for (int i = 0; i<4; i++) + statebuffer[i] = BYTES_SWAP32(((uint32_t*)buff)[i + 16]); +#pragma unroll 4 + for (int i = 4; i<8; i++) + statebuffer[i] = statebufferpre[i]; +#pragma unroll + for (int i = 0; i<40; i++) + statechainv[i] = statechainvpre[i]; + + uint32_t *outHash = outputHash + 16 * thread; + + statebuffer[4] = 0x80000000; + + rnd512_first(statebuffer, statechainv); + rnd512_nullhash(statechainv); + + + #pragma unroll + for (i = 0; i<8; i++) + { + buff[i] = statechainv[i]; + #pragma unroll + for (j = 1; j<5; j++) { + buff[i] ^= statechainv[i + 8 * j]; + } + outHash[i] = BYTES_SWAP32((buff[i])); + } + + rnd512_nullhash(statechainv); - hashState state; -#pragma unroll 40 - for(int i=0;i<40;i++) state.chainv[i] = c_IV[i]; #pragma unroll 8 - for(int i=0;i<8;i++) state.buffer[i] = 0; - Update512(&state, (BitSequence*)buff.buf32); - uint32_t *outHash = (uint32_t *)outputHash + 16 * thread; - finalization512(&state, (uint32_t*)outHash); + for (i = 0; i<8; i++) + { + buff[8 + i] = statechainv[i]; +#pragma unroll 5 + for (j = 1; j<5; j++) + { + buff[8 + i] ^= statechainv[i + 8 * j]; + } + outHash[8 + i] = BYTES_SWAP32((buff[8 + i])); + } + + } } -__global__ -void qubit_luffa512_gpu_finalhash_80(uint32_t threads, uint32_t startNounce, void *outputHash, uint32_t *resNounce) +__global__ __launch_bounds__(256,4) +void qubit_luffa512_gpu_finalhash_80(uint32_t threads, uint32_t startNounce, void *outputHash, uint32_t *resNounce, int thr_id) { - uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); - if (thread < threads) + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); +// if (thread < threads) { - uint32_t nounce = startNounce + thread; + const uint32_t nounce = startNounce + thread; union { uint64_t buf64[16]; uint32_t buf32[32]; @@ -393,20 +858,29 @@ void qubit_luffa512_gpu_finalhash_80(uint32_t threads, uint32_t startNounce, voi uint32_t Hash[16]; #pragma unroll 16 - for (int i=0; i < 16; ++i) buff.buf64[i] = c_PaddedMessage80[i]; + for (int i=0; i < 16; ++i) + buff.buf64[i] = c_PaddedMessage80[i]; // Tested nonce buff.buf64[9] = REPLACE_HIWORD(buff.buf64[9], cuda_swab32(nounce)); - hashState state; - #pragma unroll 40 - for(int i=0;i<40;i++) state.chainv[i] = c_IV[i]; - - #pragma unroll 8 - for(int i=0;i<8;i++) state.buffer[i] = 0; - - Update512(&state, (BitSequence*)buff.buf32); - finalization512(&state, Hash); + uint32_t statebuffer[8]; + uint32_t statechainv[40] = + { + 0x6d251e69, 0x44b051e0, 0x4eaa6fb4, 0xdbf78465, + 0x6e292011, 0x90152df4, 0xee058139, 0xdef610bb, + 0xc3b44b95, 0xd9d2f256, 0x70eee9a0, 0xde099fa3, + 0x5d9b0557, 0x8fc944b3, 0xcf1ccf0e, 0x746cd581, + 0xf7efc89d, 0x5dba5781, 0x04016ce5, 0xad659c05, + 0x0306194f, 0x666d1836, 0x24aa230a, 0x8b264ae7, + 0x858075d5, 0x36d79cce, 0xe571f7d7, 0x204b1f67, + 0x35870c6a, 0x57e9e923, 0x14bcb808, 0x7cde72ce, + 0x6c68e9be, 0x5ec41e22, 0xc825b7c7, 0xaffb4363, + 0xf5df3999, 0x0fc688f1, 0xb07224cc, 0x03e86cea + }; + + Update512(statebuffer, statechainv, buff.buf32); + finalization512(statebuffer, statechainv, Hash); /* dont ask me why not a simple if (Hash[i] > c_Target[i]) return; * we lose 20% in perfs without the position test */ @@ -450,54 +924,136 @@ void qubit_luffa512_cpu_init(int thr_id, uint32_t threads) } __host__ -uint32_t qubit_luffa512_cpu_finalhash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_outputHash,int order) +uint32_t qubit_luffa512_cpu_finalhash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_outputHash) { uint32_t result = UINT32_MAX; - cudaMemset(d_resNounce[thr_id], 0xff, NBN * sizeof(uint32_t)); + cudaMemsetAsync(d_resNounce[thr_id], 0xff, NBN * sizeof(uint32_t), gpustream[thr_id]); const uint32_t threadsperblock = 256; dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 block(threadsperblock); - qubit_luffa512_gpu_finalhash_80 <<>> (threads, startNounce, d_outputHash, d_resNounce[thr_id]); + qubit_luffa512_gpu_finalhash_80 <<>> (threads, startNounce, d_outputHash, d_resNounce[thr_id], thr_id); //MyStreamSynchronize(NULL, order, thr_id); - if (cudaSuccess == cudaMemcpy(h_resNounce[thr_id], d_resNounce[thr_id], NBN * sizeof(uint32_t), cudaMemcpyDeviceToHost)) { - result = h_resNounce[thr_id][0]; + CUDA_SAFE_CALL(cudaMemcpyAsync(h_resNounce[thr_id], d_resNounce[thr_id], NBN * sizeof(uint32_t), cudaMemcpyDeviceToHost, gpustream[thr_id])); + cudaStreamSynchronize(gpustream[thr_id]); + result = h_resNounce[thr_id][0]; #if NBN > 1 - extra_results[0] = h_resNounce[thr_id][1]; + extra_results[0] = h_resNounce[thr_id][1]; #endif - } return result; } +__host__ void qubit_cpu_precalc(int thr_id) +{ + uint32_t tmp,i,j; + uint32_t statebuffer[8]; + uint32_t t[40]; + uint32_t statechainv[40] = + { + 0x6d251e69, 0x44b051e0, 0x4eaa6fb4, 0xdbf78465, + 0x6e292011, 0x90152df4, 0xee058139, 0xdef610bb, + 0xc3b44b95, 0xd9d2f256, 0x70eee9a0, 0xde099fa3, + 0x5d9b0557, 0x8fc944b3, 0xcf1ccf0e, 0x746cd581, + 0xf7efc89d, 0x5dba5781, 0x04016ce5, 0xad659c05, + 0x0306194f, 0x666d1836, 0x24aa230a, 0x8b264ae7, + 0x858075d5, 0x36d79cce, 0xe571f7d7, 0x204b1f67, + 0x35870c6a, 0x57e9e923, 0x14bcb808, 0x7cde72ce, + 0x6c68e9be, 0x5ec41e22, 0xc825b7c7, 0xaffb4363, + 0xf5df3999, 0x0fc688f1, 0xb07224cc, 0x03e86cea + }; + + for (int i = 0; i<8; i++) + statebuffer[i] = BYTES_SWAP32(*(((uint32_t*)PaddedMessage) + i)); + rnd512cpu(statebuffer, statechainv); + + for (int i = 0; i<8; i++) + statebuffer[i] = BYTES_SWAP32(*(((uint32_t*)PaddedMessage) + i + 8)); + + rnd512cpu(statebuffer, statechainv); + + + for (int i = 0; i<8; i++) + { + t[i] = statechainv[i]; + for (int j = 1; j<5; j++) + { + t[i] ^= statechainv[i + 8 * j]; + } + } + + MULT2(t, 0); + + for (int j = 0; j<5; j++) { + for (int i = 0; i<8; i++) { + statechainv[i + 8 * j] ^= t[i]; + } + } + for (j = 0; j<5; j++) { + for (i = 0; i<8; i++) { + t[i + 8 * j] = statechainv[i + 8 * j]; + } + } + + for (j = 0; j<5; j++) { + MULT2(statechainv, j); + } + + for (j = 0; j<5; j++) { + for (i = 0; i<8; i++) { + statechainv[8 * j + i] ^= t[8 * ((j + 1) % 5) + i]; + } + } + + for (j = 0; j<5; j++) { + for (i = 0; i<8; i++) { + t[i + 8 * j] = statechainv[i + 8 * j]; + } + } + + for (j = 0; j<5; j++) { + MULT2(statechainv, j); + } + + for (j = 0; j<5; j++) { + for (i = 0; i<8; i++) { + statechainv[8 * j + i] ^= t[8 * ((j + 4) % 5) + i]; + } + } + + + + cudaMemcpyToSymbolAsync(statebufferpre, statebuffer, 8 * sizeof(uint32_t), 0, cudaMemcpyHostToDevice, gpustream[thr_id]); + cudaMemcpyToSymbolAsync(statechainvpre, statechainv, 40 * sizeof(uint32_t), 0, cudaMemcpyHostToDevice, gpustream[thr_id]); +} + __host__ -void qubit_luffa512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_outputHash,int order) +void qubit_luffa512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_outputHash) { const uint32_t threadsperblock = 256; dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 block(threadsperblock); - qubit_luffa512_gpu_hash_80 <<>> (threads, startNounce, d_outputHash); + qubit_luffa512_gpu_hash_80 <<>> (threads, startNounce, d_outputHash); } __host__ -void qubit_luffa512_cpu_setBlock_80(void *pdata) +void qubit_luffa512_cpu_setBlock_80(int thr_id, void *pdata) { - unsigned char PaddedMessage[128]; - memcpy(PaddedMessage, pdata, 80); - memset(PaddedMessage+80, 0, 48); + memset(PaddedMessage + 80, 0, 48); PaddedMessage[80] = 0x80; PaddedMessage[111] = 1; PaddedMessage[126] = 0x02; PaddedMessage[127] = 0x80; - CUDA_SAFE_CALL(cudaMemcpyToSymbol( c_PaddedMessage80, PaddedMessage, 16*sizeof(uint64_t), 0, cudaMemcpyHostToDevice)); + CUDA_SAFE_CALL(cudaMemcpyToSymbolAsync(c_PaddedMessage80, PaddedMessage, 16 * sizeof(uint64_t), 0, cudaMemcpyHostToDevice, gpustream[thr_id])); + qubit_cpu_precalc(thr_id); } __host__ -void qubit_luffa512_cpufinal_setBlock_80(void *pdata, const void *ptarget) +void qubit_luffa512_cpufinal_setBlock_80(int thr_id, void *pdata, const void *ptarget) { unsigned char PaddedMessage[128]; @@ -508,6 +1064,6 @@ void qubit_luffa512_cpufinal_setBlock_80(void *pdata, const void *ptarget) PaddedMessage[126] = 0x02; PaddedMessage[127] = 0x80; - CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_Target, ptarget, 8*sizeof(uint32_t), 0, cudaMemcpyHostToDevice)); - CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_PaddedMessage80, PaddedMessage, 16*sizeof(uint64_t), 0, cudaMemcpyHostToDevice)); + CUDA_SAFE_CALL(cudaMemcpyToSymbolAsync(c_Target, ptarget, 8 * sizeof(uint32_t), 0, cudaMemcpyHostToDevice, gpustream[thr_id])); + CUDA_SAFE_CALL(cudaMemcpyToSymbolAsync(c_PaddedMessage80, PaddedMessage, 16 * sizeof(uint64_t), 0, cudaMemcpyHostToDevice, gpustream[thr_id])); } diff --git a/scrypt.c b/scrypt.c deleted file mode 100644 index c20c2e47d3..0000000000 --- a/scrypt.c +++ /dev/null @@ -1,756 +0,0 @@ -/* - * Copyright 2009 Colin Percival, 2011 ArtForz, 2011-2013 pooler - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * This file was originally written by Colin Percival as part of the Tarsnap - * online backup system. - */ - -#include "cpuminer-config.h" -#include "miner.h" - -#include -#include -#include - -static const uint32_t keypad[12] = { - 0x80000000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x00000280 -}; -static const uint32_t innerpad[11] = { - 0x80000000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x000004a0 -}; -static const uint32_t outerpad[8] = { - 0x80000000, 0, 0, 0, 0, 0, 0, 0x00000300 -}; -static const uint32_t finalblk[16] = { - 0x00000001, 0x80000000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x00000620 -}; - -static inline void HMAC_SHA256_80_init(const uint32_t *key, - uint32_t *tstate, uint32_t *ostate) -{ - uint32_t ihash[8]; - uint32_t pad[16]; - int i; - - /* tstate is assumed to contain the midstate of key */ - memcpy(pad, key + 16, 16); - memcpy(pad + 4, keypad, 48); - sha256_transform(tstate, pad, 0); - memcpy(ihash, tstate, 32); - - sha256_init(ostate); - for (i = 0; i < 8; i++) - pad[i] = ihash[i] ^ 0x5c5c5c5c; - for (; i < 16; i++) - pad[i] = 0x5c5c5c5c; - sha256_transform(ostate, pad, 0); - - sha256_init(tstate); - for (i = 0; i < 8; i++) - pad[i] = ihash[i] ^ 0x36363636; - for (; i < 16; i++) - pad[i] = 0x36363636; - sha256_transform(tstate, pad, 0); -} - -static inline void PBKDF2_SHA256_80_128(const uint32_t *tstate, - const uint32_t *ostate, const uint32_t *salt, uint32_t *output) -{ - uint32_t istate[8], ostate2[8]; - uint32_t ibuf[16], obuf[16]; - int i, j; - - memcpy(istate, tstate, 32); - sha256_transform(istate, salt, 0); - - memcpy(ibuf, salt + 16, 16); - memcpy(ibuf + 5, innerpad, 44); - memcpy(obuf + 8, outerpad, 32); - - for (i = 0; i < 4; i++) { - memcpy(obuf, istate, 32); - ibuf[4] = i + 1; - sha256_transform(obuf, ibuf, 0); - - memcpy(ostate2, ostate, 32); - sha256_transform(ostate2, obuf, 0); - for (j = 0; j < 8; j++) - output[8 * i + j] = swab32(ostate2[j]); - } -} - -static inline void PBKDF2_SHA256_128_32(uint32_t *tstate, uint32_t *ostate, - const uint32_t *salt, uint32_t *output) -{ - uint32_t buf[16]; - int i; - - sha256_transform(tstate, salt, 1); - sha256_transform(tstate, salt + 16, 1); - sha256_transform(tstate, finalblk, 0); - memcpy(buf, tstate, 32); - memcpy(buf + 8, outerpad, 32); - - sha256_transform(ostate, buf, 0); - for (i = 0; i < 8; i++) - output[i] = swab32(ostate[i]); -} - - -#if HAVE_SHA256_4WAY - -static const uint32_t keypad_4way[4 * 12] = { - 0x80000000, 0x80000000, 0x80000000, 0x80000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000280, 0x00000280, 0x00000280, 0x00000280 -}; -static const uint32_t innerpad_4way[4 * 11] = { - 0x80000000, 0x80000000, 0x80000000, 0x80000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x000004a0, 0x000004a0, 0x000004a0, 0x000004a0 -}; -static const uint32_t outerpad_4way[4 * 8] = { - 0x80000000, 0x80000000, 0x80000000, 0x80000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000300, 0x00000300, 0x00000300, 0x00000300 -}; -static const uint32_t finalblk_4way[4 * 16] __attribute__((aligned(16))) = { - 0x00000001, 0x00000001, 0x00000001, 0x00000001, - 0x80000000, 0x80000000, 0x80000000, 0x80000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000620, 0x00000620, 0x00000620, 0x00000620 -}; - -static inline void HMAC_SHA256_80_init_4way(const uint32_t *key, - uint32_t *tstate, uint32_t *ostate) -{ - uint32_t ihash[4 * 8] __attribute__((aligned(16))); - uint32_t pad[4 * 16] __attribute__((aligned(16))); - int i; - - /* tstate is assumed to contain the midstate of key */ - memcpy(pad, key + 4 * 16, 4 * 16); - memcpy(pad + 4 * 4, keypad_4way, 4 * 48); - sha256_transform_4way(tstate, pad, 0); - memcpy(ihash, tstate, 4 * 32); - - sha256_init_4way(ostate); - for (i = 0; i < 4 * 8; i++) - pad[i] = ihash[i] ^ 0x5c5c5c5c; - for (; i < 4 * 16; i++) - pad[i] = 0x5c5c5c5c; - sha256_transform_4way(ostate, pad, 0); - - sha256_init_4way(tstate); - for (i = 0; i < 4 * 8; i++) - pad[i] = ihash[i] ^ 0x36363636; - for (; i < 4 * 16; i++) - pad[i] = 0x36363636; - sha256_transform_4way(tstate, pad, 0); -} - -static inline void PBKDF2_SHA256_80_128_4way(const uint32_t *tstate, - const uint32_t *ostate, const uint32_t *salt, uint32_t *output) -{ - uint32_t istate[4 * 8] __attribute__((aligned(16))); - uint32_t ostate2[4 * 8] __attribute__((aligned(16))); - uint32_t ibuf[4 * 16] __attribute__((aligned(16))); - uint32_t obuf[4 * 16] __attribute__((aligned(16))); - int i, j; - - memcpy(istate, tstate, 4 * 32); - sha256_transform_4way(istate, salt, 0); - - memcpy(ibuf, salt + 4 * 16, 4 * 16); - memcpy(ibuf + 4 * 5, innerpad_4way, 4 * 44); - memcpy(obuf + 4 * 8, outerpad_4way, 4 * 32); - - for (i = 0; i < 4; i++) { - memcpy(obuf, istate, 4 * 32); - ibuf[4 * 4 + 0] = i + 1; - ibuf[4 * 4 + 1] = i + 1; - ibuf[4 * 4 + 2] = i + 1; - ibuf[4 * 4 + 3] = i + 1; - sha256_transform_4way(obuf, ibuf, 0); - - memcpy(ostate2, ostate, 4 * 32); - sha256_transform_4way(ostate2, obuf, 0); - for (j = 0; j < 4 * 8; j++) - output[4 * 8 * i + j] = swab32(ostate2[j]); - } -} - -static inline void PBKDF2_SHA256_128_32_4way(uint32_t *tstate, - uint32_t *ostate, const uint32_t *salt, uint32_t *output) -{ - uint32_t buf[4 * 16] __attribute__((aligned(16))); - int i; - - sha256_transform_4way(tstate, salt, 1); - sha256_transform_4way(tstate, salt + 4 * 16, 1); - sha256_transform_4way(tstate, finalblk_4way, 0); - memcpy(buf, tstate, 4 * 32); - memcpy(buf + 4 * 8, outerpad_4way, 4 * 32); - - sha256_transform_4way(ostate, buf, 0); - for (i = 0; i < 4 * 8; i++) - output[i] = swab32(ostate[i]); -} - -#endif /* HAVE_SHA256_4WAY */ - - -#if HAVE_SHA256_8WAY - -static const uint32_t finalblk_8way[8 * 16] __attribute__((aligned(32))) = { - 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, - 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000620, 0x00000620, 0x00000620, 0x00000620, 0x00000620, 0x00000620, 0x00000620, 0x00000620 -}; - -static inline void HMAC_SHA256_80_init_8way(const uint32_t *key, - uint32_t *tstate, uint32_t *ostate) -{ - uint32_t ihash[8 * 8] __attribute__((aligned(32))); - uint32_t pad[8 * 16] __attribute__((aligned(32))); - int i; - - /* tstate is assumed to contain the midstate of key */ - memcpy(pad, key + 8 * 16, 8 * 16); - for (i = 0; i < 8; i++) - pad[8 * 4 + i] = 0x80000000; - memset(pad + 8 * 5, 0x00, 8 * 40); - for (i = 0; i < 8; i++) - pad[8 * 15 + i] = 0x00000280; - sha256_transform_8way(tstate, pad, 0); - memcpy(ihash, tstate, 8 * 32); - - sha256_init_8way(ostate); - for (i = 0; i < 8 * 8; i++) - pad[i] = ihash[i] ^ 0x5c5c5c5c; - for (; i < 8 * 16; i++) - pad[i] = 0x5c5c5c5c; - sha256_transform_8way(ostate, pad, 0); - - sha256_init_8way(tstate); - for (i = 0; i < 8 * 8; i++) - pad[i] = ihash[i] ^ 0x36363636; - for (; i < 8 * 16; i++) - pad[i] = 0x36363636; - sha256_transform_8way(tstate, pad, 0); -} - -static inline void PBKDF2_SHA256_80_128_8way(const uint32_t *tstate, - const uint32_t *ostate, const uint32_t *salt, uint32_t *output) -{ - uint32_t istate[8 * 8] __attribute__((aligned(32))); - uint32_t ostate2[8 * 8] __attribute__((aligned(32))); - uint32_t ibuf[8 * 16] __attribute__((aligned(32))); - uint32_t obuf[8 * 16] __attribute__((aligned(32))); - int i, j; - - memcpy(istate, tstate, 8 * 32); - sha256_transform_8way(istate, salt, 0); - - memcpy(ibuf, salt + 8 * 16, 8 * 16); - for (i = 0; i < 8; i++) - ibuf[8 * 5 + i] = 0x80000000; - memset(ibuf + 8 * 6, 0x00, 8 * 36); - for (i = 0; i < 8; i++) - ibuf[8 * 15 + i] = 0x000004a0; - - for (i = 0; i < 8; i++) - obuf[8 * 8 + i] = 0x80000000; - memset(obuf + 8 * 9, 0x00, 8 * 24); - for (i = 0; i < 8; i++) - obuf[8 * 15 + i] = 0x00000300; - - for (i = 0; i < 4; i++) { - memcpy(obuf, istate, 8 * 32); - ibuf[8 * 4 + 0] = i + 1; - ibuf[8 * 4 + 1] = i + 1; - ibuf[8 * 4 + 2] = i + 1; - ibuf[8 * 4 + 3] = i + 1; - ibuf[8 * 4 + 4] = i + 1; - ibuf[8 * 4 + 5] = i + 1; - ibuf[8 * 4 + 6] = i + 1; - ibuf[8 * 4 + 7] = i + 1; - sha256_transform_8way(obuf, ibuf, 0); - - memcpy(ostate2, ostate, 8 * 32); - sha256_transform_8way(ostate2, obuf, 0); - for (j = 0; j < 8 * 8; j++) - output[8 * 8 * i + j] = swab32(ostate2[j]); - } -} - -static inline void PBKDF2_SHA256_128_32_8way(uint32_t *tstate, - uint32_t *ostate, const uint32_t *salt, uint32_t *output) -{ - uint32_t buf[8 * 16] __attribute__((aligned(32))); - int i; - - sha256_transform_8way(tstate, salt, 1); - sha256_transform_8way(tstate, salt + 8 * 16, 1); - sha256_transform_8way(tstate, finalblk_8way, 0); - - memcpy(buf, tstate, 8 * 32); - for (i = 0; i < 8; i++) - buf[8 * 8 + i] = 0x80000000; - memset(buf + 8 * 9, 0x00, 8 * 24); - for (i = 0; i < 8; i++) - buf[8 * 15 + i] = 0x00000300; - sha256_transform_8way(ostate, buf, 0); - - for (i = 0; i < 8 * 8; i++) - output[i] = swab32(ostate[i]); -} - -#endif /* HAVE_SHA256_8WAY */ - - -#if defined(__x86_64__) - -#define SCRYPT_MAX_WAYS 1 -#define HAVE_SCRYPT_3WAY 0 -#define scrypt_best_throughput() 1 -static void scrypt_core(uint32_t *X, uint32_t *V); -void scrypt_core_3way(uint32_t *X, uint32_t *V); -#if defined(USE_AVX2) -#undef SCRYPT_MAX_WAYS -#define SCRYPT_MAX_WAYS 21 -#define HAVE_SCRYPT_6WAY 0 -void scrypt_core_6way(uint32_t *X, uint32_t *V); -#endif - -#elif defined(__i386__) - -#define SCRYPT_MAX_WAYS 1 -#define scrypt_best_throughput() 1 -static void scrypt_core(uint32_t *X, uint32_t *V); - -#elif defined(__arm__) && defined(__APCS_32__) - -static void scrypt_core(uint32_t *X, uint32_t *V); -#if defined(__ARM_NEON__) -#undef HAVE_SHA256_4WAY -#define SCRYPT_MAX_WAYS 1 -#define HAVE_SCRYPT_3WAY 0 -#define scrypt_best_throughput() 1 -void scrypt_core_3way(uint32_t *X, uint32_t *V); -#endif - -#endif - -static inline void xor_salsa8(uint32_t B[16], const uint32_t Bx[16]) -{ - uint32_t x00,x01,x02,x03,x04,x05,x06,x07,x08,x09,x10,x11,x12,x13,x14,x15; - int i; - - x00 = (B[ 0] ^= Bx[ 0]); - x01 = (B[ 1] ^= Bx[ 1]); - x02 = (B[ 2] ^= Bx[ 2]); - x03 = (B[ 3] ^= Bx[ 3]); - x04 = (B[ 4] ^= Bx[ 4]); - x05 = (B[ 5] ^= Bx[ 5]); - x06 = (B[ 6] ^= Bx[ 6]); - x07 = (B[ 7] ^= Bx[ 7]); - x08 = (B[ 8] ^= Bx[ 8]); - x09 = (B[ 9] ^= Bx[ 9]); - x10 = (B[10] ^= Bx[10]); - x11 = (B[11] ^= Bx[11]); - x12 = (B[12] ^= Bx[12]); - x13 = (B[13] ^= Bx[13]); - x14 = (B[14] ^= Bx[14]); - x15 = (B[15] ^= Bx[15]); - for (i = 0; i < 8; i += 2) { -#define R(a, b) (((a) << (b)) | ((a) >> (32 - (b)))) - /* Operate on columns. */ - x04 ^= R(x00+x12, 7); x09 ^= R(x05+x01, 7); - x14 ^= R(x10+x06, 7); x03 ^= R(x15+x11, 7); - - x08 ^= R(x04+x00, 9); x13 ^= R(x09+x05, 9); - x02 ^= R(x14+x10, 9); x07 ^= R(x03+x15, 9); - - x12 ^= R(x08+x04,13); x01 ^= R(x13+x09,13); - x06 ^= R(x02+x14,13); x11 ^= R(x07+x03,13); - - x00 ^= R(x12+x08,18); x05 ^= R(x01+x13,18); - x10 ^= R(x06+x02,18); x15 ^= R(x11+x07,18); - - /* Operate on rows. */ - x01 ^= R(x00+x03, 7); x06 ^= R(x05+x04, 7); - x11 ^= R(x10+x09, 7); x12 ^= R(x15+x14, 7); - - x02 ^= R(x01+x00, 9); x07 ^= R(x06+x05, 9); - x08 ^= R(x11+x10, 9); x13 ^= R(x12+x15, 9); - - x03 ^= R(x02+x01,13); x04 ^= R(x07+x06,13); - x09 ^= R(x08+x11,13); x14 ^= R(x13+x12,13); - - x00 ^= R(x03+x02,18); x05 ^= R(x04+x07,18); - x10 ^= R(x09+x08,18); x15 ^= R(x14+x13,18); -#undef R - } - B[ 0] += x00; - B[ 1] += x01; - B[ 2] += x02; - B[ 3] += x03; - B[ 4] += x04; - B[ 5] += x05; - B[ 6] += x06; - B[ 7] += x07; - B[ 8] += x08; - B[ 9] += x09; - B[10] += x10; - B[11] += x11; - B[12] += x12; - B[13] += x13; - B[14] += x14; - B[15] += x15; -} - -static inline void scrypt_core(uint32_t *X, uint32_t *V) -{ - uint32_t i, j, k; - - for (i = 0; i < 1024; i++) { - memcpy(&V[i * 32], X, 128); - xor_salsa8(&X[0], &X[16]); - xor_salsa8(&X[16], &X[0]); - } - for (i = 0; i < 1024; i++) { - j = 32 * (X[16] & 1023); - for (k = 0; k < 32; k++) - X[k] ^= V[j + k]; - xor_salsa8(&X[0], &X[16]); - xor_salsa8(&X[16], &X[0]); - } -} - -#ifndef SCRYPT_MAX_WAYS -#define SCRYPT_MAX_WAYS 1 -#define scrypt_best_throughput() 1 -#endif - -#define SCRYPT_BUFFER_SIZE (SCRYPT_MAX_WAYS * 131072 + 63) - -unsigned char *scrypt_buffer_alloc() -{ - return (unsigned char *)malloc(SCRYPT_BUFFER_SIZE); -} - -static void scrypt_1024_1_1_256(const uint32_t *input, uint32_t *output, - uint32_t *midstate, unsigned char *scratchpad) -{ - uint32_t tstate[8], ostate[8]; - uint32_t X[32]; - uint32_t *V; - - V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63)); - - memcpy(tstate, midstate, 32); - HMAC_SHA256_80_init(input, tstate, ostate); - PBKDF2_SHA256_80_128(tstate, ostate, input, X); - - scrypt_core(X, V); - - PBKDF2_SHA256_128_32(tstate, ostate, X, output); -} - -#if HAVE_SHA256_4WAY -static void scrypt_1024_1_1_256_4way(const uint32_t *input, - uint32_t *output, uint32_t *midstate, unsigned char *scratchpad) -{ - uint32_t tstate[4 * 8] __attribute__((aligned(128))); - uint32_t ostate[4 * 8] __attribute__((aligned(128))); - uint32_t W[4 * 32] __attribute__((aligned(128))); - uint32_t X[4 * 32] __attribute__((aligned(128))); - uint32_t *V; - int i, k; - - V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63)); - - for (i = 0; i < 20; i++) - for (k = 0; k < 4; k++) - W[4 * i + k] = input[k * 20 + i]; - for (i = 0; i < 8; i++) - for (k = 0; k < 4; k++) - tstate[4 * i + k] = midstate[i]; - HMAC_SHA256_80_init_4way(W, tstate, ostate); - PBKDF2_SHA256_80_128_4way(tstate, ostate, W, W); - for (i = 0; i < 32; i++) - for (k = 0; k < 4; k++) - X[k * 32 + i] = W[4 * i + k]; - scrypt_core(X + 0 * 32, V); - scrypt_core(X + 1 * 32, V); - scrypt_core(X + 2 * 32, V); - scrypt_core(X + 3 * 32, V); - for (i = 0; i < 32; i++) - for (k = 0; k < 4; k++) - W[4 * i + k] = X[k * 32 + i]; - PBKDF2_SHA256_128_32_4way(tstate, ostate, W, W); - for (i = 0; i < 8; i++) - for (k = 0; k < 4; k++) - output[k * 8 + i] = W[4 * i + k]; -} -#endif /* HAVE_SHA256_4WAY */ - -#if HAVE_SCRYPT_3WAY - -static void scrypt_1024_1_1_256_3way(const uint32_t *input, - uint32_t *output, uint32_t *midstate, unsigned char *scratchpad) -{ - uint32_t tstate[3 * 8], ostate[3 * 8]; - uint32_t X[3 * 32] __attribute__((aligned(64))); - uint32_t *V; - - V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63)); - - memcpy(tstate + 0, midstate, 32); - memcpy(tstate + 8, midstate, 32); - memcpy(tstate + 16, midstate, 32); - HMAC_SHA256_80_init(input + 0, tstate + 0, ostate + 0); - HMAC_SHA256_80_init(input + 20, tstate + 8, ostate + 8); - HMAC_SHA256_80_init(input + 40, tstate + 16, ostate + 16); - PBKDF2_SHA256_80_128(tstate + 0, ostate + 0, input + 0, X + 0); - PBKDF2_SHA256_80_128(tstate + 8, ostate + 8, input + 20, X + 32); - PBKDF2_SHA256_80_128(tstate + 16, ostate + 16, input + 40, X + 64); - - scrypt_core_3way(X, V); - - PBKDF2_SHA256_128_32(tstate + 0, ostate + 0, X + 0, output + 0); - PBKDF2_SHA256_128_32(tstate + 8, ostate + 8, X + 32, output + 8); - PBKDF2_SHA256_128_32(tstate + 16, ostate + 16, X + 64, output + 16); -} - -#if HAVE_SHA256_4WAY -static void scrypt_1024_1_1_256_12way(const uint32_t *input, - uint32_t *output, uint32_t *midstate, unsigned char *scratchpad) -{ - uint32_t tstate[12 * 8] __attribute__((aligned(128))); - uint32_t ostate[12 * 8] __attribute__((aligned(128))); - uint32_t W[12 * 32] __attribute__((aligned(128))); - uint32_t X[12 * 32] __attribute__((aligned(128))); - uint32_t *V; - int i, j, k; - - V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63)); - - for (j = 0; j < 3; j++) - for (i = 0; i < 20; i++) - for (k = 0; k < 4; k++) - W[128 * j + 4 * i + k] = input[80 * j + k * 20 + i]; - for (j = 0; j < 3; j++) - for (i = 0; i < 8; i++) - for (k = 0; k < 4; k++) - tstate[32 * j + 4 * i + k] = midstate[i]; - HMAC_SHA256_80_init_4way(W + 0, tstate + 0, ostate + 0); - HMAC_SHA256_80_init_4way(W + 128, tstate + 32, ostate + 32); - HMAC_SHA256_80_init_4way(W + 256, tstate + 64, ostate + 64); - PBKDF2_SHA256_80_128_4way(tstate + 0, ostate + 0, W + 0, W + 0); - PBKDF2_SHA256_80_128_4way(tstate + 32, ostate + 32, W + 128, W + 128); - PBKDF2_SHA256_80_128_4way(tstate + 64, ostate + 64, W + 256, W + 256); - for (j = 0; j < 3; j++) - for (i = 0; i < 32; i++) - for (k = 0; k < 4; k++) - X[128 * j + k * 32 + i] = W[128 * j + 4 * i + k]; - scrypt_core_3way(X + 0 * 96, V); - scrypt_core_3way(X + 1 * 96, V); - scrypt_core_3way(X + 2 * 96, V); - scrypt_core_3way(X + 3 * 96, V); - for (j = 0; j < 3; j++) - for (i = 0; i < 32; i++) - for (k = 0; k < 4; k++) - W[128 * j + 4 * i + k] = X[128 * j + k * 32 + i]; - PBKDF2_SHA256_128_32_4way(tstate + 0, ostate + 0, W + 0, W + 0); - PBKDF2_SHA256_128_32_4way(tstate + 32, ostate + 32, W + 128, W + 128); - PBKDF2_SHA256_128_32_4way(tstate + 64, ostate + 64, W + 256, W + 256); - for (j = 0; j < 3; j++) - for (i = 0; i < 8; i++) - for (k = 0; k < 4; k++) - output[32 * j + k * 8 + i] = W[128 * j + 4 * i + k]; -} -#endif /* HAVE_SHA256_4WAY */ - -#endif /* HAVE_SCRYPT_3WAY */ - -#if HAVE_SCRYPT_6WAY -static void scrypt_1024_1_1_256_24way(const uint32_t *input, - uint32_t *output, uint32_t *midstate, unsigned char *scratchpad) -{ - uint32_t tstate[24 * 8] __attribute__((aligned(128))); - uint32_t ostate[24 * 8] __attribute__((aligned(128))); - uint32_t W[24 * 32] __attribute__((aligned(128))); - uint32_t X[24 * 32] __attribute__((aligned(128))); - uint32_t *V; - int i, j, k; - - V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63)); - - for (j = 0; j < 3; j++) - for (i = 0; i < 20; i++) - for (k = 0; k < 8; k++) - W[8 * 32 * j + 8 * i + k] = input[8 * 20 * j + k * 20 + i]; - for (j = 0; j < 3; j++) - for (i = 0; i < 8; i++) - for (k = 0; k < 8; k++) - tstate[8 * 8 * j + 8 * i + k] = midstate[i]; - HMAC_SHA256_80_init_8way(W + 0, tstate + 0, ostate + 0); - HMAC_SHA256_80_init_8way(W + 256, tstate + 64, ostate + 64); - HMAC_SHA256_80_init_8way(W + 512, tstate + 128, ostate + 128); - PBKDF2_SHA256_80_128_8way(tstate + 0, ostate + 0, W + 0, W + 0); - PBKDF2_SHA256_80_128_8way(tstate + 64, ostate + 64, W + 256, W + 256); - PBKDF2_SHA256_80_128_8way(tstate + 128, ostate + 128, W + 512, W + 512); - for (j = 0; j < 3; j++) - for (i = 0; i < 32; i++) - for (k = 0; k < 8; k++) - X[8 * 32 * j + k * 32 + i] = W[8 * 32 * j + 8 * i + k]; - scrypt_core_6way(X + 0 * 32, V); - scrypt_core_6way(X + 6 * 32, V); - scrypt_core_6way(X + 12 * 32, V); - scrypt_core_6way(X + 18 * 32, V); - for (j = 0; j < 3; j++) - for (i = 0; i < 32; i++) - for (k = 0; k < 8; k++) - W[8 * 32 * j + 8 * i + k] = X[8 * 32 * j + k * 32 + i]; - PBKDF2_SHA256_128_32_8way(tstate + 0, ostate + 0, W + 0, W + 0); - PBKDF2_SHA256_128_32_8way(tstate + 64, ostate + 64, W + 256, W + 256); - PBKDF2_SHA256_128_32_8way(tstate + 128, ostate + 128, W + 512, W + 512); - for (j = 0; j < 3; j++) - for (i = 0; i < 8; i++) - for (k = 0; k < 8; k++) - output[8 * 8 * j + k * 8 + i] = W[8 * 32 * j + 8 * i + k]; -} -#endif /* HAVE_SCRYPT_6WAY */ - -int scanhash_scrypt(int thr_id, uint32_t *pdata, - unsigned char *scratchbuf, const uint32_t *ptarget, - uint32_t max_nonce, unsigned long *hashes_done) -{ - uint32_t data[SCRYPT_MAX_WAYS * 20], hash[SCRYPT_MAX_WAYS * 8]; - uint32_t midstate[8]; - uint32_t n = pdata[19] - 1; - const uint32_t Htarg = ptarget[7]; - uint32_t throughput = scrypt_best_throughput(); - int i; - -#if HAVE_SHA256_4WAY - if (sha256_use_4way()) - throughput *= 4; -#endif - - for (i = 0; i < throughput; i++) - memcpy(data + i * 20, pdata, 80); - - sha256_init(midstate); - sha256_transform(midstate, data, 0); - - do { - for (i = 0; i < throughput; i++) - data[i * 20 + 19] = ++n; - -#if HAVE_SHA256_4WAY - if (throughput == 4) - scrypt_1024_1_1_256_4way(data, hash, midstate, scratchbuf); - else -#endif -#if HAVE_SCRYPT_3WAY && HAVE_SHA256_4WAY - if (throughput == 12) - scrypt_1024_1_1_256_12way(data, hash, midstate, scratchbuf); - else -#endif -#if HAVE_SCRYPT_6WAY - if (throughput == 24) - scrypt_1024_1_1_256_24way(data, hash, midstate, scratchbuf); - else -#endif -#if HAVE_SCRYPT_3WAY - if (throughput == 3) - scrypt_1024_1_1_256_3way(data, hash, midstate, scratchbuf); - else -#endif - scrypt_1024_1_1_256(data, hash, midstate, scratchbuf); - - for (i = 0; i < throughput; i++) { - if (hash[i * 8 + 7] <= Htarg && fulltest(hash + i * 8, ptarget)) { - *hashes_done = n - pdata[19] + 1; - pdata[19] = data[i * 20 + 19]; - return 1; - } - } - } while (n < max_nonce && !work_restart[thr_id].restart); - - *hashes_done = n - pdata[19] + 1; - pdata[19] = n; - return 0; -} diff --git a/skein.cu b/skein.cu new file mode 100644 index 0000000000..8336a91c14 --- /dev/null +++ b/skein.cu @@ -0,0 +1,138 @@ +/** +* SKEIN512 80 + SHA256 64 +* by tpruvot@github - 2015 +* Optimized by sp-hash@github - 2015 +*/ + +extern "C" { +#include "sph/sph_skein.h" +} + +#include "miner.h" +#include "cuda_helper.h" +#include + +extern void skein512_cpu_setBlock_80(int thr_id,void *pdata); +extern void skein512_cpu_hash_80_50(int thr_id, uint32_t threads, uint32_t startNounce, int swapu, uint64_t target, uint32_t *h_found); +extern void skein512_cpu_hash_80_52(int thr_id, uint32_t threads, uint32_t startNounce, int swapu, uint64_t target, uint32_t *h_found); + +void skeincoinhash(void *output, const void *input) +{ + sph_skein512_context ctx_skein; + SHA256_CTX sha256; + + uint32_t hash[16]; + + sph_skein512_init(&ctx_skein); + sph_skein512(&ctx_skein, input, 80); + sph_skein512_close(&ctx_skein, hash); + + SHA256_Init(&sha256); + SHA256_Update(&sha256, (unsigned char *)hash, 64); + SHA256_Final((unsigned char *)hash, &sha256); + + memcpy(output, hash, 32); +} + +static __inline uint32_t swab32_if(uint32_t val, bool iftrue) +{ + return iftrue ? swab32(val) : val; +} + +int scanhash_skeincoin(int thr_id, uint32_t *pdata, + uint32_t *ptarget, uint32_t max_nonce, + uint32_t *hashes_done) +{ + static THREAD uint32_t *foundnonces = nullptr; + + const uint32_t first_nonce = pdata[19]; + const int swap = 1; + + uint32_t intensity = (device_sm[device_map[thr_id]] > 500) ? 1 << 28 : 1 << 27;; + uint32_t throughput = device_intensity(device_map[thr_id], __func__, intensity); // 256*4096 + throughput = min(throughput, max_nonce - first_nonce) & 0xfffffc00; + + if (opt_benchmark) + { + ((uint64_t*)ptarget)[3] = 0x3000f0000; + } + uint64_t target = ((uint64_t*)ptarget)[3]; + + static THREAD volatile bool init = false; + if(!init) + { + CUDA_SAFE_CALL(cudaSetDevice(device_map[thr_id])); + cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync); + cudaDeviceSetCacheConfig(cudaFuncCachePreferL1); + CUDA_SAFE_CALL(cudaStreamCreate(&gpustream[thr_id])); + CUDA_SAFE_CALL(cudaMallocHost(&foundnonces, 2 * 4)); + mining_has_stopped[thr_id] = false; + init = true; + } + + uint32_t endiandata[20]; + for (int k = 0; k < 20; k++) + be32enc(&endiandata[k], pdata[k]); + + skein512_cpu_setBlock_80(thr_id, (void*)endiandata); + do + { + *hashes_done = pdata[19] - first_nonce + throughput; + + if (device_sm[device_map[thr_id]] > 500) + skein512_cpu_hash_80_52(thr_id, throughput, pdata[19], swap, target, foundnonces); + else + skein512_cpu_hash_80_50(thr_id, throughput, pdata[19], swap, target, foundnonces); + + if(stop_mining) {mining_has_stopped[thr_id] = true; cudaStreamDestroy(gpustream[thr_id]); pthread_exit(nullptr);} + if(foundnonces[0] != 0xffffffff) + { + uint32_t vhash64[8]={0}; + + endiandata[19] = swab32_if(foundnonces[0], swap); + + skeincoinhash(vhash64, endiandata); + + uint64_t test = ((uint64_t*)vhash64)[3]; + if (test <= target && fulltest(vhash64, ptarget)) + { + int res = 1; + if (opt_debug || opt_benchmark) + applog(LOG_INFO, "GPU #%d: found nonce $%08X", device_map[thr_id], foundnonces[0]); + if (foundnonces[1] != 0xffffffff) + { + endiandata[19] = swab32_if(foundnonces[1], swap); + skeincoinhash(vhash64, endiandata); + uint64_t test2 = ((uint64_t*)vhash64)[3]; + if (test2 <= target && fulltest(vhash64, ptarget)) + { + if (opt_debug || opt_benchmark) + applog(LOG_INFO, "GPU #%d: found nonce $%08X", device_map[thr_id], foundnonces[1]); + pdata[19 + res] = swab32_if(foundnonces[1], !swap); + res++; + } + else + { + if (test2 != target) applog(LOG_WARNING, "GPU #%d: result for nonce $%08X does not validate on CPU!", device_map[thr_id], foundnonces[1]); + } + } + pdata[19] = swab32_if(foundnonces[0], !swap); + return res; + } + else + { + if (test != target) + applog(LOG_WARNING, "GPU #%d: result for nonce $%08X does not validate on CPU!", device_map[thr_id], foundnonces[0]); + else + applog(LOG_WARNING, "Lost work: #%d", test); + + } + } + + pdata[19] += throughput; + + } while(!work_restart[thr_id].restart && ((uint64_t)max_nonce > ((uint64_t)(pdata[19]) + (uint64_t)throughput))); + + *hashes_done = pdata[19] - first_nonce ; + return 0; +} diff --git a/sph/neoscrypt.cpp b/sph/neoscrypt.cpp new file mode 100644 index 0000000000..ebbb2074ed --- /dev/null +++ b/sph/neoscrypt.cpp @@ -0,0 +1,994 @@ +/* + * Copyright (c) 2009 Colin Percival, 2011 ArtForz + * Copyright (c) 2012 Andrew Moon (floodyberry) + * Copyright (c) 2012 Samuel Neves + * Copyright (c) 2014 John Doering + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + + +#include +#include +#include + +#include "neoscrypt.h" + +extern void proper_exit(int reason); +enum +{ + LOG_ERR, + LOG_WARNING, + LOG_NOTICE, + LOG_INFO, + LOG_DEBUG, + /* custom notices */ + LOG_BLUE = 0x10, +}; +extern void applog(int prio, const char *fmt, ...); + +#ifdef _WIN32 +/* sizeof(unsigned long) = 4 for MinGW64 */ +typedef unsigned long long ulong; +#else +typedef unsigned long ulong; +#endif +typedef unsigned int uint; +typedef unsigned char uchar; + +#define MIN(a, b) ((a) < (b) ? a : b) +#define MAX(a, b) ((a) > (b) ? a : b) + + +/* SHA-256 */ + +static const uint32_t sha256_constants[64] = { + 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, + 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, + 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, + 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, + 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, + 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, + 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, + 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 +}; + +#define Ch(x,y,z) (z ^ (x & (y ^ z))) +#define Maj(x,y,z) (((x | y) & z) | (x & y)) +#define S0(x) (ROTR32(x, 2) ^ ROTR32(x, 13) ^ ROTR32(x, 22)) +#define S1(x) (ROTR32(x, 6) ^ ROTR32(x, 11) ^ ROTR32(x, 25)) +#define G0(x) (ROTR32(x, 7) ^ ROTR32(x, 18) ^ (x >> 3)) +#define G1(x) (ROTR32(x, 17) ^ ROTR32(x, 19) ^ (x >> 10)) +#define W0(in,i) (U8TO32_BE(&in[i * 4])) +#define W1(i) (G1(w[i - 2]) + w[i - 7] + G0(w[i - 15]) + w[i - 16]) +#define STEP(i) \ + t1 = S0(r[0]) + Maj(r[0], r[1], r[2]); \ + t0 = r[7] + S1(r[4]) + Ch(r[4], r[5], r[6]) + sha256_constants[i] + w[i]; \ + r[7] = r[6]; \ + r[6] = r[5]; \ + r[5] = r[4]; \ + r[4] = r[3] + t0; \ + r[3] = r[2]; \ + r[2] = r[1]; \ + r[1] = r[0]; \ + r[0] = t0 + t1; + + +typedef struct sha256_hash_state_t { + uint32_t H[8]; + uint64_t T; + uint32_t leftover; + uint8_t buffer[SCRYPT_HASH_BLOCK_SIZE]; +} sha256_hash_state; + + +static void sha256_blocks(sha256_hash_state *S, const uint8_t *in, size_t blocks) { + uint32_t r[8], w[64], t0, t1; + size_t i; + + for(i = 0; i < 8; i++) + r[i] = S->H[i]; + + while(blocks--) { + for(i = 0; i < 16; i++) { + w[i] = W0(in, i); + } + for(i = 16; i < 64; i++) { + w[i] = W1(i); + } + for(i = 0; i < 64; i++) { + STEP(i); + } + for(i = 0; i < 8; i++) { + r[i] += S->H[i]; + S->H[i] = r[i]; + } + S->T += SCRYPT_HASH_BLOCK_SIZE * 8; + in += SCRYPT_HASH_BLOCK_SIZE; + } +} + +static void neoscrypt_hash_init_sha256(sha256_hash_state *S) { + S->H[0] = 0x6a09e667; + S->H[1] = 0xbb67ae85; + S->H[2] = 0x3c6ef372; + S->H[3] = 0xa54ff53a; + S->H[4] = 0x510e527f; + S->H[5] = 0x9b05688c; + S->H[6] = 0x1f83d9ab; + S->H[7] = 0x5be0cd19; + S->T = 0; + S->leftover = 0; +} + +static void neoscrypt_hash_update_sha256(sha256_hash_state *S, const uint8_t *in, size_t inlen) { + size_t blocks, want; + + /* handle the previous data */ + if(S->leftover) { + want = (SCRYPT_HASH_BLOCK_SIZE - S->leftover); + want = (want < inlen) ? want : inlen; + memcpy(S->buffer + S->leftover, in, want); + S->leftover += (uint32_t)want; + if(S->leftover < SCRYPT_HASH_BLOCK_SIZE) + return; + in += want; + inlen -= want; + sha256_blocks(S, S->buffer, 1); + } + + /* handle the current data */ + blocks = (inlen & ~(SCRYPT_HASH_BLOCK_SIZE - 1)); + S->leftover = (uint32_t)(inlen - blocks); + if(blocks) { + sha256_blocks(S, in, blocks / SCRYPT_HASH_BLOCK_SIZE); + in += blocks; + } + + /* handle leftover data */ + if(S->leftover) + memcpy(S->buffer, in, S->leftover); +} + +static void neoscrypt_hash_finish_sha256(sha256_hash_state *S, uint8_t *hash) { + uint64_t t = S->T + (S->leftover * 8); + + S->buffer[S->leftover] = 0x80; + if(S->leftover <= 55) { + memset(S->buffer + S->leftover + 1, 0, 55 - S->leftover); + } else { + memset(S->buffer + S->leftover + 1, 0, 63 - S->leftover); + sha256_blocks(S, S->buffer, 1); + memset(S->buffer, 0, 56); + } + + U64TO8_BE(S->buffer + 56, t); + sha256_blocks(S, S->buffer, 1); + + U32TO8_BE(&hash[ 0], S->H[0]); + U32TO8_BE(&hash[ 4], S->H[1]); + U32TO8_BE(&hash[ 8], S->H[2]); + U32TO8_BE(&hash[12], S->H[3]); + U32TO8_BE(&hash[16], S->H[4]); + U32TO8_BE(&hash[20], S->H[5]); + U32TO8_BE(&hash[24], S->H[6]); + U32TO8_BE(&hash[28], S->H[7]); +} + +static void neoscrypt_hash_sha256(hash_digest hash, const uint8_t *m, size_t mlen) { + sha256_hash_state st; + neoscrypt_hash_init_sha256(&st); + neoscrypt_hash_update_sha256(&st, m, mlen); + neoscrypt_hash_finish_sha256(&st, hash); +} + + +/* HMAC for SHA-256 */ + +typedef struct sha256_hmac_state_t { + sha256_hash_state inner, outer; +} sha256_hmac_state; + +static void neoscrypt_hmac_init_sha256(sha256_hmac_state *st, const uint8_t *key, size_t keylen) { + uint8_t pad[SCRYPT_HASH_BLOCK_SIZE] = {0}; + size_t i; + + neoscrypt_hash_init_sha256(&st->inner); + neoscrypt_hash_init_sha256(&st->outer); + + if(keylen <= SCRYPT_HASH_BLOCK_SIZE) { + /* use the key directly if it's <= blocksize bytes */ + memcpy(pad, key, keylen); + } else { + /* if it's > blocksize bytes, hash it */ + neoscrypt_hash_sha256(pad, key, keylen); + } + + /* inner = (key ^ 0x36) */ + /* h(inner || ...) */ + for(i = 0; i < SCRYPT_HASH_BLOCK_SIZE; i++) + pad[i] ^= 0x36; + neoscrypt_hash_update_sha256(&st->inner, pad, SCRYPT_HASH_BLOCK_SIZE); + + /* outer = (key ^ 0x5c) */ + /* h(outer || ...) */ + for(i = 0; i < SCRYPT_HASH_BLOCK_SIZE; i++) + pad[i] ^= (0x5c ^ 0x36); + neoscrypt_hash_update_sha256(&st->outer, pad, SCRYPT_HASH_BLOCK_SIZE); +} + +static void neoscrypt_hmac_update_sha256(sha256_hmac_state *st, const uint8_t *m, size_t mlen) { + /* h(inner || m...) */ + neoscrypt_hash_update_sha256(&st->inner, m, mlen); +} + +static void neoscrypt_hmac_finish_sha256(sha256_hmac_state *st, hash_digest mac) { + /* h(inner || m) */ + hash_digest innerhash; + neoscrypt_hash_finish_sha256(&st->inner, innerhash); + + /* h(outer || h(inner || m)) */ + neoscrypt_hash_update_sha256(&st->outer, innerhash, sizeof(innerhash)); + neoscrypt_hash_finish_sha256(&st->outer, mac); +} + + +/* PBKDF2 for SHA-256 */ + +static void neoscrypt_pbkdf2_sha256(const uint8_t *password, size_t password_len, + const uint8_t *salt, size_t salt_len, uint64_t N, uint8_t *output, size_t output_len) { + sha256_hmac_state hmac_pw, hmac_pw_salt, work; + hash_digest ti, u; + uint8_t be[4]; + uint32_t i, j, k, blocks; + + /* bytes must be <= (0xffffffff - (SCRYPT_HASH_DIGEST_SIZE - 1)), which they will always be under scrypt */ + + /* hmac(password, ...) */ + neoscrypt_hmac_init_sha256(&hmac_pw, password, password_len); + + /* hmac(password, salt...) */ + hmac_pw_salt = hmac_pw; + neoscrypt_hmac_update_sha256(&hmac_pw_salt, salt, salt_len); + + blocks = ((uint32_t)output_len + (SCRYPT_HASH_DIGEST_SIZE - 1)) / SCRYPT_HASH_DIGEST_SIZE; + for(i = 1; i <= blocks; i++) { + /* U1 = hmac(password, salt || be(i)) */ + U32TO8_BE(be, i); + work = hmac_pw_salt; + neoscrypt_hmac_update_sha256(&work, be, 4); + neoscrypt_hmac_finish_sha256(&work, ti); + memcpy(u, ti, sizeof(u)); + + /* T[i] = U1 ^ U2 ^ U3... */ + for(j = 0; j < N - 1; j++) { + /* UX = hmac(password, U{X-1}) */ + work = hmac_pw; + neoscrypt_hmac_update_sha256(&work, u, SCRYPT_HASH_DIGEST_SIZE); + neoscrypt_hmac_finish_sha256(&work, u); + + /* T[i] ^= UX */ + for(k = 0; k < sizeof(u); k++) + ti[k] ^= u[k]; + } + + memcpy(output, ti, (output_len > SCRYPT_HASH_DIGEST_SIZE) ? SCRYPT_HASH_DIGEST_SIZE : output_len); + output += SCRYPT_HASH_DIGEST_SIZE; + output_len -= SCRYPT_HASH_DIGEST_SIZE; + } +} + + +/* NeoScrypt */ + +#if defined(ASM) + +extern void neoscrypt_salsa(uint *X, uint rounds); +extern void neoscrypt_salsa_tangle(uint *X, uint count); +extern void neoscrypt_chacha(uint *X, uint rounds); + +extern void neoscrypt_blkcpy(void *dstp, const void *srcp, uint len); +extern void neoscrypt_blkswp(void *blkAp, void *blkBp, uint len); +extern void neoscrypt_blkxor(void *dstp, const void *srcp, uint len); + +#else + +/* Salsa20, rounds must be a multiple of 2 */ +static void neoscrypt_salsa(uint *X, uint rounds) { + uint x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, t; + + x0 = X[0]; x1 = X[1]; x2 = X[2]; x3 = X[3]; + x4 = X[4]; x5 = X[5]; x6 = X[6]; x7 = X[7]; + x8 = X[8]; x9 = X[9]; x10 = X[10]; x11 = X[11]; + x12 = X[12]; x13 = X[13]; x14 = X[14]; x15 = X[15]; + +#define quarter(a, b, c, d) \ + t = a + d; t = ROTL32(t, 7); b ^= t; \ + t = b + a; t = ROTL32(t, 9); c ^= t; \ + t = c + b; t = ROTL32(t, 13); d ^= t; \ + t = d + c; t = ROTL32(t, 18); a ^= t; + + for(; rounds; rounds -= 2) { + quarter( x0, x4, x8, x12); + quarter( x5, x9, x13, x1); + quarter(x10, x14, x2, x6); + quarter(x15, x3, x7, x11); + quarter( x0, x1, x2, x3); + quarter( x5, x6, x7, x4); + quarter(x10, x11, x8, x9); + quarter(x15, x12, x13, x14); + } + + X[0] += x0; X[1] += x1; X[2] += x2; X[3] += x3; + X[4] += x4; X[5] += x5; X[6] += x6; X[7] += x7; + X[8] += x8; X[9] += x9; X[10] += x10; X[11] += x11; + X[12] += x12; X[13] += x13; X[14] += x14; X[15] += x15; + +#undef quarter +} + +/* ChaCha20, rounds must be a multiple of 2 */ +static void neoscrypt_chacha(uint *X, uint rounds) { + uint x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, t; + + x0 = X[0]; x1 = X[1]; x2 = X[2]; x3 = X[3]; + x4 = X[4]; x5 = X[5]; x6 = X[6]; x7 = X[7]; + x8 = X[8]; x9 = X[9]; x10 = X[10]; x11 = X[11]; + x12 = X[12]; x13 = X[13]; x14 = X[14]; x15 = X[15]; + +#define quarter(a,b,c,d) \ + a += b; t = d ^ a; d = ROTL32(t, 16); \ + c += d; t = b ^ c; b = ROTL32(t, 12); \ + a += b; t = d ^ a; d = ROTL32(t, 8); \ + c += d; t = b ^ c; b = ROTL32(t, 7); + + for(; rounds; rounds -= 2) { + quarter( x0, x4, x8, x12); + quarter( x1, x5, x9, x13); + quarter( x2, x6, x10, x14); + quarter( x3, x7, x11, x15); + quarter( x0, x5, x10, x15); + quarter( x1, x6, x11, x12); + quarter( x2, x7, x8, x13); + quarter( x3, x4, x9, x14); + } + + X[0] += x0; X[1] += x1; X[2] += x2; X[3] += x3; + X[4] += x4; X[5] += x5; X[6] += x6; X[7] += x7; + X[8] += x8; X[9] += x9; X[10] += x10; X[11] += x11; + X[12] += x12; X[13] += x13; X[14] += x14; X[15] += x15; + +#undef quarter +} + + +/* Fast 32-bit / 64-bit memcpy(); + * len must be a multiple of 32 bytes */ +static void neoscrypt_blkcpy(void *dstp, const void *srcp, uint len) { + ulong *dst = (ulong *) dstp; + ulong *src = (ulong *) srcp; + uint i; + + for(i = 0; i < (len / sizeof(ulong)); i += 4) { + dst[i] = src[i]; + dst[i + 1] = src[i + 1]; + dst[i + 2] = src[i + 2]; + dst[i + 3] = src[i + 3]; + } +} + +/* Fast 32-bit / 64-bit block swapper; + * len must be a multiple of 32 bytes */ +static void neoscrypt_blkswp(void *blkAp, void *blkBp, uint len) { + ulong *blkA = (ulong *) blkAp; + ulong *blkB = (ulong *) blkBp; + register ulong t0, t1, t2, t3; + uint i; + + for(i = 0; i < (len / sizeof(ulong)); i += 4) { + t0 = blkA[i]; + t1 = blkA[i + 1]; + t2 = blkA[i + 2]; + t3 = blkA[i + 3]; + blkA[i] = blkB[i]; + blkA[i + 1] = blkB[i + 1]; + blkA[i + 2] = blkB[i + 2]; + blkA[i + 3] = blkB[i + 3]; + blkB[i] = t0; + blkB[i + 1] = t1; + blkB[i + 2] = t2; + blkB[i + 3] = t3; + } +} + +/* Fast 32-bit / 64-bit block XOR engine; + * len must be a multiple of 32 bytes */ +static void neoscrypt_blkxor(void *dstp, const void *srcp, uint len) { + ulong *dst = (ulong *) dstp; + ulong *src = (ulong *) srcp; + uint i; + + for(i = 0; i < (len / sizeof(ulong)); i += 4) { + dst[i] ^= src[i]; + dst[i + 1] ^= src[i + 1]; + dst[i + 2] ^= src[i + 2]; + dst[i + 3] ^= src[i + 3]; + } +} + +#endif + +/* 32-bit / 64-bit optimised memcpy() */ +static void neoscrypt_copy(void *dstp, const void *srcp, uint len) { + ulong *dst = (ulong *) dstp; + ulong *src = (ulong *) srcp; + uint i, tail; + + for(i = 0; i < (len / sizeof(ulong)); i++) + dst[i] = src[i]; + + tail = len & (sizeof(ulong) - 1); + if(tail) { + uchar *dstb = (uchar *) dstp; + uchar *srcb = (uchar *) srcp; + + for(i = len - tail; i < len; i++) + dstb[i] = srcb[i]; + } +} + +/* 32-bit / 64-bit optimised memory erase aka memset() to zero */ +static void neoscrypt_erase(void *dstp, uint len) { + const ulong null = 0; + ulong *dst = (ulong *) dstp; + uint i, tail; + + for(i = 0; i < (len / sizeof(ulong)); i++) + dst[i] = null; + + tail = len & (sizeof(ulong) - 1); + if(tail) { + uchar *dstb = (uchar *) dstp; + + for(i = len - tail; i < len; i++) + dstb[i] = (uchar)null; + } +} + +/* 32-bit / 64-bit optimised XOR engine */ +static void neoscrypt_xor(void *dstp, const void *srcp, uint len) { + ulong *dst = (ulong *) dstp; + ulong *src = (ulong *) srcp; + uint i, tail; + + for(i = 0; i < (len / sizeof(ulong)); i++) + dst[i] ^= src[i]; + + tail = len & (sizeof(ulong) - 1); + if(tail) { + uchar *dstb = (uchar *) dstp; + uchar *srcb = (uchar *) srcp; + + for(i = len - tail; i < len; i++) + dstb[i] ^= srcb[i]; + } +} + + +/* BLAKE2s */ + +#define BLAKE2S_BLOCK_SIZE 64U +#define BLAKE2S_OUT_SIZE 32U +#define BLAKE2S_KEY_SIZE 32U + +/* Parameter block of 32 bytes */ +typedef struct blake2s_param_t { + uchar digest_length; + uchar key_length; + uchar fanout; + uchar depth; + uint leaf_length; + uchar node_offset[6]; + uchar node_depth; + uchar inner_length; + uchar salt[8]; + uchar personal[8]; +} blake2s_param; + +/* State block of 180 bytes */ +typedef struct blake2s_state_t { + uint h[8]; + uint t[2]; + uint f[2]; + uchar buf[2 * BLAKE2S_BLOCK_SIZE]; + uint buflen; +} blake2s_state; + +static const uint blake2s_IV[8] = { + 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A, + 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19 +}; + +static const uint8_t blake2s_sigma[10][16] = { + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 } , + { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 } , + { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 } , + { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 } , + { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 } , + { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 } , + { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 } , + { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 } , + { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 } , + { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13 , 0 } , +}; + +static void blake2s_compress(blake2s_state *S, const uint *buf) { + uint i; + uint m[16]; + uint v[16]; + + neoscrypt_copy(m, buf, 64); + neoscrypt_copy(v, S, 32); + + v[ 8] = blake2s_IV[0]; + v[ 9] = blake2s_IV[1]; + v[10] = blake2s_IV[2]; + v[11] = blake2s_IV[3]; + v[12] = S->t[0] ^ blake2s_IV[4]; + v[13] = S->t[1] ^ blake2s_IV[5]; + v[14] = S->f[0] ^ blake2s_IV[6]; + v[15] = S->f[1] ^ blake2s_IV[7]; +#define G(r,i,a,b,c,d) \ + do { \ + a = a + b + m[blake2s_sigma[r][2*i+0]]; \ + d = ROTR32(d ^ a, 16); \ + c = c + d; \ + b = ROTR32(b ^ c, 12); \ + a = a + b + m[blake2s_sigma[r][2*i+1]]; \ + d = ROTR32(d ^ a, 8); \ + c = c + d; \ + b = ROTR32(b ^ c, 7); \ + } while(0) +#define ROUND(r) \ + do { \ + G(r, 0, v[ 0], v[ 4], v[ 8], v[12]); \ + G(r, 1, v[ 1], v[ 5], v[ 9], v[13]); \ + G(r, 2, v[ 2], v[ 6], v[10], v[14]); \ + G(r, 3, v[ 3], v[ 7], v[11], v[15]); \ + G(r, 4, v[ 0], v[ 5], v[10], v[15]); \ + G(r, 5, v[ 1], v[ 6], v[11], v[12]); \ + G(r, 6, v[ 2], v[ 7], v[ 8], v[13]); \ + G(r, 7, v[ 3], v[ 4], v[ 9], v[14]); \ + } while(0) + ROUND(0); + ROUND(1); + ROUND(2); + ROUND(3); + ROUND(4); + ROUND(5); + ROUND(6); + ROUND(7); + ROUND(8); + ROUND(9); + + for (i = 0; i < 8; i++) + S->h[i] = S->h[i] ^ v[i] ^ v[i + 8]; + +#undef G +#undef ROUND +} + +static void blake2s_update(blake2s_state *S, const uchar *input, uint input_size) { + uint left, fill; + + while(input_size > 0) { + left = S->buflen; + fill = 2 * BLAKE2S_BLOCK_SIZE - left; + if(input_size > fill) { + /* Buffer fill */ + neoscrypt_copy(S->buf + left, input, fill); + S->buflen += fill; + /* Counter increment */ + S->t[0] += BLAKE2S_BLOCK_SIZE; + /* Compress */ + blake2s_compress(S, (uint *) S->buf); + /* Shift buffer left */ + neoscrypt_copy(S->buf, S->buf + BLAKE2S_BLOCK_SIZE, BLAKE2S_BLOCK_SIZE); + S->buflen -= BLAKE2S_BLOCK_SIZE; + input += fill; + input_size -= fill; + } else { + neoscrypt_copy(S->buf + left, input, input_size); + S->buflen += input_size; + /* Do not compress */ + input += input_size; + input_size = 0; + } + } +} + +static void neoscrypt_blake2s(const void *input, const uint input_size, const void *key, const uchar key_size, + void *output, const uchar output_size) { + uchar block[BLAKE2S_BLOCK_SIZE]; + blake2s_param P[1]; + blake2s_state S[1]; + + /* Initialise */ + neoscrypt_erase(P, 32); + P->digest_length = output_size; + P->key_length = key_size; + P->fanout = 1; + P->depth = 1; + + neoscrypt_erase(S, 180); + neoscrypt_copy(S, blake2s_IV, 32); + neoscrypt_xor(S, P, 32); + + neoscrypt_erase(block, BLAKE2S_BLOCK_SIZE); + neoscrypt_copy(block, key, key_size); + blake2s_update(S, (uchar *) block, BLAKE2S_BLOCK_SIZE); + + /* Update */ + blake2s_update(S, (uchar *) input, input_size); + + /* Finish */ + if(S->buflen > BLAKE2S_BLOCK_SIZE) { + S->t[0] += BLAKE2S_BLOCK_SIZE; + blake2s_compress(S, (uint *) S->buf); + S->buflen -= BLAKE2S_BLOCK_SIZE; + neoscrypt_copy(S->buf, S->buf + BLAKE2S_BLOCK_SIZE, S->buflen); + } + S->t[0] += S->buflen; + S->f[0] = ~0U; + neoscrypt_erase(S->buf + S->buflen, 2 * BLAKE2S_BLOCK_SIZE - S->buflen); + blake2s_compress(S, (uint *) S->buf); + + /* Write back */ + neoscrypt_copy(output, S, output_size); +// for (int k = 0; k<4; k++) { printf("cpu blake %d %08x %08x\n", k, ((unsigned int*)output)[2 * k], ((unsigned int*)output)[2 * k + 1]); } + +} + + +#define FASTKDF_BUFFER_SIZE 256U + +/* FastKDF, a fast buffered key derivation function: + * FASTKDF_BUFFER_SIZE must be a power of 2; + * password_len, salt_len and output_len should not exceed FASTKDF_BUFFER_SIZE; + * prf_output_size must be <= prf_key_size; */ +static void neoscrypt_fastkdf(const uchar *password, uint password_len, const uchar *salt, uint salt_len, + uint N, uchar *output, uint output_len) { +// for (int i = 0; i<10; i++) { printf("cpu password %d %08x %08x\n", i, ((unsigned int*)password)[2 * i], ((unsigned int*)password)[2 * i+1]); } + const uint stack_align = 0x40; + const uint kdf_buf_size = 256U; //FASTKDF_BUFFER_SIZE + const uint prf_input_size = 64U; //BLAKE2S_BLOCK_SIZE + const uint prf_key_size = 32U; //BLAKE2S_KEY_SIZE + const uint prf_output_size = 32U; //BLAKE2S_OUT_SIZE + uint bufptr, a, b, i, j; + uchar *A, *B, *prf_input, *prf_key, *prf_output; + uchar *stack; + stack = (uchar*)malloc(sizeof(uchar) * 2 * kdf_buf_size + prf_input_size + prf_key_size + prf_output_size + stack_align); + if(stack == NULL) + { + applog(LOG_ERR, "Out of memory!"); + proper_exit(2); + } + /* Align and set up the buffers in stack */ + //uchar stack[2 * kdf_buf_size + prf_input_size + prf_key_size + prf_output_size + stack_align]; + + A = &stack[stack_align & ~(stack_align - 1)]; + B = &A[kdf_buf_size + prf_input_size]; + prf_output = &A[2 * kdf_buf_size + prf_input_size + prf_key_size]; + + /* Initialise the password buffer */ + if(password_len > kdf_buf_size) + password_len = kdf_buf_size; + + a = kdf_buf_size / password_len; + for(i = 0; i < a; i++) + neoscrypt_copy(&A[i * password_len], &password[0], password_len); + b = kdf_buf_size - a * password_len; + if(b) + neoscrypt_copy(&A[a * password_len], &password[0], b); + neoscrypt_copy(&A[kdf_buf_size], &password[0], prf_input_size); + + /* Initialise the salt buffer */ + if(salt_len > kdf_buf_size) + salt_len = kdf_buf_size; + + a = kdf_buf_size / salt_len; + for(i = 0; i < a; i++) + neoscrypt_copy(&B[i * salt_len], &salt[0], salt_len); + b = kdf_buf_size - a * salt_len; + if(b) + neoscrypt_copy(&B[a * salt_len], &salt[0], b); + neoscrypt_copy(&B[kdf_buf_size], &salt[0], prf_key_size); + + /* The primary iteration */ + for(i = 0, bufptr = 0; i < N; i++) { + + /* Map the PRF input buffer */ + prf_input = &A[bufptr]; + + /* Map the PRF key buffer */ + prf_key = &B[bufptr]; + + /* PRF */ + +// for (int k = 0; k<(prf_input_size/4); k++) { printf("cpu bufptr %08x before blake %d %d %08x \n",bufptr, i, k, ((unsigned int*)prf_input)[k]); } + neoscrypt_blake2s(prf_input, prf_input_size, prf_key, prf_key_size, prf_output, prf_output_size); + //for (int k = 0; k<(prf_output_size/4); k++) { printf("cpu after blake %d %d %08x \n", i, k, ((unsigned int*)prf_output)[k]); } + + /* Calculate the next buffer pointer */ + for(j = 0, bufptr = 0; j < prf_output_size; j++) + bufptr += prf_output[j]; + bufptr &= (kdf_buf_size - 1); + + /* Modify the salt buffer */ + neoscrypt_xor(&B[bufptr], &prf_output[0], prf_output_size); + + /* Head modified, tail updated */ + if(bufptr < prf_key_size) + neoscrypt_copy(&B[kdf_buf_size + bufptr], &B[bufptr], MIN(prf_output_size, prf_key_size - bufptr)); + + /* Tail modified, head updated */ + if((kdf_buf_size - bufptr) < prf_output_size) + neoscrypt_copy(&B[0], &B[kdf_buf_size], prf_output_size - (kdf_buf_size - bufptr)); + + } + + /* Modify and copy into the output buffer */ + if(output_len > kdf_buf_size) + output_len = kdf_buf_size; + + a = kdf_buf_size - bufptr; + if(a >= output_len) { + neoscrypt_xor(&B[bufptr], &A[0], output_len); + neoscrypt_copy(&output[0], &B[bufptr], output_len); + } else { + neoscrypt_xor(&B[bufptr], &A[0], a); + neoscrypt_xor(&B[0], &A[a], output_len - a); + neoscrypt_copy(&output[0], &B[bufptr], a); + neoscrypt_copy(&output[a], &B[0], output_len - a); + } +// for (int i = 0; i<10; i++) { printf("cpu fastkdf %d %08x %08x\n", i, ((unsigned int*)output)[2 * i], ((unsigned int*)output)[2 * i + 1]); } + +} + + +/* Configurable optimised block mixer */ +static void neoscrypt_blkmix(uint *X, uint *Y, uint r, uint mixmode) { + uint i, mixer, rounds; + + mixer = mixmode >> 8; + rounds = mixmode & 0xFF; + + /* NeoScrypt flow: Scrypt flow: + Xa ^= Xd; M(Xa'); Ya = Xa"; Xa ^= Xb; M(Xa'); Ya = Xa"; + Xb ^= Xa"; M(Xb'); Yb = Xb"; Xb ^= Xa"; M(Xb'); Yb = Xb"; + Xc ^= Xb"; M(Xc'); Yc = Xc"; Xa" = Ya; + Xd ^= Xc"; M(Xd'); Yd = Xd"; Xb" = Yb; + Xa" = Ya; Xb" = Yc; + Xc" = Yb; Xd" = Yd; */ + + if(r == 1) { + neoscrypt_blkxor(&X[0], &X[16], SCRYPT_BLOCK_SIZE); + if(mixer) + neoscrypt_chacha(&X[0], rounds); + else + neoscrypt_salsa(&X[0], rounds); + neoscrypt_blkxor(&X[16], &X[0], SCRYPT_BLOCK_SIZE); + if(mixer) + neoscrypt_chacha(&X[16], rounds); + else + neoscrypt_salsa(&X[16], rounds); + return; + } + + if(r == 2) { + neoscrypt_blkxor(&X[0], &X[48], SCRYPT_BLOCK_SIZE); + if(mixer) + neoscrypt_chacha(&X[0], rounds); + else + neoscrypt_salsa(&X[0], rounds); + neoscrypt_blkxor(&X[16], &X[0], SCRYPT_BLOCK_SIZE); + if(mixer) + neoscrypt_chacha(&X[16], rounds); + else + neoscrypt_salsa(&X[16], rounds); + neoscrypt_blkxor(&X[32], &X[16], SCRYPT_BLOCK_SIZE); + if(mixer) + neoscrypt_chacha(&X[32], rounds); + else + neoscrypt_salsa(&X[32], rounds); + neoscrypt_blkxor(&X[48], &X[32], SCRYPT_BLOCK_SIZE); + if(mixer) + neoscrypt_chacha(&X[48], rounds); + else + neoscrypt_salsa(&X[48], rounds); + neoscrypt_blkswp(&X[16], &X[32], SCRYPT_BLOCK_SIZE); + return; + } + + /* Reference code for any reasonable r */ + for(i = 0; i < 2 * r; i++) { + if(i) neoscrypt_blkxor(&X[16 * i], &X[16 * (i - 1)], SCRYPT_BLOCK_SIZE); + else neoscrypt_blkxor(&X[0], &X[16 * (2 * r - 1)], SCRYPT_BLOCK_SIZE); + if(mixer) + neoscrypt_chacha(&X[16 * i], rounds); + else + neoscrypt_salsa(&X[16 * i], rounds); + neoscrypt_blkcpy(&Y[16 * i], &X[16 * i], SCRYPT_BLOCK_SIZE); + } + for(i = 0; i < r; i++) + neoscrypt_blkcpy(&X[16 * i], &Y[16 * 2 * i], SCRYPT_BLOCK_SIZE); + for(i = 0; i < r; i++) + neoscrypt_blkcpy(&X[16 * (i + r)], &Y[16 * (2 * i + 1)], SCRYPT_BLOCK_SIZE); +} + +/* NeoScrypt core engine: + * p = 1, salt = password; + * Basic customisation (required): + * profile bit 0: + * 0 = NeoScrypt(128, 2, 1) with Salsa20/20 and ChaCha20/20; + * 1 = Scrypt(1024, 1, 1) with Salsa20/8; + * profile bits 4 to 1: + * 0000 = FastKDF-BLAKE2s; + * 0001 = PBKDF2-HMAC-SHA256; + * Extended customisation (optional): + * profile bit 31: + * 0 = extended customisation absent; + * 1 = extended customisation present; + * profile bits 7 to 5 (rfactor): + * 000 = r of 1; + * 001 = r of 2; + * 010 = r of 4; + * ... + * 111 = r of 128; + * profile bits 12 to 8 (Nfactor): + * 00000 = N of 2; + * 00001 = N of 4; + * 00010 = N of 8; + * ..... + * 00110 = N of 128; + * ..... + * 01001 = N of 1024; + * ..... + * 11110 = N of 2147483648; + * profile bits 30 to 13 are reserved */ +void neoscrypt(const uchar *password, uchar *output, uint profile) { + uint N = 128, r = 2, dblmix = 1, mixmode = 0x14, stack_align = 0x40; + uint kdf, i, j; + uint *X, *Y, *Z, *V; + + if(profile & 0x1) { + N = 1024; /* N = (1 << (Nfactor + 1)); */ + r = 1; /* r = (1 << rfactor); */ + dblmix = 0; /* Salsa only */ + mixmode = 0x08; /* 8 rounds */ + } + + if(profile >> 31) { + N = (1 << (((profile >> 8) & 0x1F) + 1)); + r = (1 << ((profile >> 5) & 0x7)); + } + uchar *stack; + stack = (uchar*)malloc(((N + 3) * r * 2 * SCRYPT_BLOCK_SIZE + stack_align)*sizeof(uchar)); + if(stack == NULL) + { + applog(LOG_ERR, "Out of memory!"); + proper_exit(2); + } + /* X = r * 2 * SCRYPT_BLOCK_SIZE */ + X = (uint *) &stack[stack_align & ~(stack_align - 1)]; + /* Z is a copy of X for ChaCha */ + Z = &X[32 * r]; + /* Y is an X sized temporal space */ + Y = &X[64 * r]; + /* V = N * r * 2 * SCRYPT_BLOCK_SIZE */ + V = &X[96 * r]; + + /* X = KDF(password, salt) */ + kdf = (profile >> 1) & 0xF; + + switch(kdf) { + + default: + case(0x0): + neoscrypt_fastkdf(password, 80, password, 80, 32, (uchar *) X, r * 2 * SCRYPT_BLOCK_SIZE); + break; + + case(0x1): + neoscrypt_pbkdf2_sha256(password, 80, password, 80, 1, (uchar *) X, r * 2 * SCRYPT_BLOCK_SIZE); + break; + + } + + /* Process ChaCha 1st, Salsa 2nd and XOR them into FastKDF; otherwise Salsa only */ + + if(dblmix) { + /* blkcpy(Z, X) */ + neoscrypt_blkcpy(&Z[0], &X[0], r * 2 * SCRYPT_BLOCK_SIZE); + + /* Z = SMix(Z) */ + for(i = 0; i < N; i++) { + /* blkcpy(V, Z) */ + neoscrypt_blkcpy(&V[i * (32 * r)], &Z[0], r * 2 * SCRYPT_BLOCK_SIZE); + /* blkmix(Z, Y) */ + neoscrypt_blkmix(&Z[0], &Y[0], r, (mixmode | 0x0100)); + } + for(i = 0; i < N; i++) { + /* integerify(Z) mod N */ + j = (32 * r) * (Z[16 * (2 * r - 1)] & (N - 1)); + /* blkxor(Z, V) */ + neoscrypt_blkxor(&Z[0], &V[j], r * 2 * SCRYPT_BLOCK_SIZE); + /* blkmix(Z, Y) */ + neoscrypt_blkmix(&Z[0], &Y[0], r, (mixmode | 0x0100)); + } + } + +#if (ASM) + /* Must be called before and after SSE2 Salsa */ + neoscrypt_salsa_tangle(&X[0], r * 2); +#endif + + /* X = SMix(X) */ + for(i = 0; i < N; i++) { + /* blkcpy(V, X) */ + neoscrypt_blkcpy(&V[i * (32 * r)], &X[0], r * 2 * SCRYPT_BLOCK_SIZE); + /* blkmix(X, Y) */ + neoscrypt_blkmix(&X[0], &Y[0], r, mixmode); + } + for(i = 0; i < N; i++) { + /* integerify(X) mod N */ + j = (32 * r) * (X[16 * (2 * r - 1)] & (N - 1)); + /* blkxor(X, V) */ + neoscrypt_blkxor(&X[0], &V[j], r * 2 * SCRYPT_BLOCK_SIZE); + /* blkmix(X, Y) */ + neoscrypt_blkmix(&X[0], &Y[0], r, mixmode); + } + +#if (ASM) + neoscrypt_salsa_tangle(&X[0], r * 2); +#endif + + if(dblmix) + /* blkxor(X, Z) */ + neoscrypt_blkxor(&X[0], &Z[0], r * 2 * SCRYPT_BLOCK_SIZE); + + /* output = KDF(password, X) */ + switch(kdf) { + + default: + case(0x0): + neoscrypt_fastkdf(password, 80, (uchar *) X, r * 2 * SCRYPT_BLOCK_SIZE, 32, output, 32); + break; + + case(0x1): + neoscrypt_pbkdf2_sha256(password, 80, (uchar *) X, r * 2 * SCRYPT_BLOCK_SIZE, 1, output, 32); + break; + + } + free(stack); +} + diff --git a/sph/neoscrypt.h b/sph/neoscrypt.h new file mode 100644 index 0000000000..aec9d541df --- /dev/null +++ b/sph/neoscrypt.h @@ -0,0 +1,27 @@ +void neoscrypt(const unsigned char *input, unsigned char *output, unsigned int profile); + +#define SCRYPT_BLOCK_SIZE 64 +#define SCRYPT_HASH_BLOCK_SIZE 64 +#define SCRYPT_HASH_DIGEST_SIZE 32 + +typedef uint8_t hash_digest[SCRYPT_HASH_DIGEST_SIZE]; + +#ifndef ROTL32 +#define ROTL32(a,b) (((a) << (b)) | ((a) >> (32 - b))) +#endif +#ifndef ROTR32 +#define ROTR32(a,b) (((a) >> (b)) | ((a) << (32 - b))) +#endif + +#define U8TO32_BE(p) \ + (((uint32_t)((p)[0]) << 24) | ((uint32_t)((p)[1]) << 16) | \ + ((uint32_t)((p)[2]) << 8) | ((uint32_t)((p)[3]))) + +#define U32TO8_BE(p, v) \ + (p)[0] = (uint8_t)((v) >> 24); (p)[1] = (uint8_t)((v) >> 16); \ + (p)[2] = (uint8_t)((v) >> 8); (p)[3] = (uint8_t)((v) ); + +#define U64TO8_BE(p, v) \ + U32TO8_BE((p), (uint32_t)((v) >> 32)); \ + U32TO8_BE((p) + 4, (uint32_t)((v) )); + diff --git a/sph/sha2.c b/sph/sha2.c index 46f0e5bee3..5780c76934 100644 --- a/sph/sha2.c +++ b/sph/sha2.c @@ -8,7 +8,6 @@ * any later version. See COPYING for more details. */ -#include "cpuminer-config.h" #include "miner.h" #include @@ -462,169 +461,3 @@ static inline void sha256d_ms(uint32_t *hash, uint32_t *W, } #endif /* EXTERN_SHA256 */ - -#if HAVE_SHA256_4WAY - -void sha256d_ms_4way(uint32_t *hash, uint32_t *data, - const uint32_t *midstate, const uint32_t *prehash); - -static inline int scanhash_sha256d_4way(int thr_id, uint32_t *pdata, - const uint32_t *ptarget, uint32_t max_nonce, unsigned long *hashes_done) -{ - uint32_t data[4 * 64] __attribute__((aligned(128))); - uint32_t hash[4 * 8] __attribute__((aligned(32))); - uint32_t midstate[4 * 8] __attribute__((aligned(32))); - uint32_t prehash[4 * 8] __attribute__((aligned(32))); - uint32_t n = pdata[19] - 1; - const uint32_t first_nonce = pdata[19]; - const uint32_t Htarg = ptarget[7]; - int i, j; - - memcpy(data, pdata + 16, 64); - sha256d_preextend(data); - for (i = 31; i >= 0; i--) - for (j = 0; j < 4; j++) - data[i * 4 + j] = data[i]; - - sha256_init(midstate); - sha256_transform(midstate, pdata, 0); - memcpy(prehash, midstate, 32); - sha256d_prehash(prehash, pdata + 16); - for (i = 7; i >= 0; i--) { - for (j = 0; j < 4; j++) { - midstate[i * 4 + j] = midstate[i]; - prehash[i * 4 + j] = prehash[i]; - } - } - - do { - for (i = 0; i < 4; i++) - data[4 * 3 + i] = ++n; - - sha256d_ms_4way(hash, data, midstate, prehash); - - for (i = 0; i < 4; i++) { - if (swab32(hash[4 * 7 + i]) <= Htarg) { - pdata[19] = data[4 * 3 + i]; - sha256d_80_swap(hash, pdata); - if (fulltest(hash, ptarget)) { - *hashes_done = n - first_nonce + 1; - return 1; - } - } - } - } while (n < max_nonce && !work_restart[thr_id].restart); - - *hashes_done = n - first_nonce + 1; - pdata[19] = n; - return 0; -} - -#endif /* HAVE_SHA256_4WAY */ - -#if HAVE_SHA256_8WAY - -void sha256d_ms_8way(uint32_t *hash, uint32_t *data, - const uint32_t *midstate, const uint32_t *prehash); - -static inline int scanhash_sha256d_8way(int thr_id, uint32_t *pdata, - const uint32_t *ptarget, uint32_t max_nonce, unsigned long *hashes_done) -{ - uint32_t data[8 * 64] __attribute__((aligned(128))); - uint32_t hash[8 * 8] __attribute__((aligned(32))); - uint32_t midstate[8 * 8] __attribute__((aligned(32))); - uint32_t prehash[8 * 8] __attribute__((aligned(32))); - uint32_t n = pdata[19] - 1; - const uint32_t first_nonce = pdata[19]; - const uint32_t Htarg = ptarget[7]; - int i, j; - - memcpy(data, pdata + 16, 64); - sha256d_preextend(data); - for (i = 31; i >= 0; i--) - for (j = 0; j < 8; j++) - data[i * 8 + j] = data[i]; - - sha256_init(midstate); - sha256_transform(midstate, pdata, 0); - memcpy(prehash, midstate, 32); - sha256d_prehash(prehash, pdata + 16); - for (i = 7; i >= 0; i--) { - for (j = 0; j < 8; j++) { - midstate[i * 8 + j] = midstate[i]; - prehash[i * 8 + j] = prehash[i]; - } - } - - do { - for (i = 0; i < 8; i++) - data[8 * 3 + i] = ++n; - - sha256d_ms_8way(hash, data, midstate, prehash); - - for (i = 0; i < 8; i++) { - if (swab32(hash[8 * 7 + i]) <= Htarg) { - pdata[19] = data[8 * 3 + i]; - sha256d_80_swap(hash, pdata); - if (fulltest(hash, ptarget)) { - *hashes_done = n - first_nonce + 1; - return 1; - } - } - } - } while (n < max_nonce && !work_restart[thr_id].restart); - - *hashes_done = n - first_nonce + 1; - pdata[19] = n; - return 0; -} - -#endif /* HAVE_SHA256_8WAY */ - -int scanhash_sha256d(int thr_id, uint32_t *pdata, const uint32_t *ptarget, - uint32_t max_nonce, unsigned long *hashes_done) -{ - uint32_t data[64] /* __attribute__((aligned(128))) */; - uint32_t hash[8] /* __attribute__((aligned(32))) */; - uint32_t midstate[8] /* __attribute__((aligned(32))) */; - uint32_t prehash[8] /* __attribute__((aligned(32))) */; - uint32_t n = pdata[19] - 1; - const uint32_t first_nonce = pdata[19]; - const uint32_t Htarg = ptarget[7]; - -#if HAVE_SHA256_8WAY - if (sha256_use_8way()) - return scanhash_sha256d_8way(thr_id, pdata, ptarget, - max_nonce, hashes_done); -#endif -#if HAVE_SHA256_4WAY - if (sha256_use_4way()) - return scanhash_sha256d_4way(thr_id, pdata, ptarget, - max_nonce, hashes_done); -#endif - - memcpy(data, pdata + 16, 64); - sha256d_preextend(data); - - sha256_init(midstate); - sha256_transform(midstate, pdata, 0); - memcpy(prehash, midstate, 32); - sha256d_prehash(prehash, pdata + 16); - - do { - data[3] = ++n; - sha256d_ms(hash, data, midstate, prehash); - if (swab32(hash[7]) <= Htarg) { - pdata[19] = data[3]; - sha256d_80_swap(hash, pdata); - if (fulltest(hash, ptarget)) { - *hashes_done = n - first_nonce + 1; - return 1; - } - } - } while (n < max_nonce && !work_restart[thr_id].restart); - - *hashes_done = n - first_nonce + 1; - pdata[19] = n; - return 0; -} diff --git a/sph/sha256_Y.c b/sph/sha256_Y.c new file mode 100644 index 0000000000..d17cbe2c7a --- /dev/null +++ b/sph/sha256_Y.c @@ -0,0 +1,418 @@ +/*- + * Copyright 2005,2007,2009 Colin Percival + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include + +#include +#include + +#include "sph/sysendian.h" + +#include "sph/sha256_Y.h" + +/* + * Encode a length len/4 vector of (uint32_t) into a length len vector of + * (unsigned char) in big-endian form. Assumes len is a multiple of 4. + */ +static void +be32enc_vect(unsigned char *dst, const uint32_t *src, size_t len) +{ + size_t i; + + for (i = 0; i < len / 4; i++) + be32enc(dst + i * 4, src[i]); +} + +/* + * Decode a big-endian length len vector of (unsigned char) into a length + * len/4 vector of (uint32_t). Assumes len is a multiple of 4. + */ +static void +be32dec_vect(uint32_t *dst, const unsigned char *src, size_t len) +{ + size_t i; + + for (i = 0; i < len / 4; i++) + dst[i] = be32dec(src + i * 4); +} + +/* Elementary functions used by SHA256 */ +#define Ch(x, y, z) ((x & (y ^ z)) ^ z) +#define Maj(x, y, z) ((x & (y | z)) | (y & z)) +#define SHR(x, n) (x >> n) +#define ROTR(x, n) ((x >> n) | (x << (32 - n))) +#define S0(x) (ROTR(x, 2) ^ ROTR(x, 13) ^ ROTR(x, 22)) +#define S1(x) (ROTR(x, 6) ^ ROTR(x, 11) ^ ROTR(x, 25)) +#define s0(x) (ROTR(x, 7) ^ ROTR(x, 18) ^ SHR(x, 3)) +#define s1(x) (ROTR(x, 17) ^ ROTR(x, 19) ^ SHR(x, 10)) + +/* SHA256 round function */ +#define RND(a, b, c, d, e, f, g, h, k) \ + t0 = h + S1(e) + Ch(e, f, g) + k; \ + t1 = S0(a) + Maj(a, b, c); \ + d += t0; \ + h = t0 + t1; + +/* Adjusted round function for rotating state */ +#define RNDr(S, W, i, k) \ + RND(S[(64 - i) % 8], S[(65 - i) % 8], \ + S[(66 - i) % 8], S[(67 - i) % 8], \ + S[(68 - i) % 8], S[(69 - i) % 8], \ + S[(70 - i) % 8], S[(71 - i) % 8], \ + W[i] + k) + +/* + * SHA256 block compression function. The 256-bit state is transformed via + * the 512-bit input block to produce a new state. + */ +static void +SHA256_Transform(uint32_t * state, const unsigned char block[64]) +{ + uint32_t W[64]; + uint32_t S[8]; + uint32_t t0, t1; + int i; + /* 1. Prepare message schedule W. */ + be32dec_vect(W, block, 64); + + for (i = 16; i < 64; i++) + W[i] = s1(W[i - 2]) + W[i - 7] + s0(W[i - 15]) + W[i - 16]; + + /* 2. Initialize working variables. */ + memcpy(S, state, 32); + + /* 3. Mix. */ + RNDr(S, W, 0, 0x428a2f98); + RNDr(S, W, 1, 0x71374491); + RNDr(S, W, 2, 0xb5c0fbcf); + RNDr(S, W, 3, 0xe9b5dba5); + RNDr(S, W, 4, 0x3956c25b); + RNDr(S, W, 5, 0x59f111f1); + RNDr(S, W, 6, 0x923f82a4); + RNDr(S, W, 7, 0xab1c5ed5); + RNDr(S, W, 8, 0xd807aa98); + RNDr(S, W, 9, 0x12835b01); + RNDr(S, W, 10, 0x243185be); + RNDr(S, W, 11, 0x550c7dc3); + RNDr(S, W, 12, 0x72be5d74); + RNDr(S, W, 13, 0x80deb1fe); + RNDr(S, W, 14, 0x9bdc06a7); + RNDr(S, W, 15, 0xc19bf174); + RNDr(S, W, 16, 0xe49b69c1); + RNDr(S, W, 17, 0xefbe4786); + RNDr(S, W, 18, 0x0fc19dc6); + RNDr(S, W, 19, 0x240ca1cc); + RNDr(S, W, 20, 0x2de92c6f); + RNDr(S, W, 21, 0x4a7484aa); + RNDr(S, W, 22, 0x5cb0a9dc); + RNDr(S, W, 23, 0x76f988da); + RNDr(S, W, 24, 0x983e5152); + RNDr(S, W, 25, 0xa831c66d); + RNDr(S, W, 26, 0xb00327c8); + RNDr(S, W, 27, 0xbf597fc7); + RNDr(S, W, 28, 0xc6e00bf3); + RNDr(S, W, 29, 0xd5a79147); + RNDr(S, W, 30, 0x06ca6351); + RNDr(S, W, 31, 0x14292967); + RNDr(S, W, 32, 0x27b70a85); + RNDr(S, W, 33, 0x2e1b2138); + RNDr(S, W, 34, 0x4d2c6dfc); + RNDr(S, W, 35, 0x53380d13); + RNDr(S, W, 36, 0x650a7354); + RNDr(S, W, 37, 0x766a0abb); + RNDr(S, W, 38, 0x81c2c92e); + RNDr(S, W, 39, 0x92722c85); + RNDr(S, W, 40, 0xa2bfe8a1); + RNDr(S, W, 41, 0xa81a664b); + RNDr(S, W, 42, 0xc24b8b70); + RNDr(S, W, 43, 0xc76c51a3); + RNDr(S, W, 44, 0xd192e819); + RNDr(S, W, 45, 0xd6990624); + RNDr(S, W, 46, 0xf40e3585); + RNDr(S, W, 47, 0x106aa070); + RNDr(S, W, 48, 0x19a4c116); + RNDr(S, W, 49, 0x1e376c08); + RNDr(S, W, 50, 0x2748774c); + RNDr(S, W, 51, 0x34b0bcb5); + RNDr(S, W, 52, 0x391c0cb3); + RNDr(S, W, 53, 0x4ed8aa4a); + RNDr(S, W, 54, 0x5b9cca4f); + RNDr(S, W, 55, 0x682e6ff3); + RNDr(S, W, 56, 0x748f82ee); + RNDr(S, W, 57, 0x78a5636f); + RNDr(S, W, 58, 0x84c87814); + RNDr(S, W, 59, 0x8cc70208); + RNDr(S, W, 60, 0x90befffa); + RNDr(S, W, 61, 0xa4506ceb); + RNDr(S, W, 62, 0xbef9a3f7); + RNDr(S, W, 63, 0xc67178f2); + + /* 4. Mix local working variables into global state */ + for (i = 0; i < 8; i++) { + state[i] += S[i]; + +} + + /* Clean the stack. */ + memset(W, 0, 256); + memset(S, 0, 32); + t0 = t1 = 0; +} + +static unsigned char PAD[64] = { + 0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +}; + +/* Add padding and terminating bit-count. */ +static void +SHA256_Pad(SHA256_CTX_Y * ctx) +{ + unsigned char len[8]; + uint32_t r, plen; + + /* + * Convert length to a vector of bytes -- we do this now rather + * than later because the length will change after we pad. + */ + be32enc_vect(len, ctx->count, 8); + + /* Add 1--64 bytes so that the resulting length is 56 mod 64 */ + r = (ctx->count[1] >> 3) & 0x3f; + plen = (r < 56) ? (56 - r) : (120 - r); + SHA256_Update_Y(ctx, PAD, (size_t)plen); + + /* Add the terminating bit-count */ + SHA256_Update_Y(ctx, len, 8); +} + +/* SHA-256 initialization. Begins a SHA-256 operation. */ +void +SHA256_Init_Y(SHA256_CTX_Y * ctx) +{ + + /* Zero bits processed so far */ + ctx->count[0] = ctx->count[1] = 0; + + /* Magic initialization constants */ + ctx->state[0] = 0x6A09E667; + ctx->state[1] = 0xBB67AE85; + ctx->state[2] = 0x3C6EF372; + ctx->state[3] = 0xA54FF53A; + ctx->state[4] = 0x510E527F; + ctx->state[5] = 0x9B05688C; + ctx->state[6] = 0x1F83D9AB; + ctx->state[7] = 0x5BE0CD19; +} + +/* Add bytes into the hash */ +void +SHA256_Update_Y(SHA256_CTX_Y * ctx, const void *in, size_t len) +{ + uint32_t bitlen[2]; + uint32_t r; + const unsigned char *src = in; + + /* Number of bytes left in the buffer from previous updates */ + r = (ctx->count[1] >> 3) & 0x3f; + + /* Convert the length into a number of bits */ + bitlen[1] = ((uint32_t)len) << 3; + bitlen[0] = (uint32_t)(len >> 29); + + /* Update number of bits */ + if ((ctx->count[1] += bitlen[1]) < bitlen[1]) + ctx->count[0]++; + ctx->count[0] += bitlen[0]; + + /* Handle the case where we don't need to perform any transforms */ + if (len < 64 - r) { + + memcpy(&ctx->buf[r], src, len); + return; + } + + /* Finish the current block */ + memcpy(&ctx->buf[r], src, 64 - r); + + SHA256_Transform(ctx->state, ctx->buf); + src += 64 - r; + len -= 64 - r; + + /* Perform complete blocks */ + + while (len >= 64) { + SHA256_Transform(ctx->state, src); + src += 64; + len -= 64; + } + + /* Copy left over data into buffer */ + memcpy(ctx->buf, src, len); +} + +/* + * SHA-256 finalization. Pads the input data, exports the hash value, + * and clears the context state. + */ +void +SHA256_Final_Y(unsigned char digest[32], SHA256_CTX_Y * ctx) +{ + /* Add padding */ + SHA256_Pad(ctx); + + /* Write the hash */ + be32enc_vect(digest, ctx->state, 32); + + /* Clear the context state */ + memset((void *)ctx, 0, sizeof(*ctx)); +} + +/* Initialize an HMAC-SHA256 operation with the given key. */ +void +HMAC_SHA256_Init_Y(HMAC_SHA256_CTX_Y * ctx, const void * _K, size_t Klen) +{ + unsigned char pad[64]; + unsigned char khash[32]; + const unsigned char * K = _K; + size_t i; + + /* If Klen > 64, the key is really SHA256(K). */ + if (Klen > 64) { + SHA256_Init_Y(&ctx->ictx); + SHA256_Update_Y(&ctx->ictx, K, Klen); + SHA256_Final_Y(khash, &ctx->ictx); + K = khash; + Klen = 32; + } + + /* Inner SHA256 operation is SHA256(K xor [block of 0x36] || data). */ + SHA256_Init_Y(&ctx->ictx); + memset(pad, 0x36, 64); + for (i = 0; i < Klen; i++) { + pad[i] ^= K[i]; + } + SHA256_Update_Y(&ctx->ictx, pad, 64); + + /* Outer SHA256 operation is SHA256(K xor [block of 0x5c] || hash). */ + SHA256_Init_Y(&ctx->octx); + memset(pad, 0x5c, 64); + for (i = 0; i < Klen; i++) + { + pad[i] ^= K[i]; + } + SHA256_Update_Y(&ctx->octx, pad, 64); + + /* Clean the stack. */ + memset(khash, 0, 32); +} + +/* Add bytes to the HMAC-SHA256 operation. */ +void +HMAC_SHA256_Update_Y(HMAC_SHA256_CTX_Y * ctx, const void *in, size_t len) +{ + /* Feed data to the inner SHA256 operation. */ + SHA256_Update_Y(&ctx->ictx, in, len); +} + +/* Finish an HMAC-SHA256 operation. */ +void +HMAC_SHA256_Final_Y(unsigned char digest[32], HMAC_SHA256_CTX_Y * ctx) +{ + unsigned char ihash[32]; + + /* Finish the inner SHA256 operation. */ + SHA256_Final_Y(ihash, &ctx->ictx); + + /* Feed the inner hash to the outer SHA256 operation. */ + SHA256_Update_Y(&ctx->octx, ihash, 32); + + /* Finish the outer SHA256 operation. */ + SHA256_Final_Y(digest, &ctx->octx); + + /* Clean the stack. */ + memset(ihash, 0, 32); +} + +/** + * PBKDF2_SHA256(passwd, passwdlen, salt, saltlen, c, buf, dkLen): + * Compute PBKDF2(passwd, salt, c, dkLen) using HMAC-SHA256 as the PRF, and + * write the output to buf. The value dkLen must be at most 32 * (2^32 - 1). + */ + +void +PBKDF2_SHA256(const uint8_t * passwd, size_t passwdlen, const uint8_t * salt, +size_t saltlen, uint64_t c, uint8_t * buf, size_t dkLen) +{ + HMAC_SHA256_CTX_Y PShctx, hctx; + size_t i; + uint8_t ivec[4]; + uint8_t U[32]; + uint8_t T[32]; + uint64_t j; + int k; + size_t clen; + + /* Compute HMAC state after processing P and S. */ + HMAC_SHA256_Init_Y(&PShctx, passwd, passwdlen); + HMAC_SHA256_Update_Y(&PShctx, salt, saltlen); + + /* Iterate through the blocks. */ + for (i = 0; i * 32 < dkLen; i++) { + /* Generate INT(i + 1). */ + be32enc(ivec, (uint32_t)(i + 1)); + + /* Compute U_1 = PRF(P, S || INT(i)). */ + memcpy(&hctx, &PShctx, sizeof(HMAC_SHA256_CTX_Y)); + HMAC_SHA256_Update_Y(&hctx, ivec, 4); + HMAC_SHA256_Final_Y(U, &hctx); + + /* T_i = U_1 ... */ + memcpy(T, U, 32); + + for (j = 2; j <= c; j++) { + /* Compute U_j. */ + HMAC_SHA256_Init_Y(&hctx, passwd, passwdlen); + HMAC_SHA256_Update_Y(&hctx, U, 32); + HMAC_SHA256_Final_Y(U, &hctx); + + /* ... xor U_j ... */ + for (k = 0; k < 32; k++) + T[k] ^= U[k]; + } + + /* Copy as many bytes as necessary into buf. */ + clen = dkLen - i * 32; + if (clen > 32) + clen = 32; + memcpy(&buf[i * 32], T, clen); + } + + /* Clean PShctx, since we never called _Final on it. */ + memset(&PShctx, 0, sizeof(HMAC_SHA256_CTX_Y)); +} diff --git a/sph/sha256_Y.h b/sph/sha256_Y.h new file mode 100644 index 0000000000..e97b81ba21 --- /dev/null +++ b/sph/sha256_Y.h @@ -0,0 +1,63 @@ +/*- + * Copyright 2005,2007,2009 Colin Percival + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: src/lib/libmd/sha256_Y.h,v 1.2 2006/01/17 15:35:56 phk Exp $ + */ + +#ifndef _SHA256_H_ +#define _SHA256_H_ + +#include + +#include + +typedef struct SHA256Context { + uint32_t state[8]; + uint32_t count[2]; + unsigned char buf[64]; +} SHA256_CTX_Y; + +typedef struct HMAC_SHA256Context { + SHA256_CTX_Y ictx; + SHA256_CTX_Y octx; +} HMAC_SHA256_CTX_Y; + +void SHA256_Init_Y(SHA256_CTX_Y *); +void SHA256_Update_Y(SHA256_CTX_Y *, const void *, size_t); +void SHA256_Final_Y(unsigned char [32], SHA256_CTX_Y *); +void HMAC_SHA256_Init_Y(HMAC_SHA256_CTX_Y *, const void *, size_t); +void HMAC_SHA256_Update_Y(HMAC_SHA256_CTX_Y *, const void *, size_t); +void HMAC_SHA256_Final_Y(unsigned char [32], HMAC_SHA256_CTX_Y *); + +/** + * PBKDF2_SHA256(passwd, passwdlen, salt, saltlen, c, buf, dkLen): + * Compute PBKDF2(passwd, salt, c, dkLen) using HMAC-SHA256 as the PRF, and + * write the output to buf. The value dkLen must be at most 32 * (2^32 - 1). + */ +void PBKDF2_SHA256(const uint8_t *, size_t, const uint8_t *, size_t, + uint64_t, uint8_t *, size_t); + + +#endif /* !_SHA256_H_ */ diff --git a/sph/shabal.c b/sph/shabal.c index 06d368ce54..46fe962eaf 100644 --- a/sph/shabal.c +++ b/sph/shabal.c @@ -386,7 +386,7 @@ extern "C"{ if ((Wlow = T32(Wlow + 1)) == 0) \ Whigh = T32(Whigh + 1); \ } while (0) -#if 0 + static const sph_u32 A_init_192[] = { C32(0xFD749ED4), C32(0xB798E530), C32(0x33904B6F), C32(0x46BDA85E), C32(0x076934B4), C32(0x454B4058), C32(0x77F74527), C32(0xFB4CF465), @@ -466,7 +466,7 @@ static const sph_u32 C_init_384[] = { C32(0x6E6E36DC), C32(0x63130DAD), C32(0xA9C69BD6), C32(0x1E90EA0C), C32(0x7C35073B), C32(0x28D95E6D), C32(0xAA340E0D), C32(0xCB3DEE70) }; -#endif + static const sph_u32 A_init_512[] = { C32(0x20728DFD), C32(0x46C0BD53), C32(0xE782B699), C32(0x55304632), C32(0x71B4EF90), C32(0x0EA9E82C), C32(0xDBB930F1), C32(0xFAD06B8B), @@ -500,7 +500,6 @@ shabal_init(void *cc, unsigned size) sph_shabal_context *sc; switch (size) { -#if 0 case 192: A_init = A_init_192; B_init = B_init_192; @@ -521,7 +520,6 @@ shabal_init(void *cc, unsigned size) B_init = B_init_384; C_init = C_init_384; break; -#endif case 512: A_init = A_init_512; B_init = B_init_512; @@ -662,7 +660,6 @@ shabal_close(void *cc, unsigned ub, unsigned n, void *dst, unsigned size_words) memcpy(dst, u.tmp_out + (sizeof u.tmp_out) - out_len, out_len); shabal_init(sc, size_words << 5); } - #if 0 /* see sph_shabal.h */ void @@ -720,6 +717,7 @@ sph_shabal224_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) shabal_close(cc, ub, n, dst, 7); } +#endif /* see sph_shabal.h */ void sph_shabal256_init(void *cc) @@ -748,6 +746,7 @@ sph_shabal256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) shabal_close(cc, ub, n, dst, 8); } +#if 0 /* see sph_shabal.h */ void sph_shabal384_init(void *cc) @@ -775,7 +774,6 @@ sph_shabal384_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) { shabal_close(cc, ub, n, dst, 12); } - #endif /* see sph_shabal.h */ diff --git a/sph/sph_sha2.c b/sph/sph_sha2.c new file mode 100644 index 0000000000..0a7e0c3275 --- /dev/null +++ b/sph/sph_sha2.c @@ -0,0 +1,693 @@ +/* $Id: sha2.c 227 2010-06-16 17:28:38Z tp $ */ +/* + * SHA-224 / SHA-256 implementation. + * + * ==========================(LICENSE BEGIN)============================ + * + * Copyright (c) 2007-2010 Projet RNRT SAPHIR + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * ===========================(LICENSE END)============================= + * + * @author Thomas Pornin + */ + +#include +#include + +#include "sph_sha2.h" + +#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_SHA2 +#define SPH_SMALL_FOOTPRINT_SHA2 1 +#endif + +#define CH(X, Y, Z) ((((Y) ^ (Z)) & (X)) ^ (Z)) +#define MAJ(X, Y, Z) (((Y) & (Z)) | (((Y) | (Z)) & (X))) + +#define ROTR SPH_ROTR32 + +#define BSG2_0(x) (ROTR(x, 2) ^ ROTR(x, 13) ^ ROTR(x, 22)) +#define BSG2_1(x) (ROTR(x, 6) ^ ROTR(x, 11) ^ ROTR(x, 25)) +#define SSG2_0(x) (ROTR(x, 7) ^ ROTR(x, 18) ^ SPH_T32((x) >> 3)) +#define SSG2_1(x) (ROTR(x, 17) ^ ROTR(x, 19) ^ SPH_T32((x) >> 10)) + +static const sph_u32 H224[8] = { + SPH_C32(0xC1059ED8), SPH_C32(0x367CD507), SPH_C32(0x3070DD17), + SPH_C32(0xF70E5939), SPH_C32(0xFFC00B31), SPH_C32(0x68581511), + SPH_C32(0x64F98FA7), SPH_C32(0xBEFA4FA4) +}; + +static const sph_u32 H256[8] = { + SPH_C32(0x6A09E667), SPH_C32(0xBB67AE85), SPH_C32(0x3C6EF372), + SPH_C32(0xA54FF53A), SPH_C32(0x510E527F), SPH_C32(0x9B05688C), + SPH_C32(0x1F83D9AB), SPH_C32(0x5BE0CD19) +}; + +/* + * The SHA2_ROUND_BODY defines the body for a SHA-224 / SHA-256 + * compression function implementation. The "in" parameter should + * evaluate, when applied to a numerical input parameter from 0 to 15, + * to an expression which yields the corresponding input block. The "r" + * parameter should evaluate to an array or pointer expression + * designating the array of 8 words which contains the input and output + * of the compression function. + */ + +#if SPH_SMALL_FOOTPRINT_SHA2 + +static const sph_u32 K[64] = { + SPH_C32(0x428A2F98), SPH_C32(0x71374491), + SPH_C32(0xB5C0FBCF), SPH_C32(0xE9B5DBA5), + SPH_C32(0x3956C25B), SPH_C32(0x59F111F1), + SPH_C32(0x923F82A4), SPH_C32(0xAB1C5ED5), + SPH_C32(0xD807AA98), SPH_C32(0x12835B01), + SPH_C32(0x243185BE), SPH_C32(0x550C7DC3), + SPH_C32(0x72BE5D74), SPH_C32(0x80DEB1FE), + SPH_C32(0x9BDC06A7), SPH_C32(0xC19BF174), + SPH_C32(0xE49B69C1), SPH_C32(0xEFBE4786), + SPH_C32(0x0FC19DC6), SPH_C32(0x240CA1CC), + SPH_C32(0x2DE92C6F), SPH_C32(0x4A7484AA), + SPH_C32(0x5CB0A9DC), SPH_C32(0x76F988DA), + SPH_C32(0x983E5152), SPH_C32(0xA831C66D), + SPH_C32(0xB00327C8), SPH_C32(0xBF597FC7), + SPH_C32(0xC6E00BF3), SPH_C32(0xD5A79147), + SPH_C32(0x06CA6351), SPH_C32(0x14292967), + SPH_C32(0x27B70A85), SPH_C32(0x2E1B2138), + SPH_C32(0x4D2C6DFC), SPH_C32(0x53380D13), + SPH_C32(0x650A7354), SPH_C32(0x766A0ABB), + SPH_C32(0x81C2C92E), SPH_C32(0x92722C85), + SPH_C32(0xA2BFE8A1), SPH_C32(0xA81A664B), + SPH_C32(0xC24B8B70), SPH_C32(0xC76C51A3), + SPH_C32(0xD192E819), SPH_C32(0xD6990624), + SPH_C32(0xF40E3585), SPH_C32(0x106AA070), + SPH_C32(0x19A4C116), SPH_C32(0x1E376C08), + SPH_C32(0x2748774C), SPH_C32(0x34B0BCB5), + SPH_C32(0x391C0CB3), SPH_C32(0x4ED8AA4A), + SPH_C32(0x5B9CCA4F), SPH_C32(0x682E6FF3), + SPH_C32(0x748F82EE), SPH_C32(0x78A5636F), + SPH_C32(0x84C87814), SPH_C32(0x8CC70208), + SPH_C32(0x90BEFFFA), SPH_C32(0xA4506CEB), + SPH_C32(0xBEF9A3F7), SPH_C32(0xC67178F2) +}; + +#define SHA2_MEXP1(in, pc) do { \ + W[pc] = in(pc); \ + } while (0) + +#define SHA2_MEXP2(in, pc) do { \ + W[(pc) & 0x0F] = SPH_T32(SSG2_1(W[((pc) - 2) & 0x0F]) \ + + W[((pc) - 7) & 0x0F] \ + + SSG2_0(W[((pc) - 15) & 0x0F]) + W[(pc) & 0x0F]); \ + } while (0) + +#define SHA2_STEPn(n, a, b, c, d, e, f, g, h, in, pc) do { \ + sph_u32 t1, t2; \ + SHA2_MEXP ## n(in, pc); \ + t1 = SPH_T32(h + BSG2_1(e) + CH(e, f, g) \ + + K[pcount + (pc)] + W[(pc) & 0x0F]); \ + t2 = SPH_T32(BSG2_0(a) + MAJ(a, b, c)); \ + d = SPH_T32(d + t1); \ + h = SPH_T32(t1 + t2); \ + } while (0) + +#define SHA2_STEP1(a, b, c, d, e, f, g, h, in, pc) \ + SHA2_STEPn(1, a, b, c, d, e, f, g, h, in, pc) +#define SHA2_STEP2(a, b, c, d, e, f, g, h, in, pc) \ + SHA2_STEPn(2, a, b, c, d, e, f, g, h, in, pc) + +#define SHA2_ROUND_BODY(in, r) do { \ + sph_u32 A, B, C, D, E, F, G, H; \ + sph_u32 W[16]; \ + unsigned pcount; \ + \ + A = (r)[0]; \ + B = (r)[1]; \ + C = (r)[2]; \ + D = (r)[3]; \ + E = (r)[4]; \ + F = (r)[5]; \ + G = (r)[6]; \ + H = (r)[7]; \ + pcount = 0; \ + SHA2_STEP1(A, B, C, D, E, F, G, H, in, 0); \ + SHA2_STEP1(H, A, B, C, D, E, F, G, in, 1); \ + SHA2_STEP1(G, H, A, B, C, D, E, F, in, 2); \ + SHA2_STEP1(F, G, H, A, B, C, D, E, in, 3); \ + SHA2_STEP1(E, F, G, H, A, B, C, D, in, 4); \ + SHA2_STEP1(D, E, F, G, H, A, B, C, in, 5); \ + SHA2_STEP1(C, D, E, F, G, H, A, B, in, 6); \ + SHA2_STEP1(B, C, D, E, F, G, H, A, in, 7); \ + SHA2_STEP1(A, B, C, D, E, F, G, H, in, 8); \ + SHA2_STEP1(H, A, B, C, D, E, F, G, in, 9); \ + SHA2_STEP1(G, H, A, B, C, D, E, F, in, 10); \ + SHA2_STEP1(F, G, H, A, B, C, D, E, in, 11); \ + SHA2_STEP1(E, F, G, H, A, B, C, D, in, 12); \ + SHA2_STEP1(D, E, F, G, H, A, B, C, in, 13); \ + SHA2_STEP1(C, D, E, F, G, H, A, B, in, 14); \ + SHA2_STEP1(B, C, D, E, F, G, H, A, in, 15); \ + for (pcount = 16; pcount < 64; pcount += 16) { \ + SHA2_STEP2(A, B, C, D, E, F, G, H, in, 0); \ + SHA2_STEP2(H, A, B, C, D, E, F, G, in, 1); \ + SHA2_STEP2(G, H, A, B, C, D, E, F, in, 2); \ + SHA2_STEP2(F, G, H, A, B, C, D, E, in, 3); \ + SHA2_STEP2(E, F, G, H, A, B, C, D, in, 4); \ + SHA2_STEP2(D, E, F, G, H, A, B, C, in, 5); \ + SHA2_STEP2(C, D, E, F, G, H, A, B, in, 6); \ + SHA2_STEP2(B, C, D, E, F, G, H, A, in, 7); \ + SHA2_STEP2(A, B, C, D, E, F, G, H, in, 8); \ + SHA2_STEP2(H, A, B, C, D, E, F, G, in, 9); \ + SHA2_STEP2(G, H, A, B, C, D, E, F, in, 10); \ + SHA2_STEP2(F, G, H, A, B, C, D, E, in, 11); \ + SHA2_STEP2(E, F, G, H, A, B, C, D, in, 12); \ + SHA2_STEP2(D, E, F, G, H, A, B, C, in, 13); \ + SHA2_STEP2(C, D, E, F, G, H, A, B, in, 14); \ + SHA2_STEP2(B, C, D, E, F, G, H, A, in, 15); \ + } \ + (r)[0] = SPH_T32((r)[0] + A); \ + (r)[1] = SPH_T32((r)[1] + B); \ + (r)[2] = SPH_T32((r)[2] + C); \ + (r)[3] = SPH_T32((r)[3] + D); \ + (r)[4] = SPH_T32((r)[4] + E); \ + (r)[5] = SPH_T32((r)[5] + F); \ + (r)[6] = SPH_T32((r)[6] + G); \ + (r)[7] = SPH_T32((r)[7] + H); \ + } while (0) + +#else + +#define SHA2_ROUND_BODY(in, r) do { \ + sph_u32 A, B, C, D, E, F, G, H, T1, T2; \ + sph_u32 W00, W01, W02, W03, W04, W05, W06, W07; \ + sph_u32 W08, W09, W10, W11, W12, W13, W14, W15; \ + int i; \ + \ +/* for (i=0;i<8;i++) {printf("in[%d]=%08x in[%d]=%08x \n",2*i,in(2*i),2*i+1,in(2*i+1));} */ \ + A = (r)[0]; \ + B = (r)[1]; \ + C = (r)[2]; \ + D = (r)[3]; \ + E = (r)[4]; \ + F = (r)[5]; \ + G = (r)[6]; \ + H = (r)[7]; \ + W00 = in(0); \ + T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \ + + SPH_C32(0x428A2F98) + W00); \ + T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \ + D = SPH_T32(D + T1); \ + H = SPH_T32(T1 + T2); \ + W01 = in(1); \ + T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \ + + SPH_C32(0x71374491) + W01); \ + T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \ + C = SPH_T32(C + T1); \ + G = SPH_T32(T1 + T2); \ + W02 = in(2); \ + T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \ + + SPH_C32(0xB5C0FBCF) + W02); \ + T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \ + B = SPH_T32(B + T1); \ + F = SPH_T32(T1 + T2); \ + W03 = in(3); \ + T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \ + + SPH_C32(0xE9B5DBA5) + W03); \ + T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \ + A = SPH_T32(A + T1); \ + E = SPH_T32(T1 + T2); \ + W04 = in(4); \ + T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \ + + SPH_C32(0x3956C25B) + W04); \ + T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \ + H = SPH_T32(H + T1); \ + D = SPH_T32(T1 + T2); \ + W05 = in(5); \ + T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \ + + SPH_C32(0x59F111F1) + W05); \ + T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \ + G = SPH_T32(G + T1); \ + C = SPH_T32(T1 + T2); \ + W06 = in(6); \ + T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \ + + SPH_C32(0x923F82A4) + W06); \ + T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \ + F = SPH_T32(F + T1); \ + B = SPH_T32(T1 + T2); \ + W07 = in(7); \ + T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \ + + SPH_C32(0xAB1C5ED5) + W07); \ + T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \ + E = SPH_T32(E + T1); \ + A = SPH_T32(T1 + T2); \ + W08 = in(8); \ + T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \ + + SPH_C32(0xD807AA98) + W08); \ + T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \ + D = SPH_T32(D + T1); \ + H = SPH_T32(T1 + T2); \ + W09 = in(9); \ + T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \ + + SPH_C32(0x12835B01) + W09); \ + T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \ + C = SPH_T32(C + T1); \ + G = SPH_T32(T1 + T2); \ + W10 = in(10); \ + T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \ + + SPH_C32(0x243185BE) + W10); \ + T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \ + B = SPH_T32(B + T1); \ + F = SPH_T32(T1 + T2); \ + W11 = in(11); \ + T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \ + + SPH_C32(0x550C7DC3) + W11); \ + T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \ + A = SPH_T32(A + T1); \ + E = SPH_T32(T1 + T2); \ + W12 = in(12); \ + T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \ + + SPH_C32(0x72BE5D74) + W12); \ + T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \ + H = SPH_T32(H + T1); \ + D = SPH_T32(T1 + T2); \ + W13 = in(13); \ + T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \ + + SPH_C32(0x80DEB1FE) + W13); \ + T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \ + G = SPH_T32(G + T1); \ + C = SPH_T32(T1 + T2); \ + W14 = in(14); \ + T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \ + + SPH_C32(0x9BDC06A7) + W14); \ + T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \ + F = SPH_T32(F + T1); \ + B = SPH_T32(T1 + T2); \ + W15 = in(15); \ + T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \ + + SPH_C32(0xC19BF174) + W15); \ + T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \ + E = SPH_T32(E + T1); \ + A = SPH_T32(T1 + T2); \ + W00 = SPH_T32(SSG2_1(W14) + W09 + SSG2_0(W01) + W00); \ + T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \ + + SPH_C32(0xE49B69C1) + W00); \ + T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \ + D = SPH_T32(D + T1); \ + H = SPH_T32(T1 + T2); \ + W01 = SPH_T32(SSG2_1(W15) + W10 + SSG2_0(W02) + W01); \ + T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \ + + SPH_C32(0xEFBE4786) + W01); \ + T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \ + C = SPH_T32(C + T1); \ + G = SPH_T32(T1 + T2); \ + W02 = SPH_T32(SSG2_1(W00) + W11 + SSG2_0(W03) + W02); \ + T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \ + + SPH_C32(0x0FC19DC6) + W02); \ + T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \ + B = SPH_T32(B + T1); \ + F = SPH_T32(T1 + T2); \ + W03 = SPH_T32(SSG2_1(W01) + W12 + SSG2_0(W04) + W03); \ + T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \ + + SPH_C32(0x240CA1CC) + W03); \ + T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \ + A = SPH_T32(A + T1); \ + E = SPH_T32(T1 + T2); \ + W04 = SPH_T32(SSG2_1(W02) + W13 + SSG2_0(W05) + W04); \ + T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \ + + SPH_C32(0x2DE92C6F) + W04); \ + T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \ + H = SPH_T32(H + T1); \ + D = SPH_T32(T1 + T2); \ + W05 = SPH_T32(SSG2_1(W03) + W14 + SSG2_0(W06) + W05); \ + T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \ + + SPH_C32(0x4A7484AA) + W05); \ + T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \ + G = SPH_T32(G + T1); \ + C = SPH_T32(T1 + T2); \ + W06 = SPH_T32(SSG2_1(W04) + W15 + SSG2_0(W07) + W06); \ + T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \ + + SPH_C32(0x5CB0A9DC) + W06); \ + T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \ + F = SPH_T32(F + T1); \ + B = SPH_T32(T1 + T2); \ + W07 = SPH_T32(SSG2_1(W05) + W00 + SSG2_0(W08) + W07); \ + T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \ + + SPH_C32(0x76F988DA) + W07); \ + T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \ + E = SPH_T32(E + T1); \ + A = SPH_T32(T1 + T2); \ + W08 = SPH_T32(SSG2_1(W06) + W01 + SSG2_0(W09) + W08); \ + T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \ + + SPH_C32(0x983E5152) + W08); \ + T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \ + D = SPH_T32(D + T1); \ + H = SPH_T32(T1 + T2); \ + W09 = SPH_T32(SSG2_1(W07) + W02 + SSG2_0(W10) + W09); \ + T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \ + + SPH_C32(0xA831C66D) + W09); \ + T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \ + C = SPH_T32(C + T1); \ + G = SPH_T32(T1 + T2); \ + W10 = SPH_T32(SSG2_1(W08) + W03 + SSG2_0(W11) + W10); \ + T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \ + + SPH_C32(0xB00327C8) + W10); \ + T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \ + B = SPH_T32(B + T1); \ + F = SPH_T32(T1 + T2); \ + W11 = SPH_T32(SSG2_1(W09) + W04 + SSG2_0(W12) + W11); \ + T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \ + + SPH_C32(0xBF597FC7) + W11); \ + T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \ + A = SPH_T32(A + T1); \ + E = SPH_T32(T1 + T2); \ + W12 = SPH_T32(SSG2_1(W10) + W05 + SSG2_0(W13) + W12); \ + T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \ + + SPH_C32(0xC6E00BF3) + W12); \ + T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \ + H = SPH_T32(H + T1); \ + D = SPH_T32(T1 + T2); \ + W13 = SPH_T32(SSG2_1(W11) + W06 + SSG2_0(W14) + W13); \ + T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \ + + SPH_C32(0xD5A79147) + W13); \ + T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \ + G = SPH_T32(G + T1); \ + C = SPH_T32(T1 + T2); \ + W14 = SPH_T32(SSG2_1(W12) + W07 + SSG2_0(W15) + W14); \ + T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \ + + SPH_C32(0x06CA6351) + W14); \ + T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \ + F = SPH_T32(F + T1); \ + B = SPH_T32(T1 + T2); \ + W15 = SPH_T32(SSG2_1(W13) + W08 + SSG2_0(W00) + W15); \ + T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \ + + SPH_C32(0x14292967) + W15); \ + T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \ + E = SPH_T32(E + T1); \ + A = SPH_T32(T1 + T2); \ + W00 = SPH_T32(SSG2_1(W14) + W09 + SSG2_0(W01) + W00); \ + T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \ + + SPH_C32(0x27B70A85) + W00); \ + T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \ + D = SPH_T32(D + T1); \ + H = SPH_T32(T1 + T2); \ + W01 = SPH_T32(SSG2_1(W15) + W10 + SSG2_0(W02) + W01); \ + T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \ + + SPH_C32(0x2E1B2138) + W01); \ + T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \ + C = SPH_T32(C + T1); \ + G = SPH_T32(T1 + T2); \ + W02 = SPH_T32(SSG2_1(W00) + W11 + SSG2_0(W03) + W02); \ + T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \ + + SPH_C32(0x4D2C6DFC) + W02); \ + T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \ + B = SPH_T32(B + T1); \ + F = SPH_T32(T1 + T2); \ + W03 = SPH_T32(SSG2_1(W01) + W12 + SSG2_0(W04) + W03); \ + T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \ + + SPH_C32(0x53380D13) + W03); \ + T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \ + A = SPH_T32(A + T1); \ + E = SPH_T32(T1 + T2); \ + W04 = SPH_T32(SSG2_1(W02) + W13 + SSG2_0(W05) + W04); \ + T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \ + + SPH_C32(0x650A7354) + W04); \ + T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \ + H = SPH_T32(H + T1); \ + D = SPH_T32(T1 + T2); \ + W05 = SPH_T32(SSG2_1(W03) + W14 + SSG2_0(W06) + W05); \ + T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \ + + SPH_C32(0x766A0ABB) + W05); \ + T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \ + G = SPH_T32(G + T1); \ + C = SPH_T32(T1 + T2); \ + W06 = SPH_T32(SSG2_1(W04) + W15 + SSG2_0(W07) + W06); \ + T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \ + + SPH_C32(0x81C2C92E) + W06); \ + T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \ + F = SPH_T32(F + T1); \ + B = SPH_T32(T1 + T2); \ + W07 = SPH_T32(SSG2_1(W05) + W00 + SSG2_0(W08) + W07); \ + T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \ + + SPH_C32(0x92722C85) + W07); \ + T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \ + E = SPH_T32(E + T1); \ + A = SPH_T32(T1 + T2); \ + W08 = SPH_T32(SSG2_1(W06) + W01 + SSG2_0(W09) + W08); \ + T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \ + + SPH_C32(0xA2BFE8A1) + W08); \ + T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \ + D = SPH_T32(D + T1); \ + H = SPH_T32(T1 + T2); \ + W09 = SPH_T32(SSG2_1(W07) + W02 + SSG2_0(W10) + W09); \ + T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \ + + SPH_C32(0xA81A664B) + W09); \ + T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \ + C = SPH_T32(C + T1); \ + G = SPH_T32(T1 + T2); \ + W10 = SPH_T32(SSG2_1(W08) + W03 + SSG2_0(W11) + W10); \ + T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \ + + SPH_C32(0xC24B8B70) + W10); \ + T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \ + B = SPH_T32(B + T1); \ + F = SPH_T32(T1 + T2); \ + W11 = SPH_T32(SSG2_1(W09) + W04 + SSG2_0(W12) + W11); \ + T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \ + + SPH_C32(0xC76C51A3) + W11); \ + T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \ + A = SPH_T32(A + T1); \ + E = SPH_T32(T1 + T2); \ + W12 = SPH_T32(SSG2_1(W10) + W05 + SSG2_0(W13) + W12); \ + T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \ + + SPH_C32(0xD192E819) + W12); \ + T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \ + H = SPH_T32(H + T1); \ + D = SPH_T32(T1 + T2); \ + W13 = SPH_T32(SSG2_1(W11) + W06 + SSG2_0(W14) + W13); \ + T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \ + + SPH_C32(0xD6990624) + W13); \ + T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \ + G = SPH_T32(G + T1); \ + C = SPH_T32(T1 + T2); \ + W14 = SPH_T32(SSG2_1(W12) + W07 + SSG2_0(W15) + W14); \ + T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \ + + SPH_C32(0xF40E3585) + W14); \ + T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \ + F = SPH_T32(F + T1); \ + B = SPH_T32(T1 + T2); \ + W15 = SPH_T32(SSG2_1(W13) + W08 + SSG2_0(W00) + W15); \ + T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \ + + SPH_C32(0x106AA070) + W15); \ + T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \ + E = SPH_T32(E + T1); \ + A = SPH_T32(T1 + T2); \ + W00 = SPH_T32(SSG2_1(W14) + W09 + SSG2_0(W01) + W00); \ + T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \ + + SPH_C32(0x19A4C116) + W00); \ + T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \ + D = SPH_T32(D + T1); \ + H = SPH_T32(T1 + T2); \ + W01 = SPH_T32(SSG2_1(W15) + W10 + SSG2_0(W02) + W01); \ + T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \ + + SPH_C32(0x1E376C08) + W01); \ + T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \ + C = SPH_T32(C + T1); \ + G = SPH_T32(T1 + T2); \ + W02 = SPH_T32(SSG2_1(W00) + W11 + SSG2_0(W03) + W02); \ + T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \ + + SPH_C32(0x2748774C) + W02); \ + T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \ + B = SPH_T32(B + T1); \ + F = SPH_T32(T1 + T2); \ + W03 = SPH_T32(SSG2_1(W01) + W12 + SSG2_0(W04) + W03); \ + T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \ + + SPH_C32(0x34B0BCB5) + W03); \ + T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \ + A = SPH_T32(A + T1); \ + E = SPH_T32(T1 + T2); \ + W04 = SPH_T32(SSG2_1(W02) + W13 + SSG2_0(W05) + W04); \ + T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \ + + SPH_C32(0x391C0CB3) + W04); \ + T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \ + H = SPH_T32(H + T1); \ + D = SPH_T32(T1 + T2); \ + W05 = SPH_T32(SSG2_1(W03) + W14 + SSG2_0(W06) + W05); \ + T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \ + + SPH_C32(0x4ED8AA4A) + W05); \ + T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \ + G = SPH_T32(G + T1); \ + C = SPH_T32(T1 + T2); \ + W06 = SPH_T32(SSG2_1(W04) + W15 + SSG2_0(W07) + W06); \ + T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \ + + SPH_C32(0x5B9CCA4F) + W06); \ + T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \ + F = SPH_T32(F + T1); \ + B = SPH_T32(T1 + T2); \ + W07 = SPH_T32(SSG2_1(W05) + W00 + SSG2_0(W08) + W07); \ + T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \ + + SPH_C32(0x682E6FF3) + W07); \ + T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \ + E = SPH_T32(E + T1); \ + A = SPH_T32(T1 + T2); \ + W08 = SPH_T32(SSG2_1(W06) + W01 + SSG2_0(W09) + W08); \ + T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \ + + SPH_C32(0x748F82EE) + W08); \ + T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \ + D = SPH_T32(D + T1); \ + H = SPH_T32(T1 + T2); \ + W09 = SPH_T32(SSG2_1(W07) + W02 + SSG2_0(W10) + W09); \ + T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \ + + SPH_C32(0x78A5636F) + W09); \ + T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \ + C = SPH_T32(C + T1); \ + G = SPH_T32(T1 + T2); \ + W10 = SPH_T32(SSG2_1(W08) + W03 + SSG2_0(W11) + W10); \ + T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \ + + SPH_C32(0x84C87814) + W10); \ + T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \ + B = SPH_T32(B + T1); \ + F = SPH_T32(T1 + T2); \ + W11 = SPH_T32(SSG2_1(W09) + W04 + SSG2_0(W12) + W11); \ + T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \ + + SPH_C32(0x8CC70208) + W11); \ + T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \ + A = SPH_T32(A + T1); \ + E = SPH_T32(T1 + T2); \ + W12 = SPH_T32(SSG2_1(W10) + W05 + SSG2_0(W13) + W12); \ + T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \ + + SPH_C32(0x90BEFFFA) + W12); \ + T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \ + H = SPH_T32(H + T1); \ + D = SPH_T32(T1 + T2); \ + W13 = SPH_T32(SSG2_1(W11) + W06 + SSG2_0(W14) + W13); \ + T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \ + + SPH_C32(0xA4506CEB) + W13); \ + T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \ + G = SPH_T32(G + T1); \ + C = SPH_T32(T1 + T2); \ + W14 = SPH_T32(SSG2_1(W12) + W07 + SSG2_0(W15) + W14); \ + T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \ + + SPH_C32(0xBEF9A3F7) + W14); \ + T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \ + F = SPH_T32(F + T1); \ + B = SPH_T32(T1 + T2); \ + W15 = SPH_T32(SSG2_1(W13) + W08 + SSG2_0(W00) + W15); \ + T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \ + + SPH_C32(0xC67178F2) + W15); \ + T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \ + E = SPH_T32(E + T1); \ + A = SPH_T32(T1 + T2); \ + (r)[0] = SPH_T32((r)[0] + A); \ + (r)[1] = SPH_T32((r)[1] + B); \ + (r)[2] = SPH_T32((r)[2] + C); \ + (r)[3] = SPH_T32((r)[3] + D); \ + (r)[4] = SPH_T32((r)[4] + E); \ + (r)[5] = SPH_T32((r)[5] + F); \ + (r)[6] = SPH_T32((r)[6] + G); \ + (r)[7] = SPH_T32((r)[7] + H); \ +/* for (i=0;i<4;i++) {printf("r[%d]=%08x r[%d]=%08x\n",2*i,(r)[2*i],2*i+1,(r)[2*i+1]);} */ \ + } while (0) + +#endif + +/* + * One round of SHA-224 / SHA-256. The data must be aligned for 32-bit access. + */ +static void +sha2_round(const unsigned char *data, sph_u32 r[8]) +{ +#define SHA2_IN(x) sph_dec32be_aligned(data + (4 * (x))) + SHA2_ROUND_BODY(SHA2_IN, r); +#undef SHA2_IN +} + +/* see sph_sha2.h */ +void +sph_sha224_init(void *cc) +{ + sph_sha224_context *sc; + + sc = cc; + memcpy(sc->val, H224, sizeof H224); +#if SPH_64 + sc->count = 0; +#else + sc->count_high = sc->count_low = 0; +#endif +} + +/* see sph_sha2.h */ +void +sph_sha256_init(void *cc) +{ + sph_sha256_context *sc; + + sc = cc; + memcpy(sc->val, H256, sizeof H256); +#if SPH_64 + sc->count = 0; +#else + sc->count_high = sc->count_low = 0; +#endif +} + +#define RFUN sha2_round +#define HASH sha224 +#define BE32 1 +#include "md_helper.c" + +/* see sph_sha2.h */ +void +sph_sha224_close(void *cc, void *dst) +{ + sha224_close(cc, dst, 7); + sph_sha224_init(cc); +} + +/* see sph_sha2.h */ +void +sph_sha224_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + sha224_addbits_and_close(cc, ub, n, dst, 7); + sph_sha224_init(cc); +} + +/* see sph_sha2.h */ +void +sph_sha256_close(void *cc, void *dst) +{ + sha224_close(cc, dst, 8); + sph_sha256_init(cc); +} + +/* see sph_sha2.h */ +void +sph_sha256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + sha224_addbits_and_close(cc, ub, n, dst, 8); + sph_sha256_init(cc); +} + +/* see sph_sha2.h */ +void +sph_sha224_comp(const sph_u32 msg[16], sph_u32 val[8]) +{ +#define SHA2_IN(x) msg[x] + SHA2_ROUND_BODY(SHA2_IN, val); +#undef SHA2_IN +} diff --git a/sph/sph_types.h b/sph/sph_types.h index 5ec7bbf31d..85578a4292 100644 --- a/sph/sph_types.h +++ b/sph/sph_types.h @@ -816,7 +816,7 @@ static inline void sph_enc64be_aligned(void *dst, sph_u64 val); #undef SPH_64 #undef SPH_64_TRUE -#if defined __STDC__ && __STDC_VERSION__ >= 199901L +#if 1 // defined __STDC__ && __STDC_VERSION__ >= 199901L /* * On C99 implementations, we can use to get an exact 64-bit @@ -824,7 +824,11 @@ static inline void sph_enc64be_aligned(void *dst, sph_u64 val); * C99 conformance). */ +#ifdef __cplusplus +#include +#else #include +#endif #ifdef UINT32_MAX typedef uint32_t sph_u32; @@ -930,14 +934,25 @@ typedef long long sph_s64; */ #define SPH_T32(x) ((x) & SPH_C32(0xFFFFFFFF)) -#define SPH_ROTL32(x, n) SPH_T32(((x) << (n)) | ((x) >> (32 - (n)))) -#define SPH_ROTR32(x, n) SPH_ROTL32(x, (32 - (n))) +#if defined _MSC_VER +#define SPH_ROTL32(x, n) _rotl(x, n) +#define SPH_ROTR32(x, n) _rotr(x, n) +#else +#define SPH_ROTL32(x, n) ((x) << (n)) | ((x) >> (32 - (n))) +#define SPH_ROTR32(x, n) ((x) >> (n)) | ((x) << (32 - (n))) +#endif #if SPH_64 +#if defined _MSC_VER +#define SPH_ROTR64(x, n) _rotr64(x, n) +#define SPH_ROTL64(x, n) _rotl64(x, n) +#else +#define SPH_ROTR64(x, n) (((x) >> (n)) | ((x) << (64 - (n)))) +#define SPH_ROTL64(x, n) (((x) << (n)) | ((x) >> (64 - (n)))) +#endif + #define SPH_T64(x) ((x) & SPH_C64(0xFFFFFFFFFFFFFFFF)) -#define SPH_ROTL64(x, n) SPH_T64(((x) << (n)) | ((x) >> (64 - (n)))) -#define SPH_ROTR64(x, n) SPH_ROTL64(x, (64 - (n))) #endif @@ -1001,7 +1016,7 @@ typedef long long sph_s64; #define SPH_DETECT_UNALIGNED 1 #define SPH_DETECT_LITTLE_ENDIAN 1 -#define SPH_DETECT_UPTR sph_u32 +#define SPH_DETECT_UPTR uintptr_t #ifdef __GNUC__ #define SPH_DETECT_I386_GCC 1 #endif @@ -1016,7 +1031,7 @@ typedef long long sph_s64; #define SPH_DETECT_UNALIGNED 1 #define SPH_DETECT_LITTLE_ENDIAN 1 -#define SPH_DETECT_UPTR sph_u64 +#define SPH_DETECT_UPTR uintptr_t #ifdef __GNUC__ #define SPH_DETECT_AMD64_GCC 1 #endif diff --git a/sph/sysendian.h b/sph/sysendian.h new file mode 100644 index 0000000000..31ac985fb9 --- /dev/null +++ b/sph/sysendian.h @@ -0,0 +1,140 @@ +/*- + * Copyright 2007-2009 Colin Percival + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * This file was originally written by Colin Percival as part of the Tarsnap + * online backup system. + */ +#ifndef _SYSENDIAN_H_ +#define _SYSENDIAN_H_ + +/* If we don't have be64enc, the we have isn't usable. */ +#if !HAVE_DECL_BE64ENC +#undef HAVE_SYS_ENDIAN_H +#endif + +#ifdef HAVE_SYS_ENDIAN_H + +#include + +#else + +#include + +#if !HAVE_DECL_LE32DEC +static uint32_t le32dec(const void *pp) +{ + const uint8_t *p = (uint8_t const *)pp; + return ((uint32_t)(p[0]) + ((uint32_t)(p[1]) << 8) + + ((uint32_t)(p[2]) << 16) + ((uint32_t)(p[3]) << 24)); +} +#endif + +#if !HAVE_DECL_BE32ENC +static void be32enc(void *pp, uint32_t x) +{ + uint8_t *p = (uint8_t *)pp; + p[3] = x & 0xff; + p[2] = (x >> 8) & 0xff; + p[1] = (x >> 16) & 0xff; + p[0] = (x >> 24) & 0xff; +} +#endif + +#if !HAVE_DECL_BE32DEC +static uint32_t be32dec(const void *pp) +{ + const uint8_t *p = (uint8_t const *)pp; + return ((uint32_t)(p[3]) + ((uint32_t)(p[2]) << 8) + + ((uint32_t)(p[1]) << 16) + ((uint32_t)(p[0]) << 24)); +} +#endif + +#if !HAVE_DECL_LE32ENC +static void le32enc(void *pp, uint32_t x) +{ + uint8_t *p = (uint8_t *)pp; + p[0] = x & 0xff; + p[1] = (x >> 8) & 0xff; + p[2] = (x >> 16) & 0xff; + p[3] = (x >> 24) & 0xff; +} +#endif + +static uint64_t +be64dec(const void *pp) +{ + const uint8_t *p = (uint8_t const *)pp; + + return ((uint64_t)(p[7]) + ((uint64_t)(p[6]) << 8) + + ((uint64_t)(p[5]) << 16) + ((uint64_t)(p[4]) << 24) + + ((uint64_t)(p[3]) << 32) + ((uint64_t)(p[2]) << 40) + + ((uint64_t)(p[1]) << 48) + ((uint64_t)(p[0]) << 56)); +} + +static void +be64enc(void *pp, uint64_t x) +{ + uint8_t * p = (uint8_t *)pp; + + p[7] = x & 0xff; + p[6] = (x >> 8) & 0xff; + p[5] = (x >> 16) & 0xff; + p[4] = (x >> 24) & 0xff; + p[3] = (x >> 32) & 0xff; + p[2] = (x >> 40) & 0xff; + p[1] = (x >> 48) & 0xff; + p[0] = (x >> 56) & 0xff; +} + + + +static uint64_t +le64dec(const void *pp) +{ + const uint8_t *p = (uint8_t const *)pp; + + return ((uint64_t)(p[0]) + ((uint64_t)(p[1]) << 8) + + ((uint64_t)(p[2]) << 16) + ((uint64_t)(p[3]) << 24) + + ((uint64_t)(p[4]) << 32) + ((uint64_t)(p[5]) << 40) + + ((uint64_t)(p[6]) << 48) + ((uint64_t)(p[7]) << 56)); +} + +static void +le64enc(void *pp, uint64_t x) +{ + uint8_t * p = (uint8_t *)pp; + + p[0] = x & 0xff; + p[1] = (x >> 8) & 0xff; + p[2] = (x >> 16) & 0xff; + p[3] = (x >> 24) & 0xff; + p[4] = (x >> 32) & 0xff; + p[5] = (x >> 40) & 0xff; + p[6] = (x >> 48) & 0xff; + p[7] = (x >> 56) & 0xff; +} +#endif /* !HAVE_SYS_ENDIAN_H */ + +#endif /* !_SYSENDIAN_H_ */ diff --git a/sph/yescrypt-opt.c b/sph/yescrypt-opt.c new file mode 100644 index 0000000000..3ec0eb8726 --- /dev/null +++ b/sph/yescrypt-opt.c @@ -0,0 +1,1392 @@ +/*- + * Copyright 2009 Colin Percival + * Copyright 2013,2014 Alexander Peslyak + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * This file was originally written by Colin Percival as part of the Tarsnap + * online backup system. + */ + +#ifdef __i386__ +#warning "This implementation does not use SIMD, and thus it runs a lot slower than the SIMD-enabled implementation. Enable at least SSE2 in the C compiler and use yescrypt-best.c instead unless you're building this SIMD-less implementation on purpose (portability to older CPUs or testing)." +#elif defined(__x86_64__) +#warning "This implementation does not use SIMD, and thus it runs a lot slower than the SIMD-enabled implementation. Use yescrypt-best.c instead unless you're building this SIMD-less implementation on purpose (for testing only)." +#endif + +#include +#include +#include +#include +#include "yescrypt.h" +#include "sha256_Y.h" +#include "sysendian.h" + +// #include "sph/yescrypt-platform.c" +#define HUGEPAGE_THRESHOLD (12 * 1024 * 1024) + +#ifdef __x86_64__ +#define HUGEPAGE_SIZE (2 * 1024 * 1024) +#else +#undef HUGEPAGE_SIZE +#endif + + +static void * +alloc_region(yescrypt_region_t * region, size_t size) +{ + size_t base_size = size; + uint8_t * base, *aligned; +#ifdef MAP_ANON + int flags = +#ifdef MAP_NOCORE + MAP_NOCORE | +#endif + MAP_ANON | MAP_PRIVATE; +#if defined(MAP_HUGETLB) && defined(HUGEPAGE_SIZE) + size_t new_size = size; + const size_t hugepage_mask = (size_t)HUGEPAGE_SIZE - 1; + if (size >= HUGEPAGE_THRESHOLD && size + hugepage_mask >= size) { + flags |= MAP_HUGETLB; + /* + * Linux's munmap() fails on MAP_HUGETLB mappings if size is not a multiple of + * huge page size, so let's round up to huge page size here. + */ + new_size = size + hugepage_mask; + new_size &= ~hugepage_mask; + } + base = mmap(NULL, new_size, PROT_READ | PROT_WRITE, flags, -1, 0); + if (base != MAP_FAILED) { + base_size = new_size; + } + else + if (flags & MAP_HUGETLB) { + flags &= ~MAP_HUGETLB; + base = mmap(NULL, size, PROT_READ | PROT_WRITE, flags, -1, 0); + } + +#else + base = mmap(NULL, size, PROT_READ | PROT_WRITE, flags, -1, 0); +#endif + if (base == MAP_FAILED) + base = NULL; + aligned = base; +#elif defined(HAVE_POSIX_MEMALIGN) + if ((errno = posix_memalign((void **)&base, 64, size)) != 0) + base = NULL; + aligned = base; +#else + base = aligned = NULL; + if (size + 63 < size) { + errno = ENOMEM; + } + else if ((base = malloc(size + 63)) != NULL) { + aligned = base + 63; + aligned -= (uintptr_t)aligned & 63; + } +#endif + region->base = base; + region->aligned = aligned; + region->base_size = base ? base_size : 0; + region->aligned_size = base ? size : 0; + return aligned; +} + +static void init_region(yescrypt_region_t * region) +{ + region->base = region->aligned = NULL; + region->base_size = region->aligned_size = 0; +} + +static int +free_region(yescrypt_region_t * region) +{ + if (region->base) { +#ifdef MAP_ANON + if (munmap(region->base, region->base_size)) + return -1; +#else + free(region->base); +#endif + } + init_region(region); + return 0; +} + +int +yescrypt_init_shared(yescrypt_shared_t * shared, +const uint8_t * param, size_t paramlen, +uint64_t N, uint32_t r, uint32_t p, +yescrypt_init_shared_flags_t flags, uint32_t mask, +uint8_t * buf, size_t buflen) +{ + yescrypt_shared1_t * shared1 = &shared->shared1; + yescrypt_shared_t dummy, half1, half2; + // yescrypt_shared_t * half2; + uint8_t salt[32]; + + if (flags & YESCRYPT_SHARED_PREALLOCATED) { + if (!shared1->aligned || !shared1->aligned_size) + return -1; + } + else { + init_region(shared1); + } + shared->mask1 = 1; + if (!param && !paramlen && !N && !r && !p && !buf && !buflen) + return 0; + + init_region(&dummy.shared1); + dummy.mask1 = 1; + if (yescrypt_kdf(&dummy, shared1, + param, paramlen, NULL, 0, N, r, p, 0, + YESCRYPT_RW | YESCRYPT_PARALLEL_SMIX | __YESCRYPT_INIT_SHARED_1, + salt, sizeof(salt))) + goto out; + + half1 = half2 = *shared; + half1.shared1.aligned_size /= 2; + half2.shared1.aligned_size = half1.shared1.aligned_size; + half2.shared1.aligned = (char*)half2.shared1.aligned + half1.shared1.aligned_size; + + N /= 2; + + if (p > 1 && yescrypt_kdf(&half1, &half2.shared1, + param, paramlen, salt, sizeof(salt), N, r, p, 0, + YESCRYPT_RW | YESCRYPT_PARALLEL_SMIX | __YESCRYPT_INIT_SHARED_2, + salt, sizeof(salt))) + goto out; + + if (yescrypt_kdf(&half2, &half1.shared1, + param, paramlen, salt, sizeof(salt), N, r, p, 0, + YESCRYPT_RW | YESCRYPT_PARALLEL_SMIX | __YESCRYPT_INIT_SHARED_1, + salt, sizeof(salt))) + goto out; + + if (yescrypt_kdf(&half1, &half2.shared1, + param, paramlen, salt, sizeof(salt), N, r, p, 0, + YESCRYPT_RW | YESCRYPT_PARALLEL_SMIX | __YESCRYPT_INIT_SHARED_1, + buf, buflen)) + goto out; + + shared->mask1 = mask; + + return 0; + +out: + if (!(flags & YESCRYPT_SHARED_PREALLOCATED)) + free_region(shared1); + return -1; +} + +int +yescrypt_free_shared(yescrypt_shared_t * shared) +{ + return free_region(&shared->shared1); +} + +int +yescrypt_init_local(yescrypt_local_t * local) +{ + init_region(local); + return 0; +} + +int +yescrypt_free_local(yescrypt_local_t * local) +{ + return free_region(local); +} + + +static void +blkcpy(uint64_t * dest, const uint64_t * src, size_t count) +{ + do { + *dest++ = *src++; *dest++ = *src++; + *dest++ = *src++; *dest++ = *src++; + } while (count -= 4); +}; + +static void +blkxor(uint64_t * dest, const uint64_t * src, size_t count) +{ + do { + *dest++ ^= *src++; *dest++ ^= *src++; + *dest++ ^= *src++; *dest++ ^= *src++; + } while (count -= 4); +}; + +typedef union { + uint32_t w[16]; + uint64_t d[8]; +} salsa20_blk_t; + +static void +salsa20_simd_shuffle(const salsa20_blk_t * Bin, salsa20_blk_t * Bout) +{ +#define COMBINE(out, in1, in2) \ + Bout->d[out] = Bin->w[in1 * 2] | ((uint64_t)Bin->w[in2 * 2 + 1] << 32); + COMBINE(0, 0, 2) + COMBINE(1, 5, 7) + COMBINE(2, 2, 4) + COMBINE(3, 7, 1) + COMBINE(4, 4, 6) + COMBINE(5, 1, 3) + COMBINE(6, 6, 0) + COMBINE(7, 3, 5) +#undef COMBINE +} + +static void +salsa20_simd_unshuffle(const salsa20_blk_t * Bin, salsa20_blk_t * Bout) +{ +#define COMBINE(out, in1, in2) \ + Bout->w[out * 2] = Bin->d[in1]; \ + Bout->w[out * 2 + 1] = Bin->d[in2] >> 32; + COMBINE(0, 0, 6) + COMBINE(1, 5, 3) + COMBINE(2, 2, 0) + COMBINE(3, 7, 5) + COMBINE(4, 4, 2) + COMBINE(5, 1, 7) + COMBINE(6, 6, 4) + COMBINE(7, 3, 1) +#undef COMBINE +} + +/** + * salsa20_8(B): + * Apply the salsa20/8 core to the provided block. + */ + +static void +salsa20_8(uint64_t B[8]) +{ + size_t i; + salsa20_blk_t X; + +#define x X.w + + salsa20_simd_unshuffle((const salsa20_blk_t *)B, &X); + + for (i = 0; i < 8; i += 2) { +#define R(a,b) (((a) << (b)) | ((a) >> (32 - (b)))) + /* Operate on columns */ + x[ 4] ^= R(x[ 0]+x[12], 7); x[ 8] ^= R(x[ 4]+x[ 0], 9); + x[12] ^= R(x[ 8]+x[ 4],13); x[ 0] ^= R(x[12]+x[ 8],18); + + x[ 9] ^= R(x[ 5]+x[ 1], 7); x[13] ^= R(x[ 9]+x[ 5], 9); + x[ 1] ^= R(x[13]+x[ 9],13); x[ 5] ^= R(x[ 1]+x[13],18); + + x[14] ^= R(x[10]+x[ 6], 7); x[ 2] ^= R(x[14]+x[10], 9); + x[ 6] ^= R(x[ 2]+x[14],13); x[10] ^= R(x[ 6]+x[ 2],18); + + x[ 3] ^= R(x[15]+x[11], 7); x[ 7] ^= R(x[ 3]+x[15], 9); + x[11] ^= R(x[ 7]+x[ 3],13); x[15] ^= R(x[11]+x[ 7],18); + + /* Operate on rows */ + x[ 1] ^= R(x[ 0]+x[ 3], 7); x[ 2] ^= R(x[ 1]+x[ 0], 9); + x[ 3] ^= R(x[ 2]+x[ 1],13); x[ 0] ^= R(x[ 3]+x[ 2],18); + + x[ 6] ^= R(x[ 5]+x[ 4], 7); x[ 7] ^= R(x[ 6]+x[ 5], 9); + x[ 4] ^= R(x[ 7]+x[ 6],13); x[ 5] ^= R(x[ 4]+x[ 7],18); + + x[11] ^= R(x[10]+x[ 9], 7); x[ 8] ^= R(x[11]+x[10], 9); + x[ 9] ^= R(x[ 8]+x[11],13); x[10] ^= R(x[ 9]+x[ 8],18); + + x[12] ^= R(x[15]+x[14], 7); x[13] ^= R(x[12]+x[15], 9); + x[14] ^= R(x[13]+x[12],13); x[15] ^= R(x[14]+x[13],18); +#undef R + } +#undef x + + { + salsa20_blk_t Y; + salsa20_simd_shuffle(&X, &Y); + for (i = 0; i < 16; i += 4) { + ((salsa20_blk_t *)B)->w[i] += Y.w[i]; + ((salsa20_blk_t *)B)->w[i + 1] += Y.w[i + 1]; + ((salsa20_blk_t *)B)->w[i + 2] += Y.w[i + 2]; + ((salsa20_blk_t *)B)->w[i + 3] += Y.w[i + 3]; + } + } +} + +/** + * blockmix_salsa8(Bin, Bout, X, r): + * Compute Bout = BlockMix_{salsa20/8, r}(Bin). The input Bin must be 128r + * bytes in length; the output Bout must also be the same size. The + * temporary space X must be 64 bytes. + */ +static void +blockmix_salsa8(const uint64_t * Bin, uint64_t * Bout, uint64_t * X, size_t r) +{ + size_t i; + + /* 1: X <-- B_{2r - 1} */ + blkcpy(X, &Bin[(2 * r - 1) * 8], 8); + + /* 2: for i = 0 to 2r - 1 do */ + for (i = 0; i < 2 * r; i += 2) { + /* 3: X <-- H(X \xor B_i) */ + blkxor(X, &Bin[i * 8], 8); + salsa20_8(X); + + /* 4: Y_i <-- X */ + /* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */ + blkcpy(&Bout[i * 4], X, 8); + + /* 3: X <-- H(X \xor B_i) */ + blkxor(X, &Bin[i * 8 + 8], 8); + salsa20_8(X); + + /* 4: Y_i <-- X */ + /* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */ + blkcpy(&Bout[i * 4 + r * 8], X, 8); + } + +} + +/* These are tunable */ +#define S_BITS 8 +#define S_SIMD 2 +#define S_P 4 +#define S_ROUNDS 6 + +/* Number of S-boxes. Not tunable, hard-coded in a few places. */ +#define S_N 2 + +/* Derived values. Not tunable on their own. */ +#define S_SIZE1 (1 << S_BITS) +#define S_MASK ((S_SIZE1 - 1) * S_SIMD * 8) +#define S_MASK2 (((uint64_t)S_MASK << 32) | S_MASK) +#define S_SIZE_ALL (S_N * S_SIZE1 * S_SIMD) +#define S_P_SIZE (S_P * S_SIMD) +#define S_MIN_R ((S_P * S_SIMD + 15) / 16) + +/** + * pwxform(B): + * Transform the provided block using the provided S-boxes. + */ + +static void +block_pwxform(uint64_t * B, const uint64_t * S) +{ + uint64_t(*X)[S_SIMD] = (uint64_t(*)[S_SIMD])B; + const uint8_t *S0 = (const uint8_t *)S; + const uint8_t *S1 = (const uint8_t *)(S + S_SIZE1 * S_SIMD); + size_t i, j; + + for (j = 0; j < S_P; j++) { + + uint64_t *Xj = X[j]; + uint64_t x0 = Xj[0]; + uint64_t x1 = Xj[1]; + + for (i = 0; i < S_ROUNDS; i++) { + uint64_t x = x0 & S_MASK2; + const uint64_t *p0, *p1; + + p0 = (const uint64_t *)(S0 + (uint32_t)x); + p1 = (const uint64_t *)(S1 + (x >> 32)); + + x0 = (uint64_t)(x0 >> 32) * (uint32_t)x0; + x0 += p0[0]; + x0 ^= p1[0]; + + x1 = (uint64_t)(x1 >> 32) * (uint32_t)x1; + x1 += p0[1]; + x1 ^= p1[1]; + } + Xj[0] = x0; + Xj[1] = x1; + } + + + +} + + +/** + * blockmix_pwxform(Bin, Bout, S, r): + * Compute Bout = BlockMix_pwxform{salsa20/8, S, r}(Bin). The input Bin must + * be 128r bytes in length; the output Bout must also be the same size. + * + * S lacks const qualifier to match blockmix_salsa8()'s prototype, which we + * need to refer to both functions via the same function pointers. + */ +static void +blockmix_pwxform(const uint64_t * Bin, uint64_t * Bout, uint64_t * S, size_t r) +{ + size_t r1, r2, i; + // S_P_SIZE = 8; + /* Convert 128-byte blocks to (S_P_SIZE * 64-bit) blocks */ + + r1 = r * 128 / (S_P_SIZE * 8); + /* X <-- B_{r1 - 1} */ + blkcpy(Bout, &Bin[(r1 - 1) * S_P_SIZE], S_P_SIZE); + + /* X <-- X \xor B_i */ + blkxor(Bout, Bin, S_P_SIZE); + + /* X <-- H'(X) */ + /* B'_i <-- X */ + block_pwxform(Bout, S); + + /* for i = 0 to r1 - 1 do */ + for (i = 1; i < r1; i++) { + /* X <-- X \xor B_i */ + blkcpy(&Bout[i * S_P_SIZE], &Bout[(i - 1) * S_P_SIZE],S_P_SIZE); + blkxor(&Bout[i * S_P_SIZE], &Bin[i * S_P_SIZE], S_P_SIZE); + + /* X <-- H'(X) */ + /* B'_i <-- X */ + block_pwxform(&Bout[i * S_P_SIZE], S); + } + + /* Handle partial blocks */ + if (i * S_P_SIZE < r * 16) { + blkcpy(&Bout[i * S_P_SIZE], &Bin[i * S_P_SIZE],r * 16 - i * S_P_SIZE); +} + + i = (r1 - 1) * S_P_SIZE / 8; + /* Convert 128-byte blocks to 64-byte blocks */ + r2 = r * 2; + + /* B'_i <-- H(B'_i) */ + salsa20_8(&Bout[i * 8]); + + + i++; +/// not used yescrypt + + for (; i < r2; i++) { + /* B'_i <-- H(B'_i \xor B'_{i-1}) */ + blkxor(&Bout[i * 8], &Bout[(i - 1) * 8], 8); + salsa20_8(&Bout[i * 8]); + } +} + + + +/** + * integerify(B, r): + * Return the result of parsing B_{2r-1} as a little-endian integer. + */ +static uint64_t +integerify(const uint64_t * B, size_t r) +{ +/* + * Our 64-bit words are in host byte order, and word 6 holds the second 32-bit + * word of B_{2r-1} due to SIMD shuffling. The 64-bit value we return is also + * in host byte order, as it should be. + */ + const uint64_t * X = &B[(2 * r - 1) * 8]; + uint32_t lo = X[0]; + uint32_t hi = X[6] >> 32; + return ((uint64_t)hi << 32) + lo; +} + +/** + * smix1(B, r, N, flags, V, NROM, shared, XY, S): + * Compute first loop of B = SMix_r(B, N). The input B must be 128r bytes in + * length; the temporary storage V must be 128rN bytes in length; the temporary + * storage XY must be 256r + 64 bytes in length. The value N must be even and + * no smaller than 2. + */ +static void +smix1(uint64_t * B, size_t r, uint64_t N, yescrypt_flags_t flags, + uint64_t * V, uint64_t NROM, const yescrypt_shared_t * shared, + uint64_t * XY, uint64_t * S) +{ + void (*blockmix)(const uint64_t *, uint64_t *, uint64_t *, size_t) = (S ? blockmix_pwxform : blockmix_salsa8); + const uint64_t * VROM = shared->shared1.aligned; + uint32_t VROM_mask = shared->mask1; + size_t s = 16 * r; + uint64_t * X = V; + uint64_t * Y = &XY[s]; + uint64_t * Z = S ? S : &XY[2 * s]; + uint64_t n, i, j; + size_t k; + + /* 1: X <-- B */ + /* 3: V_i <-- X */ + for (i = 0; i < 2 * r; i++) { + const salsa20_blk_t *src = (const salsa20_blk_t *)&B[i * 8]; + salsa20_blk_t *tmp = (salsa20_blk_t *)Y; + salsa20_blk_t *dst = (salsa20_blk_t *)&X[i * 8]; + for (k = 0; k < 16; k++) + tmp->w[k] = le32dec(&src->w[k]); + + salsa20_simd_shuffle(tmp, dst); + } + + /* 4: X <-- H(X) */ + /* 3: V_i <-- X */ + + blockmix(X, Y, Z, r); + + blkcpy(&V[s], Y, s); + + X = XY; + + if (NROM && (VROM_mask & 1)) { + if ((1 & VROM_mask) == 1) { + /* j <-- Integerify(X) mod NROM */ + j = integerify(Y, r) & (NROM - 1); + + /* X <-- H(X \xor VROM_j) */ + blkxor(Y, &VROM[j * s], s); + } + + blockmix(Y, X, Z, r); + + /* 2: for i = 0 to N - 1 do */ + for (n = 1, i = 2; i < N; i += 2) { + /* 3: V_i <-- X */ + blkcpy(&V[i * s], X, s); + + if ((i & (i - 1)) == 0) + n <<= 1; + + /* j <-- Wrap(Integerify(X), i) */ + j = integerify(X, r) & (n - 1); + j += i - n; + + /* X <-- X \xor V_j */ + blkxor(X, &V[j * s], s); + + /* 4: X <-- H(X) */ + blockmix(X, Y, Z, r); + + /* 3: V_i <-- X */ + blkcpy(&V[(i + 1) * s], Y, s); + + j = integerify(Y, r); + if (((i + 1) & VROM_mask) == 1) { + /* j <-- Integerify(X) mod NROM */ + j &= NROM - 1; + + /* X <-- H(X \xor VROM_j) */ + blkxor(Y, &VROM[j * s], s); + } else { + /* j <-- Wrap(Integerify(X), i) */ + j &= n - 1; + j += i + 1 - n; + + /* X <-- H(X \xor V_j) */ + blkxor(Y, &V[j * s], s); + } + + blockmix(Y, X, Z, r); + } + } else { + yescrypt_flags_t rw = flags & YESCRYPT_RW; + /* 4: X <-- H(X) */ + blockmix(Y, X, Z, r); + + /* 2: for i = 0 to N - 1 do */ + for (n = 1, i = 2; i < N; i += 2) { + /* 3: V_i <-- X */ + blkcpy(&V[i * s], X, s); + + if (rw) { + if ((i & (i - 1)) == 0) + n <<= 1; + + /* j <-- Wrap(Integerify(X), i) */ + j = integerify(X, r) & (n - 1); + j += i - n; + + /* X <-- X \xor V_j */ + blkxor(X, &V[j * s], s); + } + + /* 4: X <-- H(X) */ + blockmix(X, Y, Z, r); + + /* 3: V_i <-- X */ + blkcpy(&V[(i + 1) * s], Y, s); + + if (rw) { + /* j <-- Wrap(Integerify(X), i) */ + j = integerify(Y, r) & (n - 1); + j += (i + 1) - n; + + + /* X <-- X \xor V_j */ + blkxor(Y, &V[j * s], s); + } + + /* 4: X <-- H(X) */ + blockmix(Y, X, Z, r); + } + } + + /* B' <-- X */ + for (i = 0; i < 2 * r; i++) { + const salsa20_blk_t *src = (const salsa20_blk_t *)&X[i * 8]; + salsa20_blk_t *tmp = (salsa20_blk_t *)Y; + salsa20_blk_t *dst = (salsa20_blk_t *)&B[i * 8]; + for (k = 0; k < 16; k++) + le32enc(&tmp->w[k], src->w[k]); + salsa20_simd_unshuffle(tmp, dst); + } +} + + + +/** + * smix2(B, r, N, Nloop, flags, V, NROM, shared, XY, S): + * Compute second loop of B = SMix_r(B, N). The input B must be 128r bytes in + * length; the temporary storage V must be 128rN bytes in length; the temporary + * storage XY must be 256r + 64 bytes in length. The value N must be a + * power of 2 greater than 1. The value Nloop must be even. + */ +static void +smix2(uint64_t * B, size_t r, uint64_t N, uint64_t Nloop, + yescrypt_flags_t flags, + uint64_t * V, uint64_t NROM, const yescrypt_shared_t * shared, + uint64_t * XY, uint64_t * S) +{ + + void (*blockmix)(const uint64_t *, uint64_t *, uint64_t *, size_t) = + (S ? blockmix_pwxform : blockmix_salsa8); + const uint64_t * VROM = shared->shared1.aligned; + uint32_t VROM_mask = shared->mask1 | 1; + size_t s = 16 * r; + yescrypt_flags_t rw = flags & YESCRYPT_RW; + uint64_t * X = XY; + uint64_t * Y = &XY[s]; + uint64_t * Z = S ? S : &XY[2 * s]; + uint64_t i, j; + size_t k; + + if (Nloop == 0) + return; + + /* X <-- B' */ + for (i = 0; i < 2 * r; i++) { + const salsa20_blk_t *src = (const salsa20_blk_t *)&B[i * 8]; + salsa20_blk_t *tmp = (salsa20_blk_t *)Y; + salsa20_blk_t *dst = (salsa20_blk_t *)&X[i * 8]; + for (k = 0; k < 16; k++) + tmp->w[k] = le32dec(&src->w[k]); + salsa20_simd_shuffle(tmp, dst); + } + if (NROM) { + + /* 6: for i = 0 to N - 1 do */ + for (i = 0; i < Nloop; i += 2) { + /* 7: j <-- Integerify(X) mod N */ + j = integerify(X, r) & (N - 1); + + /* 8: X <-- H(X \xor V_j) */ + blkxor(X, &V[j * s], s); + /* V_j <-- Xprev \xor V_j */ + if (rw) + blkcpy(&V[j * s], X, s); + blockmix(X, Y, Z, r); + + j = integerify(Y, r); + if (((i + 1) & VROM_mask) == 1) { + /* j <-- Integerify(X) mod NROM */ + j &= NROM - 1; + + /* X <-- H(X \xor VROM_j) */ + blkxor(Y, &VROM[j * s], s); + } else { + /* 7: j <-- Integerify(X) mod N */ + j &= N - 1; + + /* 8: X <-- H(X \xor V_j) */ + blkxor(Y, &V[j * s], s); + /* V_j <-- Xprev \xor V_j */ + if (rw) + blkcpy(&V[j * s], Y, s); + } + + blockmix(Y, X, Z, r); + } + } else { + + /* 6: for i = 0 to N - 1 do */ + i = Nloop / 2; + do { + /* 7: j <-- Integerify(X) mod N */ + j = integerify(X, r) & (N - 1); + + /* 8: X <-- H(X \xor V_j) */ + blkxor(X, &V[j * s], s); + /* V_j <-- Xprev \xor V_j */ + if (rw) + blkcpy(&V[j * s], X, s); + blockmix(X, Y, Z, r); + + /* 7: j <-- Integerify(X) mod N */ + j = integerify(Y, r) & (N - 1); + + /* 8: X <-- H(X \xor V_j) */ + blkxor(Y, &V[j * s], s); + /* V_j <-- Xprev \xor V_j */ + if (rw) + blkcpy(&V[j * s], Y, s); + blockmix(Y, X, Z, r); + } while (--i); + } + + /* 10: B' <-- X */ + for (i = 0; i < 2 * r; i++) { + const salsa20_blk_t *src = (const salsa20_blk_t *)&X[i * 8]; + salsa20_blk_t *tmp = (salsa20_blk_t *)Y; + salsa20_blk_t *dst = (salsa20_blk_t *)&B[i * 8]; + for (k = 0; k < 16; k++) + le32enc(&tmp->w[k], src->w[k]); + salsa20_simd_unshuffle(tmp, dst); + } +} + + + + +/** + * p2floor(x): + * Largest power of 2 not greater than argument. + */ +static uint64_t +p2floor(uint64_t x) +{ + uint64_t y; + while ((y = x & (x - 1))) + x = y; + return x; +} + +/** + * smix(B, r, N, p, t, flags, V, NROM, shared, XY, S): + * Compute B = SMix_r(B, N). The input B must be 128rp bytes in length; the + * temporary storage V must be 128rN bytes in length; the temporary storage + * XY must be 256r+64 or (256r+64)*p bytes in length (the larger size is + * required with OpenMP-enabled builds). The value N must be a power of 2 + * greater than 1. + */ +static void +smix(uint64_t * B, size_t r, uint64_t N, uint32_t p, uint32_t t, + yescrypt_flags_t flags, + uint64_t * V, uint64_t NROM, const yescrypt_shared_t * shared, + uint64_t * XY, uint64_t * S) +{ + size_t s = 16 * r; + uint64_t Nchunk = N / p, Nloop_all, Nloop_rw; + uint32_t i; + + Nloop_all = Nchunk; + if (flags & YESCRYPT_RW) { + if (t <= 1) { + if (t) + Nloop_all *= 2; /* 2/3 */ + Nloop_all = (Nloop_all + 2) / 3; /* 1/3, round up */ + } else { + Nloop_all *= t - 1; + } + } else if (t) { + if (t == 1) + Nloop_all += (Nloop_all + 1) / 2; /* 1.5, round up */ + Nloop_all *= t; + } + + Nloop_rw = 0; + if (flags & __YESCRYPT_INIT_SHARED) + Nloop_rw = Nloop_all; + else if (flags & YESCRYPT_RW) + Nloop_rw = Nloop_all / p; + + Nchunk &= ~(uint64_t)1; /* round down to even */ + Nloop_all++; Nloop_all &= ~(uint64_t)1; /* round up to even */ + Nloop_rw &= ~(uint64_t)1; /* round down to even */ + + + for (i = 0; i < p; i++) { + uint64_t Vchunk = i * Nchunk; + uint64_t * Bp = &B[i * s]; + uint64_t * Vp = &V[Vchunk * s]; + uint64_t * XYp = XY; + + uint64_t Np = (i < p - 1) ? Nchunk : (N - Vchunk); + uint64_t * Sp = S ? &S[i * S_SIZE_ALL] : S; + + if (Sp) + smix1(Bp, 1, S_SIZE_ALL / 16, flags & ~YESCRYPT_PWXFORM,Sp, NROM, shared, XYp, NULL); + + + + if (!(flags & __YESCRYPT_INIT_SHARED_2)) + smix1(Bp, r, Np, flags, Vp, NROM, shared, XYp, Sp); + + smix2(Bp, r, p2floor(Np), Nloop_rw, flags, Vp, NROM, shared, XYp, Sp); + } + if (Nloop_all > Nloop_rw) { + + for (i = 0; i < p; i++) { + uint64_t * Bp = &B[i * s]; + + uint64_t * XYp = XY; + + uint64_t * Sp = S ? &S[i * S_SIZE_ALL] : S; + smix2(Bp, r, N, Nloop_all - Nloop_rw,flags & ~YESCRYPT_RW, V, NROM, shared, XYp, Sp); + } + } + +} + +static void +smix_old(uint64_t * B, size_t r, uint64_t N, uint32_t p, uint32_t t, +yescrypt_flags_t flags, +uint64_t * V, uint64_t NROM, const yescrypt_shared_t * shared, +uint64_t * XY, uint64_t * S) +{ + size_t s = 16 * r; + uint64_t Nchunk = N / p, Nloop_all, Nloop_rw; + uint32_t i; + + Nloop_all = Nchunk; + if (flags & YESCRYPT_RW) { + if (t <= 1) { + if (t) + Nloop_all *= 2; /* 2/3 */ + Nloop_all = (Nloop_all + 2) / 3; /* 1/3, round up */ + } + else { + Nloop_all *= t - 1; + } + } + else if (t) { + if (t == 1) + Nloop_all += (Nloop_all + 1) / 2; /* 1.5, round up */ + Nloop_all *= t; + } + + Nloop_rw = 0; + if (flags & __YESCRYPT_INIT_SHARED) + Nloop_rw = Nloop_all; + else if (flags & YESCRYPT_RW) + Nloop_rw = Nloop_all / p; + + Nchunk &= ~(uint64_t)1; /* round down to even */ + Nloop_all++; Nloop_all &= ~(uint64_t)1; /* round up to even */ + Nloop_rw &= ~(uint64_t)1; /* round down to even */ + + + for (i = 0; i < p; i++) { + printf("smix first loop p=%d s=%d Nchunk=%d\n",p,s,(uint32_t)Nchunk); + uint64_t Vchunk = i * Nchunk; + uint64_t * Bp = &B[i * s]; + uint64_t * Vp = &V[Vchunk * s]; + uint64_t * XYp = XY; + printf("beofre XYp[0] %08x %08x XYp[1] %08x %08x\n", ((uint32_t*)XYp)[0], ((uint32_t*)XYp)[1], ((uint32_t*)XYp)[2], ((uint32_t*)XYp)[3]); + + uint64_t Np = (i < p - 1) ? Nchunk : (N - Vchunk); + uint64_t * Sp = S ? &S[i * S_SIZE_ALL] : S; + printf("Np %d beofre Sp[0] %08x %08x Sp[1] %08x %08x\n",(uint32_t)Np, ((uint32_t*)Sp)[0], ((uint32_t*)Sp)[1], ((uint32_t*)Sp)[2], ((uint32_t*)Sp)[3]); + + if (Sp) { + printf("sp condition s_size_all %d\n", S_SIZE_ALL); + smix1(Bp, 1, S_SIZE_ALL / 16, flags & ~YESCRYPT_PWXFORM, Sp, NROM, shared, XYp, NULL); + printf("after XYp[0] %08x %08x XYp[1] %08x %08x\n", ((uint32_t*)XYp)[0], ((uint32_t*)XYp)[1], ((uint32_t*)XYp)[2], ((uint32_t*)XYp)[3]); + printf("after Sp[0] %08x %08x Sp[1] %08x %08x\n", ((uint32_t*)Sp)[0], ((uint32_t*)Sp)[1], ((uint32_t*)Sp)[2], ((uint32_t*)Sp)[3]); + + + } + + + if (!(flags & __YESCRYPT_INIT_SHARED_2)) { + printf("flag condition Np smix1 and smix2 again %d \n", Np); + smix1(Bp, r, Np, flags, Vp, NROM, shared, XYp, Sp); + } + + + smix2(Bp, r, p2floor(Np), Nloop_rw, flags, Vp, NROM, shared, XYp, Sp); + } + + printf("Nloop_all %d Nloop_rw %d\n", Nloop_all, Nloop_rw); + if (Nloop_all > Nloop_rw) { + + for (i = 0; i < p; i++) { + printf("smix second loop p=%d s=%d\n",p,s); + uint64_t * Bp = &B[i * s]; + + uint64_t * XYp = XY; + + uint64_t * Sp = S ? &S[i * S_SIZE_ALL] : S; + smix2(Bp, r, N, Nloop_all - Nloop_rw, flags & ~YESCRYPT_RW, V, NROM, shared, XYp, Sp); + } + } +} + +/** + * yescrypt_kdf(shared, local, passwd, passwdlen, salt, saltlen, + * N, r, p, t, flags, buf, buflen): + * Compute scrypt(passwd[0 .. passwdlen - 1], salt[0 .. saltlen - 1], N, r, + * p, buflen), or a revision of scrypt as requested by flags and shared, and + * write the result into buf. The parameters r, p, and buflen must satisfy + * r * p < 2^30 and buflen <= (2^32 - 1) * 32. The parameter N must be a power + * of 2 greater than 1. + * + * t controls computation time while not affecting peak memory usage. shared + * and flags may request special modes as described in yescrypt.h. local is + * the thread-local data structure, allowing to preserve and reuse a memory + * allocation across calls, thereby reducing its overhead. + * + * Return 0 on success; or -1 on error. + */ +int yescrypt_kdf(const yescrypt_shared_t * shared, yescrypt_local_t * local, + const uint8_t * passwd, size_t passwdlen, + const uint8_t * salt, size_t saltlen, + uint64_t N, uint32_t r, uint32_t p, uint32_t t, yescrypt_flags_t flags, + uint8_t * buf, size_t buflen) +{ + yescrypt_region_t tmp; + uint64_t NROM; + size_t B_size, V_size, XY_size, need; + uint64_t * B, * V, * XY, * S; + uint64_t sha256[4]; + + /* + * YESCRYPT_PARALLEL_SMIX is a no-op at p = 1 for its intended purpose, + * so don't let it have side-effects. Without this adjustment, it'd + * enable the SHA-256 password pre-hashing and output post-hashing, + * because any deviation from classic scrypt implies those. + */ + if (p == 1) + flags &= ~YESCRYPT_PARALLEL_SMIX; + + /* Sanity-check parameters */ + if (flags & ~YESCRYPT_KNOWN_FLAGS) { + errno = EINVAL; + return -1; + } +#if SIZE_MAX > UINT32_MAX + if (buflen > (((uint64_t)(1) << 32) - 1) * 32) { + errno = EFBIG; + return -1; + } +#endif + if ((uint64_t)(r) * (uint64_t)(p) >= (1 << 30)) { + errno = EFBIG; + return -1; + } + if (((N & (N - 1)) != 0) || (N <= 1) || (r < 1) || (p < 1)) { + errno = EINVAL; + return -1; + } + if ((flags & YESCRYPT_PARALLEL_SMIX) && (N / p <= 1)) { + errno = EINVAL; + return -1; + } +#if S_MIN_R > 1 + if ((flags & YESCRYPT_PWXFORM) && (r < S_MIN_R)) { + errno = EINVAL; + return -1; + } +#endif + if ((p > SIZE_MAX / ((size_t)256 * r + 64)) || +#if SIZE_MAX / 256 <= UINT32_MAX + (r > SIZE_MAX / 256) || +#endif + (N > SIZE_MAX / 128 / r)) { + errno = ENOMEM; + return -1; + } + if (N > UINT64_MAX / ((uint64_t)t + 1)) { + errno = EFBIG; + return -1; + } + + if ((flags & YESCRYPT_PWXFORM) && + p > SIZE_MAX / (S_SIZE_ALL * sizeof(*S))) { + errno = ENOMEM; + return -1; + } + + NROM = 0; + if (shared->shared1.aligned) { + NROM = shared->shared1.aligned_size / ((size_t)128 * r); + if (((NROM & (NROM - 1)) != 0) || (NROM <= 1) || + !(flags & YESCRYPT_RW)) { + errno = EINVAL; + return -1; + } + } + + /* Allocate memory */ + V = NULL; + V_size = (size_t)128 * r * N; + + need = V_size; + if (flags & __YESCRYPT_INIT_SHARED) { + if (local->aligned_size < need) { + if (local->base || local->aligned || + local->base_size || local->aligned_size) { + errno = EINVAL; + return -1; + } + if (!alloc_region(local, need)) + return -1; + } + V = (uint64_t *)local->aligned; + need = 0; + } + B_size = (size_t)128 * r * p; + need += B_size; + if (need < B_size) { + errno = ENOMEM; + return -1; + } + XY_size = (size_t)256 * r + 64; + + need += XY_size; + if (need < XY_size) { + errno = ENOMEM; + return -1; + } + if (flags & YESCRYPT_PWXFORM) { + size_t S_size = S_SIZE_ALL * sizeof(*S); + + if (flags & YESCRYPT_PARALLEL_SMIX) + S_size *= p; + + need += S_size; + if (need < S_size) { + errno = ENOMEM; + return -1; + } + } + if (flags & __YESCRYPT_INIT_SHARED) { + if (!alloc_region(&tmp, need)) + return -1; + B = (uint64_t *)tmp.aligned; + XY = (uint64_t *)((uint8_t *)B + B_size); + } else { + init_region(&tmp); + if (local->aligned_size < need) { + if (free_region(local)) + return -1; + if (!alloc_region(local, need)) + return -1; + } + B = (uint64_t *)local->aligned; + V = (uint64_t *)((uint8_t *)B + B_size); + XY = (uint64_t *)((uint8_t *)V + V_size); + } + S = NULL; + if (flags & YESCRYPT_PWXFORM) + S = (uint64_t *)((uint8_t *)XY + XY_size); + + + if (t || flags) { + SHA256_CTX_Y ctx; + SHA256_Init_Y(&ctx); + SHA256_Update_Y(&ctx, passwd, passwdlen); + SHA256_Final_Y((uint8_t *)sha256, &ctx); + passwd = (uint8_t *)sha256; + passwdlen = sizeof(sha256); + } + + /* 1: (B_0 ... B_{p-1}) <-- PBKDF2(P, S, 1, p * MFLen) */ + PBKDF2_SHA256(passwd, passwdlen, salt, saltlen, 1,(uint8_t *)B, B_size); + + if (t || flags) + { + blkcpy(sha256, B, sizeof(sha256) / sizeof(sha256[0])); + } + if (p == 1 || (flags & YESCRYPT_PARALLEL_SMIX)) { + smix(B, r, N, p, t, flags, V, NROM, shared, XY, S); + } else { + uint32_t i; + /* 2: for i = 0 to p - 1 do */ + for (i = 0; i < p; i++) { + /* 3: B_i <-- MF(B_i, N) */ + smix(&B[(size_t)16 * r * i], r, N, 1, t, flags, V, NROM, shared, XY, S); + } + } + + /* 5: DK <-- PBKDF2(P, B, 1, dkLen) */ + PBKDF2_SHA256(passwd, passwdlen, (uint8_t *)B, B_size, 1, buf, buflen); + /* + * Except when computing classic scrypt, allow all computation so far + * to be performed on the client. The final steps below match those of + * SCRAM (RFC 5802), so that an extension of SCRAM (with the steps so + * far in place of SCRAM's use of PBKDF2 and with SHA-256 in place of + * SCRAM's use of SHA-1) would be usable with yescrypt hashes. + */ + if ((t || flags) && buflen == sizeof(sha256)) { + /* Compute ClientKey */ + + { + HMAC_SHA256_CTX_Y ctx; + HMAC_SHA256_Init_Y(&ctx, buf, buflen); + HMAC_SHA256_Update_Y(&ctx, salt, saltlen); + HMAC_SHA256_Final_Y((uint8_t *)sha256, &ctx); + } + /* Compute StoredKey */ + { + SHA256_CTX_Y ctx; + SHA256_Init_Y(&ctx); + SHA256_Update_Y(&ctx, (uint8_t *)sha256, sizeof(sha256)); + SHA256_Final_Y(buf, &ctx); + } + } + + if (free_region(&tmp)) + return -1; + + /* Success! */ + return 0; +} + +int +yescrypt_kdf_old(const yescrypt_shared_t * shared, yescrypt_local_t * local, +const uint8_t * passwd, size_t passwdlen, +const uint8_t * salt, size_t saltlen, +uint64_t N, uint32_t r, uint32_t p, uint32_t t, yescrypt_flags_t flags, +uint8_t * buf, size_t buflen) +{ + yescrypt_region_t tmp; + uint64_t NROM; + size_t B_size, V_size, XY_size, need; + uint64_t * B, *V, *XY, *S; + uint64_t sha256[4]; + + /* + * YESCRYPT_PARALLEL_SMIX is a no-op at p = 1 for its intended purpose, + * so don't let it have side-effects. Without this adjustment, it'd + * enable the SHA-256 password pre-hashing and output post-hashing, + * because any deviation from classic scrypt implies those. + */ + if (p == 1) + flags &= ~YESCRYPT_PARALLEL_SMIX; + + /* Sanity-check parameters */ + if (flags & ~YESCRYPT_KNOWN_FLAGS) { + errno = EINVAL; + return -1; + } +#if SIZE_MAX > UINT32_MAX + if (buflen > (((uint64_t)(1) << 32) - 1) * 32) { + errno = EFBIG; + return -1; + } +#endif + if ((uint64_t)(r)* (uint64_t)(p) >= (1 << 30)) { + errno = EFBIG; + return -1; + } + if (((N & (N - 1)) != 0) || (N <= 1) || (r < 1) || (p < 1)) { + errno = EINVAL; + return -1; + } + if ((flags & YESCRYPT_PARALLEL_SMIX) && (N / p <= 1)) { + errno = EINVAL; + return -1; + } +#if S_MIN_R > 1 + if ((flags & YESCRYPT_PWXFORM) && (r < S_MIN_R)) { + errno = EINVAL; + return -1; + } +#endif + if ((p > SIZE_MAX / ((size_t)256 * r + 64)) || +#if SIZE_MAX / 256 <= UINT32_MAX + (r > SIZE_MAX / 256) || +#endif + (N > SIZE_MAX / 128 / r)) { + errno = ENOMEM; + return -1; + } + if (N > UINT64_MAX / ((uint64_t)t + 1)) { + errno = EFBIG; + return -1; + } + + if ((flags & YESCRYPT_PWXFORM) && + p > SIZE_MAX / (S_SIZE_ALL * sizeof(*S))) { + errno = ENOMEM; + return -1; + } + + NROM = 0; + if (shared->shared1.aligned) { + NROM = shared->shared1.aligned_size / ((size_t)128 * r); + if (((NROM & (NROM - 1)) != 0) || (NROM <= 1) || + !(flags & YESCRYPT_RW)) { + errno = EINVAL; + return -1; + } + } + + /* Allocate memory */ + V = NULL; + V_size = (size_t)128 * r * N; + + need = V_size; + if (flags & __YESCRYPT_INIT_SHARED) { + if (local->aligned_size < need) { + if (local->base || local->aligned || + local->base_size || local->aligned_size) { + errno = EINVAL; + return -1; + } + if (!alloc_region(local, need)) + return -1; + } + V = (uint64_t *)local->aligned; + need = 0; + } + B_size = (size_t)128 * r * p; + need += B_size; + if (need < B_size) { + errno = ENOMEM; + return -1; + } + XY_size = (size_t)256 * r + 64; + + need += XY_size; + if (need < XY_size) { + errno = ENOMEM; + return -1; + } + if (flags & YESCRYPT_PWXFORM) { + size_t S_size = S_SIZE_ALL * sizeof(*S); + + if (flags & YESCRYPT_PARALLEL_SMIX) + S_size *= p; + + need += S_size; + if (need < S_size) { + errno = ENOMEM; + return -1; + } + } + if (flags & __YESCRYPT_INIT_SHARED) { + if (!alloc_region(&tmp, need)) + return -1; + B = (uint64_t *)tmp.aligned; + XY = (uint64_t *)((uint8_t *)B + B_size); + printf("yescrypt_init_shared and flag"); + } + else { + printf("NOT yescrypt_init_shared and flag"); + init_region(&tmp); + if (local->aligned_size < need) { + if (free_region(local)) + return -1; + if (!alloc_region(local, need)) + return -1; + } + B = (uint64_t *)local->aligned; + V = (uint64_t *)((uint8_t *)B + B_size); + XY = (uint64_t *)((uint8_t *)V + V_size); + } + S = NULL; + if (flags & YESCRYPT_PWXFORM) + S = (uint64_t *)((uint8_t *)XY + XY_size); + + printf("XY_size %d S_size %d B_size %d V_size %d\n", XY_size, S_SIZE_ALL*sizeof(S), B_size, V_size); + + if (t || flags) { + printf(" first sha t %d flag %d t or flag %d\n", t, flags, (t || flags)); + for (int i = 0; i<10; i++) { printf("i=%d passwd %08x %08x\n",i, ((uint32_t*)passwd)[2 * i], ((uint32_t*)passwd)[2 * i+1]); } + SHA256_CTX_Y ctx; + SHA256_Init_Y(&ctx); + SHA256_Update_Y(&ctx, passwd, passwdlen); + SHA256_Final_Y((uint8_t *)sha256, &ctx); + passwd = (uint8_t *)sha256; + passwdlen = sizeof(sha256); + } + + /* 1: (B_0 ... B_{p-1}) <-- PBKDF2(P, S, 1, p * MFLen) */ + printf("passwdlen=%d saltlen=%d before 1st pbkdf2 B_size %d\n",passwdlen,saltlen, B_size); + PBKDF2_SHA256(passwd, passwdlen, salt, saltlen, 1, (uint8_t *)B, B_size); + //for (int k = 0; k<32; k++) + // printf("k=%d the buf %08x %08x %08x %08x %08x %08x %08x %08x\n", k, ((uint32_t*)B)[8 * k], ((uint32_t*)B)[8 * k + 1], + // ((uint32_t*)B)[8 * k+2], ((uint32_t*)B)[8 * k + 3], + // ((uint32_t*)B)[8 * k+4], ((uint32_t*)B)[8 * k + 5], + // ((uint32_t*)B)[8 * k+6], ((uint32_t*)B)[8 * k + 7]); + + + if (t || flags) + { + printf("before blkcpy count %d\n", sizeof(sha256) / sizeof(sha256[0])); + blkcpy(sha256, B, sizeof(sha256) / sizeof(sha256[0])); + } + printf("after pbkdf2 B0= %08x %08x %08x %08x %08x %08x %08x %08x\n", B[0], B[1], B[2], B[3]); + + printf("before smix p %d flag %d\n", p, (flags & YESCRYPT_PARALLEL_SMIX)); + printf("coef smix r %d N %d p %d t %d flags %d NROM %d\n", r, N, p, t, flags, NROM); + smix(B, r, N, p, t, flags, V, NROM, shared, XY, S); + + + /* 5: DK <-- PBKDF2(P, B, 1, dkLen) */ + printf("before 2nd pbkdf2 B_size %d buflen %d\n", B_size); + printf("paswd = %08x %08x %08x %08x\n", ((uint32_t*)passwd)[0], ((uint32_t*)passwd)[1],((uint32_t*)passwd)[2],((uint32_t*)passwd)[3]); + PBKDF2_SHA256(passwd, passwdlen, (uint8_t *)B, B_size, 1, buf, buflen); + printf("after 2nd pbkdf2 B_size %d buflen %d\n", B_size, buflen); + printf("buf = %08x %08x %08x %08x %08x %08x %08x %08x", + ((uint64_t*)buf)[0], ((uint64_t*)buf)[1], ((uint64_t*)buf)[2], ((uint64_t*)buf)[3]); + + /* + * Except when computing classic scrypt, allow all computation so far + * to be performed on the client. The final steps below match those of + * SCRAM (RFC 5802), so that an extension of SCRAM (with the steps so + * far in place of SCRAM's use of PBKDF2 and with SHA-256 in place of + * SCRAM's use of SHA-1) would be usable with yescrypt hashes. + */ + if ((t || flags) && buflen == sizeof(sha256)) { + /* Compute ClientKey */ + printf("compute keys before end the flag %d\n", (t || flags)); + + { + HMAC_SHA256_CTX_Y ctx; + HMAC_SHA256_Init_Y(&ctx, buf, buflen); + HMAC_SHA256_Update_Y(&ctx, salt, saltlen); + HMAC_SHA256_Final_Y((uint8_t *)sha256, &ctx); + } + /* Compute StoredKey */ + { + SHA256_CTX_Y ctx; + SHA256_Init_Y(&ctx); + SHA256_Update_Y(&ctx, (uint8_t *)sha256, sizeof(sha256)); + SHA256_Final_Y(buf, &ctx); + } + } + printf("buf = %08x %08x %08x %08x %08x %08x %08x %08x", + ((uint64_t*)buf)[0], ((uint64_t*)buf)[1], ((uint64_t*)buf)[2], ((uint64_t*)buf)[3]); + + if (free_region(&tmp)) + return -1; + + /* Success! */ + return 0; +} + diff --git a/sph/yescrypt-simd.c b/sph/yescrypt-simd.c new file mode 100644 index 0000000000..adc054d27d --- /dev/null +++ b/sph/yescrypt-simd.c @@ -0,0 +1,1380 @@ +/*- + * Copyright 2009 Colin Percival + * Copyright 2012-2014 Alexander Peslyak + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * This file was originally written by Colin Percival as part of the Tarsnap + * online backup system. + */ + +/* + * On 64-bit, enabling SSE4.1 helps our pwxform code indirectly, via avoiding + * gcc bug 54349 (fixed for gcc 4.9+). On 32-bit, it's of direct help. AVX + * and XOP are of further help either way. + */ +#ifndef __SSE4_1__ +#warning "Consider enabling SSE4.1, AVX, or XOP in the C compiler for significantly better performance" +#endif + +#include +#ifdef __XOP__ +#include +#endif + +#include +#include +#include +#include + +#include "sha256_Y.h" +#include "sysendian.h" + +#include "sph/yescrypt.h" + +#include "sph/yescrypt-platform.c" + +#if __STDC_VERSION__ >= 199901L +/* have restrict */ +#elif defined(__GNUC__) +#define restrict __restrict +#else +#define restrict +#endif + +#define PREFETCH(x, hint) _mm_prefetch((const char *)(x), (hint)); +#define PREFETCH_OUT(x, hint) /* disabled */ + +#ifdef __XOP__ +#define ARX(out, in1, in2, s) \ + out = _mm_xor_si128(out, _mm_roti_epi32(_mm_add_epi32(in1, in2), s)); +#else +#define ARX(out, in1, in2, s) \ + { \ + __m128i T = _mm_add_epi32(in1, in2); \ + out = _mm_xor_si128(out, _mm_slli_epi32(T, s)); \ + out = _mm_xor_si128(out, _mm_srli_epi32(T, 32-s)); \ + } +#endif + +#define SALSA20_2ROUNDS \ + /* Operate on "columns" */ \ + ARX(X1, X0, X3, 7) \ + ARX(X2, X1, X0, 9) \ + ARX(X3, X2, X1, 13) \ + ARX(X0, X3, X2, 18) \ +\ + /* Rearrange data */ \ + X1 = _mm_shuffle_epi32(X1, 0x93); \ + X2 = _mm_shuffle_epi32(X2, 0x4E); \ + X3 = _mm_shuffle_epi32(X3, 0x39); \ +\ + /* Operate on "rows" */ \ + ARX(X3, X0, X1, 7) \ + ARX(X2, X3, X0, 9) \ + ARX(X1, X2, X3, 13) \ + ARX(X0, X1, X2, 18) \ +\ + /* Rearrange data */ \ + X1 = _mm_shuffle_epi32(X1, 0x39); \ + X2 = _mm_shuffle_epi32(X2, 0x4E); \ + X3 = _mm_shuffle_epi32(X3, 0x93); + +/** + * Apply the salsa20/8 core to the block provided in (X0 ... X3). + */ +#define SALSA20_8_BASE(maybe_decl, out) \ + { \ + maybe_decl Y0 = X0; \ + maybe_decl Y1 = X1; \ + maybe_decl Y2 = X2; \ + maybe_decl Y3 = X3; \ + SALSA20_2ROUNDS \ + SALSA20_2ROUNDS \ + SALSA20_2ROUNDS \ + SALSA20_2ROUNDS \ + (out)[0] = X0 = _mm_add_epi32(X0, Y0); \ + (out)[1] = X1 = _mm_add_epi32(X1, Y1); \ + (out)[2] = X2 = _mm_add_epi32(X2, Y2); \ + (out)[3] = X3 = _mm_add_epi32(X3, Y3); \ + } +#define SALSA20_8(out) \ + SALSA20_8_BASE(__m128i, out) + +/** + * Apply the salsa20/8 core to the block provided in (X0 ... X3) ^ (Z0 ... Z3). + */ +#define SALSA20_8_XOR_ANY(maybe_decl, Z0, Z1, Z2, Z3, out) \ + X0 = _mm_xor_si128(X0, Z0); \ + X1 = _mm_xor_si128(X1, Z1); \ + X2 = _mm_xor_si128(X2, Z2); \ + X3 = _mm_xor_si128(X3, Z3); \ + SALSA20_8_BASE(maybe_decl, out) + +#define SALSA20_8_XOR_MEM(in, out) \ + SALSA20_8_XOR_ANY(__m128i, (in)[0], (in)[1], (in)[2], (in)[3], out) + +#define SALSA20_8_XOR_REG(out) \ + SALSA20_8_XOR_ANY(/* empty */, Y0, Y1, Y2, Y3, out) + +typedef union { + uint32_t w[16]; + __m128i q[4]; +} salsa20_blk_t; + +/** + * blockmix_salsa8(Bin, Bout, r): + * Compute Bout = BlockMix_{salsa20/8, r}(Bin). The input Bin must be 128r + * bytes in length; the output Bout must also be the same size. + */ +static inline void +blockmix_salsa8(const salsa20_blk_t *restrict Bin, + salsa20_blk_t *restrict Bout, size_t r) +{ + __m128i X0, X1, X2, X3; + size_t i; + + r--; + PREFETCH(&Bin[r * 2 + 1], _MM_HINT_T0) + for (i = 0; i < r; i++) { + PREFETCH(&Bin[i * 2], _MM_HINT_T0) + PREFETCH_OUT(&Bout[i], _MM_HINT_T0) + PREFETCH(&Bin[i * 2 + 1], _MM_HINT_T0) + PREFETCH_OUT(&Bout[r + 1 + i], _MM_HINT_T0) + } + PREFETCH(&Bin[r * 2], _MM_HINT_T0) + PREFETCH_OUT(&Bout[r], _MM_HINT_T0) + PREFETCH_OUT(&Bout[r * 2 + 1], _MM_HINT_T0) + + /* 1: X <-- B_{2r - 1} */ + X0 = Bin[r * 2 + 1].q[0]; + X1 = Bin[r * 2 + 1].q[1]; + X2 = Bin[r * 2 + 1].q[2]; + X3 = Bin[r * 2 + 1].q[3]; + + /* 3: X <-- H(X \xor B_i) */ + /* 4: Y_i <-- X */ + /* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */ + SALSA20_8_XOR_MEM(Bin[0].q, Bout[0].q) + + /* 2: for i = 0 to 2r - 1 do */ + for (i = 0; i < r;) { + /* 3: X <-- H(X \xor B_i) */ + /* 4: Y_i <-- X */ + /* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */ + SALSA20_8_XOR_MEM(Bin[i * 2 + 1].q, Bout[r + 1 + i].q) + + i++; + + /* 3: X <-- H(X \xor B_i) */ + /* 4: Y_i <-- X */ + /* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */ + SALSA20_8_XOR_MEM(Bin[i * 2].q, Bout[i].q) + } + + /* 3: X <-- H(X \xor B_i) */ + /* 4: Y_i <-- X */ + /* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */ + SALSA20_8_XOR_MEM(Bin[r * 2 + 1].q, Bout[r * 2 + 1].q) +} + +/* + * (V)PSRLDQ and (V)PSHUFD have higher throughput than (V)PSRLQ on some CPUs + * starting with Sandy Bridge. Additionally, PSHUFD uses separate source and + * destination registers, whereas the shifts would require an extra move + * instruction for our code when building without AVX. Unfortunately, PSHUFD + * is much slower on Conroe (4 cycles latency vs. 1 cycle latency for PSRLQ) + * and somewhat slower on some non-Intel CPUs (luckily not including AMD + * Bulldozer and Piledriver). Since for many other CPUs using (V)PSHUFD is a + * win in terms of throughput or/and not needing a move instruction, we + * currently use it despite of the higher latency on some older CPUs. As an + * alternative, the #if below may be patched to only enable use of (V)PSHUFD + * when building with SSE4.1 or newer, which is not available on older CPUs + * where this instruction has higher latency. + */ +#if 1 +#define HI32(X) \ + _mm_shuffle_epi32((X), _MM_SHUFFLE(2,3,0,1)) +#elif 0 +#define HI32(X) \ + _mm_srli_si128((X), 4) +#else +#define HI32(X) \ + _mm_srli_epi64((X), 32) +#endif + +#if defined(__x86_64__) && (defined(__ICC) || defined(__llvm__)) +/* Intel's name, also supported by recent gcc */ +#define EXTRACT64(X) _mm_cvtsi128_si64(X) +#elif defined(__x86_64__) && !defined(_MSC_VER) && !defined(__OPEN64__) +/* gcc got the 'x' name earlier than non-'x', MSVC and Open64 had bugs */ +#define EXTRACT64(X) _mm_cvtsi128_si64x(X) +#elif defined(__x86_64__) && defined(__SSE4_1__) +/* No known bugs for this intrinsic */ +#include +#define EXTRACT64(X) _mm_extract_epi64((X), 0) +#elif defined(__SSE4_1__) +/* 32-bit */ +#include +#if 0 +/* This is currently unused by the code below, which instead uses these two + * intrinsics explicitly when (!defined(__x86_64__) && defined(__SSE4_1__)) */ +#define EXTRACT64(X) \ + ((uint64_t)(uint32_t)_mm_cvtsi128_si32(X) | \ + ((uint64_t)(uint32_t)_mm_extract_epi32((X), 1) << 32)) +#endif +#else +/* 32-bit or compilers with known past bugs in _mm_cvtsi128_si64*() */ +#define EXTRACT64(X) \ + ((uint64_t)(uint32_t)_mm_cvtsi128_si32(X) | \ + ((uint64_t)(uint32_t)_mm_cvtsi128_si32(HI32(X)) << 32)) +#endif + +/* This is tunable */ +#define S_BITS 8 + +/* Not tunable in this implementation, hard-coded in a few places */ +#define S_SIMD 2 +#define S_P 4 + +/* Number of S-boxes. Not tunable by design, hard-coded in a few places. */ +#define S_N 2 + +/* Derived values. Not tunable except via S_BITS above. */ +#define S_SIZE1 (1 << S_BITS) +#define S_MASK ((S_SIZE1 - 1) * S_SIMD * 8) +#define S_MASK2 (((uint64_t)S_MASK << 32) | S_MASK) +#define S_SIZE_ALL (S_N * S_SIZE1 * S_SIMD * 8) + +#if !defined(__x86_64__) && defined(__SSE4_1__) +/* 32-bit with SSE4.1 */ +#define PWXFORM_X_T __m128i +#define PWXFORM_SIMD(X, x, s0, s1) \ + x = _mm_and_si128(X, _mm_set1_epi64x(S_MASK2)); \ + s0 = *(const __m128i *)(S0 + (uint32_t)_mm_cvtsi128_si32(x)); \ + s1 = *(const __m128i *)(S1 + (uint32_t)_mm_extract_epi32(x, 1)); \ + X = _mm_mul_epu32(HI32(X), X); \ + X = _mm_add_epi64(X, s0); \ + X = _mm_xor_si128(X, s1); +#else +/* 64-bit, or 32-bit without SSE4.1 */ +#define PWXFORM_X_T uint64_t +#define PWXFORM_SIMD(X, x, s0, s1) \ + x = EXTRACT64(X) & S_MASK2; \ + s0 = *(const __m128i *)(S0 + (uint32_t)x); \ + s1 = *(const __m128i *)(S1 + (x >> 32)); \ + X = _mm_mul_epu32(HI32(X), X); \ + X = _mm_add_epi64(X, s0); \ + X = _mm_xor_si128(X, s1); +#endif + +#define PWXFORM_ROUND \ + PWXFORM_SIMD(X0, x0, s00, s01) \ + PWXFORM_SIMD(X1, x1, s10, s11) \ + PWXFORM_SIMD(X2, x2, s20, s21) \ + PWXFORM_SIMD(X3, x3, s30, s31) + +#define PWXFORM \ + { \ + PWXFORM_X_T x0, x1, x2, x3; \ + __m128i s00, s01, s10, s11, s20, s21, s30, s31; \ + PWXFORM_ROUND PWXFORM_ROUND \ + PWXFORM_ROUND PWXFORM_ROUND \ + PWXFORM_ROUND PWXFORM_ROUND \ + } + +#define XOR4(in) \ + X0 = _mm_xor_si128(X0, (in)[0]); \ + X1 = _mm_xor_si128(X1, (in)[1]); \ + X2 = _mm_xor_si128(X2, (in)[2]); \ + X3 = _mm_xor_si128(X3, (in)[3]); + +#define OUT(out) \ + (out)[0] = X0; \ + (out)[1] = X1; \ + (out)[2] = X2; \ + (out)[3] = X3; + +/** + * blockmix_pwxform(Bin, Bout, r, S): + * Compute Bout = BlockMix_pwxform{salsa20/8, r, S}(Bin). The input Bin must + * be 128r bytes in length; the output Bout must also be the same size. + */ +static void +blockmix(const salsa20_blk_t *restrict Bin, salsa20_blk_t *restrict Bout, + size_t r, const __m128i *restrict S) +{ + const uint8_t * S0, * S1; + __m128i X0, X1, X2, X3; + size_t i; + + if (!S) { + blockmix_salsa8(Bin, Bout, r); + return; + } + + S0 = (const uint8_t *)S; + S1 = (const uint8_t *)S + S_SIZE_ALL / 2; + + /* Convert 128-byte blocks to 64-byte blocks */ + r *= 2; + + r--; + PREFETCH(&Bin[r], _MM_HINT_T0) + for (i = 0; i < r; i++) { + PREFETCH(&Bin[i], _MM_HINT_T0) + PREFETCH_OUT(&Bout[i], _MM_HINT_T0) + } + PREFETCH_OUT(&Bout[r], _MM_HINT_T0) + + /* X <-- B_{r1 - 1} */ + X0 = Bin[r].q[0]; + X1 = Bin[r].q[1]; + X2 = Bin[r].q[2]; + X3 = Bin[r].q[3]; + + /* for i = 0 to r1 - 1 do */ + for (i = 0; i < r; i++) { + /* X <-- H'(X \xor B_i) */ + XOR4(Bin[i].q) + PWXFORM + /* B'_i <-- X */ + OUT(Bout[i].q) + } + + /* Last iteration of the loop above */ + XOR4(Bin[i].q) + PWXFORM + + /* B'_i <-- H(B'_i) */ + SALSA20_8(Bout[i].q) +} + +#define XOR4_2(in1, in2) \ + X0 = _mm_xor_si128((in1)[0], (in2)[0]); \ + X1 = _mm_xor_si128((in1)[1], (in2)[1]); \ + X2 = _mm_xor_si128((in1)[2], (in2)[2]); \ + X3 = _mm_xor_si128((in1)[3], (in2)[3]); + +static inline uint32_t +blockmix_salsa8_xor(const salsa20_blk_t *restrict Bin1, + const salsa20_blk_t *restrict Bin2, salsa20_blk_t *restrict Bout, + size_t r, int Bin2_in_ROM) +{ + __m128i X0, X1, X2, X3; + size_t i; + + r--; + if (Bin2_in_ROM) { + PREFETCH(&Bin2[r * 2 + 1], _MM_HINT_NTA) + PREFETCH(&Bin1[r * 2 + 1], _MM_HINT_T0) + for (i = 0; i < r; i++) { + PREFETCH(&Bin2[i * 2], _MM_HINT_NTA) + PREFETCH(&Bin1[i * 2], _MM_HINT_T0) + PREFETCH(&Bin2[i * 2 + 1], _MM_HINT_NTA) + PREFETCH(&Bin1[i * 2 + 1], _MM_HINT_T0) + PREFETCH_OUT(&Bout[i], _MM_HINT_T0) + PREFETCH_OUT(&Bout[r + 1 + i], _MM_HINT_T0) + } + PREFETCH(&Bin2[r * 2], _MM_HINT_T0) + } else { + PREFETCH(&Bin2[r * 2 + 1], _MM_HINT_T0) + PREFETCH(&Bin1[r * 2 + 1], _MM_HINT_T0) + for (i = 0; i < r; i++) { + PREFETCH(&Bin2[i * 2], _MM_HINT_T0) + PREFETCH(&Bin1[i * 2], _MM_HINT_T0) + PREFETCH(&Bin2[i * 2 + 1], _MM_HINT_T0) + PREFETCH(&Bin1[i * 2 + 1], _MM_HINT_T0) + PREFETCH_OUT(&Bout[i], _MM_HINT_T0) + PREFETCH_OUT(&Bout[r + 1 + i], _MM_HINT_T0) + } + PREFETCH(&Bin2[r * 2], _MM_HINT_T0) + } + PREFETCH(&Bin1[r * 2], _MM_HINT_T0) + PREFETCH_OUT(&Bout[r], _MM_HINT_T0) + PREFETCH_OUT(&Bout[r * 2 + 1], _MM_HINT_T0) + + /* 1: X <-- B_{2r - 1} */ + XOR4_2(Bin1[r * 2 + 1].q, Bin2[r * 2 + 1].q) + + /* 3: X <-- H(X \xor B_i) */ + /* 4: Y_i <-- X */ + /* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */ + XOR4(Bin1[0].q) + SALSA20_8_XOR_MEM(Bin2[0].q, Bout[0].q) + + /* 2: for i = 0 to 2r - 1 do */ + for (i = 0; i < r;) { + /* 3: X <-- H(X \xor B_i) */ + /* 4: Y_i <-- X */ + /* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */ + XOR4(Bin1[i * 2 + 1].q) + SALSA20_8_XOR_MEM(Bin2[i * 2 + 1].q, Bout[r + 1 + i].q) + + i++; + + /* 3: X <-- H(X \xor B_i) */ + /* 4: Y_i <-- X */ + /* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */ + XOR4(Bin1[i * 2].q) + SALSA20_8_XOR_MEM(Bin2[i * 2].q, Bout[i].q) + } + + /* 3: X <-- H(X \xor B_i) */ + /* 4: Y_i <-- X */ + /* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */ + XOR4(Bin1[r * 2 + 1].q) + SALSA20_8_XOR_MEM(Bin2[r * 2 + 1].q, Bout[r * 2 + 1].q) + + return _mm_cvtsi128_si32(X0); +} + +static uint32_t +blockmix_xor(const salsa20_blk_t *restrict Bin1, + const salsa20_blk_t *restrict Bin2, salsa20_blk_t *restrict Bout, + size_t r, int Bin2_in_ROM, const __m128i *restrict S) +{ + const uint8_t * S0, * S1; + __m128i X0, X1, X2, X3; + size_t i; + + if (!S) + return blockmix_salsa8_xor(Bin1, Bin2, Bout, r, Bin2_in_ROM); + + S0 = (const uint8_t *)S; + S1 = (const uint8_t *)S + S_SIZE_ALL / 2; + + /* Convert 128-byte blocks to 64-byte blocks */ + r *= 2; + + r--; + if (Bin2_in_ROM) { + PREFETCH(&Bin2[r], _MM_HINT_NTA) + PREFETCH(&Bin1[r], _MM_HINT_T0) + for (i = 0; i < r; i++) { + PREFETCH(&Bin2[i], _MM_HINT_NTA) + PREFETCH(&Bin1[i], _MM_HINT_T0) + PREFETCH_OUT(&Bout[i], _MM_HINT_T0) + } + } else { + PREFETCH(&Bin2[r], _MM_HINT_T0) + PREFETCH(&Bin1[r], _MM_HINT_T0) + for (i = 0; i < r; i++) { + PREFETCH(&Bin2[i], _MM_HINT_T0) + PREFETCH(&Bin1[i], _MM_HINT_T0) + PREFETCH_OUT(&Bout[i], _MM_HINT_T0) + } + } + PREFETCH_OUT(&Bout[r], _MM_HINT_T0); + + /* X <-- B_{r1 - 1} */ + XOR4_2(Bin1[r].q, Bin2[r].q) + + /* for i = 0 to r1 - 1 do */ + for (i = 0; i < r; i++) { + /* X <-- H'(X \xor B_i) */ + XOR4(Bin1[i].q) + XOR4(Bin2[i].q) + PWXFORM + /* B'_i <-- X */ + OUT(Bout[i].q) + } + + /* Last iteration of the loop above */ + XOR4(Bin1[i].q) + XOR4(Bin2[i].q) + PWXFORM + + /* B'_i <-- H(B'_i) */ + SALSA20_8(Bout[i].q) + + return _mm_cvtsi128_si32(X0); +} + +#undef XOR4 +#define XOR4(in, out) \ + (out)[0] = Y0 = _mm_xor_si128((in)[0], (out)[0]); \ + (out)[1] = Y1 = _mm_xor_si128((in)[1], (out)[1]); \ + (out)[2] = Y2 = _mm_xor_si128((in)[2], (out)[2]); \ + (out)[3] = Y3 = _mm_xor_si128((in)[3], (out)[3]); + +static inline uint32_t +blockmix_salsa8_xor_save(const salsa20_blk_t *restrict Bin1, + salsa20_blk_t *restrict Bin2, salsa20_blk_t *restrict Bout, + size_t r) +{ + __m128i X0, X1, X2, X3, Y0, Y1, Y2, Y3; + size_t i; + + r--; + PREFETCH(&Bin2[r * 2 + 1], _MM_HINT_T0) + PREFETCH(&Bin1[r * 2 + 1], _MM_HINT_T0) + for (i = 0; i < r; i++) { + PREFETCH(&Bin2[i * 2], _MM_HINT_T0) + PREFETCH(&Bin1[i * 2], _MM_HINT_T0) + PREFETCH(&Bin2[i * 2 + 1], _MM_HINT_T0) + PREFETCH(&Bin1[i * 2 + 1], _MM_HINT_T0) + PREFETCH_OUT(&Bout[i], _MM_HINT_T0) + PREFETCH_OUT(&Bout[r + 1 + i], _MM_HINT_T0) + } + PREFETCH(&Bin2[r * 2], _MM_HINT_T0) + PREFETCH(&Bin1[r * 2], _MM_HINT_T0) + PREFETCH_OUT(&Bout[r], _MM_HINT_T0) + PREFETCH_OUT(&Bout[r * 2 + 1], _MM_HINT_T0) + + /* 1: X <-- B_{2r - 1} */ + XOR4_2(Bin1[r * 2 + 1].q, Bin2[r * 2 + 1].q) + + /* 3: X <-- H(X \xor B_i) */ + /* 4: Y_i <-- X */ + /* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */ + XOR4(Bin1[0].q, Bin2[0].q) + SALSA20_8_XOR_REG(Bout[0].q) + + /* 2: for i = 0 to 2r - 1 do */ + for (i = 0; i < r;) { + /* 3: X <-- H(X \xor B_i) */ + /* 4: Y_i <-- X */ + /* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */ + XOR4(Bin1[i * 2 + 1].q, Bin2[i * 2 + 1].q) + SALSA20_8_XOR_REG(Bout[r + 1 + i].q) + + i++; + + /* 3: X <-- H(X \xor B_i) */ + /* 4: Y_i <-- X */ + /* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */ + XOR4(Bin1[i * 2].q, Bin2[i * 2].q) + SALSA20_8_XOR_REG(Bout[i].q) + } + + /* 3: X <-- H(X \xor B_i) */ + /* 4: Y_i <-- X */ + /* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */ + XOR4(Bin1[r * 2 + 1].q, Bin2[r * 2 + 1].q) + SALSA20_8_XOR_REG(Bout[r * 2 + 1].q) + + return _mm_cvtsi128_si32(X0); +} + +#define XOR4_Y \ + X0 = _mm_xor_si128(X0, Y0); \ + X1 = _mm_xor_si128(X1, Y1); \ + X2 = _mm_xor_si128(X2, Y2); \ + X3 = _mm_xor_si128(X3, Y3); + +static uint32_t +blockmix_xor_save(const salsa20_blk_t *restrict Bin1, + salsa20_blk_t *restrict Bin2, salsa20_blk_t *restrict Bout, + size_t r, const __m128i *restrict S) +{ + const uint8_t * S0, * S1; + __m128i X0, X1, X2, X3, Y0, Y1, Y2, Y3; + size_t i; + + if (!S) + return blockmix_salsa8_xor_save(Bin1, Bin2, Bout, r); + + S0 = (const uint8_t *)S; + S1 = (const uint8_t *)S + S_SIZE_ALL / 2; + + /* Convert 128-byte blocks to 64-byte blocks */ + r *= 2; + + r--; + PREFETCH(&Bin2[r], _MM_HINT_T0) + PREFETCH(&Bin1[r], _MM_HINT_T0) + for (i = 0; i < r; i++) { + PREFETCH(&Bin2[i], _MM_HINT_T0) + PREFETCH(&Bin1[i], _MM_HINT_T0) + PREFETCH_OUT(&Bout[i], _MM_HINT_T0) + } + PREFETCH_OUT(&Bout[r], _MM_HINT_T0); + + /* X <-- B_{r1 - 1} */ + XOR4_2(Bin1[r].q, Bin2[r].q) + + /* for i = 0 to r1 - 1 do */ + for (i = 0; i < r; i++) { + XOR4(Bin1[i].q, Bin2[i].q) + /* X <-- H'(X \xor B_i) */ + XOR4_Y + PWXFORM + /* B'_i <-- X */ + OUT(Bout[i].q) + } + + /* Last iteration of the loop above */ + XOR4(Bin1[i].q, Bin2[i].q) + XOR4_Y + PWXFORM + + /* B'_i <-- H(B'_i) */ + SALSA20_8(Bout[i].q) + + return _mm_cvtsi128_si32(X0); +} + +#undef ARX +#undef SALSA20_2ROUNDS +#undef SALSA20_8 +#undef SALSA20_8_XOR_ANY +#undef SALSA20_8_XOR_MEM +#undef SALSA20_8_XOR_REG +#undef PWXFORM_SIMD_1 +#undef PWXFORM_SIMD_2 +#undef PWXFORM_ROUND +#undef PWXFORM +#undef OUT +#undef XOR4 +#undef XOR4_2 +#undef XOR4_Y + +/** + * integerify(B, r): + * Return the result of parsing B_{2r-1} as a little-endian integer. + */ +static inline uint32_t +integerify(const salsa20_blk_t * B, size_t r) +{ + return B[2 * r - 1].w[0]; +} + +/** + * smix1(B, r, N, flags, V, NROM, shared, XY, S): + * Compute first loop of B = SMix_r(B, N). The input B must be 128r bytes in + * length; the temporary storage V must be 128rN bytes in length; the temporary + * storage XY must be 128r bytes in length. The value N must be even and no + * smaller than 2. The array V must be aligned to a multiple of 64 bytes, and + * arrays B and XY to a multiple of at least 16 bytes (aligning them to 64 + * bytes as well saves cache lines, but might result in cache bank conflicts). + */ +static void +smix1(uint8_t * B, size_t r, uint32_t N, yescrypt_flags_t flags, + salsa20_blk_t * V, uint32_t NROM, const yescrypt_shared_t * shared, + salsa20_blk_t * XY, void * S) +{ + const salsa20_blk_t * VROM = shared->shared1.aligned; + uint32_t VROM_mask = shared->mask1; + size_t s = 2 * r; + salsa20_blk_t * X = V, * Y; + uint32_t i, j; + size_t k; + + /* 1: X <-- B */ + /* 3: V_i <-- X */ + for (k = 0; k < 2 * r; k++) { + for (i = 0; i < 16; i++) { + X[k].w[i] = le32dec(&B[(k * 16 + (i * 5 % 16)) * 4]); + } + } + + if (NROM && (VROM_mask & 1)) { + uint32_t n; + salsa20_blk_t * V_n; + const salsa20_blk_t * V_j; + + /* 4: X <-- H(X) */ + /* 3: V_i <-- X */ + Y = &V[s]; + blockmix(X, Y, r, S); + + X = &V[2 * s]; + if ((1 & VROM_mask) == 1) { + /* j <-- Integerify(X) mod NROM */ + j = integerify(Y, r) & (NROM - 1); + V_j = &VROM[j * s]; + + /* X <-- H(X \xor VROM_j) */ + j = blockmix_xor(Y, V_j, X, r, 1, S); + } else { + /* X <-- H(X) */ + blockmix(Y, X, r, S); + j = integerify(X, r); + } + + for (n = 2; n < N; n <<= 1) { + uint32_t m = (n < N / 2) ? n : (N - 1 - n); + + V_n = &V[n * s]; + + /* 2: for i = 0 to N - 1 do */ + for (i = 1; i < m; i += 2) { + /* j <-- Wrap(Integerify(X), i) */ + j &= n - 1; + j += i - 1; + V_j = &V[j * s]; + + /* X <-- X \xor V_j */ + /* 4: X <-- H(X) */ + /* 3: V_i <-- X */ + Y = &V_n[i * s]; + j = blockmix_xor(X, V_j, Y, r, 0, S); + + if (((n + i) & VROM_mask) == 1) { + /* j <-- Integerify(X) mod NROM */ + j &= NROM - 1; + V_j = &VROM[j * s]; + } else { + /* j <-- Wrap(Integerify(X), i) */ + j &= n - 1; + j += i; + V_j = &V[j * s]; + } + + /* X <-- H(X \xor VROM_j) */ + X = &V_n[(i + 1) * s]; + j = blockmix_xor(Y, V_j, X, r, 1, S); + } + } + + n >>= 1; + + /* j <-- Wrap(Integerify(X), i) */ + j &= n - 1; + j += N - 2 - n; + V_j = &V[j * s]; + + /* X <-- X \xor V_j */ + /* 4: X <-- H(X) */ + /* 3: V_i <-- X */ + Y = &V[(N - 1) * s]; + j = blockmix_xor(X, V_j, Y, r, 0, S); + + if (((N - 1) & VROM_mask) == 1) { + /* j <-- Integerify(X) mod NROM */ + j &= NROM - 1; + V_j = &VROM[j * s]; + } else { + /* j <-- Wrap(Integerify(X), i) */ + j &= n - 1; + j += N - 1 - n; + V_j = &V[j * s]; + } + + /* X <-- X \xor V_j */ + /* 4: X <-- H(X) */ + X = XY; + blockmix_xor(Y, V_j, X, r, 1, S); + } else if (flags & YESCRYPT_RW) { + uint32_t n; + salsa20_blk_t * V_n, * V_j; + + /* 4: X <-- H(X) */ + /* 3: V_i <-- X */ + Y = &V[s]; + blockmix(X, Y, r, S); + + /* 4: X <-- H(X) */ + /* 3: V_i <-- X */ + X = &V[2 * s]; + blockmix(Y, X, r, S); + j = integerify(X, r); + + for (n = 2; n < N; n <<= 1) { + uint32_t m = (n < N / 2) ? n : (N - 1 - n); + + V_n = &V[n * s]; + + /* 2: for i = 0 to N - 1 do */ + for (i = 1; i < m; i += 2) { + Y = &V_n[i * s]; + + /* j <-- Wrap(Integerify(X), i) */ + j &= n - 1; + j += i - 1; + V_j = &V[j * s]; + + /* X <-- X \xor V_j */ + /* 4: X <-- H(X) */ + /* 3: V_i <-- X */ + j = blockmix_xor(X, V_j, Y, r, 0, S); + + /* j <-- Wrap(Integerify(X), i) */ + j &= n - 1; + j += i; + V_j = &V[j * s]; + + /* X <-- X \xor V_j */ + /* 4: X <-- H(X) */ + /* 3: V_i <-- X */ + X = &V_n[(i + 1) * s]; + j = blockmix_xor(Y, V_j, X, r, 0, S); + } + } + + n >>= 1; + + /* j <-- Wrap(Integerify(X), i) */ + j &= n - 1; + j += N - 2 - n; + V_j = &V[j * s]; + + /* X <-- X \xor V_j */ + /* 4: X <-- H(X) */ + /* 3: V_i <-- X */ + Y = &V[(N - 1) * s]; + j = blockmix_xor(X, V_j, Y, r, 0, S); + + /* j <-- Wrap(Integerify(X), i) */ + j &= n - 1; + j += N - 1 - n; + V_j = &V[j * s]; + + /* X <-- X \xor V_j */ + /* 4: X <-- H(X) */ + X = XY; + blockmix_xor(Y, V_j, X, r, 0, S); + } else { + /* 2: for i = 0 to N - 1 do */ + for (i = 1; i < N - 1; i += 2) { + /* 4: X <-- H(X) */ + /* 3: V_i <-- X */ + Y = &V[i * s]; + blockmix(X, Y, r, S); + + /* 4: X <-- H(X) */ + /* 3: V_i <-- X */ + X = &V[(i + 1) * s]; + blockmix(Y, X, r, S); + } + + /* 4: X <-- H(X) */ + /* 3: V_i <-- X */ + Y = &V[i * s]; + blockmix(X, Y, r, S); + + /* 4: X <-- H(X) */ + X = XY; + blockmix(Y, X, r, S); + } + + /* B' <-- X */ + for (k = 0; k < 2 * r; k++) { + for (i = 0; i < 16; i++) { + le32enc(&B[(k * 16 + (i * 5 % 16)) * 4], X[k].w[i]); + } + } +} + +/** + * smix2(B, r, N, Nloop, flags, V, NROM, shared, XY, S): + * Compute second loop of B = SMix_r(B, N). The input B must be 128r bytes in + * length; the temporary storage V must be 128rN bytes in length; the temporary + * storage XY must be 256r bytes in length. The value N must be a power of 2 + * greater than 1. The value Nloop must be even. The array V must be aligned + * to a multiple of 64 bytes, and arrays B and XY to a multiple of at least 16 + * bytes (aligning them to 64 bytes as well saves cache lines, but might result + * in cache bank conflicts). + */ +static void +smix2(uint8_t * B, size_t r, uint32_t N, uint64_t Nloop, + yescrypt_flags_t flags, salsa20_blk_t * V, uint32_t NROM, + const yescrypt_shared_t * shared, salsa20_blk_t * XY, void * S) +{ + const salsa20_blk_t * VROM = shared->shared1.aligned; + uint32_t VROM_mask = shared->mask1; + size_t s = 2 * r; + salsa20_blk_t * X = XY, * Y = &XY[s]; + uint64_t i; + uint32_t j; + size_t k; + + if (Nloop == 0) + return; + + /* X <-- B' */ + /* 3: V_i <-- X */ + for (k = 0; k < 2 * r; k++) { + for (i = 0; i < 16; i++) { + X[k].w[i] = le32dec(&B[(k * 16 + (i * 5 % 16)) * 4]); + } + } + + i = Nloop / 2; + + /* 7: j <-- Integerify(X) mod N */ + j = integerify(X, r) & (N - 1); + +/* + * Normally, NROM implies YESCRYPT_RW, but we check for these separately + * because YESCRYPT_PARALLEL_SMIX resets YESCRYPT_RW for the smix2() calls + * operating on the entire V. + */ + if (NROM && (flags & YESCRYPT_RW)) { + /* 6: for i = 0 to N - 1 do */ + for (i = 0; i < Nloop; i += 2) { + salsa20_blk_t * V_j = &V[j * s]; + + /* 8: X <-- H(X \xor V_j) */ + /* V_j <-- Xprev \xor V_j */ + /* j <-- Integerify(X) mod NROM */ + j = blockmix_xor_save(X, V_j, Y, r, S); + + if (((i + 1) & VROM_mask) == 1) { + const salsa20_blk_t * VROM_j; + + j &= NROM - 1; + VROM_j = &VROM[j * s]; + + /* X <-- H(X \xor VROM_j) */ + /* 7: j <-- Integerify(X) mod N */ + j = blockmix_xor(Y, VROM_j, X, r, 1, S); + } else { + j &= N - 1; + V_j = &V[j * s]; + + /* 8: X <-- H(X \xor V_j) */ + /* V_j <-- Xprev \xor V_j */ + /* j <-- Integerify(X) mod NROM */ + j = blockmix_xor_save(Y, V_j, X, r, S); + } + j &= N - 1; + V_j = &V[j * s]; + } + } else if (NROM) { + /* 6: for i = 0 to N - 1 do */ + for (i = 0; i < Nloop; i += 2) { + const salsa20_blk_t * V_j = &V[j * s]; + + /* 8: X <-- H(X \xor V_j) */ + /* V_j <-- Xprev \xor V_j */ + /* j <-- Integerify(X) mod NROM */ + j = blockmix_xor(X, V_j, Y, r, 0, S); + + if (((i + 1) & VROM_mask) == 1) { + j &= NROM - 1; + V_j = &VROM[j * s]; + } else { + j &= N - 1; + V_j = &V[j * s]; + } + + /* X <-- H(X \xor VROM_j) */ + /* 7: j <-- Integerify(X) mod N */ + j = blockmix_xor(Y, V_j, X, r, 1, S); + j &= N - 1; + V_j = &V[j * s]; + } + } else if (flags & YESCRYPT_RW) { + /* 6: for i = 0 to N - 1 do */ + do { + salsa20_blk_t * V_j = &V[j * s]; + + /* 8: X <-- H(X \xor V_j) */ + /* V_j <-- Xprev \xor V_j */ + /* 7: j <-- Integerify(X) mod N */ + j = blockmix_xor_save(X, V_j, Y, r, S); + j &= N - 1; + V_j = &V[j * s]; + + /* 8: X <-- H(X \xor V_j) */ + /* V_j <-- Xprev \xor V_j */ + /* 7: j <-- Integerify(X) mod N */ + j = blockmix_xor_save(Y, V_j, X, r, S); + j &= N - 1; + } while (--i); + } else { + /* 6: for i = 0 to N - 1 do */ + do { + const salsa20_blk_t * V_j = &V[j * s]; + + /* 8: X <-- H(X \xor V_j) */ + /* 7: j <-- Integerify(X) mod N */ + j = blockmix_xor(X, V_j, Y, r, 0, S); + j &= N - 1; + V_j = &V[j * s]; + + /* 8: X <-- H(X \xor V_j) */ + /* 7: j <-- Integerify(X) mod N */ + j = blockmix_xor(Y, V_j, X, r, 0, S); + j &= N - 1; + } while (--i); + } + + /* 10: B' <-- X */ + for (k = 0; k < 2 * r; k++) { + for (i = 0; i < 16; i++) { + le32enc(&B[(k * 16 + (i * 5 % 16)) * 4], X[k].w[i]); + } + } +} + +/** + * p2floor(x): + * Largest power of 2 not greater than argument. + */ +static uint64_t +p2floor(uint64_t x) +{ + uint64_t y; + while ((y = x & (x - 1))) + x = y; + return x; +} + +/** + * smix(B, r, N, p, t, flags, V, NROM, shared, XY, S): + * Compute B = SMix_r(B, N). The input B must be 128rp bytes in length; the + * temporary storage V must be 128rN bytes in length; the temporary storage XY + * must be 256r or 256rp bytes in length (the larger size is required with + * OpenMP-enabled builds). The value N must be a power of 2 greater than 1. + * The array V must be aligned to a multiple of 64 bytes, and arrays B and + * XY to a multiple of at least 16 bytes (aligning them to 64 bytes as well + * saves cache lines and helps avoid false sharing in OpenMP-enabled builds + * when p > 1, but it might also result in cache bank conflicts). + */ +static void +smix(uint8_t * B, size_t r, uint32_t N, uint32_t p, uint32_t t, + yescrypt_flags_t flags, + salsa20_blk_t * V, uint32_t NROM, const yescrypt_shared_t * shared, + salsa20_blk_t * XY, void * S) +{ + size_t s = 2 * r; + uint32_t Nchunk = N / p; + uint64_t Nloop_all, Nloop_rw; + uint32_t i; + + Nloop_all = Nchunk; + if (flags & YESCRYPT_RW) { + if (t <= 1) { + if (t) + Nloop_all *= 2; /* 2/3 */ + Nloop_all = (Nloop_all + 2) / 3; /* 1/3, round up */ + } else { + Nloop_all *= t - 1; + } + } else if (t) { + if (t == 1) + Nloop_all += (Nloop_all + 1) / 2; /* 1.5, round up */ + Nloop_all *= t; + } + + Nloop_rw = 0; + if (flags & __YESCRYPT_INIT_SHARED) + Nloop_rw = Nloop_all; + else if (flags & YESCRYPT_RW) + Nloop_rw = Nloop_all / p; + + Nchunk &= ~(uint32_t)1; /* round down to even */ + Nloop_all++; Nloop_all &= ~(uint64_t)1; /* round up to even */ + Nloop_rw &= ~(uint64_t)1; /* round down to even */ + +#ifdef _OPENMP +#pragma omp parallel if (p > 1) default(none) private(i) shared(B, r, N, p, flags, V, NROM, shared, XY, S, s, Nchunk, Nloop_all, Nloop_rw) + { +#pragma omp for +#endif + for (i = 0; i < p; i++) { + uint32_t Vchunk = i * Nchunk; + uint8_t * Bp = &B[128 * r * i]; + salsa20_blk_t * Vp = &V[Vchunk * s]; +#ifdef _OPENMP + salsa20_blk_t * XYp = &XY[i * (2 * s)]; +#else + salsa20_blk_t * XYp = XY; +#endif + uint32_t Np = (i < p - 1) ? Nchunk : (N - Vchunk); + void * Sp = S ? ((uint8_t *)S + i * S_SIZE_ALL) : S; + if (Sp) + smix1(Bp, 1, S_SIZE_ALL / 128, + flags & ~YESCRYPT_PWXFORM, + Sp, NROM, shared, XYp, NULL); + if (!(flags & __YESCRYPT_INIT_SHARED_2)) + smix1(Bp, r, Np, flags, Vp, NROM, shared, XYp, Sp); + smix2(Bp, r, p2floor(Np), Nloop_rw, flags, Vp, + NROM, shared, XYp, Sp); + } + + if (Nloop_all > Nloop_rw) { +#ifdef _OPENMP +#pragma omp for +#endif + for (i = 0; i < p; i++) { + uint8_t * Bp = &B[128 * r * i]; +#ifdef _OPENMP + salsa20_blk_t * XYp = &XY[i * (2 * s)]; +#else + salsa20_blk_t * XYp = XY; +#endif + void * Sp = S ? ((uint8_t *)S + i * S_SIZE_ALL) : S; + smix2(Bp, r, N, Nloop_all - Nloop_rw, + flags & ~YESCRYPT_RW, V, NROM, shared, XYp, Sp); + } + } +#ifdef _OPENMP + } +#endif +} + +/** + * yescrypt_kdf(shared, local, passwd, passwdlen, salt, saltlen, + * N, r, p, t, flags, buf, buflen): + * Compute scrypt(passwd[0 .. passwdlen - 1], salt[0 .. saltlen - 1], N, r, + * p, buflen), or a revision of scrypt as requested by flags and shared, and + * write the result into buf. The parameters r, p, and buflen must satisfy + * r * p < 2^30 and buflen <= (2^32 - 1) * 32. The parameter N must be a power + * of 2 greater than 1. (This optimized implementation currently additionally + * limits N to the range from 8 to 2^31, but other implementation might not.) + * + * t controls computation time while not affecting peak memory usage. shared + * and flags may request special modes as described in yescrypt.h. local is + * the thread-local data structure, allowing to preserve and reuse a memory + * allocation across calls, thereby reducing its overhead. + * + * Return 0 on success; or -1 on error. + */ +int +yescrypt_kdf(const yescrypt_shared_t * shared, yescrypt_local_t * local, + const uint8_t * passwd, size_t passwdlen, + const uint8_t * salt, size_t saltlen, + uint64_t N, uint32_t r, uint32_t p, uint32_t t, yescrypt_flags_t flags, + uint8_t * buf, size_t buflen) +{ + yescrypt_region_t tmp; + uint64_t NROM; + size_t B_size, V_size, XY_size, need; + uint8_t * B, * S; + salsa20_blk_t * V, * XY; + uint8_t sha256[32]; + + /* + * YESCRYPT_PARALLEL_SMIX is a no-op at p = 1 for its intended purpose, + * so don't let it have side-effects. Without this adjustment, it'd + * enable the SHA-256 password pre-hashing and output post-hashing, + * because any deviation from classic scrypt implies those. + */ + if (p == 1) + flags &= ~YESCRYPT_PARALLEL_SMIX; + + /* Sanity-check parameters */ + if (flags & ~YESCRYPT_KNOWN_FLAGS) { + errno = EINVAL; + return -1; + } +#if SIZE_MAX > UINT32_MAX + if (buflen > (((uint64_t)(1) << 32) - 1) * 32) { + errno = EFBIG; + return -1; + } +#endif + if ((uint64_t)(r) * (uint64_t)(p) >= (1 << 30)) { + errno = EFBIG; + return -1; + } + if (N > UINT32_MAX) { + errno = EFBIG; + return -1; + } + if (((N & (N - 1)) != 0) || (N <= 7) || (r < 1) || (p < 1)) { + errno = EINVAL; + return -1; + } + if ((flags & YESCRYPT_PARALLEL_SMIX) && (N / p <= 7)) { + errno = EINVAL; + return -1; + } + if ((r > SIZE_MAX / 256 / p) || + (N > SIZE_MAX / 128 / r)) { + errno = ENOMEM; + return -1; + } +#ifdef _OPENMP + if (!(flags & YESCRYPT_PARALLEL_SMIX) && + (N > SIZE_MAX / 128 / (r * p))) { + errno = ENOMEM; + return -1; + } +#endif + if ((flags & YESCRYPT_PWXFORM) && +#ifndef _OPENMP + (flags & YESCRYPT_PARALLEL_SMIX) && +#endif + p > SIZE_MAX / S_SIZE_ALL) { + errno = ENOMEM; + return -1; + } + + NROM = 0; + if (shared->shared1.aligned) { + NROM = shared->shared1.aligned_size / ((size_t)128 * r); + if (NROM > UINT32_MAX) { + errno = EFBIG; + return -1; + } + if (((NROM & (NROM - 1)) != 0) || (NROM <= 7) || + !(flags & YESCRYPT_RW)) { + errno = EINVAL; + return -1; + } + } + + /* Allocate memory */ + V = NULL; + V_size = (size_t)128 * r * N; +#ifdef _OPENMP + if (!(flags & YESCRYPT_PARALLEL_SMIX)) + V_size *= p; +#endif + need = V_size; + if (flags & __YESCRYPT_INIT_SHARED) { + if (local->aligned_size < need) { + if (local->base || local->aligned || + local->base_size || local->aligned_size) { + errno = EINVAL; + return -1; + } + if (!alloc_region(local, need)) + return -1; + } + V = (salsa20_blk_t *)local->aligned; + need = 0; + } + B_size = (size_t)128 * r * p; + need += B_size; + if (need < B_size) { + errno = ENOMEM; + return -1; + } + XY_size = (size_t)256 * r; +#ifdef _OPENMP + XY_size *= p; +#endif + need += XY_size; + if (need < XY_size) { + errno = ENOMEM; + return -1; + } + if (flags & YESCRYPT_PWXFORM) { + size_t S_size = S_SIZE_ALL; +#ifdef _OPENMP + S_size *= p; +#else + if (flags & YESCRYPT_PARALLEL_SMIX) + S_size *= p; +#endif + need += S_size; + if (need < S_size) { + errno = ENOMEM; + return -1; + } + } + if (flags & __YESCRYPT_INIT_SHARED) { + if (!alloc_region(&tmp, need)) + return -1; + B = (uint8_t *)tmp.aligned; + XY = (salsa20_blk_t *)((uint8_t *)B + B_size); + } else { + init_region(&tmp); + if (local->aligned_size < need) { + if (free_region(local)) + return -1; + if (!alloc_region(local, need)) + return -1; + } + B = (uint8_t *)local->aligned; + V = (salsa20_blk_t *)((uint8_t *)B + B_size); + XY = (salsa20_blk_t *)((uint8_t *)V + V_size); + } + S = NULL; + if (flags & YESCRYPT_PWXFORM) + S = (uint8_t *)XY + XY_size; + + if (t || flags) { + SHA256_CTX_Y ctx; + SHA256_Init_Y(&ctx); + SHA256_Update_Y(&ctx, passwd, passwdlen); + SHA256_Final_Y(sha256, &ctx); + passwd = sha256; + passwdlen = sizeof(sha256); + } + + /* 1: (B_0 ... B_{p-1}) <-- PBKDF2(P, S, 1, p * MFLen) */ + PBKDF2_SHA256(passwd, passwdlen, salt, saltlen, 1, B, B_size); + + if (t || flags) + memcpy(sha256, B, sizeof(sha256)); + + if (p == 1 || (flags & YESCRYPT_PARALLEL_SMIX)) { + smix(B, r, N, p, t, flags, V, NROM, shared, XY, S); + } else { + uint32_t i; + + /* 2: for i = 0 to p - 1 do */ +#ifdef _OPENMP +#pragma omp parallel for default(none) private(i) shared(B, r, N, p, t, flags, V, NROM, shared, XY, S) +#endif + for (i = 0; i < p; i++) { + /* 3: B_i <-- MF(B_i, N) */ +#ifdef _OPENMP + smix(&B[(size_t)128 * r * i], r, N, 1, t, flags, + &V[(size_t)2 * r * i * N], + NROM, shared, + &XY[(size_t)4 * r * i], + S ? &S[S_SIZE_ALL * i] : S); +#else + smix(&B[(size_t)128 * r * i], r, N, 1, t, flags, V, + NROM, shared, XY, S); +#endif + } + } + + /* 5: DK <-- PBKDF2(P, B, 1, dkLen) */ + PBKDF2_SHA256(passwd, passwdlen, B, B_size, 1, buf, buflen); + + /* + * Except when computing classic scrypt, allow all computation so far + * to be performed on the client. The final steps below match those of + * SCRAM (RFC 5802), so that an extension of SCRAM (with the steps so + * far in place of SCRAM's use of PBKDF2 and with SHA-256 in place of + * SCRAM's use of SHA-1) would be usable with yescrypt hashes. + */ + if ((t || flags) && buflen == sizeof(sha256)) { + /* Compute ClientKey */ + { + HMAC_SHA256_CTX_Y ctx; + HMAC_SHA256_Init_Y(&ctx, buf, buflen); +#if 0 +/* Proper yescrypt */ + HMAC_SHA256_Update_Y(&ctx, "Client Key", 10); +#else +/* GlobalBoost-Y buggy yescrypt */ + HMAC_SHA256_Update_Y(&ctx, salt, saltlen); +#endif + HMAC_SHA256_Final_Y(sha256, &ctx); + } + /* Compute StoredKey */ + { + SHA256_CTX_Y ctx; + SHA256_Init_Y(&ctx); + SHA256_Update_Y(&ctx, sha256, sizeof(sha256)); + SHA256_Final_Y(buf, &ctx); + } + } + + if (free_region(&tmp)) + return -1; + + /* Success! */ + return 0; +} diff --git a/sph/yescrypt.h b/sph/yescrypt.h new file mode 100644 index 0000000000..651225833f --- /dev/null +++ b/sph/yescrypt.h @@ -0,0 +1,376 @@ +/*- + * Copyright 2009 Colin Percival + * Copyright 2013,2014 Alexander Peslyak + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * This file was originally written by Colin Percival as part of the Tarsnap + * online backup system. + */ +#ifndef _YESCRYPT_H_ +#define _YESCRYPT_H_ + +#include +#include /* for size_t */ +#include + +//#ifdef __cplusplus +//extern "C" { +//#endif + + +//extern void yescrypt_hash_sp(const unsigned char *input, unsigned char *output); +extern void yescrypt_hash(const unsigned char *input, unsigned char *output); + + + +/** + * crypto_scrypt(passwd, passwdlen, salt, saltlen, N, r, p, buf, buflen): + * Compute scrypt(passwd[0 .. passwdlen - 1], salt[0 .. saltlen - 1], N, r, + * p, buflen) and write the result into buf. The parameters r, p, and buflen + * must satisfy r * p < 2^30 and buflen <= (2^32 - 1) * 32. The parameter N + * must be a power of 2 greater than 1. + * + * Return 0 on success; or -1 on error. + * + * MT-safe as long as buf is local to the thread. + */ +extern int crypto_scrypt(const uint8_t * __passwd, size_t __passwdlen, + const uint8_t * __salt, size_t __saltlen, + uint64_t __N, uint32_t __r, uint32_t __p, + uint8_t * __buf, size_t __buflen); + +/** + * Internal type used by the memory allocator. Please do not use it directly. + * Use yescrypt_shared_t and yescrypt_local_t as appropriate instead, since + * they might differ from each other in a future version. + */ +typedef struct { + void * base, * aligned; + size_t base_size, aligned_size; +} yescrypt_region_t; + +/** + * Types for shared (ROM) and thread-local (RAM) data structures. + */ +typedef yescrypt_region_t yescrypt_shared1_t; +typedef struct { + yescrypt_shared1_t shared1; + uint32_t mask1; +} yescrypt_shared_t; +typedef yescrypt_region_t yescrypt_local_t; + +/** + * Possible values for yescrypt_init_shared()'s flags argument. + */ +typedef enum { + YESCRYPT_SHARED_DEFAULTS = 0, + YESCRYPT_SHARED_PREALLOCATED = 0x100 +} yescrypt_init_shared_flags_t; + +/** + * Possible values for the flags argument of yescrypt_kdf(), + * yescrypt_gensalt_r(), yescrypt_gensalt(). These may be OR'ed together, + * except that YESCRYPT_WORM and YESCRYPT_RW are mutually exclusive. + * Please refer to the description of yescrypt_kdf() below for the meaning of + * these flags. + */ +typedef enum { +/* public */ + YESCRYPT_WORM = 0, + YESCRYPT_RW = 1, + YESCRYPT_PARALLEL_SMIX = 2, + YESCRYPT_PWXFORM = 4, +/* private */ + __YESCRYPT_INIT_SHARED_1 = 0x10000, + __YESCRYPT_INIT_SHARED_2 = 0x20000, + __YESCRYPT_INIT_SHARED = 0x30000 +} yescrypt_flags_t; + +#define YESCRYPT_KNOWN_FLAGS \ + (YESCRYPT_RW | YESCRYPT_PARALLEL_SMIX | YESCRYPT_PWXFORM | \ + __YESCRYPT_INIT_SHARED) + +/** + * yescrypt_init_shared(shared, param, paramlen, N, r, p, flags, mask, + * buf, buflen): + * Optionally allocate memory for and initialize the shared (ROM) data + * structure. The parameters N, r, and p must satisfy the same conditions as + * with crypto_scrypt(). param and paramlen specify a local parameter with + * which the ROM is seeded. If buf is not NULL, then it is used to return + * buflen bytes of message digest for the initialized ROM (the caller may use + * this to verify that the ROM has been computed in the same way that it was on + * a previous run). + * + * Return 0 on success; or -1 on error. + * + * If bit YESCRYPT_SHARED_PREALLOCATED in flags is set, then memory for the + * ROM is assumed to have been preallocated by the caller, with + * shared->shared1.aligned being the start address of the ROM and + * shared->shared1.aligned_size being its size (which must be consistent with + * N, r, and p). This may be used e.g. when the ROM is to be placed in a SysV + * shared memory segment allocated by the caller. + * + * mask controls the frequency of ROM accesses by yescrypt_kdf(). Normally it + * should be set to 1, to interleave RAM and ROM accesses, which works well + * when both regions reside in the machine's RAM anyway. Other values may be + * used e.g. when the ROM is memory-mapped from a disk file. Recommended mask + * values are powers of 2 minus 1 or minus 2. Here's the effect of some mask + * values: + * mask value ROM accesses in SMix 1st loop ROM accesses in SMix 2nd loop + * 0 0 1/2 + * 1 1/2 1/2 + * 2 0 1/4 + * 3 1/4 1/4 + * 6 0 1/8 + * 7 1/8 1/8 + * 14 0 1/16 + * 15 1/16 1/16 + * 1022 0 1/1024 + * 1023 1/1024 1/1024 + * + * Actual computation of the ROM contents may be avoided, if you don't intend + * to use a ROM but need a dummy shared structure, by calling this function + * with NULL, 0, 0, 0, 0, YESCRYPT_SHARED_DEFAULTS, 0, NULL, 0 for the + * arguments starting with param and on. + * + * MT-safe as long as shared is local to the thread. + */ +extern int yescrypt_init_shared(yescrypt_shared_t * __shared, + const uint8_t * __param, size_t __paramlen, + uint64_t __N, uint32_t __r, uint32_t __p, + yescrypt_init_shared_flags_t __flags, uint32_t __mask, + uint8_t * __buf, size_t __buflen); + +/** + * yescrypt_free_shared(shared): + * Free memory that had been allocated with yescrypt_init_shared(). + * + * Return 0 on success; or -1 on error. + * + * MT-safe as long as shared is local to the thread. + */ +extern int yescrypt_free_shared(yescrypt_shared_t * __shared); + +/** + * yescrypt_init_local(local): + * Initialize the thread-local (RAM) data structure. Actual memory allocation + * is currently fully postponed until a call to yescrypt_kdf() or yescrypt_r(). + * + * Return 0 on success; or -1 on error. + * + * MT-safe as long as local is local to the thread. + */ +extern int yescrypt_init_local(yescrypt_local_t * __local); + +/** + * yescrypt_free_local(local): + * Free memory that may have been allocated for an initialized thread-local + * (RAM) data structure. + * + * Return 0 on success; or -1 on error. + * + * MT-safe as long as local is local to the thread. + */ +extern int yescrypt_free_local(yescrypt_local_t * __local); + +/** + * yescrypt_kdf(shared, local, passwd, passwdlen, salt, saltlen, + * N, r, p, t, flags, buf, buflen): + * Compute scrypt(passwd[0 .. passwdlen - 1], salt[0 .. saltlen - 1], N, r, + * p, buflen), or a revision of scrypt as requested by flags and shared, and + * write the result into buf. The parameters N, r, p, and buflen must satisfy + * the same conditions as with crypto_scrypt(). t controls computation time + * while not affecting peak memory usage. shared and flags may request + * special modes as described below. local is the thread-local data + * structure, allowing to preserve and reuse a memory allocation across calls, + * thereby reducing its overhead. + * + * Return 0 on success; or -1 on error. + * + * t controls computation time. t = 0 is optimal in terms of achieving the + * highest area-time for ASIC attackers. Thus, higher computation time, if + * affordable, is best achieved by increasing N rather than by increasing t. + * However, if the higher memory usage (which goes along with higher N) is not + * affordable, or if fine-tuning of the time is needed (recall that N must be a + * power of 2), then t = 1 or above may be used to increase time while staying + * at the same peak memory usage. t = 1 increases the time by 25% and + * decreases the normalized area-time to 96% of optimal. (Of course, in + * absolute terms the area-time increases with higher t. It's just that it + * would increase slightly more with higher N*r rather than with higher t.) + * t = 2 increases the time by another 20% and decreases the normalized + * area-time to 89% of optimal. Thus, these two values are reasonable to use + * for fine-tuning. Values of t higher than 2 result in further increase in + * time while reducing the efficiency much further (e.g., down to around 50% of + * optimal for t = 5, which runs 3 to 4 times slower than t = 0, with exact + * numbers varying by the flags settings). + * + * Classic scrypt is available by setting t = 0 and flags to YESCRYPT_WORM and + * passing a dummy shared structure (see the description of + * yescrypt_init_shared() above for how to produce one). In this mode, the + * thread-local memory region (RAM) is first sequentially written to and then + * randomly read from. This algorithm is friendly towards time-memory + * tradeoffs (TMTO), available both to defenders (albeit not in this + * implementation) and to attackers. + * + * Setting YESCRYPT_RW adds extra random reads and writes to the thread-local + * memory region (RAM), which makes TMTO a lot less efficient. This may be + * used to slow down the kinds of attackers who would otherwise benefit from + * classic scrypt's efficient TMTO. Since classic scrypt's TMTO allows not + * only for the tradeoff, but also for a decrease of attacker's area-time (by + * up to a constant factor), setting YESCRYPT_RW substantially increases the + * cost of attacks in area-time terms as well. Yet another benefit of it is + * that optimal area-time is reached at an earlier time than with classic + * scrypt, and t = 0 actually corresponds to this earlier completion time, + * resulting in quicker hash computations (and thus in higher request rate + * capacity). Due to these properties, YESCRYPT_RW should almost always be + * set, except when compatibility with classic scrypt or TMTO-friendliness are + * desired. + * + * YESCRYPT_PARALLEL_SMIX moves parallelism that is present with p > 1 to a + * lower level as compared to where it is in classic scrypt. This reduces + * flexibility for efficient computation (for both attackers and defenders) by + * requiring that, short of resorting to TMTO, the full amount of memory be + * allocated as needed for the specified p, regardless of whether that + * parallelism is actually being fully made use of or not. (For comparison, a + * single instance of classic scrypt may be computed in less memory without any + * CPU time overhead, but in more real time, by not making full use of the + * parallelism.) This may be desirable when the defender has enough memory + * with sufficiently low latency and high bandwidth for efficient full parallel + * execution, yet the required memory size is high enough that some likely + * attackers might end up being forced to choose between using higher latency + * memory than they could use otherwise (waiting for data longer) or using TMTO + * (waiting for data more times per one hash computation). The area-time cost + * for other kinds of attackers (who would use the same memory type and TMTO + * factor or no TMTO either way) remains roughly the same, given the same + * running time for the defender. In the TMTO-friendly YESCRYPT_WORM mode, as + * long as the defender has enough memory that is just as fast as the smaller + * per-thread regions would be, doesn't expect to ever need greater + * flexibility (except possibly via TMTO), and doesn't need backwards + * compatibility with classic scrypt, there are no other serious drawbacks to + * this setting. In the YESCRYPT_RW mode, which is meant to discourage TMTO, + * this new approach to parallelization makes TMTO less inefficient. (This is + * an unfortunate side-effect of avoiding some random writes, as we have to in + * order to allow for parallel threads to access a common memory region without + * synchronization overhead.) Thus, in this mode this setting poses an extra + * tradeoff of its own (higher area-time cost for a subset of attackers vs. + * better TMTO resistance). Setting YESCRYPT_PARALLEL_SMIX also changes the + * way the running time is to be controlled from N*r*p (for classic scrypt) to + * N*r (in this modification). All of this applies only when p > 1. For + * p = 1, this setting is a no-op. + * + * Passing a real shared structure, with ROM contents previously computed by + * yescrypt_init_shared(), enables the use of ROM and requires YESCRYPT_RW for + * the thread-local RAM region. In order to allow for initialization of the + * ROM to be split into a separate program, the shared->shared1.aligned and + * shared->shared1.aligned_size fields may be set by the caller of + * yescrypt_kdf() manually rather than with yescrypt_init_shared(). + * + * local must be initialized with yescrypt_init_local(). + * + * MT-safe as long as local and buf are local to the thread. + */ +extern int yescrypt_kdf(const yescrypt_shared_t * __shared, + yescrypt_local_t * __local, + const uint8_t * __passwd, size_t __passwdlen, + const uint8_t * __salt, size_t __saltlen, + uint64_t __N, uint32_t __r, uint32_t __p, uint32_t __t, + yescrypt_flags_t __flags, + uint8_t * __buf, size_t __buflen); + +/** + * yescrypt_r(shared, local, passwd, passwdlen, setting, buf, buflen): + * Compute and encode an scrypt or enhanced scrypt hash of passwd given the + * parameters and salt value encoded in setting. If the shared structure is + * not dummy, a ROM is used and YESCRYPT_RW is required. Otherwise, whether to + * use the YESCRYPT_WORM (classic scrypt) or YESCRYPT_RW (time-memory tradeoff + * discouraging modification) is determined by the setting string. shared and + * local must be initialized as described above for yescrypt_kdf(). buf must + * be large enough (as indicated by buflen) to hold the encoded hash string. + * + * Return the encoded hash string on success; or NULL on error. + * + * MT-safe as long as local and buf are local to the thread. + */ +extern uint8_t * yescrypt_r(const yescrypt_shared_t * __shared, + yescrypt_local_t * __local, + const uint8_t * __passwd, size_t __passwdlen, + const uint8_t * __setting, + uint8_t * __buf, size_t __buflen); + +/** + * yescrypt(passwd, setting): + * Compute and encode an scrypt or enhanced scrypt hash of passwd given the + * parameters and salt value encoded in setting. Whether to use the + * YESCRYPT_WORM (classic scrypt) or YESCRYPT_RW (time-memory tradeoff + * discouraging modification) is determined by the setting string. + * + * Return the encoded hash string on success; or NULL on error. + * + * This is a crypt(3)-like interface, which is simpler to use than + * yescrypt_r(), but it is not MT-safe, it does not allow for the use of a ROM, + * and it is slower than yescrypt_r() for repeated calls because it allocates + * and frees memory on each call. + * + * MT-unsafe. + */ +extern uint8_t * yescrypt(const uint8_t * __passwd, const uint8_t * __setting); + +/** + * yescrypt_gensalt_r(N_log2, r, p, flags, src, srclen, buf, buflen): + * Generate a setting string for use with yescrypt_r() and yescrypt() by + * encoding into it the parameters N_log2 (which is to be set to base 2 + * logarithm of the desired value for N), r, p, flags, and a salt given by src + * (of srclen bytes). buf must be large enough (as indicated by buflen) to + * hold the setting string. + * + * Return the setting string on success; or NULL on error. + * + * MT-safe as long as buf is local to the thread. + */ +extern uint8_t * yescrypt_gensalt_r( + uint32_t __N_log2, uint32_t __r, uint32_t __p, + yescrypt_flags_t __flags, + const uint8_t * __src, size_t __srclen, + uint8_t * __buf, size_t __buflen); + +/** + * yescrypt_gensalt(N_log2, r, p, flags, src, srclen): + * Generate a setting string for use with yescrypt_r() and yescrypt(). This + * function is the same as yescrypt_gensalt_r() except that it uses a static + * buffer and thus is not MT-safe. + * + * Return the setting string on success; or NULL on error. + * + * MT-unsafe. + */ +extern uint8_t * yescrypt_gensalt( + uint32_t __N_log2, uint32_t __r, uint32_t __p, + yescrypt_flags_t __flags, + const uint8_t * __src, size_t __srclen); + +//#ifdef __cplusplus +//} +//#endif + +#endif /* !_YESCRYPT_H_ */ diff --git a/sph/yescryptcommon.c b/sph/yescryptcommon.c new file mode 100644 index 0000000000..e5d76eb436 --- /dev/null +++ b/sph/yescryptcommon.c @@ -0,0 +1,365 @@ +/*- + * Copyright 2013,2014 Alexander Peslyak + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +#include +#include +#include "sph/yescrypt.h" +#include +//#include + +#define BYTES2CHARS(bytes) \ + ((((bytes) * 8) + 5) / 6) + +#define HASH_SIZE 32 /* bytes */ +#define HASH_LEN BYTES2CHARS(HASH_SIZE) /* base-64 chars */ +#define YESCRYPT_FLAGS (YESCRYPT_RW | YESCRYPT_PWXFORM) +static const char * const itoa64 = + "./0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"; + +static uint8_t * encode64_uint32(uint8_t * dst, size_t dstlen, + uint32_t src, uint32_t srcbits) +{ + uint32_t bit; + + for (bit = 0; bit < srcbits; bit += 6) { + if (dstlen < 1) + return NULL; + *dst++ = itoa64[src & 0x3f]; + dstlen--; + src >>= 6; + } + + return dst; +} + +static uint8_t * encode64(uint8_t * dst, size_t dstlen, + const uint8_t * src, size_t srclen) +{ + size_t i; + + for (i = 0; i < srclen; ) { + uint8_t * dnext; + uint32_t value = 0, bits = 0; + do { + value |= (uint32_t)src[i++] << bits; + bits += 8; + } while (bits < 24 && i < srclen); + dnext = encode64_uint32(dst, dstlen, value, bits); + if (!dnext) + return NULL; + dstlen -= dnext - dst; + dst = dnext; + } + + return dst; +} + +static int decode64_one(uint32_t * dst, uint8_t src) +{ + const char * ptr = strchr(itoa64, src); + if (ptr) { + *dst = ptr - itoa64; + return 0; + } + *dst = 0; + return -1; +} + +static const uint8_t * decode64_uint32(uint32_t * dst, uint32_t dstbits, + const uint8_t * src) +{ + uint32_t bit; + uint32_t value; + + value = 0; + for (bit = 0; bit < dstbits; bit += 6) { + uint32_t one; + if (decode64_one(&one, *src)) { + *dst = 0; + return NULL; + } + src++; + value |= one << bit; + } + + *dst = value; + return src; +} + +uint8_t * +yescrypt_r(const yescrypt_shared_t * shared, yescrypt_local_t * local, + const uint8_t * passwd, size_t passwdlen, + const uint8_t * setting, + uint8_t * buf, size_t buflen) +{ + uint8_t hash[HASH_SIZE]; + const uint8_t * src, * salt; + uint8_t * dst; + size_t prefixlen, saltlen, need; + uint8_t version; + uint64_t N; + uint32_t r, p; + yescrypt_flags_t flags = YESCRYPT_WORM; + fflush(stdout); + if (setting[0] != '$' || setting[1] != '7') + { + fflush(stdout); + return NULL; + } + fflush(stdout); + src = setting + 2; + fflush(stdout); + switch ((version = *src)) { + case '$': + fflush(stdout); + break; + case 'X': + src++; + flags = YESCRYPT_RW; + fflush(stdout); + break; + default: + { + fflush(stdout); + return NULL; + } + } + + fflush(stdout); + if (*src != '$') { + uint32_t decoded_flags; + if (decode64_one(&decoded_flags, *src)) + + { + fflush(stdout); + return NULL; + } + flags = decoded_flags; + if (*++src != '$') + { + fflush(stdout); + return NULL; + } + } + src++; + + { + uint32_t N_log2; + if (decode64_one(&N_log2, *src)) + { + return NULL; + } + src++; + N = (uint64_t)1 << N_log2; + } + + src = decode64_uint32(&r, 30, src); + if (!src) + { + return NULL; + } + + src = decode64_uint32(&p, 30, src); + if (!src) + { + return NULL; + } + + prefixlen = src - setting; + + salt = src; + src = (uint8_t *)strrchr((char *)salt, '$'); + if (src) + saltlen = src - salt; + else + saltlen = strlen((char *)salt); + + need = prefixlen + saltlen + 1 + HASH_LEN + 1; + if (need > buflen || need < saltlen) + + { + fflush(stdout); + return NULL; + } + +fflush(stdout); + if (yescrypt_kdf(shared, local, passwd, passwdlen, salt, saltlen, + N, r, p, 0, flags, hash, sizeof(hash))) + { + fflush(stdout); + return NULL; + } + + dst = buf; + memcpy(dst, setting, prefixlen + saltlen); + dst += prefixlen + saltlen; + *dst++ = '$'; + + dst = encode64(dst, buflen - (dst - buf), hash, sizeof(hash)); + /* Could zeroize hash[] here, but yescrypt_kdf() doesn't zeroize its + * memory allocations yet anyway. */ + if (!dst || dst >= buf + buflen) /* Can't happen */ + { + return NULL; + } + + *dst = 0; /* NUL termination */ + fflush(stdout); + return buf; +} + +uint8_t * +yescrypt(const uint8_t * passwd, const uint8_t * setting) +{ + static uint8_t buf[4 + 1 + 5 + 5 + BYTES2CHARS(32) + 1 + HASH_LEN + 1]; + yescrypt_shared_t shared; + yescrypt_local_t local; + uint8_t * retval; + if (yescrypt_init_shared(&shared, NULL, 0, + 0, 0, 0, YESCRYPT_SHARED_DEFAULTS, 0, NULL, 0)) + return NULL; + if (yescrypt_init_local(&local)) { + yescrypt_free_shared(&shared); + return NULL; + } + retval = yescrypt_r(&shared, &local, + passwd, 80, setting, buf, sizeof(buf)); + // printf("hashse='%s'\n", (char *)retval); + if (yescrypt_free_local(&local)) { + yescrypt_free_shared(&shared); + return NULL; + } + if (yescrypt_free_shared(&shared)) + return NULL; + return retval; + +} + +uint8_t * +yescrypt_gensalt_r(uint32_t N_log2, uint32_t r, uint32_t p, + yescrypt_flags_t flags, + const uint8_t * src, size_t srclen, + uint8_t * buf, size_t buflen) +{ + uint8_t * dst; + size_t prefixlen = 3 + 1 + 5 + 5; + size_t saltlen = BYTES2CHARS(srclen); + size_t need; + + if (p == 1) + flags &= ~YESCRYPT_PARALLEL_SMIX; + + if (flags) { + if (flags & ~0x3f) + return NULL; + + prefixlen++; + if (flags != YESCRYPT_RW) + prefixlen++; + } + + need = prefixlen + saltlen + 1; + if (need > buflen || need < saltlen || saltlen < srclen) + return NULL; + + if (N_log2 > 63 || ((uint64_t)r * (uint64_t)p >= (1U << 30))) + return NULL; + + dst = buf; + *dst++ = '$'; + *dst++ = '7'; + if (flags) { + *dst++ = 'X'; /* eXperimental, subject to change */ + if (flags != YESCRYPT_RW) + *dst++ = itoa64[flags]; + } + *dst++ = '$'; + + *dst++ = itoa64[N_log2]; + + dst = encode64_uint32(dst, buflen - (dst - buf), r, 30); + if (!dst) /* Can't happen */ + return NULL; + + dst = encode64_uint32(dst, buflen - (dst - buf), p, 30); + if (!dst) /* Can't happen */ + return NULL; + + dst = encode64(dst, buflen - (dst - buf), src, srclen); + if (!dst || dst >= buf + buflen) /* Can't happen */ + return NULL; + + *dst = 0; /* NUL termination */ + + return buf; +} + +uint8_t * +yescrypt_gensalt(uint32_t N_log2, uint32_t r, uint32_t p, + yescrypt_flags_t flags, + const uint8_t * src, size_t srclen) +{ + static uint8_t buf[4 + 1 + 5 + 5 + BYTES2CHARS(32) + 1]; + return yescrypt_gensalt_r(N_log2, r, p, flags, src, srclen, + buf, sizeof(buf)); +} + +static int +yescrypt_bsty(const uint8_t * passwd, size_t passwdlen, + const uint8_t * salt, size_t saltlen, uint64_t N, uint32_t r, uint32_t p, + uint8_t * buf, size_t buflen) +{ + +#ifdef WIN32 + static __declspec(thread) int initialized = 0; + static __declspec(thread) yescrypt_shared_t shared; + static __declspec(thread) yescrypt_local_t local; +#else + static __thread int initialized = 0; + static __thread yescrypt_shared_t shared; + static __thread yescrypt_local_t local; +#endif + + int retval; + if (!initialized) { +/* "shared" could in fact be shared, but it's simpler to keep it private + * along with "local". It's dummy and tiny anyway. */ + if (yescrypt_init_shared(&shared, NULL, 0, + 0, 0, 0, YESCRYPT_SHARED_DEFAULTS, 0, NULL, 0)) + return -1; + if (yescrypt_init_local(&local)) { + yescrypt_free_shared(&shared); + return -1; + } + initialized = 1; + } + retval = yescrypt_kdf(&shared, &local, + passwd, passwdlen, salt, saltlen, N, r, p, 0, YESCRYPT_FLAGS, + buf, buflen); + + return retval; +} + +void yescrypt_hash(const unsigned char *input, unsigned char *output) +{ + + yescrypt_bsty((const uint8_t *)input, 80, (const uint8_t *) input, 80, 2048, 8, 1, (uint8_t *)output, 32); +} diff --git a/stats.cpp b/stats.cpp index f5a2d8280e..e30b4015ce 100644 --- a/stats.cpp +++ b/stats.cpp @@ -25,7 +25,7 @@ extern int opt_statsavg; */ void stats_remember_speed(int thr_id, uint32_t hashcount, double hashrate, uint8_t found, uint32_t height) { - const uint8_t gpu = (uint8_t) device_map[thr_id]; + uint8_t gpu = (uint8_t) device_map[thr_id]; const uint64_t key = ((uid++ % UINT32_MAX) << 32) + gpu; stats_data data; // to enough hashes to give right stats @@ -61,7 +61,8 @@ void stats_remember_speed(int thr_id, uint32_t hashcount, double hashrate, uint8 */ double stats_get_speed(int thr_id, double def_speed) { - const uint64_t gpu = device_map[thr_id]; + uint64_t gpu = device_map[thr_id]; + const uint64_t keymsk = 0xffULL; // last u8 is the gpu double speed = 0.0; int records = 0; diff --git a/sysinfos.cpp b/sysinfos.cpp index 4515830be2..6639411f36 100644 --- a/sysinfos.cpp +++ b/sysinfos.cpp @@ -6,11 +6,11 @@ * tpruvot 2014 */ -#include +#include #include -#include -#include - +#include +#include +using namespace std; #include "miner.h" #ifndef WIN32 @@ -46,12 +46,11 @@ static uint32_t linux_cpufreq(int core) FILE *fd = fopen(CPUFREQ_PATH, "r"); uint32_t freq = 0; - if (!fd) - return freq; - - if (!fscanf(fd, "%d", &freq)) - return freq; - + if(!fd) + { + fscanf(fd, "%d", &freq); + fclose(fd); + } return freq; } diff --git a/uint256.h b/uint256.h deleted file mode 100644 index 2a252c94f3..0000000000 --- a/uint256.h +++ /dev/null @@ -1,784 +0,0 @@ -// Copyright (c) 2009-2010 Satoshi Nakamoto -// Copyright (c) 2009-2012 The Bitcoin developers -// Distributed under the MIT/X11 software license, see the accompanying -// file COPYING or http://www.opensource.org/licenses/mit-license.php. -#ifndef BITCOIN_UINT256_H -#define BITCOIN_UINT256_H - -#include -#include -#include -#include -#include -#include - -typedef long long int64; -typedef unsigned long long uint64; - - -inline int Testuint256AdHoc(std::vector vArg); - - - -/** Base class without constructors for uint256 and uint160. - * This makes the compiler let you use it in a union. - */ -template -class base_uint -{ -protected: - enum { WIDTH=BITS/32 }; - uint32_t pn[WIDTH]; -public: - - bool operator!() const - { - for (int i = 0; i < WIDTH; i++) - if (pn[i] != 0) - return false; - return true; - } - - const base_uint operator~() const - { - base_uint ret; - for (int i = 0; i < WIDTH; i++) - ret.pn[i] = ~pn[i]; - return ret; - } - - const base_uint operator-() const - { - base_uint ret; - for (int i = 0; i < WIDTH; i++) - ret.pn[i] = ~pn[i]; - ret++; - return ret; - } - - double getdouble() const - { - double ret = 0.0; - double fact = 1.0; - for (int i = 0; i < WIDTH; i++) { - ret += fact * pn[i]; - fact *= 4294967296.0; - } - return ret; - } - - base_uint& operator=(uint64 b) - { - pn[0] = (unsigned int)b; - pn[1] = (unsigned int)(b >> 32); - for (int i = 2; i < WIDTH; i++) - pn[i] = 0; - return *this; - } - - base_uint& operator^=(const base_uint& b) - { - for (int i = 0; i < WIDTH; i++) - pn[i] ^= b.pn[i]; - return *this; - } - - base_uint& operator&=(const base_uint& b) - { - for (int i = 0; i < WIDTH; i++) - pn[i] &= b.pn[i]; - return *this; - } - - base_uint& operator|=(const base_uint& b) - { - for (int i = 0; i < WIDTH; i++) - pn[i] |= b.pn[i]; - return *this; - } - - base_uint& operator^=(uint64 b) - { - pn[0] ^= (unsigned int)b; - pn[1] ^= (unsigned int)(b >> 32); - return *this; - } - - base_uint& operator|=(uint64 b) - { - pn[0] |= (unsigned int)b; - pn[1] |= (unsigned int)(b >> 32); - return *this; - } - - base_uint& operator<<=(unsigned int shift) - { - base_uint a(*this); - for (int i = 0; i < WIDTH; i++) - pn[i] = 0; - int k = shift / 32; - shift = shift % 32; - for (int i = 0; i < WIDTH; i++) - { - if (i+k+1 < WIDTH && shift != 0) - pn[i+k+1] |= (a.pn[i] >> (32-shift)); - if (i+k < WIDTH) - pn[i+k] |= (a.pn[i] << shift); - } - return *this; - } - - base_uint& operator>>=(unsigned int shift) - { - base_uint a(*this); - for (int i = 0; i < WIDTH; i++) - pn[i] = 0; - int k = shift / 32; - shift = shift % 32; - for (int i = 0; i < WIDTH; i++) - { - if (i-k-1 >= 0 && shift != 0) - pn[i-k-1] |= (a.pn[i] << (32-shift)); - if (i-k >= 0) - pn[i-k] |= (a.pn[i] >> shift); - } - return *this; - } - - base_uint& operator+=(const base_uint& b) - { - uint64 carry = 0; - for (int i = 0; i < WIDTH; i++) - { - uint64 n = carry + pn[i] + b.pn[i]; - pn[i] = n & 0xffffffff; - carry = n >> 32; - } - return *this; - } - - base_uint& operator-=(const base_uint& b) - { - *this += -b; - return *this; - } - - base_uint& operator+=(uint64 b64) - { - base_uint b; - b = b64; - *this += b; - return *this; - } - - base_uint& operator-=(uint64 b64) - { - base_uint b; - b = b64; - *this += -b; - return *this; - } - - - base_uint& operator++() - { - // prefix operator - int i = 0; - while (++pn[i] == 0 && i < WIDTH-1) - i++; - return *this; - } - - const base_uint operator++(int) - { - // postfix operator - const base_uint ret = *this; - ++(*this); - return ret; - } - - base_uint& operator--() - { - // prefix operator - int i = 0; - while (--pn[i] == -1 && i < WIDTH-1) - i++; - return *this; - } - - const base_uint operator--(int) - { - // postfix operator - const base_uint ret = *this; - --(*this); - return ret; - } - - - friend inline bool operator<(const base_uint& a, const base_uint& b) - { - for (int i = base_uint::WIDTH-1; i >= 0; i--) - { - if (a.pn[i] < b.pn[i]) - return true; - else if (a.pn[i] > b.pn[i]) - return false; - } - return false; - } - - friend inline bool operator<=(const base_uint& a, const base_uint& b) - { - for (int i = base_uint::WIDTH-1; i >= 0; i--) - { - if (a.pn[i] < b.pn[i]) - return true; - else if (a.pn[i] > b.pn[i]) - return false; - } - return true; - } - - friend inline bool operator>(const base_uint& a, const base_uint& b) - { - for (int i = base_uint::WIDTH-1; i >= 0; i--) - { - if (a.pn[i] > b.pn[i]) - return true; - else if (a.pn[i] < b.pn[i]) - return false; - } - return false; - } - - friend inline bool operator>=(const base_uint& a, const base_uint& b) - { - for (int i = base_uint::WIDTH-1; i >= 0; i--) - { - if (a.pn[i] > b.pn[i]) - return true; - else if (a.pn[i] < b.pn[i]) - return false; - } - return true; - } - - friend inline bool operator==(const base_uint& a, const base_uint& b) - { - for (int i = 0; i < base_uint::WIDTH; i++) - if (a.pn[i] != b.pn[i]) - return false; - return true; - } - - friend inline bool operator==(const base_uint& a, uint64 b) - { - if (a.pn[0] != (unsigned int)b) - return false; - if (a.pn[1] != (unsigned int)(b >> 32)) - return false; - for (int i = 2; i < base_uint::WIDTH; i++) - if (a.pn[i] != 0) - return false; - return true; - } - - friend inline bool operator!=(const base_uint& a, const base_uint& b) - { - return (!(a == b)); - } - - friend inline bool operator!=(const base_uint& a, uint64 b) - { - return (!(a == b)); - } - - - - std::string GetHex() const - { - char psz[sizeof(pn)*2 + 1]; - for (unsigned int i = 0; i < sizeof(pn); i++) - sprintf(psz + i*2, "%02x", ((unsigned char*)pn)[sizeof(pn) - i - 1]); - return std::string(psz, psz + sizeof(pn)*2); - } - - void SetHex(const char* psz) - { - for (int i = 0; i < WIDTH; i++) - pn[i] = 0; - - // skip leading spaces - while (isspace(*psz)) - psz++; - - // skip 0x - if (psz[0] == '0' && tolower(psz[1]) == 'x') - psz += 2; - - // hex string to uint - static const unsigned char phexdigit[256] = { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,1,2,3,4,5,6,7,8,9,0,0,0,0,0,0, 0,0xa,0xb,0xc,0xd,0xe,0xf,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0xa,0xb,0xc,0xd,0xe,0xf,0,0,0,0,0,0,0,0,0 }; - const char* pbegin = psz; - while (phexdigit[(unsigned char)*psz] || *psz == '0') - psz++; - psz--; - unsigned char* p1 = (unsigned char*)pn; - unsigned char* pend = p1 + WIDTH * 4; - while (psz >= pbegin && p1 < pend) - { - *p1 = phexdigit[(unsigned char)*psz--]; - if (psz >= pbegin) - { - *p1 |= (phexdigit[(unsigned char)*psz--] << 4); - p1++; - } - } - } - - void SetHex(const std::string& str) - { - SetHex(str.c_str()); - } - - std::string ToString() const - { - return (GetHex()); - } - - unsigned char* begin() - { - return (unsigned char*)&pn[0]; - } - - unsigned char* end() - { - return (unsigned char*)&pn[WIDTH]; - } - - const unsigned char* begin() const - { - return (unsigned char*)&pn[0]; - } - - const unsigned char* end() const - { - return (unsigned char*)&pn[WIDTH]; - } - - unsigned int size() const - { - return sizeof(pn); - } - - uint64 Get64(int n=0) const - { - return pn[2*n] | (uint64)pn[2*n+1] << 32; - } - -// unsigned int GetSerializeSize(int nType=0, int nVersion=PROTOCOL_VERSION) const - unsigned int GetSerializeSize(int nType, int nVersion) const - { - return sizeof(pn); - } - - template -// void Serialize(Stream& s, int nType=0, int nVersion=PROTOCOL_VERSION) const - void Serialize(Stream& s, int nType, int nVersion) const - { - s.write((char*)pn, sizeof(pn)); - } - - template -// void Unserialize(Stream& s, int nType=0, int nVersion=PROTOCOL_VERSION) - void Unserialize(Stream& s, int nType, int nVersion) - { - s.read((char*)pn, sizeof(pn)); - } - - - friend class uint160; - friend class uint256; - friend inline int Testuint256AdHoc(std::vector vArg); -}; - -typedef base_uint<160> base_uint160; -typedef base_uint<256> base_uint256; - - - -// -// uint160 and uint256 could be implemented as templates, but to keep -// compile errors and debugging cleaner, they're copy and pasted. -// - - - -////////////////////////////////////////////////////////////////////////////// -// -// uint160 -// - -/** 160-bit unsigned integer */ -class uint160 : public base_uint160 -{ -public: - typedef base_uint160 basetype; - - uint160() - { - for (int i = 0; i < WIDTH; i++) - pn[i] = 0; - } - - uint160(const basetype& b) - { - for (int i = 0; i < WIDTH; i++) - pn[i] = b.pn[i]; - } - - uint160& operator=(const basetype& b) - { - for (int i = 0; i < WIDTH; i++) - pn[i] = b.pn[i]; - return *this; - } - - uint160(uint64 b) - { - pn[0] = (unsigned int)b; - pn[1] = (unsigned int)(b >> 32); - for (int i = 2; i < WIDTH; i++) - pn[i] = 0; - } - - uint160& operator=(uint64 b) - { - pn[0] = (unsigned int)b; - pn[1] = (unsigned int)(b >> 32); - for (int i = 2; i < WIDTH; i++) - pn[i] = 0; - return *this; - } - - explicit uint160(const std::string& str) - { - SetHex(str); - } - - explicit uint160(const std::vector& vch) - { - if (vch.size() == sizeof(pn)) - memcpy(pn, &vch[0], sizeof(pn)); - else - *this = 0; - } -}; - -inline bool operator==(const uint160& a, uint64 b) { return (base_uint160)a == b; } -inline bool operator!=(const uint160& a, uint64 b) { return (base_uint160)a != b; } -inline const uint160 operator<<(const base_uint160& a, unsigned int shift) { return uint160(a) <<= shift; } -inline const uint160 operator>>(const base_uint160& a, unsigned int shift) { return uint160(a) >>= shift; } -inline const uint160 operator<<(const uint160& a, unsigned int shift) { return uint160(a) <<= shift; } -inline const uint160 operator>>(const uint160& a, unsigned int shift) { return uint160(a) >>= shift; } - -inline const uint160 operator^(const base_uint160& a, const base_uint160& b) { return uint160(a) ^= b; } -inline const uint160 operator&(const base_uint160& a, const base_uint160& b) { return uint160(a) &= b; } -inline const uint160 operator|(const base_uint160& a, const base_uint160& b) { return uint160(a) |= b; } -inline const uint160 operator+(const base_uint160& a, const base_uint160& b) { return uint160(a) += b; } -inline const uint160 operator-(const base_uint160& a, const base_uint160& b) { return uint160(a) -= b; } - -inline bool operator<(const base_uint160& a, const uint160& b) { return (base_uint160)a < (base_uint160)b; } -inline bool operator<=(const base_uint160& a, const uint160& b) { return (base_uint160)a <= (base_uint160)b; } -inline bool operator>(const base_uint160& a, const uint160& b) { return (base_uint160)a > (base_uint160)b; } -inline bool operator>=(const base_uint160& a, const uint160& b) { return (base_uint160)a >= (base_uint160)b; } -inline bool operator==(const base_uint160& a, const uint160& b) { return (base_uint160)a == (base_uint160)b; } -inline bool operator!=(const base_uint160& a, const uint160& b) { return (base_uint160)a != (base_uint160)b; } -inline const uint160 operator^(const base_uint160& a, const uint160& b) { return (base_uint160)a ^ (base_uint160)b; } -inline const uint160 operator&(const base_uint160& a, const uint160& b) { return (base_uint160)a & (base_uint160)b; } -inline const uint160 operator|(const base_uint160& a, const uint160& b) { return (base_uint160)a | (base_uint160)b; } -inline const uint160 operator+(const base_uint160& a, const uint160& b) { return (base_uint160)a + (base_uint160)b; } -inline const uint160 operator-(const base_uint160& a, const uint160& b) { return (base_uint160)a - (base_uint160)b; } - -inline bool operator<(const uint160& a, const base_uint160& b) { return (base_uint160)a < (base_uint160)b; } -inline bool operator<=(const uint160& a, const base_uint160& b) { return (base_uint160)a <= (base_uint160)b; } -inline bool operator>(const uint160& a, const base_uint160& b) { return (base_uint160)a > (base_uint160)b; } -inline bool operator>=(const uint160& a, const base_uint160& b) { return (base_uint160)a >= (base_uint160)b; } -inline bool operator==(const uint160& a, const base_uint160& b) { return (base_uint160)a == (base_uint160)b; } -inline bool operator!=(const uint160& a, const base_uint160& b) { return (base_uint160)a != (base_uint160)b; } -inline const uint160 operator^(const uint160& a, const base_uint160& b) { return (base_uint160)a ^ (base_uint160)b; } -inline const uint160 operator&(const uint160& a, const base_uint160& b) { return (base_uint160)a & (base_uint160)b; } -inline const uint160 operator|(const uint160& a, const base_uint160& b) { return (base_uint160)a | (base_uint160)b; } -inline const uint160 operator+(const uint160& a, const base_uint160& b) { return (base_uint160)a + (base_uint160)b; } -inline const uint160 operator-(const uint160& a, const base_uint160& b) { return (base_uint160)a - (base_uint160)b; } - -inline bool operator<(const uint160& a, const uint160& b) { return (base_uint160)a < (base_uint160)b; } -inline bool operator<=(const uint160& a, const uint160& b) { return (base_uint160)a <= (base_uint160)b; } -inline bool operator>(const uint160& a, const uint160& b) { return (base_uint160)a > (base_uint160)b; } -inline bool operator>=(const uint160& a, const uint160& b) { return (base_uint160)a >= (base_uint160)b; } -inline bool operator==(const uint160& a, const uint160& b) { return (base_uint160)a == (base_uint160)b; } -inline bool operator!=(const uint160& a, const uint160& b) { return (base_uint160)a != (base_uint160)b; } -inline const uint160 operator^(const uint160& a, const uint160& b) { return (base_uint160)a ^ (base_uint160)b; } -inline const uint160 operator&(const uint160& a, const uint160& b) { return (base_uint160)a & (base_uint160)b; } -inline const uint160 operator|(const uint160& a, const uint160& b) { return (base_uint160)a | (base_uint160)b; } -inline const uint160 operator+(const uint160& a, const uint160& b) { return (base_uint160)a + (base_uint160)b; } -inline const uint160 operator-(const uint160& a, const uint160& b) { return (base_uint160)a - (base_uint160)b; } - - - - - - -////////////////////////////////////////////////////////////////////////////// -// -// uint256 -// - -/** 256-bit unsigned integer */ -class uint256 : public base_uint256 -{ -public: - typedef base_uint256 basetype; - - uint256() - { - for (int i = 0; i < WIDTH; i++) - pn[i] = 0; - } - - uint256(const basetype& b) - { - for (int i = 0; i < WIDTH; i++) - pn[i] = b.pn[i]; - } - - uint256& operator=(const basetype& b) - { - for (int i = 0; i < WIDTH; i++) - pn[i] = b.pn[i]; - return *this; - } - - uint256(uint64 b) - { - pn[0] = (unsigned int)b; - pn[1] = (unsigned int)(b >> 32); - for (int i = 2; i < WIDTH; i++) - pn[i] = 0; - } - - uint256& operator=(uint64 b) - { - pn[0] = (unsigned int)b; - pn[1] = (unsigned int)(b >> 32); - for (int i = 2; i < WIDTH; i++) - pn[i] = 0; - return *this; - } - - explicit uint256(const std::string& str) - { - SetHex(str); - } - - explicit uint256(const std::vector& vch) - { - if (vch.size() == sizeof(pn)) - memcpy(pn, &vch[0], sizeof(pn)); - else - *this = 0; - } -}; - -inline bool operator==(const uint256& a, uint64 b) { return (base_uint256)a == b; } -inline bool operator!=(const uint256& a, uint64 b) { return (base_uint256)a != b; } -inline const uint256 operator<<(const base_uint256& a, unsigned int shift) { return uint256(a) <<= shift; } -inline const uint256 operator>>(const base_uint256& a, unsigned int shift) { return uint256(a) >>= shift; } -inline const uint256 operator<<(const uint256& a, unsigned int shift) { return uint256(a) <<= shift; } -inline const uint256 operator>>(const uint256& a, unsigned int shift) { return uint256(a) >>= shift; } - -inline const uint256 operator^(const base_uint256& a, const base_uint256& b) { return uint256(a) ^= b; } -inline const uint256 operator&(const base_uint256& a, const base_uint256& b) { return uint256(a) &= b; } -inline const uint256 operator|(const base_uint256& a, const base_uint256& b) { return uint256(a) |= b; } -inline const uint256 operator+(const base_uint256& a, const base_uint256& b) { return uint256(a) += b; } -inline const uint256 operator-(const base_uint256& a, const base_uint256& b) { return uint256(a) -= b; } - -inline bool operator<(const base_uint256& a, const uint256& b) { return (base_uint256)a < (base_uint256)b; } -inline bool operator<=(const base_uint256& a, const uint256& b) { return (base_uint256)a <= (base_uint256)b; } -inline bool operator>(const base_uint256& a, const uint256& b) { return (base_uint256)a > (base_uint256)b; } -inline bool operator>=(const base_uint256& a, const uint256& b) { return (base_uint256)a >= (base_uint256)b; } -inline bool operator==(const base_uint256& a, const uint256& b) { return (base_uint256)a == (base_uint256)b; } -inline bool operator!=(const base_uint256& a, const uint256& b) { return (base_uint256)a != (base_uint256)b; } -inline const uint256 operator^(const base_uint256& a, const uint256& b) { return (base_uint256)a ^ (base_uint256)b; } -inline const uint256 operator&(const base_uint256& a, const uint256& b) { return (base_uint256)a & (base_uint256)b; } -inline const uint256 operator|(const base_uint256& a, const uint256& b) { return (base_uint256)a | (base_uint256)b; } -inline const uint256 operator+(const base_uint256& a, const uint256& b) { return (base_uint256)a + (base_uint256)b; } -inline const uint256 operator-(const base_uint256& a, const uint256& b) { return (base_uint256)a - (base_uint256)b; } - -inline bool operator<(const uint256& a, const base_uint256& b) { return (base_uint256)a < (base_uint256)b; } -inline bool operator<=(const uint256& a, const base_uint256& b) { return (base_uint256)a <= (base_uint256)b; } -inline bool operator>(const uint256& a, const base_uint256& b) { return (base_uint256)a > (base_uint256)b; } -inline bool operator>=(const uint256& a, const base_uint256& b) { return (base_uint256)a >= (base_uint256)b; } -inline bool operator==(const uint256& a, const base_uint256& b) { return (base_uint256)a == (base_uint256)b; } -inline bool operator!=(const uint256& a, const base_uint256& b) { return (base_uint256)a != (base_uint256)b; } -inline const uint256 operator^(const uint256& a, const base_uint256& b) { return (base_uint256)a ^ (base_uint256)b; } -inline const uint256 operator&(const uint256& a, const base_uint256& b) { return (base_uint256)a & (base_uint256)b; } -inline const uint256 operator|(const uint256& a, const base_uint256& b) { return (base_uint256)a | (base_uint256)b; } -inline const uint256 operator+(const uint256& a, const base_uint256& b) { return (base_uint256)a + (base_uint256)b; } -inline const uint256 operator-(const uint256& a, const base_uint256& b) { return (base_uint256)a - (base_uint256)b; } - -inline bool operator<(const uint256& a, const uint256& b) { return (base_uint256)a < (base_uint256)b; } -inline bool operator<=(const uint256& a, const uint256& b) { return (base_uint256)a <= (base_uint256)b; } -inline bool operator>(const uint256& a, const uint256& b) { return (base_uint256)a > (base_uint256)b; } -inline bool operator>=(const uint256& a, const uint256& b) { return (base_uint256)a >= (base_uint256)b; } -inline bool operator==(const uint256& a, const uint256& b) { return (base_uint256)a == (base_uint256)b; } -inline bool operator!=(const uint256& a, const uint256& b) { return (base_uint256)a != (base_uint256)b; } -inline const uint256 operator^(const uint256& a, const uint256& b) { return (base_uint256)a ^ (base_uint256)b; } -inline const uint256 operator&(const uint256& a, const uint256& b) { return (base_uint256)a & (base_uint256)b; } -inline const uint256 operator|(const uint256& a, const uint256& b) { return (base_uint256)a | (base_uint256)b; } -inline const uint256 operator+(const uint256& a, const uint256& b) { return (base_uint256)a + (base_uint256)b; } -inline const uint256 operator-(const uint256& a, const uint256& b) { return (base_uint256)a - (base_uint256)b; } - - - - - - - - - - -#ifdef TEST_UINT256 - -inline int Testuint256AdHoc(std::vector vArg) -{ - uint256 g(0); - - - printf("%s\n", g.ToString().c_str()); - g--; printf("g--\n"); - printf("%s\n", g.ToString().c_str()); - g--; printf("g--\n"); - printf("%s\n", g.ToString().c_str()); - g++; printf("g++\n"); - printf("%s\n", g.ToString().c_str()); - g++; printf("g++\n"); - printf("%s\n", g.ToString().c_str()); - g++; printf("g++\n"); - printf("%s\n", g.ToString().c_str()); - g++; printf("g++\n"); - printf("%s\n", g.ToString().c_str()); - - - - uint256 a(7); - printf("a=7\n"); - printf("%s\n", a.ToString().c_str()); - - uint256 b; - printf("b undefined\n"); - printf("%s\n", b.ToString().c_str()); - int c = 3; - - a = c; - a.pn[3] = 15; - printf("%s\n", a.ToString().c_str()); - uint256 k(c); - - a = 5; - a.pn[3] = 15; - printf("%s\n", a.ToString().c_str()); - b = 1; - b <<= 52; - - a |= b; - - a ^= 0x500; - - printf("a %s\n", a.ToString().c_str()); - - a = a | b | (uint256)0x1000; - - - printf("a %s\n", a.ToString().c_str()); - printf("b %s\n", b.ToString().c_str()); - - a = 0xfffffffe; - a.pn[4] = 9; - - printf("%s\n", a.ToString().c_str()); - a++; - printf("%s\n", a.ToString().c_str()); - a++; - printf("%s\n", a.ToString().c_str()); - a++; - printf("%s\n", a.ToString().c_str()); - a++; - printf("%s\n", a.ToString().c_str()); - - a--; - printf("%s\n", a.ToString().c_str()); - a--; - printf("%s\n", a.ToString().c_str()); - a--; - printf("%s\n", a.ToString().c_str()); - uint256 d = a--; - printf("%s\n", d.ToString().c_str()); - printf("%s\n", a.ToString().c_str()); - a--; - printf("%s\n", a.ToString().c_str()); - a--; - printf("%s\n", a.ToString().c_str()); - - d = a; - - printf("%s\n", d.ToString().c_str()); - for (int i = uint256::WIDTH-1; i >= 0; i--) printf("%08x", d.pn[i]); printf("\n"); - - uint256 neg = d; - neg = ~neg; - printf("%s\n", neg.ToString().c_str()); - - - uint256 e = uint256("0xABCDEF123abcdef12345678909832180000011111111"); - printf("\n"); - printf("%s\n", e.ToString().c_str()); - - - printf("\n"); - uint256 x1 = uint256("0xABCDEF123abcdef12345678909832180000011111111"); - uint256 x2; - printf("%s\n", x1.ToString().c_str()); - for (int i = 0; i < 270; i += 4) - { - x2 = x1 << i; - printf("%s\n", x2.ToString().c_str()); - } - - printf("\n"); - printf("%s\n", x1.ToString().c_str()); - for (int i = 0; i < 270; i += 4) - { - x2 = x1; - x2 >>= i; - printf("%s\n", x2.ToString().c_str()); - } - - - for (int i = 0; i < 100; i++) - { - uint256 k = (~uint256(0) >> i); - printf("%s\n", k.ToString().c_str()); - } - - for (int i = 0; i < 100; i++) - { - uint256 k = (~uint256(0) << i); - printf("%s\n", k.ToString().c_str()); - } - - return (0); -} - -#endif - -#endif diff --git a/util.c b/util.c deleted file mode 100644 index 350665ff2d..0000000000 --- a/util.c +++ /dev/null @@ -1,1617 +0,0 @@ -/* - * Copyright 2010 Jeff Garzik - * Copyright 2012-2014 pooler - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the Free - * Software Foundation; either version 2 of the License, or (at your option) - * any later version. See COPYING for more details. - */ - -#define _GNU_SOURCE -#include "cpuminer-config.h" - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#ifdef WIN32 -#include "compat/winansi.h" -#include -#include -#else -#include -#include -#include -#include -#endif -#include "compat.h" -#include "miner.h" -#include "elist.h" - -struct data_buffer { - void *buf; - size_t len; -}; - -struct upload_buffer { - const void *buf; - size_t len; - size_t pos; -}; - -struct header_info { - char *lp_path; - char *reason; - char *stratum_url; -}; - -struct tq_ent { - void *data; - struct list_head q_node; -}; - -struct thread_q { - struct list_head q; - - bool frozen; - - pthread_mutex_t mutex; - pthread_cond_t cond; -}; - -void applog(int prio, const char *fmt, ...) -{ - va_list ap; - - va_start(ap, fmt); - -#ifdef HAVE_SYSLOG_H - if (use_syslog) { - va_list ap2; - char *buf; - int len; - - /* custom colors to syslog prio */ - if (prio > LOG_DEBUG) { - switch (prio) { - case LOG_BLUE: prio = LOG_NOTICE; break; - } - } - - va_copy(ap2, ap); - len = vsnprintf(NULL, 0, fmt, ap2) + 1; - va_end(ap2); - buf = alloca(len); - if (vsnprintf(buf, len, fmt, ap) >= 0) - syslog(prio, "%s", buf); - } -#else - if (0) {} -#endif - else { - const char* color = ""; - char *f; - int len; - time_t now; - struct tm tm, *tm_p; - - time(&now); - - pthread_mutex_lock(&applog_lock); - tm_p = localtime(&now); - memcpy(&tm, tm_p, sizeof(tm)); - pthread_mutex_unlock(&applog_lock); - - switch (prio) { - case LOG_ERR: color = CL_RED; break; - case LOG_WARNING: color = CL_YLW; break; - case LOG_NOTICE: color = CL_WHT; break; - case LOG_INFO: color = ""; break; - case LOG_DEBUG: color = CL_GRY; break; - - case LOG_BLUE: - prio = LOG_NOTICE; - color = CL_CYN; - break; - } - if (!use_colors) - color = ""; - - len = 40 + (int) strlen(fmt) + 2; - f = (char*) alloca(len); - sprintf(f, "[%d-%02d-%02d %02d:%02d:%02d]%s %s%s\n", - tm.tm_year + 1900, - tm.tm_mon + 1, - tm.tm_mday, - tm.tm_hour, - tm.tm_min, - tm.tm_sec, - color, - fmt, - use_colors ? CL_N : "" - ); - pthread_mutex_lock(&applog_lock); - vfprintf(stderr, f, ap); /* atomic write to stderr */ - fflush(stderr); - pthread_mutex_unlock(&applog_lock); - } - va_end(ap); -} - -static void databuf_free(struct data_buffer *db) -{ - if (!db) - return; - - free(db->buf); - - memset(db, 0, sizeof(*db)); -} - -static size_t all_data_cb(const void *ptr, size_t size, size_t nmemb, - void *user_data) -{ - struct data_buffer *db = (struct data_buffer *)user_data; - size_t len = size * nmemb; - size_t oldlen, newlen; - void *newmem; - static const unsigned char zero = 0; - - oldlen = db->len; - newlen = oldlen + len; - - newmem = realloc(db->buf, newlen + 1); - if (!newmem) - return 0; - - db->buf = newmem; - db->len = newlen; - memcpy((char*)db->buf + oldlen, ptr, len); - memcpy((char*)db->buf + newlen, &zero, 1); /* null terminate */ - - return len; -} - -static size_t upload_data_cb(void *ptr, size_t size, size_t nmemb, - void *user_data) -{ - struct upload_buffer *ub = (struct upload_buffer *)user_data; - unsigned int len = (unsigned int)(size * nmemb); - - if (len > ub->len - ub->pos) - len = (unsigned int)(ub->len - ub->pos); - - if (len) { - memcpy(ptr, (char*)ub->buf + ub->pos, len); - ub->pos += len; - } - - return len; -} - -#if LIBCURL_VERSION_NUM >= 0x071200 -static int seek_data_cb(void *user_data, curl_off_t offset, int origin) -{ - struct upload_buffer *ub = (struct upload_buffer *)user_data; - - switch (origin) { - case SEEK_SET: - ub->pos = (size_t)offset; - break; - case SEEK_CUR: - ub->pos += (size_t)offset; - break; - case SEEK_END: - ub->pos = ub->len + (size_t)offset; - break; - default: - return 1; /* CURL_SEEKFUNC_FAIL */ - } - - return 0; /* CURL_SEEKFUNC_OK */ -} -#endif - -static size_t resp_hdr_cb(void *ptr, size_t size, size_t nmemb, void *user_data) -{ - struct header_info *hi = (struct header_info *)user_data; - size_t remlen, slen, ptrlen = size * nmemb; - char *rem, *val = NULL, *key = NULL; - void *tmp; - - val = (char*)calloc(1, ptrlen); - key = (char*)calloc(1, ptrlen); - if (!key || !val) - goto out; - - tmp = memchr(ptr, ':', ptrlen); - if (!tmp || (tmp == ptr)) /* skip empty keys / blanks */ - goto out; - slen = (size_t)((char*)tmp - (char*)ptr); - if ((slen + 1) == ptrlen) /* skip key w/ no value */ - goto out; - memcpy(key, ptr, slen); /* store & nul term key */ - key[slen] = 0; - - rem = (char*)ptr + slen + 1; /* trim value's leading whitespace */ - remlen = ptrlen - slen - 1; - while ((remlen > 0) && (isspace(*rem))) { - remlen--; - rem++; - } - - memcpy(val, rem, remlen); /* store value, trim trailing ws */ - val[remlen] = 0; - while ((*val) && (isspace(val[strlen(val) - 1]))) { - val[strlen(val) - 1] = 0; - } - if (!*val) /* skip blank value */ - goto out; - - if (!strcasecmp("X-Long-Polling", key)) { - hi->lp_path = val; /* X-Mining-Extensions: longpoll */ - val = NULL; - } - - if (!strcasecmp("X-Reject-Reason", key)) { - hi->reason = val; /* X-Mining-Extensions: reject-reason */ - //applog(LOG_WARNING, "%s:%s", key, val); - val = NULL; - } - - if (!strcasecmp("X-Stratum", key)) { - hi->stratum_url = val; /* steal memory reference */ - val = NULL; - } - - if (!strcasecmp("X-Nonce-Range", key)) { - /* todo when available: X-Mining-Extensions: noncerange */ - } -out: - free(key); - free(val); - return ptrlen; -} - -#if LIBCURL_VERSION_NUM >= 0x070f06 -static int sockopt_keepalive_cb(void *userdata, curl_socket_t fd, - curlsocktype purpose) -{ - int keepalive = 1; - int tcp_keepcnt = 3; - int tcp_keepidle = 50; - int tcp_keepintvl = 50; -#ifdef WIN32 - DWORD outputBytes; -#endif - -#ifndef WIN32 - if (unlikely(setsockopt(fd, SOL_SOCKET, SO_KEEPALIVE, &keepalive, - sizeof(keepalive)))) - return 1; -#ifdef __linux - if (unlikely(setsockopt(fd, SOL_TCP, TCP_KEEPCNT, - &tcp_keepcnt, sizeof(tcp_keepcnt)))) - return 1; - if (unlikely(setsockopt(fd, SOL_TCP, TCP_KEEPIDLE, - &tcp_keepidle, sizeof(tcp_keepidle)))) - return 1; - if (unlikely(setsockopt(fd, SOL_TCP, TCP_KEEPINTVL, - &tcp_keepintvl, sizeof(tcp_keepintvl)))) - return 1; -#endif /* __linux */ -#ifdef __APPLE_CC__ - if (unlikely(setsockopt(fd, IPPROTO_TCP, TCP_KEEPALIVE, - &tcp_keepintvl, sizeof(tcp_keepintvl)))) - return 1; -#endif /* __APPLE_CC__ */ -#else /* WIN32 */ - struct tcp_keepalive vals; - vals.onoff = 1; - vals.keepalivetime = tcp_keepidle * 1000; - vals.keepaliveinterval = tcp_keepintvl * 1000; - if (unlikely(WSAIoctl(fd, SIO_KEEPALIVE_VALS, &vals, sizeof(vals), - NULL, 0, &outputBytes, NULL, NULL))) - return 1; -#endif /* WIN32 */ - - return 0; -} -#endif - -json_t *json_rpc_call(CURL *curl, const char *url, - const char *userpass, const char *rpc_req, - bool longpoll_scan, bool longpoll, int *curl_err) -{ - json_t *val, *err_val, *res_val; - int rc; - struct data_buffer all_data = {0}; - struct upload_buffer upload_data; - json_error_t err; - struct curl_slist *headers = NULL; - char len_hdr[64], hashrate_hdr[64]; - char curl_err_str[CURL_ERROR_SIZE]; - long timeout = longpoll ? opt_timeout : 30; - struct header_info hi = {0}; - bool lp_scanning = longpoll_scan && !have_longpoll; - - /* it is assumed that 'curl' is freshly [re]initialized at this pt */ - - if (opt_protocol) - curl_easy_setopt(curl, CURLOPT_VERBOSE, 1); - curl_easy_setopt(curl, CURLOPT_URL, url); - if (opt_cert) - curl_easy_setopt(curl, CURLOPT_CAINFO, opt_cert); - curl_easy_setopt(curl, CURLOPT_ENCODING, ""); - curl_easy_setopt(curl, CURLOPT_FAILONERROR, 1); - curl_easy_setopt(curl, CURLOPT_NOSIGNAL, 1); - curl_easy_setopt(curl, CURLOPT_TCP_NODELAY, 1); - curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, all_data_cb); - curl_easy_setopt(curl, CURLOPT_WRITEDATA, &all_data); - curl_easy_setopt(curl, CURLOPT_READFUNCTION, upload_data_cb); - curl_easy_setopt(curl, CURLOPT_READDATA, &upload_data); -#if LIBCURL_VERSION_NUM >= 0x071200 - curl_easy_setopt(curl, CURLOPT_SEEKFUNCTION, &seek_data_cb); - curl_easy_setopt(curl, CURLOPT_SEEKDATA, &upload_data); -#endif - curl_easy_setopt(curl, CURLOPT_ERRORBUFFER, curl_err_str); - curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1); - curl_easy_setopt(curl, CURLOPT_TIMEOUT, timeout); - curl_easy_setopt(curl, CURLOPT_HEADERFUNCTION, resp_hdr_cb); - curl_easy_setopt(curl, CURLOPT_HEADERDATA, &hi); - if (opt_proxy) { - curl_easy_setopt(curl, CURLOPT_PROXY, opt_proxy); - curl_easy_setopt(curl, CURLOPT_PROXYTYPE, opt_proxy_type); - } - if (userpass) { - curl_easy_setopt(curl, CURLOPT_USERPWD, userpass); - curl_easy_setopt(curl, CURLOPT_HTTPAUTH, CURLAUTH_BASIC); - } -#if LIBCURL_VERSION_NUM >= 0x070f06 - if (longpoll) - curl_easy_setopt(curl, CURLOPT_SOCKOPTFUNCTION, sockopt_keepalive_cb); -#endif - curl_easy_setopt(curl, CURLOPT_POST, 1); - - if (opt_protocol) - applog(LOG_DEBUG, "JSON protocol request:\n%s", rpc_req); - - upload_data.buf = rpc_req; - upload_data.len = strlen(rpc_req); - upload_data.pos = 0; - sprintf(len_hdr, "Content-Length: %lu", (unsigned long) upload_data.len); - sprintf(hashrate_hdr, "X-Mining-Hashrate: %llu", (unsigned long long) global_hashrate); - - headers = curl_slist_append(headers, "Content-Type: application/json"); - headers = curl_slist_append(headers, len_hdr); - headers = curl_slist_append(headers, "User-Agent: " USER_AGENT); - headers = curl_slist_append(headers, "X-Mining-Extensions: longpoll noncerange reject-reason"); - headers = curl_slist_append(headers, hashrate_hdr); - headers = curl_slist_append(headers, "Accept:"); /* disable Accept hdr*/ - headers = curl_slist_append(headers, "Expect:"); /* disable Expect hdr*/ - - curl_easy_setopt(curl, CURLOPT_HTTPHEADER, headers); - - rc = curl_easy_perform(curl); - if (curl_err != NULL) - *curl_err = rc; - if (rc) { - if (!(longpoll && rc == CURLE_OPERATION_TIMEDOUT)) - applog(LOG_ERR, "HTTP request failed: %s", curl_err_str); - goto err_out; - } - - /* If X-Stratum was found, activate Stratum */ - if (want_stratum && hi.stratum_url && - !strncasecmp(hi.stratum_url, "stratum+tcp://", 14) && - !(opt_proxy && opt_proxy_type == CURLPROXY_HTTP)) { - have_stratum = true; - tq_push(thr_info[stratum_thr_id].q, hi.stratum_url); - hi.stratum_url = NULL; - } - - /* If X-Long-Polling was found, activate long polling */ - if (lp_scanning && hi.lp_path && !have_stratum) { - have_longpoll = true; - tq_push(thr_info[longpoll_thr_id].q, hi.lp_path); - hi.lp_path = NULL; - } - - if (!all_data.buf) { - applog(LOG_ERR, "Empty data received in json_rpc_call."); - goto err_out; - } - - val = JSON_LOADS((const char*)all_data.buf, &err); - if (!val) { - applog(LOG_ERR, "JSON decode failed(%d): %s", err.line, err.text); - goto err_out; - } - - if (opt_protocol) { - char *s = json_dumps(val, JSON_INDENT(3)); - applog(LOG_DEBUG, "JSON protocol response:\n%s\n", s); - free(s); - } - - /* JSON-RPC valid response returns a non-null 'result', - * and a null 'error'. */ - res_val = json_object_get(val, "result"); - err_val = json_object_get(val, "error"); - - if (!res_val || json_is_null(res_val) || - (err_val && !json_is_null(err_val))) { - char *s; - - if (err_val) - s = json_dumps(err_val, JSON_INDENT(3)); - else - s = strdup("(unknown reason)"); - - applog(LOG_ERR, "JSON-RPC call failed: %s", s); - - free(s); - - goto err_out; - } - - if (hi.reason) - json_object_set_new(val, "reject-reason", json_string(hi.reason)); - - databuf_free(&all_data); - curl_slist_free_all(headers); - curl_easy_reset(curl); - return val; - -err_out: - free(hi.lp_path); - free(hi.reason); - free(hi.stratum_url); - databuf_free(&all_data); - curl_slist_free_all(headers); - curl_easy_reset(curl); - return NULL; -} - -/** - * Unlike malloc, calloc set the memory to zero - */ -void *aligned_calloc(int size) -{ - const int ALIGN = 64; // cache line -#ifdef _MSC_VER - void* res = _aligned_malloc(size, ALIGN); - memset(res, 0, size); - return res; -#else - void *mem = calloc(1, size+ALIGN+sizeof(void*)); - void **ptr = (void**)((size_t)(mem+ALIGN+sizeof(void*)) & ~(ALIGN-1)); - ptr[-1] = mem; - return ptr; -#endif -} - -void aligned_free(void *ptr) -{ -#ifdef _MSC_VER - return _aligned_free(ptr); -#else - free(((void**)ptr)[-1]); -#endif -} - -void cbin2hex(char *out, const char *in, size_t len) -{ - if (out) { - unsigned int i; - for (i = 0; i < len; i++) - sprintf(out + (i * 2), "%02x", (uint8_t)in[i]); - } -} - -char *bin2hex(const unsigned char *in, size_t len) -{ - char *s = (char*)malloc((len * 2) + 1); - if (!s) - return NULL; - - cbin2hex(s, (const char *) in, len); - - return s; -} - -bool hex2bin(unsigned char *p, const char *hexstr, size_t len) -{ - char hex_byte[3]; - char *ep; - - hex_byte[2] = '\0'; - - while (*hexstr && len) { - if (!hexstr[1]) { - applog(LOG_ERR, "hex2bin str truncated"); - return false; - } - hex_byte[0] = hexstr[0]; - hex_byte[1] = hexstr[1]; - *p = (unsigned char) strtol(hex_byte, &ep, 16); - if (*ep) { - applog(LOG_ERR, "hex2bin failed on '%s'", hex_byte); - return false; - } - p++; - hexstr += 2; - len--; - } - - return (len == 0 && *hexstr == 0) ? true : false; -} - -/* Subtract the `struct timeval' values X and Y, - storing the result in RESULT. - Return 1 if the difference is negative, otherwise 0. */ -int timeval_subtract(struct timeval *result, struct timeval *x, - struct timeval *y) -{ - /* Perform the carry for the later subtraction by updating Y. */ - if (x->tv_usec < y->tv_usec) { - int nsec = (y->tv_usec - x->tv_usec) / 1000000 + 1; - y->tv_usec -= 1000000 * nsec; - y->tv_sec += nsec; - } - if (x->tv_usec - y->tv_usec > 1000000) { - int nsec = (x->tv_usec - y->tv_usec) / 1000000; - y->tv_usec += 1000000 * nsec; - y->tv_sec -= nsec; - } - - /* Compute the time remaining to wait. - * `tv_usec' is certainly positive. */ - result->tv_sec = x->tv_sec - y->tv_sec; - result->tv_usec = x->tv_usec - y->tv_usec; - - /* Return 1 if result is negative. */ - return x->tv_sec < y->tv_sec; -} - -bool fulltest(const uint32_t *hash, const uint32_t *target) -{ - int i; - bool rc = true; - - for (i = 7; i >= 0; i--) { - if (hash[i] > target[i]) { - rc = false; - break; - } - if (hash[i] < target[i]) { - rc = true; - break; - } - if (hash[1] == target[1]) { - applog(LOG_NOTICE, "We found a close match!"); - } - } - - if (!rc && opt_debug) { - uint32_t hash_be[8], target_be[8]; - char *hash_str, *target_str; - - for (i = 0; i < 8; i++) { - be32enc(hash_be + i, hash[7 - i]); - be32enc(target_be + i, target[7 - i]); - } - hash_str = bin2hex((unsigned char *)hash_be, 32); - target_str = bin2hex((unsigned char *)target_be, 32); - - applog(LOG_DEBUG, "DEBUG: %s\nHash: %s\nTarget: %s", - rc ? "hash <= target" - : CL_YLW "hash > target (false positive)" CL_N, - hash_str, - target_str); - - free(hash_str); - free(target_str); - } - - return rc; -} - -void diff_to_target(uint32_t *target, double diff) -{ - uint64_t m; - int k; - - for (k = 6; k > 0 && diff > 1.0; k--) - diff /= 4294967296.0; - m = (uint64_t)(4294901760.0 / diff); - if (m == 0 && k == 6) - memset(target, 0xff, 32); - else { - memset(target, 0, 32); - target[k] = (uint32_t)m; - target[k + 1] = (uint32_t)(m >> 32); - } -} - -#ifdef WIN32 -#define socket_blocks() (WSAGetLastError() == WSAEWOULDBLOCK) -#else -#define socket_blocks() (errno == EAGAIN || errno == EWOULDBLOCK) -#endif - -static bool send_line(curl_socket_t sock, char *s) -{ - ssize_t len, sent = 0; - - len = (ssize_t)strlen(s); - s[len++] = '\n'; - - while (len > 0) { - struct timeval timeout = {0, 0}; - ssize_t n; - fd_set wd; - - FD_ZERO(&wd); - FD_SET(sock, &wd); - if (select((int)sock + 1, NULL, &wd, NULL, &timeout) < 1) - return false; - n = send(sock, s + sent, len, 0); - if (n < 0) { - if (!socket_blocks()) - return false; - n = 0; - } - sent += n; - len -= n; - } - - return true; -} - -bool stratum_send_line(struct stratum_ctx *sctx, char *s) -{ - bool ret = false; - - if (opt_protocol) - applog(LOG_DEBUG, "> %s", s); - - pthread_mutex_lock(&sctx->sock_lock); - ret = send_line(sctx->sock, s); - pthread_mutex_unlock(&sctx->sock_lock); - - return ret; -} - -static bool socket_full(curl_socket_t sock, int timeout) -{ - struct timeval tv; - fd_set rd; - - FD_ZERO(&rd); - FD_SET(sock, &rd); - tv.tv_sec = timeout; - tv.tv_usec = 0; - if (select((int)sock + 1, &rd, NULL, NULL, &tv) > 0) - return true; - return false; -} - -bool stratum_socket_full(struct stratum_ctx *sctx, int timeout) -{ - return strlen(sctx->sockbuf) || socket_full(sctx->sock, timeout); -} - -#define RBUFSIZE 2048 -#define RECVSIZE (RBUFSIZE - 4) - -static void stratum_buffer_append(struct stratum_ctx *sctx, const char *s) -{ - size_t old, snew; - - old = strlen(sctx->sockbuf); - snew = old + strlen(s) + 1; - if (snew >= sctx->sockbuf_size) { - sctx->sockbuf_size = snew + (RBUFSIZE - (snew % RBUFSIZE)); - sctx->sockbuf = (char*)realloc(sctx->sockbuf, sctx->sockbuf_size); - } - strcpy(sctx->sockbuf + old, s); -} - -char *stratum_recv_line(struct stratum_ctx *sctx) -{ - ssize_t len, buflen; - char *tok, *sret = NULL; - - if (!strstr(sctx->sockbuf, "\n")) { - bool ret = true; - time_t rstart; - - time(&rstart); - if (!socket_full(sctx->sock, 60)) { - applog(LOG_ERR, "stratum_recv_line timed out"); - goto out; - } - do { - char s[RBUFSIZE]; - ssize_t n; - - memset(s, 0, RBUFSIZE); - n = recv(sctx->sock, s, RECVSIZE, 0); - if (!n) { - ret = false; - break; - } - if (n < 0) { - if (!socket_blocks() || !socket_full(sctx->sock, 1)) { - ret = false; - break; - } - } else - stratum_buffer_append(sctx, s); - } while (time(NULL) - rstart < 60 && !strstr(sctx->sockbuf, "\n")); - - if (!ret) { - applog(LOG_ERR, "stratum_recv_line failed"); - goto out; - } - } - - buflen = (ssize_t)strlen(sctx->sockbuf); - tok = strtok(sctx->sockbuf, "\n"); - if (!tok) { - applog(LOG_ERR, "stratum_recv_line failed to parse a newline-terminated string"); - goto out; - } - sret = strdup(tok); - len = (ssize_t)strlen(sret); - - if (buflen > len + 1) - memmove(sctx->sockbuf, sctx->sockbuf + len + 1, buflen - len + 1); - else - sctx->sockbuf[0] = '\0'; - -out: - if (sret && opt_protocol) - applog(LOG_DEBUG, "< %s", sret); - return sret; -} - -#if LIBCURL_VERSION_NUM >= 0x071101 -static curl_socket_t opensocket_grab_cb(void *clientp, curlsocktype purpose, - struct curl_sockaddr *addr) -{ - curl_socket_t *sock = (curl_socket_t *)clientp; - *sock = socket(addr->family, addr->socktype, addr->protocol); - return *sock; -} -#endif - -bool stratum_connect(struct stratum_ctx *sctx, const char *url) -{ - CURL *curl; - int rc; - - pthread_mutex_lock(&sctx->sock_lock); - if (sctx->curl) - curl_easy_cleanup(sctx->curl); - sctx->curl = curl_easy_init(); - if (!sctx->curl) { - applog(LOG_ERR, "CURL initialization failed"); - pthread_mutex_unlock(&sctx->sock_lock); - return false; - } - curl = sctx->curl; - if (!sctx->sockbuf) { - sctx->sockbuf = (char*)calloc(RBUFSIZE, 1); - sctx->sockbuf_size = RBUFSIZE; - } - sctx->sockbuf[0] = '\0'; - pthread_mutex_unlock(&sctx->sock_lock); - - if (url != sctx->url) { - free(sctx->url); - sctx->url = strdup(url); - } - free(sctx->curl_url); - sctx->curl_url = (char*)malloc(strlen(url)); - sprintf(sctx->curl_url, "http%s", strstr(url, "://")); - - if (opt_protocol) - curl_easy_setopt(curl, CURLOPT_VERBOSE, 1); - curl_easy_setopt(curl, CURLOPT_URL, sctx->curl_url); - curl_easy_setopt(curl, CURLOPT_FRESH_CONNECT, 1); - curl_easy_setopt(curl, CURLOPT_CONNECTTIMEOUT, 30); - curl_easy_setopt(curl, CURLOPT_ERRORBUFFER, sctx->curl_err_str); - curl_easy_setopt(curl, CURLOPT_NOSIGNAL, 1); - curl_easy_setopt(curl, CURLOPT_TCP_NODELAY, 1); - if (opt_proxy && opt_proxy_type != CURLPROXY_HTTP) { - curl_easy_setopt(curl, CURLOPT_PROXY, opt_proxy); - curl_easy_setopt(curl, CURLOPT_PROXYTYPE, opt_proxy_type); - } else if (getenv("http_proxy")) { - if (getenv("all_proxy")) - curl_easy_setopt(curl, CURLOPT_PROXY, getenv("all_proxy")); - else if (getenv("ALL_PROXY")) - curl_easy_setopt(curl, CURLOPT_PROXY, getenv("ALL_PROXY")); - else - curl_easy_setopt(curl, CURLOPT_PROXY, ""); - } -#if LIBCURL_VERSION_NUM >= 0x070f06 - curl_easy_setopt(curl, CURLOPT_SOCKOPTFUNCTION, sockopt_keepalive_cb); -#endif -#if LIBCURL_VERSION_NUM >= 0x071101 - curl_easy_setopt(curl, CURLOPT_OPENSOCKETFUNCTION, opensocket_grab_cb); - curl_easy_setopt(curl, CURLOPT_OPENSOCKETDATA, &sctx->sock); -#endif - curl_easy_setopt(curl, CURLOPT_CONNECT_ONLY, 1); - - rc = curl_easy_perform(curl); - if (rc) { - applog(LOG_ERR, "Stratum connection failed: %s", sctx->curl_err_str); - curl_easy_cleanup(curl); - sctx->curl = NULL; - return false; - } - -#if LIBCURL_VERSION_NUM < 0x071101 - /* CURLINFO_LASTSOCKET is broken on Win64; only use it as a last resort */ - curl_easy_getinfo(curl, CURLINFO_LASTSOCKET, (long *)&sctx->sock); -#endif - - return true; -} - -void stratum_disconnect(struct stratum_ctx *sctx) -{ - pthread_mutex_lock(&sctx->sock_lock); - if (sctx->curl) { - curl_easy_cleanup(sctx->curl); - sctx->curl = NULL; - sctx->sockbuf[0] = '\0'; - } - pthread_mutex_unlock(&sctx->sock_lock); -} - -static const char *get_stratum_session_id(json_t *val) -{ - json_t *arr_val; - int i, n; - - arr_val = json_array_get(val, 0); - if (!arr_val || !json_is_array(arr_val)) - return NULL; - n = json_array_size(arr_val); - for (i = 0; i < n; i++) { - const char *notify; - json_t *arr = json_array_get(arr_val, i); - - if (!arr || !json_is_array(arr)) - break; - notify = json_string_value(json_array_get(arr, 0)); - if (!notify) - continue; - if (!strcasecmp(notify, "mining.notify")) - return json_string_value(json_array_get(arr, 1)); - } - return NULL; -} - -bool stratum_subscribe(struct stratum_ctx *sctx) -{ - char *s, *sret = NULL; - const char *sid, *xnonce1; - int xn2_size; - json_t *val = NULL, *res_val, *err_val; - json_error_t err; - bool ret = false, retry = false; - -start: - s = (char*)malloc(128 + (sctx->session_id ? strlen(sctx->session_id) : 0)); - if (retry) - sprintf(s, "{\"id\": 1, \"method\": \"mining.subscribe\", \"params\": []}"); - else if (sctx->session_id) - sprintf(s, "{\"id\": 1, \"method\": \"mining.subscribe\", \"params\": [\"" USER_AGENT "\", \"%s\"]}", sctx->session_id); - else - sprintf(s, "{\"id\": 1, \"method\": \"mining.subscribe\", \"params\": [\"" USER_AGENT "\"]}"); - - if (!stratum_send_line(sctx, s)) - goto out; - - if (!socket_full(sctx->sock, 30)) { - applog(LOG_ERR, "stratum_subscribe timed out"); - goto out; - } - - sret = stratum_recv_line(sctx); - if (!sret) - goto out; - - val = JSON_LOADS(sret, &err); - free(sret); - if (!val) { - applog(LOG_ERR, "JSON decode failed(%d): %s", err.line, err.text); - goto out; - } - - res_val = json_object_get(val, "result"); - err_val = json_object_get(val, "error"); - - if (!res_val || json_is_null(res_val) || - (err_val && !json_is_null(err_val))) { - if (opt_debug || retry) { - free(s); - if (err_val) - s = json_dumps(err_val, JSON_INDENT(3)); - else - s = strdup("(unknown reason)"); - applog(LOG_ERR, "JSON-RPC call failed: %s", s); - } - goto out; - } - - sid = get_stratum_session_id(res_val); - if (opt_debug && !sid) - applog(LOG_DEBUG, "Failed to get Stratum session id"); - xnonce1 = json_string_value(json_array_get(res_val, 1)); - if (!xnonce1) { - applog(LOG_ERR, "Failed to get extranonce1"); - goto out; - } - xn2_size = json_integer_value(json_array_get(res_val, 2)); - if (!xn2_size) { - applog(LOG_ERR, "Failed to get extranonce2_size"); - goto out; - } - - pthread_mutex_lock(&sctx->work_lock); - free(sctx->session_id); - free(sctx->xnonce1); - sctx->session_id = sid ? strdup(sid) : NULL; - sctx->xnonce1_size = strlen(xnonce1) / 2; - sctx->xnonce1 = (unsigned char*)malloc(sctx->xnonce1_size); - hex2bin(sctx->xnonce1, xnonce1, sctx->xnonce1_size); - sctx->xnonce2_size = xn2_size; - sctx->next_diff = 1.0; - pthread_mutex_unlock(&sctx->work_lock); - - if (opt_debug && sid) - applog(LOG_DEBUG, "Stratum session id: %s", sctx->session_id); - - ret = true; - -out: - free(s); - if (val) - json_decref(val); - - if (!ret) { - if (sret && !retry) { - retry = true; - goto start; - } - } - - return ret; -} - -bool stratum_authorize(struct stratum_ctx *sctx, const char *user, const char *pass) -{ - json_t *val = NULL, *res_val, *err_val; - char *s, *sret; - json_error_t err; - bool ret = false; - - s = (char*)malloc(80 + strlen(user) + strlen(pass)); - sprintf(s, "{\"id\": 2, \"method\": \"mining.authorize\", \"params\": [\"%s\", \"%s\"]}", - user, pass); - - if (!stratum_send_line(sctx, s)) - goto out; - - while (1) { - sret = stratum_recv_line(sctx); - if (!sret) - goto out; - if (!stratum_handle_method(sctx, sret)) - break; - free(sret); - } - - val = JSON_LOADS(sret, &err); - free(sret); - if (!val) { - applog(LOG_ERR, "JSON decode failed(%d): %s", err.line, err.text); - goto out; - } - - res_val = json_object_get(val, "result"); - err_val = json_object_get(val, "error"); - - if (!res_val || json_is_false(res_val) || - (err_val && !json_is_null(err_val))) { - applog(LOG_ERR, "Stratum authentication failed"); - goto out; - } - - ret = true; - -out: - free(s); - if (val) - json_decref(val); - - return ret; -} - -/** - * Extract bloc height L H... here len=3, height=0x1333e8 - * "...0000000000ffffffff2703e83313062f503253482f043d61105408" - */ -static uint32_t getblocheight(struct stratum_ctx *sctx) -{ - uint32_t height = 0; - uint8_t hlen = 0, *p, *m; - - // find 0xffff tag - p = (uint8_t*) sctx->job.coinbase + 32; - m = p + 128; - while (*p != 0xff && p < m) p++; - while (*p == 0xff && p < m) p++; - if (*(p-1) == 0xff && *(p-2) == 0xff) { - p++; hlen = *p; - p++; height = le16dec(p); - p += 2; - switch (hlen) { - case 4: - height += 0x10000UL * le16dec(p); - break; - case 3: - height += 0x10000UL * (*p); - break; - } - } - return height; -} - -static bool stratum_notify(struct stratum_ctx *sctx, json_t *params) -{ - const char *job_id, *prevhash, *coinb1, *coinb2, *version, *nbits, *stime, *nreward; - size_t coinb1_size, coinb2_size; - bool clean, ret = false; - int merkle_count, i; - json_t *merkle_arr; - unsigned char **merkle; - int ntime; - - job_id = json_string_value(json_array_get(params, 0)); - prevhash = json_string_value(json_array_get(params, 1)); - coinb1 = json_string_value(json_array_get(params, 2)); - coinb2 = json_string_value(json_array_get(params, 3)); - merkle_arr = json_array_get(params, 4); - if (!merkle_arr || !json_is_array(merkle_arr)) - goto out; - merkle_count = json_array_size(merkle_arr); - version = json_string_value(json_array_get(params, 5)); - nbits = json_string_value(json_array_get(params, 6)); - stime = json_string_value(json_array_get(params, 7)); - clean = json_is_true(json_array_get(params, 8)); - nreward = json_string_value(json_array_get(params, 9)); - - if (!job_id || !prevhash || !coinb1 || !coinb2 || !version || !nbits || !stime || - strlen(prevhash) != 64 || strlen(version) != 8 || - strlen(nbits) != 8 || strlen(stime) != 8) { - applog(LOG_ERR, "Stratum notify: invalid parameters"); - goto out; - } - - /* store stratum server time diff */ - hex2bin((unsigned char *)&ntime, stime, 4); - ntime = swab32(ntime) - (uint32_t) time(0); - if (ntime > sctx->srvtime_diff) { - sctx->srvtime_diff = ntime; - if (!opt_quiet) - applog(LOG_DEBUG, "stratum time is at least %ds in the future", ntime); - } - - merkle = (unsigned char**)malloc(merkle_count * sizeof(char *)); - for (i = 0; i < merkle_count; i++) { - const char *s = json_string_value(json_array_get(merkle_arr, i)); - if (!s || strlen(s) != 64) { - while (i--) - free(merkle[i]); - free(merkle); - applog(LOG_ERR, "Stratum notify: invalid Merkle branch"); - goto out; - } - merkle[i] = (unsigned char*)malloc(32); - hex2bin(merkle[i], s, 32); - } - - pthread_mutex_lock(&sctx->work_lock); - - coinb1_size = strlen(coinb1) / 2; - coinb2_size = strlen(coinb2) / 2; - sctx->job.coinbase_size = coinb1_size + sctx->xnonce1_size + - sctx->xnonce2_size + coinb2_size; - - sctx->job.coinbase = (unsigned char*)realloc(sctx->job.coinbase, sctx->job.coinbase_size); - sctx->job.xnonce2 = sctx->job.coinbase + coinb1_size + sctx->xnonce1_size; - hex2bin(sctx->job.coinbase, coinb1, coinb1_size); - memcpy(sctx->job.coinbase + coinb1_size, sctx->xnonce1, sctx->xnonce1_size); - - if (!sctx->job.job_id || strcmp(sctx->job.job_id, job_id)) - memset(sctx->job.xnonce2, 0, sctx->xnonce2_size); - hex2bin(sctx->job.xnonce2 + sctx->xnonce2_size, coinb2, coinb2_size); - - free(sctx->job.job_id); - sctx->job.job_id = strdup(job_id); - hex2bin(sctx->job.prevhash, prevhash, 32); - - sctx->bloc_height = getblocheight(sctx); - - for (i = 0; i < sctx->job.merkle_count; i++) - free(sctx->job.merkle[i]); - free(sctx->job.merkle); - sctx->job.merkle = merkle; - sctx->job.merkle_count = merkle_count; - - hex2bin(sctx->job.version, version, 4); - hex2bin(sctx->job.nbits, nbits, 4); - hex2bin(sctx->job.ntime, stime, 4); - if(nreward != NULL) - { - if(strlen(nreward) == 4) - hex2bin(sctx->job.nreward, nreward, 2); - } - sctx->job.clean = clean; - - sctx->job.diff = sctx->next_diff; - - pthread_mutex_unlock(&sctx->work_lock); - - ret = true; - -out: - return ret; -} - -static bool stratum_set_difficulty(struct stratum_ctx *sctx, json_t *params) -{ - double diff; - - diff = json_number_value(json_array_get(params, 0)); - if (diff == 0) - return false; - - pthread_mutex_lock(&sctx->work_lock); - sctx->next_diff = diff; - pthread_mutex_unlock(&sctx->work_lock); - - applog(LOG_WARNING, "Stratum difficulty set to %g", diff); - - return true; -} - -static bool stratum_reconnect(struct stratum_ctx *sctx, json_t *params) -{ - json_t *port_val; - const char *host; - int port; - - host = json_string_value(json_array_get(params, 0)); - port_val = json_array_get(params, 1); - if (json_is_string(port_val)) - port = atoi(json_string_value(port_val)); - else - port = json_integer_value(port_val); - if (!host || !port) - return false; - - free(sctx->url); - sctx->url = (char*)malloc(32 + strlen(host)); - sprintf(sctx->url, "stratum+tcp://%s:%d", host, port); - - applog(LOG_NOTICE, "Server requested reconnection to %s", sctx->url); - - stratum_disconnect(sctx); - - return true; -} - -static bool stratum_get_version(struct stratum_ctx *sctx, json_t *id) -{ - char *s; - json_t *val; - bool ret; - - if (!id || json_is_null(id)) - return false; - - val = json_object(); - json_object_set(val, "id", id); - json_object_set_new(val, "error", json_null()); - json_object_set_new(val, "result", json_string(USER_AGENT)); - s = json_dumps(val, 0); - ret = stratum_send_line(sctx, s); - json_decref(val); - free(s); - - return ret; -} - -static bool stratum_show_message(struct stratum_ctx *sctx, json_t *id, json_t *params) -{ - char *s; - json_t *val; - bool ret; - - val = json_array_get(params, 0); - if (val) - applog(LOG_NOTICE, "MESSAGE FROM SERVER: %s", json_string_value(val)); - - if (!id || json_is_null(id)) - return true; - - val = json_object(); - json_object_set(val, "id", id); - json_object_set_new(val, "error", json_null()); - json_object_set_new(val, "result", json_true()); - s = json_dumps(val, 0); - ret = stratum_send_line(sctx, s); - json_decref(val); - free(s); - - return ret; -} - -bool stratum_handle_method(struct stratum_ctx *sctx, const char *s) -{ - json_t *val, *id, *params; - json_error_t err; - const char *method; - bool ret = false; - - val = JSON_LOADS(s, &err); - if (!val) { - applog(LOG_ERR, "JSON decode failed(%d): %s", err.line, err.text); - goto out; - } - - method = json_string_value(json_object_get(val, "method")); - if (!method) - goto out; - id = json_object_get(val, "id"); - params = json_object_get(val, "params"); - - if (!strcasecmp(method, "mining.notify")) { - ret = stratum_notify(sctx, params); - goto out; - } - if (!strcasecmp(method, "mining.set_difficulty")) { - ret = stratum_set_difficulty(sctx, params); - goto out; - } - if (!strcasecmp(method, "client.reconnect")) { - ret = stratum_reconnect(sctx, params); - goto out; - } - if (!strcasecmp(method, "client.get_version")) { - ret = stratum_get_version(sctx, id); - goto out; - } - if (!strcasecmp(method, "client.show_message")) { - ret = stratum_show_message(sctx, id, params); - goto out; - } - -out: - if (val) - json_decref(val); - - return ret; -} - -struct thread_q *tq_new(void) -{ - struct thread_q *tq; - - tq = (struct thread_q *)calloc(1, sizeof(*tq)); - if (!tq) - return NULL; - - INIT_LIST_HEAD(&tq->q); - pthread_mutex_init(&tq->mutex, NULL); - pthread_cond_init(&tq->cond, NULL); - - return tq; -} - -void tq_free(struct thread_q *tq) -{ - struct tq_ent *ent, *iter; - - if (!tq) - return; - - list_for_each_entry_safe(ent, iter, &tq->q, q_node, struct tq_ent, struct tq_ent) { - list_del(&ent->q_node); - free(ent); - } - - pthread_cond_destroy(&tq->cond); - pthread_mutex_destroy(&tq->mutex); - - memset(tq, 0, sizeof(*tq)); /* poison */ - free(tq); -} - -static void tq_freezethaw(struct thread_q *tq, bool frozen) -{ - pthread_mutex_lock(&tq->mutex); - - tq->frozen = frozen; - - pthread_cond_signal(&tq->cond); - pthread_mutex_unlock(&tq->mutex); -} - -void tq_freeze(struct thread_q *tq) -{ - tq_freezethaw(tq, true); -} - -void tq_thaw(struct thread_q *tq) -{ - tq_freezethaw(tq, false); -} - -bool tq_push(struct thread_q *tq, void *data) -{ - struct tq_ent *ent; - bool rc = true; - - ent = (struct tq_ent *)calloc(1, sizeof(*ent)); - if (!ent) - return false; - - ent->data = data; - INIT_LIST_HEAD(&ent->q_node); - - pthread_mutex_lock(&tq->mutex); - - if (!tq->frozen) { - list_add_tail(&ent->q_node, &tq->q); - } else { - free(ent); - rc = false; - } - - pthread_cond_signal(&tq->cond); - pthread_mutex_unlock(&tq->mutex); - - return rc; -} - -void *tq_pop(struct thread_q *tq, const struct timespec *abstime) -{ - struct tq_ent *ent; - void *rval = NULL; - int rc; - - pthread_mutex_lock(&tq->mutex); - - if (!list_empty(&tq->q)) - goto pop; - - if (abstime) - rc = pthread_cond_timedwait(&tq->cond, &tq->mutex, abstime); - else - rc = pthread_cond_wait(&tq->cond, &tq->mutex); - if (rc) - goto out; - if (list_empty(&tq->q)) - goto out; - -pop: - ent = list_entry(tq->q.next, struct tq_ent, q_node); - rval = ent->data; - - list_del(&ent->q_node); - free(ent); - -out: - pthread_mutex_unlock(&tq->mutex); - return rval; -} - -/** - * @param buf char[9] mini - * @param time_t timer to convert - */ -size_t time2str(char* buf, time_t timer) -{ - struct tm* tm_info; - tm_info = localtime(&timer); - return strftime(buf, 19, "%H:%M:%S", tm_info); -} - -/** - * Alloc and returns time string (to be freed) - * @param time_t timer to convert - */ -char* atime2str(time_t timer) -{ - char* buf = (char*) malloc(16); - memset(buf, 0, 16); - time2str(buf, timer); - return buf; -} - -/* sprintf can be used in applog */ -static char* format_hash(char* buf, unsigned char *hash) -{ - int len = 0; - for (int i=0; i < 32; i += 4) { - len += sprintf(buf+len, "%02x%02x%02x%02x ", - hash[i], hash[i+1], hash[i+2], hash[i+3]); - } - return buf; -} - -/* to debug diff in data */ -extern void applog_compare_hash(unsigned char *hash, unsigned char *hash2) -{ - char s[256] = ""; - int len = 0; - for (int i=0; i < 32; i += 4) { - char *color = memcmp(hash+i, hash2+i, 4) ? CL_WHT : CL_GRY; - len += sprintf(s+len, "%s%02x%02x%02x%02x " CL_GRY, color, - hash[i], hash[i+1], hash[i+2], hash[i+3]); - s[len] = '\0'; - } - applog(LOG_DEBUG, "%s", s); -} - -extern void applog_hash(unsigned char *hash) -{ - char s[128] = {'\0'}; - applog(LOG_DEBUG, "%s", format_hash(s, hash)); -} - -#define printpfx(n,h) \ - printf("%s%12s%s: %s\n", CL_BLU, n, CL_N, format_hash(s, h)) - -extern bool opt_tracegpu; -void do_gpu_tests(void) -{ -#ifdef _DEBUG - unsigned long done; - char s[128] = { '\0' }; - unsigned char buf[128], hash[128]; - uint32_t tgt[8] = { 0 }; - memset(buf, 0, sizeof buf); - buf[0] = 1; buf[64] = 2; - opt_tracegpu = true; - work_restart = (struct work_restart*) malloc(sizeof(struct work_restart)); - work_restart[0].restart = 1; - tgt[6] = 0xffff; - scanhash_blake256(0, (uint32_t*)buf, tgt, 1, &done, 14); - free(work_restart); - work_restart = NULL; - opt_tracegpu = false; -#endif -} - -void print_hash_tests(void) -{ - char s[128] = {'\0'}; - unsigned char buf[128], hash[128]; - memset(buf, 0, sizeof buf); - // buf[0] = 1; buf[64] = 2; - - printf(CL_WHT "CPU HASH ON EMPTY BUFFER RESULTS:" CL_N "\n"); - - memset(hash, 0, sizeof hash); - animehash(&hash[0], &buf[0]); - printpfx("anime", hash); - - memset(hash, 0, sizeof hash); - blake256hash(&hash[0], &buf[0], 8); - printpfx("blakecoin", hash); - - memset(hash, 0, sizeof hash); - blake256hash(&hash[0], &buf[0], 14); - printpfx("blake", hash); - - do_gpu_tests(); - - memset(hash, 0, sizeof hash); - deephash(&hash[0], &buf[0]); - printpfx("deep", hash); - - memset(hash, 0, sizeof hash); - fresh_hash(&hash[0], &buf[0]); - printpfx("fresh", hash); - - memset(hash, 0, sizeof hash); - fugue256_hash(&hash[0], &buf[0], 32); - printpfx("fugue256", hash); - - memset(hash, 0, sizeof hash); - groestlhash(&hash[0], &buf[0]); - printpfx("groestl", hash); - - memset(hash, 0, sizeof hash); - heavycoin_hash(&hash[0], &buf[0], 32); - printpfx("heavy", hash); - - memset(hash, 0, sizeof hash); - keccak256_hash(&hash[0], &buf[0]); - printpfx("keccak", hash); - - memset(hash, 0, sizeof hash); - jackpothash(&hash[0], &buf[0]); - printpfx("jackpot", hash); - - memset(hash, 0, sizeof hash); - doomhash(&hash[0], &buf[0]); - printpfx("luffa", hash); - - memset(hash, 0, sizeof hash); - myriadhash(&hash[0], &buf[0]); - printpfx("myriad", hash); - - memset(hash, 0, sizeof hash); - nist5hash(&hash[0], &buf[0]); - printpfx("nist5", hash); - - memset(hash, 0, sizeof hash); - pentablakehash(&hash[0], &buf[0]); - printpfx("pentablake", hash); - - memset(hash, 0, sizeof hash); - quarkhash(&hash[0], &buf[0]); - printpfx("quark", hash); - - memset(hash, 0, sizeof hash); - qubithash(&hash[0], &buf[0]); - printpfx("qubit", hash); - - memset(hash, 0, sizeof hash); - s3hash(&hash[0], &buf[0]); - printpfx("S3", hash); - - memset(hash, 0, sizeof hash); - wcoinhash(&hash[0], &buf[0]); - printpfx("whirl", hash); - - memset(hash, 0, sizeof hash); - x11hash(&hash[0], &buf[0]); - printpfx("X11", hash); - - memset(hash, 0, sizeof hash); - x13hash(&hash[0], &buf[0]); - printpfx("X13", hash); - - memset(hash, 0, sizeof hash); - x14hash(&hash[0], &buf[0]); - printpfx("X14", hash); - - memset(hash, 0, sizeof hash); - x15hash(&hash[0], &buf[0]); - printpfx("X15", hash); - - memset(hash, 0, sizeof hash); - x17hash(&hash[0], &buf[0]); - printpfx("X17", hash); - - printf("\n"); -} diff --git a/util.cpp b/util.cpp index 42cee696e5..496f799064 100644 --- a/util.cpp +++ b/util.cpp @@ -1,27 +1,24 @@ /* - * Copyright 2010 Jeff Garzik - * Copyright 2012-2014 pooler - * Copyright 2014 ccminer team - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the Free - * Software Foundation; either version 2 of the License, or (at your option) - * any later version. See COPYING for more details. - */ - -//#define _GNU_SOURCE -#include "cpuminer-config.h" - -#include -#include +* Copyright 2010 Jeff Garzik +* Copyright 2012-2014 pooler +* Copyright 2014 ccminer team +* +* This program is free software; you can redistribute it and/or modify it +* under the terms of the GNU General Public License as published by the Free +* Software Foundation; either version 2 of the License, or (at your option) +* any later version. See COPYING for more details. +*/ + +#include +#include #include -#include -#include -#include +#include +#include +#include #include #include #include -#include +#include #ifdef WIN32 #include "compat/winansi.h" #include @@ -34,32 +31,41 @@ #endif #include "miner.h" #include "elist.h" +using namespace std; + +extern enum sha_algos opt_algo; +extern char curl_err_str[]; bool opt_tracegpu = false; -struct data_buffer { +struct data_buffer +{ void *buf; size_t len; }; -struct upload_buffer { +struct upload_buffer +{ const void *buf; size_t len; size_t pos; }; -struct header_info { +struct header_info +{ char *lp_path; char *reason; char *stratum_url; }; -struct tq_ent { +struct tq_ent +{ void *data; struct list_head q_node; }; -struct thread_q { +struct thread_q +{ struct list_head q; bool frozen; @@ -68,6 +74,20 @@ struct thread_q { pthread_cond_t cond; }; +// input and output may point to the same location +void hexstringreverse(void *output, const void *input, size_t length) +{ + uint16_t tmp1; + uint16_t tmp2; + for(size_t i = 0; i < length / 4; i++) + { + tmp1 = *(((uint16_t*)input) + i); + tmp2 = *(((uint16_t*)output) + (length / 2 - i)); + *(((uint16_t*)input) + i) = tmp2; + *(((uint16_t*)output) + (length / 2 - i)) = tmp1; + } +} + void applog(int prio, const char *fmt, ...) { va_list ap; @@ -75,29 +95,35 @@ void applog(int prio, const char *fmt, ...) va_start(ap, fmt); #ifdef HAVE_SYSLOG_H - if (use_syslog) { + if(use_syslog) + { va_list ap2; char *buf; int len; /* custom colors to syslog prio */ - if (prio > LOG_DEBUG) { - switch (prio) { - case LOG_BLUE: prio = LOG_NOTICE; break; + if(prio > LOG_DEBUG) + { + switch(prio) + { + case LOG_BLUE: prio = LOG_NOTICE; break; } } va_copy(ap2, ap); len = vsnprintf(NULL, 0, fmt, ap2) + 1; va_end(ap2); - buf = (char*) alloca(len); - if (vsnprintf(buf, len, fmt, ap) >= 0) + buf = (char*)alloca(len); + if(vsnprintf(buf, len, fmt, ap) >= 0) syslog(prio, "%s", buf); } #else - if (0) {} + if(0) + { + } #endif - else { + else + { const char* color = ""; char *f; int len; @@ -109,34 +135,35 @@ void applog(int prio, const char *fmt, ...) memcpy(&tm, tm_p, sizeof(tm)); pthread_mutex_unlock(&applog_lock); - switch (prio) { - case LOG_ERR: color = CL_RED; break; - case LOG_WARNING: color = CL_YLW; break; - case LOG_NOTICE: color = CL_WHT; break; - case LOG_INFO: color = ""; break; - case LOG_DEBUG: color = CL_GRY; break; - - case LOG_BLUE: - prio = LOG_NOTICE; - color = CL_CYN; - break; + switch(prio) + { + case LOG_ERR: color = CL_RED; break; + case LOG_WARNING: color = CL_YLW; break; + case LOG_NOTICE: color = CL_WHT; break; + case LOG_INFO: color = ""; break; + case LOG_DEBUG: color = CL_GRY; break; + + case LOG_BLUE: + prio = LOG_NOTICE; + color = CL_CYN; + break; } - if (!use_colors) + if(!use_colors) color = ""; - len = 40 + (int) strlen(fmt) + 2; - f = (char*) alloca(len); + len = 40 + (int)strlen(fmt) + 2; + f = (char*)alloca(len); sprintf(f, "[%d-%02d-%02d %02d:%02d:%02d]%s %s%s\n", - tm.tm_year + 1900, - tm.tm_mon + 1, - tm.tm_mday, - tm.tm_hour, - tm.tm_min, - tm.tm_sec, - color, - fmt, - use_colors ? CL_N : "" - ); + tm.tm_year + 1900, + tm.tm_mon + 1, + tm.tm_mday, + tm.tm_hour, + tm.tm_min, + tm.tm_sec, + color, + fmt, + use_colors ? CL_N : "" + ); pthread_mutex_lock(&applog_lock); vfprintf(stderr, f, ap); /* atomic write to stderr */ fflush(stderr); @@ -145,9 +172,45 @@ void applog(int prio, const char *fmt, ...) va_end(ap); } +void format_hashrate(double hashrate, char *output) +{ + char prefix = '\0'; + + if(hashrate < 10000) + { + // nop + } + else if(hashrate < 1e7) + { + prefix = 'k'; + hashrate *= 1e-3; + } + else if(hashrate < 1e10) + { + prefix = 'M'; + hashrate *= 1e-6; + } + else if(hashrate < 1e13) + { + prefix = 'G'; + hashrate *= 1e-9; + } + else + { + prefix = 'T'; + hashrate *= 1e-12; + } + + sprintf( + output, + prefix ? "%.2f %cH/s" : "%.2f H/s%c", + hashrate, prefix + ); +} + static void databuf_free(struct data_buffer *db) { - if (!db) + if(!db) return; free(db->buf); @@ -156,7 +219,7 @@ static void databuf_free(struct data_buffer *db) } static size_t all_data_cb(const void *ptr, size_t size, size_t nmemb, - void *user_data) + void *user_data) { struct data_buffer *db = (struct data_buffer *)user_data; size_t len = size * nmemb; @@ -168,8 +231,11 @@ static size_t all_data_cb(const void *ptr, size_t size, size_t nmemb, newlen = oldlen + len; newmem = realloc(db->buf, newlen + 1); - if (!newmem) - return 0; + if(newmem == NULL) + { + applog(LOG_ERR, "Out of memory!"); + proper_exit(2); + } db->buf = newmem; db->len = newlen; @@ -180,15 +246,16 @@ static size_t all_data_cb(const void *ptr, size_t size, size_t nmemb, } static size_t upload_data_cb(void *ptr, size_t size, size_t nmemb, - void *user_data) + void *user_data) { struct upload_buffer *ub = (struct upload_buffer *)user_data; unsigned int len = (unsigned int)(size * nmemb); - if (len > ub->len - ub->pos) + if(len > ub->len - ub->pos) len = (unsigned int)(ub->len - ub->pos); - if (len) { + if(len) + { memcpy(ptr, (char*)ub->buf + ub->pos, len); ub->pos += len; } @@ -200,8 +267,9 @@ static size_t upload_data_cb(void *ptr, size_t size, size_t nmemb, static int seek_data_cb(void *user_data, curl_off_t offset, int origin) { struct upload_buffer *ub = (struct upload_buffer *)user_data; - - switch (origin) { + + switch(origin) + { case SEEK_SET: ub->pos = (size_t)offset; break; @@ -227,51 +295,65 @@ static size_t resp_hdr_cb(void *ptr, size_t size, size_t nmemb, void *user_data) void *tmp; val = (char*)calloc(1, ptrlen); + if(val == NULL) + { + applog(LOG_ERR, "Out of memory!"); + proper_exit(2); + } key = (char*)calloc(1, ptrlen); - if (!key || !val) - goto out; + if(key == NULL) + { + applog(LOG_ERR, "Out of memory!"); + proper_exit(2); + } tmp = memchr(ptr, ':', ptrlen); - if (!tmp || (tmp == ptr)) /* skip empty keys / blanks */ + if(!tmp || (tmp == ptr)) /* skip empty keys / blanks */ goto out; slen = (size_t)((char*)tmp - (char*)ptr); - if ((slen + 1) == ptrlen) /* skip key w/ no value */ + if((slen + 1) == ptrlen) /* skip key w/ no value */ goto out; memcpy(key, ptr, slen); /* store & nul term key */ key[slen] = 0; rem = (char*)ptr + slen + 1; /* trim value's leading whitespace */ remlen = ptrlen - slen - 1; - while ((remlen > 0) && (isspace(*rem))) { + while((remlen > 0) && (isspace(*rem))) + { remlen--; rem++; } memcpy(val, rem, remlen); /* store value, trim trailing ws */ val[remlen] = 0; - while ((*val) && (isspace(val[strlen(val) - 1]))) { + while((*val) && (isspace(val[strlen(val) - 1]))) + { val[strlen(val) - 1] = 0; } - if (!*val) /* skip blank value */ + if(!*val) /* skip blank value */ goto out; - if (!strcasecmp("X-Long-Polling", key)) { + if(!strcasecmp("X-Long-Polling", key)) + { hi->lp_path = val; /* X-Mining-Extensions: longpoll */ val = NULL; } - if (!strcasecmp("X-Reject-Reason", key)) { + if(!strcasecmp("X-Reject-Reason", key)) + { hi->reason = val; /* X-Mining-Extensions: reject-reason */ //applog(LOG_WARNING, "%s:%s", key, val); val = NULL; } - if (!strcasecmp("X-Stratum", key)) { + if(!strcasecmp("X-Stratum", key)) + { hi->stratum_url = val; /* steal memory reference */ val = NULL; } - if (!strcasecmp("X-Nonce-Range", key)) { + if(!strcasecmp("X-Nonce-Range", key)) + { /* todo when available: X-Mining-Extensions: noncerange */ } out: @@ -282,7 +364,7 @@ static size_t resp_hdr_cb(void *ptr, size_t size, size_t nmemb, void *user_data) #if LIBCURL_VERSION_NUM >= 0x070f06 static int sockopt_keepalive_cb(void *userdata, curl_socket_t fd, - curlsocktype purpose) + curlsocktype purpose) { int keepalive = 1; int tcp_keepcnt = 3; @@ -293,22 +375,22 @@ static int sockopt_keepalive_cb(void *userdata, curl_socket_t fd, #endif #ifndef WIN32 - if (unlikely(setsockopt(fd, SOL_SOCKET, SO_KEEPALIVE, &keepalive, + if(unlikely(setsockopt(fd, SOL_SOCKET, SO_KEEPALIVE, &keepalive, sizeof(keepalive)))) return 1; #ifdef __linux - if (unlikely(setsockopt(fd, SOL_TCP, TCP_KEEPCNT, + if(unlikely(setsockopt(fd, SOL_TCP, TCP_KEEPCNT, &tcp_keepcnt, sizeof(tcp_keepcnt)))) return 1; - if (unlikely(setsockopt(fd, SOL_TCP, TCP_KEEPIDLE, + if(unlikely(setsockopt(fd, SOL_TCP, TCP_KEEPIDLE, &tcp_keepidle, sizeof(tcp_keepidle)))) return 1; - if (unlikely(setsockopt(fd, SOL_TCP, TCP_KEEPINTVL, + if(unlikely(setsockopt(fd, SOL_TCP, TCP_KEEPINTVL, &tcp_keepintvl, sizeof(tcp_keepintvl)))) return 1; #endif /* __linux */ #ifdef __APPLE_CC__ - if (unlikely(setsockopt(fd, IPPROTO_TCP, TCP_KEEPALIVE, + if(unlikely(setsockopt(fd, IPPROTO_TCP, TCP_KEEPALIVE, &tcp_keepintvl, sizeof(tcp_keepintvl)))) return 1; #endif /* __APPLE_CC__ */ @@ -316,8 +398,8 @@ static int sockopt_keepalive_cb(void *userdata, curl_socket_t fd, struct tcp_keepalive vals; vals.onoff = 1; vals.keepalivetime = tcp_keepidle * 1000; - vals.keepaliveinterval = tcp_keepintvl * 1000; - if (unlikely(WSAIoctl(fd, SIO_KEEPALIVE_VALS, &vals, sizeof(vals), + vals.keepaliveinterval = tcp_keepintvl * 1000; + if(unlikely(WSAIoctl(fd, SIO_KEEPALIVE_VALS, &vals, sizeof(vals), NULL, 0, &outputBytes, NULL, NULL))) return 1; #endif /* WIN32 */ @@ -327,28 +409,27 @@ static int sockopt_keepalive_cb(void *userdata, curl_socket_t fd, #endif json_t *json_rpc_call(CURL *curl, const char *url, - const char *userpass, const char *rpc_req, - bool longpoll_scan, bool longpoll, int *curl_err) + const char *userpass, const char *rpc_req, + bool longpoll_scan, bool longpoll, int *curl_err) { json_t *val, *err_val, *res_val; - int rc; + CURLcode rc; struct data_buffer all_data = { 0 }; struct upload_buffer upload_data; json_error_t err; struct curl_slist *headers = NULL; char* httpdata; char len_hdr[64], hashrate_hdr[64]; - char curl_err_str[CURL_ERROR_SIZE] = { 0 }; - long timeout = longpoll ? opt_timeout : 30; + long timeout = opt_timeout; struct header_info hi = { 0 }; bool lp_scanning = longpoll_scan && !have_longpoll; /* it is assumed that 'curl' is freshly [re]initialized at this pt */ - if (opt_protocol) + if(opt_protocol) curl_easy_setopt(curl, CURLOPT_VERBOSE, 1); curl_easy_setopt(curl, CURLOPT_URL, url); - if (opt_cert) + if(opt_cert) curl_easy_setopt(curl, CURLOPT_CAINFO, opt_cert); curl_easy_setopt(curl, CURLOPT_ENCODING, ""); curl_easy_setopt(curl, CURLOPT_FAILONERROR, 0); @@ -367,27 +448,29 @@ json_t *json_rpc_call(CURL *curl, const char *url, curl_easy_setopt(curl, CURLOPT_TIMEOUT, timeout); curl_easy_setopt(curl, CURLOPT_HEADERFUNCTION, resp_hdr_cb); curl_easy_setopt(curl, CURLOPT_HEADERDATA, &hi); - if (opt_proxy) { + if(opt_proxy) + { curl_easy_setopt(curl, CURLOPT_PROXY, opt_proxy); curl_easy_setopt(curl, CURLOPT_PROXYTYPE, opt_proxy_type); } - if (userpass) { + if(userpass) + { curl_easy_setopt(curl, CURLOPT_USERPWD, userpass); curl_easy_setopt(curl, CURLOPT_HTTPAUTH, CURLAUTH_BASIC); } #if LIBCURL_VERSION_NUM >= 0x070f06 - if (longpoll) + if(longpoll) curl_easy_setopt(curl, CURLOPT_SOCKOPTFUNCTION, sockopt_keepalive_cb); #endif curl_easy_setopt(curl, CURLOPT_POST, 1); - if (opt_protocol) + if(opt_protocol) applog(LOG_DEBUG, "JSON protocol request:\n%s", rpc_req); upload_data.buf = rpc_req; upload_data.len = strlen(rpc_req); upload_data.pos = 0; - sprintf(len_hdr, "Content-Length: %lu", (unsigned long) upload_data.len); + sprintf(len_hdr, "Content-Length: %lu", (unsigned long)upload_data.len); sprintf(hashrate_hdr, "X-Mining-Hashrate: %llu", (unsigned long long) global_hashrate); headers = curl_slist_append(headers, "Content-Type: application/json"); @@ -399,91 +482,118 @@ json_t *json_rpc_call(CURL *curl, const char *url, headers = curl_slist_append(headers, "Expect:"); /* disable Expect hdr*/ curl_easy_setopt(curl, CURLOPT_HTTPHEADER, headers); - + curl_err_str[0] = 0; rc = curl_easy_perform(curl); - if (curl_err != NULL) + if(curl_err != NULL) *curl_err = rc; - if (rc) { - if (!(longpoll && rc == CURLE_OPERATION_TIMEDOUT)) { - applog(LOG_ERR, "HTTP request failed: %s", curl_err_str); + if(rc != CURLE_OK) + { + if(!(longpoll && rc == CURLE_OPERATION_TIMEDOUT)) + { + if(strlen(curl_err_str)>0) + applog(LOG_ERR, "HTTP request failed: %s", curl_err_str); + else + applog(LOG_ERR, "HTTP request failed: %s", curl_easy_strerror(rc)); goto err_out; } } /* If X-Stratum was found, activate Stratum */ - if (want_stratum && hi.stratum_url && - !strncasecmp(hi.stratum_url, "stratum+tcp://", 14) && - !(opt_proxy && opt_proxy_type == CURLPROXY_HTTP)) { + if(want_stratum && hi.stratum_url && + !strncasecmp(hi.stratum_url, "stratum+tcp://", 14) && + !(opt_proxy && opt_proxy_type == CURLPROXY_HTTP)) + { have_stratum = true; tq_push(thr_info[stratum_thr_id].q, hi.stratum_url); hi.stratum_url = NULL; } /* If X-Long-Polling was found, activate long polling */ - if (lp_scanning && hi.lp_path && !have_stratum) { + if(lp_scanning && hi.lp_path && !have_stratum) + { have_longpoll = true; tq_push(thr_info[longpoll_thr_id].q, hi.lp_path); hi.lp_path = NULL; } - if (!all_data.buf || !all_data.len) { + if(!all_data.buf || !all_data.len) + { applog(LOG_ERR, "Empty data received in json_rpc_call."); goto err_out; } - httpdata = (char*) all_data.buf; + httpdata = (char*)all_data.buf; - if (*httpdata != '{' && *httpdata != '[') { + if(*httpdata != '{' && *httpdata != '[') + { long errcode = 0; CURLcode c = curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &errcode); - if (c == CURLE_OK && errcode == 401) { + if(c == CURLE_OK && errcode == 401) + { applog(LOG_ERR, "You are not authorized, check your login and password."); goto err_out; } } val = JSON_LOADS(httpdata, &err); - if (!val) { + if(!val) + { applog(LOG_ERR, "JSON decode failed(%d): %s", err.line, err.text); - if (opt_protocol) + if(opt_protocol) applog(LOG_DEBUG, "%s", httpdata); goto err_out; } - if (opt_protocol) { + if(opt_protocol) + { char *s = json_dumps(val, JSON_INDENT(3)); applog(LOG_DEBUG, "JSON protocol response:\n%s\n", s); free(s); } /* JSON-RPC valid response returns a non-null 'result', - * and a null 'error'. */ + * and a null 'error'. */ res_val = json_object_get(val, "result"); err_val = json_object_get(val, "error"); - if (!res_val || json_is_null(res_val) || - (err_val && !json_is_null(err_val))) { - char *s; + if(!res_val || json_is_null(res_val) || + (err_val && !json_is_null(err_val))) + { + char *s = NULL; - if (err_val) { + if(err_val) + { + s = json_dumps(err_val, 0); json_t *msg = json_object_get(err_val, "message"); - s = json_dumps(err_val, JSON_INDENT(3)); - if (json_is_string(msg)) { + json_t *err_code = json_object_get(err_val, "code"); + if(curl_err && json_integer_value(err_code)) + *curl_err = (int)json_integer_value(err_code); + + if(json_is_string(msg)) + { free(s); s = strdup(json_string_value(msg)); + if(have_longpoll && s && !strcmp(s, "method not getwork")) + { + json_decref(err_val); + free(s); + goto err_out; + } } + json_decref(err_val); } else s = strdup("(unknown reason)"); - applog(LOG_ERR, "JSON-RPC call failed: %s", s); + if(!curl_err || opt_debug) + applog(LOG_ERR, "JSON-RPC call failed: %s", s); free(s); goto err_out; } - if (hi.reason) + if(hi.reason) json_object_set_new(val, "reject-reason", json_string(hi.reason)); databuf_free(&all_data); @@ -502,18 +612,28 @@ json_t *json_rpc_call(CURL *curl, const char *url, } /** - * Unlike malloc, calloc set the memory to zero - */ +* Unlike malloc, calloc set the memory to zero +*/ void *aligned_calloc(int size) { const int ALIGN = 64; // cache line #ifdef _MSC_VER void* res = _aligned_malloc(size, ALIGN); + if(res == NULL) + { + applog(LOG_ERR, "Out of memory!"); + proper_exit(2); + } memset(res, 0, size); return res; #else - void *mem = calloc(1, size+ALIGN+sizeof(uintptr_t)); - void **ptr = (void**)((size_t)(((uintptr_t)(mem))+ALIGN+sizeof(uintptr_t)) & ~(ALIGN-1)); + void *mem = calloc(1, size + ALIGN + sizeof(uintptr_t)); + if(mem == NULL) + { + applog(LOG_ERR, "Out of memory!"); + proper_exit(2); + } + void **ptr = (void**)((size_t)(((uintptr_t)(mem)) + ALIGN + sizeof(uintptr_t)) & ~(ALIGN - 1)); ptr[-1] = mem; return ptr; #endif @@ -530,9 +650,10 @@ void aligned_free(void *ptr) void cbin2hex(char *out, const char *in, size_t len) { - if (out) { + if(out) + { unsigned int i; - for (i = 0; i < len; i++) + for(i = 0; i < len; i++) sprintf(out + (i * 2), "%02x", (uint8_t)in[i]); } } @@ -540,10 +661,13 @@ void cbin2hex(char *out, const char *in, size_t len) char *bin2hex(const uchar *in, size_t len) { char *s = (char*)malloc((len * 2) + 1); - if (!s) - return NULL; + if(s == NULL) + { + applog(LOG_ERR, "Out of memory!"); + proper_exit(2); + } - cbin2hex(s, (const char *) in, len); + cbin2hex(s, (const char *)in, len); return s; } @@ -555,15 +679,18 @@ bool hex2bin(uchar *p, const char *hexstr, size_t len) hex_byte[2] = '\0'; - while (*hexstr && len) { - if (!hexstr[1]) { + while(*hexstr && len) + { + if(!hexstr[1]) + { applog(LOG_ERR, "hex2bin str truncated"); return false; } hex_byte[0] = hexstr[0]; hex_byte[1] = hexstr[1]; - *p = (uchar) strtol(hex_byte, &ep, 16); - if (*ep) { + *p = (uchar)strtol(hex_byte, &ep, 16); + if(*ep) + { applog(LOG_ERR, "hex2bin failed on '%s'", hex_byte); return false; } @@ -576,19 +703,19 @@ bool hex2bin(uchar *p, const char *hexstr, size_t len) } /* Subtract the `struct timeval' values X and Y, - storing the result in RESULT. - Return 1 if the difference is negative, otherwise 0. */ +storing the result in RESULT. +Return 1 if the difference is negative, otherwise 0. */ int timeval_subtract(struct timeval *result, struct timeval *x, - struct timeval *y) +struct timeval *y) { uint64_t start, end; end = x->tv_usec + 1000000 * x->tv_sec; - start = y->tv_usec + 1000000 * y->tv_sec; - if (start <= end) + start = y->tv_usec + 1000000 * y->tv_sec; + if(start <= end) { uint64_t diff = end - start; - result->tv_sec = diff / 1000000; + result->tv_sec = (long)(diff / 1000000); result->tv_usec = diff % 1000000; } else @@ -605,26 +732,32 @@ bool fulltest(const uint32_t *hash, const uint32_t *target) { int i; bool rc = true; - - for (i = 7; i >= 0; i--) { - if (hash[i] > target[i]) { + + for(i = 7; i >= 0; i--) + { + if(hash[i] > target[i]) + { rc = false; break; } - if (hash[i] < target[i]) { + if(hash[i] < target[i]) + { rc = true; break; } - if (hash[1] == target[1]) { + if(hash[1] == target[1]) + { applog(LOG_NOTICE, "We found a close match!"); } } - if (!rc && opt_debug) { + if(!rc && opt_debug) + { uint32_t hash_be[8], target_be[8]; char *hash_str, *target_str; - - for (i = 0; i < 8; i++) { + + for(i = 0; i < 8; i++) + { be32enc(hash_be + i, hash[7 - i]); be32enc(target_be + i, target[7 - i]); } @@ -632,10 +765,10 @@ bool fulltest(const uint32_t *hash, const uint32_t *target) target_str = bin2hex((uchar *)target_be, 32); applog(LOG_DEBUG, "DEBUG: %s\nHash: %s\nTarget: %s", - rc ? "hash <= target" + rc ? "hash <= target" : CL_YLW "hash > target (false positive)" CL_N, - hash_str, - target_str); + hash_str, + target_str); free(hash_str); free(target_str); @@ -644,17 +777,39 @@ bool fulltest(const uint32_t *hash, const uint32_t *target) return rc; } +bool fulltest_sia(const uint64_t *hash, const uint64_t *target) +{ + int i; + bool rc = true; + + for(i = 0; i < 4; i--) + { + if(swab64(hash[i]) > target[3 - i]) + { + rc = false; + break; + } + if(swab64(hash[i]) < target[3 - i]) + { + rc = true; + break; + } + } + return rc; +} + void diff_to_target(uint32_t *target, double diff) { uint64_t m; int k; - - for (k = 6; k > 0 && diff > 1.0; k--) + + for(k = 6; k > 0 && diff > 1.0; k--) diff /= 4294967296.0; m = (uint64_t)(4294901760.0 / diff); - if (m == 0 && k == 6) + if(m == 0 && k == 6) memset(target, 0xff, 32); - else { + else + { memset(target, 0, 32); target[k] = (uint32_t)m; target[k + 1] = (uint32_t)(m >> 32); @@ -670,22 +825,24 @@ void diff_to_target(uint32_t *target, double diff) static bool send_line(curl_socket_t sock, char *s) { ssize_t len, sent = 0; - + len = (ssize_t)strlen(s); s[len++] = '\n'; - while (len > 0) { - struct timeval timeout = {0, 0}; + while(len > 0) + { + struct timeval timeout = { 0, 0 }; ssize_t n; fd_set wd; FD_ZERO(&wd); FD_SET(sock, &wd); - if (select((int)sock + 1, NULL, &wd, NULL, &timeout) < 1) + if(select((int)sock + 1, NULL, &wd, NULL, &timeout) < 1) return false; - n = send(sock, s + sent, len, 0); - if (n < 0) { - if (!socket_blocks()) + n = send(sock, s + sent, (int)len, 0); + if(n < 0) + { + if(!socket_blocks()) return false; n = 0; } @@ -700,7 +857,7 @@ bool stratum_send_line(struct stratum_ctx *sctx, char *s) { bool ret = false; - if (opt_protocol) + if(opt_protocol) applog(LOG_DEBUG, "> %s", s); pthread_mutex_lock(&sctx->sock_lock); @@ -719,7 +876,7 @@ static bool socket_full(curl_socket_t sock, int timeout) FD_SET(sock, &rd); tv.tv_sec = timeout; tv.tv_usec = 0; - if (select((int)sock + 1, &rd, NULL, NULL, &tv) > 0) + if(select((int)sock + 1, &rd, NULL, NULL, &tv) > 0) return true; return false; } @@ -738,9 +895,15 @@ static void stratum_buffer_append(struct stratum_ctx *sctx, const char *s) old = strlen(sctx->sockbuf); snew = old + strlen(s) + 1; - if (snew >= sctx->sockbuf_size) { + if(snew >= sctx->sockbuf_size) + { sctx->sockbuf_size = snew + (RBUFSIZE - (snew % RBUFSIZE)); sctx->sockbuf = (char*)realloc(sctx->sockbuf, sctx->sockbuf_size); + if(sctx->sockbuf == NULL) + { + applog(LOG_ERR, "Out of memory!"); + proper_exit(2); + } } strcpy(sctx->sockbuf + old, s); } @@ -749,62 +912,75 @@ char *stratum_recv_line(struct stratum_ctx *sctx) { ssize_t len, buflen; char *tok, *sret = NULL; + int timeout = opt_timeout; + + if(!sctx->sockbuf) + return NULL; - if (!strstr(sctx->sockbuf, "\n")) { + if(!strstr(sctx->sockbuf, "\n")) + { bool ret = true; time_t rstart = time(NULL); - if (!socket_full(sctx->sock, 60)) { + if(!socket_full(sctx->sock, timeout)) + { applog(LOG_ERR, "stratum_recv_line timed out"); goto out; } - do { + do + { char s[RBUFSIZE]; ssize_t n; memset(s, 0, RBUFSIZE); n = recv(sctx->sock, s, RECVSIZE, 0); - if (!n) { + if(!n) + { ret = false; break; } - if (n < 0) { - if (!socket_blocks() || !socket_full(sctx->sock, 1)) { + if(n < 0) + { + if(!socket_blocks() || !socket_full(sctx->sock, 10)) + { ret = false; break; } - } else + } + else stratum_buffer_append(sctx, s); - } while (time(NULL) - rstart < 60 && !strstr(sctx->sockbuf, "\n")); + } while(time(NULL) - rstart < timeout && !strstr(sctx->sockbuf, "\n")); - if (!ret) { - applog(LOG_ERR, "stratum_recv_line failed"); + if(!ret) + { + if(opt_debug) applog(LOG_ERR, "stratum_recv_line failed"); goto out; } } buflen = (ssize_t)strlen(sctx->sockbuf); tok = strtok(sctx->sockbuf, "\n"); - if (!tok) { + if(!tok) + { applog(LOG_ERR, "stratum_recv_line failed to parse a newline-terminated string"); goto out; } sret = strdup(tok); len = (ssize_t)strlen(sret); - if (buflen > len + 1) + if(buflen > len + 1) memmove(sctx->sockbuf, sctx->sockbuf + len + 1, buflen - len + 1); else sctx->sockbuf[0] = '\0'; out: - if (sret && opt_protocol) + if(sret && opt_protocol) applog(LOG_DEBUG, "< %s", sret); return sret; } #if LIBCURL_VERSION_NUM >= 0x071101 static curl_socket_t opensocket_grab_cb(void *clientp, curlsocktype purpose, - struct curl_sockaddr *addr) +struct curl_sockaddr *addr) { curl_socket_t *sock = (curl_socket_t *)clientp; *sock = socket(addr->family, addr->socktype, addr->protocol); @@ -815,48 +991,64 @@ static curl_socket_t opensocket_grab_cb(void *clientp, curlsocktype purpose, bool stratum_connect(struct stratum_ctx *sctx, const char *url) { CURL *curl; - int rc; + CURLcode rc; pthread_mutex_lock(&sctx->sock_lock); - if (sctx->curl) + if(sctx->curl) curl_easy_cleanup(sctx->curl); sctx->curl = curl_easy_init(); - if (!sctx->curl) { + if(!sctx->curl) + { applog(LOG_ERR, "CURL initialization failed"); pthread_mutex_unlock(&sctx->sock_lock); return false; } curl = sctx->curl; - if (!sctx->sockbuf) { + if(!sctx->sockbuf) + { sctx->sockbuf = (char*)calloc(RBUFSIZE, 1); + if(sctx->sockbuf == NULL) + { + applog(LOG_ERR, "Out of memory!"); + proper_exit(2); + } sctx->sockbuf_size = RBUFSIZE; } sctx->sockbuf[0] = '\0'; pthread_mutex_unlock(&sctx->sock_lock); - if (url != sctx->url) { + if(url != sctx->url) + { free(sctx->url); sctx->url = strdup(url); } free(sctx->curl_url); sctx->curl_url = (char*)malloc(strlen(url)); + if(sctx->curl_url == NULL) + { + applog(LOG_ERR, "Out of memory!"); + proper_exit(2); + } sprintf(sctx->curl_url, "http%s", strstr(url, "://")); - if (opt_protocol) + if(opt_protocol) curl_easy_setopt(curl, CURLOPT_VERBOSE, 1); curl_easy_setopt(curl, CURLOPT_URL, sctx->curl_url); curl_easy_setopt(curl, CURLOPT_FRESH_CONNECT, 1); - curl_easy_setopt(curl, CURLOPT_CONNECTTIMEOUT, 30); - curl_easy_setopt(curl, CURLOPT_ERRORBUFFER, sctx->curl_err_str); + curl_easy_setopt(curl, CURLOPT_CONNECTTIMEOUT, opt_timeout); + curl_easy_setopt(curl, CURLOPT_ERRORBUFFER, curl_err_str); curl_easy_setopt(curl, CURLOPT_NOSIGNAL, 1); curl_easy_setopt(curl, CURLOPT_TCP_NODELAY, 1); - if (opt_proxy && opt_proxy_type != CURLPROXY_HTTP) { + if(opt_proxy && opt_proxy_type != CURLPROXY_HTTP) + { curl_easy_setopt(curl, CURLOPT_PROXY, opt_proxy); curl_easy_setopt(curl, CURLOPT_PROXYTYPE, opt_proxy_type); - } else if (getenv("http_proxy")) { - if (getenv("all_proxy")) + } + else if(getenv("http_proxy")) + { + if(getenv("all_proxy")) curl_easy_setopt(curl, CURLOPT_PROXY, getenv("all_proxy")); - else if (getenv("ALL_PROXY")) + else if(getenv("ALL_PROXY")) curl_easy_setopt(curl, CURLOPT_PROXY, getenv("ALL_PROXY")); else curl_easy_setopt(curl, CURLOPT_PROXY, ""); @@ -869,10 +1061,14 @@ bool stratum_connect(struct stratum_ctx *sctx, const char *url) curl_easy_setopt(curl, CURLOPT_OPENSOCKETDATA, &sctx->sock); #endif curl_easy_setopt(curl, CURLOPT_CONNECT_ONLY, 1); - + curl_err_str[0] = 0; rc = curl_easy_perform(curl); - if (rc) { - applog(LOG_ERR, "Stratum connection failed: %s", sctx->curl_err_str); + if(rc != CURLE_OK) + { + if(strlen(curl_err_str)>0) + applog(LOG_ERR, "HTTP request failed: %s", curl_err_str); + else + applog(LOG_ERR, "HTTP request failed: %s", curl_easy_strerror(rc)); curl_easy_cleanup(curl); sctx->curl = NULL; return false; @@ -886,78 +1082,109 @@ bool stratum_connect(struct stratum_ctx *sctx, const char *url) return true; } +static void stratum_free_job(struct stratum_ctx *sctx) +{ + pthread_mutex_lock(&sctx->sock_lock); + if(sctx->job.job_id) + { + free(sctx->job.job_id); + } + if(sctx->job.merkle_count) + { + for(int i = 0; i < sctx->job.merkle_count; i++) + { + free(sctx->job.merkle[i]); + sctx->job.merkle[i] = NULL; + } + free(sctx->job.merkle); + } + free(sctx->job.coinbase); + // note: xnonce2 is not allocated + memset(&(sctx->job.job_id), 0, sizeof(struct stratum_job)); + pthread_mutex_unlock(&sctx->sock_lock); +} + void stratum_disconnect(struct stratum_ctx *sctx) { pthread_mutex_lock(&sctx->sock_lock); - if (sctx->curl) { + if(sctx->curl) + { sctx->disconnects++; curl_easy_cleanup(sctx->curl); sctx->curl = NULL; sctx->sockbuf[0] = '\0'; } + if(sctx->job.job_id) + { + stratum_free_job(sctx); + } pthread_mutex_unlock(&sctx->sock_lock); } -static const char *get_stratum_session_id(json_t *val) +static const char *get_stratum_session_id(const json_t *val) { json_t *arr_val; int i, n; arr_val = json_array_get(val, 0); - if (!arr_val || !json_is_array(arr_val)) + if(!arr_val || !json_is_array(arr_val)) return NULL; - n = json_array_size(arr_val); - for (i = 0; i < n; i++) { + n = (int)json_array_size(arr_val); + for(i = 0; i < n; i++) + { const char *notify; json_t *arr = json_array_get(arr_val, i); - if (!arr || !json_is_array(arr)) + if(!arr || !json_is_array(arr)) break; notify = json_string_value(json_array_get(arr, 0)); - if (!notify) + if(!notify) continue; - if (!strcasecmp(notify, "mining.notify")) + if(!strcasecmp(notify, "mining.notify")) return json_string_value(json_array_get(arr, 1)); } return NULL; } -static bool stratum_parse_extranonce(struct stratum_ctx *sctx, json_t *params, int pndx) +static bool stratum_parse_extranonce(struct stratum_ctx *sctx, const json_t *params, int pndx) { const char* xnonce1; int xn2_size; xnonce1 = json_string_value(json_array_get(params, pndx)); - if (!xnonce1) { + if(!xnonce1) + { applog(LOG_ERR, "Failed to get extranonce1"); goto out; } - xn2_size = (int) json_integer_value(json_array_get(params, pndx+1)); - if (!xn2_size) { + xn2_size = (int)json_integer_value(json_array_get(params, pndx + 1)); + if(!xn2_size) + { applog(LOG_ERR, "Failed to get extranonce2_size"); goto out; } - if (xn2_size < 2 || xn2_size > 16) { - applog(LOG_INFO, "Failed to get valid n2size in parse_extranonce"); + if(xn2_size < 2 || xn2_size > 16) + { + applog(LOG_ERR, "invalid n2size in parse_extranonce: size=%d", xn2_size); goto out; } pthread_mutex_lock(&sctx->work_lock); - if (sctx->xnonce1) + if(sctx->xnonce1) free(sctx->xnonce1); sctx->xnonce1_size = strlen(xnonce1) / 2; - sctx->xnonce1 = (uchar*) calloc(1, sctx->xnonce1_size); - if (unlikely(!sctx->xnonce1)) { - applog(LOG_ERR, "Failed to alloc xnonce1"); - pthread_mutex_unlock(&sctx->work_lock); - goto out; + sctx->xnonce1 = (uchar*)calloc(1, sctx->xnonce1_size); + if(sctx->xnonce1 == NULL) + { + applog(LOG_ERR, "Out of memory!"); + proper_exit(2); } hex2bin(sctx->xnonce1, xnonce1, sctx->xnonce1_size); sctx->xnonce2_size = xn2_size; pthread_mutex_unlock(&sctx->work_lock); - if (pndx == 0 && opt_debug) /* pool dynamic change */ + if(pndx == 0 && opt_debug) /* pool dynamic change */ applog(LOG_DEBUG, "Stratum set nonce %s with extranonce2 size=%d", - xnonce1, xn2_size); + xnonce1, xn2_size); return true; out: @@ -966,52 +1193,63 @@ static bool stratum_parse_extranonce(struct stratum_ctx *sctx, json_t *params, i bool stratum_subscribe(struct stratum_ctx *sctx) { - char *s, *sret = NULL; - const char *sid; - json_t *val = NULL, *res_val, *err_val; json_error_t err; + json_t *val; + json_t *res_val; + json_t *err_val; bool ret = false, retry = false; + char *sret; + char *sid; start: - s = (char*)malloc(128 + (sctx->session_id ? strlen(sctx->session_id) : 0)); - if (retry) + char *s = (char*)malloc(128 + (sctx->session_id ? strlen(sctx->session_id) : 0)); + if(s == NULL) + { + applog(LOG_ERR, "Out of memory!"); + proper_exit(2); + } + if(retry) sprintf(s, "{\"id\": 1, \"method\": \"mining.subscribe\", \"params\": []}"); - else if (sctx->session_id) + else if(sctx->session_id) sprintf(s, "{\"id\": 1, \"method\": \"mining.subscribe\", \"params\": [\"" USER_AGENT "\", \"%s\"]}", sctx->session_id); else sprintf(s, "{\"id\": 1, \"method\": \"mining.subscribe\", \"params\": [\"" USER_AGENT "\"]}"); - if (!stratum_send_line(sctx, s)) + if(!stratum_send_line(sctx, s)) goto out; - if (!socket_full(sctx->sock, 10)) { + if(!socket_full(sctx->sock, 10)) + { applog(LOG_ERR, "stratum_subscribe timed out"); goto out; } sret = stratum_recv_line(sctx); - if (!sret) + if(!sret) goto out; val = JSON_LOADS(sret, &err); free(sret); - if (!val) { + if(!val) + { applog(LOG_ERR, "JSON decode failed(%d): %s", err.line, err.text); goto out; } - if (json_integer_value(json_object_get(val, "id")) != 1) { + if(json_integer_value(json_object_get(val, "id")) != 1) + { applog(LOG_WARNING, "Stratum subscribe answer id is not correct!"); } res_val = json_object_get(val, "result"); err_val = json_object_get(val, "error"); - if (!res_val || json_is_null(res_val) || - (err_val && !json_is_null(err_val))) { - if (opt_debug || retry) { + if(!res_val || json_is_null(res_val) || (err_val && !json_is_null(err_val))) + { + if(opt_debug || retry) + { free(s); - if (err_val) + if(err_val) s = json_dumps(err_val, JSON_INDENT(3)); else s = strdup("(unknown reason)"); @@ -1021,19 +1259,20 @@ bool stratum_subscribe(struct stratum_ctx *sctx) } // sid is param 1, extranonce params are 2 and 3 - if (!stratum_parse_extranonce(sctx, res_val, 1)) { + if(!stratum_parse_extranonce(sctx, res_val, 1)) + { goto out; } ret = true; // session id (optional) - sid = get_stratum_session_id(res_val); - if (opt_debug && sid) + sid = (char*)get_stratum_session_id(res_val); + if(opt_debug && sid) applog(LOG_DEBUG, "Stratum session id: %s", sid); pthread_mutex_lock(&sctx->work_lock); - if (sctx->session_id) + if(sctx->session_id) free(sctx->session_id); sctx->session_id = sid ? strdup(sid) : NULL; sctx->next_diff = 1.0; @@ -1041,20 +1280,19 @@ bool stratum_subscribe(struct stratum_ctx *sctx) out: free(s); - if (val) + if(val) json_decref(val); - if (!ret) { - if (sret && !retry) { - retry = true; - goto start; - } + if(!ret && sret && !retry) + { + retry = true; + goto start; } return ret; } -bool stratum_authorize(struct stratum_ctx *sctx, const char *user, const char *pass) +bool stratum_authorize(struct stratum_ctx *sctx, const char *user, const char *pass, bool extranonce) { json_t *val = NULL, *res_val, *err_val; char *s, *sret; @@ -1062,109 +1300,128 @@ bool stratum_authorize(struct stratum_ctx *sctx, const char *user, const char *p bool ret = false; s = (char*)malloc(80 + strlen(user) + strlen(pass)); + if(s == NULL) + { + applog(LOG_ERR, "Out of memory!"); + proper_exit(2); + } sprintf(s, "{\"id\": 2, \"method\": \"mining.authorize\", \"params\": [\"%s\", \"%s\"]}", - user, pass); + user, pass); - if (!stratum_send_line(sctx, s)) + if(!stratum_send_line(sctx, s)) + { + applog(LOG_ERR, "Error: couldn't send stratum authorization request"); goto out; + } - while (1) { + while(1) + { sret = stratum_recv_line(sctx); - if (!sret) + if(!sret) goto out; - if (!stratum_handle_method(sctx, sret)) + if(!stratum_handle_method(sctx, sret)) break; free(sret); } val = JSON_LOADS(sret, &err); free(sret); - if (!val) { + if(!val) + { applog(LOG_ERR, "JSON decode failed(%d): %s", err.line, err.text); goto out; } - - if (json_integer_value(json_object_get(val, "id")) != 2) { + if(json_integer_value(json_object_get(val, "id")) != 2) + { applog(LOG_WARNING, "Stratum authorize answer id is not correct!"); } res_val = json_object_get(val, "result"); err_val = json_object_get(val, "error"); - - if (!res_val || json_is_false(res_val) || - (err_val && !json_is_null(err_val))) { + if(!res_val || json_is_false(res_val) || + (err_val && !json_is_null(err_val))) + { applog(LOG_ERR, "Stratum authentication failed"); goto out; } - sctx->tm_connected = time(NULL); ret = true; + if(extranonce) + { + // subscribe to extranonce (optional) + sprintf(s, "{\"id\": 3, \"method\": \"mining.extranonce.subscribe\", \"params\": []}"); + if(!stratum_send_line(sctx, s)) + goto out; + // reduced timeout to handle pools ignoring this method without answer (like xpool.ca) + if(!socket_full(sctx->sock, 10)) + { + if(opt_debug) + applog(LOG_DEBUG, "stratum extranonce subscribe timed out"); + goto out; + } - // subscribe to extranonce (optional) - sprintf(s, "{\"id\": 3, \"method\": \"mining.extranonce.subscribe\", \"params\": []}"); - - if (!stratum_send_line(sctx, s)) - goto out; - - // reduced timeout to handle pools ignoring this method without answer (like xpool.ca) - if (!socket_full(sctx->sock, 1)) { - if (opt_debug) - applog(LOG_DEBUG, "stratum extranonce subscribe timed out"); - goto out; - } - - sret = stratum_recv_line(sctx); - if (sret) { - json_t *extra = JSON_LOADS(sret, &err); - if (!extra) { - applog(LOG_WARNING, "JSON decode failed(%d): %s", err.line, err.text); - } else { - if (json_integer_value(json_object_get(extra, "id")) != 3) { - // we receive a standard method if extranonce is ignored - if (!stratum_handle_method(sctx, sret)) - applog(LOG_WARNING, "Stratum extranonce answer id was not correct!"); - } else { - res_val = json_object_get(extra, "result"); - if (opt_debug && (!res_val || json_is_false(res_val))) - applog(LOG_DEBUG, "extranonce subscribe not supported"); + sret = stratum_recv_line(sctx); + if(sret) + { + json_t *extra = JSON_LOADS(sret, &err); + if(!extra) + { + applog(LOG_WARNING, "JSON decode failed(%d): %s", err.line, err.text); + } + else + { + if(json_integer_value(json_object_get(extra, "id")) != 3) + { + // we receive a standard method if extranonce is ignored + if(!stratum_handle_method(sctx, sret)) + applog(LOG_WARNING, "Stratum extranonce answer id was not correct!"); + } + else + { + res_val = json_object_get(extra, "result"); + if(opt_debug && (!res_val || json_is_false(res_val))) + applog(LOG_DEBUG, "extranonce subscribe not supported"); + } + json_decref(extra); } - json_decref(extra); + free(sret); } - free(sret); } out: free(s); - if (val) + if(val) json_decref(val); return ret; } /** - * Extract bloc height L H... here len=3, height=0x1333e8 - * "...0000000000ffffffff2703e83313062f503253482f043d61105408" - */ +* Extract block height L H... here len=3, height=0x1333e8 +* "...0000000000ffffffff2703e83313062f503253482f043d61105408" +*/ static uint32_t getblocheight(struct stratum_ctx *sctx) { uint32_t height = 0; uint8_t hlen = 0, *p, *m; // find 0xffff tag - p = (uint8_t*) sctx->job.coinbase + 32; + p = (uint8_t*)sctx->job.coinbase + 32; m = p + 128; - while (*p != 0xff && p < m) p++; - while (*p == 0xff && p < m) p++; - if (*(p-1) == 0xff && *(p-2) == 0xff) { + while(*p != 0xff && p < m) p++; + while(*p == 0xff && p < m) p++; + if(*(p - 1) == 0xff && *(p - 2) == 0xff) + { p++; hlen = *p; p++; height = le16dec(p); p += 2; - switch (hlen) { - case 4: - height += 0x10000UL * le16dec(p); - break; - case 3: - height += 0x10000UL * (*p); - break; + switch(hlen) + { + case 4: + height += 0x10000UL * le16dec(p); + break; + case 3: + height += 0x10000UL * (*p); + break; } } return height; @@ -1172,71 +1429,117 @@ static uint32_t getblocheight(struct stratum_ctx *sctx) static bool stratum_notify(struct stratum_ctx *sctx, json_t *params) { - const char *job_id, *prevhash, *coinb1, *coinb2, *version, *nbits, *stime, *nreward; + const char *job_id, *prevhash, *coinb1, *coinb2, *version, *nbits, *nreward; + char *stime; size_t coinb1_size, coinb2_size; bool clean, ret = false; int merkle_count, i; json_t *merkle_arr; - uchar **merkle; - int ntime; + uchar **merkle = NULL; + int32_t ntime; job_id = json_string_value(json_array_get(params, 0)); prevhash = json_string_value(json_array_get(params, 1)); coinb1 = json_string_value(json_array_get(params, 2)); coinb2 = json_string_value(json_array_get(params, 3)); merkle_arr = json_array_get(params, 4); - if (!merkle_arr || !json_is_array(merkle_arr)) + if(!merkle_arr || !json_is_array(merkle_arr)) goto out; - merkle_count = json_array_size(merkle_arr); - version = json_string_value(json_array_get(params, 5)); + merkle_count = (int)json_array_size(merkle_arr); + if(opt_algo != ALGO_SIA) + version = json_string_value(json_array_get(params, 5)); + else + version = "00000001"; //unused nbits = json_string_value(json_array_get(params, 6)); - stime = json_string_value(json_array_get(params, 7)); + stime = (char *)json_string_value(json_array_get(params, 7)); clean = json_is_true(json_array_get(params, 8)); nreward = json_string_value(json_array_get(params, 9)); - if (!job_id || !prevhash || !coinb1 || !coinb2 || !version || !nbits || !stime || - strlen(prevhash) != 64 || strlen(version) != 8 || - strlen(nbits) != 8 || strlen(stime) != 8) { + if(!job_id || !prevhash || !coinb1 || !coinb2 || !version || !nbits || !stime || + strlen(prevhash) != 64 || strlen(version) != 8 || strlen(nbits) != 8) + { applog(LOG_ERR, "Stratum notify: invalid parameters"); goto out; } + if(opt_algo == ALGO_SIA) + { + if(strlen(stime) != 16) + { + applog(LOG_ERR, "Stratum notify: invalid time parameter"); + goto out; + } + } + else + { + if(strlen(stime) != 8) + { + applog(LOG_ERR, "Stratum notify: invalid time parameter"); + goto out; + } + } /* store stratum server time diff */ hex2bin((uchar *)&ntime, stime, 4); - ntime = swab32(ntime) - (uint32_t) time(0); - if (ntime > sctx->srvtime_diff) { + if(opt_algo!=ALGO_SIA) + ntime = swab32(ntime) - (uint32_t)time(0); + else + ntime = ntime - (uint32_t)time(0); + + pthread_mutex_lock(&sctx->work_lock); + + if(ntime > sctx->srvtime_diff) + { sctx->srvtime_diff = ntime; - if (!opt_quiet && ntime > 20) + if(!opt_quiet && ntime > 20) applog(LOG_DEBUG, "stratum time is at least %ds in the future", ntime); } - merkle = (uchar**) malloc(merkle_count * sizeof(char *)); - for (i = 0; i < merkle_count; i++) { + if(merkle_count) + { + merkle = (uchar**)malloc(merkle_count * sizeof(char *)); + if(merkle == NULL) + { + applog(LOG_ERR, "Out of memory!"); + proper_exit(2); + } + } + for(i = 0; i < merkle_count; i++) + { const char *s = json_string_value(json_array_get(merkle_arr, i)); - if (!s || strlen(s) != 64) { - while (i--) + if(!s || strlen(s) != 64) + { + while(i--) free(merkle[i]); free(merkle); applog(LOG_ERR, "Stratum notify: invalid Merkle branch"); + pthread_mutex_unlock(&sctx->work_lock); goto out; } - merkle[i] = (uchar*) malloc(32); + merkle[i] = (uchar*)malloc(32); + if(merkle[i] == NULL) + { + applog(LOG_ERR, "Out of memory!"); + proper_exit(2); + } hex2bin(merkle[i], s, 32); } - pthread_mutex_lock(&sctx->work_lock); - coinb1_size = strlen(coinb1) / 2; coinb2_size = strlen(coinb2) / 2; sctx->job.coinbase_size = coinb1_size + sctx->xnonce1_size + - sctx->xnonce2_size + coinb2_size; + sctx->xnonce2_size + coinb2_size; - sctx->job.coinbase = (uchar*) realloc(sctx->job.coinbase, sctx->job.coinbase_size); + sctx->job.coinbase = (uchar*)realloc(sctx->job.coinbase, sctx->job.coinbase_size); + if(sctx->job.coinbase == NULL) + { + applog(LOG_ERR, "Out of memory!"); + proper_exit(2); + } sctx->job.xnonce2 = sctx->job.coinbase + coinb1_size + sctx->xnonce1_size; hex2bin(sctx->job.coinbase, coinb1, coinb1_size); memcpy(sctx->job.coinbase + coinb1_size, sctx->xnonce1, sctx->xnonce1_size); - if (!sctx->job.job_id || strcmp(sctx->job.job_id, job_id)) + if(!sctx->job.job_id || strcmp(sctx->job.job_id, job_id)) memset(sctx->job.xnonce2, 0, sctx->xnonce2_size); hex2bin(sctx->job.xnonce2 + sctx->xnonce2_size, coinb2, coinb2_size); @@ -1244,9 +1547,12 @@ static bool stratum_notify(struct stratum_ctx *sctx, json_t *params) sctx->job.job_id = strdup(job_id); hex2bin(sctx->job.prevhash, prevhash, 32); - sctx->job.height = getblocheight(sctx); + if(opt_algo != ALGO_SIA) + sctx->job.height = getblocheight(sctx); + else + sctx->job.height = 1; - for (i = 0; i < sctx->job.merkle_count; i++) + for(i = 0; i < sctx->job.merkle_count; i++) free(sctx->job.merkle[i]); free(sctx->job.merkle); sctx->job.merkle = merkle; @@ -1272,12 +1578,13 @@ static bool stratum_notify(struct stratum_ctx *sctx, json_t *params) return ret; } +extern time_t g_work_time; static bool stratum_set_difficulty(struct stratum_ctx *sctx, json_t *params) { double diff; diff = json_number_value(json_array_get(params, 0)); - if (diff <= 0.0) + if(diff <= 0.0) return false; pthread_mutex_lock(&sctx->work_lock); @@ -1285,9 +1592,11 @@ static bool stratum_set_difficulty(struct stratum_ctx *sctx, json_t *params) pthread_mutex_unlock(&sctx->work_lock); /* store for api stats */ - if (diff != global_diff) { + if(diff != global_diff) + { global_diff = diff; applog(LOG_WARNING, "Stratum difficulty set to %g", diff); + g_work_time = 0; } return true; @@ -1301,15 +1610,20 @@ static bool stratum_reconnect(struct stratum_ctx *sctx, json_t *params) host = json_string_value(json_array_get(params, 0)); port_val = json_array_get(params, 1); - if (json_is_string(port_val)) + if(json_is_string(port_val)) port = atoi(json_string_value(port_val)); else - port = (int) json_integer_value(port_val); - if (!host || !port) + port = (int)json_integer_value(port_val); + if(!host || !port) return false; - + free(sctx->url); sctx->url = (char*)malloc(32 + strlen(host)); + if(sctx->url == NULL) + { + applog(LOG_ERR, "Out of memory!"); + proper_exit(2); + } sprintf(sctx->url, "stratum+tcp://%s:%d", host, port); applog(LOG_NOTICE, "Server requested reconnection to %s", sctx->url); @@ -1318,20 +1632,165 @@ static bool stratum_reconnect(struct stratum_ctx *sctx, json_t *params) return true; } +static bool stratum_pong(struct stratum_ctx *sctx, json_t *id) +{ + char buf[64]; + bool ret = false; + + if(!id || json_is_null(id)) + return ret; + + sprintf(buf, "{\"id\":%d,\"result\":\"pong\",\"error\":null}", + (int)json_integer_value(id)); + ret = stratum_send_line(sctx, buf); + + return ret; +} + +static bool stratum_get_algo(struct stratum_ctx *sctx, json_t *id, json_t *params) +{ + char algo[64] = {0}; + char *s; + json_t *val; + bool ret = true; + + if(!id || json_is_null(id)) + return false; + + get_currentalgo(algo, sizeof(algo)); + + val = json_object(); + json_object_set(val, "id", id); + json_object_set_new(val, "error", json_null()); + json_object_set_new(val, "result", json_string(algo)); + + s = json_dumps(val, 0); + ret = stratum_send_line(sctx, s); + json_decref(val); + free(s); + + return ret; +} + +#include "nvml.h" +extern char driver_version[32]; +extern int cuda_arch[MAX_GPUS]; + +static bool json_object_set_error(json_t *result, int code, const char *msg) +{ + json_t *val = json_object(); + json_object_set_new(val, "code", json_integer(code)); + json_object_set_new(val, "message", json_string(msg)); + return json_object_set_new(result, "error", val) != -1; +} + +/* allow to report algo/device perf to the pool for algo stats */ +static bool stratum_benchdata(json_t *result, json_t *params, int thr_id) +{ + char algo[64] = {0}; + char vid[32], arch[8], driver[32]; + char *card; + char os[8]; + uint32_t watts = 0; + int dev_id = device_map[thr_id]; + int cuda_ver = cuda_version(); + struct cgpu_info *cgpu = &thr_info[thr_id].gpu; + json_t *val; + + if(!cgpu || !opt_stratum_stats) return false; + +#if defined(WIN32) && (defined(_M_X64) || defined(__x86_64__)) + strcpy(os, "win64"); +#else + strcpy(os, is_windows() ? "win32" : "linux"); +#endif + +#ifdef USE_WRAPNVML + cgpu->has_monitoring = true; + cgpu->gpu_power = gpu_power(cgpu); // mWatts + watts = (cgpu->gpu_power >= 1000) ? cgpu->gpu_power / 1000 : 0; // ignore nvapi % + gpu_info(cgpu); +#endif + cuda_gpu_clocks(cgpu); + get_currentalgo(algo, sizeof(algo)); + + card = device_name[dev_id]; + cgpu->khashes = stats_get_speed(thr_id, 0.0) / 1000.0; + + sprintf(vid, "%04hx:%04hx", cgpu->gpu_vid, cgpu->gpu_pid); + sprintf(arch, "%d", (int)cgpu->gpu_arch); + if(cuda_arch[dev_id] > 0 && cuda_arch[dev_id] != cgpu->gpu_arch) + { + // if binary was not compiled for the highest cuda arch, add it + snprintf(arch, 8, "%d@%d", (int)cgpu->gpu_arch, cuda_arch[dev_id]); + } + snprintf(driver, 32, "CUDA %d.%d %s", cuda_ver / 1000, (cuda_ver % 1000) / 10, driver_version); + driver[31] = '\0'; + + val = json_object(); + json_object_set_new(val, "algo", json_string(algo)); + json_object_set_new(val, "type", json_string("gpu")); + json_object_set_new(val, "device", json_string(card)); + json_object_set_new(val, "vendorid", json_string(vid)); + json_object_set_new(val, "arch", json_string(arch)); + json_object_set_new(val, "freq", json_integer(cgpu->gpu_clock / 1000)); + json_object_set_new(val, "memf", json_integer(cgpu->gpu_memclock / 1000)); + json_object_set_new(val, "power", json_integer(watts)); + json_object_set_new(val, "khashes", json_real(cgpu->khashes)); + json_object_set_new(val, "intensity", json_real(cgpu->intensity)); + json_object_set_new(val, "throughput", json_integer(cgpu->throughput)); + json_object_set_new(val, "client", json_string(PACKAGE_NAME "/" PACKAGE_VERSION)); + json_object_set_new(val, "os", json_string(os)); + json_object_set_new(val, "driver", json_string(driver)); + + json_object_set_new(result, "result", val); + + return true; +} + +static bool stratum_get_stats(struct stratum_ctx *sctx, json_t *id, json_t *params) +{ + char *s; + json_t *val; + bool ret; + + if(!id || json_is_null(id)) + return false; + + val = json_object(); + json_object_set(val, "id", id); + + ret = stratum_benchdata(val, params, 0); + + if(!ret) + { + json_object_set_error(val, 1, "disabled"); //EPERM + } + else + { + json_object_set_new(val, "error", json_null()); + } + s = json_dumps(val, 0); + ret = stratum_send_line(sctx, s); + json_decref(val); + free(s); + + return ret; +} static bool stratum_get_version(struct stratum_ctx *sctx, json_t *id) { char *s; json_t *val; bool ret; - - if (!id || json_is_null(id)) + + if(!id || json_is_null(id)) return false; val = json_object(); json_object_set(val, "id", id); - json_object_set_new(val, "error", json_null()); json_object_set_new(val, "result", json_string(USER_AGENT)); + json_object_set_new(val, "error", json_null()); s = json_dumps(val, 0); ret = stratum_send_line(sctx, s); json_decref(val); @@ -1347,10 +1806,10 @@ static bool stratum_show_message(struct stratum_ctx *sctx, json_t *id, json_t *p bool ret; val = json_array_get(params, 0); - if (val) + if(val) applog(LOG_NOTICE, "MESSAGE FROM SERVER: %s", json_string_value(val)); - - if (!id || json_is_null(id)) + + if(!id || json_is_null(id)) return true; val = json_object(); @@ -1364,6 +1823,27 @@ static bool stratum_show_message(struct stratum_ctx *sctx, json_t *id, json_t *p return ret; } +static bool stratum_unknown_method(struct stratum_ctx *sctx, json_t *id) +{ + char *s; + json_t *val; + bool ret = false; + + if(!id || json_is_null(id)) + return ret; + + val = json_object(); + json_object_set(val, "id", id); + json_object_set_new(val, "result", json_false()); + json_object_set_error(val, 38, "unknown method"); // ENOSYS + + s = json_dumps(val, 0); + ret = stratum_send_line(sctx, s); + json_decref(val); + free(s); + + return ret; +} bool stratum_handle_method(struct stratum_ctx *sctx, const char *s) { @@ -1373,44 +1853,75 @@ bool stratum_handle_method(struct stratum_ctx *sctx, const char *s) bool ret = false; val = JSON_LOADS(s, &err); - if (!val) { + if(!val) + { applog(LOG_ERR, "JSON decode failed(%d): %s", err.line, err.text); goto out; } method = json_string_value(json_object_get(val, "method")); - if (!method) + if(!method) goto out; id = json_object_get(val, "id"); params = json_object_get(val, "params"); - if (!strcasecmp(method, "mining.notify")) { + if(!strcasecmp(method, "mining.notify")) + { ret = stratum_notify(sctx, params); goto out; } - if (!strcasecmp(method, "mining.set_difficulty")) { + if(!strcasecmp(method, "mining.ping")) + { // cgminer 4.7.1+ + if(opt_debug) applog(LOG_DEBUG, "Pool ping"); + ret = stratum_pong(sctx, id); + goto out; + } + if(!strcasecmp(method, "mining.set_difficulty")) + { ret = stratum_set_difficulty(sctx, params); goto out; } - if (!strcasecmp(method, "mining.set_extranonce")) { + if(!strcasecmp(method, "mining.set_extranonce")) + { ret = stratum_parse_extranonce(sctx, params, 0); goto out; } - if (!strcasecmp(method, "client.reconnect")) { + if(!strcasecmp(method, "client.reconnect")) + { ret = stratum_reconnect(sctx, params); goto out; } - if (!strcasecmp(method, "client.get_version")) { + if(!strcasecmp(method, "client.get_algo")) + { // ccminer only yet! + // will prevent wrong algo parameters on a pool, will be used as test on rejects + if(!opt_quiet) applog(LOG_NOTICE, "Pool asked your algo parameter"); + ret = stratum_get_algo(sctx, id, params); + goto out; + } + if(!strcasecmp(method, "client.get_stats")) + { // ccminer/yiimp only yet! + // optional to fill device benchmarks + ret = stratum_get_stats(sctx, id, params); + goto out; + } + if(!strcasecmp(method, "client.get_version")) + { ret = stratum_get_version(sctx, id); goto out; } - if (!strcasecmp(method, "client.show_message")) { + if(!strcasecmp(method, "client.show_message")) + { ret = stratum_show_message(sctx, id, params); goto out; } - + if(!ret) + { + // don't fail = disconnect stratum on unknown (and optional?) methods + if(opt_debug) applog(LOG_WARNING, "unknown stratum method %s!", method); + ret = stratum_unknown_method(sctx, id); + } out: - if (val) + if(val) json_decref(val); return ret; @@ -1421,8 +1932,11 @@ struct thread_q *tq_new(void) struct thread_q *tq; tq = (struct thread_q *)calloc(1, sizeof(*tq)); - if (!tq) - return NULL; + if(tq == NULL) + { + applog(LOG_ERR, "Out of memory!"); + proper_exit(2); + } INIT_LIST_HEAD(&tq->q); pthread_mutex_init(&tq->mutex, NULL); @@ -1435,10 +1949,11 @@ void tq_free(struct thread_q *tq) { struct tq_ent *ent, *iter; - if (!tq) + if(!tq) return; - list_for_each_entry_safe(ent, iter, &tq->q, q_node, struct tq_ent, struct tq_ent) { + list_for_each_entry_safe(ent, iter, &tq->q, q_node, struct tq_ent, struct tq_ent) + { list_del(&ent->q_node); free(ent); } @@ -1476,17 +1991,23 @@ bool tq_push(struct thread_q *tq, void *data) bool rc = true; ent = (struct tq_ent *)calloc(1, sizeof(*ent)); - if (!ent) - return false; + if(ent == NULL) + { + applog(LOG_ERR, "Out of memory!"); + proper_exit(2); + } ent->data = data; INIT_LIST_HEAD(&ent->q_node); pthread_mutex_lock(&tq->mutex); - if (!tq->frozen) { + if(!tq->frozen) + { list_add_tail(&ent->q_node, &tq->q); - } else { + } + else + { free(ent); rc = false; } @@ -1505,16 +2026,16 @@ void *tq_pop(struct thread_q *tq, const struct timespec *abstime) pthread_mutex_lock(&tq->mutex); - if (!list_empty(&tq->q)) + if(!list_empty(&tq->q)) goto pop; - if (abstime) + if(abstime) rc = pthread_cond_timedwait(&tq->cond, &tq->mutex, abstime); else rc = pthread_cond_wait(&tq->cond, &tq->mutex); - if (rc) + if(rc) goto out; - if (list_empty(&tq->q)) + if(list_empty(&tq->q)) goto out; pop: @@ -1530,9 +2051,9 @@ void *tq_pop(struct thread_q *tq, const struct timespec *abstime) } /** - * @param buf char[9] mini - * @param time_t timer to convert - */ +* @param buf char[9] mini +* @param time_t timer to convert +*/ size_t time2str(char* buf, time_t timer) { struct tm* tm_info; @@ -1541,12 +2062,17 @@ size_t time2str(char* buf, time_t timer) } /** - * Alloc and returns time string (to be freed) - * @param time_t timer to convert - */ +* Alloc and returns time string (to be freed) +* @param time_t timer to convert +*/ char* atime2str(time_t timer) { - char* buf = (char*) malloc(16); + char* buf = (char*)malloc(16); + if(buf == NULL) + { + applog(LOG_ERR, "Out of memory!"); + proper_exit(2); + } memset(buf, 0, 16); time2str(buf, timer); return buf; @@ -1556,9 +2082,10 @@ char* atime2str(time_t timer) static char* format_hash(char* buf, uchar *hash) { int len = 0; - for (int i=0; i < 32; i += 4) { - len += sprintf(buf+len, "%02x%02x%02x%02x ", - hash[i], hash[i+1], hash[i+2], hash[i+3]); + for(int i = 0; i < 32; i += 4) + { + len += sprintf(buf + len, "%02x%02x%02x%02x ", + hash[i], hash[i + 1], hash[i + 2], hash[i + 3]); } return buf; } @@ -1568,10 +2095,11 @@ extern void applog_compare_hash(uchar *hash, uchar *hash2) { char s[256] = ""; int len = 0; - for (int i=0; i < 32; i += 4) { - const char *color = memcmp(hash+i, hash2+i, 4) ? CL_WHT : CL_GRY; - len += sprintf(s+len, "%s%02x%02x%02x%02x " CL_GRY, color, - hash[i], hash[i+1], hash[i+2], hash[i+3]); + for(int i = 0; i < 32; i += 4) + { + const char *color = memcmp(hash + i, hash2 + i, 4) ? CL_WHT : CL_GRY; + len += sprintf(s + len, "%s%02x%02x%02x%02x " CL_GRY, color, + hash[i], hash[i + 1], hash[i + 2], hash[i + 3]); s[len] = '\0'; } applog(LOG_DEBUG, "%s", s); @@ -1579,7 +2107,7 @@ extern void applog_compare_hash(uchar *hash, uchar *hash2) extern void applog_hash(uchar *hash) { - char s[128] = {'\0'}; + char s[128] = { '\0' }; applog(LOG_DEBUG, "%s", format_hash(s, hash)); } @@ -1600,11 +2128,13 @@ void do_gpu_tests(void) tgt[7] = 0xffff; memset(buf, 0, sizeof buf); + scanhash_x11(0, (uint32_t*)buf, tgt, 1, &done); + + //memset(buf, 0, sizeof buf); // buf[0] = 1; buf[64] = 2; // for endian tests scanhash_blake256(0, (uint32_t*)buf, tgt, 1, &done, 14); memset(buf, 0, sizeof buf); - scanhash_heavy(0, (uint32_t*)buf, tgt, 1, &done, 1, 84); // HEAVYCOIN_BLKHDR_SZ=84 free(work_restart); work_restart = NULL; @@ -1614,17 +2144,13 @@ void do_gpu_tests(void) void print_hash_tests(void) { - char s[128] = {'\0'}; + char s[128] = { '\0' }; uchar buf[128], hash[128]; memset(buf, 0, sizeof buf); // buf[0] = 1; buf[64] = 2; // for endian tests printf(CL_WHT "CPU HASH ON EMPTY BUFFER RESULTS:" CL_N "\n"); - memset(hash, 0, sizeof hash); - animehash(&hash[0], &buf[0]); - printpfx("anime", hash); - memset(hash, 0, sizeof hash); blake256hash(&hash[0], &buf[0], 8); printpfx("blakecoin", hash); @@ -1649,10 +2175,6 @@ void print_hash_tests(void) groestlhash(&hash[0], &buf[0]); printpfx("groestl", hash); - memset(hash, 0, sizeof hash); - heavycoin_hash(&hash[0], &buf[0], 32); - printpfx("heavy", hash); - memset(hash, 0, sizeof hash); jackpothash(&hash[0], &buf[0]); printpfx("jackpot", hash); @@ -1665,10 +2187,6 @@ void print_hash_tests(void) doomhash(&hash[0], &buf[0]); printpfx("luffa", hash); - memset(hash, 0, sizeof hash); - lyra2_hash(&hash[0], &buf[0]); - printpfx("lyra2", hash); - memset(hash, 0, sizeof hash); myriadhash(&hash[0], &buf[0]); printpfx("myriad", hash); @@ -1689,6 +2207,9 @@ void print_hash_tests(void) qubithash(&hash[0], &buf[0]); printpfx("qubit", hash); + skeincoinhash(&hash[0], &buf[0]); + printpfx("skein", hash); + memset(hash, 0, sizeof hash); s3hash(&hash[0], &buf[0]); printpfx("S3", hash); @@ -1721,3 +2242,27 @@ void print_hash_tests(void) do_gpu_tests(); } + +void bin2hex(char *s, const unsigned char *p, size_t len) +{ + for(size_t i = 0; i < len; i++) + sprintf(s + (i * 2), "%02x", (unsigned int)p[i]); +} + +char *abin2hex(const unsigned char *p, size_t len) +{ + char *s = (char*)malloc((len * 2) + 1); + if(s == NULL) + { + applog(LOG_ERR, "Out of memory!"); + proper_exit(2); + } + bin2hex(s, p, len); + return s; +} +void applog_hex(void *data, int len) +{ + char* hex = abin2hex((uchar*)data, len); + applog(LOG_INFO, "%s", hex); + free(hex); +} diff --git a/x11/c11.cu b/x11/c11.cu new file mode 100644 index 0000000000..47021b1eca --- /dev/null +++ b/x11/c11.cu @@ -0,0 +1,266 @@ +extern "C" +{ +#include "sph/sph_blake.h" +#include "sph/sph_bmw.h" +#include "sph/sph_groestl.h" +#include "sph/sph_skein.h" +#include "sph/sph_jh.h" +#include "sph/sph_keccak.h" + +#include "sph/sph_luffa.h" +#include "sph/sph_cubehash.h" +#include "sph/sph_shavite.h" +#include "sph/sph_simd.h" +#include "sph/sph_echo.h" +} + +#include "miner.h" +//#include +//#include +#include "cuda_helper.h" + +#include +#include + +extern void quark_blake512_cpu_init(int thr_id); +extern void quark_blake512_cpu_setBlock_80(int thr_id, uint64_t *pdata); +extern void quark_blake512_cpu_setBlock_80_multi(int thr_id, uint64_t *pdata); +extern void quark_blake512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash); +extern void quark_blake512_cpu_hash_80_multi(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash); + +extern void quark_bmw512_cpu_init(int thr_id, uint32_t threads); +extern void quark_bmw512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash); + +extern void quark_groestl512_cpu_init(int thr_id, uint32_t threads); +extern void quark_groestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash); + +extern void quark_skein512_cpu_init(int thr_id, uint32_t threads); +extern void quark_skein512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash); + +extern void quark_keccak512_cpu_init(int thr_id, uint32_t threads); +extern void quark_keccak512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash); + +extern void cuda_jh512Keccak512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash); + +extern void x11_luffaCubehash512_cpu_init(int thr_id, uint32_t threads); +extern void x11_luffaCubehash512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash); + +extern void x11_shavite512_cpu_init(int thr_id, uint32_t threads); +extern void x11_shavite512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash); + +extern int x11_simd512_cpu_init(int thr_id, uint32_t threads); +extern void x11_simd512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, const uint32_t simdthreads); + +extern void x11_echo512_cpu_init(int thr_id, uint32_t threads); +extern void x11_echo512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash); +extern void x11_echo512_cpu_hash_64_final(int thr_id, uint32_t threads, uint32_t startNounce, const uint32_t *d_hash, uint32_t target, uint32_t *h_found); +extern void x11_echo512_cpu_init(int thr_id, uint32_t threads); + +extern void quark_compactTest_cpu_init(int thr_id, uint32_t threads); +extern void quark_compactTest_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, const uint32_t *inpHashes, + const uint32_t *d_noncesTrue, uint32_t *nrmTrue, uint32_t *d_noncesFalse, uint32_t *nrmFalse); + +extern "C" void c11hash(void *output, const void *input) +{ + // blake1-bmw2-grs3-skein4-jh5-keccak6-luffa7-cubehash8-shavite9-simd10-echo11 + sph_blake512_context ctx_blake; + sph_bmw512_context ctx_bmw; + sph_groestl512_context ctx_groestl; + sph_jh512_context ctx_jh; + sph_keccak512_context ctx_keccak; + sph_skein512_context ctx_skein; + sph_luffa512_context ctx_luffa; + sph_cubehash512_context ctx_cubehash; + sph_shavite512_context ctx_shavite; + sph_simd512_context ctx_simd; + sph_echo512_context ctx_echo; + + unsigned char hash[128]; + memset(hash, 0, sizeof hash); + + sph_blake512_init(&ctx_blake); + sph_blake512(&ctx_blake, input, 80); + sph_blake512_close(&ctx_blake, (void*)hash); + + sph_bmw512_init(&ctx_bmw); + sph_bmw512(&ctx_bmw, (const void*)hash, 64); + sph_bmw512_close(&ctx_bmw, (void*)hash); + + sph_groestl512_init(&ctx_groestl); + sph_groestl512(&ctx_groestl, (const void*)hash, 64); + sph_groestl512_close(&ctx_groestl, (void*)hash); + + sph_jh512_init(&ctx_jh); + sph_jh512(&ctx_jh, (const void*)hash, 64); + sph_jh512_close(&ctx_jh, (void*)hash); + + sph_keccak512_init(&ctx_keccak); + sph_keccak512(&ctx_keccak, (const void*)hash, 64); + sph_keccak512_close(&ctx_keccak, (void*)hash); + + sph_skein512_init(&ctx_skein); + sph_skein512(&ctx_skein, (const void*)hash, 64); + sph_skein512_close(&ctx_skein, (void*)hash); + + sph_luffa512_init(&ctx_luffa); + sph_luffa512(&ctx_luffa, (const void*)hash, 64); + sph_luffa512_close(&ctx_luffa, (void*)hash); + + sph_cubehash512_init(&ctx_cubehash); + sph_cubehash512(&ctx_cubehash, (const void*)hash, 64); + sph_cubehash512_close(&ctx_cubehash, (void*)hash); + + sph_shavite512_init(&ctx_shavite); + sph_shavite512(&ctx_shavite, (const void*)hash, 64); + sph_shavite512_close(&ctx_shavite, (void*)hash); + + sph_simd512_init(&ctx_simd); + sph_simd512(&ctx_simd, (const void*)hash, 64); + sph_simd512_close(&ctx_simd, (void*)hash); + + sph_echo512_init(&ctx_echo); + sph_echo512(&ctx_echo, (const void*)hash, 64); + sph_echo512_close(&ctx_echo, (void*)hash); + + memcpy(output, hash, 32); +} + +static THREAD uint32_t *d_hash = nullptr; + +int scanhash_c11(int thr_id, uint32_t *pdata, + uint32_t *ptarget, uint32_t max_nonce, + uint32_t *hashes_done) +{ + uint32_t foundnonces[2]; + const uint32_t first_nonce = pdata[19]; + + cudaDeviceProp props; + CUDA_SAFE_CALL(cudaGetDeviceProperties(&props, device_map[thr_id])); + static THREAD uint32_t throughputmax; + + if(opt_benchmark) + ptarget[7] = 0x4f; + + static THREAD bool init = false; + if(!init) + { + CUDA_SAFE_CALL(cudaSetDevice(device_map[thr_id])); + CUDA_SAFE_CALL(cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync)); + CUDA_SAFE_CALL(cudaDeviceSetCacheConfig(cudaFuncCachePreferL1)); + CUDA_SAFE_CALL(cudaStreamCreate(&gpustream[thr_id])); + get_cuda_arch(&cuda_arch[thr_id]); + + unsigned int intensity; +#if defined WIN32 && !defined _WIN64 + intensity = 256 * 256 * 16; +#else + if(strstr(props.name, "970")) intensity = (256 * 256 * 22); + else if(strstr(props.name, "980")) intensity = (256 * 256 * 22); + else if(strstr(props.name, "1070")) intensity = (256 * 256 * 22); + else if(strstr(props.name, "1080")) intensity = (256 * 256 * 22); + else if(strstr(props.name, "750 Ti")) intensity = (256 * 256 * 20); + else if(strstr(props.name, "750")) intensity = (256 * 256 * 19); + else if(strstr(props.name, "960")) intensity = (256 * 256 * 19); + else intensity = (256 * 256 * 19); +#endif + throughputmax = device_intensity(device_map[thr_id], __func__, intensity); +#if defined WIN32 && !defined _WIN64 + // 2GB limit for cudaMalloc + if(throughputmax > 0x7fffffffULL / (64 * sizeof(uint4))) + { + applog(LOG_ERR, "intensity too high"); + mining_has_stopped[thr_id] = true; + cudaStreamDestroy(gpustream[thr_id]); + proper_exit(2); + } +#endif + + quark_groestl512_cpu_init(thr_id, throughputmax); + quark_bmw512_cpu_init(thr_id, throughputmax); + x11_echo512_cpu_init(thr_id, throughputmax); + x11_simd512_cpu_init(thr_id, throughputmax); + + CUDA_SAFE_CALL(cudaMalloc(&d_hash, 16 * 4 * throughputmax)); + mining_has_stopped[thr_id] = false; + init = true; + } + uint32_t throughput = min(throughputmax, max_nonce - first_nonce) & 0xfffffc00; + uint32_t simdthreads = (device_sm[device_map[thr_id]] > 500) ? 256 : 32; + + uint32_t endiandata[20]; + for(int k = 0; k < 20; k++) + be32enc(&endiandata[k], ((uint32_t*)pdata)[k]); + + quark_blake512_cpu_setBlock_80(thr_id, (uint64_t *)endiandata); + + do + { + + quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash); + quark_bmw512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash); + quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash); + cuda_jh512Keccak512_cpu_hash_64(thr_id, throughput, pdata[19], d_hash); + quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash); + x11_luffaCubehash512_cpu_hash_64(thr_id, throughput, pdata[19], d_hash); + x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], d_hash); + x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], d_hash, simdthreads); + x11_echo512_cpu_hash_64_final(thr_id, throughput, pdata[19], d_hash, ptarget[7], foundnonces); + cudaStreamSynchronize(gpustream[thr_id]); + if(stop_mining) + { + mining_has_stopped[thr_id] = true; cudaStreamDestroy(gpustream[thr_id]); pthread_exit(nullptr); + } + if (foundnonces[0] != 0xffffffff) + { + const uint32_t Htarg = ptarget[7]; + uint32_t vhash64[8]={0}; + if(opt_verify) + { + be32enc(&endiandata[19], foundnonces[0]); + c11hash(vhash64, endiandata); + } + if(vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) + { + int res = 1; + *hashes_done = pdata[19] - first_nonce + throughput; + if(foundnonces[1] != 0xffffffff) + { + if(opt_verify) + { + be32enc(&endiandata[19], foundnonces[1]); + c11hash(vhash64, endiandata); + } + if(vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) + { + pdata[21] = foundnonces[1]; + res++; + if(opt_benchmark) + applog(LOG_INFO, "GPU #%d: Found second nonce %08x", thr_id, foundnonces[1]); + } + else + { + if(vhash64[7] != Htarg) + { + applog(LOG_INFO, "GPU #%d: result for %08x does not validate on CPU!", thr_id, foundnonces[1]); + } + } + } + pdata[19] = foundnonces[0]; + if(opt_benchmark) + applog(LOG_INFO, "GPU #%d: Found nonce %08x", thr_id, foundnonces[0]); + return res; + } + else + { + if(vhash64[7] != Htarg) + { + applog(LOG_INFO, "GPU #%d: result for %08x does not validate on CPU!", thr_id, foundnonces[0]); + } + } + } + pdata[19] += throughput; + } while(!work_restart[thr_id].restart && ((uint64_t)max_nonce > ((uint64_t)(pdata[19]) + (uint64_t)throughput))); + + *hashes_done = pdata[19] - first_nonce ; + return 0; +} diff --git a/x11/cuda_x11_aes.cu b/x11/cuda_x11_aes.cu index 45a7fde991..c99f1bfbc3 100644 --- a/x11/cuda_x11_aes.cu +++ b/x11/cuda_x11_aes.cu @@ -1,353 +1,154 @@ - -/* AES Helper for inline-usage from SPH */ -#define AESx(x) SPH_C32(x) - +#include "cuda_helper.h" __constant__ __align__(64) uint32_t d_AES0[256] = { - AESx(0xA56363C6), AESx(0x847C7CF8), AESx(0x997777EE), AESx(0x8D7B7BF6), - AESx(0x0DF2F2FF), AESx(0xBD6B6BD6), AESx(0xB16F6FDE), AESx(0x54C5C591), - AESx(0x50303060), AESx(0x03010102), AESx(0xA96767CE), AESx(0x7D2B2B56), - AESx(0x19FEFEE7), AESx(0x62D7D7B5), AESx(0xE6ABAB4D), AESx(0x9A7676EC), - AESx(0x45CACA8F), AESx(0x9D82821F), AESx(0x40C9C989), AESx(0x877D7DFA), - AESx(0x15FAFAEF), AESx(0xEB5959B2), AESx(0xC947478E), AESx(0x0BF0F0FB), - AESx(0xECADAD41), AESx(0x67D4D4B3), AESx(0xFDA2A25F), AESx(0xEAAFAF45), - AESx(0xBF9C9C23), AESx(0xF7A4A453), AESx(0x967272E4), AESx(0x5BC0C09B), - AESx(0xC2B7B775), AESx(0x1CFDFDE1), AESx(0xAE93933D), AESx(0x6A26264C), - AESx(0x5A36366C), AESx(0x413F3F7E), AESx(0x02F7F7F5), AESx(0x4FCCCC83), - AESx(0x5C343468), AESx(0xF4A5A551), AESx(0x34E5E5D1), AESx(0x08F1F1F9), - AESx(0x937171E2), AESx(0x73D8D8AB), AESx(0x53313162), AESx(0x3F15152A), - AESx(0x0C040408), AESx(0x52C7C795), AESx(0x65232346), AESx(0x5EC3C39D), - AESx(0x28181830), AESx(0xA1969637), AESx(0x0F05050A), AESx(0xB59A9A2F), - AESx(0x0907070E), AESx(0x36121224), AESx(0x9B80801B), AESx(0x3DE2E2DF), - AESx(0x26EBEBCD), AESx(0x6927274E), AESx(0xCDB2B27F), AESx(0x9F7575EA), - AESx(0x1B090912), AESx(0x9E83831D), AESx(0x742C2C58), AESx(0x2E1A1A34), - AESx(0x2D1B1B36), AESx(0xB26E6EDC), AESx(0xEE5A5AB4), AESx(0xFBA0A05B), - AESx(0xF65252A4), AESx(0x4D3B3B76), AESx(0x61D6D6B7), AESx(0xCEB3B37D), - AESx(0x7B292952), AESx(0x3EE3E3DD), AESx(0x712F2F5E), AESx(0x97848413), - AESx(0xF55353A6), AESx(0x68D1D1B9), AESx(0x00000000), AESx(0x2CEDEDC1), - AESx(0x60202040), AESx(0x1FFCFCE3), AESx(0xC8B1B179), AESx(0xED5B5BB6), - AESx(0xBE6A6AD4), AESx(0x46CBCB8D), AESx(0xD9BEBE67), AESx(0x4B393972), - AESx(0xDE4A4A94), AESx(0xD44C4C98), AESx(0xE85858B0), AESx(0x4ACFCF85), - AESx(0x6BD0D0BB), AESx(0x2AEFEFC5), AESx(0xE5AAAA4F), AESx(0x16FBFBED), - AESx(0xC5434386), AESx(0xD74D4D9A), AESx(0x55333366), AESx(0x94858511), - AESx(0xCF45458A), AESx(0x10F9F9E9), AESx(0x06020204), AESx(0x817F7FFE), - AESx(0xF05050A0), AESx(0x443C3C78), AESx(0xBA9F9F25), AESx(0xE3A8A84B), - AESx(0xF35151A2), AESx(0xFEA3A35D), AESx(0xC0404080), AESx(0x8A8F8F05), - AESx(0xAD92923F), AESx(0xBC9D9D21), AESx(0x48383870), AESx(0x04F5F5F1), - AESx(0xDFBCBC63), AESx(0xC1B6B677), AESx(0x75DADAAF), AESx(0x63212142), - AESx(0x30101020), AESx(0x1AFFFFE5), AESx(0x0EF3F3FD), AESx(0x6DD2D2BF), - AESx(0x4CCDCD81), AESx(0x140C0C18), AESx(0x35131326), AESx(0x2FECECC3), - AESx(0xE15F5FBE), AESx(0xA2979735), AESx(0xCC444488), AESx(0x3917172E), - AESx(0x57C4C493), AESx(0xF2A7A755), AESx(0x827E7EFC), AESx(0x473D3D7A), - AESx(0xAC6464C8), AESx(0xE75D5DBA), AESx(0x2B191932), AESx(0x957373E6), - AESx(0xA06060C0), AESx(0x98818119), AESx(0xD14F4F9E), AESx(0x7FDCDCA3), - AESx(0x66222244), AESx(0x7E2A2A54), AESx(0xAB90903B), AESx(0x8388880B), - AESx(0xCA46468C), AESx(0x29EEEEC7), AESx(0xD3B8B86B), AESx(0x3C141428), - AESx(0x79DEDEA7), AESx(0xE25E5EBC), AESx(0x1D0B0B16), AESx(0x76DBDBAD), - AESx(0x3BE0E0DB), AESx(0x56323264), AESx(0x4E3A3A74), AESx(0x1E0A0A14), - AESx(0xDB494992), AESx(0x0A06060C), AESx(0x6C242448), AESx(0xE45C5CB8), - AESx(0x5DC2C29F), AESx(0x6ED3D3BD), AESx(0xEFACAC43), AESx(0xA66262C4), - AESx(0xA8919139), AESx(0xA4959531), AESx(0x37E4E4D3), AESx(0x8B7979F2), - AESx(0x32E7E7D5), AESx(0x43C8C88B), AESx(0x5937376E), AESx(0xB76D6DDA), - AESx(0x8C8D8D01), AESx(0x64D5D5B1), AESx(0xD24E4E9C), AESx(0xE0A9A949), - AESx(0xB46C6CD8), AESx(0xFA5656AC), AESx(0x07F4F4F3), AESx(0x25EAEACF), - AESx(0xAF6565CA), AESx(0x8E7A7AF4), AESx(0xE9AEAE47), AESx(0x18080810), - AESx(0xD5BABA6F), AESx(0x887878F0), AESx(0x6F25254A), AESx(0x722E2E5C), - AESx(0x241C1C38), AESx(0xF1A6A657), AESx(0xC7B4B473), AESx(0x51C6C697), - AESx(0x23E8E8CB), AESx(0x7CDDDDA1), AESx(0x9C7474E8), AESx(0x211F1F3E), - AESx(0xDD4B4B96), AESx(0xDCBDBD61), AESx(0x868B8B0D), AESx(0x858A8A0F), - AESx(0x907070E0), AESx(0x423E3E7C), AESx(0xC4B5B571), AESx(0xAA6666CC), - AESx(0xD8484890), AESx(0x05030306), AESx(0x01F6F6F7), AESx(0x120E0E1C), - AESx(0xA36161C2), AESx(0x5F35356A), AESx(0xF95757AE), AESx(0xD0B9B969), - AESx(0x91868617), AESx(0x58C1C199), AESx(0x271D1D3A), AESx(0xB99E9E27), - AESx(0x38E1E1D9), AESx(0x13F8F8EB), AESx(0xB398982B), AESx(0x33111122), - AESx(0xBB6969D2), AESx(0x70D9D9A9), AESx(0x898E8E07), AESx(0xA7949433), - AESx(0xB69B9B2D), AESx(0x221E1E3C), AESx(0x92878715), AESx(0x20E9E9C9), - AESx(0x49CECE87), AESx(0xFF5555AA), AESx(0x78282850), AESx(0x7ADFDFA5), - AESx(0x8F8C8C03), AESx(0xF8A1A159), AESx(0x80898909), AESx(0x170D0D1A), - AESx(0xDABFBF65), AESx(0x31E6E6D7), AESx(0xC6424284), AESx(0xB86868D0), - AESx(0xC3414182), AESx(0xB0999929), AESx(0x772D2D5A), AESx(0x110F0F1E), - AESx(0xCBB0B07B), AESx(0xFC5454A8), AESx(0xD6BBBB6D), AESx(0x3A16162C) -}; - -__constant__ __align__(64) uint32_t d_AES1[256] = { - AESx(0x6363C6A5), AESx(0x7C7CF884), AESx(0x7777EE99), AESx(0x7B7BF68D), - AESx(0xF2F2FF0D), AESx(0x6B6BD6BD), AESx(0x6F6FDEB1), AESx(0xC5C59154), - AESx(0x30306050), AESx(0x01010203), AESx(0x6767CEA9), AESx(0x2B2B567D), - AESx(0xFEFEE719), AESx(0xD7D7B562), AESx(0xABAB4DE6), AESx(0x7676EC9A), - AESx(0xCACA8F45), AESx(0x82821F9D), AESx(0xC9C98940), AESx(0x7D7DFA87), - AESx(0xFAFAEF15), AESx(0x5959B2EB), AESx(0x47478EC9), AESx(0xF0F0FB0B), - AESx(0xADAD41EC), AESx(0xD4D4B367), AESx(0xA2A25FFD), AESx(0xAFAF45EA), - AESx(0x9C9C23BF), AESx(0xA4A453F7), AESx(0x7272E496), AESx(0xC0C09B5B), - AESx(0xB7B775C2), AESx(0xFDFDE11C), AESx(0x93933DAE), AESx(0x26264C6A), - AESx(0x36366C5A), AESx(0x3F3F7E41), AESx(0xF7F7F502), AESx(0xCCCC834F), - AESx(0x3434685C), AESx(0xA5A551F4), AESx(0xE5E5D134), AESx(0xF1F1F908), - AESx(0x7171E293), AESx(0xD8D8AB73), AESx(0x31316253), AESx(0x15152A3F), - AESx(0x0404080C), AESx(0xC7C79552), AESx(0x23234665), AESx(0xC3C39D5E), - AESx(0x18183028), AESx(0x969637A1), AESx(0x05050A0F), AESx(0x9A9A2FB5), - AESx(0x07070E09), AESx(0x12122436), AESx(0x80801B9B), AESx(0xE2E2DF3D), - AESx(0xEBEBCD26), AESx(0x27274E69), AESx(0xB2B27FCD), AESx(0x7575EA9F), - AESx(0x0909121B), AESx(0x83831D9E), AESx(0x2C2C5874), AESx(0x1A1A342E), - AESx(0x1B1B362D), AESx(0x6E6EDCB2), AESx(0x5A5AB4EE), AESx(0xA0A05BFB), - AESx(0x5252A4F6), AESx(0x3B3B764D), AESx(0xD6D6B761), AESx(0xB3B37DCE), - AESx(0x2929527B), AESx(0xE3E3DD3E), AESx(0x2F2F5E71), AESx(0x84841397), - AESx(0x5353A6F5), AESx(0xD1D1B968), AESx(0x00000000), AESx(0xEDEDC12C), - AESx(0x20204060), AESx(0xFCFCE31F), AESx(0xB1B179C8), AESx(0x5B5BB6ED), - AESx(0x6A6AD4BE), AESx(0xCBCB8D46), AESx(0xBEBE67D9), AESx(0x3939724B), - AESx(0x4A4A94DE), AESx(0x4C4C98D4), AESx(0x5858B0E8), AESx(0xCFCF854A), - AESx(0xD0D0BB6B), AESx(0xEFEFC52A), AESx(0xAAAA4FE5), AESx(0xFBFBED16), - AESx(0x434386C5), AESx(0x4D4D9AD7), AESx(0x33336655), AESx(0x85851194), - AESx(0x45458ACF), AESx(0xF9F9E910), AESx(0x02020406), AESx(0x7F7FFE81), - AESx(0x5050A0F0), AESx(0x3C3C7844), AESx(0x9F9F25BA), AESx(0xA8A84BE3), - AESx(0x5151A2F3), AESx(0xA3A35DFE), AESx(0x404080C0), AESx(0x8F8F058A), - AESx(0x92923FAD), AESx(0x9D9D21BC), AESx(0x38387048), AESx(0xF5F5F104), - AESx(0xBCBC63DF), AESx(0xB6B677C1), AESx(0xDADAAF75), AESx(0x21214263), - AESx(0x10102030), AESx(0xFFFFE51A), AESx(0xF3F3FD0E), AESx(0xD2D2BF6D), - AESx(0xCDCD814C), AESx(0x0C0C1814), AESx(0x13132635), AESx(0xECECC32F), - AESx(0x5F5FBEE1), AESx(0x979735A2), AESx(0x444488CC), AESx(0x17172E39), - AESx(0xC4C49357), AESx(0xA7A755F2), AESx(0x7E7EFC82), AESx(0x3D3D7A47), - AESx(0x6464C8AC), AESx(0x5D5DBAE7), AESx(0x1919322B), AESx(0x7373E695), - AESx(0x6060C0A0), AESx(0x81811998), AESx(0x4F4F9ED1), AESx(0xDCDCA37F), - AESx(0x22224466), AESx(0x2A2A547E), AESx(0x90903BAB), AESx(0x88880B83), - AESx(0x46468CCA), AESx(0xEEEEC729), AESx(0xB8B86BD3), AESx(0x1414283C), - AESx(0xDEDEA779), AESx(0x5E5EBCE2), AESx(0x0B0B161D), AESx(0xDBDBAD76), - AESx(0xE0E0DB3B), AESx(0x32326456), AESx(0x3A3A744E), AESx(0x0A0A141E), - AESx(0x494992DB), AESx(0x06060C0A), AESx(0x2424486C), AESx(0x5C5CB8E4), - AESx(0xC2C29F5D), AESx(0xD3D3BD6E), AESx(0xACAC43EF), AESx(0x6262C4A6), - AESx(0x919139A8), AESx(0x959531A4), AESx(0xE4E4D337), AESx(0x7979F28B), - AESx(0xE7E7D532), AESx(0xC8C88B43), AESx(0x37376E59), AESx(0x6D6DDAB7), - AESx(0x8D8D018C), AESx(0xD5D5B164), AESx(0x4E4E9CD2), AESx(0xA9A949E0), - AESx(0x6C6CD8B4), AESx(0x5656ACFA), AESx(0xF4F4F307), AESx(0xEAEACF25), - AESx(0x6565CAAF), AESx(0x7A7AF48E), AESx(0xAEAE47E9), AESx(0x08081018), - AESx(0xBABA6FD5), AESx(0x7878F088), AESx(0x25254A6F), AESx(0x2E2E5C72), - AESx(0x1C1C3824), AESx(0xA6A657F1), AESx(0xB4B473C7), AESx(0xC6C69751), - AESx(0xE8E8CB23), AESx(0xDDDDA17C), AESx(0x7474E89C), AESx(0x1F1F3E21), - AESx(0x4B4B96DD), AESx(0xBDBD61DC), AESx(0x8B8B0D86), AESx(0x8A8A0F85), - AESx(0x7070E090), AESx(0x3E3E7C42), AESx(0xB5B571C4), AESx(0x6666CCAA), - AESx(0x484890D8), AESx(0x03030605), AESx(0xF6F6F701), AESx(0x0E0E1C12), - AESx(0x6161C2A3), AESx(0x35356A5F), AESx(0x5757AEF9), AESx(0xB9B969D0), - AESx(0x86861791), AESx(0xC1C19958), AESx(0x1D1D3A27), AESx(0x9E9E27B9), - AESx(0xE1E1D938), AESx(0xF8F8EB13), AESx(0x98982BB3), AESx(0x11112233), - AESx(0x6969D2BB), AESx(0xD9D9A970), AESx(0x8E8E0789), AESx(0x949433A7), - AESx(0x9B9B2DB6), AESx(0x1E1E3C22), AESx(0x87871592), AESx(0xE9E9C920), - AESx(0xCECE8749), AESx(0x5555AAFF), AESx(0x28285078), AESx(0xDFDFA57A), - AESx(0x8C8C038F), AESx(0xA1A159F8), AESx(0x89890980), AESx(0x0D0D1A17), - AESx(0xBFBF65DA), AESx(0xE6E6D731), AESx(0x424284C6), AESx(0x6868D0B8), - AESx(0x414182C3), AESx(0x999929B0), AESx(0x2D2D5A77), AESx(0x0F0F1E11), - AESx(0xB0B07BCB), AESx(0x5454A8FC), AESx(0xBBBB6DD6), AESx(0x16162C3A) + 0xA56363C6, 0x847C7CF8, 0x997777EE, 0x8D7B7BF6, + 0x0DF2F2FF, 0xBD6B6BD6, 0xB16F6FDE, 0x54C5C591, + 0x50303060, 0x03010102, 0xA96767CE, 0x7D2B2B56, + 0x19FEFEE7, 0x62D7D7B5, 0xE6ABAB4D, 0x9A7676EC, + 0x45CACA8F, 0x9D82821F, 0x40C9C989, 0x877D7DFA, + 0x15FAFAEF, 0xEB5959B2, 0xC947478E, 0x0BF0F0FB, + 0xECADAD41, 0x67D4D4B3, 0xFDA2A25F, 0xEAAFAF45, + 0xBF9C9C23, 0xF7A4A453, 0x967272E4, 0x5BC0C09B, + 0xC2B7B775, 0x1CFDFDE1, 0xAE93933D, 0x6A26264C, + 0x5A36366C, 0x413F3F7E, 0x02F7F7F5, 0x4FCCCC83, + 0x5C343468, 0xF4A5A551, 0x34E5E5D1, 0x08F1F1F9, + 0x937171E2, 0x73D8D8AB, 0x53313162, 0x3F15152A, + 0x0C040408, 0x52C7C795, 0x65232346, 0x5EC3C39D, + 0x28181830, 0xA1969637, 0x0F05050A, 0xB59A9A2F, + 0x0907070E, 0x36121224, 0x9B80801B, 0x3DE2E2DF, + 0x26EBEBCD, 0x6927274E, 0xCDB2B27F, 0x9F7575EA, + 0x1B090912, 0x9E83831D, 0x742C2C58, 0x2E1A1A34, + 0x2D1B1B36, 0xB26E6EDC, 0xEE5A5AB4, 0xFBA0A05B, + 0xF65252A4, 0x4D3B3B76, 0x61D6D6B7, 0xCEB3B37D, + 0x7B292952, 0x3EE3E3DD, 0x712F2F5E, 0x97848413, + 0xF55353A6, 0x68D1D1B9, 0x00000000, 0x2CEDEDC1, + 0x60202040, 0x1FFCFCE3, 0xC8B1B179, 0xED5B5BB6, + 0xBE6A6AD4, 0x46CBCB8D, 0xD9BEBE67, 0x4B393972, + 0xDE4A4A94, 0xD44C4C98, 0xE85858B0, 0x4ACFCF85, + 0x6BD0D0BB, 0x2AEFEFC5, 0xE5AAAA4F, 0x16FBFBED, + 0xC5434386, 0xD74D4D9A, 0x55333366, 0x94858511, + 0xCF45458A, 0x10F9F9E9, 0x06020204, 0x817F7FFE, + 0xF05050A0, 0x443C3C78, 0xBA9F9F25, 0xE3A8A84B, + 0xF35151A2, 0xFEA3A35D, 0xC0404080, 0x8A8F8F05, + 0xAD92923F, 0xBC9D9D21, 0x48383870, 0x04F5F5F1, + 0xDFBCBC63, 0xC1B6B677, 0x75DADAAF, 0x63212142, + 0x30101020, 0x1AFFFFE5, 0x0EF3F3FD, 0x6DD2D2BF, + 0x4CCDCD81, 0x140C0C18, 0x35131326, 0x2FECECC3, + 0xE15F5FBE, 0xA2979735, 0xCC444488, 0x3917172E, + 0x57C4C493, 0xF2A7A755, 0x827E7EFC, 0x473D3D7A, + 0xAC6464C8, 0xE75D5DBA, 0x2B191932, 0x957373E6, + 0xA06060C0, 0x98818119, 0xD14F4F9E, 0x7FDCDCA3, + 0x66222244, 0x7E2A2A54, 0xAB90903B, 0x8388880B, + 0xCA46468C, 0x29EEEEC7, 0xD3B8B86B, 0x3C141428, + 0x79DEDEA7, 0xE25E5EBC, 0x1D0B0B16, 0x76DBDBAD, + 0x3BE0E0DB, 0x56323264, 0x4E3A3A74, 0x1E0A0A14, + 0xDB494992, 0x0A06060C, 0x6C242448, 0xE45C5CB8, + 0x5DC2C29F, 0x6ED3D3BD, 0xEFACAC43, 0xA66262C4, + 0xA8919139, 0xA4959531, 0x37E4E4D3, 0x8B7979F2, + 0x32E7E7D5, 0x43C8C88B, 0x5937376E, 0xB76D6DDA, + 0x8C8D8D01, 0x64D5D5B1, 0xD24E4E9C, 0xE0A9A949, + 0xB46C6CD8, 0xFA5656AC, 0x07F4F4F3, 0x25EAEACF, + 0xAF6565CA, 0x8E7A7AF4, 0xE9AEAE47, 0x18080810, + 0xD5BABA6F, 0x887878F0, 0x6F25254A, 0x722E2E5C, + 0x241C1C38, 0xF1A6A657, 0xC7B4B473, 0x51C6C697, + 0x23E8E8CB, 0x7CDDDDA1, 0x9C7474E8, 0x211F1F3E, + 0xDD4B4B96, 0xDCBDBD61, 0x868B8B0D, 0x858A8A0F, + 0x907070E0, 0x423E3E7C, 0xC4B5B571, 0xAA6666CC, + 0xD8484890, 0x05030306, 0x01F6F6F7, 0x120E0E1C, + 0xA36161C2, 0x5F35356A, 0xF95757AE, 0xD0B9B969, + 0x91868617, 0x58C1C199, 0x271D1D3A, 0xB99E9E27, + 0x38E1E1D9, 0x13F8F8EB, 0xB398982B, 0x33111122, + 0xBB6969D2, 0x70D9D9A9, 0x898E8E07, 0xA7949433, + 0xB69B9B2D, 0x221E1E3C, 0x92878715, 0x20E9E9C9, + 0x49CECE87, 0xFF5555AA, 0x78282850, 0x7ADFDFA5, + 0x8F8C8C03, 0xF8A1A159, 0x80898909, 0x170D0D1A, + 0xDABFBF65, 0x31E6E6D7, 0xC6424284, 0xB86868D0, + 0xC3414182, 0xB0999929, 0x772D2D5A, 0x110F0F1E, + 0xCBB0B07B, 0xFC5454A8, 0xD6BBBB6D, 0x3A16162C }; -__constant__ __align__(64) uint32_t d_AES2[256] = { - AESx(0x63C6A563), AESx(0x7CF8847C), AESx(0x77EE9977), AESx(0x7BF68D7B), - AESx(0xF2FF0DF2), AESx(0x6BD6BD6B), AESx(0x6FDEB16F), AESx(0xC59154C5), - AESx(0x30605030), AESx(0x01020301), AESx(0x67CEA967), AESx(0x2B567D2B), - AESx(0xFEE719FE), AESx(0xD7B562D7), AESx(0xAB4DE6AB), AESx(0x76EC9A76), - AESx(0xCA8F45CA), AESx(0x821F9D82), AESx(0xC98940C9), AESx(0x7DFA877D), - AESx(0xFAEF15FA), AESx(0x59B2EB59), AESx(0x478EC947), AESx(0xF0FB0BF0), - AESx(0xAD41ECAD), AESx(0xD4B367D4), AESx(0xA25FFDA2), AESx(0xAF45EAAF), - AESx(0x9C23BF9C), AESx(0xA453F7A4), AESx(0x72E49672), AESx(0xC09B5BC0), - AESx(0xB775C2B7), AESx(0xFDE11CFD), AESx(0x933DAE93), AESx(0x264C6A26), - AESx(0x366C5A36), AESx(0x3F7E413F), AESx(0xF7F502F7), AESx(0xCC834FCC), - AESx(0x34685C34), AESx(0xA551F4A5), AESx(0xE5D134E5), AESx(0xF1F908F1), - AESx(0x71E29371), AESx(0xD8AB73D8), AESx(0x31625331), AESx(0x152A3F15), - AESx(0x04080C04), AESx(0xC79552C7), AESx(0x23466523), AESx(0xC39D5EC3), - AESx(0x18302818), AESx(0x9637A196), AESx(0x050A0F05), AESx(0x9A2FB59A), - AESx(0x070E0907), AESx(0x12243612), AESx(0x801B9B80), AESx(0xE2DF3DE2), - AESx(0xEBCD26EB), AESx(0x274E6927), AESx(0xB27FCDB2), AESx(0x75EA9F75), - AESx(0x09121B09), AESx(0x831D9E83), AESx(0x2C58742C), AESx(0x1A342E1A), - AESx(0x1B362D1B), AESx(0x6EDCB26E), AESx(0x5AB4EE5A), AESx(0xA05BFBA0), - AESx(0x52A4F652), AESx(0x3B764D3B), AESx(0xD6B761D6), AESx(0xB37DCEB3), - AESx(0x29527B29), AESx(0xE3DD3EE3), AESx(0x2F5E712F), AESx(0x84139784), - AESx(0x53A6F553), AESx(0xD1B968D1), AESx(0x00000000), AESx(0xEDC12CED), - AESx(0x20406020), AESx(0xFCE31FFC), AESx(0xB179C8B1), AESx(0x5BB6ED5B), - AESx(0x6AD4BE6A), AESx(0xCB8D46CB), AESx(0xBE67D9BE), AESx(0x39724B39), - AESx(0x4A94DE4A), AESx(0x4C98D44C), AESx(0x58B0E858), AESx(0xCF854ACF), - AESx(0xD0BB6BD0), AESx(0xEFC52AEF), AESx(0xAA4FE5AA), AESx(0xFBED16FB), - AESx(0x4386C543), AESx(0x4D9AD74D), AESx(0x33665533), AESx(0x85119485), - AESx(0x458ACF45), AESx(0xF9E910F9), AESx(0x02040602), AESx(0x7FFE817F), - AESx(0x50A0F050), AESx(0x3C78443C), AESx(0x9F25BA9F), AESx(0xA84BE3A8), - AESx(0x51A2F351), AESx(0xA35DFEA3), AESx(0x4080C040), AESx(0x8F058A8F), - AESx(0x923FAD92), AESx(0x9D21BC9D), AESx(0x38704838), AESx(0xF5F104F5), - AESx(0xBC63DFBC), AESx(0xB677C1B6), AESx(0xDAAF75DA), AESx(0x21426321), - AESx(0x10203010), AESx(0xFFE51AFF), AESx(0xF3FD0EF3), AESx(0xD2BF6DD2), - AESx(0xCD814CCD), AESx(0x0C18140C), AESx(0x13263513), AESx(0xECC32FEC), - AESx(0x5FBEE15F), AESx(0x9735A297), AESx(0x4488CC44), AESx(0x172E3917), - AESx(0xC49357C4), AESx(0xA755F2A7), AESx(0x7EFC827E), AESx(0x3D7A473D), - AESx(0x64C8AC64), AESx(0x5DBAE75D), AESx(0x19322B19), AESx(0x73E69573), - AESx(0x60C0A060), AESx(0x81199881), AESx(0x4F9ED14F), AESx(0xDCA37FDC), - AESx(0x22446622), AESx(0x2A547E2A), AESx(0x903BAB90), AESx(0x880B8388), - AESx(0x468CCA46), AESx(0xEEC729EE), AESx(0xB86BD3B8), AESx(0x14283C14), - AESx(0xDEA779DE), AESx(0x5EBCE25E), AESx(0x0B161D0B), AESx(0xDBAD76DB), - AESx(0xE0DB3BE0), AESx(0x32645632), AESx(0x3A744E3A), AESx(0x0A141E0A), - AESx(0x4992DB49), AESx(0x060C0A06), AESx(0x24486C24), AESx(0x5CB8E45C), - AESx(0xC29F5DC2), AESx(0xD3BD6ED3), AESx(0xAC43EFAC), AESx(0x62C4A662), - AESx(0x9139A891), AESx(0x9531A495), AESx(0xE4D337E4), AESx(0x79F28B79), - AESx(0xE7D532E7), AESx(0xC88B43C8), AESx(0x376E5937), AESx(0x6DDAB76D), - AESx(0x8D018C8D), AESx(0xD5B164D5), AESx(0x4E9CD24E), AESx(0xA949E0A9), - AESx(0x6CD8B46C), AESx(0x56ACFA56), AESx(0xF4F307F4), AESx(0xEACF25EA), - AESx(0x65CAAF65), AESx(0x7AF48E7A), AESx(0xAE47E9AE), AESx(0x08101808), - AESx(0xBA6FD5BA), AESx(0x78F08878), AESx(0x254A6F25), AESx(0x2E5C722E), - AESx(0x1C38241C), AESx(0xA657F1A6), AESx(0xB473C7B4), AESx(0xC69751C6), - AESx(0xE8CB23E8), AESx(0xDDA17CDD), AESx(0x74E89C74), AESx(0x1F3E211F), - AESx(0x4B96DD4B), AESx(0xBD61DCBD), AESx(0x8B0D868B), AESx(0x8A0F858A), - AESx(0x70E09070), AESx(0x3E7C423E), AESx(0xB571C4B5), AESx(0x66CCAA66), - AESx(0x4890D848), AESx(0x03060503), AESx(0xF6F701F6), AESx(0x0E1C120E), - AESx(0x61C2A361), AESx(0x356A5F35), AESx(0x57AEF957), AESx(0xB969D0B9), - AESx(0x86179186), AESx(0xC19958C1), AESx(0x1D3A271D), AESx(0x9E27B99E), - AESx(0xE1D938E1), AESx(0xF8EB13F8), AESx(0x982BB398), AESx(0x11223311), - AESx(0x69D2BB69), AESx(0xD9A970D9), AESx(0x8E07898E), AESx(0x9433A794), - AESx(0x9B2DB69B), AESx(0x1E3C221E), AESx(0x87159287), AESx(0xE9C920E9), - AESx(0xCE8749CE), AESx(0x55AAFF55), AESx(0x28507828), AESx(0xDFA57ADF), - AESx(0x8C038F8C), AESx(0xA159F8A1), AESx(0x89098089), AESx(0x0D1A170D), - AESx(0xBF65DABF), AESx(0xE6D731E6), AESx(0x4284C642), AESx(0x68D0B868), - AESx(0x4182C341), AESx(0x9929B099), AESx(0x2D5A772D), AESx(0x0F1E110F), - AESx(0xB07BCBB0), AESx(0x54A8FC54), AESx(0xBB6DD6BB), AESx(0x162C3A16) -}; - -__constant__ __align__(64) uint32_t d_AES3[256] = { - AESx(0xC6A56363), AESx(0xF8847C7C), AESx(0xEE997777), AESx(0xF68D7B7B), - AESx(0xFF0DF2F2), AESx(0xD6BD6B6B), AESx(0xDEB16F6F), AESx(0x9154C5C5), - AESx(0x60503030), AESx(0x02030101), AESx(0xCEA96767), AESx(0x567D2B2B), - AESx(0xE719FEFE), AESx(0xB562D7D7), AESx(0x4DE6ABAB), AESx(0xEC9A7676), - AESx(0x8F45CACA), AESx(0x1F9D8282), AESx(0x8940C9C9), AESx(0xFA877D7D), - AESx(0xEF15FAFA), AESx(0xB2EB5959), AESx(0x8EC94747), AESx(0xFB0BF0F0), - AESx(0x41ECADAD), AESx(0xB367D4D4), AESx(0x5FFDA2A2), AESx(0x45EAAFAF), - AESx(0x23BF9C9C), AESx(0x53F7A4A4), AESx(0xE4967272), AESx(0x9B5BC0C0), - AESx(0x75C2B7B7), AESx(0xE11CFDFD), AESx(0x3DAE9393), AESx(0x4C6A2626), - AESx(0x6C5A3636), AESx(0x7E413F3F), AESx(0xF502F7F7), AESx(0x834FCCCC), - AESx(0x685C3434), AESx(0x51F4A5A5), AESx(0xD134E5E5), AESx(0xF908F1F1), - AESx(0xE2937171), AESx(0xAB73D8D8), AESx(0x62533131), AESx(0x2A3F1515), - AESx(0x080C0404), AESx(0x9552C7C7), AESx(0x46652323), AESx(0x9D5EC3C3), - AESx(0x30281818), AESx(0x37A19696), AESx(0x0A0F0505), AESx(0x2FB59A9A), - AESx(0x0E090707), AESx(0x24361212), AESx(0x1B9B8080), AESx(0xDF3DE2E2), - AESx(0xCD26EBEB), AESx(0x4E692727), AESx(0x7FCDB2B2), AESx(0xEA9F7575), - AESx(0x121B0909), AESx(0x1D9E8383), AESx(0x58742C2C), AESx(0x342E1A1A), - AESx(0x362D1B1B), AESx(0xDCB26E6E), AESx(0xB4EE5A5A), AESx(0x5BFBA0A0), - AESx(0xA4F65252), AESx(0x764D3B3B), AESx(0xB761D6D6), AESx(0x7DCEB3B3), - AESx(0x527B2929), AESx(0xDD3EE3E3), AESx(0x5E712F2F), AESx(0x13978484), - AESx(0xA6F55353), AESx(0xB968D1D1), AESx(0x00000000), AESx(0xC12CEDED), - AESx(0x40602020), AESx(0xE31FFCFC), AESx(0x79C8B1B1), AESx(0xB6ED5B5B), - AESx(0xD4BE6A6A), AESx(0x8D46CBCB), AESx(0x67D9BEBE), AESx(0x724B3939), - AESx(0x94DE4A4A), AESx(0x98D44C4C), AESx(0xB0E85858), AESx(0x854ACFCF), - AESx(0xBB6BD0D0), AESx(0xC52AEFEF), AESx(0x4FE5AAAA), AESx(0xED16FBFB), - AESx(0x86C54343), AESx(0x9AD74D4D), AESx(0x66553333), AESx(0x11948585), - AESx(0x8ACF4545), AESx(0xE910F9F9), AESx(0x04060202), AESx(0xFE817F7F), - AESx(0xA0F05050), AESx(0x78443C3C), AESx(0x25BA9F9F), AESx(0x4BE3A8A8), - AESx(0xA2F35151), AESx(0x5DFEA3A3), AESx(0x80C04040), AESx(0x058A8F8F), - AESx(0x3FAD9292), AESx(0x21BC9D9D), AESx(0x70483838), AESx(0xF104F5F5), - AESx(0x63DFBCBC), AESx(0x77C1B6B6), AESx(0xAF75DADA), AESx(0x42632121), - AESx(0x20301010), AESx(0xE51AFFFF), AESx(0xFD0EF3F3), AESx(0xBF6DD2D2), - AESx(0x814CCDCD), AESx(0x18140C0C), AESx(0x26351313), AESx(0xC32FECEC), - AESx(0xBEE15F5F), AESx(0x35A29797), AESx(0x88CC4444), AESx(0x2E391717), - AESx(0x9357C4C4), AESx(0x55F2A7A7), AESx(0xFC827E7E), AESx(0x7A473D3D), - AESx(0xC8AC6464), AESx(0xBAE75D5D), AESx(0x322B1919), AESx(0xE6957373), - AESx(0xC0A06060), AESx(0x19988181), AESx(0x9ED14F4F), AESx(0xA37FDCDC), - AESx(0x44662222), AESx(0x547E2A2A), AESx(0x3BAB9090), AESx(0x0B838888), - AESx(0x8CCA4646), AESx(0xC729EEEE), AESx(0x6BD3B8B8), AESx(0x283C1414), - AESx(0xA779DEDE), AESx(0xBCE25E5E), AESx(0x161D0B0B), AESx(0xAD76DBDB), - AESx(0xDB3BE0E0), AESx(0x64563232), AESx(0x744E3A3A), AESx(0x141E0A0A), - AESx(0x92DB4949), AESx(0x0C0A0606), AESx(0x486C2424), AESx(0xB8E45C5C), - AESx(0x9F5DC2C2), AESx(0xBD6ED3D3), AESx(0x43EFACAC), AESx(0xC4A66262), - AESx(0x39A89191), AESx(0x31A49595), AESx(0xD337E4E4), AESx(0xF28B7979), - AESx(0xD532E7E7), AESx(0x8B43C8C8), AESx(0x6E593737), AESx(0xDAB76D6D), - AESx(0x018C8D8D), AESx(0xB164D5D5), AESx(0x9CD24E4E), AESx(0x49E0A9A9), - AESx(0xD8B46C6C), AESx(0xACFA5656), AESx(0xF307F4F4), AESx(0xCF25EAEA), - AESx(0xCAAF6565), AESx(0xF48E7A7A), AESx(0x47E9AEAE), AESx(0x10180808), - AESx(0x6FD5BABA), AESx(0xF0887878), AESx(0x4A6F2525), AESx(0x5C722E2E), - AESx(0x38241C1C), AESx(0x57F1A6A6), AESx(0x73C7B4B4), AESx(0x9751C6C6), - AESx(0xCB23E8E8), AESx(0xA17CDDDD), AESx(0xE89C7474), AESx(0x3E211F1F), - AESx(0x96DD4B4B), AESx(0x61DCBDBD), AESx(0x0D868B8B), AESx(0x0F858A8A), - AESx(0xE0907070), AESx(0x7C423E3E), AESx(0x71C4B5B5), AESx(0xCCAA6666), - AESx(0x90D84848), AESx(0x06050303), AESx(0xF701F6F6), AESx(0x1C120E0E), - AESx(0xC2A36161), AESx(0x6A5F3535), AESx(0xAEF95757), AESx(0x69D0B9B9), - AESx(0x17918686), AESx(0x9958C1C1), AESx(0x3A271D1D), AESx(0x27B99E9E), - AESx(0xD938E1E1), AESx(0xEB13F8F8), AESx(0x2BB39898), AESx(0x22331111), - AESx(0xD2BB6969), AESx(0xA970D9D9), AESx(0x07898E8E), AESx(0x33A79494), - AESx(0x2DB69B9B), AESx(0x3C221E1E), AESx(0x15928787), AESx(0xC920E9E9), - AESx(0x8749CECE), AESx(0xAAFF5555), AESx(0x50782828), AESx(0xA57ADFDF), - AESx(0x038F8C8C), AESx(0x59F8A1A1), AESx(0x09808989), AESx(0x1A170D0D), - AESx(0x65DABFBF), AESx(0xD731E6E6), AESx(0x84C64242), AESx(0xD0B86868), - AESx(0x82C34141), AESx(0x29B09999), AESx(0x5A772D2D), AESx(0x1E110F0F), - AESx(0x7BCBB0B0), AESx(0xA8FC5454), AESx(0x6DD6BBBB), AESx(0x2C3A1616) -}; - - __device__ __forceinline__ void aes_gpu_init(uint32_t *const sharedMemory) { /* each thread startup will fill a uint32 */ if (threadIdx.x < 256) { sharedMemory[threadIdx.x] = d_AES0[threadIdx.x]; - sharedMemory[threadIdx.x+256] = d_AES1[threadIdx.x]; - sharedMemory[threadIdx.x+512] = d_AES2[threadIdx.x]; - sharedMemory[threadIdx.x+768] = d_AES3[threadIdx.x]; + sharedMemory[threadIdx.x + 256] = ROL8(sharedMemory[threadIdx.x]); + sharedMemory[threadIdx.x + 512] = ROL16(sharedMemory[threadIdx.x]); + sharedMemory[threadIdx.x + 768] = ROL24(sharedMemory[threadIdx.x]); } } -/* tried with 3 xor.b32 asm, not faster */ -#define xor4_32(a,b,c,d) ((a ^ b) ^ (c ^ d)); +__device__ __forceinline__ +uint32_t bfe(uint32_t x, uint8_t bit, uint8_t numBits) +{ + uint32_t ret; + asm("bfe.u32 %0, %1, %2, %3;" : "=r"(ret) : "r"(x), "r"((uint32_t)bit), "r"((uint32_t)numBits)); + return ret; +} + +__device__ __forceinline__ +uint32_t bfi(uint32_t x, uint32_t a, uint32_t bit, uint32_t numBits) +{ + uint32_t ret; + asm("bfi.b32 %0, %1, %2, %3, %4;" : "=r"(ret) : "r"(x), "r"(a), "r"(bit), "r"(numBits)); + return ret; +} -__device__ +__device__ __forceinline__ static void aes_round( const uint32_t *const __restrict__ sharedMemory, const uint32_t x0, const uint32_t x1, const uint32_t x2, const uint32_t x3, const uint32_t k0, - uint32_t &y0, uint32_t &y1, uint32_t &y2, uint32_t &y3) + uint32_t &y0, uint32_t &y1, uint32_t &y2, uint32_t &y3 ) { - - y0 = xor4_32( - sharedMemory[__byte_perm(x0, 0, 0x4440)], - sharedMemory[__byte_perm(x1, 0, 0x4441) + 256], - sharedMemory[__byte_perm(x2, 0, 0x4442) + 512], - sharedMemory[__byte_perm(x3, 0, 0x4443) + 768]); - - y1 = xor4_32( - sharedMemory[__byte_perm(x1, 0, 0x4440)], - sharedMemory[__byte_perm(x2, 0, 0x4441) + 256], - sharedMemory[__byte_perm(x3, 0, 0x4442) + 512], - sharedMemory[__byte_perm(x0, 0, 0x4443) + 768]); - - y2 = xor4_32( - sharedMemory[__byte_perm(x2, 0, 0x4440)], - sharedMemory[__byte_perm(x3, 0, 0x4441) + 256], - sharedMemory[__byte_perm(x0, 0, 0x4442) + 512], - sharedMemory[__byte_perm(x1, 0, 0x4443) + 768]); // ^k2 - - y0 ^= k0; - - y3 = xor4_32( - sharedMemory[__byte_perm(x3, 0, 0x4440)], - sharedMemory[__byte_perm(x0, 0, 0x4441) + 256], - sharedMemory[__byte_perm(x1, 0, 0x4442) + 512], - sharedMemory[__byte_perm(x2, 0, 0x4443) + 768]); // ^k3 + const uint32_t a0 = (uint32_t) &sharedMemory[0]; + y0 = *(uint32_t *)(bfi(x0, a0, 2, 8)) + ^ sharedMemory[bfe(x1, 8, 8) + 256] + ^ sharedMemory[bfe(x2, 16, 8) + 512] + ^ sharedMemory[bfe(x3, 24, 8) + 768] ^ k0; + + y1 = *(uint32_t *)(bfi(x1, a0, 2, 8)) + ^sharedMemory[bfe(x2, 8, 8) + 256] + ^sharedMemory[bfe(x3, 16, 8) + 512] + ^ sharedMemory[bfe(x0, 24, 8) + 768]; + + y2 = *(uint32_t *)(bfi(x2, a0, 2, 8)) + ^sharedMemory[bfe(x3, 8, 8) + 256] + ^sharedMemory[bfe(x0, 16, 8) + 512] + ^ sharedMemory[bfe(x1, 24, 8) + 768]; + + y3 = *(uint32_t *)(bfi(x3, a0, 2, 8)) + ^ sharedMemory[bfe(x0, 8, 8) + 256] + ^ sharedMemory[bfe(x1, 16, 8) + 512] + ^ sharedMemory[bfe(x2, 24, 8) + 768]; } -__device__ +__device__ __forceinline__ static void aes_round( const uint32_t *const __restrict__ sharedMemory, const uint32_t x0, const uint32_t x1, const uint32_t x2, const uint32_t x3, uint32_t &y0, uint32_t &y1, uint32_t &y2, uint32_t &y3) { - y0 = xor4_32( - sharedMemory[__byte_perm(x0, 0, 0x4440)], - sharedMemory[__byte_perm(x1, 0, 0x4441) + 256], - sharedMemory[__byte_perm(x2, 0, 0x4442) + 512], - sharedMemory[__byte_perm(x3, 0, 0x4443) + 768]); - - y1 = xor4_32( - sharedMemory[__byte_perm(x1, 0, 0x4440)], - sharedMemory[__byte_perm(x2, 0, 0x4441) + 256], - sharedMemory[__byte_perm(x3, 0, 0x4442) + 512], - sharedMemory[__byte_perm(x0, 0, 0x4443) + 768]); - - y2 = xor4_32( - sharedMemory[__byte_perm(x2, 0, 0x4440)], - sharedMemory[__byte_perm(x3, 0, 0x4441) + 256], - sharedMemory[__byte_perm(x0, 0, 0x4442) + 512], - sharedMemory[__byte_perm(x1, 0, 0x4443) + 768]); // ^k2 - y3 = xor4_32( - sharedMemory[__byte_perm(x3, 0, 0x4440)], - sharedMemory[__byte_perm(x0, 0, 0x4441) + 256], - sharedMemory[__byte_perm(x1, 0, 0x4442) + 512], - sharedMemory[__byte_perm(x2, 0, 0x4443) + 768]); // ^k3 + const uint32_t a0 = (uint32_t)&sharedMemory[0]; + y0 = *(uint32_t *)(bfi(x0, a0, 2, 8)) + ^ sharedMemory[bfe(x1, 8, 8) + 256] + ^ sharedMemory[bfe(x2, 16, 8) + 512] + ^ sharedMemory[__byte_perm(x3, 0, 0x4443)+ 768]; + + + y1 = *(uint32_t *)(bfi(x1, a0, 2, 8)) + ^ sharedMemory[bfe(x2, 8, 8) + 256] + ^ sharedMemory[bfe(x3, 16, 8) + 512] + ^ sharedMemory[bfe(x0, 24, 8) + 768]; + + y2 = *(uint32_t *)(bfi(x2, a0, 2, 8)) + ^ sharedMemory[bfe(x3, 8, 8) + 256] + ^ sharedMemory[bfe(x0, 16, 8) + 512] + ^ sharedMemory[bfe(x1, 24, 8) + 768]; + + y3 = *(uint32_t *)(bfi(x3, a0, 2, 8)) + ^ sharedMemory[bfe(x0, 8, 8) + 256] + ^ sharedMemory[bfe(x1, 16, 8) + 512] + ^ sharedMemory[bfe(x2, 24, 8) + 768]; } diff --git a/x11/cuda_x11_cubehash512.cu b/x11/cuda_x11_cubehash512.cu index dad5a6b511..b5e0552601 100644 --- a/x11/cuda_x11_cubehash512.cu +++ b/x11/cuda_x11_cubehash512.cu @@ -1,296 +1,307 @@ #include "cuda_helper.h" -typedef unsigned char BitSequence; - -#define CUBEHASH_ROUNDS 16 /* this is r for CubeHashr/b */ -#define CUBEHASH_BLOCKBYTES 32 /* this is b for CubeHashr/b */ - -#if __CUDA_ARCH__ < 350 -#define LROT(x,bits) ((x << bits) | (x >> (32 - bits))) +#define ROUND_EVEN \ + xg = (x0 + xg); \ + x0 = ROTL32(x0, 7); \ + xh = (x1 + xh); \ + x1 = ROTL32(x1, 7); \ + xi = (x2 + xi); \ + x2 = ROTL32(x2, 7); \ + xj = (x3 + xj); \ + x3 = ROTL32(x3, 7); \ + xk = (x4 + xk); \ + x4 = ROTL32(x4, 7); \ + xl = (x5 + xl); \ + x5 = ROTL32(x5, 7); \ + xm = (x6 + xm); \ + x6 = ROTL32(x6, 7); \ + xn = (x7 + xn); \ + x7 = ROTL32(x7, 7); \ + xo = (x8 + xo); \ + x8 = ROTL32(x8, 7); \ + xp = (x9 + xp); \ + x9 = ROTL32(x9, 7); \ + xq = (xa + xq); \ + xa = ROTL32(xa, 7); \ + xr = (xb + xr); \ + xb = ROTL32(xb, 7); \ + xs = (xc + xs); \ + xc = ROTL32(xc, 7); \ + xt = (xd + xt); \ + xd = ROTL32(xd, 7); \ + xu = (xe + xu); \ + xe = ROTL32(xe, 7); \ + xv = (xf + xv); \ + xf = ROTL32(xf, 7); \ + x8 ^= xg; \ + x9 ^= xh; \ + xa ^= xi; \ + xb ^= xj; \ + xc ^= xk; \ + xd ^= xl; \ + xe ^= xm; \ + xf ^= xn; \ + x0 ^= xo; \ + x1 ^= xp; \ + x2 ^= xq; \ + x3 ^= xr; \ + x4 ^= xs; \ + x5 ^= xt; \ + x6 ^= xu; \ + x7 ^= xv; \ + xi = (x8 + xi); \ + x8 = ROTL32(x8, 11); \ + xj = (x9 + xj); \ + x9 = ROTL32(x9, 11); \ + xg = (xa + xg); \ + xa = ROTL32(xa, 11); \ + xh = (xb + xh); \ + xb = ROTL32(xb, 11); \ + xm = (xc + xm); \ + xc = ROTL32(xc, 11); \ + xn = (xd + xn); \ + xd = ROTL32(xd, 11); \ + xk = (xe + xk); \ + xe = ROTL32(xe, 11); \ + xl = (xf + xl); \ + xf = ROTL32(xf, 11); \ + xq = (x0 + xq); \ + x0 = ROTL32(x0, 11); \ + xr = (x1 + xr); \ + x1 = ROTL32(x1, 11); \ + xo = (x2 + xo); \ + x2 = ROTL32(x2, 11); \ + xp = (x3 + xp); \ + x3 = ROTL32(x3, 11); \ + xu = (x4 + xu); \ + x4 = ROTL32(x4, 11); \ + xv = (x5 + xv); \ + x5 = ROTL32(x5, 11); \ + xs = (x6 + xs); \ + x6 = ROTL32(x6, 11); \ + xt = (x7 + xt); \ + x7 = ROTL32(x7, 11); \ + xc ^= xi; \ + xd ^= xj; \ + xe ^= xg; \ + xf ^= xh; \ + x8 ^= xm; \ + x9 ^= xn; \ + xa ^= xk; \ + xb ^= xl; \ + x4 ^= xq; \ + x5 ^= xr; \ + x6 ^= xo; \ + x7 ^= xp; \ + x0 ^= xu; \ + x1 ^= xv; \ + x2 ^= xs; \ + x3 ^= xt; + +#define ROUND_ODD \ + xj = (xc + xj); \ + xc = ROTL32(xc, 7); \ + xi = (xd + xi); \ + xd = ROTL32(xd, 7); \ + xh = (xe + xh); \ + xe = ROTL32(xe, 7); \ + xg = (xf + xg); \ + xf = ROTL32(xf, 7); \ + xn = (x8 + xn); \ + x8 = ROTL32(x8, 7); \ + xm = (x9 + xm); \ + x9 = ROTL32(x9, 7); \ + xl = (xa + xl); \ + xa = ROTL32(xa, 7); \ + xk = (xb + xk); \ + xb = ROTL32(xb, 7); \ + xr = (x4 + xr); \ + x4 = ROTL32(x4, 7); \ + xq = (x5 + xq); \ + x5 = ROTL32(x5, 7); \ + xp = (x6 + xp); \ + x6 = ROTL32(x6, 7); \ + xo = (x7 + xo); \ + x7 = ROTL32(x7, 7); \ + xv = (x0 + xv); \ + x0 = ROTL32(x0, 7); \ + xu = (x1 + xu); \ + x1 = ROTL32(x1, 7); \ + xt = (x2 + xt); \ + x2 = ROTL32(x2, 7); \ + xs = (x3 + xs); \ + x3 = ROTL32(x3, 7); \ + x4 ^= xj; \ + x5 ^= xi; \ + x6 ^= xh; \ + x7 ^= xg; \ + x0 ^= xn; \ + x1 ^= xm; \ + x2 ^= xl; \ + x3 ^= xk; \ + xc ^= xr; \ + xd ^= xq; \ + xe ^= xp; \ + xf ^= xo; \ + x8 ^= xv; \ + x9 ^= xu; \ + xa ^= xt; \ + xb ^= xs; \ + xh = (x4 + xh); \ + x4 = ROTL32(x4, 11); \ + xg = (x5 + xg); \ + x5 = ROTL32(x5, 11); \ + xj = (x6 + xj); \ + x6 = ROTL32(x6, 11); \ + xi = (x7 + xi); \ + x7 = ROTL32(x7, 11); \ + xl = (x0 + xl); \ + x0 = ROTL32(x0, 11); \ + xk = (x1 + xk); \ + x1 = ROTL32(x1, 11); \ + xn = (x2 + xn); \ + x2 = ROTL32(x2, 11); \ + xm = (x3 + xm); \ + x3 = ROTL32(x3, 11); \ + xp = (xc + xp); \ + xc = ROTL32(xc, 11); \ + xo = (xd + xo); \ + xd = ROTL32(xd, 11); \ + xr = (xe + xr); \ + xe = ROTL32(xe, 11); \ + xq = (xf + xq); \ + xf = ROTL32(xf, 11); \ + xt = (x8 + xt); \ + x8 = ROTL32(x8, 11); \ + xs = (x9 + xs); \ + x9 = ROTL32(x9, 11); \ + xv = (xa + xv); \ + xa = ROTL32(xa, 11); \ + xu = (xb + xu); \ + xb = ROTL32(xb, 11); \ + x0 ^= xh; \ + x1 ^= xg; \ + x2 ^= xj; \ + x3 ^= xi; \ + x4 ^= xl; \ + x5 ^= xk; \ + x6 ^= xn; \ + x7 ^= xm; \ + x8 ^= xp; \ + x9 ^= xo; \ + xa ^= xr; \ + xb ^= xq; \ + xc ^= xt; \ + xd ^= xs; \ + xe ^= xv; \ + xf ^= xu; + +#define SIXTEEN_ROUNDS \ + for (int j = 0; j < 8; j ++) { \ + ROUND_EVEN; \ + ROUND_ODD;} +__global__ +#if __CUDA_ARCH__ > 500 +__launch_bounds__(512, 2) #else -#define LROT(x, bits) __funnelshift_l(x, x, bits) +__launch_bounds__(256, 5) #endif - -#define ROTATEUPWARDS7(a) LROT(a,7) -#define ROTATEUPWARDS11(a) LROT(a,11) - -#define SWAP(a,b) { uint32_t u = a; a = b; b = u; } - -__device__ __constant__ -static const uint32_t c_IV_512[32] = { - 0x2AEA2A61, 0x50F494D4, 0x2D538B8B, - 0x4167D83E, 0x3FEE2313, 0xC701CF8C, - 0xCC39968E, 0x50AC5695, 0x4D42C787, - 0xA647A8B3, 0x97CF0BEF, 0x825B4537, - 0xEEF864D2, 0xF22090C4, 0xD0E5CD33, - 0xA23911AE, 0xFCD398D9, 0x148FE485, - 0x1B017BEF, 0xB6444532, 0x6A536159, - 0x2FF5781C, 0x91FA7934, 0x0DBADEA9, - 0xD65C8A2B, 0xA5A70E75, 0xB1C62456, - 0xBC796576, 0x1921C8F7, 0xE7989AF1, - 0x7795D246, 0xD43E3B44 -}; - -static __device__ __forceinline__ void rrounds(uint32_t x[2][2][2][2][2]) -{ - int r; - int j; - int k; - int l; - int m; - -//#pragma unroll 16 - for (r = 0;r < CUBEHASH_ROUNDS;++r) { - - /* "add x_0jklm into x_1jklmn modulo 2^32" */ -#pragma unroll 2 - for (j = 0;j < 2;++j) -#pragma unroll 2 - for (k = 0;k < 2;++k) -#pragma unroll 2 - for (l = 0;l < 2;++l) -#pragma unroll 2 - for (m = 0;m < 2;++m) - x[1][j][k][l][m] += x[0][j][k][l][m]; - - /* "rotate x_0jklm upwards by 7 bits" */ -#pragma unroll 2 - for (j = 0;j < 2;++j) -#pragma unroll 2 - for (k = 0;k < 2;++k) -#pragma unroll 2 - for (l = 0;l < 2;++l) -#pragma unroll 2 - for (m = 0;m < 2;++m) - x[0][j][k][l][m] = ROTATEUPWARDS7(x[0][j][k][l][m]); - - /* "swap x_00klm with x_01klm" */ -#pragma unroll 2 - for (k = 0;k < 2;++k) -#pragma unroll 2 - for (l = 0;l < 2;++l) -#pragma unroll 2 - for (m = 0;m < 2;++m) - SWAP(x[0][0][k][l][m],x[0][1][k][l][m]) - - /* "xor x_1jklm into x_0jklm" */ -#pragma unroll 2 - for (j = 0;j < 2;++j) -#pragma unroll 2 - for (k = 0;k < 2;++k) -#pragma unroll 2 - for (l = 0;l < 2;++l) -#pragma unroll 2 - for (m = 0;m < 2;++m) - x[0][j][k][l][m] ^= x[1][j][k][l][m]; - - /* "swap x_1jk0m with x_1jk1m" */ -#pragma unroll 2 - for (j = 0;j < 2;++j) -#pragma unroll 2 - for (k = 0;k < 2;++k) -#pragma unroll 2 - for (m = 0;m < 2;++m) - SWAP(x[1][j][k][0][m],x[1][j][k][1][m]) - - /* "add x_0jklm into x_1jklm modulo 2^32" */ -#pragma unroll 2 - for (j = 0;j < 2;++j) -#pragma unroll 2 - for (k = 0;k < 2;++k) -#pragma unroll 2 - for (l = 0;l < 2;++l) -#pragma unroll 2 - for (m = 0;m < 2;++m) - x[1][j][k][l][m] += x[0][j][k][l][m]; - - /* "rotate x_0jklm upwards by 11 bits" */ -#pragma unroll 2 - for (j = 0;j < 2;++j) -#pragma unroll 2 - for (k = 0;k < 2;++k) -#pragma unroll 2 - for (l = 0;l < 2;++l) -#pragma unroll 2 - for (m = 0;m < 2;++m) - x[0][j][k][l][m] = ROTATEUPWARDS11(x[0][j][k][l][m]); - - /* "swap x_0j0lm with x_0j1lm" */ -#pragma unroll 2 - for (j = 0;j < 2;++j) -#pragma unroll 2 - for (l = 0;l < 2;++l) -#pragma unroll 2 - for (m = 0;m < 2;++m) - SWAP(x[0][j][0][l][m],x[0][j][1][l][m]) - - /* "xor x_1jklm into x_0jklm" */ -#pragma unroll 2 - for (j = 0;j < 2;++j) -#pragma unroll 2 - for (k = 0;k < 2;++k) -#pragma unroll 2 - for (l = 0;l < 2;++l) -#pragma unroll 2 - for (m = 0;m < 2;++m) - x[0][j][k][l][m] ^= x[1][j][k][l][m]; - - /* "swap x_1jkl0 with x_1jkl1" */ -#pragma unroll 2 - for (j = 0;j < 2;++j) -#pragma unroll 2 - for (k = 0;k < 2;++k) -#pragma unroll 2 - for (l = 0;l < 2;++l) - SWAP(x[1][j][k][l][0],x[1][j][k][l][1]) - - } -} - - -static __device__ __forceinline__ void block_tox(uint32_t block[16], uint32_t x[2][2][2][2][2]) -{ - int k; - int l; - int m; - uint32_t *in = block; - -#pragma unroll 2 - for (k = 0;k < 2;++k) -#pragma unroll 2 - for (l = 0;l < 2;++l) -#pragma unroll 2 - for (m = 0;m < 2;++m) - x[0][0][k][l][m] ^= *in++; -} - -static __device__ __forceinline__ void hash_fromx(uint32_t hash[16], uint32_t x[2][2][2][2][2]) -{ - int j; - int k; - int l; - int m; - uint32_t *out = hash; - -#pragma unroll 2 - for (j = 0;j < 2;++j) -#pragma unroll 2 - for (k = 0;k < 2;++k) -#pragma unroll 2 - for (l = 0;l < 2;++l) -#pragma unroll 2 - for (m = 0;m < 2;++m) - *out++ = x[0][j][k][l][m]; -} - -void __device__ __forceinline__ Init(uint32_t x[2][2][2][2][2]) +void x11_cubehash512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint32_t *g_hash) { - int i,j,k,l,m; -#if 0 - /* "the first three state words x_00000, x_00001, x_00010" */ - /* "are set to the integers h/8, b, r respectively." */ - /* "the remaining state words are set to 0." */ -#pragma unroll 2 - for (i = 0;i < 2;++i) -#pragma unroll 2 - for (j = 0;j < 2;++j) -#pragma unroll 2 - for (k = 0;k < 2;++k) -#pragma unroll 2 - for (l = 0;l < 2;++l) -#pragma unroll 2 - for (m = 0;m < 2;++m) - x[i][j][k][l][m] = 0; - x[0][0][0][0][0] = 512/8; - x[0][0][0][0][1] = CUBEHASH_BLOCKBYTES; - x[0][0][0][1][0] = CUBEHASH_ROUNDS; - - /* "the state is then transformed invertibly through 10r identical rounds */ - for (i = 0;i < 10;++i) rrounds(x); + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); +// if(thread < threads) + { + uint32_t *Hash = &g_hash[16 * thread]; + + uint32_t x0 = 0x2AEA2A61 ^ Hash[0]; + uint32_t x1 = 0x50F494D4 ^ Hash[1]; + uint32_t x2 = 0x2D538B8B ^ Hash[2]; + uint32_t x3 = 0x4167D83E ^ Hash[3]; + uint32_t x4 = 0x3FEE2313 ^ Hash[4]; + uint32_t x5 = 0xC701CF8C ^ Hash[5]; + uint32_t x6 = 0xCC39968E ^ Hash[6]; + uint32_t x7 = 0x50AC5695 ^ Hash[7]; + uint32_t x8 = 0x4D42C787, x9 = 0xA647A8B3, xa = 0x97CF0BEF, xb = 0x825B4537; + uint32_t xc = 0xEEF864D2, xd = 0xF22090C4, xe = 0xD0E5CD33, xf = 0xA23911AE; + uint32_t xg = 0xFCD398D9, xh = 0x148FE485, xi = 0x1B017BEF, xj = 0xB6444532; + uint32_t xk = 0x6A536159, xl = 0x2FF5781C, xm = 0x91FA7934, xn = 0x0DBADEA9; + uint32_t xo = 0xD65C8A2B, xp = 0xA5A70E75, xq = 0xB1C62456, xr = 0xBC796576; + uint32_t xs = 0x1921C8F7, xt = 0xE7989AF1, xu = 0x7795D246, xv = 0xD43E3B44; + +#if __CUDA_ARCH__ > 500 + #pragma unroll + for (int j = 0; j < 8; j++) #else - const uint32_t *iv = c_IV_512; - -#pragma unroll 2 - for (i = 0;i < 2;++i) -#pragma unroll 2 - for (j = 0;j < 2;++j) -#pragma unroll 2 - for (k = 0;k < 2;++k) -#pragma unroll 2 - for (l = 0;l < 2;++l) -#pragma unroll 2 - for (m = 0;m < 2;++m) - x[i][j][k][l][m] = *iv++; + #pragma unroll 1 + for (int j = 0; j < 8; j++) #endif + { + ROUND_EVEN; + ROUND_ODD; + } + + x0 ^= (Hash[8]); + x1 ^= (Hash[9]); + x2 ^= (Hash[10]); + x3 ^= (Hash[11]); + x4 ^= (Hash[12]); + x5 ^= (Hash[13]); + x6 ^= (Hash[14]); + x7 ^= (Hash[15]); +#if __CUDA_ARCH__ > 500 + #pragma unroll + for (int j = 0; j < 8; j++) +#else + for (int j = 0; j < 8; j++) +#endif + { + ROUND_EVEN; + ROUND_ODD; + } + x0 ^= 0x80; + + for (int j = 0; j < 8; j++) + { + ROUND_EVEN; + ROUND_ODD; + } + xv ^= 1; + + for(int i = 3; i < 13; i++) + { + for (int j = 0; j < 8; j++) + { + ROUND_EVEN; + ROUND_ODD; + } + } + + Hash[0] = x0; + Hash[1] = x1; + Hash[2] = x2; + Hash[3] = x3; + Hash[4] = x4; + Hash[5] = x5; + Hash[6] = x6; + Hash[7] = x7; + Hash[8] = x8; + Hash[9] = x9; + Hash[10] = xa; + Hash[11] = xb; + Hash[12] = xc; + Hash[13] = xd; + Hash[14] = xe; + Hash[15] = xf; + } } - -void __device__ __forceinline__ Update32(uint32_t x[2][2][2][2][2], const BitSequence *data) -{ - /* "xor the block into the first b bytes of the state" */ - /* "and then transform the state invertibly through r identical rounds" */ - block_tox((uint32_t*)data, x); - rrounds(x); -} - -void __device__ __forceinline__ Final(uint32_t x[2][2][2][2][2], BitSequence *hashval) -{ - int i; - - /* "the integer 1 is xored into the last state word x_11111" */ - x[1][1][1][1][1] ^= 1; - - /* "the state is then transformed invertibly through 10r identical rounds" */ -#pragma unroll 10 - for (i = 0;i < 10;++i) rrounds(x); - - /* "output the first h/8 bytes of the state" */ - hash_fromx((uint32_t*)hashval, x); -} - - -/***************************************************/ -// Die Hash-Funktion -__global__ __launch_bounds__(256, 4) -void x11_cubehash512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector) -{ - uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); - if (thread < threads) - { - uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread); - - int hashPosition = nounce - startNounce; - uint32_t *Hash = (uint32_t*)&g_hash[8 * hashPosition]; - - uint32_t x[2][2][2][2][2]; - Init(x); - - // erste Hälfte des Hashes (32 bytes) - Update32(x, (const BitSequence*)Hash); - - // zweite Hälfte des Hashes (32 bytes) - Update32(x, (const BitSequence*)(Hash+8)); - - // Padding Block - uint32_t last[8]; - last[0] = 0x80; -#pragma unroll 7 - for (int i=1; i < 8; i++) last[i] = 0; - Update32(x, (const BitSequence*)last); - - Final(x, (BitSequence*)Hash); - } -} - - __host__ -void x11_cubehash512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order) +void x11_cubehash512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash) { const uint32_t threadsperblock = 256; - // berechne wie viele Thread Blocks wir brauchen - dim3 grid((threads + threadsperblock-1)/threadsperblock); - dim3 block(threadsperblock); + // berechne wie viele Thread Blocks wir brauchen + dim3 grid((threads + threadsperblock - 1) / threadsperblock); + dim3 block(threadsperblock); - x11_cubehash512_gpu_hash_64<<>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector); + x11_cubehash512_gpu_hash_64 <<> >(threads, startNounce, d_hash); } - diff --git a/x11/cuda_x11_echo.cu b/x11/cuda_x11_echo.cu index c26cffd427..ddfc91b789 100644 --- a/x11/cuda_x11_echo.cu +++ b/x11/cuda_x11_echo.cu @@ -2,105 +2,147 @@ #include #include "cuda_helper.h" +#include "cuda_vector.h" -#include "cuda_x11_aes.cu" -static uint2 *d_nonce[MAX_GPUS]; +// #ifdef NOASM +#include "cuda_x11_aes_noasm.cu" +// #else +// #include "cuda_x11_aes.cu" +// #endif + static uint32_t *d_found[MAX_GPUS]; +__constant__ uint32_t P[48] = { + 0xe7e9f5f5, + 0xf5e7e9f5, + 0xb3b36b23, + 0xb3dbe7af, + + 0xa4213d7e, + 0xf5e7e9f5, + 0xb3b36b23, + 0xb3dbe7af, + //8-12 + 0x01425eb8, + 0xf5e7e9f5, + 0xb3b36b23, + 0xb3dbe7af, + + 0x65978b09, + 0xf5e7e9f5, + 0xb3b36b23, + 0xb3dbe7af, + + //21-25 + 0x2cb6b661, + 0x6b23b3b3, + 0xcf93a7cf, + 0x9d9d3751, + + 0x9ac2dea3, + 0xf5e7e9f5, + 0xb3b36b23, + 0xb3dbe7af, + + //34-38 + 0x579f9f33, + 0xfbfbfbfb, + 0xfbfbfbfb, + 0xefefd3c7, + + 0xdbfde1dd, + 0xf5e7e9f5, + 0xb3b36b23, + 0xb3dbe7af, + + 0x34514d9e, + 0xf5e7e9f5, + 0xb3b36b23, + 0xb3dbe7af, + + + 0xb134347e, + 0xea6f7e7e, + 0xbd7731bd, + 0x8a8a1968, + + 0x14b8a457, + 0xf5e7e9f5, + 0xb3b36b23, + 0xb3dbe7af, + + 0x265f4382, + 0xf5e7e9f5, + 0xb3b36b23, + 0xb3dbe7af + //58-61 +}; + __device__ __forceinline__ void AES_2ROUND( const uint32_t*const __restrict__ sharedMemory, uint32_t &x0, uint32_t &x1, uint32_t &x2, uint32_t &x3, const uint32_t k0) { - aes_round(sharedMemory, - x0, x1, x2, x3, - k0, - x0, x1, x2, x3); - - aes_round(sharedMemory, - x0, x1, x2, x3, - x0, x1, x2, x3); - - + uint32_t y0 = + sharedMemory[__byte_perm(x0, 0, 0x4440)] ^ + sharedMemory[__byte_perm(x1, 0, 0x4441) + 256] ^ + sharedMemory[__byte_perm(x2, 0, 0x4442) + 512] ^ + sharedMemory[__byte_perm(x3, 0, 0x4443) + 768] ^ k0; + + uint32_t y1 = + sharedMemory[__byte_perm(x1, 0, 0x4440)] ^ + sharedMemory[__byte_perm(x2, 0, 0x4441) + 256] ^ + sharedMemory[__byte_perm(x3, 0, 0x4442) + 512] ^ + sharedMemory[__byte_perm(x0, 0, 0x4443) + 768]; + + uint32_t y2 = + sharedMemory[__byte_perm(x2, 0, 0x4440)] ^ + sharedMemory[__byte_perm(x3, 0, 0x4441) + 256] ^ + sharedMemory[__byte_perm(x0, 0, 0x4442) + 512] ^ + sharedMemory[__byte_perm(x1, 0, 0x4443) + 768]; + + uint32_t y3 = + sharedMemory[__byte_perm(x3, 0, 0x4440)] ^ + sharedMemory[__byte_perm(x0, 0, 0x4441) + 256] ^ + sharedMemory[__byte_perm(x1, 0, 0x4442) + 512] ^ + sharedMemory[__byte_perm(x2, 0, 0x4443) + 768]; + + x0 = + sharedMemory[__byte_perm(y0, 0, 0x4440)] ^ + sharedMemory[__byte_perm(y1, 0, 0x4441) + 256] ^ + sharedMemory[__byte_perm(y2, 0, 0x4442) + 512] ^ + sharedMemory[__byte_perm(y3, 0, 0x4443) + 768]; + + x1 = + sharedMemory[__byte_perm(y1, 0, 0x4440)] ^ + sharedMemory[__byte_perm(y2, 0, 0x4441) + 256] ^ + sharedMemory[__byte_perm(y3, 0, 0x4442) + 512] ^ + sharedMemory[__byte_perm(y0, 0, 0x4443) + 768]; + + x2 = + sharedMemory[__byte_perm(y2, 0, 0x4440)] ^ + sharedMemory[__byte_perm(y3, 0, 0x4441) + 256] ^ + sharedMemory[__byte_perm(y0, 0, 0x4442) + 512] ^ + sharedMemory[__byte_perm(y1, 0, 0x4443) + 768]; + + x3 = + sharedMemory[__byte_perm(y3, 0, 0x4440)] ^ + sharedMemory[__byte_perm(y0, 0, 0x4441) + 256] ^ + sharedMemory[__byte_perm(y1, 0, 0x4442) + 512] ^ + sharedMemory[__byte_perm(y2, 0, 0x4443) + 768]; } __device__ __forceinline__ void cuda_echo_round( const uint32_t *const __restrict__ sharedMemory, uint32_t *const __restrict__ hash) { - uint32_t h[16]; - const uint32_t P[48] = { - 0xe7e9f5f5, - 0xf5e7e9f5, - 0xb3b36b23, - 0xb3dbe7af, - - 0xa4213d7e, - 0xf5e7e9f5, - 0xb3b36b23, - 0xb3dbe7af, - //8-12 - 0x01425eb8, - 0xf5e7e9f5, - 0xb3b36b23, - 0xb3dbe7af, - - 0x65978b09, - 0xf5e7e9f5, - 0xb3b36b23, - 0xb3dbe7af, - - //21-25 - 0x2cb6b661, - 0x6b23b3b3, - 0xcf93a7cf, - 0x9d9d3751, - - 0x9ac2dea3, - 0xf5e7e9f5, - 0xb3b36b23, - 0xb3dbe7af, - - //34-38 - 0x579f9f33, - 0xfbfbfbfb, - 0xfbfbfbfb, - 0xefefd3c7, - - 0xdbfde1dd, - 0xf5e7e9f5, - 0xb3b36b23, - 0xb3dbe7af, - - 0x34514d9e, - 0xf5e7e9f5, - 0xb3b36b23, - 0xb3dbe7af, - - - 0xb134347e, - 0xea6f7e7e, - 0xbd7731bd, - 0x8a8a1968, - - 0x14b8a457, - 0xf5e7e9f5, - 0xb3b36b23, - 0xb3dbe7af, - - 0x265f4382, - 0xf5e7e9f5, - 0xb3b36b23, - 0xb3dbe7af - //58-61 - }; uint32_t k0; -#pragma unroll - for (int i = 0; i < 16; i++) - { - h[i] = hash[i]; - } + uint32_t h[16]; + uint28 *phash = (uint28*)hash; + uint28 *outpt = (uint28*)h; + outpt[0] = phash[0]; + outpt[1] = phash[1]; k0 = 512 + 8; @@ -283,9 +325,9 @@ __device__ __forceinline__ void cuda_echo_round( t2 = (bc & 0x80808080); t3 = (cd & 0x80808080); - uint32_t abx = (t >> 7) * 27 ^ ((ab^t) << 1); - uint32_t bcx = (t2 >> 7) * 27 ^ ((bc^t2) << 1); - uint32_t cdx = (t3 >> 7) * 27 ^ ((cd^t3) << 1); + uint32_t abx = ((t >> 7) * 27 ^ ((ab^t) << 1)); + uint32_t bcx = ((t2 >> 7) * 27 ^ ((bc^t2) << 1)); + uint32_t cdx = ((t3 >> 7) * 27 ^ ((cd^t3) << 1)); W[idx + i] = abx ^ bc ^ d; W[idx + i + 4] = bcx ^ a ^ cd; @@ -309,13 +351,12 @@ __device__ __forceinline__ void cuda_echo_round( hash[i] ^= W[i]; } - - +/* __device__ __forceinline__ -void echo_gpu_init(uint32_t *const __restrict__ sharedMemory) +void echo_gpu_init_128(uint32_t *const __restrict__ sharedMemory) { - /* each thread startup will fill a uint32 */ - if (threadIdx.x < 128) { + if (threadIdx.x < 128) + { sharedMemory[threadIdx.x] = d_AES0[threadIdx.x]; sharedMemory[threadIdx.x + 256] = d_AES1[threadIdx.x]; sharedMemory[threadIdx.x + 512] = d_AES2[threadIdx.x]; @@ -327,21 +368,34 @@ void echo_gpu_init(uint32_t *const __restrict__ sharedMemory) sharedMemory[threadIdx.x + 64 * 2 + 768] = d_AES3[threadIdx.x + 64 * 2]; } } +*/ -__global__ __launch_bounds__(128) -void x11_echo512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t *const __restrict__ g_hash, const uint32_t *const __restrict__ g_nonceVector) +__device__ __forceinline__ +void echo_gpu_init(uint32_t *const __restrict__ sharedMemory) { - __shared__ uint32_t sharedMemory[1024]; + /* each thread startup will fill a uint32 */ + if (threadIdx.x < 256) { + sharedMemory[threadIdx.x] = d_AES0[threadIdx.x]; + sharedMemory[threadIdx.x + 256] = ROL8(sharedMemory[threadIdx.x]); + sharedMemory[threadIdx.x + 512] = ROL16(sharedMemory[threadIdx.x]); + sharedMemory[threadIdx.x + 768] = ROL24(sharedMemory[threadIdx.x]); + } +} - echo_gpu_init(sharedMemory); +__global__ __launch_bounds__(256, 3) +void x11_echo512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint32_t *const __restrict__ g_hash) +{ + __shared__ __align__(128) uint32_t sharedMemory[1024]; - uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); - if (thread < threads) + echo_gpu_init(sharedMemory); + __syncthreads(); + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); +// if (thread < threads) { - uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread); - int hashPosition = nounce - startNounce; - uint32_t *Hash = (uint32_t*)&g_hash[hashPosition<<3]; + const uint32_t nounce = (startNounce + thread); + const uint32_t hashPosition = nounce - startNounce; + uint32_t *const Hash = &g_hash[hashPosition<<4]; cuda_echo_round(sharedMemory, Hash); } } @@ -349,114 +403,47 @@ void x11_echo512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t *c // Setup-Funktionen __host__ void x11_echo512_cpu_init(int thr_id, uint32_t threads) { - cudaMalloc(&d_nonce[thr_id], sizeof(uint2)); - CUDA_SAFE_CALL(cudaMalloc(&(d_found[thr_id]), 4 * sizeof(uint32_t))); + CUDA_SAFE_CALL(cudaMalloc(&(d_found[thr_id]), 2 * sizeof(uint32_t))); } -__host__ void x11_echo512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order) +__host__ void x11_echo512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash) { - const uint32_t threadsperblock = 128; + const uint32_t threadsperblock = 256; // berechne wie viele Thread Blocks wir brauchen dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 block(threadsperblock); - x11_echo512_gpu_hash_64<<>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector); + x11_echo512_gpu_hash_64<<>>(threads, startNounce, d_hash); //MyStreamSynchronize(NULL, order, thr_id); } __host__ void x11_echo512_cpu_free(int32_t thr_id) { - cudaFreeHost(&d_nonce[thr_id]); } -__global__ __launch_bounds__(128) -void x11_echo512_gpu_hash_64_final(uint32_t threads, uint32_t startNounce, uint64_t *const __restrict__ g_hash, const uint32_t *const __restrict__ g_nonceVector, uint32_t *const __restrict__ d_found, uint32_t target) +__global__ __launch_bounds__(256, 3) +void x11_echo512_gpu_hash_64_final(uint32_t threads, uint32_t startNounce, const uint64_t *const __restrict__ g_hash, uint32_t *const __restrict__ d_found, uint32_t target) { - __shared__ uint32_t sharedMemory[1024]; - echo_gpu_init(sharedMemory); - uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); - if (thread < threads) + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); +// if (thread < threads) { - uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread); - int hashPosition = nounce - startNounce; - uint32_t *Hash = (uint32_t*)&g_hash[hashPosition *8]; + __shared__ __align__(128) uint32_t sharedMemory[1024]; + echo_gpu_init(sharedMemory); + __syncthreads(); + const uint32_t nounce = (startNounce + thread); + + const uint32_t hashPosition = nounce - startNounce; + const uint32_t *const Hash = (uint32_t*)&g_hash[hashPosition *8]; uint32_t h[16]; - const uint32_t P[48] = { - 0xe7e9f5f5, - 0xf5e7e9f5, - 0xb3b36b23, - 0xb3dbe7af, - - 0xa4213d7e, - 0xf5e7e9f5, - 0xb3b36b23, - 0xb3dbe7af, - //8-12 - 0x01425eb8, - 0xf5e7e9f5, - 0xb3b36b23, - 0xb3dbe7af, - - 0x65978b09, - 0xf5e7e9f5, - 0xb3b36b23, - 0xb3dbe7af, - - //21-25 - 0x2cb6b661, - 0x6b23b3b3, - 0xcf93a7cf, - 0x9d9d3751, - - 0x9ac2dea3, - 0xf5e7e9f5, - 0xb3b36b23, - 0xb3dbe7af, - - //34-38 - 0x579f9f33, - 0xfbfbfbfb, - 0xfbfbfbfb, - 0xefefd3c7, - - 0xdbfde1dd, - 0xf5e7e9f5, - 0xb3b36b23, - 0xb3dbe7af, - - 0x34514d9e, - 0xf5e7e9f5, - 0xb3b36b23, - 0xb3dbe7af, - - - 0xb134347e, - 0xea6f7e7e, - 0xbd7731bd, - 0x8a8a1968, - - 0x14b8a457, - 0xf5e7e9f5, - 0xb3b36b23, - 0xb3dbe7af, - - 0x265f4382, - 0xf5e7e9f5, - 0xb3b36b23, - 0xb3dbe7af - //58-61 - }; - - -#pragma unroll 16 - for (int i = 0; i < 16; i++) - { - h[i] = Hash[i]; - } + uint28 *phash = (uint28*)Hash; + uint28 *outpt = (uint28*)h; + outpt[0] = phash[0]; + outpt[1] = phash[1]; + uint32_t backup = h[7]; AES_2ROUND(sharedMemory, @@ -470,101 +457,81 @@ void x11_echo512_gpu_hash_64_final(uint32_t threads, uint32_t startNounce, uint6 uint32_t W[64]; -//#pragma unroll 4 + #pragma unroll 4 for (int i = 0; i < 4; i++) { - uint32_t a = P[i]; - uint32_t b = P[i + 4]; - uint32_t c = h[i + 8]; - uint32_t d = P[i + 8]; - - uint32_t ab = a ^ b; - uint32_t bc = b ^ c; - uint32_t cd = c ^ d; - - - uint32_t t = (ab & 0x80808080); - uint32_t t2 = (bc & 0x80808080); - uint32_t t3 = (cd & 0x80808080); - - uint32_t abx = (t >> 7) * 27 ^ ((ab^t) << 1); - uint32_t bcx = (t2 >> 7) * 27 ^ ((bc^t2) << 1); - uint32_t cdx = (t3 >> 7) * 27 ^ ((cd^t3) << 1); + const uint32_t a = P[i]; + const uint32_t a2 = P[12 + i]; + const uint32_t a3 = h[i]; + const uint32_t a4 = P[36 + i]; + const uint32_t b = P[i + 4]; + const uint32_t b2 = h[i + 4]; + const uint32_t b3 = P[24 + i + 0]; + const uint32_t b4 = P[36 + i + 4]; + const uint32_t c = h[i + 8]; + const uint32_t c2 = P[12 + i + 4]; + const uint32_t c3 = P[24 + i + 4]; + const uint32_t c4 = P[36 + i + 8]; + const uint32_t d = P[i + 8]; + const uint32_t d2 = P[12 + i + 8]; + const uint32_t d3 = P[24 + i + 8]; + const uint32_t d4 = h[i + 12]; + + const uint32_t ab = a ^ b; + const uint32_t ab2 = a2 ^ b2; + const uint32_t ab3 = a3 ^ b3; + const uint32_t ab4 = a4 ^ b4; + const uint32_t bc = b ^ c; + const uint32_t bc2 = b2 ^ c2; + const uint32_t bc3 = b3 ^ c3; + const uint32_t bc4 = b4 ^ c4; + const uint32_t cd = c ^ d; + const uint32_t cd2 = c2 ^ d2; + const uint32_t cd3 = c3 ^ d3; + const uint32_t cd4 = c4 ^ d4; + + const uint32_t t = (ab & 0x80808080); + const uint32_t ta2 = (ab2 & 0x80808080); + const uint32_t ta3 = (ab3 & 0x80808080); + const uint32_t t4 = (ab4 & 0x80808080); + const uint32_t t2 = (bc & 0x80808080); + const uint32_t t22 = (bc2 & 0x80808080); + const uint32_t t23 = (bc3 & 0x80808080); + const uint32_t t24 = (bc4 & 0x80808080); + const uint32_t t3 = (cd & 0x80808080); + const uint32_t t32 = (cd2 & 0x80808080); + const uint32_t t33 = (cd3 & 0x80808080); + const uint32_t t34 = (cd4 & 0x80808080); + + const uint32_t abx = (t >> 7) * 27 ^ ((ab^t) << 1); + const uint32_t abx2 = (ta2 >> 7) * 27 ^ ((ab2^ta2) << 1); + const uint32_t abx3 = (ta3 >> 7) * 27 ^ ((ab3^ta3) << 1); + const uint32_t abx4 = (t4 >> 7) * 27 ^ ((ab4^t4) << 1); + const uint32_t bcx = (t2 >> 7) * 27 ^ ((bc^t2) << 1); + const uint32_t bcx2 = (t22 >> 7) * 27 ^ ((bc2^t22) << 1); + const uint32_t bcx3 = (t23 >> 7) * 27 ^ ((bc3^t23) << 1); + const uint32_t bcx4 = (t24 >> 7) * 27 ^ ((bc4^t24) << 1); + const uint32_t cdx = (t3 >> 7) * 27 ^ ((cd^t3) << 1); + const uint32_t cdx2 = (t32 >> 7) * 27 ^ ((cd2^t32) << 1); + const uint32_t cdx3 = (t33 >> 7) * 27 ^ ((cd3^t33) << 1); + const uint32_t cdx4 = (t34 >> 7) * 27 ^ ((cd4^t34) << 1); W[0 + i] = abx ^ bc ^ d; W[0 + i + 4] = bcx ^ a ^ cd; W[0 + i + 8] = cdx ^ ab ^ d; W[0 + i + 12] = abx ^ bcx ^ cdx ^ ab ^ c; - - a = P[12 + i]; - b = h[i + 4]; - c = P[12 + i + 4]; - d = P[12 + i + 8]; - - ab = a ^ b; - bc = b ^ c; - cd = c ^ d; - - - t = (ab & 0x80808080); - t2 = (bc & 0x80808080); - t3 = (cd & 0x80808080); - - abx = (t >> 7) * 27 ^ ((ab^t) << 1); - bcx = (t2 >> 7) * 27 ^ ((bc^t2) << 1); - cdx = (t3 >> 7) * 27 ^ ((cd^t3) << 1); - - W[16 + i] = abx ^ bc ^ d; - W[16 + i + 4] = bcx ^ a ^ cd; - W[16 + i + 8] = cdx ^ ab ^ d; - W[16 + i + 12] = abx ^ bcx ^ cdx ^ ab ^ c; - - a = h[i]; - b = P[24 + i + 0]; - c = P[24 + i + 4]; - d = P[24 + i + 8]; - - ab = a ^ b; - bc = b ^ c; - cd = c ^ d; - - - t = (ab & 0x80808080); - t2 = (bc & 0x80808080); - t3 = (cd & 0x80808080); - - abx = (t >> 7) * 27 ^ ((ab^t) << 1); - bcx = (t2 >> 7) * 27 ^ ((bc^t2) << 1); - cdx = (t3 >> 7) * 27 ^ ((cd^t3) << 1); - - W[32 + i] = abx ^ bc ^ d; - W[32 + i + 4] = bcx ^ a ^ cd; - W[32 + i + 8] = cdx ^ ab ^ d; - W[32 + i + 12] = abx ^ bcx ^ cdx ^ ab ^ c; - - a = P[36 + i]; - b = P[36 + i + 4]; - c = P[36 + i + 8]; - d = h[i + 12]; - - ab = a ^ b; - bc = b ^ c; - cd = c ^ d; - - t = (ab & 0x80808080); - t2 = (bc & 0x80808080); - t3 = (cd & 0x80808080); - - abx = (t >> 7) * 27 ^ ((ab^t) << 1); - bcx = (t2 >> 7) * 27 ^ ((bc^t2) << 1); - cdx = (t3 >> 7) * 27 ^ ((cd^t3) << 1); - - W[48 + i] = abx ^ bc ^ d; - W[48 + i + 4] = bcx ^ a ^ cd; - W[48 + i + 8] = cdx ^ ab ^ d; - W[48 + i + 12] = abx ^ bcx ^ cdx ^ ab ^ c; - - } + W[16 + i] = abx2 ^ bc2 ^ d2; + W[16 + i + 4] = bcx2 ^ a2 ^ cd2; + W[16 + i + 8] = cdx2 ^ ab2 ^ d2; + W[16 + i + 12] = abx2 ^ bcx2 ^ cdx2 ^ ab2 ^ c2; + W[32 + i] = abx3 ^ bc3 ^ d3; + W[32 + i + 4] = bcx3 ^ a3 ^ cd3; + W[32 + i + 8] = cdx3 ^ ab3 ^ d3; + W[32 + i + 12] = abx3 ^ bcx3 ^ cdx3 ^ ab3 ^ c3; + W[48 + i] = abx4 ^ bc4 ^ d4; + W[48 + i + 4] = bcx4 ^ a4 ^ cd4; + W[48 + i + 8] = cdx4 ^ ab4 ^ d4; + W[48 + i + 12] = abx4 ^ bcx4 ^ cdx4 ^ ab4 ^ c4;} uint32_t k0 = 512 + 16; @@ -572,7 +539,8 @@ void x11_echo512_gpu_hash_64_final(uint32_t threads, uint32_t startNounce, uint6 { // Big Sub Words - #pragma unroll 4 + + #pragma unroll 4 for (int idx = 0; idx < 64; idx += 16) { AES_2ROUND(sharedMemory, @@ -593,10 +561,9 @@ void x11_echo512_gpu_hash_64_final(uint32_t threads, uint32_t startNounce, uint6 #pragma unroll 4 for (int i = 0; i < 4; i++) { - uint32_t t; /// 1, 5, 9, 13 - t = W[4 + i]; + uint32_t t = W[4 + i]; W[4 + i] = W[20 + i]; W[20 + i] = W[36 + i]; W[36 + i] = W[52 + i]; @@ -620,34 +587,34 @@ void x11_echo512_gpu_hash_64_final(uint32_t threads, uint32_t startNounce, uint6 // Mix Columns #pragma unroll - for (int i = 0; i < 4; i++) // Schleife über je 2*uint32_t - { + for (int i = 0; i < 4; i++) // Schleife über je 2*uint32_t + { #pragma unroll - for (int idx = 0; idx < 64; idx += 16) // Schleife über die elemnte - { + for (int idx = 0; idx < 64; idx += 16) // Schleife über die elemnte + { - uint32_t a = W[idx + i]; - uint32_t b = W[idx + i + 4]; - uint32_t c = W[idx + i + 8]; - uint32_t d = W[idx + i + 12]; + const uint32_t a = W[idx + i]; + const uint32_t b = W[idx + i + 4]; + const uint32_t c = W[idx + i + 8]; + const uint32_t d = W[idx + i + 12]; - uint32_t ab = a ^ b; - uint32_t bc = b ^ c; - uint32_t cd = c ^ d; + const uint32_t ab = a ^ b; + const uint32_t bc = b ^ c; + const uint32_t cd = c ^ d; - uint32_t t, t2, t3; - t = (ab & 0x80808080); - t2 = (bc & 0x80808080); - t3 = (cd & 0x80808080); + const uint32_t t = (ab & 0x80808080); + const uint32_t t2 = (bc & 0x80808080); + const uint32_t t3 = (cd & 0x80808080); - uint32_t abx = (t >> 7) * 27 ^ ((ab^t) << 1); - uint32_t bcx = (t2 >> 7) * 27 ^ ((bc^t2) << 1); - uint32_t cdx = (t3 >> 7) * 27 ^ ((cd^t3) << 1); + const uint32_t abx = (t >> 7) * 27 ^ ((ab^t) << 1); + const uint32_t bcx = (t2 >> 7) * 27 ^ ((bc^t2) << 1); + const uint32_t cdx = (t3 >> 7) * 27 ^ ((cd^t3) << 1); W[idx + i] = abx ^ bc ^ d; W[idx + i + 4] = bcx ^ a ^ cd; W[idx + i + 8] = cdx ^ ab ^ d; W[idx + i + 12] = abx ^ bcx ^ cdx ^ ab ^ c; + } } } @@ -681,10 +648,8 @@ void x11_echo512_gpu_hash_64_final(uint32_t threads, uint32_t startNounce, uint6 512 + (9 * 16) + 15); uint32_t bc = W[23] ^ W[43]; - uint32_t cd = W[43] ^ W[63]; uint32_t t2 = (bc & 0x80808080); - - uint32_t test = (t2 >> 7) * 27 ^ ((bc^t2) << 1) ^ W[3] ^ cd; + uint32_t test = (t2 >> 7) * 27 ^ ((bc^t2) << 1) ^ W[3] ^ W[43] ^ W[63]; bc = W[55] ^ W[11]; t2 = (bc & 0x80808080); test ^= (t2 >> 7) * 27 ^ ((bc^t2) << 1) ^ W[35] ^ W[11] ^ W[31] ^ backup; @@ -696,16 +661,16 @@ void x11_echo512_gpu_hash_64_final(uint32_t threads, uint32_t startNounce, uint6 } } } -__host__ void x11_echo512_cpu_hash_64_final(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, uint32_t target, uint32_t *h_found, int order) +__host__ void x11_echo512_cpu_hash_64_final(int thr_id, uint32_t threads, uint32_t startNounce, const uint32_t *d_hash, uint32_t target, uint32_t *h_found) { - const uint32_t threadsperblock = 128; + const uint32_t threadsperblock = 256; // berechne wie viele Thread Blocks wir brauchen dim3 grid((threads + threadsperblock - 1) / threadsperblock); dim3 block(threadsperblock); - cudaMemset(d_found[thr_id], 0xff, 4*sizeof(uint32_t)); + CUDA_SAFE_CALL(cudaMemsetAsync(d_found[thr_id], 0xff, 2 * sizeof(uint32_t), gpustream[thr_id])); - x11_echo512_gpu_hash_64_final << > >(threads, startNounce, (uint64_t*)d_hash, d_nonceVector, d_found[thr_id], target); - //MyStreamSynchronize(NULL, order, thr_id); - cudaMemcpy(h_found, d_found[thr_id], 4*sizeof(uint32_t), cudaMemcpyDeviceToHost); + x11_echo512_gpu_hash_64_final << >>(threads, startNounce, (uint64_t*)d_hash, d_found[thr_id], target); + CUDA_SAFE_CALL(cudaStreamSynchronize(gpustream[thr_id])); + CUDA_SAFE_CALL(cudaMemcpy(h_found, d_found[thr_id], 2 * sizeof(uint32_t), cudaMemcpyDeviceToHost)); } diff --git a/x11/cuda_x11_luffa512.cu b/x11/cuda_x11_luffa512.cu index eef6c5db1b..edecd2df05 100644 --- a/x11/cuda_x11_luffa512.cu +++ b/x11/cuda_x11_luffa512.cu @@ -333,15 +333,15 @@ void finalization512(hashState *state, uint32_t *b) /***************************************************/ // Die Hash-Funktion -__global__ void x11_luffa512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector) +__global__ void x11_luffa512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t *const __restrict__ g_hash, const uint32_t *const __restrict__ g_nonceVector) { - uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); if (thread < threads) { - uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread); + const uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread); - int hashPosition = nounce - startNounce; - uint32_t *Hash = (uint32_t*)&g_hash[8 * hashPosition]; + const int hashPosition = nounce - startNounce; + uint32_t *const Hash = (uint32_t*)&g_hash[8 * hashPosition]; hashState state; #pragma unroll 40 @@ -353,7 +353,7 @@ __global__ void x11_luffa512_gpu_hash_64(uint32_t threads, uint32_t startNounce, } } -__host__ void x11_luffa512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order) +__host__ void x11_luffa512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash) { const uint32_t threadsperblock = 256; diff --git a/x11/cuda_x11_luffa512_Cubehash.cu b/x11/cuda_x11_luffa512_Cubehash.cu index ef5bb7f963..aceade3f0b 100644 --- a/x11/cuda_x11_luffa512_Cubehash.cu +++ b/x11/cuda_x11_luffa512_Cubehash.cu @@ -1,30 +1,31 @@ /* - * luffa_for_32.c - * Version 2.0 (Sep 15th 2009) - * - * Copyright (C) 2008-2009 Hitachi, Ltd. All rights reserved. - * - * Hitachi, Ltd. is the owner of this software and hereby grant - * the U.S. Government and any interested party the right to use - * this software for the purposes of the SHA-3 evaluation process, - * notwithstanding that this software is copyrighted. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - */ +* luffa_for_32.c +* Version 2.0 (Sep 15th 2009) +* +* Copyright (C) 2008-2009 Hitachi, Ltd. All rights reserved. +* +* Hitachi, Ltd. is the owner of this software and hereby grant +* the U.S. Government and any interested party the right to use +* this software for the purposes of the SHA-3 evaluation process, +* notwithstanding that this software is copyrighted. +* +* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR +* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN +* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF +* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +*/ #include "cuda_helper.h" typedef unsigned char BitSequence; -typedef struct { - uint32_t buffer[8]; /* Buffer to be hashed */ - uint32_t chainv[40]; /* Chaining values */ +typedef struct +{ + uint32_t buffer[8]; /* Buffer to be hashed */ + uint32_t chainv[40]; /* Chaining values */ } hashState; #define MULT2(a,j)\ @@ -38,11 +39,13 @@ typedef struct { a[1+(8*j)] = a[0+(8*j)] ^ tmp;\ a[0+(8*j)] = tmp; -#if __CUDA_ARCH__ < 350 -#define LROT(x,bits) ((x << bits) | (x >> (32 - bits))) -#else -#define LROT(x, bits) __funnelshift_l(x, x, bits) -#endif +#define LROT ROTL32 + +#define CUBEHASH_ROUNDS 16 /* this is r for CubeHashr/b */ +#define CUBEHASH_BLOCKBYTES 32 /* this is b for CubeHashr/b */ + +#define ROTATEUPWARDS7(a) LROT(a,7) +#define ROTATEUPWARDS11(a) LROT(a,11) #define TWEAK(a0,a1,a2,a3,j)\ a0 = LROT(a0,j);\ @@ -50,15 +53,6 @@ typedef struct { a2 = LROT(a2,j);\ a3 = LROT(a3,j); -#define STEP(c0,c1)\ - SUBCRUMB(chainv[0],chainv[1],chainv[2],chainv[3],tmp);\ - SUBCRUMB(chainv[5],chainv[6],chainv[7],chainv[4],tmp);\ - MIXWORD(chainv[0],chainv[4]);\ - MIXWORD(chainv[1],chainv[5]);\ - MIXWORD(chainv[2],chainv[6]);\ - MIXWORD(chainv[3],chainv[7]);\ - ADD_CONSTANT(chainv[0],chainv[4],c0,c1); - #define SUBCRUMB(a0,a1,a2,a3,a4)\ a4 = a0;\ a0 |= a1;\ @@ -92,6 +86,15 @@ typedef struct { a0 ^= c0;\ b0 ^= c1; +#define STEP(c0,c1)\ + SUBCRUMB(chainv[0],chainv[1],chainv[2],chainv[3],tmp);\ + SUBCRUMB(chainv[5],chainv[6],chainv[7],chainv[4],tmp);\ + MIXWORD(chainv[0],chainv[4]);\ + MIXWORD(chainv[1],chainv[5]);\ + MIXWORD(chainv[2],chainv[6]);\ + MIXWORD(chainv[3],chainv[7]);\ + ADD_CONSTANT(chainv[0],chainv[4],c0,c1); + // Precalculated chaining values __device__ __constant__ uint32_t c_IV[40] = { 0x8bb0a761, 0xc2e4aa8b, 0x2d539bc9, 0x381408f8, @@ -103,196 +106,369 @@ __device__ __constant__ uint32_t c_IV[40] = 0x66e66a8a, 0x2303208f, 0x486dafb4, 0xc0d37dc6, 0x634d15af, 0xe5af6747, 0x10af7e38, 0xee7e6428, 0x01262e5d, 0xc92c2e64, 0x82fee966, 0xcea738d3, -0x867de2b0, 0xe0714818, 0xda6e831f, 0xa7062529}; +0x867de2b0, 0xe0714818, 0xda6e831f, 0xa7062529 }; /* old chaining values __device__ __constant__ uint32_t c_IV[40] = { - 0x6d251e69,0x44b051e0,0x4eaa6fb4,0xdbf78465, - 0x6e292011,0x90152df4,0xee058139,0xdef610bb, - 0xc3b44b95,0xd9d2f256,0x70eee9a0,0xde099fa3, - 0x5d9b0557,0x8fc944b3,0xcf1ccf0e,0x746cd581, - 0xf7efc89d,0x5dba5781,0x04016ce5,0xad659c05, - 0x0306194f,0x666d1836,0x24aa230a,0x8b264ae7, - 0x858075d5,0x36d79cce,0xe571f7d7,0x204b1f67, - 0x35870c6a,0x57e9e923,0x14bcb808,0x7cde72ce, - 0x6c68e9be,0x5ec41e22,0xc825b7c7,0xaffb4363, - 0xf5df3999,0x0fc688f1,0xb07224cc,0x03e86cea}; +0x6d251e69,0x44b051e0,0x4eaa6fb4,0xdbf78465, +0x6e292011,0x90152df4,0xee058139,0xdef610bb, +0xc3b44b95,0xd9d2f256,0x70eee9a0,0xde099fa3, +0x5d9b0557,0x8fc944b3,0xcf1ccf0e,0x746cd581, +0xf7efc89d,0x5dba5781,0x04016ce5,0xad659c05, +0x0306194f,0x666d1836,0x24aa230a,0x8b264ae7, +0x858075d5,0x36d79cce,0xe571f7d7,0x204b1f67, +0x35870c6a,0x57e9e923,0x14bcb808,0x7cde72ce, +0x6c68e9be,0x5ec41e22,0xc825b7c7,0xaffb4363, +0xf5df3999,0x0fc688f1,0xb07224cc,0x03e86cea}; */ - __device__ __constant__ uint32_t c_CNS[80] = { - 0x303994a6,0xe0337818,0xc0e65299,0x441ba90d, - 0x6cc33a12,0x7f34d442,0xdc56983e,0x9389217f, - 0x1e00108f,0xe5a8bce6,0x7800423d,0x5274baf4, - 0x8f5b7882,0x26889ba7,0x96e1db12,0x9a226e9d, - 0xb6de10ed,0x01685f3d,0x70f47aae,0x05a17cf4, - 0x0707a3d4,0xbd09caca,0x1c1e8f51,0xf4272b28, - 0x707a3d45,0x144ae5cc,0xaeb28562,0xfaa7ae2b, - 0xbaca1589,0x2e48f1c1,0x40a46f3e,0xb923c704, - 0xfc20d9d2,0xe25e72c1,0x34552e25,0xe623bb72, - 0x7ad8818f,0x5c58a4a4,0x8438764a,0x1e38e2e7, - 0xbb6de032,0x78e38b9d,0xedb780c8,0x27586719, - 0xd9847356,0x36eda57f,0xa2c78434,0x703aace7, - 0xb213afa5,0xe028c9bf,0xc84ebe95,0x44756f91, - 0x4e608a22,0x7e8fce32,0x56d858fe,0x956548be, - 0x343b138f,0xfe191be2,0xd0ec4e3d,0x3cb226e5, - 0x2ceb4882,0x5944a28e,0xb3ad2208,0xa1c4c355, - 0xf0d2e9e3,0x5090d577,0xac11d7fa,0x2d1925ab, - 0x1bcb66f2,0xb46496ac,0x6f2d9bc9,0xd1925ab0, - 0x78602649,0x29131ab6,0x8edae952,0x0fc053c3, - 0x3b6ba548,0x3f014f0c,0xedae9520,0xfc053c31}; + 0x303994a6, 0xe0337818, 0xc0e65299, 0x441ba90d, + 0x6cc33a12, 0x7f34d442, 0xdc56983e, 0x9389217f, + 0x1e00108f, 0xe5a8bce6, 0x7800423d, 0x5274baf4, + 0x8f5b7882, 0x26889ba7, 0x96e1db12, 0x9a226e9d, + 0xb6de10ed, 0x01685f3d, 0x70f47aae, 0x05a17cf4, + 0x0707a3d4, 0xbd09caca, 0x1c1e8f51, 0xf4272b28, + 0x707a3d45, 0x144ae5cc, 0xaeb28562, 0xfaa7ae2b, + 0xbaca1589, 0x2e48f1c1, 0x40a46f3e, 0xb923c704, + 0xfc20d9d2, 0xe25e72c1, 0x34552e25, 0xe623bb72, + 0x7ad8818f, 0x5c58a4a4, 0x8438764a, 0x1e38e2e7, + 0xbb6de032, 0x78e38b9d, 0xedb780c8, 0x27586719, + 0xd9847356, 0x36eda57f, 0xa2c78434, 0x703aace7, + 0xb213afa5, 0xe028c9bf, 0xc84ebe95, 0x44756f91, + 0x4e608a22, 0x7e8fce32, 0x56d858fe, 0x956548be, + 0x343b138f, 0xfe191be2, 0xd0ec4e3d, 0x3cb226e5, + 0x2ceb4882, 0x5944a28e, 0xb3ad2208, 0xa1c4c355, + 0xf0d2e9e3, 0x5090d577, 0xac11d7fa, 0x2d1925ab, + 0x1bcb66f2, 0xb46496ac, 0x6f2d9bc9, 0xd1925ab0, + 0x78602649, 0x29131ab6, 0x8edae952, 0x0fc053c3, + 0x3b6ba548, 0x3f014f0c, 0xedae9520, 0xfc053c31 }; /***************************************************/ __device__ __forceinline__ -void rnd512(hashState *state) +void rnd512(uint32_t *const __restrict__ statebuffer, uint32_t *const __restrict__ statechainv) { - int i,j; - uint32_t t[40]; - uint32_t chainv[8]; - uint32_t tmp; + int i, j; + uint32_t t[40]; + uint32_t chainv[8]; + uint32_t tmp; #pragma unroll 8 - for(i=0;i<8;i++) + for(i = 0; i<8; i++) { t[i] = 0; #pragma unroll 5 - for(j=0;j<5;j++) + for(j = 0; j<5; j++) { - t[i] ^= state->chainv[i+8*j]; - } + t[i] ^= statechainv[i + 8 * j]; + } } - MULT2(t, 0); + MULT2(t, 0); #pragma unroll 5 - for(j=0;j<5;j++) { + for(j = 0; j<5; j++) + { #pragma unroll 8 - for(i=0;i<8;i++) { - state->chainv[i+8*j] ^= t[i]; - } - } + for(i = 0; i<8; i++) + { + statechainv[i + 8 * j] ^= t[i]; + } + } #pragma unroll 5 - for(j=0;j<5;j++) { + for(j = 0; j<5; j++) + { #pragma unroll 8 - for(i=0;i<8;i++) { - t[i+8*j] = state->chainv[i+8*j]; - } - } + for(i = 0; i<8; i++) + { + t[i + 8 * j] = statechainv[i + 8 * j]; + } + } #pragma unroll 5 - for(j=0;j<5;j++) { - MULT2(state->chainv, j); - } + for(j = 0; j<5; j++) + { + MULT2(statechainv, j); + } #pragma unroll 5 - for(j=0;j<5;j++) { + for(j = 0; j<5; j++) + { #pragma unroll 8 - for(i=0;i<8;i++) { - state->chainv[8*j+i] ^= t[8*((j+1)%5)+i]; - } - } + for(i = 0; i<8; i++) + { + statechainv[8 * j + i] ^= t[8 * ((j + 1) % 5) + i]; + } + } #pragma unroll 5 - for(j=0;j<5;j++) { + for(j = 0; j<5; j++) + { #pragma unroll 8 - for(i=0;i<8;i++) { - t[i+8*j] = state->chainv[i+8*j]; - } - } + for(i = 0; i<8; i++) + { + t[i + 8 * j] = statechainv[i + 8 * j]; + } + } #pragma unroll 5 - for(j=0;j<5;j++) { - MULT2(state->chainv, j); - } + for(j = 0; j<5; j++) + { + MULT2(statechainv, j); + } #pragma unroll 5 - for(j=0;j<5;j++) { + for(j = 0; j<5; j++) + { #pragma unroll 8 - for(i=0;i<8;i++) { - state->chainv[8*j+i] ^= t[8*((j+4)%5)+i]; - } - } + for(i = 0; i<8; i++) + { + statechainv[8 * j + i] ^= t[8 * ((j + 4) % 5) + i]; + } + } #pragma unroll 5 - for(j=0;j<5;j++) { + for(j = 0; j<5; j++) + { #pragma unroll 8 - for(i=0;i<8;i++) { - state->chainv[i+8*j] ^= state->buffer[i]; - } - MULT2(state->buffer, 0); - } + for(i = 0; i<8; i++) + { + statechainv[i + 8 * j] ^= statebuffer[i]; + } + MULT2(statebuffer, 0); + } #pragma unroll 8 - for(i=0;i<8;i++) { - chainv[i] = state->chainv[i]; - } + for(i = 0; i<8; i++) + { + chainv[i] = statechainv[i]; + } + +#pragma unroll 1 + for(i = 0; i<=14; i+=2) + { + STEP(c_CNS[i], c_CNS[i + 1]); + } #pragma unroll 8 - for(i=0;i<8;i++) { - STEP(c_CNS[(2*i)],c_CNS[(2*i)+1]); - } + for(i = 0; i<8; i++) + { + statechainv[i] = chainv[i]; + chainv[i] = statechainv[i + 8]; + } + + TWEAK(chainv[4], chainv[5], chainv[6], chainv[7], 1); + +#pragma unroll 1 + for(i = 0; i<=14; i+=2) + { + STEP(c_CNS[i + 16], c_CNS[i + 16 + 1]); + } #pragma unroll 8 - for(i=0;i<8;i++) { - state->chainv[i] = chainv[i]; - chainv[i] = state->chainv[i+8]; - } + for(i = 0; i<8; i++) + { + statechainv[i + 8] = chainv[i]; + chainv[i] = statechainv[i + 16]; + } - TWEAK(chainv[4],chainv[5],chainv[6],chainv[7],1); + TWEAK(chainv[4], chainv[5], chainv[6], chainv[7], 2); + +#pragma unroll 1 + for(i = 0; i<=14; i+=2) + { + STEP(c_CNS[i + 32], c_CNS[i + 32 + 1]); + } #pragma unroll 8 - for(i=0;i<8;i++) { - STEP(c_CNS[(2*i)+16],c_CNS[(2*i)+16+1]); - } + for(i = 0; i<8; i++) + { + statechainv[i + 16] = chainv[i]; + chainv[i] = statechainv[i + 24]; + } + + TWEAK(chainv[4], chainv[5], chainv[6], chainv[7], 3); +#pragma unroll 1 + for(i = 0; i<=14; i+=2) + { + STEP(c_CNS[i + 48], c_CNS[i + 48 + 1]); + } + +#pragma unroll 8 + for(i = 0; i<8; i++) + { + statechainv[i + 24] = chainv[i]; + chainv[i] = statechainv[i + 32]; + } + + TWEAK(chainv[4], chainv[5], chainv[6], chainv[7], 4); + +#pragma unroll 1 + for(i = 0; i<=14; i+=2) + { + STEP(c_CNS[i + 64], c_CNS[i + 64 + 1]); + } + +#pragma unroll 8 + for(i = 0; i<8; i++) + { + statechainv[i + 32] = chainv[i]; + } +} + + +__device__ __forceinline__ +void rnd512_finalfirst(uint32_t *const statechainv) +{ + int i, j; + uint32_t t[40]; + uint32_t chainv[8]; + uint32_t tmp; + +#pragma unroll 8 + for (i = 0; i<8; i++) + { + t[i] = 0; +#pragma unroll 5 + for (j = 0; j<5; j++) + { + t[i] ^= statechainv[i + 8 * j]; + } + } + + MULT2(t, 0); + +#pragma unroll 5 + for (j = 0; j<5; j++) { +#pragma unroll 8 + for (i = 0; i<8; i++) { + statechainv[i + 8 * j] ^= t[i]; + } + } + +#pragma unroll 5 + for (j = 0; j<5; j++) { +#pragma unroll 8 + for (i = 0; i<8; i++) { + t[i + 8 * j] = statechainv[i + 8 * j]; + } + } + +#pragma unroll 5 + for (j = 0; j<5; j++) { + MULT2(statechainv, j); + } + +#pragma unroll 5 + for (j = 0; j<5; j++) { #pragma unroll 8 - for(i=0;i<8;i++) { - state->chainv[i+8] = chainv[i]; - chainv[i] = state->chainv[i+16]; - } + for (i = 0; i<8; i++) { + statechainv[8 * j + i] ^= t[8 * ((j + 1) % 5) + i]; + } + } - TWEAK(chainv[4],chainv[5],chainv[6],chainv[7],2); +#pragma unroll 5 + for (j = 0; j<5; j++) { +#pragma unroll 8 + for (i = 0; i<8; i++) { + t[i + 8 * j] = statechainv[i + 8 * j]; + } + } +#pragma unroll 5 + for (j = 0; j<5; j++) { + MULT2(statechainv, j); + } + +#pragma unroll 5 + for (j = 0; j<5; j++) { #pragma unroll 8 - for(i=0;i<8;i++) { - STEP(c_CNS[(2*i)+32],c_CNS[(2*i)+32+1]); - } + for (i = 0; i<8; i++) { + statechainv[8 * j + i] ^= t[8 * ((j + 4) % 5) + i]; + } + } + + statechainv[0 + 8 * 0] ^= 0x80000000; + statechainv[1 + 8 * 1] ^= 0x80000000; + statechainv[2 + 8 * 2] ^= 0x80000000; + statechainv[3 + 8 * 3] ^= 0x80000000; + statechainv[4 + 8 * 4] ^= 0x80000000; + #pragma unroll 8 - for(i=0;i<8;i++) { - state->chainv[i+16] = chainv[i]; - chainv[i] = state->chainv[i+24]; - } + for (i = 0; i<8; i++) { + chainv[i] = statechainv[i]; + } - TWEAK(chainv[4],chainv[5],chainv[6],chainv[7],3); +#pragma unroll 1 + for (i = 0; i<8; i++) { + STEP(c_CNS[(2 * i)], c_CNS[(2 * i) + 1]); + } + +#pragma unroll 8 + for (i = 0; i<8; i++) { + statechainv[i] = chainv[i]; + chainv[i] = statechainv[i + 8]; + } + + TWEAK(chainv[4], chainv[5], chainv[6], chainv[7], 1); + +#pragma unroll 1 + for (i = 0; i<8; i++) { + STEP(c_CNS[(2 * i) + 16], c_CNS[(2 * i) + 16 + 1]); + } #pragma unroll 8 - for(i=0;i<8;i++) { - STEP(c_CNS[(2*i)+48],c_CNS[(2*i)+48+1]); - } + for (i = 0; i<8; i++) { + statechainv[i + 8] = chainv[i]; + chainv[i] = statechainv[i + 16]; + } + + TWEAK(chainv[4], chainv[5], chainv[6], chainv[7], 2); + +#pragma unroll 1 + for (i = 0; i<8; i++) { + STEP(c_CNS[(2 * i) + 32], c_CNS[(2 * i) + 32 + 1]); + } #pragma unroll 8 - for(i=0;i<8;i++) { - state->chainv[i+24] = chainv[i]; - chainv[i] = state->chainv[i+32]; - } + for (i = 0; i<8; i++) { + statechainv[i + 16] = chainv[i]; + chainv[i] = statechainv[i + 24]; + } + + TWEAK(chainv[4], chainv[5], chainv[6], chainv[7], 3); - TWEAK(chainv[4],chainv[5],chainv[6],chainv[7],4); +#pragma unroll 1 + for (i = 0; i<8; i++) { + STEP(c_CNS[(2 * i) + 48], c_CNS[(2 * i) + 48 + 1]); + } #pragma unroll 8 - for(i=0;i<8;i++) { - STEP(c_CNS[(2*i)+64],c_CNS[(2*i)+64+1]); - } + for (i = 0; i<8; i++) { + statechainv[i + 24] = chainv[i]; + chainv[i] = statechainv[i + 32]; + } + + TWEAK(chainv[4], chainv[5], chainv[6], chainv[7], 4); + +#pragma unroll 1 + for (i = 0; i<8; i++) { + STEP(c_CNS[(2 * i) + 64], c_CNS[(2 * i) + 64 + 1]); + } #pragma unroll 8 - for(i=0;i<8;i++) { - state->chainv[i+32] = chainv[i]; - } + for (i = 0; i<8; i++) { + statechainv[i + 32] = chainv[i]; + } } + + __device__ __forceinline__ void rnd512_first(uint32_t state[40], uint32_t buffer[8]) { @@ -301,87 +477,100 @@ void rnd512_first(uint32_t state[40], uint32_t buffer[8]) uint32_t tmp; #pragma unroll 5 - for (j = 0; j<5; j++) { + for(j = 0; j<5; j++) + { state[0 + 8 * j] ^= buffer[0]; #pragma unroll 7 - for (i = 1; i<8; i++) { + for(i = 1; i<8; i++) + { state[i + 8 * j] ^= buffer[i]; } MULT2(buffer, 0); } #pragma unroll 8 - for (i = 0; i<8; i++) { + for(i = 0; i<8; i++) + { chainv[i] = state[i]; } -#pragma unroll 8 - for (i = 0; i<8; i++) { +#pragma unroll 1 + for(i = 0; i<8; i++) + { STEP(c_CNS[(2 * i)], c_CNS[(2 * i) + 1]); } #pragma unroll 8 - for (i = 0; i<8; i++) { + for(i = 0; i<8; i++) + { state[i] = chainv[i]; chainv[i] = state[i + 8]; } TWEAK(chainv[4], chainv[5], chainv[6], chainv[7], 1); -#pragma unroll 8 - for (i = 0; i<8; i++) { +#pragma unroll 1 + for(i = 0; i<8; i++) + { STEP(c_CNS[(2 * i) + 16], c_CNS[(2 * i) + 16 + 1]); } #pragma unroll 8 - for (i = 0; i<8; i++) { + for(i = 0; i<8; i++) + { state[i + 8] = chainv[i]; chainv[i] = state[i + 16]; } TWEAK(chainv[4], chainv[5], chainv[6], chainv[7], 2); -#pragma unroll 8 - for (i = 0; i<8; i++) { +#pragma unroll 1 + for(i = 0; i<8; i++) + { STEP(c_CNS[(2 * i) + 32], c_CNS[(2 * i) + 32 + 1]); } #pragma unroll 8 - for (i = 0; i<8; i++) { + for(i = 0; i<8; i++) + { state[i + 16] = chainv[i]; chainv[i] = state[i + 24]; } TWEAK(chainv[4], chainv[5], chainv[6], chainv[7], 3); -#pragma unroll 8 - for (i = 0; i<8; i++) { +#pragma unroll 1 + for(i = 0; i<8; i++) + { STEP(c_CNS[(2 * i) + 48], c_CNS[(2 * i) + 48 + 1]); } #pragma unroll 8 - for (i = 0; i<8; i++) { + for(i = 0; i<8; i++) + { state[i + 24] = chainv[i]; chainv[i] = state[i + 32]; } TWEAK(chainv[4], chainv[5], chainv[6], chainv[7], 4); -#pragma unroll 8 - for (i = 0; i<8; i++) { +#pragma unroll 1 + for(i = 0; i<8; i++) + { STEP(c_CNS[(2 * i) + 64], c_CNS[(2 * i) + 64 + 1]); } #pragma unroll 8 - for (i = 0; i<8; i++) { + for(i = 0; i<8; i++) + { state[i + 32] = chainv[i]; } } /***************************************************/ __device__ __forceinline__ -void rnd512_nullhash(uint32_t *state) +void rnd512_nullhash(uint32_t *const state) { int i, j; uint32_t t[40]; @@ -389,10 +578,12 @@ void rnd512_nullhash(uint32_t *state) uint32_t tmp; #pragma unroll 8 - for (i = 0; i<8; i++) { + for(i = 0; i<8; i++) + { t[i] = state[i + 8 * 0]; #pragma unroll 4 - for (j = 1; j<5; j++) { + for(j = 1; j<5; j++) + { t[i] ^= state[i + 8 * j]; } } @@ -400,467 +591,688 @@ void rnd512_nullhash(uint32_t *state) MULT2(t, 0); #pragma unroll 5 - for (j = 0; j<5; j++) { + for(j = 0; j<5; j++) + { #pragma unroll 8 - for (i = 0; i<8; i++) { + for(i = 0; i<8; i++) + { state[i + 8 * j] ^= t[i]; } } #pragma unroll 5 - for (j = 0; j<5; j++) { + for(j = 0; j<5; j++) + { #pragma unroll 8 - for (i = 0; i<8; i++) { + for(i = 0; i<8; i++) + { t[i + 8 * j] = state[i + 8 * j]; } } #pragma unroll 5 - for (j = 0; j<5; j++) { + for(j = 0; j<5; j++) + { MULT2(state, j); } #pragma unroll 5 - for (j = 0; j<5; j++) { + for(j = 0; j<5; j++) + { #pragma unroll 8 - for (i = 0; i<8; i++) { + for(i = 0; i<8; i++) + { state[8 * j + i] ^= t[8 * ((j + 1) % 5) + i]; } } #pragma unroll 5 - for (j = 0; j<5; j++) { + for(j = 0; j<5; j++) + { #pragma unroll 8 - for (i = 0; i<8; i++) { + for(i = 0; i<8; i++) + { t[i + 8 * j] = state[i + 8 * j]; } } #pragma unroll 5 - for (j = 0; j<5; j++) { + for(j = 0; j<5; j++) + { MULT2(state, j); } #pragma unroll 5 - for (j = 0; j<5; j++) { + for(j = 0; j<5; j++) + { #pragma unroll 8 - for (i = 0; i<8; i++) { + for(i = 0; i<8; i++) + { state[8 * j + i] ^= t[8 * ((j + 4) % 5) + i]; } } #pragma unroll 8 - for (i = 0; i<8; i++) { + for(i = 0; i<8; i++) + { chainv[i] = state[i]; } -#pragma unroll 8 - for (i = 0; i<8; i++) { +#pragma unroll 1 + for(i = 0; i<8; i++) + { STEP(c_CNS[(2 * i)], c_CNS[(2 * i) + 1]); } #pragma unroll 8 - for (i = 0; i<8; i++) { + for(i = 0; i<8; i++) + { state[i] = chainv[i]; chainv[i] = state[i + 8]; } TWEAK(chainv[4], chainv[5], chainv[6], chainv[7], 1); -#pragma unroll 8 - for (i = 0; i<8; i++) { +#pragma unroll 1 + for(i = 0; i<8; i++) + { STEP(c_CNS[(2 * i) + 16], c_CNS[(2 * i) + 16 + 1]); } #pragma unroll 8 - for (i = 0; i<8; i++) { + for(i = 0; i<8; i++) + { state[i + 8] = chainv[i]; chainv[i] = state[i + 16]; } TWEAK(chainv[4], chainv[5], chainv[6], chainv[7], 2); -#pragma unroll 8 - for (i = 0; i<8; i++) { +#pragma unroll 1 + for(i = 0; i<8; i++) + { STEP(c_CNS[(2 * i) + 32], c_CNS[(2 * i) + 32 + 1]); } #pragma unroll 8 - for (i = 0; i<8; i++) { + for(i = 0; i<8; i++) + { state[i + 16] = chainv[i]; chainv[i] = state[i + 24]; } TWEAK(chainv[4], chainv[5], chainv[6], chainv[7], 3); -#pragma unroll 8 - for (i = 0; i<8; i++) { +#pragma unroll 1 + for(i = 0; i<8; i++) + { STEP(c_CNS[(2 * i) + 48], c_CNS[(2 * i) + 48 + 1]); } #pragma unroll 8 - for (i = 0; i<8; i++) { + for(i = 0; i<8; i++) + { state[i + 24] = chainv[i]; chainv[i] = state[i + 32]; } TWEAK(chainv[4], chainv[5], chainv[6], chainv[7], 4); -#pragma unroll 8 - for (i = 0; i<8; i++) { +#pragma unroll 1 + for(i = 0; i<8; i++) + { STEP(c_CNS[(2 * i) + 64], c_CNS[(2 * i) + 64 + 1]); } #pragma unroll 8 - for (i = 0; i<8; i++) { + for(i = 0; i<8; i++) + { state[i + 32] = chainv[i]; } } __device__ __forceinline__ -void Update512(hashState *state, const uint32_t*data) +void Update512(uint32_t *const __restrict__ statebuffer, uint32_t *const __restrict__ statechainv, const uint32_t *const __restrict__ data) { #pragma unroll 8 - for (int i = 0; i < 8; i++) state->buffer[i] = cuda_swab32(data[i]); - rnd512_first(state->chainv, state->buffer); + for(int i = 0; i < 8; i++) + statebuffer[i] = cuda_swab32(data[i]); + rnd512_first(statechainv, statebuffer); #pragma unroll 8 - for (int i = 0; i < 8; i++) state->buffer[i] = cuda_swab32(data[i + 8]); - rnd512(state); + for(int i = 0; i < 8; i++) + statebuffer[i] = cuda_swab32(data[i + 8]); + rnd512(statebuffer, statechainv); } /***************************************************/ __device__ __forceinline__ -void finalization512(hashState *state, uint32_t *b) +void finalization512(uint32_t *const __restrict__ statechainv, uint32_t *const __restrict__ b) { - int i,j; - - state->buffer[0] = 0x80000000; - #pragma unroll 7 - for(int i=1;i<8;i++) state->buffer[i] = 0; - rnd512(state); - - /*---- blank round with m=0 ----*/ - rnd512_nullhash(state->chainv); + int i, j; + rnd512_finalfirst(statechainv); + /*---- blank round with m=0 ----*/ + rnd512_nullhash(statechainv); #pragma unroll 8 - for(i=0;i<8;i++) { - b[i] = state->chainv[i + 8 * 0]; + for(i = 0; i<8; i++) + { + b[i] = statechainv[i + 8 * 0]; #pragma unroll 4 - for(j=1;j<5;j++) { - b[i] ^= state->chainv[i+8*j]; - } - b[i] = cuda_swab32((b[i])); - } + for(j = 1; j<5; j++) + { + b[i] ^= statechainv[i + 8 * j]; + } + b[i] = cuda_swab32((b[i])); + } - rnd512_nullhash(state->chainv); + rnd512_nullhash(statechainv); #pragma unroll 8 - for(i=0;i<8;i++) { - b[8 + i] = state->chainv[i + 8 * 0]; + for(i = 0; i<8; i++) + { + b[8 + i] = statechainv[i + 8 * 0]; #pragma unroll 4 - for(j=1;j<5;j++) { - b[8+i] ^= state->chainv[i+8*j]; - } - b[8 + i] = cuda_swab32((b[8 + i])); - } -} - - -typedef unsigned char BitSequence; - -#define CUBEHASH_ROUNDS 16 /* this is r for CubeHashr/b */ -#define CUBEHASH_BLOCKBYTES 32 /* this is b for CubeHashr/b */ - -#if __CUDA_ARCH__ < 350 -#define LROT(x,bits) ((x << bits) | (x >> (32 - bits))) -#else -#define LROT(x, bits) __funnelshift_l(x, x, bits) -#endif - -#define ROTATEUPWARDS7(a) LROT(a,7) -#define ROTATEUPWARDS11(a) LROT(a,11) - -#define SWAP(a,b) { uint32_t u = a; a = b; b = u; } - -__device__ __constant__ -static const uint32_t c_IV_512[32] = { - - 0x2AEA2A61, 0x50F494D4, 0x2D538B8B, - 0x4167D83E, 0x3FEE2313, 0xC701CF8C, - 0xCC39968E, 0x50AC5695, 0x4D42C787, - 0xA647A8B3, 0x97CF0BEF, 0x825B4537, - 0xEEF864D2, 0xF22090C4, 0xD0E5CD33, - 0xA23911AE, 0xFCD398D9, 0x148FE485, - 0x1B017BEF, 0xB6444532, 0x6A536159, - 0x2FF5781C, 0x91FA7934, 0x0DBADEA9, - 0xD65C8A2B, 0xA5A70E75, 0xB1C62456, - 0xBC796576, 0x1921C8F7, 0xE7989AF1, - 0x7795D246, 0xD43E3B44 -}; - -__device__ __forceinline__ void rrounds(uint32_t x[2][2][2][2][2]) -{ - int r; - int j; - int k; - int l; - int m; - -// #pragma unroll - for (r = 0; r < CUBEHASH_ROUNDS; ++r) { - - /* "add x_0jklm into x_1jklmn modulo 2^32" */ -#pragma unroll 2 - for (j = 0; j < 2; ++j) -#pragma unroll 2 - for (k = 0; k < 2; ++k) -#pragma unroll 2 - for (l = 0; l < 2; ++l) -#pragma unroll 2 - for (m = 0; m < 2; ++m) - x[1][j][k][l][m] += x[0][j][k][l][m]; - - /* "rotate x_0jklm upwards by 7 bits" */ -#pragma unroll 2 - for (j = 0; j < 2; ++j) -#pragma unroll 2 - for (k = 0; k < 2; ++k) -#pragma unroll 2 - for (l = 0; l < 2; ++l) -#pragma unroll 2 - for (m = 0; m < 2; ++m) - x[0][j][k][l][m] = ROTATEUPWARDS7(x[0][j][k][l][m]); - - /* "swap x_00klm with x_01klm" */ -#pragma unroll 2 - for (k = 0; k < 2; ++k) -#pragma unroll 2 - for (l = 0; l < 2; ++l) -#pragma unroll 2 - for (m = 0; m < 2; ++m) - SWAP(x[0][0][k][l][m], x[0][1][k][l][m]) - - /* "xor x_1jklm into x_0jklm" */ -#pragma unroll 2 - for (j = 0; j < 2; ++j) -#pragma unroll 2 - for (k = 0; k < 2; ++k) -#pragma unroll 2 - for (l = 0; l < 2; ++l) -#pragma unroll 2 - for (m = 0; m < 2; ++m) - x[0][j][k][l][m] ^= x[1][j][k][l][m]; - - /* "swap x_1jk0m with x_1jk1m" */ -#pragma unroll 2 - for (j = 0; j < 2; ++j) -#pragma unroll 2 - for (k = 0; k < 2; ++k) -#pragma unroll 2 - for (m = 0; m < 2; ++m) - SWAP(x[1][j][k][0][m], x[1][j][k][1][m]) - - /* "add x_0jklm into x_1jklm modulo 2^32" */ -#pragma unroll 2 - for (j = 0; j < 2; ++j) -#pragma unroll 2 - for (k = 0; k < 2; ++k) -#pragma unroll 2 - for (l = 0; l < 2; ++l) -#pragma unroll 2 - for (m = 0; m < 2; ++m) - x[1][j][k][l][m] += x[0][j][k][l][m]; - - /* "rotate x_0jklm upwards by 11 bits" */ -#pragma unroll 2 - for (j = 0; j < 2; ++j) -#pragma unroll 2 - for (k = 0; k < 2; ++k) -#pragma unroll 2 - for (l = 0; l < 2; ++l) -#pragma unroll 2 - for (m = 0; m < 2; ++m) - x[0][j][k][l][m] = ROTATEUPWARDS11(x[0][j][k][l][m]); - - /* "swap x_0j0lm with x_0j1lm" */ -#pragma unroll 2 - for (j = 0; j < 2; ++j) -#pragma unroll 2 - for (l = 0; l < 2; ++l) -#pragma unroll 2 - for (m = 0; m < 2; ++m) - SWAP(x[0][j][0][l][m], x[0][j][1][l][m]) - - /* "xor x_1jklm into x_0jklm" */ -#pragma unroll 2 - for (j = 0; j < 2; ++j) -#pragma unroll 2 - for (k = 0; k < 2; ++k) -#pragma unroll 2 - for (l = 0; l < 2; ++l) -#pragma unroll 2 - for (m = 0; m < 2; ++m) - x[0][j][k][l][m] ^= x[1][j][k][l][m]; - - /* "swap x_1jkl0 with x_1jkl1" */ -#pragma unroll 2 - for (j = 0; j < 2; ++j) -#pragma unroll 2 - for (k = 0; k < 2; ++k) -#pragma unroll 2 - for (l = 0; l < 2; ++l) - SWAP(x[1][j][k][l][0], x[1][j][k][l][1]) - + for(j = 1; j<5; j++) + { + b[8 + i] ^= statechainv[i + 8 * j]; + } + b[8 + i] = cuda_swab32((b[8 + i])); } } +#define ROUND_EVEN \ + xg = (x0 + xg); \ + x0 = ROTL32(x0, 7); \ + xh = (x1 + xh); \ + x1 = ROTL32(x1, 7); \ + xi = (x2 + xi); \ + x2 = ROTL32(x2, 7); \ + xj = (x3 + xj); \ + x3 = ROTL32(x3, 7); \ + xk = (x4 + xk); \ + x4 = ROTL32(x4, 7); \ + xl = (x5 + xl); \ + x5 = ROTL32(x5, 7); \ + xm = (x6 + xm); \ + x6 = ROTL32(x6, 7); \ + xn = (x7 + xn); \ + x7 = ROTL32(x7, 7); \ + xo = (x8 + xo); \ + x8 = ROTL32(x8, 7); \ + xp = (x9 + xp); \ + x9 = ROTL32(x9, 7); \ + xq = (xa + xq); \ + xa = ROTL32(xa, 7); \ + xr = (xb + xr); \ + xb = ROTL32(xb, 7); \ + xs = (xc + xs); \ + xc = ROTL32(xc, 7); \ + xt = (xd + xt); \ + xd = ROTL32(xd, 7); \ + xu = (xe + xu); \ + xe = ROTL32(xe, 7); \ + xv = (xf + xv); \ + xf = ROTL32(xf, 7); \ + x8 ^= xg; \ + x9 ^= xh; \ + xa ^= xi; \ + xb ^= xj; \ + xc ^= xk; \ + xd ^= xl; \ + xe ^= xm; \ + xf ^= xn; \ + x0 ^= xo; \ + x1 ^= xp; \ + x2 ^= xq; \ + x3 ^= xr; \ + x4 ^= xs; \ + x5 ^= xt; \ + x6 ^= xu; \ + x7 ^= xv; \ + xi = (x8 + xi); \ + x8 = ROTL32(x8, 11); \ + xj = (x9 + xj); \ + x9 = ROTL32(x9, 11); \ + xg = (xa + xg); \ + xa = ROTL32(xa, 11); \ + xh = (xb + xh); \ + xb = ROTL32(xb, 11); \ + xm = (xc + xm); \ + xc = ROTL32(xc, 11); \ + xn = (xd + xn); \ + xd = ROTL32(xd, 11); \ + xk = (xe + xk); \ + xe = ROTL32(xe, 11); \ + xl = (xf + xl); \ + xf = ROTL32(xf, 11); \ + xq = (x0 + xq); \ + x0 = ROTL32(x0, 11); \ + xr = (x1 + xr); \ + x1 = ROTL32(x1, 11); \ + xo = (x2 + xo); \ + x2 = ROTL32(x2, 11); \ + xp = (x3 + xp); \ + x3 = ROTL32(x3, 11); \ + xu = (x4 + xu); \ + x4 = ROTL32(x4, 11); \ + xv = (x5 + xv); \ + x5 = ROTL32(x5, 11); \ + xs = (x6 + xs); \ + x6 = ROTL32(x6, 11); \ + xt = (x7 + xt); \ + x7 = ROTL32(x7, 11); \ + xc ^= xi; \ + xd ^= xj; \ + xe ^= xg; \ + xf ^= xh; \ + x8 ^= xm; \ + x9 ^= xn; \ + xa ^= xk; \ + xb ^= xl; \ + x4 ^= xq; \ + x5 ^= xr; \ + x6 ^= xo; \ + x7 ^= xp; \ + x0 ^= xu; \ + x1 ^= xv; \ + x2 ^= xs; \ + x3 ^= xt; + +#define ROUND_ODD \ + xj = (xc + xj); \ + xc = ROTL32(xc, 7); \ + xi = (xd + xi); \ + xd = ROTL32(xd, 7); \ + xh = (xe + xh); \ + xe = ROTL32(xe, 7); \ + xg = (xf + xg); \ + xf = ROTL32(xf, 7); \ + xn = (x8 + xn); \ + x8 = ROTL32(x8, 7); \ + xm = (x9 + xm); \ + x9 = ROTL32(x9, 7); \ + xl = (xa + xl); \ + xa = ROTL32(xa, 7); \ + xk = (xb + xk); \ + xb = ROTL32(xb, 7); \ + xr = (x4 + xr); \ + x4 = ROTL32(x4, 7); \ + xq = (x5 + xq); \ + x5 = ROTL32(x5, 7); \ + xp = (x6 + xp); \ + x6 = ROTL32(x6, 7); \ + xo = (x7 + xo); \ + x7 = ROTL32(x7, 7); \ + xv = (x0 + xv); \ + x0 = ROTL32(x0, 7); \ + xu = (x1 + xu); \ + x1 = ROTL32(x1, 7); \ + xt = (x2 + xt); \ + x2 = ROTL32(x2, 7); \ + xs = (x3 + xs); \ + x3 = ROTL32(x3, 7); \ + x4 ^= xj; \ + x5 ^= xi; \ + x6 ^= xh; \ + x7 ^= xg; \ + x0 ^= xn; \ + x1 ^= xm; \ + x2 ^= xl; \ + x3 ^= xk; \ + xc ^= xr; \ + xd ^= xq; \ + xe ^= xp; \ + xf ^= xo; \ + x8 ^= xv; \ + x9 ^= xu; \ + xa ^= xt; \ + xb ^= xs; \ + xh = (x4 + xh); \ + x4 = ROTL32(x4, 11); \ + xg = (x5 + xg); \ + x5 = ROTL32(x5, 11); \ + xj = (x6 + xj); \ + x6 = ROTL32(x6, 11); \ + xi = (x7 + xi); \ + x7 = ROTL32(x7, 11); \ + xl = (x0 + xl); \ + x0 = ROTL32(x0, 11); \ + xk = (x1 + xk); \ + x1 = ROTL32(x1, 11); \ + xn = (x2 + xn); \ + x2 = ROTL32(x2, 11); \ + xm = (x3 + xm); \ + x3 = ROTL32(x3, 11); \ + xp = (xc + xp); \ + xc = ROTL32(xc, 11); \ + xo = (xd + xo); \ + xd = ROTL32(xd, 11); \ + xr = (xe + xr); \ + xe = ROTL32(xe, 11); \ + xq = (xf + xq); \ + xf = ROTL32(xf, 11); \ + xt = (x8 + xt); \ + x8 = ROTL32(x8, 11); \ + xs = (x9 + xs); \ + x9 = ROTL32(x9, 11); \ + xv = (xa + xv); \ + xa = ROTL32(xa, 11); \ + xu = (xb + xu); \ + xb = ROTL32(xb, 11); \ + x0 ^= xh; \ + x1 ^= xg; \ + x2 ^= xj; \ + x3 ^= xi; \ + x4 ^= xl; \ + x5 ^= xk; \ + x6 ^= xn; \ + x7 ^= xm; \ + x8 ^= xp; \ + x9 ^= xo; \ + xa ^= xr; \ + xb ^= xq; \ + xc ^= xt; \ + xd ^= xs; \ + xe ^= xv; \ + xf ^= xu; + +#define SIXTEEN_ROUNDS \ + for (int j = 0; j < 8; j ++) { \ + ROUND_EVEN; \ + ROUND_ODD;} -__device__ __forceinline__ void block_tox(uint32_t *in, uint32_t x[2][2][2][2][2]) -{ - int k; - int l; - int m; -// uint32_t *in = block; - -#pragma unroll 2 - for (k = 0; k < 2; ++k) -#pragma unroll 2 - for (l = 0; l < 2; ++l) -#pragma unroll 2 - for (m = 0; m < 2; ++m) - x[0][0][k][l][m] ^= *in++; -} - -__device__ __forceinline__ void hash_fromx(uint32_t *out, uint32_t x[2][2][2][2][2]) -{ - int j; - int k; - int l; - int m; -// uint32_t *out = hash; - -#pragma unroll 2 - for (j = 0; j < 2; ++j) -#pragma unroll 2 - for (k = 0; k < 2; ++k) -#pragma unroll 2 - for (l = 0; l < 2; ++l) -#pragma unroll 2 - for (m = 0; m < 2; ++m) - *out++ = x[0][j][k][l][m]; -} - -void __device__ __forceinline__ Init(uint32_t x[2][2][2][2][2]) -{ - int i, j, k, l, m; -#if 0 - /* "the first three state words x_00000, x_00001, x_00010" */ - /* "are set to the integers h/8, b, r respectively." */ - /* "the remaining state words are set to 0." */ -#pragma unroll 2 - for (i = 0; i < 2; ++i) -#pragma unroll 2 - for (j = 0; j < 2; ++j) -#pragma unroll 2 - for (k = 0; k < 2; ++k) -#pragma unroll 2 - for (l = 0; l < 2; ++l) -#pragma unroll 2 - for (m = 0; m < 2; ++m) - x[i][j][k][l][m] = 0; - x[0][0][0][0][0] = 512 / 8; - x[0][0][0][0][1] = CUBEHASH_BLOCKBYTES; - x[0][0][0][1][0] = CUBEHASH_ROUNDS; - - /* "the state is then transformed invertibly through 10r identical rounds */ - for (i = 0; i < 10; ++i) rrounds(x); +__global__ +#if __CUDA_ARCH__ > 500 +__launch_bounds__(256, 4) #else - const uint32_t *iv = c_IV_512; - -#pragma unroll 2 - for (i = 0; i < 2; ++i) -#pragma unroll 2 - for (j = 0; j < 2; ++j) -#pragma unroll 2 - for (k = 0; k < 2; ++k) -#pragma unroll 2 - for (l = 0; l < 2; ++l) -#pragma unroll 2 - for (m = 0; m < 2; ++m) - x[i][j][k][l][m] = *iv++; +__launch_bounds__(256, 3) #endif -} - -void __device__ __forceinline__ Update32(uint32_t x[2][2][2][2][2], const uint32_t *data) -{ - /* "xor the block into the first b bytes of the state" */ - /* "and then transform the state invertibly through r identical rounds" */ - block_tox((uint32_t*)data, x); - rrounds(x); -} - -void __device__ __forceinline__ Final(uint32_t x[2][2][2][2][2], uint32_t *hashval) +void x11_luffaCubehash512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t *const g_hash) { - int i; - - /* "the integer 1 is xored into the last state word x_11111" */ - x[1][1][1][1][1] ^= 1; - - /* "the state is then transformed invertibly through 10r identical rounds" */ -// #pragma unroll 10 - for (i = 0; i < 10; ++i) rrounds(x); - - /* "output the first h/8 bytes of the state" */ - hash_fromx(hashval, x); -} + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); +// if(thread < threads) + { + uint32_t *const Hash = (uint32_t*)&g_hash[8 * thread]; + uint32_t statebuffer[8]; + uint32_t statechainv[40] = + { + 0x8bb0a761, 0xc2e4aa8b, 0x2d539bc9, 0x381408f8, + 0x478f6633, 0x255a46ff, 0x581c37f7, 0x601c2e8e, + 0x266c5f9d, 0xc34715d8, 0x8900670e, 0x51a540be, + 0xe4ce69fb, 0x5089f4d4, 0x3cc0a506, 0x609bcb02, + 0xa4e3cd82, 0xd24fd6ca, 0xc0f196dc, 0xcf41eafe, + 0x0ff2e673, 0x303804f2, 0xa7b3cd48, 0x677addd4, + 0x66e66a8a, 0x2303208f, 0x486dafb4, 0xc0d37dc6, + 0x634d15af, 0xe5af6747, 0x10af7e38, 0xee7e6428, + 0x01262e5d, 0xc92c2e64, 0x82fee966, 0xcea738d3, + 0x867de2b0, 0xe0714818, 0xda6e831f, 0xa7062529 + }; + + Update512(statebuffer, statechainv, Hash); + finalization512(statechainv, Hash); + //Cubehash -/***************************************************/ -// Die Hash-Funktion -__global__ -void x11_luffaCubehash512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector) -{ - uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); - if (thread < threads) - { - uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread); + uint32_t x0 = 0x2AEA2A61 ^ Hash[0]; + uint32_t x1 = 0x50F494D4 ^ Hash[1]; + uint32_t x2 = 0x2D538B8B ^ Hash[2]; + uint32_t x3 = 0x4167D83E ^ Hash[3]; + uint32_t x4 = 0x3FEE2313 ^ Hash[4]; + uint32_t x5 = 0xC701CF8C ^ Hash[5]; + uint32_t x6 = 0xCC39968E ^ Hash[6]; + uint32_t x7 = 0x50AC5695 ^ Hash[7]; + uint32_t x8 = 0x4D42C787, x9 = 0xA647A8B3, xa = 0x97CF0BEF, xb = 0x825B4537; + uint32_t xc = 0xEEF864D2, xd = 0xF22090C4, xe = 0xD0E5CD33, xf = 0xA23911AE; + uint32_t xg = 0xFCD398D9 + x0, xh = 0x148FE485 + x1, xi = 0x1B017BEF + x2, xj = 0xB6444532 + x3; + uint32_t xk = 0x6A536159 + x4, xl = 0x2FF5781C + x5, xm = 0x91FA7934 + x6, xn = 0x0DBADEA9 + x7; + uint32_t xo = 0xD65C8A2B + x8, xp = 0xA5A70E75 + x9, xq = 0xB1C62456 + xa, xr = 0xBC796576 + xb; + uint32_t xs = 0x1921C8F7 + xc, xt = 0xE7989AF1 + xd, xu = 0x7795D246 + xe, xv = 0xD43E3B44 + xf; + + + x0 = ROTL32(x0, 7); + x1 = ROTL32(x1, 7); + x2 = ROTL32(x2, 7); + x3 = ROTL32(x3, 7); + x4 = ROTL32(x4, 7); + x5 = ROTL32(x5, 7); + x6 = ROTL32(x6, 7); + x7 = ROTL32(x7, 7); + x8 = ROTL32(x8, 7); + x9 = ROTL32(x9, 7); + xa = ROTL32(xa, 7); + xb = ROTL32(xb, 7); + xc = ROTL32(xc, 7); + xd = ROTL32(xd, 7); + xe = ROTL32(xe, 7); + xf = ROTL32(xf, 7); + x8 ^= xg; + x9 ^= xh; + xa ^= xi; + xb ^= xj; + xc ^= xk; + xd ^= xl; + xe ^= xm; + xf ^= xn; + x0 ^= xo; + x1 ^= xp; + x2 ^= xq; + x3 ^= xr; + x4 ^= xs; + x5 ^= xt; + x6 ^= xu; + x7 ^= xv; + xi = (x8 + xi); + x8 = ROTL32(x8, 11); + xj = (x9 + xj); + x9 = ROTL32(x9, 11); + xg = (xa + xg); + xa = ROTL32(xa, 11); + xh = (xb + xh); + xb = ROTL32(xb, 11); + xm = (xc + xm); + xc = ROTL32(xc, 11); + xn = (xd + xn); + xd = ROTL32(xd, 11); + xk = (xe + xk); + xe = ROTL32(xe, 11); + xl = (xf + xl); + xf = ROTL32(xf, 11); + xq = (x0 + xq); + x0 = ROTL32(x0, 11); + xr = (x1 + xr); + x1 = ROTL32(x1, 11); + xo = (x2 + xo); + x2 = ROTL32(x2, 11); + xp = (x3 + xp); + x3 = ROTL32(x3, 11); + xu = (x4 + xu); + x4 = ROTL32(x4, 11); + xv = (x5 + xv); + x5 = ROTL32(x5, 11); + xs = (x6 + xs); + x6 = ROTL32(x6, 11); + xt = (x7 + xt); + x7 = ROTL32(x7, 11); + xc ^= xi; + xd ^= xj; + xe ^= xg; + xf ^= xh; + x8 ^= xm; + x9 ^= xn; + xa ^= xk; + xb ^= xl; + x4 ^= xq; + x5 ^= xr; + x6 ^= xo; + x7 ^= xp; + x0 ^= xu; + x1 ^= xv; + x2 ^= xs; + x3 ^= xt; + + xj = (xc + xj); + xc = ROTL32(xc, 7); + xi = (xd + xi); + xd = ROTL32(xd, 7); + xh = (xe + xh); + xe = ROTL32(xe, 7); + xg = (xf + xg); + xf = ROTL32(xf, 7); + xn = (x8 + xn); + x8 = ROTL32(x8, 7); + xm = (x9 + xm); + x9 = ROTL32(x9, 7); + xl = (xa + xl); + xa = ROTL32(xa, 7); + xk = (xb + xk); + xb = ROTL32(xb, 7); + xr = (x4 + xr); + x4 = ROTL32(x4, 7); + xq = (x5 + xq); + x5 = ROTL32(x5, 7); + xp = (x6 + xp); + x6 = ROTL32(x6, 7); + xo = (x7 + xo); + x7 = ROTL32(x7, 7); + xv = (x0 + xv); + x0 = ROTL32(x0, 7); + xu = (x1 + xu); + x1 = ROTL32(x1, 7); + xt = (x2 + xt); + x2 = ROTL32(x2, 7); + xs = (x3 + xs); + x3 = ROTL32(x3, 7); + x4 ^= xj; + x5 ^= xi; + x6 ^= xh; + x7 ^= xg; + x0 ^= xn; + x1 ^= xm; + x2 ^= xl; + x3 ^= xk; + xc ^= xr; + xd ^= xq; + xe ^= xp; + xf ^= xo; + x8 ^= xv; + x9 ^= xu; + xa ^= xt; + xb ^= xs; + xh = (x4 + xh); + x4 = ROTL32(x4, 11); + xg = (x5 + xg); + x5 = ROTL32(x5, 11); + xj = (x6 + xj); + x6 = ROTL32(x6, 11); + xi = (x7 + xi); + x7 = ROTL32(x7, 11); + xl = (x0 + xl); + x0 = ROTL32(x0, 11); + xk = (x1 + xk); + x1 = ROTL32(x1, 11); + xn = (x2 + xn); + x2 = ROTL32(x2, 11); + xm = (x3 + xm); + x3 = ROTL32(x3, 11); + xp = (xc + xp); + xc = ROTL32(xc, 11); + xo = (xd + xo); + xd = ROTL32(xd, 11); + xr = (xe + xr); + xe = ROTL32(xe, 11); + xq = (xf + xq); + xf = ROTL32(xf, 11); + xt = (x8 + xt); + x8 = ROTL32(x8, 11); + xs = (x9 + xs); + x9 = ROTL32(x9, 11); + xv = (xa + xv); + xa = ROTL32(xa, 11); + xu = (xb + xu); + xb = ROTL32(xb, 11); + x0 ^= xh; + x1 ^= xg; + x2 ^= xj; + x3 ^= xi; + x4 ^= xl; + x5 ^= xk; + x6 ^= xn; + x7 ^= xm; + x8 ^= xp; + x9 ^= xo; + xa ^= xr; + xb ^= xq; + xc ^= xt; + xd ^= xs; + xe ^= xv; + xf ^= xu; + + for (int j = 1; j < 8; j++) + { + ROUND_EVEN; + ROUND_ODD; + } + x0 ^= (Hash[8]); + x1 ^= (Hash[9]); + x2 ^= (Hash[10]); + x3 ^= (Hash[11]); + x4 ^= (Hash[12]); + x5 ^= (Hash[13]); + x6 ^= (Hash[14]); + x7 ^= (Hash[15]); - int hashPosition = nounce - startNounce; - uint32_t *Hash = (uint32_t*)&g_hash[8 * hashPosition]; - hashState state; -#pragma unroll 40 - for(int i=0;i<40;i++) state.chainv[i] = c_IV[i]; + for (int j = 0; j < 8; j++) + { + ROUND_EVEN; + ROUND_ODD; + } + x0 ^= 0x80; + + for (int j = 0; j < 8; j++) + { + ROUND_EVEN; + ROUND_ODD; + } + xv ^= 1; - Update512(&state, Hash); - finalization512(&state, Hash); - //Cubehash + for(int i = 3; i < 13; i++) + { + for (int j = 0; j < 8; j++) + { + ROUND_EVEN; + ROUND_ODD; + } + } - uint32_t x[2][2][2][2][2]; - Init(x); - // erste Hälfte des Hashes (32 bytes) - Update32(x, Hash); - // zweite Hälfte des Hashes (32 bytes) - Update32(x, &Hash[8]); - // Padding Block - uint32_t last[8]; - last[0] = 0x80; -#pragma unroll 7 - for (int i = 1; i < 8; i++) last[i] = 0; - Update32(x, last); - Final(x, Hash); + Hash[0] = x0; + Hash[1] = x1; + Hash[2] = x2; + Hash[3] = x3; + Hash[4] = x4; + Hash[5] = x5; + Hash[6] = x6; + Hash[7] = x7; + Hash[8] = x8; + Hash[9] = x9; + Hash[10] = xa; + Hash[11] = xb; + Hash[12] = xc; + Hash[13] = xd; + Hash[14] = xe; + Hash[15] = xf; } } -__host__ void x11_luffaCubehash512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order) +__host__ void x11_luffaCubehash512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash) { - const uint32_t threadsperblock = 256; + const uint32_t threadsperblock = 256; - // berechne wie viele Thread Blocks wir brauchen - dim3 grid((threads + threadsperblock-1)/threadsperblock); - dim3 block(threadsperblock); + // berechne wie viele Thread Blocks wir brauchen + dim3 grid((threads + threadsperblock - 1) / threadsperblock); + dim3 block(threadsperblock); - x11_luffaCubehash512_gpu_hash_64 << > >(threads, startNounce, (uint64_t*)d_hash, d_nonceVector); + x11_luffaCubehash512_gpu_hash_64 << > >(threads, startNounce, (uint64_t*)d_hash); + CUDA_SAFE_CALL(cudaGetLastError()); } - diff --git a/x11/cuda_x11_shavite512.cu b/x11/cuda_x11_shavite512.cu index d531da9321..12b85d617c 100644 --- a/x11/cuda_x11_shavite512.cu +++ b/x11/cuda_x11_shavite512.cu @@ -1,37 +1,37 @@ #include "cuda_helper.h" #include // memcpy() +#include "cuda_vector.h" -#define TPB 128 + +#define TPB 320 __constant__ uint32_t c_PaddedMessage80[32]; // padded message (80 bytes + padding) -#include "cuda_x11_aes.cu" +// #ifdef NOASM +#include "cuda_x11_aes_noasm.cu" +// #else +// #include "cuda_x11_aes.cu" +// #endif __device__ __forceinline__ -static void AES_ROUND_NOKEY( - const uint32_t* __restrict__ sharedMemory, +void AES_ROUND_NOKEY( + const uint32_t*const __restrict__ sharedMemory, uint32_t &x0, uint32_t &x1, uint32_t &x2, uint32_t &x3) { - uint32_t y0, y1, y2, y3; aes_round(sharedMemory, - x0, x1, x2, x3, - y0, y1, y2, y3); - - x0 = y0; - x1 = y1; - x2 = y2; - x3 = y3; + x0, x1, x2, x3, + x0, x1, x2, x3); } __device__ __forceinline__ -static void KEY_EXPAND_ELT( - const uint32_t* __restrict__ sharedMemory, +void KEY_EXPAND_ELT( + const uint32_t*const __restrict__ sharedMemory, uint32_t &k0, uint32_t &k1, uint32_t &k2, uint32_t &k3) { uint32_t y0, y1, y2, y3; aes_round(sharedMemory, - k0, k1, k2, k3, - y0, y1, y2, y3); + k0, k1, k2, k3, + y0, y1, y2, y3); k0 = y1; k1 = y2; @@ -40,94 +40,59 @@ static void KEY_EXPAND_ELT( } __device__ __forceinline__ -static void c512(const uint32_t*const __restrict__ sharedMemory, uint32_t *const __restrict__ state, uint32_t *const __restrict__ msg, const uint32_t count) +void shavite_gpu_init(uint32_t *sharedMemory) { - uint32_t p0, p1, p2, p3, p4, p5, p6, p7; - uint32_t p8, p9, pA, pB, pC, pD, pE, pF; - uint32_t x0, x1, x2, x3; - uint32_t rk[32]; - uint32_t i; - const uint32_t counter = count; - - p0 = state[0x0]; - p1 = state[0x1]; - p2 = state[0x2]; - p3 = state[0x3]; - p4 = state[0x4]; - p5 = state[0x5]; - p6 = state[0x6]; - p7 = state[0x7]; - p8 = state[0x8]; - p9 = state[0x9]; - pA = state[0xA]; - pB = state[0xB]; - pC = state[0xC]; - pD = state[0xD]; - pE = state[0xE]; - pF = state[0xF]; - - x0 = p4; - x1 = p5; - x2 = p6; - x3 = p7; -#pragma unroll - for (i = 0; i<16; i += 4) - { - rk[i] = msg[i]; - x0 ^= msg[i]; - rk[i + 1] = msg[i + 1]; - x1 ^= msg[i + 1]; - rk[i + 2] = msg[i + 2]; - x2 ^= msg[i + 2]; - rk[i + 3] = msg[i + 3]; - x3 ^= msg[i + 3]; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + /* each thread startup will fill a uint32 */ + if (threadIdx.x < 256) { + /* each thread startup will fill a uint32 */ + sharedMemory[threadIdx.x] = d_AES0[threadIdx.x]; + sharedMemory[threadIdx.x + 256] = ROL8(sharedMemory[threadIdx.x]); + sharedMemory[threadIdx.x + 512] = ROL16(sharedMemory[threadIdx.x]); + sharedMemory[threadIdx.x + 768] = ROL24(sharedMemory[threadIdx.x]); +// sharedMemory[threadIdx.x + 64 * 2 ] = d_AES0[threadIdx.x + 64 * 2]; +// sharedMemory[threadIdx.x + 64 * 2 + 256] = d_AES1[threadIdx.x + 64 * 2]; +// sharedMemory[threadIdx.x + 64 * 2 + 512] = d_AES2[threadIdx.x + 64 * 2]; +// sharedMemory[threadIdx.x + 64 * 2 + 768] = d_AES3[threadIdx.x + 64 * 2]; } +} +__global__ __launch_bounds__(TPB, 3) +void x11_shavite512_gpu_hash_64(uint32_t threads, uint32_t *const __restrict__ g_hash) +{ + __shared__ __align__(128) uint32_t sharedMemory[1024]; - p0 ^= x0; - p1 ^= x1; - p2 ^= x2; - p3 ^= x3; - if (count == 512) - { - rk[16] = 0x80U; - x0 = pC ^ 0x80U; - rk[17] = 0; - x1 = pD; - rk[18] = 0; - x2 = pE; - rk[19] = 0; - x3 = pF; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - rk[20] = 0; - rk[21] = 0; - rk[22] = 0; - rk[23] = 0; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - rk[24] = 0; - rk[25] = 0; - rk[26] = 0; - rk[27] = 0x02000000U; - x3 ^= 0x02000000U; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - rk[28] = 0; - rk[29] = 0; - rk[30] = 0; - rk[31] = 0x02000000; - x3 ^= 0x02000000; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - } - else + shavite_gpu_init(sharedMemory); + __syncthreads(); + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + if(thread < threads) { - x0 = pC; - x1 = pD; - x2 = pE; - x3 = pF; + uint32_t *Hash = &g_hash[thread * 16]; + + uint32_t rk[32]; + uint32_t msg[16]; - for (i = 16; i<32; i += 4) + uint28 *phash = (uint28*)Hash; + uint28 *outpt = (uint28*)msg; + outpt[0] = phash[0]; + outpt[1] = phash[1]; + + uint32_t state[16] = + { + 0x72FCCDD8, 0x79CA4727, 0x128A077B, 0x40D55AEC, + 0xD1901A06, 0x430AE307, 0xB29F5CD1, 0xDF07FBFC, + 0x8E45D73D, 0x681AB538, 0xBDE86578, 0xDD577E47, + 0xE275EADE, 0x502D9FCD, 0xB9357178, 0x022A4B9A + }; + + uint32_t x0 = 0xD1901A06; + uint32_t x1 = 0x430AE307; + uint32_t x2 = 0xB29F5CD1; + uint32_t x3 = 0xDF07FBFC; + + for(int i = 0; i < 16; i += 4) { - rk[i] = msg[i]; - x0 ^= msg[i]; + + rk[i + 0] = msg[i + 0]; + x0 ^= msg[i + 0]; rk[i + 1] = msg[i + 1]; x1 ^= msg[i + 1]; rk[i + 2] = msg[i + 2]; @@ -136,1285 +101,211 @@ static void c512(const uint32_t*const __restrict__ sharedMemory, uint32_t *const x3 ^= msg[i + 3]; AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); } - } - p8 ^= x0; - p9 ^= x1; - pA ^= x2; - pB ^= x3; + state[0] ^= x0; + state[1] ^= x1; + state[2] ^= x2; + state[3] ^= x3; - // 1 - KEY_EXPAND_ELT(sharedMemory, rk[0], rk[1], rk[2], rk[3]); + // 1 + KEY_EXPAND_ELT(sharedMemory, rk[0], rk[1], rk[2], rk[3]); - rk[0] ^= rk[28]; - rk[1] ^= rk[29]; - rk[2] ^= rk[30]; - rk[3] ^= rk[31]; - rk[0] ^= counter; - rk[3] ^= 0xFFFFFFFF; - x0 = p0 ^ rk[0]; - x1 = p1 ^ rk[1]; - x2 = p2 ^ rk[2]; - x3 = p3 ^ rk[3]; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - KEY_EXPAND_ELT(sharedMemory, rk[4], rk[5], rk[6], rk[7]); - rk[4] ^= rk[0]; - rk[5] ^= rk[1]; - rk[6] ^= rk[2]; - rk[7] ^= rk[3]; - x0 ^= rk[4]; - x1 ^= rk[5]; - x2 ^= rk[6]; - x3 ^= rk[7]; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - KEY_EXPAND_ELT(sharedMemory, rk[8], rk[9], rk[10], rk[11]); - rk[8] ^= rk[4]; - rk[9] ^= rk[5]; - rk[10] ^= rk[6]; - rk[11] ^= rk[7]; - x0 ^= rk[8]; - x1 ^= rk[9]; - x2 ^= rk[10]; - x3 ^= rk[11]; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - KEY_EXPAND_ELT(sharedMemory, rk[12], rk[13], rk[14], rk[15]); - rk[12] ^= rk[8]; - rk[13] ^= rk[9]; - rk[14] ^= rk[10]; - rk[15] ^= rk[11]; - x0 ^= rk[12]; - x1 ^= rk[13]; - x2 ^= rk[14]; - x3 ^= rk[15]; + rk[3] ^= (0x02000000UL ^ 0xFFFFFFFFUL); //rk[31]; + rk[0] ^= 512; + // rk[3] ^= 0xFFFFFFFF; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - pC ^= x0; - pD ^= x1; - pE ^= x2; - pF ^= x3; + x0 = state[0] ^ rk[0]; + x1 = state[1] ^ rk[1]; + x2 = state[2] ^ rk[2]; + x3 = state[3] ^ rk[3]; - KEY_EXPAND_ELT(sharedMemory, rk[16], rk[17], rk[18], rk[19]); - rk[16] ^= rk[12]; - rk[17] ^= rk[13]; - rk[18] ^= rk[14]; - rk[19] ^= rk[15]; - x0 = p8 ^ rk[16]; - x1 = p9 ^ rk[17]; - x2 = pA ^ rk[18]; - x3 = pB ^ rk[19]; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - KEY_EXPAND_ELT(sharedMemory, rk[20], rk[21], rk[22], rk[23]); - rk[20] ^= rk[16]; - rk[21] ^= rk[17]; - rk[22] ^= rk[18]; - rk[23] ^= rk[19]; - x0 ^= rk[20]; - x1 ^= rk[21]; - x2 ^= rk[22]; - x3 ^= rk[23]; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - KEY_EXPAND_ELT(sharedMemory, rk[24], rk[25], rk[26], rk[27]); - rk[24] ^= rk[20]; - rk[25] ^= rk[21]; - rk[26] ^= rk[22]; - rk[27] ^= rk[23]; - x0 ^= rk[24]; - x1 ^= rk[25]; - x2 ^= rk[26]; - x3 ^= rk[27]; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - KEY_EXPAND_ELT(sharedMemory, rk[28], rk[29], rk[30], rk[31]); - rk[28] ^= rk[24]; - rk[29] ^= rk[25]; - rk[30] ^= rk[26]; - rk[31] ^= rk[27]; - x0 ^= rk[28]; - x1 ^= rk[29]; - x2 ^= rk[30]; - x3 ^= rk[31]; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - p4 ^= x0; - p5 ^= x1; - p6 ^= x2; - p7 ^= x3; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + KEY_EXPAND_ELT(sharedMemory, rk[4], rk[5], rk[6], rk[7]); + rk[4] ^= rk[0]; + rk[5] ^= rk[1]; + rk[6] ^= rk[2]; + rk[7] ^= rk[3]; + x0 ^= rk[4]; + x1 ^= rk[5]; + x2 ^= rk[6]; + x3 ^= rk[7]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + KEY_EXPAND_ELT(sharedMemory, rk[8], rk[9], rk[10], rk[11]); + rk[8] ^= rk[4]; + rk[9] ^= rk[5]; + rk[10] ^= rk[6]; + rk[11] ^= rk[7]; + x0 ^= rk[8]; + x1 ^= rk[9]; + x2 ^= rk[10]; + x3 ^= rk[11]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + KEY_EXPAND_ELT(sharedMemory, rk[12], rk[13], rk[14], rk[15]); + rk[12] ^= rk[8]; + rk[13] ^= rk[9]; + rk[14] ^= rk[10]; + rk[15] ^= rk[11]; + x0 ^= rk[12]; + x1 ^= rk[13]; + x2 ^= rk[14]; + x3 ^= rk[15]; - rk[0] ^= rk[25]; - x0 = pC ^ rk[0]; - rk[1] ^= rk[26]; - x1 = pD ^ rk[1]; - rk[2] ^= rk[27]; - x2 = pE ^ rk[2]; - rk[3] ^= rk[28]; - x3 = pF ^ rk[3]; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - rk[4] ^= rk[29]; - x0 ^= rk[4]; - rk[5] ^= rk[30]; - x1 ^= rk[5]; - rk[6] ^= rk[31]; - x2 ^= rk[6]; - rk[7] ^= rk[0]; - x3 ^= rk[7]; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - rk[8] ^= rk[1]; - x0 ^= rk[8]; - rk[9] ^= rk[2]; - x1 ^= rk[9]; - rk[10] ^= rk[3]; - x2 ^= rk[10]; - rk[11] ^= rk[4]; - x3 ^= rk[11]; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - rk[12] ^= rk[5]; - x0 ^= rk[12]; - rk[13] ^= rk[6]; - x1 ^= rk[13]; - rk[14] ^= rk[7]; - x2 ^= rk[14]; - rk[15] ^= rk[8]; - x3 ^= rk[15]; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - p8 ^= x0; - p9 ^= x1; - pA ^= x2; - pB ^= x3; - rk[16] ^= rk[9]; - x0 = p4 ^ rk[16]; - rk[17] ^= rk[10]; - x1 = p5 ^ rk[17]; - rk[18] ^= rk[11]; - x2 = p6 ^ rk[18]; - rk[19] ^= rk[12]; - x3 = p7 ^ rk[19]; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - rk[20] ^= rk[13]; - x0 ^= rk[20]; - rk[21] ^= rk[14]; - x1 ^= rk[21]; - rk[22] ^= rk[15]; - x2 ^= rk[22]; - rk[23] ^= rk[16]; - x3 ^= rk[23]; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - rk[24] ^= rk[17]; - x0 ^= rk[24]; - rk[25] ^= rk[18]; - x1 ^= rk[25]; - rk[26] ^= rk[19]; - x2 ^= rk[26]; - rk[27] ^= rk[20]; - x3 ^= rk[27]; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - rk[28] ^= rk[21]; - x0 ^= rk[28]; - rk[29] ^= rk[22]; - x1 ^= rk[29]; - rk[30] ^= rk[23]; - x2 ^= rk[30]; - rk[31] ^= rk[24]; - x3 ^= rk[31]; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - p0 ^= x0; - p1 ^= x1; - p2 ^= x2; - p3 ^= x3; - /* round 3, 7, 11 */ - KEY_EXPAND_ELT(sharedMemory, rk[0], rk[1], rk[2], rk[3]); - rk[0] ^= rk[28]; - rk[1] ^= rk[29]; - rk[2] ^= rk[30]; - rk[3] ^= rk[31]; - x0 = p8 ^ rk[0]; - x1 = p9 ^ rk[1]; - x2 = pA ^ rk[2]; - x3 = pB ^ rk[3]; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - KEY_EXPAND_ELT(sharedMemory, rk[4], rk[5], rk[6], rk[7]); - rk[4] ^= rk[0]; - rk[5] ^= rk[1]; - rk[6] ^= rk[2]; - rk[7] ^= rk[3]; - x0 ^= rk[4]; - x1 ^= rk[5]; - x2 ^= rk[6]; - x3 ^= rk[7]; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - KEY_EXPAND_ELT(sharedMemory, rk[8], rk[9], rk[10], rk[11]); - rk[8] ^= rk[4]; - rk[9] ^= rk[5]; - rk[10] ^= rk[6]; - rk[11] ^= rk[7]; - x0 ^= rk[8]; - x1 ^= rk[9]; - x2 ^= rk[10]; - x3 ^= rk[11]; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - KEY_EXPAND_ELT(sharedMemory, rk[12], rk[13], rk[14], rk[15]); - rk[12] ^= rk[8]; - rk[13] ^= rk[9]; - rk[14] ^= rk[10]; - rk[15] ^= rk[11]; - x0 ^= rk[12]; - x1 ^= rk[13]; - x2 ^= rk[14]; - x3 ^= rk[15]; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - p4 ^= x0; - p5 ^= x1; - p6 ^= x2; - p7 ^= x3; - KEY_EXPAND_ELT(sharedMemory, rk[16], rk[17], rk[18], rk[19]); - rk[16] ^= rk[12]; - rk[17] ^= rk[13]; - rk[18] ^= rk[14]; - rk[19] ^= rk[15]; - x0 = p0 ^ rk[16]; - x1 = p1 ^ rk[17]; - x2 = p2 ^ rk[18]; - x3 = p3 ^ rk[19]; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - KEY_EXPAND_ELT(sharedMemory, rk[20], rk[21], rk[22], rk[23]); - rk[20] ^= rk[16]; - rk[21] ^= rk[17]; - rk[22] ^= rk[18]; - rk[23] ^= rk[19]; - x0 ^= rk[20]; - x1 ^= rk[21]; - x2 ^= rk[22]; - x3 ^= rk[23]; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - KEY_EXPAND_ELT(sharedMemory, rk[24], rk[25], rk[26], rk[27]); - rk[24] ^= rk[20]; - rk[25] ^= rk[21]; - rk[26] ^= rk[22]; - rk[27] ^= rk[23]; - x0 ^= rk[24]; - x1 ^= rk[25]; - x2 ^= rk[26]; - x3 ^= rk[27]; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - KEY_EXPAND_ELT(sharedMemory, rk[28], rk[29], rk[30], rk[31]); - rk[28] ^= rk[24]; - rk[29] ^= rk[25]; - rk[30] ^= rk[26]; - rk[31] ^= rk[27]; - x0 ^= rk[28]; - x1 ^= rk[29]; - x2 ^= rk[30]; - x3 ^= rk[31]; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - pC ^= x0; - pD ^= x1; - pE ^= x2; - pF ^= x3; - /* round 4, 8, 12 */ - rk[0] ^= rk[25]; - x0 = p4 ^ rk[0]; - rk[1] ^= rk[26]; - x1 = p5 ^ rk[1]; - rk[2] ^= rk[27]; - x2 = p6 ^ rk[2]; - rk[3] ^= rk[28]; - x3 = p7 ^ rk[3]; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - rk[4] ^= rk[29]; - x0 ^= rk[4]; - rk[5] ^= rk[30]; - x1 ^= rk[5]; - rk[6] ^= rk[31]; - x2 ^= rk[6]; - rk[7] ^= rk[0]; - x3 ^= rk[7]; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - rk[8] ^= rk[1]; - x0 ^= rk[8]; - rk[9] ^= rk[2]; - x1 ^= rk[9]; - rk[10] ^= rk[3]; - x2 ^= rk[10]; - rk[11] ^= rk[4]; - x3 ^= rk[11]; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - rk[12] ^= rk[5]; - x0 ^= rk[12]; - rk[13] ^= rk[6]; - x1 ^= rk[13]; - rk[14] ^= rk[7]; - x2 ^= rk[14]; - rk[15] ^= rk[8]; - x3 ^= rk[15]; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - p0 ^= x0; - p1 ^= x1; - p2 ^= x2; - p3 ^= x3; - rk[16] ^= rk[9]; - x0 = pC ^ rk[16]; - rk[17] ^= rk[10]; - x1 = pD ^ rk[17]; - rk[18] ^= rk[11]; - x2 = pE ^ rk[18]; - rk[19] ^= rk[12]; - x3 = pF ^ rk[19]; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - rk[20] ^= rk[13]; - x0 ^= rk[20]; - rk[21] ^= rk[14]; - x1 ^= rk[21]; - rk[22] ^= rk[15]; - x2 ^= rk[22]; - rk[23] ^= rk[16]; - x3 ^= rk[23]; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - rk[24] ^= rk[17]; - x0 ^= rk[24]; - rk[25] ^= rk[18]; - x1 ^= rk[25]; - rk[26] ^= rk[19]; - x2 ^= rk[26]; - rk[27] ^= rk[20]; - x3 ^= rk[27]; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - rk[28] ^= rk[21]; - x0 ^= rk[28]; - rk[29] ^= rk[22]; - x1 ^= rk[29]; - rk[30] ^= rk[23]; - x2 ^= rk[30]; - rk[31] ^= rk[24]; - x3 ^= rk[31]; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - p8 ^= x0; - p9 ^= x1; - pA ^= x2; - pB ^= x3; + state[8] ^= 0x32be246fUL; + state[9] ^= 0xe33ad1e5UL; + state[10] ^= 0xd659b13eUL; + state[11] ^= 0xb6a1a92cUL; - // 2 - KEY_EXPAND_ELT(sharedMemory, rk[0], rk[1], rk[2], rk[3]); - rk[0] ^= rk[28]; - rk[1] ^= rk[29]; - rk[2] ^= rk[30]; - rk[3] ^= rk[31]; - x0 = p0 ^ rk[0]; - x1 = p1 ^ rk[1]; - x2 = p2 ^ rk[2]; - x3 = p3 ^ rk[3]; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - KEY_EXPAND_ELT(sharedMemory, rk[4], rk[5], rk[6], rk[7]); - rk[4] ^= rk[0]; - rk[5] ^= rk[1]; - rk[6] ^= rk[2]; - rk[7] ^= rk[3]; - rk[7] ^= ~counter; - x0 ^= rk[4]; - x1 ^= rk[5]; - x2 ^= rk[6]; - x3 ^= rk[7]; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - KEY_EXPAND_ELT(sharedMemory, rk[8], rk[9], rk[10], rk[11]); - rk[8] ^= rk[4]; - rk[9] ^= rk[5]; - rk[10] ^= rk[6]; - rk[11] ^= rk[7]; - x0 ^= rk[8]; - x1 ^= rk[9]; - x2 ^= rk[10]; - x3 ^= rk[11]; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - KEY_EXPAND_ELT(sharedMemory, rk[12], rk[13], rk[14], rk[15]); - rk[12] ^= rk[8]; - rk[13] ^= rk[9]; - rk[14] ^= rk[10]; - rk[15] ^= rk[11]; - x0 ^= rk[12]; - x1 ^= rk[13]; - x2 ^= rk[14]; - x3 ^= rk[15]; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - pC ^= x0; - pD ^= x1; - pE ^= x2; - pF ^= x3; - KEY_EXPAND_ELT(sharedMemory, rk[16], rk[17], rk[18], rk[19]); - rk[16] ^= rk[12]; - rk[17] ^= rk[13]; - rk[18] ^= rk[14]; - rk[19] ^= rk[15]; - x0 = p8 ^ rk[16]; - x1 = p9 ^ rk[17]; - x2 = pA ^ rk[18]; - x3 = pB ^ rk[19]; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - KEY_EXPAND_ELT(sharedMemory, rk[20], rk[21], rk[22], rk[23]); - rk[20] ^= rk[16]; - rk[21] ^= rk[17]; - rk[22] ^= rk[18]; - rk[23] ^= rk[19]; - x0 ^= rk[20]; - x1 ^= rk[21]; - x2 ^= rk[22]; - x3 ^= rk[23]; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - KEY_EXPAND_ELT(sharedMemory, rk[24], rk[25], rk[26], rk[27]); - rk[24] ^= rk[20]; - rk[25] ^= rk[21]; - rk[26] ^= rk[22]; - rk[27] ^= rk[23]; - x0 ^= rk[24]; - x1 ^= rk[25]; - x2 ^= rk[26]; - x3 ^= rk[27]; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - KEY_EXPAND_ELT(sharedMemory, rk[28], rk[29], rk[30], rk[31]); - rk[28] ^= rk[24]; - rk[29] ^= rk[25]; - rk[30] ^= rk[26]; - rk[31] ^= rk[27]; - x0 ^= rk[28]; - x1 ^= rk[29]; - x2 ^= rk[30]; - x3 ^= rk[31]; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - p4 ^= x0; - p5 ^= x1; - p6 ^= x2; - p7 ^= x3; + state[12] ^= x0; + state[13] ^= x1; + state[14] ^= x2; + state[15] ^= x3; - rk[0] ^= rk[25]; - x0 = pC ^ rk[0]; - rk[1] ^= rk[26]; - x1 = pD ^ rk[1]; - rk[2] ^= rk[27]; - x2 = pE ^ rk[2]; - rk[3] ^= rk[28]; - x3 = pF ^ rk[3]; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - rk[4] ^= rk[29]; - x0 ^= rk[4]; - rk[5] ^= rk[30]; - x1 ^= rk[5]; - rk[6] ^= rk[31]; - x2 ^= rk[6]; - rk[7] ^= rk[0]; - x3 ^= rk[7]; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - rk[8] ^= rk[1]; - x0 ^= rk[8]; - rk[9] ^= rk[2]; - x1 ^= rk[9]; - rk[10] ^= rk[3]; - x2 ^= rk[10]; - rk[11] ^= rk[4]; - x3 ^= rk[11]; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - rk[12] ^= rk[5]; - x0 ^= rk[12]; - rk[13] ^= rk[6]; - x1 ^= rk[13]; - rk[14] ^= rk[7]; - x2 ^= rk[14]; - rk[15] ^= rk[8]; - x3 ^= rk[15]; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - p8 ^= x0; - p9 ^= x1; - pA ^= x2; - pB ^= x3; - rk[16] ^= rk[9]; - x0 = p4 ^ rk[16]; - rk[17] ^= rk[10]; - x1 = p5 ^ rk[17]; - rk[18] ^= rk[11]; - x2 = p6 ^ rk[18]; - rk[19] ^= rk[12]; - x3 = p7 ^ rk[19]; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - rk[20] ^= rk[13]; - x0 ^= rk[20]; - rk[21] ^= rk[14]; - x1 ^= rk[21]; - rk[22] ^= rk[15]; - x2 ^= rk[22]; - rk[23] ^= rk[16]; - x3 ^= rk[23]; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - rk[24] ^= rk[17]; - x0 ^= rk[24]; - rk[25] ^= rk[18]; - x1 ^= rk[25]; - rk[26] ^= rk[19]; - x2 ^= rk[26]; - rk[27] ^= rk[20]; - x3 ^= rk[27]; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - rk[28] ^= rk[21]; - x0 ^= rk[28]; - rk[29] ^= rk[22]; - x1 ^= rk[29]; - rk[30] ^= rk[23]; - x2 ^= rk[30]; - rk[31] ^= rk[24]; - x3 ^= rk[31]; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - p0 ^= x0; - p1 ^= x1; - p2 ^= x2; - p3 ^= x3; - /* round 3, 7, 11 */ - KEY_EXPAND_ELT(sharedMemory, rk[0], rk[1], rk[2], rk[3]); - rk[0] ^= rk[28]; - rk[1] ^= rk[29]; - rk[2] ^= rk[30]; - rk[3] ^= rk[31]; - x0 = p8 ^ rk[0]; - x1 = p9 ^ rk[1]; - x2 = pA ^ rk[2]; - x3 = pB ^ rk[3]; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - KEY_EXPAND_ELT(sharedMemory, rk[4], rk[5], rk[6], rk[7]); - rk[4] ^= rk[0]; - rk[5] ^= rk[1]; - rk[6] ^= rk[2]; - rk[7] ^= rk[3]; - x0 ^= rk[4]; - x1 ^= rk[5]; - x2 ^= rk[6]; - x3 ^= rk[7]; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - KEY_EXPAND_ELT(sharedMemory, rk[8], rk[9], rk[10], rk[11]); - rk[8] ^= rk[4]; - rk[9] ^= rk[5]; - rk[10] ^= rk[6]; - rk[11] ^= rk[7]; - x0 ^= rk[8]; - x1 ^= rk[9]; - x2 ^= rk[10]; - x3 ^= rk[11]; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - KEY_EXPAND_ELT(sharedMemory, rk[12], rk[13], rk[14], rk[15]); - rk[12] ^= rk[8]; - rk[13] ^= rk[9]; - rk[14] ^= rk[10]; - rk[15] ^= rk[11]; - x0 ^= rk[12]; - x1 ^= rk[13]; - x2 ^= rk[14]; - x3 ^= rk[15]; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - p4 ^= x0; - p5 ^= x1; - p6 ^= x2; - p7 ^= x3; - KEY_EXPAND_ELT(sharedMemory, rk[16], rk[17], rk[18], rk[19]); - rk[16] ^= rk[12]; - rk[17] ^= rk[13]; - rk[18] ^= rk[14]; - rk[19] ^= rk[15]; - x0 = p0 ^ rk[16]; - x1 = p1 ^ rk[17]; - x2 = p2 ^ rk[18]; - x3 = p3 ^ rk[19]; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - KEY_EXPAND_ELT(sharedMemory, rk[20], rk[21], rk[22], rk[23]); - rk[20] ^= rk[16]; - rk[21] ^= rk[17]; - rk[22] ^= rk[18]; - rk[23] ^= rk[19]; - x0 ^= rk[20]; - x1 ^= rk[21]; - x2 ^= rk[22]; - x3 ^= rk[23]; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - KEY_EXPAND_ELT(sharedMemory, rk[24], rk[25], rk[26], rk[27]); - rk[24] ^= rk[20]; - rk[25] ^= rk[21]; - rk[26] ^= rk[22]; - rk[27] ^= rk[23]; - x0 ^= rk[24]; - x1 ^= rk[25]; - x2 ^= rk[26]; - x3 ^= rk[27]; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - KEY_EXPAND_ELT(sharedMemory, rk[28], rk[29], rk[30], rk[31]); - rk[28] ^= rk[24]; - rk[29] ^= rk[25]; - rk[30] ^= rk[26]; - rk[31] ^= rk[27]; - x0 ^= rk[28]; - x1 ^= rk[29]; - x2 ^= rk[30]; - x3 ^= rk[31]; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - pC ^= x0; - pD ^= x1; - pE ^= x2; - pF ^= x3; - /* round 4, 8, 12 */ - rk[0] ^= rk[25]; - x0 = p4 ^ rk[0]; - rk[1] ^= rk[26]; - x1 = p5 ^ rk[1]; - rk[2] ^= rk[27]; - x2 = p6 ^ rk[2]; - rk[3] ^= rk[28]; - x3 = p7 ^ rk[3]; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - rk[4] ^= rk[29]; - x0 ^= rk[4]; - rk[5] ^= rk[30]; - x1 ^= rk[5]; - rk[6] ^= rk[31]; - x2 ^= rk[6]; - rk[7] ^= rk[0]; - x3 ^= rk[7]; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - rk[8] ^= rk[1]; - x0 ^= rk[8]; - rk[9] ^= rk[2]; - x1 ^= rk[9]; - rk[10] ^= rk[3]; - x2 ^= rk[10]; - rk[11] ^= rk[4]; - x3 ^= rk[11]; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - rk[12] ^= rk[5]; - x0 ^= rk[12]; - rk[13] ^= rk[6]; - x1 ^= rk[13]; - rk[14] ^= rk[7]; - x2 ^= rk[14]; - rk[15] ^= rk[8]; - x3 ^= rk[15]; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - p0 ^= x0; - p1 ^= x1; - p2 ^= x2; - p3 ^= x3; - rk[16] ^= rk[9]; - x0 = pC ^ rk[16]; - rk[17] ^= rk[10]; - x1 = pD ^ rk[17]; - rk[18] ^= rk[11]; - x2 = pE ^ rk[18]; - rk[19] ^= rk[12]; - x3 = pF ^ rk[19]; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - rk[20] ^= rk[13]; - x0 ^= rk[20]; - rk[21] ^= rk[14]; - x1 ^= rk[21]; - rk[22] ^= rk[15]; - x2 ^= rk[22]; - rk[23] ^= rk[16]; - x3 ^= rk[23]; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - rk[24] ^= rk[17]; - x0 ^= rk[24]; - rk[25] ^= rk[18]; - x1 ^= rk[25]; - rk[26] ^= rk[19]; - x2 ^= rk[26]; - rk[27] ^= rk[20]; - x3 ^= rk[27]; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - rk[28] ^= rk[21]; - x0 ^= rk[28]; - rk[29] ^= rk[22]; - x1 ^= rk[29]; - rk[30] ^= rk[23]; - x2 ^= rk[30]; - rk[31] ^= rk[24]; - x3 ^= rk[31]; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - p8 ^= x0; - p9 ^= x1; - pA ^= x2; - pB ^= x3; + rk[16] = rk[12] ^ 0x63636363UL; + rk[17] = rk[13] ^ 0x63636363UL; + rk[18] = rk[14] ^ 0x63636363UL; + rk[19] = rk[15] ^ 0x8acdcd24UL; + x0 = state[8] ^ rk[16]; + x1 = state[9] ^ rk[17]; + x2 = state[10] ^ rk[18]; + x3 = state[11] ^ rk[19]; + rk[20] = 0x63636363UL ^ rk[16]; + rk[21] = 0x63636363UL ^ rk[17]; + rk[22] = 0x63636363UL ^ rk[18]; + rk[23] = 0x63636363UL ^ rk[19]; + rk[24] = 0x63636363UL ^ rk[20]; + rk[25] = 0x63636363UL ^ rk[21]; + rk[26] = 0x63636363UL ^ rk[22]; + rk[27] = 0x4b5f7777UL ^ rk[23]; - // 3 - KEY_EXPAND_ELT(sharedMemory, rk[0], rk[1], rk[2], rk[3]); - rk[0] ^= rk[28]; - rk[1] ^= rk[29]; - rk[2] ^= rk[30]; - rk[3] ^= rk[31]; - x0 = p0 ^ rk[0]; - x1 = p1 ^ rk[1]; - x2 = p2 ^ rk[2]; - x3 = p3 ^ rk[3]; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - KEY_EXPAND_ELT(sharedMemory, rk[4], rk[5], rk[6], rk[7]); - rk[4] ^= rk[0]; - rk[5] ^= rk[1]; - rk[6] ^= rk[2]; - rk[7] ^= rk[3]; - x0 ^= rk[4]; - x1 ^= rk[5]; - x2 ^= rk[6]; - x3 ^= rk[7]; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - KEY_EXPAND_ELT(sharedMemory, rk[8], rk[9], rk[10], rk[11]); - rk[8] ^= rk[4]; - rk[9] ^= rk[5]; - rk[10] ^= rk[6]; - rk[11] ^= rk[7]; - x0 ^= rk[8]; - x1 ^= rk[9]; - x2 ^= rk[10]; - x3 ^= rk[11]; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - KEY_EXPAND_ELT(sharedMemory, rk[12], rk[13], rk[14], rk[15]); - rk[12] ^= rk[8]; - rk[13] ^= rk[9]; - rk[14] ^= rk[10]; - rk[15] ^= rk[11]; - x0 ^= rk[12]; - x1 ^= rk[13]; - x2 ^= rk[14]; - x3 ^= rk[15]; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - pC ^= x0; - pD ^= x1; - pE ^= x2; - pF ^= x3; - KEY_EXPAND_ELT(sharedMemory, rk[16], rk[17], rk[18], rk[19]); - rk[16] ^= rk[12]; - rk[17] ^= rk[13]; - rk[18] ^= rk[14]; - rk[19] ^= rk[15]; - x0 = p8 ^ rk[16]; - x1 = p9 ^ rk[17]; - x2 = pA ^ rk[18]; - x3 = pB ^ rk[19]; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - KEY_EXPAND_ELT(sharedMemory, rk[20], rk[21], rk[22], rk[23]); - rk[20] ^= rk[16]; - rk[21] ^= rk[17]; - rk[22] ^= rk[18]; - rk[23] ^= rk[19]; - x0 ^= rk[20]; - x1 ^= rk[21]; - x2 ^= rk[22]; - x3 ^= rk[23]; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - KEY_EXPAND_ELT(sharedMemory, rk[24], rk[25], rk[26], rk[27]); - rk[24] ^= rk[20]; - rk[25] ^= rk[21]; - rk[26] ^= rk[22]; - rk[27] ^= rk[23]; - x0 ^= rk[24]; - x1 ^= rk[25]; - x2 ^= rk[26]; - x3 ^= rk[27]; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - KEY_EXPAND_ELT(sharedMemory, rk[28], rk[29], rk[30], rk[31]); - rk[28] ^= rk[24]; - rk[29] ^= rk[25]; - rk[30] ^= rk[26]; - rk[31] ^= rk[27]; - rk[30] ^= counter; - rk[31] ^= 0xFFFFFFFF; - x0 ^= rk[28]; - x1 ^= rk[29]; - x2 ^= rk[30]; - x3 ^= rk[31]; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - p4 ^= x0; - p5 ^= x1; - p6 ^= x2; - p7 ^= x3; + rk[28] = 0x63636363UL ^ rk[24]; + rk[29] = 0x63636363UL ^ rk[25]; + rk[30] = 0x63636363UL ^ rk[26]; + rk[31] = 0x4b5f7777UL ^ rk[27]; - rk[0] ^= rk[25]; - x0 = pC ^ rk[0]; - rk[1] ^= rk[26]; - x1 = pD ^ rk[1]; - rk[2] ^= rk[27]; - x2 = pE ^ rk[2]; - rk[3] ^= rk[28]; - x3 = pF ^ rk[3]; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - rk[4] ^= rk[29]; - x0 ^= rk[4]; - rk[5] ^= rk[30]; - x1 ^= rk[5]; - rk[6] ^= rk[31]; - x2 ^= rk[6]; - rk[7] ^= rk[0]; - x3 ^= rk[7]; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - rk[8] ^= rk[1]; - x0 ^= rk[8]; - rk[9] ^= rk[2]; - x1 ^= rk[9]; - rk[10] ^= rk[3]; - x2 ^= rk[10]; - rk[11] ^= rk[4]; - x3 ^= rk[11]; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - rk[12] ^= rk[5]; - x0 ^= rk[12]; - rk[13] ^= rk[6]; - x1 ^= rk[13]; - rk[14] ^= rk[7]; - x2 ^= rk[14]; - rk[15] ^= rk[8]; - x3 ^= rk[15]; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - p8 ^= x0; - p9 ^= x1; - pA ^= x2; - pB ^= x3; - rk[16] ^= rk[9]; - x0 = p4 ^ rk[16]; - rk[17] ^= rk[10]; - x1 = p5 ^ rk[17]; - rk[18] ^= rk[11]; - x2 = p6 ^ rk[18]; - rk[19] ^= rk[12]; - x3 = p7 ^ rk[19]; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - rk[20] ^= rk[13]; - x0 ^= rk[20]; - rk[21] ^= rk[14]; - x1 ^= rk[21]; - rk[22] ^= rk[15]; - x2 ^= rk[22]; - rk[23] ^= rk[16]; - x3 ^= rk[23]; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - rk[24] ^= rk[17]; - x0 ^= rk[24]; - rk[25] ^= rk[18]; - x1 ^= rk[25]; - rk[26] ^= rk[19]; - x2 ^= rk[26]; - rk[27] ^= rk[20]; - x3 ^= rk[27]; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - rk[28] ^= rk[21]; - x0 ^= rk[28]; - rk[29] ^= rk[22]; - x1 ^= rk[29]; - rk[30] ^= rk[23]; - x2 ^= rk[30]; - rk[31] ^= rk[24]; - x3 ^= rk[31]; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - p0 ^= x0; - p1 ^= x1; - p2 ^= x2; - p3 ^= x3; - /* round 3, 7, 11 */ - KEY_EXPAND_ELT(sharedMemory, rk[0], rk[1], rk[2], rk[3]); - rk[0] ^= rk[28]; - rk[1] ^= rk[29]; - rk[2] ^= rk[30]; - rk[3] ^= rk[31]; - x0 = p8 ^ rk[0]; - x1 = p9 ^ rk[1]; - x2 = pA ^ rk[2]; - x3 = pB ^ rk[3]; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - KEY_EXPAND_ELT(sharedMemory, rk[4], rk[5], rk[6], rk[7]); - rk[4] ^= rk[0]; - rk[5] ^= rk[1]; - rk[6] ^= rk[2]; - rk[7] ^= rk[3]; - x0 ^= rk[4]; - x1 ^= rk[5]; - x2 ^= rk[6]; - x3 ^= rk[7]; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - KEY_EXPAND_ELT(sharedMemory, rk[8], rk[9], rk[10], rk[11]); - rk[8] ^= rk[4]; - rk[9] ^= rk[5]; - rk[10] ^= rk[6]; - rk[11] ^= rk[7]; - x0 ^= rk[8]; - x1 ^= rk[9]; - x2 ^= rk[10]; - x3 ^= rk[11]; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - KEY_EXPAND_ELT(sharedMemory, rk[12], rk[13], rk[14], rk[15]); - rk[12] ^= rk[8]; - rk[13] ^= rk[9]; - rk[14] ^= rk[10]; - rk[15] ^= rk[11]; - x0 ^= rk[12]; - x1 ^= rk[13]; - x2 ^= rk[14]; - x3 ^= rk[15]; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - p4 ^= x0; - p5 ^= x1; - p6 ^= x2; - p7 ^= x3; - KEY_EXPAND_ELT(sharedMemory, rk[16], rk[17], rk[18], rk[19]); - rk[16] ^= rk[12]; - rk[17] ^= rk[13]; - rk[18] ^= rk[14]; - rk[19] ^= rk[15]; - x0 = p0 ^ rk[16]; - x1 = p1 ^ rk[17]; - x2 = p2 ^ rk[18]; - x3 = p3 ^ rk[19]; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - KEY_EXPAND_ELT(sharedMemory, rk[20], rk[21], rk[22], rk[23]); - rk[20] ^= rk[16]; - rk[21] ^= rk[17]; - rk[22] ^= rk[18]; - rk[23] ^= rk[19]; - x0 ^= rk[20]; - x1 ^= rk[21]; - x2 ^= rk[22]; - x3 ^= rk[23]; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - KEY_EXPAND_ELT(sharedMemory, rk[24], rk[25], rk[26], rk[27]); - rk[24] ^= rk[20]; - rk[25] ^= rk[21]; - rk[26] ^= rk[22]; - rk[27] ^= rk[23]; - x0 ^= rk[24]; - x1 ^= rk[25]; - x2 ^= rk[26]; - x3 ^= rk[27]; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - KEY_EXPAND_ELT(sharedMemory, rk[28], rk[29], rk[30], rk[31]); - rk[28] ^= rk[24]; - rk[29] ^= rk[25]; - rk[30] ^= rk[26]; - rk[31] ^= rk[27]; - x0 ^= rk[28]; - x1 ^= rk[29]; - x2 ^= rk[30]; - x3 ^= rk[31]; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - pC ^= x0; - pD ^= x1; - pE ^= x2; - pF ^= x3; - /* round 4, 8, 12 */ - rk[0] ^= rk[25]; - x0 = p4 ^ rk[0]; - rk[1] ^= rk[26]; - x1 = p5 ^ rk[1]; - rk[2] ^= rk[27]; - x2 = p6 ^ rk[2]; - rk[3] ^= rk[28]; - x3 = p7 ^ rk[3]; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - rk[4] ^= rk[29]; - x0 ^= rk[4]; - rk[5] ^= rk[30]; - x1 ^= rk[5]; - rk[6] ^= rk[31]; - x2 ^= rk[6]; - rk[7] ^= rk[0]; - x3 ^= rk[7]; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - rk[8] ^= rk[1]; - x0 ^= rk[8]; - rk[9] ^= rk[2]; - x1 ^= rk[9]; - rk[10] ^= rk[3]; - x2 ^= rk[10]; - rk[11] ^= rk[4]; - x3 ^= rk[11]; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - rk[12] ^= rk[5]; - x0 ^= rk[12]; - rk[13] ^= rk[6]; - x1 ^= rk[13]; - rk[14] ^= rk[7]; - x2 ^= rk[14]; - rk[15] ^= rk[8]; - x3 ^= rk[15]; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - p0 ^= x0; - p1 ^= x1; - p2 ^= x2; - p3 ^= x3; - rk[16] ^= rk[9]; - x0 = pC ^ rk[16]; - rk[17] ^= rk[10]; - x1 = pD ^ rk[17]; - rk[18] ^= rk[11]; - x2 = pE ^ rk[18]; - rk[19] ^= rk[12]; - x3 = pF ^ rk[19]; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - rk[20] ^= rk[13]; - x0 ^= rk[20]; - rk[21] ^= rk[14]; - x1 ^= rk[21]; - rk[22] ^= rk[15]; - x2 ^= rk[22]; - rk[23] ^= rk[16]; - x3 ^= rk[23]; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - rk[24] ^= rk[17]; - x0 ^= rk[24]; - rk[25] ^= rk[18]; - x1 ^= rk[25]; - rk[26] ^= rk[19]; - x2 ^= rk[26]; - rk[27] ^= rk[20]; - x3 ^= rk[27]; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - rk[28] ^= rk[21]; - x0 ^= rk[28]; - rk[29] ^= rk[22]; - x1 ^= rk[29]; - rk[30] ^= rk[23]; - x2 ^= rk[30]; - rk[31] ^= rk[24]; - x3 ^= rk[31]; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - p8 ^= x0; - p9 ^= x1; - pA ^= x2; - pB ^= x3; - - /* round 13 */ - KEY_EXPAND_ELT(sharedMemory, rk[0], rk[1], rk[2], rk[3]); - rk[0] ^= rk[28]; - rk[1] ^= rk[29]; - rk[2] ^= rk[30]; - rk[3] ^= rk[31]; - x0 = p0 ^ rk[0]; - x1 = p1 ^ rk[1]; - x2 = p2 ^ rk[2]; - x3 = p3 ^ rk[3]; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - KEY_EXPAND_ELT(sharedMemory, rk[4], rk[5], rk[6], rk[7]); - rk[4] ^= rk[0]; - rk[5] ^= rk[1]; - rk[6] ^= rk[2]; - rk[7] ^= rk[3]; - x0 ^= rk[4]; - x1 ^= rk[5]; - x2 ^= rk[6]; - x3 ^= rk[7]; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - KEY_EXPAND_ELT(sharedMemory, rk[8], rk[9], rk[10], rk[11]); - rk[8] ^= rk[4]; - rk[9] ^= rk[5]; - rk[10] ^= rk[6]; - rk[11] ^= rk[7]; - x0 ^= rk[8]; - x1 ^= rk[9]; - x2 ^= rk[10]; - x3 ^= rk[11]; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - KEY_EXPAND_ELT(sharedMemory, rk[12], rk[13], rk[14], rk[15]); - rk[12] ^= rk[8]; - rk[13] ^= rk[9]; - rk[14] ^= rk[10]; - rk[15] ^= rk[11]; - x0 ^= rk[12]; - x1 ^= rk[13]; - x2 ^= rk[14]; - x3 ^= rk[15]; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - pC ^= x0; - pD ^= x1; - pE ^= x2; - pF ^= x3; - KEY_EXPAND_ELT(sharedMemory, rk[16], rk[17], rk[18], rk[19]); - rk[16] ^= rk[12]; - rk[17] ^= rk[13]; - rk[18] ^= rk[14]; - rk[19] ^= rk[15]; - x0 = p8 ^ rk[16]; - x1 = p9 ^ rk[17]; - x2 = pA ^ rk[18]; - x3 = pB ^ rk[19]; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - KEY_EXPAND_ELT(sharedMemory, rk[20], rk[21], rk[22], rk[23]); - rk[20] ^= rk[16]; - rk[21] ^= rk[17]; - rk[22] ^= rk[18]; - rk[23] ^= rk[19]; - x0 ^= rk[20]; - x1 ^= rk[21]; - x2 ^= rk[22]; - x3 ^= rk[23]; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - KEY_EXPAND_ELT(sharedMemory, rk[24], rk[25], rk[26], rk[27]); - rk[24] ^= rk[20]; - rk[25] ^= rk[21] ^ counter; - rk[26] ^= rk[22]; - rk[27] ^= rk[23] ^ 0xFFFFFFFF; - x0 ^= rk[24]; - x1 ^= rk[25]; - x2 ^= rk[26]; - x3 ^= rk[27]; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - KEY_EXPAND_ELT(sharedMemory, rk[28], rk[29], rk[30], rk[31]); - rk[28] ^= rk[24]; - rk[29] ^= rk[25]; - rk[30] ^= rk[26]; - rk[31] ^= rk[27]; - x0 ^= rk[28]; - x1 ^= rk[29]; - x2 ^= rk[30]; - x3 ^= rk[31]; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - p4 ^= x0; - p5 ^= x1; - p6 ^= x2; - p7 ^= x3; - state[0x0] ^= p8; - state[0x1] ^= p9; - state[0x2] ^= pA; - state[0x3] ^= pB; - state[0x4] ^= pC; - state[0x5] ^= pD; - state[0x6] ^= pE; - state[0x7] ^= pF; - state[0x8] ^= p0; - state[0x9] ^= p1; - state[0xA] ^= p2; - state[0xB] ^= p3; - state[0xC] ^= p4; - state[0xD] ^= p5; - state[0xE] ^= p6; - state[0xF] ^= p7; -} - -__device__ __forceinline__ -void shavite_gpu_init(uint32_t *sharedMemory) -{ - /* each thread startup will fill a uint32 */ - if (threadIdx.x < 128) { - sharedMemory[threadIdx.x ] = d_AES0[threadIdx.x]; - sharedMemory[threadIdx.x + 256] = d_AES1[threadIdx.x]; - sharedMemory[threadIdx.x + 512] = d_AES2[threadIdx.x]; - sharedMemory[threadIdx.x + 768] = d_AES3[threadIdx.x]; - - sharedMemory[threadIdx.x + 64 * 2 ] = d_AES0[threadIdx.x + 64 * 2]; - sharedMemory[threadIdx.x + 64 * 2 + 256] = d_AES1[threadIdx.x + 64 * 2]; - sharedMemory[threadIdx.x + 64 * 2 + 512] = d_AES2[threadIdx.x + 64 * 2]; - sharedMemory[threadIdx.x + 64 * 2 + 768] = d_AES3[threadIdx.x + 64 * 2]; - } -} -__global__ __launch_bounds__(TPB, 8) -void x11_shavite512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t *const __restrict__ g_hash, uint32_t *const __restrict__ g_nonceVector) -{ - __shared__ uint32_t sharedMemory[1024]; - - shavite_gpu_init(sharedMemory); - - uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); - if (thread < threads) - { - uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread); - - int hashPosition = nounce - startNounce; - uint32_t *Hash = (uint32_t*)&g_hash[hashPosition*8]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - // kopiere init-state + x0 ^= rk[20]; + x1 ^= rk[21]; + x2 ^= rk[22]; + x3 ^= rk[23]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - uint32_t p0, p1, p2, p3, p4, p5, p6, p7; - uint32_t p8, p9, pA, pB, pC, pD, pE, pF; - uint32_t x0, x1, x2, x3; - uint32_t rk[32]; - const uint32_t msg[16] = - { - Hash[0], Hash[1], Hash[2], Hash[3], Hash[4], Hash[5], Hash[6], Hash[7], Hash[8], Hash[9], Hash[10], Hash[11], Hash[12], Hash[13], Hash[14], Hash[15] - }; - const uint32_t state[16] = - { - SPH_C32(0x72FCCDD8), SPH_C32(0x79CA4727), SPH_C32(0x128A077B), SPH_C32(0x40D55AEC), - SPH_C32(0xD1901A06), SPH_C32(0x430AE307), SPH_C32(0xB29F5CD1), SPH_C32(0xDF07FBFC), - SPH_C32(0x8E45D73D), SPH_C32(0x681AB538), SPH_C32(0xBDE86578), SPH_C32(0xDD577E47), - SPH_C32(0xE275EADE), SPH_C32(0x502D9FCD), SPH_C32(0xB9357178), SPH_C32(0x022A4B9A) - }; - p0 = state[0x0]; - p1 = state[0x1]; - p2 = state[0x2]; - p3 = state[0x3]; - p4 = state[0x4]; - p5 = state[0x5]; - p6 = state[0x6]; - p7 = state[0x7]; - p8 = state[0x8]; - p9 = state[0x9]; - pA = state[0xA]; - pB = state[0xB]; - pC = state[0xC]; - pD = state[0xD]; - pE = state[0xE]; - pF = state[0xF]; - - x0 = p4; - x1 = p5; - x2 = p6; - x3 = p7; - - - rk[0] = msg[0]; - x0 ^= msg[0]; - rk[1] = msg[1]; - x1 ^= msg[1]; - rk[2] = msg[2]; - x2 ^= msg[2]; - rk[3] = msg[3]; - x3 ^= msg[3]; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - - rk[4] = msg[4]; - x0 ^= msg[4]; - rk[5] = msg[5]; - x1 ^= msg[5]; - rk[6] = msg[6]; - x2 ^= msg[6]; - rk[7] = msg[7]; - x3 ^= msg[7]; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - - rk[8] = msg[8]; - x0 ^= msg[8]; - rk[9] = msg[9]; - x1 ^= msg[9]; - rk[10] = msg[10]; - x2 ^= msg[10]; - rk[11] = msg[11]; - x3 ^= msg[11]; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - rk[12] = msg[12]; - x0 ^= msg[12]; - rk[13] = msg[13]; - x1 ^= msg[13]; - rk[14] = msg[14]; - x2 ^= msg[14]; - rk[15] = msg[15]; - x3 ^= msg[15]; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - - p0 ^= x0; - p1 ^= x1; - p2 ^= x2; - p3 ^= x3; + x0 ^= rk[24]; + x1 ^= rk[25]; + x2 ^= rk[26]; + x3 ^= rk[27]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - // 1 - KEY_EXPAND_ELT(sharedMemory, rk[0], rk[1], rk[2], rk[3]); + x0 ^= rk[28]; + x1 ^= rk[29]; + x2 ^= rk[30]; + x3 ^= rk[31]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - rk[3] ^= (0x02000000UL ^ 0xFFFFFFFFUL); //rk[31]; - rk[0] ^= 512; - // rk[3] ^= 0xFFFFFFFF; + state[4] ^= x0; + state[5] ^= x1; + state[6] ^= x2; + state[7] ^= x3; - x0 = p0 ^ rk[0]; - x1 = p1 ^ rk[1]; - x2 = p2 ^ rk[2]; - x3 = p3 ^ rk[3]; + rk[0] ^= rk[25]; + x0 = state[12] ^ rk[0]; + rk[1] ^= rk[26]; + x1 = state[13] ^ rk[1]; + rk[2] ^= rk[27]; + x2 = state[14] ^ rk[2]; + rk[3] ^= rk[28]; + x3 = state[15] ^ rk[3]; AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - KEY_EXPAND_ELT(sharedMemory, rk[4], rk[5], rk[6], rk[7]); - rk[4] ^= rk[0]; - rk[5] ^= rk[1]; + rk[4] ^= rk[29]; + x0 ^= rk[4]; + rk[5] ^= rk[30]; + x1 ^= rk[5]; + rk[6] ^= rk[31]; + x2 ^= rk[6]; + rk[7] ^= rk[0]; + x3 ^= rk[7]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + rk[8] ^= rk[1]; + x0 ^= rk[8]; + rk[9] ^= rk[2]; + x1 ^= rk[9]; + rk[10] ^= rk[3]; + x2 ^= rk[10]; + rk[11] ^= rk[4]; + x3 ^= rk[11]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + rk[12] ^= rk[5]; + x0 ^= rk[12]; + rk[13] ^= rk[6]; + x1 ^= rk[13]; + rk[14] ^= rk[7]; + x2 ^= rk[14]; + rk[15] ^= rk[8]; + x3 ^= rk[15]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + state[8] ^= x0; + state[9] ^= x1; + state[10] ^= x2; + state[11] ^= x3; + rk[16] ^= rk[9]; + x0 = state[4] ^ rk[16]; + rk[17] ^= rk[10]; + x1 = state[5] ^ rk[17]; + rk[18] ^= rk[11]; + x2 = state[6] ^ rk[18]; + rk[19] ^= rk[12]; + x3 = state[7] ^ rk[19]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + rk[20] ^= rk[13]; + x0 ^= rk[20]; + rk[21] ^= rk[14]; + x1 ^= rk[21]; + rk[22] ^= rk[15]; + x2 ^= rk[22]; + rk[23] ^= rk[16]; + x3 ^= rk[23]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + rk[24] ^= rk[17]; + x0 ^= rk[24]; + rk[25] ^= rk[18]; + x1 ^= rk[25]; + rk[26] ^= rk[19]; + x2 ^= rk[26]; + rk[27] ^= rk[20]; + x3 ^= rk[27]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + rk[28] ^= rk[21]; + x0 ^= rk[28]; + rk[29] ^= rk[22]; + x1 ^= rk[29]; + rk[30] ^= rk[23]; + x2 ^= rk[30]; + rk[31] ^= rk[24]; + x3 ^= rk[31]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + state[0] ^= x0; + state[1] ^= x1; + state[2] ^= x2; + state[3] ^= x3; + /* round 3, 7, 11 */ + KEY_EXPAND_ELT(sharedMemory, rk[0], rk[1], rk[2], rk[3]); + rk[0] ^= rk[28]; + rk[1] ^= rk[29]; + rk[2] ^= rk[30]; + rk[3] ^= rk[31]; + x0 = state[8] ^ rk[0]; + x1 = state[9] ^ rk[1]; + x2 = state[10] ^ rk[2]; + x3 = state[11] ^ rk[3]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + KEY_EXPAND_ELT(sharedMemory, rk[4], rk[5], rk[6], rk[7]); + rk[4] ^= rk[0]; + rk[5] ^= rk[1]; rk[6] ^= rk[2]; rk[7] ^= rk[3]; x0 ^= rk[4]; @@ -1441,33 +332,22 @@ void x11_shavite512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t x1 ^= rk[13]; x2 ^= rk[14]; x3 ^= rk[15]; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - pC ^= x0; - pD ^= x1; - pE ^= x2; - pF ^= x3; - - p8 ^= 0x32be246fUL; - p9 ^= 0xe33ad1e5UL; - pA ^= 0xd659b13eUL; - pB ^= 0xb6a1a92cUL; - - rk[16] = rk[12] ^ 0x63636363UL; - rk[17] = rk[13] ^ 0x63636363UL; - rk[18] = rk[14] ^ 0x63636363UL; - rk[19] = rk[15] ^ 0x8acdcd24UL; - x0 = p8 ^ rk[16]; - x1 = p9 ^ rk[17]; - x2 = pA ^ rk[18]; - x3 = pB ^ rk[19]; + state[4] ^= x0; + state[5] ^= x1; + state[6] ^= x2; + state[7] ^= x3; + KEY_EXPAND_ELT(sharedMemory, rk[16], rk[17], rk[18], rk[19]); + rk[16] ^= rk[12]; + rk[17] ^= rk[13]; + rk[18] ^= rk[14]; + rk[19] ^= rk[15]; + x0 = state[0] ^ rk[16]; + x1 = state[1] ^ rk[17]; + x2 = state[2] ^ rk[18]; + x3 = state[3] ^ rk[19]; AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - - rk[20] = 0x63636363UL; - rk[21] = 0x63636363UL; - rk[22] = 0x63636363UL; - rk[23] = 0x63636363UL; - + KEY_EXPAND_ELT(sharedMemory, rk[20], rk[21], rk[22], rk[23]); rk[20] ^= rk[16]; rk[21] ^= rk[17]; rk[22] ^= rk[18]; @@ -1477,12 +357,7 @@ void x11_shavite512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t x2 ^= rk[22]; x3 ^= rk[23]; AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - - rk[24] = 0x63636363UL; - rk[25] = 0x63636363UL; - rk[26] = 0x63636363UL; - rk[27] = 0x4b5f7777UL; - + KEY_EXPAND_ELT(sharedMemory, rk[24], rk[25], rk[26], rk[27]); rk[24] ^= rk[20]; rk[25] ^= rk[21]; rk[26] ^= rk[22]; @@ -1492,12 +367,180 @@ void x11_shavite512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t x2 ^= rk[26]; x3 ^= rk[27]; AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + KEY_EXPAND_ELT(sharedMemory, rk[28], rk[29], rk[30], rk[31]); + rk[28] ^= rk[24]; + rk[29] ^= rk[25]; + rk[30] ^= rk[26]; + rk[31] ^= rk[27]; + x0 ^= rk[28]; + x1 ^= rk[29]; + x2 ^= rk[30]; + x3 ^= rk[31]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + state[12] ^= x0; + state[13] ^= x1; + state[14] ^= x2; + state[15] ^= x3; + /* round 4, 8, 12 */ + rk[0] ^= rk[25]; + x0 = state[4] ^ rk[0]; + rk[1] ^= rk[26]; + x1 = state[5] ^ rk[1]; + rk[2] ^= rk[27]; + x2 = state[6] ^ rk[2]; + rk[3] ^= rk[28]; + x3 = state[7] ^ rk[3]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + rk[4] ^= rk[29]; + x0 ^= rk[4]; + rk[5] ^= rk[30]; + x1 ^= rk[5]; + rk[6] ^= rk[31]; + x2 ^= rk[6]; + rk[7] ^= rk[0]; + x3 ^= rk[7]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + rk[8] ^= rk[1]; + x0 ^= rk[8]; + rk[9] ^= rk[2]; + x1 ^= rk[9]; + rk[10] ^= rk[3]; + x2 ^= rk[10]; + rk[11] ^= rk[4]; + x3 ^= rk[11]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + rk[12] ^= rk[5]; + x0 ^= rk[12]; + rk[13] ^= rk[6]; + x1 ^= rk[13]; + rk[14] ^= rk[7]; + x2 ^= rk[14]; + rk[15] ^= rk[8]; + x3 ^= rk[15]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - rk[28] = 0x63636363UL; - rk[29] = 0x63636363UL; - rk[30] = 0x63636363UL; - rk[31] = 0x4b5f7777UL; + state[0] ^= x0; + state[1] ^= x1; + state[2] ^= x2; + state[3] ^= x3; + rk[16] ^= rk[9]; + x0 = state[12] ^ rk[16]; + rk[17] ^= rk[10]; + x1 = state[13] ^ rk[17]; + rk[18] ^= rk[11]; + x2 = state[14] ^ rk[18]; + rk[19] ^= rk[12]; + x3 = state[15] ^ rk[19]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + rk[20] ^= rk[13]; + x0 ^= rk[20]; + rk[21] ^= rk[14]; + x1 ^= rk[21]; + rk[22] ^= rk[15]; + x2 ^= rk[22]; + rk[23] ^= rk[16]; + x3 ^= rk[23]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + rk[24] ^= rk[17]; + x0 ^= rk[24]; + rk[25] ^= rk[18]; + x1 ^= rk[25]; + rk[26] ^= rk[19]; + x2 ^= rk[26]; + rk[27] ^= rk[20]; + x3 ^= rk[27]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + rk[28] ^= rk[21]; + x0 ^= rk[28]; + rk[29] ^= rk[22]; + x1 ^= rk[29]; + rk[30] ^= rk[23]; + x2 ^= rk[30]; + rk[31] ^= rk[24]; + x3 ^= rk[31]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + state[8] ^= x0; + state[9] ^= x1; + state[10] ^= x2; + state[11] ^= x3; + // 2 + KEY_EXPAND_ELT(sharedMemory, rk[0], rk[1], rk[2], rk[3]); + rk[0] ^= rk[28]; + rk[1] ^= rk[29]; + rk[2] ^= rk[30]; + rk[3] ^= rk[31]; + x0 = state[0] ^ rk[0]; + x1 = state[1] ^ rk[1]; + x2 = state[2] ^ rk[2]; + x3 = state[3] ^ rk[3]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + KEY_EXPAND_ELT(sharedMemory, rk[4], rk[5], rk[6], rk[7]); + rk[4] ^= rk[0]; + rk[5] ^= rk[1]; + rk[6] ^= rk[2]; + rk[7] ^= rk[3]; + rk[7] ^= ~512; + x0 ^= rk[4]; + x1 ^= rk[5]; + x2 ^= rk[6]; + x3 ^= rk[7]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + KEY_EXPAND_ELT(sharedMemory, rk[8], rk[9], rk[10], rk[11]); + rk[8] ^= rk[4]; + rk[9] ^= rk[5]; + rk[10] ^= rk[6]; + rk[11] ^= rk[7]; + x0 ^= rk[8]; + x1 ^= rk[9]; + x2 ^= rk[10]; + x3 ^= rk[11]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + KEY_EXPAND_ELT(sharedMemory, rk[12], rk[13], rk[14], rk[15]); + rk[12] ^= rk[8]; + rk[13] ^= rk[9]; + rk[14] ^= rk[10]; + rk[15] ^= rk[11]; + x0 ^= rk[12]; + x1 ^= rk[13]; + x2 ^= rk[14]; + x3 ^= rk[15]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + state[12] ^= x0; + state[13] ^= x1; + state[14] ^= x2; + state[15] ^= x3; + KEY_EXPAND_ELT(sharedMemory, rk[16], rk[17], rk[18], rk[19]); + rk[16] ^= rk[12]; + rk[17] ^= rk[13]; + rk[18] ^= rk[14]; + rk[19] ^= rk[15]; + x0 = state[8] ^ rk[16]; + x1 = state[9] ^ rk[17]; + x2 = state[10] ^ rk[18]; + x3 = state[11] ^ rk[19]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + KEY_EXPAND_ELT(sharedMemory, rk[20], rk[21], rk[22], rk[23]); + rk[20] ^= rk[16]; + rk[21] ^= rk[17]; + rk[22] ^= rk[18]; + rk[23] ^= rk[19]; + x0 ^= rk[20]; + x1 ^= rk[21]; + x2 ^= rk[22]; + x3 ^= rk[23]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + KEY_EXPAND_ELT(sharedMemory, rk[24], rk[25], rk[26], rk[27]); + rk[24] ^= rk[20]; + rk[25] ^= rk[21]; + rk[26] ^= rk[22]; + rk[27] ^= rk[23]; + x0 ^= rk[24]; + x1 ^= rk[25]; + x2 ^= rk[26]; + x3 ^= rk[27]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + KEY_EXPAND_ELT(sharedMemory, rk[28], rk[29], rk[30], rk[31]); rk[28] ^= rk[24]; rk[29] ^= rk[25]; rk[30] ^= rk[26]; @@ -1507,19 +550,19 @@ void x11_shavite512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t x2 ^= rk[30]; x3 ^= rk[31]; AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - p4 ^= x0; - p5 ^= x1; - p6 ^= x2; - p7 ^= x3; + state[4] ^= x0; + state[5] ^= x1; + state[6] ^= x2; + state[7] ^= x3; rk[0] ^= rk[25]; - x0 = pC ^ rk[0]; + x0 = state[12] ^ rk[0]; rk[1] ^= rk[26]; - x1 = pD ^ rk[1]; + x1 = state[13] ^ rk[1]; rk[2] ^= rk[27]; - x2 = pE ^ rk[2]; + x2 = state[14] ^ rk[2]; rk[3] ^= rk[28]; - x3 = pF ^ rk[3]; + x3 = state[15] ^ rk[3]; AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); rk[4] ^= rk[29]; x0 ^= rk[4]; @@ -1548,18 +591,18 @@ void x11_shavite512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t rk[15] ^= rk[8]; x3 ^= rk[15]; AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - p8 ^= x0; - p9 ^= x1; - pA ^= x2; - pB ^= x3; + state[8] ^= x0; + state[9] ^= x1; + state[10] ^= x2; + state[11] ^= x3; rk[16] ^= rk[9]; - x0 = p4 ^ rk[16]; + x0 = state[4] ^ rk[16]; rk[17] ^= rk[10]; - x1 = p5 ^ rk[17]; + x1 = state[5] ^ rk[17]; rk[18] ^= rk[11]; - x2 = p6 ^ rk[18]; + x2 = state[6] ^ rk[18]; rk[19] ^= rk[12]; - x3 = p7 ^ rk[19]; + x3 = state[7] ^ rk[19]; AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); rk[20] ^= rk[13]; x0 ^= rk[20]; @@ -1588,20 +631,20 @@ void x11_shavite512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t rk[31] ^= rk[24]; x3 ^= rk[31]; AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - p0 ^= x0; - p1 ^= x1; - p2 ^= x2; - p3 ^= x3; + state[0] ^= x0; + state[1] ^= x1; + state[2] ^= x2; + state[3] ^= x3; /* round 3, 7, 11 */ KEY_EXPAND_ELT(sharedMemory, rk[0], rk[1], rk[2], rk[3]); rk[0] ^= rk[28]; rk[1] ^= rk[29]; rk[2] ^= rk[30]; rk[3] ^= rk[31]; - x0 = p8 ^ rk[0]; - x1 = p9 ^ rk[1]; - x2 = pA ^ rk[2]; - x3 = pB ^ rk[3]; + x0 = state[8] ^ rk[0]; + x1 = state[9] ^ rk[1]; + x2 = state[10] ^ rk[2]; + x3 = state[11] ^ rk[3]; AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); KEY_EXPAND_ELT(sharedMemory, rk[4], rk[5], rk[6], rk[7]); rk[4] ^= rk[0]; @@ -1633,19 +676,19 @@ void x11_shavite512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t x2 ^= rk[14]; x3 ^= rk[15]; AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - p4 ^= x0; - p5 ^= x1; - p6 ^= x2; - p7 ^= x3; + state[4] ^= x0; + state[5] ^= x1; + state[6] ^= x2; + state[7] ^= x3; KEY_EXPAND_ELT(sharedMemory, rk[16], rk[17], rk[18], rk[19]); rk[16] ^= rk[12]; rk[17] ^= rk[13]; rk[18] ^= rk[14]; rk[19] ^= rk[15]; - x0 = p0 ^ rk[16]; - x1 = p1 ^ rk[17]; - x2 = p2 ^ rk[18]; - x3 = p3 ^ rk[19]; + x0 = state[0] ^ rk[16]; + x1 = state[1] ^ rk[17]; + x2 = state[2] ^ rk[18]; + x3 = state[3] ^ rk[19]; AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); KEY_EXPAND_ELT(sharedMemory, rk[20], rk[21], rk[22], rk[23]); rk[20] ^= rk[16]; @@ -1677,19 +720,19 @@ void x11_shavite512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t x2 ^= rk[30]; x3 ^= rk[31]; AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - pC ^= x0; - pD ^= x1; - pE ^= x2; - pF ^= x3; + state[12] ^= x0; + state[13] ^= x1; + state[14] ^= x2; + state[15] ^= x3; /* round 4, 8, 12 */ rk[0] ^= rk[25]; - x0 = p4 ^ rk[0]; + x0 = state[4] ^ rk[0]; rk[1] ^= rk[26]; - x1 = p5 ^ rk[1]; + x1 = state[5] ^ rk[1]; rk[2] ^= rk[27]; - x2 = p6 ^ rk[2]; + x2 = state[6] ^ rk[2]; rk[3] ^= rk[28]; - x3 = p7 ^ rk[3]; + x3 = state[7] ^ rk[3]; AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); rk[4] ^= rk[29]; x0 ^= rk[4]; @@ -1718,19 +761,18 @@ void x11_shavite512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t rk[15] ^= rk[8]; x3 ^= rk[15]; AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - - p0 ^= x0; - p1 ^= x1; - p2 ^= x2; - p3 ^= x3; + state[0] ^= x0; + state[1] ^= x1; + state[2] ^= x2; + state[3] ^= x3; rk[16] ^= rk[9]; - x0 = pC ^ rk[16]; + x0 = state[12] ^ rk[16]; rk[17] ^= rk[10]; - x1 = pD ^ rk[17]; + x1 = state[13] ^ rk[17]; rk[18] ^= rk[11]; - x2 = pE ^ rk[18]; + x2 = state[14] ^ rk[18]; rk[19] ^= rk[12]; - x3 = pF ^ rk[19]; + x3 = state[15] ^ rk[19]; AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); rk[20] ^= rk[13]; x0 ^= rk[20]; @@ -1759,28 +801,27 @@ void x11_shavite512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t rk[31] ^= rk[24]; x3 ^= rk[31]; AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - p8 ^= x0; - p9 ^= x1; - pA ^= x2; - pB ^= x3; + state[8] ^= x0; + state[9] ^= x1; + state[10] ^= x2; + state[11] ^= x3; - // 2 + // 3 KEY_EXPAND_ELT(sharedMemory, rk[0], rk[1], rk[2], rk[3]); rk[0] ^= rk[28]; rk[1] ^= rk[29]; rk[2] ^= rk[30]; rk[3] ^= rk[31]; - x0 = p0 ^ rk[0]; - x1 = p1 ^ rk[1]; - x2 = p2 ^ rk[2]; - x3 = p3 ^ rk[3]; + x0 = state[0] ^ rk[0]; + x1 = state[1] ^ rk[1]; + x2 = state[2] ^ rk[2]; + x3 = state[3] ^ rk[3]; AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); KEY_EXPAND_ELT(sharedMemory, rk[4], rk[5], rk[6], rk[7]); rk[4] ^= rk[0]; rk[5] ^= rk[1]; rk[6] ^= rk[2]; rk[7] ^= rk[3]; - rk[7] ^= ~512; x0 ^= rk[4]; x1 ^= rk[5]; x2 ^= rk[6]; @@ -1806,19 +847,19 @@ void x11_shavite512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t x2 ^= rk[14]; x3 ^= rk[15]; AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - pC ^= x0; - pD ^= x1; - pE ^= x2; - pF ^= x3; + state[12] ^= x0; + state[13] ^= x1; + state[14] ^= x2; + state[15] ^= x3; KEY_EXPAND_ELT(sharedMemory, rk[16], rk[17], rk[18], rk[19]); rk[16] ^= rk[12]; rk[17] ^= rk[13]; rk[18] ^= rk[14]; rk[19] ^= rk[15]; - x0 = p8 ^ rk[16]; - x1 = p9 ^ rk[17]; - x2 = pA ^ rk[18]; - x3 = pB ^ rk[19]; + x0 = state[8] ^ rk[16]; + x1 = state[9] ^ rk[17]; + x2 = state[10] ^ rk[18]; + x3 = state[11] ^ rk[19]; AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); KEY_EXPAND_ELT(sharedMemory, rk[20], rk[21], rk[22], rk[23]); rk[20] ^= rk[16]; @@ -1844,25 +885,27 @@ void x11_shavite512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t rk[28] ^= rk[24]; rk[29] ^= rk[25]; rk[30] ^= rk[26]; - rk[31] ^= rk[27]; + rk[31] ^= ~rk[27]; + rk[30] ^= 512; + // rk[31] ^= 0xFFFFFFFF; x0 ^= rk[28]; x1 ^= rk[29]; x2 ^= rk[30]; x3 ^= rk[31]; AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - p4 ^= x0; - p5 ^= x1; - p6 ^= x2; - p7 ^= x3; + state[4] ^= x0; + state[5] ^= x1; + state[6] ^= x2; + state[7] ^= x3; rk[0] ^= rk[25]; - x0 = pC ^ rk[0]; + x0 = state[12] ^ rk[0]; rk[1] ^= rk[26]; - x1 = pD ^ rk[1]; + x1 = state[13] ^ rk[1]; rk[2] ^= rk[27]; - x2 = pE ^ rk[2]; + x2 = state[14] ^ rk[2]; rk[3] ^= rk[28]; - x3 = pF ^ rk[3]; + x3 = state[15] ^ rk[3]; AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); rk[4] ^= rk[29]; x0 ^= rk[4]; @@ -1891,18 +934,18 @@ void x11_shavite512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t rk[15] ^= rk[8]; x3 ^= rk[15]; AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - p8 ^= x0; - p9 ^= x1; - pA ^= x2; - pB ^= x3; + state[8] ^= x0; + state[9] ^= x1; + state[10] ^= x2; + state[11] ^= x3; rk[16] ^= rk[9]; - x0 = p4 ^ rk[16]; + x0 = state[4] ^ rk[16]; rk[17] ^= rk[10]; - x1 = p5 ^ rk[17]; + x1 = state[5] ^ rk[17]; rk[18] ^= rk[11]; - x2 = p6 ^ rk[18]; + x2 = state[6] ^ rk[18]; rk[19] ^= rk[12]; - x3 = p7 ^ rk[19]; + x3 = state[7] ^ rk[19]; AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); rk[20] ^= rk[13]; x0 ^= rk[20]; @@ -1931,20 +974,21 @@ void x11_shavite512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t rk[31] ^= rk[24]; x3 ^= rk[31]; AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - p0 ^= x0; - p1 ^= x1; - p2 ^= x2; - p3 ^= x3; + state[0] ^= x0; + state[1] ^= x1; + state[2] ^= x2; + state[3] ^= x3; + /* round 3, 7, 11 */ KEY_EXPAND_ELT(sharedMemory, rk[0], rk[1], rk[2], rk[3]); rk[0] ^= rk[28]; rk[1] ^= rk[29]; rk[2] ^= rk[30]; rk[3] ^= rk[31]; - x0 = p8 ^ rk[0]; - x1 = p9 ^ rk[1]; - x2 = pA ^ rk[2]; - x3 = pB ^ rk[3]; + x0 = state[8] ^ rk[0]; + x1 = state[9] ^ rk[1]; + x2 = state[10] ^ rk[2]; + x3 = state[11] ^ rk[3]; AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); KEY_EXPAND_ELT(sharedMemory, rk[4], rk[5], rk[6], rk[7]); rk[4] ^= rk[0]; @@ -1976,19 +1020,19 @@ void x11_shavite512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t x2 ^= rk[14]; x3 ^= rk[15]; AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - p4 ^= x0; - p5 ^= x1; - p6 ^= x2; - p7 ^= x3; + state[4] ^= x0; + state[5] ^= x1; + state[6] ^= x2; + state[7] ^= x3; KEY_EXPAND_ELT(sharedMemory, rk[16], rk[17], rk[18], rk[19]); rk[16] ^= rk[12]; rk[17] ^= rk[13]; rk[18] ^= rk[14]; rk[19] ^= rk[15]; - x0 = p0 ^ rk[16]; - x1 = p1 ^ rk[17]; - x2 = p2 ^ rk[18]; - x3 = p3 ^ rk[19]; + x0 = state[0] ^ rk[16]; + x1 = state[1] ^ rk[17]; + x2 = state[2] ^ rk[18]; + x3 = state[3] ^ rk[19]; AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); KEY_EXPAND_ELT(sharedMemory, rk[20], rk[21], rk[22], rk[23]); rk[20] ^= rk[16]; @@ -2020,19 +1064,19 @@ void x11_shavite512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t x2 ^= rk[30]; x3 ^= rk[31]; AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - pC ^= x0; - pD ^= x1; - pE ^= x2; - pF ^= x3; + state[12] ^= x0; + state[13] ^= x1; + state[14] ^= x2; + state[15] ^= x3; /* round 4, 8, 12 */ rk[0] ^= rk[25]; - x0 = p4 ^ rk[0]; + x0 = state[4] ^ rk[0]; rk[1] ^= rk[26]; - x1 = p5 ^ rk[1]; + x1 = state[5] ^ rk[1]; rk[2] ^= rk[27]; - x2 = p6 ^ rk[2]; + x2 = state[6] ^ rk[2]; rk[3] ^= rk[28]; - x3 = p7 ^ rk[3]; + x3 = state[7] ^ rk[3]; AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); rk[4] ^= rk[29]; x0 ^= rk[4]; @@ -2061,18 +1105,18 @@ void x11_shavite512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t rk[15] ^= rk[8]; x3 ^= rk[15]; AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - p0 ^= x0; - p1 ^= x1; - p2 ^= x2; - p3 ^= x3; + state[0] ^= x0; + state[1] ^= x1; + state[2] ^= x2; + state[3] ^= x3; rk[16] ^= rk[9]; - x0 = pC ^ rk[16]; + x0 = state[12] ^ rk[16]; rk[17] ^= rk[10]; - x1 = pD ^ rk[17]; + x1 = state[13] ^ rk[17]; rk[18] ^= rk[11]; - x2 = pE ^ rk[18]; + x2 = state[14] ^ rk[18]; rk[19] ^= rk[12]; - x3 = pF ^ rk[19]; + x3 = state[15] ^ rk[19]; AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); rk[20] ^= rk[13]; x0 ^= rk[20]; @@ -2101,21 +1145,21 @@ void x11_shavite512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t rk[31] ^= rk[24]; x3 ^= rk[31]; AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - p8 ^= x0; - p9 ^= x1; - pA ^= x2; - pB ^= x3; + state[8] ^= x0; + state[9] ^= x1; + state[10] ^= x2; + state[11] ^= x3; - // 3 + /* round 13 */ KEY_EXPAND_ELT(sharedMemory, rk[0], rk[1], rk[2], rk[3]); rk[0] ^= rk[28]; rk[1] ^= rk[29]; rk[2] ^= rk[30]; rk[3] ^= rk[31]; - x0 = p0 ^ rk[0]; - x1 = p1 ^ rk[1]; - x2 = p2 ^ rk[2]; - x3 = p3 ^ rk[3]; + x0 = state[0] ^ rk[0]; + x1 = state[1] ^ rk[1]; + x2 = state[2] ^ rk[2]; + x3 = state[3] ^ rk[3]; AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); KEY_EXPAND_ELT(sharedMemory, rk[4], rk[5], rk[6], rk[7]); rk[4] ^= rk[0]; @@ -2147,19 +1191,19 @@ void x11_shavite512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t x2 ^= rk[14]; x3 ^= rk[15]; AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - pC ^= x0; - pD ^= x1; - pE ^= x2; - pF ^= x3; + state[12] ^= x0; + state[13] ^= x1; + state[14] ^= x2; + state[15] ^= x3; KEY_EXPAND_ELT(sharedMemory, rk[16], rk[17], rk[18], rk[19]); rk[16] ^= rk[12]; rk[17] ^= rk[13]; rk[18] ^= rk[14]; rk[19] ^= rk[15]; - x0 = p8 ^ rk[16]; - x1 = p9 ^ rk[17]; - x2 = pA ^ rk[18]; - x3 = pB ^ rk[19]; + x0 = state[8] ^ rk[16]; + x1 = state[9] ^ rk[17]; + x2 = state[10] ^ rk[18]; + x3 = state[11] ^ rk[19]; AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); KEY_EXPAND_ELT(sharedMemory, rk[20], rk[21], rk[22], rk[23]); rk[20] ^= rk[16]; @@ -2173,9 +1217,9 @@ void x11_shavite512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); KEY_EXPAND_ELT(sharedMemory, rk[24], rk[25], rk[26], rk[27]); rk[24] ^= rk[20]; - rk[25] ^= rk[21]; + rk[25] ^= rk[21] ^ 512; rk[26] ^= rk[22]; - rk[27] ^= rk[23]; + rk[27] ^= ~rk[23]; //^ 0xFFFFFFFF; x0 ^= rk[24]; x1 ^= rk[25]; x2 ^= rk[26]; @@ -2186,413 +1230,1279 @@ void x11_shavite512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t rk[29] ^= rk[25]; rk[30] ^= rk[26]; rk[31] ^= rk[27]; - rk[30] ^= 512; - rk[31] ^= 0xFFFFFFFF; x0 ^= rk[28]; x1 ^= rk[29]; x2 ^= rk[30]; x3 ^= rk[31]; AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - p4 ^= x0; - p5 ^= x1; - p6 ^= x2; - p7 ^= x3; + state[4] ^= x0; + state[5] ^= x1; + state[6] ^= x2; + state[7] ^= x3; + + Hash[0] = 0x72FCCDD8 ^ state[8]; + Hash[1] = 0x79CA4727 ^ state[9]; + Hash[2] = 0x128A077B ^ state[10]; + Hash[3] = 0x40D55AEC ^ state[11]; + Hash[4] = 0xD1901A06 ^ state[12]; + Hash[5] = 0x430AE307 ^ state[13]; + Hash[6] = 0xB29F5CD1 ^ state[14]; + Hash[7] = 0xDF07FBFC ^ state[15]; + Hash[8] = 0x8E45D73D ^ state[0]; + Hash[9] = 0x681AB538 ^ state[1]; + Hash[10] = 0xBDE86578 ^ state[2]; + Hash[11] = 0xDD577E47 ^ state[3]; + Hash[12] = 0xE275EADE ^ state[4]; + Hash[13] = 0x502D9FCD ^ state[5]; + Hash[14] = 0xB9357178 ^ state[6]; + Hash[15] = 0x022A4B9A ^ state[7]; + } +} + +__device__ __forceinline__ +static void c512(const uint32_t*const __restrict__ sharedMemory, uint32_t *const __restrict__ state, uint32_t *const __restrict__ msg) +{ + uint32_t p0, p1, p2, p3, p4, p5, p6, p7; + uint32_t p8, p9, pA, pB, pC, pD, pE, pF; + uint32_t x0, x1, x2, x3; + uint32_t rk[32]; + uint32_t i; + const uint32_t counter = 640; + + p0 = state[0x0]; + p1 = state[0x1]; + p2 = state[0x2]; + p3 = state[0x3]; + p4 = state[0x4]; + p5 = state[0x5]; + p6 = state[0x6]; + p7 = state[0x7]; + p8 = state[0x8]; + p9 = state[0x9]; + pA = state[0xA]; + pB = state[0xB]; + pC = state[0xC]; + pD = state[0xD]; + pE = state[0xE]; + pF = state[0xF]; + + x0 = p4; + x1 = p5; + x2 = p6; + x3 = p7; +#pragma unroll + for(i = 0; i<16; i += 4) + { + rk[i] = msg[i]; + x0 ^= msg[i]; + rk[i + 1] = msg[i + 1]; + x1 ^= msg[i + 1]; + rk[i + 2] = msg[i + 2]; + x2 ^= msg[i + 2]; + rk[i + 3] = msg[i + 3]; + x3 ^= msg[i + 3]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + } + + p0 ^= x0; + p1 ^= x1; + p2 ^= x2; + p3 ^= x3; + x0 = pC; + x1 = pD; + x2 = pE; + x3 = pF; + +#pragma unroll + for(i = 16; i<32; i += 4) + { + rk[i] = msg[i]; + x0 ^= msg[i]; + rk[i + 1] = msg[i + 1]; + x1 ^= msg[i + 1]; + rk[i + 2] = msg[i + 2]; + x2 ^= msg[i + 2]; + rk[i + 3] = msg[i + 3]; + x3 ^= msg[i + 3]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + } + p8 ^= x0; + p9 ^= x1; + pA ^= x2; + pB ^= x3; + + // 1 + KEY_EXPAND_ELT(sharedMemory, rk[0], rk[1], rk[2], rk[3]); + + rk[0] ^= rk[28]; + rk[1] ^= rk[29]; + rk[2] ^= rk[30]; + rk[3] ^= ~rk[31]; + rk[0] ^= counter; + //rk[3] ^= 0xFFFFFFFF; + x0 = p0 ^ rk[0]; + x1 = p1 ^ rk[1]; + x2 = p2 ^ rk[2]; + x3 = p3 ^ rk[3]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + KEY_EXPAND_ELT(sharedMemory, rk[4], rk[5], rk[6], rk[7]); + rk[4] ^= rk[0]; + rk[5] ^= rk[1]; + rk[6] ^= rk[2]; + rk[7] ^= rk[3]; + x0 ^= rk[4]; + x1 ^= rk[5]; + x2 ^= rk[6]; + x3 ^= rk[7]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + KEY_EXPAND_ELT(sharedMemory, rk[8], rk[9], rk[10], rk[11]); + rk[8] ^= rk[4]; + rk[9] ^= rk[5]; + rk[10] ^= rk[6]; + rk[11] ^= rk[7]; + x0 ^= rk[8]; + x1 ^= rk[9]; + x2 ^= rk[10]; + x3 ^= rk[11]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + KEY_EXPAND_ELT(sharedMemory, rk[12], rk[13], rk[14], rk[15]); + rk[12] ^= rk[8]; + rk[13] ^= rk[9]; + rk[14] ^= rk[10]; + rk[15] ^= rk[11]; + x0 ^= rk[12]; + x1 ^= rk[13]; + x2 ^= rk[14]; + x3 ^= rk[15]; + + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + pC ^= x0; + pD ^= x1; + pE ^= x2; + pF ^= x3; + + KEY_EXPAND_ELT(sharedMemory, rk[16], rk[17], rk[18], rk[19]); + rk[16] ^= rk[12]; + rk[17] ^= rk[13]; + rk[18] ^= rk[14]; + rk[19] ^= rk[15]; + x0 = p8 ^ rk[16]; + x1 = p9 ^ rk[17]; + x2 = pA ^ rk[18]; + x3 = pB ^ rk[19]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + KEY_EXPAND_ELT(sharedMemory, rk[20], rk[21], rk[22], rk[23]); + rk[20] ^= rk[16]; + rk[21] ^= rk[17]; + rk[22] ^= rk[18]; + rk[23] ^= rk[19]; + x0 ^= rk[20]; + x1 ^= rk[21]; + x2 ^= rk[22]; + x3 ^= rk[23]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + KEY_EXPAND_ELT(sharedMemory, rk[24], rk[25], rk[26], rk[27]); + rk[24] ^= rk[20]; + rk[25] ^= rk[21]; + rk[26] ^= rk[22]; + rk[27] ^= rk[23]; + x0 ^= rk[24]; + x1 ^= rk[25]; + x2 ^= rk[26]; + x3 ^= rk[27]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + KEY_EXPAND_ELT(sharedMemory, rk[28], rk[29], rk[30], rk[31]); + rk[28] ^= rk[24]; + rk[29] ^= rk[25]; + rk[30] ^= rk[26]; + rk[31] ^= rk[27]; + x0 ^= rk[28]; + x1 ^= rk[29]; + x2 ^= rk[30]; + x3 ^= rk[31]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + p4 ^= x0; + p5 ^= x1; + p6 ^= x2; + p7 ^= x3; + + rk[0] ^= rk[25]; + x0 = pC ^ rk[0]; + rk[1] ^= rk[26]; + x1 = pD ^ rk[1]; + rk[2] ^= rk[27]; + x2 = pE ^ rk[2]; + rk[3] ^= rk[28]; + x3 = pF ^ rk[3]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + rk[4] ^= rk[29]; + x0 ^= rk[4]; + rk[5] ^= rk[30]; + x1 ^= rk[5]; + rk[6] ^= rk[31]; + x2 ^= rk[6]; + rk[7] ^= rk[0]; + x3 ^= rk[7]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + rk[8] ^= rk[1]; + x0 ^= rk[8]; + rk[9] ^= rk[2]; + x1 ^= rk[9]; + rk[10] ^= rk[3]; + x2 ^= rk[10]; + rk[11] ^= rk[4]; + x3 ^= rk[11]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + rk[12] ^= rk[5]; + x0 ^= rk[12]; + rk[13] ^= rk[6]; + x1 ^= rk[13]; + rk[14] ^= rk[7]; + x2 ^= rk[14]; + rk[15] ^= rk[8]; + x3 ^= rk[15]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + p8 ^= x0; + p9 ^= x1; + pA ^= x2; + pB ^= x3; + rk[16] ^= rk[9]; + x0 = p4 ^ rk[16]; + rk[17] ^= rk[10]; + x1 = p5 ^ rk[17]; + rk[18] ^= rk[11]; + x2 = p6 ^ rk[18]; + rk[19] ^= rk[12]; + x3 = p7 ^ rk[19]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + rk[20] ^= rk[13]; + x0 ^= rk[20]; + rk[21] ^= rk[14]; + x1 ^= rk[21]; + rk[22] ^= rk[15]; + x2 ^= rk[22]; + rk[23] ^= rk[16]; + x3 ^= rk[23]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + rk[24] ^= rk[17]; + x0 ^= rk[24]; + rk[25] ^= rk[18]; + x1 ^= rk[25]; + rk[26] ^= rk[19]; + x2 ^= rk[26]; + rk[27] ^= rk[20]; + x3 ^= rk[27]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + rk[28] ^= rk[21]; + x0 ^= rk[28]; + rk[29] ^= rk[22]; + x1 ^= rk[29]; + rk[30] ^= rk[23]; + x2 ^= rk[30]; + rk[31] ^= rk[24]; + x3 ^= rk[31]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + p0 ^= x0; + p1 ^= x1; + p2 ^= x2; + p3 ^= x3; + /* round 3, 7, 11 */ + KEY_EXPAND_ELT(sharedMemory, rk[0], rk[1], rk[2], rk[3]); + rk[0] ^= rk[28]; + rk[1] ^= rk[29]; + rk[2] ^= rk[30]; + rk[3] ^= rk[31]; + x0 = p8 ^ rk[0]; + x1 = p9 ^ rk[1]; + x2 = pA ^ rk[2]; + x3 = pB ^ rk[3]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + KEY_EXPAND_ELT(sharedMemory, rk[4], rk[5], rk[6], rk[7]); + rk[4] ^= rk[0]; + rk[5] ^= rk[1]; + rk[6] ^= rk[2]; + rk[7] ^= rk[3]; + x0 ^= rk[4]; + x1 ^= rk[5]; + x2 ^= rk[6]; + x3 ^= rk[7]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + KEY_EXPAND_ELT(sharedMemory, rk[8], rk[9], rk[10], rk[11]); + rk[8] ^= rk[4]; + rk[9] ^= rk[5]; + rk[10] ^= rk[6]; + rk[11] ^= rk[7]; + x0 ^= rk[8]; + x1 ^= rk[9]; + x2 ^= rk[10]; + x3 ^= rk[11]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + KEY_EXPAND_ELT(sharedMemory, rk[12], rk[13], rk[14], rk[15]); + rk[12] ^= rk[8]; + rk[13] ^= rk[9]; + rk[14] ^= rk[10]; + rk[15] ^= rk[11]; + x0 ^= rk[12]; + x1 ^= rk[13]; + x2 ^= rk[14]; + x3 ^= rk[15]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + p4 ^= x0; + p5 ^= x1; + p6 ^= x2; + p7 ^= x3; + KEY_EXPAND_ELT(sharedMemory, rk[16], rk[17], rk[18], rk[19]); + rk[16] ^= rk[12]; + rk[17] ^= rk[13]; + rk[18] ^= rk[14]; + rk[19] ^= rk[15]; + x0 = p0 ^ rk[16]; + x1 = p1 ^ rk[17]; + x2 = p2 ^ rk[18]; + x3 = p3 ^ rk[19]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + KEY_EXPAND_ELT(sharedMemory, rk[20], rk[21], rk[22], rk[23]); + rk[20] ^= rk[16]; + rk[21] ^= rk[17]; + rk[22] ^= rk[18]; + rk[23] ^= rk[19]; + x0 ^= rk[20]; + x1 ^= rk[21]; + x2 ^= rk[22]; + x3 ^= rk[23]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + KEY_EXPAND_ELT(sharedMemory, rk[24], rk[25], rk[26], rk[27]); + rk[24] ^= rk[20]; + rk[25] ^= rk[21]; + rk[26] ^= rk[22]; + rk[27] ^= rk[23]; + x0 ^= rk[24]; + x1 ^= rk[25]; + x2 ^= rk[26]; + x3 ^= rk[27]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + KEY_EXPAND_ELT(sharedMemory, rk[28], rk[29], rk[30], rk[31]); + rk[28] ^= rk[24]; + rk[29] ^= rk[25]; + rk[30] ^= rk[26]; + rk[31] ^= rk[27]; + x0 ^= rk[28]; + x1 ^= rk[29]; + x2 ^= rk[30]; + x3 ^= rk[31]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + pC ^= x0; + pD ^= x1; + pE ^= x2; + pF ^= x3; + /* round 4, 8, 12 */ + rk[0] ^= rk[25]; + x0 = p4 ^ rk[0]; + rk[1] ^= rk[26]; + x1 = p5 ^ rk[1]; + rk[2] ^= rk[27]; + x2 = p6 ^ rk[2]; + rk[3] ^= rk[28]; + x3 = p7 ^ rk[3]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + rk[4] ^= rk[29]; + x0 ^= rk[4]; + rk[5] ^= rk[30]; + x1 ^= rk[5]; + rk[6] ^= rk[31]; + x2 ^= rk[6]; + rk[7] ^= rk[0]; + x3 ^= rk[7]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + rk[8] ^= rk[1]; + x0 ^= rk[8]; + rk[9] ^= rk[2]; + x1 ^= rk[9]; + rk[10] ^= rk[3]; + x2 ^= rk[10]; + rk[11] ^= rk[4]; + x3 ^= rk[11]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + rk[12] ^= rk[5]; + x0 ^= rk[12]; + rk[13] ^= rk[6]; + x1 ^= rk[13]; + rk[14] ^= rk[7]; + x2 ^= rk[14]; + rk[15] ^= rk[8]; + x3 ^= rk[15]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + + p0 ^= x0; + p1 ^= x1; + p2 ^= x2; + p3 ^= x3; + rk[16] ^= rk[9]; + x0 = pC ^ rk[16]; + rk[17] ^= rk[10]; + x1 = pD ^ rk[17]; + rk[18] ^= rk[11]; + x2 = pE ^ rk[18]; + rk[19] ^= rk[12]; + x3 = pF ^ rk[19]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + rk[20] ^= rk[13]; + x0 ^= rk[20]; + rk[21] ^= rk[14]; + x1 ^= rk[21]; + rk[22] ^= rk[15]; + x2 ^= rk[22]; + rk[23] ^= rk[16]; + x3 ^= rk[23]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + rk[24] ^= rk[17]; + x0 ^= rk[24]; + rk[25] ^= rk[18]; + x1 ^= rk[25]; + rk[26] ^= rk[19]; + x2 ^= rk[26]; + rk[27] ^= rk[20]; + x3 ^= rk[27]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + rk[28] ^= rk[21]; + x0 ^= rk[28]; + rk[29] ^= rk[22]; + x1 ^= rk[29]; + rk[30] ^= rk[23]; + x2 ^= rk[30]; + rk[31] ^= rk[24]; + x3 ^= rk[31]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + p8 ^= x0; + p9 ^= x1; + pA ^= x2; + pB ^= x3; + + // 2 + KEY_EXPAND_ELT(sharedMemory, rk[0], rk[1], rk[2], rk[3]); + rk[0] ^= rk[28]; + rk[1] ^= rk[29]; + rk[2] ^= rk[30]; + rk[3] ^= rk[31]; + x0 = p0 ^ rk[0]; + x1 = p1 ^ rk[1]; + x2 = p2 ^ rk[2]; + x3 = p3 ^ rk[3]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + KEY_EXPAND_ELT(sharedMemory, rk[4], rk[5], rk[6], rk[7]); + rk[4] ^= rk[0]; + rk[5] ^= rk[1]; + rk[6] ^= rk[2]; + rk[7] ^= rk[3]; + rk[7] ^= ~counter; + x0 ^= rk[4]; + x1 ^= rk[5]; + x2 ^= rk[6]; + x3 ^= rk[7]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + KEY_EXPAND_ELT(sharedMemory, rk[8], rk[9], rk[10], rk[11]); + rk[8] ^= rk[4]; + rk[9] ^= rk[5]; + rk[10] ^= rk[6]; + rk[11] ^= rk[7]; + x0 ^= rk[8]; + x1 ^= rk[9]; + x2 ^= rk[10]; + x3 ^= rk[11]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + KEY_EXPAND_ELT(sharedMemory, rk[12], rk[13], rk[14], rk[15]); + rk[12] ^= rk[8]; + rk[13] ^= rk[9]; + rk[14] ^= rk[10]; + rk[15] ^= rk[11]; + x0 ^= rk[12]; + x1 ^= rk[13]; + x2 ^= rk[14]; + x3 ^= rk[15]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + pC ^= x0; + pD ^= x1; + pE ^= x2; + pF ^= x3; + KEY_EXPAND_ELT(sharedMemory, rk[16], rk[17], rk[18], rk[19]); + rk[16] ^= rk[12]; + rk[17] ^= rk[13]; + rk[18] ^= rk[14]; + rk[19] ^= rk[15]; + x0 = p8 ^ rk[16]; + x1 = p9 ^ rk[17]; + x2 = pA ^ rk[18]; + x3 = pB ^ rk[19]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + KEY_EXPAND_ELT(sharedMemory, rk[20], rk[21], rk[22], rk[23]); + rk[20] ^= rk[16]; + rk[21] ^= rk[17]; + rk[22] ^= rk[18]; + rk[23] ^= rk[19]; + x0 ^= rk[20]; + x1 ^= rk[21]; + x2 ^= rk[22]; + x3 ^= rk[23]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + KEY_EXPAND_ELT(sharedMemory, rk[24], rk[25], rk[26], rk[27]); + rk[24] ^= rk[20]; + rk[25] ^= rk[21]; + rk[26] ^= rk[22]; + rk[27] ^= rk[23]; + x0 ^= rk[24]; + x1 ^= rk[25]; + x2 ^= rk[26]; + x3 ^= rk[27]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + KEY_EXPAND_ELT(sharedMemory, rk[28], rk[29], rk[30], rk[31]); + rk[28] ^= rk[24]; + rk[29] ^= rk[25]; + rk[30] ^= rk[26]; + rk[31] ^= rk[27]; + x0 ^= rk[28]; + x1 ^= rk[29]; + x2 ^= rk[30]; + x3 ^= rk[31]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + p4 ^= x0; + p5 ^= x1; + p6 ^= x2; + p7 ^= x3; + + rk[0] ^= rk[25]; + x0 = pC ^ rk[0]; + rk[1] ^= rk[26]; + x1 = pD ^ rk[1]; + rk[2] ^= rk[27]; + x2 = pE ^ rk[2]; + rk[3] ^= rk[28]; + x3 = pF ^ rk[3]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + rk[4] ^= rk[29]; + x0 ^= rk[4]; + rk[5] ^= rk[30]; + x1 ^= rk[5]; + rk[6] ^= rk[31]; + x2 ^= rk[6]; + rk[7] ^= rk[0]; + x3 ^= rk[7]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + rk[8] ^= rk[1]; + x0 ^= rk[8]; + rk[9] ^= rk[2]; + x1 ^= rk[9]; + rk[10] ^= rk[3]; + x2 ^= rk[10]; + rk[11] ^= rk[4]; + x3 ^= rk[11]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + rk[12] ^= rk[5]; + x0 ^= rk[12]; + rk[13] ^= rk[6]; + x1 ^= rk[13]; + rk[14] ^= rk[7]; + x2 ^= rk[14]; + rk[15] ^= rk[8]; + x3 ^= rk[15]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + p8 ^= x0; + p9 ^= x1; + pA ^= x2; + pB ^= x3; + rk[16] ^= rk[9]; + x0 = p4 ^ rk[16]; + rk[17] ^= rk[10]; + x1 = p5 ^ rk[17]; + rk[18] ^= rk[11]; + x2 = p6 ^ rk[18]; + rk[19] ^= rk[12]; + x3 = p7 ^ rk[19]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + rk[20] ^= rk[13]; + x0 ^= rk[20]; + rk[21] ^= rk[14]; + x1 ^= rk[21]; + rk[22] ^= rk[15]; + x2 ^= rk[22]; + rk[23] ^= rk[16]; + x3 ^= rk[23]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + rk[24] ^= rk[17]; + x0 ^= rk[24]; + rk[25] ^= rk[18]; + x1 ^= rk[25]; + rk[26] ^= rk[19]; + x2 ^= rk[26]; + rk[27] ^= rk[20]; + x3 ^= rk[27]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + rk[28] ^= rk[21]; + x0 ^= rk[28]; + rk[29] ^= rk[22]; + x1 ^= rk[29]; + rk[30] ^= rk[23]; + x2 ^= rk[30]; + rk[31] ^= rk[24]; + x3 ^= rk[31]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + p0 ^= x0; + p1 ^= x1; + p2 ^= x2; + p3 ^= x3; + /* round 3, 7, 11 */ + KEY_EXPAND_ELT(sharedMemory, rk[0], rk[1], rk[2], rk[3]); + rk[0] ^= rk[28]; + rk[1] ^= rk[29]; + rk[2] ^= rk[30]; + rk[3] ^= rk[31]; + x0 = p8 ^ rk[0]; + x1 = p9 ^ rk[1]; + x2 = pA ^ rk[2]; + x3 = pB ^ rk[3]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + KEY_EXPAND_ELT(sharedMemory, rk[4], rk[5], rk[6], rk[7]); + rk[4] ^= rk[0]; + rk[5] ^= rk[1]; + rk[6] ^= rk[2]; + rk[7] ^= rk[3]; + x0 ^= rk[4]; + x1 ^= rk[5]; + x2 ^= rk[6]; + x3 ^= rk[7]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + KEY_EXPAND_ELT(sharedMemory, rk[8], rk[9], rk[10], rk[11]); + rk[8] ^= rk[4]; + rk[9] ^= rk[5]; + rk[10] ^= rk[6]; + rk[11] ^= rk[7]; + x0 ^= rk[8]; + x1 ^= rk[9]; + x2 ^= rk[10]; + x3 ^= rk[11]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + KEY_EXPAND_ELT(sharedMemory, rk[12], rk[13], rk[14], rk[15]); + rk[12] ^= rk[8]; + rk[13] ^= rk[9]; + rk[14] ^= rk[10]; + rk[15] ^= rk[11]; + x0 ^= rk[12]; + x1 ^= rk[13]; + x2 ^= rk[14]; + x3 ^= rk[15]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + p4 ^= x0; + p5 ^= x1; + p6 ^= x2; + p7 ^= x3; + KEY_EXPAND_ELT(sharedMemory, rk[16], rk[17], rk[18], rk[19]); + rk[16] ^= rk[12]; + rk[17] ^= rk[13]; + rk[18] ^= rk[14]; + rk[19] ^= rk[15]; + x0 = p0 ^ rk[16]; + x1 = p1 ^ rk[17]; + x2 = p2 ^ rk[18]; + x3 = p3 ^ rk[19]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + KEY_EXPAND_ELT(sharedMemory, rk[20], rk[21], rk[22], rk[23]); + rk[20] ^= rk[16]; + rk[21] ^= rk[17]; + rk[22] ^= rk[18]; + rk[23] ^= rk[19]; + x0 ^= rk[20]; + x1 ^= rk[21]; + x2 ^= rk[22]; + x3 ^= rk[23]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + KEY_EXPAND_ELT(sharedMemory, rk[24], rk[25], rk[26], rk[27]); + rk[24] ^= rk[20]; + rk[25] ^= rk[21]; + rk[26] ^= rk[22]; + rk[27] ^= rk[23]; + x0 ^= rk[24]; + x1 ^= rk[25]; + x2 ^= rk[26]; + x3 ^= rk[27]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + KEY_EXPAND_ELT(sharedMemory, rk[28], rk[29], rk[30], rk[31]); + rk[28] ^= rk[24]; + rk[29] ^= rk[25]; + rk[30] ^= rk[26]; + rk[31] ^= rk[27]; + x0 ^= rk[28]; + x1 ^= rk[29]; + x2 ^= rk[30]; + x3 ^= rk[31]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + pC ^= x0; + pD ^= x1; + pE ^= x2; + pF ^= x3; + /* round 4, 8, 12 */ + rk[0] ^= rk[25]; + x0 = p4 ^ rk[0]; + rk[1] ^= rk[26]; + x1 = p5 ^ rk[1]; + rk[2] ^= rk[27]; + x2 = p6 ^ rk[2]; + rk[3] ^= rk[28]; + x3 = p7 ^ rk[3]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + rk[4] ^= rk[29]; + x0 ^= rk[4]; + rk[5] ^= rk[30]; + x1 ^= rk[5]; + rk[6] ^= rk[31]; + x2 ^= rk[6]; + rk[7] ^= rk[0]; + x3 ^= rk[7]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + rk[8] ^= rk[1]; + x0 ^= rk[8]; + rk[9] ^= rk[2]; + x1 ^= rk[9]; + rk[10] ^= rk[3]; + x2 ^= rk[10]; + rk[11] ^= rk[4]; + x3 ^= rk[11]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + rk[12] ^= rk[5]; + x0 ^= rk[12]; + rk[13] ^= rk[6]; + x1 ^= rk[13]; + rk[14] ^= rk[7]; + x2 ^= rk[14]; + rk[15] ^= rk[8]; + x3 ^= rk[15]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + p0 ^= x0; + p1 ^= x1; + p2 ^= x2; + p3 ^= x3; + rk[16] ^= rk[9]; + x0 = pC ^ rk[16]; + rk[17] ^= rk[10]; + x1 = pD ^ rk[17]; + rk[18] ^= rk[11]; + x2 = pE ^ rk[18]; + rk[19] ^= rk[12]; + x3 = pF ^ rk[19]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + rk[20] ^= rk[13]; + x0 ^= rk[20]; + rk[21] ^= rk[14]; + x1 ^= rk[21]; + rk[22] ^= rk[15]; + x2 ^= rk[22]; + rk[23] ^= rk[16]; + x3 ^= rk[23]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + rk[24] ^= rk[17]; + x0 ^= rk[24]; + rk[25] ^= rk[18]; + x1 ^= rk[25]; + rk[26] ^= rk[19]; + x2 ^= rk[26]; + rk[27] ^= rk[20]; + x3 ^= rk[27]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + rk[28] ^= rk[21]; + x0 ^= rk[28]; + rk[29] ^= rk[22]; + x1 ^= rk[29]; + rk[30] ^= rk[23]; + x2 ^= rk[30]; + rk[31] ^= rk[24]; + x3 ^= rk[31]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + p8 ^= x0; + p9 ^= x1; + pA ^= x2; + pB ^= x3; - rk[0] ^= rk[25]; - x0 = pC ^ rk[0]; - rk[1] ^= rk[26]; - x1 = pD ^ rk[1]; - rk[2] ^= rk[27]; - x2 = pE ^ rk[2]; - rk[3] ^= rk[28]; - x3 = pF ^ rk[3]; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - rk[4] ^= rk[29]; - x0 ^= rk[4]; - rk[5] ^= rk[30]; - x1 ^= rk[5]; - rk[6] ^= rk[31]; - x2 ^= rk[6]; - rk[7] ^= rk[0]; - x3 ^= rk[7]; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - rk[8] ^= rk[1]; - x0 ^= rk[8]; - rk[9] ^= rk[2]; - x1 ^= rk[9]; - rk[10] ^= rk[3]; - x2 ^= rk[10]; - rk[11] ^= rk[4]; - x3 ^= rk[11]; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - rk[12] ^= rk[5]; - x0 ^= rk[12]; - rk[13] ^= rk[6]; - x1 ^= rk[13]; - rk[14] ^= rk[7]; - x2 ^= rk[14]; - rk[15] ^= rk[8]; - x3 ^= rk[15]; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - p8 ^= x0; - p9 ^= x1; - pA ^= x2; - pB ^= x3; - rk[16] ^= rk[9]; - x0 = p4 ^ rk[16]; - rk[17] ^= rk[10]; - x1 = p5 ^ rk[17]; - rk[18] ^= rk[11]; - x2 = p6 ^ rk[18]; - rk[19] ^= rk[12]; - x3 = p7 ^ rk[19]; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - rk[20] ^= rk[13]; - x0 ^= rk[20]; - rk[21] ^= rk[14]; - x1 ^= rk[21]; - rk[22] ^= rk[15]; - x2 ^= rk[22]; - rk[23] ^= rk[16]; - x3 ^= rk[23]; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - rk[24] ^= rk[17]; - x0 ^= rk[24]; - rk[25] ^= rk[18]; - x1 ^= rk[25]; - rk[26] ^= rk[19]; - x2 ^= rk[26]; - rk[27] ^= rk[20]; - x3 ^= rk[27]; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - rk[28] ^= rk[21]; - x0 ^= rk[28]; - rk[29] ^= rk[22]; - x1 ^= rk[29]; - rk[30] ^= rk[23]; - x2 ^= rk[30]; - rk[31] ^= rk[24]; - x3 ^= rk[31]; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - p0 ^= x0; - p1 ^= x1; - p2 ^= x2; - p3 ^= x3; + // 3 + KEY_EXPAND_ELT(sharedMemory, rk[0], rk[1], rk[2], rk[3]); + rk[0] ^= rk[28]; + rk[1] ^= rk[29]; + rk[2] ^= rk[30]; + rk[3] ^= rk[31]; + x0 = p0 ^ rk[0]; + x1 = p1 ^ rk[1]; + x2 = p2 ^ rk[2]; + x3 = p3 ^ rk[3]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + KEY_EXPAND_ELT(sharedMemory, rk[4], rk[5], rk[6], rk[7]); + rk[4] ^= rk[0]; + rk[5] ^= rk[1]; + rk[6] ^= rk[2]; + rk[7] ^= rk[3]; + x0 ^= rk[4]; + x1 ^= rk[5]; + x2 ^= rk[6]; + x3 ^= rk[7]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + KEY_EXPAND_ELT(sharedMemory, rk[8], rk[9], rk[10], rk[11]); + rk[8] ^= rk[4]; + rk[9] ^= rk[5]; + rk[10] ^= rk[6]; + rk[11] ^= rk[7]; + x0 ^= rk[8]; + x1 ^= rk[9]; + x2 ^= rk[10]; + x3 ^= rk[11]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + KEY_EXPAND_ELT(sharedMemory, rk[12], rk[13], rk[14], rk[15]); + rk[12] ^= rk[8]; + rk[13] ^= rk[9]; + rk[14] ^= rk[10]; + rk[15] ^= rk[11]; + x0 ^= rk[12]; + x1 ^= rk[13]; + x2 ^= rk[14]; + x3 ^= rk[15]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + pC ^= x0; + pD ^= x1; + pE ^= x2; + pF ^= x3; + KEY_EXPAND_ELT(sharedMemory, rk[16], rk[17], rk[18], rk[19]); + rk[16] ^= rk[12]; + rk[17] ^= rk[13]; + rk[18] ^= rk[14]; + rk[19] ^= rk[15]; + x0 = p8 ^ rk[16]; + x1 = p9 ^ rk[17]; + x2 = pA ^ rk[18]; + x3 = pB ^ rk[19]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + KEY_EXPAND_ELT(sharedMemory, rk[20], rk[21], rk[22], rk[23]); + rk[20] ^= rk[16]; + rk[21] ^= rk[17]; + rk[22] ^= rk[18]; + rk[23] ^= rk[19]; + x0 ^= rk[20]; + x1 ^= rk[21]; + x2 ^= rk[22]; + x3 ^= rk[23]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + KEY_EXPAND_ELT(sharedMemory, rk[24], rk[25], rk[26], rk[27]); + rk[24] ^= rk[20]; + rk[25] ^= rk[21]; + rk[26] ^= rk[22]; + rk[27] ^= rk[23]; + x0 ^= rk[24]; + x1 ^= rk[25]; + x2 ^= rk[26]; + x3 ^= rk[27]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + KEY_EXPAND_ELT(sharedMemory, rk[28], rk[29], rk[30], rk[31]); + rk[28] ^= rk[24]; + rk[29] ^= rk[25]; + rk[30] ^= rk[26]; + rk[31] ^= ~rk[27]; + rk[30] ^= counter; + //rk[31] ^= 0xFFFFFFFF; + x0 ^= rk[28]; + x1 ^= rk[29]; + x2 ^= rk[30]; + x3 ^= rk[31]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + p4 ^= x0; + p5 ^= x1; + p6 ^= x2; + p7 ^= x3; + + rk[0] ^= rk[25]; + x0 = pC ^ rk[0]; + rk[1] ^= rk[26]; + x1 = pD ^ rk[1]; + rk[2] ^= rk[27]; + x2 = pE ^ rk[2]; + rk[3] ^= rk[28]; + x3 = pF ^ rk[3]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + rk[4] ^= rk[29]; + x0 ^= rk[4]; + rk[5] ^= rk[30]; + x1 ^= rk[5]; + rk[6] ^= rk[31]; + x2 ^= rk[6]; + rk[7] ^= rk[0]; + x3 ^= rk[7]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + rk[8] ^= rk[1]; + x0 ^= rk[8]; + rk[9] ^= rk[2]; + x1 ^= rk[9]; + rk[10] ^= rk[3]; + x2 ^= rk[10]; + rk[11] ^= rk[4]; + x3 ^= rk[11]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + rk[12] ^= rk[5]; + x0 ^= rk[12]; + rk[13] ^= rk[6]; + x1 ^= rk[13]; + rk[14] ^= rk[7]; + x2 ^= rk[14]; + rk[15] ^= rk[8]; + x3 ^= rk[15]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + p8 ^= x0; + p9 ^= x1; + pA ^= x2; + pB ^= x3; + rk[16] ^= rk[9]; + x0 = p4 ^ rk[16]; + rk[17] ^= rk[10]; + x1 = p5 ^ rk[17]; + rk[18] ^= rk[11]; + x2 = p6 ^ rk[18]; + rk[19] ^= rk[12]; + x3 = p7 ^ rk[19]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + rk[20] ^= rk[13]; + x0 ^= rk[20]; + rk[21] ^= rk[14]; + x1 ^= rk[21]; + rk[22] ^= rk[15]; + x2 ^= rk[22]; + rk[23] ^= rk[16]; + x3 ^= rk[23]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + rk[24] ^= rk[17]; + x0 ^= rk[24]; + rk[25] ^= rk[18]; + x1 ^= rk[25]; + rk[26] ^= rk[19]; + x2 ^= rk[26]; + rk[27] ^= rk[20]; + x3 ^= rk[27]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + rk[28] ^= rk[21]; + x0 ^= rk[28]; + rk[29] ^= rk[22]; + x1 ^= rk[29]; + rk[30] ^= rk[23]; + x2 ^= rk[30]; + rk[31] ^= rk[24]; + x3 ^= rk[31]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + p0 ^= x0; + p1 ^= x1; + p2 ^= x2; + p3 ^= x3; - /* round 3, 7, 11 */ - KEY_EXPAND_ELT(sharedMemory, rk[0], rk[1], rk[2], rk[3]); - rk[0] ^= rk[28]; - rk[1] ^= rk[29]; - rk[2] ^= rk[30]; - rk[3] ^= rk[31]; - x0 = p8 ^ rk[0]; - x1 = p9 ^ rk[1]; - x2 = pA ^ rk[2]; - x3 = pB ^ rk[3]; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - KEY_EXPAND_ELT(sharedMemory, rk[4], rk[5], rk[6], rk[7]); - rk[4] ^= rk[0]; - rk[5] ^= rk[1]; - rk[6] ^= rk[2]; - rk[7] ^= rk[3]; - x0 ^= rk[4]; - x1 ^= rk[5]; - x2 ^= rk[6]; - x3 ^= rk[7]; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - KEY_EXPAND_ELT(sharedMemory, rk[8], rk[9], rk[10], rk[11]); - rk[8] ^= rk[4]; - rk[9] ^= rk[5]; - rk[10] ^= rk[6]; - rk[11] ^= rk[7]; - x0 ^= rk[8]; - x1 ^= rk[9]; - x2 ^= rk[10]; - x3 ^= rk[11]; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - KEY_EXPAND_ELT(sharedMemory, rk[12], rk[13], rk[14], rk[15]); - rk[12] ^= rk[8]; - rk[13] ^= rk[9]; - rk[14] ^= rk[10]; - rk[15] ^= rk[11]; - x0 ^= rk[12]; - x1 ^= rk[13]; - x2 ^= rk[14]; - x3 ^= rk[15]; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - p4 ^= x0; - p5 ^= x1; - p6 ^= x2; - p7 ^= x3; - KEY_EXPAND_ELT(sharedMemory, rk[16], rk[17], rk[18], rk[19]); - rk[16] ^= rk[12]; - rk[17] ^= rk[13]; - rk[18] ^= rk[14]; - rk[19] ^= rk[15]; - x0 = p0 ^ rk[16]; - x1 = p1 ^ rk[17]; - x2 = p2 ^ rk[18]; - x3 = p3 ^ rk[19]; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - KEY_EXPAND_ELT(sharedMemory, rk[20], rk[21], rk[22], rk[23]); - rk[20] ^= rk[16]; - rk[21] ^= rk[17]; - rk[22] ^= rk[18]; - rk[23] ^= rk[19]; - x0 ^= rk[20]; - x1 ^= rk[21]; - x2 ^= rk[22]; - x3 ^= rk[23]; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - KEY_EXPAND_ELT(sharedMemory, rk[24], rk[25], rk[26], rk[27]); - rk[24] ^= rk[20]; - rk[25] ^= rk[21]; - rk[26] ^= rk[22]; - rk[27] ^= rk[23]; - x0 ^= rk[24]; - x1 ^= rk[25]; - x2 ^= rk[26]; - x3 ^= rk[27]; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - KEY_EXPAND_ELT(sharedMemory, rk[28], rk[29], rk[30], rk[31]); - rk[28] ^= rk[24]; - rk[29] ^= rk[25]; - rk[30] ^= rk[26]; - rk[31] ^= rk[27]; - x0 ^= rk[28]; - x1 ^= rk[29]; - x2 ^= rk[30]; - x3 ^= rk[31]; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - pC ^= x0; - pD ^= x1; - pE ^= x2; - pF ^= x3; - /* round 4, 8, 12 */ - rk[0] ^= rk[25]; - x0 = p4 ^ rk[0]; - rk[1] ^= rk[26]; - x1 = p5 ^ rk[1]; - rk[2] ^= rk[27]; - x2 = p6 ^ rk[2]; - rk[3] ^= rk[28]; - x3 = p7 ^ rk[3]; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - rk[4] ^= rk[29]; - x0 ^= rk[4]; - rk[5] ^= rk[30]; - x1 ^= rk[5]; - rk[6] ^= rk[31]; - x2 ^= rk[6]; - rk[7] ^= rk[0]; - x3 ^= rk[7]; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - rk[8] ^= rk[1]; - x0 ^= rk[8]; - rk[9] ^= rk[2]; - x1 ^= rk[9]; - rk[10] ^= rk[3]; - x2 ^= rk[10]; - rk[11] ^= rk[4]; - x3 ^= rk[11]; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - rk[12] ^= rk[5]; - x0 ^= rk[12]; - rk[13] ^= rk[6]; - x1 ^= rk[13]; - rk[14] ^= rk[7]; - x2 ^= rk[14]; - rk[15] ^= rk[8]; - x3 ^= rk[15]; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - p0 ^= x0; - p1 ^= x1; - p2 ^= x2; - p3 ^= x3; - rk[16] ^= rk[9]; - x0 = pC ^ rk[16]; - rk[17] ^= rk[10]; - x1 = pD ^ rk[17]; - rk[18] ^= rk[11]; - x2 = pE ^ rk[18]; - rk[19] ^= rk[12]; - x3 = pF ^ rk[19]; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - rk[20] ^= rk[13]; - x0 ^= rk[20]; - rk[21] ^= rk[14]; - x1 ^= rk[21]; - rk[22] ^= rk[15]; - x2 ^= rk[22]; - rk[23] ^= rk[16]; - x3 ^= rk[23]; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - rk[24] ^= rk[17]; - x0 ^= rk[24]; - rk[25] ^= rk[18]; - x1 ^= rk[25]; - rk[26] ^= rk[19]; - x2 ^= rk[26]; - rk[27] ^= rk[20]; - x3 ^= rk[27]; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - rk[28] ^= rk[21]; - x0 ^= rk[28]; - rk[29] ^= rk[22]; - x1 ^= rk[29]; - rk[30] ^= rk[23]; - x2 ^= rk[30]; - rk[31] ^= rk[24]; - x3 ^= rk[31]; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - p8 ^= x0; - p9 ^= x1; - pA ^= x2; - pB ^= x3; - /* round 13 */ - KEY_EXPAND_ELT(sharedMemory, rk[0], rk[1], rk[2], rk[3]); - rk[0] ^= rk[28]; - rk[1] ^= rk[29]; - rk[2] ^= rk[30]; - rk[3] ^= rk[31]; - x0 = p0 ^ rk[0]; - x1 = p1 ^ rk[1]; - x2 = p2 ^ rk[2]; - x3 = p3 ^ rk[3]; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - KEY_EXPAND_ELT(sharedMemory, rk[4], rk[5], rk[6], rk[7]); - rk[4] ^= rk[0]; - rk[5] ^= rk[1]; - rk[6] ^= rk[2]; - rk[7] ^= rk[3]; - x0 ^= rk[4]; - x1 ^= rk[5]; - x2 ^= rk[6]; - x3 ^= rk[7]; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - KEY_EXPAND_ELT(sharedMemory, rk[8], rk[9], rk[10], rk[11]); - rk[8] ^= rk[4]; - rk[9] ^= rk[5]; - rk[10] ^= rk[6]; - rk[11] ^= rk[7]; - x0 ^= rk[8]; - x1 ^= rk[9]; - x2 ^= rk[10]; - x3 ^= rk[11]; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - KEY_EXPAND_ELT(sharedMemory, rk[12], rk[13], rk[14], rk[15]); - rk[12] ^= rk[8]; - rk[13] ^= rk[9]; - rk[14] ^= rk[10]; - rk[15] ^= rk[11]; - x0 ^= rk[12]; - x1 ^= rk[13]; - x2 ^= rk[14]; - x3 ^= rk[15]; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - pC ^= x0; - pD ^= x1; - pE ^= x2; - pF ^= x3; - KEY_EXPAND_ELT(sharedMemory, rk[16], rk[17], rk[18], rk[19]); - rk[16] ^= rk[12]; - rk[17] ^= rk[13]; - rk[18] ^= rk[14]; - rk[19] ^= rk[15]; - x0 = p8 ^ rk[16]; - x1 = p9 ^ rk[17]; - x2 = pA ^ rk[18]; - x3 = pB ^ rk[19]; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - KEY_EXPAND_ELT(sharedMemory, rk[20], rk[21], rk[22], rk[23]); - rk[20] ^= rk[16]; - rk[21] ^= rk[17]; - rk[22] ^= rk[18]; - rk[23] ^= rk[19]; - x0 ^= rk[20]; - x1 ^= rk[21]; - x2 ^= rk[22]; - x3 ^= rk[23]; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - KEY_EXPAND_ELT(sharedMemory, rk[24], rk[25], rk[26], rk[27]); - rk[24] ^= rk[20]; - rk[25] ^= rk[21] ^ 512; - rk[26] ^= rk[22]; - rk[27] ^= ~rk[23]; //^ 0xFFFFFFFF; - x0 ^= rk[24]; - x1 ^= rk[25]; - x2 ^= rk[26]; - x3 ^= rk[27]; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - KEY_EXPAND_ELT(sharedMemory, rk[28], rk[29], rk[30], rk[31]); - rk[28] ^= rk[24]; - rk[29] ^= rk[25]; - rk[30] ^= rk[26]; - rk[31] ^= rk[27]; - x0 ^= rk[28]; - x1 ^= rk[29]; - x2 ^= rk[30]; - x3 ^= rk[31]; - AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); - p4 ^= x0; - p5 ^= x1; - p6 ^= x2; - p7 ^= x3; - Hash[0]=state[0x0] ^ p8; - Hash[1]=state[0x1] ^ p9; - Hash[2]= state[0x2] ^ pA; - Hash[3] = state[0x3] ^ pB; - Hash[4] = state[0x4] ^ pC; - Hash[5] = state[0x5] ^ pD; - Hash[6] = state[0x6] ^ pE; - Hash[7] = state[0x7] ^ pF; - Hash[8] = state[0x8] ^ p0; - Hash[9] = state[0x9] ^ p1; - Hash[10] = state[0xA] ^ p2; - Hash[11] = state[0xB] ^ p3; - Hash[12] = state[0xC] ^ p4; - Hash[13] = state[0xD] ^ p5; - Hash[14] = state[0xE] ^ p6; - Hash[15] = state[0xF] ^ p7; - } + /* round 3, 7, 11 */ + KEY_EXPAND_ELT(sharedMemory, rk[0], rk[1], rk[2], rk[3]); + rk[0] ^= rk[28]; + rk[1] ^= rk[29]; + rk[2] ^= rk[30]; + rk[3] ^= rk[31]; + x0 = p8 ^ rk[0]; + x1 = p9 ^ rk[1]; + x2 = pA ^ rk[2]; + x3 = pB ^ rk[3]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + KEY_EXPAND_ELT(sharedMemory, rk[4], rk[5], rk[6], rk[7]); + rk[4] ^= rk[0]; + rk[5] ^= rk[1]; + rk[6] ^= rk[2]; + rk[7] ^= rk[3]; + x0 ^= rk[4]; + x1 ^= rk[5]; + x2 ^= rk[6]; + x3 ^= rk[7]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + KEY_EXPAND_ELT(sharedMemory, rk[8], rk[9], rk[10], rk[11]); + rk[8] ^= rk[4]; + rk[9] ^= rk[5]; + rk[10] ^= rk[6]; + rk[11] ^= rk[7]; + x0 ^= rk[8]; + x1 ^= rk[9]; + x2 ^= rk[10]; + x3 ^= rk[11]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + KEY_EXPAND_ELT(sharedMemory, rk[12], rk[13], rk[14], rk[15]); + rk[12] ^= rk[8]; + rk[13] ^= rk[9]; + rk[14] ^= rk[10]; + rk[15] ^= rk[11]; + x0 ^= rk[12]; + x1 ^= rk[13]; + x2 ^= rk[14]; + x3 ^= rk[15]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + p4 ^= x0; + p5 ^= x1; + p6 ^= x2; + p7 ^= x3; + KEY_EXPAND_ELT(sharedMemory, rk[16], rk[17], rk[18], rk[19]); + rk[16] ^= rk[12]; + rk[17] ^= rk[13]; + rk[18] ^= rk[14]; + rk[19] ^= rk[15]; + x0 = p0 ^ rk[16]; + x1 = p1 ^ rk[17]; + x2 = p2 ^ rk[18]; + x3 = p3 ^ rk[19]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + KEY_EXPAND_ELT(sharedMemory, rk[20], rk[21], rk[22], rk[23]); + rk[20] ^= rk[16]; + rk[21] ^= rk[17]; + rk[22] ^= rk[18]; + rk[23] ^= rk[19]; + x0 ^= rk[20]; + x1 ^= rk[21]; + x2 ^= rk[22]; + x3 ^= rk[23]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + KEY_EXPAND_ELT(sharedMemory, rk[24], rk[25], rk[26], rk[27]); + rk[24] ^= rk[20]; + rk[25] ^= rk[21]; + rk[26] ^= rk[22]; + rk[27] ^= rk[23]; + x0 ^= rk[24]; + x1 ^= rk[25]; + x2 ^= rk[26]; + x3 ^= rk[27]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + KEY_EXPAND_ELT(sharedMemory, rk[28], rk[29], rk[30], rk[31]); + rk[28] ^= rk[24]; + rk[29] ^= rk[25]; + rk[30] ^= rk[26]; + rk[31] ^= rk[27]; + x0 ^= rk[28]; + x1 ^= rk[29]; + x2 ^= rk[30]; + x3 ^= rk[31]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + pC ^= x0; + pD ^= x1; + pE ^= x2; + pF ^= x3; + /* round 4, 8, 12 */ + rk[0] ^= rk[25]; + x0 = p4 ^ rk[0]; + rk[1] ^= rk[26]; + x1 = p5 ^ rk[1]; + rk[2] ^= rk[27]; + x2 = p6 ^ rk[2]; + rk[3] ^= rk[28]; + x3 = p7 ^ rk[3]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + rk[4] ^= rk[29]; + x0 ^= rk[4]; + rk[5] ^= rk[30]; + x1 ^= rk[5]; + rk[6] ^= rk[31]; + x2 ^= rk[6]; + rk[7] ^= rk[0]; + x3 ^= rk[7]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + rk[8] ^= rk[1]; + x0 ^= rk[8]; + rk[9] ^= rk[2]; + x1 ^= rk[9]; + rk[10] ^= rk[3]; + x2 ^= rk[10]; + rk[11] ^= rk[4]; + x3 ^= rk[11]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + rk[12] ^= rk[5]; + x0 ^= rk[12]; + rk[13] ^= rk[6]; + x1 ^= rk[13]; + rk[14] ^= rk[7]; + x2 ^= rk[14]; + rk[15] ^= rk[8]; + x3 ^= rk[15]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + p0 ^= x0; + p1 ^= x1; + p2 ^= x2; + p3 ^= x3; + rk[16] ^= rk[9]; + x0 = pC ^ rk[16]; + rk[17] ^= rk[10]; + x1 = pD ^ rk[17]; + rk[18] ^= rk[11]; + x2 = pE ^ rk[18]; + rk[19] ^= rk[12]; + x3 = pF ^ rk[19]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + rk[20] ^= rk[13]; + x0 ^= rk[20]; + rk[21] ^= rk[14]; + x1 ^= rk[21]; + rk[22] ^= rk[15]; + x2 ^= rk[22]; + rk[23] ^= rk[16]; + x3 ^= rk[23]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + rk[24] ^= rk[17]; + x0 ^= rk[24]; + rk[25] ^= rk[18]; + x1 ^= rk[25]; + rk[26] ^= rk[19]; + x2 ^= rk[26]; + rk[27] ^= rk[20]; + x3 ^= rk[27]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + rk[28] ^= rk[21]; + x0 ^= rk[28]; + rk[29] ^= rk[22]; + x1 ^= rk[29]; + rk[30] ^= rk[23]; + x2 ^= rk[30]; + rk[31] ^= rk[24]; + x3 ^= rk[31]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + p8 ^= x0; + p9 ^= x1; + pA ^= x2; + pB ^= x3; + + /* round 13 */ + KEY_EXPAND_ELT(sharedMemory, rk[0], rk[1], rk[2], rk[3]); + rk[0] ^= rk[28]; + rk[1] ^= rk[29]; + rk[2] ^= rk[30]; + rk[3] ^= rk[31]; + x0 = p0 ^ rk[0]; + x1 = p1 ^ rk[1]; + x2 = p2 ^ rk[2]; + x3 = p3 ^ rk[3]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + KEY_EXPAND_ELT(sharedMemory, rk[4], rk[5], rk[6], rk[7]); + rk[4] ^= rk[0]; + rk[5] ^= rk[1]; + rk[6] ^= rk[2]; + rk[7] ^= rk[3]; + x0 ^= rk[4]; + x1 ^= rk[5]; + x2 ^= rk[6]; + x3 ^= rk[7]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + KEY_EXPAND_ELT(sharedMemory, rk[8], rk[9], rk[10], rk[11]); + rk[8] ^= rk[4]; + rk[9] ^= rk[5]; + rk[10] ^= rk[6]; + rk[11] ^= rk[7]; + x0 ^= rk[8]; + x1 ^= rk[9]; + x2 ^= rk[10]; + x3 ^= rk[11]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + KEY_EXPAND_ELT(sharedMemory, rk[12], rk[13], rk[14], rk[15]); + rk[12] ^= rk[8]; + rk[13] ^= rk[9]; + rk[14] ^= rk[10]; + rk[15] ^= rk[11]; + x0 ^= rk[12]; + x1 ^= rk[13]; + x2 ^= rk[14]; + x3 ^= rk[15]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + pC ^= x0; + pD ^= x1; + pE ^= x2; + pF ^= x3; + KEY_EXPAND_ELT(sharedMemory, rk[16], rk[17], rk[18], rk[19]); + rk[16] ^= rk[12]; + rk[17] ^= rk[13]; + rk[18] ^= rk[14]; + rk[19] ^= rk[15]; + x0 = p8 ^ rk[16]; + x1 = p9 ^ rk[17]; + x2 = pA ^ rk[18]; + x3 = pB ^ rk[19]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + KEY_EXPAND_ELT(sharedMemory, rk[20], rk[21], rk[22], rk[23]); + rk[20] ^= rk[16]; + rk[21] ^= rk[17]; + rk[22] ^= rk[18]; + rk[23] ^= rk[19]; + x0 ^= rk[20]; + x1 ^= rk[21]; + x2 ^= rk[22]; + x3 ^= rk[23]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + KEY_EXPAND_ELT(sharedMemory, rk[24], rk[25], rk[26], rk[27]); + rk[24] ^= rk[20]; + rk[25] ^= rk[21] ^ counter; + rk[26] ^= rk[22]; + rk[27] ^= ~rk[23]; //^ 0xFFFFFFFF; + x0 ^= rk[24]; + x1 ^= rk[25]; + x2 ^= rk[26]; + x3 ^= rk[27]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + KEY_EXPAND_ELT(sharedMemory, rk[28], rk[29], rk[30], rk[31]); + rk[28] ^= rk[24]; + rk[29] ^= rk[25]; + rk[30] ^= rk[26]; + rk[31] ^= rk[27]; + x0 ^= rk[28]; + x1 ^= rk[29]; + x2 ^= rk[30]; + x3 ^= rk[31]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + p4 ^= x0; + p5 ^= x1; + p6 ^= x2; + p7 ^= x3; + state[0x0] ^= p8; + state[0x1] ^= p9; + state[0x2] ^= pA; + state[0x3] ^= pB; + state[0x4] ^= pC; + state[0x5] ^= pD; + state[0x6] ^= pE; + state[0x7] ^= pF; + state[0x8] ^= p0; + state[0x9] ^= p1; + state[0xA] ^= p2; + state[0xB] ^= p3; + state[0xC] ^= p4; + state[0xD] ^= p5; + state[0xE] ^= p6; + state[0xF] ^= p7; } - -__global__ __launch_bounds__(TPB, 8) +__global__ __launch_bounds__(TPB, 3) void x11_shavite512_gpu_hash_80(uint32_t threads, uint32_t startNounce, void *outputHash) { __shared__ uint32_t sharedMemory[1024]; - if (threadIdx.x < 128) { + if(threadIdx.x < 256) + { sharedMemory[threadIdx.x] = d_AES0[threadIdx.x]; - sharedMemory[threadIdx.x + 256] = d_AES1[threadIdx.x]; - sharedMemory[threadIdx.x + 512] = d_AES2[threadIdx.x]; - sharedMemory[threadIdx.x + 768] = d_AES3[threadIdx.x]; - - sharedMemory[threadIdx.x + 64 * 2] = d_AES0[threadIdx.x + 64 * 2]; - sharedMemory[threadIdx.x + 64 * 2 + 256] = d_AES1[threadIdx.x + 64 * 2]; - sharedMemory[threadIdx.x + 64 * 2 + 512] = d_AES2[threadIdx.x + 64 * 2]; - sharedMemory[threadIdx.x + 64 * 2 + 768] = d_AES3[threadIdx.x + 64 * 2]; + sharedMemory[threadIdx.x + 256] = ROTL32(sharedMemory[threadIdx.x], 8); + sharedMemory[threadIdx.x + 512] = ROTL32(sharedMemory[threadIdx.x], 16); + sharedMemory[threadIdx.x + 768] = ROTL32(sharedMemory[threadIdx.x], 24); } - - uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); - if (thread < threads) + __syncthreads(); + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + if(thread < threads) { const uint32_t nounce = startNounce + thread; // kopiere init-state uint32_t state[16] = { - SPH_C32(0x72FCCDD8), SPH_C32(0x79CA4727), SPH_C32(0x128A077B), SPH_C32(0x40D55AEC), - SPH_C32(0xD1901A06), SPH_C32(0x430AE307), SPH_C32(0xB29F5CD1), SPH_C32(0xDF07FBFC), - SPH_C32(0x8E45D73D), SPH_C32(0x681AB538), SPH_C32(0xBDE86578), SPH_C32(0xDD577E47), - SPH_C32(0xE275EADE), SPH_C32(0x502D9FCD), SPH_C32(0xB9357178), SPH_C32(0x022A4B9A) + 0x72FCCDD8, 0x79CA4727, 0x128A077B, 0x40D55AEC, + 0xD1901A06, 0x430AE307, 0xB29F5CD1, 0xDF07FBFC, + 0x8E45D73D, 0x681AB538, 0xBDE86578, 0xDD577E47, + 0xE275EADE, 0x502D9FCD, 0xB9357178, 0x022A4B9A }; uint32_t msg[32]; - #pragma unroll 32 - for(int i=0;i<32;i++) { +#pragma unroll + for(int i = 0; i<31; i++) + { msg[i] = c_PaddedMessage80[i]; } msg[19] = cuda_swab32(nounce); @@ -2600,37 +2510,39 @@ void x11_shavite512_gpu_hash_80(uint32_t threads, uint32_t startNounce, void *ou msg[27] = 0x2800000; msg[31] = 0x2000000; - c512(sharedMemory, state, msg, 640); + c512(sharedMemory, state, msg); uint32_t *outHash = (uint32_t *)outputHash + 16 * thread; - #pragma unroll 16 - for(int i=0;i<16;i++) +#pragma unroll 16 + for(int i = 0; i<16; i++) outHash[i] = state[i]; } //thread < threads } -__host__ void x11_shavite512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order) +__host__ void x11_shavite512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash) { // berechne wie viele Thread Blocks wir brauchen dim3 grid((threads + TPB-1)/TPB); dim3 block(TPB); - x11_shavite512_gpu_hash_64<<>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector); + x11_shavite512_gpu_hash_64<<>>(threads, d_hash); + + CUDA_SAFE_CALL(cudaGetLastError()); } -__host__ void x11_shavite512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_outputHash, int order) +__host__ void x11_shavite512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_outputHash) { // berechne wie viele Thread Blocks wir brauchen dim3 grid((threads + TPB-1)/TPB); dim3 block(TPB); - x11_shavite512_gpu_hash_80<<>>(threads, startNounce, d_outputHash); + x11_shavite512_gpu_hash_80<<>>(threads, startNounce, d_outputHash); } -__host__ void x11_shavite512_setBlock_80(void *pdata) +__host__ void x11_shavite512_setBlock_80(int thr_id, void *pdata) { // Message mit Padding bereitstellen // lediglich die korrekte Nonce ist noch ab Byte 76 einzusetzen. @@ -2638,6 +2550,6 @@ __host__ void x11_shavite512_setBlock_80(void *pdata) memcpy(PaddedMessage, pdata, 80); memset(PaddedMessage+80, 0, 48); - cudaMemcpyToSymbol(c_PaddedMessage80, PaddedMessage, 32*sizeof(uint32_t), 0, cudaMemcpyHostToDevice); + cudaMemcpyToSymbolAsync(c_PaddedMessage80, PaddedMessage, 32 * sizeof(uint32_t), 0, cudaMemcpyHostToDevice, gpustream[thr_id]); } diff --git a/x11/cuda_x11_simd512.cu b/x11/cuda_x11_simd512.cu index 17a24ff616..5b85c3f514 100644 --- a/x11/cuda_x11_simd512.cu +++ b/x11/cuda_x11_simd512.cu @@ -5,17 +5,19 @@ // // STEP8_IF and STEP8_MAJ beinhalten je 2x 8-fach parallel Operations -#define TPB 64 - +#define TPB 256 #include "cuda_helper.h" +#include "cuda_vector.h" #include -uint32_t *d_state[MAX_GPUS]; -uint4 *d_temp4[MAX_GPUS]; +static uint32_t *d_state[MAX_GPUS]; +static uint4 *d_temp4[MAX_GPUS]; +#if __CUDA_ARCH__ < 320 // texture bound to d_temp4[thr_id], for read access in Compaction kernel texture texRef1D_128; +#endif __constant__ uint8_t c_perm0[8] = { 2, 3, 6, 7, 0, 1, 4, 5 }; __constant__ uint8_t c_perm1[8] = { 6, 7, 2, 3, 4, 5, 0, 1 }; @@ -26,14 +28,6 @@ __constant__ uint8_t c_perm5[8] = { 6, 7, 2, 3, 0, 1, 4, 5 }; __constant__ uint8_t c_perm6[8] = { 6, 7, 0, 1, 4, 5, 2, 3 }; __constant__ uint8_t c_perm7[8] = { 4, 5, 2, 3, 6, 7, 0, 1 }; - -__constant__ uint32_t c_IV_512[32] = { - 0x0ba16b95, 0x72f999ad, 0x9fecc2ae, 0xba3264fc, 0x5e894929, 0x8e9f30e5, 0x2f1daa37, 0xf0f2c558, - 0xac506643, 0xa90635a5, 0xe25b878b, 0xaab7878f, 0x88817f7a, 0x0a02892b, 0x559a7550, 0x598f657e, - 0x7eef60a1, 0x6b70e3e8, 0x9c1714d1, 0xb958e2a8, 0xab02675e, 0xed1c014f, 0xcd8d65bb, 0xfdb7a257, - 0x09254899, 0xd699c7bc, 0x9019b6dc, 0x2b9022e4, 0x8fa14956, 0x21bf9bd3, 0xb94d0943, 0x6ffddc22 -}; - __constant__ short c_FFT128_8_16_Twiddle[128] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 60, 2, 120, 4, -17, 8, -34, 16, -68, 32, 121, 64, -15, 128, -30, @@ -104,25 +98,21 @@ void FFT_8(int *y, int stripe) { * Unrolled decimation in frequency (DIF) radix-2 NTT. * Output data is in revbin_permuted order. */ - + uint32_t u, v; #define X(i) y[stripe*i] #define DO_REDUCE(i) \ X(i) = REDUCE(X(i)) #define DO_REDUCE_FULL_S(i) \ -do { \ X(i) = REDUCE(X(i)); \ - X(i) = EXTRA_REDUCE_S(X(i)); \ -} while(0) + X(i) = EXTRA_REDUCE_S(X(i)); #define BUTTERFLY(i,j,n) \ -do { \ - int u= X(i); \ - int v= X(j); \ + u= y[stripe*i]; \ + v= y[stripe*j]; \ X(i) = u+v; \ - X(j) = (u-v) << (2*n); \ -} while(0) + X(j) = (u-v) << (2*n); BUTTERFLY(0, 4, 0); BUTTERFLY(1, 5, 1); @@ -167,11 +157,8 @@ __device__ __forceinline__ void FFT_16(int *y) { * Output data is in revbin_permuted order. */ #define DO_REDUCE_FULL_S(i) \ - do { \ y[i] = REDUCE(y[i]); \ - y[i] = EXTRA_REDUCE_S(y[i]); \ - } while(0) - + y[i] = EXTRA_REDUCE_S(y[i]); int u,v; // BUTTERFLY(0, 8, 0); @@ -283,7 +270,7 @@ void FFT_128_full(int *y) for (i=0; i<16; i++) /*if (i & 7)*/ y[i] = REDUCE(y[i]*c_FFT128_8_16_Twiddle[i*8+(threadIdx.x&7)]); -#pragma unroll 8 +//#pragma unroll 8 for (i=0; i<16; i+=2) FFT_16(y+i); // eight sequential FFT16's, each one executed in parallel by 8 threads } @@ -327,14 +314,197 @@ void Expansion(const uint32_t *const __restrict__ data, uint4 *const __restrict_ int expanded[32]; #pragma unroll 4 for (int i=0; i < 4; i++) { - expanded[ i] = __byte_perm(__shfl((int)data[0], 2*i, 8), __shfl((int)data[0], (2*i)+1, 8), threadIdx.x&7)&0xff; - expanded[4+i] = __byte_perm(__shfl((int)data[1], 2*i, 8), __shfl((int)data[1], (2*i)+1, 8), threadIdx.x&7)&0xff; + expanded[i] = __byte_perm(__shfl((int)data[0], 2 * i, 8), __shfl((int)data[0], (2 * i) + 1, 8), threadIdx.x & 7) & 0xff; + expanded[4 + i] = __byte_perm(__shfl((int)data[1], 2 * i, 8), __shfl((int)data[1], (2 * i) + 1, 8), threadIdx.x & 7) & 0xff; } -#pragma unroll 8 - for (int i=8; i < 16; i++) - expanded[i] = 0; - FFT_256_halfzero(expanded); + expanded[9] = 0; + expanded[11] = 0; + expanded[13] = 0; + expanded[15] = 0; + +// FFT_256_halfzero(expanded); + + /* + * FFT_256 using w=41 as 256th root of unity. + * Decimation in frequency (DIF) NTT. + * Output data is in revbin_permuted order. + * In place. + */ +// const int tmp = expanded[15]; + + #pragma unroll 8 + for (int i = 0; i<8; i++) + expanded[16 + i] = REDUCE(expanded[i] * c_FFT256_2_128_Twiddle[8 * i + (threadIdx.x & 7)]); + + +//#pragma unroll 8 +// for (int i = 24; i<32; i++) +// expanded[i] = 0; + expanded[9+16] = 0; + expanded[11 + 16] = 0; + expanded[13 + 16] = 0; + expanded[15 + 16] = 0; + + /* handle X^255 with an additional butterfly */ + if ((threadIdx.x & 7) == 7) + { + expanded[15] = 1; + expanded[31] = 0x0100 * 94; + } + + // FFT_128_full(expanded); + + int i; + uint32_t u, v; + +#define DO_REDUCE(i) \ + expanded[2*i] = REDUCE(expanded[2*i]) + +#define DO_REDUCE_FULL_S(i) \ + expanded[2*i] = REDUCE(expanded[2*i]); \ + expanded[2*i] = EXTRA_REDUCE_S(expanded[2*i]); + +#define BUTTERFLY(i,j,n) \ + u= expanded[2*i]; \ + v= expanded[2*j]; \ + expanded[2*i] = u+v; \ + expanded[2*j] = (u-v) << (2*n); + +// BUTTERFLY(0, 4, 0); //0 8 0 + expanded[2 * 4] = expanded[2 * 0]; + +// BUTTERFLY(1, 5, 1); //2 10 2 + u = expanded[2 * 1]; + expanded[2 * 5] = (u ) << (2 * 1); + +// BUTTERFLY(2, 6, 2); //4 12 4 + u = expanded[2 * 2]; + expanded[2 * 6] = (u) << (2 * 2); + +// BUTTERFLY(3, 7, 3); //6 14 6 + u = expanded[2 * 3]; + expanded[2 * 7] = (u) << (2 * 3); + + expanded[2 * 6] = REDUCE(expanded[2 * 6]); + expanded[2 * 7] = REDUCE(expanded[2 * 7]); + + BUTTERFLY(0, 2, 0); + BUTTERFLY(4, 6, 0); + BUTTERFLY(1, 3, 2); + BUTTERFLY(5, 7, 2); + + DO_REDUCE(7); + + BUTTERFLY(0, 1, 0); + BUTTERFLY(2, 3, 0); + BUTTERFLY(4, 5, 0); + BUTTERFLY(6, 7, 0); + + DO_REDUCE_FULL_S(0); + DO_REDUCE_FULL_S(1); + DO_REDUCE_FULL_S(2); + DO_REDUCE_FULL_S(3); + DO_REDUCE_FULL_S(4); + DO_REDUCE_FULL_S(5); + DO_REDUCE_FULL_S(6); + DO_REDUCE_FULL_S(7); + +#undef X +#undef DO_REDUCE +#undef DO_REDUCE_FULL_S +#undef BUTTERFLY + +// FFT_8(expanded + 0, 2); // eight parallel FFT8's + + FFT_8(expanded + 1, 2); // eight parallel FFT8's + + expanded[0] = REDUCE(expanded[0]); + expanded[1] = REDUCE(expanded[1]); +#pragma unroll + for (i = 2; i<16; i++) + expanded[i] = REDUCE(expanded[i] * c_FFT128_8_16_Twiddle[i * 8 + (threadIdx.x & 7)]); + + //#pragma unroll 8 + for (i = 0; i<16; i += 2) + FFT_16(expanded + i); // eight sequential FFT16's, each one executed in parallel by 8 threads + + + +// FFT_128_full(expanded + 16); + +#define DO_REDUCE(i) \ + expanded[2*i+ 16] = REDUCE(expanded[2*i+ 16]) + +#define DO_REDUCE_FULL_S(i) \ + expanded[2*i+ 16] = REDUCE(expanded[2*i+ 16]); \ + expanded[2*i+ 16] = EXTRA_REDUCE_S(expanded[2*i+ 16]); + +#define BUTTERFLY(i,j,n) \ + u= expanded[2*i+ 16]; \ + v= expanded[2*j+ 16]; \ + expanded[2*i+ 16] = u+v; \ + expanded[2*j+ 16] = (u-v) << (2*n); + + // BUTTERFLY(0, 4, 0); //0 8 0 + expanded[2 * 4 + 16] = expanded[2 * 0 + 16]; + + // BUTTERFLY(1, 5, 1); //2 10 2 + u = expanded[2 * 1 + 16]; + expanded[2 * 5 + 16] = (u) << (2 * 1); + + // BUTTERFLY(2, 6, 2); //4 12 4 + u = expanded[2 * 2 + 16]; + expanded[2 * 6 + 16] = (u) << (2 * 2); + + // BUTTERFLY(3, 7, 3); //6 14 6 + u = expanded[2 * 3 + 16]; + expanded[2 * 7 + 16] = (u) << (2 * 3); + + expanded[2 * 6 + 16] = REDUCE(expanded[2 * 6 + 16]); + expanded[2 * 7 + 16] = REDUCE(expanded[2 * 7 + 16]); + + BUTTERFLY(0, 2, 0); + BUTTERFLY(4, 6, 0); + BUTTERFLY(1, 3, 2); + BUTTERFLY(5, 7, 2); + + DO_REDUCE(7); + + BUTTERFLY(0, 1, 0); + BUTTERFLY(2, 3, 0); + BUTTERFLY(4, 5, 0); + BUTTERFLY(6, 7, 0); + + DO_REDUCE_FULL_S(0); + DO_REDUCE_FULL_S(1); + DO_REDUCE_FULL_S(2); + DO_REDUCE_FULL_S(3); + DO_REDUCE_FULL_S(4); + DO_REDUCE_FULL_S(5); + DO_REDUCE_FULL_S(6); + DO_REDUCE_FULL_S(7); + +#undef X +#undef DO_REDUCE +#undef DO_REDUCE_FULL_S +#undef BUTTERFLY + + // FFT_8(expanded + 0, 2); // eight parallel FFT8's + + + FFT_8(expanded + 1 + 16, 2); // eight parallel FFT8's + + expanded[0 + 16] = REDUCE(expanded[0 + 16]); + expanded[1 + 16] = REDUCE(expanded[1 + 16]); +#pragma unroll + for (i = 2; i<16; i++) + expanded[i + 16] = REDUCE(expanded[i + 16] * c_FFT128_8_16_Twiddle[i * 8 + (threadIdx.x & 7)]); + + //#pragma unroll 8 + for (i = 0; i<16; i += 2) + FFT_16(expanded + i+ 16); // eight sequential FFT16's, each one executed in parallel by 8 threads + // store w matrices in global memory @@ -541,14 +711,14 @@ void Expansion(const uint32_t *const __restrict__ data, uint4 *const __restrict_ /***************************************************/ __global__ void __launch_bounds__(TPB, 4) -x11_simd512_gpu_expand_64(uint32_t threads, uint32_t startNounce, const uint64_t *const __restrict__ g_hash, const uint32_t *const __restrict__ g_nonceVector, uint4 *const __restrict__ g_temp4) +x11_simd512_gpu_expand_64(uint32_t threads, uint32_t startNounce, const uint64_t *const __restrict__ g_hash, uint4 *const __restrict__ g_temp4) { uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x)/8; - if (thread < threads) +// if (thread < threads) { - uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread); + const uint32_t nounce = (startNounce + thread); - int hashPosition = nounce - startNounce; + const uint32_t hashPosition = nounce - startNounce; uint32_t *inpHash = (uint32_t*)&g_hash[8 * hashPosition]; @@ -566,62 +736,65 @@ x11_simd512_gpu_expand_64(uint32_t threads, uint32_t startNounce, const uint64_t } } -__global__ void __launch_bounds__(TPB, 4) -x11_simd512_gpu_compress1_64(uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector, uint4 *g_fft4, uint32_t *g_state) +__global__ __launch_bounds__(TPB, 2) +void x11_simd512_gpu_compress_64_maxwell(uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint4 *g_fft4) { - uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); - if (thread < threads) + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); +// if (thread < threads) { - uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread); + const uint32_t nounce = (startNounce + thread); + uint4 g_state[64]; - int hashPosition = nounce - startNounce; - uint32_t *Hash = (uint32_t*)&g_hash[8 * hashPosition]; - Compression1(Hash, hashPosition, g_fft4, g_state); + const uint32_t hashPosition = nounce - startNounce; + uint32_t *const Hash = (uint32_t*)&g_hash[8 * hashPosition]; + + Compression1(Hash, hashPosition, g_fft4, (uint32_t *)g_state); + Compression2(hashPosition, g_fft4, (uint32_t *)&g_state); + Final(Hash, hashPosition, g_fft4, (uint32_t *)&g_state); } } + __global__ void __launch_bounds__(TPB, 4) -x11_simd512_gpu_compress2_64(uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector, uint4 *g_fft4, uint32_t *g_state) +x11_simd512_gpu_compress1_64(uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint4 *g_fft4, uint32_t *g_state) { uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); - if (thread < threads) +// if(thread < threads) { - uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread); + uint32_t nounce = startNounce + thread; int hashPosition = nounce - startNounce; + uint32_t *Hash = (uint32_t*)&g_hash[8 * hashPosition]; - Compression2(hashPosition, g_fft4, g_state); + Compression1(Hash, hashPosition, g_fft4, g_state); } } - -__global__ void __launch_bounds__(TPB, 4) -x11_simd512_gpu_compress_64_maxwell(uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector, uint4 *g_fft4, uint32_t *g_state) +__global__ void __launch_bounds__(TPB, 1) +x11_simd512_gpu_compress2_64(uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint4 *g_fft4, uint32_t *g_state) { - uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); - if (thread < threads) + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); +// if (thread < threads) { - uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread); + const uint32_t nounce = (startNounce + thread); - int hashPosition = nounce - startNounce; - uint32_t *Hash = (uint32_t*)&g_hash[8 * hashPosition]; + const uint32_t hashPosition = nounce - startNounce; - Compression1(Hash, hashPosition, g_fft4, g_state); Compression2(hashPosition, g_fft4, g_state); } } __global__ void __launch_bounds__(TPB, 4) -x11_simd512_gpu_final_64(uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector, uint4 *g_fft4, uint32_t *g_state) +x11_simd512_gpu_final_64(uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint4 *g_fft4, uint32_t *g_state) { - uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); - if (thread < threads) + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); +// if (thread < threads) { - uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread); + const uint32_t nounce = (startNounce + thread); - int hashPosition = nounce - startNounce; - uint32_t *Hash = (uint32_t*)&g_hash[8 * hashPosition]; + const uint32_t hashPosition = nounce - startNounce; + uint32_t *const Hash = (uint32_t*)&g_hash[8 * hashPosition]; Final(Hash, hashPosition, g_fft4, g_state); } @@ -630,45 +803,42 @@ x11_simd512_gpu_final_64(uint32_t threads, uint32_t startNounce, uint64_t *g_has __host__ int x11_simd512_cpu_init(int thr_id, uint32_t threads) { - CUDA_SAFE_CALL(cudaMalloc(&d_state[thr_id], 32*sizeof(int)*threads)); CUDA_SAFE_CALL(cudaMalloc(&d_temp4[thr_id], 64*sizeof(uint4)*threads)); // Texture for 128-Bit Zugriffe - cudaChannelFormatDesc channelDesc128 = cudaCreateChannelDesc(); - texRef1D_128.normalized = 0; - texRef1D_128.filterMode = cudaFilterModePoint; - texRef1D_128.addressMode[0] = cudaAddressModeClamp; - CUDA_SAFE_CALL(cudaBindTexture(NULL, &texRef1D_128, d_temp4[thr_id], &channelDesc128, 64*sizeof(uint4)*threads)); +// cudaChannelFormatDesc channelDesc128 = cudaCreateChannelDesc(); +// texRef1D_128.normalized = 0; +// texRef1D_128.filterMode = cudaFilterModePoint; +// texRef1D_128.addressMode[0] = cudaAddressModeClamp; +// CUDA_SAFE_CALL(cudaBindTexture(NULL, &texRef1D_128, d_temp4[thr_id], &channelDesc128, 64*sizeof(uint4)*threads)); return 0; } void x11_simd512_cpu_free(int thr_id) { - cudaFree(&d_state[thr_id]); cudaFree(&d_temp4[thr_id]); } + __host__ -void x11_simd512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order) +void x11_simd512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, uint32_t simdthreads) { - dim3 block(TPB); - dim3 grid8(((threads + TPB-1)/TPB)*8); + dim3 grid8(((threads + simdthreads - 1) / simdthreads) * 8); - x11_simd512_gpu_expand_64 <<>> (threads, startNounce, (uint64_t*)d_hash, d_nonceVector, d_temp4[thr_id]); - //MyStreamSynchronize(NULL, order, thr_id); - - dim3 grid((threads + TPB-1)/TPB); if (device_sm[device_map[thr_id]] >= 500) { - x11_simd512_gpu_compress_64_maxwell << < grid, block >> > (threads, startNounce, (uint64_t*)d_hash, d_nonceVector, d_temp4[thr_id], d_state[thr_id]); - //MyStreamSynchronize(NULL, order, thr_id); + dim3 block(simdthreads); + dim3 grid((threads + simdthreads - 1) / simdthreads); + x11_simd512_gpu_expand_64 << > > (threads, startNounce, (uint64_t*)d_hash, d_temp4[thr_id]); + x11_simd512_gpu_compress_64_maxwell << < grid, block, 0, gpustream[thr_id] >> > (threads, startNounce, (uint64_t*)d_hash, d_temp4[thr_id]); } else { - x11_simd512_gpu_compress1_64 << < grid, block >> > (threads, startNounce, (uint64_t*)d_hash, d_nonceVector, d_temp4[thr_id], d_state[thr_id]); - x11_simd512_gpu_compress2_64 << < grid, block >> > (threads, startNounce, (uint64_t*)d_hash, d_nonceVector, d_temp4[thr_id], d_state[thr_id]); - // MyStreamSynchronize(NULL, order, thr_id); + dim3 block(TPB); + dim3 grid((threads + TPB - 1) / TPB); + x11_simd512_gpu_expand_64 << > > (threads, startNounce, (uint64_t*)d_hash, d_temp4[thr_id]); + x11_simd512_gpu_compress1_64 << < grid, block, 0, gpustream[thr_id] >> > (threads, startNounce, (uint64_t*)d_hash, d_temp4[thr_id], d_state[thr_id]); + x11_simd512_gpu_compress2_64 << < grid, block, 0, gpustream[thr_id]>>> (threads, startNounce, (uint64_t*)d_hash, d_temp4[thr_id], d_state[thr_id]); + x11_simd512_gpu_final_64 << > > (threads, startNounce, (uint64_t*)d_hash, d_temp4[thr_id], d_state[thr_id]); } - - x11_simd512_gpu_final_64 << > > (threads, startNounce, (uint64_t*)d_hash, d_nonceVector, d_temp4[thr_id], d_state[thr_id]); -// MyStreamSynchronize(NULL, order, thr_id); + CUDA_SAFE_CALL(cudaGetLastError()); } diff --git a/x11/fresh.cu b/x11/fresh.cu index 29758af357..959adc6e7e 100644 --- a/x11/fresh.cu +++ b/x11/fresh.cu @@ -12,25 +12,23 @@ extern "C" { // to test gpu hash on a null buffer #define NULLTEST 0 -static uint32_t *d_hash[MAX_GPUS]; - -extern void x11_shavite512_setBlock_80(void *pdata); -extern void x11_shavite512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order); -extern void x11_shavite512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); +extern void x11_shavite512_setBlock_80(int thr_id, void *pdata); +extern void x11_shavite512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash); +extern void x11_shavite512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash); extern int x11_simd512_cpu_init(int thr_id, uint32_t threads); -extern void x11_simd512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); +extern void x11_simd512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, const uint32_t simdthreads); extern void x11_echo512_cpu_init(int thr_id, uint32_t threads); -extern void x11_echo512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); +//extern void x11_echo512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash); +extern void x11_echo512_cpu_hash_64_final(int thr_id, uint32_t threads, uint32_t startNounce, const uint32_t *d_hash, uint32_t target, uint32_t *h_found); extern void quark_compactTest_cpu_init(int thr_id, uint32_t threads); -extern void quark_compactTest_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *inpHashes, - uint32_t *d_noncesTrue, uint32_t *nrmTrue, uint32_t *d_noncesFalse, uint32_t *nrmFalse, - int order); +extern void quark_compactTest_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, const uint32_t *inpHashes, + const uint32_t *d_noncesTrue, uint32_t *nrmTrue, uint32_t *d_noncesFalse, uint32_t *nrmFalse); // CPU Hash -extern "C" void fresh_hash(void *state, const void *input) +void fresh_hash(void *state, const void *input) { // shavite-simd-shavite-simd-echo @@ -67,91 +65,118 @@ extern "C" void fresh_hash(void *state, const void *input) memcpy(state, hash, 32); } -static bool init[MAX_GPUS] = { 0 }; - -extern "C" int scanhash_fresh(int thr_id, uint32_t *pdata, - const uint32_t *ptarget, uint32_t max_nonce, - unsigned long *hashes_done) +extern int scanhash_fresh(int thr_id, uint32_t *pdata, + uint32_t *ptarget, uint32_t max_nonce, + uint32_t *hashes_done) { + static THREAD uint32_t *d_hash = nullptr; + static THREAD uint32_t *h_found = nullptr; + const uint32_t first_nonce = pdata[19]; uint32_t endiandata[20]; - uint32_t throughput = device_intensity(thr_id, __func__, 1 << 19); - throughput = min(throughput, (max_nonce - first_nonce)); + uint32_t throughputmax = device_intensity(device_map[thr_id], __func__, 1 << 19); + uint32_t throughput = min(throughputmax, (max_nonce - first_nonce)) & 0xfffffc00; + uint32_t simdthreads = (device_sm[device_map[thr_id]] > 500) ? 256 : 32; if (opt_benchmark) - ((uint32_t*)ptarget)[7] = 0x00ff; + ptarget[7] = 0xf; - if (!init[thr_id]) + static THREAD volatile bool init = false; + if(!init) { - cudaSetDevice(device_map[thr_id]); - cudaDeviceReset(); + CUDA_SAFE_CALL(cudaSetDevice(device_map[thr_id])); cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync); cudaDeviceSetCacheConfig(cudaFuncCachePreferL1); + CUDA_SAFE_CALL(cudaStreamCreate(&gpustream[thr_id])); +#if defined WIN32 && !defined _WIN64 + // 2GB limit for cudaMalloc + if(throughputmax > 0x7fffffffULL / (64 * sizeof(uint4))) + { + applog(LOG_ERR, "intensity too high"); + mining_has_stopped[thr_id] = true; + cudaStreamDestroy(gpustream[thr_id]); + proper_exit(2); + } +#endif - x11_simd512_cpu_init(thr_id, throughput); - x11_echo512_cpu_init(thr_id, throughput); + x11_simd512_cpu_init(thr_id, throughputmax); + x11_echo512_cpu_init(thr_id, throughputmax); - CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput + 4), 0); + CUDA_SAFE_CALL(cudaMalloc(&d_hash, 16 * sizeof(uint32_t) * throughputmax + 4)); + CUDA_SAFE_CALL(cudaMallocHost(&(h_found), 4 * sizeof(uint32_t))); - cuda_check_cpu_init(thr_id, throughput); + cuda_check_cpu_init(thr_id, throughputmax); + mining_has_stopped[thr_id] = false; - init[thr_id] = true; + init = true; } for (int k=0; k < 20; k++) - be32enc(&endiandata[k], ((uint32_t*)pdata)[k]); + be32enc(&endiandata[k], pdata[k]); - x11_shavite512_setBlock_80((void*)endiandata); - cuda_check_cpu_setTarget(ptarget); + x11_shavite512_setBlock_80(thr_id, (void*)endiandata); do { - uint32_t Htarg = ptarget[7]; - - uint32_t foundNonce; - int order = 0; // GPU Hash - x11_shavite512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++); - x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); - x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); - x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); - x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); - -#if NULLTEST - uint32_t buf[8]; memset(buf, 0, sizeof buf); - CUDA_SAFE_CALL(cudaMemcpy(buf, d_hash[thr_id], sizeof buf, cudaMemcpyDeviceToHost)); - CUDA_SAFE_CALL(cudaDeviceSynchronize()); - print_hash((unsigned char*)buf); printf("\n"); -#endif - foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]); - if (foundNonce != UINT32_MAX) + x11_shavite512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash); + x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], d_hash, simdthreads); + x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], d_hash); + x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], d_hash, simdthreads); + x11_echo512_cpu_hash_64_final(thr_id, throughput, pdata[19], d_hash, ptarget[7], h_found); + cudaStreamSynchronize(gpustream[thr_id]); + + if(stop_mining) {mining_has_stopped[thr_id] = true; cudaStreamDestroy(gpustream[thr_id]); pthread_exit(nullptr);} + if(h_found[0] != 0xffffffff) { - uint32_t vhash64[8]; - be32enc(&endiandata[19], foundNonce); + const uint32_t Htarg = ptarget[7]; + uint32_t vhash64[8]={0}; + if(opt_verify){ be32enc(&endiandata[19], h_found[0]); fresh_hash(vhash64, endiandata); - if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) { + } if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) + { int res = 1; - uint32_t secNonce = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], foundNonce); *hashes_done = pdata[19] - first_nonce + throughput; - if (secNonce != 0) { - pdata[21] = secNonce; - res++; + if (h_found[1] != 0xffffffff) + { + if(opt_verify){ be32enc(&endiandata[19], h_found[1]); + fresh_hash(vhash64, endiandata); + } if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) + { + + pdata[21] = h_found[1]; + res++; + if (opt_benchmark) + applog(LOG_INFO, "GPU #%d Found second nounce %08x", device_map[thr_id], h_found[1]); + } + else + { + if (vhash64[7] != Htarg) + { + applog(LOG_WARNING, "GPU #%d: result for %08x does not validate on CPU!", device_map[thr_id], h_found[1]); + } + } + } - pdata[19] = foundNonce; + pdata[19] = h_found[0]; + if (opt_benchmark) + applog(LOG_INFO, "GPU #%d Found nounce %08x", device_map[thr_id], h_found[0]); return res; } else { - applog(LOG_INFO, "GPU #%d: result for %08x does not validate on CPU!", thr_id, foundNonce); + if (vhash64[7] != Htarg) + { + applog(LOG_WARNING, "GPU #%d: result for %08x does not validate on CPU!", device_map[thr_id], h_found[0]); + } } } - - pdata[19] += throughput; + pdata[19] += throughput; CUDA_SAFE_CALL(cudaGetLastError()); } while (!work_restart[thr_id].restart && ((uint64_t)max_nonce > ((uint64_t)(pdata[19]) + (uint64_t)throughput))); - *hashes_done = pdata[19] - first_nonce + 1; + *hashes_done = pdata[19] - first_nonce ; return 0; } diff --git a/x11/s3.cu b/x11/s3.cu index e4c7749f6b..39abc03e3d 100644 --- a/x11/s3.cu +++ b/x11/s3.cu @@ -11,22 +11,24 @@ extern "C" { #include "miner.h" #include "cuda_helper.h" +#ifdef __cplusplus +#include +#else #include +#endif -static uint32_t *d_hash[MAX_GPUS]; - -extern void x11_shavite512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order); -extern void x11_shavite512_setBlock_80(void *pdata); +extern void x11_shavite512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash); +extern void x11_shavite512_setBlock_80(int thr_id, void *pdata); extern int x11_simd512_cpu_init(int thr_id, uint32_t threads); -extern void x11_simd512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); +extern void x11_simd512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, const uint32_t simdthreads); extern void quark_skein512_cpu_init(int thr_id); -extern void quark_skein512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); -extern void quark_skein512_cpu_hash_64_final(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, uint32_t *h_found, uint32_t target, int order); +extern void quark_skein512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash); +extern void quark_skein512_cpu_hash_64_final(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, uint32_t *h_found, uint32_t target); /* CPU HASH */ -extern "C" void s3hash(void *output, const void *input) +void s3hash(void *output, const void *input) { sph_shavite512_context ctx_shavite; sph_simd512_context ctx_simd; @@ -49,93 +51,118 @@ extern "C" void s3hash(void *output, const void *input) memcpy(output, hash, 32); } -static bool init[MAX_GPUS] = { 0 }; -static uint32_t *h_found[MAX_GPUS]; - /* Main S3 entry point */ -extern "C" int scanhash_s3(int thr_id, uint32_t *pdata, - const uint32_t *ptarget, uint32_t max_nonce, - unsigned long *hashes_done) +extern int scanhash_s3(int thr_id, uint32_t *pdata, + uint32_t *ptarget, uint32_t max_nonce, + uint32_t *hashes_done) { + static THREAD uint32_t *d_hash = nullptr; + static THREAD uint32_t *h_found = nullptr; + const uint32_t first_nonce = pdata[19]; unsigned int intensity = 20; // 256*256*8*2; #ifdef WIN32 // reduce by one the intensity on windows intensity--; #endif - uint32_t throughput = device_intensity(thr_id, __func__, 1 << intensity); - throughput = min(throughput, (max_nonce - first_nonce)); - + uint32_t throughputmax = device_intensity(device_map[thr_id], __func__, 1 << intensity); + uint32_t throughput = min(throughputmax, (max_nonce - first_nonce)) & 0xfffffc00; + uint32_t simdthreads = (device_sm[device_map[thr_id]] > 500) ? 256 : 32; if (opt_benchmark) - ((uint32_t*)ptarget)[7] = 0x0000000fu; + ptarget[7] = 0x0000000fu; - if (!init[thr_id]) + static THREAD volatile bool init = false; + if(!init) { - cudaSetDevice(device_map[thr_id]); - cudaDeviceReset(); + CUDA_SAFE_CALL(cudaSetDevice(device_map[thr_id])); cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync); cudaDeviceSetCacheConfig(cudaFuncCachePreferL1); + CUDA_SAFE_CALL(cudaStreamCreate(&gpustream[thr_id])); + get_cuda_arch(&cuda_arch[thr_id]); +#if defined WIN32 && !defined _WIN64 + // 2GB limit for cudaMalloc + if(throughputmax > 0x7fffffffULL / (64 * sizeof(uint4))) + { + applog(LOG_ERR, "intensity too high"); + mining_has_stopped[thr_id] = true; + cudaStreamDestroy(gpustream[thr_id]); + proper_exit(2); + } +#endif - x11_simd512_cpu_init(thr_id, throughput); + x11_simd512_cpu_init(thr_id, throughputmax); quark_skein512_cpu_init(thr_id); - CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput), 0); - CUDA_CALL_OR_RET_X(cudaMallocHost(&(h_found[thr_id]), 2 * sizeof(uint32_t)), 0); + CUDA_SAFE_CALL(cudaMalloc(&d_hash, 16 * sizeof(uint32_t) * throughputmax)); + CUDA_SAFE_CALL(cudaMallocHost(&(h_found), 2 * sizeof(uint32_t))); - cuda_check_cpu_init(thr_id, throughput); + cuda_check_cpu_init(thr_id, throughputmax); + mining_has_stopped[thr_id] = false; - init[thr_id] = true; + init = true; } uint32_t endiandata[20]; for (int k=0; k < 20; k++) - be32enc(&endiandata[k], ((uint32_t*)pdata)[k]); + be32enc(&endiandata[k], pdata[k]); - x11_shavite512_setBlock_80((void*)endiandata); - cuda_check_cpu_setTarget(ptarget); + x11_shavite512_setBlock_80(thr_id, (void*)endiandata); + cuda_check_cpu_setTarget(ptarget, thr_id); do { - int order = 0; + x11_shavite512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash); + x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], d_hash, simdthreads); + quark_skein512_cpu_hash_64_final(thr_id, throughput, pdata[19], NULL, d_hash, h_found, ptarget[7]); - x11_shavite512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++); - x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); - quark_skein512_cpu_hash_64_final(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], h_found[thr_id], ptarget[7], order++); - - if (h_found[thr_id][0] != 0xffffffff) + if(stop_mining) {mining_has_stopped[thr_id] = true; cudaStreamDestroy(gpustream[thr_id]); pthread_exit(nullptr);} + if(h_found[0] != 0xffffffff) { const uint32_t Htarg = ptarget[7]; - uint32_t vhash64[8]; - be32enc(&endiandata[19], h_found[thr_id][0]); + uint32_t vhash64[8]={0}; + if(opt_verify){ be32enc(&endiandata[19], h_found[0]); s3hash(vhash64, endiandata); - if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) + } if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) { int res = 1; - // check if there was some other ones... *hashes_done = pdata[19] - first_nonce + throughput; - if (h_found[thr_id][1] != 0xffffffff) + if (h_found[1] != 0xffffffff) { - pdata[21] = h_found[thr_id][1]; - res++; - if (opt_benchmark) - applog(LOG_INFO, "GPU #%d Found second nounce %08x", thr_id, h_found[thr_id][1]); + if(opt_verify){ be32enc(&endiandata[19], h_found[1]); + s3hash(vhash64, endiandata); + } if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) + { + + pdata[21] = h_found[1]; + res++; + if (opt_benchmark) + applog(LOG_INFO, "GPU #%d Found second nounce %08x", device_map[thr_id], h_found[1]); + } + else + { + if (vhash64[7] != Htarg) + { + applog(LOG_WARNING, "GPU #%d: result for %08x does not validate on CPU!", device_map[thr_id], h_found[1]); + } + } + } - pdata[19] = h_found[thr_id][0]; + pdata[19] = h_found[0]; if (opt_benchmark) - applog(LOG_INFO, "GPU #%d Found nounce %08x", thr_id, h_found[thr_id][0]); + applog(LOG_INFO, "GPU #%d Found nounce %08x", device_map[thr_id], h_found[0]); return res; } else { if (vhash64[7] != Htarg) { - applog(LOG_INFO, "GPU #%d: result for %08x does not validate on CPU!", thr_id, h_found[thr_id][0]); + applog(LOG_WARNING, "GPU #%d: result for %08x does not validate on CPU!", device_map[thr_id], h_found[0]); } } } - pdata[19] += throughput; + pdata[19] += throughput; CUDA_SAFE_CALL(cudaGetLastError()); } while (!work_restart[thr_id].restart && ((uint64_t)max_nonce > ((uint64_t)(pdata[19]) + (uint64_t)throughput))); - *hashes_done = pdata[19] - first_nonce + 1; + *hashes_done = pdata[19] - first_nonce ; return 0; } diff --git a/x11/simd_functions.cu b/x11/simd_functions.cu index fdf00e2615..6a029ab344 100644 --- a/x11/simd_functions.cu +++ b/x11/simd_functions.cu @@ -1134,7 +1134,7 @@ __device__ __forceinline__ void Round8_3_final(uint32_t *A, int r, int s, int t, STEP8_MAJ_31(d_cw3[7], u, r, &A[8], &A[16], &A[24], A); } -#if __CUDA_ARCH__ < 350 +#if __CUDA_ARCH__ < 320 #define expanded_vector(x) tex1Dfetch(texRef1D_128, (x)) #else //#define expanded_vector(x) tex1Dfetch(texRef1D_128, (x)) @@ -1278,10 +1278,18 @@ __device__ __forceinline__ void SIMD_Compress1(uint32_t *const __restrict__ A, c { int i; const int thr_offset = thr_id << 6; // thr_id * 128 (je zwei elemente) + + uint32_t msg[16]; + + uint28 *phash = (uint28*)M; + uint28 *outpt = (uint28*)msg; + outpt[0] = phash[0]; + outpt[1] = phash[1]; + #pragma unroll 8 for(i=0; i<8; i++) { - A[i] ^= M[i]; - (&A[8])[i] ^= M[8+i]; + A[i] ^= msg[i]; + (&A[8])[i] ^= msg[8 + i]; } Round8_0(A, thr_offset, 3, 23, 17, 27, g_fft4); Round8_1(A, thr_offset, 28, 19, 22, 7, g_fft4); @@ -1297,10 +1305,9 @@ __device__ __forceinline__ void Compression1(const uint32_t *const __restrict__ }; SIMD_Compress1(A, texture_id, hashval, g_fft4); - uint32_t *state = (uint32_t*)&g_state[blockIdx.x * (blockDim.x*32)]; #pragma unroll 32 for (int i=0; i < 32; i++) - state[threadIdx.x+blockDim.x*i] = A[i]; + g_state[i] = A[i]; } __device__ __forceinline__ void SIMD_Compress2(uint32_t *const __restrict__ A, const int thr_id, const uint4 *const __restrict__ g_fft4) @@ -1324,12 +1331,11 @@ __device__ __forceinline__ void Compression2(const int texture_id, const uint4 * { uint32_t A[32]; int i; - uint32_t *state = &g_state[blockIdx.x * (blockDim.x*32)]; #pragma unroll 32 - for (i=0; i < 32; i++) A[i] = state[threadIdx.x+blockDim.x*i]; + for (i = 0; i < 32; i++) A[i] = g_state[i]; SIMD_Compress2(A, texture_id, g_fft4); #pragma unroll 32 - for (i=0; i < 32; i++) state[threadIdx.x+blockDim.x*i] = A[i]; + for (i=0; i < 32; i++) g_state[i] = A[i]; } __device__ __forceinline__ void SIMD_Compress_Final(uint32_t *const __restrict__ A) @@ -1360,10 +1366,9 @@ __device__ __forceinline__ void Final(uint32_t *const __restrict__ hashval, cons { uint32_t A[32]; int i; - const uint32_t *state = &g_state[blockIdx.x * (blockDim.x*32)]; #pragma unroll 32 for (i=0; i < 32; i++) - A[i] = state[threadIdx.x+blockDim.x*i]; + A[i] = g_state[i]; SIMD_Compress_Final(A); #pragma unroll 16 diff --git a/x11/x11.cu b/x11/x11.cu index 99eee9e980..b5aa115943 100644 --- a/x11/x11.cu +++ b/x11/x11.cu @@ -1,11 +1,5 @@ extern "C" { - -#define FASTECHO 1 //Fast echo can give hardware errors on low difficulty but accepted on most pools. - -#ifdef _DEBUG //Visual Leak Detector for Visual C++ -// #include -#endif #include "sph/sph_blake.h" #include "sph/sph_bmw.h" #include "sph/sph_groestl.h" @@ -28,49 +22,47 @@ extern "C" #include #include - -uint32_t *d_hash[MAX_GPUS]; -uint32_t *h_found[MAX_GPUS]; - -extern void quark_blake512_cpu_init(int thr_id, uint32_t threads); -extern void quark_blake512_cpu_setBlock_80(void *pdata); -extern void quark_blake512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order); +extern void quark_blake512_cpu_init(int thr_id); +extern void quark_blake512_cpu_setBlock_80(int thr_id, uint64_t *pdata); +extern void quark_blake512_cpu_setBlock_80_multi(int thr_id, uint64_t *pdata); +extern void quark_blake512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash); +extern void quark_blake512_cpu_hash_80_multi(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash); extern void quark_bmw512_cpu_init(int thr_id, uint32_t threads); -extern void quark_bmw512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); +extern void quark_bmw512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash); extern void quark_groestl512_cpu_init(int thr_id, uint32_t threads); -extern void quark_groestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); -extern void quark_doublegroestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); +extern void quark_groestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash); extern void quark_skein512_cpu_init(int thr_id, uint32_t threads); -extern void quark_skein512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); +extern void quark_skein512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash); extern void quark_keccak512_cpu_init(int thr_id, uint32_t threads); -extern void quark_keccak512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); +extern void quark_keccak512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash); -extern void cuda_jh512Keccak512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); +extern void cuda_jh512Keccak512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash); extern void x11_luffaCubehash512_cpu_init(int thr_id, uint32_t threads); -extern void x11_luffaCubehash512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); +extern void x11_luffaCubehash512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash); extern void x11_shavite512_cpu_init(int thr_id, uint32_t threads); -extern void x11_shavite512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); +extern void x11_shavite512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash); extern int x11_simd512_cpu_init(int thr_id, uint32_t threads); -extern void x11_simd512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); +extern void x11_simd512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, const uint32_t simdthreads); + extern void x11_echo512_cpu_init(int thr_id, uint32_t threads); -extern void x11_echo512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); -extern void x11_echo512_cpu_hash_64_final(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, uint32_t target, uint32_t *h_found, int order); +extern void x11_echo512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash); +extern void x11_echo512_cpu_hash_64_final(int thr_id, uint32_t threads, uint32_t startNounce, const uint32_t *d_hash, uint32_t target, uint32_t *h_found); extern void x11_echo512_cpu_init(int thr_id, uint32_t threads); extern void quark_compactTest_cpu_init(int thr_id, uint32_t threads); -extern void quark_compactTest_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *inpHashes, - uint32_t *d_noncesTrue, uint32_t *nrmTrue, uint32_t *d_noncesFalse, uint32_t *nrmFalse, int order); +extern void quark_compactTest_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, const uint32_t *inpHashes, + const uint32_t *d_noncesTrue, uint32_t *nrmTrue, uint32_t *d_noncesFalse, uint32_t *nrmFalse); // X11 CPU Hash -extern "C" void x11hash(void *output, const void *input) +void x11hash(void *output, const void *input) { // blake1-bmw2-grs3-skein4-jh5-keccak6-luffa7-cubehash8-shavite9-simd10-echo11 @@ -136,123 +128,139 @@ extern "C" void x11hash(void *output, const void *input) memcpy(output, hash, 32); } -static bool init[MAX_GPUS] = { 0 }; - -extern "C" int scanhash_x11(int thr_id, uint32_t *pdata, - const uint32_t *ptarget, uint32_t max_nonce, - unsigned long *hashes_done) +extern int scanhash_x11(int thr_id, uint32_t *pdata, + uint32_t *ptarget, uint32_t max_nonce, + uint32_t *hashes_done) { + static THREAD uint32_t *d_hash = nullptr; + static THREAD uint32_t *h_found = nullptr; + const uint32_t first_nonce = pdata[19]; - unsigned int intensity = (device_sm[device_map[thr_id]] >= 500 && !is_windows()) ? 20 : 19; - uint32_t throughput = device_intensity(thr_id, __func__, 1 << intensity); // 19=256*256*8; - throughput = min(throughput, (max_nonce - first_nonce)); + + cudaDeviceProp props; + CUDA_SAFE_CALL(cudaGetDeviceProperties(&props, device_map[thr_id])); + static THREAD uint32_t throughputmax; if (opt_benchmark) - ((uint32_t*)ptarget)[7] = 0xf; + ptarget[7] = 0x4f; - if (!init[thr_id]) + static THREAD volatile bool init = false; + if(!init) { - cudaSetDevice(device_map[thr_id]); - cudaDeviceReset(); - cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync); - cudaDeviceSetCacheConfig(cudaFuncCachePreferL1); - quark_groestl512_cpu_init(thr_id, throughput); - quark_bmw512_cpu_init(thr_id, throughput); - x11_echo512_cpu_init(thr_id, throughput); - if (x11_simd512_cpu_init(thr_id, throughput) != 0) { - return 0; + CUDA_SAFE_CALL(cudaSetDevice(device_map[thr_id])); + CUDA_SAFE_CALL(cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync)); + CUDA_SAFE_CALL(cudaDeviceSetCacheConfig(cudaFuncCachePreferL1)); + CUDA_SAFE_CALL(cudaStreamCreate(&gpustream[thr_id])); + get_cuda_arch(&cuda_arch[thr_id]); + + unsigned int intensity; +#if defined WIN32 && !defined _WIN64 + intensity = 256 * 256 * 16; +#else + if(strstr(props.name, "970")) intensity = (256 * 256 * 22); + else if(strstr(props.name, "980")) intensity = (256 * 256 * 22); + else if(strstr(props.name, "1070")) intensity = (256 * 256 * 22); + else if(strstr(props.name, "1080")) intensity = (256 * 256 * 22); + else if(strstr(props.name, "750 Ti")) intensity = (256 * 256 * 20); + else if(strstr(props.name, "750")) intensity = (256 * 256 * 19); + else if(strstr(props.name, "960")) intensity = (256 * 256 * 19); + else intensity = (256 * 256 * 19); +#endif + throughputmax = device_intensity(device_map[thr_id], __func__, intensity); +#if defined WIN32 && !defined _WIN64 + // 2GB limit for cudaMalloc + if(throughputmax > 0x7fffffffULL / (64 * sizeof(uint4))) + { + applog(LOG_ERR, "intensity too high"); + mining_has_stopped[thr_id] = true; + cudaStreamDestroy(gpustream[thr_id]); + proper_exit(2); } - CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], 64 * throughput), 0); // why 64 ? - CUDA_CALL_OR_RET_X(cudaMallocHost(&(h_found[thr_id]), 4 * sizeof(uint32_t)), 0); - cuda_check_cpu_init(thr_id, throughput); - init[thr_id] = true; +#endif + quark_groestl512_cpu_init(thr_id, throughputmax); + quark_bmw512_cpu_init(thr_id, throughputmax); + x11_echo512_cpu_init(thr_id, throughputmax); + x11_simd512_cpu_init(thr_id, throughputmax); + + CUDA_SAFE_CALL(cudaMalloc(&d_hash, 16 * 4 * throughputmax)); + CUDA_SAFE_CALL(cudaMallocHost(&h_found, 2 * sizeof(uint32_t))); + mining_has_stopped[thr_id] = false; + init = true; } + uint32_t throughput = min(throughputmax, max_nonce - first_nonce) & 0xfffffc00; + uint32_t simdthreads = (device_sm[device_map[thr_id]] > 500) ? 256 : 32; + uint32_t endiandata[20]; for (int k=0; k < 20; k++) - be32enc(&endiandata[k], ((uint32_t*)pdata)[k]); - quark_blake512_cpu_setBlock_80((void*)endiandata); - - do { - int order = 0; - - quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++); - quark_bmw512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); - quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); - quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); - cuda_jh512Keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); - x11_luffaCubehash512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); - x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); - x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); - - #ifdef FASTECHO - x11_echo512_cpu_hash_64_final(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], ptarget[7], h_found[thr_id], order++); - if (h_found[thr_id][0] != 0xffffffff) + be32enc(&endiandata[k], pdata[k]); + + quark_blake512_cpu_setBlock_80(thr_id, (uint64_t *)endiandata); + CUDA_SAFE_CALL(cudaGetLastError()); + do + { + quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash); + quark_bmw512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash); + quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash); + quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash); + cuda_jh512Keccak512_cpu_hash_64(thr_id, throughput, pdata[19], d_hash); + x11_luffaCubehash512_cpu_hash_64(thr_id, throughput, pdata[19], d_hash); + x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], d_hash); + x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], d_hash, simdthreads); + x11_echo512_cpu_hash_64_final(thr_id, throughput, pdata[19], d_hash, ptarget[7], h_found); + if(stop_mining) {mining_has_stopped[thr_id] = true; cudaStreamDestroy(gpustream[thr_id]); pthread_exit(nullptr);} + if(h_found[0] != 0xffffffff) { const uint32_t Htarg = ptarget[7]; - uint32_t vhash64[8]; - be32enc(&endiandata[19], h_found[thr_id][0]); - x11hash(vhash64, endiandata); + uint32_t vhash64[8] = {0}; + if(opt_verify) + { + be32enc(&endiandata[19], h_found[0]); + x11hash(vhash64, endiandata); - if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) + } + if(vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) { int res = 1; - // check if there was some other ones... *hashes_done = pdata[19] - first_nonce + throughput; - if (h_found[thr_id][1] != 0xffffffff) + if(h_found[1] != 0xffffffff) { - pdata[21] = h_found[thr_id][1]; - res++; - if (opt_benchmark) - applog(LOG_INFO, "GPU #%d Found second nounce %08x", thr_id, h_found[thr_id][1], vhash64[7], Htarg); - } - pdata[19] = h_found[thr_id][0]; - if (opt_benchmark) - applog(LOG_INFO, "GPU #%d Found nounce %08x", thr_id, h_found[thr_id][0], vhash64[7], Htarg); - return res; - } - else - { - if (vhash64[7] != Htarg) + if(opt_verify) + { + be32enc(&endiandata[19], h_found[1]); + x11hash(vhash64, endiandata); + } if(vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) { - applog(LOG_INFO, "GPU #%d: result for %08x does not validate on CPU!", thr_id, h_found[thr_id][0]); + + pdata[21] = h_found[1]; + res++; + if(opt_benchmark) + applog(LOG_INFO, "GPU #%d Found second nonce %08x", device_map[thr_id], h_found[1]); + } + else + { + if(vhash64[7] != Htarg) + { + applog(LOG_WARNING, "GPU #%d: result for %08x does not validate on CPU!", device_map[thr_id], h_found[1]); + } } - } - } - #else - x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); - uint32_t foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]); - if (foundNonce != UINT32_MAX) - { - const uint32_t Htarg = ptarget[7]; - uint32_t vhash64[8]; - be32enc(&endiandata[19], foundNonce); - x11hash(vhash64, endiandata); - if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) - { - int res = 1; - // check if there was some other ones... - uint32_t secNonce = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], foundNonce); - *hashes_done = pdata[19] - first_nonce + throughput; - if (secNonce != 0) { - pdata[21] = secNonce; - res++; - if (opt_benchmark) applog(LOG_INFO, "Found second nounce", thr_id, foundNonce, vhash64[7], Htarg); } - pdata[19] = foundNonce; - if (opt_benchmark) applog(LOG_INFO, "Found nounce", thr_id, foundNonce, vhash64[7], Htarg); + pdata[19] = h_found[0]; + if(opt_benchmark) + applog(LOG_INFO, "GPU #%d Found nounce %08x", device_map[thr_id], h_found[0]); return res; } else { - applog(LOG_INFO, "GPU #%d: result for %08x does not validate on CPU!", thr_id, foundNonce); + if(vhash64[7] != Htarg) + { + applog(LOG_WARNING, "GPU #%d: result for %08x does not validate on CPU!", device_map[thr_id], h_found[0]); + } } } - #endif - pdata[19] += throughput; - } while (!work_restart[thr_id].restart && ((uint64_t)max_nonce > ((uint64_t)(pdata[19]) + (uint64_t)throughput))); + } while(!work_restart[thr_id].restart && ((uint64_t)max_nonce > ((uint64_t)(pdata[19]) + (uint64_t)throughput))); - *hashes_done = pdata[19] - first_nonce + 1; + *hashes_done = pdata[19] - first_nonce ; return 0; } diff --git a/x13/cuda_x13_fugue512.cu b/x13/cuda_x13_fugue512.cu index cd851726f8..f66f447574 100644 --- a/x13/cuda_x13_fugue512.cu +++ b/x13/cuda_x13_fugue512.cu @@ -11,10 +11,10 @@ #include "cuda_helper.h" + __constant__ uint32_t pTarget[8]; static uint32_t *d_nonce[MAX_GPUS]; - /* * X13 kernel implementation. * @@ -46,17 +46,17 @@ static uint32_t *d_nonce[MAX_GPUS]; * @author phm */ -#define mixtab0(x) (*((uint32_t*)mixtabs + ( (x)))) -#define mixtab1(x) (*((uint32_t*)mixtabs + (256+(x)))) -#define mixtab2(x) (*((uint32_t*)mixtabs + (512+(x)))) -#define mixtab3(x) (*((uint32_t*)mixtabs + (768+(x)))) +#define mixtab0(x) (*(mixtabs + ( (x)))) +#define mixtab1(x) (*(mixtabs + (256+(x)))) +#define mixtab2(x) (*(mixtabs + (512+(x)))) +#define mixtab3(x) (*(mixtabs + (768+(x)))) -texture mixTab0Tex; -texture mixTab1Tex; -texture mixTab2Tex; -texture mixTab3Tex; +//texture mixTab0Tex; +//texture mixTab1Tex; +//texture mixTab2Tex; +//texture mixTab3Tex; -static const uint32_t mixtab0_cpu[] = { +__constant__ uint32_t mixTab0Tex[] = { SPH_C32(0x63633297), SPH_C32(0x7c7c6feb), SPH_C32(0x77775ec7), SPH_C32(0x7b7b7af7), SPH_C32(0xf2f2e8e5), SPH_C32(0x6b6b0ab7), SPH_C32(0x6f6f16a7), SPH_C32(0xc5c56d39), SPH_C32(0x303090c0), @@ -144,8 +144,8 @@ static const uint32_t mixtab0_cpu[] = { SPH_C32(0xb0b03df6), SPH_C32(0x5454b74b), SPH_C32(0xbbbb0cda), SPH_C32(0x16166258) }; - - static const uint32_t mixtab1_cpu[] = { +/* +__constant__ uint32_t mixTab1Tex[] = { SPH_C32(0x97636332), SPH_C32(0xeb7c7c6f), SPH_C32(0xc777775e), SPH_C32(0xf77b7b7a), SPH_C32(0xe5f2f2e8), SPH_C32(0xb76b6b0a), SPH_C32(0xa76f6f16), SPH_C32(0x39c5c56d), SPH_C32(0xc0303090), @@ -234,7 +234,7 @@ static const uint32_t mixtab0_cpu[] = { SPH_C32(0x58161662) }; - static const uint32_t mixtab2_cpu[] = { +__constant__ uint32_t mixTab2Tex[] = { SPH_C32(0x32976363), SPH_C32(0x6feb7c7c), SPH_C32(0x5ec77777), SPH_C32(0x7af77b7b), SPH_C32(0xe8e5f2f2), SPH_C32(0x0ab76b6b), SPH_C32(0x16a76f6f), SPH_C32(0x6d39c5c5), SPH_C32(0x90c03030), @@ -323,7 +323,7 @@ static const uint32_t mixtab0_cpu[] = { SPH_C32(0x62581616) }; - static const uint32_t mixtab3_cpu[] = { +__constant__ uint32_t mixTab3Tex[] = { SPH_C32(0x63329763), SPH_C32(0x7c6feb7c), SPH_C32(0x775ec777), SPH_C32(0x7b7af77b), SPH_C32(0xf2e8e5f2), SPH_C32(0x6b0ab76b), SPH_C32(0x6f16a76f), SPH_C32(0xc56d39c5), SPH_C32(0x3090c030), @@ -411,6 +411,7 @@ static const uint32_t mixtab0_cpu[] = { SPH_C32(0xb03df6b0), SPH_C32(0x54b74b54), SPH_C32(0xbb0cdabb), SPH_C32(0x16625816) }; +*/ #define TIX4(q, x00, x01, x04, x07, x08, x22, x24, x27, x30) { \ x22 ^= x00; \ @@ -430,29 +431,90 @@ static const uint32_t mixtab0_cpu[] = { x20 ^= x06; \ } #define SMIX(x0, x1, x2, x3) { \ - uint32_t c0 = 0; \ - uint32_t c1 = 0; \ - uint32_t c2 = 0; \ - uint32_t c3 = 0; \ - uint32_t r0 = 0; \ - uint32_t r1 = 0; \ - uint32_t r2 = 0; \ - uint32_t r3 = 0; \ - uint32_t tmp; \ - tmp = mixtab0(__byte_perm(x0, 0, 0x4443)); \ - c0 ^= tmp; \ + uint32_t tmp = mixtab0(__byte_perm(x0, 0, 0x4443)); \ + uint32_t c0 = tmp; \ tmp = mixtab1(__byte_perm(x0, 0, 0x4442)); \ c0 ^= tmp; \ - r1 ^= tmp; \ + uint32_t r1 = tmp; \ tmp = mixtab2(__byte_perm(x0, 0, 0x4441)); \ c0 ^= tmp; \ - r2 ^= tmp; \ + uint32_t r2= tmp; \ tmp = mixtab3(__byte_perm(x0, 0, 0x4440)); \ c0 ^= tmp; \ - r3 ^= tmp; \ + uint32_t r3= tmp; \ tmp = mixtab0(__byte_perm(x1, 0, 0x4443)); \ + uint32_t c1 = tmp; \ + uint32_t r0 = tmp; \ + tmp = mixtab1(__byte_perm(x1, 0, 0x4442)); \ + c1 ^= tmp; \ + tmp = mixtab2(__byte_perm(x1, 0, 0x4441)); \ c1 ^= tmp; \ + r2 ^= tmp; \ + tmp = mixtab3(__byte_perm(x1, 0, 0x4440)); \ + c1 ^= tmp; \ + r3 ^= tmp; \ + tmp = mixtab0(__byte_perm(x2, 0, 0x4443)); \ + uint32_t c2 = tmp; \ + r0 ^= tmp; \ + tmp = mixtab1(__byte_perm(x2, 0, 0x4442)); \ + c2 ^= tmp; \ + r1 ^= tmp; \ + tmp = mixtab2(__byte_perm(x2, 0, 0x4441)); \ + c2 ^= tmp; \ + tmp = mixtab3(__byte_perm(x2, 0, 0x4440)); \ + c2 ^= tmp; \ + r3 ^= tmp; \ + tmp = mixtab0(__byte_perm(x3, 0, 0x4443)); \ + uint32_t c3 = tmp; \ r0 ^= tmp; \ + tmp = mixtab1(__byte_perm(x3, 0, 0x4442)); \ + c3 ^= tmp; \ + r1 ^= tmp; \ + tmp = mixtab2(__byte_perm(x3, 0, 0x4441)); \ + c3 ^= tmp; \ + r2 ^= tmp; \ + tmp = mixtab3(__byte_perm(x3, 0, 0x4440)); \ + c3 ^= tmp; \ + uint32_t tmp2 = __byte_perm((c0 ^ r0),(c1 ^ r1), 0x3636);\ + tmp= __byte_perm((c2 ^ r2),(c3 ^ r3), 0x1414); \ + x0 = __byte_perm(tmp2,tmp, 0x3254);\ + r0 = ROL8(r0); \ + r1 = ROL8(r1); \ + r2 = ROL8(r2); \ + r3 = ROL8(r3); \ + tmp2 = __byte_perm((c1 ^ r0),(c2 ^ r1), 0x3636);\ + tmp= __byte_perm((c3 ^ r2),(c0 ^ r3), 0x1414); \ + x1 = __byte_perm(tmp2,tmp, 0x3254);\ + r0 = ROL8(r0); \ + r1 = ROL8(r1); \ + r2 = ROL8(r2); \ + r3 = ROL8(r3); \ + tmp2 = __byte_perm((c2 ^ r0),(c3 ^ r1), 0x3636);\ + tmp= __byte_perm((c0 ^ r2),(c1 ^ r3), 0x1414); \ + x2 = __byte_perm(tmp2,tmp, 0x3254);\ + r0 = ROL8(r0); \ + r1 = ROL8(r1); \ + r2 = ROL8(r2); \ + r3 = ROL8(r3); \ + tmp2 = __byte_perm((c3 ^ r0),(c0 ^ r1), 0x3636);\ + tmp= __byte_perm((c1 ^ r2),(c2 ^ r3), 0x1414); \ + x3 = __byte_perm(tmp2,tmp, 0x3254);\ + } +#define SMIX0(x0, x1, x2, x3) { \ + uint32_t tmp = mixtab0(__byte_perm(x0, 0, 0x4443)); \ + uint32_t c0 = tmp; \ + tmp = mixtab1(__byte_perm(x0, 0, 0x4442)); \ + c0 ^= tmp; \ + uint32_t r1 = tmp; \ + tmp = mixtab2(__byte_perm(x0, 0, 0x4441)); \ + c0 ^= tmp; \ + uint32_t r2= tmp; \ + tmp = mixtab3(__byte_perm(x0, 0, 0x4440)); \ + c0 ^= tmp; \ + uint32_t r3= tmp; \ + tmp = mixtab0(__byte_perm(x1, 0, 0x4443)); \ + uint32_t c1 = tmp; \ + uint32_t r0 = tmp; \ tmp = mixtab1(__byte_perm(x1, 0, 0x4442)); \ c1 ^= tmp; \ tmp = mixtab2(__byte_perm(x1, 0, 0x4441)); \ @@ -462,7 +524,7 @@ static const uint32_t mixtab0_cpu[] = { c1 ^= tmp; \ r3 ^= tmp; \ tmp = mixtab0(__byte_perm(x2, 0, 0x4443)); \ - c2 ^= tmp; \ + uint32_t c2 = tmp; \ r0 ^= tmp; \ tmp = mixtab1(__byte_perm(x2, 0, 0x4442)); \ c2 ^= tmp; \ @@ -473,7 +535,7 @@ static const uint32_t mixtab0_cpu[] = { c2 ^= tmp; \ r3 ^= tmp; \ tmp = mixtab0(__byte_perm(x3, 0, 0x4443)); \ - c3 ^= tmp; \ + uint32_t c3 = tmp; \ r0 ^= tmp; \ tmp = mixtab1(__byte_perm(x3, 0, 0x4442)); \ c3 ^= tmp; \ @@ -483,23 +545,11 @@ static const uint32_t mixtab0_cpu[] = { r2 ^= tmp; \ tmp = mixtab3(__byte_perm(x3, 0, 0x4440)); \ c3 ^= tmp; \ - x0 = ((c0 ^ r0) & SPH_C32(0xFF000000)) \ - | ((c1 ^ r1) & SPH_C32(0x00FF0000)) \ - | ((c2 ^ r2) & SPH_C32(0x0000FF00)) \ - | ((c3 ^ r3) & SPH_C32(0x000000FF)); \ - x1 = ((c1 ^ (r0 << 8)) & SPH_C32(0xFF000000)) \ - | ((c2 ^ (r1 << 8)) & SPH_C32(0x00FF0000)) \ - | ((c3 ^ (r2 << 8)) & SPH_C32(0x0000FF00)) \ - | ((c0 ^ (r3 >> 24)) & SPH_C32(0x000000FF)); \ - x2 = ((c2 ^ (r0 << 16)) & SPH_C32(0xFF000000)) \ - | ((c3 ^ (r1 << 16)) & SPH_C32(0x00FF0000)) \ - | ((c0 ^ (r2 >> 16)) & SPH_C32(0x0000FF00)) \ - | ((c1 ^ (r3 >> 16)) & SPH_C32(0x000000FF)); \ - x3 = ((c3 ^ (r0 << 24)) & SPH_C32(0xFF000000)) \ - | ((c0 ^ (r1 >> 8)) & SPH_C32(0x00FF0000)) \ - | ((c1 ^ (r2 >> 8)) & SPH_C32(0x0000FF00)) \ - | ((c2 ^ (r3 >> 8)) & SPH_C32(0x000000FF)); \ + uint32_t tmp2 = __byte_perm((c0 ^ r0),(c1 ^ r1), 0x3636);\ + tmp= __byte_perm((c2 ^ r2),(c3 ^ r3), 0x1414); \ + x0 = __byte_perm(tmp2,tmp, 0x3254);\ } + #define ROR3 { \ B33 = S33, B34 = S34, B35 = S35; \ S35 = S32; S34 = S31; S33 = S30; S32 = S29; S31 = S28; S30 = S27; S29 = S26; S28 = S25; S27 = S24; \ @@ -508,20 +558,13 @@ static const uint32_t mixtab0_cpu[] = { S08 = S05; S07 = S04; S06 = S03; S05 = S02; S04 = S01; S03 = S00; S02 = B35; S01 = B34; S00 = B33; \ } -#define ROR8 { \ - B28 = S28, B29 = S29, B30 = S30, B31 = S31, B32 = S32, B33 = S33, B34 = S34, B35 = S35; \ - S35 = S27; S34 = S26; S33 = S25; S32 = S24; S31 = S23; S30 = S22; S29 = S21; S28 = S20; S27 = S19; \ - S26 = S18; S25 = S17; S24 = S16; S23 = S15; S22 = S14; S21 = S13; S20 = S12; S19 = S11; S18 = S10; \ - S17 = S09; S16 = S08; S15 = S07; S14 = S06; S13 = S05; S12 = S04; S11 = S03; S10 = S02; S09 = S01; \ - S08 = S00; S07 = B35; S06 = B34; S05 = B33; S04 = B32; S03 = B31; S02 = B30; S01 = B29; S00 = B28; \ - } - -#define ROR9 { \ - B27 = S27, B28 = S28, B29 = S29, B30 = S30, B31 = S31, B32 = S32, B33 = S33, B34 = S34, B35 = S35; \ - S35 = S26; S34 = S25; S33 = S24; S32 = S23; S31 = S22; S30 = S21; S29 = S20; S28 = S19; S27 = S18; \ - S26 = S17; S25 = S16; S24 = S15; S23 = S14; S22 = S13; S21 = S12; S20 = S11; S19 = S10; S18 = S09; \ - S17 = S08; S16 = S07; S15 = S06; S14 = S05; S13 = S04; S12 = S03; S11 = S02; S10 = S01; S09 = S00; \ - S08 = B35; S07 = B34; S06 = B33; S05 = B32; S04 = B31; S03 = B30; S02 = B29; S01 = B28; S00 = B27; \ +#define ROL1 { \ + B35 = S00; \ + S00 = S01; S01 = S02; S02 = S03; S03 = S04; S04 = S05; S05 = S06; S06 = S07; S07 = S08; S08 = S09; S09 = S10; \ + S10 = S11; S11 = S12; S12 = S13; S13 = S14; S14 = S15; S15 = S16; S16 = S17; S17 = S18; S18 = S19; S19 = S20; \ + S20 = S21; S21 = S22; S22 = S23; S23 = S24; S24 = S25; S25 = S26; S26 = S27; S27 = S28; S28 = S29; S29 = S30; \ + S30 = S31; S31 = S32; S32 = S33; S33 = S34; S34 = S35; \ + S35 = B35; \ } #define FUGUE512_3(x, y, z) { \ @@ -556,92 +599,229 @@ static const uint32_t mixtab0_cpu[] = { SMIX(S00, S01, S02, S03); \ } -//__launch_bounds__(128, 6) -__global__ __launch_bounds__(128,8) -void x13_fugue512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector) +__global__ __launch_bounds__(128, 8) +void x13_fugue512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint32_t *g_hash) { __shared__ uint32_t mixtabs[1024]; - - uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); - if (thread < threads) + + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); +// if (thread < threads) { - if (threadIdx.x < 128) + if (threadIdx.x < 128) { - *((uint32_t*)mixtabs + (threadIdx.x)) = tex1Dfetch(mixTab0Tex, threadIdx.x); - *((uint32_t*)mixtabs + (128 + threadIdx.x)) = tex1Dfetch(mixTab0Tex, threadIdx.x+128); - *((uint32_t*)mixtabs + (256 + threadIdx.x)) = tex1Dfetch(mixTab1Tex, threadIdx.x); - *((uint32_t*)mixtabs + (384 + threadIdx.x)) = tex1Dfetch(mixTab1Tex , threadIdx.x+128); - *((uint32_t*)mixtabs + (512 + threadIdx.x)) = tex1Dfetch(mixTab2Tex, threadIdx.x); - *((uint32_t*)mixtabs + (640 + threadIdx.x)) = tex1Dfetch(mixTab2Tex, threadIdx.x+128); - *((uint32_t*)mixtabs + (768 + threadIdx.x)) = tex1Dfetch(mixTab3Tex, threadIdx.x); - *((uint32_t*)mixtabs + (896 + threadIdx.x)) = tex1Dfetch(mixTab3Tex, threadIdx.x+128); + mixtabs[threadIdx.x] = mixTab0Tex[threadIdx.x]; + mixtabs[threadIdx.x + 128] = mixTab0Tex[threadIdx.x + 128]; + mixtabs[256 + threadIdx.x] = ROTR32(mixtabs[threadIdx.x], 8); + mixtabs[256 + threadIdx.x + 128] = ROTR32(mixtabs[threadIdx.x + 128], 8); + mixtabs[512 + threadIdx.x] = ROTR32(mixtabs[threadIdx.x], 16); + mixtabs[512 + threadIdx.x + 128] = ROTR32(mixtabs[threadIdx.x + 128], 16); + mixtabs[768 + threadIdx.x] = ROTR32(mixtabs[threadIdx.x], 24); + mixtabs[768 + threadIdx.x + 128] = ROTR32(mixtabs[threadIdx.x + 128], 24); } - uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread); - - int hashPosition = nounce - startNounce; - uint32_t *Hash = (uint32_t*)&g_hash[hashPosition<<3]; + __syncthreads(); + const uint32_t nounce = (startNounce + thread); + + const uint32_t hashPosition = nounce - startNounce; + uint32_t *const Hash = &g_hash[hashPosition*16]; - #pragma unroll 16 +#pragma unroll 16 for (int i = 0; i < 16; i++) Hash[i] = cuda_swab32(Hash[i]); - __syncthreads(); uint32_t S00, S01, S02, S03, S04, S05, S06, S07, S08, S09; uint32_t S10, S11, S12, S13, S14, S15, S16, S17, S18, S19; uint32_t S20, S21, S22, S23, S24, S25, S26, S27, S28, S29; uint32_t S30, S31, S32, S33, S34, S35; - uint32_t B27, B28, B29, B30, B31, B32, B33, B34, B35; - uint64_t bc = (uint64_t) 64 << 3; - uint32_t bclo = (uint32_t)(bc & 0xFFFFFFFFULL); - uint32_t bchi = (uint32_t)(bc >> 32); - - S00 = S01 = S02 = S03 = S04 = S05 = S06 = S07 = S08 = S09 = S10 = S11 = S12 = S13 = S14 = S15 = S16 = S17 = S18 = S19 = 0; - S20 = SPH_C32(0x8807a57e); S21 = SPH_C32(0xe616af75); S22 = SPH_C32(0xc5d3e4db); S23 = SPH_C32(0xac9ab027); - S24 = SPH_C32(0xd915f117); S25 = SPH_C32(0xb6eecc54); S26 = SPH_C32(0x06e8020b); S27 = SPH_C32(0x4a92efd1); - S28 = SPH_C32(0xaac6e2c9); S29 = SPH_C32(0xddb21398); S30 = SPH_C32(0xcae65838); S31 = SPH_C32(0x437f203f); - S32 = SPH_C32(0x25ea78e7); S33 = SPH_C32(0x951fddd6); S34 = SPH_C32(0xda6ed11d); S35 = SPH_C32(0xe13e3567); - - FUGUE512_3((Hash[0x0]), (Hash[0x1]), (Hash[0x2])); - FUGUE512_3((Hash[0x3]), (Hash[0x4]), (Hash[0x5])); - FUGUE512_3((Hash[0x6]), (Hash[0x7]), (Hash[0x8])); - FUGUE512_3((Hash[0x9]), (Hash[0xA]), (Hash[0xB])); - FUGUE512_3((Hash[0xC]), (Hash[0xD]), (Hash[0xE])); - FUGUE512_3((Hash[0xF]), bchi, bclo); - - //#pragma unroll + uint32_t B33, B34, B35; + + S02 = S03 = S05 = S06 = S09 = S10 = S11 = S12 = S13 = S14 = S16 = S17 = S18 = S19 = 0; + S20 = 0x8807a57eUL; S21 = 0xe616af75UL; S22 = 0xc5d3e4dbUL; S23 = 0xac9ab027UL; + S24 = 0xd915f117UL; S25 = 0xb6eecc54UL; S26 = 0x06e8020bUL; S27 = 0x4a92efd1UL; + S28 = 0xaac6e2c9UL; S29 = 0xddb21398UL; S30 = 0xcae65838UL; S31 = 0x437f203fUL; + S32 = 0x25ea78e7UL; S33 = 0x4c0a2cc1UL; S34 = 0xda6ed11dUL; S35 = 0xe13e3567UL; + + S01 = 0xd915f117UL; + S04 = 0x4a92efd1UL; + S07 = 0xcae65838UL; + S15 = 0xd915f117UL; + S00 = Hash[0]; + S08 = Hash[0]; + + uint32_t c0 = 0x9ae23283UL; + uint32_t c1 = 0x0361b92dUL; + uint32_t c2 = 0x4c92d8edUL; + uint32_t r0, r1, r2; + uint32_t tmp, tmp2, c3; + + tmp = mixtabs[__byte_perm(S00, 0, 17475)]; c3 = tmp; r0 = 0xafaf608aUL ^ tmp; + tmp = mixtabs[256 + __byte_perm(S00, 0, 17474)]; c3 ^= tmp; r1 = 0x79d5d51dUL ^ tmp; + tmp = mixtabs[512 + __byte_perm(S00, 0, 17473)]; c3 ^= tmp; r2 = 0xf6274f4fUL ^ tmp; + tmp = mixtabs[768 + __byte_perm(S00, 0, 17472)]; c3 ^= tmp; + tmp2 = __byte_perm(c0 ^ r0, c1 ^ r1, 13878); + tmp = __byte_perm(c2 ^ r2, c3 ^ 0x59947f59UL, 5140); + S33 = __byte_perm(tmp2, tmp, 12884); + r0 = ROL8(r0); r1 = ROL8(r1); + r2 = ROL8(r2); + tmp2 = __byte_perm(c1 ^ r0, c2 ^ r1, 13878); + tmp = __byte_perm(c3 ^ r2, c0 ^ 0x947f5959UL, 5140); + S34 = __byte_perm(tmp2, tmp, 12884); + r0 = ROL8(r0); r1 = ROL8(r1); + r2 = ROL8(r2); + tmp2 = __byte_perm(c2 ^ r0, c3 ^ r1, 13878); + tmp = __byte_perm(c0 ^ r2, c1 ^ 0x7f595994UL, 5140); + S35 = __byte_perm(tmp2, tmp, 12884); + r0 = ROL8(r0); r1 = ROL8(r1); + r2 = ROL8(r2); + tmp2 = __byte_perm(c3 ^ r0, c0 ^ r1, 13878); + tmp = __byte_perm(c1 ^ r2, c2 ^ 0x5959947fUL, 5140); + S00 = __byte_perm(tmp2, tmp, 12884); + + CMIX36(S30, S31, S32, S34, S35, S00, S12, S13, S14); + SMIX(S30, S31, S32, S33); + CMIX36(S27, S28, S29, S31, S32, S33, S09, S10, S11); + SMIX(S27, S28, S29, S30); + CMIX36(S24, S25, S26, S28, S29, S30, S06, S07, S08); + SMIX(S24, S25, S26, S27); + + TIX4(Hash[1], S24, S25, S28, S31, S32, S10, S12, S15, S18); + CMIX36(S21, S22, S23, S25, S26, S27, S03, S04, S05); + SMIX(S21, S22, S23, S24); + CMIX36(S18, S19, S20, S22, S23, S24, S00, S01, S02); + SMIX(S18, S19, S20, S21); + CMIX36(S15, S16, S17, S19, S20, S21, S33, S34, S35); + SMIX(S15, S16, S17, S18); + CMIX36(S12, S13, S14, S16, S17, S18, S30, S31, S32); + SMIX(S12, S13, S14, S15); + + TIX4(Hash[2], S12, S13, S16, S19, S20, S34, S00, S03, S06); + CMIX36(S09, S10, S11, S13, S14, S15, S27, S28, S29); + SMIX(S09, S10, S11, S12); + CMIX36(S06, S07, S08, S10, S11, S12, S24, S25, S26); + SMIX(S06, S07, S08, S09); + CMIX36(S03, S04, S05, S07, S08, S09, S21, S22, S23); + SMIX(S03, S04, S05, S06); + CMIX36(S00, S01, S02, S04, S05, S06, S18, S19, S20); + SMIX(S00, S01, S02, S03); +#pragma unroll + for (int i = 3; i < (5 * 3); i += 3) + { + FUGUE512_3((Hash[i]), (Hash[i + 1]), (Hash[i + 2])); + } + TIX4(Hash[0xF], S00, S01, S04, S07, S08, S22, S24, S27, S30); + CMIX36(S33, S34, S35, S01, S02, S03, S15, S16, S17); + SMIX(S33, S34, S35, S00); + CMIX36(S30, S31, S32, S34, S35, S00, S12, S13, S14); + SMIX(S30, S31, S32, S33); + CMIX36(S27, S28, S29, S31, S32, S33, S09, S10, S11); + SMIX(S27, S28, S29, S30); + CMIX36(S24, S25, S26, S28, S29, S30, S06, S07, S08); + SMIX(S24, S25, S26, S27); + + S10 ^= S24; + S25 ^= S12; S28 ^= S15; S31 ^= S18; + S21 ^= S25; S22 ^= S26; S23 ^= S27; S03 ^= S25; S04 ^= S26; S05 ^= S27; + tmp = (*(mixtabs + ((__byte_perm(S21, 0, 0x4443))))); c0 = tmp; tmp = (*(mixtabs + (256 + (__byte_perm(S21, 0, 0x4442))))); c0 ^= tmp; r1 = tmp; tmp = (*(mixtabs + (512 + (__byte_perm(S21, 0, 0x4441))))); c0 ^= tmp; r2 = tmp; tmp = (*(mixtabs + (768 + (__byte_perm(S21, 0, 0x4440))))); c0 ^= tmp; uint32_t r3 = tmp; tmp = (*(mixtabs + ((__byte_perm(S22, 0, 0x4443))))); c1 = tmp; r0 = tmp; tmp = (*(mixtabs + (256 + (__byte_perm(S22, 0, 0x4442))))); c1 ^= tmp; tmp = (*(mixtabs + (512 + (__byte_perm(S22, 0, 0x4441))))); c1 ^= tmp; r2 ^= tmp; tmp = (*(mixtabs + (768 + (__byte_perm(S22, 0, 0x4440))))); c1 ^= tmp; r3 ^= tmp; tmp = (*(mixtabs + ((__byte_perm(S23, 0, 0x4443))))); c2 = tmp; r0 ^= tmp; tmp = (*(mixtabs + (256 + (__byte_perm(S23, 0, 0x4442))))); c2 ^= tmp; r1 ^= tmp; tmp = (*(mixtabs + (512 + (__byte_perm(S23, 0, 0x4441))))); c2 ^= tmp; tmp = (*(mixtabs + (768 + (__byte_perm(S23, 0, 0x4440))))); c2 ^= tmp; r3 ^= tmp; + r0 ^= 0x63633297UL; + r1 ^= 0x97636332UL; + r2 ^= 0x32976363UL; + c3 = (0x63633297UL ^ 0x97636332UL ^ 0x32976363UL ^ 0x63329763UL); + tmp2 = __byte_perm((c0 ^ r0), (c1 ^ r1), 0x3636); tmp = __byte_perm((c2 ^ r2), (c3 ^ r3), 0x1414); + S21 = __byte_perm(tmp2, tmp, 0x3254); + r0 = ROTL32((r0), (8)); + r1 = ROTL32((r1), (8)); + r2 = ROTL32((r2), (8)); + r3 = ROTL32((r3), (8)); + tmp2 = __byte_perm((c1 ^ r0), (c2 ^ r1), 0x3636); tmp = __byte_perm((c3 ^ r2), (c0 ^ r3), 0x1414); + S22 = __byte_perm(tmp2, tmp, 0x3254); + r0 = ROTL32((r0), (8)); + r1 = ROTL32((r1), (8)); + r2 = ROTL32((r2), (8)); + r3 = ROTL32((r3), (8)); + tmp2 = __byte_perm((c2 ^ r0), (c3 ^ r1), 0x3636); tmp = __byte_perm((c0 ^ r2), (c1 ^ r3), 0x1414); + S23 = __byte_perm(tmp2, tmp, 0x3254); + r0 = ROTL32((r0), (8)); + r1 = ROTL32((r1), (8)); + r2 = ROTL32((r2), (8)); + r3 = ROTL32((r3), (8)); + tmp2 = __byte_perm((c3 ^ r0), (c0 ^ r1), 0x3636); tmp = __byte_perm((c1 ^ r2), (c2 ^ r3), 0x1414); + S24 = __byte_perm(tmp2, tmp, 0x3254); + + CMIX36(S18, S19, S20, S22, S23, S24, S00, S01, S02); + SMIX(S18, S19, S20, S21); + CMIX36(S15, S16, S17, S19, S20, S21, S33, S34, S35); + SMIX(S15, S16, S17, S18); + CMIX36(S12, S13, S14, S16, S17, S18, S30, S31, S32); + SMIX(S12, S13, S14, S15); + + S34 ^= S12; + S12 = (64 << 3); + S20 ^= S12; S13 ^= S00; S16 ^= S03; S19 ^= S06; + S09 ^= S13; S10 ^= S14; S11 ^= S15; S27 ^= S13; S28 ^= S14; S29 ^= S15; + tmp = (*(mixtabs + ((__byte_perm(S09, 0, 0x4443))))); c0 = tmp; tmp = (*(mixtabs + (256 + (__byte_perm(S09, 0, 0x4442))))); c0 ^= tmp; r1 = tmp; tmp = (*(mixtabs + (512 + (__byte_perm(S09, 0, 0x4441))))); c0 ^= tmp; r2 = tmp; tmp = (*(mixtabs + (768 + (__byte_perm(S09, 0, 0x4440))))); c0 ^= tmp; r3 = tmp; tmp = (*(mixtabs + ((__byte_perm(S10, 0, 0x4443))))); c1 = tmp; r0 = tmp; tmp = (*(mixtabs + (256 + (__byte_perm(S10, 0, 0x4442))))); c1 ^= tmp; tmp = (*(mixtabs + (512 + (__byte_perm(S10, 0, 0x4441))))); c1 ^= tmp; r2 ^= tmp; tmp = (*(mixtabs + (768 + (__byte_perm(S10, 0, 0x4440))))); c1 ^= tmp; r3 ^= tmp; tmp = (*(mixtabs + ((__byte_perm(S11, 0, 0x4443))))); c2 = tmp; r0 ^= tmp; tmp = (*(mixtabs + (256 + (__byte_perm(S11, 0, 0x4442))))); c2 ^= tmp; r1 ^= tmp; tmp = (*(mixtabs + (512 + (__byte_perm(S11, 0, 0x4441))))); c2 ^= tmp; tmp = (*(mixtabs + (768 + (__byte_perm(S11, 0, 0x4440))))); c2 ^= tmp; r3 ^= tmp; + r0 ^= 0x63633297UL; + r1 ^= 0x97636332UL; + r2 ^= 0x5ec77777UL; + c3 = (0x63633297UL ^ 0x97636332UL ^ 0x5ec77777UL ^ 0x63329763); + tmp2 = __byte_perm((c0 ^ r0), (c1 ^ r1), 0x3636); tmp = __byte_perm((c2 ^ r2), (c3 ^ r3), 0x1414); + S09 = __byte_perm(tmp2, tmp, 0x3254); + r0 = ROTL32((r0), (8)); + r1 = ROTL32((r1), (8)); + r2 = ROTL32((r2), (8)); + r3 = ROTL32((r3), (8)); + tmp2 = __byte_perm((c1 ^ r0), (c2 ^ r1), 0x3636); tmp = __byte_perm((c3 ^ r2), (c0 ^ r3), 0x1414); + S10 = __byte_perm(tmp2, tmp, 0x3254); + r0 = ROTL32((r0), (8)); + r1 = ROTL32((r1), (8)); + r2 = ROTL32((r2), (8)); + r3 = ROTL32((r3), (8)); + tmp2 = __byte_perm((c2 ^ r0), (c3 ^ r1), 0x3636); tmp = __byte_perm((c0 ^ r2), (c1 ^ r3), 0x1414); + S11 = __byte_perm(tmp2, tmp, 0x3254); + r0 = ROTL32((r0), (8)); + r1 = ROTL32((r1), (8)); + r2 = ROTL32((r2), (8)); + r3 = ROTL32((r3), (8)); + tmp2 = __byte_perm((c3 ^ r0), (c0 ^ r1), 0x3636); tmp = __byte_perm((c1 ^ r2), (c2 ^ r3), 0x1414); + S12 = __byte_perm(tmp2, tmp, 0x3254); + + CMIX36(S06, S07, S08, S10, S11, S12, S24, S25, S26); + SMIX(S06, S07, S08, S09); + CMIX36(S03, S04, S05, S07, S08, S09, S21, S22, S23); + SMIX(S03, S04, S05, S06); + CMIX36(S00, S01, S02, S04, S05, S06, S18, S19, S20); + SMIX(S00, S01, S02, S03); + + //#pragma unroll for (int i = 0; i < 32; i++) { ROR3; CMIX36(S00, S01, S02, S04, S05, S06, S18, S19, S20); SMIX(S00, S01, S02, S03); } - //#pragma unroll - for (int i = 0; i < 13; i++) { + //#pragma unroll + for (int i = 0; i < 13; i++) + { S04 ^= S00; S09 ^= S00; S18 ^= S00; S27 ^= S00; - ROR9; - SMIX(S00, S01, S02, S03); - S04 ^= S00; - S10 ^= S00; - S18 ^= S00; - S27 ^= S00; - ROR9; - SMIX(S00, S01, S02, S03); - S04 ^= S00; - S10 ^= S00; - S19 ^= S00; - S27 ^= S00; - ROR9; - SMIX(S00, S01, S02, S03); - S04 ^= S00; - S10 ^= S00; - S19 ^= S00; - S28 ^= S00; - ROR8; - SMIX(S00, S01, S02, S03); + SMIX(S27, S28, S29, S30); + S31 ^= S27; + S01 ^= S27; + S09 ^= S27; + S18 ^= S27; + SMIX(S18, S19, S20, S21); + S22 ^= S18; + S28 ^= S18; + S01 ^= S18; + S09 ^= S18; + SMIX(S09, S10, S11, S12); + S13 ^= S09; + S19 ^= S09; + S28 ^= S09; + S01 ^= S09; + SMIX(S01, S02, S03, S04); + ROL1; } S04 ^= S00; S09 ^= S00; @@ -667,29 +847,25 @@ void x13_fugue512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t * } } -__global__ __launch_bounds__(128, 7) -void x13_fugue512_gpu_hash_64_final(uint32_t threads, uint32_t startNounce, uint64_t *const __restrict__ g_hash, uint32_t *const __restrict__ g_nonceVector, uint32_t *const __restrict__ d_nonce) +__global__ __launch_bounds__(256,3) +void x13_fugue512_gpu_hash_64_final(const uint32_t threads, const uint32_t startNounce, const uint32_t *const __restrict__ g_hash, uint32_t *const __restrict__ d_nonce) { __shared__ uint32_t mixtabs[1024]; - uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); if (thread< threads) { - if(threadIdx.x < 128) + if (threadIdx.x < 256) { - *((uint32_t*)mixtabs + (threadIdx.x)) = tex1Dfetch(mixTab0Tex, threadIdx.x); - *((uint32_t*)mixtabs + (128 + threadIdx.x)) = tex1Dfetch(mixTab0Tex, threadIdx.x + 128); - *((uint32_t*)mixtabs + (256 + threadIdx.x)) = tex1Dfetch(mixTab1Tex, threadIdx.x); - *((uint32_t*)mixtabs + (256 + 128 + threadIdx.x)) = tex1Dfetch(mixTab1Tex, threadIdx.x + 128); - *((uint32_t*)mixtabs + (512 + threadIdx.x)) = tex1Dfetch(mixTab2Tex, threadIdx.x); - *((uint32_t*)mixtabs + (512 + 128 + threadIdx.x)) = tex1Dfetch(mixTab2Tex, threadIdx.x + 128); - *((uint32_t*)mixtabs + (768 + threadIdx.x)) = tex1Dfetch(mixTab3Tex, threadIdx.x); - *((uint32_t*)mixtabs + (768 + 128 + threadIdx.x)) = tex1Dfetch(mixTab3Tex, threadIdx.x + 128); + mixtabs[threadIdx.x] = mixTab0Tex[threadIdx.x]; + mixtabs[256 + threadIdx.x] = ROTR32(mixtabs[threadIdx.x], 8); + mixtabs[(512 + threadIdx.x)] = ROTR32(mixtabs[threadIdx.x], 16); + mixtabs[(768 + threadIdx.x)] = ROTR32(mixtabs[threadIdx.x], 24); } - uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread); - - int hashPosition = nounce - startNounce; - uint32_t *h = (uint32_t*)&g_hash[hashPosition * 8]; + __syncthreads(); + const uint32_t nounce = (startNounce + thread); + const uint32_t hashPosition = nounce - startNounce; + const uint32_t *h = &g_hash[hashPosition * 16]; uint32_t Hash[16]; #pragma unroll 16 for (int i = 0; i < 16; i++) @@ -700,24 +876,175 @@ void x13_fugue512_gpu_hash_64_final(uint32_t threads, uint32_t startNounce, uint uint32_t S20, S21, S22, S23, S24, S25, S26, S27, S28, S29; uint32_t S30, S31, S32, S33, S34, S35; - uint32_t B27, B28, B29, B30, B31, B32, B33, B34, B35; - uint64_t bc = (uint64_t)64 << 3; - uint32_t bclo = (uint32_t)(bc & 0xFFFFFFFFULL); - uint32_t bchi = (uint32_t)(bc >> 32); + uint32_t B33, B34, B35; - S00 = S01 = S02 = S03 = S04 = S05 = S06 = S07 = S08 = S09 = S10 = S11 = S12 = S13 = S14 = S15 = S16 = S17 = S18 = S19 = 0; - S20 = SPH_C32(0x8807a57e); S21 = SPH_C32(0xe616af75); S22 = SPH_C32(0xc5d3e4db); S23 = SPH_C32(0xac9ab027); - S24 = SPH_C32(0xd915f117); S25 = SPH_C32(0xb6eecc54); S26 = SPH_C32(0x06e8020b); S27 = SPH_C32(0x4a92efd1); - S28 = SPH_C32(0xaac6e2c9); S29 = SPH_C32(0xddb21398); S30 = SPH_C32(0xcae65838); S31 = SPH_C32(0x437f203f); - S32 = SPH_C32(0x25ea78e7); S33 = SPH_C32(0x951fddd6); S34 = SPH_C32(0xda6ed11d); S35 = SPH_C32(0xe13e3567); - __syncthreads(); + S02 = S03 = S05 = S06 = S09 = S10 = S11 = S12 = S13 = S14 = S16 = S17 = S18 = S19 = 0; + S20 = 0x8807a57eUL; S21 = 0xe616af75UL; S22 = 0xc5d3e4dbUL; S23 = 0xac9ab027UL; + S24 = 0xd915f117UL; S25 = 0xb6eecc54UL; S26 = 0x06e8020bUL; S27 = 0x4a92efd1UL; + S28 = 0xaac6e2c9UL; S29 = 0xddb21398UL; S30 = 0xcae65838UL; S31 = 0x437f203fUL; + S32 = 0x25ea78e7UL; S33 = 0x4c0a2cc1UL; S34 = 0xda6ed11dUL; S35 = 0xe13e3567UL; + + S01 = 0xd915f117UL; + S04 = 0x4a92efd1UL; + S07 = 0xcae65838UL; + S15 = 0xd915f117UL; + S00 = Hash[0]; + S08 = Hash[0]; + + uint32_t c0 = 0x9ae23283UL; + uint32_t c1 = 0x0361b92dUL; + uint32_t c2 = 0x4c92d8edUL; + uint32_t r0, r1, r2; + uint32_t tmp, tmp2, c3; + + tmp = mixtabs[__byte_perm(S00, 0, 17475)]; c3 = tmp; r0 = 0xafaf608aUL ^ tmp; + tmp = mixtabs[256 + __byte_perm(S00, 0, 17474)]; c3 ^= tmp; r1 = 0x79d5d51dUL ^ tmp; + tmp = mixtabs[512 + __byte_perm(S00, 0, 17473)]; c3 ^= tmp; r2 = 0xf6274f4fUL ^ tmp; + tmp = mixtabs[768 + __byte_perm(S00, 0, 17472)]; c3 ^= tmp; + tmp2 = __byte_perm(c0 ^ r0, c1 ^ r1, 13878); + tmp = __byte_perm(c2 ^ r2, c3 ^ 0x59947f59UL, 5140); + S33 = __byte_perm(tmp2, tmp, 12884); + r0 = ROL8(r0); r1 = ROL8(r1); + r2 = ROL8(r2); + tmp2 = __byte_perm(c1 ^ r0, c2 ^ r1, 13878); + tmp = __byte_perm(c3 ^ r2, c0 ^ 0x947f5959UL, 5140); + S34 = __byte_perm(tmp2, tmp, 12884); + r0 = ROL8(r0); r1 = ROL8(r1); + r2 = ROL8(r2); + tmp2 = __byte_perm(c2 ^ r0, c3 ^ r1, 13878); + tmp = __byte_perm(c0 ^ r2, c1 ^ 0x7f595994UL, 5140); + S35 = __byte_perm(tmp2, tmp, 12884); + r0 = ROL8(r0); r1 = ROL8(r1); + r2 = ROL8(r2); + tmp2 = __byte_perm(c3 ^ r0, c0 ^ r1, 13878); + tmp = __byte_perm(c1 ^ r2, c2 ^ 0x5959947fUL, 5140); + S00 = __byte_perm(tmp2, tmp, 12884); + + CMIX36(S30, S31, S32, S34, S35, S00, S12, S13, S14); + SMIX(S30, S31, S32, S33); + CMIX36(S27, S28, S29, S31, S32, S33, S09, S10, S11); + SMIX(S27, S28, S29, S30); + CMIX36(S24, S25, S26, S28, S29, S30, S06, S07, S08); + SMIX(S24, S25, S26, S27); + + TIX4(Hash[1], S24, S25, S28, S31, S32, S10, S12, S15, S18); + CMIX36(S21, S22, S23, S25, S26, S27, S03, S04, S05); + SMIX(S21, S22, S23, S24); + CMIX36(S18, S19, S20, S22, S23, S24, S00, S01, S02); + SMIX(S18, S19, S20, S21); + CMIX36(S15, S16, S17, S19, S20, S21, S33, S34, S35); + SMIX(S15, S16, S17, S18); + CMIX36(S12, S13, S14, S16, S17, S18, S30, S31, S32); + SMIX(S12, S13, S14, S15); + + TIX4(Hash[2], S12, S13, S16, S19, S20, S34, S00, S03, S06); + CMIX36(S09, S10, S11, S13, S14, S15, S27, S28, S29); + SMIX(S09, S10, S11, S12); + CMIX36(S06, S07, S08, S10, S11, S12, S24, S25, S26); + SMIX(S06, S07, S08, S09); + CMIX36(S03, S04, S05, S07, S08, S09, S21, S22, S23); + SMIX(S03, S04, S05, S06); + CMIX36(S00, S01, S02, S04, S05, S06, S18, S19, S20); + SMIX(S00, S01, S02, S03); + +#pragma unroll + for (int i = 3; i < (5 * 3); i += 3) + { + FUGUE512_3((Hash[i]), (Hash[i + 1]), (Hash[i + 2])); + } + TIX4(Hash[0xF], S00, S01, S04, S07, S08, S22, S24, S27, S30); + CMIX36(S33, S34, S35, S01, S02, S03, S15, S16, S17); + SMIX(S33, S34, S35, S00); + CMIX36(S30, S31, S32, S34, S35, S00, S12, S13, S14); + SMIX(S30, S31, S32, S33); + CMIX36(S27, S28, S29, S31, S32, S33, S09, S10, S11); + SMIX(S27, S28, S29, S30); + CMIX36(S24, S25, S26, S28, S29, S30, S06, S07, S08); + SMIX(S24, S25, S26, S27); + + S10 ^= S24; + S25 ^= S12; S28 ^= S15; S31 ^= S18; + S21 ^= S25; S22 ^= S26; S23 ^= S27; S03 ^= S25; S04 ^= S26; S05 ^= S27; + tmp = (*(mixtabs + ((__byte_perm(S21, 0, 0x4443))))); c0 = tmp; tmp = (*(mixtabs + (256 + (__byte_perm(S21, 0, 0x4442))))); c0 ^= tmp; r1 = tmp; tmp = (*(mixtabs + (512 + (__byte_perm(S21, 0, 0x4441))))); c0 ^= tmp; r2 = tmp; tmp = (*(mixtabs + (768 + (__byte_perm(S21, 0, 0x4440))))); c0 ^= tmp; uint32_t r3 = tmp; tmp = (*(mixtabs + ((__byte_perm(S22, 0, 0x4443))))); c1 = tmp; r0 = tmp; tmp = (*(mixtabs + (256 + (__byte_perm(S22, 0, 0x4442))))); c1 ^= tmp; tmp = (*(mixtabs + (512 + (__byte_perm(S22, 0, 0x4441))))); c1 ^= tmp; r2 ^= tmp; tmp = (*(mixtabs + (768 + (__byte_perm(S22, 0, 0x4440))))); c1 ^= tmp; r3 ^= tmp; tmp = (*(mixtabs + ((__byte_perm(S23, 0, 0x4443))))); c2 = tmp; r0 ^= tmp; tmp = (*(mixtabs + (256 + (__byte_perm(S23, 0, 0x4442))))); c2 ^= tmp; r1 ^= tmp; tmp = (*(mixtabs + (512 + (__byte_perm(S23, 0, 0x4441))))); c2 ^= tmp; tmp = (*(mixtabs + (768 + (__byte_perm(S23, 0, 0x4440))))); c2 ^= tmp; r3 ^= tmp; + r0 ^= 0x63633297UL; + r1 ^= 0x97636332UL; + r2 ^= 0x32976363UL; + c3 = (0x63633297UL^0x97636332UL^0x32976363UL^0x63329763UL); + tmp2 = __byte_perm((c0 ^ r0), (c1 ^ r1), 0x3636); tmp = __byte_perm((c2 ^ r2), (c3 ^ r3), 0x1414); + S21 = __byte_perm(tmp2, tmp, 0x3254); + r0 = ROTL32((r0), (8)); + r1 = ROTL32((r1), (8)); + r2 = ROTL32((r2), (8)); + r3 = ROTL32((r3), (8)); + tmp2 = __byte_perm((c1 ^ r0), (c2 ^ r1), 0x3636); tmp = __byte_perm((c3 ^ r2), (c0 ^ r3), 0x1414); + S22 = __byte_perm(tmp2, tmp, 0x3254); + r0 = ROTL32((r0), (8)); + r1 = ROTL32((r1), (8)); + r2 = ROTL32((r2), (8)); + r3 = ROTL32((r3), (8)); + tmp2 = __byte_perm((c2 ^ r0), (c3 ^ r1), 0x3636); tmp = __byte_perm((c0 ^ r2), (c1 ^ r3), 0x1414); + S23 = __byte_perm(tmp2, tmp, 0x3254); + r0 = ROTL32((r0), (8)); + r1 = ROTL32((r1), (8)); + r2 = ROTL32((r2), (8)); + r3 = ROTL32((r3), (8)); + tmp2 = __byte_perm((c3 ^ r0), (c0 ^ r1), 0x3636); tmp = __byte_perm((c1 ^ r2), (c2 ^ r3), 0x1414); + S24 = __byte_perm(tmp2, tmp, 0x3254); + + CMIX36(S18, S19, S20, S22, S23, S24, S00, S01, S02); + SMIX(S18, S19, S20, S21); + CMIX36(S15, S16, S17, S19, S20, S21, S33, S34, S35); + SMIX(S15, S16, S17, S18); + CMIX36(S12, S13, S14, S16, S17, S18, S30, S31, S32); + SMIX(S12, S13, S14, S15); - FUGUE512_3((Hash[0x0]), (Hash[0x1]), (Hash[0x2])); - FUGUE512_3((Hash[0x3]), (Hash[0x4]), (Hash[0x5])); - FUGUE512_3((Hash[0x6]), (Hash[0x7]), (Hash[0x8])); - FUGUE512_3((Hash[0x9]), (Hash[0xA]), (Hash[0xB])); - FUGUE512_3((Hash[0xC]), (Hash[0xD]), (Hash[0xE])); - FUGUE512_3((Hash[0xF]), bchi, bclo); + S34 ^= S12; + S12 = (64 << 3); + S20 ^= S12; S13 ^= S00; S16 ^= S03; S19 ^= S06; + S09 ^= S13; S10 ^= S14; S11 ^= S15; S27 ^= S13; S28 ^= S14; S29 ^= S15; + c0 = (*(mixtabs + ((__byte_perm(S09, 0, 0x4443))))); + tmp = (*(mixtabs + (256 + (__byte_perm(S09, 0, 0x4442))))); c0 ^= tmp; r1 = tmp; + tmp = (*(mixtabs + (512 + (__byte_perm(S09, 0, 0x4441))))); c0 ^= tmp; r2 = tmp; + tmp = (*(mixtabs + (768 + (__byte_perm(S09, 0, 0x4440))))); c0 ^= tmp; r3 = tmp; + tmp = (*(mixtabs + ((__byte_perm(S10, 0, 0x4443))))); c1 = tmp; r0 = tmp; + tmp = (*(mixtabs + (256 + (__byte_perm(S10, 0, 0x4442))))); c1 ^= tmp; + tmp = (*(mixtabs + (512 + (__byte_perm(S10, 0, 0x4441))))); c1 ^= tmp; r2 ^= tmp; + tmp = (*(mixtabs + (768 + (__byte_perm(S10, 0, 0x4440))))); c1 ^= tmp; r3 ^= tmp; + tmp = (*(mixtabs + ((__byte_perm(S11, 0, 0x4443))))); c2 = tmp; r0 ^= tmp; + tmp = (*(mixtabs + (256 + (__byte_perm(S11, 0, 0x4442))))); c2 ^= tmp; r1 ^= tmp; + tmp = (*(mixtabs + (512 + (__byte_perm(S11, 0, 0x4441))))); c2 ^= tmp; + tmp = (*(mixtabs + (768 + (__byte_perm(S11, 0, 0x4440))))); c2 ^= tmp; r3 ^= tmp; + r0 ^= 0x63633297UL; + r1 ^= 0x97636332UL; + r2 ^= 0x5ec77777UL; + c3 = (0x63633297UL^0x97636332UL^0x5ec77777UL^0x63329763); + tmp2 = __byte_perm((c0 ^ r0), (c1 ^ r1), 0x3636); tmp = __byte_perm((c2 ^ r2), (c3 ^ r3), 0x1414); + S09 = __byte_perm(tmp2, tmp, 0x3254); + r0 = ROTL32((r0), (8)); + r1 = ROTL32((r1), (8)); + r2 = ROTL32((r2), (8)); + r3 = ROTL32((r3), (8)); + tmp2 = __byte_perm((c1 ^ r0), (c2 ^ r1), 0x3636); tmp = __byte_perm((c3 ^ r2), (c0 ^ r3), 0x1414); + S10 = __byte_perm(tmp2, tmp, 0x3254); + r0 = ROTL32((r0), (8)); + r1 = ROTL32((r1), (8)); + r2 = ROTL32((r2), (8)); + r3 = ROTL32((r3), (8)); + tmp2 = __byte_perm((c2 ^ r0), (c3 ^ r1), 0x3636); tmp = __byte_perm((c0 ^ r2), (c1 ^ r3), 0x1414); + S11 = __byte_perm(tmp2, tmp, 0x3254); + r0 = ROTL32((r0), (8)); + r1 = ROTL32((r1), (8)); + r2 = ROTL32((r2), (8)); + r3 = ROTL32((r3), (8)); + tmp2 = __byte_perm((c3 ^ r0), (c0 ^ r1), 0x3636); tmp = __byte_perm((c1 ^ r2), (c2 ^ r3), 0x1414); + S12 = __byte_perm(tmp2, tmp, 0x3254); + + CMIX36(S06, S07, S08, S10, S11, S12, S24, S25, S26); + SMIX(S06, S07, S08, S09); + CMIX36(S03, S04, S05, S07, S08, S09, S21, S22, S23); + SMIX(S03, S04, S05, S06); + CMIX36(S00, S01, S02, S04, S05, S06, S18, S19, S20); + SMIX(S00, S01, S02, S03); //#pragma unroll 32 for (int i = 0; i < 32; i++) { @@ -726,86 +1053,76 @@ void x13_fugue512_gpu_hash_64_final(uint32_t threads, uint32_t startNounce, uint SMIX(S00, S01, S02, S03); } //#pragma unroll 13 - for (int i = 0; i < 12; i++) + for (int i = 0; i < 11; i++) { S04 ^= S00; S09 ^= S00; S18 ^= S00; S27 ^= S00; - ROR9; - SMIX(S00, S01, S02, S03); - S04 ^= S00; - S10 ^= S00; - S18 ^= S00; - S27 ^= S00; - ROR9; - SMIX(S00, S01, S02, S03); - S04 ^= S00; - S10 ^= S00; - S19 ^= S00; - S27 ^= S00; - ROR9; - SMIX(S00, S01, S02, S03); - S04 ^= S00; - S10 ^= S00; - S19 ^= S00; - S28 ^= S00; - ROR8; - SMIX(S00, S01, S02, S03); + SMIX(S27, S28, S29, S30); + S31 ^= S27; + S01 ^= S27; + S09 ^= S27; + S18 ^= S27; + SMIX(S18, S19, S20, S21); + S22 ^= S18; + S28 ^= S18; + S01 ^= S18; + S09 ^= S18; + SMIX(S09, S10, S11, S12); + S13 ^= S09; + S19 ^= S09; + S28 ^= S09; + S01 ^= S09; + SMIX(S01, S02, S03, S04); + ROL1; } S04 ^= S00; S09 ^= S00; S18 ^= S00; S27 ^= S00; - ROR9; - SMIX(S00, S01, S02, S03); - S04 ^= S00; - S10 ^= S00; - S18 ^= S00; - S27 ^= S00; - ROR9; - SMIX(S00, S01, S02, S03); - S04 ^= S00; - S10 ^= S00; - S19 ^= S00; - S27 ^= S00; - ROR9; - SMIX(S00, S01, S02, S03); - - - S04 ^= S00; - if (cuda_swab32(S04) <= pTarget[7]) + SMIX(S27, S28, S29, S30); + S31 ^= S27; + S01 ^= S27; + S09 ^= S27; + S18 ^= S27; + SMIX(S18, S19, S20, S21); + S22 ^= S18; + S28 ^= S18; + S01 ^= S18; + S09 ^= S18; + SMIX(S09, S10, S11, S12); + S13 ^= S09; + S19 ^= S09; + S28 ^= S09; + S01 ^= S09; + SMIX0(S01, S02, S03, S04); + S10 ^= S01; + S19 ^= S01; + S28 ^= S01; + SMIX0(S28, S29, S30, S31); + S10 ^= S28; + S19 ^= S28; + SMIX0(S19, S20, S21, S22); + S10 ^= S19; + SMIX0(S10, S11, S12, S13); + S14 ^= S10; + if (cuda_swab32(S14) <= pTarget[7]) { - if (d_nonce[0] != 0xffffffff) - { - if (d_nonce[0] < nounce) d_nonce[0] = nounce; - } - else d_nonce[0] = nounce; + uint32_t tmp = atomicExch(d_nonce, nounce); + if (tmp != 0xffffffff) + d_nonce[1] = tmp; } } } -#define texDef(texname, texmem, texsource, texsize) \ - unsigned int *texmem; \ - cudaMalloc(&texmem, texsize); \ - cudaMemcpy(texmem, texsource, texsize, cudaMemcpyHostToDevice); \ - texname.normalized = 0; \ - texname.filterMode = cudaFilterModePoint; \ - texname.addressMode[0] = cudaAddressModeClamp; \ - { cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(); \ - cudaBindTexture(NULL, &texname, texmem, &channelDesc, texsize ); } - __host__ void x13_fugue512_cpu_init(int thr_id, uint32_t threads) { - texDef(mixTab0Tex, mixTab0m, mixtab0_cpu, sizeof(uint32_t)*256); - texDef(mixTab1Tex, mixTab1m, mixtab1_cpu, sizeof(uint32_t)*256); - texDef(mixTab2Tex, mixTab2m, mixtab2_cpu, sizeof(uint32_t)*256); - texDef(mixTab3Tex, mixTab3m, mixtab3_cpu, sizeof(uint32_t)*256); - cudaMalloc(&d_nonce[thr_id], sizeof(uint32_t)); + cudaMalloc(&d_nonce[thr_id], 2*sizeof(uint32_t)); } -__host__ void x13_fugue512_cpu_setTarget(const void *ptarget) +__host__ void x13_fugue512_cpu_setTarget(int thr_id, const void *ptarget) { - CUDA_SAFE_CALL(cudaMemcpyToSymbol(pTarget, ptarget, 8 * sizeof(uint32_t), 0, cudaMemcpyHostToDevice)); + CUDA_SAFE_CALL(cudaMemcpyToSymbolAsync(pTarget, ptarget, 8 * sizeof(uint32_t), 0, cudaMemcpyHostToDevice, gpustream[thr_id])); } @@ -815,7 +1132,7 @@ __host__ void x13_fugue512_cpu_free(int32_t thr_id) cudaFreeHost(&d_nonce[thr_id]); } -__host__ void x13_fugue512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order) +__host__ void x13_fugue512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash) { const uint32_t threadsperblock = 128; @@ -825,21 +1142,20 @@ __host__ void x13_fugue512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t st // fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size); - x13_fugue512_gpu_hash_64<<>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector); + x13_fugue512_gpu_hash_64<<>>(threads, startNounce, d_hash); // MyStreamSynchronize(NULL, order, thr_id); } -__host__ uint32_t x13_fugue512_cpu_hash_64_final(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order) +__host__ void x13_fugue512_cpu_hash_64_final(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, uint32_t *res) { - const uint32_t threadsperblock = 128; + const uint32_t threadsperblock = 256; // berechne wie viele Thread Blocks wir brauchen dim3 grid((threads + threadsperblock - 1) / threadsperblock); dim3 block(threadsperblock); - cudaMemset(d_nonce[thr_id], 0xff, sizeof(uint32_t)); + cudaMemsetAsync(d_nonce[thr_id], 0xff, 2 * sizeof(uint32_t), gpustream[thr_id]); - x13_fugue512_gpu_hash_64_final << > >(threads, startNounce, (uint64_t*)d_hash, d_nonceVector, d_nonce[thr_id]); - uint32_t res; - cudaMemcpy(&res, d_nonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost); - return res; + x13_fugue512_gpu_hash_64_final << >>(threads, startNounce, d_hash, d_nonce[thr_id]); + cudaMemcpyAsync(res, d_nonce[thr_id], 2*sizeof(uint32_t), cudaMemcpyDeviceToHost, gpustream[thr_id]); + cudaStreamSynchronize(gpustream[thr_id]); } diff --git a/x13/cuda_x13_hamsi512.cu b/x13/cuda_x13_hamsi512.cu index 47aab23c70..0070c80a52 100644 --- a/x13/cuda_x13_hamsi512.cu +++ b/x13/cuda_x13_hamsi512.cu @@ -1,463 +1,402 @@ -#include +#ifdef __cplusplus +#include +#include +using namespace std; +#else #include +#include +#endif #include +#include "cuda_helper.h" -typedef unsigned char BitSequence; - -#include "cuda_helper.h" - -#undef SPH_C32 -#define SPH_C32(x) (x) -#undef SPH_T32 -#define SPH_T32(x) (x) - -static __constant__ uint32_t d_alpha_n[32] = { - SPH_C32(0xff00f0f0), SPH_C32(0xccccaaaa), SPH_C32(0xf0f0cccc), - SPH_C32(0xff00aaaa), SPH_C32(0xccccaaaa), SPH_C32(0xf0f0ff00), - SPH_C32(0xaaaacccc), SPH_C32(0xf0f0ff00), SPH_C32(0xf0f0cccc), - SPH_C32(0xaaaaff00), SPH_C32(0xccccff00), SPH_C32(0xaaaaf0f0), - SPH_C32(0xaaaaf0f0), SPH_C32(0xff00cccc), SPH_C32(0xccccf0f0), - SPH_C32(0xff00aaaa), SPH_C32(0xccccaaaa), SPH_C32(0xff00f0f0), - SPH_C32(0xff00aaaa), SPH_C32(0xf0f0cccc), SPH_C32(0xf0f0ff00), - SPH_C32(0xccccaaaa), SPH_C32(0xf0f0ff00), SPH_C32(0xaaaacccc), - SPH_C32(0xaaaaff00), SPH_C32(0xf0f0cccc), SPH_C32(0xaaaaf0f0), - SPH_C32(0xccccff00), SPH_C32(0xff00cccc), SPH_C32(0xaaaaf0f0), - SPH_C32(0xff00aaaa), SPH_C32(0xccccf0f0) -}; -static __constant__ uint32_t d_alpha_f[32] = { - SPH_C32(0xcaf9639c), SPH_C32(0x0ff0f9c0), SPH_C32(0x639c0ff0), - SPH_C32(0xcaf9f9c0), SPH_C32(0x0ff0f9c0), SPH_C32(0x639ccaf9), - SPH_C32(0xf9c00ff0), SPH_C32(0x639ccaf9), SPH_C32(0x639c0ff0), - SPH_C32(0xf9c0caf9), SPH_C32(0x0ff0caf9), SPH_C32(0xf9c0639c), - SPH_C32(0xf9c0639c), SPH_C32(0xcaf90ff0), SPH_C32(0x0ff0639c), - SPH_C32(0xcaf9f9c0), SPH_C32(0x0ff0f9c0), SPH_C32(0xcaf9639c), - SPH_C32(0xcaf9f9c0), SPH_C32(0x639c0ff0), SPH_C32(0x639ccaf9), - SPH_C32(0x0ff0f9c0), SPH_C32(0x639ccaf9), SPH_C32(0xf9c00ff0), - SPH_C32(0xf9c0caf9), SPH_C32(0x639c0ff0), SPH_C32(0xf9c0639c), - SPH_C32(0x0ff0caf9), SPH_C32(0xcaf90ff0), SPH_C32(0xf9c0639c), - SPH_C32(0xcaf9f9c0), SPH_C32(0x0ff0639c) +static __constant__ uint32_t d_T512[4096/4] = { + 0xef0b0270, 0x3afd0000, 0x5dae0000, + 0x69490000, 0x9b0f3c06, 0x4405b5f9, + 0x66140a51, 0x924f5d0a, 0xc96b0030, + 0xe7250000, 0x2f840000, 0x264f0000, + 0x08695bf9, 0x6dfcf137, 0x509f6984, + 0x9e69af68, + 0xc96b0030, 0xe7250000, 0x2f840000, + 0x264f0000, 0x08695bf9, 0x6dfcf137, + 0x509f6984, 0x9e69af68, 0x26600240, + 0xddd80000, 0x722a0000, 0x4f060000, + 0x936667ff, 0x29f944ce, 0x368b63d5, + 0x0c26f262, + 0x145a3c00, 0xb9e90000, 0x61270000, + 0xf1610000, 0xce613d6c, 0xb0493d78, + 0x47a96720, 0xe18e24c5, 0x23671400, + 0xc8b90000, 0xf4c70000, 0xfb750000, + 0x73cd2465, 0xf8a6a549, 0x02c40a3f, + 0xdc24e61f, + 0x23671400, 0xc8b90000, 0xf4c70000, + 0xfb750000, 0x73cd2465, 0xf8a6a549, + 0x02c40a3f, 0xdc24e61f, 0x373d2800, + 0x71500000, 0x95e00000, 0x0a140000, + 0xbdac1909, 0x48ef9831, 0x456d6d1f, + 0x3daac2da, + 0x54285c00, 0xeaed0000, 0xc5d60000, + 0xa1c50000, 0xb3a26770, 0x94a5c4e1, + 0x6bb0419d, 0x551b3782, 0x9cbb1800, + 0xb0d30000, 0x92510000, 0xed930000, + 0x593a4345, 0xe114d5f4, 0x430633da, + 0x78cace29, + 0x9cbb1800, 0xb0d30000, 0x92510000, + 0xed930000, 0x593a4345, 0xe114d5f4, + 0x430633da, 0x78cace29, 0xc8934400, + 0x5a3e0000, 0x57870000, 0x4c560000, + 0xea982435, 0x75b11115, 0x28b67247, + 0x2dd1f9ab, + 0x29449c00, 0x64e70000, 0xf24b0000, + 0xc2f30000, 0x0ede4e8f, 0x56c23745, + 0xf3e04259, 0x8d0d9ec4, 0x466d0c00, + 0x08620000, 0xdd5d0000, 0xbadd0000, + 0x6a927942, 0x441f2b93, 0x218ace6f, + 0xbf2c0be2, + 0x466d0c00, 0x08620000, 0xdd5d0000, + 0xbadd0000, 0x6a927942, 0x441f2b93, + 0x218ace6f, 0xbf2c0be2, 0x6f299000, + 0x6c850000, 0x2f160000, 0x782e0000, + 0x644c37cd, 0x12dd1cd6, 0xd26a8c36, + 0x32219526, + 0xf6800005, 0x3443c000, 0x24070000, + 0x8f3d0000, 0x21373bfb, 0x0ab8d5ae, + 0xcdc58b19, 0xd795ba31, 0xa67f0001, + 0x71378000, 0x19fc0000, 0x96db0000, + 0x3a8b6dfd, 0xebcaaef3, 0x2c6d478f, + 0xac8e6c88, + 0xa67f0001, 0x71378000, 0x19fc0000, + 0x96db0000, 0x3a8b6dfd, 0xebcaaef3, + 0x2c6d478f, 0xac8e6c88, 0x50ff0004, + 0x45744000, 0x3dfb0000, 0x19e60000, + 0x1bbc5606, 0xe1727b5d, 0xe1a8cc96, + 0x7b1bd6b9, + 0xf7750009, 0xcf3cc000, 0xc3d60000, + 0x04920000, 0x029519a9, 0xf8e836ba, + 0x7a87f14e, 0x9e16981a, 0xd46a0000, + 0x8dc8c000, 0xa5af0000, 0x4a290000, + 0xfc4e427a, 0xc9b4866c, 0x98369604, + 0xf746c320, + 0xd46a0000, 0x8dc8c000, 0xa5af0000, + 0x4a290000, 0xfc4e427a, 0xc9b4866c, + 0x98369604, 0xf746c320, 0x231f0009, + 0x42f40000, 0x66790000, 0x4ebb0000, + 0xfedb5bd3, 0x315cb0d6, 0xe2b1674a, + 0x69505b3a, + 0x774400f0, 0xf15a0000, 0xf5b20000, + 0x34140000, 0x89377e8c, 0x5a8bec25, + 0x0bc3cd1e, 0xcf3775cb, 0xf46c0050, + 0x96180000, 0x14a50000, 0x031f0000, + 0x42947eb8, 0x66bf7e19, 0x9ca470d2, + 0x8a341574, + 0xf46c0050, 0x96180000, 0x14a50000, + 0x031f0000, 0x42947eb8, 0x66bf7e19, + 0x9ca470d2, 0x8a341574, 0x832800a0, + 0x67420000, 0xe1170000, 0x370b0000, + 0xcba30034, 0x3c34923c, 0x9767bdcc, + 0x450360bf, + 0xe8870170, 0x9d720000, 0x12db0000, + 0xd4220000, 0xf2886b27, 0xa921e543, + 0x4ef8b518, 0x618813b1, 0xb4370060, + 0x0c4c0000, 0x56c20000, 0x5cae0000, + 0x94541f3f, 0x3b3ef825, 0x1b365f3d, + 0xf3d45758, + 0xb4370060, 0x0c4c0000, 0x56c20000, + 0x5cae0000, 0x94541f3f, 0x3b3ef825, + 0x1b365f3d, 0xf3d45758, 0x5cb00110, + 0x913e0000, 0x44190000, 0x888c0000, + 0x66dc7418, 0x921f1d66, 0x55ceea25, + 0x925c44e9, + 0x0c720000, 0x49e50f00, 0x42790000, + 0x5cea0000, 0x33aa301a, 0x15822514, + 0x95a34b7b, 0xb44b0090, 0xfe220000, + 0xa7580500, 0x25d10000, 0xf7600000, + 0x893178da, 0x1fd4f860, 0x4ed0a315, + 0xa123ff9f, + 0xfe220000, 0xa7580500, 0x25d10000, + 0xf7600000, 0x893178da, 0x1fd4f860, + 0x4ed0a315, 0xa123ff9f, 0xf2500000, + 0xeebd0a00, 0x67a80000, 0xab8a0000, + 0xba9b48c0, 0x0a56dd74, 0xdb73e86e, + 0x1568ff0f, + 0x45180000, 0xa5b51700, 0xf96a0000, + 0x3b480000, 0x1ecc142c, 0x231395d6, + 0x16bca6b0, 0xdf33f4df, 0xb83d0000, + 0x16710600, 0x379a0000, 0xf5b10000, + 0x228161ac, 0xae48f145, 0x66241616, + 0xc5c1eb3e, + 0xb83d0000, 0x16710600, 0x379a0000, + 0xf5b10000, 0x228161ac, 0xae48f145, + 0x66241616, 0xc5c1eb3e, 0xfd250000, + 0xb3c41100, 0xcef00000, 0xcef90000, + 0x3c4d7580, 0x8d5b6493, 0x7098b0a6, + 0x1af21fe1, + 0x75a40000, 0xc28b2700, 0x94a40000, + 0x90f50000, 0xfb7857e0, 0x49ce0bae, + 0x1767c483, 0xaedf667e, 0xd1660000, + 0x1bbc0300, 0x9eec0000, 0xf6940000, + 0x03024527, 0xcf70fcf2, 0xb4431b17, + 0x857f3c2b, + 0xd1660000, 0x1bbc0300, 0x9eec0000, + 0xf6940000, 0x03024527, 0xcf70fcf2, + 0xb4431b17, 0x857f3c2b, 0xa4c20000, + 0xd9372400, 0x0a480000, 0x66610000, + 0xf87a12c7, 0x86bef75c, 0xa324df94, + 0x2ba05a55, + 0x75c90003, 0x0e10c000, 0xd1200000, + 0xbaea0000, 0x8bc42f3e, 0x8758b757, + 0xbb28761d, 0x00b72e2b, 0xeecf0001, + 0x6f564000, 0xf33e0000, 0xa79e0000, + 0xbdb57219, 0xb711ebc5, 0x4a3b40ba, + 0xfeabf254, + 0xeecf0001, 0x6f564000, 0xf33e0000, + 0xa79e0000, 0xbdb57219, 0xb711ebc5, + 0x4a3b40ba, 0xfeabf254, 0x9b060002, + 0x61468000, 0x221e0000, 0x1d740000, + 0x36715d27, 0x30495c92, 0xf11336a7, + 0xfe1cdc7f, + 0x86790000, 0x3f390002, 0xe19ae000, + 0x98560000, 0x9565670e, 0x4e88c8ea, + 0xd3dd4944, 0x161ddab9, 0x30b70000, + 0xe5d00000, 0xf4f46000, 0x42c40000, + 0x63b83d6a, 0x78ba9460, 0x21afa1ea, + 0xb0a51834, + 0x30b70000, 0xe5d00000, 0xf4f46000, + 0x42c40000, 0x63b83d6a, 0x78ba9460, + 0x21afa1ea, 0xb0a51834, 0xb6ce0000, + 0xdae90002, 0x156e8000, 0xda920000, + 0xf6dd5a64, 0x36325c8a, 0xf272e8ae, + 0xa6b8c28d, + 0x14190000, 0x23ca003c, 0x50df0000, + 0x44b60000, 0x1b6c67b0, 0x3cf3ac75, + 0x61e610b0, 0xdbcadb80, 0xe3430000, + 0x3a4e0014, 0xf2c60000, 0xaa4e0000, + 0xdb1e42a6, 0x256bbe15, 0x123db156, + 0x3a4e99d7, + 0xe3430000, 0x3a4e0014, 0xf2c60000, + 0xaa4e0000, 0xdb1e42a6, 0x256bbe15, + 0x123db156, 0x3a4e99d7, 0xf75a0000, + 0x19840028, 0xa2190000, 0xeef80000, + 0xc0722516, 0x19981260, 0x73dba1e6, + 0xe1844257, + 0x54500000, 0x0671005c, 0x25ae0000, + 0x6a1e0000, 0x2ea54edf, 0x664e8512, + 0xbfba18c3, 0x7e715d17, 0xbc8d0000, + 0xfc3b0018, 0x19830000, 0xd10b0000, + 0xae1878c4, 0x42a69856, 0x0012da37, + 0x2c3b504e, + 0xbc8d0000, 0xfc3b0018, 0x19830000, + 0xd10b0000, 0xae1878c4, 0x42a69856, + 0x0012da37, 0x2c3b504e, 0xe8dd0000, + 0xfa4a0044, 0x3c2d0000, 0xbb150000, + 0x80bd361b, 0x24e81d44, 0xbfa8c2f4, + 0x524a0d59, + 0x69510000, 0xd4e1009c, 0xc3230000, + 0xac2f0000, 0xe4950bae, 0xcea415dc, + 0x87ec287c, 0xbce1a3ce, 0xc6730000, + 0xaf8d000c, 0xa4c10000, 0x218d0000, + 0x23111587, 0x7913512f, 0x1d28ac88, + 0x378dd173, + 0xc6730000, 0xaf8d000c, 0xa4c10000, + 0x218d0000, 0x23111587, 0x7913512f, + 0x1d28ac88, 0x378dd173, 0xaf220000, + 0x7b6c0090, 0x67e20000, 0x8da20000, + 0xc7841e29, 0xb7b744f3, 0x9ac484f4, + 0x8b6c72bd, + 0xcc140000, 0xa5630000, 0x5ab90780, + 0x3b500000, 0x4bd013ff, 0x879b3418, + 0x694348c1, 0xca5a87fe, 0x819e0000, + 0xec570000, 0x66320280, 0x95f30000, + 0x5da92802, 0x48f43cbc, 0xe65aa22d, + 0x8e67b7fa, + 0x819e0000, 0xec570000, 0x66320280, + 0x95f30000, 0x5da92802, 0x48f43cbc, + 0xe65aa22d, 0x8e67b7fa, 0x4d8a0000, + 0x49340000, 0x3c8b0500, 0xaea30000, + 0x16793bfd, 0xcf6f08a4, 0x8f19eaec, + 0x443d3004, + 0x78230000, 0x12fc0000, 0xa93a0b80, + 0x90a50000, 0x713e2879, 0x7ee98924, + 0xf08ca062, 0x636f8bab, 0x02af0000, + 0xb7280000, 0xba1c0300, 0x56980000, + 0xba8d45d3, 0x8048c667, 0xa95c149a, + 0xf4f6ea7b, + 0x02af0000, 0xb7280000, 0xba1c0300, + 0x56980000, 0xba8d45d3, 0x8048c667, + 0xa95c149a, 0xf4f6ea7b, 0x7a8c0000, + 0xa5d40000, 0x13260880, 0xc63d0000, + 0xcbb36daa, 0xfea14f43, 0x59d0b4f8, + 0x979961d0, + 0xac480000, 0x1ba60000, 0x45fb1380, + 0x03430000, 0x5a85316a, 0x1fb250b6, + 0xfe72c7fe, 0x91e478f6, 0x1e4e0000, + 0xdecf0000, 0x6df80180, 0x77240000, + 0xec47079e, 0xf4a0694e, 0xcda31812, + 0x98aa496e, + 0x1e4e0000, 0xdecf0000, 0x6df80180, + 0x77240000, 0xec47079e, 0xf4a0694e, + 0xcda31812, 0x98aa496e, 0xb2060000, + 0xc5690000, 0x28031200, 0x74670000, + 0xb6c236f4, 0xeb1239f8, 0x33d1dfec, + 0x094e3198, + 0xaec30000, 0x9c4f0001, 0x79d1e000, + 0x2c150000, 0x45cc75b3, 0x6650b736, + 0xab92f78f, 0xa312567b, 0xdb250000, + 0x09290000, 0x49aac000, 0x81e10000, + 0xcafe6b59, 0x42793431, 0x43566b76, + 0xe86cba2e, + 0xdb250000, 0x09290000, 0x49aac000, + 0x81e10000, 0xcafe6b59, 0x42793431, + 0x43566b76, 0xe86cba2e, 0x75e60000, + 0x95660001, 0x307b2000, 0xadf40000, + 0x8f321eea, 0x24298307, 0xe8c49cf9, + 0x4b7eec55, + 0x58430000, 0x807e0000, 0x78330001, + 0xc66b3800, 0xe7375cdc, 0x79ad3fdd, + 0xac73fe6f, 0x3a4479b1, 0x1d5a0000, + 0x2b720000, 0x488d0000, 0xaf611800, + 0x25cb2ec5, 0xc879bfd0, 0x81a20429, + 0x1e7536a6, + 0x1d5a0000, 0x2b720000, 0x488d0000, + 0xaf611800, 0x25cb2ec5, 0xc879bfd0, + 0x81a20429, 0x1e7536a6, 0x45190000, + 0xab0c0000, 0x30be0001, 0x690a2000, + 0xc2fc7219, 0xb1d4800d, 0x2dd1fa46, + 0x24314f17, + 0xa53b0000, 0x14260000, 0x4e30001e, + 0x7cae0000, 0x8f9e0dd5, 0x78dfaa3d, + 0xf73168d8, 0x0b1b4946, 0x07ed0000, + 0xb2500000, 0x8774000a, 0x970d0000, + 0x437223ae, 0x48c76ea4, 0xf4786222, + 0x9075b1ce, + 0x07ed0000, 0xb2500000, 0x8774000a, + 0x970d0000, 0x437223ae, 0x48c76ea4, + 0xf4786222, 0x9075b1ce, 0xa2d60000, + 0xa6760000, 0xc9440014, 0xeba30000, + 0xccec2e7b, 0x3018c499, 0x03490afa, + 0x9b6ef888, + 0x88980000, 0x1f940000, 0x7fcf002e, + 0xfb4e0000, 0xf158079a, 0x61ae9167, + 0xa895706c, 0xe6107494, 0x0bc20000, + 0xdb630000, 0x7e88000c, 0x15860000, + 0x91fd48f3, 0x7581bb43, 0xf460449e, + 0xd8b61463, + 0x0bc20000, 0xdb630000, 0x7e88000c, + 0x15860000, 0x91fd48f3, 0x7581bb43, + 0xf460449e, 0xd8b61463, 0x835a0000, + 0xc4f70000, 0x01470022, 0xeec80000, + 0x60a54f69, 0x142f2a24, 0x5cf534f2, + 0x3ea660f7, + 0x52500000, 0x29540000, 0x6a61004e, + 0xf0ff0000, 0x9a317eec, 0x452341ce, + 0xcf568fe5, 0x5303130f, 0x538d0000, + 0xa9fc0000, 0x9ef70006, 0x56ff0000, + 0x0ae4004e, 0x92c5cdf9, 0xa9444018, + 0x7f975691, + 0x538d0000, 0xa9fc0000, 0x9ef70006, + 0x56ff0000, 0x0ae4004e, 0x92c5cdf9, + 0xa9444018, 0x7f975691, 0x01dd0000, + 0x80a80000, 0xf4960048, 0xa6000000, + 0x90d57ea2, 0xd7e68c37, 0x6612cffd, + 0x2c94459e, + 0xe6280000, 0x4c4b0000, 0xa8550000, + 0xd3d002e0, 0xd86130b8, 0x98a7b0da, + 0x289506b4, 0xd75a4897, 0xf0c50000, + 0x59230000, 0x45820000, 0xe18d00c0, + 0x3b6d0631, 0xc2ed5699, 0xcbe0fe1c, + 0x56a7b19f, + 0xf0c50000, 0x59230000, 0x45820000, + 0xe18d00c0, 0x3b6d0631, 0xc2ed5699, + 0xcbe0fe1c, 0x56a7b19f, 0x16ed0000, + 0x15680000, 0xedd70000, 0x325d0220, + 0xe30c3689, 0x5a4ae643, 0xe375f8a8, + 0x81fdf908, + 0xb4310000, 0x77330000, 0xb15d0000, + 0x7fd004e0, 0x78a26138, 0xd116c35d, + 0xd256d489, 0x4e6f74de, 0xe3060000, + 0xbdc10000, 0x87130000, 0xbff20060, + 0x2eba0a1a, 0x8db53751, 0x73c5ab06, + 0x5bd61539, + 0xe3060000, 0xbdc10000, 0x87130000, + 0xbff20060, 0x2eba0a1a, 0x8db53751, + 0x73c5ab06, 0x5bd61539, 0x57370000, + 0xcaf20000, 0x364e0000, 0xc0220480, + 0x56186b22, 0x5ca3f40c, 0xa1937f8f, + 0x15b961e7, + 0x02f20000, 0xa2810000, 0x873f0000, + 0xe36c7800, 0x1e1d74ef, 0x073d2bd6, + 0xc4c23237, 0x7f32259e, 0xbadd0000, + 0x13ad0000, 0xb7e70000, 0xf7282800, + 0xdf45144d, 0x361ac33a, 0xea5a8d14, + 0x2a2c18f0, + 0xbadd0000, 0x13ad0000, 0xb7e70000, + 0xf7282800, 0xdf45144d, 0x361ac33a, + 0xea5a8d14, 0x2a2c18f0, 0xb82f0000, + 0xb12c0000, 0x30d80000, 0x14445000, + 0xc15860a2, 0x3127e8ec, 0x2e98bf23, + 0x551e3d6e, + 0x1e6c0000, 0xc4420000, 0x8a2e0000, + 0xbcb6b800, 0x2c4413b6, 0x8bfdd3da, + 0x6a0c1bc8, 0xb99dc2eb, 0x92560000, + 0x1eda0000, 0xea510000, 0xe8b13000, + 0xa93556a5, 0xebfb6199, 0xb15c2254, + 0x33c5244f, + 0x92560000, 0x1eda0000, 0xea510000, + 0xe8b13000, 0xa93556a5, 0xebfb6199, + 0xb15c2254, 0x33c5244f, 0x8c3a0000, + 0xda980000, 0x607f0000, 0x54078800, + 0x85714513, 0x6006b243, 0xdb50399c, + 0x8a58e6a4, + 0x033d0000, 0x08b30000, 0xf33a0000, + 0x3ac20007, 0x51298a50, 0x6b6e661f, + 0x0ea5cfe3, 0xe6da7ffe, 0xa8da0000, + 0x96be0000, 0x5c1d0000, 0x07da0002, + 0x7d669583, 0x1f98708a, 0xbb668808, + 0xda878000, + 0xa8da0000, 0x96be0000, 0x5c1d0000, + 0x07da0002, 0x7d669583, 0x1f98708a, + 0xbb668808, 0xda878000, 0xabe70000, + 0x9e0d0000, 0xaf270000, 0x3d180005, + 0x2c4f1fd3, 0x74f61695, 0xb5c347eb, + 0x3c5dfffe, + 0x01930000, 0xe7820000, 0xedfb0000, + 0xcf0c000b, 0x8dd08d58, 0xbca3b42e, + 0x063661e1, 0x536f9e7b, 0x92280000, + 0xdc850000, 0x57fa0000, 0x56dc0003, + 0xbae92316, 0x5aefa30c, 0x90cef752, + 0x7b1675d7, + 0x92280000, 0xdc850000, 0x57fa0000, + 0x56dc0003, 0xbae92316, 0x5aefa30c, + 0x90cef752, 0x7b1675d7, 0x93bb0000, + 0x3b070000, 0xba010000, 0x99d00008, + 0x3739ae4e, 0xe64c1722, 0x96f896b3, + 0x2879ebac, + 0x5fa80000, 0x56030000, 0x43ae0000, + 0x64f30013, 0x257e86bf, 0x1311944e, + 0x541e95bf, 0x8ea4db69, 0x00440000, + 0x7f480000, 0xda7c0000, 0x2a230001, + 0x3badc9cc, 0xa9b69c87, 0x030a9e60, + 0xbe0a679e, + 0x00440000, 0x7f480000, 0xda7c0000, + 0x2a230001, 0x3badc9cc, 0xa9b69c87, + 0x030a9e60, 0xbe0a679e, 0x5fec0000, + 0x294b0000, 0x99d20000, 0x4ed00012, + 0x1ed34f73, 0xbaa708c9, 0x57140bdf, + 0x30aebcf7, + 0xee930000, 0xd6070000, 0x92c10000, + 0x2b9801e0, 0x9451287c, 0x3b6cfb57, + 0x45312374, 0x201f6a64, 0x7b280000, + 0x57420000, 0xa9e50000, 0x634300a0, + 0x9edb442f, 0x6d9995bb, 0x27f83b03, + 0xc7ff60f0, + 0x7b280000, 0x57420000, 0xa9e50000, + 0x634300a0, 0x9edb442f, 0x6d9995bb, + 0x27f83b03, 0xc7ff60f0, 0x95bb0000, + 0x81450000, 0x3b240000, 0x48db0140, + 0x0a8a6c53, 0x56f56eec, 0x62c91877, + 0xe7e00a94 }; -static __constant__ uint32_t d_T512[64][16] = { - { SPH_C32(0xef0b0270), SPH_C32(0x3afd0000), SPH_C32(0x5dae0000), - SPH_C32(0x69490000), SPH_C32(0x9b0f3c06), SPH_C32(0x4405b5f9), - SPH_C32(0x66140a51), SPH_C32(0x924f5d0a), SPH_C32(0xc96b0030), - SPH_C32(0xe7250000), SPH_C32(0x2f840000), SPH_C32(0x264f0000), - SPH_C32(0x08695bf9), SPH_C32(0x6dfcf137), SPH_C32(0x509f6984), - SPH_C32(0x9e69af68) }, - { SPH_C32(0xc96b0030), SPH_C32(0xe7250000), SPH_C32(0x2f840000), - SPH_C32(0x264f0000), SPH_C32(0x08695bf9), SPH_C32(0x6dfcf137), - SPH_C32(0x509f6984), SPH_C32(0x9e69af68), SPH_C32(0x26600240), - SPH_C32(0xddd80000), SPH_C32(0x722a0000), SPH_C32(0x4f060000), - SPH_C32(0x936667ff), SPH_C32(0x29f944ce), SPH_C32(0x368b63d5), - SPH_C32(0x0c26f262) }, - { SPH_C32(0x145a3c00), SPH_C32(0xb9e90000), SPH_C32(0x61270000), - SPH_C32(0xf1610000), SPH_C32(0xce613d6c), SPH_C32(0xb0493d78), - SPH_C32(0x47a96720), SPH_C32(0xe18e24c5), SPH_C32(0x23671400), - SPH_C32(0xc8b90000), SPH_C32(0xf4c70000), SPH_C32(0xfb750000), - SPH_C32(0x73cd2465), SPH_C32(0xf8a6a549), SPH_C32(0x02c40a3f), - SPH_C32(0xdc24e61f) }, - { SPH_C32(0x23671400), SPH_C32(0xc8b90000), SPH_C32(0xf4c70000), - SPH_C32(0xfb750000), SPH_C32(0x73cd2465), SPH_C32(0xf8a6a549), - SPH_C32(0x02c40a3f), SPH_C32(0xdc24e61f), SPH_C32(0x373d2800), - SPH_C32(0x71500000), SPH_C32(0x95e00000), SPH_C32(0x0a140000), - SPH_C32(0xbdac1909), SPH_C32(0x48ef9831), SPH_C32(0x456d6d1f), - SPH_C32(0x3daac2da) }, - { SPH_C32(0x54285c00), SPH_C32(0xeaed0000), SPH_C32(0xc5d60000), - SPH_C32(0xa1c50000), SPH_C32(0xb3a26770), SPH_C32(0x94a5c4e1), - SPH_C32(0x6bb0419d), SPH_C32(0x551b3782), SPH_C32(0x9cbb1800), - SPH_C32(0xb0d30000), SPH_C32(0x92510000), SPH_C32(0xed930000), - SPH_C32(0x593a4345), SPH_C32(0xe114d5f4), SPH_C32(0x430633da), - SPH_C32(0x78cace29) }, - { SPH_C32(0x9cbb1800), SPH_C32(0xb0d30000), SPH_C32(0x92510000), - SPH_C32(0xed930000), SPH_C32(0x593a4345), SPH_C32(0xe114d5f4), - SPH_C32(0x430633da), SPH_C32(0x78cace29), SPH_C32(0xc8934400), - SPH_C32(0x5a3e0000), SPH_C32(0x57870000), SPH_C32(0x4c560000), - SPH_C32(0xea982435), SPH_C32(0x75b11115), SPH_C32(0x28b67247), - SPH_C32(0x2dd1f9ab) }, - { SPH_C32(0x29449c00), SPH_C32(0x64e70000), SPH_C32(0xf24b0000), - SPH_C32(0xc2f30000), SPH_C32(0x0ede4e8f), SPH_C32(0x56c23745), - SPH_C32(0xf3e04259), SPH_C32(0x8d0d9ec4), SPH_C32(0x466d0c00), - SPH_C32(0x08620000), SPH_C32(0xdd5d0000), SPH_C32(0xbadd0000), - SPH_C32(0x6a927942), SPH_C32(0x441f2b93), SPH_C32(0x218ace6f), - SPH_C32(0xbf2c0be2) }, - { SPH_C32(0x466d0c00), SPH_C32(0x08620000), SPH_C32(0xdd5d0000), - SPH_C32(0xbadd0000), SPH_C32(0x6a927942), SPH_C32(0x441f2b93), - SPH_C32(0x218ace6f), SPH_C32(0xbf2c0be2), SPH_C32(0x6f299000), - SPH_C32(0x6c850000), SPH_C32(0x2f160000), SPH_C32(0x782e0000), - SPH_C32(0x644c37cd), SPH_C32(0x12dd1cd6), SPH_C32(0xd26a8c36), - SPH_C32(0x32219526) }, - { SPH_C32(0xf6800005), SPH_C32(0x3443c000), SPH_C32(0x24070000), - SPH_C32(0x8f3d0000), SPH_C32(0x21373bfb), SPH_C32(0x0ab8d5ae), - SPH_C32(0xcdc58b19), SPH_C32(0xd795ba31), SPH_C32(0xa67f0001), - SPH_C32(0x71378000), SPH_C32(0x19fc0000), SPH_C32(0x96db0000), - SPH_C32(0x3a8b6dfd), SPH_C32(0xebcaaef3), SPH_C32(0x2c6d478f), - SPH_C32(0xac8e6c88) }, - { SPH_C32(0xa67f0001), SPH_C32(0x71378000), SPH_C32(0x19fc0000), - SPH_C32(0x96db0000), SPH_C32(0x3a8b6dfd), SPH_C32(0xebcaaef3), - SPH_C32(0x2c6d478f), SPH_C32(0xac8e6c88), SPH_C32(0x50ff0004), - SPH_C32(0x45744000), SPH_C32(0x3dfb0000), SPH_C32(0x19e60000), - SPH_C32(0x1bbc5606), SPH_C32(0xe1727b5d), SPH_C32(0xe1a8cc96), - SPH_C32(0x7b1bd6b9) }, - { SPH_C32(0xf7750009), SPH_C32(0xcf3cc000), SPH_C32(0xc3d60000), - SPH_C32(0x04920000), SPH_C32(0x029519a9), SPH_C32(0xf8e836ba), - SPH_C32(0x7a87f14e), SPH_C32(0x9e16981a), SPH_C32(0xd46a0000), - SPH_C32(0x8dc8c000), SPH_C32(0xa5af0000), SPH_C32(0x4a290000), - SPH_C32(0xfc4e427a), SPH_C32(0xc9b4866c), SPH_C32(0x98369604), - SPH_C32(0xf746c320) }, - { SPH_C32(0xd46a0000), SPH_C32(0x8dc8c000), SPH_C32(0xa5af0000), - SPH_C32(0x4a290000), SPH_C32(0xfc4e427a), SPH_C32(0xc9b4866c), - SPH_C32(0x98369604), SPH_C32(0xf746c320), SPH_C32(0x231f0009), - SPH_C32(0x42f40000), SPH_C32(0x66790000), SPH_C32(0x4ebb0000), - SPH_C32(0xfedb5bd3), SPH_C32(0x315cb0d6), SPH_C32(0xe2b1674a), - SPH_C32(0x69505b3a) }, - { SPH_C32(0x774400f0), SPH_C32(0xf15a0000), SPH_C32(0xf5b20000), - SPH_C32(0x34140000), SPH_C32(0x89377e8c), SPH_C32(0x5a8bec25), - SPH_C32(0x0bc3cd1e), SPH_C32(0xcf3775cb), SPH_C32(0xf46c0050), - SPH_C32(0x96180000), SPH_C32(0x14a50000), SPH_C32(0x031f0000), - SPH_C32(0x42947eb8), SPH_C32(0x66bf7e19), SPH_C32(0x9ca470d2), - SPH_C32(0x8a341574) }, - { SPH_C32(0xf46c0050), SPH_C32(0x96180000), SPH_C32(0x14a50000), - SPH_C32(0x031f0000), SPH_C32(0x42947eb8), SPH_C32(0x66bf7e19), - SPH_C32(0x9ca470d2), SPH_C32(0x8a341574), SPH_C32(0x832800a0), - SPH_C32(0x67420000), SPH_C32(0xe1170000), SPH_C32(0x370b0000), - SPH_C32(0xcba30034), SPH_C32(0x3c34923c), SPH_C32(0x9767bdcc), - SPH_C32(0x450360bf) }, - { SPH_C32(0xe8870170), SPH_C32(0x9d720000), SPH_C32(0x12db0000), - SPH_C32(0xd4220000), SPH_C32(0xf2886b27), SPH_C32(0xa921e543), - SPH_C32(0x4ef8b518), SPH_C32(0x618813b1), SPH_C32(0xb4370060), - SPH_C32(0x0c4c0000), SPH_C32(0x56c20000), SPH_C32(0x5cae0000), - SPH_C32(0x94541f3f), SPH_C32(0x3b3ef825), SPH_C32(0x1b365f3d), - SPH_C32(0xf3d45758) }, - { SPH_C32(0xb4370060), SPH_C32(0x0c4c0000), SPH_C32(0x56c20000), - SPH_C32(0x5cae0000), SPH_C32(0x94541f3f), SPH_C32(0x3b3ef825), - SPH_C32(0x1b365f3d), SPH_C32(0xf3d45758), SPH_C32(0x5cb00110), - SPH_C32(0x913e0000), SPH_C32(0x44190000), SPH_C32(0x888c0000), - SPH_C32(0x66dc7418), SPH_C32(0x921f1d66), SPH_C32(0x55ceea25), - SPH_C32(0x925c44e9) }, - { SPH_C32(0x0c720000), SPH_C32(0x49e50f00), SPH_C32(0x42790000), - SPH_C32(0x5cea0000), SPH_C32(0x33aa301a), SPH_C32(0x15822514), - SPH_C32(0x95a34b7b), SPH_C32(0xb44b0090), SPH_C32(0xfe220000), - SPH_C32(0xa7580500), SPH_C32(0x25d10000), SPH_C32(0xf7600000), - SPH_C32(0x893178da), SPH_C32(0x1fd4f860), SPH_C32(0x4ed0a315), - SPH_C32(0xa123ff9f) }, - { SPH_C32(0xfe220000), SPH_C32(0xa7580500), SPH_C32(0x25d10000), - SPH_C32(0xf7600000), SPH_C32(0x893178da), SPH_C32(0x1fd4f860), - SPH_C32(0x4ed0a315), SPH_C32(0xa123ff9f), SPH_C32(0xf2500000), - SPH_C32(0xeebd0a00), SPH_C32(0x67a80000), SPH_C32(0xab8a0000), - SPH_C32(0xba9b48c0), SPH_C32(0x0a56dd74), SPH_C32(0xdb73e86e), - SPH_C32(0x1568ff0f) }, - { SPH_C32(0x45180000), SPH_C32(0xa5b51700), SPH_C32(0xf96a0000), - SPH_C32(0x3b480000), SPH_C32(0x1ecc142c), SPH_C32(0x231395d6), - SPH_C32(0x16bca6b0), SPH_C32(0xdf33f4df), SPH_C32(0xb83d0000), - SPH_C32(0x16710600), SPH_C32(0x379a0000), SPH_C32(0xf5b10000), - SPH_C32(0x228161ac), SPH_C32(0xae48f145), SPH_C32(0x66241616), - SPH_C32(0xc5c1eb3e) }, - { SPH_C32(0xb83d0000), SPH_C32(0x16710600), SPH_C32(0x379a0000), - SPH_C32(0xf5b10000), SPH_C32(0x228161ac), SPH_C32(0xae48f145), - SPH_C32(0x66241616), SPH_C32(0xc5c1eb3e), SPH_C32(0xfd250000), - SPH_C32(0xb3c41100), SPH_C32(0xcef00000), SPH_C32(0xcef90000), - SPH_C32(0x3c4d7580), SPH_C32(0x8d5b6493), SPH_C32(0x7098b0a6), - SPH_C32(0x1af21fe1) }, - { SPH_C32(0x75a40000), SPH_C32(0xc28b2700), SPH_C32(0x94a40000), - SPH_C32(0x90f50000), SPH_C32(0xfb7857e0), SPH_C32(0x49ce0bae), - SPH_C32(0x1767c483), SPH_C32(0xaedf667e), SPH_C32(0xd1660000), - SPH_C32(0x1bbc0300), SPH_C32(0x9eec0000), SPH_C32(0xf6940000), - SPH_C32(0x03024527), SPH_C32(0xcf70fcf2), SPH_C32(0xb4431b17), - SPH_C32(0x857f3c2b) }, - { SPH_C32(0xd1660000), SPH_C32(0x1bbc0300), SPH_C32(0x9eec0000), - SPH_C32(0xf6940000), SPH_C32(0x03024527), SPH_C32(0xcf70fcf2), - SPH_C32(0xb4431b17), SPH_C32(0x857f3c2b), SPH_C32(0xa4c20000), - SPH_C32(0xd9372400), SPH_C32(0x0a480000), SPH_C32(0x66610000), - SPH_C32(0xf87a12c7), SPH_C32(0x86bef75c), SPH_C32(0xa324df94), - SPH_C32(0x2ba05a55) }, - { SPH_C32(0x75c90003), SPH_C32(0x0e10c000), SPH_C32(0xd1200000), - SPH_C32(0xbaea0000), SPH_C32(0x8bc42f3e), SPH_C32(0x8758b757), - SPH_C32(0xbb28761d), SPH_C32(0x00b72e2b), SPH_C32(0xeecf0001), - SPH_C32(0x6f564000), SPH_C32(0xf33e0000), SPH_C32(0xa79e0000), - SPH_C32(0xbdb57219), SPH_C32(0xb711ebc5), SPH_C32(0x4a3b40ba), - SPH_C32(0xfeabf254) }, - { SPH_C32(0xeecf0001), SPH_C32(0x6f564000), SPH_C32(0xf33e0000), - SPH_C32(0xa79e0000), SPH_C32(0xbdb57219), SPH_C32(0xb711ebc5), - SPH_C32(0x4a3b40ba), SPH_C32(0xfeabf254), SPH_C32(0x9b060002), - SPH_C32(0x61468000), SPH_C32(0x221e0000), SPH_C32(0x1d740000), - SPH_C32(0x36715d27), SPH_C32(0x30495c92), SPH_C32(0xf11336a7), - SPH_C32(0xfe1cdc7f) }, - { SPH_C32(0x86790000), SPH_C32(0x3f390002), SPH_C32(0xe19ae000), - SPH_C32(0x98560000), SPH_C32(0x9565670e), SPH_C32(0x4e88c8ea), - SPH_C32(0xd3dd4944), SPH_C32(0x161ddab9), SPH_C32(0x30b70000), - SPH_C32(0xe5d00000), SPH_C32(0xf4f46000), SPH_C32(0x42c40000), - SPH_C32(0x63b83d6a), SPH_C32(0x78ba9460), SPH_C32(0x21afa1ea), - SPH_C32(0xb0a51834) }, - { SPH_C32(0x30b70000), SPH_C32(0xe5d00000), SPH_C32(0xf4f46000), - SPH_C32(0x42c40000), SPH_C32(0x63b83d6a), SPH_C32(0x78ba9460), - SPH_C32(0x21afa1ea), SPH_C32(0xb0a51834), SPH_C32(0xb6ce0000), - SPH_C32(0xdae90002), SPH_C32(0x156e8000), SPH_C32(0xda920000), - SPH_C32(0xf6dd5a64), SPH_C32(0x36325c8a), SPH_C32(0xf272e8ae), - SPH_C32(0xa6b8c28d) }, - { SPH_C32(0x14190000), SPH_C32(0x23ca003c), SPH_C32(0x50df0000), - SPH_C32(0x44b60000), SPH_C32(0x1b6c67b0), SPH_C32(0x3cf3ac75), - SPH_C32(0x61e610b0), SPH_C32(0xdbcadb80), SPH_C32(0xe3430000), - SPH_C32(0x3a4e0014), SPH_C32(0xf2c60000), SPH_C32(0xaa4e0000), - SPH_C32(0xdb1e42a6), SPH_C32(0x256bbe15), SPH_C32(0x123db156), - SPH_C32(0x3a4e99d7) }, - { SPH_C32(0xe3430000), SPH_C32(0x3a4e0014), SPH_C32(0xf2c60000), - SPH_C32(0xaa4e0000), SPH_C32(0xdb1e42a6), SPH_C32(0x256bbe15), - SPH_C32(0x123db156), SPH_C32(0x3a4e99d7), SPH_C32(0xf75a0000), - SPH_C32(0x19840028), SPH_C32(0xa2190000), SPH_C32(0xeef80000), - SPH_C32(0xc0722516), SPH_C32(0x19981260), SPH_C32(0x73dba1e6), - SPH_C32(0xe1844257) }, - { SPH_C32(0x54500000), SPH_C32(0x0671005c), SPH_C32(0x25ae0000), - SPH_C32(0x6a1e0000), SPH_C32(0x2ea54edf), SPH_C32(0x664e8512), - SPH_C32(0xbfba18c3), SPH_C32(0x7e715d17), SPH_C32(0xbc8d0000), - SPH_C32(0xfc3b0018), SPH_C32(0x19830000), SPH_C32(0xd10b0000), - SPH_C32(0xae1878c4), SPH_C32(0x42a69856), SPH_C32(0x0012da37), - SPH_C32(0x2c3b504e) }, - { SPH_C32(0xbc8d0000), SPH_C32(0xfc3b0018), SPH_C32(0x19830000), - SPH_C32(0xd10b0000), SPH_C32(0xae1878c4), SPH_C32(0x42a69856), - SPH_C32(0x0012da37), SPH_C32(0x2c3b504e), SPH_C32(0xe8dd0000), - SPH_C32(0xfa4a0044), SPH_C32(0x3c2d0000), SPH_C32(0xbb150000), - SPH_C32(0x80bd361b), SPH_C32(0x24e81d44), SPH_C32(0xbfa8c2f4), - SPH_C32(0x524a0d59) }, - { SPH_C32(0x69510000), SPH_C32(0xd4e1009c), SPH_C32(0xc3230000), - SPH_C32(0xac2f0000), SPH_C32(0xe4950bae), SPH_C32(0xcea415dc), - SPH_C32(0x87ec287c), SPH_C32(0xbce1a3ce), SPH_C32(0xc6730000), - SPH_C32(0xaf8d000c), SPH_C32(0xa4c10000), SPH_C32(0x218d0000), - SPH_C32(0x23111587), SPH_C32(0x7913512f), SPH_C32(0x1d28ac88), - SPH_C32(0x378dd173) }, - { SPH_C32(0xc6730000), SPH_C32(0xaf8d000c), SPH_C32(0xa4c10000), - SPH_C32(0x218d0000), SPH_C32(0x23111587), SPH_C32(0x7913512f), - SPH_C32(0x1d28ac88), SPH_C32(0x378dd173), SPH_C32(0xaf220000), - SPH_C32(0x7b6c0090), SPH_C32(0x67e20000), SPH_C32(0x8da20000), - SPH_C32(0xc7841e29), SPH_C32(0xb7b744f3), SPH_C32(0x9ac484f4), - SPH_C32(0x8b6c72bd) }, - { SPH_C32(0xcc140000), SPH_C32(0xa5630000), SPH_C32(0x5ab90780), - SPH_C32(0x3b500000), SPH_C32(0x4bd013ff), SPH_C32(0x879b3418), - SPH_C32(0x694348c1), SPH_C32(0xca5a87fe), SPH_C32(0x819e0000), - SPH_C32(0xec570000), SPH_C32(0x66320280), SPH_C32(0x95f30000), - SPH_C32(0x5da92802), SPH_C32(0x48f43cbc), SPH_C32(0xe65aa22d), - SPH_C32(0x8e67b7fa) }, - { SPH_C32(0x819e0000), SPH_C32(0xec570000), SPH_C32(0x66320280), - SPH_C32(0x95f30000), SPH_C32(0x5da92802), SPH_C32(0x48f43cbc), - SPH_C32(0xe65aa22d), SPH_C32(0x8e67b7fa), SPH_C32(0x4d8a0000), - SPH_C32(0x49340000), SPH_C32(0x3c8b0500), SPH_C32(0xaea30000), - SPH_C32(0x16793bfd), SPH_C32(0xcf6f08a4), SPH_C32(0x8f19eaec), - SPH_C32(0x443d3004) }, - { SPH_C32(0x78230000), SPH_C32(0x12fc0000), SPH_C32(0xa93a0b80), - SPH_C32(0x90a50000), SPH_C32(0x713e2879), SPH_C32(0x7ee98924), - SPH_C32(0xf08ca062), SPH_C32(0x636f8bab), SPH_C32(0x02af0000), - SPH_C32(0xb7280000), SPH_C32(0xba1c0300), SPH_C32(0x56980000), - SPH_C32(0xba8d45d3), SPH_C32(0x8048c667), SPH_C32(0xa95c149a), - SPH_C32(0xf4f6ea7b) }, - { SPH_C32(0x02af0000), SPH_C32(0xb7280000), SPH_C32(0xba1c0300), - SPH_C32(0x56980000), SPH_C32(0xba8d45d3), SPH_C32(0x8048c667), - SPH_C32(0xa95c149a), SPH_C32(0xf4f6ea7b), SPH_C32(0x7a8c0000), - SPH_C32(0xa5d40000), SPH_C32(0x13260880), SPH_C32(0xc63d0000), - SPH_C32(0xcbb36daa), SPH_C32(0xfea14f43), SPH_C32(0x59d0b4f8), - SPH_C32(0x979961d0) }, - { SPH_C32(0xac480000), SPH_C32(0x1ba60000), SPH_C32(0x45fb1380), - SPH_C32(0x03430000), SPH_C32(0x5a85316a), SPH_C32(0x1fb250b6), - SPH_C32(0xfe72c7fe), SPH_C32(0x91e478f6), SPH_C32(0x1e4e0000), - SPH_C32(0xdecf0000), SPH_C32(0x6df80180), SPH_C32(0x77240000), - SPH_C32(0xec47079e), SPH_C32(0xf4a0694e), SPH_C32(0xcda31812), - SPH_C32(0x98aa496e) }, - { SPH_C32(0x1e4e0000), SPH_C32(0xdecf0000), SPH_C32(0x6df80180), - SPH_C32(0x77240000), SPH_C32(0xec47079e), SPH_C32(0xf4a0694e), - SPH_C32(0xcda31812), SPH_C32(0x98aa496e), SPH_C32(0xb2060000), - SPH_C32(0xc5690000), SPH_C32(0x28031200), SPH_C32(0x74670000), - SPH_C32(0xb6c236f4), SPH_C32(0xeb1239f8), SPH_C32(0x33d1dfec), - SPH_C32(0x094e3198) }, - { SPH_C32(0xaec30000), SPH_C32(0x9c4f0001), SPH_C32(0x79d1e000), - SPH_C32(0x2c150000), SPH_C32(0x45cc75b3), SPH_C32(0x6650b736), - SPH_C32(0xab92f78f), SPH_C32(0xa312567b), SPH_C32(0xdb250000), - SPH_C32(0x09290000), SPH_C32(0x49aac000), SPH_C32(0x81e10000), - SPH_C32(0xcafe6b59), SPH_C32(0x42793431), SPH_C32(0x43566b76), - SPH_C32(0xe86cba2e) }, - { SPH_C32(0xdb250000), SPH_C32(0x09290000), SPH_C32(0x49aac000), - SPH_C32(0x81e10000), SPH_C32(0xcafe6b59), SPH_C32(0x42793431), - SPH_C32(0x43566b76), SPH_C32(0xe86cba2e), SPH_C32(0x75e60000), - SPH_C32(0x95660001), SPH_C32(0x307b2000), SPH_C32(0xadf40000), - SPH_C32(0x8f321eea), SPH_C32(0x24298307), SPH_C32(0xe8c49cf9), - SPH_C32(0x4b7eec55) }, - { SPH_C32(0x58430000), SPH_C32(0x807e0000), SPH_C32(0x78330001), - SPH_C32(0xc66b3800), SPH_C32(0xe7375cdc), SPH_C32(0x79ad3fdd), - SPH_C32(0xac73fe6f), SPH_C32(0x3a4479b1), SPH_C32(0x1d5a0000), - SPH_C32(0x2b720000), SPH_C32(0x488d0000), SPH_C32(0xaf611800), - SPH_C32(0x25cb2ec5), SPH_C32(0xc879bfd0), SPH_C32(0x81a20429), - SPH_C32(0x1e7536a6) }, - { SPH_C32(0x1d5a0000), SPH_C32(0x2b720000), SPH_C32(0x488d0000), - SPH_C32(0xaf611800), SPH_C32(0x25cb2ec5), SPH_C32(0xc879bfd0), - SPH_C32(0x81a20429), SPH_C32(0x1e7536a6), SPH_C32(0x45190000), - SPH_C32(0xab0c0000), SPH_C32(0x30be0001), SPH_C32(0x690a2000), - SPH_C32(0xc2fc7219), SPH_C32(0xb1d4800d), SPH_C32(0x2dd1fa46), - SPH_C32(0x24314f17) }, - { SPH_C32(0xa53b0000), SPH_C32(0x14260000), SPH_C32(0x4e30001e), - SPH_C32(0x7cae0000), SPH_C32(0x8f9e0dd5), SPH_C32(0x78dfaa3d), - SPH_C32(0xf73168d8), SPH_C32(0x0b1b4946), SPH_C32(0x07ed0000), - SPH_C32(0xb2500000), SPH_C32(0x8774000a), SPH_C32(0x970d0000), - SPH_C32(0x437223ae), SPH_C32(0x48c76ea4), SPH_C32(0xf4786222), - SPH_C32(0x9075b1ce) }, - { SPH_C32(0x07ed0000), SPH_C32(0xb2500000), SPH_C32(0x8774000a), - SPH_C32(0x970d0000), SPH_C32(0x437223ae), SPH_C32(0x48c76ea4), - SPH_C32(0xf4786222), SPH_C32(0x9075b1ce), SPH_C32(0xa2d60000), - SPH_C32(0xa6760000), SPH_C32(0xc9440014), SPH_C32(0xeba30000), - SPH_C32(0xccec2e7b), SPH_C32(0x3018c499), SPH_C32(0x03490afa), - SPH_C32(0x9b6ef888) }, - { SPH_C32(0x88980000), SPH_C32(0x1f940000), SPH_C32(0x7fcf002e), - SPH_C32(0xfb4e0000), SPH_C32(0xf158079a), SPH_C32(0x61ae9167), - SPH_C32(0xa895706c), SPH_C32(0xe6107494), SPH_C32(0x0bc20000), - SPH_C32(0xdb630000), SPH_C32(0x7e88000c), SPH_C32(0x15860000), - SPH_C32(0x91fd48f3), SPH_C32(0x7581bb43), SPH_C32(0xf460449e), - SPH_C32(0xd8b61463) }, - { SPH_C32(0x0bc20000), SPH_C32(0xdb630000), SPH_C32(0x7e88000c), - SPH_C32(0x15860000), SPH_C32(0x91fd48f3), SPH_C32(0x7581bb43), - SPH_C32(0xf460449e), SPH_C32(0xd8b61463), SPH_C32(0x835a0000), - SPH_C32(0xc4f70000), SPH_C32(0x01470022), SPH_C32(0xeec80000), - SPH_C32(0x60a54f69), SPH_C32(0x142f2a24), SPH_C32(0x5cf534f2), - SPH_C32(0x3ea660f7) }, - { SPH_C32(0x52500000), SPH_C32(0x29540000), SPH_C32(0x6a61004e), - SPH_C32(0xf0ff0000), SPH_C32(0x9a317eec), SPH_C32(0x452341ce), - SPH_C32(0xcf568fe5), SPH_C32(0x5303130f), SPH_C32(0x538d0000), - SPH_C32(0xa9fc0000), SPH_C32(0x9ef70006), SPH_C32(0x56ff0000), - SPH_C32(0x0ae4004e), SPH_C32(0x92c5cdf9), SPH_C32(0xa9444018), - SPH_C32(0x7f975691) }, - { SPH_C32(0x538d0000), SPH_C32(0xa9fc0000), SPH_C32(0x9ef70006), - SPH_C32(0x56ff0000), SPH_C32(0x0ae4004e), SPH_C32(0x92c5cdf9), - SPH_C32(0xa9444018), SPH_C32(0x7f975691), SPH_C32(0x01dd0000), - SPH_C32(0x80a80000), SPH_C32(0xf4960048), SPH_C32(0xa6000000), - SPH_C32(0x90d57ea2), SPH_C32(0xd7e68c37), SPH_C32(0x6612cffd), - SPH_C32(0x2c94459e) }, - { SPH_C32(0xe6280000), SPH_C32(0x4c4b0000), SPH_C32(0xa8550000), - SPH_C32(0xd3d002e0), SPH_C32(0xd86130b8), SPH_C32(0x98a7b0da), - SPH_C32(0x289506b4), SPH_C32(0xd75a4897), SPH_C32(0xf0c50000), - SPH_C32(0x59230000), SPH_C32(0x45820000), SPH_C32(0xe18d00c0), - SPH_C32(0x3b6d0631), SPH_C32(0xc2ed5699), SPH_C32(0xcbe0fe1c), - SPH_C32(0x56a7b19f) }, - { SPH_C32(0xf0c50000), SPH_C32(0x59230000), SPH_C32(0x45820000), - SPH_C32(0xe18d00c0), SPH_C32(0x3b6d0631), SPH_C32(0xc2ed5699), - SPH_C32(0xcbe0fe1c), SPH_C32(0x56a7b19f), SPH_C32(0x16ed0000), - SPH_C32(0x15680000), SPH_C32(0xedd70000), SPH_C32(0x325d0220), - SPH_C32(0xe30c3689), SPH_C32(0x5a4ae643), SPH_C32(0xe375f8a8), - SPH_C32(0x81fdf908) }, - { SPH_C32(0xb4310000), SPH_C32(0x77330000), SPH_C32(0xb15d0000), - SPH_C32(0x7fd004e0), SPH_C32(0x78a26138), SPH_C32(0xd116c35d), - SPH_C32(0xd256d489), SPH_C32(0x4e6f74de), SPH_C32(0xe3060000), - SPH_C32(0xbdc10000), SPH_C32(0x87130000), SPH_C32(0xbff20060), - SPH_C32(0x2eba0a1a), SPH_C32(0x8db53751), SPH_C32(0x73c5ab06), - SPH_C32(0x5bd61539) }, - { SPH_C32(0xe3060000), SPH_C32(0xbdc10000), SPH_C32(0x87130000), - SPH_C32(0xbff20060), SPH_C32(0x2eba0a1a), SPH_C32(0x8db53751), - SPH_C32(0x73c5ab06), SPH_C32(0x5bd61539), SPH_C32(0x57370000), - SPH_C32(0xcaf20000), SPH_C32(0x364e0000), SPH_C32(0xc0220480), - SPH_C32(0x56186b22), SPH_C32(0x5ca3f40c), SPH_C32(0xa1937f8f), - SPH_C32(0x15b961e7) }, - { SPH_C32(0x02f20000), SPH_C32(0xa2810000), SPH_C32(0x873f0000), - SPH_C32(0xe36c7800), SPH_C32(0x1e1d74ef), SPH_C32(0x073d2bd6), - SPH_C32(0xc4c23237), SPH_C32(0x7f32259e), SPH_C32(0xbadd0000), - SPH_C32(0x13ad0000), SPH_C32(0xb7e70000), SPH_C32(0xf7282800), - SPH_C32(0xdf45144d), SPH_C32(0x361ac33a), SPH_C32(0xea5a8d14), - SPH_C32(0x2a2c18f0) }, - { SPH_C32(0xbadd0000), SPH_C32(0x13ad0000), SPH_C32(0xb7e70000), - SPH_C32(0xf7282800), SPH_C32(0xdf45144d), SPH_C32(0x361ac33a), - SPH_C32(0xea5a8d14), SPH_C32(0x2a2c18f0), SPH_C32(0xb82f0000), - SPH_C32(0xb12c0000), SPH_C32(0x30d80000), SPH_C32(0x14445000), - SPH_C32(0xc15860a2), SPH_C32(0x3127e8ec), SPH_C32(0x2e98bf23), - SPH_C32(0x551e3d6e) }, - { SPH_C32(0x1e6c0000), SPH_C32(0xc4420000), SPH_C32(0x8a2e0000), - SPH_C32(0xbcb6b800), SPH_C32(0x2c4413b6), SPH_C32(0x8bfdd3da), - SPH_C32(0x6a0c1bc8), SPH_C32(0xb99dc2eb), SPH_C32(0x92560000), - SPH_C32(0x1eda0000), SPH_C32(0xea510000), SPH_C32(0xe8b13000), - SPH_C32(0xa93556a5), SPH_C32(0xebfb6199), SPH_C32(0xb15c2254), - SPH_C32(0x33c5244f) }, - { SPH_C32(0x92560000), SPH_C32(0x1eda0000), SPH_C32(0xea510000), - SPH_C32(0xe8b13000), SPH_C32(0xa93556a5), SPH_C32(0xebfb6199), - SPH_C32(0xb15c2254), SPH_C32(0x33c5244f), SPH_C32(0x8c3a0000), - SPH_C32(0xda980000), SPH_C32(0x607f0000), SPH_C32(0x54078800), - SPH_C32(0x85714513), SPH_C32(0x6006b243), SPH_C32(0xdb50399c), - SPH_C32(0x8a58e6a4) }, - { SPH_C32(0x033d0000), SPH_C32(0x08b30000), SPH_C32(0xf33a0000), - SPH_C32(0x3ac20007), SPH_C32(0x51298a50), SPH_C32(0x6b6e661f), - SPH_C32(0x0ea5cfe3), SPH_C32(0xe6da7ffe), SPH_C32(0xa8da0000), - SPH_C32(0x96be0000), SPH_C32(0x5c1d0000), SPH_C32(0x07da0002), - SPH_C32(0x7d669583), SPH_C32(0x1f98708a), SPH_C32(0xbb668808), - SPH_C32(0xda878000) }, - { SPH_C32(0xa8da0000), SPH_C32(0x96be0000), SPH_C32(0x5c1d0000), - SPH_C32(0x07da0002), SPH_C32(0x7d669583), SPH_C32(0x1f98708a), - SPH_C32(0xbb668808), SPH_C32(0xda878000), SPH_C32(0xabe70000), - SPH_C32(0x9e0d0000), SPH_C32(0xaf270000), SPH_C32(0x3d180005), - SPH_C32(0x2c4f1fd3), SPH_C32(0x74f61695), SPH_C32(0xb5c347eb), - SPH_C32(0x3c5dfffe) }, - { SPH_C32(0x01930000), SPH_C32(0xe7820000), SPH_C32(0xedfb0000), - SPH_C32(0xcf0c000b), SPH_C32(0x8dd08d58), SPH_C32(0xbca3b42e), - SPH_C32(0x063661e1), SPH_C32(0x536f9e7b), SPH_C32(0x92280000), - SPH_C32(0xdc850000), SPH_C32(0x57fa0000), SPH_C32(0x56dc0003), - SPH_C32(0xbae92316), SPH_C32(0x5aefa30c), SPH_C32(0x90cef752), - SPH_C32(0x7b1675d7) }, - { SPH_C32(0x92280000), SPH_C32(0xdc850000), SPH_C32(0x57fa0000), - SPH_C32(0x56dc0003), SPH_C32(0xbae92316), SPH_C32(0x5aefa30c), - SPH_C32(0x90cef752), SPH_C32(0x7b1675d7), SPH_C32(0x93bb0000), - SPH_C32(0x3b070000), SPH_C32(0xba010000), SPH_C32(0x99d00008), - SPH_C32(0x3739ae4e), SPH_C32(0xe64c1722), SPH_C32(0x96f896b3), - SPH_C32(0x2879ebac) }, - { SPH_C32(0x5fa80000), SPH_C32(0x56030000), SPH_C32(0x43ae0000), - SPH_C32(0x64f30013), SPH_C32(0x257e86bf), SPH_C32(0x1311944e), - SPH_C32(0x541e95bf), SPH_C32(0x8ea4db69), SPH_C32(0x00440000), - SPH_C32(0x7f480000), SPH_C32(0xda7c0000), SPH_C32(0x2a230001), - SPH_C32(0x3badc9cc), SPH_C32(0xa9b69c87), SPH_C32(0x030a9e60), - SPH_C32(0xbe0a679e) }, - { SPH_C32(0x00440000), SPH_C32(0x7f480000), SPH_C32(0xda7c0000), - SPH_C32(0x2a230001), SPH_C32(0x3badc9cc), SPH_C32(0xa9b69c87), - SPH_C32(0x030a9e60), SPH_C32(0xbe0a679e), SPH_C32(0x5fec0000), - SPH_C32(0x294b0000), SPH_C32(0x99d20000), SPH_C32(0x4ed00012), - SPH_C32(0x1ed34f73), SPH_C32(0xbaa708c9), SPH_C32(0x57140bdf), - SPH_C32(0x30aebcf7) }, - { SPH_C32(0xee930000), SPH_C32(0xd6070000), SPH_C32(0x92c10000), - SPH_C32(0x2b9801e0), SPH_C32(0x9451287c), SPH_C32(0x3b6cfb57), - SPH_C32(0x45312374), SPH_C32(0x201f6a64), SPH_C32(0x7b280000), - SPH_C32(0x57420000), SPH_C32(0xa9e50000), SPH_C32(0x634300a0), - SPH_C32(0x9edb442f), SPH_C32(0x6d9995bb), SPH_C32(0x27f83b03), - SPH_C32(0xc7ff60f0) }, - { SPH_C32(0x7b280000), SPH_C32(0x57420000), SPH_C32(0xa9e50000), - SPH_C32(0x634300a0), SPH_C32(0x9edb442f), SPH_C32(0x6d9995bb), - SPH_C32(0x27f83b03), SPH_C32(0xc7ff60f0), SPH_C32(0x95bb0000), - SPH_C32(0x81450000), SPH_C32(0x3b240000), SPH_C32(0x48db0140), - SPH_C32(0x0a8a6c53), SPH_C32(0x56f56eec), SPH_C32(0x62c91877), - SPH_C32(0xe7e00a94) } -}; - -#define hamsi_s00 m0 -#define hamsi_s01 m1 -#define hamsi_s02 c0 -#define hamsi_s03 c1 -#define hamsi_s04 m2 -#define hamsi_s05 m3 -#define hamsi_s06 c2 -#define hamsi_s07 c3 -#define hamsi_s08 c4 -#define hamsi_s09 c5 -#define hamsi_s0A m4 -#define hamsi_s0B m5 -#define hamsi_s0C c6 -#define hamsi_s0D c7 -#define hamsi_s0E m6 -#define hamsi_s0F m7 -#define hamsi_s10 m8 -#define hamsi_s11 m9 -#define hamsi_s12 c8 -#define hamsi_s13 c9 -#define hamsi_s14 mA -#define hamsi_s15 mB -#define hamsi_s16 cA -#define hamsi_s17 cB -#define hamsi_s18 cC -#define hamsi_s19 cD -#define hamsi_s1A mC -#define hamsi_s1B mD -#define hamsi_s1C cE -#define hamsi_s1D cF -#define hamsi_s1E mE -#define hamsi_s1F mF #define SBOX(a, b, c, d) { \ uint32_t t; \ @@ -496,58 +435,58 @@ static __constant__ uint32_t d_T512[64][16] = { } #define ROUND_BIG(rc, alpha) { \ - hamsi_s00 ^= alpha[0x00]; \ - hamsi_s08 ^= alpha[0x08]; \ - hamsi_s10 ^= alpha[0x10]; \ - hamsi_s18 ^= alpha[0x18]; \ - hamsi_s01 ^= alpha[0x01] ^ (uint32_t)(rc); \ - hamsi_s09 ^= alpha[0x09]; \ - hamsi_s11 ^= alpha[0x11]; \ - hamsi_s19 ^= alpha[0x19]; \ - hamsi_s02 ^= alpha[0x02]; \ - hamsi_s0A ^= alpha[0x0A]; \ - hamsi_s12 ^= alpha[0x12]; \ - hamsi_s1A ^= alpha[0x1A]; \ - hamsi_s03 ^= alpha[0x03]; \ - hamsi_s0B ^= alpha[0x0B]; \ - hamsi_s13 ^= alpha[0x13]; \ - hamsi_s1B ^= alpha[0x1B]; \ - hamsi_s04 ^= alpha[0x04]; \ - hamsi_s0C ^= alpha[0x0C]; \ - hamsi_s14 ^= alpha[0x14]; \ - hamsi_s1C ^= alpha[0x1C]; \ - hamsi_s05 ^= alpha[0x05]; \ - hamsi_s0D ^= alpha[0x0D]; \ - hamsi_s15 ^= alpha[0x15]; \ - hamsi_s1D ^= alpha[0x1D]; \ - hamsi_s06 ^= alpha[0x06]; \ - hamsi_s0E ^= alpha[0x0E]; \ - hamsi_s16 ^= alpha[0x16]; \ - hamsi_s1E ^= alpha[0x1E]; \ - hamsi_s07 ^= alpha[0x07]; \ - hamsi_s0F ^= alpha[0x0F]; \ - hamsi_s17 ^= alpha[0x17]; \ - hamsi_s1F ^= alpha[0x1F]; \ - SBOX(hamsi_s00, hamsi_s08, hamsi_s10, hamsi_s18); \ - SBOX(hamsi_s01, hamsi_s09, hamsi_s11, hamsi_s19); \ - SBOX(hamsi_s02, hamsi_s0A, hamsi_s12, hamsi_s1A); \ - SBOX(hamsi_s03, hamsi_s0B, hamsi_s13, hamsi_s1B); \ - SBOX(hamsi_s04, hamsi_s0C, hamsi_s14, hamsi_s1C); \ - SBOX(hamsi_s05, hamsi_s0D, hamsi_s15, hamsi_s1D); \ - SBOX(hamsi_s06, hamsi_s0E, hamsi_s16, hamsi_s1E); \ - SBOX(hamsi_s07, hamsi_s0F, hamsi_s17, hamsi_s1F); \ - HAMSI_L(hamsi_s00, hamsi_s09, hamsi_s12, hamsi_s1B); \ - HAMSI_L(hamsi_s01, hamsi_s0A, hamsi_s13, hamsi_s1C); \ - HAMSI_L(hamsi_s02, hamsi_s0B, hamsi_s14, hamsi_s1D); \ - HAMSI_L(hamsi_s03, hamsi_s0C, hamsi_s15, hamsi_s1E); \ - HAMSI_L(hamsi_s04, hamsi_s0D, hamsi_s16, hamsi_s1F); \ - HAMSI_L(hamsi_s05, hamsi_s0E, hamsi_s17, hamsi_s18); \ - HAMSI_L(hamsi_s06, hamsi_s0F, hamsi_s10, hamsi_s19); \ - HAMSI_L(hamsi_s07, hamsi_s08, hamsi_s11, hamsi_s1A); \ - HAMSI_L(hamsi_s00, hamsi_s02, hamsi_s05, hamsi_s07); \ - HAMSI_L(hamsi_s10, hamsi_s13, hamsi_s15, hamsi_s16); \ - HAMSI_L(hamsi_s09, hamsi_s0B, hamsi_s0C, hamsi_s0E); \ - HAMSI_L(hamsi_s19, hamsi_s1A, hamsi_s1C, hamsi_s1F); \ + m0 ^= alpha[0x00]; \ + c4 ^= alpha[0x08]; \ + m8 ^= alpha[0x10]; \ + cC ^= alpha[0x18]; \ + m1 ^= alpha[0x01] ^ rc; \ + c5 ^= alpha[0x09]; \ + m9 ^= alpha[0x11]; \ + cD ^= alpha[0x19]; \ + c0 ^= alpha[0x02]; \ + m4 ^= alpha[0x0A]; \ + c8 ^= alpha[0x12]; \ + mC ^= alpha[0x1A]; \ + c1 ^= alpha[0x03]; \ + m5 ^= alpha[0x0B]; \ + c9 ^= alpha[0x13]; \ + mD ^= alpha[0x1B]; \ + m2 ^= alpha[0x04]; \ + c6 ^= alpha[0x0C]; \ + mA ^= alpha[0x14]; \ + cE ^= alpha[0x1C]; \ + m3 ^= alpha[0x05]; \ + c7 ^= alpha[0x0D]; \ + mB ^= alpha[0x15]; \ + cF ^= alpha[0x1D]; \ + c2 ^= alpha[0x06]; \ + m6 ^= alpha[0x0E]; \ + cA ^= alpha[0x16]; \ + mE ^= alpha[0x1E]; \ + c3 ^= alpha[0x07]; \ + m7 ^= alpha[0x0F]; \ + cB ^= alpha[0x17]; \ + mF ^= alpha[0x1F]; \ + SBOX(m0, c4, m8, cC); \ + SBOX(m1, c5, m9, cD); \ + SBOX(c0, m4, c8, mC); \ + SBOX(c1, m5, c9, mD); \ + SBOX(m2, c6, mA, cE); \ + SBOX(m3, c7, mB, cF); \ + SBOX(c2, m6, cA, mE); \ + SBOX(c3, m7, cB, mF); \ + HAMSI_L(m0, c5, c8, mD); \ + HAMSI_L(m1, m4, c9, cE); \ + HAMSI_L(c0, m5, mA, cF); \ + HAMSI_L(c1, c6, mB, mE); \ + HAMSI_L(m2, c7, cA, mF); \ + HAMSI_L(m3, m6, cB, cC); \ + HAMSI_L(c2, m7, m8, cD); \ + HAMSI_L(c3, c4, m9, mC); \ + HAMSI_L(m0, c0, m3, c3); \ + HAMSI_L(m8, c9, mB, cA); \ + HAMSI_L(c5, m5, c6, m6); \ + HAMSI_L(cD, mC, cE, mF); \ } @@ -563,69 +502,100 @@ static __constant__ uint32_t d_T512[64][16] = { #define T_BIG { \ /* order is important */ \ - cF = (h[0xF] ^= hamsi_s17); \ - cE = (h[0xE] ^= hamsi_s16); \ - cD = (h[0xD] ^= hamsi_s15); \ - cC = (h[0xC] ^= hamsi_s14); \ - cB = (h[0xB] ^= hamsi_s13); \ - cA = (h[0xA] ^= hamsi_s12); \ - c9 = (h[0x9] ^= hamsi_s11); \ - c8 = (h[0x8] ^= hamsi_s10); \ - c7 = (h[0x7] ^= hamsi_s07); \ - c6 = (h[0x6] ^= hamsi_s06); \ - c5 = (h[0x5] ^= hamsi_s05); \ - c4 = (h[0x4] ^= hamsi_s04); \ - c3 = (h[0x3] ^= hamsi_s03); \ - c2 = (h[0x2] ^= hamsi_s02); \ - c1 = (h[0x1] ^= hamsi_s01); \ - c0 = (h[0x0] ^= hamsi_s00); \ + cF = (h[0xF] ^= cB); \ + cE = (h[0xE] ^= cA); \ + cD = (h[0xD] ^= mB); \ + cC = (h[0xC] ^= mA); \ + cB = (h[0xB] ^= c9); \ + cA = (h[0xA] ^= c8); \ + c9 = (h[0x9] ^= m9); \ + c8 = (h[0x8] ^= m8); \ + c7 = (h[0x7] ^= c3); \ + c6 = (h[0x6] ^= c2); \ + c5 = (h[0x5] ^= m3); \ + c4 = (h[0x4] ^= m2); \ + c3 = (h[0x3] ^= c1); \ + c2 = (h[0x2] ^= c0); \ + c1 = (h[0x1] ^= m1); \ + c0 = (h[0x0] ^= m0); \ } -__global__ __launch_bounds__(512,2) -void x13_hamsi512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector) +__global__ __launch_bounds__(256,4) +void x13_hamsi512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint32_t *g_hash ) { - uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); - if (thread < threads) + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); +// if (thread < threads) { - uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread); - - int hashPosition = nounce - startNounce; - uint32_t *Hash = (uint32_t*)&g_hash[hashPosition<<3]; - unsigned char *h1 = (unsigned char *)Hash; - - uint32_t c0 = SPH_C32(0x73746565), c1 = SPH_C32(0x6c706172), c2 = SPH_C32(0x6b204172), c3 = SPH_C32(0x656e6265); - uint32_t c4 = SPH_C32(0x72672031), c5 = SPH_C32(0x302c2062), c6 = SPH_C32(0x75732032), c7 = SPH_C32(0x3434362c); - uint32_t c8 = SPH_C32(0x20422d33), c9 = SPH_C32(0x30303120), cA = SPH_C32(0x4c657576), cB = SPH_C32(0x656e2d48); - uint32_t cC = SPH_C32(0x65766572), cD = SPH_C32(0x6c65652c), cE = SPH_C32(0x2042656c), cF = SPH_C32(0x6769756d); + const uint32_t d_alpha_n[32] = + { + 0xff00f0f0, 0xccccaaaa, 0xf0f0cccc, + 0xff00aaaa, 0xccccaaaa, 0xf0f0ff00, + 0xaaaacccc, 0xf0f0ff00, 0xf0f0cccc, + 0xaaaaff00, 0xccccff00, 0xaaaaf0f0, + 0xaaaaf0f0, 0xff00cccc, 0xccccf0f0, + 0xff00aaaa, 0xccccaaaa, 0xff00f0f0, + 0xff00aaaa, 0xf0f0cccc, 0xf0f0ff00, + 0xccccaaaa, 0xf0f0ff00, 0xaaaacccc, + 0xaaaaff00, 0xf0f0cccc, 0xaaaaf0f0, + 0xccccff00, 0xff00cccc, 0xaaaaf0f0, + 0xff00aaaa, 0xccccf0f0 + }; + + + const uint32_t d_alpha_f[32] = { + 0xcaf9639c, 0x0ff0f9c0, 0x639c0ff0, + 0xcaf9f9c0, 0x0ff0f9c0, 0x639ccaf9, + 0xf9c00ff0, 0x639ccaf9, 0x639c0ff0, + 0xf9c0caf9, 0x0ff0caf9, 0xf9c0639c, + 0xf9c0639c, 0xcaf90ff0, 0x0ff0639c, + 0xcaf9f9c0, 0x0ff0f9c0, 0xcaf9639c, + 0xcaf9f9c0, 0x639c0ff0, 0x639ccaf9, + 0x0ff0f9c0, 0x639ccaf9, 0xf9c00ff0, + 0xf9c0caf9, 0x639c0ff0, 0xf9c0639c, + 0x0ff0caf9, 0xcaf90ff0, 0xf9c0639c, + 0xcaf9f9c0, 0x0ff0639c + }; + + const uint32_t nounce = (startNounce + thread); + + const uint32_t hashPosition = nounce - startNounce; + uint32_t *const Hash = &g_hash[hashPosition*16]; + uint8_t *h1 = (uint8_t *)Hash; + + uint32_t c0 = 0x73746565, c1 = 0x6c706172, c2 = 0x6b204172, c3 = 0x656e6265; + uint32_t c4 = 0x72672031, c5 = 0x302c2062, c6 = 0x75732032, c7 = 0x3434362c; + uint32_t c8 = 0x20422d33, c9 = 0x30303120, cA = 0x4c657576, cB = 0x656e2d48; + uint32_t cC = 0x65766572, cD = 0x6c65652c, cE = 0x2042656c, cF = 0x6769756d; uint32_t m0, m1, m2, m3, m4, m5, m6, m7, m8, m9, mA, mB, mC, mD, mE, mF; uint32_t h[16] = { c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, cA, cB, cC, cD, cE, cF }; uint32_t *tp, db, dm; - for(int i = 0; i < 64; i += 8) { - - m0 = 0; m1 = 0; m2 = 0; m3 = 0; m4 = 0; m5 = 0; m6 = 0; m7 = 0; + #pragma unroll 1 + for(int i = 0; i < 64; i += 8) + { + tp = &d_T512[0]; + m0 = 0; m1 = 0; m2 = 0; m3 = 0; m4 = 0; m5 = 0; m6 = 0; m7 = 0; m8 = 0; m9 = 0; mA = 0; mB = 0; mC = 0; mD = 0; mE = 0; mF = 0; - tp = &d_T512[0][0]; -#pragma unroll 2 - for (int u = 0; u < 8; u ++) { + for (int u = 0; u < 8; u++) + { db = h1[i+u]; -#pragma unroll 2 - for (int v = 0; v < 8; v ++, db >>= 1) { - dm = -(uint32_t)(db & 1); - m0 ^= dm & *(tp+ 0); m1 ^= dm & *(tp+ 1); - m2 ^= dm & *(tp+ 2); m3 ^= dm & *(tp+ 3); - m4 ^= dm & *(tp+ 4); m5 ^= dm & *(tp+ 5); - m6 ^= dm & *(tp+ 6); m7 ^= dm & *(tp+ 7); - m8 ^= dm & *(tp+ 8); m9 ^= dm & *(tp+ 9); - mA ^= dm & *(tp+10); mB ^= dm & *(tp+11); - mC ^= dm & *(tp+12); mD ^= dm & *(tp+13); - mE ^= dm & *(tp+14); mF ^= dm & *(tp+15); - tp += 16; + for (int v = 0; v < 8; v++, db >>= 1, tp += 16) + { + dm = -(db & 1); + m0 ^= dm & tp[0]; m1 ^= dm & tp[1]; + m2 ^= dm & tp[2]; m3 ^= dm & tp[3]; + m4 ^= dm & tp[4]; m5 ^= dm & tp[5]; + m6 ^= dm & tp[6]; m7 ^= dm & tp[7]; + m8 ^= dm & tp[8]; m9 ^= dm & tp[9]; + mA ^= dm & tp[10]; mB ^= dm & tp[11]; + mC ^= dm & tp[12]; mD ^= dm & tp[13]; + mE ^= dm & tp[14]; mF ^= dm & tp[15]; } } - for( int r = 0; r < 6; r += 2 ) { + for (int r = 0; r < 6; r += 2) + { ROUND_BIG(r, d_alpha_n); ROUND_BIG(r+1, d_alpha_n); } @@ -633,38 +603,251 @@ void x13_hamsi512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t * } - tp = &d_T512[0][0] + 112; - - m0 = *(tp+ 0); m1 = *(tp+ 1); - m2 = *(tp+ 2); m3 = *(tp+ 3); - m4 = *(tp+ 4); m5 = *(tp+ 5); - m6 = *(tp+ 6); m7 = *(tp+ 7); - m8 = *(tp+ 8); m9 = *(tp+ 9); - mA = *(tp+10); mB = *(tp+11); - mC = *(tp+12); mD = *(tp+13); - mE = *(tp+14); mF = *(tp+15); - - for( int r = 0; r < 6; r += 2 ) { - ROUND_BIG(r, d_alpha_n); - ROUND_BIG(r+1, d_alpha_n); - } + tp = &d_T512[0] + 112; + + m0 = tp[ 0]; m1 = tp[ 1]; + m2 = tp[ 2]; m3 = tp[ 3]; + m4 = tp[ 4]; m5 = tp[ 5]; + m6 = tp[ 6]; m7 = tp[ 7]; + m8 = tp[ 8]; m9 = tp[ 9]; + mA = tp[10]; mB = tp[11]; + mC = tp[12]; mD = tp[13]; + mE = tp[14]; mF = tp[15]; + + for (int r = 0; r < 6; r += 2) + { + // ROUND_BIG(r, d_alpha_n); + m0 ^= d_alpha_n[0x00]; \ + c4 ^= d_alpha_n[0x08]; \ + m8 ^= d_alpha_n[0x10]; \ + cC ^= d_alpha_n[0x18]; \ + m1 ^= d_alpha_n[0x01] ^ r; \ + c5 ^= d_alpha_n[0x09]; \ + m9 ^= d_alpha_n[0x11]; \ + cD ^= d_alpha_n[0x19]; \ + c0 ^= d_alpha_n[0x02]; \ + m4 ^= d_alpha_n[0x0A]; \ + c8 ^= d_alpha_n[0x12]; \ + mC ^= d_alpha_n[0x1A]; \ + c1 ^= d_alpha_n[0x03]; \ + m5 ^= d_alpha_n[0x0B]; \ + c9 ^= d_alpha_n[0x13]; \ + mD ^= d_alpha_n[0x1B]; \ + m2 ^= d_alpha_n[0x04]; \ + c6 ^= d_alpha_n[0x0C]; \ + mA ^= d_alpha_n[0x14]; \ + cE ^= d_alpha_n[0x1C]; \ + m3 ^= d_alpha_n[0x05]; \ + c7 ^= d_alpha_n[0x0D]; \ + mB ^= d_alpha_n[0x15]; \ + cF ^= d_alpha_n[0x1D]; \ + c2 ^= d_alpha_n[0x06]; \ + m6 ^= d_alpha_n[0x0E]; \ + cA ^= d_alpha_n[0x16]; \ + mE ^= d_alpha_n[0x1E]; \ + c3 ^= d_alpha_n[0x07]; \ + m7 ^= d_alpha_n[0x0F]; \ + cB ^= d_alpha_n[0x17]; \ + mF ^= d_alpha_n[0x1F]; \ + SBOX(m0, c4, m8, cC); \ + SBOX(m1, c5, m9, cD); \ + SBOX(c0, m4, c8, mC); \ + SBOX(c1, m5, c9, mD); \ + SBOX(m2, c6, mA, cE); \ + SBOX(m3, c7, mB, cF); \ + SBOX(c2, m6, cA, mE); \ + SBOX(c3, m7, cB, mF); \ + HAMSI_L(m0, c5, c8, mD); \ + HAMSI_L(m1, m4, c9, cE); \ + HAMSI_L(c0, m5, mA, cF); \ + HAMSI_L(c1, c6, mB, mE); \ + HAMSI_L(m2, c7, cA, mF); \ + HAMSI_L(m3, m6, cB, cC); \ + HAMSI_L(c2, m7, m8, cD); \ + HAMSI_L(c3, c4, m9, mC); \ + HAMSI_L(m0, c0, m3, c3); \ + HAMSI_L(m8, c9, mB, cA); \ + HAMSI_L(c5, m5, c6, m6); \ + HAMSI_L(cD, mC, cE, mF); \ + + // ROUND_BIG(r+1, d_alpha_n); + m0 ^= d_alpha_n[0x00]; \ + c4 ^= d_alpha_n[0x08]; \ + m8 ^= d_alpha_n[0x10]; \ + cC ^= d_alpha_n[0x18]; \ + m1 ^= d_alpha_n[0x01] ^ (r+1); \ + c5 ^= d_alpha_n[0x09]; \ + m9 ^= d_alpha_n[0x11]; \ + cD ^= d_alpha_n[0x19]; \ + c0 ^= d_alpha_n[0x02]; \ + m4 ^= d_alpha_n[0x0A]; \ + c8 ^= d_alpha_n[0x12]; \ + mC ^= d_alpha_n[0x1A]; \ + c1 ^= d_alpha_n[0x03]; \ + m5 ^= d_alpha_n[0x0B]; \ + c9 ^= d_alpha_n[0x13]; \ + mD ^= d_alpha_n[0x1B]; \ + m2 ^= d_alpha_n[0x04]; \ + c6 ^= d_alpha_n[0x0C]; \ + mA ^= d_alpha_n[0x14]; \ + cE ^= d_alpha_n[0x1C]; \ + m3 ^= d_alpha_n[0x05]; \ + c7 ^= d_alpha_n[0x0D]; \ + mB ^= d_alpha_n[0x15]; \ + cF ^= d_alpha_n[0x1D]; \ + c2 ^= d_alpha_n[0x06]; \ + m6 ^= d_alpha_n[0x0E]; \ + cA ^= d_alpha_n[0x16]; \ + mE ^= d_alpha_n[0x1E]; \ + c3 ^= d_alpha_n[0x07]; \ + m7 ^= d_alpha_n[0x0F]; \ + cB ^= d_alpha_n[0x17]; \ + mF ^= d_alpha_n[0x1F]; \ + SBOX(m0, c4, m8, cC); \ + SBOX(m1, c5, m9, cD); \ + SBOX(c0, m4, c8, mC); \ + SBOX(c1, m5, c9, mD); \ + SBOX(m2, c6, mA, cE); \ + SBOX(m3, c7, mB, cF); \ + SBOX(c2, m6, cA, mE); \ + SBOX(c3, m7, cB, mF); \ + HAMSI_L(m0, c5, c8, mD); \ + HAMSI_L(m1, m4, c9, cE); \ + HAMSI_L(c0, m5, mA, cF); \ + HAMSI_L(c1, c6, mB, mE); \ + HAMSI_L(m2, c7, cA, mF); \ + HAMSI_L(m3, m6, cB, cC); \ + HAMSI_L(c2, m7, m8, cD); \ + HAMSI_L(c3, c4, m9, mC); \ + HAMSI_L(m0, c0, m3, c3); \ + HAMSI_L(m8, c9, mB, cA); \ + HAMSI_L(c5, m5, c6, m6); \ + HAMSI_L(cD, mC, cE, mF); \ + } T_BIG; - tp = &d_T512[0][0] + 784; - - m0 = *(tp+ 0); m1 = *(tp+ 1); - m2 = *(tp+ 2); m3 = *(tp+ 3); - m4 = *(tp+ 4); m5 = *(tp+ 5); - m6 = *(tp+ 6); m7 = *(tp+ 7); - m8 = *(tp+ 8); m9 = *(tp+ 9); - mA = *(tp+10); mB = *(tp+11); - mC = *(tp+12); mD = *(tp+13); - mE = *(tp+14); mF = *(tp+15); - - for( int r = 0; r < 12; r += 2 ) { - ROUND_BIG(r, d_alpha_f); - ROUND_BIG(r+1, d_alpha_f); - } + tp = &d_T512[0] + 784; + + m0 = tp[ 0]; m1 = tp[ 1]; + m2 = tp[ 2]; m3 = tp[ 3]; + m4 = tp[ 4]; m5 = tp[ 5]; + m6 = tp[ 6]; m7 = tp[ 7]; + m8 = tp[ 8]; m9 = tp[ 9]; + mA = tp[10]; mB = tp[11]; + mC = tp[12]; mD = tp[13]; + mE = tp[14]; mF = tp[15]; + +#pragma unroll 1 + for( int r = 0; r < 12; r += 2 ) + { + // ROUND_BIG(r, d_alpha_f); + m0 ^= d_alpha_f[0x00]; \ + c4 ^= d_alpha_f[0x08]; \ + m8 ^= d_alpha_f[0x10]; \ + cC ^= d_alpha_f[0x18]; \ + m1 ^= d_alpha_f[0x01] ^ r; \ + c5 ^= d_alpha_f[0x09]; \ + m9 ^= d_alpha_f[0x11]; \ + cD ^= d_alpha_f[0x19]; \ + c0 ^= d_alpha_f[0x02]; \ + m4 ^= d_alpha_f[0x0A]; \ + c8 ^= d_alpha_f[0x12]; \ + mC ^= d_alpha_f[0x1A]; \ + c1 ^= d_alpha_f[0x03]; \ + m5 ^= d_alpha_f[0x0B]; \ + c9 ^= d_alpha_f[0x13]; \ + mD ^= d_alpha_f[0x1B]; \ + m2 ^= d_alpha_f[0x04]; \ + c6 ^= d_alpha_f[0x0C]; \ + mA ^= d_alpha_f[0x14]; \ + cE ^= d_alpha_f[0x1C]; \ + m3 ^= d_alpha_f[0x05]; \ + c7 ^= d_alpha_f[0x0D]; \ + mB ^= d_alpha_f[0x15]; \ + cF ^= d_alpha_f[0x1D]; \ + c2 ^= d_alpha_f[0x06]; \ + m6 ^= d_alpha_f[0x0E]; \ + cA ^= d_alpha_f[0x16]; \ + mE ^= d_alpha_f[0x1E]; \ + c3 ^= d_alpha_f[0x07]; \ + m7 ^= d_alpha_f[0x0F]; \ + cB ^= d_alpha_f[0x17]; \ + mF ^= d_alpha_f[0x1F]; \ + SBOX(m0, c4, m8, cC); \ + SBOX(m1, c5, m9, cD); \ + SBOX(c0, m4, c8, mC); \ + SBOX(c1, m5, c9, mD); \ + SBOX(m2, c6, mA, cE); \ + SBOX(m3, c7, mB, cF); \ + SBOX(c2, m6, cA, mE); \ + SBOX(c3, m7, cB, mF); \ + HAMSI_L(m0, c5, c8, mD); \ + HAMSI_L(m1, m4, c9, cE); \ + HAMSI_L(c0, m5, mA, cF); \ + HAMSI_L(c1, c6, mB, mE); \ + HAMSI_L(m2, c7, cA, mF); \ + HAMSI_L(m3, m6, cB, cC); \ + HAMSI_L(c2, m7, m8, cD); \ + HAMSI_L(c3, c4, m9, mC); \ + HAMSI_L(m0, c0, m3, c3); \ + HAMSI_L(m8, c9, mB, cA); \ + HAMSI_L(c5, m5, c6, m6); \ + HAMSI_L(cD, mC, cE, mF); \ + + // ROUND_BIG(r+1, d_alpha_n); + m0 ^= d_alpha_f[0x00]; \ + c4 ^= d_alpha_f[0x08]; \ + m8 ^= d_alpha_f[0x10]; \ + cC ^= d_alpha_f[0x18]; \ + m1 ^= d_alpha_f[0x01] ^ (r + 1); \ + c5 ^= d_alpha_f[0x09]; \ + m9 ^= d_alpha_f[0x11]; \ + cD ^= d_alpha_f[0x19]; \ + c0 ^= d_alpha_f[0x02]; \ + m4 ^= d_alpha_f[0x0A]; \ + c8 ^= d_alpha_f[0x12]; \ + mC ^= d_alpha_f[0x1A]; \ + c1 ^= d_alpha_f[0x03]; \ + m5 ^= d_alpha_f[0x0B]; \ + c9 ^= d_alpha_f[0x13]; \ + mD ^= d_alpha_f[0x1B]; \ + m2 ^= d_alpha_f[0x04]; \ + c6 ^= d_alpha_f[0x0C]; \ + mA ^= d_alpha_f[0x14]; \ + cE ^= d_alpha_f[0x1C]; \ + m3 ^= d_alpha_f[0x05]; \ + c7 ^= d_alpha_f[0x0D]; \ + mB ^= d_alpha_f[0x15]; \ + cF ^= d_alpha_f[0x1D]; \ + c2 ^= d_alpha_f[0x06]; \ + m6 ^= d_alpha_f[0x0E]; \ + cA ^= d_alpha_f[0x16]; \ + mE ^= d_alpha_f[0x1E]; \ + c3 ^= d_alpha_f[0x07]; \ + m7 ^= d_alpha_f[0x0F]; \ + cB ^= d_alpha_f[0x17]; \ + mF ^= d_alpha_f[0x1F]; \ + SBOX(m0, c4, m8, cC); \ + SBOX(m1, c5, m9, cD); \ + SBOX(c0, m4, c8, mC); \ + SBOX(c1, m5, c9, mD); \ + SBOX(m2, c6, mA, cE); \ + SBOX(m3, c7, mB, cF); \ + SBOX(c2, m6, cA, mE); \ + SBOX(c3, m7, cB, mF); \ + HAMSI_L(m0, c5, c8, mD); \ + HAMSI_L(m1, m4, c9, cE); \ + HAMSI_L(c0, m5, mA, cF); \ + HAMSI_L(c1, c6, mB, mE); \ + HAMSI_L(m2, c7, cA, mF); \ + HAMSI_L(m3, m6, cB, cC); \ + HAMSI_L(c2, m7, m8, cD); \ + HAMSI_L(c3, c4, m9, mC); \ + HAMSI_L(m0, c0, m3, c3); \ + HAMSI_L(m8, c9, mB, cA); \ + HAMSI_L(c5, m5, c6, m6); \ + HAMSI_L(cD, mC, cE, mF); \ + } T_BIG; #pragma unroll 16 @@ -678,12 +861,12 @@ __host__ void x13_hamsi512_cpu_init(int thr_id, uint32_t threads) { } -__host__ void x13_hamsi512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order) +__host__ void x13_hamsi512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash) { const uint32_t threadsperblock = 128; dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 block(threadsperblock); - x13_hamsi512_gpu_hash_64<<>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector); + x13_hamsi512_gpu_hash_64<<>>(threads, startNounce, d_hash); } \ No newline at end of file diff --git a/x13/x13.cu b/x13/x13.cu index 2953306de4..1dd3ee23b6 100644 --- a/x13/x13.cu +++ b/x13/x13.cu @@ -23,57 +23,57 @@ extern "C" #include "cuda_helper.h" -static uint32_t *d_hash[MAX_GPUS]; - - -extern void quark_blake512_cpu_setBlock_80(void *pdata); -extern void quark_blake512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order); +extern void quark_blake512_cpu_init(int thr_id); +extern void quark_blake512_cpu_setBlock_80(int thr_id, uint64_t *pdata); +extern void quark_blake512_cpu_setBlock_80_multi(int thr_id, uint64_t *pdata); +extern void quark_blake512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash); +extern void quark_blake512_cpu_hash_80_multi(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash); extern void quark_bmw512_cpu_init(int thr_id, uint32_t threads); -extern void quark_bmw512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); +extern void quark_bmw512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash); extern void quark_groestl512_cpu_init(int thr_id, uint32_t threads); -extern void quark_groestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); +extern void quark_groestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash); -extern void quark_skein512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); +extern void quark_skein512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash); -extern void cuda_jh512Keccak512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); -//extern void quark_jh512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); -extern void quark_keccak512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); +extern void cuda_jh512Keccak512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash); +//extern void quark_jh512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash); +//extern void quark_keccak512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash); -extern void x11_luffaCubehash512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); +extern void x11_luffaCubehash512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash); -extern void x11_shavite512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); +extern void x11_shavite512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash); extern int x11_simd512_cpu_init(int thr_id, uint32_t threads); -extern void x11_simd512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); +extern void x11_simd512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, const uint32_t simdthreads); extern void x11_echo512_cpu_init(int thr_id, uint32_t threads); -extern void x11_echo512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); +extern void x11_echo512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash); extern void x13_hamsi512_cpu_init(int thr_id, uint32_t threads); -extern void x13_hamsi512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); +extern void x13_hamsi512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash); extern void x13_fugue512_cpu_init(int thr_id, uint32_t threads); -extern uint32_t x13_fugue512_cpu_hash_64_final(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); -extern void x13_fugue512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); -//extern uint32_t cuda_check_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); +extern void x13_fugue512_cpu_hash_64_final(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, uint32_t *result); +extern void x13_fugue512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash); +//extern uint32_t cuda_check_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash); extern uint32_t cuda_check_hash(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_inputHash); -extern void x13_fugue512_cpu_setTarget(const void *ptarget); +extern void x13_fugue512_cpu_setTarget(int thr_id, const void *ptarget); extern void x13_fugue512_cpu_free(int32_t thr_id); //extern void cuda_check_cpu_free(int32_t thr_id); extern void x11_simd512_cpu_free(int32_t thr_id); extern void quark_compactTest_cpu_init(int thr_id, uint32_t threads); -extern void quark_compactTest_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *inpHashes, - uint32_t *d_noncesTrue, uint32_t *nrmTrue, uint32_t *d_noncesFalse, uint32_t *nrmFalse, int order); +extern void quark_compactTest_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, const uint32_t *inpHashes, + const uint32_t *d_noncesTrue, uint32_t *nrmTrue, uint32_t *d_noncesFalse, uint32_t *nrmFalse); // X13 Hashfunktion -extern "C" void x13hash(void *output, const void *input) +void x13hash(void *output, const void *input) { // blake1-bmw2-grs3-skein4-jh5-keccak6-luffa7-cubehash8-shavite9-simd10-echo11-hamsi12-fugue13 @@ -149,93 +149,127 @@ extern "C" void x13hash(void *output, const void *input) memcpy(output, hash, 32); } -extern "C" int scanhash_x13(int thr_id, uint32_t *pdata, - const uint32_t *ptarget, uint32_t max_nonce, - unsigned long *hashes_done) +extern int scanhash_x13(int thr_id, uint32_t *pdata, + uint32_t *ptarget, uint32_t max_nonce, + uint32_t *hashes_done) { + static THREAD uint32_t *d_hash = nullptr; + static THREAD uint32_t *h_found = nullptr; + const uint32_t first_nonce = pdata[19]; - static bool init[MAX_GPUS] = { 0 }; uint32_t endiandata[20]; - unsigned int intensity = 19; // (device_sm[device_map[thr_id]] > 500 && !is_windows()) ? 20 : 19; - uint32_t throughput = device_intensity(thr_id, __func__, 1 << intensity); // 19=256*256*8; - - throughput = min(throughput, (max_nonce - first_nonce)); + int intensity = (device_sm[device_map[thr_id]] > 500) ? 256 * 256 * 26 : 256 * 256 * 13; + uint32_t throughputmax = device_intensity(device_map[thr_id], __func__, intensity); + uint32_t throughput = min(throughputmax, max_nonce - first_nonce) & 0xfffffc00; + uint32_t simdthreads = (device_sm[device_map[thr_id]] > 500) ? 256 : 32; if (opt_benchmark) - ((uint32_t*)ptarget)[7] = 0xf; + ptarget[7] = 0xff; - if (!init[thr_id]) + static THREAD volatile bool init = false; + if(!init) { - cudaSetDevice(device_map[thr_id]); - cudaDeviceReset(); + CUDA_SAFE_CALL(cudaSetDevice(device_map[thr_id])); cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync); cudaDeviceSetCacheConfig(cudaFuncCachePreferL1); + CUDA_SAFE_CALL(cudaStreamCreate(&gpustream[thr_id])); + get_cuda_arch(&cuda_arch[thr_id]); +#if defined WIN32 && !defined _WIN64 + // 2GB limit for cudaMalloc + if(throughputmax > 0x7fffffffULL / (64 * sizeof(uint4))) + { + applog(LOG_ERR, "intensity too high"); + mining_has_stopped[thr_id] = true; + cudaStreamDestroy(gpustream[thr_id]); + proper_exit(2); + } +#endif - quark_groestl512_cpu_init(thr_id, throughput); - quark_bmw512_cpu_init(thr_id, throughput); - x11_simd512_cpu_init(thr_id, throughput); - x11_echo512_cpu_init(thr_id, throughput); - x13_hamsi512_cpu_init(thr_id, throughput); - x13_fugue512_cpu_init(thr_id, throughput); + quark_groestl512_cpu_init(thr_id, throughputmax); + quark_bmw512_cpu_init(thr_id, throughputmax); + x11_simd512_cpu_init(thr_id, throughputmax); + x11_echo512_cpu_init(thr_id, throughputmax); + x13_hamsi512_cpu_init(thr_id, throughputmax); + x13_fugue512_cpu_init(thr_id, throughputmax); - CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput), 0); + CUDA_SAFE_CALL(cudaMalloc(&d_hash, 16 * sizeof(uint32_t) * throughputmax)); + CUDA_SAFE_CALL(cudaMallocHost(&(h_found), 2 * sizeof(uint32_t))); - cuda_check_cpu_init(thr_id, throughput); - init[thr_id] = true; +// cuda_check_cpu_init(thr_id, throughput); + mining_has_stopped[thr_id] = false; + init = true; } for (int k = 0; k < 20; k++) - be32enc(&endiandata[k], ((uint32_t*)pdata)[k]); + be32enc(&endiandata[k], pdata[k]); - quark_blake512_cpu_setBlock_80((void*)endiandata); - cuda_check_cpu_setTarget(ptarget); -// x13_fugue512_cpu_setTarget(ptarget); + quark_blake512_cpu_setBlock_80(thr_id, (uint64_t *)endiandata); + // cuda_check_cpu_setTarget(ptarget, thr_id); + x13_fugue512_cpu_setTarget(thr_id, ptarget); do { - int order = 0; - quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++); - quark_bmw512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); - quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); - quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); - cuda_jh512Keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); - x11_luffaCubehash512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); - x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); - x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); - x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); - x13_hamsi512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); - x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); - // uint32_t foundNonce = x13_fugue512_cpu_hash_64_final(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); - - uint32_t foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]); - if (foundNonce != 0xffffffff) + quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash); + quark_bmw512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash); + quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash); + quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash); + cuda_jh512Keccak512_cpu_hash_64(thr_id, throughput, pdata[19], d_hash); + x11_luffaCubehash512_cpu_hash_64(thr_id, throughput, pdata[19], d_hash); + x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], d_hash); + x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], d_hash,simdthreads); + x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], d_hash); + x13_hamsi512_cpu_hash_64(thr_id, throughput, pdata[19], d_hash); + x13_fugue512_cpu_hash_64_final(thr_id, throughput, pdata[19], d_hash, h_found); + + if(stop_mining) {mining_has_stopped[thr_id] = true; cudaStreamDestroy(gpustream[thr_id]); pthread_exit(nullptr);} + // h_found[0] = 0xffffffff; + if (h_found[0] != 0xffffffff) { - uint32_t vhash64[8]; - be32enc(&endiandata[19], foundNonce); + const uint32_t Htarg = ptarget[7]; + uint32_t vhash64[8]={0}; + if(opt_verify){ be32enc(&endiandata[19], h_found[0]); x13hash(vhash64, endiandata); - uint32_t Htarg = ptarget[7]; - if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) + + } if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) { int res = 1; - uint32_t secNonce = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], foundNonce); *hashes_done = pdata[19] - first_nonce + throughput; - if (secNonce != 0) + if (h_found[1] != 0xffffffff) { - if (opt_benchmark) applog(LOG_INFO, "found second nounce", thr_id, foundNonce, vhash64[7], Htarg); - pdata[21] = secNonce; - res++; + if(opt_verify){ be32enc(&endiandata[19], h_found[1]); + x13hash(vhash64, endiandata); + } if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) + { + + pdata[21] = h_found[1]; + res++; + if (opt_benchmark) + applog(LOG_INFO, "GPU #%d Found second nounce %08x", device_map[thr_id], h_found[1]); + } + else + { + if (vhash64[7] != Htarg) + { + applog(LOG_WARNING, "GPU #%d: result for %08x does not validate on CPU!", device_map[thr_id], h_found[1]); + } + } + } - pdata[19] = foundNonce; - if (opt_benchmark) applog(LOG_INFO, "found nounce", thr_id, foundNonce, vhash64[7], Htarg); + pdata[19] = h_found[0]; + if (opt_benchmark) + applog(LOG_INFO, "GPU #%d Found nounce %08x", device_map[thr_id], h_found[0]); return res; } else { - applog(LOG_INFO, "GPU #%d: result for %08x does not validate on CPU!", thr_id, foundNonce); + if (vhash64[7] != Htarg) + { + applog(LOG_WARNING, "GPU #%d: result for %08x does not validate on CPU!", device_map[thr_id], h_found[0]); + } } } - pdata[19] += throughput; + pdata[19] += throughput; CUDA_SAFE_CALL(cudaGetLastError()); } while (!work_restart[thr_id].restart && ((uint64_t)max_nonce > ((uint64_t)(pdata[19]) + (uint64_t)throughput))); - *hashes_done = pdata[19] - first_nonce + 1; + *hashes_done = pdata[19] - first_nonce ; return 0; } diff --git a/x15/cuda_whirlpoolx.cu b/x15/cuda_whirlpoolx.cu new file mode 100644 index 0000000000..1bfbea40a4 --- /dev/null +++ b/x15/cuda_whirlpoolx.cu @@ -0,0 +1,615 @@ +/* + * Built on cbuchner1's implementation, actual hashing code + * based on sphlib 3.0 + */ +#include +#include +#include "cuda_helper.h" + + + +#if __CUDA_ARCH__ > 500 +#define TPB 1024 +#else +#define TPB 256 +#endif + + +#define NONCES_PER_THREAD 16 + +__constant__ uint64_t c_PaddedMessage80[16]; // padded message (80 bytes + padding) +__constant__ uint2 c_xtra[8]; +__constant__ uint2 c_tmp[72]; +static uint2 *d_xtra[MAX_GPUS]; +static uint64_t *d_tmp[MAX_GPUS]; +__constant__ uint64_t pTarget[1]; + +static uint32_t *h_wxnounce[MAX_GPUS]; +static uint32_t *d_WXNonce[MAX_GPUS]; + +/** + * Whirlpool CUDA kernel implementation. + * + * ==========================(LICENSE BEGIN)============================ + * + * Copyright (c) 2014 djm34 & tpruvot & SP & Provos Alexis + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * ===========================(LICENSE END)============================= + * @author djm34 + * @author tpruvot + * @author SP + * @author Provos Alexis + */ + +__constant__ __align__(128) uint64_t mixTob0Tox[256]; +__constant__ __align__(128) uint64_t mixTob1Tox[256]; + +const uint64_t hmixTob0Tox[256] = { + 0xD83078C018601818,0x2646AF05238C2323,0xB891F97EC63FC6C6,0xFBCD6F13E887E8E8,0xCB13A14C87268787,0x116D62A9B8DAB8B8,0x0902050801040101,0x0D9E6E424F214F4F,0x9B6CEEAD36D83636, + 0xFF510459A6A2A6A6,0x0CB9BDDED26FD2D2,0x0EF706FBF5F3F5F5,0x96F280EF79F97979,0x30DECE5F6FA16F6F,0x6D3FEFFC917E9191,0xF8A407AA52555252,0x47C0FD27609D6060,0x35657689BCCABCBC, + 0x372BCDAC9B569B9B,0x8A018C048E028E8E,0xD25B1571A3B6A3A3,0x6C183C600C300C0C,0x84F68AFF7BF17B7B,0x806AE1B535D43535,0xF53A69E81D741D1D,0xB3DD4753E0A7E0E0,0x21B3ACF6D77BD7D7, + 0x9C99ED5EC22FC2C2,0x435C966D2EB82E2E,0x29967A624B314B4B,0x5DE121A3FEDFFEFE,0xD5AE168257415757,0xBD2A41A815541515,0xE8EEB69F77C17777,0x926EEBA537DC3737,0x9ED7567BE5B3E5E5, + 0x1323D98C9F469F9F,0x23FD17D3F0E7F0F0,0x20947F6A4A354A4A,0x44A9959EDA4FDADA,0xA2B025FA587D5858,0xCF8FCA06C903C9C9,0x7C528D5529A42929,0x5A1422500A280A0A,0x507F4FE1B1FEB1B1, + 0xC95D1A69A0BAA0A0,0x14D6DA7F6BB16B6B,0xD917AB5C852E8585,0x3C677381BDCEBDBD,0x8FBA34D25D695D5D,0x9020508010401010,0x07F503F3F4F7F4F4,0xDD8BC016CB0BCBCB,0xD37CC6ED3EF83E3E, + 0x2D0A112805140505,0x78CEE61F67816767,0x97D55373E4B7E4E4,0x024EBB25279C2727,0x7382583241194141,0xA70B9D2C8B168B8B,0xF6530151A7A6A7A7,0xB2FA94CF7DE97D7D,0x4937FBDC956E9595, + 0x56AD9F8ED847D8D8,0x70EB308BFBCBFBFB,0xCDC17123EE9FEEEE,0xBBF891C77CED7C7C,0x71CCE31766856666,0x7BA78EA6DD53DDDD,0xAF2E4BB8175C1717,0x458E460247014747,0x1A21DC849E429E9E, + 0xD489C51ECA0FCACA,0x585A99752DB42D2D,0x2E637991BFC6BFBF,0x3F0E1B38071C0707,0xAC472301AD8EADAD,0xB0B42FEA5A755A5A,0xEF1BB56C83368383,0xB666FF8533CC3333,0x5CC6F23F63916363, + 0x12040A1002080202,0x93493839AA92AAAA,0xDEE2A8AF71D97171,0xC68DCF0EC807C8C8,0xD1327DC819641919,0x3B92707249394949,0x5FAF9A86D943D9D9,0x31F91DC3F2EFF2F2,0xA8DB484BE3ABE3E3, + 0xB9B62AE25B715B5B,0xBC0D9234881A8888,0x3E29C8A49A529A9A,0x0B4CBE2D26982626,0xBF64FA8D32C83232,0x597D4AE9B0FAB0B0,0xF2CF6A1BE983E9E9,0x771E33780F3C0F0F,0x33B7A6E6D573D5D5, + 0xF41DBA74803A8080,0x27617C99BEC2BEBE,0xEB87DE26CD13CDCD,0x8968E4BD34D03434,0x3290757A483D4848,0x54E324ABFFDBFFFF,0x8DF48FF77AF57A7A,0x643DEAF4907A9090,0x9DBE3EC25F615F5F, + 0x3D40A01D20802020,0x0FD0D56768BD6868,0xCA3472D01A681A1A,0xB7412C19AE82AEAE,0x7D755EC9B4EAB4B4,0xCEA8199A544D5454,0x7F3BE5EC93769393,0x2F44AA0D22882222,0x63C8E907648D6464, + 0x2AFF12DBF1E3F1F1,0xCCE6A2BF73D17373,0x82245A9012481212,0x7A805D3A401D4040,0x4810284008200808,0x959BE856C32BC3C3,0xDFC57B33EC97ECEC,0x4DAB9096DB4BDBDB,0xC05F1F61A1BEA1A1, + 0x9107831C8D0E8D8D,0xC87AC9F53DF43D3D,0x5B33F1CC97669797,0x0000000000000000,0xF983D436CF1BCFCF,0x6E5687452BAC2B2B,0xE1ECB39776C57676,0xE619B06482328282,0x28B1A9FED67FD6D6, + 0xC33677D81B6C1B1B,0x74775BC1B5EEB5B5,0xBE432911AF86AFAF,0x1DD4DF776AB56A6A,0xEAA00DBA505D5050,0x578A4C1245094545,0x38FB18CBF3EBF3F3,0xAD60F09D30C03030,0xC4C3742BEF9BEFEF, + 0xDA7EC3E53FFC3F3F,0xC7AA1C9255495555,0xDB591079A2B2A2A2,0xE9C96503EA8FEAEA,0x6ACAEC0F65896565,0x036968B9BAD2BABA,0x4A5E93652FBC2F2F,0x8E9DE74EC027C0C0,0x60A181BEDE5FDEDE, + 0xFC386CE01C701C1C,0x46E72EBBFDD3FDFD,0x1F9A64524D294D4D,0x7639E0E492729292,0xFAEABC8F75C97575,0x360C1E3006180606,0xAE0998248A128A8A,0x4B7940F9B2F2B2B2,0x85D15963E6BFE6E6, + 0x7E1C36700E380E0E,0xE73E63F81F7C1F1F,0x55C4F73762956262,0x3AB5A3EED477D4D4,0x814D3229A89AA8A8,0x5231F4C496629696,0x62EF3A9BF9C3F9F9,0xA397F666C533C5C5,0x104AB13525942525, + 0xABB220F259795959,0xD015AE54842A8484,0xC5E4A7B772D57272,0xEC72DDD539E43939,0x1698615A4C2D4C4C,0x94BC3BCA5E655E5E,0x9FF085E778FD7878,0xE570D8DD38E03838,0x980586148C0A8C8C, + 0x17BFB2C6D163D1D1,0xE4570B41A5AEA5A5,0xA1D94D43E2AFE2E2,0x4EC2F82F61996161,0x427B45F1B3F6B3B3,0x3442A51521842121,0x0825D6949C4A9C9C,0xEE3C66F01E781E1E,0x6186522243114343, + 0xB193FC76C73BC7C7,0x4FE52BB3FCD7FCFC,0x2408142004100404,0xE3A208B251595151,0x252FC7BC995E9999,0x22DAC44F6DA96D6D,0x651A39680D340D0D,0x79E93583FACFFAFA,0x69A384B6DF5BDFDF, + 0xA9FC9BD77EE57E7E,0x1948B43D24902424,0xFE76D7C53BEC3B3B,0x9A4B3D31AB96ABAB,0xF081D13ECE1FCECE,0x9922558811441111,0x8303890C8F068F8F,0x049C6B4A4E254E4E,0x667351D1B7E6B7B7, + 0xE0CB600BEB8BEBEB,0xC178CCFD3CF03C3C,0xFD1FBF7C813E8181,0x4035FED4946A9494,0x1CF30CEBF7FBF7F7,0x186F67A1B9DEB9B9,0x8B265F98134C1313,0x51589C7D2CB02C2C,0x05BBB8D6D36BD3D3, + 0x8CD35C6BE7BBE7E7,0x39DCCB576EA56E6E,0xAA95F36EC437C4C4,0x1B060F18030C0303,0xDCAC138A56455656,0x5E88491A440D4444,0xA0FE9EDF7FE17F7F,0x884F3721A99EA9A9,0x6754824D2AA82A2A, + 0x0A6B6DB1BBD6BBBB,0x879FE246C123C1C1,0xF1A602A253515353,0x72A58BAEDC57DCDC,0x531627580B2C0B0B,0x0127D39C9D4E9D9D,0x2BD8C1476CAD6C6C,0xA462F59531C43131,0xF3E8B98774CD7474, + 0x15F109E3F6FFF6F6,0x4C8C430A46054646,0xA5452609AC8AACAC,0xB50F973C891E8989,0xB42844A014501414,0xBADF425BE1A3E1E1,0xA62C4EB016581616,0xF774D2CD3AE83A3A,0x06D2D06F69B96969, + 0x41122D4809240909,0xD7E0ADA770DD7070,0x6F7154D9B6E2B6B6,0x1EBDB7CED067D0D0,0xD6C77E3BED93EDED,0xE285DB2ECC17CCCC,0x6884572A42154242,0x2C2DC2B4985A9898,0xED550E49A4AAA4A4, + 0x7550885D28A02828,0x86B831DA5C6D5C5C,0x6BED3F93F8C7F8F8,0xC211A44486228686 +}; + +/** + * Round constants. + */ +/* ====================================================================== */ + +__device__ __forceinline__ +static uint64_t ROUND_ELT(const uint64_t*const __restrict__ sharedMemory, const uint64_t*const __restrict__ in, const int i0, const int i1, const int i2, const int i3, const int i4, const int i5, const int i6, const int i7) +{ + const uint32_t* const __restrict__ in32 = (uint32_t*)in; + return + sharedMemory[in32[(i0 << 1)] & 0xff] ^ + sharedMemory[__byte_perm(in32[(i1 << 1)], 0, 0x4441) + 256] ^ + sharedMemory[__byte_perm(in32[(i2 << 1)], 0, 0x4442) + 512] ^ + sharedMemory[__byte_perm(in32[(i3 << 1)], 0, 0x4443) + 768] ^ + sharedMemory[(in32[(i4 << 1) + 1]&0xff) + 1024] ^ + sharedMemory[__byte_perm(in32[(i5 << 1) + 1], 0, 0x4441) + 1280] ^ + sharedMemory[__byte_perm(in32[(i6 << 1) + 1], 0, 0x4442) + 1536] ^ + sharedMemory[__byte_perm(in32[(i7 << 1) + 1], 0, 0x4443) + 1792]; +} + +__device__ __forceinline__ +static uint2 ROUND_ELT2(const uint64_t*const __restrict__ sharedMemory, const uint2*const __restrict__ in, const int i0, const int i1, const int i2, const int i3, const int i4, const int i5, const int i6, const int i7) +{ +// const uint32_t* __restrict__ in32 = (uint32_t*)in; + return + vectorize + ( + sharedMemory[in[(i0)].x & 0xff] ^ + sharedMemory[__byte_perm(in[(i1)].x, 0, 0x4441) + 256] ^ + sharedMemory[__byte_perm(in[(i2)].x, 0, 0x4442) + 512] ^ + sharedMemory[__byte_perm(in[(i3)].x, 0, 0x4443) + 768] ^ + sharedMemory[(in[(i4)].y & 0xff) + 1024] ^ + sharedMemory[__byte_perm(in[(i5)].y, 0, 0x4441) + 1280] ^ + sharedMemory[__byte_perm(in[(i6)].y, 0, 0x4442) + 1536] ^ + sharedMemory[__byte_perm(in[(i7)].y, 0, 0x4443) + 1792]); +} + + +#define TRANSFER(dst, src) { \ + dst[0] = src ## 0; \ + dst[1] = src ## 1; \ + dst[2] = src ## 2; \ + dst[3] = src ## 3; \ + dst[4] = src ## 4; \ + dst[5] = src ## 5; \ + dst[6] = src ## 6; \ + dst[7] = src ## 7; \ +} + +#define ROUND(table, in, out, c0, c1, c2, c3, c4, c5, c6, c7) { \ + out ## 0 = (ROUND_ELT(table, in, 0, 7, 6, 5, 4, 3, 2, 1)^ c0); \ + out ## 1 = (ROUND_ELT(table, in, 1, 0, 7, 6, 5, 4, 3, 2)^ c1); \ + out ## 2 = (ROUND_ELT(table, in, 2, 1, 0, 7, 6, 5, 4, 3)^ c2); \ + out ## 3 = (ROUND_ELT(table, in, 3, 2, 1, 0, 7, 6, 5, 4)^ c3); \ + out ## 4 = (ROUND_ELT(table, in, 4, 3, 2, 1, 0, 7, 6, 5)^ c4); \ + out ## 5 = (ROUND_ELT(table, in, 5, 4, 3, 2, 1, 0, 7, 6)^ c5); \ + out ## 6 = (ROUND_ELT(table, in, 6, 5, 4, 3, 2, 1, 0, 7)^ c6); \ + out ## 7 = (ROUND_ELT(table, in, 7, 6, 5, 4, 3, 2, 1, 0)^ c7); \ +} + +#define ROUND1(table, in, out, c) { \ + out ## 0 = (ROUND_ELT(table, in, 0, 7, 6, 5, 4, 3, 2, 1)^ c); \ + out ## 1 = ROUND_ELT(table, in, 1, 0, 7, 6, 5, 4, 3, 2); \ + out ## 2 = ROUND_ELT(table, in, 2, 1, 0, 7, 6, 5, 4, 3); \ + out ## 3 = ROUND_ELT(table, in, 3, 2, 1, 0, 7, 6, 5, 4); \ + out ## 4 = ROUND_ELT(table, in, 4, 3, 2, 1, 0, 7, 6, 5); \ + out ## 5 = ROUND_ELT(table, in, 5, 4, 3, 2, 1, 0, 7, 6); \ + out ## 6 = ROUND_ELT(table, in, 6, 5, 4, 3, 2, 1, 0, 7); \ + out ## 7 = ROUND_ELT(table, in, 7, 6, 5, 4, 3, 2, 1, 0); \ +} + +#define ROUND_KSCHED(table, in, out, c) \ + ROUND1(table, in, out, c) \ + TRANSFER(in, out) + +#define ROUND_WENC(table, in, key, out) \ + ROUND(table, in, out, key[0], key[1], key[2],key[3], key[4], key[5], key[6], key[7]) \ + TRANSFER(in, out) + +__device__ __forceinline__ +static void getShared(uint64_t* sharedMemory) +{ + if (threadIdx.x < 256) + { + sharedMemory[threadIdx.x] = mixTob0Tox[threadIdx.x]; + sharedMemory[threadIdx.x + 256] = mixTob1Tox[threadIdx.x]; + sharedMemory[threadIdx.x + 512] = ROTL64(mixTob0Tox[threadIdx.x], 16); + sharedMemory[threadIdx.x + 768] = ROTL64(mixTob0Tox[threadIdx.x], 24); + sharedMemory[threadIdx.x + 1024] = SWAPDWORDS(sharedMemory[threadIdx.x]); + sharedMemory[threadIdx.x + 1280] = SWAPDWORDS(sharedMemory[threadIdx.x + 256]); + sharedMemory[threadIdx.x + 1536] = SWAPDWORDS(sharedMemory[threadIdx.x + 512]); + sharedMemory[threadIdx.x + 1792] = SWAPDWORDS(sharedMemory[threadIdx.x + 768]); + } +} + + +__global__ __launch_bounds__(256) +void precomputeX(uint32_t threads, uint2*const __restrict__ d_xtra, uint64_t*const __restrict__ d_tmp) +{ + + __shared__ uint64_t sharedMemory[2048]; + const uint64_t InitVector_RC[10] = + { + 0x4F01B887E8C62318, 0x52916F79F5D2A636, 0x357B0CA38E9BBC60, 0x57FE4B2EC2D7E01D, 0xDA4AF09FE5377715, + 0x856BA0B10A29C958, 0x67053ECBF4105DBD, 0xD8957DA78B4127E4, 0x9E4717DD667CEEFB, 0x33835AAD07BF2DCA + }; + + + getShared(sharedMemory); + __syncthreads(); + const unsigned int thread = (blockDim.x * blockIdx.x + threadIdx.x); + if (thread < threads) + { + + uint64_t n[8]; + uint64_t h[8] = { 0, 0, 0, 0, 0, 0, 0, 0 }; + +#pragma unroll 8 + for (int i = 0; i<8; i++) { + n[i] = c_PaddedMessage80[i]; // read data + } + //#pragma unroll 10 + for (unsigned int r = 0; r < 10; r++) { + uint64_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + ROUND_KSCHED(sharedMemory, h, tmp, InitVector_RC[r]); + ROUND_WENC(sharedMemory, n, h, tmp); + } +#pragma unroll 8 + for (int i = 0; i < 8; i++) { + h[i] = xor1(n[i], c_PaddedMessage80[i]); + } + + if (threadIdx.x == 0) + { + d_xtra[0] = vectorize(h[1]); + d_xtra[0].y = cuda_swab32(d_xtra[0].y); + } + uint64_t atLastCalc = xor1(h[3], h[5]); + + ////////////////////////////////// + + n[0] = xor1(c_PaddedMessage80[8], h[0]); + n[1] = c_PaddedMessage80[9]; + n[2] = xor1(0x0000000000000080, h[2]); + n[3] = h[3]; + n[4] = h[4]; + n[5] = h[5]; + n[6] = h[6]; + n[7] = xor1(0x8002000000000000, h[7]); + + uint64_t tmp[8]; + tmp[0] = xor1(ROUND_ELT(sharedMemory, h, 0, 7, 6, 5, 4, 3, 2, 1), InitVector_RC[0]); + tmp[1] = ROUND_ELT(sharedMemory, h, 1, 0, 7, 6, 5, 4, 3, 2); + tmp[2] = ROUND_ELT(sharedMemory, h, 2, 1, 0, 7, 6, 5, 4, 3); + tmp[3] = ROUND_ELT(sharedMemory, h, 3, 2, 1, 0, 7, 6, 5, 4); + tmp[4] = ROUND_ELT(sharedMemory, h, 4, 3, 2, 1, 0, 7, 6, 5); + tmp[5] = ROUND_ELT(sharedMemory, h, 5, 4, 3, 2, 1, 0, 7, 6); + tmp[6] = ROUND_ELT(sharedMemory, h, 6, 5, 4, 3, 2, 1, 0, 7); + tmp[7] = ROUND_ELT(sharedMemory, h, 7, 6, 5, 4, 3, 2, 1, 0); + + uint64_t tmp2[8]; + uint32_t* n32 = (uint32_t*)n; + tmp2[0] = xor8(sharedMemory[__byte_perm(n32[0], 0, 0x4440)], sharedMemory[__byte_perm(n32[14], 0, 0x4441) + 256], + sharedMemory[__byte_perm(n32[12], 0, 0x4442) + 512], sharedMemory[__byte_perm(n32[10], 0, 0x4443) + 768], + sharedMemory[__byte_perm(n32[9], 0, 0x4440) + 1024], sharedMemory[__byte_perm(n32[7], 0, 0x4441) + 1280], + sharedMemory[__byte_perm(n32[5], 0, 0x4442) + 1536], tmp[0]); + + tmp2[1] = xor8(tmp[1], sharedMemory[__byte_perm(n32[0], 0, 0x4441) + 256], + sharedMemory[__byte_perm(n32[14], 0, 0x4442) + 512], sharedMemory[__byte_perm(n32[12], 0, 0x4443) + 768], + sharedMemory[__byte_perm(n32[11], 0, 0x4440) + 1024], sharedMemory[__byte_perm(n32[9], 0, 0x4441) + 1280], + sharedMemory[__byte_perm(n32[7], 0, 0x4442) + 1536], sharedMemory[__byte_perm(n32[5], 0, 0x4443) + 1792]); + + tmp2[2] = xor8(sharedMemory[__byte_perm(n32[4], 0, 0x4440)], tmp[2], + sharedMemory[__byte_perm(n32[0], 0, 0x4442) + 512], sharedMemory[__byte_perm(n32[14], 0, 0x4443) + 768], + sharedMemory[__byte_perm(n32[13], 0, 0x4440) + 1024], sharedMemory[__byte_perm(n32[11], 0, 0x4441) + 1280], + sharedMemory[__byte_perm(n32[9], 0, 0x4442) + 1536], sharedMemory[__byte_perm(n32[7], 0, 0x4443) + 1792]); + + tmp2[3] = xor8(sharedMemory[__byte_perm(n32[6], 0, 0x4440)], sharedMemory[__byte_perm(n32[4], 0, 0x4441) + 256], + tmp[3], sharedMemory[__byte_perm(n32[0], 0, 0x4443) + 768], + sharedMemory[__byte_perm(n32[15], 0, 0x4440) + 1024], sharedMemory[__byte_perm(n32[13], 0, 0x4441) + 1280], + sharedMemory[__byte_perm(n32[11], 0, 0x4442) + 1536], sharedMemory[__byte_perm(n32[9], 0, 0x4443) + 1792]); + + tmp2[4] = xor8(sharedMemory[__byte_perm(n32[8], 0, 0x4440)], sharedMemory[__byte_perm(n32[6], 0, 0x4441) + 256], + sharedMemory[__byte_perm(n32[4], 0, 0x4442) + 512], tmp[4], + sharedMemory[__byte_perm(n32[1], 0, 0x4440) + 1024], sharedMemory[__byte_perm(n32[15], 0, 0x4441) + 1280], + sharedMemory[__byte_perm(n32[13], 0, 0x4442) + 1536], sharedMemory[__byte_perm(n32[11], 0, 0x4443) + 1792]); + + tmp2[5] = xor8(sharedMemory[__byte_perm(n32[10], 0, 0x4440)], sharedMemory[__byte_perm(n32[8], 0, 0x4441) + 256], + sharedMemory[__byte_perm(n32[6], 0, 0x4442) + 512], sharedMemory[__byte_perm(n32[4], 0, 0x4443) + 768], + tmp[5], sharedMemory[__byte_perm(n32[1], 0, 0x4441) + 1280], + sharedMemory[__byte_perm(n32[15], 0, 0x4442) + 1536], sharedMemory[__byte_perm(n32[13], 0, 0x4443) + 1792]); + + tmp2[6] = xor8(sharedMemory[__byte_perm(n32[12], 0, 0x4440)], sharedMemory[__byte_perm(n32[10], 0, 0x4441) + 256], + sharedMemory[__byte_perm(n32[8], 0, 0x4442) + 512], sharedMemory[__byte_perm(n32[6], 0, 0x4443) + 768], + sharedMemory[__byte_perm(n32[5], 0, 0x4440) + 1024], tmp[6], + sharedMemory[__byte_perm(n32[1], 0, 0x4442) + 1536], sharedMemory[__byte_perm(n32[15], 0, 0x4443) + 1792]); + + tmp2[7] = xor8(sharedMemory[__byte_perm(n32[14], 0, 0x4440)], sharedMemory[__byte_perm(n32[12], 0, 0x4441) + 256], + sharedMemory[__byte_perm(n32[10], 0, 0x4442) + 512], sharedMemory[__byte_perm(n32[8], 0, 0x4443) + 768], + sharedMemory[__byte_perm(n32[7], 0, 0x4440) + 1024], sharedMemory[__byte_perm(n32[5], 0, 0x4441) + 1280], + tmp[7], sharedMemory[__byte_perm(n32[1], 0, 0x4443) + 1792]); + + n[1] ^= h[1]; + tmp2[1] ^= sharedMemory[__byte_perm(n32[2], 0, 0x4440)]; + tmp2[2] ^= sharedMemory[__byte_perm(n32[2], 0, 0x4441) + 256]; + tmp2[3] ^= sharedMemory[__byte_perm(n32[2], 0, 0x4442) + 512]; + tmp2[4] ^= sharedMemory[__byte_perm(n32[2], 0, 0x4443) + 768]; + + d_tmp[threadIdx.x] = tmp2[threadIdx.x]; + + uint64_t tmp3[8]; + tmp3[0] = xor1(ROUND_ELT(sharedMemory, tmp, 0, 7, 6, 5, 4, 3, 2, 1), InitVector_RC[1]); + tmp3[1] = ROUND_ELT(sharedMemory, tmp, 1, 0, 7, 6, 5, 4, 3, 2); + tmp3[2] = ROUND_ELT(sharedMemory, tmp, 2, 1, 0, 7, 6, 5, 4, 3); + tmp3[3] = ROUND_ELT(sharedMemory, tmp, 3, 2, 1, 0, 7, 6, 5, 4); + tmp3[4] = ROUND_ELT(sharedMemory, tmp, 4, 3, 2, 1, 0, 7, 6, 5); + tmp3[5] = ROUND_ELT(sharedMemory, tmp, 5, 4, 3, 2, 1, 0, 7, 6); + tmp3[6] = ROUND_ELT(sharedMemory, tmp, 6, 5, 4, 3, 2, 1, 0, 7); + tmp3[7] = ROUND_ELT(sharedMemory, tmp, 7, 6, 5, 4, 3, 2, 1, 0); + + n32 = (uint32_t*)tmp2; + uint64_t tmp4[8]; + tmp4[0] = (sharedMemory[__byte_perm(n32[9], 0, 0x4440) + 1024] ^ sharedMemory[__byte_perm(n32[7], 0, 0x4441) + 1280] ^ + sharedMemory[__byte_perm(n32[5], 0, 0x4442) + 1536] ^ sharedMemory[__byte_perm(n32[3], 0, 0x4443) + 1792]) ^ tmp3[0]; + + tmp4[1] = (sharedMemory[__byte_perm(n32[2], 0, 0x4440)] ^ sharedMemory[__byte_perm(n32[9], 0, 0x4441) + 1280] ^ + sharedMemory[__byte_perm(n32[7], 0, 0x4442) + 1536] ^ sharedMemory[__byte_perm(n32[5], 0, 0x4443) + 1792]) ^ tmp3[1]; + + tmp4[2] = (sharedMemory[__byte_perm(n32[4], 0, 0x4440)] ^ sharedMemory[__byte_perm(n32[2], 0, 0x4441) + 256] ^ + sharedMemory[__byte_perm(n32[9], 0, 0x4442) + 1536] ^ sharedMemory[__byte_perm(n32[7], 0, 0x4443) + 1792]) ^ tmp3[2]; + + tmp4[3] = (sharedMemory[__byte_perm(n32[6], 0, 0x4440)] ^ sharedMemory[__byte_perm(n32[4], 0, 0x4441) + 256] ^ + sharedMemory[__byte_perm(n32[2], 0, 0x4442) + 512] ^ sharedMemory[__byte_perm(n32[9], 0, 0x4443) + 1792]) ^ tmp3[3]; + + tmp4[4] = (sharedMemory[__byte_perm(n32[8], 0, 0x4440)] ^ sharedMemory[__byte_perm(n32[6], 0, 0x4441) + 256] ^ + sharedMemory[__byte_perm(n32[4], 0, 0x4442) + 512] ^ sharedMemory[__byte_perm(n32[2], 0, 0x4443) + 768]) ^ tmp3[4]; + + tmp4[5] = (sharedMemory[__byte_perm(n32[8], 0, 0x4441) + 256] ^ sharedMemory[__byte_perm(n32[6], 0, 0x4442) + 512] ^ + sharedMemory[__byte_perm(n32[4], 0, 0x4443) + 768] ^ sharedMemory[__byte_perm(n32[3], 0, 0x4440) + 1024]) ^ tmp3[5]; + + tmp4[6] = (sharedMemory[__byte_perm(n32[8], 0, 0x4442) + 512] ^ sharedMemory[__byte_perm(n32[6], 0, 0x4443) + 768] ^ + sharedMemory[__byte_perm(n32[5], 0, 0x4440) + 1024] ^ sharedMemory[__byte_perm(n32[3], 0, 0x4441) + 1280]) ^ tmp3[6]; + + tmp4[7] = (sharedMemory[__byte_perm(n32[8], 0, 0x4443) + 768] ^ sharedMemory[__byte_perm(n32[7], 0, 0x4440) + 1024] ^ + sharedMemory[__byte_perm(n32[5], 0, 0x4441) + 1280] ^ sharedMemory[__byte_perm(n32[3], 0, 0x4442) + 1536]) ^ tmp3[7]; + + d_tmp[threadIdx.x + 16] = tmp4[threadIdx.x]; + + uint64_t tmp5[8]; + tmp5[0] = xor1(ROUND_ELT(sharedMemory, tmp3, 0, 7, 6, 5, 4, 3, 2, 1), InitVector_RC[2]); + tmp5[1] = ROUND_ELT(sharedMemory, tmp3, 1, 0, 7, 6, 5, 4, 3, 2); + tmp5[2] = ROUND_ELT(sharedMemory, tmp3, 2, 1, 0, 7, 6, 5, 4, 3); + tmp5[3] = ROUND_ELT(sharedMemory, tmp3, 3, 2, 1, 0, 7, 6, 5, 4); + tmp5[4] = ROUND_ELT(sharedMemory, tmp3, 4, 3, 2, 1, 0, 7, 6, 5); + tmp5[5] = ROUND_ELT(sharedMemory, tmp3, 5, 4, 3, 2, 1, 0, 7, 6); + tmp5[6] = ROUND_ELT(sharedMemory, tmp3, 6, 5, 4, 3, 2, 1, 0, 7); + tmp5[7] = ROUND_ELT(sharedMemory, tmp3, 7, 6, 5, 4, 3, 2, 1, 0); + + d_tmp[threadIdx.x + 8] = tmp5[threadIdx.x]; + + uint64_t tmp6[8]; + tmp6[0] = xor1(ROUND_ELT(sharedMemory, tmp5, 0, 7, 6, 5, 4, 3, 2, 1), InitVector_RC[3]); + tmp6[1] = ROUND_ELT(sharedMemory, tmp5, 1, 0, 7, 6, 5, 4, 3, 2); + tmp6[2] = ROUND_ELT(sharedMemory, tmp5, 2, 1, 0, 7, 6, 5, 4, 3); + tmp6[3] = ROUND_ELT(sharedMemory, tmp5, 3, 2, 1, 0, 7, 6, 5, 4); + tmp6[4] = ROUND_ELT(sharedMemory, tmp5, 4, 3, 2, 1, 0, 7, 6, 5); + tmp6[5] = ROUND_ELT(sharedMemory, tmp5, 5, 4, 3, 2, 1, 0, 7, 6); + tmp6[6] = ROUND_ELT(sharedMemory, tmp5, 6, 5, 4, 3, 2, 1, 0, 7); + tmp6[7] = ROUND_ELT(sharedMemory, tmp5, 7, 6, 5, 4, 3, 2, 1, 0); + + d_tmp[threadIdx.x + 24] = tmp6[threadIdx.x]; + + uint64_t tmp7[8]; + tmp7[0] = xor1(ROUND_ELT(sharedMemory, tmp6, 0, 7, 6, 5, 4, 3, 2, 1), InitVector_RC[4]); + tmp7[1] = ROUND_ELT(sharedMemory, tmp6, 1, 0, 7, 6, 5, 4, 3, 2); + tmp7[2] = ROUND_ELT(sharedMemory, tmp6, 2, 1, 0, 7, 6, 5, 4, 3); + tmp7[3] = ROUND_ELT(sharedMemory, tmp6, 3, 2, 1, 0, 7, 6, 5, 4); + tmp7[4] = ROUND_ELT(sharedMemory, tmp6, 4, 3, 2, 1, 0, 7, 6, 5); + tmp7[5] = ROUND_ELT(sharedMemory, tmp6, 5, 4, 3, 2, 1, 0, 7, 6); + tmp7[6] = ROUND_ELT(sharedMemory, tmp6, 6, 5, 4, 3, 2, 1, 0, 7); + tmp7[7] = ROUND_ELT(sharedMemory, tmp6, 7, 6, 5, 4, 3, 2, 1, 0); + + d_tmp[threadIdx.x + 32] = tmp7[threadIdx.x]; + //------------------- + uint64_t tmp8[8]; + tmp8[0] = xor1(ROUND_ELT(sharedMemory, tmp7, 0, 7, 6, 5, 4, 3, 2, 1), InitVector_RC[5]); + tmp8[1] = ROUND_ELT(sharedMemory, tmp7, 1, 0, 7, 6, 5, 4, 3, 2); + tmp8[2] = ROUND_ELT(sharedMemory, tmp7, 2, 1, 0, 7, 6, 5, 4, 3); + tmp8[3] = ROUND_ELT(sharedMemory, tmp7, 3, 2, 1, 0, 7, 6, 5, 4); + tmp8[4] = ROUND_ELT(sharedMemory, tmp7, 4, 3, 2, 1, 0, 7, 6, 5); + tmp8[5] = ROUND_ELT(sharedMemory, tmp7, 5, 4, 3, 2, 1, 0, 7, 6); + tmp8[6] = ROUND_ELT(sharedMemory, tmp7, 6, 5, 4, 3, 2, 1, 0, 7); + tmp8[7] = ROUND_ELT(sharedMemory, tmp7, 7, 6, 5, 4, 3, 2, 1, 0); + + d_tmp[threadIdx.x + 40] = tmp8[threadIdx.x]; + + uint64_t tmp9[8]; + tmp9[0] = xor1(ROUND_ELT(sharedMemory, tmp8, 0, 7, 6, 5, 4, 3, 2, 1), InitVector_RC[6]); + tmp9[1] = ROUND_ELT(sharedMemory, tmp8, 1, 0, 7, 6, 5, 4, 3, 2); + tmp9[2] = ROUND_ELT(sharedMemory, tmp8, 2, 1, 0, 7, 6, 5, 4, 3); + tmp9[3] = ROUND_ELT(sharedMemory, tmp8, 3, 2, 1, 0, 7, 6, 5, 4); + tmp9[4] = ROUND_ELT(sharedMemory, tmp8, 4, 3, 2, 1, 0, 7, 6, 5); + tmp9[5] = ROUND_ELT(sharedMemory, tmp8, 5, 4, 3, 2, 1, 0, 7, 6); + tmp9[6] = ROUND_ELT(sharedMemory, tmp8, 6, 5, 4, 3, 2, 1, 0, 7); + tmp9[7] = ROUND_ELT(sharedMemory, tmp8, 7, 6, 5, 4, 3, 2, 1, 0); + + d_tmp[threadIdx.x + 48] = tmp9[threadIdx.x]; + + uint64_t tmp10[8]; + tmp10[0] = xor1(ROUND_ELT(sharedMemory, tmp9, 0, 7, 6, 5, 4, 3, 2, 1), InitVector_RC[7]); + tmp10[1] = ROUND_ELT(sharedMemory, tmp9, 1, 0, 7, 6, 5, 4, 3, 2); + tmp10[2] = ROUND_ELT(sharedMemory, tmp9, 2, 1, 0, 7, 6, 5, 4, 3); + tmp10[3] = ROUND_ELT(sharedMemory, tmp9, 3, 2, 1, 0, 7, 6, 5, 4); + tmp10[4] = ROUND_ELT(sharedMemory, tmp9, 4, 3, 2, 1, 0, 7, 6, 5); + tmp10[5] = ROUND_ELT(sharedMemory, tmp9, 5, 4, 3, 2, 1, 0, 7, 6); + tmp10[6] = ROUND_ELT(sharedMemory, tmp9, 6, 5, 4, 3, 2, 1, 0, 7); + tmp10[7] = ROUND_ELT(sharedMemory, tmp9, 7, 6, 5, 4, 3, 2, 1, 0); + + + d_tmp[threadIdx.x + 56] = tmp10[threadIdx.x]; + + uint64_t tmp11[8]; + tmp11[0] = xor1(ROUND_ELT(sharedMemory, tmp10, 0, 7, 6, 5, 4, 3, 2, 1), InitVector_RC[8]); + tmp11[1] = ROUND_ELT(sharedMemory, tmp10, 1, 0, 7, 6, 5, 4, 3, 2); + tmp11[2] = ROUND_ELT(sharedMemory, tmp10, 2, 1, 0, 7, 6, 5, 4, 3); + tmp11[3] = ROUND_ELT(sharedMemory, tmp10, 3, 2, 1, 0, 7, 6, 5, 4); + tmp11[4] = ROUND_ELT(sharedMemory, tmp10, 4, 3, 2, 1, 0, 7, 6, 5); + tmp11[5] = ROUND_ELT(sharedMemory, tmp10, 5, 4, 3, 2, 1, 0, 7, 6); + tmp11[6] = ROUND_ELT(sharedMemory, tmp10, 6, 5, 4, 3, 2, 1, 0, 7); + tmp11[7] = ROUND_ELT(sharedMemory, tmp10, 7, 6, 5, 4, 3, 2, 1, 0); + + d_tmp[threadIdx.x + 64] = tmp11[threadIdx.x]; + + if (threadIdx.x == 1){ + tmp[0] = ROUND_ELT(sharedMemory, tmp11, 3, 2, 1, 0, 7, 6, 5, 4); + tmp[1] = ROUND_ELT(sharedMemory, tmp11, 5, 4, 3, 2, 1, 0, 7, 6); + tmp[4] = xor3(tmp[0], tmp[1], atLastCalc); + d_xtra[1] = vectorize(tmp[4]); + } + } +} +__global__ __launch_bounds__(TPB) +void whirlpoolx(uint32_t threads, uint32_t startNounce, uint32_t *resNounce) +{ + + + uint32_t threadindex = (blockDim.x * blockIdx.x + threadIdx.x); + + + if (threadindex < threads) + { + __shared__ uint64_t sharedMemory[2048]; + getShared(sharedMemory); + __syncthreads(); + const uint32_t numberofthreads = blockDim.x*gridDim.x; + const uint32_t maxnonce = startNounce + threadindex + numberofthreads*NONCES_PER_THREAD - 1; + const uint32_t threadindex = blockIdx.x*blockDim.x + threadIdx.x; + const uint64_t backup = pTarget[0]; +// #pragma unroll + for (uint32_t nounce = startNounce + threadindex; nounce <= maxnonce; nounce += numberofthreads) + { + + uint2 n[8]; + uint2 tmp[8]; + //const uint32_t nounce = startNounce + thread; + + n[1].y = nounce ^ c_xtra[0].y; + + n[0] = vectorize(sharedMemory[(n[1].y & 0xff) + 1792]); + n[5] = vectorize(sharedMemory[__byte_perm(n[1].y, 0, 0x4443) + 1024]); + n[6] = vectorize(sharedMemory[__byte_perm(n[1].y, 0, 0x4442) + 1280]); + n[7] = vectorize(sharedMemory[__byte_perm(n[1].y, 0, 0x4441) + 1536]); + uint2 b = (c_tmp[0]) ^ n[0]; + n[5] = (c_tmp[5]) ^ n[5]; + n[6] = (c_tmp[6]) ^ n[6]; + n[7] = (c_tmp[7]) ^ n[7]; + + tmp[0] = vectorize(sharedMemory[__byte_perm(n[5].x, 0, 0x4443) + 768] ^ sharedMemory[__byte_perm(n[6].x, 0, 0x4442) + 512] ^ sharedMemory[__byte_perm(n[7].x, 0, 0x4441) + 256]); + tmp[1] = vectorize(sharedMemory[(n[5].y&0xff) + 1024] ^ sharedMemory[__byte_perm(n[6].x, 0, 0x4443) + 768] ^ sharedMemory[__byte_perm(n[7].x, 0, 0x4442) + 512]); + tmp[2] = vectorize(sharedMemory[__byte_perm(n[5].y, 0, 0x4441) + 1280] ^ sharedMemory[(n[6].y & 0xff) + 1024] ^ sharedMemory[__byte_perm(n[7].x, 0, 0x4443) + 768]); + tmp[3] = vectorize(sharedMemory[__byte_perm(n[5].y, 0, 0x4442) + 1536] ^ sharedMemory[__byte_perm(n[6].y, 0, 0x4441) + 1280] ^ sharedMemory[(n[7].y & 0xff) + 1024]); + tmp[4] = vectorize(sharedMemory[__byte_perm(n[5].y, 0, 0x4443) + 1792] ^ sharedMemory[__byte_perm(n[6].y, 0, 0x4442) + 1536] ^ sharedMemory[__byte_perm(n[7].y, 0, 0x4441) + 1280]); + tmp[5] = vectorize(sharedMemory[(n[5].x &0xff)] ^ sharedMemory[__byte_perm(n[6].y, 0, 0x4443) + 1792] ^ sharedMemory[__byte_perm(n[7].y, 0, 0x4442) + 1536]); + tmp[6] = vectorize(sharedMemory[(n[6].x & 0xff)] ^ sharedMemory[__byte_perm(n[5].x, 0, 0x4441) + 256] ^ sharedMemory[__byte_perm(n[7].y, 0, 0x4443) + 1792]); + tmp[7] = vectorize(sharedMemory[(n[7].x & 0xff)] ^ sharedMemory[__byte_perm(n[6].x, 0, 0x4441) + 256] ^ sharedMemory[__byte_perm(n[5].x, 0, 0x4442) + 512]); + + n[0] = vectorize(sharedMemory[__byte_perm(b.x, 0, 0x4440)]) ^ tmp[0] ^ (c_tmp[0 + 16]); + n[1] = vectorize(sharedMemory[__byte_perm(b.x, 0, 0x4441) + 256]) ^ tmp[1] ^ (c_tmp[1 + 16]); + n[2] = vectorize(sharedMemory[__byte_perm(b.x, 0, 0x4442) + 512]) ^ tmp[2] ^ (c_tmp[2 + 16]); + n[3] = vectorize(sharedMemory[__byte_perm(b.x, 0, 0x4443) + 768]) ^ tmp[3] ^ (c_tmp[3 + 16]); + n[4] = vectorize(sharedMemory[__byte_perm(b.y, 0, 0x4440) + 1024]) ^ tmp[4] ^ (c_tmp[4 + 16]); + n[5] = vectorize(sharedMemory[__byte_perm(b.y, 0, 0x4441) + 1280]) ^ tmp[5] ^ (c_tmp[5 + 16]); + n[6] = vectorize(sharedMemory[__byte_perm(b.y, 0, 0x4442) + 1536]) ^ tmp[6] ^ (c_tmp[6 + 16]); + n[7] = vectorize(sharedMemory[__byte_perm(b.y, 0, 0x4443) + 1792]) ^ tmp[7] ^ (c_tmp[7 + 16]); + + tmp[0] = ROUND_ELT2(sharedMemory, n, 0, 7, 6, 5, 4, 3, 2, 1) ^ (c_tmp[0 + 8]); + tmp[1] = ROUND_ELT2(sharedMemory, n, 1, 0, 7, 6, 5, 4, 3, 2) ^ (c_tmp[1 + 8]); + tmp[2] = ROUND_ELT2(sharedMemory, n, 2, 1, 0, 7, 6, 5, 4, 3) ^ (c_tmp[2 + 8]); + tmp[3] = ROUND_ELT2(sharedMemory, n, 3, 2, 1, 0, 7, 6, 5, 4) ^ (c_tmp[3 + 8]); + tmp[4] = ROUND_ELT2(sharedMemory, n, 4, 3, 2, 1, 0, 7, 6, 5) ^ (c_tmp[4 + 8]); + tmp[5] = ROUND_ELT2(sharedMemory, n, 5, 4, 3, 2, 1, 0, 7, 6) ^ (c_tmp[5 + 8]); + tmp[6] = ROUND_ELT2(sharedMemory, n, 6, 5, 4, 3, 2, 1, 0, 7) ^ (c_tmp[6 + 8]); + tmp[7] = ROUND_ELT2(sharedMemory, n, 7, 6, 5, 4, 3, 2, 1, 0) ^ (c_tmp[7 + 8]); + + n[0] = ROUND_ELT2(sharedMemory, tmp, 0, 7, 6, 5, 4, 3, 2, 1) ^ (c_tmp[0 + 24]); + n[1] = ROUND_ELT2(sharedMemory, tmp, 1, 0, 7, 6, 5, 4, 3, 2) ^ (c_tmp[1 + 24]); + n[2] = ROUND_ELT2(sharedMemory, tmp, 2, 1, 0, 7, 6, 5, 4, 3) ^ (c_tmp[2 + 24]); + n[3] = ROUND_ELT2(sharedMemory, tmp, 3, 2, 1, 0, 7, 6, 5, 4) ^ (c_tmp[3 + 24]); + n[4] = ROUND_ELT2(sharedMemory, tmp, 4, 3, 2, 1, 0, 7, 6, 5) ^ (c_tmp[4 + 24]); + n[5] = ROUND_ELT2(sharedMemory, tmp, 5, 4, 3, 2, 1, 0, 7, 6) ^ (c_tmp[5 + 24]); + n[6] = ROUND_ELT2(sharedMemory, tmp, 6, 5, 4, 3, 2, 1, 0, 7) ^ (c_tmp[6 + 24]); + n[7] = ROUND_ELT2(sharedMemory, tmp, 7, 6, 5, 4, 3, 2, 1, 0) ^ (c_tmp[7 + 24]); + + + tmp[0] = ROUND_ELT2(sharedMemory, n, 0, 7, 6, 5, 4, 3, 2, 1) ^ (c_tmp[0 + 32]); + tmp[1] = ROUND_ELT2(sharedMemory, n, 1, 0, 7, 6, 5, 4, 3, 2) ^ (c_tmp[1 + 32]); + tmp[2] = ROUND_ELT2(sharedMemory, n, 2, 1, 0, 7, 6, 5, 4, 3) ^ (c_tmp[2 + 32]); + tmp[3] = ROUND_ELT2(sharedMemory, n, 3, 2, 1, 0, 7, 6, 5, 4) ^ (c_tmp[3 + 32]); + tmp[4] = ROUND_ELT2(sharedMemory, n, 4, 3, 2, 1, 0, 7, 6, 5) ^ (c_tmp[4 + 32]); + tmp[5] = ROUND_ELT2(sharedMemory, n, 5, 4, 3, 2, 1, 0, 7, 6) ^ (c_tmp[5 + 32]); + tmp[6] = ROUND_ELT2(sharedMemory, n, 6, 5, 4, 3, 2, 1, 0, 7) ^ (c_tmp[6 + 32]); + tmp[7] = ROUND_ELT2(sharedMemory, n, 7, 6, 5, 4, 3, 2, 1, 0) ^ (c_tmp[7 + 32]); + + n[0] = ROUND_ELT2(sharedMemory, tmp, 0, 7, 6, 5, 4, 3, 2, 1) ^ (c_tmp[0 + 40]); + n[1] = ROUND_ELT2(sharedMemory, tmp, 1, 0, 7, 6, 5, 4, 3, 2) ^ (c_tmp[1 + 40]); + n[2] = ROUND_ELT2(sharedMemory, tmp, 2, 1, 0, 7, 6, 5, 4, 3) ^ (c_tmp[2 + 40]); + n[3] = ROUND_ELT2(sharedMemory, tmp, 3, 2, 1, 0, 7, 6, 5, 4) ^ (c_tmp[3 + 40]); + n[4] = ROUND_ELT2(sharedMemory, tmp, 4, 3, 2, 1, 0, 7, 6, 5) ^ (c_tmp[4 + 40]); + n[5] = ROUND_ELT2(sharedMemory, tmp, 5, 4, 3, 2, 1, 0, 7, 6) ^ (c_tmp[5 + 40]); + n[6] = ROUND_ELT2(sharedMemory, tmp, 6, 5, 4, 3, 2, 1, 0, 7) ^ (c_tmp[6 + 40]); + n[7] = ROUND_ELT2(sharedMemory, tmp, 7, 6, 5, 4, 3, 2, 1, 0) ^ (c_tmp[7 + 40]); + + tmp[0] = ROUND_ELT2(sharedMemory, n, 0, 7, 6, 5, 4, 3, 2, 1) ^ (c_tmp[0 + 48]); + tmp[1] = ROUND_ELT2(sharedMemory, n, 1, 0, 7, 6, 5, 4, 3, 2) ^ (c_tmp[1 + 48]); + tmp[2] = ROUND_ELT2(sharedMemory, n, 2, 1, 0, 7, 6, 5, 4, 3) ^ (c_tmp[2 + 48]); + tmp[3] = ROUND_ELT2(sharedMemory, n, 3, 2, 1, 0, 7, 6, 5, 4) ^ (c_tmp[3 + 48]); + tmp[4] = ROUND_ELT2(sharedMemory, n, 4, 3, 2, 1, 0, 7, 6, 5) ^ (c_tmp[4 + 48]); + tmp[5] = ROUND_ELT2(sharedMemory, n, 5, 4, 3, 2, 1, 0, 7, 6) ^ (c_tmp[5 + 48]); + tmp[6] = ROUND_ELT2(sharedMemory, n, 6, 5, 4, 3, 2, 1, 0, 7) ^ (c_tmp[6 + 48]); + tmp[7] = ROUND_ELT2(sharedMemory, n, 7, 6, 5, 4, 3, 2, 1, 0) ^ (c_tmp[7 + 48]); + + n[0] = ROUND_ELT2(sharedMemory, tmp, 0, 7, 6, 5, 4, 3, 2, 1) ^ (c_tmp[0 + 56]); + n[1] = ROUND_ELT2(sharedMemory, tmp, 1, 0, 7, 6, 5, 4, 3, 2) ^ (c_tmp[1 + 56]); + n[2] = ROUND_ELT2(sharedMemory, tmp, 2, 1, 0, 7, 6, 5, 4, 3) ^ (c_tmp[2 + 56]); + n[3] = ROUND_ELT2(sharedMemory, tmp, 3, 2, 1, 0, 7, 6, 5, 4) ^ (c_tmp[3 + 56]); + n[4] = ROUND_ELT2(sharedMemory, tmp, 4, 3, 2, 1, 0, 7, 6, 5) ^ (c_tmp[4 + 56]); + n[5] = ROUND_ELT2(sharedMemory, tmp, 5, 4, 3, 2, 1, 0, 7, 6) ^ (c_tmp[5 + 56]); + n[6] = ROUND_ELT2(sharedMemory, tmp, 6, 5, 4, 3, 2, 1, 0, 7) ^ (c_tmp[6 + 56]); + n[7] = ROUND_ELT2(sharedMemory, tmp, 7, 6, 5, 4, 3, 2, 1, 0) ^ (c_tmp[7 + 56]); + + tmp[0] = ROUND_ELT2(sharedMemory, n, 0, 7, 6, 5, 4, 3, 2, 1) ^ (c_tmp[0 + 64]); + tmp[1] = ROUND_ELT2(sharedMemory, n, 1, 0, 7, 6, 5, 4, 3, 2) ^ (c_tmp[1 + 64]); + tmp[2] = ROUND_ELT2(sharedMemory, n, 2, 1, 0, 7, 6, 5, 4, 3) ^ (c_tmp[2 + 64]); + tmp[3] = ROUND_ELT2(sharedMemory, n, 3, 2, 1, 0, 7, 6, 5, 4) ^ (c_tmp[3 + 64]); + tmp[4] = ROUND_ELT2(sharedMemory, n, 4, 3, 2, 1, 0, 7, 6, 5) ^ (c_tmp[4 + 64]); + tmp[5] = ROUND_ELT2(sharedMemory, n, 5, 4, 3, 2, 1, 0, 7, 6) ^ (c_tmp[5 + 64]); + tmp[6] = ROUND_ELT2(sharedMemory, n, 6, 5, 4, 3, 2, 1, 0, 7) ^ (c_tmp[6 + 64]); + tmp[7] = ROUND_ELT2(sharedMemory, n, 7, 6, 5, 4, 3, 2, 1, 0) ^ (c_tmp[7 + 64]); + + if ((devectorize(c_xtra[1] ^ ROUND_ELT2(sharedMemory, tmp, 3, 2, 1, 0, 7, 6, 5, 4) ^ ROUND_ELT2(sharedMemory, tmp, 5, 4, 3, 2, 1, 0, 7, 6))) <= backup) + { + uint32_t tmp = atomicExch(resNounce, nounce); + if (tmp != 0xffffffff) + resNounce[1] = tmp; + } + } // thread < threads + } +} + +__host__ extern void whirlpoolx_cpu_init(int thr_id, uint32_t threads) +{ + uint64_t t1[256]; + cudaMemcpyToSymbolAsync(mixTob0Tox, hmixTob0Tox, sizeof(hmixTob0Tox), 0, cudaMemcpyHostToDevice, gpustream[thr_id]); + + for (int i = 0; i < 256; i++) + { + t1[i] = ROTL64(hmixTob0Tox[i], 8); + } + cudaMemcpyToSymbolAsync(mixTob1Tox, t1, sizeof(hmixTob0Tox), 0, cudaMemcpyHostToDevice, gpustream[thr_id]); + cudaMalloc(&d_WXNonce[thr_id], 2 * sizeof(uint32_t)); + cudaMallocHost(&h_wxnounce[thr_id], 2 * sizeof(uint32_t)); + cudaMalloc((void **)&d_xtra[thr_id], 2 * sizeof(uint64_t)); + cudaMalloc((void **)&d_tmp[thr_id], 8 * 9 * sizeof(uint64_t)); +} + +__host__ void whirlpoolx_setBlock_80(int thr_id, void *pdata, const void *ptarget) +{ + uint64_t PaddedMessage[16]; + memcpy(PaddedMessage, pdata, 80); + memset((uint8_t*)&PaddedMessage + 80, 0, 48); + *(((uint8_t*)&PaddedMessage) + 80) = 0x80; /* ending */ + cudaMemcpyToSymbolAsync(pTarget, ptarget, 1 * sizeof(uint64_t), 0, cudaMemcpyHostToDevice, gpustream[thr_id]); + cudaMemcpyToSymbolAsync(c_PaddedMessage80, PaddedMessage, 16 * sizeof(uint64_t), 0, cudaMemcpyHostToDevice, gpustream[thr_id]); +} + +__host__ void whirlpoolx_precompute(int thr_id) +{ + dim3 grid(1); + dim3 block(256); + + precomputeX <<>>(8, d_xtra[thr_id], d_tmp[thr_id]); + cudaMemcpyToSymbolAsync(c_xtra, d_xtra[thr_id], 2 * sizeof(uint64_t), 0, cudaMemcpyDeviceToDevice, gpustream[thr_id]); + cudaMemcpyToSymbolAsync(c_tmp, d_tmp[thr_id], 8 * 9 * sizeof(uint64_t), 0, cudaMemcpyDeviceToDevice, gpustream[thr_id]); +} + +__host__ void cpu_whirlpoolx(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *foundnonce) +{ + dim3 grid((threads + TPB*NONCES_PER_THREAD - 1) / TPB / NONCES_PER_THREAD); + dim3 block(TPB); + + cudaMemsetAsync(d_WXNonce[thr_id], 0xff, 2 * sizeof(uint32_t), gpustream[thr_id]); + whirlpoolx <<>>(threads, startNounce, d_WXNonce[thr_id]); + + cudaMemcpyAsync(h_wxnounce[thr_id], d_WXNonce[thr_id], 2 * sizeof(uint32_t), cudaMemcpyDeviceToHost, gpustream[thr_id]); + cudaStreamSynchronize(gpustream[thr_id]); + foundnonce[0] = h_wxnounce[thr_id][0]; + foundnonce[1] = h_wxnounce[thr_id][1]; +} diff --git a/x15/cuda_x14_shabal512.cu b/x15/cuda_x14_shabal512.cu index b942156939..bb7fabfd37 100644 --- a/x15/cuda_x14_shabal512.cu +++ b/x15/cuda_x14_shabal512.cu @@ -2,6 +2,7 @@ * Shabal-512 for X14/X15 (STUB) */ #include "cuda_helper.h" +#include "cuda_vector.h" /* $Id: shabal.c 175 2010-05-07 16:03:20Z tp $ */ @@ -43,9 +44,6 @@ #define sM 16 -#define C32 SPH_C32 -#define T32(x) (x) - #define O1 13 #define O2 9 #define O3 6 @@ -57,56 +55,52 @@ /* BEGIN -- automatically generated code. */ -#define INPUT_BLOCK_ADD do { \ - B0 = T32(B0 + M0); \ - B1 = T32(B1 + M1); \ - B2 = T32(B2 + M2); \ - B3 = T32(B3 + M3); \ - B4 = T32(B4 + M4); \ - B5 = T32(B5 + M5); \ - B6 = T32(B6 + M6); \ - B7 = T32(B7 + M7); \ - B8 = T32(B8 + M8); \ - B9 = T32(B9 + M9); \ - BA = T32(BA + MA); \ - BB = T32(BB + MB); \ - BC = T32(BC + MC); \ - BD = T32(BD + MD); \ - BE = T32(BE + ME); \ - BF = T32(BF + MF); \ - } while (0) - -#define INPUT_BLOCK_SUB do { \ - C0 = T32(C0 - M0); \ - C1 = T32(C1 - M1); \ - C2 = T32(C2 - M2); \ - C3 = T32(C3 - M3); \ - C4 = T32(C4 - M4); \ - C5 = T32(C5 - M5); \ - C6 = T32(C6 - M6); \ - C7 = T32(C7 - M7); \ - C8 = T32(C8 - M8); \ - C9 = T32(C9 - M9); \ - CA = T32(CA - MA); \ - CB = T32(CB - MB); \ - CC = T32(CC - MC); \ - CD = T32(CD - MD); \ - CE = T32(CE - ME); \ - CF = T32(CF - MF); \ - } while (0) - -#define XOR_W do { \ +#define INPUT_BLOCK_ADD \ + B0 = B0 + M0; \ + B1 = B1 + M1; \ + B2 = B2 + M2; \ + B3 = B3 + M3; \ + B4 = B4 + M4; \ + B5 = B5 + M5; \ + B6 = B6 + M6; \ + B7 = B7 + M7; \ + B8 = B8 + M8; \ + B9 = B9 + M9; \ + BA = BA + MA; \ + BB = BB + MB; \ + BC = BC + MC; \ + BD = BD + MD; \ + BE = BE + ME; \ + BF = BF + MF; \ + +#define INPUT_BLOCK_SUB \ + C0 = C0 - M0; \ + C1 = C1 - M1; \ + C2 = C2 - M2; \ + C3 = C3 - M3; \ + C4 = C4 - M4; \ + C5 = C5 - M5; \ + C6 = C6 - M6; \ + C7 = C7 - M7; \ + C8 = C8 - M8; \ + C9 = C9 - M9; \ + CA = CA - MA; \ + CB = CB - MB; \ + CC = CC - MC; \ + CD = CD - MD; \ + CE = CE - ME; \ + CF = CF - MF; \ + +#define XOR_W \ A00 ^= Wlow; \ A01 ^= Whigh; \ - } while (0) -#define SWAP(v1, v2) do { \ - uint32_t tmp = (v1); \ - (v1) = (v2); \ - (v2) = tmp; \ - } while (0) +#define SWAP(v1, v2) \ + v1^=v2;\ + v2 ^= v1;\ + v1 ^= v2; -#define SWAP_BC do { \ +#define SWAP_BC \ SWAP(B0, C0); \ SWAP(B1, C1); \ SWAP(B2, C2); \ @@ -123,17 +117,15 @@ SWAP(BD, CD); \ SWAP(BE, CE); \ SWAP(BF, CF); \ - } while (0) -#define PERM_ELT(xa0, xa1, xb0, xb1, xb2, xb3, xc, xm) do { \ - xa0 = T32((xa0 \ +#define PERM_ELT(xa0, xa1, xb0, xb1, xb2, xb3, xc, xm) \ + xa0 = ((xa0 \ ^ (ROTL32(xa1, 15) * 5U) \ ^ xc) * 3U) \ ^ xb1 ^ (xb2 & ~xb3) ^ xm; \ - xb0 = T32(~(ROTL32(xb0, 1) ^ xa0)); \ - } while (0) + xb0 = (~(ROTL32(xb0, 1) ^ xa0)); \ -#define PERM_STEP_0 do { \ +#define PERM_STEP_0 \ PERM_ELT(A00, A0B, B0, BD, B9, B6, C8, M0); \ PERM_ELT(A01, A00, B1, BE, BA, B7, C7, M1); \ PERM_ELT(A02, A01, B2, BF, BB, B8, C6, M2); \ @@ -150,9 +142,8 @@ PERM_ELT(A01, A00, BD, BA, B6, B3, CB, MD); \ PERM_ELT(A02, A01, BE, BB, B7, B4, CA, ME); \ PERM_ELT(A03, A02, BF, BC, B8, B5, C9, MF); \ - } while (0) -#define PERM_STEP_1 do { \ +#define PERM_STEP_1 \ PERM_ELT(A04, A03, B0, BD, B9, B6, C8, M0); \ PERM_ELT(A05, A04, B1, BE, BA, B7, C7, M1); \ PERM_ELT(A06, A05, B2, BF, BB, B8, C6, M2); \ @@ -169,9 +160,8 @@ PERM_ELT(A05, A04, BD, BA, B6, B3, CB, MD); \ PERM_ELT(A06, A05, BE, BB, B7, B4, CA, ME); \ PERM_ELT(A07, A06, BF, BC, B8, B5, C9, MF); \ - } while (0) -#define PERM_STEP_2 do { \ +#define PERM_STEP_2 \ PERM_ELT(A08, A07, B0, BD, B9, B6, C8, M0); \ PERM_ELT(A09, A08, B1, BE, BA, B7, C7, M1); \ PERM_ELT(A0A, A09, B2, BF, BB, B8, C6, M2); \ @@ -188,9 +178,8 @@ PERM_ELT(A09, A08, BD, BA, B6, B3, CB, MD); \ PERM_ELT(A0A, A09, BE, BB, B7, B4, CA, ME); \ PERM_ELT(A0B, A0A, BF, BC, B8, B5, C9, MF); \ - } while (0) -#define APPLY_P do { \ +#define APPLY_P \ B0 = ROTL32(B0, 17); \ B1 = ROTL32(B1, 17); \ B2 = ROTL32(B2, 17); \ @@ -210,45 +199,44 @@ PERM_STEP_0; \ PERM_STEP_1; \ PERM_STEP_2; \ - A0B = T32(A0B + C6); \ - A0A = T32(A0A + C5); \ - A09 = T32(A09 + C4); \ - A08 = T32(A08 + C3); \ - A07 = T32(A07 + C2); \ - A06 = T32(A06 + C1); \ - A05 = T32(A05 + C0); \ - A04 = T32(A04 + CF); \ - A03 = T32(A03 + CE); \ - A02 = T32(A02 + CD); \ - A01 = T32(A01 + CC); \ - A00 = T32(A00 + CB); \ - A0B = T32(A0B + CA); \ - A0A = T32(A0A + C9); \ - A09 = T32(A09 + C8); \ - A08 = T32(A08 + C7); \ - A07 = T32(A07 + C6); \ - A06 = T32(A06 + C5); \ - A05 = T32(A05 + C4); \ - A04 = T32(A04 + C3); \ - A03 = T32(A03 + C2); \ - A02 = T32(A02 + C1); \ - A01 = T32(A01 + C0); \ - A00 = T32(A00 + CF); \ - A0B = T32(A0B + CE); \ - A0A = T32(A0A + CD); \ - A09 = T32(A09 + CC); \ - A08 = T32(A08 + CB); \ - A07 = T32(A07 + CA); \ - A06 = T32(A06 + C9); \ - A05 = T32(A05 + C8); \ - A04 = T32(A04 + C7); \ - A03 = T32(A03 + C6); \ - A02 = T32(A02 + C5); \ - A01 = T32(A01 + C4); \ - A00 = T32(A00 + C3); \ - } while (0) - -#define APPLY_P_FINAL do { \ + A0B = (A0B + C6); \ + A0A = (A0A + C5); \ + A09 = (A09 + C4); \ + A08 = (A08 + C3); \ + A07 = (A07 + C2); \ + A06 = (A06 + C1); \ + A05 = (A05 + C0); \ + A04 = (A04 + CF); \ + A03 = (A03 + CE); \ + A02 = (A02 + CD); \ + A01 = (A01 + CC); \ + A00 = (A00 + CB); \ + A0B = (A0B + CA); \ + A0A = (A0A + C9); \ + A09 = (A09 + C8); \ + A08 = (A08 + C7); \ + A07 = (A07 + C6); \ + A06 = (A06 + C5); \ + A05 = (A05 + C4); \ + A04 = (A04 + C3); \ + A03 = (A03 + C2); \ + A02 = (A02 + C1); \ + A01 = (A01 + C0); \ + A00 = (A00 + CF); \ + A0B = (A0B + CE); \ + A0A = (A0A + CD); \ + A09 = (A09 + CC); \ + A08 = (A08 + CB); \ + A07 = (A07 + CA); \ + A06 = (A06 + C9); \ + A05 = (A05 + C8); \ + A04 = (A04 + C7); \ + A03 = (A03 + C6); \ + A02 = (A02 + C5); \ + A01 = (A01 + C4); \ + A00 = (A00 + C3); \ + +#define APPLY_P_FINAL \ B0 = ROTL32(B0, 17); \ B1 = ROTL32(B1, 17); \ B2 = ROTL32(B2, 17); \ @@ -268,135 +256,128 @@ PERM_STEP_0; \ PERM_STEP_1; \ PERM_STEP_2; \ - } while (0) -#define INCR_W do { \ - if ((Wlow = T32(Wlow + 1)) == 0) \ - Whigh = T32(Whigh + 1); \ - } while (0) +#define INCR_W if ((Wlow = (Wlow + 1)) == 0) \ + Whigh = (Whigh + 1); \ + #if 0 /* other hash sizes init */ static const uint32_t A_init_192[] = { - C32(0xFD749ED4), C32(0xB798E530), C32(0x33904B6F), C32(0x46BDA85E), - C32(0x076934B4), C32(0x454B4058), C32(0x77F74527), C32(0xFB4CF465), - C32(0x62931DA9), C32(0xE778C8DB), C32(0x22B3998E), C32(0xAC15CFB9) + 0xFD749ED4), 0xB798E530), 0x33904B6F), 0x46BDA85E), + 0x076934B4), 0x454B4058), 0x77F74527), 0xFB4CF465), + 0x62931DA9), 0xE778C8DB), 0x22B3998E), 0xAC15CFB9) }; static const uint32_t B_init_192[] = { - C32(0x58BCBAC4), C32(0xEC47A08E), C32(0xAEE933B2), C32(0xDFCBC824), - C32(0xA7944804), C32(0xBF65BDB0), C32(0x5A9D4502), C32(0x59979AF7), - C32(0xC5CEA54E), C32(0x4B6B8150), C32(0x16E71909), C32(0x7D632319), - C32(0x930573A0), C32(0xF34C63D1), C32(0xCAF914B4), C32(0xFDD6612C) + 0x58BCBAC4), 0xEC47A08E), 0xAEE933B2), 0xDFCBC824), + 0xA7944804), 0xBF65BDB0), 0x5A9D4502), 0x59979AF7), + 0xC5CEA54E), 0x4B6B8150), 0x16E71909), 0x7D632319), + 0x930573A0), 0xF34C63D1), 0xCAF914B4), 0xFDD6612C) }; static const uint32_t C_init_192[] = { - C32(0x61550878), C32(0x89EF2B75), C32(0xA1660C46), C32(0x7EF3855B), - C32(0x7297B58C), C32(0x1BC67793), C32(0x7FB1C723), C32(0xB66FC640), - C32(0x1A48B71C), C32(0xF0976D17), C32(0x088CE80A), C32(0xA454EDF3), - C32(0x1C096BF4), C32(0xAC76224B), C32(0x5215781C), C32(0xCD5D2669) + 0x61550878), 0x89EF2B75), 0xA1660C46), 0x7EF3855B), + 0x7297B58C), 0x1BC67793), 0x7FB1C723), 0xB66FC640), + 0x1A48B71C), 0xF0976D17), 0x088CE80A), 0xA454EDF3), + 0x1C096BF4), 0xAC76224B), 0x5215781C), 0xCD5D2669) }; static const uint32_t A_init_224[] = { - C32(0xA5201467), C32(0xA9B8D94A), C32(0xD4CED997), C32(0x68379D7B), - C32(0xA7FC73BA), C32(0xF1A2546B), C32(0x606782BF), C32(0xE0BCFD0F), - C32(0x2F25374E), C32(0x069A149F), C32(0x5E2DFF25), C32(0xFAECF061) + 0xA5201467), 0xA9B8D94A), 0xD4CED997), 0x68379D7B), + 0xA7FC73BA), 0xF1A2546B), 0x606782BF), 0xE0BCFD0F), + 0x2F25374E), 0x069A149F), 0x5E2DFF25), 0xFAECF061) }; static const uint32_t B_init_224[] = { - C32(0xEC9905D8), C32(0xF21850CF), C32(0xC0A746C8), C32(0x21DAD498), - C32(0x35156EEB), C32(0x088C97F2), C32(0x26303E40), C32(0x8A2D4FB5), - C32(0xFEEE44B6), C32(0x8A1E9573), C32(0x7B81111A), C32(0xCBC139F0), - C32(0xA3513861), C32(0x1D2C362E), C32(0x918C580E), C32(0xB58E1B9C) + 0xEC9905D8), 0xF21850CF), 0xC0A746C8), 0x21DAD498), + 0x35156EEB), 0x088C97F2), 0x26303E40), 0x8A2D4FB5), + 0xFEEE44B6), 0x8A1E9573), 0x7B81111A), 0xCBC139F0), + 0xA3513861), 0x1D2C362E), 0x918C580E), 0xB58E1B9C) }; static const uint32_t C_init_224[] = { - C32(0xE4B573A1), C32(0x4C1A0880), C32(0x1E907C51), C32(0x04807EFD), - C32(0x3AD8CDE5), C32(0x16B21302), C32(0x02512C53), C32(0x2204CB18), - C32(0x99405F2D), C32(0xE5B648A1), C32(0x70AB1D43), C32(0xA10C25C2), - C32(0x16F1AC05), C32(0x38BBEB56), C32(0x9B01DC60), C32(0xB1096D83) + 0xE4B573A1), 0x4C1A0880), 0x1E907C51), 0x04807EFD), + 0x3AD8CDE5), 0x16B21302), 0x02512C53), 0x2204CB18), + 0x99405F2D), 0xE5B648A1), 0x70AB1D43), 0xA10C25C2), + 0x16F1AC05), 0x38BBEB56), 0x9B01DC60), 0xB1096D83) }; static const uint32_t A_init_256[] = { - C32(0x52F84552), C32(0xE54B7999), C32(0x2D8EE3EC), C32(0xB9645191), - C32(0xE0078B86), C32(0xBB7C44C9), C32(0xD2B5C1CA), C32(0xB0D2EB8C), - C32(0x14CE5A45), C32(0x22AF50DC), C32(0xEFFDBC6B), C32(0xEB21B74A) + 0x52F84552), 0xE54B7999), 0x2D8EE3EC), 0xB9645191), + 0xE0078B86), 0xBB7C44C9), 0xD2B5C1CA), 0xB0D2EB8C), + 0x14CE5A45), 0x22AF50DC), 0xEFFDBC6B), 0xEB21B74A) }; static const uint32_t B_init_256[] = { - C32(0xB555C6EE), C32(0x3E710596), C32(0xA72A652F), C32(0x9301515F), - C32(0xDA28C1FA), C32(0x696FD868), C32(0x9CB6BF72), C32(0x0AFE4002), - C32(0xA6E03615), C32(0x5138C1D4), C32(0xBE216306), C32(0xB38B8890), - C32(0x3EA8B96B), C32(0x3299ACE4), C32(0x30924DD4), C32(0x55CB34A5) + 0xB555C6EE), 0x3E710596), 0xA72A652F), 0x9301515F), + 0xDA28C1FA), 0x696FD868), 0x9CB6BF72), 0x0AFE4002), + 0xA6E03615), 0x5138C1D4), 0xBE216306), 0xB38B8890), + 0x3EA8B96B), 0x3299ACE4), 0x30924DD4), 0x55CB34A5) }; static const uint32_t C_init_256[] = { - C32(0xB405F031), C32(0xC4233EBA), C32(0xB3733979), C32(0xC0DD9D55), - C32(0xC51C28AE), C32(0xA327B8E1), C32(0x56C56167), C32(0xED614433), - C32(0x88B59D60), C32(0x60E2CEBA), C32(0x758B4B8B), C32(0x83E82A7F), - C32(0xBC968828), C32(0xE6E00BF7), C32(0xBA839E55), C32(0x9B491C60) + 0xB405F031), 0xC4233EBA), 0xB3733979), 0xC0DD9D55), + 0xC51C28AE), 0xA327B8E1), 0x56C56167), 0xED614433), + 0x88B59D60), 0x60E2CEBA), 0x758B4B8B), 0x83E82A7F), + 0xBC968828), 0xE6E00BF7), 0xBA839E55), 0x9B491C60) }; static const uint32_t A_init_384[] = { - C32(0xC8FCA331), C32(0xE55C504E), C32(0x003EBF26), C32(0xBB6B8D83), - C32(0x7B0448C1), C32(0x41B82789), C32(0x0A7C9601), C32(0x8D659CFF), - C32(0xB6E2673E), C32(0xCA54C77B), C32(0x1460FD7E), C32(0x3FCB8F2D) + 0xC8FCA331), 0xE55C504E), 0x003EBF26), 0xBB6B8D83), + 0x7B0448C1), 0x41B82789), 0x0A7C9601), 0x8D659CFF), + 0xB6E2673E), 0xCA54C77B), 0x1460FD7E), 0x3FCB8F2D) }; static const uint32_t B_init_384[] = { - C32(0x527291FC), C32(0x2A16455F), C32(0x78E627E5), C32(0x944F169F), - C32(0x1CA6F016), C32(0xA854EA25), C32(0x8DB98ABE), C32(0xF2C62641), - C32(0x30117DCB), C32(0xCF5C4309), C32(0x93711A25), C32(0xF9F671B8), - C32(0xB01D2116), C32(0x333F4B89), C32(0xB285D165), C32(0x86829B36) + 0x527291FC), 0x2A16455F), 0x78E627E5), 0x944F169F), + 0x1CA6F016), 0xA854EA25), 0x8DB98ABE), 0xF2C62641), + 0x30117DCB), 0xCF5C4309), 0x93711A25), 0xF9F671B8), + 0xB01D2116), 0x333F4B89), 0xB285D165), 0x86829B36) }; static const uint32_t C_init_384[] = { - C32(0xF764B11A), C32(0x76172146), C32(0xCEF6934D), C32(0xC6D28399), - C32(0xFE095F61), C32(0x5E6018B4), C32(0x5048ECF5), C32(0x51353261), - C32(0x6E6E36DC), C32(0x63130DAD), C32(0xA9C69BD6), C32(0x1E90EA0C), - C32(0x7C35073B), C32(0x28D95E6D), C32(0xAA340E0D), C32(0xCB3DEE70) + 0xF764B11A), 0x76172146), 0xCEF6934D), 0xC6D28399), + 0xFE095F61), 0x5E6018B4), 0x5048ECF5), 0x51353261), + 0x6E6E36DC), 0x63130DAD), 0xA9C69BD6), 0x1E90EA0C), + 0x7C35073B), 0x28D95E6D), 0xAA340E0D), 0xCB3DEE70) }; #endif -__device__ __constant__ -static const uint32_t d_A512[] = { - C32(0x20728DFD), C32(0x46C0BD53), C32(0xE782B699), C32(0x55304632), - C32(0x71B4EF90), C32(0x0EA9E82C), C32(0xDBB930F1), C32(0xFAD06B8B), - C32(0xBE0CAE40), C32(0x8BD14410), C32(0x76D2ADAC), C32(0x28ACAB7F) -}; - -__device__ __constant__ -static const uint32_t d_B512[] = { - C32(0xC1099CB7), C32(0x07B385F3), C32(0xE7442C26), C32(0xCC8AD640), - C32(0xEB6F56C7), C32(0x1EA81AA9), C32(0x73B9D314), C32(0x1DE85D08), - C32(0x48910A5A), C32(0x893B22DB), C32(0xC5A0DF44), C32(0xBBC4324E), - C32(0x72D2F240), C32(0x75941D99), C32(0x6D8BDE82), C32(0xA1A7502B) -}; - -__device__ __constant__ -static const uint32_t d_C512[] = { - C32(0xD9BF68D1), C32(0x58BAD750), C32(0x56028CB2), C32(0x8134F359), - C32(0xB5D469D8), C32(0x941A8CC2), C32(0x418B2A6E), C32(0x04052780), - C32(0x7F07D787), C32(0x5194358F), C32(0x3C60D665), C32(0xBE97D79A), - C32(0x950C3434), C32(0xAED9A06D), C32(0x2537DC8D), C32(0x7CDB5969) -}; /***************************************************/ // GPU Hash Function __global__ __launch_bounds__(256, 4) -void x14_shabal512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector) +void x14_shabal512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint32_t *g_hash) { - __syncthreads(); - - uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); - - if (thread < threads) + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + + const uint32_t d_A512[] = { + 0x20728DFD, 0x46C0BD53, 0xE782B699,0x55304632, + 0x71B4EF90, 0x0EA9E82C, 0xDBB930F1, 0xFAD06B8B, + 0xBE0CAE40, 0x8BD14410, 0x76D2ADAC, 0x28ACAB7F + }; + + const uint32_t d_B512[] = { + 0xC1099CB7, 0x07B385F3, 0xE7442C26, 0xCC8AD640, + 0xEB6F56C7, 0x1EA81AA9, 0x73B9D314, 0x1DE85D08, + 0x48910A5A, 0x893B22DB, 0xC5A0DF44, 0xBBC4324E, + 0x72D2F240, 0x75941D99, 0x6D8BDE82, 0xA1A7502B + }; + + const uint32_t d_C512[] = { + 0xD9BF68D1, 0x58BAD750, 0x56028CB2, 0x8134F359, + 0xB5D469D8, 0x941A8CC2, 0x418B2A6E, 0x04052780, + 0x7F07D787, 0x5194358F, 0x3C60D665, 0xBE97D79A, + 0x950C3434, 0xAED9A06D, 0x2537DC8D, 0x7CDB5969 + }; + +// if (thread < threads) { - uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread); - int hashPosition = nounce - startNounce; - uint32_t *Hash = (uint32_t*)&g_hash[hashPosition<<3]; // [hashPosition * 8] - + const uint32_t nounce = (startNounce + thread); + const uint32_t hashPosition = nounce - startNounce; + uint32_t *const Hash = &g_hash[hashPosition*16]; // [hashPosition * 8] uint32_t A00 = d_A512[0], A01 = d_A512[1], A02 = d_A512[2], A03 = d_A512[3], A04 = d_A512[4], A05 = d_A512[5], A06 = d_A512[6], A07 = d_A512[7], A08 = d_A512[8], A09 = d_A512[9], A0A = d_A512[10], A0B = d_A512[11]; @@ -410,23 +391,31 @@ void x14_shabal512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t CC = d_C512[12], CD = d_C512[13], CE = d_C512[14], CF = d_C512[15]; uint32_t M0, M1, M2, M3, M4, M5, M6, M7, M8, M9, MA, MB, MC, MD, ME, MF; - M0 = Hash[0]; - M1 = Hash[1]; - M2 = Hash[2]; - M3 = Hash[3]; - M4 = Hash[4]; - M5 = Hash[5]; - M6 = Hash[6]; - M7 = Hash[7]; - - M8 = Hash[8]; - M9 = Hash[9]; - MA = Hash[10]; - MB = Hash[11]; - MC = Hash[12]; - MD = Hash[13]; - ME = Hash[14]; - MF = Hash[15]; + + uint32_t msg[16]; + + uint28 *phash = (uint28*)Hash; + uint28 *outpt = (uint28*)msg; + outpt[0] = phash[0]; + outpt[1] = phash[1]; + + M0 = msg[0]; + M1 = msg[1]; + M2 = msg[2]; + M3 = msg[3]; + M4 = msg[4]; + M5 = msg[5]; + M6 = msg[6]; + M7 = msg[7]; + + M8 = msg[8]; + M9 = msg[9]; + MA = msg[10]; + MB = msg[11]; + MC = msg[12]; + MD = msg[13]; + ME = msg[14]; + MF = msg[15]; INPUT_BLOCK_ADD; A00 ^= 1; @@ -470,18 +459,11 @@ void x14_shabal512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t Hash[13] = BD; Hash[14] = BE; Hash[15] = BF; - - //result = (Hash[3] <= target); - - uint32_t *outpHash = (uint32_t*)&g_hash[hashPosition << 3]; // [8 * hashPosition]; - - for (int i = 0; i < 16; i++) - outpHash[i] = Hash[i]; } } // #include -__host__ void x14_shabal512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order) +__host__ void x14_shabal512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash) { const uint32_t threadsperblock = 64; @@ -489,5 +471,5 @@ __host__ void x14_shabal512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t s dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 block(threadsperblock); - x14_shabal512_gpu_hash_64<<>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector); + x14_shabal512_gpu_hash_64<<>>(threads, startNounce, d_hash); } diff --git a/x15/cuda_x14_shabal512.cu.orig b/x15/cuda_x14_shabal512.cu.orig new file mode 100644 index 0000000000..b6a156178a --- /dev/null +++ b/x15/cuda_x14_shabal512.cu.orig @@ -0,0 +1,474 @@ +/* + * Shabal-512 for X14/X15 (STUB) + */ +#include "cuda_helper.h" + + + +/* $Id: shabal.c 175 2010-05-07 16:03:20Z tp $ */ +/* + * Shabal implementation. + * + * ==========================(LICENSE BEGIN)============================ + * + * Copyright (c) 2007-2010 Projet RNRT SAPHIR + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * ===========================(LICENSE END)============================= + * + * @author Thomas Pornin + */ + +/* + * Part of this code was automatically generated (the part between + * the "BEGIN" and "END" markers). + */ + +#define sM 16 + +#define O1 13 +#define O2 9 +#define O3 6 + +/* + * We copy the state into local variables, so that the compiler knows + * that it can optimize them at will. + */ + +/* BEGIN -- automatically generated code. */ + +#define INPUT_BLOCK_ADD \ + B0 = B0 + M0; \ + B1 = B1 + M1; \ + B2 = B2 + M2; \ + B3 = B3 + M3; \ + B4 = B4 + M4; \ + B5 = B5 + M5; \ + B6 = B6 + M6; \ + B7 = B7 + M7; \ + B8 = B8 + M8; \ + B9 = B9 + M9; \ + BA = BA + MA; \ + BB = BB + MB; \ + BC = BC + MC; \ + BD = BD + MD; \ + BE = BE + ME; \ + BF = BF + MF; \ + +#define INPUT_BLOCK_SUB \ + C0 = C0 - M0; \ + C1 = C1 - M1; \ + C2 = C2 - M2; \ + C3 = C3 - M3; \ + C4 = C4 - M4; \ + C5 = C5 - M5; \ + C6 = C6 - M6; \ + C7 = C7 - M7; \ + C8 = C8 - M8; \ + C9 = C9 - M9; \ + CA = CA - MA; \ + CB = CB - MB; \ + CC = CC - MC; \ + CD = CD - MD; \ + CE = CE - ME; \ + CF = CF - MF; \ + +#define XOR_W \ + A00 ^= Wlow; \ + A01 ^= Whigh; \ + +#define SWAP(v1, v2) \ + v1^=v2;\ + v2 ^= v1;\ + v1 ^= v2; + +#define SWAP_BC \ + SWAP(B0, C0); \ + SWAP(B1, C1); \ + SWAP(B2, C2); \ + SWAP(B3, C3); \ + SWAP(B4, C4); \ + SWAP(B5, C5); \ + SWAP(B6, C6); \ + SWAP(B7, C7); \ + SWAP(B8, C8); \ + SWAP(B9, C9); \ + SWAP(BA, CA); \ + SWAP(BB, CB); \ + SWAP(BC, CC); \ + SWAP(BD, CD); \ + SWAP(BE, CE); \ + SWAP(BF, CF); \ + +#define PERM_ELT(xa0, xa1, xb0, xb1, xb2, xb3, xc, xm) \ + xa0 = ((xa0 \ + ^ (ROTL32(xa1, 15) * 5U) \ + ^ xc) * 3U) \ + ^ xb1 ^ (xb2 & ~xb3) ^ xm; \ + xb0 = (~(ROTL32(xb0, 1) ^ xa0)); \ + +#define PERM_STEP_0 \ + PERM_ELT(A00, A0B, B0, BD, B9, B6, C8, M0); \ + PERM_ELT(A01, A00, B1, BE, BA, B7, C7, M1); \ + PERM_ELT(A02, A01, B2, BF, BB, B8, C6, M2); \ + PERM_ELT(A03, A02, B3, B0, BC, B9, C5, M3); \ + PERM_ELT(A04, A03, B4, B1, BD, BA, C4, M4); \ + PERM_ELT(A05, A04, B5, B2, BE, BB, C3, M5); \ + PERM_ELT(A06, A05, B6, B3, BF, BC, C2, M6); \ + PERM_ELT(A07, A06, B7, B4, B0, BD, C1, M7); \ + PERM_ELT(A08, A07, B8, B5, B1, BE, C0, M8); \ + PERM_ELT(A09, A08, B9, B6, B2, BF, CF, M9); \ + PERM_ELT(A0A, A09, BA, B7, B3, B0, CE, MA); \ + PERM_ELT(A0B, A0A, BB, B8, B4, B1, CD, MB); \ + PERM_ELT(A00, A0B, BC, B9, B5, B2, CC, MC); \ + PERM_ELT(A01, A00, BD, BA, B6, B3, CB, MD); \ + PERM_ELT(A02, A01, BE, BB, B7, B4, CA, ME); \ + PERM_ELT(A03, A02, BF, BC, B8, B5, C9, MF); \ + +#define PERM_STEP_1 \ + PERM_ELT(A04, A03, B0, BD, B9, B6, C8, M0); \ + PERM_ELT(A05, A04, B1, BE, BA, B7, C7, M1); \ + PERM_ELT(A06, A05, B2, BF, BB, B8, C6, M2); \ + PERM_ELT(A07, A06, B3, B0, BC, B9, C5, M3); \ + PERM_ELT(A08, A07, B4, B1, BD, BA, C4, M4); \ + PERM_ELT(A09, A08, B5, B2, BE, BB, C3, M5); \ + PERM_ELT(A0A, A09, B6, B3, BF, BC, C2, M6); \ + PERM_ELT(A0B, A0A, B7, B4, B0, BD, C1, M7); \ + PERM_ELT(A00, A0B, B8, B5, B1, BE, C0, M8); \ + PERM_ELT(A01, A00, B9, B6, B2, BF, CF, M9); \ + PERM_ELT(A02, A01, BA, B7, B3, B0, CE, MA); \ + PERM_ELT(A03, A02, BB, B8, B4, B1, CD, MB); \ + PERM_ELT(A04, A03, BC, B9, B5, B2, CC, MC); \ + PERM_ELT(A05, A04, BD, BA, B6, B3, CB, MD); \ + PERM_ELT(A06, A05, BE, BB, B7, B4, CA, ME); \ + PERM_ELT(A07, A06, BF, BC, B8, B5, C9, MF); \ + +#define PERM_STEP_2 \ + PERM_ELT(A08, A07, B0, BD, B9, B6, C8, M0); \ + PERM_ELT(A09, A08, B1, BE, BA, B7, C7, M1); \ + PERM_ELT(A0A, A09, B2, BF, BB, B8, C6, M2); \ + PERM_ELT(A0B, A0A, B3, B0, BC, B9, C5, M3); \ + PERM_ELT(A00, A0B, B4, B1, BD, BA, C4, M4); \ + PERM_ELT(A01, A00, B5, B2, BE, BB, C3, M5); \ + PERM_ELT(A02, A01, B6, B3, BF, BC, C2, M6); \ + PERM_ELT(A03, A02, B7, B4, B0, BD, C1, M7); \ + PERM_ELT(A04, A03, B8, B5, B1, BE, C0, M8); \ + PERM_ELT(A05, A04, B9, B6, B2, BF, CF, M9); \ + PERM_ELT(A06, A05, BA, B7, B3, B0, CE, MA); \ + PERM_ELT(A07, A06, BB, B8, B4, B1, CD, MB); \ + PERM_ELT(A08, A07, BC, B9, B5, B2, CC, MC); \ + PERM_ELT(A09, A08, BD, BA, B6, B3, CB, MD); \ + PERM_ELT(A0A, A09, BE, BB, B7, B4, CA, ME); \ + PERM_ELT(A0B, A0A, BF, BC, B8, B5, C9, MF); \ + +#define APPLY_P \ + B0 = ROTL32(B0, 17); \ + B1 = ROTL32(B1, 17); \ + B2 = ROTL32(B2, 17); \ + B3 = ROTL32(B3, 17); \ + B4 = ROTL32(B4, 17); \ + B5 = ROTL32(B5, 17); \ + B6 = ROTL32(B6, 17); \ + B7 = ROTL32(B7, 17); \ + B8 = ROTL32(B8, 17); \ + B9 = ROTL32(B9, 17); \ + BA = ROTL32(BA, 17); \ + BB = ROTL32(BB, 17); \ + BC = ROTL32(BC, 17); \ + BD = ROTL32(BD, 17); \ + BE = ROTL32(BE, 17); \ + BF = ROTL32(BF, 17); \ + PERM_STEP_0; \ + PERM_STEP_1; \ + PERM_STEP_2; \ + A0B = (A0B + C6); \ + A0A = (A0A + C5); \ + A09 = (A09 + C4); \ + A08 = (A08 + C3); \ + A07 = (A07 + C2); \ + A06 = (A06 + C1); \ + A05 = (A05 + C0); \ + A04 = (A04 + CF); \ + A03 = (A03 + CE); \ + A02 = (A02 + CD); \ + A01 = (A01 + CC); \ + A00 = (A00 + CB); \ + A0B = (A0B + CA); \ + A0A = (A0A + C9); \ + A09 = (A09 + C8); \ + A08 = (A08 + C7); \ + A07 = (A07 + C6); \ + A06 = (A06 + C5); \ + A05 = (A05 + C4); \ + A04 = (A04 + C3); \ + A03 = (A03 + C2); \ + A02 = (A02 + C1); \ + A01 = (A01 + C0); \ + A00 = (A00 + CF); \ + A0B = (A0B + CE); \ + A0A = (A0A + CD); \ + A09 = (A09 + CC); \ + A08 = (A08 + CB); \ + A07 = (A07 + CA); \ + A06 = (A06 + C9); \ + A05 = (A05 + C8); \ + A04 = (A04 + C7); \ + A03 = (A03 + C6); \ + A02 = (A02 + C5); \ + A01 = (A01 + C4); \ + A00 = (A00 + C3); \ + +#define APPLY_P_FINAL \ + B0 = ROTL32(B0, 17); \ + B1 = ROTL32(B1, 17); \ + B2 = ROTL32(B2, 17); \ + B3 = ROTL32(B3, 17); \ + B4 = ROTL32(B4, 17); \ + B5 = ROTL32(B5, 17); \ + B6 = ROTL32(B6, 17); \ + B7 = ROTL32(B7, 17); \ + B8 = ROTL32(B8, 17); \ + B9 = ROTL32(B9, 17); \ + BA = ROTL32(BA, 17); \ + BB = ROTL32(BB, 17); \ + BC = ROTL32(BC, 17); \ + BD = ROTL32(BD, 17); \ + BE = ROTL32(BE, 17); \ + BF = ROTL32(BF, 17); \ + PERM_STEP_0; \ + PERM_STEP_1; \ + PERM_STEP_2; \ + +#define INCR_W if ((Wlow = (Wlow + 1)) == 0) \ + Whigh = (Whigh + 1); \ + + + +#if 0 /* other hash sizes init */ + +static const uint32_t A_init_192[] = { + 0xFD749ED4), 0xB798E530), 0x33904B6F), 0x46BDA85E), + 0x076934B4), 0x454B4058), 0x77F74527), 0xFB4CF465), + 0x62931DA9), 0xE778C8DB), 0x22B3998E), 0xAC15CFB9) +}; + +static const uint32_t B_init_192[] = { + 0x58BCBAC4), 0xEC47A08E), 0xAEE933B2), 0xDFCBC824), + 0xA7944804), 0xBF65BDB0), 0x5A9D4502), 0x59979AF7), + 0xC5CEA54E), 0x4B6B8150), 0x16E71909), 0x7D632319), + 0x930573A0), 0xF34C63D1), 0xCAF914B4), 0xFDD6612C) +}; + +static const uint32_t C_init_192[] = { + 0x61550878), 0x89EF2B75), 0xA1660C46), 0x7EF3855B), + 0x7297B58C), 0x1BC67793), 0x7FB1C723), 0xB66FC640), + 0x1A48B71C), 0xF0976D17), 0x088CE80A), 0xA454EDF3), + 0x1C096BF4), 0xAC76224B), 0x5215781C), 0xCD5D2669) +}; + +static const uint32_t A_init_224[] = { + 0xA5201467), 0xA9B8D94A), 0xD4CED997), 0x68379D7B), + 0xA7FC73BA), 0xF1A2546B), 0x606782BF), 0xE0BCFD0F), + 0x2F25374E), 0x069A149F), 0x5E2DFF25), 0xFAECF061) +}; + +static const uint32_t B_init_224[] = { + 0xEC9905D8), 0xF21850CF), 0xC0A746C8), 0x21DAD498), + 0x35156EEB), 0x088C97F2), 0x26303E40), 0x8A2D4FB5), + 0xFEEE44B6), 0x8A1E9573), 0x7B81111A), 0xCBC139F0), + 0xA3513861), 0x1D2C362E), 0x918C580E), 0xB58E1B9C) +}; + +static const uint32_t C_init_224[] = { + 0xE4B573A1), 0x4C1A0880), 0x1E907C51), 0x04807EFD), + 0x3AD8CDE5), 0x16B21302), 0x02512C53), 0x2204CB18), + 0x99405F2D), 0xE5B648A1), 0x70AB1D43), 0xA10C25C2), + 0x16F1AC05), 0x38BBEB56), 0x9B01DC60), 0xB1096D83) +}; + +static const uint32_t A_init_256[] = { + 0x52F84552), 0xE54B7999), 0x2D8EE3EC), 0xB9645191), + 0xE0078B86), 0xBB7C44C9), 0xD2B5C1CA), 0xB0D2EB8C), + 0x14CE5A45), 0x22AF50DC), 0xEFFDBC6B), 0xEB21B74A) +}; + +static const uint32_t B_init_256[] = { + 0xB555C6EE), 0x3E710596), 0xA72A652F), 0x9301515F), + 0xDA28C1FA), 0x696FD868), 0x9CB6BF72), 0x0AFE4002), + 0xA6E03615), 0x5138C1D4), 0xBE216306), 0xB38B8890), + 0x3EA8B96B), 0x3299ACE4), 0x30924DD4), 0x55CB34A5) +}; + +static const uint32_t C_init_256[] = { + 0xB405F031), 0xC4233EBA), 0xB3733979), 0xC0DD9D55), + 0xC51C28AE), 0xA327B8E1), 0x56C56167), 0xED614433), + 0x88B59D60), 0x60E2CEBA), 0x758B4B8B), 0x83E82A7F), + 0xBC968828), 0xE6E00BF7), 0xBA839E55), 0x9B491C60) +}; + +static const uint32_t A_init_384[] = { + 0xC8FCA331), 0xE55C504E), 0x003EBF26), 0xBB6B8D83), + 0x7B0448C1), 0x41B82789), 0x0A7C9601), 0x8D659CFF), + 0xB6E2673E), 0xCA54C77B), 0x1460FD7E), 0x3FCB8F2D) +}; + +static const uint32_t B_init_384[] = { + 0x527291FC), 0x2A16455F), 0x78E627E5), 0x944F169F), + 0x1CA6F016), 0xA854EA25), 0x8DB98ABE), 0xF2C62641), + 0x30117DCB), 0xCF5C4309), 0x93711A25), 0xF9F671B8), + 0xB01D2116), 0x333F4B89), 0xB285D165), 0x86829B36) +}; + +static const uint32_t C_init_384[] = { + 0xF764B11A), 0x76172146), 0xCEF6934D), 0xC6D28399), + 0xFE095F61), 0x5E6018B4), 0x5048ECF5), 0x51353261), + 0x6E6E36DC), 0x63130DAD), 0xA9C69BD6), 0x1E90EA0C), + 0x7C35073B), 0x28D95E6D), 0xAA340E0D), 0xCB3DEE70) +}; +#endif + + +/***************************************************/ +// GPU Hash Function +__global__ __launch_bounds__(256, 4) +void x14_shabal512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint32_t *g_hash) +{ + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + + const uint32_t d_A512[] = { + 0x20728DFD, 0x46C0BD53, 0xE782B699,0x55304632, + 0x71B4EF90, 0x0EA9E82C, 0xDBB930F1, 0xFAD06B8B, + 0xBE0CAE40, 0x8BD14410, 0x76D2ADAC, 0x28ACAB7F + }; + + const uint32_t d_B512[] = { + 0xC1099CB7, 0x07B385F3, 0xE7442C26, 0xCC8AD640, + 0xEB6F56C7, 0x1EA81AA9, 0x73B9D314, 0x1DE85D08, + 0x48910A5A, 0x893B22DB, 0xC5A0DF44, 0xBBC4324E, + 0x72D2F240, 0x75941D99, 0x6D8BDE82, 0xA1A7502B + }; + + const uint32_t d_C512[] = { + 0xD9BF68D1, 0x58BAD750, 0x56028CB2, 0x8134F359, + 0xB5D469D8, 0x941A8CC2, 0x418B2A6E, 0x04052780, + 0x7F07D787, 0x5194358F, 0x3C60D665, 0xBE97D79A, + 0x950C3434, 0xAED9A06D, 0x2537DC8D, 0x7CDB5969 + }; + + if (thread < threads) + { +<<<<<<< HEAD + const uint32_t nounce = (startNounce + thread); + const uint32_t hashPosition = nounce - startNounce; + uint32_t *const Hash = &g_hash[hashPosition*16]; // [hashPosition * 8] + uint32_t tmp; +======= + uint32_t nounce = (startNounce + thread); + uint32_t hashPosition = nounce - startNounce; + uint32_t *Hash = &g_hash[hashPosition*16]; // [hashPosition * 8] +>>>>>>> fe3bf39... less scratch registers + uint32_t A00 = d_A512[0], A01 = d_A512[1], A02 = d_A512[2], A03 = d_A512[3], + A04 = d_A512[4], A05 = d_A512[5], A06 = d_A512[6], A07 = d_A512[7], + A08 = d_A512[8], A09 = d_A512[9], A0A = d_A512[10], A0B = d_A512[11]; + uint32_t B0 = d_B512[0], B1 = d_B512[1], B2 = d_B512[2], B3 = d_B512[3], + B4 = d_B512[4], B5 = d_B512[5], B6 = d_B512[6], B7 = d_B512[7], + B8 = d_B512[8], B9 = d_B512[9], BA = d_B512[10], BB = d_B512[11], + BC = d_B512[12], BD = d_B512[13], BE = d_B512[14], BF = d_B512[15]; + uint32_t C0 = d_C512[0], C1 = d_C512[1], C2 = d_C512[2], C3 = d_C512[3], + C4 = d_C512[4], C5 = d_C512[5], C6 = d_C512[6], C7 = d_C512[7], + C8 = d_C512[8], C9 = d_C512[9], CA = d_C512[10], CB = d_C512[11], + CC = d_C512[12], CD = d_C512[13], CE = d_C512[14], CF = d_C512[15]; + uint32_t M0, M1, M2, M3, M4, M5, M6, M7, M8, M9, MA, MB, MC, MD, ME, MF; + + M0 = Hash[0]; + M1 = Hash[1]; + M2 = Hash[2]; + M3 = Hash[3]; + M4 = Hash[4]; + M5 = Hash[5]; + M6 = Hash[6]; + M7 = Hash[7]; + + M8 = Hash[8]; + M9 = Hash[9]; + MA = Hash[10]; + MB = Hash[11]; + MC = Hash[12]; + MD = Hash[13]; + ME = Hash[14]; + MF = Hash[15]; + + INPUT_BLOCK_ADD; + A00 ^= 1; + APPLY_P; + INPUT_BLOCK_SUB; + SWAP_BC; + + M0 = 0x80; + M1 = M2 = M3 = M4 = M5 = M6 = M7 = M8 = M9 = MA = MB = MC = MD = ME = MF = 0; + + INPUT_BLOCK_ADD; + A00 ^= 2; + APPLY_P; + + SWAP_BC; + A00 ^= 2; + APPLY_P; + + SWAP_BC; + A00 ^= 2; + APPLY_P; + + SWAP_BC; + A00 ^= 2; + APPLY_P_FINAL; + + Hash[0] = B0; + Hash[1] = B1; + Hash[2] = B2; + Hash[3] = B3; + Hash[4] = B4; + Hash[5] = B5; + Hash[6] = B6; + Hash[7] = B7; + + Hash[8] = B8; + Hash[9] = B9; + Hash[10] = BA; + Hash[11] = BB; + Hash[12] = BC; + Hash[13] = BD; + Hash[14] = BE; + Hash[15] = BF; + } +} + +// #include +__host__ void x14_shabal512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash) +{ + const uint32_t threadsperblock = 64; + + // berechne wie viele Thread Blocks wir brauchen + dim3 grid((threads + threadsperblock-1)/threadsperblock); + dim3 block(threadsperblock); + + x14_shabal512_gpu_hash_64<<>>(threads, startNounce, d_hash); +} diff --git a/x15/cuda_x15_whirlpool.cu b/x15/cuda_x15_whirlpool.cu index 4729d9b902..5cb34ec946 100644 --- a/x15/cuda_x15_whirlpool.cu +++ b/x15/cuda_x15_whirlpool.cu @@ -10,25 +10,18 @@ #define USE_SHARED 1 #include "cuda_helper.h" +#include "cuda_vector.h" + + __constant__ uint64_t c_PaddedMessage80[16]; // padded message (80 bytes + padding) __constant__ uint32_t pTarget[8]; -uint32_t *d_wnounce[MAX_GPUS]; -uint32_t *d_WNonce[MAX_GPUS]; +static uint32_t *h_wnounce[MAX_GPUS]; +static uint32_t *d_WNonce[MAX_GPUS]; #define USE_ALL_TABLES 1 -__constant__ static uint64_t mixTob0Tox[256]; -#if USE_ALL_TABLES -__constant__ static uint64_t mixTob1Tox[256]; -__constant__ static uint64_t mixTob2Tox[256]; -__constant__ static uint64_t mixTob3Tox[256]; -__constant__ static uint64_t mixTob4Tox[256]; -__constant__ static uint64_t mixTob5Tox[256]; -__constant__ static uint64_t mixTob6Tox[256]; -__constant__ static uint64_t mixTob7Tox[256]; -#endif /** * Whirlpool CUDA kernel implementation. @@ -62,1069 +55,7 @@ __constant__ static uint64_t mixTob7Tox[256]; * @author SP */ -static const uint64_t old1_T0[256] = { - SPH_C64(0x78D8C07818281818), SPH_C64(0xAF2605AF23652323), - SPH_C64(0xF9B87EF9C657C6C6), SPH_C64(0x6FFB136FE825E8E8), - SPH_C64(0xA1CB4CA187948787), SPH_C64(0x6211A962B8D5B8B8), - SPH_C64(0x0509080501030101), SPH_C64(0x6E0D426E4FD14F4F), - SPH_C64(0xEE9BADEE365A3636), SPH_C64(0x04FF5904A6F7A6A6), - SPH_C64(0xBD0CDEBDD26BD2D2), SPH_C64(0x060EFB06F502F5F5), - SPH_C64(0x8096EF80798B7979), SPH_C64(0xCE305FCE6FB16F6F), - SPH_C64(0xEF6DFCEF91AE9191), SPH_C64(0x07F8AA0752F65252), - SPH_C64(0xFD4727FD60A06060), SPH_C64(0x76358976BCD9BCBC), - SPH_C64(0xCD37ACCD9BB09B9B), SPH_C64(0x8C8A048C8E8F8E8E), - SPH_C64(0x15D27115A3F8A3A3), SPH_C64(0x3C6C603C0C140C0C), - SPH_C64(0x8A84FF8A7B8D7B7B), SPH_C64(0xE180B5E1355F3535), - SPH_C64(0x69F5E8691D271D1D), SPH_C64(0x47B35347E03DE0E0), - SPH_C64(0xAC21F6ACD764D7D7), SPH_C64(0xED9C5EEDC25BC2C2), - SPH_C64(0x96436D962E722E2E), SPH_C64(0x7A29627A4BDD4B4B), - SPH_C64(0x215DA321FE1FFEFE), SPH_C64(0x16D5821657F95757), - SPH_C64(0x41BDA841153F1515), SPH_C64(0xB6E89FB677997777), - SPH_C64(0xEB92A5EB37593737), SPH_C64(0x569E7B56E532E5E5), - SPH_C64(0xD9138CD99FBC9F9F), SPH_C64(0x1723D317F00DF0F0), - SPH_C64(0x7F206A7F4ADE4A4A), SPH_C64(0x95449E95DA73DADA), - SPH_C64(0x25A2FA2558E85858), SPH_C64(0xCACF06CAC946C9C9), - SPH_C64(0x8D7C558D297B2929), SPH_C64(0x225A50220A1E0A0A), - SPH_C64(0x4F50E14FB1CEB1B1), SPH_C64(0x1AC9691AA0FDA0A0), - SPH_C64(0xDA147FDA6BBD6B6B), SPH_C64(0xABD95CAB85928585), - SPH_C64(0x733C8173BDDABDBD), SPH_C64(0x348FD2345DE75D5D), - SPH_C64(0x5090805010301010), SPH_C64(0x0307F303F401F4F4), - SPH_C64(0xC0DD16C0CB40CBCB), SPH_C64(0xC6D3EDC63E423E3E), - SPH_C64(0x112D2811050F0505), SPH_C64(0xE6781FE667A96767), - SPH_C64(0x53977353E431E4E4), SPH_C64(0xBB0225BB27692727), - SPH_C64(0x5873325841C34141), SPH_C64(0x9DA72C9D8B808B8B), - SPH_C64(0x01F65101A7F4A7A7), SPH_C64(0x94B2CF947D877D7D), - SPH_C64(0xFB49DCFB95A29595), SPH_C64(0x9F568E9FD875D8D8), - SPH_C64(0x30708B30FB10FBFB), SPH_C64(0x71CD2371EE2FEEEE), - SPH_C64(0x91BBC7917C847C7C), SPH_C64(0xE37117E366AA6666), - SPH_C64(0x8E7BA68EDD7ADDDD), SPH_C64(0x4BAFB84B17391717), - SPH_C64(0x4645024647C94747), SPH_C64(0xDC1A84DC9EBF9E9E), - SPH_C64(0xC5D41EC5CA43CACA), SPH_C64(0x995875992D772D2D), - SPH_C64(0x792E9179BFDCBFBF), SPH_C64(0x1B3F381B07090707), - SPH_C64(0x23AC0123ADEAADAD), SPH_C64(0x2FB0EA2F5AEE5A5A), - SPH_C64(0xB5EF6CB583988383), SPH_C64(0xFFB685FF33553333), - SPH_C64(0xF25C3FF263A56363), SPH_C64(0x0A12100A02060202), - SPH_C64(0x38933938AAE3AAAA), SPH_C64(0xA8DEAFA871937171), - SPH_C64(0xCFC60ECFC845C8C8), SPH_C64(0x7DD1C87D192B1919), - SPH_C64(0x703B727049DB4949), SPH_C64(0x9A5F869AD976D9D9), - SPH_C64(0x1D31C31DF20BF2F2), SPH_C64(0x48A84B48E338E3E3), - SPH_C64(0x2AB9E22A5BED5B5B), SPH_C64(0x92BC349288858888), - SPH_C64(0xC83EA4C89AB39A9A), SPH_C64(0xBE0B2DBE266A2626), - SPH_C64(0xFABF8DFA32563232), SPH_C64(0x4A59E94AB0CDB0B0), - SPH_C64(0x6AF21B6AE926E9E9), SPH_C64(0x337778330F110F0F), - SPH_C64(0xA633E6A6D562D5D5), SPH_C64(0xBAF474BA809D8080), - SPH_C64(0x7C27997CBEDFBEBE), SPH_C64(0xDEEB26DECD4ACDCD), - SPH_C64(0xE489BDE4345C3434), SPH_C64(0x75327A7548D84848), - SPH_C64(0x2454AB24FF1CFFFF), SPH_C64(0x8F8DF78F7A8E7A7A), - SPH_C64(0xEA64F4EA90AD9090), SPH_C64(0x3E9DC23E5FE15F5F), - SPH_C64(0xA03D1DA020602020), SPH_C64(0xD50F67D568B86868), - SPH_C64(0x72CAD0721A2E1A1A), SPH_C64(0x2CB7192CAEEFAEAE), - SPH_C64(0x5E7DC95EB4C1B4B4), SPH_C64(0x19CE9A1954FC5454), - SPH_C64(0xE57FECE593A89393), SPH_C64(0xAA2F0DAA22662222), - SPH_C64(0xE96307E964AC6464), SPH_C64(0x122ADB12F10EF1F1), - SPH_C64(0xA2CCBFA273957373), SPH_C64(0x5A82905A12361212), - SPH_C64(0x5D7A3A5D40C04040), SPH_C64(0x2848402808180808), - SPH_C64(0xE89556E8C358C3C3), SPH_C64(0x7BDF337BEC29ECEC), - SPH_C64(0x904D9690DB70DBDB), SPH_C64(0x1FC0611FA1FEA1A1), - SPH_C64(0x83911C838D8A8D8D), SPH_C64(0xC9C8F5C93D473D3D), - SPH_C64(0xF15BCCF197A49797), SPH_C64(0x0000000000000000), - SPH_C64(0xD4F936D4CF4CCFCF), SPH_C64(0x876E45872B7D2B2B), - SPH_C64(0xB3E197B3769A7676), SPH_C64(0xB0E664B0829B8282), - SPH_C64(0xA928FEA9D667D6D6), SPH_C64(0x77C3D8771B2D1B1B), - SPH_C64(0x5B74C15BB5C2B5B5), SPH_C64(0x29BE1129AFECAFAF), - SPH_C64(0xDF1D77DF6ABE6A6A), SPH_C64(0x0DEABA0D50F05050), - SPH_C64(0x4C57124C45CF4545), SPH_C64(0x1838CB18F308F3F3), - SPH_C64(0xF0AD9DF030503030), SPH_C64(0x74C42B74EF2CEFEF), - SPH_C64(0xC3DAE5C33F413F3F), SPH_C64(0x1CC7921C55FF5555), - SPH_C64(0x10DB7910A2FBA2A2), SPH_C64(0x65E90365EA23EAEA), - SPH_C64(0xEC6A0FEC65AF6565), SPH_C64(0x6803B968BAD3BABA), - SPH_C64(0x934A65932F712F2F), SPH_C64(0xE78E4EE7C05DC0C0), - SPH_C64(0x8160BE81DE7FDEDE), SPH_C64(0x6CFCE06C1C241C1C), - SPH_C64(0x2E46BB2EFD1AFDFD), SPH_C64(0x641F52644DD74D4D), - SPH_C64(0xE076E4E092AB9292), SPH_C64(0xBCFA8FBC759F7575), - SPH_C64(0x1E36301E060A0606), SPH_C64(0x98AE24988A838A8A), - SPH_C64(0x404BF940B2CBB2B2), SPH_C64(0x59856359E637E6E6), - SPH_C64(0x367E70360E120E0E), SPH_C64(0x63E7F8631F211F1F), - SPH_C64(0xF75537F762A66262), SPH_C64(0xA33AEEA3D461D4D4), - SPH_C64(0x32812932A8E5A8A8), SPH_C64(0xF452C4F496A79696), - SPH_C64(0x3A629B3AF916F9F9), SPH_C64(0xF6A366F6C552C5C5), - SPH_C64(0xB11035B1256F2525), SPH_C64(0x20ABF22059EB5959), - SPH_C64(0xAED054AE84918484), SPH_C64(0xA7C5B7A772967272), - SPH_C64(0xDDECD5DD394B3939), SPH_C64(0x61165A614CD44C4C), - SPH_C64(0x3B94CA3B5EE25E5E), SPH_C64(0x859FE78578887878), - SPH_C64(0xD8E5DDD838483838), SPH_C64(0x869814868C898C8C), - SPH_C64(0xB217C6B2D16ED1D1), SPH_C64(0x0BE4410BA5F2A5A5), - SPH_C64(0x4DA1434DE23BE2E2), SPH_C64(0xF84E2FF861A36161), - SPH_C64(0x4542F145B3C8B3B3), SPH_C64(0xA53415A521632121), - SPH_C64(0xD60894D69CB99C9C), SPH_C64(0x66EEF0661E221E1E), - SPH_C64(0x5261225243C54343), SPH_C64(0xFCB176FCC754C7C7), - SPH_C64(0x2B4FB32BFC19FCFC), SPH_C64(0x14242014040C0404), - SPH_C64(0x08E3B20851F35151), SPH_C64(0xC725BCC799B69999), - SPH_C64(0xC4224FC46DB76D6D), SPH_C64(0x396568390D170D0D), - SPH_C64(0x35798335FA13FAFA), SPH_C64(0x8469B684DF7CDFDF), - SPH_C64(0x9BA9D79B7E827E7E), SPH_C64(0xB4193DB4246C2424), - SPH_C64(0xD7FEC5D73B4D3B3B), SPH_C64(0x3D9A313DABE0ABAB), - SPH_C64(0xD1F03ED1CE4FCECE), SPH_C64(0x5599885511331111), - SPH_C64(0x89830C898F8C8F8F), SPH_C64(0x6B044A6B4ED24E4E), - SPH_C64(0x5166D151B7C4B7B7), SPH_C64(0x60E00B60EB20EBEB), - SPH_C64(0xCCC1FDCC3C443C3C), SPH_C64(0xBFFD7CBF819E8181), - SPH_C64(0xFE40D4FE94A19494), SPH_C64(0x0C1CEB0CF704F7F7), - SPH_C64(0x6718A167B9D6B9B9), SPH_C64(0x5F8B985F13351313), - SPH_C64(0x9C517D9C2C742C2C), SPH_C64(0xB805D6B8D368D3D3), - SPH_C64(0x5C8C6B5CE734E7E7), SPH_C64(0xCB3957CB6EB26E6E), - SPH_C64(0xF3AA6EF3C451C4C4), SPH_C64(0x0F1B180F03050303), - SPH_C64(0x13DC8A1356FA5656), SPH_C64(0x495E1A4944CC4444), - SPH_C64(0x9EA0DF9E7F817F7F), SPH_C64(0x37882137A9E6A9A9), - SPH_C64(0x82674D822A7E2A2A), SPH_C64(0x6D0AB16DBBD0BBBB), - SPH_C64(0xE28746E2C15EC1C1), SPH_C64(0x02F1A20253F55353), - SPH_C64(0x8B72AE8BDC79DCDC), SPH_C64(0x275358270B1D0B0B), - SPH_C64(0xD3019CD39DBA9D9D), SPH_C64(0xC12B47C16CB46C6C), - SPH_C64(0xF5A495F531533131), SPH_C64(0xB9F387B9749C7474), - SPH_C64(0x0915E309F607F6F6), SPH_C64(0x434C0A4346CA4646), - SPH_C64(0x26A50926ACE9ACAC), SPH_C64(0x97B53C9789868989), - SPH_C64(0x44B4A044143C1414), SPH_C64(0x42BA5B42E13EE1E1), - SPH_C64(0x4EA6B04E163A1616), SPH_C64(0xD2F7CDD23A4E3A3A), - SPH_C64(0xD0066FD069BB6969), SPH_C64(0x2D41482D091B0909), - SPH_C64(0xADD7A7AD70907070), SPH_C64(0x546FD954B6C7B6B6), - SPH_C64(0xB71ECEB7D06DD0D0), SPH_C64(0x7ED63B7EED2AEDED), - SPH_C64(0xDBE22EDBCC49CCCC), SPH_C64(0x57682A5742C64242), - SPH_C64(0xC22CB4C298B59898), SPH_C64(0x0EED490EA4F1A4A4), - SPH_C64(0x88755D8828782828), SPH_C64(0x3186DA315CE45C5C), - SPH_C64(0x3F6B933FF815F8F8), SPH_C64(0xA4C244A486978686) -}; - - -static const uint64_t old1_T1[256] = { - SPH_C64(0xD8C0781828181878), SPH_C64(0x2605AF23652323AF), - SPH_C64(0xB87EF9C657C6C6F9), SPH_C64(0xFB136FE825E8E86F), - SPH_C64(0xCB4CA187948787A1), SPH_C64(0x11A962B8D5B8B862), - SPH_C64(0x0908050103010105), SPH_C64(0x0D426E4FD14F4F6E), - SPH_C64(0x9BADEE365A3636EE), SPH_C64(0xFF5904A6F7A6A604), - SPH_C64(0x0CDEBDD26BD2D2BD), SPH_C64(0x0EFB06F502F5F506), - SPH_C64(0x96EF80798B797980), SPH_C64(0x305FCE6FB16F6FCE), - SPH_C64(0x6DFCEF91AE9191EF), SPH_C64(0xF8AA0752F6525207), - SPH_C64(0x4727FD60A06060FD), SPH_C64(0x358976BCD9BCBC76), - SPH_C64(0x37ACCD9BB09B9BCD), SPH_C64(0x8A048C8E8F8E8E8C), - SPH_C64(0xD27115A3F8A3A315), SPH_C64(0x6C603C0C140C0C3C), - SPH_C64(0x84FF8A7B8D7B7B8A), SPH_C64(0x80B5E1355F3535E1), - SPH_C64(0xF5E8691D271D1D69), SPH_C64(0xB35347E03DE0E047), - SPH_C64(0x21F6ACD764D7D7AC), SPH_C64(0x9C5EEDC25BC2C2ED), - SPH_C64(0x436D962E722E2E96), SPH_C64(0x29627A4BDD4B4B7A), - SPH_C64(0x5DA321FE1FFEFE21), SPH_C64(0xD5821657F9575716), - SPH_C64(0xBDA841153F151541), SPH_C64(0xE89FB677997777B6), - SPH_C64(0x92A5EB37593737EB), SPH_C64(0x9E7B56E532E5E556), - SPH_C64(0x138CD99FBC9F9FD9), SPH_C64(0x23D317F00DF0F017), - SPH_C64(0x206A7F4ADE4A4A7F), SPH_C64(0x449E95DA73DADA95), - SPH_C64(0xA2FA2558E8585825), SPH_C64(0xCF06CAC946C9C9CA), - SPH_C64(0x7C558D297B29298D), SPH_C64(0x5A50220A1E0A0A22), - SPH_C64(0x50E14FB1CEB1B14F), SPH_C64(0xC9691AA0FDA0A01A), - SPH_C64(0x147FDA6BBD6B6BDA), SPH_C64(0xD95CAB85928585AB), - SPH_C64(0x3C8173BDDABDBD73), SPH_C64(0x8FD2345DE75D5D34), - SPH_C64(0x9080501030101050), SPH_C64(0x07F303F401F4F403), - SPH_C64(0xDD16C0CB40CBCBC0), SPH_C64(0xD3EDC63E423E3EC6), - SPH_C64(0x2D2811050F050511), SPH_C64(0x781FE667A96767E6), - SPH_C64(0x977353E431E4E453), SPH_C64(0x0225BB27692727BB), - SPH_C64(0x73325841C3414158), SPH_C64(0xA72C9D8B808B8B9D), - SPH_C64(0xF65101A7F4A7A701), SPH_C64(0xB2CF947D877D7D94), - SPH_C64(0x49DCFB95A29595FB), SPH_C64(0x568E9FD875D8D89F), - SPH_C64(0x708B30FB10FBFB30), SPH_C64(0xCD2371EE2FEEEE71), - SPH_C64(0xBBC7917C847C7C91), SPH_C64(0x7117E366AA6666E3), - SPH_C64(0x7BA68EDD7ADDDD8E), SPH_C64(0xAFB84B173917174B), - SPH_C64(0x45024647C9474746), SPH_C64(0x1A84DC9EBF9E9EDC), - SPH_C64(0xD41EC5CA43CACAC5), SPH_C64(0x5875992D772D2D99), - SPH_C64(0x2E9179BFDCBFBF79), SPH_C64(0x3F381B070907071B), - SPH_C64(0xAC0123ADEAADAD23), SPH_C64(0xB0EA2F5AEE5A5A2F), - SPH_C64(0xEF6CB583988383B5), SPH_C64(0xB685FF33553333FF), - SPH_C64(0x5C3FF263A56363F2), SPH_C64(0x12100A020602020A), - SPH_C64(0x933938AAE3AAAA38), SPH_C64(0xDEAFA871937171A8), - SPH_C64(0xC60ECFC845C8C8CF), SPH_C64(0xD1C87D192B19197D), - SPH_C64(0x3B727049DB494970), SPH_C64(0x5F869AD976D9D99A), - SPH_C64(0x31C31DF20BF2F21D), SPH_C64(0xA84B48E338E3E348), - SPH_C64(0xB9E22A5BED5B5B2A), SPH_C64(0xBC34928885888892), - SPH_C64(0x3EA4C89AB39A9AC8), SPH_C64(0x0B2DBE266A2626BE), - SPH_C64(0xBF8DFA32563232FA), SPH_C64(0x59E94AB0CDB0B04A), - SPH_C64(0xF21B6AE926E9E96A), SPH_C64(0x7778330F110F0F33), - SPH_C64(0x33E6A6D562D5D5A6), SPH_C64(0xF474BA809D8080BA), - SPH_C64(0x27997CBEDFBEBE7C), SPH_C64(0xEB26DECD4ACDCDDE), - SPH_C64(0x89BDE4345C3434E4), SPH_C64(0x327A7548D8484875), - SPH_C64(0x54AB24FF1CFFFF24), SPH_C64(0x8DF78F7A8E7A7A8F), - SPH_C64(0x64F4EA90AD9090EA), SPH_C64(0x9DC23E5FE15F5F3E), - SPH_C64(0x3D1DA020602020A0), SPH_C64(0x0F67D568B86868D5), - SPH_C64(0xCAD0721A2E1A1A72), SPH_C64(0xB7192CAEEFAEAE2C), - SPH_C64(0x7DC95EB4C1B4B45E), SPH_C64(0xCE9A1954FC545419), - SPH_C64(0x7FECE593A89393E5), SPH_C64(0x2F0DAA22662222AA), - SPH_C64(0x6307E964AC6464E9), SPH_C64(0x2ADB12F10EF1F112), - SPH_C64(0xCCBFA273957373A2), SPH_C64(0x82905A123612125A), - SPH_C64(0x7A3A5D40C040405D), SPH_C64(0x4840280818080828), - SPH_C64(0x9556E8C358C3C3E8), SPH_C64(0xDF337BEC29ECEC7B), - SPH_C64(0x4D9690DB70DBDB90), SPH_C64(0xC0611FA1FEA1A11F), - SPH_C64(0x911C838D8A8D8D83), SPH_C64(0xC8F5C93D473D3DC9), - SPH_C64(0x5BCCF197A49797F1), SPH_C64(0x0000000000000000), - SPH_C64(0xF936D4CF4CCFCFD4), SPH_C64(0x6E45872B7D2B2B87), - SPH_C64(0xE197B3769A7676B3), SPH_C64(0xE664B0829B8282B0), - SPH_C64(0x28FEA9D667D6D6A9), SPH_C64(0xC3D8771B2D1B1B77), - SPH_C64(0x74C15BB5C2B5B55B), SPH_C64(0xBE1129AFECAFAF29), - SPH_C64(0x1D77DF6ABE6A6ADF), SPH_C64(0xEABA0D50F050500D), - SPH_C64(0x57124C45CF45454C), SPH_C64(0x38CB18F308F3F318), - SPH_C64(0xAD9DF030503030F0), SPH_C64(0xC42B74EF2CEFEF74), - SPH_C64(0xDAE5C33F413F3FC3), SPH_C64(0xC7921C55FF55551C), - SPH_C64(0xDB7910A2FBA2A210), SPH_C64(0xE90365EA23EAEA65), - SPH_C64(0x6A0FEC65AF6565EC), SPH_C64(0x03B968BAD3BABA68), - SPH_C64(0x4A65932F712F2F93), SPH_C64(0x8E4EE7C05DC0C0E7), - SPH_C64(0x60BE81DE7FDEDE81), SPH_C64(0xFCE06C1C241C1C6C), - SPH_C64(0x46BB2EFD1AFDFD2E), SPH_C64(0x1F52644DD74D4D64), - SPH_C64(0x76E4E092AB9292E0), SPH_C64(0xFA8FBC759F7575BC), - SPH_C64(0x36301E060A06061E), SPH_C64(0xAE24988A838A8A98), - SPH_C64(0x4BF940B2CBB2B240), SPH_C64(0x856359E637E6E659), - SPH_C64(0x7E70360E120E0E36), SPH_C64(0xE7F8631F211F1F63), - SPH_C64(0x5537F762A66262F7), SPH_C64(0x3AEEA3D461D4D4A3), - SPH_C64(0x812932A8E5A8A832), SPH_C64(0x52C4F496A79696F4), - SPH_C64(0x629B3AF916F9F93A), SPH_C64(0xA366F6C552C5C5F6), - SPH_C64(0x1035B1256F2525B1), SPH_C64(0xABF22059EB595920), - SPH_C64(0xD054AE84918484AE), SPH_C64(0xC5B7A772967272A7), - SPH_C64(0xECD5DD394B3939DD), SPH_C64(0x165A614CD44C4C61), - SPH_C64(0x94CA3B5EE25E5E3B), SPH_C64(0x9FE7857888787885), - SPH_C64(0xE5DDD838483838D8), SPH_C64(0x9814868C898C8C86), - SPH_C64(0x17C6B2D16ED1D1B2), SPH_C64(0xE4410BA5F2A5A50B), - SPH_C64(0xA1434DE23BE2E24D), SPH_C64(0x4E2FF861A36161F8), - SPH_C64(0x42F145B3C8B3B345), SPH_C64(0x3415A521632121A5), - SPH_C64(0x0894D69CB99C9CD6), SPH_C64(0xEEF0661E221E1E66), - SPH_C64(0x61225243C5434352), SPH_C64(0xB176FCC754C7C7FC), - SPH_C64(0x4FB32BFC19FCFC2B), SPH_C64(0x242014040C040414), - SPH_C64(0xE3B20851F3515108), SPH_C64(0x25BCC799B69999C7), - SPH_C64(0x224FC46DB76D6DC4), SPH_C64(0x6568390D170D0D39), - SPH_C64(0x798335FA13FAFA35), SPH_C64(0x69B684DF7CDFDF84), - SPH_C64(0xA9D79B7E827E7E9B), SPH_C64(0x193DB4246C2424B4), - SPH_C64(0xFEC5D73B4D3B3BD7), SPH_C64(0x9A313DABE0ABAB3D), - SPH_C64(0xF03ED1CE4FCECED1), SPH_C64(0x9988551133111155), - SPH_C64(0x830C898F8C8F8F89), SPH_C64(0x044A6B4ED24E4E6B), - SPH_C64(0x66D151B7C4B7B751), SPH_C64(0xE00B60EB20EBEB60), - SPH_C64(0xC1FDCC3C443C3CCC), SPH_C64(0xFD7CBF819E8181BF), - SPH_C64(0x40D4FE94A19494FE), SPH_C64(0x1CEB0CF704F7F70C), - SPH_C64(0x18A167B9D6B9B967), SPH_C64(0x8B985F133513135F), - SPH_C64(0x517D9C2C742C2C9C), SPH_C64(0x05D6B8D368D3D3B8), - SPH_C64(0x8C6B5CE734E7E75C), SPH_C64(0x3957CB6EB26E6ECB), - SPH_C64(0xAA6EF3C451C4C4F3), SPH_C64(0x1B180F030503030F), - SPH_C64(0xDC8A1356FA565613), SPH_C64(0x5E1A4944CC444449), - SPH_C64(0xA0DF9E7F817F7F9E), SPH_C64(0x882137A9E6A9A937), - SPH_C64(0x674D822A7E2A2A82), SPH_C64(0x0AB16DBBD0BBBB6D), - SPH_C64(0x8746E2C15EC1C1E2), SPH_C64(0xF1A20253F5535302), - SPH_C64(0x72AE8BDC79DCDC8B), SPH_C64(0x5358270B1D0B0B27), - SPH_C64(0x019CD39DBA9D9DD3), SPH_C64(0x2B47C16CB46C6CC1), - SPH_C64(0xA495F531533131F5), SPH_C64(0xF387B9749C7474B9), - SPH_C64(0x15E309F607F6F609), SPH_C64(0x4C0A4346CA464643), - SPH_C64(0xA50926ACE9ACAC26), SPH_C64(0xB53C978986898997), - SPH_C64(0xB4A044143C141444), SPH_C64(0xBA5B42E13EE1E142), - SPH_C64(0xA6B04E163A16164E), SPH_C64(0xF7CDD23A4E3A3AD2), - SPH_C64(0x066FD069BB6969D0), SPH_C64(0x41482D091B09092D), - SPH_C64(0xD7A7AD70907070AD), SPH_C64(0x6FD954B6C7B6B654), - SPH_C64(0x1ECEB7D06DD0D0B7), SPH_C64(0xD63B7EED2AEDED7E), - SPH_C64(0xE22EDBCC49CCCCDB), SPH_C64(0x682A5742C6424257), - SPH_C64(0x2CB4C298B59898C2), SPH_C64(0xED490EA4F1A4A40E), - SPH_C64(0x755D882878282888), SPH_C64(0x86DA315CE45C5C31), - SPH_C64(0x6B933FF815F8F83F), SPH_C64(0xC244A486978686A4) -}; - -static const uint64_t old1_T2[256] = { - SPH_C64(0xC0781828181878D8), SPH_C64(0x05AF23652323AF26), - SPH_C64(0x7EF9C657C6C6F9B8), SPH_C64(0x136FE825E8E86FFB), - SPH_C64(0x4CA187948787A1CB), SPH_C64(0xA962B8D5B8B86211), - SPH_C64(0x0805010301010509), SPH_C64(0x426E4FD14F4F6E0D), - SPH_C64(0xADEE365A3636EE9B), SPH_C64(0x5904A6F7A6A604FF), - SPH_C64(0xDEBDD26BD2D2BD0C), SPH_C64(0xFB06F502F5F5060E), - SPH_C64(0xEF80798B79798096), SPH_C64(0x5FCE6FB16F6FCE30), - SPH_C64(0xFCEF91AE9191EF6D), SPH_C64(0xAA0752F6525207F8), - SPH_C64(0x27FD60A06060FD47), SPH_C64(0x8976BCD9BCBC7635), - SPH_C64(0xACCD9BB09B9BCD37), SPH_C64(0x048C8E8F8E8E8C8A), - SPH_C64(0x7115A3F8A3A315D2), SPH_C64(0x603C0C140C0C3C6C), - SPH_C64(0xFF8A7B8D7B7B8A84), SPH_C64(0xB5E1355F3535E180), - SPH_C64(0xE8691D271D1D69F5), SPH_C64(0x5347E03DE0E047B3), - SPH_C64(0xF6ACD764D7D7AC21), SPH_C64(0x5EEDC25BC2C2ED9C), - SPH_C64(0x6D962E722E2E9643), SPH_C64(0x627A4BDD4B4B7A29), - SPH_C64(0xA321FE1FFEFE215D), SPH_C64(0x821657F9575716D5), - SPH_C64(0xA841153F151541BD), SPH_C64(0x9FB677997777B6E8), - SPH_C64(0xA5EB37593737EB92), SPH_C64(0x7B56E532E5E5569E), - SPH_C64(0x8CD99FBC9F9FD913), SPH_C64(0xD317F00DF0F01723), - SPH_C64(0x6A7F4ADE4A4A7F20), SPH_C64(0x9E95DA73DADA9544), - SPH_C64(0xFA2558E8585825A2), SPH_C64(0x06CAC946C9C9CACF), - SPH_C64(0x558D297B29298D7C), SPH_C64(0x50220A1E0A0A225A), - SPH_C64(0xE14FB1CEB1B14F50), SPH_C64(0x691AA0FDA0A01AC9), - SPH_C64(0x7FDA6BBD6B6BDA14), SPH_C64(0x5CAB85928585ABD9), - SPH_C64(0x8173BDDABDBD733C), SPH_C64(0xD2345DE75D5D348F), - SPH_C64(0x8050103010105090), SPH_C64(0xF303F401F4F40307), - SPH_C64(0x16C0CB40CBCBC0DD), SPH_C64(0xEDC63E423E3EC6D3), - SPH_C64(0x2811050F0505112D), SPH_C64(0x1FE667A96767E678), - SPH_C64(0x7353E431E4E45397), SPH_C64(0x25BB27692727BB02), - SPH_C64(0x325841C341415873), SPH_C64(0x2C9D8B808B8B9DA7), - SPH_C64(0x5101A7F4A7A701F6), SPH_C64(0xCF947D877D7D94B2), - SPH_C64(0xDCFB95A29595FB49), SPH_C64(0x8E9FD875D8D89F56), - SPH_C64(0x8B30FB10FBFB3070), SPH_C64(0x2371EE2FEEEE71CD), - SPH_C64(0xC7917C847C7C91BB), SPH_C64(0x17E366AA6666E371), - SPH_C64(0xA68EDD7ADDDD8E7B), SPH_C64(0xB84B173917174BAF), - SPH_C64(0x024647C947474645), SPH_C64(0x84DC9EBF9E9EDC1A), - SPH_C64(0x1EC5CA43CACAC5D4), SPH_C64(0x75992D772D2D9958), - SPH_C64(0x9179BFDCBFBF792E), SPH_C64(0x381B070907071B3F), - SPH_C64(0x0123ADEAADAD23AC), SPH_C64(0xEA2F5AEE5A5A2FB0), - SPH_C64(0x6CB583988383B5EF), SPH_C64(0x85FF33553333FFB6), - SPH_C64(0x3FF263A56363F25C), SPH_C64(0x100A020602020A12), - SPH_C64(0x3938AAE3AAAA3893), SPH_C64(0xAFA871937171A8DE), - SPH_C64(0x0ECFC845C8C8CFC6), SPH_C64(0xC87D192B19197DD1), - SPH_C64(0x727049DB4949703B), SPH_C64(0x869AD976D9D99A5F), - SPH_C64(0xC31DF20BF2F21D31), SPH_C64(0x4B48E338E3E348A8), - SPH_C64(0xE22A5BED5B5B2AB9), SPH_C64(0x34928885888892BC), - SPH_C64(0xA4C89AB39A9AC83E), SPH_C64(0x2DBE266A2626BE0B), - SPH_C64(0x8DFA32563232FABF), SPH_C64(0xE94AB0CDB0B04A59), - SPH_C64(0x1B6AE926E9E96AF2), SPH_C64(0x78330F110F0F3377), - SPH_C64(0xE6A6D562D5D5A633), SPH_C64(0x74BA809D8080BAF4), - SPH_C64(0x997CBEDFBEBE7C27), SPH_C64(0x26DECD4ACDCDDEEB), - SPH_C64(0xBDE4345C3434E489), SPH_C64(0x7A7548D848487532), - SPH_C64(0xAB24FF1CFFFF2454), SPH_C64(0xF78F7A8E7A7A8F8D), - SPH_C64(0xF4EA90AD9090EA64), SPH_C64(0xC23E5FE15F5F3E9D), - SPH_C64(0x1DA020602020A03D), SPH_C64(0x67D568B86868D50F), - SPH_C64(0xD0721A2E1A1A72CA), SPH_C64(0x192CAEEFAEAE2CB7), - SPH_C64(0xC95EB4C1B4B45E7D), SPH_C64(0x9A1954FC545419CE), - SPH_C64(0xECE593A89393E57F), SPH_C64(0x0DAA22662222AA2F), - SPH_C64(0x07E964AC6464E963), SPH_C64(0xDB12F10EF1F1122A), - SPH_C64(0xBFA273957373A2CC), SPH_C64(0x905A123612125A82), - SPH_C64(0x3A5D40C040405D7A), SPH_C64(0x4028081808082848), - SPH_C64(0x56E8C358C3C3E895), SPH_C64(0x337BEC29ECEC7BDF), - SPH_C64(0x9690DB70DBDB904D), SPH_C64(0x611FA1FEA1A11FC0), - SPH_C64(0x1C838D8A8D8D8391), SPH_C64(0xF5C93D473D3DC9C8), - SPH_C64(0xCCF197A49797F15B), SPH_C64(0x0000000000000000), - SPH_C64(0x36D4CF4CCFCFD4F9), SPH_C64(0x45872B7D2B2B876E), - SPH_C64(0x97B3769A7676B3E1), SPH_C64(0x64B0829B8282B0E6), - SPH_C64(0xFEA9D667D6D6A928), SPH_C64(0xD8771B2D1B1B77C3), - SPH_C64(0xC15BB5C2B5B55B74), SPH_C64(0x1129AFECAFAF29BE), - SPH_C64(0x77DF6ABE6A6ADF1D), SPH_C64(0xBA0D50F050500DEA), - SPH_C64(0x124C45CF45454C57), SPH_C64(0xCB18F308F3F31838), - SPH_C64(0x9DF030503030F0AD), SPH_C64(0x2B74EF2CEFEF74C4), - SPH_C64(0xE5C33F413F3FC3DA), SPH_C64(0x921C55FF55551CC7), - SPH_C64(0x7910A2FBA2A210DB), SPH_C64(0x0365EA23EAEA65E9), - SPH_C64(0x0FEC65AF6565EC6A), SPH_C64(0xB968BAD3BABA6803), - SPH_C64(0x65932F712F2F934A), SPH_C64(0x4EE7C05DC0C0E78E), - SPH_C64(0xBE81DE7FDEDE8160), SPH_C64(0xE06C1C241C1C6CFC), - SPH_C64(0xBB2EFD1AFDFD2E46), SPH_C64(0x52644DD74D4D641F), - SPH_C64(0xE4E092AB9292E076), SPH_C64(0x8FBC759F7575BCFA), - SPH_C64(0x301E060A06061E36), SPH_C64(0x24988A838A8A98AE), - SPH_C64(0xF940B2CBB2B2404B), SPH_C64(0x6359E637E6E65985), - SPH_C64(0x70360E120E0E367E), SPH_C64(0xF8631F211F1F63E7), - SPH_C64(0x37F762A66262F755), SPH_C64(0xEEA3D461D4D4A33A), - SPH_C64(0x2932A8E5A8A83281), SPH_C64(0xC4F496A79696F452), - SPH_C64(0x9B3AF916F9F93A62), SPH_C64(0x66F6C552C5C5F6A3), - SPH_C64(0x35B1256F2525B110), SPH_C64(0xF22059EB595920AB), - SPH_C64(0x54AE84918484AED0), SPH_C64(0xB7A772967272A7C5), - SPH_C64(0xD5DD394B3939DDEC), SPH_C64(0x5A614CD44C4C6116), - SPH_C64(0xCA3B5EE25E5E3B94), SPH_C64(0xE78578887878859F), - SPH_C64(0xDDD838483838D8E5), SPH_C64(0x14868C898C8C8698), - SPH_C64(0xC6B2D16ED1D1B217), SPH_C64(0x410BA5F2A5A50BE4), - SPH_C64(0x434DE23BE2E24DA1), SPH_C64(0x2FF861A36161F84E), - SPH_C64(0xF145B3C8B3B34542), SPH_C64(0x15A521632121A534), - SPH_C64(0x94D69CB99C9CD608), SPH_C64(0xF0661E221E1E66EE), - SPH_C64(0x225243C543435261), SPH_C64(0x76FCC754C7C7FCB1), - SPH_C64(0xB32BFC19FCFC2B4F), SPH_C64(0x2014040C04041424), - SPH_C64(0xB20851F3515108E3), SPH_C64(0xBCC799B69999C725), - SPH_C64(0x4FC46DB76D6DC422), SPH_C64(0x68390D170D0D3965), - SPH_C64(0x8335FA13FAFA3579), SPH_C64(0xB684DF7CDFDF8469), - SPH_C64(0xD79B7E827E7E9BA9), SPH_C64(0x3DB4246C2424B419), - SPH_C64(0xC5D73B4D3B3BD7FE), SPH_C64(0x313DABE0ABAB3D9A), - SPH_C64(0x3ED1CE4FCECED1F0), SPH_C64(0x8855113311115599), - SPH_C64(0x0C898F8C8F8F8983), SPH_C64(0x4A6B4ED24E4E6B04), - SPH_C64(0xD151B7C4B7B75166), SPH_C64(0x0B60EB20EBEB60E0), - SPH_C64(0xFDCC3C443C3CCCC1), SPH_C64(0x7CBF819E8181BFFD), - SPH_C64(0xD4FE94A19494FE40), SPH_C64(0xEB0CF704F7F70C1C), - SPH_C64(0xA167B9D6B9B96718), SPH_C64(0x985F133513135F8B), - SPH_C64(0x7D9C2C742C2C9C51), SPH_C64(0xD6B8D368D3D3B805), - SPH_C64(0x6B5CE734E7E75C8C), SPH_C64(0x57CB6EB26E6ECB39), - SPH_C64(0x6EF3C451C4C4F3AA), SPH_C64(0x180F030503030F1B), - SPH_C64(0x8A1356FA565613DC), SPH_C64(0x1A4944CC4444495E), - SPH_C64(0xDF9E7F817F7F9EA0), SPH_C64(0x2137A9E6A9A93788), - SPH_C64(0x4D822A7E2A2A8267), SPH_C64(0xB16DBBD0BBBB6D0A), - SPH_C64(0x46E2C15EC1C1E287), SPH_C64(0xA20253F5535302F1), - SPH_C64(0xAE8BDC79DCDC8B72), SPH_C64(0x58270B1D0B0B2753), - SPH_C64(0x9CD39DBA9D9DD301), SPH_C64(0x47C16CB46C6CC12B), - SPH_C64(0x95F531533131F5A4), SPH_C64(0x87B9749C7474B9F3), - SPH_C64(0xE309F607F6F60915), SPH_C64(0x0A4346CA4646434C), - SPH_C64(0x0926ACE9ACAC26A5), SPH_C64(0x3C978986898997B5), - SPH_C64(0xA044143C141444B4), SPH_C64(0x5B42E13EE1E142BA), - SPH_C64(0xB04E163A16164EA6), SPH_C64(0xCDD23A4E3A3AD2F7), - SPH_C64(0x6FD069BB6969D006), SPH_C64(0x482D091B09092D41), - SPH_C64(0xA7AD70907070ADD7), SPH_C64(0xD954B6C7B6B6546F), - SPH_C64(0xCEB7D06DD0D0B71E), SPH_C64(0x3B7EED2AEDED7ED6), - SPH_C64(0x2EDBCC49CCCCDBE2), SPH_C64(0x2A5742C642425768), - SPH_C64(0xB4C298B59898C22C), SPH_C64(0x490EA4F1A4A40EED), - SPH_C64(0x5D88287828288875), SPH_C64(0xDA315CE45C5C3186), - SPH_C64(0x933FF815F8F83F6B), SPH_C64(0x44A486978686A4C2) -}; - -static const uint64_t old1_T3[256] = { - SPH_C64(0x781828181878D8C0), SPH_C64(0xAF23652323AF2605), - SPH_C64(0xF9C657C6C6F9B87E), SPH_C64(0x6FE825E8E86FFB13), - SPH_C64(0xA187948787A1CB4C), SPH_C64(0x62B8D5B8B86211A9), - SPH_C64(0x0501030101050908), SPH_C64(0x6E4FD14F4F6E0D42), - SPH_C64(0xEE365A3636EE9BAD), SPH_C64(0x04A6F7A6A604FF59), - SPH_C64(0xBDD26BD2D2BD0CDE), SPH_C64(0x06F502F5F5060EFB), - SPH_C64(0x80798B79798096EF), SPH_C64(0xCE6FB16F6FCE305F), - SPH_C64(0xEF91AE9191EF6DFC), SPH_C64(0x0752F6525207F8AA), - SPH_C64(0xFD60A06060FD4727), SPH_C64(0x76BCD9BCBC763589), - SPH_C64(0xCD9BB09B9BCD37AC), SPH_C64(0x8C8E8F8E8E8C8A04), - SPH_C64(0x15A3F8A3A315D271), SPH_C64(0x3C0C140C0C3C6C60), - SPH_C64(0x8A7B8D7B7B8A84FF), SPH_C64(0xE1355F3535E180B5), - SPH_C64(0x691D271D1D69F5E8), SPH_C64(0x47E03DE0E047B353), - SPH_C64(0xACD764D7D7AC21F6), SPH_C64(0xEDC25BC2C2ED9C5E), - SPH_C64(0x962E722E2E96436D), SPH_C64(0x7A4BDD4B4B7A2962), - SPH_C64(0x21FE1FFEFE215DA3), SPH_C64(0x1657F9575716D582), - SPH_C64(0x41153F151541BDA8), SPH_C64(0xB677997777B6E89F), - SPH_C64(0xEB37593737EB92A5), SPH_C64(0x56E532E5E5569E7B), - SPH_C64(0xD99FBC9F9FD9138C), SPH_C64(0x17F00DF0F01723D3), - SPH_C64(0x7F4ADE4A4A7F206A), SPH_C64(0x95DA73DADA95449E), - SPH_C64(0x2558E8585825A2FA), SPH_C64(0xCAC946C9C9CACF06), - SPH_C64(0x8D297B29298D7C55), SPH_C64(0x220A1E0A0A225A50), - SPH_C64(0x4FB1CEB1B14F50E1), SPH_C64(0x1AA0FDA0A01AC969), - SPH_C64(0xDA6BBD6B6BDA147F), SPH_C64(0xAB85928585ABD95C), - SPH_C64(0x73BDDABDBD733C81), SPH_C64(0x345DE75D5D348FD2), - SPH_C64(0x5010301010509080), SPH_C64(0x03F401F4F40307F3), - SPH_C64(0xC0CB40CBCBC0DD16), SPH_C64(0xC63E423E3EC6D3ED), - SPH_C64(0x11050F0505112D28), SPH_C64(0xE667A96767E6781F), - SPH_C64(0x53E431E4E4539773), SPH_C64(0xBB27692727BB0225), - SPH_C64(0x5841C34141587332), SPH_C64(0x9D8B808B8B9DA72C), - SPH_C64(0x01A7F4A7A701F651), SPH_C64(0x947D877D7D94B2CF), - SPH_C64(0xFB95A29595FB49DC), SPH_C64(0x9FD875D8D89F568E), - SPH_C64(0x30FB10FBFB30708B), SPH_C64(0x71EE2FEEEE71CD23), - SPH_C64(0x917C847C7C91BBC7), SPH_C64(0xE366AA6666E37117), - SPH_C64(0x8EDD7ADDDD8E7BA6), SPH_C64(0x4B173917174BAFB8), - SPH_C64(0x4647C94747464502), SPH_C64(0xDC9EBF9E9EDC1A84), - SPH_C64(0xC5CA43CACAC5D41E), SPH_C64(0x992D772D2D995875), - SPH_C64(0x79BFDCBFBF792E91), SPH_C64(0x1B070907071B3F38), - SPH_C64(0x23ADEAADAD23AC01), SPH_C64(0x2F5AEE5A5A2FB0EA), - SPH_C64(0xB583988383B5EF6C), SPH_C64(0xFF33553333FFB685), - SPH_C64(0xF263A56363F25C3F), SPH_C64(0x0A020602020A1210), - SPH_C64(0x38AAE3AAAA389339), SPH_C64(0xA871937171A8DEAF), - SPH_C64(0xCFC845C8C8CFC60E), SPH_C64(0x7D192B19197DD1C8), - SPH_C64(0x7049DB4949703B72), SPH_C64(0x9AD976D9D99A5F86), - SPH_C64(0x1DF20BF2F21D31C3), SPH_C64(0x48E338E3E348A84B), - SPH_C64(0x2A5BED5B5B2AB9E2), SPH_C64(0x928885888892BC34), - SPH_C64(0xC89AB39A9AC83EA4), SPH_C64(0xBE266A2626BE0B2D), - SPH_C64(0xFA32563232FABF8D), SPH_C64(0x4AB0CDB0B04A59E9), - SPH_C64(0x6AE926E9E96AF21B), SPH_C64(0x330F110F0F337778), - SPH_C64(0xA6D562D5D5A633E6), SPH_C64(0xBA809D8080BAF474), - SPH_C64(0x7CBEDFBEBE7C2799), SPH_C64(0xDECD4ACDCDDEEB26), - SPH_C64(0xE4345C3434E489BD), SPH_C64(0x7548D8484875327A), - SPH_C64(0x24FF1CFFFF2454AB), SPH_C64(0x8F7A8E7A7A8F8DF7), - SPH_C64(0xEA90AD9090EA64F4), SPH_C64(0x3E5FE15F5F3E9DC2), - SPH_C64(0xA020602020A03D1D), SPH_C64(0xD568B86868D50F67), - SPH_C64(0x721A2E1A1A72CAD0), SPH_C64(0x2CAEEFAEAE2CB719), - SPH_C64(0x5EB4C1B4B45E7DC9), SPH_C64(0x1954FC545419CE9A), - SPH_C64(0xE593A89393E57FEC), SPH_C64(0xAA22662222AA2F0D), - SPH_C64(0xE964AC6464E96307), SPH_C64(0x12F10EF1F1122ADB), - SPH_C64(0xA273957373A2CCBF), SPH_C64(0x5A123612125A8290), - SPH_C64(0x5D40C040405D7A3A), SPH_C64(0x2808180808284840), - SPH_C64(0xE8C358C3C3E89556), SPH_C64(0x7BEC29ECEC7BDF33), - SPH_C64(0x90DB70DBDB904D96), SPH_C64(0x1FA1FEA1A11FC061), - SPH_C64(0x838D8A8D8D83911C), SPH_C64(0xC93D473D3DC9C8F5), - SPH_C64(0xF197A49797F15BCC), SPH_C64(0x0000000000000000), - SPH_C64(0xD4CF4CCFCFD4F936), SPH_C64(0x872B7D2B2B876E45), - SPH_C64(0xB3769A7676B3E197), SPH_C64(0xB0829B8282B0E664), - SPH_C64(0xA9D667D6D6A928FE), SPH_C64(0x771B2D1B1B77C3D8), - SPH_C64(0x5BB5C2B5B55B74C1), SPH_C64(0x29AFECAFAF29BE11), - SPH_C64(0xDF6ABE6A6ADF1D77), SPH_C64(0x0D50F050500DEABA), - SPH_C64(0x4C45CF45454C5712), SPH_C64(0x18F308F3F31838CB), - SPH_C64(0xF030503030F0AD9D), SPH_C64(0x74EF2CEFEF74C42B), - SPH_C64(0xC33F413F3FC3DAE5), SPH_C64(0x1C55FF55551CC792), - SPH_C64(0x10A2FBA2A210DB79), SPH_C64(0x65EA23EAEA65E903), - SPH_C64(0xEC65AF6565EC6A0F), SPH_C64(0x68BAD3BABA6803B9), - SPH_C64(0x932F712F2F934A65), SPH_C64(0xE7C05DC0C0E78E4E), - SPH_C64(0x81DE7FDEDE8160BE), SPH_C64(0x6C1C241C1C6CFCE0), - SPH_C64(0x2EFD1AFDFD2E46BB), SPH_C64(0x644DD74D4D641F52), - SPH_C64(0xE092AB9292E076E4), SPH_C64(0xBC759F7575BCFA8F), - SPH_C64(0x1E060A06061E3630), SPH_C64(0x988A838A8A98AE24), - SPH_C64(0x40B2CBB2B2404BF9), SPH_C64(0x59E637E6E6598563), - SPH_C64(0x360E120E0E367E70), SPH_C64(0x631F211F1F63E7F8), - SPH_C64(0xF762A66262F75537), SPH_C64(0xA3D461D4D4A33AEE), - SPH_C64(0x32A8E5A8A8328129), SPH_C64(0xF496A79696F452C4), - SPH_C64(0x3AF916F9F93A629B), SPH_C64(0xF6C552C5C5F6A366), - SPH_C64(0xB1256F2525B11035), SPH_C64(0x2059EB595920ABF2), - SPH_C64(0xAE84918484AED054), SPH_C64(0xA772967272A7C5B7), - SPH_C64(0xDD394B3939DDECD5), SPH_C64(0x614CD44C4C61165A), - SPH_C64(0x3B5EE25E5E3B94CA), SPH_C64(0x8578887878859FE7), - SPH_C64(0xD838483838D8E5DD), SPH_C64(0x868C898C8C869814), - SPH_C64(0xB2D16ED1D1B217C6), SPH_C64(0x0BA5F2A5A50BE441), - SPH_C64(0x4DE23BE2E24DA143), SPH_C64(0xF861A36161F84E2F), - SPH_C64(0x45B3C8B3B34542F1), SPH_C64(0xA521632121A53415), - SPH_C64(0xD69CB99C9CD60894), SPH_C64(0x661E221E1E66EEF0), - SPH_C64(0x5243C54343526122), SPH_C64(0xFCC754C7C7FCB176), - SPH_C64(0x2BFC19FCFC2B4FB3), SPH_C64(0x14040C0404142420), - SPH_C64(0x0851F3515108E3B2), SPH_C64(0xC799B69999C725BC), - SPH_C64(0xC46DB76D6DC4224F), SPH_C64(0x390D170D0D396568), - SPH_C64(0x35FA13FAFA357983), SPH_C64(0x84DF7CDFDF8469B6), - SPH_C64(0x9B7E827E7E9BA9D7), SPH_C64(0xB4246C2424B4193D), - SPH_C64(0xD73B4D3B3BD7FEC5), SPH_C64(0x3DABE0ABAB3D9A31), - SPH_C64(0xD1CE4FCECED1F03E), SPH_C64(0x5511331111559988), - SPH_C64(0x898F8C8F8F89830C), SPH_C64(0x6B4ED24E4E6B044A), - SPH_C64(0x51B7C4B7B75166D1), SPH_C64(0x60EB20EBEB60E00B), - SPH_C64(0xCC3C443C3CCCC1FD), SPH_C64(0xBF819E8181BFFD7C), - SPH_C64(0xFE94A19494FE40D4), SPH_C64(0x0CF704F7F70C1CEB), - SPH_C64(0x67B9D6B9B96718A1), SPH_C64(0x5F133513135F8B98), - SPH_C64(0x9C2C742C2C9C517D), SPH_C64(0xB8D368D3D3B805D6), - SPH_C64(0x5CE734E7E75C8C6B), SPH_C64(0xCB6EB26E6ECB3957), - SPH_C64(0xF3C451C4C4F3AA6E), SPH_C64(0x0F030503030F1B18), - SPH_C64(0x1356FA565613DC8A), SPH_C64(0x4944CC4444495E1A), - SPH_C64(0x9E7F817F7F9EA0DF), SPH_C64(0x37A9E6A9A9378821), - SPH_C64(0x822A7E2A2A82674D), SPH_C64(0x6DBBD0BBBB6D0AB1), - SPH_C64(0xE2C15EC1C1E28746), SPH_C64(0x0253F5535302F1A2), - SPH_C64(0x8BDC79DCDC8B72AE), SPH_C64(0x270B1D0B0B275358), - SPH_C64(0xD39DBA9D9DD3019C), SPH_C64(0xC16CB46C6CC12B47), - SPH_C64(0xF531533131F5A495), SPH_C64(0xB9749C7474B9F387), - SPH_C64(0x09F607F6F60915E3), SPH_C64(0x4346CA4646434C0A), - SPH_C64(0x26ACE9ACAC26A509), SPH_C64(0x978986898997B53C), - SPH_C64(0x44143C141444B4A0), SPH_C64(0x42E13EE1E142BA5B), - SPH_C64(0x4E163A16164EA6B0), SPH_C64(0xD23A4E3A3AD2F7CD), - SPH_C64(0xD069BB6969D0066F), SPH_C64(0x2D091B09092D4148), - SPH_C64(0xAD70907070ADD7A7), SPH_C64(0x54B6C7B6B6546FD9), - SPH_C64(0xB7D06DD0D0B71ECE), SPH_C64(0x7EED2AEDED7ED63B), - SPH_C64(0xDBCC49CCCCDBE22E), SPH_C64(0x5742C6424257682A), - SPH_C64(0xC298B59898C22CB4), SPH_C64(0x0EA4F1A4A40EED49), - SPH_C64(0x882878282888755D), SPH_C64(0x315CE45C5C3186DA), - SPH_C64(0x3FF815F8F83F6B93), SPH_C64(0xA486978686A4C244) -}; - -static const uint64_t old1_T4[256] = { - SPH_C64(0x1828181878D8C078), SPH_C64(0x23652323AF2605AF), - SPH_C64(0xC657C6C6F9B87EF9), SPH_C64(0xE825E8E86FFB136F), - SPH_C64(0x87948787A1CB4CA1), SPH_C64(0xB8D5B8B86211A962), - SPH_C64(0x0103010105090805), SPH_C64(0x4FD14F4F6E0D426E), - SPH_C64(0x365A3636EE9BADEE), SPH_C64(0xA6F7A6A604FF5904), - SPH_C64(0xD26BD2D2BD0CDEBD), SPH_C64(0xF502F5F5060EFB06), - SPH_C64(0x798B79798096EF80), SPH_C64(0x6FB16F6FCE305FCE), - SPH_C64(0x91AE9191EF6DFCEF), SPH_C64(0x52F6525207F8AA07), - SPH_C64(0x60A06060FD4727FD), SPH_C64(0xBCD9BCBC76358976), - SPH_C64(0x9BB09B9BCD37ACCD), SPH_C64(0x8E8F8E8E8C8A048C), - SPH_C64(0xA3F8A3A315D27115), SPH_C64(0x0C140C0C3C6C603C), - SPH_C64(0x7B8D7B7B8A84FF8A), SPH_C64(0x355F3535E180B5E1), - SPH_C64(0x1D271D1D69F5E869), SPH_C64(0xE03DE0E047B35347), - SPH_C64(0xD764D7D7AC21F6AC), SPH_C64(0xC25BC2C2ED9C5EED), - SPH_C64(0x2E722E2E96436D96), SPH_C64(0x4BDD4B4B7A29627A), - SPH_C64(0xFE1FFEFE215DA321), SPH_C64(0x57F9575716D58216), - SPH_C64(0x153F151541BDA841), SPH_C64(0x77997777B6E89FB6), - SPH_C64(0x37593737EB92A5EB), SPH_C64(0xE532E5E5569E7B56), - SPH_C64(0x9FBC9F9FD9138CD9), SPH_C64(0xF00DF0F01723D317), - SPH_C64(0x4ADE4A4A7F206A7F), SPH_C64(0xDA73DADA95449E95), - SPH_C64(0x58E8585825A2FA25), SPH_C64(0xC946C9C9CACF06CA), - SPH_C64(0x297B29298D7C558D), SPH_C64(0x0A1E0A0A225A5022), - SPH_C64(0xB1CEB1B14F50E14F), SPH_C64(0xA0FDA0A01AC9691A), - SPH_C64(0x6BBD6B6BDA147FDA), SPH_C64(0x85928585ABD95CAB), - SPH_C64(0xBDDABDBD733C8173), SPH_C64(0x5DE75D5D348FD234), - SPH_C64(0x1030101050908050), SPH_C64(0xF401F4F40307F303), - SPH_C64(0xCB40CBCBC0DD16C0), SPH_C64(0x3E423E3EC6D3EDC6), - SPH_C64(0x050F0505112D2811), SPH_C64(0x67A96767E6781FE6), - SPH_C64(0xE431E4E453977353), SPH_C64(0x27692727BB0225BB), - SPH_C64(0x41C3414158733258), SPH_C64(0x8B808B8B9DA72C9D), - SPH_C64(0xA7F4A7A701F65101), SPH_C64(0x7D877D7D94B2CF94), - SPH_C64(0x95A29595FB49DCFB), SPH_C64(0xD875D8D89F568E9F), - SPH_C64(0xFB10FBFB30708B30), SPH_C64(0xEE2FEEEE71CD2371), - SPH_C64(0x7C847C7C91BBC791), SPH_C64(0x66AA6666E37117E3), - SPH_C64(0xDD7ADDDD8E7BA68E), SPH_C64(0x173917174BAFB84B), - SPH_C64(0x47C9474746450246), SPH_C64(0x9EBF9E9EDC1A84DC), - SPH_C64(0xCA43CACAC5D41EC5), SPH_C64(0x2D772D2D99587599), - SPH_C64(0xBFDCBFBF792E9179), SPH_C64(0x070907071B3F381B), - SPH_C64(0xADEAADAD23AC0123), SPH_C64(0x5AEE5A5A2FB0EA2F), - SPH_C64(0x83988383B5EF6CB5), SPH_C64(0x33553333FFB685FF), - SPH_C64(0x63A56363F25C3FF2), SPH_C64(0x020602020A12100A), - SPH_C64(0xAAE3AAAA38933938), SPH_C64(0x71937171A8DEAFA8), - SPH_C64(0xC845C8C8CFC60ECF), SPH_C64(0x192B19197DD1C87D), - SPH_C64(0x49DB4949703B7270), SPH_C64(0xD976D9D99A5F869A), - SPH_C64(0xF20BF2F21D31C31D), SPH_C64(0xE338E3E348A84B48), - SPH_C64(0x5BED5B5B2AB9E22A), SPH_C64(0x8885888892BC3492), - SPH_C64(0x9AB39A9AC83EA4C8), SPH_C64(0x266A2626BE0B2DBE), - SPH_C64(0x32563232FABF8DFA), SPH_C64(0xB0CDB0B04A59E94A), - SPH_C64(0xE926E9E96AF21B6A), SPH_C64(0x0F110F0F33777833), - SPH_C64(0xD562D5D5A633E6A6), SPH_C64(0x809D8080BAF474BA), - SPH_C64(0xBEDFBEBE7C27997C), SPH_C64(0xCD4ACDCDDEEB26DE), - SPH_C64(0x345C3434E489BDE4), SPH_C64(0x48D8484875327A75), - SPH_C64(0xFF1CFFFF2454AB24), SPH_C64(0x7A8E7A7A8F8DF78F), - SPH_C64(0x90AD9090EA64F4EA), SPH_C64(0x5FE15F5F3E9DC23E), - SPH_C64(0x20602020A03D1DA0), SPH_C64(0x68B86868D50F67D5), - SPH_C64(0x1A2E1A1A72CAD072), SPH_C64(0xAEEFAEAE2CB7192C), - SPH_C64(0xB4C1B4B45E7DC95E), SPH_C64(0x54FC545419CE9A19), - SPH_C64(0x93A89393E57FECE5), SPH_C64(0x22662222AA2F0DAA), - SPH_C64(0x64AC6464E96307E9), SPH_C64(0xF10EF1F1122ADB12), - SPH_C64(0x73957373A2CCBFA2), SPH_C64(0x123612125A82905A), - SPH_C64(0x40C040405D7A3A5D), SPH_C64(0x0818080828484028), - SPH_C64(0xC358C3C3E89556E8), SPH_C64(0xEC29ECEC7BDF337B), - SPH_C64(0xDB70DBDB904D9690), SPH_C64(0xA1FEA1A11FC0611F), - SPH_C64(0x8D8A8D8D83911C83), SPH_C64(0x3D473D3DC9C8F5C9), - SPH_C64(0x97A49797F15BCCF1), SPH_C64(0x0000000000000000), - SPH_C64(0xCF4CCFCFD4F936D4), SPH_C64(0x2B7D2B2B876E4587), - SPH_C64(0x769A7676B3E197B3), SPH_C64(0x829B8282B0E664B0), - SPH_C64(0xD667D6D6A928FEA9), SPH_C64(0x1B2D1B1B77C3D877), - SPH_C64(0xB5C2B5B55B74C15B), SPH_C64(0xAFECAFAF29BE1129), - SPH_C64(0x6ABE6A6ADF1D77DF), SPH_C64(0x50F050500DEABA0D), - SPH_C64(0x45CF45454C57124C), SPH_C64(0xF308F3F31838CB18), - SPH_C64(0x30503030F0AD9DF0), SPH_C64(0xEF2CEFEF74C42B74), - SPH_C64(0x3F413F3FC3DAE5C3), SPH_C64(0x55FF55551CC7921C), - SPH_C64(0xA2FBA2A210DB7910), SPH_C64(0xEA23EAEA65E90365), - SPH_C64(0x65AF6565EC6A0FEC), SPH_C64(0xBAD3BABA6803B968), - SPH_C64(0x2F712F2F934A6593), SPH_C64(0xC05DC0C0E78E4EE7), - SPH_C64(0xDE7FDEDE8160BE81), SPH_C64(0x1C241C1C6CFCE06C), - SPH_C64(0xFD1AFDFD2E46BB2E), SPH_C64(0x4DD74D4D641F5264), - SPH_C64(0x92AB9292E076E4E0), SPH_C64(0x759F7575BCFA8FBC), - SPH_C64(0x060A06061E36301E), SPH_C64(0x8A838A8A98AE2498), - SPH_C64(0xB2CBB2B2404BF940), SPH_C64(0xE637E6E659856359), - SPH_C64(0x0E120E0E367E7036), SPH_C64(0x1F211F1F63E7F863), - SPH_C64(0x62A66262F75537F7), SPH_C64(0xD461D4D4A33AEEA3), - SPH_C64(0xA8E5A8A832812932), SPH_C64(0x96A79696F452C4F4), - SPH_C64(0xF916F9F93A629B3A), SPH_C64(0xC552C5C5F6A366F6), - SPH_C64(0x256F2525B11035B1), SPH_C64(0x59EB595920ABF220), - SPH_C64(0x84918484AED054AE), SPH_C64(0x72967272A7C5B7A7), - SPH_C64(0x394B3939DDECD5DD), SPH_C64(0x4CD44C4C61165A61), - SPH_C64(0x5EE25E5E3B94CA3B), SPH_C64(0x78887878859FE785), - SPH_C64(0x38483838D8E5DDD8), SPH_C64(0x8C898C8C86981486), - SPH_C64(0xD16ED1D1B217C6B2), SPH_C64(0xA5F2A5A50BE4410B), - SPH_C64(0xE23BE2E24DA1434D), SPH_C64(0x61A36161F84E2FF8), - SPH_C64(0xB3C8B3B34542F145), SPH_C64(0x21632121A53415A5), - SPH_C64(0x9CB99C9CD60894D6), SPH_C64(0x1E221E1E66EEF066), - SPH_C64(0x43C5434352612252), SPH_C64(0xC754C7C7FCB176FC), - SPH_C64(0xFC19FCFC2B4FB32B), SPH_C64(0x040C040414242014), - SPH_C64(0x51F3515108E3B208), SPH_C64(0x99B69999C725BCC7), - SPH_C64(0x6DB76D6DC4224FC4), SPH_C64(0x0D170D0D39656839), - SPH_C64(0xFA13FAFA35798335), SPH_C64(0xDF7CDFDF8469B684), - SPH_C64(0x7E827E7E9BA9D79B), SPH_C64(0x246C2424B4193DB4), - SPH_C64(0x3B4D3B3BD7FEC5D7), SPH_C64(0xABE0ABAB3D9A313D), - SPH_C64(0xCE4FCECED1F03ED1), SPH_C64(0x1133111155998855), - SPH_C64(0x8F8C8F8F89830C89), SPH_C64(0x4ED24E4E6B044A6B), - SPH_C64(0xB7C4B7B75166D151), SPH_C64(0xEB20EBEB60E00B60), - SPH_C64(0x3C443C3CCCC1FDCC), SPH_C64(0x819E8181BFFD7CBF), - SPH_C64(0x94A19494FE40D4FE), SPH_C64(0xF704F7F70C1CEB0C), - SPH_C64(0xB9D6B9B96718A167), SPH_C64(0x133513135F8B985F), - SPH_C64(0x2C742C2C9C517D9C), SPH_C64(0xD368D3D3B805D6B8), - SPH_C64(0xE734E7E75C8C6B5C), SPH_C64(0x6EB26E6ECB3957CB), - SPH_C64(0xC451C4C4F3AA6EF3), SPH_C64(0x030503030F1B180F), - SPH_C64(0x56FA565613DC8A13), SPH_C64(0x44CC4444495E1A49), - SPH_C64(0x7F817F7F9EA0DF9E), SPH_C64(0xA9E6A9A937882137), - SPH_C64(0x2A7E2A2A82674D82), SPH_C64(0xBBD0BBBB6D0AB16D), - SPH_C64(0xC15EC1C1E28746E2), SPH_C64(0x53F5535302F1A202), - SPH_C64(0xDC79DCDC8B72AE8B), SPH_C64(0x0B1D0B0B27535827), - SPH_C64(0x9DBA9D9DD3019CD3), SPH_C64(0x6CB46C6CC12B47C1), - SPH_C64(0x31533131F5A495F5), SPH_C64(0x749C7474B9F387B9), - SPH_C64(0xF607F6F60915E309), SPH_C64(0x46CA4646434C0A43), - SPH_C64(0xACE9ACAC26A50926), SPH_C64(0x8986898997B53C97), - SPH_C64(0x143C141444B4A044), SPH_C64(0xE13EE1E142BA5B42), - SPH_C64(0x163A16164EA6B04E), SPH_C64(0x3A4E3A3AD2F7CDD2), - SPH_C64(0x69BB6969D0066FD0), SPH_C64(0x091B09092D41482D), - SPH_C64(0x70907070ADD7A7AD), SPH_C64(0xB6C7B6B6546FD954), - SPH_C64(0xD06DD0D0B71ECEB7), SPH_C64(0xED2AEDED7ED63B7E), - SPH_C64(0xCC49CCCCDBE22EDB), SPH_C64(0x42C6424257682A57), - SPH_C64(0x98B59898C22CB4C2), SPH_C64(0xA4F1A4A40EED490E), - SPH_C64(0x2878282888755D88), SPH_C64(0x5CE45C5C3186DA31), - SPH_C64(0xF815F8F83F6B933F), SPH_C64(0x86978686A4C244A4) -}; - -static const uint64_t old1_T5[256] = { - SPH_C64(0x28181878D8C07818), SPH_C64(0x652323AF2605AF23), - SPH_C64(0x57C6C6F9B87EF9C6), SPH_C64(0x25E8E86FFB136FE8), - SPH_C64(0x948787A1CB4CA187), SPH_C64(0xD5B8B86211A962B8), - SPH_C64(0x0301010509080501), SPH_C64(0xD14F4F6E0D426E4F), - SPH_C64(0x5A3636EE9BADEE36), SPH_C64(0xF7A6A604FF5904A6), - SPH_C64(0x6BD2D2BD0CDEBDD2), SPH_C64(0x02F5F5060EFB06F5), - SPH_C64(0x8B79798096EF8079), SPH_C64(0xB16F6FCE305FCE6F), - SPH_C64(0xAE9191EF6DFCEF91), SPH_C64(0xF6525207F8AA0752), - SPH_C64(0xA06060FD4727FD60), SPH_C64(0xD9BCBC76358976BC), - SPH_C64(0xB09B9BCD37ACCD9B), SPH_C64(0x8F8E8E8C8A048C8E), - SPH_C64(0xF8A3A315D27115A3), SPH_C64(0x140C0C3C6C603C0C), - SPH_C64(0x8D7B7B8A84FF8A7B), SPH_C64(0x5F3535E180B5E135), - SPH_C64(0x271D1D69F5E8691D), SPH_C64(0x3DE0E047B35347E0), - SPH_C64(0x64D7D7AC21F6ACD7), SPH_C64(0x5BC2C2ED9C5EEDC2), - SPH_C64(0x722E2E96436D962E), SPH_C64(0xDD4B4B7A29627A4B), - SPH_C64(0x1FFEFE215DA321FE), SPH_C64(0xF9575716D5821657), - SPH_C64(0x3F151541BDA84115), SPH_C64(0x997777B6E89FB677), - SPH_C64(0x593737EB92A5EB37), SPH_C64(0x32E5E5569E7B56E5), - SPH_C64(0xBC9F9FD9138CD99F), SPH_C64(0x0DF0F01723D317F0), - SPH_C64(0xDE4A4A7F206A7F4A), SPH_C64(0x73DADA95449E95DA), - SPH_C64(0xE8585825A2FA2558), SPH_C64(0x46C9C9CACF06CAC9), - SPH_C64(0x7B29298D7C558D29), SPH_C64(0x1E0A0A225A50220A), - SPH_C64(0xCEB1B14F50E14FB1), SPH_C64(0xFDA0A01AC9691AA0), - SPH_C64(0xBD6B6BDA147FDA6B), SPH_C64(0x928585ABD95CAB85), - SPH_C64(0xDABDBD733C8173BD), SPH_C64(0xE75D5D348FD2345D), - SPH_C64(0x3010105090805010), SPH_C64(0x01F4F40307F303F4), - SPH_C64(0x40CBCBC0DD16C0CB), SPH_C64(0x423E3EC6D3EDC63E), - SPH_C64(0x0F0505112D281105), SPH_C64(0xA96767E6781FE667), - SPH_C64(0x31E4E453977353E4), SPH_C64(0x692727BB0225BB27), - SPH_C64(0xC341415873325841), SPH_C64(0x808B8B9DA72C9D8B), - SPH_C64(0xF4A7A701F65101A7), SPH_C64(0x877D7D94B2CF947D), - SPH_C64(0xA29595FB49DCFB95), SPH_C64(0x75D8D89F568E9FD8), - SPH_C64(0x10FBFB30708B30FB), SPH_C64(0x2FEEEE71CD2371EE), - SPH_C64(0x847C7C91BBC7917C), SPH_C64(0xAA6666E37117E366), - SPH_C64(0x7ADDDD8E7BA68EDD), SPH_C64(0x3917174BAFB84B17), - SPH_C64(0xC947474645024647), SPH_C64(0xBF9E9EDC1A84DC9E), - SPH_C64(0x43CACAC5D41EC5CA), SPH_C64(0x772D2D995875992D), - SPH_C64(0xDCBFBF792E9179BF), SPH_C64(0x0907071B3F381B07), - SPH_C64(0xEAADAD23AC0123AD), SPH_C64(0xEE5A5A2FB0EA2F5A), - SPH_C64(0x988383B5EF6CB583), SPH_C64(0x553333FFB685FF33), - SPH_C64(0xA56363F25C3FF263), SPH_C64(0x0602020A12100A02), - SPH_C64(0xE3AAAA38933938AA), SPH_C64(0x937171A8DEAFA871), - SPH_C64(0x45C8C8CFC60ECFC8), SPH_C64(0x2B19197DD1C87D19), - SPH_C64(0xDB4949703B727049), SPH_C64(0x76D9D99A5F869AD9), - SPH_C64(0x0BF2F21D31C31DF2), SPH_C64(0x38E3E348A84B48E3), - SPH_C64(0xED5B5B2AB9E22A5B), SPH_C64(0x85888892BC349288), - SPH_C64(0xB39A9AC83EA4C89A), SPH_C64(0x6A2626BE0B2DBE26), - SPH_C64(0x563232FABF8DFA32), SPH_C64(0xCDB0B04A59E94AB0), - SPH_C64(0x26E9E96AF21B6AE9), SPH_C64(0x110F0F337778330F), - SPH_C64(0x62D5D5A633E6A6D5), SPH_C64(0x9D8080BAF474BA80), - SPH_C64(0xDFBEBE7C27997CBE), SPH_C64(0x4ACDCDDEEB26DECD), - SPH_C64(0x5C3434E489BDE434), SPH_C64(0xD8484875327A7548), - SPH_C64(0x1CFFFF2454AB24FF), SPH_C64(0x8E7A7A8F8DF78F7A), - SPH_C64(0xAD9090EA64F4EA90), SPH_C64(0xE15F5F3E9DC23E5F), - SPH_C64(0x602020A03D1DA020), SPH_C64(0xB86868D50F67D568), - SPH_C64(0x2E1A1A72CAD0721A), SPH_C64(0xEFAEAE2CB7192CAE), - SPH_C64(0xC1B4B45E7DC95EB4), SPH_C64(0xFC545419CE9A1954), - SPH_C64(0xA89393E57FECE593), SPH_C64(0x662222AA2F0DAA22), - SPH_C64(0xAC6464E96307E964), SPH_C64(0x0EF1F1122ADB12F1), - SPH_C64(0x957373A2CCBFA273), SPH_C64(0x3612125A82905A12), - SPH_C64(0xC040405D7A3A5D40), SPH_C64(0x1808082848402808), - SPH_C64(0x58C3C3E89556E8C3), SPH_C64(0x29ECEC7BDF337BEC), - SPH_C64(0x70DBDB904D9690DB), SPH_C64(0xFEA1A11FC0611FA1), - SPH_C64(0x8A8D8D83911C838D), SPH_C64(0x473D3DC9C8F5C93D), - SPH_C64(0xA49797F15BCCF197), SPH_C64(0x0000000000000000), - SPH_C64(0x4CCFCFD4F936D4CF), SPH_C64(0x7D2B2B876E45872B), - SPH_C64(0x9A7676B3E197B376), SPH_C64(0x9B8282B0E664B082), - SPH_C64(0x67D6D6A928FEA9D6), SPH_C64(0x2D1B1B77C3D8771B), - SPH_C64(0xC2B5B55B74C15BB5), SPH_C64(0xECAFAF29BE1129AF), - SPH_C64(0xBE6A6ADF1D77DF6A), SPH_C64(0xF050500DEABA0D50), - SPH_C64(0xCF45454C57124C45), SPH_C64(0x08F3F31838CB18F3), - SPH_C64(0x503030F0AD9DF030), SPH_C64(0x2CEFEF74C42B74EF), - SPH_C64(0x413F3FC3DAE5C33F), SPH_C64(0xFF55551CC7921C55), - SPH_C64(0xFBA2A210DB7910A2), SPH_C64(0x23EAEA65E90365EA), - SPH_C64(0xAF6565EC6A0FEC65), SPH_C64(0xD3BABA6803B968BA), - SPH_C64(0x712F2F934A65932F), SPH_C64(0x5DC0C0E78E4EE7C0), - SPH_C64(0x7FDEDE8160BE81DE), SPH_C64(0x241C1C6CFCE06C1C), - SPH_C64(0x1AFDFD2E46BB2EFD), SPH_C64(0xD74D4D641F52644D), - SPH_C64(0xAB9292E076E4E092), SPH_C64(0x9F7575BCFA8FBC75), - SPH_C64(0x0A06061E36301E06), SPH_C64(0x838A8A98AE24988A), - SPH_C64(0xCBB2B2404BF940B2), SPH_C64(0x37E6E659856359E6), - SPH_C64(0x120E0E367E70360E), SPH_C64(0x211F1F63E7F8631F), - SPH_C64(0xA66262F75537F762), SPH_C64(0x61D4D4A33AEEA3D4), - SPH_C64(0xE5A8A832812932A8), SPH_C64(0xA79696F452C4F496), - SPH_C64(0x16F9F93A629B3AF9), SPH_C64(0x52C5C5F6A366F6C5), - SPH_C64(0x6F2525B11035B125), SPH_C64(0xEB595920ABF22059), - SPH_C64(0x918484AED054AE84), SPH_C64(0x967272A7C5B7A772), - SPH_C64(0x4B3939DDECD5DD39), SPH_C64(0xD44C4C61165A614C), - SPH_C64(0xE25E5E3B94CA3B5E), SPH_C64(0x887878859FE78578), - SPH_C64(0x483838D8E5DDD838), SPH_C64(0x898C8C869814868C), - SPH_C64(0x6ED1D1B217C6B2D1), SPH_C64(0xF2A5A50BE4410BA5), - SPH_C64(0x3BE2E24DA1434DE2), SPH_C64(0xA36161F84E2FF861), - SPH_C64(0xC8B3B34542F145B3), SPH_C64(0x632121A53415A521), - SPH_C64(0xB99C9CD60894D69C), SPH_C64(0x221E1E66EEF0661E), - SPH_C64(0xC543435261225243), SPH_C64(0x54C7C7FCB176FCC7), - SPH_C64(0x19FCFC2B4FB32BFC), SPH_C64(0x0C04041424201404), - SPH_C64(0xF3515108E3B20851), SPH_C64(0xB69999C725BCC799), - SPH_C64(0xB76D6DC4224FC46D), SPH_C64(0x170D0D396568390D), - SPH_C64(0x13FAFA35798335FA), SPH_C64(0x7CDFDF8469B684DF), - SPH_C64(0x827E7E9BA9D79B7E), SPH_C64(0x6C2424B4193DB424), - SPH_C64(0x4D3B3BD7FEC5D73B), SPH_C64(0xE0ABAB3D9A313DAB), - SPH_C64(0x4FCECED1F03ED1CE), SPH_C64(0x3311115599885511), - SPH_C64(0x8C8F8F89830C898F), SPH_C64(0xD24E4E6B044A6B4E), - SPH_C64(0xC4B7B75166D151B7), SPH_C64(0x20EBEB60E00B60EB), - SPH_C64(0x443C3CCCC1FDCC3C), SPH_C64(0x9E8181BFFD7CBF81), - SPH_C64(0xA19494FE40D4FE94), SPH_C64(0x04F7F70C1CEB0CF7), - SPH_C64(0xD6B9B96718A167B9), SPH_C64(0x3513135F8B985F13), - SPH_C64(0x742C2C9C517D9C2C), SPH_C64(0x68D3D3B805D6B8D3), - SPH_C64(0x34E7E75C8C6B5CE7), SPH_C64(0xB26E6ECB3957CB6E), - SPH_C64(0x51C4C4F3AA6EF3C4), SPH_C64(0x0503030F1B180F03), - SPH_C64(0xFA565613DC8A1356), SPH_C64(0xCC4444495E1A4944), - SPH_C64(0x817F7F9EA0DF9E7F), SPH_C64(0xE6A9A937882137A9), - SPH_C64(0x7E2A2A82674D822A), SPH_C64(0xD0BBBB6D0AB16DBB), - SPH_C64(0x5EC1C1E28746E2C1), SPH_C64(0xF5535302F1A20253), - SPH_C64(0x79DCDC8B72AE8BDC), SPH_C64(0x1D0B0B275358270B), - SPH_C64(0xBA9D9DD3019CD39D), SPH_C64(0xB46C6CC12B47C16C), - SPH_C64(0x533131F5A495F531), SPH_C64(0x9C7474B9F387B974), - SPH_C64(0x07F6F60915E309F6), SPH_C64(0xCA4646434C0A4346), - SPH_C64(0xE9ACAC26A50926AC), SPH_C64(0x86898997B53C9789), - SPH_C64(0x3C141444B4A04414), SPH_C64(0x3EE1E142BA5B42E1), - SPH_C64(0x3A16164EA6B04E16), SPH_C64(0x4E3A3AD2F7CDD23A), - SPH_C64(0xBB6969D0066FD069), SPH_C64(0x1B09092D41482D09), - SPH_C64(0x907070ADD7A7AD70), SPH_C64(0xC7B6B6546FD954B6), - SPH_C64(0x6DD0D0B71ECEB7D0), SPH_C64(0x2AEDED7ED63B7EED), - SPH_C64(0x49CCCCDBE22EDBCC), SPH_C64(0xC6424257682A5742), - SPH_C64(0xB59898C22CB4C298), SPH_C64(0xF1A4A40EED490EA4), - SPH_C64(0x78282888755D8828), SPH_C64(0xE45C5C3186DA315C), - SPH_C64(0x15F8F83F6B933FF8), SPH_C64(0x978686A4C244A486) -}; - -static const uint64_t old1_T6[256] = { - SPH_C64(0x181878D8C0781828), SPH_C64(0x2323AF2605AF2365), - SPH_C64(0xC6C6F9B87EF9C657), SPH_C64(0xE8E86FFB136FE825), - SPH_C64(0x8787A1CB4CA18794), SPH_C64(0xB8B86211A962B8D5), - SPH_C64(0x0101050908050103), SPH_C64(0x4F4F6E0D426E4FD1), - SPH_C64(0x3636EE9BADEE365A), SPH_C64(0xA6A604FF5904A6F7), - SPH_C64(0xD2D2BD0CDEBDD26B), SPH_C64(0xF5F5060EFB06F502), - SPH_C64(0x79798096EF80798B), SPH_C64(0x6F6FCE305FCE6FB1), - SPH_C64(0x9191EF6DFCEF91AE), SPH_C64(0x525207F8AA0752F6), - SPH_C64(0x6060FD4727FD60A0), SPH_C64(0xBCBC76358976BCD9), - SPH_C64(0x9B9BCD37ACCD9BB0), SPH_C64(0x8E8E8C8A048C8E8F), - SPH_C64(0xA3A315D27115A3F8), SPH_C64(0x0C0C3C6C603C0C14), - SPH_C64(0x7B7B8A84FF8A7B8D), SPH_C64(0x3535E180B5E1355F), - SPH_C64(0x1D1D69F5E8691D27), SPH_C64(0xE0E047B35347E03D), - SPH_C64(0xD7D7AC21F6ACD764), SPH_C64(0xC2C2ED9C5EEDC25B), - SPH_C64(0x2E2E96436D962E72), SPH_C64(0x4B4B7A29627A4BDD), - SPH_C64(0xFEFE215DA321FE1F), SPH_C64(0x575716D5821657F9), - SPH_C64(0x151541BDA841153F), SPH_C64(0x7777B6E89FB67799), - SPH_C64(0x3737EB92A5EB3759), SPH_C64(0xE5E5569E7B56E532), - SPH_C64(0x9F9FD9138CD99FBC), SPH_C64(0xF0F01723D317F00D), - SPH_C64(0x4A4A7F206A7F4ADE), SPH_C64(0xDADA95449E95DA73), - SPH_C64(0x585825A2FA2558E8), SPH_C64(0xC9C9CACF06CAC946), - SPH_C64(0x29298D7C558D297B), SPH_C64(0x0A0A225A50220A1E), - SPH_C64(0xB1B14F50E14FB1CE), SPH_C64(0xA0A01AC9691AA0FD), - SPH_C64(0x6B6BDA147FDA6BBD), SPH_C64(0x8585ABD95CAB8592), - SPH_C64(0xBDBD733C8173BDDA), SPH_C64(0x5D5D348FD2345DE7), - SPH_C64(0x1010509080501030), SPH_C64(0xF4F40307F303F401), - SPH_C64(0xCBCBC0DD16C0CB40), SPH_C64(0x3E3EC6D3EDC63E42), - SPH_C64(0x0505112D2811050F), SPH_C64(0x6767E6781FE667A9), - SPH_C64(0xE4E453977353E431), SPH_C64(0x2727BB0225BB2769), - SPH_C64(0x41415873325841C3), SPH_C64(0x8B8B9DA72C9D8B80), - SPH_C64(0xA7A701F65101A7F4), SPH_C64(0x7D7D94B2CF947D87), - SPH_C64(0x9595FB49DCFB95A2), SPH_C64(0xD8D89F568E9FD875), - SPH_C64(0xFBFB30708B30FB10), SPH_C64(0xEEEE71CD2371EE2F), - SPH_C64(0x7C7C91BBC7917C84), SPH_C64(0x6666E37117E366AA), - SPH_C64(0xDDDD8E7BA68EDD7A), SPH_C64(0x17174BAFB84B1739), - SPH_C64(0x47474645024647C9), SPH_C64(0x9E9EDC1A84DC9EBF), - SPH_C64(0xCACAC5D41EC5CA43), SPH_C64(0x2D2D995875992D77), - SPH_C64(0xBFBF792E9179BFDC), SPH_C64(0x07071B3F381B0709), - SPH_C64(0xADAD23AC0123ADEA), SPH_C64(0x5A5A2FB0EA2F5AEE), - SPH_C64(0x8383B5EF6CB58398), SPH_C64(0x3333FFB685FF3355), - SPH_C64(0x6363F25C3FF263A5), SPH_C64(0x02020A12100A0206), - SPH_C64(0xAAAA38933938AAE3), SPH_C64(0x7171A8DEAFA87193), - SPH_C64(0xC8C8CFC60ECFC845), SPH_C64(0x19197DD1C87D192B), - SPH_C64(0x4949703B727049DB), SPH_C64(0xD9D99A5F869AD976), - SPH_C64(0xF2F21D31C31DF20B), SPH_C64(0xE3E348A84B48E338), - SPH_C64(0x5B5B2AB9E22A5BED), SPH_C64(0x888892BC34928885), - SPH_C64(0x9A9AC83EA4C89AB3), SPH_C64(0x2626BE0B2DBE266A), - SPH_C64(0x3232FABF8DFA3256), SPH_C64(0xB0B04A59E94AB0CD), - SPH_C64(0xE9E96AF21B6AE926), SPH_C64(0x0F0F337778330F11), - SPH_C64(0xD5D5A633E6A6D562), SPH_C64(0x8080BAF474BA809D), - SPH_C64(0xBEBE7C27997CBEDF), SPH_C64(0xCDCDDEEB26DECD4A), - SPH_C64(0x3434E489BDE4345C), SPH_C64(0x484875327A7548D8), - SPH_C64(0xFFFF2454AB24FF1C), SPH_C64(0x7A7A8F8DF78F7A8E), - SPH_C64(0x9090EA64F4EA90AD), SPH_C64(0x5F5F3E9DC23E5FE1), - SPH_C64(0x2020A03D1DA02060), SPH_C64(0x6868D50F67D568B8), - SPH_C64(0x1A1A72CAD0721A2E), SPH_C64(0xAEAE2CB7192CAEEF), - SPH_C64(0xB4B45E7DC95EB4C1), SPH_C64(0x545419CE9A1954FC), - SPH_C64(0x9393E57FECE593A8), SPH_C64(0x2222AA2F0DAA2266), - SPH_C64(0x6464E96307E964AC), SPH_C64(0xF1F1122ADB12F10E), - SPH_C64(0x7373A2CCBFA27395), SPH_C64(0x12125A82905A1236), - SPH_C64(0x40405D7A3A5D40C0), SPH_C64(0x0808284840280818), - SPH_C64(0xC3C3E89556E8C358), SPH_C64(0xECEC7BDF337BEC29), - SPH_C64(0xDBDB904D9690DB70), SPH_C64(0xA1A11FC0611FA1FE), - SPH_C64(0x8D8D83911C838D8A), SPH_C64(0x3D3DC9C8F5C93D47), - SPH_C64(0x9797F15BCCF197A4), SPH_C64(0x0000000000000000), - SPH_C64(0xCFCFD4F936D4CF4C), SPH_C64(0x2B2B876E45872B7D), - SPH_C64(0x7676B3E197B3769A), SPH_C64(0x8282B0E664B0829B), - SPH_C64(0xD6D6A928FEA9D667), SPH_C64(0x1B1B77C3D8771B2D), - SPH_C64(0xB5B55B74C15BB5C2), SPH_C64(0xAFAF29BE1129AFEC), - SPH_C64(0x6A6ADF1D77DF6ABE), SPH_C64(0x50500DEABA0D50F0), - SPH_C64(0x45454C57124C45CF), SPH_C64(0xF3F31838CB18F308), - SPH_C64(0x3030F0AD9DF03050), SPH_C64(0xEFEF74C42B74EF2C), - SPH_C64(0x3F3FC3DAE5C33F41), SPH_C64(0x55551CC7921C55FF), - SPH_C64(0xA2A210DB7910A2FB), SPH_C64(0xEAEA65E90365EA23), - SPH_C64(0x6565EC6A0FEC65AF), SPH_C64(0xBABA6803B968BAD3), - SPH_C64(0x2F2F934A65932F71), SPH_C64(0xC0C0E78E4EE7C05D), - SPH_C64(0xDEDE8160BE81DE7F), SPH_C64(0x1C1C6CFCE06C1C24), - SPH_C64(0xFDFD2E46BB2EFD1A), SPH_C64(0x4D4D641F52644DD7), - SPH_C64(0x9292E076E4E092AB), SPH_C64(0x7575BCFA8FBC759F), - SPH_C64(0x06061E36301E060A), SPH_C64(0x8A8A98AE24988A83), - SPH_C64(0xB2B2404BF940B2CB), SPH_C64(0xE6E659856359E637), - SPH_C64(0x0E0E367E70360E12), SPH_C64(0x1F1F63E7F8631F21), - SPH_C64(0x6262F75537F762A6), SPH_C64(0xD4D4A33AEEA3D461), - SPH_C64(0xA8A832812932A8E5), SPH_C64(0x9696F452C4F496A7), - SPH_C64(0xF9F93A629B3AF916), SPH_C64(0xC5C5F6A366F6C552), - SPH_C64(0x2525B11035B1256F), SPH_C64(0x595920ABF22059EB), - SPH_C64(0x8484AED054AE8491), SPH_C64(0x7272A7C5B7A77296), - SPH_C64(0x3939DDECD5DD394B), SPH_C64(0x4C4C61165A614CD4), - SPH_C64(0x5E5E3B94CA3B5EE2), SPH_C64(0x7878859FE7857888), - SPH_C64(0x3838D8E5DDD83848), SPH_C64(0x8C8C869814868C89), - SPH_C64(0xD1D1B217C6B2D16E), SPH_C64(0xA5A50BE4410BA5F2), - SPH_C64(0xE2E24DA1434DE23B), SPH_C64(0x6161F84E2FF861A3), - SPH_C64(0xB3B34542F145B3C8), SPH_C64(0x2121A53415A52163), - SPH_C64(0x9C9CD60894D69CB9), SPH_C64(0x1E1E66EEF0661E22), - SPH_C64(0x43435261225243C5), SPH_C64(0xC7C7FCB176FCC754), - SPH_C64(0xFCFC2B4FB32BFC19), SPH_C64(0x040414242014040C), - SPH_C64(0x515108E3B20851F3), SPH_C64(0x9999C725BCC799B6), - SPH_C64(0x6D6DC4224FC46DB7), SPH_C64(0x0D0D396568390D17), - SPH_C64(0xFAFA35798335FA13), SPH_C64(0xDFDF8469B684DF7C), - SPH_C64(0x7E7E9BA9D79B7E82), SPH_C64(0x2424B4193DB4246C), - SPH_C64(0x3B3BD7FEC5D73B4D), SPH_C64(0xABAB3D9A313DABE0), - SPH_C64(0xCECED1F03ED1CE4F), SPH_C64(0x1111559988551133), - SPH_C64(0x8F8F89830C898F8C), SPH_C64(0x4E4E6B044A6B4ED2), - SPH_C64(0xB7B75166D151B7C4), SPH_C64(0xEBEB60E00B60EB20), - SPH_C64(0x3C3CCCC1FDCC3C44), SPH_C64(0x8181BFFD7CBF819E), - SPH_C64(0x9494FE40D4FE94A1), SPH_C64(0xF7F70C1CEB0CF704), - SPH_C64(0xB9B96718A167B9D6), SPH_C64(0x13135F8B985F1335), - SPH_C64(0x2C2C9C517D9C2C74), SPH_C64(0xD3D3B805D6B8D368), - SPH_C64(0xE7E75C8C6B5CE734), SPH_C64(0x6E6ECB3957CB6EB2), - SPH_C64(0xC4C4F3AA6EF3C451), SPH_C64(0x03030F1B180F0305), - SPH_C64(0x565613DC8A1356FA), SPH_C64(0x4444495E1A4944CC), - SPH_C64(0x7F7F9EA0DF9E7F81), SPH_C64(0xA9A937882137A9E6), - SPH_C64(0x2A2A82674D822A7E), SPH_C64(0xBBBB6D0AB16DBBD0), - SPH_C64(0xC1C1E28746E2C15E), SPH_C64(0x535302F1A20253F5), - SPH_C64(0xDCDC8B72AE8BDC79), SPH_C64(0x0B0B275358270B1D), - SPH_C64(0x9D9DD3019CD39DBA), SPH_C64(0x6C6CC12B47C16CB4), - SPH_C64(0x3131F5A495F53153), SPH_C64(0x7474B9F387B9749C), - SPH_C64(0xF6F60915E309F607), SPH_C64(0x4646434C0A4346CA), - SPH_C64(0xACAC26A50926ACE9), SPH_C64(0x898997B53C978986), - SPH_C64(0x141444B4A044143C), SPH_C64(0xE1E142BA5B42E13E), - SPH_C64(0x16164EA6B04E163A), SPH_C64(0x3A3AD2F7CDD23A4E), - SPH_C64(0x6969D0066FD069BB), SPH_C64(0x09092D41482D091B), - SPH_C64(0x7070ADD7A7AD7090), SPH_C64(0xB6B6546FD954B6C7), - SPH_C64(0xD0D0B71ECEB7D06D), SPH_C64(0xEDED7ED63B7EED2A), - SPH_C64(0xCCCCDBE22EDBCC49), SPH_C64(0x424257682A5742C6), - SPH_C64(0x9898C22CB4C298B5), SPH_C64(0xA4A40EED490EA4F1), - SPH_C64(0x282888755D882878), SPH_C64(0x5C5C3186DA315CE4), - SPH_C64(0xF8F83F6B933FF815), SPH_C64(0x8686A4C244A48697) -}; - -static const uint64_t old1_T7[256] = { - SPH_C64(0x1878D8C078182818), SPH_C64(0x23AF2605AF236523), - SPH_C64(0xC6F9B87EF9C657C6), SPH_C64(0xE86FFB136FE825E8), - SPH_C64(0x87A1CB4CA1879487), SPH_C64(0xB86211A962B8D5B8), - SPH_C64(0x0105090805010301), SPH_C64(0x4F6E0D426E4FD14F), - SPH_C64(0x36EE9BADEE365A36), SPH_C64(0xA604FF5904A6F7A6), - SPH_C64(0xD2BD0CDEBDD26BD2), SPH_C64(0xF5060EFB06F502F5), - SPH_C64(0x798096EF80798B79), SPH_C64(0x6FCE305FCE6FB16F), - SPH_C64(0x91EF6DFCEF91AE91), SPH_C64(0x5207F8AA0752F652), - SPH_C64(0x60FD4727FD60A060), SPH_C64(0xBC76358976BCD9BC), - SPH_C64(0x9BCD37ACCD9BB09B), SPH_C64(0x8E8C8A048C8E8F8E), - SPH_C64(0xA315D27115A3F8A3), SPH_C64(0x0C3C6C603C0C140C), - SPH_C64(0x7B8A84FF8A7B8D7B), SPH_C64(0x35E180B5E1355F35), - SPH_C64(0x1D69F5E8691D271D), SPH_C64(0xE047B35347E03DE0), - SPH_C64(0xD7AC21F6ACD764D7), SPH_C64(0xC2ED9C5EEDC25BC2), - SPH_C64(0x2E96436D962E722E), SPH_C64(0x4B7A29627A4BDD4B), - SPH_C64(0xFE215DA321FE1FFE), SPH_C64(0x5716D5821657F957), - SPH_C64(0x1541BDA841153F15), SPH_C64(0x77B6E89FB6779977), - SPH_C64(0x37EB92A5EB375937), SPH_C64(0xE5569E7B56E532E5), - SPH_C64(0x9FD9138CD99FBC9F), SPH_C64(0xF01723D317F00DF0), - SPH_C64(0x4A7F206A7F4ADE4A), SPH_C64(0xDA95449E95DA73DA), - SPH_C64(0x5825A2FA2558E858), SPH_C64(0xC9CACF06CAC946C9), - SPH_C64(0x298D7C558D297B29), SPH_C64(0x0A225A50220A1E0A), - SPH_C64(0xB14F50E14FB1CEB1), SPH_C64(0xA01AC9691AA0FDA0), - SPH_C64(0x6BDA147FDA6BBD6B), SPH_C64(0x85ABD95CAB859285), - SPH_C64(0xBD733C8173BDDABD), SPH_C64(0x5D348FD2345DE75D), - SPH_C64(0x1050908050103010), SPH_C64(0xF40307F303F401F4), - SPH_C64(0xCBC0DD16C0CB40CB), SPH_C64(0x3EC6D3EDC63E423E), - SPH_C64(0x05112D2811050F05), SPH_C64(0x67E6781FE667A967), - SPH_C64(0xE453977353E431E4), SPH_C64(0x27BB0225BB276927), - SPH_C64(0x415873325841C341), SPH_C64(0x8B9DA72C9D8B808B), - SPH_C64(0xA701F65101A7F4A7), SPH_C64(0x7D94B2CF947D877D), - SPH_C64(0x95FB49DCFB95A295), SPH_C64(0xD89F568E9FD875D8), - SPH_C64(0xFB30708B30FB10FB), SPH_C64(0xEE71CD2371EE2FEE), - SPH_C64(0x7C91BBC7917C847C), SPH_C64(0x66E37117E366AA66), - SPH_C64(0xDD8E7BA68EDD7ADD), SPH_C64(0x174BAFB84B173917), - SPH_C64(0x474645024647C947), SPH_C64(0x9EDC1A84DC9EBF9E), - SPH_C64(0xCAC5D41EC5CA43CA), SPH_C64(0x2D995875992D772D), - SPH_C64(0xBF792E9179BFDCBF), SPH_C64(0x071B3F381B070907), - SPH_C64(0xAD23AC0123ADEAAD), SPH_C64(0x5A2FB0EA2F5AEE5A), - SPH_C64(0x83B5EF6CB5839883), SPH_C64(0x33FFB685FF335533), - SPH_C64(0x63F25C3FF263A563), SPH_C64(0x020A12100A020602), - SPH_C64(0xAA38933938AAE3AA), SPH_C64(0x71A8DEAFA8719371), - SPH_C64(0xC8CFC60ECFC845C8), SPH_C64(0x197DD1C87D192B19), - SPH_C64(0x49703B727049DB49), SPH_C64(0xD99A5F869AD976D9), - SPH_C64(0xF21D31C31DF20BF2), SPH_C64(0xE348A84B48E338E3), - SPH_C64(0x5B2AB9E22A5BED5B), SPH_C64(0x8892BC3492888588), - SPH_C64(0x9AC83EA4C89AB39A), SPH_C64(0x26BE0B2DBE266A26), - SPH_C64(0x32FABF8DFA325632), SPH_C64(0xB04A59E94AB0CDB0), - SPH_C64(0xE96AF21B6AE926E9), SPH_C64(0x0F337778330F110F), - SPH_C64(0xD5A633E6A6D562D5), SPH_C64(0x80BAF474BA809D80), - SPH_C64(0xBE7C27997CBEDFBE), SPH_C64(0xCDDEEB26DECD4ACD), - SPH_C64(0x34E489BDE4345C34), SPH_C64(0x4875327A7548D848), - SPH_C64(0xFF2454AB24FF1CFF), SPH_C64(0x7A8F8DF78F7A8E7A), - SPH_C64(0x90EA64F4EA90AD90), SPH_C64(0x5F3E9DC23E5FE15F), - SPH_C64(0x20A03D1DA0206020), SPH_C64(0x68D50F67D568B868), - SPH_C64(0x1A72CAD0721A2E1A), SPH_C64(0xAE2CB7192CAEEFAE), - SPH_C64(0xB45E7DC95EB4C1B4), SPH_C64(0x5419CE9A1954FC54), - SPH_C64(0x93E57FECE593A893), SPH_C64(0x22AA2F0DAA226622), - SPH_C64(0x64E96307E964AC64), SPH_C64(0xF1122ADB12F10EF1), - SPH_C64(0x73A2CCBFA2739573), SPH_C64(0x125A82905A123612), - SPH_C64(0x405D7A3A5D40C040), SPH_C64(0x0828484028081808), - SPH_C64(0xC3E89556E8C358C3), SPH_C64(0xEC7BDF337BEC29EC), - SPH_C64(0xDB904D9690DB70DB), SPH_C64(0xA11FC0611FA1FEA1), - SPH_C64(0x8D83911C838D8A8D), SPH_C64(0x3DC9C8F5C93D473D), - SPH_C64(0x97F15BCCF197A497), SPH_C64(0x0000000000000000), - SPH_C64(0xCFD4F936D4CF4CCF), SPH_C64(0x2B876E45872B7D2B), - SPH_C64(0x76B3E197B3769A76), SPH_C64(0x82B0E664B0829B82), - SPH_C64(0xD6A928FEA9D667D6), SPH_C64(0x1B77C3D8771B2D1B), - SPH_C64(0xB55B74C15BB5C2B5), SPH_C64(0xAF29BE1129AFECAF), - SPH_C64(0x6ADF1D77DF6ABE6A), SPH_C64(0x500DEABA0D50F050), - SPH_C64(0x454C57124C45CF45), SPH_C64(0xF31838CB18F308F3), - SPH_C64(0x30F0AD9DF0305030), SPH_C64(0xEF74C42B74EF2CEF), - SPH_C64(0x3FC3DAE5C33F413F), SPH_C64(0x551CC7921C55FF55), - SPH_C64(0xA210DB7910A2FBA2), SPH_C64(0xEA65E90365EA23EA), - SPH_C64(0x65EC6A0FEC65AF65), SPH_C64(0xBA6803B968BAD3BA), - SPH_C64(0x2F934A65932F712F), SPH_C64(0xC0E78E4EE7C05DC0), - SPH_C64(0xDE8160BE81DE7FDE), SPH_C64(0x1C6CFCE06C1C241C), - SPH_C64(0xFD2E46BB2EFD1AFD), SPH_C64(0x4D641F52644DD74D), - SPH_C64(0x92E076E4E092AB92), SPH_C64(0x75BCFA8FBC759F75), - SPH_C64(0x061E36301E060A06), SPH_C64(0x8A98AE24988A838A), - SPH_C64(0xB2404BF940B2CBB2), SPH_C64(0xE659856359E637E6), - SPH_C64(0x0E367E70360E120E), SPH_C64(0x1F63E7F8631F211F), - SPH_C64(0x62F75537F762A662), SPH_C64(0xD4A33AEEA3D461D4), - SPH_C64(0xA832812932A8E5A8), SPH_C64(0x96F452C4F496A796), - SPH_C64(0xF93A629B3AF916F9), SPH_C64(0xC5F6A366F6C552C5), - SPH_C64(0x25B11035B1256F25), SPH_C64(0x5920ABF22059EB59), - SPH_C64(0x84AED054AE849184), SPH_C64(0x72A7C5B7A7729672), - SPH_C64(0x39DDECD5DD394B39), SPH_C64(0x4C61165A614CD44C), - SPH_C64(0x5E3B94CA3B5EE25E), SPH_C64(0x78859FE785788878), - SPH_C64(0x38D8E5DDD8384838), SPH_C64(0x8C869814868C898C), - SPH_C64(0xD1B217C6B2D16ED1), SPH_C64(0xA50BE4410BA5F2A5), - SPH_C64(0xE24DA1434DE23BE2), SPH_C64(0x61F84E2FF861A361), - SPH_C64(0xB34542F145B3C8B3), SPH_C64(0x21A53415A5216321), - SPH_C64(0x9CD60894D69CB99C), SPH_C64(0x1E66EEF0661E221E), - SPH_C64(0x435261225243C543), SPH_C64(0xC7FCB176FCC754C7), - SPH_C64(0xFC2B4FB32BFC19FC), SPH_C64(0x0414242014040C04), - SPH_C64(0x5108E3B20851F351), SPH_C64(0x99C725BCC799B699), - SPH_C64(0x6DC4224FC46DB76D), SPH_C64(0x0D396568390D170D), - SPH_C64(0xFA35798335FA13FA), SPH_C64(0xDF8469B684DF7CDF), - SPH_C64(0x7E9BA9D79B7E827E), SPH_C64(0x24B4193DB4246C24), - SPH_C64(0x3BD7FEC5D73B4D3B), SPH_C64(0xAB3D9A313DABE0AB), - SPH_C64(0xCED1F03ED1CE4FCE), SPH_C64(0x1155998855113311), - SPH_C64(0x8F89830C898F8C8F), SPH_C64(0x4E6B044A6B4ED24E), - SPH_C64(0xB75166D151B7C4B7), SPH_C64(0xEB60E00B60EB20EB), - SPH_C64(0x3CCCC1FDCC3C443C), SPH_C64(0x81BFFD7CBF819E81), - SPH_C64(0x94FE40D4FE94A194), SPH_C64(0xF70C1CEB0CF704F7), - SPH_C64(0xB96718A167B9D6B9), SPH_C64(0x135F8B985F133513), - SPH_C64(0x2C9C517D9C2C742C), SPH_C64(0xD3B805D6B8D368D3), - SPH_C64(0xE75C8C6B5CE734E7), SPH_C64(0x6ECB3957CB6EB26E), - SPH_C64(0xC4F3AA6EF3C451C4), SPH_C64(0x030F1B180F030503), - SPH_C64(0x5613DC8A1356FA56), SPH_C64(0x44495E1A4944CC44), - SPH_C64(0x7F9EA0DF9E7F817F), SPH_C64(0xA937882137A9E6A9), - SPH_C64(0x2A82674D822A7E2A), SPH_C64(0xBB6D0AB16DBBD0BB), - SPH_C64(0xC1E28746E2C15EC1), SPH_C64(0x5302F1A20253F553), - SPH_C64(0xDC8B72AE8BDC79DC), SPH_C64(0x0B275358270B1D0B), - SPH_C64(0x9DD3019CD39DBA9D), SPH_C64(0x6CC12B47C16CB46C), - SPH_C64(0x31F5A495F5315331), SPH_C64(0x74B9F387B9749C74), - SPH_C64(0xF60915E309F607F6), SPH_C64(0x46434C0A4346CA46), - SPH_C64(0xAC26A50926ACE9AC), SPH_C64(0x8997B53C97898689), - SPH_C64(0x1444B4A044143C14), SPH_C64(0xE142BA5B42E13EE1), - SPH_C64(0x164EA6B04E163A16), SPH_C64(0x3AD2F7CDD23A4E3A), - SPH_C64(0x69D0066FD069BB69), SPH_C64(0x092D41482D091B09), - SPH_C64(0x70ADD7A7AD709070), SPH_C64(0xB6546FD954B6C7B6), - SPH_C64(0xD0B71ECEB7D06DD0), SPH_C64(0xED7ED63B7EED2AED), - SPH_C64(0xCCDBE22EDBCC49CC), SPH_C64(0x4257682A5742C642), - SPH_C64(0x98C22CB4C298B598), SPH_C64(0xA40EED490EA4F1A4), - SPH_C64(0x2888755D88287828), SPH_C64(0x5C3186DA315CE45C), - SPH_C64(0xF83F6B933FF815F8), SPH_C64(0x86A4C244A4869786) -}; - -static const uint64_t old1_RC[10] = { - SPH_C64(0x4F01B887E8C62318), - SPH_C64(0x52916F79F5D2A636), - SPH_C64(0x357B0CA38E9BBC60), - SPH_C64(0x57FE4B2EC2D7E01D), - SPH_C64(0xDA4AF09FE5377715), - SPH_C64(0x856BA0B10A29C958), - SPH_C64(0x67053ECBF4105DBD), - SPH_C64(0xD8957DA78B4127E4), - SPH_C64(0x9E4717DD667CEEFB), - SPH_C64(0x33835AAD07BF2DCA) -}; - -static const uint64_t plain_T0[256] = { +__constant__ __align__(64) uint64_t mixTob0Tox[256] = { SPH_C64(0xD83078C018601818), SPH_C64(0x2646AF05238C2323), SPH_C64(0xB891F97EC63FC6C6), SPH_C64(0xFBCD6F13E887E8E8), SPH_C64(0xCB13A14C87268787), SPH_C64(0x116D62A9B8DAB8B8), @@ -1255,7 +186,10 @@ static const uint64_t plain_T0[256] = { SPH_C64(0x6BED3F93F8C7F8F8), SPH_C64(0xC211A44486228686) }; -static const uint64_t plain_T1[256] = { +#if USE_ALL_TABLES + +/* +__constant__ __align__(64) uint64_t mixTob1Tox[256] = { SPH_C64(0x3078C018601818D8), SPH_C64(0x46AF05238C232326), SPH_C64(0x91F97EC63FC6C6B8), SPH_C64(0xCD6F13E887E8E8FB), SPH_C64(0x13A14C87268787CB), SPH_C64(0x6D62A9B8DAB8B811), @@ -2171,466 +1105,703 @@ static const uint64_t plain_T7[256] = { SPH_C64(0x287550885D28A028), SPH_C64(0x5C86B831DA5C6D5C), SPH_C64(0xF86BED3F93F8C7F8), SPH_C64(0x86C211A444862286) }; +*/ /** * Round constants. */ -__constant__ uint64_t InitVector_RC[10]; - -static const uint64_t plain_RC[10] = { - SPH_C64(0x4F01B887E8C62318), - SPH_C64(0x52916F79F5D2A636), - SPH_C64(0x357B0CA38E9BBC60), - SPH_C64(0x57FE4B2EC2D7E01D), - SPH_C64(0xDA4AF09FE5377715), - SPH_C64(0x856BA0B10A29C958), - SPH_C64(0x67053ECBF4105DBD), - SPH_C64(0xD8957DA78B4127E4), - SPH_C64(0x9E4717DD667CEEFB), - SPH_C64(0x33835AAD07BF2DCA) -}; - -/* ====================================================================== */ - - -#define TRANSFER(dst, src) { \ - dst[0] = src ## 0; \ - dst[1] = src ## 1; \ - dst[2] = src ## 2; \ - dst[3] = src ## 3; \ - dst[4] = src ## 4; \ - dst[5] = src ## 5; \ - dst[6] = src ## 6; \ - dst[7] = src ## 7; \ -} +#endif #if !USE_ALL_TABLES -#define BYTE(x, n) ((unsigned)((x) >> (8 * (n))) & 0xFF) - -/* method disabled to reduce code size */ -__device__ __forceinline__ -static uint64_t table_skew(uint64_t val, int num) { - return ROTL64(val, 8 * num); -} __device__ __forceinline__ -static uint64_t ROUND_ELT(const uint64_t* sharedMemory, uint64_t* __restrict__ in, - int i0,int i1,int i2,int i3,int i4,int i5,int i6,int i7) +static uint2 ROUND_ELT(const uint2*const __restrict__ sharedMemory, const uint2*const __restrict__ in, +const int i0, const int i1, const int i2, const int i3, const int i4, const int i5, const int i6, const int i7) { - uint32_t idx0, idx1, idx2, idx3, idx4, idx5, idx6, idx7; - idx0 = BYTE(in[i0], 0); - idx1 = BYTE(in[i1], 1); - idx2 = BYTE(in[i2], 2); - idx3 = BYTE(in[i3], 3); - idx4 = BYTE(in[i4], 4); - idx5 = BYTE(in[i5], 5); - idx6 = BYTE(in[i6], 6); - idx7 = BYTE(in[i7], 7); - - return xor8( - sharedMemory[idx0], - table_skew(sharedMemory[idx1], 1), - table_skew(sharedMemory[idx2], 2), - table_skew(sharedMemory[idx3], 3), - table_skew(sharedMemory[idx4], 4), - table_skew(sharedMemory[idx5], 5), - table_skew(sharedMemory[idx6], 6), - table_skew(sharedMemory[idx7], 7) - ); -} + return( + sharedMemory[__byte_perm(in[(i0)].x, 0, 0x4440)] ^ ROL2(sharedMemory[__byte_perm(in[(i1)].x, 0, 0x4441)], 8) ^ ROL2(sharedMemory[__byte_perm(in[(i2)].x, 0, 0x4442)], 16) ^ + ROL2(sharedMemory[__byte_perm(in[(i3)].x, 0, 0x4443)], 24) ^ sharedMemory[__byte_perm(in[(i4)].y, 0, 0x4440) + 256] ^ ROL2(sharedMemory[__byte_perm(in[(i5)].y, 0, 0x4441) + 256], 8) ^ + ROL2(sharedMemory[__byte_perm(in[(i6)].y, 0, 0x4442) + 256], 16) ^ ROL2(sharedMemory[__byte_perm(in[(i7)].y, 0, 0x4443) + 256], 24)); -#else -__device__ __forceinline__ -static uint64_t ROUND_ELT(const uint64_t* sharedMemory, uint64_t* __restrict__ in, +} +#else +__device__ uint2 ROUND_ELT(const uint2*const __restrict__ sharedMemory, uint2* const __restrict__ in, const int i0, const int i1, const int i2, const int i3, const int i4, const int i5, const int i6, const int i7) { - uint32_t* in32 = (uint32_t*)in; - return (sharedMemory[__byte_perm(in32[(i0 << 1)], 0, 0x4440)] ^ sharedMemory[__byte_perm(in32[(i1 << 1)], 0, 0x4441) + 256] ^ - sharedMemory[__byte_perm(in32[(i2 << 1)], 0, 0x4442) + 512] ^ sharedMemory[__byte_perm(in32[(i3 << 1)], 0, 0x4443) + 768] ^ - sharedMemory[__byte_perm(in32[(i4 << 1) + 1], 0, 0x4440) + 1024] ^ sharedMemory[__byte_perm(in32[(i5 << 1) + 1], 0, 0x4441) + 1280] ^ - sharedMemory[__byte_perm(in32[(i6 << 1) + 1], 0, 0x4442) + 1536] ^ sharedMemory[__byte_perm(in32[(i7 << 1) + 1], 0, 0x4443) + 1792]); + return (sharedMemory[__byte_perm(in[i0].x, 0, 0x4440)] ^ + sharedMemory[__byte_perm(in[i1].x, 0, 0x4441) + 256] ^ + sharedMemory[__byte_perm(in[i2].x, 0, 0x4442) + 512] ^ + sharedMemory[__byte_perm(in[i3].x, 0, 0x4443) + 768] ^ + SWAPDWORDS2(sharedMemory[__byte_perm(in[i4].y, 0, 0x4440)]) ^ + SWAPDWORDS2(sharedMemory[__byte_perm(in[i5].y, 0, 0x4441) + 256]) ^ + SWAPDWORDS2(sharedMemory[__byte_perm(in[i6].y, 0, 0x4442) + 512]) ^ + SWAPDWORDS2(sharedMemory[__byte_perm(in[i7].y, 0, 0x4443) + 768])); } #endif /* USE_ALL_TABLES */ -#define ROUND(table, in, out, c0, c1, c2, c3, c4, c5, c6, c7) { \ - out ## 0 = xor1(ROUND_ELT(table, in, 0, 7, 6, 5, 4, 3, 2, 1), c0); \ - out ## 1 = xor1(ROUND_ELT(table, in, 1, 0, 7, 6, 5, 4, 3, 2), c1); \ - out ## 2 = xor1(ROUND_ELT(table, in, 2, 1, 0, 7, 6, 5, 4, 3), c2); \ - out ## 3 = xor1(ROUND_ELT(table, in, 3, 2, 1, 0, 7, 6, 5, 4), c3); \ - out ## 4 = xor1(ROUND_ELT(table, in, 4, 3, 2, 1, 0, 7, 6, 5), c4); \ - out ## 5 = xor1(ROUND_ELT(table, in, 5, 4, 3, 2, 1, 0, 7, 6), c5); \ - out ## 6 = xor1(ROUND_ELT(table, in, 6, 5, 4, 3, 2, 1, 0, 7), c6); \ - out ## 7 = xor1(ROUND_ELT(table, in, 7, 6, 5, 4, 3, 2, 1, 0), c7); \ -} - -#define ROUND1(table, in, out,c) { \ - out ## 0 = xor1(ROUND_ELT(table, in, 0, 7, 6, 5, 4, 3, 2, 1),c); \ - out ## 1 = ROUND_ELT(table, in, 1, 0, 7, 6, 5, 4, 3, 2); \ - out ## 2 = ROUND_ELT(table, in, 2, 1, 0, 7, 6, 5, 4, 3); \ - out ## 3 = ROUND_ELT(table, in, 3, 2, 1, 0, 7, 6, 5, 4); \ - out ## 4 = ROUND_ELT(table, in, 4, 3, 2, 1, 0, 7, 6, 5); \ - out ## 5 = ROUND_ELT(table, in, 5, 4, 3, 2, 1, 0, 7, 6); \ - out ## 6 = ROUND_ELT(table, in, 6, 5, 4, 3, 2, 1, 0, 7); \ - out ## 7 = ROUND_ELT(table, in, 7, 6, 5, 4, 3, 2, 1, 0); \ -} - -#define ROUND_KSCHED(table, in, out, c) \ - ROUND1(table, in, out,c) \ - TRANSFER(in, out) - -#define ROUND_WENC(table, in, key, out) \ - ROUND(table, in, out, key[0], key[1], key[2],key[3], key[4], key[5], key[6], key[7]) \ - TRANSFER(in, out) -__global__ +__global__ void oldwhirlpool_gpu_hash_80(uint32_t threads, uint32_t startNounce, void *outputHash) { - __shared__ uint64_t sharedMemory[2048]; + /* +#if USE_ALL_TABLES + __shared__ uint2 sharedMemory[256*4]; +#else + __shared__ uint2 sharedMemory[256*2]; +#endif if (threadIdx.x < 256) { - sharedMemory[threadIdx.x] = mixTob0Tox[threadIdx.x]; - #if USE_ALL_TABLES - sharedMemory[threadIdx.x+256] = mixTob1Tox[threadIdx.x]; - sharedMemory[threadIdx.x+512] = mixTob2Tox[threadIdx.x]; - sharedMemory[threadIdx.x+768] = mixTob3Tox[threadIdx.x]; - sharedMemory[threadIdx.x+1024] = mixTob4Tox[threadIdx.x]; - sharedMemory[threadIdx.x+1280] = mixTob5Tox[threadIdx.x]; - sharedMemory[threadIdx.x+1536] = mixTob6Tox[threadIdx.x]; - sharedMemory[threadIdx.x+1792] = mixTob7Tox[threadIdx.x]; - #endif +#if USE_ALL_TABLES + sharedMemory[threadIdx.x] = vectorize(mixTob0Tox[threadIdx.x]); + sharedMemory[threadIdx.x + 256] = vectorize(mixTob1Tox[threadIdx.x]); + sharedMemory[threadIdx.x + 512] = ROL2(vectorize(mixTob0Tox[threadIdx.x]), 16); + sharedMemory[threadIdx.x + 768] = ROL2(vectorize(mixTob1Tox[threadIdx.x]), 16); +// sharedMemory[threadIdx.x + 1024] = SWAPDWORDS2(sharedMemory[threadIdx.x]); +// sharedMemory[threadIdx.x + 1280] = SWAPDWORDS2(sharedMemory[threadIdx.x + 256]); +// sharedMemory[threadIdx.x + 1536] = SWAPDWORDS2(sharedMemory[threadIdx.x + 512]); +// sharedMemory[threadIdx.x + 1792] = SWAPDWORDS2(sharedMemory[threadIdx.x + 768]); +#else + sharedMemory[threadIdx.x] = vectorize(mixTob0Tox[threadIdx.x]); + sharedMemory[threadIdx.x + 256] = SWAPDWORDS2(sharedMemory[threadIdx.x]); +#endif } - - uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); if (thread < threads) { - uint32_t nounce = startNounce + thread; + const uint2 InitVector_RC[10] = + { + { 0xE8C62318UL, 0x4F01B887UL }, + { 0xF5D2A636UL, 0x52916F79UL }, + { 0x8E9BBC60UL, 0x357B0CA3UL }, + { 0xC2D7E01DUL, 0x57FE4B2EUL }, + { 0xE5377715UL, 0xDA4AF09FUL }, + { 0x0A29C958UL, 0x856BA0B1UL }, + { 0xF4105DBDUL, 0x67053ECBUL }, + { 0x8B4127E4UL, 0xD8957DA7UL }, + { 0x667CEEFBUL, 0x9E4717DDUL }, + { 0x07BF2DCAUL, 0x33835AADUL } + }; + + const uint32_t nounce = startNounce + thread; union { uint8_t h1[64]; uint32_t h4[16]; - uint64_t h8[8]; + uint2 h8[8]; } hash; - uint64_t state[8]; - uint64_t n[8]; - uint64_t h[8]; + uint2 state[8]; + uint2 n[8]; + uint2 h[8]; - #pragma unroll 8 - for (int i=0; i<8; i++) { - n[i] = c_PaddedMessage80[i]; // read data - h[i] = 0; // read state +#pragma unroll 8 + for (int i = 0; i<8; i++) { + n[i] = vectorize(c_PaddedMessage80[i]); // read data + h[i] = make_uint2(0,0); // read state } - #pragma unroll 10 - for (unsigned r=0; r < 10; r++) { - uint64_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; - ROUND_KSCHED(sharedMemory, h, tmp, InitVector_RC[r]); - ROUND_WENC(sharedMemory, n, h, tmp); +//#pragma unroll 10 + for (int i = 0; i < 10; i++) + { + uint2 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + tmp0 = ROUND_ELT(sharedMemory, h, 0, 7, 6, 5, 4, 3, 2, 1)^ InitVector_RC[i]; + tmp1 = ROUND_ELT(sharedMemory, h, 1, 0, 7, 6, 5, 4, 3, 2); + tmp2 = ROUND_ELT(sharedMemory, h, 2, 1, 0, 7, 6, 5, 4, 3); + tmp3 = ROUND_ELT(sharedMemory, h, 3, 2, 1, 0, 7, 6, 5, 4); + tmp4 = ROUND_ELT(sharedMemory, h, 4, 3, 2, 1, 0, 7, 6, 5); + tmp5 = ROUND_ELT(sharedMemory, h, 5, 4, 3, 2, 1, 0, 7, 6); + tmp6 = ROUND_ELT(sharedMemory, h, 6, 5, 4, 3, 2, 1, 0, 7); + tmp7 = ROUND_ELT(sharedMemory, h, 7, 6, 5, 4, 3, 2, 1, 0); + h[0] = tmp0; + h[1] = tmp1; + h[2] = tmp2; + h[3] = tmp3; + h[4] = tmp4; + h[5] = tmp5; + h[6] = tmp6; + h[7] = tmp7; + tmp0 = ROUND_ELT(sharedMemory, n, 0, 7, 6, 5, 4, 3, 2, 1) ^ h[0]; + tmp1 = ROUND_ELT(sharedMemory, n, 1, 0, 7, 6, 5, 4, 3, 2) ^ h[1]; + tmp2 = ROUND_ELT(sharedMemory, n, 2, 1, 0, 7, 6, 5, 4, 3) ^ h[2]; + tmp3 = ROUND_ELT(sharedMemory, n, 3, 2, 1, 0, 7, 6, 5, 4) ^ h[3]; + tmp4 = ROUND_ELT(sharedMemory, n, 4, 3, 2, 1, 0, 7, 6, 5) ^ h[4]; + tmp5 = ROUND_ELT(sharedMemory, n, 5, 4, 3, 2, 1, 0, 7, 6) ^ h[5]; + tmp6 = ROUND_ELT(sharedMemory, n, 6, 5, 4, 3, 2, 1, 0, 7) ^ h[6]; + tmp7 = ROUND_ELT(sharedMemory, n, 7, 6, 5, 4, 3, 2, 1, 0) ^ h[7]; + n[0] = tmp0; + n[1] = tmp1; + n[2] = tmp2; + n[3] = tmp3; + n[4] = tmp4; + n[5] = tmp5; + n[6] = tmp6; + n[7] = tmp7; } - #pragma unroll 8 - for (int i=0; i < 8; i++) { - state[i] = xor1(n[i],c_PaddedMessage80[i]); +#pragma unroll 8 + for (int i = 0; i < 8; i++) { + state[i] = n[i]^vectorize(c_PaddedMessage80[i]); } /// round 2 /////// ////////////////////////////////// - #pragma unroll 8 - for (int i=0; i<8; i++) { +#pragma unroll 8 + for (int i = 0; i<8; i++) { h[i] = state[i]; //read state } - n[0] = xor1(c_PaddedMessage80[8], h[0]); - n[1] = xor1(REPLACE_HIWORD(c_PaddedMessage80[9], cuda_swab32(nounce)), h[1]); - n[2] = xor1(0x0000000000000080, h[2]); + n[0] = vectorize(c_PaddedMessage80[8])^ h[0]; + n[1] = vectorize(REPLACE_HIWORD(c_PaddedMessage80[9], cuda_swab32(nounce)))^ h[1]; + n[2].y = h[2].y; + n[2].x = h[2].x ^ 0x80; n[3] = h[3]; n[4] = h[4]; n[5] = h[5]; n[6] = h[6]; - n[7] = xor1(0x8002000000000000, h[7]); - - #pragma unroll 10 - for (unsigned r=0; r < 10; r++) { - uint64_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; - ROUND_KSCHED(sharedMemory, h, tmp, InitVector_RC[r]); - ROUND_WENC(sharedMemory, n, h, tmp); + n[7].x = h[7].x; + n[7].y = h[7].y ^ 0x80020000; + +//#pragma unroll 10 + for (int i = 0; i < 10; i++) + { + uint2 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + tmp0 = ROUND_ELT(sharedMemory, h, 0, 7, 6, 5, 4, 3, 2, 1)^InitVector_RC[i]; + tmp1 = ROUND_ELT(sharedMemory, h, 1, 0, 7, 6, 5, 4, 3, 2); + tmp2 = ROUND_ELT(sharedMemory, h, 2, 1, 0, 7, 6, 5, 4, 3); + tmp3 = ROUND_ELT(sharedMemory, h, 3, 2, 1, 0, 7, 6, 5, 4); + tmp4 = ROUND_ELT(sharedMemory, h, 4, 3, 2, 1, 0, 7, 6, 5); + tmp5 = ROUND_ELT(sharedMemory, h, 5, 4, 3, 2, 1, 0, 7, 6); + tmp6 = ROUND_ELT(sharedMemory, h, 6, 5, 4, 3, 2, 1, 0, 7); + tmp7 = ROUND_ELT(sharedMemory, h, 7, 6, 5, 4, 3, 2, 1, 0); + h[0] = tmp0; + h[1] = tmp1; + h[2] = tmp2; + h[3] = tmp3; + h[4] = tmp4; + h[5] = tmp5; + h[6] = tmp6; + h[7] = tmp7; + tmp0 = ROUND_ELT(sharedMemory, n, 0, 7, 6, 5, 4, 3, 2, 1)^ h[0]; + tmp1 = ROUND_ELT(sharedMemory, n, 1, 0, 7, 6, 5, 4, 3, 2) ^ h[1]; + tmp2 = ROUND_ELT(sharedMemory, n, 2, 1, 0, 7, 6, 5, 4, 3) ^ h[2]; + tmp3 = ROUND_ELT(sharedMemory, n, 3, 2, 1, 0, 7, 6, 5, 4) ^ h[3]; + tmp4 = ROUND_ELT(sharedMemory, n, 4, 3, 2, 1, 0, 7, 6, 5) ^ h[4]; + tmp5 = ROUND_ELT(sharedMemory, n, 5, 4, 3, 2, 1, 0, 7, 6) ^ h[5]; + tmp6 = ROUND_ELT(sharedMemory, n, 6, 5, 4, 3, 2, 1, 0, 7) ^ h[6]; + tmp7 = ROUND_ELT(sharedMemory, n, 7, 6, 5, 4, 3, 2, 1, 0) ^ h[7]; + n[0] = tmp0; + n[1] = tmp1; + n[2] = tmp2; + n[3] = tmp3; + n[4] = tmp4; + n[5] = tmp5; + n[6] = tmp6; + n[7] = tmp7; } - state[0] = xor3(state[0], n[0], c_PaddedMessage80[8]); - state[1] = xor3(state[1], n[1], REPLACE_HIWORD(c_PaddedMessage80[9], cuda_swab32(nounce)) ); - state[2] = xor3(state[2], n[2], 0x0000000000000080); - state[3] = xor1(state[3], n[3]); - state[4] = xor1(state[4], n[4]); - state[5] = xor1(state[5], n[5]); - state[6] = xor1(state[6], n[6]); - state[7] = xor3(state[7], n[7], 0x8002000000000000); - - #pragma unroll 8 + state[0] = state[0] ^ n[0] ^ vectorize(c_PaddedMessage80[8]); + state[1] = state[1] ^ n[1] ^ vectorize(REPLACE_HIWORD(c_PaddedMessage80[9], cuda_swab32(nounce))); + state[2].y = state[2].y ^ n[2].y; + state[2].x = state[2].x ^ n[2].x ^ 0x80; + state[3] = state[3] ^ n[3]; + state[4] = state[4] ^ n[4]; + state[5] = state[5] ^ n[5]; + state[6] = state[6] ^ n[6]; + state[7].x = state[7].x ^ n[7].x; + state[7].y = state[7].y ^ n[7].y ^ 0x80020000; + +#pragma unroll 8 for (unsigned i = 0; i < 8; i++) hash.h8[i] = state[i]; uint32_t *outHash = (uint32_t *)outputHash + 16 * thread; - #pragma unroll 16 - for (int i=0; i<16; i++) +#pragma unroll 16 + for (int i = 0; i<16; i++) outHash[i] = hash.h4[i]; - } // thread < threads + */ } -__global__ -void x15_whirlpool_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector) -{ - __shared__ uint64_t sharedMemory[2048]; +__constant__ uint2 precalc[8 * 9] = { + { 0xf889ab3b, 0x24aed1ea }, + { 0x66454544, 0xafcbe945 }, + { 0xa4a4fe70, 0x89b2a4c5 }, + { 0xe1a9fac5, 0xa0e1cce1 }, + { 0x5cc0ac48, 0xfcb8fcfc }, + { 0x260ef78f, 0x698f8f90 }, + { 0x07147996, 0x797985d7 }, + { 0x68f8a8f8, 0xf878c8b8 }, + { 0xdbbf19d3, 0x58704630 }, + { 0xd1235b29, 0xdb37cfaf }, + { 0xc28a2c01, 0x98ac958b }, + { 0xb19e6381, 0xa706b2c0 }, + { 0x7a605e44, 0xdb09b2b0 }, + { 0xcf2c5b73, 0x71bc8cbc }, + { 0x240967dc, 0xd3ddedef }, + { 0xf03b8d7b, 0x197d3bd7 }, + { 0xc1aabe38, 0x866511de }, + { 0xd0f37c68, 0x7f33874a }, + { 0xdbfa37f3, 0x57f0ad98 }, + { 0x5842e2c5, 0xbc8d35ee }, + { 0xe8f00911, 0x7e246e99 }, + { 0xedd6c501, 0x0134b010 }, + { 0xf152c9fb, 0xd3ec287b }, + { 0x0cdc5632, 0x4027f1c7 }, + { 0x20a525af, 0x14cf9b94 }, + { 0xa92636c1, 0x4d53c4e3 }, + { 0x867d0fe6, 0xe1f94077 }, + { 0xbbe65d91, 0x29066ae2 }, + { 0xcc545a96, 0x8d5efe4c }, + { 0xcb31e9be, 0xa63a3262 }, + { 0x18597bb1, 0x476a8496 }, + { 0x36c9f0d4, 0x31af5927 }, + { 0xc0b5f9e2, 0xb00b3725 }, + { 0xa2cb2b39, 0xa5948416 }, + { 0xcef88a60, 0x148c34fa }, + { 0x6437a57a, 0x19928c41 }, + { 0xa146f3b3, 0x893f83fa }, + { 0x483f4997, 0x7ccf0278 }, + { 0xbae8addc, 0x238f001e }, + { 0x494f7792, 0x3d32b0ed }, + { 0x82634175, 0x2fff4d77 }, + { 0xd038faff, 0x00460355 }, + { 0x49027dbf, 0x61f3983e }, + { 0xc260a8f4, 0x0bcee59a }, + { 0x445adfc8, 0x279d5dee }, + { 0x555af423, 0xa4007504 }, + { 0x121016b0, 0x8ce2f902 }, + { 0x29cd30ac, 0x1d333368 }, + { 0x82f16b03, 0x89ad8468 }, + { 0x62c64099, 0x637146d8 }, + { 0x173e434c, 0x10c2194b }, + { 0xd3cf9ce2, 0xc586ff4c }, + { 0xa011ff21, 0x5326df42 }, + { 0xcb008e1b, 0x134be46c }, + { 0xf73b12a6, 0xceb747a3 }, + { 0x0e9018d9, 0xca33283b }, + { 0x7a671cd0, 0xf92c9a0a }, + { 0x532f942a, 0xb2b6634a }, + { 0x46224288, 0xb4a8acfe }, + { 0xc75c4a47, 0x5935583d }, + { 0x5d92a674, 0xa16f5ca5 }, + { 0x8ce61777, 0x395c73c4 }, + { 0x0b3b2a08, 0xc61aec53 }, + { 0xeb58f62a, 0x62e74d81 }, + { 0xb6489548, 0x3abcee01 }, + { 0xc66b0da5, 0x818eed6b }, + { 0xcf3dcee0, 0x755a2688 }, + { 0xdb4a8cc2, 0xe99cf6c0 }, + { 0xd59cb754, 0x1385717f }, + { 0x8a4b4143, 0x7b0b7d97 }, + { 0xbb351963, 0x7a15f6db }, + { 0xf64e7a6a, 0x27820137 } +}; - if (threadIdx.x < 256) { - sharedMemory[threadIdx.x] = mixTob0Tox[threadIdx.x]; - #if USE_ALL_TABLES - sharedMemory[threadIdx.x+256] = mixTob1Tox[threadIdx.x]; - sharedMemory[threadIdx.x+512] = mixTob2Tox[threadIdx.x]; - sharedMemory[threadIdx.x+768] = mixTob3Tox[threadIdx.x]; - sharedMemory[threadIdx.x+1024] = mixTob4Tox[threadIdx.x]; - sharedMemory[threadIdx.x+1280] = mixTob5Tox[threadIdx.x]; - sharedMemory[threadIdx.x+1536] = mixTob6Tox[threadIdx.x]; - sharedMemory[threadIdx.x+1792] = mixTob7Tox[threadIdx.x]; - #endif - } - uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); - if (thread < threads) +__global__ __launch_bounds__(threadsperblock, 2) +void x15_whirlpool_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t *g_hash) +{ +#if USE_ALL_TABLES + __shared__ uint2 sharedMemory[256*4]; +#else + __shared__ uint2 sharedMemory[512]; +#endif + if (threadIdx.x < 256) + { +#if USE_ALL_TABLES + sharedMemory[threadIdx.x] = vectorize(mixTob0Tox[threadIdx.x]); + sharedMemory[threadIdx.x + 256] = ROL8(sharedMemory[threadIdx.x]); + sharedMemory[threadIdx.x + 512] = ROL16(sharedMemory[threadIdx.x]); + sharedMemory[threadIdx.x + 768] = ROL16(sharedMemory[threadIdx.x + 256]); +// sharedMemory[threadIdx.x + 1024] = SWAPDWORDS2(sharedMemory[threadIdx.x]); +// sharedMemory[threadIdx.x + 1280] = SWAPDWORDS2(sharedMemory[threadIdx.x + 256]); +// sharedMemory[threadIdx.x + 1536] = SWAPDWORDS2(sharedMemory[threadIdx.x + 512]); +// sharedMemory[threadIdx.x + 1792] = SWAPDWORDS2(sharedMemory[threadIdx.x + 768]); +#else + sharedMemory[threadIdx.x] = vectorize(mixTob0Tox[threadIdx.x]); + sharedMemory[threadIdx.x + 256] = SWAPDWORDS2(sharedMemory[threadIdx.x]); +#endif + } + __syncthreads(); + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); +// if (thread < threads) { - uint32_t nounce = g_nonceVector ? g_nonceVector[thread] : (startNounce + thread); + const uint32_t nounce = (startNounce + thread); + const uint2 InitVector_RC[10] = + { + { 0xE8C62318UL, 0x4F01B887UL }, + { 0xF5D2A636UL, 0x52916F79UL }, + { 0x8E9BBC60UL, 0x357B0CA3UL }, + { 0xC2D7E01DUL, 0x57FE4B2EUL }, + { 0xE5377715UL, 0xDA4AF09FUL }, + { 0x0A29C958UL, 0x856BA0B1UL }, + { 0xF4105DBDUL, 0x67053ECBUL }, + { 0x8B4127E4UL, 0xD8957DA7UL }, + { 0x667CEEFBUL, 0x9E4717DDUL }, + { 0x07BF2DCAUL, 0x33835AADUL } + }; + uint32_t hashPosition = (nounce - startNounce) << 3; - uint64_t hash[8], state[8], n[8], h[8] = { 0 }; - uint8_t i; - - #pragma unroll 8 - for (i=0; i<8; i++) - n[i] = hash[i] = g_hash[hashPosition + i]; - - #pragma unroll 10 - for (i=0; i < 10; i++) { - uint64_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; - ROUND_KSCHED(sharedMemory, h, tmp, InitVector_RC[i]); - ROUND_WENC(sharedMemory, n, h, tmp); + uint2 hash[8], state[8], n[8], h[8]; + int i; + + uint28 *phash = (uint28*)&g_hash[hashPosition]; + uint28 *outpt = (uint28*)hash; + outpt[0] = phash[0]; + outpt[1] = phash[1]; + for (i = 0; i < 8; i++) + { + n[i] = hash[i]; } - #pragma unroll 8 - for (i=0; i<8; i++) - state[i] = xor1(n[i], hash[i]); - - #pragma unroll 8 - for (i=0; i < 8; i++) { - h[i] = state[i]; +//#pragma unroll 8 +// for (i = 0; i < 8; i++) +// n[i] = hash[i] = vectorize(g_hash[hashPosition + i]); + + uint2 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, pre; + + pre = make_uint2( 0x28282828, 0x28282828); + + tmp0 = ROUND_ELT(sharedMemory, n, 0, 7, 6, 5, 4, 3, 2, 1) ^ make_uint2(3236825904UL, 1730777263UL); + tmp1 = ROUND_ELT(sharedMemory, n, 1, 0, 7, 6, 5, 4, 3, 2) ^ pre; + tmp2 = ROUND_ELT(sharedMemory, n, 2, 1, 0, 7, 6, 5, 4, 3) ^ pre; + tmp3 = ROUND_ELT(sharedMemory, n, 3, 2, 1, 0, 7, 6, 5, 4) ^ pre; + tmp4 = ROUND_ELT(sharedMemory, n, 4, 3, 2, 1, 0, 7, 6, 5) ^ pre; + tmp5 = ROUND_ELT(sharedMemory, n, 5, 4, 3, 2, 1, 0, 7, 6) ^ pre; + tmp6 = ROUND_ELT(sharedMemory, n, 6, 5, 4, 3, 2, 1, 0, 7) ^ pre; + n[7] = ROUND_ELT(sharedMemory, n, 7, 6, 5, 4, 3, 2, 1, 0) ^ pre; + + n[0] = tmp0; + n[1] = tmp1; + n[2] = tmp2; + n[3] = tmp3; + n[4] = tmp4; + n[5] = tmp5; + n[6] = tmp6; + + #pragma unroll 1 + for (i = 0; i < 8*9; i+=8) + { + tmp0 = ROUND_ELT(sharedMemory, n, 0, 7, 6, 5, 4, 3, 2, 1) ^ precalc[i]; + tmp1 = ROUND_ELT(sharedMemory, n, 1, 0, 7, 6, 5, 4, 3, 2) ^ precalc[i+1]; + tmp2 = ROUND_ELT(sharedMemory, n, 2, 1, 0, 7, 6, 5, 4, 3) ^ precalc[i+2]; + tmp3 = ROUND_ELT(sharedMemory, n, 3, 2, 1, 0, 7, 6, 5, 4) ^ precalc[i+3]; + tmp4 = ROUND_ELT(sharedMemory, n, 4, 3, 2, 1, 0, 7, 6, 5) ^ precalc[i+4]; + tmp5 = ROUND_ELT(sharedMemory, n, 5, 4, 3, 2, 1, 0, 7, 6) ^ precalc[i+5]; + tmp6 = ROUND_ELT(sharedMemory, n, 6, 5, 4, 3, 2, 1, 0, 7) ^ precalc[i+6]; + n[7] = ROUND_ELT(sharedMemory, n, 7, 6, 5, 4, 3, 2, 1, 0) ^ precalc[i + 7]; + n[0] = tmp0; + n[1] = tmp1; + n[2] = tmp2; + n[3] = tmp3; + n[4] = tmp4; + n[5] = tmp5; + n[6] = tmp6; } - n[0] = xor1(0x80, state[0]); + + +#pragma unroll 8 + for (i = 0; i<8; i++) + h[i] = state[i] = n[i] ^ hash[i]; + + n[0] = state[0]; n[1] = state[1]; n[2] = state[2]; n[3] = state[3]; n[4] = state[4]; n[5] = state[5]; n[6] = state[6]; - n[7] = xor1(0x2000000000000, state[7]); - - #pragma unroll 10 - for (i=0; i < 10; i++) { - uint64_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; - ROUND_KSCHED(sharedMemory, h, tmp, InitVector_RC[i]); - ROUND_WENC(sharedMemory, n, h, tmp); + n[7] = state[7]; + n[0].x ^= 0x80; + n[7].y ^= 0x20000; + +#pragma unroll 10 + for (i = 0; i < 10; i++) + { + uint2 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6; + tmp0 = ROUND_ELT(sharedMemory, h, 0, 7, 6, 5, 4, 3, 2, 1) ^ InitVector_RC[i]; + tmp1 = ROUND_ELT(sharedMemory, h, 1, 0, 7, 6, 5, 4, 3, 2); + tmp2 = ROUND_ELT(sharedMemory, h, 2, 1, 0, 7, 6, 5, 4, 3); + tmp3 = ROUND_ELT(sharedMemory, h, 3, 2, 1, 0, 7, 6, 5, 4); + tmp4 = ROUND_ELT(sharedMemory, h, 4, 3, 2, 1, 0, 7, 6, 5); + tmp5 = ROUND_ELT(sharedMemory, h, 5, 4, 3, 2, 1, 0, 7, 6); + tmp6 = ROUND_ELT(sharedMemory, h, 6, 5, 4, 3, 2, 1, 0, 7); + h[7] = ROUND_ELT(sharedMemory, h, 7, 6, 5, 4, 3, 2, 1, 0); + h[0] = tmp0; + h[1] = tmp1; + h[2] = tmp2; + h[3] = tmp3; + h[4] = tmp4; + h[5] = tmp5; + h[6] = tmp6; + tmp0 = ROUND_ELT(sharedMemory, n, 0, 7, 6, 5, 4, 3, 2, 1) ^ h[0]; + tmp1 = ROUND_ELT(sharedMemory, n, 1, 0, 7, 6, 5, 4, 3, 2) ^ h[1]; + tmp2 = ROUND_ELT(sharedMemory, n, 2, 1, 0, 7, 6, 5, 4, 3) ^ h[2]; + tmp3 = ROUND_ELT(sharedMemory, n, 3, 2, 1, 0, 7, 6, 5, 4) ^ h[3]; + tmp4 = ROUND_ELT(sharedMemory, n, 4, 3, 2, 1, 0, 7, 6, 5) ^ h[4]; + tmp5 = ROUND_ELT(sharedMemory, n, 5, 4, 3, 2, 1, 0, 7, 6) ^ h[5]; + tmp6 = ROUND_ELT(sharedMemory, n, 6, 5, 4, 3, 2, 1, 0, 7) ^ h[6]; + n[7] = ROUND_ELT(sharedMemory, n, 7, 6, 5, 4, 3, 2, 1, 0) ^ h[7]; + n[0] = tmp0; + n[1] = tmp1; + n[2] = tmp2; + n[3] = tmp3; + n[4] = tmp4; + n[5] = tmp5; + n[6] = tmp6; } - state[0] = xor3(state[0], n[0], 0x80); - state[1] = xor1(state[1], n[1]); - state[2] = xor1(state[2], n[2]); - state[3] = xor1(state[3], n[3]); - state[4] = xor1(state[4], n[4]); - state[5] = xor1(state[5], n[5]); - state[6] = xor1(state[6], n[6]); - state[7] = xor3(state[7], n[7], 0x2000000000000); - - #pragma unroll 8 - for (i=0; i < 8; i++) - g_hash[hashPosition + i] = state[i]; + state[0].y = state[0].y ^ n[0].y; + state[0].x = state[0].x ^ n[0].x ^ 0x80; + state[1] = state[1] ^ n[1]; + state[2] = state[2] ^ n[2]; + state[3] = state[3] ^ n[3]; + state[4] = state[4] ^ n[4]; + state[5] = state[5] ^ n[5]; + state[6] = state[6] ^ n[6]; + state[7].x = state[7].x ^ n[7].x; + state[7].y = state[7].y ^ n[7].y ^ 0x20000; + +#pragma unroll 8 + for (i = 0; i < 8; i++) + g_hash[hashPosition + i] = devectorize(state[i]); } } -__global__ -void oldwhirlpool_gpu_finalhash_64(uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector, uint32_t *resNounce) +__global__ +void oldwhirlpool_gpu_finalhash_64(uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *resNounce) { - __shared__ uint64_t sharedMemory[2048]; + /* +#if USE_ALL_TABLES + __shared__ uint2 sharedMemory[256*4]; +#else + __shared__ uint2 sharedMemory[256*2]; +#endif if (threadIdx.x < 256) { - sharedMemory[threadIdx.x] = mixTob0Tox[threadIdx.x]; - #if USE_ALL_TABLES - sharedMemory[threadIdx.x+256] = mixTob1Tox[threadIdx.x]; - sharedMemory[threadIdx.x+512] = mixTob2Tox[threadIdx.x]; - sharedMemory[threadIdx.x+768] = mixTob3Tox[threadIdx.x]; - sharedMemory[threadIdx.x+1024] = mixTob4Tox[threadIdx.x]; - sharedMemory[threadIdx.x+1280] = mixTob5Tox[threadIdx.x]; - sharedMemory[threadIdx.x+1536] = mixTob6Tox[threadIdx.x]; - sharedMemory[threadIdx.x+1792] = mixTob7Tox[threadIdx.x]; - #endif +#if USE_ALL_TABLES + sharedMemory[threadIdx.x] = vectorize(mixTob0Tox[threadIdx.x]); + sharedMemory[threadIdx.x + 256] = vectorize(mixTob1Tox[threadIdx.x]); + sharedMemory[threadIdx.x + 512] = ROL2(vectorize(mixTob0Tox[threadIdx.x]), 16); + sharedMemory[threadIdx.x + 768] = ROL2(vectorize(mixTob1Tox[threadIdx.x]), 16); +// sharedMemory[threadIdx.x + 1024] = SWAPDWORDS2(sharedMemory[threadIdx.x]); +// sharedMemory[threadIdx.x + 1280] = SWAPDWORDS2(sharedMemory[threadIdx.x + 256]); +// sharedMemory[threadIdx.x + 1536] = SWAPDWORDS2(sharedMemory[threadIdx.x + 512]); +// sharedMemory[threadIdx.x + 1792] = SWAPDWORDS2(sharedMemory[threadIdx.x + 768]); +#else + sharedMemory[threadIdx.x] = vectorize(mixTob0Tox[threadIdx.x]); + sharedMemory[threadIdx.x + 256] = SWAPDWORDS2(sharedMemory[threadIdx.x]); +#endif } - uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); if (thread < threads) { - uint32_t nounce = g_nonceVector ? g_nonceVector[thread] : (startNounce + thread); - - int hashPosition = nounce - startNounce; - uint64_t *inpHash = (uint64_t*) &g_hash[8 * hashPosition]; - uint64_t h8[8]; - - #pragma unroll 8 - for (int i=0; i<8; i++) { - h8[i] = inpHash[i]; + const uint64_t target = ((uint64_t*)pTarget)[3]; + + const uint32_t nounce = (startNounce + thread); + + const uint2 InitVector_RC[10] = + { + { 0xE8C62318UL, 0x4F01B887UL }, + { 0xF5D2A636UL, 0x52916F79UL }, + { 0x8E9BBC60UL, 0x357B0CA3UL }, + { 0xC2D7E01DUL, 0x57FE4B2EUL }, + { 0xE5377715UL, 0xDA4AF09FUL }, + { 0x0A29C958UL, 0x856BA0B1UL }, + { 0xF4105DBDUL, 0x67053ECBUL }, + { 0x8B4127E4UL, 0xD8957DA7UL }, + { 0x667CEEFBUL, 0x9E4717DDUL }, + { 0x07BF2DCAUL, 0x33835AADUL } + }; + const uint32_t hashPosition = nounce - startNounce; + uint64_t *inpHash = (uint64_t*)&g_hash[8 * hashPosition]; + uint2 h8[8]; + +#pragma unroll 8 + for (int i = 0; i < 8; i++) { + h8[i] = vectorize(inpHash[i]); } - uint64_t state[8]; - uint64_t n[8]; - uint64_t h[8]; + uint2 state[8]; + uint2 n[8]; + uint2 h[8]; - #pragma unroll 8 - for (int i=0; i<8; i++) { +#pragma unroll 8 + for (int i = 0; i < 8; i++) { n[i] = h8[i]; - h[i] = 0; + h[i] = vectorizelow(0); } - #pragma unroll 10 - for (unsigned r=0; r < 10; r++) { - uint64_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; - ROUND_KSCHED(sharedMemory, h, tmp, InitVector_RC[r]); - ROUND_WENC(sharedMemory, n, h, tmp); +#pragma unroll 10 + for (int r = 0; r < 10; r++) { + uint2 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + tmp0 = ROUND_ELT(sharedMemory, h, 0, 7, 6, 5, 4, 3, 2, 1) ^ InitVector_RC[r]; + tmp1 = ROUND_ELT(sharedMemory, h, 1, 0, 7, 6, 5, 4, 3, 2); + tmp2 = ROUND_ELT(sharedMemory, h, 2, 1, 0, 7, 6, 5, 4, 3); + tmp3 = ROUND_ELT(sharedMemory, h, 3, 2, 1, 0, 7, 6, 5, 4); + tmp4 = ROUND_ELT(sharedMemory, h, 4, 3, 2, 1, 0, 7, 6, 5); + tmp5 = ROUND_ELT(sharedMemory, h, 5, 4, 3, 2, 1, 0, 7, 6); + tmp6 = ROUND_ELT(sharedMemory, h, 6, 5, 4, 3, 2, 1, 0, 7); + tmp7 = ROUND_ELT(sharedMemory, h, 7, 6, 5, 4, 3, 2, 1, 0); + h[0] = tmp0; + h[1] = tmp1; + h[2] = tmp2; + h[3] = tmp3; + h[4] = tmp4; + h[5] = tmp5; + h[6] = tmp6; + h[7] = tmp7; + tmp0 = ROUND_ELT(sharedMemory, n, 0, 7, 6, 5, 4, 3, 2, 1) ^ h[0]; + tmp1 = ROUND_ELT(sharedMemory, n, 1, 0, 7, 6, 5, 4, 3, 2) ^ h[1]; + tmp2 = ROUND_ELT(sharedMemory, n, 2, 1, 0, 7, 6, 5, 4, 3) ^ h[2]; + tmp3 = ROUND_ELT(sharedMemory, n, 3, 2, 1, 0, 7, 6, 5, 4) ^ h[3]; + tmp4 = ROUND_ELT(sharedMemory, n, 4, 3, 2, 1, 0, 7, 6, 5) ^ h[4]; + tmp5 = ROUND_ELT(sharedMemory, n, 5, 4, 3, 2, 1, 0, 7, 6) ^ h[5]; + tmp6 = ROUND_ELT(sharedMemory, n, 6, 5, 4, 3, 2, 1, 0, 7) ^ h[6]; + tmp7 = ROUND_ELT(sharedMemory, n, 7, 6, 5, 4, 3, 2, 1, 0) ^ h[7]; + n[0] = tmp0; + n[1] = tmp1; + n[2] = tmp2; + n[3] = tmp3; + n[4] = tmp4; + n[5] = tmp5; + n[6] = tmp6; + n[7] = tmp7; + } - #pragma unroll 8 - for (int i=0; i<8; i++) { - state[i] = xor1(n[i], h8[i]); +#pragma unroll 8 + for (int i = 0; i < 8; i++) { + state[i] = n[i] ^ h8[i]; } - #pragma unroll 8 - for (int i=0; i<8; i++) { +#pragma unroll 8 + for (int i = 0; i < 8; i++) { h[i] = state[i]; } - n[0] = xor1(0x80, state[0]); + n[0].y = state[0].y; + n[0].x = state[0].x ^ 0x80; n[1] = state[1]; n[2] = state[2]; n[3] = state[3]; n[4] = state[4]; n[5] = state[5]; n[6] = state[6]; - n[7] = xor1(0x2000000000000, state[7]); - - #pragma unroll 10 - for (unsigned r=0; r < 10; r++) { - uint64_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; - ROUND_KSCHED(sharedMemory, h, tmp, InitVector_RC[r]); - ROUND_WENC(sharedMemory, n, h, tmp); + n[7].x = state[7].x; + n[7].y = state[7].y ^ 0x20000; +#pragma unroll 9 + for (int r = 0; r < 9; r++) + { + uint2 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + tmp0 = ROUND_ELT(sharedMemory, h, 0, 7, 6, 5, 4, 3, 2, 1) ^ InitVector_RC[r]; + tmp1 = ROUND_ELT(sharedMemory, h, 1, 0, 7, 6, 5, 4, 3, 2); + tmp2 = ROUND_ELT(sharedMemory, h, 2, 1, 0, 7, 6, 5, 4, 3); + tmp3 = ROUND_ELT(sharedMemory, h, 3, 2, 1, 0, 7, 6, 5, 4); + tmp4 = ROUND_ELT(sharedMemory, h, 4, 3, 2, 1, 0, 7, 6, 5); + tmp5 = ROUND_ELT(sharedMemory, h, 5, 4, 3, 2, 1, 0, 7, 6); + tmp6 = ROUND_ELT(sharedMemory, h, 6, 5, 4, 3, 2, 1, 0, 7); + tmp7 = ROUND_ELT(sharedMemory, h, 7, 6, 5, 4, 3, 2, 1, 0); + h[0] = tmp0; + h[1] = tmp1; + h[2] = tmp2; + h[3] = tmp3; + h[4] = tmp4; + h[5] = tmp5; + h[6] = tmp6; + h[7] = tmp7; + tmp0 = ROUND_ELT(sharedMemory, n, 0, 7, 6, 5, 4, 3, 2, 1) ^ h[0]; + tmp1 = ROUND_ELT(sharedMemory, n, 1, 0, 7, 6, 5, 4, 3, 2) ^ h[1]; + tmp2 = ROUND_ELT(sharedMemory, n, 2, 1, 0, 7, 6, 5, 4, 3) ^ h[2]; + tmp3 = ROUND_ELT(sharedMemory, n, 3, 2, 1, 0, 7, 6, 5, 4) ^ h[3]; + tmp4 = ROUND_ELT(sharedMemory, n, 4, 3, 2, 1, 0, 7, 6, 5) ^ h[4]; + tmp5 = ROUND_ELT(sharedMemory, n, 5, 4, 3, 2, 1, 0, 7, 6) ^ h[5]; + tmp6 = ROUND_ELT(sharedMemory, n, 6, 5, 4, 3, 2, 1, 0, 7) ^ h[6]; + tmp7 = ROUND_ELT(sharedMemory, n, 7, 6, 5, 4, 3, 2, 1, 0) ^ h[7]; + n[0] = tmp0; + n[1] = tmp1; + n[2] = tmp2; + n[3] = tmp3; + n[4] = tmp4; + n[5] = tmp5; + n[6] = tmp6; + n[7] = tmp7; } + uint2 tmp3; + tmp3 = ROUND_ELT(sharedMemory, h, 3, 2, 1, 0, 7, 6, 5, 4); + h[3] = tmp3; + tmp3 = ROUND_ELT(sharedMemory, n, 3, 2, 1, 0, 7, 6, 5, 4) ^ h[3]; + n[3] = tmp3; + + state[3] = state[3] ^ n[3]; + + if(devectorize(state[3]) <= target) + { + uint32_t tmp = atomicExch(resNounce, nounce); + if (tmp != 0xffffffff) + resNounce[1] = tmp; - state[0] = xor3(state[0], n[0], 0x80); - state[1] = xor1(state[1], n[1]); - state[2] = xor1(state[2], n[2]); - state[3] = xor1(state[3], n[3]); - state[4] = xor1(state[4], n[4]); - state[5] = xor1(state[5], n[5]); - state[6] = xor1(state[6], n[6]); - state[7] = xor3(state[7], n[7], 0x2000000000000); - - bool rc = (state[3] <= ((uint64_t*)pTarget)[3]); - if (rc && resNounce[0] > nounce) - resNounce[0] = nounce; + } } + */ } __host__ -extern void x15_whirlpool_cpu_init(int thr_id, uint32_t threads, int mode) +extern void x15_whirlpool_cpu_init(int thr_id, uint32_t threads, const int mode) { - switch (mode) { - case 0: /* x15 with rotated T1-T7 (based on T0) */ - cudaMemcpyToSymbol(InitVector_RC, plain_RC, sizeof(plain_RC), 0, cudaMemcpyHostToDevice); - cudaMemcpyToSymbol(mixTob0Tox, plain_T0, sizeof(plain_T0), 0, cudaMemcpyHostToDevice); -#if USE_ALL_TABLES - cudaMemcpyToSymbol(mixTob1Tox, plain_T1, (256*8), 0, cudaMemcpyHostToDevice); - cudaMemcpyToSymbol(mixTob2Tox, plain_T2, (256*8), 0, cudaMemcpyHostToDevice); - cudaMemcpyToSymbol(mixTob3Tox, plain_T3, (256*8), 0, cudaMemcpyHostToDevice); - cudaMemcpyToSymbol(mixTob4Tox, plain_T4, (256*8), 0, cudaMemcpyHostToDevice); - cudaMemcpyToSymbol(mixTob5Tox, plain_T5, (256*8), 0, cudaMemcpyHostToDevice); - cudaMemcpyToSymbol(mixTob6Tox, plain_T6, (256*8), 0, cudaMemcpyHostToDevice); - cudaMemcpyToSymbol(mixTob7Tox, plain_T7, (256*8), 0, cudaMemcpyHostToDevice); -#endif - break; - - case 1: /* old whirlpool */ - cudaMemcpyToSymbol(InitVector_RC, old1_RC, sizeof(plain_RC), 0, cudaMemcpyHostToDevice); - cudaMemcpyToSymbol(mixTob0Tox, old1_T0, sizeof(plain_T0), 0, cudaMemcpyHostToDevice); - cudaMemcpyToSymbol(mixTob1Tox, old1_T1, (256*8), 0, cudaMemcpyHostToDevice); - cudaMemcpyToSymbol(mixTob2Tox, old1_T2, (256*8), 0, cudaMemcpyHostToDevice); - cudaMemcpyToSymbol(mixTob3Tox, old1_T3, (256*8), 0, cudaMemcpyHostToDevice); - cudaMemcpyToSymbol(mixTob4Tox, old1_T4, (256*8), 0, cudaMemcpyHostToDevice); - cudaMemcpyToSymbol(mixTob5Tox, old1_T5, (256*8), 0, cudaMemcpyHostToDevice); - cudaMemcpyToSymbol(mixTob6Tox, old1_T6, (256*8), 0, cudaMemcpyHostToDevice); - cudaMemcpyToSymbol(mixTob7Tox, old1_T7, (256*8), 0, cudaMemcpyHostToDevice); - cudaMalloc(&d_WNonce[thr_id], sizeof(uint32_t)); - cudaMallocHost(&d_wnounce[thr_id], sizeof(uint32_t)); - break; - } +// cudaMemcpyToSymbolAsync(mixTob0Tox, plain_T0, sizeof(plain_T0), 0, cudaMemcpyHostToDevice); +//#if USE_ALL_TABLES +// cudaMemcpyToSymbolAsync(mixTob1Tox, plain_T1, (256*8), 0, cudaMemcpyHostToDevice); +// cudaMemcpyToSymbolAsync(mixTob2Tox, plain_T2, (256*8), 0, cudaMemcpyHostToDevice); +// cudaMemcpyToSymbolAsync(mixTob3Tox, plain_T3, (256*8), 0, cudaMemcpyHostToDevice); +// cudaMemcpyToSymbolAsync(mixTob4Tox, plain_T4, (256*8), 0, cudaMemcpyHostToDevice); +// cudaMemcpyToSymbolAsync(mixTob5Tox, plain_T5, (256*8), 0, cudaMemcpyHostToDevice); +// cudaMemcpyToSymbolAsync(mixTob6Tox, plain_T6, (256*8), 0, cudaMemcpyHostToDevice); +// cudaMemcpyToSymbolAsync(mixTob7Tox, plain_T7, (256*8), 0, cudaMemcpyHostToDevice); +//#endif } __host__ extern void x15_whirlpool_cpu_free(int thr_id) { cudaFree(d_WNonce[thr_id]); - cudaFreeHost(d_wnounce[thr_id]); + cudaFreeHost(h_wnounce[thr_id]); } __host__ -extern void x15_whirlpool_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order) +extern void x15_whirlpool_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash) { dim3 grid((threads + threadsperblock-1) / threadsperblock); dim3 block(threadsperblock); - x15_whirlpool_gpu_hash_64<<>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector); + x15_whirlpool_gpu_hash_64<<>>(threads, startNounce, (uint64_t*)d_hash); //MyStreamSynchronize(NULL, order, thr_id); } __host__ -extern uint32_t whirlpool512_cpu_finalhash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order) +extern uint32_t* whirlpool512_cpu_finalhash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash) { - uint32_t result = 0xffffffff; - - dim3 grid((threads + threadsperblock-1) / threadsperblock); + dim3 grid((threads + threadsperblock - 1) / threadsperblock); dim3 block(threadsperblock); - cudaMemset(d_WNonce[thr_id], 0xff, sizeof(uint32_t)); + cudaMemsetAsync(d_WNonce[thr_id], 0xff, 2 * sizeof(uint32_t), gpustream[thr_id]); - oldwhirlpool_gpu_finalhash_64<<>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector,d_WNonce[thr_id]); + //oldwhirlpool_gpu_finalhash_64 << >>(threads, startNounce, (uint64_t*)d_hash, d_WNonce[thr_id]); //MyStreamSynchronize(NULL, order, thr_id); - cudaMemcpy(d_wnounce[thr_id], d_WNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost); - result = *d_wnounce[thr_id]; - return result; + CUDA_SAFE_CALL(cudaMemcpyAsync(h_wnounce[thr_id], d_WNonce[thr_id], 2 * sizeof(uint32_t), cudaMemcpyDeviceToHost, gpustream[thr_id])); cudaStreamSynchronize(gpustream[thr_id]); + return h_wnounce[thr_id]; } __host__ -void whirlpool512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_outputHash, int order) +void whirlpool512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_outputHash) { - // berechne wie viele Thread Blocks wir brauchen - dim3 grid((threads + threadsperblock-1) / threadsperblock); - dim3 block(threadsperblock); - - oldwhirlpool_gpu_hash_80<<>>(threads, startNounce, d_outputHash); } __host__ -void whirlpool512_setBlock_80(void *pdata, const void *ptarget) +void whirlpool512_setBlock_80(int thr_id, void *pdata, const void *ptarget) { - unsigned char PaddedMessage[128]; - memcpy(PaddedMessage, pdata, 80); - memset(PaddedMessage+80, 0, 48); - PaddedMessage[80] = 0x80; /* ending */ - cudaMemcpyToSymbol(pTarget, ptarget, 8*sizeof(uint32_t), 0, cudaMemcpyHostToDevice); - cudaMemcpyToSymbol(c_PaddedMessage80, PaddedMessage, 16*sizeof(uint64_t), 0, cudaMemcpyHostToDevice); } diff --git a/x15/whirlpool.cu b/x15/whirlpool.cu index 4fb891a619..0f891407c3 100644 --- a/x15/whirlpool.cu +++ b/x15/whirlpool.cu @@ -1,26 +1,23 @@ /* - * whirlpool routine (djm) + * whirlpool routine djm&SP */ extern "C" { #include "sph/sph_whirlpool.h" -#include "miner.h" } - +#include "miner.h" #include "cuda_helper.h" -static uint32_t *d_hash[MAX_GPUS]; - extern void x15_whirlpool_cpu_init(int thr_id, uint32_t threads, int mode); -extern void x15_whirlpool_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); +extern void x15_whirlpool_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash); -extern void whirlpool512_setBlock_80(void *pdata, const void *ptarget); -extern void whirlpool512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order); -extern uint32_t whirlpool512_cpu_finalhash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); +extern void whirlpool512_setBlock_80(int thr_id, void *pdata, const void *ptarget); +extern void whirlpool512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash); +extern uint32_t* whirlpool512_cpu_finalhash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash); // CPU Hash function -extern "C" void wcoinhash(void *state, const void *input) +void wcoinhash(void *state, const void *input) { sph_whirlpool_context ctx_whirlpool; @@ -49,74 +46,101 @@ extern "C" void wcoinhash(void *state, const void *input) memcpy(state, hash, 32); } -static bool init[MAX_GPUS] = { 0 }; - -extern "C" int scanhash_whc(int thr_id, uint32_t *pdata, - const uint32_t *ptarget, uint32_t max_nonce, - unsigned long *hashes_done) +extern int scanhash_whc(int thr_id, uint32_t *pdata, + uint32_t *ptarget, uint32_t max_nonce, + uint32_t *hashes_done) { + static THREAD uint32_t *d_hash = nullptr; + const uint32_t first_nonce = pdata[19]; uint32_t endiandata[20]; - uint32_t throughput = device_intensity(thr_id, __func__, 1U << 19); // 19=256*256*8; - throughput = min(throughput, (max_nonce - first_nonce)); + uint32_t throughputmax = device_intensity(device_map[thr_id], __func__, 1U << 20); // 19=256*256*8; + uint32_t throughput = min(throughputmax, (max_nonce - first_nonce)) & 0xfffffc00; if (opt_benchmark) - ((uint32_t*)ptarget)[7] = 0x0000ff; + ptarget[7] = 0x0000ff; + + static THREAD volatile bool init = false; + if(!init) + { + CUDA_SAFE_CALL(cudaSetDevice(device_map[thr_id])); + cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync); + cudaDeviceSetCacheConfig(cudaFuncCachePreferL1); + CUDA_SAFE_CALL(cudaStreamCreate(&gpustream[thr_id])); +#if defined WIN32 && !defined _WIN64 + // 2GB limit for cudaMalloc + if(throughputmax > 0x7fffffffULL / (16 * sizeof(uint32_t))) + { + applog(LOG_ERR, "intensity too high"); + mining_has_stopped[thr_id] = true; + cudaStreamDestroy(gpustream[thr_id]); + proper_exit(2); + } +#endif - if (!init[thr_id]) { - CUDA_CALL_OR_RET_X(cudaSetDevice(device_map[thr_id]), 0); - // Konstanten kopieren, Speicher belegen - cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput); - x15_whirlpool_cpu_init(thr_id, throughput, 1 /* old whirlpool */); + CUDA_SAFE_CALL(cudaMalloc(&d_hash, 16 * sizeof(uint32_t) * throughputmax)); + x15_whirlpool_cpu_init(thr_id, throughputmax, 1 /* old whirlpool */); + mining_has_stopped[thr_id] = false; - init[thr_id] = true; + init = true; } for (int k=0; k < 20; k++) { - be32enc(&endiandata[k], ((uint32_t*)pdata)[k]); + be32enc(&endiandata[k], pdata[k]); } - whirlpool512_setBlock_80((void*)endiandata, ptarget); + whirlpool512_setBlock_80(thr_id, (void*)endiandata, ptarget); do { - uint32_t foundNonce; - int order = 0; + uint32_t* foundNonce; - whirlpool512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++); - x15_whirlpool_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); - x15_whirlpool_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + whirlpool512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash); + x15_whirlpool_cpu_hash_64(thr_id, throughput, pdata[19], d_hash); + x15_whirlpool_cpu_hash_64(thr_id, throughput, pdata[19], d_hash); - foundNonce = whirlpool512_cpu_finalhash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); - if (foundNonce != UINT32_MAX) + foundNonce = whirlpool512_cpu_finalhash_64(thr_id, throughput, pdata[19], d_hash); + if(stop_mining) {mining_has_stopped[thr_id] = true; cudaStreamDestroy(gpustream[thr_id]); pthread_exit(nullptr);} + if(foundNonce[0] != UINT32_MAX) { const uint32_t Htarg = ptarget[7]; - uint32_t vhash64[8]; - be32enc(&endiandata[19], foundNonce); + uint32_t vhash64[8]={0}; + if(opt_verify){ be32enc(&endiandata[19], foundNonce[0]); wcoinhash(vhash64, endiandata); - - if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) { + } if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) + { int res = 1; *hashes_done = pdata[19] - first_nonce + throughput; - #if 0 - uint32_t secNonce = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1); - if (secNonce != 0) { - pdata[21] = secNonce; - res++; + if (foundNonce[1] != UINT32_MAX) + { + if(opt_verify){ be32enc(&endiandata[19], foundNonce[1]); + wcoinhash(vhash64, endiandata); + } if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) + { + + if (opt_benchmark) applog(LOG_INFO, "GPU #%d: found second nounce %08x", device_map[thr_id], foundNonce[1]); + pdata[21] = foundNonce[1]; + res++; + } + else + { + if (vhash64[7] != Htarg) + applog(LOG_WARNING, "GPU #%d: result for %08x does not validate on CPU!", device_map[thr_id], foundNonce[1]); + } } - #endif - pdata[19] = foundNonce; + pdata[19] = foundNonce[0]; + if (opt_benchmark) applog(LOG_INFO, "GPU #%d: found nounce %08x", device_map[thr_id], foundNonce[0]); + return res; } - else if (vhash64[7] > Htarg) { - applog(LOG_INFO, "GPU #%d: result for %08x is not in range: %x > %x", thr_id, foundNonce, vhash64[7], Htarg); - } - else { - applog(LOG_INFO, "GPU #%d: result for %08x does not validate on CPU!", thr_id, foundNonce); + else + { + if (vhash64[7] != Htarg) + applog(LOG_WARNING, "GPU #%d: result for %08x does not validate on CPU!", device_map[thr_id], foundNonce[0]); } } - pdata[19] += throughput; + pdata[19] += throughput; CUDA_SAFE_CALL(cudaGetLastError()); } while (!work_restart[thr_id].restart && ((uint64_t)max_nonce > ((uint64_t)(pdata[19]) + (uint64_t)throughput))); - *hashes_done = pdata[19] - first_nonce; + *hashes_done = pdata[19] - first_nonce ; return 0; } diff --git a/x15/whirlpoolx.cu b/x15/whirlpoolx.cu new file mode 100644 index 0000000000..be0b18b336 --- /dev/null +++ b/x15/whirlpoolx.cu @@ -0,0 +1,117 @@ +/* + * whirlpool routine (djm) + * whirlpoolx routine (provos alexis) + */ +extern "C" +{ +#include "sph/sph_whirlpool.h" +} +#include "miner.h" + + +#include "cuda_helper.h" + +extern void whirlpoolx_cpu_init(int thr_id, uint32_t threads); +extern void whirlpoolx_setBlock_80(int thr_id, void *pdata, const void *ptarget); +extern void cpu_whirlpoolx(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *foundNonce); +extern void whirlpoolx_precompute(int thr_id); + +// CPU Hash function +extern "C" void whirlxHash(void *state, const void *input) +{ + + sph_whirlpool_context ctx_whirlpool; + + unsigned char hash[64]; + unsigned char hash_xored[32]; + + memset(hash, 0, sizeof(hash)); + + sph_whirlpool_init(&ctx_whirlpool); + sph_whirlpool(&ctx_whirlpool, input, 80); + sph_whirlpool_close(&ctx_whirlpool, hash); + + + for (uint32_t i = 0; i < 32; i++){ + hash_xored[i] = hash[i] ^ hash[i + 16]; + } + memcpy(state, hash_xored, 32); +} + +int scanhash_whirlpoolx(int thr_id, uint32_t *pdata, uint32_t *ptarget, uint32_t max_nonce, uint32_t *hashes_done) +{ + const uint32_t first_nonce = pdata[19]; + uint32_t endiandata[20]; + uint32_t throughputmax = device_intensity(device_map[thr_id], __func__, (1 << 27)); + uint32_t throughput = min(throughputmax, max_nonce - first_nonce) & 0xfffffc00; + + if (opt_benchmark) + ptarget[7] = 0x5; + + static THREAD volatile bool init = false; + if(!init) + { + CUDA_SAFE_CALL(cudaSetDevice(device_map[thr_id])); + cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync); + cudaDeviceSetCacheConfig(cudaFuncCachePreferL1); + CUDA_SAFE_CALL(cudaStreamCreate(&gpustream[thr_id])); + whirlpoolx_cpu_init(thr_id, throughputmax); + mining_has_stopped[thr_id] = false; + init = true; + } + + for (int k=0; k < 20; k++) + { + be32enc(&endiandata[k], pdata[k]); + } + + whirlpoolx_setBlock_80(thr_id, (void*)endiandata, &ptarget[6]); + whirlpoolx_precompute(thr_id); + do { + uint32_t foundNonce[2]; + cpu_whirlpoolx(thr_id, throughput, pdata[19], foundNonce); + CUDA_SAFE_CALL(cudaGetLastError()); + if(stop_mining) {mining_has_stopped[thr_id] = true; cudaStreamDestroy(gpustream[thr_id]); pthread_exit(nullptr);} + if(foundNonce[0] != UINT32_MAX) + { + const uint32_t Htarg = ptarget[7]; + uint32_t vhash64[8]={0}; + /* check now with the CPU to confirm */ + if(opt_verify){ be32enc(&endiandata[19], foundNonce[0]); + whirlxHash(vhash64, endiandata); + } if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) + { + int res = 1; + *hashes_done = pdata[19] - first_nonce + throughput; + if (foundNonce[1] != UINT32_MAX) + { + if(opt_verify){ be32enc(&endiandata[19], foundNonce[1]); + whirlxHash(vhash64, endiandata); + } if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) + { + pdata[21] = foundNonce[1]; + res++; + if (opt_benchmark) applog(LOG_INFO, "GPU #%d: found nonce %08x", device_map[thr_id], foundNonce[1]); + } + else + { + if (vhash64[7] != Htarg) + applog(LOG_WARNING, "GPU #%d: result for %08x does not validate on CPU!", device_map[thr_id], foundNonce[1]); + } + } + if (opt_benchmark) + applog(LOG_INFO, "GPU #%d: found nonce %08x", device_map[thr_id], foundNonce[0], vhash64[7]); + pdata[19] = foundNonce[0]; + return res; + } + else + { + if(vhash64[7] != Htarg) + applog(LOG_WARNING, "GPU #%d: result for %08x does not validate on CPU!", device_map[thr_id], foundNonce[0]); + } + } + pdata[19] += throughput; + } while (!work_restart[thr_id].restart && ((uint64_t)max_nonce > ((uint64_t)(pdata[19]) + (uint64_t)throughput))); + *hashes_done = pdata[19] - first_nonce ; + return 0; +} diff --git a/x15/x14.cu b/x15/x14.cu index 447ecb9f33..bb22ffe6b8 100644 --- a/x15/x14.cu +++ b/x15/x14.cu @@ -26,47 +26,47 @@ extern "C" { #include "cuda_helper.h" -// Memory for the hash functions -static uint32_t *d_hash[MAX_GPUS]; - -extern void quark_blake512_cpu_setBlock_80(void *pdata); -extern void quark_blake512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order); +extern void quark_blake512_cpu_init(int thr_id); +extern void quark_blake512_cpu_setBlock_80(int thr_id, uint64_t *pdata); +extern void quark_blake512_cpu_setBlock_80_multi(int thr_id, uint64_t *pdata); +extern void quark_blake512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash); +extern void quark_blake512_cpu_hash_80_multi(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash); extern void quark_bmw512_cpu_init(int thr_id, uint32_t threads); -extern void quark_bmw512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); +extern void quark_bmw512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash); extern void quark_groestl512_cpu_init(int thr_id, uint32_t threads); -extern void quark_groestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); +extern void quark_groestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash); extern void quark_skein512_cpu_init(int thr_id); -extern void quark_skein512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); +extern void quark_skein512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash); -extern void cuda_jh512Keccak512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); +extern void cuda_jh512Keccak512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash); -extern void x11_luffaCubehash512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); +extern void x11_luffaCubehash512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash); -extern void x11_shavite512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); +extern void x11_shavite512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash); extern int x11_simd512_cpu_init(int thr_id, uint32_t threads); -extern void x11_simd512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); +extern void x11_simd512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, const uint32_t simdthreads); extern void x11_echo512_cpu_init(int thr_id, uint32_t threads); -extern void x11_echo512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); +extern void x11_echo512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash); extern void x13_hamsi512_cpu_init(int thr_id, uint32_t threads); -extern void x13_hamsi512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); +extern void x13_hamsi512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash); extern void x13_fugue512_cpu_init(int thr_id, uint32_t threads); -extern void x13_fugue512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); +extern void x13_fugue512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash); -extern void x14_shabal512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); +extern void x14_shabal512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash); extern void quark_compactTest_cpu_init(int thr_id, uint32_t threads); -extern void quark_compactTest_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *inpHashes, - uint32_t *d_noncesTrue, uint32_t *nrmTrue, uint32_t *d_noncesFalse, uint32_t *nrmFalse, int order); +extern void quark_compactTest_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, const uint32_t *inpHashes, + const uint32_t *d_noncesTrue, uint32_t *nrmTrue, uint32_t *d_noncesFalse, uint32_t *nrmFalse); // X14 CPU Hash function -extern "C" void x14hash(void *output, const void *input) +void x14hash(void *output, const void *input) { unsigned char hash[128]; // uint32_t hashA[16], hashB[16]; #define hashB hash+64 @@ -147,93 +147,118 @@ extern "C" void x14hash(void *output, const void *input) memcpy(output, hash, 32); } -static bool init[MAX_GPUS] = { 0 }; - -extern "C" int scanhash_x14(int thr_id, uint32_t *pdata, - const uint32_t *ptarget, uint32_t max_nonce, - unsigned long *hashes_done) +extern int scanhash_x14(int thr_id, uint32_t *pdata, + uint32_t *ptarget, uint32_t max_nonce, + uint32_t *hashes_done) { + static THREAD uint32_t *d_hash = nullptr; + const uint32_t first_nonce = pdata[19]; uint32_t endiandata[20]; - uint32_t throughput = device_intensity(thr_id, __func__, 1U << 19); // 19=256*256*8; - throughput = min(throughput, max_nonce - first_nonce); + int intensity = (device_sm[device_map[thr_id]] > 500) ? 256 * 256 * 20 : 256 * 256 * 10; + uint32_t simdthreads = (device_sm[device_map[thr_id]] > 500) ? 256 : 32; + + uint32_t throughputmax = device_intensity(device_map[thr_id], __func__, intensity); // 19=256*256*8; + uint32_t throughput = min(throughputmax, max_nonce - first_nonce) & 0xfffffc00; if (opt_benchmark) - ((uint32_t*)ptarget)[7] = 0x000f; + ptarget[7] = 0x000f; - if (!init[thr_id]) + static THREAD volatile bool init = false; + if(!init) { - cudaSetDevice(device_map[thr_id]); - cudaDeviceReset(); + CUDA_SAFE_CALL(cudaSetDevice(device_map[thr_id])); cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync); cudaDeviceSetCacheConfig(cudaFuncCachePreferL1); + CUDA_SAFE_CALL(cudaStreamCreate(&gpustream[thr_id])); + get_cuda_arch(&cuda_arch[thr_id]); +#if defined WIN32 && !defined _WIN64 + // 2GB limit for cudaMalloc + if(throughputmax > 0x7fffffffULL / (64 * sizeof(uint4))) + { + applog(LOG_ERR, "intensity too high"); + mining_has_stopped[thr_id] = true; + cudaStreamDestroy(gpustream[thr_id]); + proper_exit(2); + } +#endif - quark_groestl512_cpu_init(thr_id, throughput); + quark_groestl512_cpu_init(thr_id, throughputmax); quark_skein512_cpu_init(thr_id); - quark_bmw512_cpu_init(thr_id, throughput); - x11_simd512_cpu_init(thr_id, throughput); - x11_echo512_cpu_init(thr_id, throughput); - x13_hamsi512_cpu_init(thr_id, throughput); - x13_fugue512_cpu_init(thr_id, throughput); + quark_bmw512_cpu_init(thr_id, throughputmax); + x11_simd512_cpu_init(thr_id, throughputmax); + x11_echo512_cpu_init(thr_id, throughputmax); + x13_hamsi512_cpu_init(thr_id, throughputmax); + x13_fugue512_cpu_init(thr_id, throughputmax); - CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput), 0); + CUDA_SAFE_CALL(cudaMalloc(&d_hash, 16 * sizeof(uint32_t) * throughputmax)); - cuda_check_cpu_init(thr_id, throughput); - init[thr_id] = true; + cuda_check_cpu_init(thr_id, throughputmax); + mining_has_stopped[thr_id] = false; + init = true; } for (int k = 0; k < 20; k++) - be32enc(&endiandata[k], ((uint32_t*)pdata)[k]); + be32enc(&endiandata[k], pdata[k]); - quark_blake512_cpu_setBlock_80((void*)endiandata); - cuda_check_cpu_setTarget(ptarget); + quark_blake512_cpu_setBlock_80(thr_id, (uint64_t *)endiandata); + cuda_check_cpu_setTarget(ptarget, thr_id); do { - int order = 0; - quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++); - quark_bmw512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); - quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); - quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); - cuda_jh512Keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); - x11_luffaCubehash512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); - x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); - x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); - x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); - x13_hamsi512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); - x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); - x14_shabal512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); - - // MyStreamSynchronize(NULL, 1, thr_id); - - uint32_t foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]); - if (foundNonce != UINT32_MAX) + quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash); + quark_bmw512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash); + quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash); + quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash); + cuda_jh512Keccak512_cpu_hash_64(thr_id, throughput, pdata[19], d_hash); + x11_luffaCubehash512_cpu_hash_64(thr_id, throughput, pdata[19], d_hash); + x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], d_hash); + x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], d_hash,simdthreads); + x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], d_hash); + x13_hamsi512_cpu_hash_64(thr_id, throughput, pdata[19], d_hash); + x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], d_hash); + x14_shabal512_cpu_hash_64(thr_id, throughput, pdata[19], d_hash); + + uint32_t foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash); + if(stop_mining) {mining_has_stopped[thr_id] = true; cudaStreamDestroy(gpustream[thr_id]); pthread_exit(nullptr);} + if(foundNonce != UINT32_MAX) { const uint32_t Htarg = ptarget[7]; - uint32_t vhash64[8]; + uint32_t vhash64[8]={0}; /* check now with the CPU to confirm */ - be32enc(&endiandata[19], foundNonce); + if(opt_verify){ be32enc(&endiandata[19], foundNonce); x14hash(vhash64, endiandata); - if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) { + } if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) { int res = 1; - uint32_t secNonce = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], foundNonce); + uint32_t secNonce = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash, foundNonce); *hashes_done = pdata[19] - first_nonce + throughput; - if (secNonce != 0) { - pdata[21] = secNonce; - res++; + if (secNonce != 0) + { + if(opt_verify){ be32enc(&endiandata[19], secNonce); + x14hash(vhash64, endiandata); + + } if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) + { + pdata[21] = secNonce; + res++; + } + else + { + applog(LOG_WARNING, "GPU #%d: result for %08x does not validate on CPU!", device_map[thr_id], secNonce); + } } pdata[19] = foundNonce; return res; } else { - applog(LOG_INFO, "GPU #%d: result for %08x does not validate on CPU!", thr_id, foundNonce); + applog(LOG_WARNING, "GPU #%d: result for %08x does not validate on CPU!", device_map[thr_id], foundNonce); } } - pdata[19] += throughput; + pdata[19] += throughput; CUDA_SAFE_CALL(cudaGetLastError()); } while (!work_restart[thr_id].restart && ((uint64_t)max_nonce > ((uint64_t)(pdata[19]) + (uint64_t)throughput))); - *hashes_done = pdata[19] - first_nonce + 1; + *hashes_done = pdata[19] - first_nonce ; return 0; } diff --git a/x15/x15.cu b/x15/x15.cu index 811b7c51c6..61611de101 100644 --- a/x15/x15.cu +++ b/x15/x15.cu @@ -27,53 +27,53 @@ extern "C" { #include "cuda_helper.h" -// Memory for the hash functions -static uint32_t *d_hash[MAX_GPUS]; - -extern void quark_blake512_cpu_setBlock_80(void *pdata); -extern void quark_blake512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order); +extern void quark_blake512_cpu_init(int thr_id); +extern void quark_blake512_cpu_setBlock_80(int thr_id, uint64_t *pdata); +extern void quark_blake512_cpu_setBlock_80_multi(int thr_id, uint64_t *pdata); +extern void quark_blake512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash); +extern void quark_blake512_cpu_hash_80_multi(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash); extern void quark_bmw512_cpu_init(int thr_id, uint32_t threads); -extern void quark_bmw512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); +extern void quark_bmw512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash); extern void quark_groestl512_cpu_init(int thr_id, uint32_t threads); -extern void quark_groestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); +extern void quark_groestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash); extern void quark_skein512_cpu_init(int thr_id); -extern void quark_skein512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); +extern void quark_skein512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash); -extern void quark_keccak512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); +extern void quark_keccak512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash); -extern void quark_jh512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); +extern void quark_jh512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash); -extern void x11_luffaCubehash512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); +extern void x11_luffaCubehash512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash); -extern void x11_shavite512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); +extern void x11_shavite512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash); extern int x11_simd512_cpu_init(int thr_id, uint32_t threads); -extern void x11_simd512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); +extern void x11_simd512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, const uint32_t simdthreads); extern void x11_echo512_cpu_init(int thr_id, uint32_t threads); -extern void x11_echo512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); +extern void x11_echo512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash); extern void x13_hamsi512_cpu_init(int thr_id, uint32_t threads); -extern void x13_hamsi512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); +extern void x13_hamsi512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash); extern void x13_fugue512_cpu_init(int thr_id, uint32_t threads); -extern void x13_fugue512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); +extern void x13_fugue512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash); -extern void x14_shabal512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); +extern void x14_shabal512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash); extern void x15_whirlpool_cpu_init(int thr_id, uint32_t threads, int mode); -extern void x15_whirlpool_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); +extern void x15_whirlpool_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash); extern void x15_whirlpool_cpu_free(int thr_id); extern void quark_compactTest_cpu_init(int thr_id, uint32_t threads); -extern void quark_compactTest_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *inpHashes, - uint32_t *d_noncesTrue, uint32_t *nrmTrue, uint32_t *d_noncesFalse, uint32_t *nrmFalse, int order); +extern void quark_compactTest_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, const uint32_t *inpHashes, + const uint32_t *d_noncesTrue, uint32_t *nrmTrue, uint32_t *d_noncesFalse, uint32_t *nrmFalse); // X15 CPU Hash function -extern "C" void x15hash(void *output, const void *input) +void x15hash(void *output, const void *input) { sph_blake512_context ctx_blake; sph_bmw512_context ctx_bmw; @@ -159,98 +159,126 @@ extern "C" void x15hash(void *output, const void *input) memcpy(output, hash, 32); } -static bool init[MAX_GPUS] = { 0 }; - -extern "C" int scanhash_x15(int thr_id, uint32_t *pdata, - const uint32_t *ptarget, uint32_t max_nonce, - unsigned long *hashes_done) +extern int scanhash_x15(int thr_id, uint32_t *pdata, + uint32_t *ptarget, uint32_t max_nonce, + uint32_t *hashes_done) { + static THREAD uint32_t *d_hash = nullptr; + const uint32_t first_nonce = pdata[19]; - uint32_t endiandata[20]; - uint32_t throughput = device_intensity(thr_id, __func__, 1U << 19); // 19=256*256*8; - throughput = min(throughput, (max_nonce - first_nonce)); + int intensity = 256 * 256 * 13; + if (device_sm[device_map[thr_id]] == 520) intensity = 256 * 256 * 22; + uint32_t throughputmax = device_intensity(device_map[thr_id], __func__, intensity); // 19=256*256*8; + uint32_t throughput = min(throughputmax, (max_nonce - first_nonce)) & 0xfffffc00; + uint32_t simdthreads = (device_sm[device_map[thr_id]] > 500) ? 256 : 32; if (opt_benchmark) - ((uint32_t*)ptarget)[7] = 0x0fF; + ptarget[7] = 0x0fF; - if (!init[thr_id]) + static THREAD volatile bool init = false; + if(!init) { - cudaSetDevice(device_map[thr_id]); - cudaDeviceReset(); + CUDA_SAFE_CALL(cudaSetDevice(device_map[thr_id])); cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync); cudaDeviceSetCacheConfig(cudaFuncCachePreferL1); + CUDA_SAFE_CALL(cudaStreamCreate(&gpustream[thr_id])); + get_cuda_arch(&cuda_arch[thr_id]); +#if defined WIN32 && !defined _WIN64 + // 2GB limit for cudaMalloc + if(throughputmax > 0x7fffffffULL / (64 * sizeof(uint4))) + { + applog(LOG_ERR, "intensity too high"); + mining_has_stopped[thr_id] = true; + cudaStreamDestroy(gpustream[thr_id]); + proper_exit(2); + } +#endif - quark_groestl512_cpu_init(thr_id, throughput); + quark_groestl512_cpu_init(thr_id, throughputmax); quark_skein512_cpu_init(thr_id); - quark_bmw512_cpu_init(thr_id, throughput); - x11_simd512_cpu_init(thr_id, throughput); - x11_echo512_cpu_init(thr_id, throughput); - x13_hamsi512_cpu_init(thr_id, throughput); - x13_fugue512_cpu_init(thr_id, throughput); - x15_whirlpool_cpu_init(thr_id, throughput, 0); - - CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput), 0); - - cuda_check_cpu_init(thr_id, throughput); - init[thr_id] = true; + quark_bmw512_cpu_init(thr_id, throughputmax); + x11_simd512_cpu_init(thr_id, throughputmax); + x11_echo512_cpu_init(thr_id, throughputmax); + x13_hamsi512_cpu_init(thr_id, throughputmax); + x13_fugue512_cpu_init(thr_id, throughputmax); + x15_whirlpool_cpu_init(thr_id, throughputmax, 0); + + CUDA_SAFE_CALL(cudaMalloc(&d_hash, 16 * sizeof(uint32_t) * throughputmax)); + + cuda_check_cpu_init(thr_id, throughputmax); + mining_has_stopped[thr_id] = false; + init = true; } - + + uint32_t endiandata[20]; for (int k=0; k < 20; k++) - be32enc(&endiandata[k], ((uint32_t*)pdata)[k]); + be32enc(&endiandata[k], pdata[k]); - quark_blake512_cpu_setBlock_80((void*)endiandata); - cuda_check_cpu_setTarget(ptarget); + quark_blake512_cpu_setBlock_80(thr_id, (uint64_t *)endiandata); + cuda_check_cpu_setTarget(ptarget, thr_id); do { - int order = 0; - quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++); - quark_bmw512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); - quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); - quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); - quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); - quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); - x11_luffaCubehash512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); - x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); - x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); - x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); - x13_hamsi512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); - x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); - x14_shabal512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); - x15_whirlpool_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); -// MyStreamSynchronize(NULL, 1, thr_id); - - uint32_t foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]); - if (foundNonce != UINT32_MAX) + quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash); + quark_bmw512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash); + quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash); + quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash); + quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash); + quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash); + x11_luffaCubehash512_cpu_hash_64(thr_id, throughput, pdata[19], d_hash); + x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], d_hash); + x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], d_hash,simdthreads); + x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], d_hash); + x13_hamsi512_cpu_hash_64(thr_id, throughput, pdata[19], d_hash); + x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], d_hash); + x14_shabal512_cpu_hash_64(thr_id, throughput, pdata[19], d_hash); + x15_whirlpool_cpu_hash_64(thr_id, throughput, pdata[19], d_hash); + + uint32_t foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash); + if(stop_mining) {mining_has_stopped[thr_id] = true; cudaStreamDestroy(gpustream[thr_id]); pthread_exit(nullptr);} + if(foundNonce != UINT32_MAX) { const uint32_t Htarg = ptarget[7]; - uint32_t vhash64[8]; + uint32_t vhash64[8]={0}; /* check now with the CPU to confirm */ - be32enc(&endiandata[19], foundNonce); + if(opt_verify){ be32enc(&endiandata[19], foundNonce); x15hash(vhash64, endiandata); - if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) { + } if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) { int res = 1; - uint32_t secNonce = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], foundNonce); + uint32_t secNonce = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash, foundNonce); *hashes_done = pdata[19] - first_nonce + throughput; - if (secNonce != 0) { - pdata[21] = secNonce; - res++; + if (secNonce != 0) + { + if(opt_verify){ be32enc(&endiandata[19], secNonce); + x15hash(vhash64, endiandata); + } if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) + { + pdata[21] = secNonce; + res++; + if (opt_benchmark) + applog(LOG_INFO, "GPU #%d: found nounce %08x", device_map[thr_id], secNonce); + } + else + { + applog(LOG_WARNING, "GPU #%d: result for %08x does not validate on CPU!", device_map[thr_id], secNonce); + } } - if (opt_benchmark) applog(LOG_INFO, "found nounce", thr_id, foundNonce, vhash64[7], Htarg); + if (opt_benchmark) + applog(LOG_INFO, "GPU #%d: found nounce %08x", device_map[thr_id], foundNonce); pdata[19] = foundNonce; return res; } else { - applog(LOG_INFO, "GPU #%d: result for %08x does not validate on CPU!", thr_id, foundNonce); + applog(LOG_WARNING, "GPU #%d: result for %08x does not validate on CPU!", device_map[thr_id], foundNonce); } } - pdata[19] += throughput; + pdata[19] += throughput; CUDA_SAFE_CALL(cudaGetLastError()); } while (!work_restart[thr_id].restart && ((uint64_t)max_nonce > ((uint64_t)(pdata[19]) + (uint64_t)throughput))); - *hashes_done = pdata[19] - first_nonce + 1; + *hashes_done = pdata[19] - first_nonce ; return 0; } diff --git a/x17/cuda_x17_haval512.cu b/x17/cuda_x17_haval512.cu index a8cf28fcac..3596682e0f 100644 --- a/x17/cuda_x17_haval512.cu +++ b/x17/cuda_x17_haval512.cu @@ -43,211 +43,10 @@ #include "cuda_helper.h" -#define SPH_ROTL32(x, n) (((x) << (n)) | ((x) >> (32 - (n)))) -#define SPH_ROTR32(x, n) SPH_ROTL32(x, (32 - (n))) - -static __constant__ uint32_t initVector[8]; - -static const uint32_t c_initVector[8] = { - SPH_C32(0x243F6A88), - SPH_C32(0x85A308D3), - SPH_C32(0x13198A2E), - SPH_C32(0x03707344), - SPH_C32(0xA4093822), - SPH_C32(0x299F31D0), - SPH_C32(0x082EFA98), - SPH_C32(0xEC4E6C89) -}; - -#define PASS1(n, in) { \ - STEP(n, 1, s7, s6, s5, s4, s3, s2, s1, s0, in[ 0], SPH_C32(0x00000000)); \ - STEP(n, 1, s6, s5, s4, s3, s2, s1, s0, s7, in[ 1], SPH_C32(0x00000000)); \ - STEP(n, 1, s5, s4, s3, s2, s1, s0, s7, s6, in[ 2], SPH_C32(0x00000000)); \ - STEP(n, 1, s4, s3, s2, s1, s0, s7, s6, s5, in[ 3], SPH_C32(0x00000000)); \ - STEP(n, 1, s3, s2, s1, s0, s7, s6, s5, s4, in[ 4], SPH_C32(0x00000000)); \ - STEP(n, 1, s2, s1, s0, s7, s6, s5, s4, s3, in[ 5], SPH_C32(0x00000000)); \ - STEP(n, 1, s1, s0, s7, s6, s5, s4, s3, s2, in[ 6], SPH_C32(0x00000000)); \ - STEP(n, 1, s0, s7, s6, s5, s4, s3, s2, s1, in[ 7], SPH_C32(0x00000000)); \ - \ - STEP(n, 1, s7, s6, s5, s4, s3, s2, s1, s0, in[ 8], SPH_C32(0x00000000)); \ - STEP(n, 1, s6, s5, s4, s3, s2, s1, s0, s7, in[ 9], SPH_C32(0x00000000)); \ - STEP(n, 1, s5, s4, s3, s2, s1, s0, s7, s6, in[10], SPH_C32(0x00000000)); \ - STEP(n, 1, s4, s3, s2, s1, s0, s7, s6, s5, in[11], SPH_C32(0x00000000)); \ - STEP(n, 1, s3, s2, s1, s0, s7, s6, s5, s4, in[12], SPH_C32(0x00000000)); \ - STEP(n, 1, s2, s1, s0, s7, s6, s5, s4, s3, in[13], SPH_C32(0x00000000)); \ - STEP(n, 1, s1, s0, s7, s6, s5, s4, s3, s2, in[14], SPH_C32(0x00000000)); \ - STEP(n, 1, s0, s7, s6, s5, s4, s3, s2, s1, in[15], SPH_C32(0x00000000)); \ - \ - STEP(n, 1, s7, s6, s5, s4, s3, s2, s1, s0, in[16], SPH_C32(0x00000000)); \ - STEP(n, 1, s6, s5, s4, s3, s2, s1, s0, s7, in[17], SPH_C32(0x00000000)); \ - STEP(n, 1, s5, s4, s3, s2, s1, s0, s7, s6, in[18], SPH_C32(0x00000000)); \ - STEP(n, 1, s4, s3, s2, s1, s0, s7, s6, s5, in[19], SPH_C32(0x00000000)); \ - STEP(n, 1, s3, s2, s1, s0, s7, s6, s5, s4, in[20], SPH_C32(0x00000000)); \ - STEP(n, 1, s2, s1, s0, s7, s6, s5, s4, s3, in[21], SPH_C32(0x00000000)); \ - STEP(n, 1, s1, s0, s7, s6, s5, s4, s3, s2, in[22], SPH_C32(0x00000000)); \ - STEP(n, 1, s0, s7, s6, s5, s4, s3, s2, s1, in[23], SPH_C32(0x00000000)); \ - \ - STEP(n, 1, s7, s6, s5, s4, s3, s2, s1, s0, in[24], SPH_C32(0x00000000)); \ - STEP(n, 1, s6, s5, s4, s3, s2, s1, s0, s7, in[25], SPH_C32(0x00000000)); \ - STEP(n, 1, s5, s4, s3, s2, s1, s0, s7, s6, in[26], SPH_C32(0x00000000)); \ - STEP(n, 1, s4, s3, s2, s1, s0, s7, s6, s5, in[27], SPH_C32(0x00000000)); \ - STEP(n, 1, s3, s2, s1, s0, s7, s6, s5, s4, in[28], SPH_C32(0x00000000)); \ - STEP(n, 1, s2, s1, s0, s7, s6, s5, s4, s3, in[29], SPH_C32(0x00000000)); \ - STEP(n, 1, s1, s0, s7, s6, s5, s4, s3, s2, in[30], SPH_C32(0x00000000)); \ - STEP(n, 1, s0, s7, s6, s5, s4, s3, s2, s1, in[31], SPH_C32(0x00000000)); \ -} - -#define PASS2(n, in) { \ - STEP(n, 2, s7, s6, s5, s4, s3, s2, s1, s0, in[ 5], SPH_C32(0x452821E6)); \ - STEP(n, 2, s6, s5, s4, s3, s2, s1, s0, s7, in[14], SPH_C32(0x38D01377)); \ - STEP(n, 2, s5, s4, s3, s2, s1, s0, s7, s6, in[26], SPH_C32(0xBE5466CF)); \ - STEP(n, 2, s4, s3, s2, s1, s0, s7, s6, s5, in[18], SPH_C32(0x34E90C6C)); \ - STEP(n, 2, s3, s2, s1, s0, s7, s6, s5, s4, in[11], SPH_C32(0xC0AC29B7)); \ - STEP(n, 2, s2, s1, s0, s7, s6, s5, s4, s3, in[28], SPH_C32(0xC97C50DD)); \ - STEP(n, 2, s1, s0, s7, s6, s5, s4, s3, s2, in[ 7], SPH_C32(0x3F84D5B5)); \ - STEP(n, 2, s0, s7, s6, s5, s4, s3, s2, s1, in[16], SPH_C32(0xB5470917)); \ - \ - STEP(n, 2, s7, s6, s5, s4, s3, s2, s1, s0, in[ 0], SPH_C32(0x9216D5D9)); \ - STEP(n, 2, s6, s5, s4, s3, s2, s1, s0, s7, in[23], SPH_C32(0x8979FB1B)); \ - STEP(n, 2, s5, s4, s3, s2, s1, s0, s7, s6, in[20], SPH_C32(0xD1310BA6)); \ - STEP(n, 2, s4, s3, s2, s1, s0, s7, s6, s5, in[22], SPH_C32(0x98DFB5AC)); \ - STEP(n, 2, s3, s2, s1, s0, s7, s6, s5, s4, in[ 1], SPH_C32(0x2FFD72DB)); \ - STEP(n, 2, s2, s1, s0, s7, s6, s5, s4, s3, in[10], SPH_C32(0xD01ADFB7)); \ - STEP(n, 2, s1, s0, s7, s6, s5, s4, s3, s2, in[ 4], SPH_C32(0xB8E1AFED)); \ - STEP(n, 2, s0, s7, s6, s5, s4, s3, s2, s1, in[ 8], SPH_C32(0x6A267E96)); \ - \ - STEP(n, 2, s7, s6, s5, s4, s3, s2, s1, s0, in[30], SPH_C32(0xBA7C9045)); \ - STEP(n, 2, s6, s5, s4, s3, s2, s1, s0, s7, in[ 3], SPH_C32(0xF12C7F99)); \ - STEP(n, 2, s5, s4, s3, s2, s1, s0, s7, s6, in[21], SPH_C32(0x24A19947)); \ - STEP(n, 2, s4, s3, s2, s1, s0, s7, s6, s5, in[ 9], SPH_C32(0xB3916CF7)); \ - STEP(n, 2, s3, s2, s1, s0, s7, s6, s5, s4, in[17], SPH_C32(0x0801F2E2)); \ - STEP(n, 2, s2, s1, s0, s7, s6, s5, s4, s3, in[24], SPH_C32(0x858EFC16)); \ - STEP(n, 2, s1, s0, s7, s6, s5, s4, s3, s2, in[29], SPH_C32(0x636920D8)); \ - STEP(n, 2, s0, s7, s6, s5, s4, s3, s2, s1, in[ 6], SPH_C32(0x71574E69)); \ - \ - STEP(n, 2, s7, s6, s5, s4, s3, s2, s1, s0, in[19], SPH_C32(0xA458FEA3)); \ - STEP(n, 2, s6, s5, s4, s3, s2, s1, s0, s7, in[12], SPH_C32(0xF4933D7E)); \ - STEP(n, 2, s5, s4, s3, s2, s1, s0, s7, s6, in[15], SPH_C32(0x0D95748F)); \ - STEP(n, 2, s4, s3, s2, s1, s0, s7, s6, s5, in[13], SPH_C32(0x728EB658)); \ - STEP(n, 2, s3, s2, s1, s0, s7, s6, s5, s4, in[ 2], SPH_C32(0x718BCD58)); \ - STEP(n, 2, s2, s1, s0, s7, s6, s5, s4, s3, in[25], SPH_C32(0x82154AEE)); \ - STEP(n, 2, s1, s0, s7, s6, s5, s4, s3, s2, in[31], SPH_C32(0x7B54A41D)); \ - STEP(n, 2, s0, s7, s6, s5, s4, s3, s2, s1, in[27], SPH_C32(0xC25A59B5)); \ -} +static uint32_t *d_nonce[MAX_GPUS]; -#define PASS3(n, in) { \ - STEP(n, 3, s7, s6, s5, s4, s3, s2, s1, s0, in[19], SPH_C32(0x9C30D539)); \ - STEP(n, 3, s6, s5, s4, s3, s2, s1, s0, s7, in[ 9], SPH_C32(0x2AF26013)); \ - STEP(n, 3, s5, s4, s3, s2, s1, s0, s7, s6, in[ 4], SPH_C32(0xC5D1B023)); \ - STEP(n, 3, s4, s3, s2, s1, s0, s7, s6, s5, in[20], SPH_C32(0x286085F0)); \ - STEP(n, 3, s3, s2, s1, s0, s7, s6, s5, s4, in[28], SPH_C32(0xCA417918)); \ - STEP(n, 3, s2, s1, s0, s7, s6, s5, s4, s3, in[17], SPH_C32(0xB8DB38EF)); \ - STEP(n, 3, s1, s0, s7, s6, s5, s4, s3, s2, in[ 8], SPH_C32(0x8E79DCB0)); \ - STEP(n, 3, s0, s7, s6, s5, s4, s3, s2, s1, in[22], SPH_C32(0x603A180E)); \ - \ - STEP(n, 3, s7, s6, s5, s4, s3, s2, s1, s0, in[29], SPH_C32(0x6C9E0E8B)); \ - STEP(n, 3, s6, s5, s4, s3, s2, s1, s0, s7, in[14], SPH_C32(0xB01E8A3E)); \ - STEP(n, 3, s5, s4, s3, s2, s1, s0, s7, s6, in[25], SPH_C32(0xD71577C1)); \ - STEP(n, 3, s4, s3, s2, s1, s0, s7, s6, s5, in[12], SPH_C32(0xBD314B27)); \ - STEP(n, 3, s3, s2, s1, s0, s7, s6, s5, s4, in[24], SPH_C32(0x78AF2FDA)); \ - STEP(n, 3, s2, s1, s0, s7, s6, s5, s4, s3, in[30], SPH_C32(0x55605C60)); \ - STEP(n, 3, s1, s0, s7, s6, s5, s4, s3, s2, in[16], SPH_C32(0xE65525F3)); \ - STEP(n, 3, s0, s7, s6, s5, s4, s3, s2, s1, in[26], SPH_C32(0xAA55AB94)); \ - \ - STEP(n, 3, s7, s6, s5, s4, s3, s2, s1, s0, in[31], SPH_C32(0x57489862)); \ - STEP(n, 3, s6, s5, s4, s3, s2, s1, s0, s7, in[15], SPH_C32(0x63E81440)); \ - STEP(n, 3, s5, s4, s3, s2, s1, s0, s7, s6, in[ 7], SPH_C32(0x55CA396A)); \ - STEP(n, 3, s4, s3, s2, s1, s0, s7, s6, s5, in[ 3], SPH_C32(0x2AAB10B6)); \ - STEP(n, 3, s3, s2, s1, s0, s7, s6, s5, s4, in[ 1], SPH_C32(0xB4CC5C34)); \ - STEP(n, 3, s2, s1, s0, s7, s6, s5, s4, s3, in[ 0], SPH_C32(0x1141E8CE)); \ - STEP(n, 3, s1, s0, s7, s6, s5, s4, s3, s2, in[18], SPH_C32(0xA15486AF)); \ - STEP(n, 3, s0, s7, s6, s5, s4, s3, s2, s1, in[27], SPH_C32(0x7C72E993)); \ - \ - STEP(n, 3, s7, s6, s5, s4, s3, s2, s1, s0, in[13], SPH_C32(0xB3EE1411)); \ - STEP(n, 3, s6, s5, s4, s3, s2, s1, s0, s7, in[ 6], SPH_C32(0x636FBC2A)); \ - STEP(n, 3, s5, s4, s3, s2, s1, s0, s7, s6, in[21], SPH_C32(0x2BA9C55D)); \ - STEP(n, 3, s4, s3, s2, s1, s0, s7, s6, s5, in[10], SPH_C32(0x741831F6)); \ - STEP(n, 3, s3, s2, s1, s0, s7, s6, s5, s4, in[23], SPH_C32(0xCE5C3E16)); \ - STEP(n, 3, s2, s1, s0, s7, s6, s5, s4, s3, in[11], SPH_C32(0x9B87931E)); \ - STEP(n, 3, s1, s0, s7, s6, s5, s4, s3, s2, in[ 5], SPH_C32(0xAFD6BA33)); \ - STEP(n, 3, s0, s7, s6, s5, s4, s3, s2, s1, in[ 2], SPH_C32(0x6C24CF5C)); \ -} - -#define PASS4(n, in) { \ - STEP(n, 4, s7, s6, s5, s4, s3, s2, s1, s0, in[24], SPH_C32(0x7A325381)); \ - STEP(n, 4, s6, s5, s4, s3, s2, s1, s0, s7, in[ 4], SPH_C32(0x28958677)); \ - STEP(n, 4, s5, s4, s3, s2, s1, s0, s7, s6, in[ 0], SPH_C32(0x3B8F4898)); \ - STEP(n, 4, s4, s3, s2, s1, s0, s7, s6, s5, in[14], SPH_C32(0x6B4BB9AF)); \ - STEP(n, 4, s3, s2, s1, s0, s7, s6, s5, s4, in[ 2], SPH_C32(0xC4BFE81B)); \ - STEP(n, 4, s2, s1, s0, s7, s6, s5, s4, s3, in[ 7], SPH_C32(0x66282193)); \ - STEP(n, 4, s1, s0, s7, s6, s5, s4, s3, s2, in[28], SPH_C32(0x61D809CC)); \ - STEP(n, 4, s0, s7, s6, s5, s4, s3, s2, s1, in[23], SPH_C32(0xFB21A991)); \ - \ - STEP(n, 4, s7, s6, s5, s4, s3, s2, s1, s0, in[26], SPH_C32(0x487CAC60)); \ - STEP(n, 4, s6, s5, s4, s3, s2, s1, s0, s7, in[ 6], SPH_C32(0x5DEC8032)); \ - STEP(n, 4, s5, s4, s3, s2, s1, s0, s7, s6, in[30], SPH_C32(0xEF845D5D)); \ - STEP(n, 4, s4, s3, s2, s1, s0, s7, s6, s5, in[20], SPH_C32(0xE98575B1)); \ - STEP(n, 4, s3, s2, s1, s0, s7, s6, s5, s4, in[18], SPH_C32(0xDC262302)); \ - STEP(n, 4, s2, s1, s0, s7, s6, s5, s4, s3, in[25], SPH_C32(0xEB651B88)); \ - STEP(n, 4, s1, s0, s7, s6, s5, s4, s3, s2, in[19], SPH_C32(0x23893E81)); \ - STEP(n, 4, s0, s7, s6, s5, s4, s3, s2, s1, in[ 3], SPH_C32(0xD396ACC5)); \ - \ - STEP(n, 4, s7, s6, s5, s4, s3, s2, s1, s0, in[22], SPH_C32(0x0F6D6FF3)); \ - STEP(n, 4, s6, s5, s4, s3, s2, s1, s0, s7, in[11], SPH_C32(0x83F44239)); \ - STEP(n, 4, s5, s4, s3, s2, s1, s0, s7, s6, in[31], SPH_C32(0x2E0B4482)); \ - STEP(n, 4, s4, s3, s2, s1, s0, s7, s6, s5, in[21], SPH_C32(0xA4842004)); \ - STEP(n, 4, s3, s2, s1, s0, s7, s6, s5, s4, in[ 8], SPH_C32(0x69C8F04A)); \ - STEP(n, 4, s2, s1, s0, s7, s6, s5, s4, s3, in[27], SPH_C32(0x9E1F9B5E)); \ - STEP(n, 4, s1, s0, s7, s6, s5, s4, s3, s2, in[12], SPH_C32(0x21C66842)); \ - STEP(n, 4, s0, s7, s6, s5, s4, s3, s2, s1, in[ 9], SPH_C32(0xF6E96C9A)); \ - \ - STEP(n, 4, s7, s6, s5, s4, s3, s2, s1, s0, in[ 1], SPH_C32(0x670C9C61)); \ - STEP(n, 4, s6, s5, s4, s3, s2, s1, s0, s7, in[29], SPH_C32(0xABD388F0)); \ - STEP(n, 4, s5, s4, s3, s2, s1, s0, s7, s6, in[ 5], SPH_C32(0x6A51A0D2)); \ - STEP(n, 4, s4, s3, s2, s1, s0, s7, s6, s5, in[15], SPH_C32(0xD8542F68)); \ - STEP(n, 4, s3, s2, s1, s0, s7, s6, s5, s4, in[17], SPH_C32(0x960FA728)); \ - STEP(n, 4, s2, s1, s0, s7, s6, s5, s4, s3, in[10], SPH_C32(0xAB5133A3)); \ - STEP(n, 4, s1, s0, s7, s6, s5, s4, s3, s2, in[16], SPH_C32(0x6EEF0B6C)); \ - STEP(n, 4, s0, s7, s6, s5, s4, s3, s2, s1, in[13], SPH_C32(0x137A3BE4)); \ -} - -#define PASS5(n, in) { \ - STEP(n, 5, s7, s6, s5, s4, s3, s2, s1, s0, in[27], SPH_C32(0xBA3BF050)); \ - STEP(n, 5, s6, s5, s4, s3, s2, s1, s0, s7, in[ 3], SPH_C32(0x7EFB2A98)); \ - STEP(n, 5, s5, s4, s3, s2, s1, s0, s7, s6, in[21], SPH_C32(0xA1F1651D)); \ - STEP(n, 5, s4, s3, s2, s1, s0, s7, s6, s5, in[26], SPH_C32(0x39AF0176)); \ - STEP(n, 5, s3, s2, s1, s0, s7, s6, s5, s4, in[17], SPH_C32(0x66CA593E)); \ - STEP(n, 5, s2, s1, s0, s7, s6, s5, s4, s3, in[11], SPH_C32(0x82430E88)); \ - STEP(n, 5, s1, s0, s7, s6, s5, s4, s3, s2, in[20], SPH_C32(0x8CEE8619)); \ - STEP(n, 5, s0, s7, s6, s5, s4, s3, s2, s1, in[29], SPH_C32(0x456F9FB4)); \ - \ - STEP(n, 5, s7, s6, s5, s4, s3, s2, s1, s0, in[19], SPH_C32(0x7D84A5C3)); \ - STEP(n, 5, s6, s5, s4, s3, s2, s1, s0, s7, in[ 0], SPH_C32(0x3B8B5EBE)); \ - STEP(n, 5, s5, s4, s3, s2, s1, s0, s7, s6, in[12], SPH_C32(0xE06F75D8)); \ - STEP(n, 5, s4, s3, s2, s1, s0, s7, s6, s5, in[ 7], SPH_C32(0x85C12073)); \ - STEP(n, 5, s3, s2, s1, s0, s7, s6, s5, s4, in[13], SPH_C32(0x401A449F)); \ - STEP(n, 5, s2, s1, s0, s7, s6, s5, s4, s3, in[ 8], SPH_C32(0x56C16AA6)); \ - STEP(n, 5, s1, s0, s7, s6, s5, s4, s3, s2, in[31], SPH_C32(0x4ED3AA62)); \ - STEP(n, 5, s0, s7, s6, s5, s4, s3, s2, s1, in[10], SPH_C32(0x363F7706)); \ - \ - STEP(n, 5, s7, s6, s5, s4, s3, s2, s1, s0, in[ 5], SPH_C32(0x1BFEDF72)); \ - STEP(n, 5, s6, s5, s4, s3, s2, s1, s0, s7, in[ 9], SPH_C32(0x429B023D)); \ - STEP(n, 5, s5, s4, s3, s2, s1, s0, s7, s6, in[14], SPH_C32(0x37D0D724)); \ - STEP(n, 5, s4, s3, s2, s1, s0, s7, s6, s5, in[30], SPH_C32(0xD00A1248)); \ - STEP(n, 5, s3, s2, s1, s0, s7, s6, s5, s4, in[18], SPH_C32(0xDB0FEAD3)); \ - STEP(n, 5, s2, s1, s0, s7, s6, s5, s4, s3, in[ 6], SPH_C32(0x49F1C09B)); \ - STEP(n, 5, s1, s0, s7, s6, s5, s4, s3, s2, in[28], SPH_C32(0x075372C9)); \ - STEP(n, 5, s0, s7, s6, s5, s4, s3, s2, s1, in[24], SPH_C32(0x80991B7B)); \ - \ - STEP(n, 5, s7, s6, s5, s4, s3, s2, s1, s0, in[ 2], SPH_C32(0x25D479D8)); \ - STEP(n, 5, s6, s5, s4, s3, s2, s1, s0, s7, in[23], SPH_C32(0xF6E8DEF7)); \ - STEP(n, 5, s5, s4, s3, s2, s1, s0, s7, s6, in[16], SPH_C32(0xE3FE501A)); \ - STEP(n, 5, s4, s3, s2, s1, s0, s7, s6, s5, in[22], SPH_C32(0xB6794C3B)); \ - STEP(n, 5, s3, s2, s1, s0, s7, s6, s5, s4, in[ 4], SPH_C32(0x976CE0BD)); \ - STEP(n, 5, s2, s1, s0, s7, s6, s5, s4, s3, in[ 1], SPH_C32(0x04C006BA)); \ - STEP(n, 5, s1, s0, s7, s6, s5, s4, s3, s2, in[25], SPH_C32(0xC1A94FB6)); \ - STEP(n, 5, s0, s7, s6, s5, s4, s3, s2, s1, in[15], SPH_C32(0x409F60C4)); \ -} +#define SPH_ROTL32(x, n) ROTL32(x, n) +#define SPH_ROTR32(x, n) ROTR32(x, n) #define F1(x6, x5, x4, x3, x2, x1, x0) \ (((x1) & ((x0) ^ (x4))) ^ ((x2) & (x5)) ^ ((x3) & (x6)) ^ (x0)) @@ -269,126 +68,278 @@ static const uint32_t c_initVector[8] = { (((x0) & ~(((x1) & (x2) & (x3)) ^ (x5))) \ ^ ((x1) & (x4)) ^ ((x2) & (x5)) ^ ((x3) & (x6))) -#define FP5_1(x6, x5, x4, x3, x2, x1, x0) \ - F1(x3, x4, x1, x0, x5, x2, x6) -#define FP5_2(x6, x5, x4, x3, x2, x1, x0) \ - F2(x6, x2, x1, x0, x3, x4, x5) -#define FP5_3(x6, x5, x4, x3, x2, x1, x0) \ - F3(x2, x6, x0, x4, x3, x1, x5) -#define FP5_4(x6, x5, x4, x3, x2, x1, x0) \ - F4(x1, x5, x3, x2, x0, x4, x6) -#define FP5_5(x6, x5, x4, x3, x2, x1, x0) \ - F5(x2, x5, x0, x6, x4, x3, x1) +#define STEP1(x7, x6, x5, x4, x3, x2, x1, x0, w) { \ + uint32_t t = F1(x3, x4, x1, x0, x5, x2, x6); \ + (x7) =(SPH_ROTR32(t, 7) + SPH_ROTR32((x7), 11) \ + + (w)); \ + } + +#define STEP2(x7, x6, x5, x4, x3, x2, x1, x0, w, c) { \ + uint32_t t = F2(x6, x2, x1, x0, x3, x4, x5); \ + (x7) =(SPH_ROTR32(t, 7) + SPH_ROTR32((x7), 11) \ + + (w) + (c)); \ + } +#define STEP3(x7, x6, x5, x4, x3, x2, x1, x0, w, c) { \ + uint32_t t = F3(x2, x6, x0, x4, x3, x1, x5); \ + (x7) =(SPH_ROTR32(t, 7) + SPH_ROTR32((x7), 11) \ + + (w) + (c)); \ + } -#define STEP(n, p, x7, x6, x5, x4, x3, x2, x1, x0, w, c) { \ - uint32_t t = FP ## n ## _ ## p(x6, x5, x4, x3, x2, x1, x0); \ +#define STEP4(x7, x6, x5, x4, x3, x2, x1, x0, w, c) { \ + uint32_t t = F4(x1, x5, x3, x2, x0, x4, x6); \ (x7) =(SPH_ROTR32(t, 7) + SPH_ROTR32((x7), 11) \ + (w) + (c)); \ } +#define STEP5(x7, x6, x5, x4, x3, x2, x1, x0, w, c) { \ + uint32_t t = F5(x2, x5, x0, x6, x4, x3, x1); \ + (x7) =(SPH_ROTR32(t, 7) + SPH_ROTR32((x7), 11) \ + + (w) + (c)); \ + } __global__ -void x17_haval256_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector) +void x17_haval256_gpu_hash_64(uint32_t threads, uint32_t startNounce, const uint64_t *const __restrict__ g_hash, uint32_t target, uint32_t *const __restrict__ ret) { - uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); - if (thread < threads) + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); +// if (thread < threads) { - uint32_t nounce = g_nonceVector ? g_nonceVector[thread] : (startNounce + thread); - int hashPosition = nounce - startNounce; - uint32_t *inpHash = (uint32_t*)&g_hash[8 * hashPosition]; - union { - uint8_t h1[64]; - uint32_t h4[16]; - uint64_t h8[8]; - } hash; - - uint32_t u0, u1, u2, u3, u4, u5, u6, u7; - uint32_t s0,s1,s2,s3,s4,s5,s6,s7; - uint32_t buf[32]; - - s0 = initVector[0]; - s1 = initVector[1]; - s2 = initVector[2]; - s3 = initVector[3]; - s4 = initVector[4]; - s5 = initVector[5]; - s6 = initVector[6]; - s7 = initVector[7]; - - u0 = s0; - u1 = s1; - u2 = s2; - u3 = s3; - u4 = s4; - u5 = s5; - u6 = s6; - u7 = s7; - - #pragma unroll - for (int i=0; i<16; i++) { - hash.h4[i]= inpHash[i]; + uint32_t *inpHash = (uint32_t*)&g_hash[8 * thread]; + uint32_t hash[16]; + + uint32_t buf[32] = {0}; + + uint32_t s0 = 0x243F6A88; + uint32_t s1 = 0x85A308D3; + uint32_t s2 = 0x13198A2E; + uint32_t s3 = 0x03707344; + uint32_t s4 = 0xA4093822; + uint32_t s5 = 0x299F31D0; + uint32_t s6 = 0x082EFA98; + uint32_t s7 = 0xEC4E6C89; + +#pragma unroll + for(int i = 0; i<16; i++) + { + hash[i] = inpHash[i]; } -///////// input big ///////////////////// + ///////// input big ///////////////////// + +#pragma unroll + for(int i = 0; i<16; i++) + { + buf[i] = hash[i]; + } - #pragma unroll - for (int i=0; i<32; i++) { - if (i<16) { - buf[i]=hash.h4[i]; - } else { - buf[i]=0; - } + buf[16] = 0x00000001; + buf[29] = 0x40290000; + buf[30] = 0x00000200; + + STEP1(s7, s6, s5, s4, s3, s2, s1, s0, buf[ 0]); + STEP1(s6, s5, s4, s3, s2, s1, s0, s7, buf[ 1]); + STEP1(s5, s4, s3, s2, s1, s0, s7, s6, buf[ 2]); + STEP1(s4, s3, s2, s1, s0, s7, s6, s5, buf[ 3]); + STEP1(s3, s2, s1, s0, s7, s6, s5, s4, buf[ 4]); + STEP1(s2, s1, s0, s7, s6, s5, s4, s3, buf[ 5]); + STEP1(s1, s0, s7, s6, s5, s4, s3, s2, buf[ 6]); + STEP1(s0, s7, s6, s5, s4, s3, s2, s1, buf[ 7]); + STEP1(s7, s6, s5, s4, s3, s2, s1, s0, buf[ 8]); + STEP1(s6, s5, s4, s3, s2, s1, s0, s7, buf[ 9]); + STEP1(s5, s4, s3, s2, s1, s0, s7, s6, buf[10]); + STEP1(s4, s3, s2, s1, s0, s7, s6, s5, buf[11]); + STEP1(s3, s2, s1, s0, s7, s6, s5, s4, buf[12]); + STEP1(s2, s1, s0, s7, s6, s5, s4, s3, buf[13]); + STEP1(s1, s0, s7, s6, s5, s4, s3, s2, buf[14]); + STEP1(s0, s7, s6, s5, s4, s3, s2, s1, buf[15]); + STEP1(s7, s6, s5, s4, s3, s2, s1, s0, buf[16]); + STEP1(s6, s5, s4, s3, s2, s1, s0, s7, buf[17]); + STEP1(s5, s4, s3, s2, s1, s0, s7, s6, buf[18]); + STEP1(s4, s3, s2, s1, s0, s7, s6, s5, buf[19]); + STEP1(s3, s2, s1, s0, s7, s6, s5, s4, buf[20]); + STEP1(s2, s1, s0, s7, s6, s5, s4, s3, buf[21]); + STEP1(s1, s0, s7, s6, s5, s4, s3, s2, buf[22]); + STEP1(s0, s7, s6, s5, s4, s3, s2, s1, buf[23]); + STEP1(s7, s6, s5, s4, s3, s2, s1, s0, buf[24]); + STEP1(s6, s5, s4, s3, s2, s1, s0, s7, buf[25]); + STEP1(s5, s4, s3, s2, s1, s0, s7, s6, buf[26]); + STEP1(s4, s3, s2, s1, s0, s7, s6, s5, buf[27]); + STEP1(s3, s2, s1, s0, s7, s6, s5, s4, buf[28]); + STEP1(s2, s1, s0, s7, s6, s5, s4, s3, buf[29]); + STEP1(s1, s0, s7, s6, s5, s4, s3, s2, buf[30]); + STEP1(s0, s7, s6, s5, s4, s3, s2, s1, buf[31]); + + STEP2(s7, s6, s5, s4, s3, s2, s1, s0, buf[ 5], SPH_C32(0x452821E6)); + STEP2(s6, s5, s4, s3, s2, s1, s0, s7, buf[14], SPH_C32(0x38D01377)); + STEP2(s5, s4, s3, s2, s1, s0, s7, s6, buf[26], SPH_C32(0xBE5466CF)); + STEP2(s4, s3, s2, s1, s0, s7, s6, s5, buf[18], SPH_C32(0x34E90C6C)); + STEP2(s3, s2, s1, s0, s7, s6, s5, s4, buf[11], SPH_C32(0xC0AC29B7)); + STEP2(s2, s1, s0, s7, s6, s5, s4, s3, buf[28], SPH_C32(0xC97C50DD)); + STEP2(s1, s0, s7, s6, s5, s4, s3, s2, buf[ 7], SPH_C32(0x3F84D5B5)); + STEP2(s0, s7, s6, s5, s4, s3, s2, s1, buf[16], SPH_C32(0xB5470917)); + STEP2(s7, s6, s5, s4, s3, s2, s1, s0, buf[ 0], SPH_C32(0x9216D5D9)); + STEP2(s6, s5, s4, s3, s2, s1, s0, s7, buf[23], SPH_C32(0x8979FB1B)); + STEP2(s5, s4, s3, s2, s1, s0, s7, s6, buf[20], SPH_C32(0xD1310BA6)); + STEP2(s4, s3, s2, s1, s0, s7, s6, s5, buf[22], SPH_C32(0x98DFB5AC)); + STEP2(s3, s2, s1, s0, s7, s6, s5, s4, buf[ 1], SPH_C32(0x2FFD72DB)); + STEP2(s2, s1, s0, s7, s6, s5, s4, s3, buf[10], SPH_C32(0xD01ADFB7)); + STEP2(s1, s0, s7, s6, s5, s4, s3, s2, buf[ 4], SPH_C32(0xB8E1AFED)); + STEP2(s0, s7, s6, s5, s4, s3, s2, s1, buf[ 8], SPH_C32(0x6A267E96)); + STEP2(s7, s6, s5, s4, s3, s2, s1, s0, buf[30], SPH_C32(0xBA7C9045)); + STEP2(s6, s5, s4, s3, s2, s1, s0, s7, buf[ 3], SPH_C32(0xF12C7F99)); + STEP2(s5, s4, s3, s2, s1, s0, s7, s6, buf[21], SPH_C32(0x24A19947)); + STEP2(s4, s3, s2, s1, s0, s7, s6, s5, buf[ 9], SPH_C32(0xB3916CF7)); + STEP2(s3, s2, s1, s0, s7, s6, s5, s4, buf[17], SPH_C32(0x0801F2E2)); + STEP2(s2, s1, s0, s7, s6, s5, s4, s3, buf[24], SPH_C32(0x858EFC16)); + STEP2(s1, s0, s7, s6, s5, s4, s3, s2, buf[29], SPH_C32(0x636920D8)); + STEP2(s0, s7, s6, s5, s4, s3, s2, s1, buf[ 6], SPH_C32(0x71574E69)); + STEP2(s7, s6, s5, s4, s3, s2, s1, s0, buf[19], SPH_C32(0xA458FEA3)); + STEP2(s6, s5, s4, s3, s2, s1, s0, s7, buf[12], SPH_C32(0xF4933D7E)); + STEP2(s5, s4, s3, s2, s1, s0, s7, s6, buf[15], SPH_C32(0x0D95748F)); + STEP2(s4, s3, s2, s1, s0, s7, s6, s5, buf[13], SPH_C32(0x728EB658)); + STEP2(s3, s2, s1, s0, s7, s6, s5, s4, buf[ 2], SPH_C32(0x718BCD58)); + STEP2(s2, s1, s0, s7, s6, s5, s4, s3, buf[25], SPH_C32(0x82154AEE)); + STEP2(s1, s0, s7, s6, s5, s4, s3, s2, buf[31], SPH_C32(0x7B54A41D)); + STEP2(s0, s7, s6, s5, s4, s3, s2, s1, buf[27], SPH_C32(0xC25A59B5)); + STEP3(s7, s6, s5, s4, s3, s2, s1, s0, buf[19], SPH_C32(0x9C30D539)); + STEP3(s6, s5, s4, s3, s2, s1, s0, s7, buf[ 9], SPH_C32(0x2AF26013)); + STEP3(s5, s4, s3, s2, s1, s0, s7, s6, buf[ 4], SPH_C32(0xC5D1B023)); + STEP3(s4, s3, s2, s1, s0, s7, s6, s5, buf[20], SPH_C32(0x286085F0)); + STEP3(s3, s2, s1, s0, s7, s6, s5, s4, buf[28], SPH_C32(0xCA417918)); + STEP3(s2, s1, s0, s7, s6, s5, s4, s3, buf[17], SPH_C32(0xB8DB38EF)); + STEP3(s1, s0, s7, s6, s5, s4, s3, s2, buf[ 8], SPH_C32(0x8E79DCB0)); + STEP3(s0, s7, s6, s5, s4, s3, s2, s1, buf[22], SPH_C32(0x603A180E)); + STEP3(s7, s6, s5, s4, s3, s2, s1, s0, buf[29], SPH_C32(0x6C9E0E8B)); + STEP3(s6, s5, s4, s3, s2, s1, s0, s7, buf[14], SPH_C32(0xB01E8A3E)); + STEP3(s5, s4, s3, s2, s1, s0, s7, s6, buf[25], SPH_C32(0xD71577C1)); + STEP3(s4, s3, s2, s1, s0, s7, s6, s5, buf[12], SPH_C32(0xBD314B27)); + STEP3(s3, s2, s1, s0, s7, s6, s5, s4, buf[24], SPH_C32(0x78AF2FDA)); + STEP3(s2, s1, s0, s7, s6, s5, s4, s3, buf[30], SPH_C32(0x55605C60)); + STEP3(s1, s0, s7, s6, s5, s4, s3, s2, buf[16], SPH_C32(0xE65525F3)); + STEP3(s0, s7, s6, s5, s4, s3, s2, s1, buf[26], SPH_C32(0xAA55AB94)); + STEP3(s7, s6, s5, s4, s3, s2, s1, s0, buf[31], SPH_C32(0x57489862)); + STEP3(s6, s5, s4, s3, s2, s1, s0, s7, buf[15], SPH_C32(0x63E81440)); + STEP3(s5, s4, s3, s2, s1, s0, s7, s6, buf[ 7], SPH_C32(0x55CA396A)); + STEP3(s4, s3, s2, s1, s0, s7, s6, s5, buf[ 3], SPH_C32(0x2AAB10B6)); + STEP3(s3, s2, s1, s0, s7, s6, s5, s4, buf[ 1], SPH_C32(0xB4CC5C34)); + STEP3(s2, s1, s0, s7, s6, s5, s4, s3, buf[ 0], SPH_C32(0x1141E8CE)); + STEP3(s1, s0, s7, s6, s5, s4, s3, s2, buf[18], SPH_C32(0xA15486AF)); + STEP3(s0, s7, s6, s5, s4, s3, s2, s1, buf[27], SPH_C32(0x7C72E993)); + STEP3(s7, s6, s5, s4, s3, s2, s1, s0, buf[13], SPH_C32(0xB3EE1411)); + STEP3(s6, s5, s4, s3, s2, s1, s0, s7, buf[ 6], SPH_C32(0x636FBC2A)); + STEP3(s5, s4, s3, s2, s1, s0, s7, s6, buf[21], SPH_C32(0x2BA9C55D)); + STEP3(s4, s3, s2, s1, s0, s7, s6, s5, buf[10], SPH_C32(0x741831F6)); + STEP3(s3, s2, s1, s0, s7, s6, s5, s4, buf[23], SPH_C32(0xCE5C3E16)); + STEP3(s2, s1, s0, s7, s6, s5, s4, s3, buf[11], SPH_C32(0x9B87931E)); + STEP3(s1, s0, s7, s6, s5, s4, s3, s2, buf[ 5], SPH_C32(0xAFD6BA33)); + STEP3(s0, s7, s6, s5, s4, s3, s2, s1, buf[ 2], SPH_C32(0x6C24CF5C)); + + STEP4(s7, s6, s5, s4, s3, s2, s1, s0, buf[24], SPH_C32(0x7A325381)); + STEP4(s6, s5, s4, s3, s2, s1, s0, s7, buf[ 4], SPH_C32(0x28958677)); + STEP4(s5, s4, s3, s2, s1, s0, s7, s6, buf[ 0], SPH_C32(0x3B8F4898)); + STEP4(s4, s3, s2, s1, s0, s7, s6, s5, buf[14], SPH_C32(0x6B4BB9AF)); + STEP4(s3, s2, s1, s0, s7, s6, s5, s4, buf[ 2], SPH_C32(0xC4BFE81B)); + STEP4(s2, s1, s0, s7, s6, s5, s4, s3, buf[ 7], SPH_C32(0x66282193)); + STEP4(s1, s0, s7, s6, s5, s4, s3, s2, buf[28], SPH_C32(0x61D809CC)); + STEP4(s0, s7, s6, s5, s4, s3, s2, s1, buf[23], SPH_C32(0xFB21A991)); + STEP4(s7, s6, s5, s4, s3, s2, s1, s0, buf[26], SPH_C32(0x487CAC60)); + STEP4(s6, s5, s4, s3, s2, s1, s0, s7, buf[ 6], SPH_C32(0x5DEC8032)); + STEP4(s5, s4, s3, s2, s1, s0, s7, s6, buf[30], SPH_C32(0xEF845D5D)); + STEP4(s4, s3, s2, s1, s0, s7, s6, s5, buf[20], SPH_C32(0xE98575B1)); + STEP4(s3, s2, s1, s0, s7, s6, s5, s4, buf[18], SPH_C32(0xDC262302)); + STEP4(s2, s1, s0, s7, s6, s5, s4, s3, buf[25], SPH_C32(0xEB651B88)); + STEP4(s1, s0, s7, s6, s5, s4, s3, s2, buf[19], SPH_C32(0x23893E81)); + STEP4(s0, s7, s6, s5, s4, s3, s2, s1, buf[ 3], SPH_C32(0xD396ACC5)); + STEP4(s7, s6, s5, s4, s3, s2, s1, s0, buf[22], SPH_C32(0x0F6D6FF3)); + STEP4(s6, s5, s4, s3, s2, s1, s0, s7, buf[11], SPH_C32(0x83F44239)); + STEP4(s5, s4, s3, s2, s1, s0, s7, s6, buf[31], SPH_C32(0x2E0B4482)); + STEP4(s4, s3, s2, s1, s0, s7, s6, s5, buf[21], SPH_C32(0xA4842004)); + STEP4(s3, s2, s1, s0, s7, s6, s5, s4, buf[ 8], SPH_C32(0x69C8F04A)); + STEP4(s2, s1, s0, s7, s6, s5, s4, s3, buf[27], SPH_C32(0x9E1F9B5E)); + STEP4(s1, s0, s7, s6, s5, s4, s3, s2, buf[12], SPH_C32(0x21C66842)); + STEP4(s0, s7, s6, s5, s4, s3, s2, s1, buf[ 9], SPH_C32(0xF6E96C9A)); + STEP4(s7, s6, s5, s4, s3, s2, s1, s0, buf[ 1], SPH_C32(0x670C9C61)); + STEP4(s6, s5, s4, s3, s2, s1, s0, s7, buf[29], SPH_C32(0xABD388F0)); + STEP4(s5, s4, s3, s2, s1, s0, s7, s6, buf[ 5], SPH_C32(0x6A51A0D2)); + STEP4(s4, s3, s2, s1, s0, s7, s6, s5, buf[15], SPH_C32(0xD8542F68)); + STEP4(s3, s2, s1, s0, s7, s6, s5, s4, buf[17], SPH_C32(0x960FA728)); + STEP4(s2, s1, s0, s7, s6, s5, s4, s3, buf[10], SPH_C32(0xAB5133A3)); + STEP4(s1, s0, s7, s6, s5, s4, s3, s2, buf[16], SPH_C32(0x6EEF0B6C)); + STEP4(s0, s7, s6, s5, s4, s3, s2, s1, buf[13], SPH_C32(0x137A3BE4)); + + STEP5(s7, s6, s5, s4, s3, s2, s1, s0, buf[27], SPH_C32(0xBA3BF050)); + STEP5(s6, s5, s4, s3, s2, s1, s0, s7, buf[ 3], SPH_C32(0x7EFB2A98)); + STEP5(s5, s4, s3, s2, s1, s0, s7, s6, buf[21], SPH_C32(0xA1F1651D)); + STEP5(s4, s3, s2, s1, s0, s7, s6, s5, buf[26], SPH_C32(0x39AF0176)); + STEP5(s3, s2, s1, s0, s7, s6, s5, s4, buf[17], SPH_C32(0x66CA593E)); + STEP5(s2, s1, s0, s7, s6, s5, s4, s3, buf[11], SPH_C32(0x82430E88)); + STEP5(s1, s0, s7, s6, s5, s4, s3, s2, buf[20], SPH_C32(0x8CEE8619)); + STEP5(s0, s7, s6, s5, s4, s3, s2, s1, buf[29], SPH_C32(0x456F9FB4)); + + STEP5(s7, s6, s5, s4, s3, s2, s1, s0, buf[19], SPH_C32(0x7D84A5C3)); + STEP5(s6, s5, s4, s3, s2, s1, s0, s7, buf[ 0], SPH_C32(0x3B8B5EBE)); + STEP5(s5, s4, s3, s2, s1, s0, s7, s6, buf[12], SPH_C32(0xE06F75D8)); + STEP5(s4, s3, s2, s1, s0, s7, s6, s5, buf[ 7], SPH_C32(0x85C12073)); + STEP5(s3, s2, s1, s0, s7, s6, s5, s4, buf[13], SPH_C32(0x401A449F)); + STEP5(s2, s1, s0, s7, s6, s5, s4, s3, buf[ 8], SPH_C32(0x56C16AA6)); + STEP5(s1, s0, s7, s6, s5, s4, s3, s2, buf[31], SPH_C32(0x4ED3AA62)); + STEP5(s0, s7, s6, s5, s4, s3, s2, s1, buf[10], SPH_C32(0x363F7706)); + + STEP5(s7, s6, s5, s4, s3, s2, s1, s0, buf[ 5], SPH_C32(0x1BFEDF72)); + STEP5(s6, s5, s4, s3, s2, s1, s0, s7, buf[ 9], SPH_C32(0x429B023D)); + STEP5(s5, s4, s3, s2, s1, s0, s7, s6, buf[14], SPH_C32(0x37D0D724)); + STEP5(s4, s3, s2, s1, s0, s7, s6, s5, buf[30], SPH_C32(0xD00A1248)); + STEP5(s3, s2, s1, s0, s7, s6, s5, s4, buf[18], SPH_C32(0xDB0FEAD3)); + STEP5(s2, s1, s0, s7, s6, s5, s4, s3, buf[ 6], SPH_C32(0x49F1C09B)); + STEP5(s1, s0, s7, s6, s5, s4, s3, s2, buf[28], SPH_C32(0x075372C9)); + STEP5(s0, s7, s6, s5, s4, s3, s2, s1, buf[24], SPH_C32(0x80991B7B)); + + STEP5(s7, s6, s5, s4, s3, s2, s1, s0, buf[ 2], SPH_C32(0x25D479D8)); + /* + STEP5(s6, s5, s4, s3, s2, s1, s0, s7, buf[23], SPH_C32(0xF6E8DEF7)); + STEP5(s5, s4, s3, s2, s1, s0, s7, s6, buf[16], SPH_C32(0xE3FE501A)); + STEP5(s4, s3, s2, s1, s0, s7, s6, s5, buf[22], SPH_C32(0xB6794C3B)); + STEP5(s3, s2, s1, s0, s7, s6, s5, s4, buf[ 4], SPH_C32(0x976CE0BD)); + STEP5(s2, s1, s0, s7, s6, s5, s4, s3, buf[ 1], SPH_C32(0x04C006BA)); + STEP5(s1, s0, s7, s6, s5, s4, s3, s2, buf[25], SPH_C32(0xC1A94FB6)); + STEP5(s0, s7, s6, s5, s4, s3, s2, s1, buf[15], SPH_C32(0x409F60C4)); + + inpHash[0] = s0 + 0x243F6A88; + inpHash[1] = s1 + 0x85A308D3; + inpHash[2] = s2 + 0x13198A2E; + inpHash[3] = s3 + 0x03707344; + inpHash[4] = s4 + 0xA4093822; + inpHash[5] = s5 + 0x299F31D0; + inpHash[6] = s6 + 0x082EFA98; + inpHash[7] = s7 + 0xEC4E6C89; + */ + if(s7 + 0xEC4E6C89 <= target) + { + uint32_t tmp = atomicExch(ret, startNounce + thread); + if(tmp != 0xffffffff) + ret[1] = tmp; } - buf[16]=0x00000001; - buf[29]=0x40290000; - buf[30]=0x00000200; - - PASS1(5, buf); - PASS2(5, buf); - PASS3(5, buf); - PASS4(5, buf); - PASS5(5, buf); - - s0 = (s0 + u0); - s2 = (s2 + u2); - s3 = (s3 + u3); - s4 = (s4 + u4); - s5 = (s5 + u5); - s6 = (s6 + u6); - s7 = (s7 + u7); - - hash.h4[0]=s0; - hash.h4[1]=s1; - hash.h4[2]=s2; - hash.h4[3]=s3; - hash.h4[4]=s4; - hash.h4[5]=s5; - hash.h4[6]=s6; - hash.h4[7]=s7; - - #pragma unroll 16 - for (int u = 0; u < 16; u ++) - inpHash[u] = hash.h4[u]; } // threads } __host__ void x17_haval256_cpu_init(int thr_id, uint32_t threads) { - cudaMemcpyToSymbol(initVector,c_initVector,sizeof(c_initVector),0, cudaMemcpyHostToDevice); + cudaMalloc(&d_nonce[thr_id], 2 * sizeof(uint32_t)); } __host__ -void x17_haval256_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order) +void x17_haval256_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, uint32_t target, uint32_t *result) { - const uint32_t threadsperblock = 256; // Alignment mit mixtab Grösse. NICHT ÄNDERN + const uint32_t threadsperblock = 512; - // berechne wie viele Thread Blocks wir brauchen dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 block(threadsperblock); + cudaMemsetAsync(d_nonce[thr_id], 0xff, 2 * sizeof(uint32_t), gpustream[thr_id]); - x17_haval256_gpu_hash_64<<>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector); + x17_haval256_gpu_hash_64 <<>>(threads, startNounce, (uint64_t*)d_hash, target, d_nonce[thr_id]); + cudaMemcpyAsync(result, d_nonce[thr_id], 2 * sizeof(uint32_t), cudaMemcpyDeviceToHost, gpustream[thr_id]); + CUDA_SAFE_CALL(cudaStreamSynchronize(gpustream[thr_id])); } diff --git a/x17/cuda_x17_sha512.cu b/x17/cuda_x17_sha512.cu index 100a5f96b6..8abdce07f0 100644 --- a/x17/cuda_x17_sha512.cu +++ b/x17/cuda_x17_sha512.cu @@ -40,22 +40,21 @@ #include "cuda_helper.h" -#define SWAP64(u64) cuda_swab64(u64) -#define SPH_ROTL32(x, n) ((x) << (n)) | ((x) >> (32 - (n))) -#define SPH_ROTR32(x, n) SPH_ROTL32(x, (32 - (n))) -static __constant__ uint64_t H_512[8]; +#define SWAP64(u64) cuda_swab64(u64) + +#define SPH_ROTL32(x, n) ROTL32(x, n) +#define SPH_ROTR32(x, n) ROTR32(x, n) -static const uint64_t H512[8] = { +static __constant__ uint64_t H_512[8] = { SPH_C64(0x6A09E667F3BCC908), SPH_C64(0xBB67AE8584CAA73B), - SPH_C64(0x3C6EF372FE94F82B), SPH_C64(0xA54FF53A5F1D36F1), - SPH_C64(0x510E527FADE682D1), SPH_C64(0x9B05688C2B3E6C1F), - SPH_C64(0x1F83D9ABFB41BD6B), SPH_C64(0x5BE0CD19137E2179) + SPH_C64(0x3C6EF372FE94F82B), SPH_C64(0xA54FF53A5F1D36F1), + SPH_C64(0x510E527FADE682D1), SPH_C64(0x9B05688C2B3E6C1F), + SPH_C64(0x1F83D9ABFB41BD6B), SPH_C64(0x5BE0CD19137E2179) }; -static __constant__ uint64_t K_512[80]; -static const uint64_t K512[80] = { +static __constant__ uint64_t K_512[80] = { SPH_C64(0x428A2F98D728AE22), SPH_C64(0x7137449123EF65CD), SPH_C64(0xB5C0FBCFEC4D3B2F), SPH_C64(0xE9B5DBA58189DBBC), SPH_C64(0x3956C25BF348B538), SPH_C64(0x59F111F1B605D019), @@ -98,24 +97,6 @@ static const uint64_t K512[80] = { SPH_C64(0x5FCB6FAB3AD6FAEC), SPH_C64(0x6C44198C4A475817) }; - -#define SHA3_STEP(ord,r,i) { \ - uint64_t T1, T2; \ - int a = 8-ord; \ - T1 = r[(7+a)&7] + BSG5_1(r[(4+a)&7]) + CH(r[(4+a)&7], r[(5+a)&7], r[(6+a)&7]) + K_512[i] + W[i]; \ - T2 = BSG5_0(r[(0+a)&7]) + MAJ(r[(0+a)&7], r[(1+a)&7], r[(2+a)&7]); \ - r[(3+a)&7] = r[(3+a)&7] + T1; \ - r[(7+a)&7] = T1 + T2; \ - } - -#define SHA3_STEP2(truc,ord,r,i) { \ - uint64_t T1, T2; \ - int a = 8-ord; \ - T1 = Tone(truc,r,W,a,i); \ - T2 = BSG5_0(r[(0+a)&7]) + MAJ(r[(0+a)&7], r[(1+a)&7], r[(2+a)&7]); \ - r[(3+a)&7] = r[(3+a)&7] + T1; \ - r[(7+a)&7] = T1 + T2; \ - } //#define BSG5_0(x) (ROTR64(x, 28) ^ ROTR64(x, 34) ^ ROTR64(x, 39)) #define BSG5_0(x) xor3(ROTR64(x, 28),ROTR64(x, 34),ROTR64(x, 39)) @@ -133,83 +114,95 @@ static const uint64_t K512[80] = { //#define MAJ(X, Y, Z) (((X) & (Y)) | (((X) | (Y)) & (Z))) #define MAJ(x, y, z) andor(x,y,z) +#define SHA3_STEP(ord,r,i) { \ + uint64_t T1, T2; \ + int a = 8-ord; \ + T1 = r[(7+a)&7] + BSG5_1(r[(4+a)&7]) + CH(r[(4+a)&7], r[(5+a)&7], r[(6+a)&7]) + K_512[i] + W[i]; \ + T2 = BSG5_0(r[(0+a)&7]) + MAJ(r[(0+a)&7], r[(1+a)&7], r[(2+a)&7]); \ + r[(3+a)&7] = r[(3+a)&7] + T1; \ + r[(7+a)&7] = T1 + T2; \ + } + __device__ __forceinline__ uint64_t Tone(const uint64_t* sharedMemory, uint64_t r[8], uint64_t W[80], uint32_t a, uint32_t i) { - uint64_t h = r[(7 + a) & 7]; uint64_t e = r[(4 + a) & 7]; - uint64_t f = r[(5 + a) & 7]; - uint64_t g = r[(6 + a) & 7]; //uint64_t BSG51 = ROTR64(e, 14) ^ ROTR64(e, 18) ^ ROTR64(e, 41); - uint64_t BSG51 = xor3(ROTR64(e, 14),ROTR64(e, 18),ROTR64(e, 41)); + uint64_t BSG51 = xor3(ROTR64(e, 14), ROTR64(e, 18), ROTR64(e, 41)); + //uint64_t CHl = (((f) ^ (g)) & (e)) ^ (g); - uint64_t CHl = xandx(e,f,g); - uint64_t result = h+BSG51+CHl+sharedMemory[i]+W[i]; + uint64_t CHl = xandx(e, r[(5 + a) & 7], r[(6 + a) & 7]); + uint64_t result = r[(7 + a) & 7] + BSG51 + CHl + sharedMemory[i] + W[i]; return result; } -__global__ __launch_bounds__(256,3) -void x17_sha512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector) +#define SHA3_STEP2(truc,ord,r,i) { \ + uint64_t T1, T2; \ + int a = 8-ord; \ + T1 = Tone(truc,r,W,a,i); \ + T2 = BSG5_0(r[(0+a)&7]) + MAJ(r[(0+a)&7], r[(1+a)&7], r[(2+a)&7]); \ + r[(3+a)&7] = r[(3+a)&7] + T1; \ + r[(7+a)&7] = T1 + T2; \ + } + +#define TPB 128 +__global__ __launch_bounds__(TPB,6) +void x17_sha512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t *g_hash) { - uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); - if (thread < threads) + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + // if (thread < threads) { - uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread); - int hashPosition = nounce - startNounce; - uint32_t *inpHash = (uint32_t*)&g_hash[8 * hashPosition]; - union { - uint8_t h1[64]; - uint32_t h4[16]; - uint64_t h8[8]; - } hash; - - #pragma unroll - for (int i=0;i<16;i++) { - hash.h4[i]= inpHash[i]; - } - uint64_t W[80]; - uint64_t r[8]; + uint64_t *inpHash = &g_hash[8 * thread]; + uint64_t hash[8]; - #pragma unroll 71 - for (int i=9;i<80;i++) { - W[i]=0; +#pragma unroll + for(int i = 0; i<8; i++) + { + hash[i] = inpHash[i]; } + uint64_t W[80] = {0}; + uint64_t r[8]; - #pragma unroll - for (int i = 0; i < 8; i ++) { - W[i] = SWAP64(hash.h8[i]); +#pragma unroll + for(int i = 0; i < 8; i++) + { + W[i] = SWAP64(hash[i]); r[i] = H_512[i]; } W[8] = 0x8000000000000000; - W[15]= 0x0000000000000200; - - #pragma unroll 64 - for (int i = 16; i < 80; i ++) - W[i] = SSG5_1(W[i - 2]) + W[i - 7] - + SSG5_0(W[i - 15]) + W[i - 16]; - - #pragma unroll 10 - for (int i = 0; i < 80; i += 8) { - #pragma unroll 8 - for (int ord=0;ord<8;ord++) { - SHA3_STEP2(K_512,ord,r,i+ord); + W[15] = 0x0000000000000200; + +#pragma unroll 64 + for(int i = 16; i < 80; i++) + W[i] = SSG5_1(W[i - 2]) + W[i - 7] + SSG5_0(W[i - 15]) + W[i - 16]; + +#pragma unroll 10 + for(int i = 0; i < 80; i += 8) + { +#pragma unroll 8 + for(int ord = 0; ord<8; ord++) + { + SHA3_STEP2(K_512, ord, r, i + ord); } } - #pragma unroll 8 - for (int i = 0; i < 8; i++) { +#pragma unroll 8 + for(int i = 0; i < 8; i++) + { r[i] = r[i] + H_512[i]; } - #pragma unroll 8 - for(int i=0;i<8;i++) { - hash.h8[i] = SWAP64(r[i]); +#pragma unroll 8 + for(int i = 0; i<8; i++) + { + hash[i] = SWAP64(r[i]); } - #pragma unroll 16 - for (int u = 0; u < 16; u ++) { - inpHash[u] = hash.h4[u]; +#pragma unroll 16 + for(int u = 0; u < 8; u++) + { + inpHash[u] = hash[u]; } } } @@ -217,16 +210,14 @@ void x17_sha512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t *g_ __host__ void x17_sha512_cpu_init(int thr_id, uint32_t threads) { - cudaMemcpyToSymbol(K_512,K512,80*sizeof(uint64_t),0, cudaMemcpyHostToDevice); - cudaMemcpyToSymbol(H_512,H512,sizeof(H512),0, cudaMemcpyHostToDevice); } __host__ -void x17_sha512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order) +void x17_sha512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_hash) { - const uint32_t threadsperblock = 64; + const uint32_t threadsperblock = TPB; dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 block(threadsperblock); - x17_sha512_gpu_hash_64<<>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector); + x17_sha512_gpu_hash_64<<>>(threads, startNounce, d_hash ); } diff --git a/x17/x17.cu b/x17/x17.cu index 97ab886ab6..e9fe13c717 100644 --- a/x17/x17.cu +++ b/x17/x17.cu @@ -32,55 +32,58 @@ extern "C" #include "cuda_helper.h" static uint32_t *d_hash[MAX_GPUS]; +static THREAD uint32_t *h_found = nullptr; -extern void quark_blake512_cpu_setBlock_80(void *pdata); -extern void quark_blake512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order); +extern void quark_blake512_cpu_init(int thr_id); +extern void quark_blake512_cpu_setBlock_80(int thr_id, uint64_t *pdata); +extern void quark_blake512_cpu_setBlock_80_multi(int thr_id, uint64_t *pdata); +extern void quark_blake512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash); +extern void quark_blake512_cpu_hash_80_multi(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash); extern void quark_bmw512_cpu_init(int thr_id, uint32_t threads); -extern void quark_bmw512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); +extern void quark_bmw512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash); extern void quark_groestl512_cpu_init(int thr_id, uint32_t threads); -extern void quark_groestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); +extern void quark_groestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash); extern void quark_skein512_cpu_init(int thr_id); -extern void quark_skein512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); +extern void quark_skein512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash); -extern void cuda_jh512Keccak512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); +extern void cuda_jh512Keccak512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash); -extern void x11_luffaCubehash512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); +extern void x11_luffaCubehash512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash); -extern void x11_shavite512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); +extern void x11_shavite512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash); extern int x11_simd512_cpu_init(int thr_id, uint32_t threads); -extern void x11_simd512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); +extern void x11_simd512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, const uint32_t simdthreads); extern void x11_echo512_cpu_init(int thr_id, uint32_t threads); -extern void x11_echo512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); +extern void x11_echo512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash); extern void x13_hamsi512_cpu_init(int thr_id, uint32_t threads); -extern void x13_hamsi512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); +extern void x13_hamsi512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash); extern void x13_fugue512_cpu_init(int thr_id, uint32_t threads); -extern void x13_fugue512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); +extern void x13_fugue512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash); -extern void x14_shabal512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); +extern void x14_shabal512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash); extern void x15_whirlpool_cpu_init(int thr_id, uint32_t threads, int flag); -extern void x15_whirlpool_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); +extern void x15_whirlpool_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash); extern void x17_sha512_cpu_init(int thr_id, uint32_t threads); -extern void x17_sha512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); +extern void x17_sha512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce,uint64_t *d_hash); extern void x17_haval256_cpu_init(int thr_id, uint32_t threads); -extern void x17_haval256_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); +extern void x17_haval256_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, uint32_t target, uint32_t *result); extern void quark_compactTest_cpu_init(int thr_id, uint32_t threads); -extern void quark_compactTest_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *inpHashes, - uint32_t *d_noncesTrue, uint32_t *nrmTrue, uint32_t *d_noncesFalse, uint32_t *nrmFalse, - int order); +extern void quark_compactTest_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, const uint32_t *inpHashes, + const uint32_t *d_noncesTrue, uint32_t *nrmTrue, uint32_t *d_noncesFalse, uint32_t *nrmFalse); // X17 Hashfunktion -extern "C" void x17hash(void *output, const void *input) +void x17hash(void *output, const void *input) { // blake1-bmw2-grs3-skein4-jh5-keccak6-luffa7-cubehash8-shavite9-simd10-echo11-hamsi12-fugue13-shabal14-whirlpool15-sha512-haval17 @@ -176,102 +179,137 @@ extern "C" void x17hash(void *output, const void *input) memcpy(output, hash, 32); } -static bool init[MAX_GPUS] = { 0 }; +static volatile bool init[MAX_GPUS] = { false }; -extern "C" int scanhash_x17(int thr_id, uint32_t *pdata, - const uint32_t *ptarget, uint32_t max_nonce, - unsigned long *hashes_done) +extern int scanhash_x17(int thr_id, uint32_t *pdata, + uint32_t *ptarget, uint32_t max_nonce, + uint32_t *hashes_done) { const uint32_t first_nonce = pdata[19]; - uint32_t throughput = device_intensity(thr_id, __func__, 1U << 19); // 19=256*256*8; - throughput = min(throughput, (max_nonce - first_nonce)); + int intensity = 256 * 256 * 9; + uint32_t simdthreads = (device_sm[device_map[thr_id]] > 500) ? 256 : 32; + if (device_sm[device_map[thr_id]] == 520) intensity = 256 * 256 * 15; + uint32_t throughputmax = device_intensity(device_map[thr_id], __func__, intensity); // 19=256*256*8; + uint32_t throughput = min(throughputmax, (max_nonce - first_nonce)) & 0xfffffc00; if (opt_benchmark) - ((uint32_t*)ptarget)[7] = 0x00f; + ptarget[7] = 0x03f; if (!init[thr_id]) { - cudaSetDevice(device_map[thr_id]); - cudaDeviceReset(); + CUDA_SAFE_CALL(cudaSetDevice(device_map[thr_id])); cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync); cudaDeviceSetCacheConfig(cudaFuncCachePreferL1); + CUDA_SAFE_CALL(cudaStreamCreate(&gpustream[thr_id])); + get_cuda_arch(&cuda_arch[thr_id]); +#if defined WIN32 && !defined _WIN64 + // 2GB limit for cudaMalloc + if(throughputmax > 0x7fffffffULL / (64 * sizeof(uint4))) + { + applog(LOG_ERR, "intensity too high"); + mining_has_stopped[thr_id] = true; + cudaStreamDestroy(gpustream[thr_id]); + proper_exit(2); + } +#endif - quark_groestl512_cpu_init(thr_id, throughput); + quark_groestl512_cpu_init(thr_id, throughputmax); quark_skein512_cpu_init(thr_id); - quark_bmw512_cpu_init(thr_id, throughput); - x11_simd512_cpu_init(thr_id, throughput); - x11_echo512_cpu_init(thr_id, throughput); - x13_hamsi512_cpu_init(thr_id, throughput); - x13_fugue512_cpu_init(thr_id, throughput); - x15_whirlpool_cpu_init(thr_id, throughput, 0); - x17_sha512_cpu_init(thr_id, throughput); - x17_haval256_cpu_init(thr_id, throughput); - - CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput), 0); - - cuda_check_cpu_init(thr_id, throughput); - + quark_bmw512_cpu_init(thr_id, throughputmax); + x11_simd512_cpu_init(thr_id, throughputmax); + x11_echo512_cpu_init(thr_id, throughputmax); + x13_hamsi512_cpu_init(thr_id, throughputmax); + x13_fugue512_cpu_init(thr_id, throughputmax); + x15_whirlpool_cpu_init(thr_id, throughputmax, 0); + x17_sha512_cpu_init(thr_id, throughputmax); + x17_haval256_cpu_init(thr_id, throughputmax); + + CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughputmax)); + CUDA_SAFE_CALL(cudaMallocHost(&(h_found), 2 * sizeof(uint32_t))); + + mining_has_stopped[thr_id] = false; init[thr_id] = true; } uint32_t endiandata[20]; for (int k=0; k < 20; k++) - be32enc(&endiandata[k], ((uint32_t*)pdata)[k]); + be32enc(&endiandata[k], pdata[k]); - quark_blake512_cpu_setBlock_80((void*)endiandata); - cuda_check_cpu_setTarget(ptarget); + quark_blake512_cpu_setBlock_80(thr_id, (uint64_t *)endiandata); + cuda_check_cpu_setTarget(ptarget, thr_id); do { - int order = 0; - // Hash with CUDA - quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++); - quark_bmw512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); - quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); - quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); - cuda_jh512Keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); - x11_luffaCubehash512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); - x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); - x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); - x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); - x13_hamsi512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); - x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); - x14_shabal512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); - x15_whirlpool_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); - x17_sha512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); - x17_haval256_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); - // MyStreamSynchronize(NULL, 1, thr_id); - - uint32_t foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]); - if (foundNonce != UINT32_MAX) + quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); + quark_bmw512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id]); + quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id]); + quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id]); + cuda_jh512Keccak512_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id]); + x11_luffaCubehash512_cpu_hash_64(thr_id, throughput, pdata[19],d_hash[thr_id]); + x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id]); + x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id], simdthreads); + x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id]); + x13_hamsi512_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id]); + x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id]); + x14_shabal512_cpu_hash_64(thr_id, throughput, pdata[19],d_hash[thr_id]); + x15_whirlpool_cpu_hash_64(thr_id, throughput, pdata[19],d_hash[thr_id]); + x17_sha512_cpu_hash_64(thr_id, throughput, pdata[19], (uint64_t*)d_hash[thr_id]); + x17_haval256_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id], ptarget[7], h_found); + + if(stop_mining) { mining_has_stopped[thr_id] = true; cudaStreamDestroy(gpustream[thr_id]); pthread_exit(nullptr); } + if(h_found[0] != 0xffffffff) { const uint32_t Htarg = ptarget[7]; - uint32_t vhash64[8]; - be32enc(&endiandata[19], foundNonce); - x17hash(vhash64, endiandata); + uint32_t vhash64[8] = {0}; + if(opt_verify) + { + be32enc(&endiandata[19], h_found[0]); + x17hash(vhash64, endiandata); - if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) { + } if(vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) + { int res = 1; - uint32_t secNonce = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], foundNonce); *hashes_done = pdata[19] - first_nonce + throughput; - if (secNonce != 0) { - pdata[21] = secNonce; - res++; + if(h_found[1] != 0xffffffff) + { + if(opt_verify) + { + be32enc(&endiandata[19], h_found[1]); + x13hash(vhash64, endiandata); + } if(vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) + { + + pdata[21] = h_found[1]; + res++; + if(opt_benchmark) + applog(LOG_INFO, "GPU #%d Found second nonce %08x", device_map[thr_id], h_found[1]); + } + else + { + if(vhash64[7] != Htarg) + { + applog(LOG_WARNING, "GPU #%d: result for %08x does not validate on CPU!", device_map[thr_id], h_found[1]); + } + } + } - if (opt_benchmark) applog(LOG_INFO, "found nounce", thr_id, foundNonce, vhash64[7], Htarg); - pdata[19] = foundNonce; + pdata[19] = h_found[0]; + if(opt_benchmark) + applog(LOG_INFO, "GPU #%d Found nonce %08x", device_map[thr_id], h_found[0]); return res; } else { - applog(LOG_INFO, "GPU #%d: result for %08x does not validate on CPU!", thr_id, foundNonce); + if(vhash64[7] != Htarg) + { + applog(LOG_WARNING, "GPU #%d: result for %08x does not validate on CPU!", device_map[thr_id], h_found[0]); + } } } - - pdata[19] += throughput; + pdata[19] += throughput; CUDA_SAFE_CALL(cudaGetLastError()); } while (!work_restart[thr_id].restart && ((uint64_t)max_nonce > ((uint64_t)(pdata[19]) + (uint64_t)throughput))); - *hashes_done = pdata[19] - first_nonce + 1; + *hashes_done = pdata[19] - first_nonce ; return 0; }