diff --git a/src/crypto/slow-hash.c b/src/crypto/slow-hash.c index 88eb751a6..5f2320d5f 100644 --- a/src/crypto/slow-hash.c +++ b/src/crypto/slow-hash.c @@ -40,9 +40,6 @@ #include "oaes_lib.h" #include "variant2_int_sqrt.h" #include "variant4_random_math.h" -#include "CryptonightR_JIT.h" - -#include #define MEMORY (1 << 21) // 2MB scratchpad #define ITER (1 << 20) @@ -54,41 +51,6 @@ extern void aesb_single_round(const uint8_t *in, uint8_t *out, const uint8_t *expandedKey); extern void aesb_pseudo_round(const uint8_t *in, uint8_t *out, const uint8_t *expandedKey); -static void local_abort(const char *msg) -{ - fprintf(stderr, "%s\n", msg); -#ifdef NDEBUG - _exit(1); -#else - abort(); -#endif -} - -volatile int use_v4_jit_flag = -1; - -static inline int use_v4_jit(void) -{ -#if defined(__x86_64__) - - if (use_v4_jit_flag != -1) - return use_v4_jit_flag; - - const char *env = getenv("MONERO_USE_CNV4_JIT"); - if (!env) { - use_v4_jit_flag = 1; - } - else if (!strcmp(env, "0") || !strcmp(env, "no")) { - use_v4_jit_flag = 0; - } - else { - use_v4_jit_flag = 1; - } - return use_v4_jit_flag; -#else - return 0; -#endif -} - #define VARIANT1_1(p) \ do if (variant == 1) \ { \ @@ -155,74 +117,48 @@ static inline int use_v4_jit(void) #define VARIANT2_SHUFFLE_ADD_SSE2(base_ptr, offset) \ do if (variant >= 2) \ { \ - __m128i chunk1 = _mm_load_si128((__m128i *)((base_ptr) + ((offset) ^ 0x10))); \ + const __m128i chunk1 = _mm_load_si128((__m128i *)((base_ptr) + ((offset) ^ 0x10))); \ const __m128i chunk2 = _mm_load_si128((__m128i *)((base_ptr) + ((offset) ^ 0x20))); \ const __m128i chunk3 = _mm_load_si128((__m128i *)((base_ptr) + ((offset) ^ 0x30))); \ _mm_store_si128((__m128i *)((base_ptr) + ((offset) ^ 0x10)), _mm_add_epi64(chunk3, _b1)); \ _mm_store_si128((__m128i *)((base_ptr) + ((offset) ^ 0x20)), _mm_add_epi64(chunk1, _b)); \ _mm_store_si128((__m128i *)((base_ptr) + ((offset) ^ 0x30)), _mm_add_epi64(chunk2, _a)); \ - if (variant >= 4) \ - { \ - chunk1 = _mm_xor_si128(chunk1, chunk2); \ - _c = _mm_xor_si128(_c, chunk3); \ - _c = _mm_xor_si128(_c, chunk1); \ - } \ } while (0) #define VARIANT2_SHUFFLE_ADD_NEON(base_ptr, offset) \ do if (variant >= 2) \ { \ - uint64x2_t chunk1 = vld1q_u64(U64((base_ptr) + ((offset) ^ 0x10))); \ + const uint64x2_t chunk1 = vld1q_u64(U64((base_ptr) + ((offset) ^ 0x10))); \ const uint64x2_t chunk2 = vld1q_u64(U64((base_ptr) + ((offset) ^ 0x20))); \ const uint64x2_t chunk3 = vld1q_u64(U64((base_ptr) + ((offset) ^ 0x30))); \ vst1q_u64(U64((base_ptr) + ((offset) ^ 0x10)), vaddq_u64(chunk3, vreinterpretq_u64_u8(_b1))); \ vst1q_u64(U64((base_ptr) + ((offset) ^ 0x20)), vaddq_u64(chunk1, vreinterpretq_u64_u8(_b))); \ vst1q_u64(U64((base_ptr) + ((offset) ^ 0x30)), vaddq_u64(chunk2, vreinterpretq_u64_u8(_a))); \ - if (variant >= 4) \ - { \ - chunk1 = veorq_u64(chunk1, chunk2); \ - _c = vreinterpretq_u8_u64(veorq_u64(vreinterpretq_u64_u8(_c), chunk3)); \ - _c = vreinterpretq_u8_u64(veorq_u64(vreinterpretq_u64_u8(_c), chunk1)); \ - } \ } while (0) -#define VARIANT2_PORTABLE_SHUFFLE_ADD(out, a_, base_ptr, offset) \ +#define VARIANT2_PORTABLE_SHUFFLE_ADD(base_ptr, offset) \ do if (variant >= 2) \ { \ uint64_t* chunk1 = U64((base_ptr) + ((offset) ^ 0x10)); \ uint64_t* chunk2 = U64((base_ptr) + ((offset) ^ 0x20)); \ uint64_t* chunk3 = U64((base_ptr) + ((offset) ^ 0x30)); \ \ - uint64_t chunk1_old[2] = { SWAP64LE(chunk1[0]), SWAP64LE(chunk1[1]) }; \ - const uint64_t chunk2_old[2] = { SWAP64LE(chunk2[0]), SWAP64LE(chunk2[1]) }; \ - const uint64_t chunk3_old[2] = { SWAP64LE(chunk3[0]), SWAP64LE(chunk3[1]) }; \ + const uint64_t chunk1_old[2] = { chunk1[0], chunk1[1] }; \ \ uint64_t b1[2]; \ memcpy_swap64le(b1, b + 16, 2); \ - chunk1[0] = SWAP64LE(chunk3_old[0] + b1[0]); \ - chunk1[1] = SWAP64LE(chunk3_old[1] + b1[1]); \ + chunk1[0] = SWAP64LE(SWAP64LE(chunk3[0]) + b1[0]); \ + chunk1[1] = SWAP64LE(SWAP64LE(chunk3[1]) + b1[1]); \ \ uint64_t a0[2]; \ - memcpy_swap64le(a0, a_, 2); \ - chunk3[0] = SWAP64LE(chunk2_old[0] + a0[0]); \ - chunk3[1] = SWAP64LE(chunk2_old[1] + a0[1]); \ + memcpy_swap64le(a0, a, 2); \ + chunk3[0] = SWAP64LE(SWAP64LE(chunk2[0]) + a0[0]); \ + chunk3[1] = SWAP64LE(SWAP64LE(chunk2[1]) + a0[1]); \ \ uint64_t b0[2]; \ memcpy_swap64le(b0, b, 2); \ - chunk2[0] = SWAP64LE(chunk1_old[0] + b0[0]); \ - chunk2[1] = SWAP64LE(chunk1_old[1] + b0[1]); \ - if (variant >= 4) \ - { \ - uint64_t out_copy[2]; \ - memcpy_swap64le(out_copy, out, 2); \ - chunk1_old[0] ^= chunk2_old[0]; \ - chunk1_old[1] ^= chunk2_old[1]; \ - out_copy[0] ^= chunk3_old[0]; \ - out_copy[1] ^= chunk3_old[1]; \ - out_copy[0] ^= chunk1_old[0]; \ - out_copy[1] ^= chunk1_old[1]; \ - memcpy_swap64le(out, out_copy, 2); \ - } \ + chunk2[0] = SWAP64LE(SWAP64LE(chunk1_old[0]) + b0[0]); \ + chunk2[1] = SWAP64LE(SWAP64LE(chunk1_old[1]) + b0[1]); \ } while (0) #define VARIANT2_INTEGER_MATH_DIVISION_STEP(b, ptr) \ @@ -265,18 +201,18 @@ static inline int use_v4_jit(void) #endif #define VARIANT2_2_PORTABLE() \ - if (variant == 2 || variant == 3) { \ + if (variant >= 2) { \ xor_blocks(long_state + (j ^ 0x10), d); \ xor_blocks(d, long_state + (j ^ 0x20)); \ } #define VARIANT2_2() \ - do if (variant == 2 || variant == 3) \ + do if (variant >= 2) \ { \ - *U64(local_hp_state + (j ^ 0x10)) ^= SWAP64LE(hi); \ - *(U64(local_hp_state + (j ^ 0x10)) + 1) ^= SWAP64LE(lo); \ - hi ^= SWAP64LE(*U64(local_hp_state + (j ^ 0x20))); \ - lo ^= SWAP64LE(*(U64(local_hp_state + (j ^ 0x20)) + 1)); \ + *U64(hp_state + (j ^ 0x10)) ^= SWAP64LE(hi); \ + *(U64(hp_state + (j ^ 0x10)) + 1) ^= SWAP64LE(lo); \ + hi ^= SWAP64LE(*U64(hp_state + (j ^ 0x20))); \ + lo ^= SWAP64LE(*(U64(hp_state + (j ^ 0x20)) + 1)); \ } while (0) #define V4_REG_LOAD(dst, src) \ @@ -289,56 +225,34 @@ static inline int use_v4_jit(void) } while (0) #define VARIANT4_RANDOM_MATH_INIT() \ - v4_reg r[9]; \ - struct V4_Instruction code[NUM_INSTRUCTIONS_MAX + 1]; \ - int jit = use_v4_jit(); \ + v4_reg r[8]; \ + struct V4_Instruction code[TOTAL_LATENCY * ALU_COUNT + 1]; \ do if (variant >= 4) \ { \ for (int i = 0; i < 4; ++i) \ V4_REG_LOAD(r + i, (uint8_t*)(state.hs.w + 12) + sizeof(v4_reg) * i); \ v4_random_math_init(code, height); \ - if (jit) \ - { \ - int ret = v4_generate_JIT_code(code, hp_jitfunc, 4096); \ - if (ret < 0) \ - local_abort("Error generating CryptonightR code"); \ - } \ } while (0) #define VARIANT4_RANDOM_MATH(a, b, r, _b, _b1) \ do if (variant >= 4) \ { \ - uint64_t t[2]; \ - memcpy(t, b, sizeof(uint64_t)); \ + uint64_t t; \ + memcpy(&t, b, sizeof(uint64_t)); \ \ if (sizeof(v4_reg) == sizeof(uint32_t)) \ - t[0] ^= SWAP64LE((r[0] + r[1]) | ((uint64_t)(r[2] + r[3]) << 32)); \ + t ^= SWAP64LE((r[0] + r[1]) | ((uint64_t)(r[2] + r[3]) << 32)); \ else \ - t[0] ^= SWAP64LE((r[0] + r[1]) ^ (r[2] + r[3])); \ + t ^= SWAP64LE((r[0] + r[1]) ^ (r[2] + r[3])); \ \ - memcpy(b, t, sizeof(uint64_t)); \ + memcpy(b, &t, sizeof(uint64_t)); \ \ V4_REG_LOAD(r + 4, a); \ V4_REG_LOAD(r + 5, (uint64_t*)(a) + 1); \ V4_REG_LOAD(r + 6, _b); \ V4_REG_LOAD(r + 7, _b1); \ - V4_REG_LOAD(r + 8, (uint64_t*)(_b1) + 1); \ \ - if (jit) \ - (*hp_jitfunc)(r); \ - else \ - v4_random_math(code, r); \ - \ - memcpy(t, a, sizeof(uint64_t) * 2); \ - \ - if (sizeof(v4_reg) == sizeof(uint32_t)) { \ - t[0] ^= SWAP64LE(r[2] | ((uint64_t)(r[3]) << 32)); \ - t[1] ^= SWAP64LE(r[0] | ((uint64_t)(r[1]) << 32)); \ - } else { \ - t[0] ^= SWAP64LE(r[2] ^ r[3]); \ - t[1] ^= SWAP64LE(r[0] ^ r[1]); \ - } \ - memcpy(a, t, sizeof(uint64_t) * 2); \ + v4_random_math(code, r); \ } while (0) @@ -404,7 +318,7 @@ static inline int use_v4_jit(void) #define pre_aes() \ j = state_index(a); \ - _c = _mm_load_si128(R128(&local_hp_state[j])); \ + _c = _mm_load_si128(R128(&hp_state[j])); \ _a = _mm_load_si128(R128(a)); \ /* @@ -417,20 +331,20 @@ static inline int use_v4_jit(void) * This code is based upon an optimized implementation by dga. */ #define post_aes() \ - VARIANT2_SHUFFLE_ADD_SSE2(local_hp_state, j); \ + VARIANT2_SHUFFLE_ADD_SSE2(hp_state, j); \ _mm_store_si128(R128(c), _c); \ - _mm_store_si128(R128(&local_hp_state[j]), _mm_xor_si128(_b, _c)); \ - VARIANT1_1(&local_hp_state[j]); \ + _mm_store_si128(R128(&hp_state[j]), _mm_xor_si128(_b, _c)); \ + VARIANT1_1(&hp_state[j]); \ j = state_index(c); \ - p = U64(&local_hp_state[j]); \ + p = U64(&hp_state[j]); \ b[0] = p[0]; b[1] = p[1]; \ VARIANT2_INTEGER_MATH_SSE2(b, c); \ VARIANT4_RANDOM_MATH(a, b, r, &_b, &_b1); \ __mul(); \ VARIANT2_2(); \ - VARIANT2_SHUFFLE_ADD_SSE2(local_hp_state, j); \ + VARIANT2_SHUFFLE_ADD_SSE2(hp_state, j); \ a[0] += hi; a[1] += lo; \ - p = U64(&local_hp_state[j]); \ + p = U64(&hp_state[j]); \ p[0] = a[0]; p[1] = a[1]; \ a[0] ^= b[0]; a[1] ^= b[1]; \ VARIANT1_2(p + 1); \ @@ -457,9 +371,6 @@ union cn_slow_hash_state THREADV uint8_t *hp_state = NULL; THREADV int hp_allocated = 0; -THREADV v4_random_math_JIT_func hp_jitfunc = NULL; -THREADV uint8_t *hp_jitfunc_memory = NULL; -THREADV int hp_jitfunc_allocated = 0; #if defined(_MSC_VER) #define cpuid(info,x) __cpuidex(info,x,0) @@ -755,10 +666,10 @@ void cn_slow_hash_allocate_state(void) #if defined(__APPLE__) || defined(__FreeBSD__) || defined(__OpenBSD__) || \ defined(__DragonFly__) || defined(__NetBSD__) hp_state = mmap(0, MEMORY, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANON, -1, 0); + MAP_PRIVATE | MAP_ANON, 0, 0); #else hp_state = mmap(0, MEMORY, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, -1, 0); + MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, 0, 0); #endif if(hp_state == MAP_FAILED) hp_state = NULL; @@ -769,35 +680,6 @@ void cn_slow_hash_allocate_state(void) hp_allocated = 0; hp_state = (uint8_t *) malloc(MEMORY); } - - -#if defined(_MSC_VER) || defined(__MINGW32__) - hp_jitfunc_memory = (uint8_t *) VirtualAlloc(hp_jitfunc_memory, 4096 + 4095, - MEM_COMMIT | MEM_RESERVE, PAGE_EXECUTE_READWRITE); -#else -#if defined(__APPLE__) || defined(__FreeBSD__) || defined(__OpenBSD__) || \ - defined(__DragonFly__) || defined(__NetBSD__) -#ifdef __NetBSD__ -#define RESERVED_FLAGS PROT_MPROTECT(PROT_EXEC) -#else -#define RESERVED_FLAGS 0 -#endif - hp_jitfunc_memory = mmap(0, 4096 + 4096, PROT_READ | PROT_WRITE | RESERVED_FLAGS, - MAP_PRIVATE | MAP_ANON, -1, 0); -#else - hp_jitfunc_memory = mmap(0, 4096 + 4096, PROT_READ | PROT_WRITE | PROT_EXEC, - MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); -#endif - if(hp_jitfunc_memory == MAP_FAILED) - hp_jitfunc_memory = NULL; -#endif - hp_jitfunc_allocated = 1; - if (hp_jitfunc_memory == NULL) - { - hp_jitfunc_allocated = 0; - hp_jitfunc_memory = malloc(4096 + 4095); - } - hp_jitfunc = (v4_random_math_JIT_func)((size_t)(hp_jitfunc_memory + 4095) & ~4095); } /** @@ -820,22 +702,8 @@ void cn_slow_hash_free_state(void) #endif } - if(!hp_jitfunc_allocated) - free(hp_jitfunc_memory); - else - { -#if defined(_MSC_VER) || defined(__MINGW32__) - VirtualFree(hp_jitfunc_memory, 0, MEM_RELEASE); -#else - munmap(hp_jitfunc_memory, 4096 + 4095); -#endif - } - hp_state = NULL; hp_allocated = 0; - hp_jitfunc = NULL; - hp_jitfunc_memory = NULL; - hp_jitfunc_allocated = 0; } /** @@ -919,7 +787,7 @@ void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int for(i = 0; i < MEMORY / INIT_SIZE_BYTE; i++) { aes_pseudo_round(text, text, expandedKey, INIT_SIZE_BLK); - memcpy(&local_hp_state[i * INIT_SIZE_BYTE], text, INIT_SIZE_BYTE); + memcpy(&hp_state[i * INIT_SIZE_BYTE], text, INIT_SIZE_BYTE); } } else @@ -931,7 +799,7 @@ void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int for(j = 0; j < INIT_SIZE_BLK; j++) aesb_pseudo_round(&text[AES_BLOCK_SIZE * j], &text[AES_BLOCK_SIZE * j], aes_ctx->key->exp_data); - memcpy(&local_hp_state[i * INIT_SIZE_BYTE], text, INIT_SIZE_BYTE); + memcpy(&hp_state[i * INIT_SIZE_BYTE], text, INIT_SIZE_BYTE); } } @@ -979,7 +847,7 @@ void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int for(i = 0; i < MEMORY / INIT_SIZE_BYTE; i++) { // add the xor to the pseudo round - aes_pseudo_round_xor(text, text, expandedKey, &local_hp_state[i * INIT_SIZE_BYTE], INIT_SIZE_BLK); + aes_pseudo_round_xor(text, text, expandedKey, &hp_state[i * INIT_SIZE_BYTE], INIT_SIZE_BLK); } } else @@ -989,7 +857,7 @@ void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int { for(j = 0; j < INIT_SIZE_BLK; j++) { - xor_blocks(&text[j * AES_BLOCK_SIZE], &local_hp_state[i * INIT_SIZE_BYTE + j * AES_BLOCK_SIZE]); + xor_blocks(&text[j * AES_BLOCK_SIZE], &hp_state[i * INIT_SIZE_BYTE + j * AES_BLOCK_SIZE]); aesb_pseudo_round(&text[AES_BLOCK_SIZE * j], &text[AES_BLOCK_SIZE * j], aes_ctx->key->exp_data); } } @@ -1033,8 +901,6 @@ void cn_slow_hash_free_state(void) #define U64(x) ((uint64_t *) (x)) -#define hp_jitfunc ((v4_random_math_JIT_func)NULL) - STATIC INLINE void xor64(uint64_t *a, const uint64_t b) { *a ^= b; @@ -1069,24 +935,24 @@ union cn_slow_hash_state #define pre_aes() \ j = state_index(a); \ - _c = vld1q_u8(&local_hp_state[j]); \ + _c = vld1q_u8(&hp_state[j]); \ _a = vld1q_u8((const uint8_t *)a); \ #define post_aes() \ - VARIANT2_SHUFFLE_ADD_NEON(local_hp_state, j); \ + VARIANT2_SHUFFLE_ADD_NEON(hp_state, j); \ vst1q_u8((uint8_t *)c, _c); \ - vst1q_u8(&local_hp_state[j], veorq_u8(_b, _c)); \ - VARIANT1_1(&local_hp_state[j]); \ + vst1q_u8(&hp_state[j], veorq_u8(_b, _c)); \ + VARIANT1_1(&hp_state[j]); \ j = state_index(c); \ - p = U64(&local_hp_state[j]); \ + p = U64(&hp_state[j]); \ b[0] = p[0]; b[1] = p[1]; \ VARIANT2_PORTABLE_INTEGER_MATH(b, c); \ VARIANT4_RANDOM_MATH(a, b, r, &_b, &_b1); \ __mul(); \ VARIANT2_2(); \ - VARIANT2_SHUFFLE_ADD_NEON(local_hp_state, j); \ + VARIANT2_SHUFFLE_ADD_NEON(hp_state, j); \ a[0] += hi; a[1] += lo; \ - p = U64(&local_hp_state[j]); \ + p = U64(&hp_state[j]); \ p[0] = a[0]; p[1] = a[1]; \ a[0] ^= b[0]; a[1] ^= b[1]; \ VARIANT1_2(p + 1); \ @@ -1100,47 +966,47 @@ union cn_slow_hash_state */ static void aes_expand_key(const uint8_t *key, uint8_t *expandedKey) { static const int rcon[] = { - 0x01,0x01,0x01,0x01, - 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d, // rotate-n-splat - 0x1b,0x1b,0x1b,0x1b }; + 0x01,0x01,0x01,0x01, + 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d, // rotate-n-splat + 0x1b,0x1b,0x1b,0x1b }; __asm__( -" eor v0.16b,v0.16b,v0.16b\n" -" ld1 {v3.16b},[%0],#16\n" -" ld1 {v1.4s,v2.4s},[%2],#32\n" -" ld1 {v4.16b},[%0]\n" -" mov w2,#5\n" -" st1 {v3.4s},[%1],#16\n" +" eor v0.16b,v0.16b,v0.16b\n" +" ld1 {v3.16b},[%0],#16\n" +" ld1 {v1.4s,v2.4s},[%2],#32\n" +" ld1 {v4.16b},[%0]\n" +" mov w2,#5\n" +" st1 {v3.4s},[%1],#16\n" "\n" "1:\n" -" tbl v6.16b,{v4.16b},v2.16b\n" -" ext v5.16b,v0.16b,v3.16b,#12\n" -" st1 {v4.4s},[%1],#16\n" -" aese v6.16b,v0.16b\n" -" subs w2,w2,#1\n" +" tbl v6.16b,{v4.16b},v2.16b\n" +" ext v5.16b,v0.16b,v3.16b,#12\n" +" st1 {v4.4s},[%1],#16\n" +" aese v6.16b,v0.16b\n" +" subs w2,w2,#1\n" "\n" -" eor v3.16b,v3.16b,v5.16b\n" -" ext v5.16b,v0.16b,v5.16b,#12\n" -" eor v3.16b,v3.16b,v5.16b\n" -" ext v5.16b,v0.16b,v5.16b,#12\n" -" eor v6.16b,v6.16b,v1.16b\n" -" eor v3.16b,v3.16b,v5.16b\n" -" shl v1.16b,v1.16b,#1\n" -" eor v3.16b,v3.16b,v6.16b\n" -" st1 {v3.4s},[%1],#16\n" -" b.eq 2f\n" +" eor v3.16b,v3.16b,v5.16b\n" +" ext v5.16b,v0.16b,v5.16b,#12\n" +" eor v3.16b,v3.16b,v5.16b\n" +" ext v5.16b,v0.16b,v5.16b,#12\n" +" eor v6.16b,v6.16b,v1.16b\n" +" eor v3.16b,v3.16b,v5.16b\n" +" shl v1.16b,v1.16b,#1\n" +" eor v3.16b,v3.16b,v6.16b\n" +" st1 {v3.4s},[%1],#16\n" +" b.eq 2f\n" "\n" -" dup v6.4s,v3.s[3] // just splat\n" -" ext v5.16b,v0.16b,v4.16b,#12\n" -" aese v6.16b,v0.16b\n" +" dup v6.4s,v3.s[3] // just splat\n" +" ext v5.16b,v0.16b,v4.16b,#12\n" +" aese v6.16b,v0.16b\n" "\n" -" eor v4.16b,v4.16b,v5.16b\n" -" ext v5.16b,v0.16b,v5.16b,#12\n" -" eor v4.16b,v4.16b,v5.16b\n" -" ext v5.16b,v0.16b,v5.16b,#12\n" -" eor v4.16b,v4.16b,v5.16b\n" +" eor v4.16b,v4.16b,v5.16b\n" +" ext v5.16b,v0.16b,v5.16b,#12\n" +" eor v4.16b,v4.16b,v5.16b\n" +" ext v5.16b,v0.16b,v5.16b,#12\n" +" eor v4.16b,v4.16b,v5.16b\n" "\n" -" eor v4.16b,v4.16b,v6.16b\n" -" b 1b\n" +" eor v4.16b,v4.16b,v6.16b\n" +" b 1b\n" "\n" "2:\n" : : "r"(key), "r"(expandedKey), "r"(rcon)); } @@ -1155,71 +1021,71 @@ __asm__( */ STATIC INLINE void aes_pseudo_round(const uint8_t *in, uint8_t *out, const uint8_t *expandedKey, int nblocks) { - const uint8x16_t *k = (const uint8x16_t *)expandedKey, zero = {0}; - uint8x16_t tmp; - int i; + const uint8x16_t *k = (const uint8x16_t *)expandedKey, zero = {0}; + uint8x16_t tmp; + int i; - for (i=0; i= 2) { copy_block(b + AES_BLOCK_SIZE, b); } - copy_block(b, c1); - copy_block(a, a1); + copy_block(b, a); + copy_block(a, c1); } memcpy(text, state.init, INIT_SIZE_BYTE); diff --git a/src/crypto/variant4_random_math.h b/src/crypto/variant4_random_math.h index f3e41a001..2c190287b 100644 --- a/src/crypto/variant4_random_math.h +++ b/src/crypto/variant4_random_math.h @@ -10,11 +10,8 @@ enum V4_Settings TOTAL_LATENCY = 15 * 3, // Always generate at least 60 instructions - NUM_INSTRUCTIONS_MIN = 60, - - // Never generate more than 70 instructions (final RET instruction doesn't count here) - NUM_INSTRUCTIONS_MAX = 70, - + NUM_INSTRUCTIONS = 60, + // Available ALUs for MUL // Modern CPUs typically have only 1 ALU which can do multiplications ALU_COUNT_MUL = 1, @@ -39,9 +36,10 @@ enum V4_InstructionList // V4_InstructionDefinition is used to generate code from random data // Every random sequence of bytes is a valid code // -// There are 9 registers in total: +// There are 8 registers in total: // - 4 variable registers -// - 5 constant registers initialized from loop variables +// - 4 constant registers initialized from loop variables +// // This is why dst_index is 2 bits enum V4_InstructionDefinition { @@ -59,9 +57,9 @@ struct V4_Instruction }; #ifndef FORCEINLINE -#if defined(__GNUC__) +#ifdef __GNUC__ #define FORCEINLINE __attribute__((always_inline)) inline -#elif defined(_MSC_VER) +#elif _MSC_VER #define FORCEINLINE __forceinline #else #define FORCEINLINE inline @@ -69,9 +67,9 @@ struct V4_Instruction #endif #ifndef UNREACHABLE_CODE -#if defined(__GNUC__) +#ifdef __GNUC__ #define UNREACHABLE_CODE __builtin_unreachable() -#elif defined(_MSC_VER) +#elif _MSC_VER #define UNREACHABLE_CODE __assume(false) #else #define UNREACHABLE_CODE @@ -143,16 +141,16 @@ static FORCEINLINE void v4_random_math(const struct V4_Instruction* code, v4_reg // Generated program can have 60 + a few more (usually 2-3) instructions to achieve required latency // I've checked all block heights < 10,000,000 and here is the distribution of program sizes: // - // 60 27960 - // 61 105054 - // 62 2452759 - // 63 5115997 - // 64 1022269 - // 65 1109635 - // 66 153145 - // 67 8550 - // 68 4529 - // 69 102 + // 60 28495 + // 61 106077 + // 62 2455855 + // 63 5114930 + // 64 1020868 + // 65 1109026 + // 66 151756 + // 67 8429 + // 68 4477 + // 69 87 // Unroll 70 instructions here V4_EXEC_10(0); // instructions 0-9 @@ -178,7 +176,6 @@ static FORCEINLINE void check_data(size_t* data_index, const size_t bytes_needed } // Generates as many random math operations as possible with given latency and ALU restrictions -// "code" array must have space for NUM_INSTRUCTIONS_MAX+1 instructions static inline int v4_random_math_init(struct V4_Instruction* code, const uint64_t height) { // MUL is 3 cycles, 3-way addition and rotations are 2 cycles, SUB/XOR are 1 cycle @@ -200,7 +197,6 @@ static inline int v4_random_math_init(struct V4_Instruction* code, const uint64_ memset(data, 0, sizeof(data)); uint64_t tmp = SWAP64LE(height); memcpy(data, &tmp, sizeof(uint64_t)); - data[20] = -38; // change seed // Set data_index past the last byte in data // to trigger full data update with blake hash @@ -208,22 +204,18 @@ static inline int v4_random_math_init(struct V4_Instruction* code, const uint64_ size_t data_index = sizeof(data); int code_size; - - // There is a small chance (1.8%) that register R8 won't be used in the generated program - // So we keep track of it and try again if it's not used - bool r8_used; do { - int latency[9]; - int asic_latency[9]; + int latency[8]; + int asic_latency[8]; // Tracks previous instruction and value of the source operand for registers R0-R3 throughout code execution // byte 0: current value of the destination register // byte 1: instruction opcode // byte 2: current value of the source register // - // Registers R4-R8 are constant and are treated as having the same value because when we do + // Registers R4-R7 are constant and are treated as having the same value because when we do // the same operation twice with two constant source registers, it can be optimized into a single operation - uint32_t inst_data[9] = { 0, 1, 2, 3, 0xFFFFFF, 0xFFFFFF, 0xFFFFFF, 0xFFFFFF, 0xFFFFFF }; + uint32_t inst_data[8] = { 0, 1, 2, 3, 0xFFFFFF, 0xFFFFFF, 0xFFFFFF, 0xFFFFFF }; bool alu_busy[TOTAL_LATENCY + 1][ALU_COUNT]; bool is_rotation[V4_INSTRUCTION_COUNT]; @@ -242,7 +234,6 @@ static inline int v4_random_math_init(struct V4_Instruction* code, const uint64_ code_size = 0; int total_iterations = 0; - r8_used = false; // Generate random code to achieve minimal required latency for our abstract CPU // Try to get this latency for all 4 registers @@ -286,9 +277,9 @@ static inline int v4_random_math_init(struct V4_Instruction* code, const uint64_ // Don't do ADD/SUB/XOR with the same register if (((opcode == ADD) || (opcode == SUB) || (opcode == XOR)) && (a == b)) { - // Use register R8 as source instead - b = 8; - src_index = 8; + // a is always < 4, so we don't need to check bounds here + b = a + 4; + src_index = b; } // Don't do rotation with the same destination twice because it's equal to a single rotation @@ -368,11 +359,6 @@ static inline int v4_random_math_init(struct V4_Instruction* code, const uint64_ code[code_size].src_index = src_index; code[code_size].C = 0; - if (src_index == 8) - { - r8_used = true; - } - if (opcode == ADD) { // ADD instruction is implemented as two 1-cycle instructions on a real CPU, so mark ALU as busy for the next cycle too @@ -387,7 +373,7 @@ static inline int v4_random_math_init(struct V4_Instruction* code, const uint64_ } ++code_size; - if (code_size >= NUM_INSTRUCTIONS_MIN) + if (code_size >= NUM_INSTRUCTIONS) { break; } @@ -402,7 +388,7 @@ static inline int v4_random_math_init(struct V4_Instruction* code, const uint64_ // We need to add a few more MUL and ROR instructions to achieve minimal required latency for ASIC // Get this latency for at least 1 of the 4 registers const int prev_code_size = code_size; - while ((code_size < NUM_INSTRUCTIONS_MAX) && (asic_latency[0] < TOTAL_LATENCY) && (asic_latency[1] < TOTAL_LATENCY) && (asic_latency[2] < TOTAL_LATENCY) && (asic_latency[3] < TOTAL_LATENCY)) + while ((asic_latency[0] < TOTAL_LATENCY) && (asic_latency[1] < TOTAL_LATENCY) && (asic_latency[2] < TOTAL_LATENCY) && (asic_latency[3] < TOTAL_LATENCY)) { int min_idx = 0; int max_idx = 0; @@ -424,11 +410,9 @@ static inline int v4_random_math_init(struct V4_Instruction* code, const uint64_ ++code_size; } - // There is ~98.15% chance that loop condition is false, so this loop will execute only 1 iteration most of the time - // It never does more than 4 iterations for all block heights < 10,000,000 - } while (!r8_used || (code_size < NUM_INSTRUCTIONS_MIN) || (code_size > NUM_INSTRUCTIONS_MAX)); + // There is ~99.8% chance that code_size >= NUM_INSTRUCTIONS here, so second iteration is required rarely + } while (code_size < NUM_INSTRUCTIONS); - // It's guaranteed that NUM_INSTRUCTIONS_MIN <= code_size <= NUM_INSTRUCTIONS_MAX here // Add final instruction to stop the interpreter code[code_size].opcode = RET; code[code_size].dst_index = 0;