mirror of
https://git.wownero.com/wownero/wownero.git
synced 2024-08-15 01:03:23 +00:00
Wownero fails to compile on armv7a. To fix we can:
- update src/crypto/slow-hash.c to the latest version that Monero currently has - modify variant4_random_math.h to facilitate the changes in slow-hash.c In short; src/crypto/slow-hash.c is now up to date with upstream Monero. The next Wownero version will have these changes automatically as the codebase follows Monero, rendering this commit obsolete in the process.
This commit is contained in:
parent
07864cc53f
commit
44c482913f
2 changed files with 463 additions and 210 deletions
|
@ -1,4 +1,4 @@
|
||||||
// Copyright (c) 2014-2020, The Monero Project
|
// Copyright (c) 2014-2022, The Monero Project
|
||||||
//
|
//
|
||||||
// All rights reserved.
|
// All rights reserved.
|
||||||
//
|
//
|
||||||
|
@ -40,6 +40,9 @@
|
||||||
#include "oaes_lib.h"
|
#include "oaes_lib.h"
|
||||||
#include "variant2_int_sqrt.h"
|
#include "variant2_int_sqrt.h"
|
||||||
#include "variant4_random_math.h"
|
#include "variant4_random_math.h"
|
||||||
|
#include "CryptonightR_JIT.h"
|
||||||
|
|
||||||
|
#include <errno.h>
|
||||||
|
|
||||||
#define MEMORY (1 << 21) // 2MB scratchpad
|
#define MEMORY (1 << 21) // 2MB scratchpad
|
||||||
#define ITER (1 << 20)
|
#define ITER (1 << 20)
|
||||||
|
@ -48,9 +51,72 @@
|
||||||
#define INIT_SIZE_BLK 8
|
#define INIT_SIZE_BLK 8
|
||||||
#define INIT_SIZE_BYTE (INIT_SIZE_BLK * AES_BLOCK_SIZE)
|
#define INIT_SIZE_BYTE (INIT_SIZE_BLK * AES_BLOCK_SIZE)
|
||||||
|
|
||||||
|
#if defined(_MSC_VER)
|
||||||
|
#define THREADV __declspec(thread)
|
||||||
|
#else
|
||||||
|
#define THREADV __thread
|
||||||
|
#endif
|
||||||
|
|
||||||
extern void aesb_single_round(const uint8_t *in, uint8_t *out, const uint8_t *expandedKey);
|
extern void aesb_single_round(const uint8_t *in, uint8_t *out, const uint8_t *expandedKey);
|
||||||
extern void aesb_pseudo_round(const uint8_t *in, uint8_t *out, const uint8_t *expandedKey);
|
extern void aesb_pseudo_round(const uint8_t *in, uint8_t *out, const uint8_t *expandedKey);
|
||||||
|
|
||||||
|
static void local_abort(const char *msg)
|
||||||
|
{
|
||||||
|
fprintf(stderr, "%s\n", msg);
|
||||||
|
#ifdef NDEBUG
|
||||||
|
_exit(1);
|
||||||
|
#else
|
||||||
|
abort();
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
volatile int use_v4_jit_flag = -1;
|
||||||
|
|
||||||
|
static inline int use_v4_jit(void)
|
||||||
|
{
|
||||||
|
#if defined(__x86_64__)
|
||||||
|
|
||||||
|
if (use_v4_jit_flag != -1)
|
||||||
|
return use_v4_jit_flag;
|
||||||
|
|
||||||
|
const char *env = getenv("MONERO_USE_CNV4_JIT");
|
||||||
|
if (!env) {
|
||||||
|
use_v4_jit_flag = 1;
|
||||||
|
}
|
||||||
|
else if (!strcmp(env, "0") || !strcmp(env, "no")) {
|
||||||
|
use_v4_jit_flag = 0;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
use_v4_jit_flag = 1;
|
||||||
|
}
|
||||||
|
return use_v4_jit_flag;
|
||||||
|
#else
|
||||||
|
return 0;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
#if defined(__x86_64__) || defined(__aarch64__)
|
||||||
|
static inline int force_software_aes(void)
|
||||||
|
{
|
||||||
|
static int use = -1;
|
||||||
|
|
||||||
|
if (use != -1)
|
||||||
|
return use;
|
||||||
|
|
||||||
|
const char *env = getenv("MONERO_USE_SOFTWARE_AES");
|
||||||
|
if (!env) {
|
||||||
|
use = 0;
|
||||||
|
}
|
||||||
|
else if (!strcmp(env, "0") || !strcmp(env, "no")) {
|
||||||
|
use = 0;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
use = 1;
|
||||||
|
}
|
||||||
|
return use;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
#define VARIANT1_1(p) \
|
#define VARIANT1_1(p) \
|
||||||
do if (variant == 1) \
|
do if (variant == 1) \
|
||||||
{ \
|
{ \
|
||||||
|
@ -117,48 +183,74 @@ extern void aesb_pseudo_round(const uint8_t *in, uint8_t *out, const uint8_t *ex
|
||||||
#define VARIANT2_SHUFFLE_ADD_SSE2(base_ptr, offset) \
|
#define VARIANT2_SHUFFLE_ADD_SSE2(base_ptr, offset) \
|
||||||
do if (variant >= 2) \
|
do if (variant >= 2) \
|
||||||
{ \
|
{ \
|
||||||
const __m128i chunk1 = _mm_load_si128((__m128i *)((base_ptr) + ((offset) ^ 0x10))); \
|
__m128i chunk1 = _mm_load_si128((__m128i *)((base_ptr) + ((offset) ^ 0x10))); \
|
||||||
const __m128i chunk2 = _mm_load_si128((__m128i *)((base_ptr) + ((offset) ^ 0x20))); \
|
const __m128i chunk2 = _mm_load_si128((__m128i *)((base_ptr) + ((offset) ^ 0x20))); \
|
||||||
const __m128i chunk3 = _mm_load_si128((__m128i *)((base_ptr) + ((offset) ^ 0x30))); \
|
const __m128i chunk3 = _mm_load_si128((__m128i *)((base_ptr) + ((offset) ^ 0x30))); \
|
||||||
_mm_store_si128((__m128i *)((base_ptr) + ((offset) ^ 0x10)), _mm_add_epi64(chunk3, _b1)); \
|
_mm_store_si128((__m128i *)((base_ptr) + ((offset) ^ 0x10)), _mm_add_epi64(chunk3, _b1)); \
|
||||||
_mm_store_si128((__m128i *)((base_ptr) + ((offset) ^ 0x20)), _mm_add_epi64(chunk1, _b)); \
|
_mm_store_si128((__m128i *)((base_ptr) + ((offset) ^ 0x20)), _mm_add_epi64(chunk1, _b)); \
|
||||||
_mm_store_si128((__m128i *)((base_ptr) + ((offset) ^ 0x30)), _mm_add_epi64(chunk2, _a)); \
|
_mm_store_si128((__m128i *)((base_ptr) + ((offset) ^ 0x30)), _mm_add_epi64(chunk2, _a)); \
|
||||||
|
if (variant >= 4) \
|
||||||
|
{ \
|
||||||
|
chunk1 = _mm_xor_si128(chunk1, chunk2); \
|
||||||
|
_c = _mm_xor_si128(_c, chunk3); \
|
||||||
|
_c = _mm_xor_si128(_c, chunk1); \
|
||||||
|
} \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
#define VARIANT2_SHUFFLE_ADD_NEON(base_ptr, offset) \
|
#define VARIANT2_SHUFFLE_ADD_NEON(base_ptr, offset) \
|
||||||
do if (variant >= 2) \
|
do if (variant >= 2) \
|
||||||
{ \
|
{ \
|
||||||
const uint64x2_t chunk1 = vld1q_u64(U64((base_ptr) + ((offset) ^ 0x10))); \
|
uint64x2_t chunk1 = vld1q_u64(U64((base_ptr) + ((offset) ^ 0x10))); \
|
||||||
const uint64x2_t chunk2 = vld1q_u64(U64((base_ptr) + ((offset) ^ 0x20))); \
|
const uint64x2_t chunk2 = vld1q_u64(U64((base_ptr) + ((offset) ^ 0x20))); \
|
||||||
const uint64x2_t chunk3 = vld1q_u64(U64((base_ptr) + ((offset) ^ 0x30))); \
|
const uint64x2_t chunk3 = vld1q_u64(U64((base_ptr) + ((offset) ^ 0x30))); \
|
||||||
vst1q_u64(U64((base_ptr) + ((offset) ^ 0x10)), vaddq_u64(chunk3, vreinterpretq_u64_u8(_b1))); \
|
vst1q_u64(U64((base_ptr) + ((offset) ^ 0x10)), vaddq_u64(chunk3, vreinterpretq_u64_u8(_b1))); \
|
||||||
vst1q_u64(U64((base_ptr) + ((offset) ^ 0x20)), vaddq_u64(chunk1, vreinterpretq_u64_u8(_b))); \
|
vst1q_u64(U64((base_ptr) + ((offset) ^ 0x20)), vaddq_u64(chunk1, vreinterpretq_u64_u8(_b))); \
|
||||||
vst1q_u64(U64((base_ptr) + ((offset) ^ 0x30)), vaddq_u64(chunk2, vreinterpretq_u64_u8(_a))); \
|
vst1q_u64(U64((base_ptr) + ((offset) ^ 0x30)), vaddq_u64(chunk2, vreinterpretq_u64_u8(_a))); \
|
||||||
|
if (variant >= 4) \
|
||||||
|
{ \
|
||||||
|
chunk1 = veorq_u64(chunk1, chunk2); \
|
||||||
|
_c = vreinterpretq_u8_u64(veorq_u64(vreinterpretq_u64_u8(_c), chunk3)); \
|
||||||
|
_c = vreinterpretq_u8_u64(veorq_u64(vreinterpretq_u64_u8(_c), chunk1)); \
|
||||||
|
} \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
#define VARIANT2_PORTABLE_SHUFFLE_ADD(base_ptr, offset) \
|
#define VARIANT2_PORTABLE_SHUFFLE_ADD(out, a_, base_ptr, offset) \
|
||||||
do if (variant >= 2) \
|
do if (variant >= 2) \
|
||||||
{ \
|
{ \
|
||||||
uint64_t* chunk1 = U64((base_ptr) + ((offset) ^ 0x10)); \
|
uint64_t* chunk1 = U64((base_ptr) + ((offset) ^ 0x10)); \
|
||||||
uint64_t* chunk2 = U64((base_ptr) + ((offset) ^ 0x20)); \
|
uint64_t* chunk2 = U64((base_ptr) + ((offset) ^ 0x20)); \
|
||||||
uint64_t* chunk3 = U64((base_ptr) + ((offset) ^ 0x30)); \
|
uint64_t* chunk3 = U64((base_ptr) + ((offset) ^ 0x30)); \
|
||||||
\
|
\
|
||||||
const uint64_t chunk1_old[2] = { chunk1[0], chunk1[1] }; \
|
uint64_t chunk1_old[2] = { SWAP64LE(chunk1[0]), SWAP64LE(chunk1[1]) }; \
|
||||||
|
const uint64_t chunk2_old[2] = { SWAP64LE(chunk2[0]), SWAP64LE(chunk2[1]) }; \
|
||||||
|
const uint64_t chunk3_old[2] = { SWAP64LE(chunk3[0]), SWAP64LE(chunk3[1]) }; \
|
||||||
\
|
\
|
||||||
uint64_t b1[2]; \
|
uint64_t b1[2]; \
|
||||||
memcpy_swap64le(b1, b + 16, 2); \
|
memcpy_swap64le(b1, b + 16, 2); \
|
||||||
chunk1[0] = SWAP64LE(SWAP64LE(chunk3[0]) + b1[0]); \
|
chunk1[0] = SWAP64LE(chunk3_old[0] + b1[0]); \
|
||||||
chunk1[1] = SWAP64LE(SWAP64LE(chunk3[1]) + b1[1]); \
|
chunk1[1] = SWAP64LE(chunk3_old[1] + b1[1]); \
|
||||||
\
|
\
|
||||||
uint64_t a0[2]; \
|
uint64_t a0[2]; \
|
||||||
memcpy_swap64le(a0, a, 2); \
|
memcpy_swap64le(a0, a_, 2); \
|
||||||
chunk3[0] = SWAP64LE(SWAP64LE(chunk2[0]) + a0[0]); \
|
chunk3[0] = SWAP64LE(chunk2_old[0] + a0[0]); \
|
||||||
chunk3[1] = SWAP64LE(SWAP64LE(chunk2[1]) + a0[1]); \
|
chunk3[1] = SWAP64LE(chunk2_old[1] + a0[1]); \
|
||||||
\
|
\
|
||||||
uint64_t b0[2]; \
|
uint64_t b0[2]; \
|
||||||
memcpy_swap64le(b0, b, 2); \
|
memcpy_swap64le(b0, b, 2); \
|
||||||
chunk2[0] = SWAP64LE(SWAP64LE(chunk1_old[0]) + b0[0]); \
|
chunk2[0] = SWAP64LE(chunk1_old[0] + b0[0]); \
|
||||||
chunk2[1] = SWAP64LE(SWAP64LE(chunk1_old[1]) + b0[1]); \
|
chunk2[1] = SWAP64LE(chunk1_old[1] + b0[1]); \
|
||||||
|
if (variant >= 4) \
|
||||||
|
{ \
|
||||||
|
uint64_t out_copy[2]; \
|
||||||
|
memcpy_swap64le(out_copy, out, 2); \
|
||||||
|
chunk1_old[0] ^= chunk2_old[0]; \
|
||||||
|
chunk1_old[1] ^= chunk2_old[1]; \
|
||||||
|
out_copy[0] ^= chunk3_old[0]; \
|
||||||
|
out_copy[1] ^= chunk3_old[1]; \
|
||||||
|
out_copy[0] ^= chunk1_old[0]; \
|
||||||
|
out_copy[1] ^= chunk1_old[1]; \
|
||||||
|
memcpy_swap64le(out, out_copy, 2); \
|
||||||
|
} \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
#define VARIANT2_INTEGER_MATH_DIVISION_STEP(b, ptr) \
|
#define VARIANT2_INTEGER_MATH_DIVISION_STEP(b, ptr) \
|
||||||
|
@ -201,18 +293,18 @@ extern void aesb_pseudo_round(const uint8_t *in, uint8_t *out, const uint8_t *ex
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define VARIANT2_2_PORTABLE() \
|
#define VARIANT2_2_PORTABLE() \
|
||||||
if (variant >= 2) { \
|
if (variant == 2 || variant == 3) { \
|
||||||
xor_blocks(long_state + (j ^ 0x10), d); \
|
xor_blocks(long_state + (j ^ 0x10), d); \
|
||||||
xor_blocks(d, long_state + (j ^ 0x20)); \
|
xor_blocks(d, long_state + (j ^ 0x20)); \
|
||||||
}
|
}
|
||||||
|
|
||||||
#define VARIANT2_2() \
|
#define VARIANT2_2() \
|
||||||
do if (variant >= 2) \
|
do if (variant == 2 || variant == 3) \
|
||||||
{ \
|
{ \
|
||||||
*U64(hp_state + (j ^ 0x10)) ^= SWAP64LE(hi); \
|
*U64(local_hp_state + (j ^ 0x10)) ^= SWAP64LE(hi); \
|
||||||
*(U64(hp_state + (j ^ 0x10)) + 1) ^= SWAP64LE(lo); \
|
*(U64(local_hp_state + (j ^ 0x10)) + 1) ^= SWAP64LE(lo); \
|
||||||
hi ^= SWAP64LE(*U64(hp_state + (j ^ 0x20))); \
|
hi ^= SWAP64LE(*U64(local_hp_state + (j ^ 0x20))); \
|
||||||
lo ^= SWAP64LE(*(U64(hp_state + (j ^ 0x20)) + 1)); \
|
lo ^= SWAP64LE(*(U64(local_hp_state + (j ^ 0x20)) + 1)); \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
#define V4_REG_LOAD(dst, src) \
|
#define V4_REG_LOAD(dst, src) \
|
||||||
|
@ -225,34 +317,56 @@ extern void aesb_pseudo_round(const uint8_t *in, uint8_t *out, const uint8_t *ex
|
||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
#define VARIANT4_RANDOM_MATH_INIT() \
|
#define VARIANT4_RANDOM_MATH_INIT() \
|
||||||
v4_reg r[8]; \
|
v4_reg r[9]; \
|
||||||
struct V4_Instruction code[TOTAL_LATENCY * ALU_COUNT + 1]; \
|
struct V4_Instruction code[NUM_INSTRUCTIONS_MAX + 1]; \
|
||||||
|
int jit = use_v4_jit(); \
|
||||||
do if (variant >= 4) \
|
do if (variant >= 4) \
|
||||||
{ \
|
{ \
|
||||||
for (int i = 0; i < 4; ++i) \
|
for (int i = 0; i < 4; ++i) \
|
||||||
V4_REG_LOAD(r + i, (uint8_t*)(state.hs.w + 12) + sizeof(v4_reg) * i); \
|
V4_REG_LOAD(r + i, (uint8_t*)(state.hs.w + 12) + sizeof(v4_reg) * i); \
|
||||||
v4_random_math_init(code, height); \
|
v4_random_math_init(code, height); \
|
||||||
|
if (jit) \
|
||||||
|
{ \
|
||||||
|
int ret = v4_generate_JIT_code(code, hp_jitfunc, 4096); \
|
||||||
|
if (ret < 0) \
|
||||||
|
local_abort("Error generating CryptonightR code"); \
|
||||||
|
} \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
#define VARIANT4_RANDOM_MATH(a, b, r, _b, _b1) \
|
#define VARIANT4_RANDOM_MATH(a, b, r, _b, _b1) \
|
||||||
do if (variant >= 4) \
|
do if (variant >= 4) \
|
||||||
{ \
|
{ \
|
||||||
uint64_t t; \
|
uint64_t t[2]; \
|
||||||
memcpy(&t, b, sizeof(uint64_t)); \
|
memcpy(t, b, sizeof(uint64_t)); \
|
||||||
\
|
\
|
||||||
if (sizeof(v4_reg) == sizeof(uint32_t)) \
|
if (sizeof(v4_reg) == sizeof(uint32_t)) \
|
||||||
t ^= SWAP64LE((r[0] + r[1]) | ((uint64_t)(r[2] + r[3]) << 32)); \
|
t[0] ^= SWAP64LE((r[0] + r[1]) | ((uint64_t)(r[2] + r[3]) << 32)); \
|
||||||
else \
|
else \
|
||||||
t ^= SWAP64LE((r[0] + r[1]) ^ (r[2] + r[3])); \
|
t[0] ^= SWAP64LE((r[0] + r[1]) ^ (r[2] + r[3])); \
|
||||||
\
|
\
|
||||||
memcpy(b, &t, sizeof(uint64_t)); \
|
memcpy(b, t, sizeof(uint64_t)); \
|
||||||
\
|
\
|
||||||
V4_REG_LOAD(r + 4, a); \
|
V4_REG_LOAD(r + 4, a); \
|
||||||
V4_REG_LOAD(r + 5, (uint64_t*)(a) + 1); \
|
V4_REG_LOAD(r + 5, (uint64_t*)(a) + 1); \
|
||||||
V4_REG_LOAD(r + 6, _b); \
|
V4_REG_LOAD(r + 6, _b); \
|
||||||
V4_REG_LOAD(r + 7, _b1); \
|
V4_REG_LOAD(r + 7, _b1); \
|
||||||
|
V4_REG_LOAD(r + 8, (uint64_t*)(_b1) + 1); \
|
||||||
\
|
\
|
||||||
|
if (jit) \
|
||||||
|
(*hp_jitfunc)(r); \
|
||||||
|
else \
|
||||||
v4_random_math(code, r); \
|
v4_random_math(code, r); \
|
||||||
|
\
|
||||||
|
memcpy(t, a, sizeof(uint64_t) * 2); \
|
||||||
|
\
|
||||||
|
if (sizeof(v4_reg) == sizeof(uint32_t)) { \
|
||||||
|
t[0] ^= SWAP64LE(r[2] | ((uint64_t)(r[3]) << 32)); \
|
||||||
|
t[1] ^= SWAP64LE(r[0] | ((uint64_t)(r[1]) << 32)); \
|
||||||
|
} else { \
|
||||||
|
t[0] ^= SWAP64LE(r[2] ^ r[3]); \
|
||||||
|
t[1] ^= SWAP64LE(r[0] ^ r[1]); \
|
||||||
|
} \
|
||||||
|
memcpy(a, t, sizeof(uint64_t) * 2); \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
|
|
||||||
|
@ -318,7 +432,7 @@ extern void aesb_pseudo_round(const uint8_t *in, uint8_t *out, const uint8_t *ex
|
||||||
|
|
||||||
#define pre_aes() \
|
#define pre_aes() \
|
||||||
j = state_index(a); \
|
j = state_index(a); \
|
||||||
_c = _mm_load_si128(R128(&hp_state[j])); \
|
_c = _mm_load_si128(R128(&local_hp_state[j])); \
|
||||||
_a = _mm_load_si128(R128(a)); \
|
_a = _mm_load_si128(R128(a)); \
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -331,32 +445,26 @@ extern void aesb_pseudo_round(const uint8_t *in, uint8_t *out, const uint8_t *ex
|
||||||
* This code is based upon an optimized implementation by dga.
|
* This code is based upon an optimized implementation by dga.
|
||||||
*/
|
*/
|
||||||
#define post_aes() \
|
#define post_aes() \
|
||||||
VARIANT2_SHUFFLE_ADD_SSE2(hp_state, j); \
|
VARIANT2_SHUFFLE_ADD_SSE2(local_hp_state, j); \
|
||||||
_mm_store_si128(R128(c), _c); \
|
_mm_store_si128(R128(c), _c); \
|
||||||
_mm_store_si128(R128(&hp_state[j]), _mm_xor_si128(_b, _c)); \
|
_mm_store_si128(R128(&local_hp_state[j]), _mm_xor_si128(_b, _c)); \
|
||||||
VARIANT1_1(&hp_state[j]); \
|
VARIANT1_1(&local_hp_state[j]); \
|
||||||
j = state_index(c); \
|
j = state_index(c); \
|
||||||
p = U64(&hp_state[j]); \
|
p = U64(&local_hp_state[j]); \
|
||||||
b[0] = p[0]; b[1] = p[1]; \
|
b[0] = p[0]; b[1] = p[1]; \
|
||||||
VARIANT2_INTEGER_MATH_SSE2(b, c); \
|
VARIANT2_INTEGER_MATH_SSE2(b, c); \
|
||||||
VARIANT4_RANDOM_MATH(a, b, r, &_b, &_b1); \
|
VARIANT4_RANDOM_MATH(a, b, r, &_b, &_b1); \
|
||||||
__mul(); \
|
__mul(); \
|
||||||
VARIANT2_2(); \
|
VARIANT2_2(); \
|
||||||
VARIANT2_SHUFFLE_ADD_SSE2(hp_state, j); \
|
VARIANT2_SHUFFLE_ADD_SSE2(local_hp_state, j); \
|
||||||
a[0] += hi; a[1] += lo; \
|
a[0] += hi; a[1] += lo; \
|
||||||
p = U64(&hp_state[j]); \
|
p = U64(&local_hp_state[j]); \
|
||||||
p[0] = a[0]; p[1] = a[1]; \
|
p[0] = a[0]; p[1] = a[1]; \
|
||||||
a[0] ^= b[0]; a[1] ^= b[1]; \
|
a[0] ^= b[0]; a[1] ^= b[1]; \
|
||||||
VARIANT1_2(p + 1); \
|
VARIANT1_2(p + 1); \
|
||||||
_b1 = _b; \
|
_b1 = _b; \
|
||||||
_b = _c; \
|
_b = _c; \
|
||||||
|
|
||||||
#if defined(_MSC_VER)
|
|
||||||
#define THREADV __declspec(thread)
|
|
||||||
#else
|
|
||||||
#define THREADV __thread
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#pragma pack(push, 1)
|
#pragma pack(push, 1)
|
||||||
union cn_slow_hash_state
|
union cn_slow_hash_state
|
||||||
{
|
{
|
||||||
|
@ -371,6 +479,9 @@ union cn_slow_hash_state
|
||||||
|
|
||||||
THREADV uint8_t *hp_state = NULL;
|
THREADV uint8_t *hp_state = NULL;
|
||||||
THREADV int hp_allocated = 0;
|
THREADV int hp_allocated = 0;
|
||||||
|
THREADV v4_random_math_JIT_func hp_jitfunc = NULL;
|
||||||
|
THREADV uint8_t *hp_jitfunc_memory = NULL;
|
||||||
|
THREADV int hp_jitfunc_allocated = 0;
|
||||||
|
|
||||||
#if defined(_MSC_VER)
|
#if defined(_MSC_VER)
|
||||||
#define cpuid(info,x) __cpuidex(info,x,0)
|
#define cpuid(info,x) __cpuidex(info,x,0)
|
||||||
|
@ -409,25 +520,6 @@ STATIC INLINE void xor64(uint64_t *a, const uint64_t b)
|
||||||
* @return true if the CPU supports AES, false otherwise
|
* @return true if the CPU supports AES, false otherwise
|
||||||
*/
|
*/
|
||||||
|
|
||||||
STATIC INLINE int force_software_aes(void)
|
|
||||||
{
|
|
||||||
static int use = -1;
|
|
||||||
|
|
||||||
if (use != -1)
|
|
||||||
return use;
|
|
||||||
|
|
||||||
const char *env = getenv("MONERO_USE_SOFTWARE_AES");
|
|
||||||
if (!env) {
|
|
||||||
use = 0;
|
|
||||||
}
|
|
||||||
else if (!strcmp(env, "0") || !strcmp(env, "no")) {
|
|
||||||
use = 0;
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
use = 1;
|
|
||||||
}
|
|
||||||
return use;
|
|
||||||
}
|
|
||||||
|
|
||||||
STATIC INLINE int check_aes_hw(void)
|
STATIC INLINE int check_aes_hw(void)
|
||||||
{
|
{
|
||||||
|
@ -666,10 +758,10 @@ void cn_slow_hash_allocate_state(void)
|
||||||
#if defined(__APPLE__) || defined(__FreeBSD__) || defined(__OpenBSD__) || \
|
#if defined(__APPLE__) || defined(__FreeBSD__) || defined(__OpenBSD__) || \
|
||||||
defined(__DragonFly__) || defined(__NetBSD__)
|
defined(__DragonFly__) || defined(__NetBSD__)
|
||||||
hp_state = mmap(0, MEMORY, PROT_READ | PROT_WRITE,
|
hp_state = mmap(0, MEMORY, PROT_READ | PROT_WRITE,
|
||||||
MAP_PRIVATE | MAP_ANON, 0, 0);
|
MAP_PRIVATE | MAP_ANON, -1, 0);
|
||||||
#else
|
#else
|
||||||
hp_state = mmap(0, MEMORY, PROT_READ | PROT_WRITE,
|
hp_state = mmap(0, MEMORY, PROT_READ | PROT_WRITE,
|
||||||
MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, 0, 0);
|
MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, -1, 0);
|
||||||
#endif
|
#endif
|
||||||
if(hp_state == MAP_FAILED)
|
if(hp_state == MAP_FAILED)
|
||||||
hp_state = NULL;
|
hp_state = NULL;
|
||||||
|
@ -680,6 +772,35 @@ void cn_slow_hash_allocate_state(void)
|
||||||
hp_allocated = 0;
|
hp_allocated = 0;
|
||||||
hp_state = (uint8_t *) malloc(MEMORY);
|
hp_state = (uint8_t *) malloc(MEMORY);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
#if defined(_MSC_VER) || defined(__MINGW32__)
|
||||||
|
hp_jitfunc_memory = (uint8_t *) VirtualAlloc(hp_jitfunc_memory, 4096 + 4095,
|
||||||
|
MEM_COMMIT | MEM_RESERVE, PAGE_EXECUTE_READWRITE);
|
||||||
|
#else
|
||||||
|
#if defined(__APPLE__) || defined(__FreeBSD__) || defined(__OpenBSD__) || \
|
||||||
|
defined(__DragonFly__) || defined(__NetBSD__)
|
||||||
|
#ifdef __NetBSD__
|
||||||
|
#define RESERVED_FLAGS PROT_MPROTECT(PROT_EXEC)
|
||||||
|
#else
|
||||||
|
#define RESERVED_FLAGS 0
|
||||||
|
#endif
|
||||||
|
hp_jitfunc_memory = mmap(0, 4096 + 4096, PROT_READ | PROT_WRITE | RESERVED_FLAGS,
|
||||||
|
MAP_PRIVATE | MAP_ANON, -1, 0);
|
||||||
|
#else
|
||||||
|
hp_jitfunc_memory = mmap(0, 4096 + 4096, PROT_READ | PROT_WRITE | PROT_EXEC,
|
||||||
|
MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
|
||||||
|
#endif
|
||||||
|
if(hp_jitfunc_memory == MAP_FAILED)
|
||||||
|
hp_jitfunc_memory = NULL;
|
||||||
|
#endif
|
||||||
|
hp_jitfunc_allocated = 1;
|
||||||
|
if (hp_jitfunc_memory == NULL)
|
||||||
|
{
|
||||||
|
hp_jitfunc_allocated = 0;
|
||||||
|
hp_jitfunc_memory = malloc(4096 + 4095);
|
||||||
|
}
|
||||||
|
hp_jitfunc = (v4_random_math_JIT_func)((size_t)(hp_jitfunc_memory + 4095) & ~4095);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -702,8 +823,22 @@ void cn_slow_hash_free_state(void)
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if(!hp_jitfunc_allocated)
|
||||||
|
free(hp_jitfunc_memory);
|
||||||
|
else
|
||||||
|
{
|
||||||
|
#if defined(_MSC_VER) || defined(__MINGW32__)
|
||||||
|
VirtualFree(hp_jitfunc_memory, 0, MEM_RELEASE);
|
||||||
|
#else
|
||||||
|
munmap(hp_jitfunc_memory, 4096 + 4095);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
hp_state = NULL;
|
hp_state = NULL;
|
||||||
hp_allocated = 0;
|
hp_allocated = 0;
|
||||||
|
hp_jitfunc = NULL;
|
||||||
|
hp_jitfunc_memory = NULL;
|
||||||
|
hp_jitfunc_allocated = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -787,7 +922,7 @@ void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int
|
||||||
for(i = 0; i < MEMORY / INIT_SIZE_BYTE; i++)
|
for(i = 0; i < MEMORY / INIT_SIZE_BYTE; i++)
|
||||||
{
|
{
|
||||||
aes_pseudo_round(text, text, expandedKey, INIT_SIZE_BLK);
|
aes_pseudo_round(text, text, expandedKey, INIT_SIZE_BLK);
|
||||||
memcpy(&hp_state[i * INIT_SIZE_BYTE], text, INIT_SIZE_BYTE);
|
memcpy(&local_hp_state[i * INIT_SIZE_BYTE], text, INIT_SIZE_BYTE);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
|
@ -799,7 +934,7 @@ void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int
|
||||||
for(j = 0; j < INIT_SIZE_BLK; j++)
|
for(j = 0; j < INIT_SIZE_BLK; j++)
|
||||||
aesb_pseudo_round(&text[AES_BLOCK_SIZE * j], &text[AES_BLOCK_SIZE * j], aes_ctx->key->exp_data);
|
aesb_pseudo_round(&text[AES_BLOCK_SIZE * j], &text[AES_BLOCK_SIZE * j], aes_ctx->key->exp_data);
|
||||||
|
|
||||||
memcpy(&hp_state[i * INIT_SIZE_BYTE], text, INIT_SIZE_BYTE);
|
memcpy(&local_hp_state[i * INIT_SIZE_BYTE], text, INIT_SIZE_BYTE);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -847,7 +982,7 @@ void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int
|
||||||
for(i = 0; i < MEMORY / INIT_SIZE_BYTE; i++)
|
for(i = 0; i < MEMORY / INIT_SIZE_BYTE; i++)
|
||||||
{
|
{
|
||||||
// add the xor to the pseudo round
|
// add the xor to the pseudo round
|
||||||
aes_pseudo_round_xor(text, text, expandedKey, &hp_state[i * INIT_SIZE_BYTE], INIT_SIZE_BLK);
|
aes_pseudo_round_xor(text, text, expandedKey, &local_hp_state[i * INIT_SIZE_BYTE], INIT_SIZE_BLK);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
|
@ -857,7 +992,7 @@ void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int
|
||||||
{
|
{
|
||||||
for(j = 0; j < INIT_SIZE_BLK; j++)
|
for(j = 0; j < INIT_SIZE_BLK; j++)
|
||||||
{
|
{
|
||||||
xor_blocks(&text[j * AES_BLOCK_SIZE], &hp_state[i * INIT_SIZE_BYTE + j * AES_BLOCK_SIZE]);
|
xor_blocks(&text[j * AES_BLOCK_SIZE], &local_hp_state[i * INIT_SIZE_BYTE + j * AES_BLOCK_SIZE]);
|
||||||
aesb_pseudo_round(&text[AES_BLOCK_SIZE * j], &text[AES_BLOCK_SIZE * j], aes_ctx->key->exp_data);
|
aesb_pseudo_round(&text[AES_BLOCK_SIZE * j], &text[AES_BLOCK_SIZE * j], aes_ctx->key->exp_data);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -877,6 +1012,44 @@ void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int
|
||||||
}
|
}
|
||||||
|
|
||||||
#elif !defined NO_AES && (defined(__arm__) || defined(__aarch64__))
|
#elif !defined NO_AES && (defined(__arm__) || defined(__aarch64__))
|
||||||
|
#ifdef __aarch64__
|
||||||
|
#include <sys/mman.h>
|
||||||
|
THREADV uint8_t *hp_state = NULL;
|
||||||
|
THREADV int hp_malloced = 0;
|
||||||
|
|
||||||
|
void cn_slow_hash_allocate_state(void)
|
||||||
|
{
|
||||||
|
if(hp_state != NULL)
|
||||||
|
return;
|
||||||
|
|
||||||
|
#ifndef MAP_HUGETLB
|
||||||
|
#define MAP_HUGETLB 0
|
||||||
|
#endif
|
||||||
|
hp_state = mmap(0, MEMORY, PROT_READ | PROT_WRITE,
|
||||||
|
MAP_PRIVATE | MAP_ANON | MAP_HUGETLB, -1, 0);
|
||||||
|
|
||||||
|
if(hp_state == MAP_FAILED)
|
||||||
|
hp_state = NULL;
|
||||||
|
if(hp_state == NULL)
|
||||||
|
{
|
||||||
|
hp_malloced = 1;
|
||||||
|
hp_state = (uint8_t *) malloc(MEMORY);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void cn_slow_hash_free_state(void)
|
||||||
|
{
|
||||||
|
if(hp_state == NULL)
|
||||||
|
return;
|
||||||
|
|
||||||
|
if (hp_malloced)
|
||||||
|
free(hp_state);
|
||||||
|
else
|
||||||
|
munmap(hp_state, MEMORY);
|
||||||
|
hp_state = NULL;
|
||||||
|
hp_malloced = 0;
|
||||||
|
}
|
||||||
|
#else
|
||||||
void cn_slow_hash_allocate_state(void)
|
void cn_slow_hash_allocate_state(void)
|
||||||
{
|
{
|
||||||
// Do nothing, this is just to maintain compatibility with the upgraded slow-hash.c
|
// Do nothing, this is just to maintain compatibility with the upgraded slow-hash.c
|
||||||
|
@ -888,6 +1061,7 @@ void cn_slow_hash_free_state(void)
|
||||||
// As above
|
// As above
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
#if defined(__GNUC__)
|
#if defined(__GNUC__)
|
||||||
#define RDATA_ALIGN16 __attribute__ ((aligned(16)))
|
#define RDATA_ALIGN16 __attribute__ ((aligned(16)))
|
||||||
|
@ -901,6 +1075,8 @@ void cn_slow_hash_free_state(void)
|
||||||
|
|
||||||
#define U64(x) ((uint64_t *) (x))
|
#define U64(x) ((uint64_t *) (x))
|
||||||
|
|
||||||
|
#define hp_jitfunc ((v4_random_math_JIT_func)NULL)
|
||||||
|
|
||||||
STATIC INLINE void xor64(uint64_t *a, const uint64_t b)
|
STATIC INLINE void xor64(uint64_t *a, const uint64_t b)
|
||||||
{
|
{
|
||||||
*a ^= b;
|
*a ^= b;
|
||||||
|
@ -926,6 +1102,23 @@ union cn_slow_hash_state
|
||||||
* and moving between vector and regular registers stalls the pipeline.
|
* and moving between vector and regular registers stalls the pipeline.
|
||||||
*/
|
*/
|
||||||
#include <arm_neon.h>
|
#include <arm_neon.h>
|
||||||
|
#ifndef __APPLE__
|
||||||
|
#include <sys/auxv.h>
|
||||||
|
#include <asm/hwcap.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
STATIC INLINE int check_aes_hw(void)
|
||||||
|
{
|
||||||
|
#ifdef __APPLE__
|
||||||
|
return 1;
|
||||||
|
#else
|
||||||
|
static int supported = -1;
|
||||||
|
|
||||||
|
if(supported < 0)
|
||||||
|
supported = (getauxval(AT_HWCAP) & HWCAP_AES) != 0;
|
||||||
|
return supported;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
#define TOTALBLOCKS (MEMORY / AES_BLOCK_SIZE)
|
#define TOTALBLOCKS (MEMORY / AES_BLOCK_SIZE)
|
||||||
|
|
||||||
|
@ -935,24 +1128,24 @@ union cn_slow_hash_state
|
||||||
|
|
||||||
#define pre_aes() \
|
#define pre_aes() \
|
||||||
j = state_index(a); \
|
j = state_index(a); \
|
||||||
_c = vld1q_u8(&hp_state[j]); \
|
_c = vld1q_u8(&local_hp_state[j]); \
|
||||||
_a = vld1q_u8((const uint8_t *)a); \
|
_a = vld1q_u8((const uint8_t *)a); \
|
||||||
|
|
||||||
#define post_aes() \
|
#define post_aes() \
|
||||||
VARIANT2_SHUFFLE_ADD_NEON(hp_state, j); \
|
VARIANT2_SHUFFLE_ADD_NEON(local_hp_state, j); \
|
||||||
vst1q_u8((uint8_t *)c, _c); \
|
vst1q_u8((uint8_t *)c, _c); \
|
||||||
vst1q_u8(&hp_state[j], veorq_u8(_b, _c)); \
|
vst1q_u8(&local_hp_state[j], veorq_u8(_b, _c)); \
|
||||||
VARIANT1_1(&hp_state[j]); \
|
VARIANT1_1(&local_hp_state[j]); \
|
||||||
j = state_index(c); \
|
j = state_index(c); \
|
||||||
p = U64(&hp_state[j]); \
|
p = U64(&local_hp_state[j]); \
|
||||||
b[0] = p[0]; b[1] = p[1]; \
|
b[0] = p[0]; b[1] = p[1]; \
|
||||||
VARIANT2_PORTABLE_INTEGER_MATH(b, c); \
|
VARIANT2_PORTABLE_INTEGER_MATH(b, c); \
|
||||||
VARIANT4_RANDOM_MATH(a, b, r, &_b, &_b1); \
|
VARIANT4_RANDOM_MATH(a, b, r, &_b, &_b1); \
|
||||||
__mul(); \
|
__mul(); \
|
||||||
VARIANT2_2(); \
|
VARIANT2_2(); \
|
||||||
VARIANT2_SHUFFLE_ADD_NEON(hp_state, j); \
|
VARIANT2_SHUFFLE_ADD_NEON(local_hp_state, j); \
|
||||||
a[0] += hi; a[1] += lo; \
|
a[0] += hi; a[1] += lo; \
|
||||||
p = U64(&hp_state[j]); \
|
p = U64(&local_hp_state[j]); \
|
||||||
p[0] = a[0]; p[1] = a[1]; \
|
p[0] = a[0]; p[1] = a[1]; \
|
||||||
a[0] ^= b[0]; a[1] ^= b[1]; \
|
a[0] ^= b[0]; a[1] ^= b[1]; \
|
||||||
VARIANT1_2(p + 1); \
|
VARIANT1_2(p + 1); \
|
||||||
|
@ -1022,7 +1215,6 @@ __asm__(
|
||||||
STATIC INLINE void aes_pseudo_round(const uint8_t *in, uint8_t *out, const uint8_t *expandedKey, int nblocks)
|
STATIC INLINE void aes_pseudo_round(const uint8_t *in, uint8_t *out, const uint8_t *expandedKey, int nblocks)
|
||||||
{
|
{
|
||||||
const uint8x16_t *k = (const uint8x16_t *)expandedKey, zero = {0};
|
const uint8x16_t *k = (const uint8x16_t *)expandedKey, zero = {0};
|
||||||
uint8x16_t tmp;
|
|
||||||
int i;
|
int i;
|
||||||
|
|
||||||
for (i=0; i<nblocks; i++)
|
for (i=0; i<nblocks; i++)
|
||||||
|
@ -1057,7 +1249,6 @@ STATIC INLINE void aes_pseudo_round_xor(const uint8_t *in, uint8_t *out, const u
|
||||||
{
|
{
|
||||||
const uint8x16_t *k = (const uint8x16_t *)expandedKey;
|
const uint8x16_t *k = (const uint8x16_t *)expandedKey;
|
||||||
const uint8x16_t *x = (const uint8x16_t *)xor;
|
const uint8x16_t *x = (const uint8x16_t *)xor;
|
||||||
uint8x16_t tmp;
|
|
||||||
int i;
|
int i;
|
||||||
|
|
||||||
for (i=0; i<nblocks; i++)
|
for (i=0; i<nblocks; i++)
|
||||||
|
@ -1110,16 +1301,17 @@ STATIC INLINE void aligned_free(void *ptr)
|
||||||
}
|
}
|
||||||
#endif /* FORCE_USE_HEAP */
|
#endif /* FORCE_USE_HEAP */
|
||||||
|
|
||||||
|
STATIC INLINE void xor_blocks(uint8_t* a, const uint8_t* b)
|
||||||
|
{
|
||||||
|
U64(a)[0] ^= U64(b)[0];
|
||||||
|
U64(a)[1] ^= U64(b)[1];
|
||||||
|
}
|
||||||
|
|
||||||
void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int prehashed, uint64_t height)
|
void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int prehashed, uint64_t height)
|
||||||
{
|
{
|
||||||
RDATA_ALIGN16 uint8_t expandedKey[240];
|
RDATA_ALIGN16 uint8_t expandedKey[240];
|
||||||
|
|
||||||
#ifndef FORCE_USE_HEAP
|
uint8_t *local_hp_state;
|
||||||
RDATA_ALIGN16 uint8_t hp_state[MEMORY];
|
|
||||||
#else
|
|
||||||
uint8_t *hp_state = (uint8_t *)aligned_malloc(MEMORY,16);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
uint8_t text[INIT_SIZE_BYTE];
|
uint8_t text[INIT_SIZE_BYTE];
|
||||||
RDATA_ALIGN16 uint64_t a[2];
|
RDATA_ALIGN16 uint64_t a[2];
|
||||||
RDATA_ALIGN16 uint64_t b[4];
|
RDATA_ALIGN16 uint64_t b[4];
|
||||||
|
@ -1130,12 +1322,22 @@ void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int
|
||||||
|
|
||||||
size_t i, j;
|
size_t i, j;
|
||||||
uint64_t *p = NULL;
|
uint64_t *p = NULL;
|
||||||
|
oaes_ctx *aes_ctx = NULL;
|
||||||
|
int useAes = !force_software_aes() && check_aes_hw();
|
||||||
|
|
||||||
static void (*const extra_hashes[4])(const void *, size_t, char *) =
|
static void (*const extra_hashes[4])(const void *, size_t, char *) =
|
||||||
{
|
{
|
||||||
hash_extra_blake, hash_extra_groestl, hash_extra_jh, hash_extra_skein
|
hash_extra_blake, hash_extra_groestl, hash_extra_jh, hash_extra_skein
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// this isn't supposed to happen, but guard against it for now.
|
||||||
|
if(hp_state == NULL)
|
||||||
|
cn_slow_hash_allocate_state();
|
||||||
|
|
||||||
|
// locals to avoid constant TLS dereferencing
|
||||||
|
local_hp_state = hp_state;
|
||||||
|
|
||||||
|
// locals to avoid constant TLS dereferencing
|
||||||
/* CryptoNight Step 1: Use Keccak1600 to initialize the 'state' (and 'text') buffers from the data. */
|
/* CryptoNight Step 1: Use Keccak1600 to initialize the 'state' (and 'text') buffers from the data. */
|
||||||
|
|
||||||
if (prehashed) {
|
if (prehashed) {
|
||||||
|
@ -1153,11 +1355,26 @@ void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int
|
||||||
* the 2MB large random access buffer.
|
* the 2MB large random access buffer.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
if(useAes)
|
||||||
|
{
|
||||||
aes_expand_key(state.hs.b, expandedKey);
|
aes_expand_key(state.hs.b, expandedKey);
|
||||||
for(i = 0; i < MEMORY / INIT_SIZE_BYTE; i++)
|
for(i = 0; i < MEMORY / INIT_SIZE_BYTE; i++)
|
||||||
{
|
{
|
||||||
aes_pseudo_round(text, text, expandedKey, INIT_SIZE_BLK);
|
aes_pseudo_round(text, text, expandedKey, INIT_SIZE_BLK);
|
||||||
memcpy(&hp_state[i * INIT_SIZE_BYTE], text, INIT_SIZE_BYTE);
|
memcpy(&local_hp_state[i * INIT_SIZE_BYTE], text, INIT_SIZE_BYTE);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
aes_ctx = (oaes_ctx *) oaes_alloc();
|
||||||
|
oaes_key_import_data(aes_ctx, state.hs.b, AES_KEY_SIZE);
|
||||||
|
for(i = 0; i < MEMORY / INIT_SIZE_BYTE; i++)
|
||||||
|
{
|
||||||
|
for(j = 0; j < INIT_SIZE_BLK; j++)
|
||||||
|
aesb_pseudo_round(&text[AES_BLOCK_SIZE * j], &text[AES_BLOCK_SIZE * j], aes_ctx->key->exp_data);
|
||||||
|
|
||||||
|
memcpy(&local_hp_state[i * INIT_SIZE_BYTE], text, INIT_SIZE_BYTE);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
U64(a)[0] = U64(&state.k[0])[0] ^ U64(&state.k[32])[0];
|
U64(a)[0] = U64(&state.k[0])[0] ^ U64(&state.k[32])[0];
|
||||||
|
@ -1173,6 +1390,8 @@ void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int
|
||||||
_b = vld1q_u8((const uint8_t *)b);
|
_b = vld1q_u8((const uint8_t *)b);
|
||||||
_b1 = vld1q_u8(((const uint8_t *)b) + AES_BLOCK_SIZE);
|
_b1 = vld1q_u8(((const uint8_t *)b) + AES_BLOCK_SIZE);
|
||||||
|
|
||||||
|
if(useAes)
|
||||||
|
{
|
||||||
for(i = 0; i < ITER / 2; i++)
|
for(i = 0; i < ITER / 2; i++)
|
||||||
{
|
{
|
||||||
pre_aes();
|
pre_aes();
|
||||||
|
@ -1181,6 +1400,17 @@ void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int
|
||||||
_c = veorq_u8(_c, _a);
|
_c = veorq_u8(_c, _a);
|
||||||
post_aes();
|
post_aes();
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
for(i = 0; i < ITER / 2; i++)
|
||||||
|
{
|
||||||
|
pre_aes();
|
||||||
|
aesb_single_round((uint8_t *) &_c, (uint8_t *) &_c, (uint8_t *) &_a);
|
||||||
|
post_aes();
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
/* CryptoNight Step 4: Sequentially pass through the mixing buffer and use 10 rounds
|
/* CryptoNight Step 4: Sequentially pass through the mixing buffer and use 10 rounds
|
||||||
* of AES encryption to mix the random data back into the 'text' buffer. 'text'
|
* of AES encryption to mix the random data back into the 'text' buffer. 'text'
|
||||||
|
@ -1188,11 +1418,27 @@ void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int
|
||||||
|
|
||||||
memcpy(text, state.init, INIT_SIZE_BYTE);
|
memcpy(text, state.init, INIT_SIZE_BYTE);
|
||||||
|
|
||||||
|
if(useAes)
|
||||||
|
{
|
||||||
aes_expand_key(&state.hs.b[32], expandedKey);
|
aes_expand_key(&state.hs.b[32], expandedKey);
|
||||||
for(i = 0; i < MEMORY / INIT_SIZE_BYTE; i++)
|
for(i = 0; i < MEMORY / INIT_SIZE_BYTE; i++)
|
||||||
{
|
{
|
||||||
// add the xor to the pseudo round
|
// add the xor to the pseudo round
|
||||||
aes_pseudo_round_xor(text, text, expandedKey, &hp_state[i * INIT_SIZE_BYTE], INIT_SIZE_BLK);
|
aes_pseudo_round_xor(text, text, expandedKey, &local_hp_state[i * INIT_SIZE_BYTE], INIT_SIZE_BLK);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
oaes_key_import_data(aes_ctx, &state.hs.b[32], AES_KEY_SIZE);
|
||||||
|
for(i = 0; i < MEMORY / INIT_SIZE_BYTE; i++)
|
||||||
|
{
|
||||||
|
for(j = 0; j < INIT_SIZE_BLK; j++)
|
||||||
|
{
|
||||||
|
xor_blocks(&text[j * AES_BLOCK_SIZE], &local_hp_state[i * INIT_SIZE_BYTE + j * AES_BLOCK_SIZE]);
|
||||||
|
aesb_pseudo_round(&text[AES_BLOCK_SIZE * j], &text[AES_BLOCK_SIZE * j], aes_ctx->key->exp_data);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
oaes_free((OAES_CTX **) &aes_ctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* CryptoNight Step 5: Apply Keccak to the state again, and then
|
/* CryptoNight Step 5: Apply Keccak to the state again, and then
|
||||||
|
@ -1205,10 +1451,6 @@ void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int
|
||||||
memcpy(state.init, text, INIT_SIZE_BYTE);
|
memcpy(state.init, text, INIT_SIZE_BYTE);
|
||||||
hash_permutation(&state.hs);
|
hash_permutation(&state.hs);
|
||||||
extra_hashes[state.hs.b[0] & 3](&state, 200, hash);
|
extra_hashes[state.hs.b[0] & 3](&state, 200, hash);
|
||||||
|
|
||||||
#ifdef FORCE_USE_HEAP
|
|
||||||
aligned_free(hp_state);
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
#else /* aarch64 && crypto */
|
#else /* aarch64 && crypto */
|
||||||
|
|
||||||
|
@ -1330,6 +1572,7 @@ void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int
|
||||||
{
|
{
|
||||||
uint8_t text[INIT_SIZE_BYTE];
|
uint8_t text[INIT_SIZE_BYTE];
|
||||||
uint8_t a[AES_BLOCK_SIZE];
|
uint8_t a[AES_BLOCK_SIZE];
|
||||||
|
uint8_t a1[AES_BLOCK_SIZE];
|
||||||
uint8_t b[AES_BLOCK_SIZE * 2];
|
uint8_t b[AES_BLOCK_SIZE * 2];
|
||||||
uint8_t c[AES_BLOCK_SIZE];
|
uint8_t c[AES_BLOCK_SIZE];
|
||||||
uint8_t c1[AES_BLOCK_SIZE];
|
uint8_t c1[AES_BLOCK_SIZE];
|
||||||
|
@ -1389,10 +1632,10 @@ void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int
|
||||||
// Iteration 1
|
// Iteration 1
|
||||||
j = state_index(a);
|
j = state_index(a);
|
||||||
p = &long_state[j];
|
p = &long_state[j];
|
||||||
aesb_single_round(p, p, a);
|
aesb_single_round(p, c1, a);
|
||||||
copy_block(c1, p);
|
|
||||||
|
|
||||||
VARIANT2_PORTABLE_SHUFFLE_ADD(long_state, j);
|
VARIANT2_PORTABLE_SHUFFLE_ADD(c1, a, long_state, j);
|
||||||
|
copy_block(p, c1);
|
||||||
xor_blocks(p, b);
|
xor_blocks(p, b);
|
||||||
VARIANT1_1(p);
|
VARIANT1_1(p);
|
||||||
|
|
||||||
|
@ -1401,14 +1644,15 @@ void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int
|
||||||
p = &long_state[j];
|
p = &long_state[j];
|
||||||
copy_block(c, p);
|
copy_block(c, p);
|
||||||
|
|
||||||
|
copy_block(a1, a);
|
||||||
VARIANT2_PORTABLE_INTEGER_MATH(c, c1);
|
VARIANT2_PORTABLE_INTEGER_MATH(c, c1);
|
||||||
VARIANT4_RANDOM_MATH(a, c, r, b, b + AES_BLOCK_SIZE);
|
VARIANT4_RANDOM_MATH(a1, c, r, b, b + AES_BLOCK_SIZE);
|
||||||
mul(c1, c, d);
|
mul(c1, c, d);
|
||||||
VARIANT2_2_PORTABLE();
|
VARIANT2_2_PORTABLE();
|
||||||
VARIANT2_PORTABLE_SHUFFLE_ADD(long_state, j);
|
VARIANT2_PORTABLE_SHUFFLE_ADD(c1, a, long_state, j);
|
||||||
sum_half_blocks(a, d);
|
sum_half_blocks(a1, d);
|
||||||
swap_blocks(a, c);
|
swap_blocks(a1, c);
|
||||||
xor_blocks(a, c);
|
xor_blocks(a1, c);
|
||||||
VARIANT1_2(U64(c) + 1);
|
VARIANT1_2(U64(c) + 1);
|
||||||
copy_block(p, c);
|
copy_block(p, c);
|
||||||
|
|
||||||
|
@ -1416,6 +1660,7 @@ void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int
|
||||||
copy_block(b + AES_BLOCK_SIZE, b);
|
copy_block(b + AES_BLOCK_SIZE, b);
|
||||||
}
|
}
|
||||||
copy_block(b, c1);
|
copy_block(b, c1);
|
||||||
|
copy_block(a, a1);
|
||||||
}
|
}
|
||||||
|
|
||||||
memcpy(text, state.init, INIT_SIZE_BYTE);
|
memcpy(text, state.init, INIT_SIZE_BYTE);
|
||||||
|
@ -1443,7 +1688,9 @@ void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int
|
||||||
#else
|
#else
|
||||||
// Portable implementation as a fallback
|
// Portable implementation as a fallback
|
||||||
|
|
||||||
void slow_hash_allocate_state(void)
|
#define hp_jitfunc ((v4_random_math_JIT_func)NULL)
|
||||||
|
|
||||||
|
void cn_slow_hash_allocate_state(void)
|
||||||
{
|
{
|
||||||
// Do nothing, this is just to maintain compatibility with the upgraded slow-hash.c
|
// Do nothing, this is just to maintain compatibility with the upgraded slow-hash.c
|
||||||
return;
|
return;
|
||||||
|
@ -1536,6 +1783,7 @@ void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int
|
||||||
union cn_slow_hash_state state;
|
union cn_slow_hash_state state;
|
||||||
uint8_t text[INIT_SIZE_BYTE];
|
uint8_t text[INIT_SIZE_BYTE];
|
||||||
uint8_t a[AES_BLOCK_SIZE];
|
uint8_t a[AES_BLOCK_SIZE];
|
||||||
|
uint8_t a1[AES_BLOCK_SIZE];
|
||||||
uint8_t b[AES_BLOCK_SIZE * 2];
|
uint8_t b[AES_BLOCK_SIZE * 2];
|
||||||
uint8_t c1[AES_BLOCK_SIZE];
|
uint8_t c1[AES_BLOCK_SIZE];
|
||||||
uint8_t c2[AES_BLOCK_SIZE];
|
uint8_t c2[AES_BLOCK_SIZE];
|
||||||
|
@ -1579,7 +1827,7 @@ void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int
|
||||||
j = e2i(a, MEMORY / AES_BLOCK_SIZE) * AES_BLOCK_SIZE;
|
j = e2i(a, MEMORY / AES_BLOCK_SIZE) * AES_BLOCK_SIZE;
|
||||||
copy_block(c1, &long_state[j]);
|
copy_block(c1, &long_state[j]);
|
||||||
aesb_single_round(c1, c1, a);
|
aesb_single_round(c1, c1, a);
|
||||||
VARIANT2_PORTABLE_SHUFFLE_ADD(long_state, j);
|
VARIANT2_PORTABLE_SHUFFLE_ADD(c1, a, long_state, j);
|
||||||
copy_block(&long_state[j], c1);
|
copy_block(&long_state[j], c1);
|
||||||
xor_blocks(&long_state[j], b);
|
xor_blocks(&long_state[j], b);
|
||||||
assert(j == e2i(a, MEMORY / AES_BLOCK_SIZE) * AES_BLOCK_SIZE);
|
assert(j == e2i(a, MEMORY / AES_BLOCK_SIZE) * AES_BLOCK_SIZE);
|
||||||
|
@ -1587,23 +1835,22 @@ void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int
|
||||||
/* Iteration 2 */
|
/* Iteration 2 */
|
||||||
j = e2i(c1, MEMORY / AES_BLOCK_SIZE) * AES_BLOCK_SIZE;
|
j = e2i(c1, MEMORY / AES_BLOCK_SIZE) * AES_BLOCK_SIZE;
|
||||||
copy_block(c2, &long_state[j]);
|
copy_block(c2, &long_state[j]);
|
||||||
|
copy_block(a1, a);
|
||||||
VARIANT2_PORTABLE_INTEGER_MATH(c2, c1);
|
VARIANT2_PORTABLE_INTEGER_MATH(c2, c1);
|
||||||
VARIANT4_RANDOM_MATH(a, c2, r, b, b + AES_BLOCK_SIZE);
|
VARIANT4_RANDOM_MATH(a1, c2, r, b, b + AES_BLOCK_SIZE);
|
||||||
mul(c1, c2, d);
|
mul(c1, c2, d);
|
||||||
VARIANT2_2_PORTABLE();
|
VARIANT2_2_PORTABLE();
|
||||||
VARIANT2_PORTABLE_SHUFFLE_ADD(long_state, j);
|
VARIANT2_PORTABLE_SHUFFLE_ADD(c1, a, long_state, j);
|
||||||
swap_blocks(a, c1);
|
sum_half_blocks(a1, d);
|
||||||
sum_half_blocks(c1, d);
|
swap_blocks(a1, c2);
|
||||||
swap_blocks(c1, c2);
|
xor_blocks(a1, c2);
|
||||||
xor_blocks(c1, c2);
|
|
||||||
VARIANT1_2(c2 + 8);
|
VARIANT1_2(c2 + 8);
|
||||||
copy_block(&long_state[j], c2);
|
copy_block(&long_state[j], c2);
|
||||||
assert(j == e2i(a, MEMORY / AES_BLOCK_SIZE) * AES_BLOCK_SIZE);
|
|
||||||
if (variant >= 2) {
|
if (variant >= 2) {
|
||||||
copy_block(b + AES_BLOCK_SIZE, b);
|
copy_block(b + AES_BLOCK_SIZE, b);
|
||||||
}
|
}
|
||||||
copy_block(b, a);
|
copy_block(b, c1);
|
||||||
copy_block(a, c1);
|
copy_block(a, a1);
|
||||||
}
|
}
|
||||||
|
|
||||||
memcpy(text, state.init, INIT_SIZE_BYTE);
|
memcpy(text, state.init, INIT_SIZE_BYTE);
|
||||||
|
|
|
@ -12,6 +12,12 @@ enum V4_Settings
|
||||||
// Always generate at least 60 instructions
|
// Always generate at least 60 instructions
|
||||||
NUM_INSTRUCTIONS = 60,
|
NUM_INSTRUCTIONS = 60,
|
||||||
|
|
||||||
|
// Always generate at least 60 instructions
|
||||||
|
NUM_INSTRUCTIONS_MIN = 60,
|
||||||
|
|
||||||
|
// Never generate more than 70 instructions (final RET instruction doesn't count here)
|
||||||
|
NUM_INSTRUCTIONS_MAX = 70,
|
||||||
|
|
||||||
// Available ALUs for MUL
|
// Available ALUs for MUL
|
||||||
// Modern CPUs typically have only 1 ALU which can do multiplications
|
// Modern CPUs typically have only 1 ALU which can do multiplications
|
||||||
ALU_COUNT_MUL = 1,
|
ALU_COUNT_MUL = 1,
|
||||||
|
|
Loading…
Reference in a new issue