From 1aa786561972165ac4c510ec2bfdff747663e063 Mon Sep 17 00:00:00 2001 From: tevador Date: Tue, 14 May 2019 09:13:38 +0200 Subject: [PATCH] Abstracted away from x86 intrinsics --- src/aes_hash.cpp | 68 ++++----- src/allocator.cpp | 5 +- src/dataset.cpp | 2 +- src/instructions_portable.cpp | 44 +++--- src/intrin_portable.h | 271 ++++++++++++++++++---------------- src/soft_aes.cpp | 28 ++-- src/soft_aes.h | 12 +- src/virtual_machine.cpp | 10 +- src/vm_interpreted.cpp | 64 ++++---- src/vm_interpreted.hpp | 12 +- 10 files changed, 267 insertions(+), 249 deletions(-) diff --git a/src/aes_hash.cpp b/src/aes_hash.cpp index 9fbbb02..00ae5b0 100644 --- a/src/aes_hash.cpp +++ b/src/aes_hash.cpp @@ -36,21 +36,21 @@ void hashAes1Rx4(const void *input, size_t inputSize, void *hash) { const uint8_t* inptr = (uint8_t*)input; const uint8_t* inputEnd = inptr + inputSize; - __m128i state0, state1, state2, state3; - __m128i in0, in1, in2, in3; + rx_vec_i128 state0, state1, state2, state3; + rx_vec_i128 in0, in1, in2, in3; //intial state - state0 = _mm_set_epi32(0x8d3126fd, 0x1146d167, 0x887af5ab, 0xc4778e00); - state1 = _mm_set_epi32(0x19fe9fa1, 0x58da632b, 0x1b95af89, 0xb834ef4b); - state2 = _mm_set_epi32(0x1bb2cd74, 0xc35ad744, 0xab283a00, 0x7742dd3a); - state3 = _mm_set_epi32(0xbb30a58a, 0x49593c57, 0xdc5d97cc, 0xe18b449a); + state0 = rx_set_int_vec_i128(0x8d3126fd, 0x1146d167, 0x887af5ab, 0xc4778e00); + state1 = rx_set_int_vec_i128(0x19fe9fa1, 0x58da632b, 0x1b95af89, 0xb834ef4b); + state2 = rx_set_int_vec_i128(0x1bb2cd74, 0xc35ad744, 0xab283a00, 0x7742dd3a); + state3 = rx_set_int_vec_i128(0xbb30a58a, 0x49593c57, 0xdc5d97cc, 0xe18b449a); //process 64 bytes at a time in 4 lanes while (inptr < inputEnd) { - in0 = _mm_load_si128((__m128i*)inptr + 0); - in1 = _mm_load_si128((__m128i*)inptr + 1); - in2 = _mm_load_si128((__m128i*)inptr + 2); - in3 = _mm_load_si128((__m128i*)inptr + 3); + in0 = rx_load_vec_i128((rx_vec_i128*)inptr + 0); + in1 = rx_load_vec_i128((rx_vec_i128*)inptr + 1); + in2 = rx_load_vec_i128((rx_vec_i128*)inptr + 2); + in3 = rx_load_vec_i128((rx_vec_i128*)inptr + 3); state0 = aesenc(state0, in0); state1 = aesdec(state1, in1); @@ -61,8 +61,8 @@ void hashAes1Rx4(const void *input, size_t inputSize, void *hash) { } //two extra rounds to achieve full diffusion - __m128i xkey0 = _mm_set_epi32(0x83951283, 0xe4c5593d, 0x2a5a929c, 0x11cbf247); - __m128i xkey1 = _mm_set_epi32(0xff215bb2, 0xabbc2523, 0x477bef0b, 0xce816c95); + rx_vec_i128 xkey0 = rx_set_int_vec_i128(0x83951283, 0xe4c5593d, 0x2a5a929c, 0x11cbf247); + rx_vec_i128 xkey1 = rx_set_int_vec_i128(0xff215bb2, 0xabbc2523, 0x477bef0b, 0xce816c95); state0 = aesenc(state0, xkey0); state1 = aesdec(state1, xkey0); @@ -75,10 +75,10 @@ void hashAes1Rx4(const void *input, size_t inputSize, void *hash) { state3 = aesdec(state3, xkey1); //output hash - _mm_store_si128((__m128i*)hash + 0, state0); - _mm_store_si128((__m128i*)hash + 1, state1); - _mm_store_si128((__m128i*)hash + 2, state2); - _mm_store_si128((__m128i*)hash + 3, state3); + rx_store_vec_i128((rx_vec_i128*)hash + 0, state0); + rx_store_vec_i128((rx_vec_i128*)hash + 1, state1); + rx_store_vec_i128((rx_vec_i128*)hash + 2, state2); + rx_store_vec_i128((rx_vec_i128*)hash + 3, state3); } template void hashAes1Rx4(const void *input, size_t inputSize, void *hash); @@ -99,18 +99,18 @@ void fillAes1Rx4(void *state, size_t outputSize, void *buffer) { const uint8_t* outptr = (uint8_t*)buffer; const uint8_t* outputEnd = outptr + outputSize; - __m128i state0, state1, state2, state3; - __m128i key0, key1, key2, key3; + rx_vec_i128 state0, state1, state2, state3; + rx_vec_i128 key0, key1, key2, key3; - key0 = _mm_set_epi32(0xdf20a2e3, 0xca329132, 0x454ff6d5, 0x84eeec2d); - key1 = _mm_set_epi32(0x1deb5971, 0xfed0387f, 0xf10fc578, 0x017b63d0); - key2 = _mm_set_epi32(0xdfc926b3, 0xa517ceb4, 0x2f2c70a1, 0x327d7a52); - key3 = _mm_set_epi32(0x341cf31c, 0xa0ece0a9, 0x3d17da5e, 0x5c8d77d3); + key0 = rx_set_int_vec_i128(0xdf20a2e3, 0xca329132, 0x454ff6d5, 0x84eeec2d); + key1 = rx_set_int_vec_i128(0x1deb5971, 0xfed0387f, 0xf10fc578, 0x017b63d0); + key2 = rx_set_int_vec_i128(0xdfc926b3, 0xa517ceb4, 0x2f2c70a1, 0x327d7a52); + key3 = rx_set_int_vec_i128(0x341cf31c, 0xa0ece0a9, 0x3d17da5e, 0x5c8d77d3); - state0 = _mm_load_si128((__m128i*)state + 0); - state1 = _mm_load_si128((__m128i*)state + 1); - state2 = _mm_load_si128((__m128i*)state + 2); - state3 = _mm_load_si128((__m128i*)state + 3); + state0 = rx_load_vec_i128((rx_vec_i128*)state + 0); + state1 = rx_load_vec_i128((rx_vec_i128*)state + 1); + state2 = rx_load_vec_i128((rx_vec_i128*)state + 2); + state3 = rx_load_vec_i128((rx_vec_i128*)state + 3); while (outptr < outputEnd) { state0 = aesdec(state0, key0); @@ -118,18 +118,18 @@ void fillAes1Rx4(void *state, size_t outputSize, void *buffer) { state2 = aesdec(state2, key2); state3 = aesenc(state3, key3); - _mm_store_si128((__m128i*)outptr + 0, state0); - _mm_store_si128((__m128i*)outptr + 1, state1); - _mm_store_si128((__m128i*)outptr + 2, state2); - _mm_store_si128((__m128i*)outptr + 3, state3); + rx_store_vec_i128((rx_vec_i128*)outptr + 0, state0); + rx_store_vec_i128((rx_vec_i128*)outptr + 1, state1); + rx_store_vec_i128((rx_vec_i128*)outptr + 2, state2); + rx_store_vec_i128((rx_vec_i128*)outptr + 3, state3); outptr += 64; } - _mm_store_si128((__m128i*)state + 0, state0); - _mm_store_si128((__m128i*)state + 1, state1); - _mm_store_si128((__m128i*)state + 2, state2); - _mm_store_si128((__m128i*)state + 3, state3); + rx_store_vec_i128((rx_vec_i128*)state + 0, state0); + rx_store_vec_i128((rx_vec_i128*)state + 1, state1); + rx_store_vec_i128((rx_vec_i128*)state + 2, state2); + rx_store_vec_i128((rx_vec_i128*)state + 3, state3); } template void fillAes1Rx4(void *state, size_t outputSize, void *buffer); diff --git a/src/allocator.cpp b/src/allocator.cpp index 2c344a8..1ce65c5 100644 --- a/src/allocator.cpp +++ b/src/allocator.cpp @@ -27,7 +27,7 @@ namespace randomx { template void* AlignedAllocator::allocMemory(size_t count) { - void *mem = _mm_malloc(count, alignment); + void *mem = rx_aligned_alloc(count, alignment); if (mem == nullptr) throw std::bad_alloc(); return mem; @@ -35,11 +35,10 @@ namespace randomx { template void AlignedAllocator::freeMemory(void* ptr, size_t count) { - _mm_free(ptr); + rx_aligned_free(ptr); } template class AlignedAllocator; - template class AlignedAllocator;; void* LargePageAllocator::allocMemory(size_t count) { return allocLargePagesMemory(count); diff --git a/src/dataset.cpp b/src/dataset.cpp index 819e855..210589d 100644 --- a/src/dataset.cpp +++ b/src/dataset.cpp @@ -148,7 +148,7 @@ namespace randomx { rl[7] = rl[0] ^ superscalarAdd7; for (unsigned i = 0; i < RANDOMX_CACHE_ACCESSES; ++i) { mixBlock = getMixBlock(registerValue, cache->memory); - PREFETCHNTA(mixBlock); + rx_prefetch_nta(mixBlock); SuperscalarProgram& prog = cache->programs[i]; executeSuperscalar(rl, prog, &cache->reciprocalCache); diff --git a/src/instructions_portable.cpp b/src/instructions_portable.cpp index 32b6c4f..283aa1d 100644 --- a/src/instructions_portable.cpp +++ b/src/instructions_portable.cpp @@ -123,32 +123,32 @@ along with RandomX. If not, see. #define HAVE_SMULH #endif -void setRoundMode(uint32_t rcflag) { - switch (rcflag & 3) { - case RoundDown: - setRoundMode_(FE_DOWNWARD); - break; - case RoundUp: - setRoundMode_(FE_UPWARD); - break; - case RoundToZero: - setRoundMode_(FE_TOWARDZERO); - break; - case RoundToNearest: - setRoundMode_(FE_TONEAREST); - break; - default: - UNREACHABLE; +#ifdef RANDOMX_DEFAULT_FENV + +void rx_reset_float_state() { + setRoundMode_(FE_TONEAREST); +} + +void rx_set_rounding_mode(uint32_t mode) { + switch (mode & 3) { + case RoundDown: + setRoundMode_(FE_DOWNWARD); + break; + case RoundUp: + setRoundMode_(FE_UPWARD); + break; + case RoundToZero: + setRoundMode_(FE_TOWARDZERO); + break; + case RoundToNearest: + setRoundMode_(FE_TONEAREST); + break; + default: + UNREACHABLE; } } -void initFpu() { -#ifdef __SSE2__ - _mm_setcsr(0x9FC0); //Flush to zero, denormals are zero, default rounding mode, all exceptions disabled -#else - setRoundMode(FE_TONEAREST); #endif -} union double_ser_t { double f; diff --git a/src/intrin_portable.h b/src/intrin_portable.h index 4364610..265ef8b 100644 --- a/src/intrin_portable.h +++ b/src/intrin_portable.h @@ -20,6 +20,7 @@ along with RandomX. If not, see. #pragma once #include +#include "blake2/endian.h" constexpr int32_t unsigned32ToSigned2sCompl(uint32_t x) { return (-1 == ~0) ? (int32_t)x : (x > INT32_MAX ? (-(int32_t)(UINT32_MAX - x) - 1) : (int32_t)x); @@ -33,6 +34,11 @@ constexpr uint64_t signExtend2sCompl(uint32_t x) { return (-1 == ~0) ? (int64_t)(int32_t)(x) : (x > INT32_MAX ? (x | 0xffffffff00000000ULL) : (uint64_t)x); } +constexpr int RoundToNearest = 0; +constexpr int RoundDown = 1; +constexpr int RoundUp = 2; +constexpr int RoundToZero = 3; + #if defined(_MSC_VER) #if defined(_M_X64) || (defined(_M_IX86_FP) && _M_IX86_FP == 2) #define __SSE2__ 1 @@ -46,185 +52,230 @@ constexpr uint64_t signExtend2sCompl(uint32_t x) { #include #endif -#define PREFETCHNTA(x) _mm_prefetch((const char *)(x), _MM_HINT_NTA) +typedef __m128i rx_vec_i128; +typedef __m128d rx_vec_f128; + +#define rx_aligned_alloc(a, b) _mm_malloc(a,b) +#define rx_aligned_free(a) _mm_free(a) +#define rx_prefetch_nta(x) _mm_prefetch((const char *)(x), _MM_HINT_NTA) + +#define rx_load_vec_f128 _mm_load_pd +#define rx_store_vec_f128 _mm_store_pd +#define rx_shuffle_vec_f128 _mm_shuffle_pd +#define rx_add_vec_f128 _mm_add_pd +#define rx_sub_vec_f128 _mm_sub_pd +#define rx_mul_vec_f128 _mm_mul_pd +#define rx_div_vec_f128 _mm_div_pd +#define rx_sqrt_vec_f128 _mm_sqrt_pd +#define rx_set1_long_vec_i128 _mm_set1_epi64x +#define rx_vec_i128_vec_f128 _mm_castsi128_pd + +FORCE_INLINE rx_vec_f128 rx_set_vec_f128(uint64_t x1, uint64_t x0) { + return _mm_castsi128_pd(_mm_set_epi64x(x1, x0)); +} + +FORCE_INLINE rx_vec_f128 rx_set1_vec_f128(uint64_t x) { + return _mm_castsi128_pd(_mm_set1_epi64x(x)); +} + +#define rx_xor_vec_f128 _mm_xor_pd +#define rx_and_vec_f128 _mm_and_pd +#define rx_or_vec_f128 _mm_or_pd +#define rx_aesenc_vec_i128 _mm_aesenc_si128 +#define rx_aesdec_vec_i128 _mm_aesdec_si128 + +FORCE_INLINE int rx_vec_i128_x(rx_vec_i128 a) { + return _mm_cvtsi128_si32(a); +} + +FORCE_INLINE int rx_vec_i128_y(rx_vec_i128 a) { + return _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0x55)); +} + +FORCE_INLINE int rx_vec_i128_z(rx_vec_i128 a) { + return _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0xaa)); +} + +FORCE_INLINE int rx_vec_i128_w(rx_vec_i128 a) { + return _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0xff)); +} + +#define rx_set_int_vec_i128 _mm_set_epi32 +#define rx_xor_vec_i128 _mm_xor_si128 +#define rx_load_vec_i128 _mm_load_si128 +#define rx_store_vec_i128 _mm_store_si128 + +FORCE_INLINE rx_vec_f128 rx_cvt_packed_int_vec_f128(const void* addr) { + __m128i ix = _mm_loadl_epi64((const __m128i*)addr); + return _mm_cvtepi32_pd(ix); +} + +constexpr uint32_t rx_mxcsr_default = 0x9FC0; //Flush to zero, denormals are zero, default rounding mode, all exceptions disabled + +FORCE_INLINE void rx_reset_float_state() { + _mm_setcsr(rx_mxcsr_default); +} + +FORCE_INLINE void rx_set_rounding_mode(uint32_t mode) { + _mm_setcsr(rx_mxcsr_default | (mode << 13)); +} #else #include #include #include #include -#include "blake2/endian.h" - -#define _mm_malloc(a,b) malloc(a) -#define _mm_free(a) free(a) -#define PREFETCHNTA(x) typedef union { uint64_t u64[2]; uint32_t u32[4]; uint16_t u16[8]; uint8_t u8[16]; -} __m128i; +} rx_vec_i128; typedef union { struct { double lo; double hi; }; - __m128i i; -} __m128d; + rx_vec_i128 i; +} rx_vec_f128; -inline __m128d _mm_load_pd(const double* pd) { - __m128d x; +#define rx_aligned_alloc(a, b) malloc(a) +#define rx_aligned_free(a) free(a) +#define rx_prefetch_nta(x) + +FORCE_INLINE rx_vec_f128 rx_load_vec_f128(const double* pd) { + rx_vec_f128 x; x.i.u64[0] = load64(pd + 0); x.i.u64[1] = load64(pd + 1); return x; } -inline void _mm_store_pd(double* mem_addr, __m128d a) { +FORCE_INLINE void rx_store_vec_f128(double* mem_addr, rx_vec_f128 a) { store64(mem_addr + 0, a.i.u64[0]); store64(mem_addr + 1, a.i.u64[1]); } -inline __m128d _mm_shuffle_pd(__m128d a, __m128d b, int imm8) { - __m128d x; +FORCE_INLINE rx_vec_f128 rx_shuffle_vec_f128(rx_vec_f128 a, rx_vec_f128 b, int imm8) { + rx_vec_f128 x; x.lo = (imm8 & 1) ? a.hi : a.lo; x.hi = (imm8 & 2) ? b.hi : b.lo; return x; } -inline __m128d _mm_add_pd(__m128d a, __m128d b) { - __m128d x; +FORCE_INLINE rx_vec_f128 rx_add_vec_f128(rx_vec_f128 a, rx_vec_f128 b) { + rx_vec_f128 x; x.lo = a.lo + b.lo; x.hi = a.hi + b.hi; return x; } -inline __m128d _mm_sub_pd(__m128d a, __m128d b) { - __m128d x; +FORCE_INLINE rx_vec_f128 rx_sub_vec_f128(rx_vec_f128 a, rx_vec_f128 b) { + rx_vec_f128 x; x.lo = a.lo - b.lo; x.hi = a.hi - b.hi; return x; } -inline __m128d _mm_mul_pd(__m128d a, __m128d b) { - __m128d x; +FORCE_INLINE rx_vec_f128 rx_mul_vec_f128(rx_vec_f128 a, rx_vec_f128 b) { + rx_vec_f128 x; x.lo = a.lo * b.lo; x.hi = a.hi * b.hi; return x; } -inline __m128d _mm_div_pd(__m128d a, __m128d b) { - __m128d x; +FORCE_INLINE rx_vec_f128 rx_div_vec_f128(rx_vec_f128 a, rx_vec_f128 b) { + rx_vec_f128 x; x.lo = a.lo / b.lo; x.hi = a.hi / b.hi; return x; } -inline __m128d _mm_sqrt_pd(__m128d a) { - __m128d x; +FORCE_INLINE rx_vec_f128 rx_sqrt_vec_f128(rx_vec_f128 a) { + rx_vec_f128 x; x.lo = sqrt(a.lo); x.hi = sqrt(a.hi); return x; } -inline __m128i _mm_set1_epi64x(uint64_t a) { - __m128i x; +FORCE_INLINE rx_vec_i128 rx_set1_long_vec_i128(uint64_t a) { + rx_vec_i128 x; x.u64[0] = a; x.u64[1] = a; return x; } -inline __m128d _mm_castsi128_pd(__m128i a) { - __m128d x; +FORCE_INLINE rx_vec_f128 rx_vec_i128_vec_f128(rx_vec_i128 a) { + rx_vec_f128 x; x.i = a; return x; } -inline __m128d _mm_abs(__m128d xd) { - xd.lo = std::fabs(xd.lo); - xd.hi = std::fabs(xd.hi); - return xd; +FORCE_INLINE rx_vec_f128 rx_set_vec_f128(uint64_t x1, uint64_t x0) { + rx_vec_f128 v; + v.i.u64[0] = x0; + v.i.u64[1] = x1; + return v; } -inline __m128d _mm_xor_pd(__m128d a, __m128d b) { - __m128d x; +FORCE_INLINE rx_vec_f128 rx_set1_vec_f128(uint64_t x) { + rx_vec_f128 v; + v.i.u64[0] = x; + v.i.u64[1] = x; + return v; +} + + +FORCE_INLINE rx_vec_f128 rx_xor_vec_f128(rx_vec_f128 a, rx_vec_f128 b) { + rx_vec_f128 x; x.i.u64[0] = a.i.u64[0] ^ b.i.u64[0]; x.i.u64[1] = a.i.u64[1] ^ b.i.u64[1]; return x; } -inline __m128d _mm_and_pd(__m128d a, __m128d b) { - __m128d x; +FORCE_INLINE rx_vec_f128 rx_and_vec_f128(rx_vec_f128 a, rx_vec_f128 b) { + rx_vec_f128 x; x.i.u64[0] = a.i.u64[0] & b.i.u64[0]; x.i.u64[1] = a.i.u64[1] & b.i.u64[1]; return x; } -inline __m128d _mm_or_pd(__m128d a, __m128d b) { - __m128d x; +FORCE_INLINE rx_vec_f128 rx_or_vec_f128(rx_vec_f128 a, rx_vec_f128 b) { + rx_vec_f128 x; x.i.u64[0] = a.i.u64[0] | b.i.u64[0]; x.i.u64[1] = a.i.u64[1] | b.i.u64[1]; return x; } -inline __m128d _mm_set_pd(double e1, double e0) { - __m128d x; - x.lo = e0; - x.hi = e1; - return x; -} - -inline __m128d _mm_max_pd(__m128d a, __m128d b) { - __m128d x; - x.lo = a.lo > b.lo ? a.lo : b.lo; - x.hi = a.hi > b.hi ? a.hi : b.hi; - return x; -} - -inline __m128d _mm_cvtepi32_pd(__m128i a) { - __m128d x; - x.lo = (double)unsigned32ToSigned2sCompl(a.u32[0]); - x.hi = (double)unsigned32ToSigned2sCompl(a.u32[1]); - return x; -} - static const char* platformError = "Platform doesn't support hardware AES"; -inline __m128i _mm_aeskeygenassist_si128(__m128i key, uint8_t rcon) { +FORCE_INLINE rx_vec_i128 rx_aesenc_vec_i128(rx_vec_i128 v, rx_vec_i128 rkey) { throw std::runtime_error(platformError); } -inline __m128i _mm_aesenc_si128(__m128i v, __m128i rkey) { +FORCE_INLINE rx_vec_i128 rx_aesdec_vec_i128(rx_vec_i128 v, rx_vec_i128 rkey) { throw std::runtime_error(platformError); } -inline __m128i _mm_aesdec_si128(__m128i v, __m128i rkey) { - throw std::runtime_error(platformError); +FORCE_INLINE int rx_vec_i128_x(rx_vec_i128 a) { + return a.u32[0]; } -inline int _mm_cvtsi128_si32(__m128i v) { - return v.u32[0]; +FORCE_INLINE int rx_vec_i128_y(rx_vec_i128 a) { + return a.u32[1]; } -inline __m128i _mm_cvtsi32_si128(int si32) { - __m128i v; - v.u32[0] = si32; - v.u32[1] = 0; - v.u32[2] = 0; - v.u32[3] = 0; - return v; +FORCE_INLINE int rx_vec_i128_z(rx_vec_i128 a) { + return a.u32[2]; } -inline __m128i _mm_set_epi64x(int64_t _I1, int64_t _I0) { - __m128i v; - v.u64[0] = _I0; - v.u64[1] = _I1; - return v; +FORCE_INLINE int rx_vec_i128_w(rx_vec_i128 a) { + return a.u32[3]; } -inline __m128i _mm_set_epi32(int _I3, int _I2, int _I1, int _I0) { - __m128i v; +FORCE_INLINE rx_vec_i128 rx_set_int_vec_i128(int _I3, int _I2, int _I1, int _I0) { + rx_vec_i128 v; v.u32[0] = _I0; v.u32[1] = _I1; v.u32[2] = _I2; @@ -232,8 +283,8 @@ inline __m128i _mm_set_epi32(int _I3, int _I2, int _I1, int _I0) { return v; }; -inline __m128i _mm_xor_si128(__m128i _A, __m128i _B) { - __m128i c; +FORCE_INLINE rx_vec_i128 rx_xor_vec_i128(rx_vec_i128 _A, rx_vec_i128 _B) { + rx_vec_i128 c; c.u32[0] = _A.u32[0] ^ _B.u32[0]; c.u32[1] = _A.u32[1] ^ _B.u32[1]; c.u32[2] = _A.u32[2] ^ _B.u32[2]; @@ -241,21 +292,12 @@ inline __m128i _mm_xor_si128(__m128i _A, __m128i _B) { return c; } -inline __m128i _mm_shuffle_epi32(__m128i _A, int _Imm) { - __m128i c; - c.u32[0] = _A.u32[_Imm & 3]; - c.u32[1] = _A.u32[(_Imm >> 2) & 3]; - c.u32[2] = _A.u32[(_Imm >> 4) & 3]; - c.u32[3] = _A.u32[(_Imm >> 6) & 3]; - return c; -} - -inline __m128i _mm_load_si128(__m128i const*_P) { +FORCE_INLINE rx_vec_i128 rx_load_vec_i128(rx_vec_i128 const*_P) { #if defined(NATIVE_LITTLE_ENDIAN) return *_P; #else uint32_t* ptr = (uint32_t*)_P; - __m128i c; + rx_vec_i128 c; c.u32[0] = load32(ptr + 0); c.u32[1] = load32(ptr + 1); c.u32[2] = load32(ptr + 2); @@ -264,7 +306,7 @@ inline __m128i _mm_load_si128(__m128i const*_P) { #endif } -inline void _mm_store_si128(__m128i *_P, __m128i _B) { +FORCE_INLINE void rx_store_vec_i128(rx_vec_i128 *_P, rx_vec_i128 _B) { #if defined(NATIVE_LITTLE_ENDIAN) *_P = _B; #else @@ -276,46 +318,23 @@ inline void _mm_store_si128(__m128i *_P, __m128i _B) { #endif } -inline __m128i _mm_slli_si128(__m128i _A, int _Imm) { - _Imm &= 255; - if (_Imm > 15) { - _A.u64[0] = 0; - _A.u64[1] = 0; - } - else { - for (int i = 15; i >= _Imm; --i) { - _A.u8[i] = _A.u8[i - _Imm]; - } - for (int i = 0; i < _Imm; ++i) { - _A.u8[i] = 0; - } - } - return _A; -} - -inline __m128i _mm_loadl_epi64(__m128i const* mem_addr) { - __m128i x; - x.u32[0] = load32((uint8_t*)mem_addr + 0); - x.u32[1] = load32((uint8_t*)mem_addr + 4); +FORCE_INLINE rx_vec_f128 rx_cvt_packed_int_vec_f128(const void* addr) { + rx_vec_f128 x; + x.lo = (double)unsigned32ToSigned2sCompl(load32((uint8_t*)addr + 0)); + x.hi = (double)unsigned32ToSigned2sCompl(load32((uint8_t*)addr + 4)); return x; } +#define RANDOMX_DEFAULT_FENV + +void rx_reset_float_state(); + +void rx_set_rounding_mode(uint32_t mode); + #endif -constexpr int RoundToNearest = 0; -constexpr int RoundDown = 1; -constexpr int RoundUp = 2; -constexpr int RoundToZero = 3; - -inline __m128d load_cvt_i32x2(const void* addr) { - __m128i ix = _mm_loadl_epi64((const __m128i*)addr); - return _mm_cvtepi32_pd(ix); -} - double loadDoublePortable(const void* addr); uint64_t mulh(uint64_t, uint64_t); int64_t smulh(int64_t, int64_t); uint64_t rotl(uint64_t, int); uint64_t rotr(uint64_t, int); -void initFpu(); -void setRoundMode(uint32_t); diff --git a/src/soft_aes.cpp b/src/soft_aes.cpp index 81dc66b..decacb7 100644 --- a/src/soft_aes.cpp +++ b/src/soft_aes.cpp @@ -318,38 +318,38 @@ alignas(16) const uint32_t lutDec3[256] = { 0x397101a8, 0x08deb30c, 0xd89ce4b4, 0x6490c156, 0x7b6184cb, 0xd570b632, 0x48745c6c, 0xd04257b8, }; -__m128i soft_aesenc(__m128i in, __m128i key) { +rx_vec_i128 soft_aesenc(rx_vec_i128 in, rx_vec_i128 key) { uint32_t s0, s1, s2, s3; - s0 = _mm_cvtsi128_si32(_mm_shuffle_epi32(in, 0xff)); - s1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(in, 0xaa)); - s2 = _mm_cvtsi128_si32(_mm_shuffle_epi32(in, 0x55)); - s3 = _mm_cvtsi128_si32(in); + s0 = rx_vec_i128_w(in); + s1 = rx_vec_i128_z(in); + s2 = rx_vec_i128_y(in); + s3 = rx_vec_i128_x(in); - __m128i out = _mm_set_epi32( + rx_vec_i128 out = rx_set_int_vec_i128( (lutEnc0[s0 & 0xff] ^ lutEnc1[(s3 >> 8) & 0xff] ^ lutEnc2[(s2 >> 16) & 0xff] ^ lutEnc3[s1 >> 24]), (lutEnc0[s1 & 0xff] ^ lutEnc1[(s0 >> 8) & 0xff] ^ lutEnc2[(s3 >> 16) & 0xff] ^ lutEnc3[s2 >> 24]), (lutEnc0[s2 & 0xff] ^ lutEnc1[(s1 >> 8) & 0xff] ^ lutEnc2[(s0 >> 16) & 0xff] ^ lutEnc3[s3 >> 24]), (lutEnc0[s3 & 0xff] ^ lutEnc1[(s2 >> 8) & 0xff] ^ lutEnc2[(s1 >> 16) & 0xff] ^ lutEnc3[s0 >> 24]) ); - return _mm_xor_si128(out, key); + return rx_xor_vec_i128(out, key); } -__m128i soft_aesdec(__m128i in, __m128i key) { +rx_vec_i128 soft_aesdec(rx_vec_i128 in, rx_vec_i128 key) { uint32_t s0, s1, s2, s3; - s0 = _mm_cvtsi128_si32(_mm_shuffle_epi32(in, 0xff)); - s1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(in, 0xaa)); - s2 = _mm_cvtsi128_si32(_mm_shuffle_epi32(in, 0x55)); - s3 = _mm_cvtsi128_si32(in); + s0 = rx_vec_i128_w(in); + s1 = rx_vec_i128_z(in); + s2 = rx_vec_i128_y(in); + s3 = rx_vec_i128_x(in); - __m128i out = _mm_set_epi32( + rx_vec_i128 out = rx_set_int_vec_i128( (lutDec0[s0 & 0xff] ^ lutDec1[(s1 >> 8) & 0xff] ^ lutDec2[(s2 >> 16) & 0xff] ^ lutDec3[s3 >> 24]), (lutDec0[s1 & 0xff] ^ lutDec1[(s2 >> 8) & 0xff] ^ lutDec2[(s3 >> 16) & 0xff] ^ lutDec3[s0 >> 24]), (lutDec0[s2 & 0xff] ^ lutDec1[(s3 >> 8) & 0xff] ^ lutDec2[(s0 >> 16) & 0xff] ^ lutDec3[s1 >> 24]), (lutDec0[s3 & 0xff] ^ lutDec1[(s0 >> 8) & 0xff] ^ lutDec2[(s1 >> 16) & 0xff] ^ lutDec3[s2 >> 24]) ); - return _mm_xor_si128(out, key); + return rx_xor_vec_i128(out, key); } diff --git a/src/soft_aes.h b/src/soft_aes.h index 12e9ce4..213b975 100644 --- a/src/soft_aes.h +++ b/src/soft_aes.h @@ -22,16 +22,16 @@ along with RandomX. If not, see. #include #include "intrin_portable.h" -__m128i soft_aesenc(__m128i in, __m128i key); +rx_vec_i128 soft_aesenc(rx_vec_i128 in, rx_vec_i128 key); -__m128i soft_aesdec(__m128i in, __m128i key); +rx_vec_i128 soft_aesdec(rx_vec_i128 in, rx_vec_i128 key); template -inline __m128i aesenc(__m128i in, __m128i key) { - return soft ? soft_aesenc(in, key) : _mm_aesenc_si128(in, key); +inline rx_vec_i128 aesenc(rx_vec_i128 in, rx_vec_i128 key) { + return soft ? soft_aesenc(in, key) : rx_aesenc_vec_i128(in, key); } template -inline __m128i aesdec(__m128i in, __m128i key) { - return soft ? soft_aesdec(in, key) : _mm_aesdec_si128(in, key); +inline rx_vec_i128 aesdec(rx_vec_i128 in, rx_vec_i128 key) { + return soft ? soft_aesdec(in, key) : rx_aesdec_vec_i128(in, key); } \ No newline at end of file diff --git a/src/virtual_machine.cpp b/src/virtual_machine.cpp index 086d438..bb8e6f6 100644 --- a/src/virtual_machine.cpp +++ b/src/virtual_machine.cpp @@ -32,7 +32,7 @@ randomx_vm::~randomx_vm() { } void randomx_vm::resetRoundingMode() { - initFpu(); + rx_reset_float_state(); } namespace randomx { @@ -86,7 +86,7 @@ void randomx_vm::initialize() { namespace randomx { - alignas(16) volatile static __m128i aesDummy; + alignas(16) volatile static rx_vec_i128 aesDummy; template VmBase::~VmBase() { @@ -98,9 +98,9 @@ namespace randomx { if (datasetPtr == nullptr) throw std::invalid_argument("Cache/Dataset not set"); if (!softAes) { //if hardware AES is not supported, it's better to fail now than to return a ticking bomb - __m128i tmp = _mm_load_si128((const __m128i*)&aesDummy); - tmp = _mm_aesenc_si128(tmp, tmp); - _mm_store_si128((__m128i*)&aesDummy, tmp); + rx_vec_i128 tmp = rx_load_vec_i128((const rx_vec_i128*)&aesDummy); + tmp = rx_aesenc_vec_i128(tmp, tmp); + rx_store_vec_i128((rx_vec_i128*)&aesDummy, tmp); } scratchpad = (uint8_t*)Allocator::allocMemory(ScratchpadSize); } diff --git a/src/vm_interpreted.cpp b/src/vm_interpreted.cpp index ea85f60..9c9c4aa 100644 --- a/src/vm_interpreted.cpp +++ b/src/vm_interpreted.cpp @@ -46,7 +46,7 @@ namespace randomx { } template - void InterpretedVm::executeBytecode(int_reg_t(&r)[RegistersCount], __m128d (&f)[RegisterCountFlt], __m128d (&e)[RegisterCountFlt], __m128d (&a)[RegisterCountFlt]) { + void InterpretedVm::executeBytecode(int_reg_t(&r)[RegistersCount], rx_vec_f128(&f)[RegisterCountFlt], rx_vec_f128(&e)[RegisterCountFlt], rx_vec_f128(&a)[RegisterCountFlt]) { for (int pc = 0; pc < RANDOMX_PROGRAM_SIZE; ++pc) { executeBytecode(pc, r, f, e, a); } @@ -59,16 +59,16 @@ namespace randomx { } template - FORCE_INLINE __m128d InterpretedVm::maskRegisterExponentMantissa(__m128d x) { - const __m128d xmantissaMask = _mm_castsi128_pd(_mm_set_epi64x(dynamicMantissaMask, dynamicMantissaMask)); - const __m128d xexponentMask = _mm_load_pd((const double*)&config.eMask); - x = _mm_and_pd(x, xmantissaMask); - x = _mm_or_pd(x, xexponentMask); + FORCE_INLINE rx_vec_f128 InterpretedVm::maskRegisterExponentMantissa(rx_vec_f128 x) { + const rx_vec_f128 xmantissaMask = rx_set_vec_f128(dynamicMantissaMask, dynamicMantissaMask); + const rx_vec_f128 xexponentMask = rx_load_vec_f128((const double*)&config.eMask); + x = rx_and_vec_f128(x, xmantissaMask); + x = rx_or_vec_f128(x, xexponentMask); return x; } template - void InterpretedVm::executeBytecode(int& pc, int_reg_t(&r)[RegistersCount], __m128d (&f)[RegisterCountFlt], __m128d (&e)[RegisterCountFlt], __m128d (&a)[RegisterCountFlt]) { + void InterpretedVm::executeBytecode(int& pc, int_reg_t(&r)[RegistersCount], rx_vec_f128(&f)[RegisterCountFlt], rx_vec_f128(&e)[RegisterCountFlt], rx_vec_f128(&a)[RegisterCountFlt]) { auto& ibc = byteCode[pc]; switch (ibc.type) { @@ -139,43 +139,43 @@ namespace randomx { } break; case InstructionType::FSWAP_R: { - *ibc.fdst = _mm_shuffle_pd(*ibc.fdst, *ibc.fdst, 1); + *ibc.fdst = rx_shuffle_vec_f128(*ibc.fdst, *ibc.fdst, 1); } break; case InstructionType::FADD_R: { - *ibc.fdst = _mm_add_pd(*ibc.fdst, *ibc.fsrc); + *ibc.fdst = rx_add_vec_f128(*ibc.fdst, *ibc.fsrc); } break; case InstructionType::FADD_M: { - __m128d fsrc = load_cvt_i32x2(getScratchpadAddress(ibc)); - *ibc.fdst = _mm_add_pd(*ibc.fdst, fsrc); + rx_vec_f128 fsrc = rx_cvt_packed_int_vec_f128(getScratchpadAddress(ibc)); + *ibc.fdst = rx_add_vec_f128(*ibc.fdst, fsrc); } break; case InstructionType::FSUB_R: { - *ibc.fdst = _mm_sub_pd(*ibc.fdst, *ibc.fsrc); + *ibc.fdst = rx_sub_vec_f128(*ibc.fdst, *ibc.fsrc); } break; case InstructionType::FSUB_M: { - __m128d fsrc = load_cvt_i32x2(getScratchpadAddress(ibc)); - *ibc.fdst = _mm_sub_pd(*ibc.fdst, fsrc); + rx_vec_f128 fsrc = rx_cvt_packed_int_vec_f128(getScratchpadAddress(ibc)); + *ibc.fdst = rx_sub_vec_f128(*ibc.fdst, fsrc); } break; case InstructionType::FSCAL_R: { - const __m128d mask = _mm_castsi128_pd(_mm_set1_epi64x(0x81F0000000000000)); - *ibc.fdst = _mm_xor_pd(*ibc.fdst, mask); + const rx_vec_f128 mask = rx_set1_vec_f128(0x81F0000000000000); + *ibc.fdst = rx_xor_vec_f128(*ibc.fdst, mask); } break; case InstructionType::FMUL_R: { - *ibc.fdst = _mm_mul_pd(*ibc.fdst, *ibc.fsrc); + *ibc.fdst = rx_mul_vec_f128(*ibc.fdst, *ibc.fsrc); } break; case InstructionType::FDIV_M: { - __m128d fsrc = maskRegisterExponentMantissa(load_cvt_i32x2(getScratchpadAddress(ibc))); - *ibc.fdst = _mm_div_pd(*ibc.fdst, fsrc); + rx_vec_f128 fsrc = maskRegisterExponentMantissa(rx_cvt_packed_int_vec_f128(getScratchpadAddress(ibc))); + *ibc.fdst = rx_div_vec_f128(*ibc.fdst, fsrc); } break; case InstructionType::FSQRT_R: { - *ibc.fdst = _mm_sqrt_pd(*ibc.fdst); + *ibc.fdst = rx_sqrt_vec_f128(*ibc.fdst); } break; case InstructionType::CBRANCH: { @@ -186,7 +186,7 @@ namespace randomx { } break; case InstructionType::CFROUND: { - setRoundMode(rotr(*ibc.isrc, ibc.imm) % 4); + rx_set_rounding_mode(rotr(*ibc.isrc, ibc.imm) % 4); } break; case InstructionType::ISTORE: { @@ -205,12 +205,12 @@ namespace randomx { template void InterpretedVm::execute() { int_reg_t r[RegistersCount] = { 0 }; - __m128d f[RegisterCountFlt]; - __m128d e[RegisterCountFlt]; - __m128d a[RegisterCountFlt]; + rx_vec_f128 f[RegisterCountFlt]; + rx_vec_f128 e[RegisterCountFlt]; + rx_vec_f128 a[RegisterCountFlt]; for(unsigned i = 0; i < RegisterCountFlt; ++i) - a[i] = _mm_load_pd(®.a[i].lo); + a[i] = rx_load_vec_f128(®.a[i].lo); precompileProgram(r, f, e, a); @@ -228,10 +228,10 @@ namespace randomx { r[i] ^= load64(scratchpad + spAddr0 + 8 * i); for (unsigned i = 0; i < RegisterCountFlt; ++i) - f[i] = load_cvt_i32x2(scratchpad + spAddr1 + 8 * i); + f[i] = rx_cvt_packed_int_vec_f128(scratchpad + spAddr1 + 8 * i); for (unsigned i = 0; i < RegisterCountFlt; ++i) - e[i] = maskRegisterExponentMantissa(load_cvt_i32x2(scratchpad + spAddr1 + 8 * (RegisterCountFlt + i))); + e[i] = maskRegisterExponentMantissa(rx_cvt_packed_int_vec_f128(scratchpad + spAddr1 + 8 * (RegisterCountFlt + i))); executeBytecode(r, f, e, a); @@ -244,10 +244,10 @@ namespace randomx { store64(scratchpad + spAddr1 + 8 * i, r[i]); for (unsigned i = 0; i < RegisterCountFlt; ++i) - f[i] = _mm_xor_pd(f[i], e[i]); + f[i] = rx_xor_vec_f128(f[i], e[i]); for (unsigned i = 0; i < RegisterCountFlt; ++i) - _mm_store_pd((double*)(scratchpad + spAddr0 + 16 * i), f[i]); + rx_store_vec_f128((double*)(scratchpad + spAddr0 + 16 * i), f[i]); spAddr0 = 0; spAddr1 = 0; @@ -257,10 +257,10 @@ namespace randomx { store64(®.r[i], r[i]); for (unsigned i = 0; i < RegisterCountFlt; ++i) - _mm_store_pd(®.f[i].lo, f[i]); + rx_store_vec_f128(®.f[i].lo, f[i]); for (unsigned i = 0; i < RegisterCountFlt; ++i) - _mm_store_pd(®.e[i].lo, e[i]); + rx_store_vec_f128(®.e[i].lo, e[i]); } template @@ -273,7 +273,7 @@ namespace randomx { #include "instruction_weights.hpp" template - void InterpretedVm::precompileProgram(int_reg_t(&r)[RegistersCount], __m128d (&f)[RegisterCountFlt], __m128d (&e)[RegisterCountFlt], __m128d (&a)[RegisterCountFlt]) { + void InterpretedVm::precompileProgram(int_reg_t(&r)[RegistersCount], rx_vec_f128(&f)[RegisterCountFlt], rx_vec_f128(&e)[RegisterCountFlt], rx_vec_f128(&a)[RegisterCountFlt]) { RegisterUsage registerUsage[RegistersCount]; for (unsigned i = 0; i < RegistersCount; ++i) { registerUsage[i].lastUsed = -1; diff --git a/src/vm_interpreted.hpp b/src/vm_interpreted.hpp index eafb216..718b2c8 100644 --- a/src/vm_interpreted.hpp +++ b/src/vm_interpreted.hpp @@ -31,11 +31,11 @@ namespace randomx { struct InstructionByteCode { union { int_reg_t* idst; - __m128d* fdst; + rx_vec_f128* fdst; }; union { int_reg_t* isrc; - __m128d* fsrc; + rx_vec_f128* fsrc; }; union { uint64_t imm; @@ -74,11 +74,11 @@ namespace randomx { virtual void datasetRead(uint32_t blockNumber, int_reg_t(&r)[RegistersCount]); private: void execute(); - void precompileProgram(int_reg_t(&r)[RegistersCount], __m128d (&f)[RegisterCountFlt], __m128d (&e)[RegisterCountFlt], __m128d (&a)[RegisterCountFlt]); - void executeBytecode(int_reg_t(&r)[RegistersCount], __m128d (&f)[RegisterCountFlt], __m128d (&e)[RegisterCountFlt], __m128d (&a)[RegisterCountFlt]); - void executeBytecode(int& i, int_reg_t(&r)[RegistersCount], __m128d (&f)[RegisterCountFlt], __m128d (&e)[RegisterCountFlt], __m128d (&a)[RegisterCountFlt]); + void precompileProgram(int_reg_t(&r)[RegistersCount], rx_vec_f128(&f)[RegisterCountFlt], rx_vec_f128(&e)[RegisterCountFlt], rx_vec_f128(&a)[RegisterCountFlt]); + void executeBytecode(int_reg_t(&r)[RegistersCount], rx_vec_f128(&f)[RegisterCountFlt], rx_vec_f128(&e)[RegisterCountFlt], rx_vec_f128(&a)[RegisterCountFlt]); + void executeBytecode(int& i, int_reg_t(&r)[RegistersCount], rx_vec_f128(&f)[RegisterCountFlt], rx_vec_f128(&e)[RegisterCountFlt], rx_vec_f128(&a)[RegisterCountFlt]); void* getScratchpadAddress(InstructionByteCode& ibc); - __m128d maskRegisterExponentMantissa(__m128d); + rx_vec_f128 maskRegisterExponentMantissa(rx_vec_f128); InstructionByteCode byteCode[RANDOMX_PROGRAM_SIZE]; };