From bf34d27ecdcb1e387ca76938824715951c2e2bbd Mon Sep 17 00:00:00 2001 From: tevador Date: Mon, 18 Feb 2019 08:56:37 +0100 Subject: [PATCH 1/3] Portable SSE2 intrinsics --- src/intrinPortable.h | 140 +++++++++++++++++++++++++++++++++++++------ 1 file changed, 122 insertions(+), 18 deletions(-) diff --git a/src/intrinPortable.h b/src/intrinPortable.h index 2c2e487..c3ee8c7 100644 --- a/src/intrinPortable.h +++ b/src/intrinPortable.h @@ -21,6 +21,18 @@ along with RandomX. If not, see. #include +constexpr int32_t unsigned32ToSigned2sCompl(uint32_t x) { + return (-1 == ~0) ? (int32_t)x : (x > INT32_MAX ? (-(int32_t)(UINT32_MAX - x) - 1) : (int32_t)x); +} + +constexpr int64_t unsigned64ToSigned2sCompl(uint64_t x) { + return (-1 == ~0) ? (int64_t)x : (x > INT64_MAX ? (-(int64_t)(UINT64_MAX - x) - 1) : (int64_t)x); +} + +constexpr uint64_t signExtend2sCompl(uint32_t x) { + return (-1 == ~0) ? (int64_t)(int32_t)(x) : (x > INT32_MAX ? (x | 0xffffffff00000000ULL) : (uint64_t)x); +} + #if defined(_MSC_VER) #if defined(_M_X64) || (defined(_M_IX86_FP) && _M_IX86_FP == 2) #define __SSE2__ 1 @@ -44,6 +56,7 @@ inline __m128d _mm_abs(__m128d xd) { #else #include #include +#include #define _mm_malloc(a,b) malloc(a) #define _mm_free(a) free(a) @@ -56,15 +69,112 @@ typedef union { uint8_t u8[16]; } __m128i; -typedef struct { - double lo; - double hi; +typedef union { + struct { + double lo; + double hi; + }; + __m128i i; } __m128d; inline __m128d _mm_load_pd(const double* pd) { __m128d x; - x.lo = *(pd + 0); - x.hi = *(pd + 1); + x.i.u64[0] = load64(pd + 0); + x.i.u64[1] = load64(pd + 1); + return x; +} + +inline void _mm_store_pd(double* mem_addr, __m128d a) { + store64(mem_addr + 0, a.i.u64[0]); + store64(mem_addr + 1, a.i.u64[1]); +} + +inline __m128d _mm_shuffle_pd(__m128d a, __m128d b, int imm8) { + __m128d x; + x.lo = (imm8 & 1) ? a.hi : a.lo; + x.hi = (imm8 & 2) ? b.hi : b.lo; + return x; +} + +inline __m128d _mm_add_pd(__m128d a, __m128d b) { + __m128d x; + x.lo = a.lo + b.lo; + x.hi = a.hi + b.hi; + return x; +} + +inline __m128d _mm_sub_pd(__m128d a, __m128d b) { + __m128d x; + x.lo = a.lo - b.lo; + x.hi = a.hi - b.hi; + return x; +} + +inline __m128d _mm_mul_pd(__m128d a, __m128d b) { + __m128d x; + x.lo = a.lo * b.lo; + x.hi = a.hi * b.hi; + return x; +} + +inline __m128d _mm_div_pd(__m128d a, __m128d b) { + __m128d x; + x.lo = a.lo / b.lo; + x.hi = a.hi / b.hi; + return x; +} + +inline __m128d _mm_sqrt_pd(__m128d a) { + __m128d x; + x.lo = sqrt(a.lo); + x.hi = sqrt(a.hi); + return x; +} + +inline __m128i _mm_set1_epi64x(uint64_t a) { + __m128i x; + x.u64[0] = a; + x.u64[1] = a; + return x; +} + +inline __m128d _mm_castsi128_pd(__m128i a) { + __m128d x; + x.i = a; + return x; +} + +inline __m128d _mm_abs(__m128d xd) { + xd.lo = std::abs(xd.lo); + xd.hi = std::abs(xd.hi); + return xd; +} + +inline __m128d _mm_xor_pd(__m128d a, __m128d b) { + __m128d x; + x.i.u64[0] = a.i.u64[0] ^ b.i.u64[0]; + x.i.u64[1] = a.i.u64[1] ^ b.i.u64[1]; + return x; +} + +inline __m128d _mm_set_pd(double e1, double e0) { + __m128d x; + x.lo = e0; + x.hi = e1; + return x; +} + +inline __m128d _mm_max_pd(__m128d a, __m128d b) { + __m128d x; + x.lo = a.lo > b.lo ? a.lo : b.lo; + x.hi = a.hi > b.hi ? a.hi : b.hi; + return x; +} + +inline __m128d _mm_cvtepi32_pd(__m128i a) { + __m128d x; + x.lo = (double)unsigned32ToSigned2sCompl(a.u32[0]); + x.hi = (double)unsigned32ToSigned2sCompl(a.u32[1]); return x; } @@ -154,6 +264,12 @@ inline __m128i _mm_slli_si128(__m128i _A, int _Imm) { return _A; } +inline __m128i _mm_loadl_epi64(__m128i const* mem_addr) { + __m128i x; + x.u64[0] = load64(mem_addr); + return x; +} + #endif constexpr int RoundToNearest = 0; @@ -161,20 +277,8 @@ constexpr int RoundDown = 1; constexpr int RoundUp = 2; constexpr int RoundToZero = 3; -constexpr int32_t unsigned32ToSigned2sCompl(uint32_t x) { - return (-1 == ~0) ? (int32_t)x : (x > INT32_MAX ? (-(int32_t)(UINT32_MAX - x) - 1) : (int32_t)x); -} - -constexpr int64_t unsigned64ToSigned2sCompl(uint64_t x) { - return (-1 == ~0) ? (int64_t)x : (x > INT64_MAX ? (-(int64_t)(UINT64_MAX - x) - 1) : (int64_t)x); -} - -constexpr uint64_t signExtend2sCompl(uint32_t x) { - return (-1 == ~0) ? (int64_t)(int32_t)(x) : (x > INT32_MAX ? (x | 0xffffffff00000000ULL) : (uint64_t)x); -} - inline __m128d load_cvt_i32x2(const void* addr) { - __m128i ix = _mm_load_si128((const __m128i*)addr); + __m128i ix = _mm_loadl_epi64((const __m128i*)addr); return _mm_cvtepi32_pd(ix); } From c5309fae9e0a7c650fcaf566ef6a37884967899f Mon Sep 17 00:00:00 2001 From: tevador Date: Mon, 18 Feb 2019 17:57:54 +0100 Subject: [PATCH 2/3] Fixed portable intrinsics compilation --- src/intrinPortable.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/intrinPortable.h b/src/intrinPortable.h index c3ee8c7..392482c 100644 --- a/src/intrinPortable.h +++ b/src/intrinPortable.h @@ -57,6 +57,8 @@ inline __m128d _mm_abs(__m128d xd) { #include #include #include +#include +#include "blake2/endian.h" #define _mm_malloc(a,b) malloc(a) #define _mm_free(a) free(a) @@ -145,8 +147,8 @@ inline __m128d _mm_castsi128_pd(__m128i a) { } inline __m128d _mm_abs(__m128d xd) { - xd.lo = std::abs(xd.lo); - xd.hi = std::abs(xd.hi); + xd.lo = std::fabs(xd.lo); + xd.hi = std::fabs(xd.hi); return xd; } From f930d5d4dc1df3b9102afab9c7c08d14340e3e6f Mon Sep 17 00:00:00 2001 From: tevador Date: Mon, 18 Feb 2019 22:09:20 +0100 Subject: [PATCH 3/3] Fixed a bug in FSWAP_R --- src/InterpretedVirtualMachine.cpp | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/InterpretedVirtualMachine.cpp b/src/InterpretedVirtualMachine.cpp index 50347f1..4872213 100644 --- a/src/InterpretedVirtualMachine.cpp +++ b/src/InterpretedVirtualMachine.cpp @@ -109,6 +109,7 @@ namespace RandomX { FORCE_INLINE void InterpretedVirtualMachine::executeBytecode(int i, int_reg_t(&r)[8], __m128d (&f)[4], __m128d (&e)[4], __m128d (&a)[4]) { auto& ibc = byteCode[i]; + if(trace) printState(r, f, e, a); switch (ibc.type) { case InstructionType::IADD_R: { @@ -268,7 +269,7 @@ namespace RandomX { UNREACHABLE; } if (trace) { - //std::cout << program(i); + std::cout << program(i); if(ibc.type < 20 || ibc.type == 31 || ibc.type == 32) print(*ibc.idst); else //if(ibc.type >= 20 && ibc.type <= 30) @@ -673,7 +674,10 @@ namespace RandomX { CASE_REP(FSWAP_R) { auto dst = instr.dst % RegistersCount; ibc.type = InstructionType::FSWAP_R; - ibc.fdst = &f[dst]; + if (dst < 4) + ibc.fdst = &f[dst]; + else + ibc.fdst = &e[dst - 4]; } break; CASE_REP(FADD_R) {