From afaeff6066cae1b6887d764838c2f3915c9f5ab5 Mon Sep 17 00:00:00 2001 From: tevador Date: Wed, 15 May 2019 13:23:50 +0200 Subject: [PATCH] Fixed incorrect sqrt results on 32bit targets using x87 math Cleaner implementation of FSWAP --- src/instructions_portable.cpp | 3 +++ src/intrin_portable.h | 41 +++++++++++++++++++++++++---------- src/vm_interpreted.cpp | 2 +- 3 files changed, 33 insertions(+), 13 deletions(-) diff --git a/src/instructions_portable.cpp b/src/instructions_portable.cpp index 283aa1d..e8dfd62 100644 --- a/src/instructions_portable.cpp +++ b/src/instructions_portable.cpp @@ -127,6 +127,9 @@ along with RandomX. If not, see. void rx_reset_float_state() { setRoundMode_(FE_TONEAREST); +#ifdef RANDOMX_USE_X87 + _control87(_PC_53, _MCW_PC); //set x87 precision to 53 bits +#endif } void rx_set_rounding_mode(uint32_t mode) { diff --git a/src/intrin_portable.h b/src/intrin_portable.h index 265ef8b..7c972f5 100644 --- a/src/intrin_portable.h +++ b/src/intrin_portable.h @@ -39,10 +39,26 @@ constexpr int RoundDown = 1; constexpr int RoundUp = 2; constexpr int RoundToZero = 3; -#if defined(_MSC_VER) -#if defined(_M_X64) || (defined(_M_IX86_FP) && _M_IX86_FP == 2) +//MSVC doesn't define __SSE2__, so we have to define it manually if SSE2 is available +#if !defined(__SSE2__) && (defined(_M_X64) || (defined(_M_IX86_FP) && _M_IX86_FP == 2)) #define __SSE2__ 1 #endif + +//the library "sqrt" function provided by MSVC for x86 targets doesn't give +//the correct results, so we have to use inline assembly to call x87 fsqrt directly +#if defined(_M_IX86) && !defined(__SSE2__) +inline double __cdecl rx_sqrt(double x) { + __asm { + fld x + fsqrt + } +} +#define rx_sqrt rx_sqrt +#define RANDOMX_USE_X87 +#endif + +#if !defined(rx_sqrt) +#define rx_sqrt sqrt #endif #ifdef __SSE2__ @@ -61,14 +77,15 @@ typedef __m128d rx_vec_f128; #define rx_load_vec_f128 _mm_load_pd #define rx_store_vec_f128 _mm_store_pd -#define rx_shuffle_vec_f128 _mm_shuffle_pd #define rx_add_vec_f128 _mm_add_pd #define rx_sub_vec_f128 _mm_sub_pd #define rx_mul_vec_f128 _mm_mul_pd #define rx_div_vec_f128 _mm_div_pd #define rx_sqrt_vec_f128 _mm_sqrt_pd -#define rx_set1_long_vec_i128 _mm_set1_epi64x -#define rx_vec_i128_vec_f128 _mm_castsi128_pd + +FORCE_INLINE rx_vec_f128 rx_swap_vec_f128(rx_vec_f128 a) { + return _mm_shuffle_pd(a, a, 1); +} FORCE_INLINE rx_vec_f128 rx_set_vec_f128(uint64_t x1, uint64_t x0) { return _mm_castsi128_pd(_mm_set_epi64x(x1, x0)); @@ -157,11 +174,11 @@ FORCE_INLINE void rx_store_vec_f128(double* mem_addr, rx_vec_f128 a) { store64(mem_addr + 1, a.i.u64[1]); } -FORCE_INLINE rx_vec_f128 rx_shuffle_vec_f128(rx_vec_f128 a, rx_vec_f128 b, int imm8) { - rx_vec_f128 x; - x.lo = (imm8 & 1) ? a.hi : a.lo; - x.hi = (imm8 & 2) ? b.hi : b.lo; - return x; +FORCE_INLINE rx_vec_f128 rx_swap_vec_f128(rx_vec_f128 a) { + double temp = a.hi; + a.hi = a.lo; + a.lo = temp; + return a; } FORCE_INLINE rx_vec_f128 rx_add_vec_f128(rx_vec_f128 a, rx_vec_f128 b) { @@ -194,8 +211,8 @@ FORCE_INLINE rx_vec_f128 rx_div_vec_f128(rx_vec_f128 a, rx_vec_f128 b) { FORCE_INLINE rx_vec_f128 rx_sqrt_vec_f128(rx_vec_f128 a) { rx_vec_f128 x; - x.lo = sqrt(a.lo); - x.hi = sqrt(a.hi); + x.lo = rx_sqrt(a.lo); + x.hi = rx_sqrt(a.hi); return x; } diff --git a/src/vm_interpreted.cpp b/src/vm_interpreted.cpp index 9c9c4aa..fd7876f 100644 --- a/src/vm_interpreted.cpp +++ b/src/vm_interpreted.cpp @@ -139,7 +139,7 @@ namespace randomx { } break; case InstructionType::FSWAP_R: { - *ibc.fdst = rx_shuffle_vec_f128(*ibc.fdst, *ibc.fdst, 1); + *ibc.fdst = rx_swap_vec_f128(*ibc.fdst); } break; case InstructionType::FADD_R: {