Reworked conversion int -> float for register group E

This commit is contained in:
tevador 2019-02-24 14:48:07 +01:00
parent f3b114af88
commit 790b382eda
2 changed files with 66 additions and 17 deletions

View file

@ -29,6 +29,7 @@ along with RandomX. If not, see<http://www.gnu.org/licenses/>.
#include <cmath> #include <cmath>
#include <cfloat> #include <cfloat>
#include <thread> #include <thread>
#include <climits>
#include "intrinPortable.h" #include "intrinPortable.h"
#include "reciprocal.h" #include "reciprocal.h"
#ifdef STATS #ifdef STATS
@ -108,9 +109,13 @@ namespace RandomX {
} }
} }
static bool isDenormal(double x) {
return std::fpclassify(x) == FP_SUBNORMAL;
}
FORCE_INLINE void InterpretedVirtualMachine::executeBytecode(int i, int_reg_t(&r)[8], __m128d (&f)[4], __m128d (&e)[4], __m128d (&a)[4]) { FORCE_INLINE void InterpretedVirtualMachine::executeBytecode(int i, int_reg_t(&r)[8], __m128d (&f)[4], __m128d (&e)[4], __m128d (&a)[4]) {
auto& ibc = byteCode[i]; auto& ibc = byteCode[i];
if(trace) printState(r, f, e, a); //if(trace) printState(r, f, e, a);
switch (ibc.type) switch (ibc.type)
{ {
case InstructionType::IADD_R: { case InstructionType::IADD_R: {
@ -219,9 +224,8 @@ namespace RandomX {
} break; } break;
case InstructionType::FDIV_M: { case InstructionType::FDIV_M: {
__m128d fsrc = _mm_abs(load_cvt_i32x2(scratchpad + (*ibc.isrc & ibc.memMask))); __m128d fsrc = ieee_set_exponent<-240>(load_cvt_i32x2(scratchpad + (*ibc.isrc & ibc.memMask)));
__m128d fdst = _mm_div_pd(*ibc.fdst, fsrc); *ibc.fdst = _mm_div_pd(*ibc.fdst, fsrc);
*ibc.fdst = _mm_max_pd(fdst, _mm_set_pd(DBL_MIN, DBL_MIN));
} break; } break;
case InstructionType::FSQRT_R: { case InstructionType::FSQRT_R: {
@ -258,6 +262,18 @@ namespace RandomX {
else //if(ibc.type >= 20 && ibc.type <= 30) else //if(ibc.type >= 20 && ibc.type <= 30)
print(0); print(0);
} }
#ifdef FPUCHECK
if (ibc.type >= 26 && ibc.type <= 30) {
double lo = *(((double*)ibc.fdst) + 0);
double hi = *(((double*)ibc.fdst) + 1);
if (lo <= 0 || hi <= 0) {
std::stringstream ss;
ss << "Underflow in operation " << ibc.type;
printState(r, f, e, a);
throw std::runtime_error(ss.str());
}
}
#endif
} }
void InterpretedVirtualMachine::execute() { void InterpretedVirtualMachine::execute() {
@ -304,10 +320,10 @@ namespace RandomX {
f[1] = load_cvt_i32x2(scratchpad + spAddr1 + 8); f[1] = load_cvt_i32x2(scratchpad + spAddr1 + 8);
f[2] = load_cvt_i32x2(scratchpad + spAddr1 + 16); f[2] = load_cvt_i32x2(scratchpad + spAddr1 + 16);
f[3] = load_cvt_i32x2(scratchpad + spAddr1 + 24); f[3] = load_cvt_i32x2(scratchpad + spAddr1 + 24);
e[0] = _mm_abs(load_cvt_i32x2(scratchpad + spAddr1 + 32)); e[0] = ieee_set_exponent<-240>(load_cvt_i32x2(scratchpad + spAddr1 + 32));
e[1] = _mm_abs(load_cvt_i32x2(scratchpad + spAddr1 + 40)); e[1] = ieee_set_exponent<-240>(load_cvt_i32x2(scratchpad + spAddr1 + 40));
e[2] = _mm_abs(load_cvt_i32x2(scratchpad + spAddr1 + 48)); e[2] = ieee_set_exponent<-240>(load_cvt_i32x2(scratchpad + spAddr1 + 48));
e[3] = _mm_abs(load_cvt_i32x2(scratchpad + spAddr1 + 56)); e[3] = ieee_set_exponent<-240>(load_cvt_i32x2(scratchpad + spAddr1 + 56));
if (trace) { if (trace) {
std::cout << "iteration " << std::dec << iter << std::endl; std::cout << "iteration " << std::dec << iter << std::endl;
@ -357,10 +373,22 @@ namespace RandomX {
store64(scratchpad + spAddr1 + 48, r[6]); store64(scratchpad + spAddr1 + 48, r[6]);
store64(scratchpad + spAddr1 + 56, r[7]); store64(scratchpad + spAddr1 + 56, r[7]);
f[0] = _mm_mul_pd(f[0], e[0]); f[0] = _mm_xor_pd(f[0], e[0]);
f[1] = _mm_mul_pd(f[1], e[1]); f[1] = _mm_xor_pd(f[1], e[1]);
f[2] = _mm_mul_pd(f[2], e[2]); f[2] = _mm_xor_pd(f[2], e[2]);
f[3] = _mm_mul_pd(f[3], e[3]); f[3] = _mm_xor_pd(f[3], e[3]);
#ifdef FPUCHECK
for(int i = 0; i < 4; ++i) {
double lo = *(((double*)&f[i]) + 0);
double hi = *(((double*)&f[i]) + 1);
if (isDenormal(lo) || isDenormal(hi)) {
std::stringstream ss;
ss << "Denormal f" << i;
throw std::runtime_error(ss.str());
}
}
#endif
_mm_store_pd((double*)(scratchpad + spAddr0 + 0), f[0]); _mm_store_pd((double*)(scratchpad + spAddr0 + 0), f[0]);
_mm_store_pd((double*)(scratchpad + spAddr0 + 16), f[1]); _mm_store_pd((double*)(scratchpad + spAddr0 + 16), f[1]);

View file

@ -46,11 +46,6 @@ constexpr uint64_t signExtend2sCompl(uint32_t x) {
#include <intrin.h> #include <intrin.h>
#endif #endif
inline __m128d _mm_abs(__m128d xd) {
const __m128d absmask = _mm_castsi128_pd(_mm_set1_epi64x(~(1LL << 63)));
return _mm_and_pd(xd, absmask);
}
#define PREFETCHNTA(x) _mm_prefetch((const char *)(x), _MM_HINT_NTA) #define PREFETCHNTA(x) _mm_prefetch((const char *)(x), _MM_HINT_NTA)
#else #else
@ -159,6 +154,20 @@ inline __m128d _mm_xor_pd(__m128d a, __m128d b) {
return x; return x;
} }
inline __m128d _mm_and_pd(__m128d a, __m128d b) {
__m128d x;
x.i.u64[0] = a.i.u64[0] & b.i.u64[0];
x.i.u64[1] = a.i.u64[1] & b.i.u64[1];
return x;
}
inline __m128d _mm_or_pd(__m128d a, __m128d b) {
__m128d x;
x.i.u64[0] = a.i.u64[0] | b.i.u64[0];
x.i.u64[1] = a.i.u64[1] | b.i.u64[1];
return x;
}
inline __m128d _mm_set_pd(double e1, double e0) { inline __m128d _mm_set_pd(double e1, double e0) {
__m128d x; __m128d x;
x.lo = e0; x.lo = e0;
@ -302,6 +311,18 @@ inline __m128d load_cvt_i32x2(const void* addr) {
return _mm_cvtepi32_pd(ix); return _mm_cvtepi32_pd(ix);
} }
template<int E>
__m128d ieee_set_exponent(__m128d x) {
static_assert(E > -1023, "Invalid exponent value");
constexpr uint64_t mantissaMask64 = (1ULL << 52) - 1;
const __m128d mantissaMask = _mm_castsi128_pd(_mm_set_epi64x(mantissaMask64, mantissaMask64));
constexpr uint64_t exponent64 = (uint64_t)(E + 1023U) << 52;
const __m128d exponentMask = _mm_castsi128_pd(_mm_set_epi64x(exponent64, exponent64));
x = _mm_and_pd(x, mantissaMask);
x = _mm_or_pd(x, exponentMask);
return x;
}
double loadDoublePortable(const void* addr); double loadDoublePortable(const void* addr);
uint64_t mulh(uint64_t, uint64_t); uint64_t mulh(uint64_t, uint64_t);