Abstracted away from x86 intrinsics

2024-08-15 00:23:14 +00:00 · 2019-05-14 09:13:38 +02:00 · 2019-05-14 09:13:38 +02:00 · 1aa7865619
commit 1aa7865619
parent 3dd21ea93d
10 changed files with 267 additions and 249 deletions
--- a/src/aes_hash.cpp
+++ b/src/aes_hash.cpp
@ -36,21 +36,21 @@ void hashAes1Rx4(const void *input, size_t inputSize, void *hash) {
 	const uint8_t* inptr = (uint8_t*)input;
 	const uint8_t* inputEnd = inptr + inputSize;

-	__m128i state0, state1, state2, state3;
-	__m128i in0, in1, in2, in3;
+	rx_vec_i128 state0, state1, state2, state3;
+	rx_vec_i128 in0, in1, in2, in3;

 	//intial state
-	state0 = _mm_set_epi32(0x8d3126fd, 0x1146d167, 0x887af5ab, 0xc4778e00);
-	state1 = _mm_set_epi32(0x19fe9fa1, 0x58da632b, 0x1b95af89, 0xb834ef4b);
-	state2 = _mm_set_epi32(0x1bb2cd74, 0xc35ad744, 0xab283a00, 0x7742dd3a);
-	state3 = _mm_set_epi32(0xbb30a58a, 0x49593c57, 0xdc5d97cc, 0xe18b449a);
+	state0 = rx_set_int_vec_i128(0x8d3126fd, 0x1146d167, 0x887af5ab, 0xc4778e00);
+	state1 = rx_set_int_vec_i128(0x19fe9fa1, 0x58da632b, 0x1b95af89, 0xb834ef4b);
+	state2 = rx_set_int_vec_i128(0x1bb2cd74, 0xc35ad744, 0xab283a00, 0x7742dd3a);
+	state3 = rx_set_int_vec_i128(0xbb30a58a, 0x49593c57, 0xdc5d97cc, 0xe18b449a);

 	//process 64 bytes at a time in 4 lanes
 	while (inptr < inputEnd) {
-		in0 = _mm_load_si128((__m128i*)inptr + 0);
-		in1 = _mm_load_si128((__m128i*)inptr + 1);
-		in2 = _mm_load_si128((__m128i*)inptr + 2);
-		in3 = _mm_load_si128((__m128i*)inptr + 3);
+		in0 = rx_load_vec_i128((rx_vec_i128*)inptr + 0);
+		in1 = rx_load_vec_i128((rx_vec_i128*)inptr + 1);
+		in2 = rx_load_vec_i128((rx_vec_i128*)inptr + 2);
+		in3 = rx_load_vec_i128((rx_vec_i128*)inptr + 3);

 		state0 = aesenc<softAes>(state0, in0);
 		state1 = aesdec<softAes>(state1, in1);
@ -61,8 +61,8 @@ void hashAes1Rx4(const void *input, size_t inputSize, void *hash) {
 	}

 	//two extra rounds to achieve full diffusion
-	__m128i xkey0 = _mm_set_epi32(0x83951283, 0xe4c5593d, 0x2a5a929c, 0x11cbf247);
-	__m128i xkey1 = _mm_set_epi32(0xff215bb2, 0xabbc2523, 0x477bef0b, 0xce816c95);
+	rx_vec_i128 xkey0 = rx_set_int_vec_i128(0x83951283, 0xe4c5593d, 0x2a5a929c, 0x11cbf247);
+	rx_vec_i128 xkey1 = rx_set_int_vec_i128(0xff215bb2, 0xabbc2523, 0x477bef0b, 0xce816c95);

 	state0 = aesenc<softAes>(state0, xkey0);
 	state1 = aesdec<softAes>(state1, xkey0);
@ -75,10 +75,10 @@ void hashAes1Rx4(const void *input, size_t inputSize, void *hash) {
 	state3 = aesdec<softAes>(state3, xkey1);

 	//output hash
-	_mm_store_si128((__m128i*)hash + 0, state0);
-	_mm_store_si128((__m128i*)hash + 1, state1);
-	_mm_store_si128((__m128i*)hash + 2, state2);
-	_mm_store_si128((__m128i*)hash + 3, state3);
+	rx_store_vec_i128((rx_vec_i128*)hash + 0, state0);
+	rx_store_vec_i128((rx_vec_i128*)hash + 1, state1);
+	rx_store_vec_i128((rx_vec_i128*)hash + 2, state2);
+	rx_store_vec_i128((rx_vec_i128*)hash + 3, state3);
 }

 template void hashAes1Rx4<false>(const void *input, size_t inputSize, void *hash);
@ -99,18 +99,18 @@ void fillAes1Rx4(void *state, size_t outputSize, void *buffer) {
 	const uint8_t* outptr = (uint8_t*)buffer;
 	const uint8_t* outputEnd = outptr + outputSize;

-	__m128i state0, state1, state2, state3;
-	__m128i key0, key1, key2, key3;
+	rx_vec_i128 state0, state1, state2, state3;
+	rx_vec_i128 key0, key1, key2, key3;

-	key0 = _mm_set_epi32(0xdf20a2e3, 0xca329132, 0x454ff6d5, 0x84eeec2d);
-	key1 = _mm_set_epi32(0x1deb5971, 0xfed0387f, 0xf10fc578, 0x017b63d0);
-	key2 = _mm_set_epi32(0xdfc926b3, 0xa517ceb4, 0x2f2c70a1, 0x327d7a52);
-	key3 = _mm_set_epi32(0x341cf31c, 0xa0ece0a9, 0x3d17da5e, 0x5c8d77d3);
+	key0 = rx_set_int_vec_i128(0xdf20a2e3, 0xca329132, 0x454ff6d5, 0x84eeec2d);
+	key1 = rx_set_int_vec_i128(0x1deb5971, 0xfed0387f, 0xf10fc578, 0x017b63d0);
+	key2 = rx_set_int_vec_i128(0xdfc926b3, 0xa517ceb4, 0x2f2c70a1, 0x327d7a52);
+	key3 = rx_set_int_vec_i128(0x341cf31c, 0xa0ece0a9, 0x3d17da5e, 0x5c8d77d3);

-	state0 = _mm_load_si128((__m128i*)state + 0);
-	state1 = _mm_load_si128((__m128i*)state + 1);
-	state2 = _mm_load_si128((__m128i*)state + 2);
-	state3 = _mm_load_si128((__m128i*)state + 3);
+	state0 = rx_load_vec_i128((rx_vec_i128*)state + 0);
+	state1 = rx_load_vec_i128((rx_vec_i128*)state + 1);
+	state2 = rx_load_vec_i128((rx_vec_i128*)state + 2);
+	state3 = rx_load_vec_i128((rx_vec_i128*)state + 3);

 	while (outptr < outputEnd) {
 		state0 = aesdec<softAes>(state0, key0);
@ -118,18 +118,18 @@ void fillAes1Rx4(void *state, size_t outputSize, void *buffer) {
 		state2 = aesdec<softAes>(state2, key2);
 		state3 = aesenc<softAes>(state3, key3);

-		_mm_store_si128((__m128i*)outptr + 0, state0);
-		_mm_store_si128((__m128i*)outptr + 1, state1);
-		_mm_store_si128((__m128i*)outptr + 2, state2);
-		_mm_store_si128((__m128i*)outptr + 3, state3);
+		rx_store_vec_i128((rx_vec_i128*)outptr + 0, state0);
+		rx_store_vec_i128((rx_vec_i128*)outptr + 1, state1);
+		rx_store_vec_i128((rx_vec_i128*)outptr + 2, state2);
+		rx_store_vec_i128((rx_vec_i128*)outptr + 3, state3);

 		outptr += 64;
 	}

-	_mm_store_si128((__m128i*)state + 0, state0);
-	_mm_store_si128((__m128i*)state + 1, state1);
-	_mm_store_si128((__m128i*)state + 2, state2);
-	_mm_store_si128((__m128i*)state + 3, state3);
+	rx_store_vec_i128((rx_vec_i128*)state + 0, state0);
+	rx_store_vec_i128((rx_vec_i128*)state + 1, state1);
+	rx_store_vec_i128((rx_vec_i128*)state + 2, state2);
+	rx_store_vec_i128((rx_vec_i128*)state + 3, state3);
 }

 template void fillAes1Rx4<true>(void *state, size_t outputSize, void *buffer);
--- a/src/allocator.cpp
+++ b/src/allocator.cpp
@ -27,7 +27,7 @@ namespace randomx {

 	template<size_t alignment>
 	void* AlignedAllocator<alignment>::allocMemory(size_t count) {
-		void *mem = _mm_malloc(count, alignment);
+		void *mem = rx_aligned_alloc(count, alignment);
 		if (mem == nullptr)
 			throw std::bad_alloc();
 		return mem;
@ -35,11 +35,10 @@ namespace randomx {

 	template<size_t alignment>
 	void AlignedAllocator<alignment>::freeMemory(void* ptr, size_t count) {
-		_mm_free(ptr);
+		rx_aligned_free(ptr);
 	}

 	template class AlignedAllocator<CacheLineSize>;
-	template class AlignedAllocator<sizeof(__m128i)>;;

 	void* LargePageAllocator::allocMemory(size_t count) {
 		return allocLargePagesMemory(count);
--- a/src/dataset.cpp
+++ b/src/dataset.cpp
@ -148,7 +148,7 @@ namespace randomx {
 		rl[7] = rl[0] ^ superscalarAdd7;
 		for (unsigned i = 0; i < RANDOMX_CACHE_ACCESSES; ++i) {
 			mixBlock = getMixBlock(registerValue, cache->memory);
-			PREFETCHNTA(mixBlock);
+			rx_prefetch_nta(mixBlock);
 			SuperscalarProgram& prog = cache->programs[i];

 			executeSuperscalar(rl, prog, &cache->reciprocalCache);
--- a/src/instructions_portable.cpp
+++ b/src/instructions_portable.cpp
@ -123,8 +123,14 @@ along with RandomX.  If not, see<http://www.gnu.org/licenses/>.
 	#define HAVE_SMULH
 #endif

-void setRoundMode(uint32_t rcflag) {
-	switch (rcflag & 3) {
+#ifdef RANDOMX_DEFAULT_FENV
+
+void rx_reset_float_state() {
+	setRoundMode_(FE_TONEAREST);
+}
+
+void rx_set_rounding_mode(uint32_t mode) {
+	switch (mode & 3) {
 	case RoundDown:
 		setRoundMode_(FE_DOWNWARD);
 		break;
@ -142,13 +148,7 @@ void setRoundMode(uint32_t rcflag) {
 	}
 }

-void initFpu() {
-#ifdef __SSE2__
-	_mm_setcsr(0x9FC0); //Flush to zero, denormals are zero, default rounding mode, all exceptions disabled
-#else
-	setRoundMode(FE_TONEAREST);
 #endif
-}

 union double_ser_t {
 	double f;
--- a/src/intrin_portable.h
+++ b/src/intrin_portable.h
@ -20,6 +20,7 @@ along with RandomX.  If not, see<http://www.gnu.org/licenses/>.
 #pragma once

 #include <cstdint>
+#include "blake2/endian.h"

 constexpr int32_t unsigned32ToSigned2sCompl(uint32_t x) {
 	return (-1 == ~0) ? (int32_t)x : (x > INT32_MAX ? (-(int32_t)(UINT32_MAX - x) - 1) : (int32_t)x);
@ -33,6 +34,11 @@ constexpr uint64_t signExtend2sCompl(uint32_t x) {
 	return (-1 == ~0) ? (int64_t)(int32_t)(x) : (x > INT32_MAX ? (x | 0xffffffff00000000ULL) : (uint64_t)x);
 }

+constexpr int RoundToNearest = 0;
+constexpr int RoundDown = 1;
+constexpr int RoundUp = 2;
+constexpr int RoundToZero = 3;
+
 #if defined(_MSC_VER)
 #if defined(_M_X64) || (defined(_M_IX86_FP) && _M_IX86_FP == 2)
 #define __SSE2__ 1
@ -46,185 +52,230 @@ constexpr uint64_t signExtend2sCompl(uint32_t x) {
 #include <intrin.h>
 #endif

-#define PREFETCHNTA(x) _mm_prefetch((const char *)(x), _MM_HINT_NTA)
+typedef __m128i rx_vec_i128;
+typedef __m128d rx_vec_f128;
+
+#define rx_aligned_alloc(a, b) _mm_malloc(a,b)
+#define rx_aligned_free(a) _mm_free(a)
+#define rx_prefetch_nta(x) _mm_prefetch((const char *)(x), _MM_HINT_NTA)
+
+#define rx_load_vec_f128 _mm_load_pd
+#define rx_store_vec_f128 _mm_store_pd
+#define rx_shuffle_vec_f128 _mm_shuffle_pd
+#define rx_add_vec_f128 _mm_add_pd
+#define rx_sub_vec_f128 _mm_sub_pd
+#define rx_mul_vec_f128 _mm_mul_pd
+#define rx_div_vec_f128 _mm_div_pd
+#define rx_sqrt_vec_f128 _mm_sqrt_pd
+#define rx_set1_long_vec_i128 _mm_set1_epi64x
+#define rx_vec_i128_vec_f128 _mm_castsi128_pd
+
+FORCE_INLINE rx_vec_f128 rx_set_vec_f128(uint64_t x1, uint64_t x0) {
+	return _mm_castsi128_pd(_mm_set_epi64x(x1, x0));
+}
+
+FORCE_INLINE rx_vec_f128 rx_set1_vec_f128(uint64_t x) {
+	return _mm_castsi128_pd(_mm_set1_epi64x(x));
+}
+
+#define rx_xor_vec_f128 _mm_xor_pd
+#define rx_and_vec_f128 _mm_and_pd
+#define rx_or_vec_f128 _mm_or_pd
+#define rx_aesenc_vec_i128 _mm_aesenc_si128
+#define rx_aesdec_vec_i128 _mm_aesdec_si128
+
+FORCE_INLINE int rx_vec_i128_x(rx_vec_i128 a) {
+	return _mm_cvtsi128_si32(a);
+}
+
+FORCE_INLINE int rx_vec_i128_y(rx_vec_i128 a) {
+	return _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0x55));
+}
+
+FORCE_INLINE int rx_vec_i128_z(rx_vec_i128 a) {
+	return _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0xaa));
+}
+
+FORCE_INLINE int rx_vec_i128_w(rx_vec_i128 a) {
+	return _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0xff));
+}
+
+#define rx_set_int_vec_i128 _mm_set_epi32
+#define rx_xor_vec_i128 _mm_xor_si128
+#define rx_load_vec_i128 _mm_load_si128
+#define rx_store_vec_i128 _mm_store_si128
+
+FORCE_INLINE rx_vec_f128 rx_cvt_packed_int_vec_f128(const void* addr) {
+	__m128i ix = _mm_loadl_epi64((const __m128i*)addr);
+	return _mm_cvtepi32_pd(ix);
+}
+
+constexpr uint32_t rx_mxcsr_default = 0x9FC0; //Flush to zero, denormals are zero, default rounding mode, all exceptions disabled
+
+FORCE_INLINE void rx_reset_float_state() {
+	_mm_setcsr(rx_mxcsr_default);
+}
+
+FORCE_INLINE void rx_set_rounding_mode(uint32_t mode) {
+	_mm_setcsr(rx_mxcsr_default | (mode << 13));
+}

 #else
 #include <cstdint>
 #include <stdexcept>
 #include <cstdlib>
 #include <cmath>
-#include "blake2/endian.h"
-
-#define _mm_malloc(a,b) malloc(a)
-#define _mm_free(a) free(a)
-#define PREFETCHNTA(x)

 typedef union {
 	uint64_t u64[2];
 	uint32_t u32[4];
 	uint16_t u16[8];
 	uint8_t u8[16];
-} __m128i;
+} rx_vec_i128;

 typedef union {
 	struct {
 		double lo;
 		double hi;
 	};
-	__m128i i;
-} __m128d;
+	rx_vec_i128 i;
+} rx_vec_f128;

-inline __m128d _mm_load_pd(const double* pd) {
-	__m128d x;
+#define rx_aligned_alloc(a, b) malloc(a)
+#define rx_aligned_free(a) free(a)
+#define rx_prefetch_nta(x)
+
+FORCE_INLINE rx_vec_f128 rx_load_vec_f128(const double* pd) {
+	rx_vec_f128 x;
 	x.i.u64[0] = load64(pd + 0);
 	x.i.u64[1] = load64(pd + 1);
 	return x;
 }

-inline void _mm_store_pd(double* mem_addr, __m128d a) {
+FORCE_INLINE void rx_store_vec_f128(double* mem_addr, rx_vec_f128 a) {
 	store64(mem_addr + 0, a.i.u64[0]);
 	store64(mem_addr + 1, a.i.u64[1]);
 }

-inline __m128d _mm_shuffle_pd(__m128d a, __m128d b, int imm8) {
-	__m128d x;
+FORCE_INLINE rx_vec_f128 rx_shuffle_vec_f128(rx_vec_f128 a, rx_vec_f128 b, int imm8) {
+	rx_vec_f128 x;
 	x.lo = (imm8 & 1) ? a.hi : a.lo;
 	x.hi = (imm8 & 2) ? b.hi : b.lo;
 	return x;
 }

-inline __m128d _mm_add_pd(__m128d a, __m128d b) {
-	__m128d x;
+FORCE_INLINE rx_vec_f128 rx_add_vec_f128(rx_vec_f128 a, rx_vec_f128 b) {
+	rx_vec_f128 x;
 	x.lo = a.lo + b.lo;
 	x.hi = a.hi + b.hi;
 	return x;
 }

-inline __m128d _mm_sub_pd(__m128d a, __m128d b) {
-	__m128d x;
+FORCE_INLINE rx_vec_f128 rx_sub_vec_f128(rx_vec_f128 a, rx_vec_f128 b) {
+	rx_vec_f128 x;
 	x.lo = a.lo - b.lo;
 	x.hi = a.hi - b.hi;
 	return x;
 }

-inline __m128d _mm_mul_pd(__m128d a, __m128d b) {
-	__m128d x;
+FORCE_INLINE rx_vec_f128 rx_mul_vec_f128(rx_vec_f128 a, rx_vec_f128 b) {
+	rx_vec_f128 x;
 	x.lo = a.lo * b.lo;
 	x.hi = a.hi * b.hi;
 	return x;
 }

-inline __m128d _mm_div_pd(__m128d a, __m128d b) {
-	__m128d x;
+FORCE_INLINE rx_vec_f128 rx_div_vec_f128(rx_vec_f128 a, rx_vec_f128 b) {
+	rx_vec_f128 x;
 	x.lo = a.lo / b.lo;
 	x.hi = a.hi / b.hi;
 	return x;
 }

-inline __m128d _mm_sqrt_pd(__m128d a) {
-	__m128d x;
+FORCE_INLINE rx_vec_f128 rx_sqrt_vec_f128(rx_vec_f128 a) {
+	rx_vec_f128 x;
 	x.lo = sqrt(a.lo);
 	x.hi = sqrt(a.hi);
 	return x;
 }

-inline __m128i _mm_set1_epi64x(uint64_t a) {
-	__m128i x;
+FORCE_INLINE rx_vec_i128 rx_set1_long_vec_i128(uint64_t a) {
+	rx_vec_i128 x;
 	x.u64[0] = a;
 	x.u64[1] = a;
 	return x;
 }

-inline __m128d _mm_castsi128_pd(__m128i a) {
-	__m128d x;
+FORCE_INLINE rx_vec_f128 rx_vec_i128_vec_f128(rx_vec_i128 a) {
+	rx_vec_f128 x;
 	x.i = a;
 	return x;
 }

-inline __m128d _mm_abs(__m128d xd) {
-	xd.lo = std::fabs(xd.lo);
-	xd.hi = std::fabs(xd.hi);
-	return xd;
+FORCE_INLINE rx_vec_f128 rx_set_vec_f128(uint64_t x1, uint64_t x0) {
+	rx_vec_f128 v;
+	v.i.u64[0] = x0;
+	v.i.u64[1] = x1;
+	return v;
 }

-inline __m128d _mm_xor_pd(__m128d a, __m128d b) {
-	__m128d x;
+FORCE_INLINE rx_vec_f128 rx_set1_vec_f128(uint64_t x) {
+	rx_vec_f128 v;
+	v.i.u64[0] = x;
+	v.i.u64[1] = x;
+	return v;
+}
+
+
+FORCE_INLINE rx_vec_f128 rx_xor_vec_f128(rx_vec_f128 a, rx_vec_f128 b) {
+	rx_vec_f128 x;
 	x.i.u64[0] = a.i.u64[0] ^ b.i.u64[0];
 	x.i.u64[1] = a.i.u64[1] ^ b.i.u64[1];
 	return x;
 }

-inline __m128d _mm_and_pd(__m128d a, __m128d b) {
-	__m128d x;
+FORCE_INLINE rx_vec_f128 rx_and_vec_f128(rx_vec_f128 a, rx_vec_f128 b) {
+	rx_vec_f128 x;
 	x.i.u64[0] = a.i.u64[0] & b.i.u64[0];
 	x.i.u64[1] = a.i.u64[1] & b.i.u64[1];
 	return x;
 }

-inline __m128d _mm_or_pd(__m128d a, __m128d b) {
-	__m128d x;
+FORCE_INLINE rx_vec_f128 rx_or_vec_f128(rx_vec_f128 a, rx_vec_f128 b) {
+	rx_vec_f128 x;
 	x.i.u64[0] = a.i.u64[0] | b.i.u64[0];
 	x.i.u64[1] = a.i.u64[1] | b.i.u64[1];
 	return x;
 }

-inline __m128d _mm_set_pd(double e1, double e0) {
-	__m128d x;
-	x.lo = e0;
-	x.hi = e1;
-	return x;
-}
-
-inline __m128d _mm_max_pd(__m128d a, __m128d b) {
-	__m128d x;
-	x.lo = a.lo > b.lo ? a.lo : b.lo;
-	x.hi = a.hi > b.hi ? a.hi : b.hi;
-	return x;
-}
-
-inline __m128d _mm_cvtepi32_pd(__m128i a) {
-	__m128d x;
-	x.lo = (double)unsigned32ToSigned2sCompl(a.u32[0]);
-	x.hi = (double)unsigned32ToSigned2sCompl(a.u32[1]);
-	return x;
-}
-
 static const char* platformError = "Platform doesn't support hardware AES";

-inline __m128i _mm_aeskeygenassist_si128(__m128i key, uint8_t rcon) {
+FORCE_INLINE rx_vec_i128 rx_aesenc_vec_i128(rx_vec_i128 v, rx_vec_i128 rkey) {
 	throw std::runtime_error(platformError);
 }

-inline __m128i _mm_aesenc_si128(__m128i v, __m128i rkey) {
+FORCE_INLINE rx_vec_i128 rx_aesdec_vec_i128(rx_vec_i128 v, rx_vec_i128 rkey) {
 	throw std::runtime_error(platformError);
 }

-inline __m128i _mm_aesdec_si128(__m128i v, __m128i rkey) {
-	throw std::runtime_error(platformError);
+FORCE_INLINE int rx_vec_i128_x(rx_vec_i128 a) {
+	return a.u32[0];
 }

-inline int _mm_cvtsi128_si32(__m128i v) {
-	return v.u32[0];
+FORCE_INLINE int rx_vec_i128_y(rx_vec_i128 a) {
+	return a.u32[1];
 }

-inline __m128i _mm_cvtsi32_si128(int si32) {
-	__m128i v;
-	v.u32[0] = si32;
-	v.u32[1] = 0;
-	v.u32[2] = 0;
-	v.u32[3] = 0;
-	return v;
+FORCE_INLINE int rx_vec_i128_z(rx_vec_i128 a) {
+	return a.u32[2];
 }

-inline  __m128i _mm_set_epi64x(int64_t _I1, int64_t _I0) {
-	__m128i v;
-	v.u64[0] = _I0;
-	v.u64[1] = _I1;
-	return v;
+FORCE_INLINE int rx_vec_i128_w(rx_vec_i128 a) {
+	return a.u32[3];
 }

-inline __m128i _mm_set_epi32(int _I3, int _I2, int _I1, int _I0) {
-	__m128i v;
+FORCE_INLINE rx_vec_i128 rx_set_int_vec_i128(int _I3, int _I2, int _I1, int _I0) {
+	rx_vec_i128 v;
 	v.u32[0] = _I0;
 	v.u32[1] = _I1;
 	v.u32[2] = _I2;
@ -232,8 +283,8 @@ inline __m128i _mm_set_epi32(int _I3, int _I2, int _I1, int _I0) {
 	return v;
 };

-inline __m128i _mm_xor_si128(__m128i _A, __m128i _B) {
-	__m128i c;
+FORCE_INLINE rx_vec_i128 rx_xor_vec_i128(rx_vec_i128 _A, rx_vec_i128 _B) {
+	rx_vec_i128 c;
 	c.u32[0] = _A.u32[0] ^ _B.u32[0];
 	c.u32[1] = _A.u32[1] ^ _B.u32[1];
 	c.u32[2] = _A.u32[2] ^ _B.u32[2];
@ -241,21 +292,12 @@ inline __m128i _mm_xor_si128(__m128i _A, __m128i _B) {
 	return c;
 }

-inline __m128i _mm_shuffle_epi32(__m128i _A, int _Imm) {
-	__m128i c;
-	c.u32[0] = _A.u32[_Imm & 3];
-	c.u32[1] = _A.u32[(_Imm >> 2) & 3];
-	c.u32[2] = _A.u32[(_Imm >> 4) & 3];
-	c.u32[3] = _A.u32[(_Imm >> 6) & 3];
-	return c;
-}
-
-inline __m128i _mm_load_si128(__m128i const*_P) {
+FORCE_INLINE rx_vec_i128 rx_load_vec_i128(rx_vec_i128 const*_P) {
 #if defined(NATIVE_LITTLE_ENDIAN)
 	return *_P;
 #else
 	uint32_t* ptr = (uint32_t*)_P;
-	__m128i c;
+	rx_vec_i128 c;
 	c.u32[0] = load32(ptr + 0);
 	c.u32[1] = load32(ptr + 1);
 	c.u32[2] = load32(ptr + 2);
@ -264,7 +306,7 @@ inline __m128i _mm_load_si128(__m128i const*_P) {
 #endif
 }

-inline void _mm_store_si128(__m128i *_P, __m128i _B) {
+FORCE_INLINE void rx_store_vec_i128(rx_vec_i128 *_P, rx_vec_i128 _B) {
 #if defined(NATIVE_LITTLE_ENDIAN)
 	*_P = _B;
 #else
@ -276,46 +318,23 @@ inline void _mm_store_si128(__m128i *_P, __m128i _B) {
 #endif
 }

-inline __m128i _mm_slli_si128(__m128i _A, int _Imm) {
-	_Imm &= 255;
-	if (_Imm > 15) {
-		_A.u64[0] = 0;
-		_A.u64[1] = 0;
-	}
-	else {
-		for (int i = 15; i >= _Imm; --i) {
-			_A.u8[i] = _A.u8[i - _Imm];
-		}
-		for (int i = 0; i < _Imm; ++i) {
-			_A.u8[i] = 0;
-		}
-	}
-	return _A;
-}
-
-inline __m128i _mm_loadl_epi64(__m128i const* mem_addr) {
-	__m128i x;
-	x.u32[0] = load32((uint8_t*)mem_addr + 0);
-	x.u32[1] = load32((uint8_t*)mem_addr + 4);
+FORCE_INLINE rx_vec_f128 rx_cvt_packed_int_vec_f128(const void* addr) {
+	rx_vec_f128 x;
+	x.lo = (double)unsigned32ToSigned2sCompl(load32((uint8_t*)addr + 0));
+	x.hi = (double)unsigned32ToSigned2sCompl(load32((uint8_t*)addr + 4));
 	return x;
 }

+#define RANDOMX_DEFAULT_FENV
+
+void rx_reset_float_state();
+
+void rx_set_rounding_mode(uint32_t mode);
+
 #endif

-constexpr int RoundToNearest = 0;
-constexpr int RoundDown = 1;
-constexpr int RoundUp = 2;
-constexpr int RoundToZero = 3;
-
-inline __m128d load_cvt_i32x2(const void* addr) {
-	__m128i ix = _mm_loadl_epi64((const __m128i*)addr);
-	return _mm_cvtepi32_pd(ix);
-}
-
 double loadDoublePortable(const void* addr);
 uint64_t mulh(uint64_t, uint64_t);
 int64_t smulh(int64_t, int64_t);
 uint64_t rotl(uint64_t, int);
 uint64_t rotr(uint64_t, int);
-void initFpu();
-void setRoundMode(uint32_t);
--- a/src/soft_aes.cpp
+++ b/src/soft_aes.cpp
@ -318,38 +318,38 @@ alignas(16) const uint32_t lutDec3[256] = {
 	0x397101a8, 0x08deb30c, 0xd89ce4b4, 0x6490c156, 0x7b6184cb, 0xd570b632, 0x48745c6c, 0xd04257b8,
 };

-__m128i soft_aesenc(__m128i in, __m128i key) {
+rx_vec_i128 soft_aesenc(rx_vec_i128 in, rx_vec_i128 key) {
 	uint32_t s0, s1, s2, s3;

-	s0 = _mm_cvtsi128_si32(_mm_shuffle_epi32(in, 0xff));
-	s1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(in, 0xaa));
-	s2 = _mm_cvtsi128_si32(_mm_shuffle_epi32(in, 0x55));
-	s3 = _mm_cvtsi128_si32(in);
+	s0 = rx_vec_i128_w(in);
+	s1 = rx_vec_i128_z(in);
+	s2 = rx_vec_i128_y(in);
+	s3 = rx_vec_i128_x(in);

-	__m128i out = _mm_set_epi32(
+	rx_vec_i128 out = rx_set_int_vec_i128(
 		(lutEnc0[s0 & 0xff] ^ lutEnc1[(s3 >> 8) & 0xff] ^ lutEnc2[(s2 >> 16) & 0xff] ^ lutEnc3[s1 >> 24]),
 		(lutEnc0[s1 & 0xff] ^ lutEnc1[(s0 >> 8) & 0xff] ^ lutEnc2[(s3 >> 16) & 0xff] ^ lutEnc3[s2 >> 24]),
 		(lutEnc0[s2 & 0xff] ^ lutEnc1[(s1 >> 8) & 0xff] ^ lutEnc2[(s0 >> 16) & 0xff] ^ lutEnc3[s3 >> 24]),
 		(lutEnc0[s3 & 0xff] ^ lutEnc1[(s2 >> 8) & 0xff] ^ lutEnc2[(s1 >> 16) & 0xff] ^ lutEnc3[s0 >> 24])
 	);

-	return _mm_xor_si128(out, key);
+	return rx_xor_vec_i128(out, key);
 }

-__m128i soft_aesdec(__m128i in, __m128i key) {
+rx_vec_i128 soft_aesdec(rx_vec_i128 in, rx_vec_i128 key) {
 	uint32_t s0, s1, s2, s3;

-	s0 = _mm_cvtsi128_si32(_mm_shuffle_epi32(in, 0xff));
-	s1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(in, 0xaa));
-	s2 = _mm_cvtsi128_si32(_mm_shuffle_epi32(in, 0x55));
-	s3 = _mm_cvtsi128_si32(in);
+	s0 = rx_vec_i128_w(in);
+	s1 = rx_vec_i128_z(in);
+	s2 = rx_vec_i128_y(in);
+	s3 = rx_vec_i128_x(in);

-	__m128i out = _mm_set_epi32(
+	rx_vec_i128 out = rx_set_int_vec_i128(
 		(lutDec0[s0 & 0xff] ^ lutDec1[(s1 >> 8) & 0xff] ^ lutDec2[(s2 >> 16) & 0xff] ^ lutDec3[s3 >> 24]),
 		(lutDec0[s1 & 0xff] ^ lutDec1[(s2 >> 8) & 0xff] ^ lutDec2[(s3 >> 16) & 0xff] ^ lutDec3[s0 >> 24]),
 		(lutDec0[s2 & 0xff] ^ lutDec1[(s3 >> 8) & 0xff] ^ lutDec2[(s0 >> 16) & 0xff] ^ lutDec3[s1 >> 24]),
 		(lutDec0[s3 & 0xff] ^ lutDec1[(s0 >> 8) & 0xff] ^ lutDec2[(s1 >> 16) & 0xff] ^ lutDec3[s2 >> 24])
 	);

-	return _mm_xor_si128(out, key);
+	return rx_xor_vec_i128(out, key);
 }
--- a/src/soft_aes.h
+++ b/src/soft_aes.h
@ -22,16 +22,16 @@ along with RandomX.  If not, see<http://www.gnu.org/licenses/>.
 #include <stdint.h>
 #include "intrin_portable.h"

-__m128i soft_aesenc(__m128i in, __m128i key);
+rx_vec_i128 soft_aesenc(rx_vec_i128 in, rx_vec_i128 key);

-__m128i soft_aesdec(__m128i in, __m128i key);
+rx_vec_i128 soft_aesdec(rx_vec_i128 in, rx_vec_i128 key);

 template<bool soft>
-inline __m128i aesenc(__m128i in, __m128i key) {
-	return soft ? soft_aesenc(in, key) : _mm_aesenc_si128(in, key);
+inline rx_vec_i128 aesenc(rx_vec_i128 in, rx_vec_i128 key) {
+	return soft ? soft_aesenc(in, key) : rx_aesenc_vec_i128(in, key);
 }

 template<bool soft>
-inline __m128i aesdec(__m128i in, __m128i key) {
-	return soft ? soft_aesdec(in, key) : _mm_aesdec_si128(in, key);
+inline rx_vec_i128 aesdec(rx_vec_i128 in, rx_vec_i128 key) {
+	return soft ? soft_aesdec(in, key) : rx_aesdec_vec_i128(in, key);
 }
--- a/src/virtual_machine.cpp
+++ b/src/virtual_machine.cpp
@ -32,7 +32,7 @@ randomx_vm::~randomx_vm() {
 }

 void randomx_vm::resetRoundingMode() {
-	initFpu();
+	rx_reset_float_state();
 }

 namespace randomx {
@ -86,7 +86,7 @@ void randomx_vm::initialize() {

 namespace randomx {

-	alignas(16) volatile static __m128i aesDummy;
+	alignas(16) volatile static rx_vec_i128 aesDummy;

 	template<class Allocator, bool softAes>
 	VmBase<Allocator, softAes>::~VmBase() {
@ -98,9 +98,9 @@ namespace randomx {
 		if (datasetPtr == nullptr)
 			throw std::invalid_argument("Cache/Dataset not set");
 		if (!softAes) { //if hardware AES is not supported, it's better to fail now than to return a ticking bomb
-			__m128i tmp = _mm_load_si128((const __m128i*)&aesDummy);
-			tmp = _mm_aesenc_si128(tmp, tmp);
-			_mm_store_si128((__m128i*)&aesDummy, tmp);
+			rx_vec_i128 tmp = rx_load_vec_i128((const rx_vec_i128*)&aesDummy);
+			tmp = rx_aesenc_vec_i128(tmp, tmp);
+			rx_store_vec_i128((rx_vec_i128*)&aesDummy, tmp);
 		}
 		scratchpad = (uint8_t*)Allocator::allocMemory(ScratchpadSize);
 	}
--- a/src/vm_interpreted.cpp
+++ b/src/vm_interpreted.cpp
@ -46,7 +46,7 @@ namespace randomx {
 	}

 	template<class Allocator, bool softAes>
-	void InterpretedVm<Allocator, softAes>::executeBytecode(int_reg_t(&r)[RegistersCount], __m128d (&f)[RegisterCountFlt], __m128d (&e)[RegisterCountFlt], __m128d (&a)[RegisterCountFlt]) {
+	void InterpretedVm<Allocator, softAes>::executeBytecode(int_reg_t(&r)[RegistersCount], rx_vec_f128(&f)[RegisterCountFlt], rx_vec_f128(&e)[RegisterCountFlt], rx_vec_f128(&a)[RegisterCountFlt]) {
 		for (int pc = 0; pc < RANDOMX_PROGRAM_SIZE; ++pc) {
 			executeBytecode(pc, r, f, e, a);
 		}
@ -59,16 +59,16 @@ namespace randomx {
 	}

 	template<class Allocator, bool softAes>
-	FORCE_INLINE __m128d InterpretedVm<Allocator, softAes>::maskRegisterExponentMantissa(__m128d x) {
-		const __m128d xmantissaMask = _mm_castsi128_pd(_mm_set_epi64x(dynamicMantissaMask, dynamicMantissaMask));
-		const __m128d xexponentMask = _mm_load_pd((const double*)&config.eMask);
-		x = _mm_and_pd(x, xmantissaMask);
-		x = _mm_or_pd(x, xexponentMask);
+	FORCE_INLINE rx_vec_f128 InterpretedVm<Allocator, softAes>::maskRegisterExponentMantissa(rx_vec_f128 x) {
+		const rx_vec_f128 xmantissaMask = rx_set_vec_f128(dynamicMantissaMask, dynamicMantissaMask);
+		const rx_vec_f128 xexponentMask = rx_load_vec_f128((const double*)&config.eMask);
+		x = rx_and_vec_f128(x, xmantissaMask);
+		x = rx_or_vec_f128(x, xexponentMask);
 		return x;
 	}

 	template<class Allocator, bool softAes>
-	void InterpretedVm<Allocator, softAes>::executeBytecode(int& pc, int_reg_t(&r)[RegistersCount], __m128d (&f)[RegisterCountFlt], __m128d (&e)[RegisterCountFlt], __m128d (&a)[RegisterCountFlt]) {
+	void InterpretedVm<Allocator, softAes>::executeBytecode(int& pc, int_reg_t(&r)[RegistersCount], rx_vec_f128(&f)[RegisterCountFlt], rx_vec_f128(&e)[RegisterCountFlt], rx_vec_f128(&a)[RegisterCountFlt]) {
 		auto& ibc = byteCode[pc];
 		switch (ibc.type)
 		{
@ -139,43 +139,43 @@ namespace randomx {
 			} break;

 			case InstructionType::FSWAP_R: {
-				*ibc.fdst = _mm_shuffle_pd(*ibc.fdst, *ibc.fdst, 1);
+				*ibc.fdst = rx_shuffle_vec_f128(*ibc.fdst, *ibc.fdst, 1);
 			} break;

 			case InstructionType::FADD_R: {
-				*ibc.fdst = _mm_add_pd(*ibc.fdst, *ibc.fsrc);
+				*ibc.fdst = rx_add_vec_f128(*ibc.fdst, *ibc.fsrc);
 			} break;

 			case InstructionType::FADD_M: {
-				__m128d fsrc = load_cvt_i32x2(getScratchpadAddress(ibc));
-				*ibc.fdst = _mm_add_pd(*ibc.fdst, fsrc);
+				rx_vec_f128 fsrc = rx_cvt_packed_int_vec_f128(getScratchpadAddress(ibc));
+				*ibc.fdst = rx_add_vec_f128(*ibc.fdst, fsrc);
 			} break;

 			case InstructionType::FSUB_R: {
-				*ibc.fdst = _mm_sub_pd(*ibc.fdst, *ibc.fsrc);
+				*ibc.fdst = rx_sub_vec_f128(*ibc.fdst, *ibc.fsrc);
 			} break;

 			case InstructionType::FSUB_M: {
-				__m128d fsrc = load_cvt_i32x2(getScratchpadAddress(ibc));
-				*ibc.fdst = _mm_sub_pd(*ibc.fdst, fsrc);
+				rx_vec_f128 fsrc = rx_cvt_packed_int_vec_f128(getScratchpadAddress(ibc));
+				*ibc.fdst = rx_sub_vec_f128(*ibc.fdst, fsrc);
 			} break;

 			case InstructionType::FSCAL_R: {
-				const __m128d mask = _mm_castsi128_pd(_mm_set1_epi64x(0x81F0000000000000));
-				*ibc.fdst = _mm_xor_pd(*ibc.fdst, mask);
+				const rx_vec_f128 mask = rx_set1_vec_f128(0x81F0000000000000);
+				*ibc.fdst = rx_xor_vec_f128(*ibc.fdst, mask);
 			} break;

 			case InstructionType::FMUL_R: {
-				*ibc.fdst = _mm_mul_pd(*ibc.fdst, *ibc.fsrc);
+				*ibc.fdst = rx_mul_vec_f128(*ibc.fdst, *ibc.fsrc);
 			} break;

 			case InstructionType::FDIV_M: {
-				__m128d fsrc = maskRegisterExponentMantissa(load_cvt_i32x2(getScratchpadAddress(ibc)));
-				*ibc.fdst = _mm_div_pd(*ibc.fdst, fsrc);
+				rx_vec_f128 fsrc = maskRegisterExponentMantissa(rx_cvt_packed_int_vec_f128(getScratchpadAddress(ibc)));
+				*ibc.fdst = rx_div_vec_f128(*ibc.fdst, fsrc);
 			} break;

 			case InstructionType::FSQRT_R: {
-				*ibc.fdst = _mm_sqrt_pd(*ibc.fdst);
+				*ibc.fdst = rx_sqrt_vec_f128(*ibc.fdst);
 			} break;

 			case InstructionType::CBRANCH: {
@ -186,7 +186,7 @@ namespace randomx {
 			} break;

 			case InstructionType::CFROUND: {
-				setRoundMode(rotr(*ibc.isrc, ibc.imm) % 4);
+				rx_set_rounding_mode(rotr(*ibc.isrc, ibc.imm) % 4);
 			} break;

 			case InstructionType::ISTORE: {
@ -205,12 +205,12 @@ namespace randomx {
 	template<class Allocator, bool softAes>
 	void InterpretedVm<Allocator, softAes>::execute() {
 		int_reg_t r[RegistersCount] = { 0 };
-		__m128d f[RegisterCountFlt];
-		__m128d e[RegisterCountFlt];
-		__m128d a[RegisterCountFlt];
+		rx_vec_f128 f[RegisterCountFlt];
+		rx_vec_f128 e[RegisterCountFlt];
+		rx_vec_f128 a[RegisterCountFlt];

 		for(unsigned i = 0; i < RegisterCountFlt; ++i)
-			a[i] = _mm_load_pd(&reg.a[i].lo);
+			a[i] = rx_load_vec_f128(&reg.a[i].lo);

 		precompileProgram(r, f, e, a);

@ -228,10 +228,10 @@ namespace randomx {
 				r[i] ^= load64(scratchpad + spAddr0 + 8 * i);

 			for (unsigned i = 0; i < RegisterCountFlt; ++i)
-				f[i] = load_cvt_i32x2(scratchpad + spAddr1 + 8 * i);
+				f[i] = rx_cvt_packed_int_vec_f128(scratchpad + spAddr1 + 8 * i);

 			for (unsigned i = 0; i < RegisterCountFlt; ++i)
-				e[i] = maskRegisterExponentMantissa(load_cvt_i32x2(scratchpad + spAddr1 + 8 * (RegisterCountFlt + i)));
+				e[i] = maskRegisterExponentMantissa(rx_cvt_packed_int_vec_f128(scratchpad + spAddr1 + 8 * (RegisterCountFlt + i)));

 			executeBytecode(r, f, e, a);

@ -244,10 +244,10 @@ namespace randomx {
 				store64(scratchpad + spAddr1 + 8 * i, r[i]);

 			for (unsigned i = 0; i < RegisterCountFlt; ++i)
-				f[i] = _mm_xor_pd(f[i], e[i]);
+				f[i] = rx_xor_vec_f128(f[i], e[i]);

 			for (unsigned i = 0; i < RegisterCountFlt; ++i)
-				_mm_store_pd((double*)(scratchpad + spAddr0 + 16 * i), f[i]);
+				rx_store_vec_f128((double*)(scratchpad + spAddr0 + 16 * i), f[i]);

 			spAddr0 = 0;
 			spAddr1 = 0;
@ -257,10 +257,10 @@ namespace randomx {
 			store64(&reg.r[i], r[i]);

 		for (unsigned i = 0; i < RegisterCountFlt; ++i)
-			_mm_store_pd(&reg.f[i].lo, f[i]);
+			rx_store_vec_f128(&reg.f[i].lo, f[i]);

 		for (unsigned i = 0; i < RegisterCountFlt; ++i)
-			_mm_store_pd(&reg.e[i].lo, e[i]);
+			rx_store_vec_f128(&reg.e[i].lo, e[i]);
 	}

 	template<class Allocator, bool softAes>
@ -273,7 +273,7 @@ namespace randomx {
 #include "instruction_weights.hpp"

 	template<class Allocator, bool softAes>
-	void InterpretedVm<Allocator, softAes>::precompileProgram(int_reg_t(&r)[RegistersCount], __m128d (&f)[RegisterCountFlt], __m128d (&e)[RegisterCountFlt], __m128d (&a)[RegisterCountFlt]) {
+	void InterpretedVm<Allocator, softAes>::precompileProgram(int_reg_t(&r)[RegistersCount], rx_vec_f128(&f)[RegisterCountFlt], rx_vec_f128(&e)[RegisterCountFlt], rx_vec_f128(&a)[RegisterCountFlt]) {
 		RegisterUsage registerUsage[RegistersCount];
 		for (unsigned i = 0; i < RegistersCount; ++i) {
 			registerUsage[i].lastUsed = -1;
--- a/src/vm_interpreted.hpp
+++ b/src/vm_interpreted.hpp
@ -31,11 +31,11 @@ namespace randomx {
 	struct InstructionByteCode {
 		union {
 			int_reg_t* idst;
-			__m128d* fdst;
+			rx_vec_f128* fdst;
 		};
 		union {
 			int_reg_t* isrc;
-			__m128d* fsrc;
+			rx_vec_f128* fsrc;
 		};
 		union {
 			uint64_t imm;
@ -74,11 +74,11 @@ namespace randomx {
 		virtual void datasetRead(uint32_t blockNumber, int_reg_t(&r)[RegistersCount]);
 	private:
 		void execute();
-		void precompileProgram(int_reg_t(&r)[RegistersCount], __m128d (&f)[RegisterCountFlt], __m128d (&e)[RegisterCountFlt], __m128d (&a)[RegisterCountFlt]);
-		void executeBytecode(int_reg_t(&r)[RegistersCount], __m128d (&f)[RegisterCountFlt], __m128d (&e)[RegisterCountFlt], __m128d (&a)[RegisterCountFlt]);
-		void executeBytecode(int& i, int_reg_t(&r)[RegistersCount], __m128d (&f)[RegisterCountFlt], __m128d (&e)[RegisterCountFlt], __m128d (&a)[RegisterCountFlt]);
+		void precompileProgram(int_reg_t(&r)[RegistersCount], rx_vec_f128(&f)[RegisterCountFlt], rx_vec_f128(&e)[RegisterCountFlt], rx_vec_f128(&a)[RegisterCountFlt]);
+		void executeBytecode(int_reg_t(&r)[RegistersCount], rx_vec_f128(&f)[RegisterCountFlt], rx_vec_f128(&e)[RegisterCountFlt], rx_vec_f128(&a)[RegisterCountFlt]);
+		void executeBytecode(int& i, int_reg_t(&r)[RegistersCount], rx_vec_f128(&f)[RegisterCountFlt], rx_vec_f128(&e)[RegisterCountFlt], rx_vec_f128(&a)[RegisterCountFlt]);
 		void* getScratchpadAddress(InstructionByteCode& ibc);
-		__m128d maskRegisterExponentMantissa(__m128d);
+		rx_vec_f128 maskRegisterExponentMantissa(rx_vec_f128);

 		InstructionByteCode byteCode[RANDOMX_PROGRAM_SIZE];
 	};