mirror of
				https://git.wownero.com/wownero/RandomWOW.git
				synced 2024-08-15 00:23:14 +00:00 
			
		
		
		
	Abstracted away from x86 intrinsics
This commit is contained in:
		
							parent
							
								
									3dd21ea93d
								
							
						
					
					
						commit
						1aa7865619
					
				
					 10 changed files with 267 additions and 249 deletions
				
			
		|  | @ -36,21 +36,21 @@ void hashAes1Rx4(const void *input, size_t inputSize, void *hash) { | |||
| 	const uint8_t* inptr = (uint8_t*)input; | ||||
| 	const uint8_t* inputEnd = inptr + inputSize; | ||||
| 
 | ||||
| 	__m128i state0, state1, state2, state3; | ||||
| 	__m128i in0, in1, in2, in3; | ||||
| 	rx_vec_i128 state0, state1, state2, state3; | ||||
| 	rx_vec_i128 in0, in1, in2, in3; | ||||
| 
 | ||||
| 	//intial state
 | ||||
| 	state0 = _mm_set_epi32(0x8d3126fd, 0x1146d167, 0x887af5ab, 0xc4778e00); | ||||
| 	state1 = _mm_set_epi32(0x19fe9fa1, 0x58da632b, 0x1b95af89, 0xb834ef4b); | ||||
| 	state2 = _mm_set_epi32(0x1bb2cd74, 0xc35ad744, 0xab283a00, 0x7742dd3a); | ||||
| 	state3 = _mm_set_epi32(0xbb30a58a, 0x49593c57, 0xdc5d97cc, 0xe18b449a); | ||||
| 	state0 = rx_set_int_vec_i128(0x8d3126fd, 0x1146d167, 0x887af5ab, 0xc4778e00); | ||||
| 	state1 = rx_set_int_vec_i128(0x19fe9fa1, 0x58da632b, 0x1b95af89, 0xb834ef4b); | ||||
| 	state2 = rx_set_int_vec_i128(0x1bb2cd74, 0xc35ad744, 0xab283a00, 0x7742dd3a); | ||||
| 	state3 = rx_set_int_vec_i128(0xbb30a58a, 0x49593c57, 0xdc5d97cc, 0xe18b449a); | ||||
| 
 | ||||
| 	//process 64 bytes at a time in 4 lanes
 | ||||
| 	while (inptr < inputEnd) { | ||||
| 		in0 = _mm_load_si128((__m128i*)inptr + 0); | ||||
| 		in1 = _mm_load_si128((__m128i*)inptr + 1); | ||||
| 		in2 = _mm_load_si128((__m128i*)inptr + 2); | ||||
| 		in3 = _mm_load_si128((__m128i*)inptr + 3); | ||||
| 		in0 = rx_load_vec_i128((rx_vec_i128*)inptr + 0); | ||||
| 		in1 = rx_load_vec_i128((rx_vec_i128*)inptr + 1); | ||||
| 		in2 = rx_load_vec_i128((rx_vec_i128*)inptr + 2); | ||||
| 		in3 = rx_load_vec_i128((rx_vec_i128*)inptr + 3); | ||||
| 
 | ||||
| 		state0 = aesenc<softAes>(state0, in0); | ||||
| 		state1 = aesdec<softAes>(state1, in1); | ||||
|  | @ -61,8 +61,8 @@ void hashAes1Rx4(const void *input, size_t inputSize, void *hash) { | |||
| 	} | ||||
| 
 | ||||
| 	//two extra rounds to achieve full diffusion
 | ||||
| 	__m128i xkey0 = _mm_set_epi32(0x83951283, 0xe4c5593d, 0x2a5a929c, 0x11cbf247); | ||||
| 	__m128i xkey1 = _mm_set_epi32(0xff215bb2, 0xabbc2523, 0x477bef0b, 0xce816c95); | ||||
| 	rx_vec_i128 xkey0 = rx_set_int_vec_i128(0x83951283, 0xe4c5593d, 0x2a5a929c, 0x11cbf247); | ||||
| 	rx_vec_i128 xkey1 = rx_set_int_vec_i128(0xff215bb2, 0xabbc2523, 0x477bef0b, 0xce816c95); | ||||
| 
 | ||||
| 	state0 = aesenc<softAes>(state0, xkey0); | ||||
| 	state1 = aesdec<softAes>(state1, xkey0); | ||||
|  | @ -75,10 +75,10 @@ void hashAes1Rx4(const void *input, size_t inputSize, void *hash) { | |||
| 	state3 = aesdec<softAes>(state3, xkey1); | ||||
| 
 | ||||
| 	//output hash
 | ||||
| 	_mm_store_si128((__m128i*)hash + 0, state0); | ||||
| 	_mm_store_si128((__m128i*)hash + 1, state1); | ||||
| 	_mm_store_si128((__m128i*)hash + 2, state2); | ||||
| 	_mm_store_si128((__m128i*)hash + 3, state3); | ||||
| 	rx_store_vec_i128((rx_vec_i128*)hash + 0, state0); | ||||
| 	rx_store_vec_i128((rx_vec_i128*)hash + 1, state1); | ||||
| 	rx_store_vec_i128((rx_vec_i128*)hash + 2, state2); | ||||
| 	rx_store_vec_i128((rx_vec_i128*)hash + 3, state3); | ||||
| } | ||||
| 
 | ||||
| template void hashAes1Rx4<false>(const void *input, size_t inputSize, void *hash); | ||||
|  | @ -99,18 +99,18 @@ void fillAes1Rx4(void *state, size_t outputSize, void *buffer) { | |||
| 	const uint8_t* outptr = (uint8_t*)buffer; | ||||
| 	const uint8_t* outputEnd = outptr + outputSize; | ||||
| 
 | ||||
| 	__m128i state0, state1, state2, state3; | ||||
| 	__m128i key0, key1, key2, key3; | ||||
| 	rx_vec_i128 state0, state1, state2, state3; | ||||
| 	rx_vec_i128 key0, key1, key2, key3; | ||||
| 
 | ||||
| 	key0 = _mm_set_epi32(0xdf20a2e3, 0xca329132, 0x454ff6d5, 0x84eeec2d); | ||||
| 	key1 = _mm_set_epi32(0x1deb5971, 0xfed0387f, 0xf10fc578, 0x017b63d0); | ||||
| 	key2 = _mm_set_epi32(0xdfc926b3, 0xa517ceb4, 0x2f2c70a1, 0x327d7a52); | ||||
| 	key3 = _mm_set_epi32(0x341cf31c, 0xa0ece0a9, 0x3d17da5e, 0x5c8d77d3); | ||||
| 	key0 = rx_set_int_vec_i128(0xdf20a2e3, 0xca329132, 0x454ff6d5, 0x84eeec2d); | ||||
| 	key1 = rx_set_int_vec_i128(0x1deb5971, 0xfed0387f, 0xf10fc578, 0x017b63d0); | ||||
| 	key2 = rx_set_int_vec_i128(0xdfc926b3, 0xa517ceb4, 0x2f2c70a1, 0x327d7a52); | ||||
| 	key3 = rx_set_int_vec_i128(0x341cf31c, 0xa0ece0a9, 0x3d17da5e, 0x5c8d77d3); | ||||
| 
 | ||||
| 	state0 = _mm_load_si128((__m128i*)state + 0); | ||||
| 	state1 = _mm_load_si128((__m128i*)state + 1); | ||||
| 	state2 = _mm_load_si128((__m128i*)state + 2); | ||||
| 	state3 = _mm_load_si128((__m128i*)state + 3); | ||||
| 	state0 = rx_load_vec_i128((rx_vec_i128*)state + 0); | ||||
| 	state1 = rx_load_vec_i128((rx_vec_i128*)state + 1); | ||||
| 	state2 = rx_load_vec_i128((rx_vec_i128*)state + 2); | ||||
| 	state3 = rx_load_vec_i128((rx_vec_i128*)state + 3); | ||||
| 
 | ||||
| 	while (outptr < outputEnd) { | ||||
| 		state0 = aesdec<softAes>(state0, key0); | ||||
|  | @ -118,18 +118,18 @@ void fillAes1Rx4(void *state, size_t outputSize, void *buffer) { | |||
| 		state2 = aesdec<softAes>(state2, key2); | ||||
| 		state3 = aesenc<softAes>(state3, key3); | ||||
| 
 | ||||
| 		_mm_store_si128((__m128i*)outptr + 0, state0); | ||||
| 		_mm_store_si128((__m128i*)outptr + 1, state1); | ||||
| 		_mm_store_si128((__m128i*)outptr + 2, state2); | ||||
| 		_mm_store_si128((__m128i*)outptr + 3, state3); | ||||
| 		rx_store_vec_i128((rx_vec_i128*)outptr + 0, state0); | ||||
| 		rx_store_vec_i128((rx_vec_i128*)outptr + 1, state1); | ||||
| 		rx_store_vec_i128((rx_vec_i128*)outptr + 2, state2); | ||||
| 		rx_store_vec_i128((rx_vec_i128*)outptr + 3, state3); | ||||
| 
 | ||||
| 		outptr += 64; | ||||
| 	} | ||||
| 
 | ||||
| 	_mm_store_si128((__m128i*)state + 0, state0); | ||||
| 	_mm_store_si128((__m128i*)state + 1, state1); | ||||
| 	_mm_store_si128((__m128i*)state + 2, state2); | ||||
| 	_mm_store_si128((__m128i*)state + 3, state3); | ||||
| 	rx_store_vec_i128((rx_vec_i128*)state + 0, state0); | ||||
| 	rx_store_vec_i128((rx_vec_i128*)state + 1, state1); | ||||
| 	rx_store_vec_i128((rx_vec_i128*)state + 2, state2); | ||||
| 	rx_store_vec_i128((rx_vec_i128*)state + 3, state3); | ||||
| } | ||||
| 
 | ||||
| template void fillAes1Rx4<true>(void *state, size_t outputSize, void *buffer); | ||||
|  |  | |||
|  | @ -27,7 +27,7 @@ namespace randomx { | |||
| 
 | ||||
| 	template<size_t alignment> | ||||
| 	void* AlignedAllocator<alignment>::allocMemory(size_t count) { | ||||
| 		void *mem = _mm_malloc(count, alignment); | ||||
| 		void *mem = rx_aligned_alloc(count, alignment); | ||||
| 		if (mem == nullptr) | ||||
| 			throw std::bad_alloc(); | ||||
| 		return mem; | ||||
|  | @ -35,11 +35,10 @@ namespace randomx { | |||
| 
 | ||||
| 	template<size_t alignment> | ||||
| 	void AlignedAllocator<alignment>::freeMemory(void* ptr, size_t count) { | ||||
| 		_mm_free(ptr); | ||||
| 		rx_aligned_free(ptr); | ||||
| 	} | ||||
| 
 | ||||
| 	template class AlignedAllocator<CacheLineSize>; | ||||
| 	template class AlignedAllocator<sizeof(__m128i)>;; | ||||
| 
 | ||||
| 	void* LargePageAllocator::allocMemory(size_t count) { | ||||
| 		return allocLargePagesMemory(count); | ||||
|  |  | |||
|  | @ -148,7 +148,7 @@ namespace randomx { | |||
| 		rl[7] = rl[0] ^ superscalarAdd7; | ||||
| 		for (unsigned i = 0; i < RANDOMX_CACHE_ACCESSES; ++i) { | ||||
| 			mixBlock = getMixBlock(registerValue, cache->memory); | ||||
| 			PREFETCHNTA(mixBlock); | ||||
| 			rx_prefetch_nta(mixBlock); | ||||
| 			SuperscalarProgram& prog = cache->programs[i]; | ||||
| 
 | ||||
| 			executeSuperscalar(rl, prog, &cache->reciprocalCache); | ||||
|  |  | |||
|  | @ -123,32 +123,32 @@ along with RandomX.  If not, see<http://www.gnu.org/licenses/>. | |||
| 	#define HAVE_SMULH | ||||
| #endif | ||||
| 
 | ||||
| void setRoundMode(uint32_t rcflag) { | ||||
| 	switch (rcflag & 3) { | ||||
| 		case RoundDown: | ||||
| 			setRoundMode_(FE_DOWNWARD); | ||||
| 			break; | ||||
| 		case RoundUp: | ||||
| 			setRoundMode_(FE_UPWARD); | ||||
| 			break; | ||||
| 		case RoundToZero: | ||||
| 			setRoundMode_(FE_TOWARDZERO); | ||||
| 			break; | ||||
| 		case RoundToNearest: | ||||
| 			setRoundMode_(FE_TONEAREST); | ||||
| 			break; | ||||
| 		default: | ||||
| 			UNREACHABLE; | ||||
| #ifdef RANDOMX_DEFAULT_FENV | ||||
| 
 | ||||
| void rx_reset_float_state() { | ||||
| 	setRoundMode_(FE_TONEAREST); | ||||
| } | ||||
| 
 | ||||
| void rx_set_rounding_mode(uint32_t mode) { | ||||
| 	switch (mode & 3) { | ||||
| 	case RoundDown: | ||||
| 		setRoundMode_(FE_DOWNWARD); | ||||
| 		break; | ||||
| 	case RoundUp: | ||||
| 		setRoundMode_(FE_UPWARD); | ||||
| 		break; | ||||
| 	case RoundToZero: | ||||
| 		setRoundMode_(FE_TOWARDZERO); | ||||
| 		break; | ||||
| 	case RoundToNearest: | ||||
| 		setRoundMode_(FE_TONEAREST); | ||||
| 		break; | ||||
| 	default: | ||||
| 		UNREACHABLE; | ||||
| 	} | ||||
| } | ||||
| 
 | ||||
| void initFpu() { | ||||
| #ifdef __SSE2__ | ||||
| 	_mm_setcsr(0x9FC0); //Flush to zero, denormals are zero, default rounding mode, all exceptions disabled
 | ||||
| #else | ||||
| 	setRoundMode(FE_TONEAREST); | ||||
| #endif | ||||
| } | ||||
| 
 | ||||
| union double_ser_t { | ||||
| 	double f; | ||||
|  |  | |||
|  | @ -20,6 +20,7 @@ along with RandomX.  If not, see<http://www.gnu.org/licenses/>. | |||
| #pragma once | ||||
| 
 | ||||
| #include <cstdint> | ||||
| #include "blake2/endian.h" | ||||
| 
 | ||||
| constexpr int32_t unsigned32ToSigned2sCompl(uint32_t x) { | ||||
| 	return (-1 == ~0) ? (int32_t)x : (x > INT32_MAX ? (-(int32_t)(UINT32_MAX - x) - 1) : (int32_t)x); | ||||
|  | @ -33,6 +34,11 @@ constexpr uint64_t signExtend2sCompl(uint32_t x) { | |||
| 	return (-1 == ~0) ? (int64_t)(int32_t)(x) : (x > INT32_MAX ? (x | 0xffffffff00000000ULL) : (uint64_t)x); | ||||
| } | ||||
| 
 | ||||
| constexpr int RoundToNearest = 0; | ||||
| constexpr int RoundDown = 1; | ||||
| constexpr int RoundUp = 2; | ||||
| constexpr int RoundToZero = 3; | ||||
| 
 | ||||
| #if defined(_MSC_VER) | ||||
| #if defined(_M_X64) || (defined(_M_IX86_FP) && _M_IX86_FP == 2) | ||||
| #define __SSE2__ 1 | ||||
|  | @ -46,185 +52,230 @@ constexpr uint64_t signExtend2sCompl(uint32_t x) { | |||
| #include <intrin.h> | ||||
| #endif | ||||
| 
 | ||||
| #define PREFETCHNTA(x) _mm_prefetch((const char *)(x), _MM_HINT_NTA) | ||||
| typedef __m128i rx_vec_i128; | ||||
| typedef __m128d rx_vec_f128; | ||||
| 
 | ||||
| #define rx_aligned_alloc(a, b) _mm_malloc(a,b) | ||||
| #define rx_aligned_free(a) _mm_free(a) | ||||
| #define rx_prefetch_nta(x) _mm_prefetch((const char *)(x), _MM_HINT_NTA) | ||||
| 
 | ||||
| #define rx_load_vec_f128 _mm_load_pd | ||||
| #define rx_store_vec_f128 _mm_store_pd | ||||
| #define rx_shuffle_vec_f128 _mm_shuffle_pd | ||||
| #define rx_add_vec_f128 _mm_add_pd | ||||
| #define rx_sub_vec_f128 _mm_sub_pd | ||||
| #define rx_mul_vec_f128 _mm_mul_pd | ||||
| #define rx_div_vec_f128 _mm_div_pd | ||||
| #define rx_sqrt_vec_f128 _mm_sqrt_pd | ||||
| #define rx_set1_long_vec_i128 _mm_set1_epi64x | ||||
| #define rx_vec_i128_vec_f128 _mm_castsi128_pd | ||||
| 
 | ||||
| FORCE_INLINE rx_vec_f128 rx_set_vec_f128(uint64_t x1, uint64_t x0) { | ||||
| 	return _mm_castsi128_pd(_mm_set_epi64x(x1, x0)); | ||||
| } | ||||
| 
 | ||||
| FORCE_INLINE rx_vec_f128 rx_set1_vec_f128(uint64_t x) { | ||||
| 	return _mm_castsi128_pd(_mm_set1_epi64x(x)); | ||||
| } | ||||
| 
 | ||||
| #define rx_xor_vec_f128 _mm_xor_pd | ||||
| #define rx_and_vec_f128 _mm_and_pd | ||||
| #define rx_or_vec_f128 _mm_or_pd | ||||
| #define rx_aesenc_vec_i128 _mm_aesenc_si128 | ||||
| #define rx_aesdec_vec_i128 _mm_aesdec_si128 | ||||
| 
 | ||||
| FORCE_INLINE int rx_vec_i128_x(rx_vec_i128 a) { | ||||
| 	return _mm_cvtsi128_si32(a); | ||||
| } | ||||
| 
 | ||||
| FORCE_INLINE int rx_vec_i128_y(rx_vec_i128 a) { | ||||
| 	return _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0x55)); | ||||
| } | ||||
| 
 | ||||
| FORCE_INLINE int rx_vec_i128_z(rx_vec_i128 a) { | ||||
| 	return _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0xaa)); | ||||
| } | ||||
| 
 | ||||
| FORCE_INLINE int rx_vec_i128_w(rx_vec_i128 a) { | ||||
| 	return _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0xff)); | ||||
| } | ||||
| 
 | ||||
| #define rx_set_int_vec_i128 _mm_set_epi32 | ||||
| #define rx_xor_vec_i128 _mm_xor_si128 | ||||
| #define rx_load_vec_i128 _mm_load_si128 | ||||
| #define rx_store_vec_i128 _mm_store_si128 | ||||
| 
 | ||||
| FORCE_INLINE rx_vec_f128 rx_cvt_packed_int_vec_f128(const void* addr) { | ||||
| 	__m128i ix = _mm_loadl_epi64((const __m128i*)addr); | ||||
| 	return _mm_cvtepi32_pd(ix); | ||||
| } | ||||
| 
 | ||||
| constexpr uint32_t rx_mxcsr_default = 0x9FC0; //Flush to zero, denormals are zero, default rounding mode, all exceptions disabled
 | ||||
| 
 | ||||
| FORCE_INLINE void rx_reset_float_state() { | ||||
| 	_mm_setcsr(rx_mxcsr_default); | ||||
| } | ||||
| 
 | ||||
| FORCE_INLINE void rx_set_rounding_mode(uint32_t mode) { | ||||
| 	_mm_setcsr(rx_mxcsr_default | (mode << 13)); | ||||
| } | ||||
| 
 | ||||
| #else | ||||
| #include <cstdint> | ||||
| #include <stdexcept> | ||||
| #include <cstdlib> | ||||
| #include <cmath> | ||||
| #include "blake2/endian.h" | ||||
| 
 | ||||
| #define _mm_malloc(a,b) malloc(a) | ||||
| #define _mm_free(a) free(a) | ||||
| #define PREFETCHNTA(x) | ||||
| 
 | ||||
| typedef union { | ||||
| 	uint64_t u64[2]; | ||||
| 	uint32_t u32[4]; | ||||
| 	uint16_t u16[8]; | ||||
| 	uint8_t u8[16]; | ||||
| } __m128i; | ||||
| } rx_vec_i128; | ||||
| 
 | ||||
| typedef union { | ||||
| 	struct { | ||||
| 		double lo; | ||||
| 		double hi; | ||||
| 	}; | ||||
| 	__m128i i; | ||||
| } __m128d; | ||||
| 	rx_vec_i128 i; | ||||
| } rx_vec_f128; | ||||
| 
 | ||||
| inline __m128d _mm_load_pd(const double* pd) { | ||||
| 	__m128d x; | ||||
| #define rx_aligned_alloc(a, b) malloc(a) | ||||
| #define rx_aligned_free(a) free(a) | ||||
| #define rx_prefetch_nta(x) | ||||
| 
 | ||||
| FORCE_INLINE rx_vec_f128 rx_load_vec_f128(const double* pd) { | ||||
| 	rx_vec_f128 x; | ||||
| 	x.i.u64[0] = load64(pd + 0); | ||||
| 	x.i.u64[1] = load64(pd + 1); | ||||
| 	return x; | ||||
| } | ||||
| 
 | ||||
| inline void _mm_store_pd(double* mem_addr, __m128d a) { | ||||
| FORCE_INLINE void rx_store_vec_f128(double* mem_addr, rx_vec_f128 a) { | ||||
| 	store64(mem_addr + 0, a.i.u64[0]); | ||||
| 	store64(mem_addr + 1, a.i.u64[1]); | ||||
| } | ||||
| 
 | ||||
| inline __m128d _mm_shuffle_pd(__m128d a, __m128d b, int imm8) { | ||||
| 	__m128d x; | ||||
| FORCE_INLINE rx_vec_f128 rx_shuffle_vec_f128(rx_vec_f128 a, rx_vec_f128 b, int imm8) { | ||||
| 	rx_vec_f128 x; | ||||
| 	x.lo = (imm8 & 1) ? a.hi : a.lo; | ||||
| 	x.hi = (imm8 & 2) ? b.hi : b.lo; | ||||
| 	return x; | ||||
| } | ||||
| 
 | ||||
| inline __m128d _mm_add_pd(__m128d a, __m128d b) { | ||||
| 	__m128d x; | ||||
| FORCE_INLINE rx_vec_f128 rx_add_vec_f128(rx_vec_f128 a, rx_vec_f128 b) { | ||||
| 	rx_vec_f128 x; | ||||
| 	x.lo = a.lo + b.lo; | ||||
| 	x.hi = a.hi + b.hi; | ||||
| 	return x; | ||||
| } | ||||
| 
 | ||||
| inline __m128d _mm_sub_pd(__m128d a, __m128d b) { | ||||
| 	__m128d x; | ||||
| FORCE_INLINE rx_vec_f128 rx_sub_vec_f128(rx_vec_f128 a, rx_vec_f128 b) { | ||||
| 	rx_vec_f128 x; | ||||
| 	x.lo = a.lo - b.lo; | ||||
| 	x.hi = a.hi - b.hi; | ||||
| 	return x; | ||||
| } | ||||
| 
 | ||||
| inline __m128d _mm_mul_pd(__m128d a, __m128d b) { | ||||
| 	__m128d x; | ||||
| FORCE_INLINE rx_vec_f128 rx_mul_vec_f128(rx_vec_f128 a, rx_vec_f128 b) { | ||||
| 	rx_vec_f128 x; | ||||
| 	x.lo = a.lo * b.lo; | ||||
| 	x.hi = a.hi * b.hi; | ||||
| 	return x; | ||||
| } | ||||
| 
 | ||||
| inline __m128d _mm_div_pd(__m128d a, __m128d b) { | ||||
| 	__m128d x; | ||||
| FORCE_INLINE rx_vec_f128 rx_div_vec_f128(rx_vec_f128 a, rx_vec_f128 b) { | ||||
| 	rx_vec_f128 x; | ||||
| 	x.lo = a.lo / b.lo; | ||||
| 	x.hi = a.hi / b.hi; | ||||
| 	return x; | ||||
| } | ||||
| 
 | ||||
| inline __m128d _mm_sqrt_pd(__m128d a) { | ||||
| 	__m128d x; | ||||
| FORCE_INLINE rx_vec_f128 rx_sqrt_vec_f128(rx_vec_f128 a) { | ||||
| 	rx_vec_f128 x; | ||||
| 	x.lo = sqrt(a.lo); | ||||
| 	x.hi = sqrt(a.hi); | ||||
| 	return x; | ||||
| } | ||||
| 
 | ||||
| inline __m128i _mm_set1_epi64x(uint64_t a) { | ||||
| 	__m128i x; | ||||
| FORCE_INLINE rx_vec_i128 rx_set1_long_vec_i128(uint64_t a) { | ||||
| 	rx_vec_i128 x; | ||||
| 	x.u64[0] = a; | ||||
| 	x.u64[1] = a; | ||||
| 	return x; | ||||
| } | ||||
| 
 | ||||
| inline __m128d _mm_castsi128_pd(__m128i a) { | ||||
| 	__m128d x; | ||||
| FORCE_INLINE rx_vec_f128 rx_vec_i128_vec_f128(rx_vec_i128 a) { | ||||
| 	rx_vec_f128 x; | ||||
| 	x.i = a; | ||||
| 	return x; | ||||
| } | ||||
| 
 | ||||
| inline __m128d _mm_abs(__m128d xd) { | ||||
| 	xd.lo = std::fabs(xd.lo); | ||||
| 	xd.hi = std::fabs(xd.hi); | ||||
| 	return xd; | ||||
| FORCE_INLINE rx_vec_f128 rx_set_vec_f128(uint64_t x1, uint64_t x0) { | ||||
| 	rx_vec_f128 v; | ||||
| 	v.i.u64[0] = x0; | ||||
| 	v.i.u64[1] = x1; | ||||
| 	return v; | ||||
| } | ||||
| 
 | ||||
| inline __m128d _mm_xor_pd(__m128d a, __m128d b) { | ||||
| 	__m128d x; | ||||
| FORCE_INLINE rx_vec_f128 rx_set1_vec_f128(uint64_t x) { | ||||
| 	rx_vec_f128 v; | ||||
| 	v.i.u64[0] = x; | ||||
| 	v.i.u64[1] = x; | ||||
| 	return v; | ||||
| } | ||||
| 
 | ||||
| 
 | ||||
| FORCE_INLINE rx_vec_f128 rx_xor_vec_f128(rx_vec_f128 a, rx_vec_f128 b) { | ||||
| 	rx_vec_f128 x; | ||||
| 	x.i.u64[0] = a.i.u64[0] ^ b.i.u64[0]; | ||||
| 	x.i.u64[1] = a.i.u64[1] ^ b.i.u64[1]; | ||||
| 	return x; | ||||
| } | ||||
| 
 | ||||
| inline __m128d _mm_and_pd(__m128d a, __m128d b) { | ||||
| 	__m128d x; | ||||
| FORCE_INLINE rx_vec_f128 rx_and_vec_f128(rx_vec_f128 a, rx_vec_f128 b) { | ||||
| 	rx_vec_f128 x; | ||||
| 	x.i.u64[0] = a.i.u64[0] & b.i.u64[0]; | ||||
| 	x.i.u64[1] = a.i.u64[1] & b.i.u64[1]; | ||||
| 	return x; | ||||
| } | ||||
| 
 | ||||
| inline __m128d _mm_or_pd(__m128d a, __m128d b) { | ||||
| 	__m128d x; | ||||
| FORCE_INLINE rx_vec_f128 rx_or_vec_f128(rx_vec_f128 a, rx_vec_f128 b) { | ||||
| 	rx_vec_f128 x; | ||||
| 	x.i.u64[0] = a.i.u64[0] | b.i.u64[0]; | ||||
| 	x.i.u64[1] = a.i.u64[1] | b.i.u64[1]; | ||||
| 	return x; | ||||
| } | ||||
| 
 | ||||
| inline __m128d _mm_set_pd(double e1, double e0) { | ||||
| 	__m128d x; | ||||
| 	x.lo = e0; | ||||
| 	x.hi = e1; | ||||
| 	return x; | ||||
| } | ||||
| 
 | ||||
| inline __m128d _mm_max_pd(__m128d a, __m128d b) { | ||||
| 	__m128d x; | ||||
| 	x.lo = a.lo > b.lo ? a.lo : b.lo; | ||||
| 	x.hi = a.hi > b.hi ? a.hi : b.hi; | ||||
| 	return x; | ||||
| } | ||||
| 
 | ||||
| inline __m128d _mm_cvtepi32_pd(__m128i a) { | ||||
| 	__m128d x; | ||||
| 	x.lo = (double)unsigned32ToSigned2sCompl(a.u32[0]); | ||||
| 	x.hi = (double)unsigned32ToSigned2sCompl(a.u32[1]); | ||||
| 	return x; | ||||
| } | ||||
| 
 | ||||
| static const char* platformError = "Platform doesn't support hardware AES"; | ||||
| 
 | ||||
| inline __m128i _mm_aeskeygenassist_si128(__m128i key, uint8_t rcon) { | ||||
| FORCE_INLINE rx_vec_i128 rx_aesenc_vec_i128(rx_vec_i128 v, rx_vec_i128 rkey) { | ||||
| 	throw std::runtime_error(platformError); | ||||
| } | ||||
| 
 | ||||
| inline __m128i _mm_aesenc_si128(__m128i v, __m128i rkey) { | ||||
| FORCE_INLINE rx_vec_i128 rx_aesdec_vec_i128(rx_vec_i128 v, rx_vec_i128 rkey) { | ||||
| 	throw std::runtime_error(platformError); | ||||
| } | ||||
| 
 | ||||
| inline __m128i _mm_aesdec_si128(__m128i v, __m128i rkey) { | ||||
| 	throw std::runtime_error(platformError); | ||||
| FORCE_INLINE int rx_vec_i128_x(rx_vec_i128 a) { | ||||
| 	return a.u32[0]; | ||||
| } | ||||
| 
 | ||||
| inline int _mm_cvtsi128_si32(__m128i v) { | ||||
| 	return v.u32[0]; | ||||
| FORCE_INLINE int rx_vec_i128_y(rx_vec_i128 a) { | ||||
| 	return a.u32[1]; | ||||
| } | ||||
| 
 | ||||
| inline __m128i _mm_cvtsi32_si128(int si32) { | ||||
| 	__m128i v; | ||||
| 	v.u32[0] = si32; | ||||
| 	v.u32[1] = 0; | ||||
| 	v.u32[2] = 0; | ||||
| 	v.u32[3] = 0; | ||||
| 	return v; | ||||
| FORCE_INLINE int rx_vec_i128_z(rx_vec_i128 a) { | ||||
| 	return a.u32[2]; | ||||
| } | ||||
| 
 | ||||
| inline  __m128i _mm_set_epi64x(int64_t _I1, int64_t _I0) { | ||||
| 	__m128i v; | ||||
| 	v.u64[0] = _I0; | ||||
| 	v.u64[1] = _I1; | ||||
| 	return v; | ||||
| FORCE_INLINE int rx_vec_i128_w(rx_vec_i128 a) { | ||||
| 	return a.u32[3]; | ||||
| } | ||||
| 
 | ||||
| inline __m128i _mm_set_epi32(int _I3, int _I2, int _I1, int _I0) { | ||||
| 	__m128i v; | ||||
| FORCE_INLINE rx_vec_i128 rx_set_int_vec_i128(int _I3, int _I2, int _I1, int _I0) { | ||||
| 	rx_vec_i128 v; | ||||
| 	v.u32[0] = _I0; | ||||
| 	v.u32[1] = _I1; | ||||
| 	v.u32[2] = _I2; | ||||
|  | @ -232,8 +283,8 @@ inline __m128i _mm_set_epi32(int _I3, int _I2, int _I1, int _I0) { | |||
| 	return v; | ||||
| }; | ||||
| 
 | ||||
| inline __m128i _mm_xor_si128(__m128i _A, __m128i _B) { | ||||
| 	__m128i c; | ||||
| FORCE_INLINE rx_vec_i128 rx_xor_vec_i128(rx_vec_i128 _A, rx_vec_i128 _B) { | ||||
| 	rx_vec_i128 c; | ||||
| 	c.u32[0] = _A.u32[0] ^ _B.u32[0]; | ||||
| 	c.u32[1] = _A.u32[1] ^ _B.u32[1]; | ||||
| 	c.u32[2] = _A.u32[2] ^ _B.u32[2]; | ||||
|  | @ -241,21 +292,12 @@ inline __m128i _mm_xor_si128(__m128i _A, __m128i _B) { | |||
| 	return c; | ||||
| } | ||||
| 
 | ||||
| inline __m128i _mm_shuffle_epi32(__m128i _A, int _Imm) { | ||||
| 	__m128i c; | ||||
| 	c.u32[0] = _A.u32[_Imm & 3]; | ||||
| 	c.u32[1] = _A.u32[(_Imm >> 2) & 3]; | ||||
| 	c.u32[2] = _A.u32[(_Imm >> 4) & 3]; | ||||
| 	c.u32[3] = _A.u32[(_Imm >> 6) & 3]; | ||||
| 	return c; | ||||
| } | ||||
| 
 | ||||
| inline __m128i _mm_load_si128(__m128i const*_P) { | ||||
| FORCE_INLINE rx_vec_i128 rx_load_vec_i128(rx_vec_i128 const*_P) { | ||||
| #if defined(NATIVE_LITTLE_ENDIAN) | ||||
| 	return *_P; | ||||
| #else | ||||
| 	uint32_t* ptr = (uint32_t*)_P; | ||||
| 	__m128i c; | ||||
| 	rx_vec_i128 c; | ||||
| 	c.u32[0] = load32(ptr + 0); | ||||
| 	c.u32[1] = load32(ptr + 1); | ||||
| 	c.u32[2] = load32(ptr + 2); | ||||
|  | @ -264,7 +306,7 @@ inline __m128i _mm_load_si128(__m128i const*_P) { | |||
| #endif | ||||
| } | ||||
| 
 | ||||
| inline void _mm_store_si128(__m128i *_P, __m128i _B) { | ||||
| FORCE_INLINE void rx_store_vec_i128(rx_vec_i128 *_P, rx_vec_i128 _B) { | ||||
| #if defined(NATIVE_LITTLE_ENDIAN) | ||||
| 	*_P = _B; | ||||
| #else | ||||
|  | @ -276,46 +318,23 @@ inline void _mm_store_si128(__m128i *_P, __m128i _B) { | |||
| #endif | ||||
| } | ||||
| 
 | ||||
| inline __m128i _mm_slli_si128(__m128i _A, int _Imm) { | ||||
| 	_Imm &= 255; | ||||
| 	if (_Imm > 15) { | ||||
| 		_A.u64[0] = 0; | ||||
| 		_A.u64[1] = 0; | ||||
| 	} | ||||
| 	else { | ||||
| 		for (int i = 15; i >= _Imm; --i) { | ||||
| 			_A.u8[i] = _A.u8[i - _Imm]; | ||||
| 		} | ||||
| 		for (int i = 0; i < _Imm; ++i) { | ||||
| 			_A.u8[i] = 0; | ||||
| 		} | ||||
| 	} | ||||
| 	return _A; | ||||
| } | ||||
| 
 | ||||
| inline __m128i _mm_loadl_epi64(__m128i const* mem_addr) { | ||||
| 	__m128i x; | ||||
| 	x.u32[0] = load32((uint8_t*)mem_addr + 0); | ||||
| 	x.u32[1] = load32((uint8_t*)mem_addr + 4); | ||||
| FORCE_INLINE rx_vec_f128 rx_cvt_packed_int_vec_f128(const void* addr) { | ||||
| 	rx_vec_f128 x; | ||||
| 	x.lo = (double)unsigned32ToSigned2sCompl(load32((uint8_t*)addr + 0)); | ||||
| 	x.hi = (double)unsigned32ToSigned2sCompl(load32((uint8_t*)addr + 4)); | ||||
| 	return x; | ||||
| } | ||||
| 
 | ||||
| #define RANDOMX_DEFAULT_FENV | ||||
| 
 | ||||
| void rx_reset_float_state(); | ||||
| 
 | ||||
| void rx_set_rounding_mode(uint32_t mode); | ||||
| 
 | ||||
| #endif | ||||
| 
 | ||||
| constexpr int RoundToNearest = 0; | ||||
| constexpr int RoundDown = 1; | ||||
| constexpr int RoundUp = 2; | ||||
| constexpr int RoundToZero = 3; | ||||
| 
 | ||||
| inline __m128d load_cvt_i32x2(const void* addr) { | ||||
| 	__m128i ix = _mm_loadl_epi64((const __m128i*)addr); | ||||
| 	return _mm_cvtepi32_pd(ix); | ||||
| } | ||||
| 
 | ||||
| double loadDoublePortable(const void* addr); | ||||
| uint64_t mulh(uint64_t, uint64_t); | ||||
| int64_t smulh(int64_t, int64_t); | ||||
| uint64_t rotl(uint64_t, int); | ||||
| uint64_t rotr(uint64_t, int); | ||||
| void initFpu(); | ||||
| void setRoundMode(uint32_t); | ||||
|  |  | |||
|  | @ -318,38 +318,38 @@ alignas(16) const uint32_t lutDec3[256] = { | |||
| 	0x397101a8, 0x08deb30c, 0xd89ce4b4, 0x6490c156, 0x7b6184cb, 0xd570b632, 0x48745c6c, 0xd04257b8, | ||||
| }; | ||||
| 
 | ||||
| __m128i soft_aesenc(__m128i in, __m128i key) { | ||||
| rx_vec_i128 soft_aesenc(rx_vec_i128 in, rx_vec_i128 key) { | ||||
| 	uint32_t s0, s1, s2, s3; | ||||
| 
 | ||||
| 	s0 = _mm_cvtsi128_si32(_mm_shuffle_epi32(in, 0xff)); | ||||
| 	s1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(in, 0xaa)); | ||||
| 	s2 = _mm_cvtsi128_si32(_mm_shuffle_epi32(in, 0x55)); | ||||
| 	s3 = _mm_cvtsi128_si32(in); | ||||
| 	s0 = rx_vec_i128_w(in); | ||||
| 	s1 = rx_vec_i128_z(in); | ||||
| 	s2 = rx_vec_i128_y(in); | ||||
| 	s3 = rx_vec_i128_x(in); | ||||
| 
 | ||||
| 	__m128i out = _mm_set_epi32( | ||||
| 	rx_vec_i128 out = rx_set_int_vec_i128( | ||||
| 		(lutEnc0[s0 & 0xff] ^ lutEnc1[(s3 >> 8) & 0xff] ^ lutEnc2[(s2 >> 16) & 0xff] ^ lutEnc3[s1 >> 24]), | ||||
| 		(lutEnc0[s1 & 0xff] ^ lutEnc1[(s0 >> 8) & 0xff] ^ lutEnc2[(s3 >> 16) & 0xff] ^ lutEnc3[s2 >> 24]), | ||||
| 		(lutEnc0[s2 & 0xff] ^ lutEnc1[(s1 >> 8) & 0xff] ^ lutEnc2[(s0 >> 16) & 0xff] ^ lutEnc3[s3 >> 24]), | ||||
| 		(lutEnc0[s3 & 0xff] ^ lutEnc1[(s2 >> 8) & 0xff] ^ lutEnc2[(s1 >> 16) & 0xff] ^ lutEnc3[s0 >> 24]) | ||||
| 	); | ||||
| 
 | ||||
| 	return _mm_xor_si128(out, key); | ||||
| 	return rx_xor_vec_i128(out, key); | ||||
| } | ||||
| 
 | ||||
| __m128i soft_aesdec(__m128i in, __m128i key) { | ||||
| rx_vec_i128 soft_aesdec(rx_vec_i128 in, rx_vec_i128 key) { | ||||
| 	uint32_t s0, s1, s2, s3; | ||||
| 
 | ||||
| 	s0 = _mm_cvtsi128_si32(_mm_shuffle_epi32(in, 0xff)); | ||||
| 	s1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(in, 0xaa)); | ||||
| 	s2 = _mm_cvtsi128_si32(_mm_shuffle_epi32(in, 0x55)); | ||||
| 	s3 = _mm_cvtsi128_si32(in); | ||||
| 	s0 = rx_vec_i128_w(in); | ||||
| 	s1 = rx_vec_i128_z(in); | ||||
| 	s2 = rx_vec_i128_y(in); | ||||
| 	s3 = rx_vec_i128_x(in); | ||||
| 
 | ||||
| 	__m128i out = _mm_set_epi32( | ||||
| 	rx_vec_i128 out = rx_set_int_vec_i128( | ||||
| 		(lutDec0[s0 & 0xff] ^ lutDec1[(s1 >> 8) & 0xff] ^ lutDec2[(s2 >> 16) & 0xff] ^ lutDec3[s3 >> 24]), | ||||
| 		(lutDec0[s1 & 0xff] ^ lutDec1[(s2 >> 8) & 0xff] ^ lutDec2[(s3 >> 16) & 0xff] ^ lutDec3[s0 >> 24]), | ||||
| 		(lutDec0[s2 & 0xff] ^ lutDec1[(s3 >> 8) & 0xff] ^ lutDec2[(s0 >> 16) & 0xff] ^ lutDec3[s1 >> 24]), | ||||
| 		(lutDec0[s3 & 0xff] ^ lutDec1[(s0 >> 8) & 0xff] ^ lutDec2[(s1 >> 16) & 0xff] ^ lutDec3[s2 >> 24]) | ||||
| 	); | ||||
| 
 | ||||
| 	return _mm_xor_si128(out, key); | ||||
| 	return rx_xor_vec_i128(out, key); | ||||
| } | ||||
|  |  | |||
|  | @ -22,16 +22,16 @@ along with RandomX.  If not, see<http://www.gnu.org/licenses/>. | |||
| #include <stdint.h> | ||||
| #include "intrin_portable.h" | ||||
| 
 | ||||
| __m128i soft_aesenc(__m128i in, __m128i key); | ||||
| rx_vec_i128 soft_aesenc(rx_vec_i128 in, rx_vec_i128 key); | ||||
| 
 | ||||
| __m128i soft_aesdec(__m128i in, __m128i key); | ||||
| rx_vec_i128 soft_aesdec(rx_vec_i128 in, rx_vec_i128 key); | ||||
| 
 | ||||
| template<bool soft> | ||||
| inline __m128i aesenc(__m128i in, __m128i key) { | ||||
| 	return soft ? soft_aesenc(in, key) : _mm_aesenc_si128(in, key); | ||||
| inline rx_vec_i128 aesenc(rx_vec_i128 in, rx_vec_i128 key) { | ||||
| 	return soft ? soft_aesenc(in, key) : rx_aesenc_vec_i128(in, key); | ||||
| } | ||||
| 
 | ||||
| template<bool soft> | ||||
| inline __m128i aesdec(__m128i in, __m128i key) { | ||||
| 	return soft ? soft_aesdec(in, key) : _mm_aesdec_si128(in, key); | ||||
| inline rx_vec_i128 aesdec(rx_vec_i128 in, rx_vec_i128 key) { | ||||
| 	return soft ? soft_aesdec(in, key) : rx_aesdec_vec_i128(in, key); | ||||
| } | ||||
|  | @ -32,7 +32,7 @@ randomx_vm::~randomx_vm() { | |||
| } | ||||
| 
 | ||||
| void randomx_vm::resetRoundingMode() { | ||||
| 	initFpu(); | ||||
| 	rx_reset_float_state(); | ||||
| } | ||||
| 
 | ||||
| namespace randomx { | ||||
|  | @ -86,7 +86,7 @@ void randomx_vm::initialize() { | |||
| 
 | ||||
| namespace randomx { | ||||
| 
 | ||||
| 	alignas(16) volatile static __m128i aesDummy; | ||||
| 	alignas(16) volatile static rx_vec_i128 aesDummy; | ||||
| 
 | ||||
| 	template<class Allocator, bool softAes> | ||||
| 	VmBase<Allocator, softAes>::~VmBase() { | ||||
|  | @ -98,9 +98,9 @@ namespace randomx { | |||
| 		if (datasetPtr == nullptr) | ||||
| 			throw std::invalid_argument("Cache/Dataset not set"); | ||||
| 		if (!softAes) { //if hardware AES is not supported, it's better to fail now than to return a ticking bomb
 | ||||
| 			__m128i tmp = _mm_load_si128((const __m128i*)&aesDummy); | ||||
| 			tmp = _mm_aesenc_si128(tmp, tmp); | ||||
| 			_mm_store_si128((__m128i*)&aesDummy, tmp); | ||||
| 			rx_vec_i128 tmp = rx_load_vec_i128((const rx_vec_i128*)&aesDummy); | ||||
| 			tmp = rx_aesenc_vec_i128(tmp, tmp); | ||||
| 			rx_store_vec_i128((rx_vec_i128*)&aesDummy, tmp); | ||||
| 		} | ||||
| 		scratchpad = (uint8_t*)Allocator::allocMemory(ScratchpadSize); | ||||
| 	} | ||||
|  |  | |||
|  | @ -46,7 +46,7 @@ namespace randomx { | |||
| 	} | ||||
| 
 | ||||
| 	template<class Allocator, bool softAes> | ||||
| 	void InterpretedVm<Allocator, softAes>::executeBytecode(int_reg_t(&r)[RegistersCount], __m128d (&f)[RegisterCountFlt], __m128d (&e)[RegisterCountFlt], __m128d (&a)[RegisterCountFlt]) { | ||||
| 	void InterpretedVm<Allocator, softAes>::executeBytecode(int_reg_t(&r)[RegistersCount], rx_vec_f128(&f)[RegisterCountFlt], rx_vec_f128(&e)[RegisterCountFlt], rx_vec_f128(&a)[RegisterCountFlt]) { | ||||
| 		for (int pc = 0; pc < RANDOMX_PROGRAM_SIZE; ++pc) { | ||||
| 			executeBytecode(pc, r, f, e, a); | ||||
| 		} | ||||
|  | @ -59,16 +59,16 @@ namespace randomx { | |||
| 	} | ||||
| 
 | ||||
| 	template<class Allocator, bool softAes> | ||||
| 	FORCE_INLINE __m128d InterpretedVm<Allocator, softAes>::maskRegisterExponentMantissa(__m128d x) { | ||||
| 		const __m128d xmantissaMask = _mm_castsi128_pd(_mm_set_epi64x(dynamicMantissaMask, dynamicMantissaMask)); | ||||
| 		const __m128d xexponentMask = _mm_load_pd((const double*)&config.eMask); | ||||
| 		x = _mm_and_pd(x, xmantissaMask); | ||||
| 		x = _mm_or_pd(x, xexponentMask); | ||||
| 	FORCE_INLINE rx_vec_f128 InterpretedVm<Allocator, softAes>::maskRegisterExponentMantissa(rx_vec_f128 x) { | ||||
| 		const rx_vec_f128 xmantissaMask = rx_set_vec_f128(dynamicMantissaMask, dynamicMantissaMask); | ||||
| 		const rx_vec_f128 xexponentMask = rx_load_vec_f128((const double*)&config.eMask); | ||||
| 		x = rx_and_vec_f128(x, xmantissaMask); | ||||
| 		x = rx_or_vec_f128(x, xexponentMask); | ||||
| 		return x; | ||||
| 	} | ||||
| 
 | ||||
| 	template<class Allocator, bool softAes> | ||||
| 	void InterpretedVm<Allocator, softAes>::executeBytecode(int& pc, int_reg_t(&r)[RegistersCount], __m128d (&f)[RegisterCountFlt], __m128d (&e)[RegisterCountFlt], __m128d (&a)[RegisterCountFlt]) { | ||||
| 	void InterpretedVm<Allocator, softAes>::executeBytecode(int& pc, int_reg_t(&r)[RegistersCount], rx_vec_f128(&f)[RegisterCountFlt], rx_vec_f128(&e)[RegisterCountFlt], rx_vec_f128(&a)[RegisterCountFlt]) { | ||||
| 		auto& ibc = byteCode[pc]; | ||||
| 		switch (ibc.type) | ||||
| 		{ | ||||
|  | @ -139,43 +139,43 @@ namespace randomx { | |||
| 			} break; | ||||
| 
 | ||||
| 			case InstructionType::FSWAP_R: { | ||||
| 				*ibc.fdst = _mm_shuffle_pd(*ibc.fdst, *ibc.fdst, 1); | ||||
| 				*ibc.fdst = rx_shuffle_vec_f128(*ibc.fdst, *ibc.fdst, 1); | ||||
| 			} break; | ||||
| 
 | ||||
| 			case InstructionType::FADD_R: { | ||||
| 				*ibc.fdst = _mm_add_pd(*ibc.fdst, *ibc.fsrc); | ||||
| 				*ibc.fdst = rx_add_vec_f128(*ibc.fdst, *ibc.fsrc); | ||||
| 			} break; | ||||
| 
 | ||||
| 			case InstructionType::FADD_M: { | ||||
| 				__m128d fsrc = load_cvt_i32x2(getScratchpadAddress(ibc)); | ||||
| 				*ibc.fdst = _mm_add_pd(*ibc.fdst, fsrc); | ||||
| 				rx_vec_f128 fsrc = rx_cvt_packed_int_vec_f128(getScratchpadAddress(ibc)); | ||||
| 				*ibc.fdst = rx_add_vec_f128(*ibc.fdst, fsrc); | ||||
| 			} break; | ||||
| 
 | ||||
| 			case InstructionType::FSUB_R: { | ||||
| 				*ibc.fdst = _mm_sub_pd(*ibc.fdst, *ibc.fsrc); | ||||
| 				*ibc.fdst = rx_sub_vec_f128(*ibc.fdst, *ibc.fsrc); | ||||
| 			} break; | ||||
| 
 | ||||
| 			case InstructionType::FSUB_M: { | ||||
| 				__m128d fsrc = load_cvt_i32x2(getScratchpadAddress(ibc)); | ||||
| 				*ibc.fdst = _mm_sub_pd(*ibc.fdst, fsrc); | ||||
| 				rx_vec_f128 fsrc = rx_cvt_packed_int_vec_f128(getScratchpadAddress(ibc)); | ||||
| 				*ibc.fdst = rx_sub_vec_f128(*ibc.fdst, fsrc); | ||||
| 			} break; | ||||
| 
 | ||||
| 			case InstructionType::FSCAL_R: { | ||||
| 				const __m128d mask = _mm_castsi128_pd(_mm_set1_epi64x(0x81F0000000000000)); | ||||
| 				*ibc.fdst = _mm_xor_pd(*ibc.fdst, mask); | ||||
| 				const rx_vec_f128 mask = rx_set1_vec_f128(0x81F0000000000000); | ||||
| 				*ibc.fdst = rx_xor_vec_f128(*ibc.fdst, mask); | ||||
| 			} break; | ||||
| 
 | ||||
| 			case InstructionType::FMUL_R: { | ||||
| 				*ibc.fdst = _mm_mul_pd(*ibc.fdst, *ibc.fsrc); | ||||
| 				*ibc.fdst = rx_mul_vec_f128(*ibc.fdst, *ibc.fsrc); | ||||
| 			} break; | ||||
| 
 | ||||
| 			case InstructionType::FDIV_M: { | ||||
| 				__m128d fsrc = maskRegisterExponentMantissa(load_cvt_i32x2(getScratchpadAddress(ibc))); | ||||
| 				*ibc.fdst = _mm_div_pd(*ibc.fdst, fsrc); | ||||
| 				rx_vec_f128 fsrc = maskRegisterExponentMantissa(rx_cvt_packed_int_vec_f128(getScratchpadAddress(ibc))); | ||||
| 				*ibc.fdst = rx_div_vec_f128(*ibc.fdst, fsrc); | ||||
| 			} break; | ||||
| 
 | ||||
| 			case InstructionType::FSQRT_R: { | ||||
| 				*ibc.fdst = _mm_sqrt_pd(*ibc.fdst); | ||||
| 				*ibc.fdst = rx_sqrt_vec_f128(*ibc.fdst); | ||||
| 			} break; | ||||
| 
 | ||||
| 			case InstructionType::CBRANCH: { | ||||
|  | @ -186,7 +186,7 @@ namespace randomx { | |||
| 			} break; | ||||
| 
 | ||||
| 			case InstructionType::CFROUND: { | ||||
| 				setRoundMode(rotr(*ibc.isrc, ibc.imm) % 4); | ||||
| 				rx_set_rounding_mode(rotr(*ibc.isrc, ibc.imm) % 4); | ||||
| 			} break; | ||||
| 
 | ||||
| 			case InstructionType::ISTORE: { | ||||
|  | @ -205,12 +205,12 @@ namespace randomx { | |||
| 	template<class Allocator, bool softAes> | ||||
| 	void InterpretedVm<Allocator, softAes>::execute() { | ||||
| 		int_reg_t r[RegistersCount] = { 0 }; | ||||
| 		__m128d f[RegisterCountFlt]; | ||||
| 		__m128d e[RegisterCountFlt]; | ||||
| 		__m128d a[RegisterCountFlt]; | ||||
| 		rx_vec_f128 f[RegisterCountFlt]; | ||||
| 		rx_vec_f128 e[RegisterCountFlt]; | ||||
| 		rx_vec_f128 a[RegisterCountFlt]; | ||||
| 
 | ||||
| 		for(unsigned i = 0; i < RegisterCountFlt; ++i) | ||||
| 			a[i] = _mm_load_pd(®.a[i].lo); | ||||
| 			a[i] = rx_load_vec_f128(®.a[i].lo); | ||||
| 
 | ||||
| 		precompileProgram(r, f, e, a); | ||||
| 
 | ||||
|  | @ -228,10 +228,10 @@ namespace randomx { | |||
| 				r[i] ^= load64(scratchpad + spAddr0 + 8 * i); | ||||
| 
 | ||||
| 			for (unsigned i = 0; i < RegisterCountFlt; ++i) | ||||
| 				f[i] = load_cvt_i32x2(scratchpad + spAddr1 + 8 * i); | ||||
| 				f[i] = rx_cvt_packed_int_vec_f128(scratchpad + spAddr1 + 8 * i); | ||||
| 
 | ||||
| 			for (unsigned i = 0; i < RegisterCountFlt; ++i) | ||||
| 				e[i] = maskRegisterExponentMantissa(load_cvt_i32x2(scratchpad + spAddr1 + 8 * (RegisterCountFlt + i))); | ||||
| 				e[i] = maskRegisterExponentMantissa(rx_cvt_packed_int_vec_f128(scratchpad + spAddr1 + 8 * (RegisterCountFlt + i))); | ||||
| 
 | ||||
| 			executeBytecode(r, f, e, a); | ||||
| 
 | ||||
|  | @ -244,10 +244,10 @@ namespace randomx { | |||
| 				store64(scratchpad + spAddr1 + 8 * i, r[i]); | ||||
| 
 | ||||
| 			for (unsigned i = 0; i < RegisterCountFlt; ++i) | ||||
| 				f[i] = _mm_xor_pd(f[i], e[i]); | ||||
| 				f[i] = rx_xor_vec_f128(f[i], e[i]); | ||||
| 
 | ||||
| 			for (unsigned i = 0; i < RegisterCountFlt; ++i) | ||||
| 				_mm_store_pd((double*)(scratchpad + spAddr0 + 16 * i), f[i]); | ||||
| 				rx_store_vec_f128((double*)(scratchpad + spAddr0 + 16 * i), f[i]); | ||||
| 
 | ||||
| 			spAddr0 = 0; | ||||
| 			spAddr1 = 0; | ||||
|  | @ -257,10 +257,10 @@ namespace randomx { | |||
| 			store64(®.r[i], r[i]); | ||||
| 
 | ||||
| 		for (unsigned i = 0; i < RegisterCountFlt; ++i) | ||||
| 			_mm_store_pd(®.f[i].lo, f[i]); | ||||
| 			rx_store_vec_f128(®.f[i].lo, f[i]); | ||||
| 
 | ||||
| 		for (unsigned i = 0; i < RegisterCountFlt; ++i) | ||||
| 			_mm_store_pd(®.e[i].lo, e[i]); | ||||
| 			rx_store_vec_f128(®.e[i].lo, e[i]); | ||||
| 	} | ||||
| 
 | ||||
| 	template<class Allocator, bool softAes> | ||||
|  | @ -273,7 +273,7 @@ namespace randomx { | |||
| #include "instruction_weights.hpp" | ||||
| 
 | ||||
| 	template<class Allocator, bool softAes> | ||||
| 	void InterpretedVm<Allocator, softAes>::precompileProgram(int_reg_t(&r)[RegistersCount], __m128d (&f)[RegisterCountFlt], __m128d (&e)[RegisterCountFlt], __m128d (&a)[RegisterCountFlt]) { | ||||
| 	void InterpretedVm<Allocator, softAes>::precompileProgram(int_reg_t(&r)[RegistersCount], rx_vec_f128(&f)[RegisterCountFlt], rx_vec_f128(&e)[RegisterCountFlt], rx_vec_f128(&a)[RegisterCountFlt]) { | ||||
| 		RegisterUsage registerUsage[RegistersCount]; | ||||
| 		for (unsigned i = 0; i < RegistersCount; ++i) { | ||||
| 			registerUsage[i].lastUsed = -1; | ||||
|  |  | |||
|  | @ -31,11 +31,11 @@ namespace randomx { | |||
| 	struct InstructionByteCode { | ||||
| 		union { | ||||
| 			int_reg_t* idst; | ||||
| 			__m128d* fdst; | ||||
| 			rx_vec_f128* fdst; | ||||
| 		}; | ||||
| 		union { | ||||
| 			int_reg_t* isrc; | ||||
| 			__m128d* fsrc; | ||||
| 			rx_vec_f128* fsrc; | ||||
| 		}; | ||||
| 		union { | ||||
| 			uint64_t imm; | ||||
|  | @ -74,11 +74,11 @@ namespace randomx { | |||
| 		virtual void datasetRead(uint32_t blockNumber, int_reg_t(&r)[RegistersCount]); | ||||
| 	private: | ||||
| 		void execute(); | ||||
| 		void precompileProgram(int_reg_t(&r)[RegistersCount], __m128d (&f)[RegisterCountFlt], __m128d (&e)[RegisterCountFlt], __m128d (&a)[RegisterCountFlt]); | ||||
| 		void executeBytecode(int_reg_t(&r)[RegistersCount], __m128d (&f)[RegisterCountFlt], __m128d (&e)[RegisterCountFlt], __m128d (&a)[RegisterCountFlt]); | ||||
| 		void executeBytecode(int& i, int_reg_t(&r)[RegistersCount], __m128d (&f)[RegisterCountFlt], __m128d (&e)[RegisterCountFlt], __m128d (&a)[RegisterCountFlt]); | ||||
| 		void precompileProgram(int_reg_t(&r)[RegistersCount], rx_vec_f128(&f)[RegisterCountFlt], rx_vec_f128(&e)[RegisterCountFlt], rx_vec_f128(&a)[RegisterCountFlt]); | ||||
| 		void executeBytecode(int_reg_t(&r)[RegistersCount], rx_vec_f128(&f)[RegisterCountFlt], rx_vec_f128(&e)[RegisterCountFlt], rx_vec_f128(&a)[RegisterCountFlt]); | ||||
| 		void executeBytecode(int& i, int_reg_t(&r)[RegistersCount], rx_vec_f128(&f)[RegisterCountFlt], rx_vec_f128(&e)[RegisterCountFlt], rx_vec_f128(&a)[RegisterCountFlt]); | ||||
| 		void* getScratchpadAddress(InstructionByteCode& ibc); | ||||
| 		__m128d maskRegisterExponentMantissa(__m128d); | ||||
| 		rx_vec_f128 maskRegisterExponentMantissa(rx_vec_f128); | ||||
| 
 | ||||
| 		InstructionByteCode byteCode[RANDOMX_PROGRAM_SIZE]; | ||||
| 	}; | ||||
|  |  | |||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue