mirror of
				https://git.wownero.com/wownero/RandomWOW.git
				synced 2024-08-15 00:23:14 +00:00 
			
		
		
		
	POWER7+ VSX support plus AES hardware support for POWER8 and newer. (#41)
This commit is contained in:
		
							parent
							
								
									8ff1bf027a
								
							
						
					
					
						commit
						776723dd40
					
				
					 2 changed files with 226 additions and 0 deletions
				
			
		
							
								
								
									
										8
									
								
								makefile
									
										
									
									
									
								
							
							
						
						
									
										8
									
								
								makefile
									
										
									
									
									
								
							|  | @ -22,6 +22,14 @@ ifeq ($(PLATFORM),x86_64) | |||
|     CXXFLAGS += -maes | ||||
| endif | ||||
| 
 | ||||
| ifeq ($(PLATFORM),ppc64) | ||||
|     CXXFLAGS += -mcpu=native | ||||
| endif | ||||
| 
 | ||||
| ifeq ($(PLATFORM),ppc64le) | ||||
|     CXXFLAGS += -mcpu=native | ||||
| endif | ||||
| 
 | ||||
| release: CXXFLAGS += -O3 -flto | ||||
| release: CCFLAGS += -O3 -flto | ||||
| release: LDFLAGS += -flto | ||||
|  |  | |||
|  | @ -160,7 +160,225 @@ FORCE_INLINE void rx_set_rounding_mode(uint32_t mode) { | |||
| 	_mm_setcsr(rx_mxcsr_default | (mode << 13)); | ||||
| } | ||||
| 
 | ||||
| #elif defined(__PPC64__) && defined(__ALTIVEC__) && defined(__VSX__) //sadly only POWER7 and newer will be able to use SIMD acceleration. Earlier processors cant use doubles or 64 bit integers with SIMD
 | ||||
| #include <cstdint> | ||||
| #include <stdexcept> | ||||
| #include <cstdlib> | ||||
| #include<altivec.h> | ||||
| #undef vector | ||||
| #undef pixel | ||||
| #undef bool | ||||
| 
 | ||||
| typedef __vector uint8_t __m128i; | ||||
| typedef __vector uint32_t __m128l; | ||||
| typedef __vector int      __m128li; | ||||
| typedef __vector uint64_t __m128ll; | ||||
| typedef __vector double __m128d; | ||||
| 
 | ||||
| typedef __m128i rx_vec_i128; | ||||
| typedef __m128d rx_vec_f128; | ||||
| typedef union{ | ||||
| 	rx_vec_i128 i; | ||||
|   rx_vec_f128 d; | ||||
|   uint64_t u64[2]; | ||||
|   double   d64[2]; | ||||
|   uint32_t u32[4]; | ||||
| 	int i32[4]; | ||||
| } vec_u; | ||||
| 
 | ||||
| #define rx_aligned_alloc(a, b) malloc(a) | ||||
| #define rx_aligned_free(a) free(a) | ||||
| #define rx_prefetch_nta(x) | ||||
| 
 | ||||
| 
 | ||||
| /* Splat 64-bit long long to 2 64-bit long longs */ | ||||
| FORCE_INLINE __m128i vec_splat2sd (int64_t scalar) | ||||
| { return (__m128i) vec_splats (scalar); } | ||||
| 
 | ||||
| FORCE_INLINE rx_vec_f128 rx_load_vec_f128(const double* pd) { | ||||
| #if defined(NATIVE_LITTLE_ENDIAN) | ||||
| 	return (rx_vec_f128)vec_vsx_ld(0,pd); | ||||
| #else | ||||
| 	vec_u t; | ||||
| 	t.u64[0] = load64(pd + 0); | ||||
| 	t.u64[1] = load64(pd + 1); | ||||
| 	return (rx_vec_f128)t.d; | ||||
| #endif | ||||
| } | ||||
| 
 | ||||
| FORCE_INLINE void rx_store_vec_f128(double* mem_addr, rx_vec_f128 a) { | ||||
| #if defined(NATIVE_LITTLE_ENDIAN) | ||||
| 	vec_vsx_st(a,0,(rx_vec_f128*)mem_addr); | ||||
| #else | ||||
| 	vec_u _a; | ||||
| 	_a.d = a; | ||||
| 	store64(mem_addr + 0, _a.u64[0]); | ||||
| 	store64(mem_addr + 1, _a.u64[1]); | ||||
| #endif | ||||
| } | ||||
| 
 | ||||
| FORCE_INLINE rx_vec_f128 rx_swap_vec_f128(rx_vec_f128 a) { | ||||
| 	return (rx_vec_f128)vec_perm((__m128i)a,(__m128i)a,(__m128i){8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7}); | ||||
| } | ||||
| 
 | ||||
| FORCE_INLINE rx_vec_f128 rx_add_vec_f128(rx_vec_f128 a, rx_vec_f128 b) { | ||||
| 	return (rx_vec_f128)vec_add(a,b); | ||||
| } | ||||
| 
 | ||||
| FORCE_INLINE rx_vec_f128 rx_sub_vec_f128(rx_vec_f128 a, rx_vec_f128 b) { | ||||
| 	return (rx_vec_f128)vec_sub(a,b); | ||||
| } | ||||
| 
 | ||||
| FORCE_INLINE rx_vec_f128 rx_mul_vec_f128(rx_vec_f128 a, rx_vec_f128 b) { | ||||
| 	return (rx_vec_f128)vec_mul(a,b); | ||||
| } | ||||
| 
 | ||||
| FORCE_INLINE rx_vec_f128 rx_div_vec_f128(rx_vec_f128 a, rx_vec_f128 b) { | ||||
| 	return (rx_vec_f128)vec_div(a,b); | ||||
| } | ||||
| 
 | ||||
| FORCE_INLINE rx_vec_f128 rx_sqrt_vec_f128(rx_vec_f128 a) { | ||||
| 	return (rx_vec_f128)vec_sqrt(a); | ||||
| } | ||||
| 
 | ||||
| FORCE_INLINE rx_vec_i128 rx_set1_long_vec_i128(uint64_t a) { | ||||
| 	return (rx_vec_i128)vec_splat2sd(a); | ||||
| } | ||||
| 
 | ||||
| FORCE_INLINE rx_vec_f128 rx_vec_i128_vec_f128(rx_vec_i128 a) { | ||||
| 	return (rx_vec_f128)a; | ||||
| } | ||||
| 
 | ||||
| FORCE_INLINE rx_vec_f128 rx_set_vec_f128(uint64_t x1, uint64_t x0) { | ||||
| 	return (rx_vec_f128)(__m128ll){x0,x1}; | ||||
| } | ||||
| 
 | ||||
| FORCE_INLINE rx_vec_f128 rx_set1_vec_f128(uint64_t x) { | ||||
| 	return (rx_vec_f128)vec_splat2sd(x); | ||||
| } | ||||
| 
 | ||||
| FORCE_INLINE rx_vec_f128 rx_xor_vec_f128(rx_vec_f128 a, rx_vec_f128 b) { | ||||
| 	return (rx_vec_f128)vec_xor(a,b); | ||||
| } | ||||
| 
 | ||||
| FORCE_INLINE rx_vec_f128 rx_and_vec_f128(rx_vec_f128 a, rx_vec_f128 b) { | ||||
| 	return (rx_vec_f128)vec_and(a,b); | ||||
| } | ||||
| 
 | ||||
| FORCE_INLINE rx_vec_f128 rx_or_vec_f128(rx_vec_f128 a, rx_vec_f128 b) { | ||||
| 	return (rx_vec_f128)vec_or(a,b); | ||||
| } | ||||
| #if defined(__CRYPTO__) | ||||
| 
 | ||||
| FORCE_INLINE __m128ll vrev(__m128i v){ | ||||
| #if defined(NATIVE_LITTLE_ENDIAN) | ||||
| 	return (__m128ll)vec_perm((__m128i)v,(__m128i){0},(__m128i){15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0}); | ||||
| #else | ||||
| 	return (__m128ll)vec_perm((__m128i)v,(__m128i){0},(__m128i){3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12}); | ||||
| #endif | ||||
| } | ||||
| 
 | ||||
| FORCE_INLINE rx_vec_i128 rx_aesenc_vec_i128(rx_vec_i128 v, rx_vec_i128 rkey) { | ||||
| 	__m128ll _v = vrev(v); | ||||
| 	__m128ll _rkey = vrev(rkey); | ||||
| 	__m128ll result = vrev((__m128i)__builtin_crypto_vcipher(_v,_rkey)); | ||||
| 	return (rx_vec_i128)result; | ||||
| } | ||||
| 
 | ||||
| FORCE_INLINE rx_vec_i128 rx_aesdec_vec_i128(rx_vec_i128 v, rx_vec_i128 rkey) { | ||||
| 	__m128ll _v = vrev(v); | ||||
| 	__m128ll zero = (__m128ll){0}; | ||||
| 	__m128ll out = vrev((__m128i)__builtin_crypto_vncipher(_v,zero)); | ||||
| 	return (rx_vec_i128)vec_xor((__m128i)out,rkey); | ||||
| } | ||||
| #else | ||||
| static const char* platformError = "Platform doesn't support hardware AES"; | ||||
| 
 | ||||
| FORCE_INLINE rx_vec_i128 rx_aesenc_vec_i128(rx_vec_i128 v, rx_vec_i128 rkey) { | ||||
| 	throw std::runtime_error(platformError); | ||||
| } | ||||
| 
 | ||||
| FORCE_INLINE rx_vec_i128 rx_aesdec_vec_i128(rx_vec_i128 v, rx_vec_i128 rkey) { | ||||
| 	throw std::runtime_error(platformError); | ||||
| } | ||||
| #endif | ||||
| 
 | ||||
| 
 | ||||
| FORCE_INLINE int rx_vec_i128_x(rx_vec_i128 a) { | ||||
| 	vec_u _a; | ||||
| 	_a.i = a; | ||||
|   return _a.i32[0]; | ||||
| } | ||||
| 
 | ||||
| FORCE_INLINE int rx_vec_i128_y(rx_vec_i128 a) { | ||||
| 	vec_u _a; | ||||
| 	_a.i = a; | ||||
| 	return _a.i32[1]; | ||||
| } | ||||
| 
 | ||||
| FORCE_INLINE int rx_vec_i128_z(rx_vec_i128 a) { | ||||
| 	vec_u _a; | ||||
| 	_a.i = a; | ||||
| 	return _a.i32[2]; | ||||
| } | ||||
| 
 | ||||
| FORCE_INLINE int rx_vec_i128_w(rx_vec_i128 a) { | ||||
| 	vec_u _a; | ||||
| 	_a.i = a; | ||||
| 	return _a.i32[3]; | ||||
| } | ||||
| 
 | ||||
| FORCE_INLINE rx_vec_i128 rx_set_int_vec_i128(int _I3, int _I2, int _I1, int _I0) { | ||||
| 	return (rx_vec_i128)((__m128li){_I0,_I1,_I2,_I3}); | ||||
| }; | ||||
| 
 | ||||
| FORCE_INLINE rx_vec_i128 rx_xor_vec_i128(rx_vec_i128 _A, rx_vec_i128 _B) { | ||||
| 	return (rx_vec_i128)vec_xor(_A,_B); | ||||
| } | ||||
| 
 | ||||
| FORCE_INLINE rx_vec_i128 rx_load_vec_i128(rx_vec_i128 const *_P) { | ||||
| #if defined(NATIVE_LITTLE_ENDIAN) | ||||
| 	return *_P; | ||||
| #else | ||||
| 	uint32_t* ptr = (uint32_t*)_P; | ||||
| 	vec_u c; | ||||
| 	c.u32[0] = load32(ptr + 0); | ||||
| 	c.u32[1] = load32(ptr + 1); | ||||
| 	c.u32[2] = load32(ptr + 2); | ||||
| 	c.u32[3] = load32(ptr + 3); | ||||
| 	return (rx_vec_i128)c.i; | ||||
| #endif | ||||
| } | ||||
| 
 | ||||
| FORCE_INLINE void rx_store_vec_i128(rx_vec_i128 *_P, rx_vec_i128 _B) { | ||||
| #if defined(NATIVE_LITTLE_ENDIAN) | ||||
| 	*_P = _B; | ||||
| #else | ||||
| 	uint32_t* ptr = (uint32_t*)_P; | ||||
| 	vec_u B; | ||||
| 	B.i = _B; | ||||
| 	store32(ptr + 0, B.u32[0]); | ||||
| 	store32(ptr + 1, B.u32[1]); | ||||
| 	store32(ptr + 2, B.u32[2]); | ||||
| 	store32(ptr + 3, B.u32[3]); | ||||
| #endif | ||||
| } | ||||
| 
 | ||||
| FORCE_INLINE rx_vec_f128 rx_cvt_packed_int_vec_f128(const void* addr) { | ||||
| 	vec_u x; | ||||
| 	x.d64[0] = (double)unsigned32ToSigned2sCompl(load32((uint8_t*)addr + 0)); | ||||
| 	x.d64[1] = (double)unsigned32ToSigned2sCompl(load32((uint8_t*)addr + 4)); | ||||
| 	return (rx_vec_f128)x.d; | ||||
| } | ||||
| 
 | ||||
| #define RANDOMX_DEFAULT_FENV | ||||
| 
 | ||||
| void rx_reset_float_state(); | ||||
| 
 | ||||
| void rx_set_rounding_mode(uint32_t mode); | ||||
| 
 | ||||
| #else //end altivec
 | ||||
| 
 | ||||
| #include <cstdint> | ||||
| #include <stdexcept> | ||||
| #include <cstdlib> | ||||
|  |  | |||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue