POWER7+ VSX support plus AES hardware support for POWER8 and newer. (#41)

2024-08-15 00:23:14 +00:00 · 2019-06-22 09:54:02 -04:00 · 2019-06-22 09:54:02 -04:00 · 776723dd40
commit 776723dd40
parent 8ff1bf027a
2 changed files with 226 additions and 0 deletions
--- a/8
+++ b/8
@ -22,6 +22,14 @@ ifeq ($(PLATFORM),x86_64)
    CXXFLAGS += -maes
 endif
 ifeq ($(PLATFORM),ppc64)
    CXXFLAGS += -mcpu=native
 endif
 ifeq ($(PLATFORM),ppc64le)
    CXXFLAGS += -mcpu=native
 endif
 release: CXXFLAGS += -O3 -flto
 release: CCFLAGS += -O3 -flto
 release: LDFLAGS += -flto
--- a/src/intrin_portable.h
+++ b/src/intrin_portable.h
@ -160,7 +160,225 @@ FORCE_INLINE void rx_set_rounding_mode(uint32_t mode) {
 	_mm_setcsr(rx_mxcsr_default | (mode << 13));
 }
 #elif defined(__PPC64__) && defined(__ALTIVEC__) && defined(__VSX__) //sadly only POWER7 and newer will be able to use SIMD acceleration. Earlier processors cant use doubles or 64 bit integers with SIMD
 #include <cstdint>
 #include <stdexcept>
 #include <cstdlib>
 #include<altivec.h>
 #undef vector
 #undef pixel
 #undef bool
 typedef __vector uint8_t __m128i;
 typedef __vector uint32_t __m128l;
 typedef __vector int      __m128li;
 typedef __vector uint64_t __m128ll;
 typedef __vector double __m128d;
 typedef __m128i rx_vec_i128;
 typedef __m128d rx_vec_f128;
 typedef union{
 	rx_vec_i128 i;
  rx_vec_f128 d;
  uint64_t u64[2];
  double   d64[2];
  uint32_t u32[4];
 	int i32[4];
 } vec_u;
 #define rx_aligned_alloc(a, b) malloc(a)
 #define rx_aligned_free(a) free(a)
 #define rx_prefetch_nta(x)
 /* Splat 64-bit long long to 2 64-bit long longs */
 FORCE_INLINE __m128i vec_splat2sd (int64_t scalar)
 { return (__m128i) vec_splats (scalar); }
 FORCE_INLINE rx_vec_f128 rx_load_vec_f128(const double* pd) {
 #if defined(NATIVE_LITTLE_ENDIAN)
 	return (rx_vec_f128)vec_vsx_ld(0,pd);
 #else
 	vec_u t;
 	t.u64[0] = load64(pd + 0);
 	t.u64[1] = load64(pd + 1);
 	return (rx_vec_f128)t.d;
 #endif
 }
 FORCE_INLINE void rx_store_vec_f128(double* mem_addr, rx_vec_f128 a) {
 #if defined(NATIVE_LITTLE_ENDIAN)
 	vec_vsx_st(a,0,(rx_vec_f128*)mem_addr);
 #else
 	vec_u _a;
 	_a.d = a;
 	store64(mem_addr + 0, _a.u64[0]);
 	store64(mem_addr + 1, _a.u64[1]);
 #endif
 }
 FORCE_INLINE rx_vec_f128 rx_swap_vec_f128(rx_vec_f128 a) {
 	return (rx_vec_f128)vec_perm((__m128i)a,(__m128i)a,(__m128i){8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7});
 }
 FORCE_INLINE rx_vec_f128 rx_add_vec_f128(rx_vec_f128 a, rx_vec_f128 b) {
 	return (rx_vec_f128)vec_add(a,b);
 }
 FORCE_INLINE rx_vec_f128 rx_sub_vec_f128(rx_vec_f128 a, rx_vec_f128 b) {
 	return (rx_vec_f128)vec_sub(a,b);
 }
 FORCE_INLINE rx_vec_f128 rx_mul_vec_f128(rx_vec_f128 a, rx_vec_f128 b) {
 	return (rx_vec_f128)vec_mul(a,b);
 }
 FORCE_INLINE rx_vec_f128 rx_div_vec_f128(rx_vec_f128 a, rx_vec_f128 b) {
 	return (rx_vec_f128)vec_div(a,b);
 }
 FORCE_INLINE rx_vec_f128 rx_sqrt_vec_f128(rx_vec_f128 a) {
 	return (rx_vec_f128)vec_sqrt(a);
 }
 FORCE_INLINE rx_vec_i128 rx_set1_long_vec_i128(uint64_t a) {
 	return (rx_vec_i128)vec_splat2sd(a);
 }
 FORCE_INLINE rx_vec_f128 rx_vec_i128_vec_f128(rx_vec_i128 a) {
 	return (rx_vec_f128)a;
 }
 FORCE_INLINE rx_vec_f128 rx_set_vec_f128(uint64_t x1, uint64_t x0) {
 	return (rx_vec_f128)(__m128ll){x0,x1};
 }
 FORCE_INLINE rx_vec_f128 rx_set1_vec_f128(uint64_t x) {
 	return (rx_vec_f128)vec_splat2sd(x);
 }
 FORCE_INLINE rx_vec_f128 rx_xor_vec_f128(rx_vec_f128 a, rx_vec_f128 b) {
 	return (rx_vec_f128)vec_xor(a,b);
 }
 FORCE_INLINE rx_vec_f128 rx_and_vec_f128(rx_vec_f128 a, rx_vec_f128 b) {
 	return (rx_vec_f128)vec_and(a,b);
 }
 FORCE_INLINE rx_vec_f128 rx_or_vec_f128(rx_vec_f128 a, rx_vec_f128 b) {
 	return (rx_vec_f128)vec_or(a,b);
 }
 #if defined(__CRYPTO__)
 FORCE_INLINE __m128ll vrev(__m128i v){
 #if defined(NATIVE_LITTLE_ENDIAN)
 	return (__m128ll)vec_perm((__m128i)v,(__m128i){0},(__m128i){15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0});
 #else
 	return (__m128ll)vec_perm((__m128i)v,(__m128i){0},(__m128i){3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12});
 #endif
 }
 FORCE_INLINE rx_vec_i128 rx_aesenc_vec_i128(rx_vec_i128 v, rx_vec_i128 rkey) {
 	__m128ll _v = vrev(v);
 	__m128ll _rkey = vrev(rkey);
 	__m128ll result = vrev((__m128i)__builtin_crypto_vcipher(_v,_rkey));
 	return (rx_vec_i128)result;
 }
 FORCE_INLINE rx_vec_i128 rx_aesdec_vec_i128(rx_vec_i128 v, rx_vec_i128 rkey) {
 	__m128ll _v = vrev(v);
 	__m128ll zero = (__m128ll){0};
 	__m128ll out = vrev((__m128i)__builtin_crypto_vncipher(_v,zero));
 	return (rx_vec_i128)vec_xor((__m128i)out,rkey);
 }
 #else
 static const char* platformError = "Platform doesn't support hardware AES";
 FORCE_INLINE rx_vec_i128 rx_aesenc_vec_i128(rx_vec_i128 v, rx_vec_i128 rkey) {
 	throw std::runtime_error(platformError);
 }
 FORCE_INLINE rx_vec_i128 rx_aesdec_vec_i128(rx_vec_i128 v, rx_vec_i128 rkey) {
 	throw std::runtime_error(platformError);
 }
 #endif
 FORCE_INLINE int rx_vec_i128_x(rx_vec_i128 a) {
 	vec_u _a;
 	_a.i = a;
  return _a.i32[0];
 }
 FORCE_INLINE int rx_vec_i128_y(rx_vec_i128 a) {
 	vec_u _a;
 	_a.i = a;
 	return _a.i32[1];
 }
 FORCE_INLINE int rx_vec_i128_z(rx_vec_i128 a) {
 	vec_u _a;
 	_a.i = a;
 	return _a.i32[2];
 }
 FORCE_INLINE int rx_vec_i128_w(rx_vec_i128 a) {
 	vec_u _a;
 	_a.i = a;
 	return _a.i32[3];
 }
 FORCE_INLINE rx_vec_i128 rx_set_int_vec_i128(int _I3, int _I2, int _I1, int _I0) {
 	return (rx_vec_i128)((__m128li){_I0,_I1,_I2,_I3});
 };
 FORCE_INLINE rx_vec_i128 rx_xor_vec_i128(rx_vec_i128 _A, rx_vec_i128 _B) {
 	return (rx_vec_i128)vec_xor(_A,_B);
 }
 FORCE_INLINE rx_vec_i128 rx_load_vec_i128(rx_vec_i128 const *_P) {
 #if defined(NATIVE_LITTLE_ENDIAN)
 	return *_P;
 #else
 	uint32_t* ptr = (uint32_t*)_P;
 	vec_u c;
 	c.u32[0] = load32(ptr + 0);
 	c.u32[1] = load32(ptr + 1);
 	c.u32[2] = load32(ptr + 2);
 	c.u32[3] = load32(ptr + 3);
 	return (rx_vec_i128)c.i;
 #endif
 }
 FORCE_INLINE void rx_store_vec_i128(rx_vec_i128 *_P, rx_vec_i128 _B) {
 #if defined(NATIVE_LITTLE_ENDIAN)
 	*_P = _B;
 #else
 	uint32_t* ptr = (uint32_t*)_P;
 	vec_u B;
 	B.i = _B;
 	store32(ptr + 0, B.u32[0]);
 	store32(ptr + 1, B.u32[1]);
 	store32(ptr + 2, B.u32[2]);
 	store32(ptr + 3, B.u32[3]);
 #endif
 }
 FORCE_INLINE rx_vec_f128 rx_cvt_packed_int_vec_f128(const void* addr) {
 	vec_u x;
 	x.d64[0] = (double)unsigned32ToSigned2sCompl(load32((uint8_t*)addr + 0));
 	x.d64[1] = (double)unsigned32ToSigned2sCompl(load32((uint8_t*)addr + 4));
 	return (rx_vec_f128)x.d;
 }
 #define RANDOMX_DEFAULT_FENV
 void rx_reset_float_state();
 void rx_set_rounding_mode(uint32_t mode);
 #else //end altivec
 #include <cstdint>
 #include <stdexcept>
 #include <cstdlib>