RandomWOW/src/intrinPortable.h

/*
Copyright (c) 2018 tevador

This file is part of RandomX.

RandomX is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

RandomX is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with RandomX.  If not, see<http://www.gnu.org/licenses/>.
*/

#pragma once

#include <cstdint>

constexpr int32_t unsigned32ToSigned2sCompl(uint32_t x) {
	return (-1 == ~0) ? (int32_t)x : (x > INT32_MAX ? (-(int32_t)(UINT32_MAX - x) - 1) : (int32_t)x);
}

constexpr int64_t unsigned64ToSigned2sCompl(uint64_t x) {
	return (-1 == ~0) ? (int64_t)x : (x > INT64_MAX ? (-(int64_t)(UINT64_MAX - x) - 1) : (int64_t)x);
}

constexpr uint64_t signExtend2sCompl(uint32_t x) {
	return (-1 == ~0) ? (int64_t)(int32_t)(x) : (x > INT32_MAX ? (x | 0xffffffff00000000ULL) : (uint64_t)x);
}

#if defined(_MSC_VER)
#if defined(_M_X64) || (defined(_M_IX86_FP) && _M_IX86_FP == 2)
#define __SSE2__ 1
#endif
#endif

#ifdef __SSE2__
#ifdef __GNUC__
#include <x86intrin.h>
#else
#include <intrin.h>
#endif

inline __m128d _mm_abs(__m128d xd) {
	const __m128d absmask = _mm_castsi128_pd(_mm_set1_epi64x(~(1LL << 63)));
	return _mm_and_pd(xd, absmask);
}

#define PREFETCHNTA(x) _mm_prefetch((const char *)(x), _MM_HINT_NTA)

#else
#include <cstdint>
#include <stdexcept>
#include <cstdlib>
#include <cmath>
#include "blake2/endian.h"

#define _mm_malloc(a,b) malloc(a)
#define _mm_free(a) free(a)
#define PREFETCHNTA(x)

typedef union {
	uint64_t u64[2];
	uint32_t u32[4];
	uint16_t u16[8];
	uint8_t u8[16];
} __m128i;

typedef union {
	struct {
		double lo;
		double hi;
	};
	__m128i i;
} __m128d;

inline __m128d _mm_load_pd(const double* pd) {
	__m128d x;
	x.i.u64[0] = load64(pd + 0);
	x.i.u64[1] = load64(pd + 1);
	return x;
}

inline void _mm_store_pd(double* mem_addr, __m128d a) {
	store64(mem_addr + 0, a.i.u64[0]);
	store64(mem_addr + 1, a.i.u64[1]);
}

inline __m128d _mm_shuffle_pd(__m128d a, __m128d b, int imm8) {
	__m128d x;
	x.lo = (imm8 & 1) ? a.hi : a.lo;
	x.hi = (imm8 & 2) ? b.hi : b.lo;
	return x;
}

inline __m128d _mm_add_pd(__m128d a, __m128d b) {
	__m128d x;
	x.lo = a.lo + b.lo;
	x.hi = a.hi + b.hi;
	return x;
}

inline __m128d _mm_sub_pd(__m128d a, __m128d b) {
	__m128d x;
	x.lo = a.lo - b.lo;
	x.hi = a.hi - b.hi;
	return x;
}

inline __m128d _mm_mul_pd(__m128d a, __m128d b) {
	__m128d x;
	x.lo = a.lo * b.lo;
	x.hi = a.hi * b.hi;
	return x;
}

inline __m128d _mm_div_pd(__m128d a, __m128d b) {
	__m128d x;
	x.lo = a.lo / b.lo;
	x.hi = a.hi / b.hi;
	return x;
}

inline __m128d _mm_sqrt_pd(__m128d a) {
	__m128d x;
	x.lo = sqrt(a.lo);
	x.hi = sqrt(a.hi);
	return x;
}

inline __m128i _mm_set1_epi64x(uint64_t a) {
	__m128i x;
	x.u64[0] = a;
	x.u64[1] = a;
	return x;
}

inline __m128d _mm_castsi128_pd(__m128i a) {
	__m128d x;
	x.i = a;
	return x;
}

inline __m128d _mm_abs(__m128d xd) {
	xd.lo = std::fabs(xd.lo);
	xd.hi = std::fabs(xd.hi);
	return xd;
}

inline __m128d _mm_xor_pd(__m128d a, __m128d b) {
	__m128d x;
	x.i.u64[0] = a.i.u64[0] ^ b.i.u64[0];
	x.i.u64[1] = a.i.u64[1] ^ b.i.u64[1];
	return x;
}

inline __m128d _mm_set_pd(double e1, double e0) {
	__m128d x;
	x.lo = e0;
	x.hi = e1;
	return x;
}

inline __m128d _mm_max_pd(__m128d a, __m128d b) {
	__m128d x;
	x.lo = a.lo > b.lo ? a.lo : b.lo;
	x.hi = a.hi > b.hi ? a.hi : b.hi;
	return x;
}

inline __m128d _mm_cvtepi32_pd(__m128i a) {
	__m128d x;
	x.lo = (double)unsigned32ToSigned2sCompl(a.u32[0]);
	x.hi = (double)unsigned32ToSigned2sCompl(a.u32[1]);
	return x;
}

static const char* platformError = "Platform doesn't support hardware AES";

inline __m128i _mm_aeskeygenassist_si128(__m128i key, uint8_t rcon) {
	throw std::runtime_error(platformError);
}

inline __m128i _mm_aesenc_si128(__m128i v, __m128i rkey) {
	throw std::runtime_error(platformError);
}

inline __m128i _mm_aesdec_si128(__m128i v, __m128i rkey) {
	throw std::runtime_error(platformError);
}

inline int _mm_cvtsi128_si32(__m128i v) {
	return v.u32[0];
}

inline __m128i _mm_cvtsi32_si128(int si32) {
	__m128i v;
	v.u32[0] = si32;
	v.u32[1] = 0;
	v.u32[2] = 0;
	v.u32[3] = 0;
	return v;
}

inline  __m128i _mm_set_epi64x(int64_t _I1, int64_t _I0) {
	__m128i v;
	v.u64[0] = _I0;
	v.u64[1] = _I1;
	return v;
}

inline __m128i _mm_set_epi32(int _I3, int _I2, int _I1, int _I0) {
	__m128i v;
	v.u32[0] = _I0;
	v.u32[1] = _I1;
	v.u32[2] = _I2;
	v.u32[3] = _I3;
	return v;
};

inline __m128i _mm_xor_si128(__m128i _A, __m128i _B) {
	__m128i c;
	c.u32[0] = _A.u32[0] ^ _B.u32[0];
	c.u32[1] = _A.u32[1] ^ _B.u32[1];
	c.u32[2] = _A.u32[2] ^ _B.u32[2];
	c.u32[3] = _A.u32[3] ^ _B.u32[3];
	return c;
}

inline __m128i _mm_shuffle_epi32(__m128i _A, int _Imm) {
	__m128i c;
	c.u32[0] = _A.u32[_Imm & 3];
	c.u32[1] = _A.u32[(_Imm >> 2) & 3];
	c.u32[2] = _A.u32[(_Imm >> 4) & 3];
	c.u32[3] = _A.u32[(_Imm >> 6) & 3];
	return c;
}

inline __m128i _mm_load_si128(__m128i const*_P) {
	return *_P;
}

inline void _mm_store_si128(__m128i *_P, __m128i _B) {
	*_P = _B;
}

inline __m128i _mm_slli_si128(__m128i _A, int _Imm) {
	_Imm &= 255;
	if (_Imm > 15) {
		_A.u64[0] = 0;
		_A.u64[1] = 0;
	}
	else {
		for (int i = 15; i >= _Imm; --i) {
			_A.u8[i] = _A.u8[i - _Imm];
		}
		for (int i = 0; i < _Imm; ++i) {
			_A.u8[i] = 0;
		}
	}
	return _A;
}

inline __m128i _mm_loadl_epi64(__m128i const* mem_addr) {
	__m128i x;
	x.u64[0] = load64(mem_addr);
	return x;
}

#endif

constexpr int RoundToNearest = 0;
constexpr int RoundDown = 1;
constexpr int RoundUp = 2;
constexpr int RoundToZero = 3;

inline __m128d load_cvt_i32x2(const void* addr) {
	__m128i ix = _mm_loadl_epi64((const __m128i*)addr);
	return _mm_cvtepi32_pd(ix);
}

double loadDoublePortable(const void* addr);

uint64_t mulh(uint64_t, uint64_t);
int64_t smulh(int64_t, int64_t);
uint64_t rotl(uint64_t, int);
uint64_t rotr(uint64_t, int);
void initFpu();
void setRoundMode(uint32_t);
bool condition(uint32_t, uint32_t, uint32_t);
RandomX portable interpreter 2018-12-11 20:00:30 +00:00			`/*`
			`Copyright (c) 2018 tevador`

			`This file is part of RandomX.`

			`RandomX is free software: you can redistribute it and/or modify`
			`it under the terms of the GNU General Public License as published by`
			`the Free Software Foundation, either version 3 of the License, or`
			`(at your option) any later version.`

			`RandomX is distributed in the hope that it will be useful,`
			`but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the`
			`GNU General Public License for more details.`

			`You should have received a copy of the GNU General Public License`
			`along with RandomX. If not, see<http://www.gnu.org/licenses/>.`
			`*/`

			`#pragma once`

Added ISWAP instruction Scratchpad -> 2 MiB New scratchpad initialization New dataset initialization 2019-02-04 16:07:00 +00:00			`#include <cstdint>`

Portable SSE2 intrinsics 2019-02-18 07:56:37 +00:00			`constexpr int32_t unsigned32ToSigned2sCompl(uint32_t x) {`
			`return (-1 == ~0) ? (int32_t)x : (x > INT32_MAX ? (-(int32_t)(UINT32_MAX - x) - 1) : (int32_t)x);`
			`}`

			`constexpr int64_t unsigned64ToSigned2sCompl(uint64_t x) {`
			`return (-1 == ~0) ? (int64_t)x : (x > INT64_MAX ? (-(int64_t)(UINT64_MAX - x) - 1) : (int64_t)x);`
			`}`

			`constexpr uint64_t signExtend2sCompl(uint32_t x) {`
			`return (-1 == ~0) ? (int64_t)(int32_t)(x) : (x > INT32_MAX ? (x \| 0xffffffff00000000ULL) : (uint64_t)x);`
			`}`

RandomX portable interpreter 2018-12-11 20:00:30 +00:00			`#if defined(_MSC_VER)`
			`#if defined(_M_X64) \|\| (defined(_M_IX86_FP) && _M_IX86_FP == 2)`
			`#define __SSE2__ 1`
			`#endif`
			`#endif`

			`#ifdef __SSE2__`
			`#ifdef __GNUC__`
			`#include <x86intrin.h>`
			`#else`
			`#include <intrin.h>`
			`#endif`
Interpreter with bytecode Fixed some undefined behavior with signed types Fixed different results on big endian systems Removed unused code files Restored FNEG_R instructions Updated documentation 2019-02-09 14:45:26 +00:00
			`inline __m128d _mm_abs(__m128d xd) {`
			`const __m128d absmask = _mm_castsi128_pd(_mm_set1_epi64x(~(1LL << 63)));`
			`return _mm_and_pd(xd, absmask);`
			`}`

			`#define PREFETCHNTA(x) _mm_prefetch((const char *)(x), _MM_HINT_NTA)`

RandomX portable interpreter 2018-12-11 20:00:30 +00:00			`#else`
			`#include <cstdint>`
			`#include <stdexcept>`
Portable SSE2 intrinsics 2019-02-18 07:56:37 +00:00			`#include <cstdlib>`
Fixed portable intrinsics compilation 2019-02-18 16:57:54 +00:00			`#include <cmath>`
			`#include "blake2/endian.h"`
RandomX portable interpreter 2018-12-11 20:00:30 +00:00
			`#define _mm_malloc(a,b) malloc(a)`
			`#define _mm_free(a) free(a)`
Interpreter with bytecode Fixed some undefined behavior with signed types Fixed different results on big endian systems Removed unused code files Restored FNEG_R instructions Updated documentation 2019-02-09 14:45:26 +00:00			`#define PREFETCHNTA(x)`
RandomX portable interpreter 2018-12-11 20:00:30 +00:00
			`typedef union {`
			`uint64_t u64[2];`
			`uint32_t u32[4];`
			`uint16_t u16[8];`
			`uint8_t u8[16];`
			`} __m128i;`

Portable SSE2 intrinsics 2019-02-18 07:56:37 +00:00			`typedef union {`
			`struct {`
			`double lo;`
			`double hi;`
			`};`
			`__m128i i;`
Added ISWAP instruction Scratchpad -> 2 MiB New scratchpad initialization New dataset initialization 2019-02-04 16:07:00 +00:00			`} __m128d;`

			`inline __m128d _mm_load_pd(const double* pd) {`
			`__m128d x;`
Portable SSE2 intrinsics 2019-02-18 07:56:37 +00:00			`x.i.u64[0] = load64(pd + 0);`
			`x.i.u64[1] = load64(pd + 1);`
			`return x;`
			`}`

			`inline void _mm_store_pd(double* mem_addr, __m128d a) {`
			`store64(mem_addr + 0, a.i.u64[0]);`
			`store64(mem_addr + 1, a.i.u64[1]);`
			`}`

			`inline __m128d _mm_shuffle_pd(__m128d a, __m128d b, int imm8) {`
			`__m128d x;`
			`x.lo = (imm8 & 1) ? a.hi : a.lo;`
			`x.hi = (imm8 & 2) ? b.hi : b.lo;`
			`return x;`
			`}`

			`inline __m128d _mm_add_pd(__m128d a, __m128d b) {`
			`__m128d x;`
			`x.lo = a.lo + b.lo;`
			`x.hi = a.hi + b.hi;`
			`return x;`
			`}`

			`inline __m128d _mm_sub_pd(__m128d a, __m128d b) {`
			`__m128d x;`
			`x.lo = a.lo - b.lo;`
			`x.hi = a.hi - b.hi;`
			`return x;`
			`}`

			`inline __m128d _mm_mul_pd(__m128d a, __m128d b) {`
			`__m128d x;`
			`x.lo = a.lo * b.lo;`
			`x.hi = a.hi * b.hi;`
			`return x;`
			`}`

			`inline __m128d _mm_div_pd(__m128d a, __m128d b) {`
			`__m128d x;`
			`x.lo = a.lo / b.lo;`
			`x.hi = a.hi / b.hi;`
			`return x;`
			`}`

			`inline __m128d _mm_sqrt_pd(__m128d a) {`
			`__m128d x;`
			`x.lo = sqrt(a.lo);`
			`x.hi = sqrt(a.hi);`
			`return x;`
			`}`

			`inline __m128i _mm_set1_epi64x(uint64_t a) {`
			`__m128i x;`
			`x.u64[0] = a;`
			`x.u64[1] = a;`
			`return x;`
			`}`

			`inline __m128d _mm_castsi128_pd(__m128i a) {`
			`__m128d x;`
			`x.i = a;`
			`return x;`
			`}`

			`inline __m128d _mm_abs(__m128d xd) {`
Fixed portable intrinsics compilation 2019-02-18 16:57:54 +00:00			`xd.lo = std::fabs(xd.lo);`
			`xd.hi = std::fabs(xd.hi);`
Portable SSE2 intrinsics 2019-02-18 07:56:37 +00:00			`return xd;`
			`}`

			`inline __m128d _mm_xor_pd(__m128d a, __m128d b) {`
			`__m128d x;`
			`x.i.u64[0] = a.i.u64[0] ^ b.i.u64[0];`
			`x.i.u64[1] = a.i.u64[1] ^ b.i.u64[1];`
			`return x;`
			`}`

			`inline __m128d _mm_set_pd(double e1, double e0) {`
			`__m128d x;`
			`x.lo = e0;`
			`x.hi = e1;`
			`return x;`
			`}`

			`inline __m128d _mm_max_pd(__m128d a, __m128d b) {`
			`__m128d x;`
			`x.lo = a.lo > b.lo ? a.lo : b.lo;`
			`x.hi = a.hi > b.hi ? a.hi : b.hi;`
			`return x;`
			`}`

			`inline __m128d _mm_cvtepi32_pd(__m128i a) {`
			`__m128d x;`
			`x.lo = (double)unsigned32ToSigned2sCompl(a.u32[0]);`
			`x.hi = (double)unsigned32ToSigned2sCompl(a.u32[1]);`
Added ISWAP instruction Scratchpad -> 2 MiB New scratchpad initialization New dataset initialization 2019-02-04 16:07:00 +00:00			`return x;`
			`}`

RandomX portable interpreter 2018-12-11 20:00:30 +00:00			`static const char* platformError = "Platform doesn't support hardware AES";`

			`inline __m128i _mm_aeskeygenassist_si128(__m128i key, uint8_t rcon) {`
			`throw std::runtime_error(platformError);`
			`}`

			`inline __m128i _mm_aesenc_si128(__m128i v, __m128i rkey) {`
			`throw std::runtime_error(platformError);`
			`}`

			`inline __m128i _mm_aesdec_si128(__m128i v, __m128i rkey) {`
			`throw std::runtime_error(platformError);`
			`}`

			`inline int _mm_cvtsi128_si32(__m128i v) {`
			`return v.u32[0];`
			`}`

			`inline __m128i _mm_cvtsi32_si128(int si32) {`
			`__m128i v;`
			`v.u32[0] = si32;`
			`v.u32[1] = 0;`
			`v.u32[2] = 0;`
			`v.u32[3] = 0;`
			`return v;`
			`}`

			`inline __m128i _mm_set_epi64x(int64_t _I1, int64_t _I0) {`
			`__m128i v;`
			`v.u64[0] = _I0;`
			`v.u64[1] = _I1;`
			`return v;`
			`}`

			`inline __m128i _mm_set_epi32(int _I3, int _I2, int _I1, int _I0) {`
			`__m128i v;`
			`v.u32[0] = _I0;`
			`v.u32[1] = _I1;`
			`v.u32[2] = _I2;`
			`v.u32[3] = _I3;`
			`return v;`
			`};`

			`inline __m128i _mm_xor_si128(__m128i _A, __m128i _B) {`
			`__m128i c;`
			`c.u32[0] = _A.u32[0] ^ _B.u32[0];`
			`c.u32[1] = _A.u32[1] ^ _B.u32[1];`
			`c.u32[2] = _A.u32[2] ^ _B.u32[2];`
			`c.u32[3] = _A.u32[3] ^ _B.u32[3];`
			`return c;`
			`}`

			`inline __m128i _mm_shuffle_epi32(__m128i _A, int _Imm) {`
			`__m128i c;`
			`c.u32[0] = _A.u32[_Imm & 3];`
			`c.u32[1] = _A.u32[(_Imm >> 2) & 3];`
			`c.u32[2] = _A.u32[(_Imm >> 4) & 3];`
			`c.u32[3] = _A.u32[(_Imm >> 6) & 3];`
			`return c;`
			`}`

			`inline __m128i _mm_load_si128(__m128i const*_P) {`
			`return *_P;`
			`}`

			`inline void _mm_store_si128(__m128i *_P, __m128i _B) {`
			`*_P = _B;`
			`}`

			`inline __m128i _mm_slli_si128(__m128i _A, int _Imm) {`
			`_Imm &= 255;`
			`if (_Imm > 15) {`
			`_A.u64[0] = 0;`
			`_A.u64[1] = 0;`
			`}`
			`else {`
			`for (int i = 15; i >= _Imm; --i) {`
			`_A.u8[i] = _A.u8[i - _Imm];`
			`}`
			`for (int i = 0; i < _Imm; ++i) {`
			`_A.u8[i] = 0;`
			`}`
			`}`
			`return _A;`
			`}`

Portable SSE2 intrinsics 2019-02-18 07:56:37 +00:00			`inline __m128i _mm_loadl_epi64(__m128i const* mem_addr) {`
			`__m128i x;`
			`x.u64[0] = load64(mem_addr);`
			`return x;`
			`}`

Added ISWAP instruction Scratchpad -> 2 MiB New scratchpad initialization New dataset initialization 2019-02-04 16:07:00 +00:00			`#endif`

			`constexpr int RoundToNearest = 0;`
			`constexpr int RoundDown = 1;`
			`constexpr int RoundUp = 2;`
			`constexpr int RoundToZero = 3;`

Interpreter with bytecode Fixed some undefined behavior with signed types Fixed different results on big endian systems Removed unused code files Restored FNEG_R instructions Updated documentation 2019-02-09 14:45:26 +00:00			`inline __m128d load_cvt_i32x2(const void* addr) {`
Portable SSE2 intrinsics 2019-02-18 07:56:37 +00:00			`__m128i ix = _mm_loadl_epi64((const __m128i*)addr);`
Interpreter with bytecode Fixed some undefined behavior with signed types Fixed different results on big endian systems Removed unused code files Restored FNEG_R instructions Updated documentation 2019-02-09 14:45:26 +00:00			`return _mm_cvtepi32_pd(ix);`
			`}`

			`double loadDoublePortable(const void* addr);`

Added ISWAP instruction Scratchpad -> 2 MiB New scratchpad initialization New dataset initialization 2019-02-04 16:07:00 +00:00			`uint64_t mulh(uint64_t, uint64_t);`
			`int64_t smulh(int64_t, int64_t);`
			`uint64_t rotl(uint64_t, int);`
			`uint64_t rotr(uint64_t, int);`
			`void initFpu();`
			`void setRoundMode(uint32_t);`
Interpreter with bytecode Fixed some undefined behavior with signed types Fixed different results on big endian systems Removed unused code files Restored FNEG_R instructions Updated documentation 2019-02-09 14:45:26 +00:00			`bool condition(uint32_t, uint32_t, uint32_t);`