Added ISWAP instruction

Scratchpad -> 2 MiB
New scratchpad initialization
New dataset initialization
This commit is contained in:
tevador 2019-02-04 17:07:00 +01:00
parent 20eb549725
commit 1ee94bef2a
23 changed files with 528 additions and 290 deletions

View file

@ -13,7 +13,7 @@ LDFLAGS=-lpthread
TOBJS=$(addprefix $(OBJDIR)/,instructionsPortable.o TestAluFpu.o)
ROBJS=$(addprefix $(OBJDIR)/,argon2_core.o argon2_ref.o AssemblyGeneratorX86.o blake2b.o CompiledVirtualMachine.o dataset.o JitCompilerX86.o instructionsPortable.o Instruction.o InterpretedVirtualMachine.o main.o Program.o softAes.o VirtualMachine.o t1ha2.o Cache.o virtualMemory.o divideByConstantCodegen.o LightClientAsyncWorker.o AddressTransform.o hashAes1Rx4.o)
ifeq ($(PLATFORM),x86_64)
ROBJS += $(OBJDIR)/JitCompilerX86-static.o
ROBJS += $(OBJDIR)/JitCompilerX86-static.o $(OBJDIR)/squareHash.o
endif
all: release test
@ -77,6 +77,9 @@ $(OBJDIR)/JitCompilerX86.o: $(addprefix $(SRCDIR)/,JitCompilerX86.cpp JitCompile
$(OBJDIR)/JitCompilerX86-static.o: $(addprefix $(SRCDIR)/,JitCompilerX86-static.S $(addprefix asm/program_, prologue_linux.inc prologue_load.inc epilogue_linux.inc epilogue_store.inc read.inc)) | $(OBJDIR)
$(CXX) -x assembler-with-cpp -c $(SRCDIR)/JitCompilerX86-static.S -o $@
$(OBJDIR)/squareHash.o: $(addprefix $(SRCDIR)/,squareHash.S $(addprefix asm/, squareHash.inc)) | $(OBJDIR)
$(CXX) -x assembler-with-cpp -c $(SRCDIR)/squareHash.S -o $@
$(OBJDIR)/instructionsPortable.o: $(addprefix $(SRCDIR)/,instructionsPortable.cpp instructions.hpp intrinPortable.h) | $(OBJDIR)
$(CXX) $(CXXFLAGS) -c $(SRCDIR)/instructionsPortable.cpp -o $@

View file

@ -72,16 +72,16 @@ namespace RandomX {
void AssemblyGeneratorX86::genAddressReg(Instruction& instr, const char* reg = "eax") {
asmCode << "\tmov " << reg << ", " << regR32[instr.src] << std::endl;
asmCode << "\tand " << reg << ", " << ((instr.alt % 4) ? ScratchpadL1Mask : ScratchpadL2Mask) << std::endl;
asmCode << "\tand " << reg << ", " << ((instr.mod % 4) ? ScratchpadL1Mask : ScratchpadL2Mask) << std::endl;
}
void AssemblyGeneratorX86::genAddressRegDst(Instruction& instr, int maskAlign = 8) {
asmCode << "\tmov eax" << ", " << regR32[instr.dst] << std::endl;
asmCode << "\tand eax" << ", " << ((instr.alt % 4) ? (ScratchpadL1Mask & (-maskAlign)) : (ScratchpadL2Mask & (-maskAlign))) << std::endl;
asmCode << "\tand eax" << ", " << ((instr.mod % 4) ? (ScratchpadL1Mask & (-maskAlign)) : (ScratchpadL2Mask & (-maskAlign))) << std::endl;
}
int32_t AssemblyGeneratorX86::genAddressImm(Instruction& instr) {
return instr.imm32 & ((instr.alt % 4) ? ScratchpadL1Mask : ScratchpadL2Mask);
return instr.imm32 & ((instr.mod % 4) ? ScratchpadL1Mask : ScratchpadL2Mask);
}
//1 uOP
@ -348,6 +348,13 @@ namespace RandomX {
}
}
//2 uOPs
void AssemblyGeneratorX86::h_ISWAP_R(Instruction& instr, int i) {
if (instr.src != instr.dst) {
asmCode << "\txchg " << regR[instr.dst] << ", " << regR[instr.src] << std::endl;
}
}
//1 uOPs
void AssemblyGeneratorX86::h_FPSWAP_R(Instruction& instr, int i) {
asmCode << "\tshufpd " << regFE[instr.dst] << ", " << regFE[instr.dst] << ", 1" << std::endl;
@ -431,7 +438,7 @@ namespace RandomX {
//6 uOPs
void AssemblyGeneratorX86::h_CFROUND(Instruction& instr, int i) {
asmCode << "\tmov rax, " << regR[instr.src] << std::endl;
int rotate = (13 - (instr.alt & 63)) & 63;
int rotate = (13 - (instr.imm32 & 63)) & 63;
if (rotate != 0)
asmCode << "\trol rax, " << rotate << std::endl;
asmCode << "\tand eax, 24576" << std::endl;
@ -441,7 +448,7 @@ namespace RandomX {
}
static inline const char* condition(Instruction& instr, bool invert = false) {
switch (((instr.alt >> 2) & 7) ^ invert)
switch (((instr.mod >> 2) & 7) ^ invert)
{
case 0:
return "be";
@ -519,6 +526,7 @@ namespace RandomX {
INST_HANDLE(IXOR_M)
INST_HANDLE(IROR_R)
INST_HANDLE(IROL_R)
INST_HANDLE(ISWAP_R)
//Common floating point
INST_HANDLE(FPSWAP_R)

View file

@ -63,6 +63,7 @@ namespace RandomX {
void h_IXOR_M(Instruction&, int);
void h_IROR_R(Instruction&, int);
void h_IROL_R(Instruction&, int);
void h_ISWAP_R(Instruction&, int);
void h_FPSWAP_R(Instruction&, int);
void h_FPADD_R(Instruction&, int);
void h_FPADD_M(Instruction&, int);

View file

@ -57,7 +57,7 @@ namespace RandomX {
for (unsigned i = 0; i < sizeof(reg) / sizeof(Pcg32::result_type); ++i) {
*(((uint32_t*)&reg) + i) = gen();
}
FPINIT();
initFpu();
/*for (int i = 0; i < RegistersCount / 2; ++i) {
reg.f[i].lo.f64 = (double)reg.f[i].lo.i64;
reg.f[i].hi.f64 = (double)reg.f[i].hi.i64;

View file

@ -29,15 +29,15 @@ namespace RandomX {
}
void Instruction::genAddressReg(std::ostream& os) const {
os << ((alt % 4) ? "L1" : "L2") << "[r" << (int)src << "]";
os << ((mod % 4) ? "L1" : "L2") << "[r" << (int)src << "]";
}
void Instruction::genAddressRegDst(std::ostream& os) const {
os << ((alt % 4) ? "L1" : "L2") << "[r" << (int)dst << "]";
os << ((mod % 4) ? "L1" : "L2") << "[r" << (int)dst << "]";
}
void Instruction::genAddressImm(std::ostream& os) const {
os << ((alt % 4) ? "L1" : "L2") << "[" << (imm32 & ((alt % 4) ? ScratchpadL1Mask : ScratchpadL2Mask)) << "]";
os << ((mod % 4) ? "L1" : "L2") << "[" << (imm32 & ((mod % 4) ? ScratchpadL1Mask : ScratchpadL2Mask)) << "]";
}
void Instruction::h_IADD_R(std::ostream& os) const {
@ -211,6 +211,10 @@ namespace RandomX {
os << "r" << (int)dst << ", " << imm32 << std::endl;
}
void Instruction::h_ISWAP_R(std::ostream& os) const {
os << "r" << (int)dst << ", r" << (int)src << std::endl;
}
void Instruction::h_FPSWAP_R(std::ostream& os) const {
const char reg = (dst >= 4) ? 'e' : 'f';
auto dstIndex = dst % 4;
@ -280,7 +284,7 @@ namespace RandomX {
}
void Instruction::h_CFROUND(std::ostream& os) const {
os << "r" << (int)src << ", " << (alt & 63) << std::endl;
os << "r" << (int)src << ", " << (imm32 & 63) << std::endl;
}
static inline const char* condition(int index) {
@ -306,11 +310,11 @@ namespace RandomX {
}
void Instruction::h_COND_R(std::ostream& os) const {
os << "r" << (int)dst << ", " << condition((alt >> 2) & 7) << "(r" << (int)src << ", " << imm32 << ")" << std::endl;
os << "r" << (int)dst << ", " << condition((mod >> 2) & 7) << "(r" << (int)src << ", " << imm32 << ")" << std::endl;
}
void Instruction::h_COND_M(std::ostream& os) const {
os << "r" << (int)dst << ", " << condition((alt >> 2) & 7) << "(";
os << "r" << (int)dst << ", " << condition((mod >> 2) & 7) << "(";
genAddressReg(os);
os << ", " << imm32 << ")" << std::endl;
}
@ -356,6 +360,7 @@ namespace RandomX {
INST_NAME(IXOR_M)
INST_NAME(IROR_R)
INST_NAME(IROL_R)
INST_NAME(ISWAP_R)
//Common floating point
INST_NAME(FPSWAP_R)
@ -406,6 +411,7 @@ namespace RandomX {
INST_HANDLE(IXOR_M)
INST_HANDLE(IROR_R)
INST_HANDLE(IROL_R)
INST_HANDLE(ISWAP_R)
//Common floating point
INST_HANDLE(FPSWAP_R)

View file

@ -28,12 +28,52 @@ namespace RandomX {
typedef void(Instruction::*InstructionVisualizer)(std::ostream&) const;
namespace InstructionType {
constexpr int IADD_R = 0;
constexpr int IADD_M = 1;
constexpr int IADD_RC = 2;
constexpr int ISUB_R = 3;
constexpr int ISUB_M = 4;
constexpr int IMUL_9C = 5;
constexpr int IMUL_R = 6;
constexpr int IMUL_M = 7;
constexpr int IMULH_R = 8;
constexpr int IMULH_M = 9;
constexpr int ISMULH_R = 10;
constexpr int ISMULH_M = 11;
constexpr int IDIV_C = 12;
constexpr int ISDIV_C = 13;
constexpr int INEG_R = 14;
constexpr int IXOR_R = 15;
constexpr int IXOR_M = 16;
constexpr int IROR_R = 17;
constexpr int IROL_R = 18;
constexpr int ISWAP_R = 19;
constexpr int FPSWAP_R = 20;
constexpr int FPADD_R = 21;
constexpr int FPADD_M = 22;
constexpr int FPSUB_R = 23;
constexpr int FPSUB_M = 24;
constexpr int FPNEG_R = 25;
constexpr int FPMUL_R = 26;
constexpr int FPMUL_M = 27;
constexpr int FPDIV_R = 28;
constexpr int FPDIV_M = 29;
constexpr int FPSQRT_R = 30;
constexpr int COND_R = 31;
constexpr int COND_M = 32;
constexpr int CFROUND = 33;
constexpr int ISTORE = 34;
constexpr int FSTORE = 35;
constexpr int NOP = 36;
}
class Instruction {
public:
uint8_t opcode;
uint8_t dst;
uint8_t src;
uint8_t alt;
uint8_t mod;
int32_t imm32;
const char* getName() const {
return names[opcode];
@ -70,6 +110,7 @@ namespace RandomX {
void h_IXOR_M(std::ostream&) const;
void h_IROR_R(std::ostream&) const;
void h_IROL_R(std::ostream&) const;
void h_ISWAP_R(std::ostream&) const;
void h_FPSWAP_R(std::ostream&) const;
void h_FPADD_R(std::ostream&) const;
void h_FPADD_M(std::ostream&) const;

View file

@ -30,6 +30,7 @@ along with RandomX. If not, see<http://www.gnu.org/licenses/>.
#include <sstream>
#include <cmath>
#include <thread>
#include "intrinPortable.h"
#ifdef STATS
#include <algorithm>
#endif
@ -98,7 +99,7 @@ namespace RandomX {
for (unsigned i = 0; i < sizeof(reg) / sizeof(Pcg32::result_type); ++i) {
*(((uint32_t*)&reg) + i) = gen();
}
FPINIT();
initFpu();
for (int i = 0; i < RegistersCount; ++i) {
reg.f[i].lo.f64 = (double)reg.f[i].lo.i64;
reg.f[i].hi.f64 = (double)reg.f[i].hi.i64;
@ -114,24 +115,32 @@ namespace RandomX {
}
void InterpretedVirtualMachine::execute() {
while (ic > 0) {
#ifdef STATS
count_instructions[pc]++;
#endif
auto& inst = p(pc);
if(trace) std::cout << inst.getName() << " (" << std::dec << pc << ")" << std::endl;
pc = (pc + 1) % ProgramLength;
auto handler = engine[inst.opcode];
(this->*handler)(inst);
ic--;
for(int i = 0; i < InstructionCount; ++i) {
for (int j = 0; j < ProgramLength; ++j) {
auto& ibc = byteCode[j];
switch (ibc.type)
{
case InstructionType::CFROUND: {
uint64_t rcFlag = rotr(ibc.isrc->u64, ibc.imm.i32);
setRoundMode(rcFlag);
}
break;
}
}
}
#ifdef STATS
count_endstack += stack.size();
#endif
}
#include "instructionWeights.hpp"
#define INST_HANDLE(x) REPN(&InterpretedVirtualMachine::h_##x, WT(x))
void InterpretedVirtualMachine::executeInstruction(Instruction& instr) {
switch (instr.opcode)
{
CASE_REP(IADD_R)
break;
}
}
InstructionHandler InterpretedVirtualMachine::engine[256] = {

View file

@ -33,10 +33,24 @@ namespace RandomX {
virtual std::ostream& printCxx(std::ostream&) const = 0;
};
struct InstructionByteCode;
class InterpretedVirtualMachine;
typedef void(InterpretedVirtualMachine::*InstructionHandler)(Instruction&);
struct alignas(64) InstructionByteCode {
convertible_t* idst;
convertible_t* isrc;
convertible_t imm;
fpu_reg_t* fdst;
fpu_reg_t* fsrc;
uint32_t condition;
uint32_t memMask;
uint32_t type;
};
constexpr int asedwfagdewsa = sizeof(InstructionByteCode);
class InterpretedVirtualMachine : public VirtualMachine {
public:
InterpretedVirtualMachine(bool soft, bool async) : softAes(soft), asyncWorker(async) {}
@ -53,6 +67,7 @@ namespace RandomX {
static const ITransform* addressTransformations[TransformationCount];
bool softAes, asyncWorker;
Program p;
InstructionByteCode byteCode[ProgramLength];
std::vector<convertible_t> stack;
uint64_t pc, ic;
const ITransform* currentTransform;
@ -106,7 +121,7 @@ namespace RandomX {
int count_FPMUL_nop2 = 0;
int datasetAccess[256] = { 0 };
#endif
void executeInstruction(Instruction&);
convertible_t loada(Instruction&);
convertible_t loadbiashift(Instruction&);
convertible_t loadbiadiv(Instruction&);

View file

@ -176,6 +176,7 @@ namespace RandomX {
static const uint8_t JNZ[] = { 0x0f, 0x85 };
static const uint8_t JMP = 0xe9;
static const uint8_t REX_XOR_RAX_R64[] = { 0x49, 0x33 };
static const uint8_t REX_XCHG[] = { 0x4d, 0x87 };
size_t JitCompilerX86::getCodeSize() {
return codePos - prologueSize;
@ -248,7 +249,7 @@ namespace RandomX {
emitByte(AND_EAX_I);
else
emit(AND_ECX_I);
emit32((instr.alt % 4) ? ScratchpadL1Mask : ScratchpadL2Mask);
emit32((instr.mod % 4) ? ScratchpadL1Mask : ScratchpadL2Mask);
}
void JitCompilerX86::genAddressRegDst(Instruction& instr, bool align16 = false) {
@ -257,11 +258,11 @@ namespace RandomX {
emitByte(AND_EAX_I);
int32_t maskL1 = align16 ? ScratchpadL1Mask16 : ScratchpadL1Mask;
int32_t maskL2 = align16 ? ScratchpadL2Mask16 : ScratchpadL2Mask;
emit32((instr.alt % 4) ? maskL1 : maskL2);
emit32((instr.mod % 4) ? maskL1 : maskL2);
}
void JitCompilerX86::genAddressImm(Instruction& instr) {
emit32(instr.imm32 & ((instr.alt % 4) ? ScratchpadL1Mask : ScratchpadL2Mask));
emit32(instr.imm32 & ((instr.mod % 4) ? ScratchpadL1Mask : ScratchpadL2Mask));
}
void JitCompilerX86::h_IADD_R(Instruction& instr) {
@ -595,6 +596,13 @@ namespace RandomX {
}
}
void JitCompilerX86::h_ISWAP_R(Instruction& instr) {
if (instr.src != instr.dst) {
emit(REX_XCHG);
emitByte(0xc0 + instr.dst + 8 * instr.src);
}
}
void JitCompilerX86::h_FPSWAP_R(Instruction& instr) {
emit(SHUFPD);
emitByte(0xc0 + 9 * instr.dst);
@ -682,7 +690,7 @@ namespace RandomX {
void JitCompilerX86::h_CFROUND(Instruction& instr) {
emit(REX_MOV_RR64);
emitByte(0xc0 + instr.src);
int rotate = (13 - (instr.alt & 63)) & 63;
int rotate = (13 - (instr.imm32 & 63)) & 63;
if (rotate != 0) {
emit(ROL_RAX);
emitByte(rotate);
@ -691,7 +699,7 @@ namespace RandomX {
}
static inline uint8_t condition(Instruction& instr, bool invert = false) {
switch ((instr.alt & 7) ^ invert)
switch ((instr.mod & 7) ^ invert)
{
case 0:
return 0x96; //setbe
@ -777,6 +785,7 @@ namespace RandomX {
INST_HANDLE(IXOR_M)
INST_HANDLE(IROR_R)
INST_HANDLE(IROL_R)
INST_HANDLE(ISWAP_R)
INST_HANDLE(FPSWAP_R)
INST_HANDLE(FPADD_R)
INST_HANDLE(FPADD_M)

View file

@ -109,6 +109,7 @@ namespace RandomX {
void h_IXOR_M(Instruction&);
void h_IROR_R(Instruction&);
void h_IROL_R(Instruction&);
void h_ISWAP_R(Instruction&);
void h_FPSWAP_R(Instruction&);
void h_FPADD_R(Instruction&);
void h_FPADD_M(Instruction&);

View file

@ -1,5 +1,5 @@
mov rdx, rax
and eax, 1048512
and eax, 2097088
lea rcx, [rsi+rax]
push rcx
xor r8, qword ptr [rcx+0]
@ -11,7 +11,7 @@
xor r14, qword ptr [rcx+48]
xor r15, qword ptr [rcx+56]
ror rdx, 32
and edx, 1048512
and edx, 2097088
lea rcx, [rsi+rdx]
push rcx
cvtdq2pd xmm0, qword ptr [rcx+0]

87
src/asm/squareHash.inc Normal file
View file

@ -0,0 +1,87 @@
mov rax, 1613783669344650115
add rax, rcx
mul rax
sub rax, rdx ;# 1
mul rax
sub rax, rdx ;# 2
mul rax
sub rax, rdx ;# 3
mul rax
sub rax, rdx ;# 4
mul rax
sub rax, rdx ;# 5
mul rax
sub rax, rdx ;# 6
mul rax
sub rax, rdx ;# 7
mul rax
sub rax, rdx ;# 8
mul rax
sub rax, rdx ;# 9
mul rax
sub rax, rdx ;# 10
mul rax
sub rax, rdx ;# 11
mul rax
sub rax, rdx ;# 12
mul rax
sub rax, rdx ;# 13
mul rax
sub rax, rdx ;# 14
mul rax
sub rax, rdx ;# 15
mul rax
sub rax, rdx ;# 16
mul rax
sub rax, rdx ;# 17
mul rax
sub rax, rdx ;# 18
mul rax
sub rax, rdx ;# 19
mul rax
sub rax, rdx ;# 20
mul rax
sub rax, rdx ;# 21
mul rax
sub rax, rdx ;# 22
mul rax
sub rax, rdx ;# 23
mul rax
sub rax, rdx ;# 24
mul rax
sub rax, rdx ;# 25
mul rax
sub rax, rdx ;# 26
mul rax
sub rax, rdx ;# 27
mul rax
sub rax, rdx ;# 28
mul rax
sub rax, rdx ;# 29
mul rax
sub rax, rdx ;# 30
mul rax
sub rax, rdx ;# 31
mul rax
sub rax, rdx ;# 32
mul rax
sub rax, rdx ;# 33
mul rax
sub rax, rdx ;# 34
mul rax
sub rax, rdx ;# 35
mul rax
sub rax, rdx ;# 36
mul rax
sub rax, rdx ;# 37
mul rax
sub rax, rdx ;# 38
mul rax
sub rax, rdx ;# 39
mul rax
sub rax, rdx ;# 40
mul rax
sub rax, rdx ;# 41
mul rax
sub rax, rdx ;# 42
ret

View file

@ -26,11 +26,6 @@ namespace RandomX {
using addr_t = uint32_t;
constexpr int RoundToNearest = 0;
constexpr int RoundDown = 1;
constexpr int RoundUp = 2;
constexpr int RoundToZero = 3;
constexpr int SeedSize = 32;
constexpr int ResultSize = 32;
@ -46,7 +41,7 @@ namespace RandomX {
constexpr int CacheBlockCount = CacheSize / CacheLineSize;
constexpr int BlockExpansionRatio = DatasetSize / CacheSize;
constexpr int DatasetBlockCount = BlockExpansionRatio * CacheBlockCount;
constexpr int DatasetIterations = 3;
constexpr int DatasetIterations = 10;
#ifdef TRACE
@ -72,12 +67,12 @@ namespace RandomX {
convertible_t hi;
};
constexpr int ProgramLength = 128;
constexpr int ProgramLength = 256;
constexpr uint32_t InstructionCount = 1024;
constexpr uint32_t ScratchpadSize = 1024 * 1024;
constexpr uint32_t ScratchpadSize = 2 * 1024 * 1024;
constexpr uint32_t ScratchpadLength = ScratchpadSize / sizeof(convertible_t);
constexpr uint32_t ScratchpadL1 = ScratchpadSize / 64 / sizeof(convertible_t);
constexpr uint32_t ScratchpadL2 = ScratchpadSize / 4 / sizeof(convertible_t);
constexpr uint32_t ScratchpadL1 = ScratchpadSize / 128 / sizeof(convertible_t);
constexpr uint32_t ScratchpadL2 = ScratchpadSize / 8 / sizeof(convertible_t);
constexpr uint32_t ScratchpadL3 = ScratchpadSize / sizeof(convertible_t);
constexpr int ScratchpadL1Mask = (ScratchpadL1 - 1) * 8;
constexpr int ScratchpadL2Mask = (ScratchpadL2 - 1) * 8;
@ -133,6 +128,8 @@ namespace RandomX {
typedef void(*ProgramFunc)(RegisterFile&, MemoryRegisters&, convertible_t*, uint64_t);
typedef bool(*Condition)(convertible_t&, convertible_t&);
extern "C" {
void executeProgram(RegisterFile&, MemoryRegisters&, convertible_t*, uint64_t);
}

View file

@ -28,10 +28,11 @@ along with RandomX. If not, see<http://www.gnu.org/licenses/>.
#include "Cache.hpp"
#include "virtualMemory.hpp"
#include "softAes.h"
#include "squareHash.h"
#if defined(__SSE2__)
#include <wmmintrin.h>
#define PREFETCH(memory) _mm_prefetch((const char *)((memory).ds.dataset + (memory).ma), _MM_HINT_NTA)
#define PREFETCHNTA(x) _mm_prefetch((const char *)(x), _MM_HINT_NTA)
#else
#define PREFETCH(memory)
#endif
@ -49,42 +50,37 @@ namespace RandomX {
template<bool soft>
void initBlock(const uint8_t* intermediate, uint8_t* out, uint32_t blockNumber, const KeysContainer& keys) {
__m128i x0, x1, x2, x3;
uint64_t r0, r1, r2, r3, r4, r5, r6, r7;
__m128i* xit = (__m128i*)intermediate;
__m128i* xout = (__m128i*)out;
r0 = 4ULL * blockNumber;
r1 = r2 = r3 = r4 = r5 = r6 = r7 = 0;
x0 = _mm_cvtsi32_si128(blockNumber);
constexpr int mask = (CacheSize / CacheLineSize) - 1;
constexpr int mask = (CacheSize - 1) & -64;
for (auto i = 0; i < DatasetIterations; ++i) {
x0 = aesenc<soft>(x0, keys[0]);
//x0 = aesenc<soft>(x0, keys[1]);
x1 = aesenc<soft>(x0, keys[2]);
//x1 = aesenc<soft>(x1, keys[3]);
x2 = aesenc<soft>(x1, keys[4]);
//x2 = aesenc<soft>(x2, keys[5]);
x3 = aesenc<soft>(x2, keys[6]);
//x3 = aesenc<soft>(x3, keys[7]);
int index = _mm_cvtsi128_si32(x3);
index &= mask;
__m128i t0 = _mm_load_si128(xit + 4 * index + 0);
__m128i t1 = _mm_load_si128(xit + 4 * index + 1);
__m128i t2 = _mm_load_si128(xit + 4 * index + 2);
__m128i t3 = _mm_load_si128(xit + 4 * index + 3);
x0 = _mm_xor_si128(x0, t0);
x1 = _mm_xor_si128(x1, t1);
x2 = _mm_xor_si128(x2, t2);
x3 = _mm_xor_si128(x3, t3);
uint64_t* mix = (uint64_t*)(intermediate + (r0 & mask));
PREFETCHNTA(mix);
r0 = squareHash(r0);
r0 ^= mix[0];
r1 ^= mix[1];
r2 ^= mix[2];
r3 ^= mix[3];
r4 ^= mix[4];
r5 ^= mix[5];
r6 ^= mix[6];
r7 ^= mix[7];
}
_mm_store_si128(xout + 0, x0);
_mm_store_si128(xout + 1, x1);
_mm_store_si128(xout + 2, x2);
_mm_store_si128(xout + 3, x3);
uint64_t* out64 = (uint64_t*)out;
out64[0] = r0;
out64[1] = r1;
out64[2] = r2;
out64[3] = r3;
out64[4] = r4;
out64[5] = r5;
out64[6] = r6;
out64[7] = r7;
}
template
@ -98,7 +94,7 @@ namespace RandomX {
memory.mx ^= addr;
memory.mx &= -64; //align to cache line
std::swap(memory.mx, memory.ma);
PREFETCH(memory);
PREFETCHNTA(memory.ds.dataset + memory.ma);
for (int i = 0; i < RegistersCount; ++i)
reg.r[i].u64 ^= datasetLine[i];
}

View file

@ -71,3 +71,44 @@ void hashAes1Rx4(const void *input, size_t inputSize, void *hash) {
template void hashAes1Rx4<false>(const void *input, size_t inputSize, void *hash);
template void hashAes1Rx4<true>(const void *input, size_t inputSize, void *hash);
template<bool softAes>
void fillAes1Rx4(void *state, size_t outputSize, void *buffer) {
const uint8_t* outptr = (uint8_t*)buffer;
const uint8_t* outputEnd = outptr + outputSize;
__m128i state0, state1, state2, state3;
__m128i key0, key1, key2, key3;
key0 = _mm_set_epi32(0x9274f206, 0x79498d2f, 0x7d2de6ab, 0x67a04d26);
key1 = _mm_set_epi32(0xe1f7af05, 0x2a3a6f1d, 0x86658a15, 0x4f719812);
key2 = _mm_set_epi32(0xd1b1f791, 0x9e2ec914, 0x14c77bce, 0xba90750e);
key3 = _mm_set_epi32(0x179d0fd9, 0x6e57883c, 0xa53bbe4f, 0xaa07621f);
state0 = _mm_load_si128((__m128i*)state + 0);
state1 = _mm_load_si128((__m128i*)state + 1);
state2 = _mm_load_si128((__m128i*)state + 2);
state3 = _mm_load_si128((__m128i*)state + 3);
while (outptr < outputEnd) {
state0 = aesdec<softAes>(state0, key0);
state1 = aesenc<softAes>(state1, key1);
state2 = aesdec<softAes>(state2, key2);
state3 = aesenc<softAes>(state3, key3);
_mm_store_si128((__m128i*)outptr + 0, state0);
_mm_store_si128((__m128i*)outptr + 1, state1);
_mm_store_si128((__m128i*)outptr + 2, state2);
_mm_store_si128((__m128i*)outptr + 3, state3);
outptr += 64;
}
_mm_store_si128((__m128i*)state + 0, state0);
_mm_store_si128((__m128i*)state + 1, state1);
_mm_store_si128((__m128i*)state + 2, state2);
_mm_store_si128((__m128i*)state + 3, state3);
}
template void fillAes1Rx4<true>(void *state, size_t outputSize, void *buffer);
template void fillAes1Rx4<false>(void *state, size_t outputSize, void *buffer);

View file

@ -20,4 +20,7 @@ along with RandomX. If not, see<http://www.gnu.org/licenses/>.
#include "softAes.h"
template<bool softAes>
void hashAes1Rx4(const void *input, size_t inputSize, void *hash);
void hashAes1Rx4(const void *input, size_t inputSize, void *hash);
template<bool softAes>
void fillAes1Rx4(void *state, size_t outputSize, void *buffer);

View file

@ -37,8 +37,9 @@ along with RandomX. If not, see<http://www.gnu.org/licenses/>.
#define WT_INEG_R 2
#define WT_IXOR_R 12
#define WT_IXOR_M 3
#define WT_IROR_R 12
#define WT_IROL_R 12
#define WT_IROR_R 10
#define WT_IROL_R 10
#define WT_ISWAP_R 4
//Common floating point
#define WT_FPSWAP_R 8
@ -72,7 +73,7 @@ constexpr int wtSum = WT_IADD_R + WT_IADD_M + WT_IADD_RC + WT_ISUB_R + \
WT_ISUB_M + WT_IMUL_9C + WT_IMUL_R + WT_IMUL_M + WT_IMULH_R + \
WT_IMULH_M + WT_ISMULH_R + WT_ISMULH_M + WT_IDIV_C + WT_ISDIV_C + \
WT_INEG_R + WT_IXOR_R + WT_IXOR_M + WT_IROR_R + WT_IROL_R + \
WT_FPSWAP_R + WT_FPADD_R + WT_FPADD_M + WT_FPSUB_R + WT_FPSUB_M + \
WT_ISWAP_R + WT_FPSWAP_R + WT_FPADD_R + WT_FPADD_M + WT_FPSUB_R + WT_FPSUB_M + \
WT_FPNEG_R + WT_FPMUL_R + WT_FPMUL_M + WT_FPDIV_R + WT_FPDIV_M + \
WT_FPSQRT_R + WT_COND_R + WT_COND_M + WT_CFROUND + WT_ISTORE + WT_FSTORE + WT_NOP;

View file

@ -17,7 +17,6 @@ You should have received a copy of the GNU General Public License
along with RandomX. If not, see<http://www.gnu.org/licenses/>.
*/
//#define DEBUG
#include "instructions.hpp"
#include "intrinPortable.h"
#pragma STDC FENV_ACCESS on
#include <cfenv>
@ -29,14 +28,14 @@ along with RandomX. If not, see<http://www.gnu.org/licenses/>.
#if defined(__SIZEOF_INT128__)
typedef unsigned __int128 uint128_t;
typedef __int128 int128_t;
static inline uint64_t __umulhi64(uint64_t a, uint64_t b) {
uint64_t mulh(uint64_t a, uint64_t b) {
return ((uint128_t)a * b) >> 64;
}
static inline uint64_t __imulhi64(int64_t a, int64_t b) {
int64_t smulh(int64_t a, int64_t b) {
return ((int128_t)a * b) >> 64;
}
#define umulhi64 __umulhi64
#define imulhi64 __imulhi64
#define HAVE_MULH
#define HAVE_SMULH
#endif
#if defined(_MSC_VER)
@ -44,62 +43,62 @@ along with RandomX. If not, see<http://www.gnu.org/licenses/>.
#define EVAL_DEFINE(X) HAS_VALUE(X)
#include <intrin.h>
#include <stdlib.h>
#define ror64 _rotr64
#define rol64 _rotl64
uint64_t rotl(uint64_t x, int c) {
return _rotl64(x, c);
}
uint64_t rotr(uint64_t x , int c) {
return _rotr64(x, c);
}
#define HAVE_ROTL
#define HAVE_ROTR
#if EVAL_DEFINE(__MACHINEARM64_X64(1))
#define umulhi64 __umulh
uint64_t mulh(uint64_t a, uint64_t b) {
return __umulh(a, b);
}
#define HAVE_MULH
#endif
#if EVAL_DEFINE(__MACHINEX64(1))
static inline uint64_t __imulhi64(int64_t a, int64_t b) {
int64_t smulh(int64_t a, int64_t b) {
int64_t hi;
_mul128(a, b, &hi);
return hi;
}
#define imulhi64 __imulhi64
#define HAVE_SMULH
#endif
static inline uint32_t _setRoundMode(uint32_t mode) {
return _controlfp(mode, _MCW_RC);
static void setRoundMode__(uint32_t mode) {
_controlfp(mode, _MCW_RC);
}
#define setRoundMode _setRoundMode
#define HAVE_SETROUNDMODE_IMPL
#endif
#ifndef setRoundMode
#define setRoundMode fesetround
#ifndef HAVE_SETROUNDMODE_IMPL
static void setRoundMode__(uint32_t mode) {
fesetround(mode);
}
#endif
#ifndef ror64
static inline uint64_t __ror64(uint64_t a, int b) {
#ifndef HAVE_ROTR
uint64_t rotr(uint64_t a, int b) {
return (a >> b) | (a << (64 - b));
}
#define ror64 __ror64
#define HAS_ROTR
#endif
#ifndef rol64
static inline uint64_t __rol64(uint64_t a, int b) {
#ifndef HAVE_ROTL
uint64_t rotl(uint64_t a, int b) {
return (a << b) | (a >> (64 - b));
}
#define rol64 __rol64
#define HAS_ROTL
#endif
#ifndef sar64
#include <type_traits>
constexpr int64_t builtintShr64(int64_t value, int shift) noexcept {
return value >> shift;
}
struct UsesArithmeticShift : std::integral_constant<bool, builtintShr64(-1LL, 1) == -1LL> {
};
static inline int64_t __sar64(int64_t a, int b) {
return UsesArithmeticShift::value ? builtintShr64(a, b) : (a < 0 ? ~(~a >> b) : a >> b);
}
#define sar64 __sar64
#endif
#ifndef umulhi64
#ifndef HAVE_MULH
#define LO(x) ((x)&0xffffffff)
#define HI(x) ((x)>>32)
static inline uint64_t __umulhi64(uint64_t a, uint64_t b) {
uint64_t mulh(uint64_t a, uint64_t b) {
uint64_t ah = HI(a), al = LO(a);
uint64_t bh = HI(b), bl = LO(b);
uint64_t x00 = al * bl;
@ -112,17 +111,17 @@ along with RandomX. If not, see<http://www.gnu.org/licenses/>.
return (m3 << 32) + LO(m2);
}
#define umulhi64 __umulhi64
#define HAVE_MULH
#endif
#ifndef imulhi64
static inline int64_t __imulhi64(int64_t a, int64_t b) {
int64_t hi = umulhi64(a, b);
#ifndef HAVE_SMULH
int64_t smulh(int64_t a, int64_t b) {
int64_t hi = mulh(a, b);
if (a < 0LL) hi -= b;
if (b < 0LL) hi -= a;
return hi;
}
#define imulhi64 __imulhi64
#define HAVE_SMULH
#endif
// avoid undefined behavior of signed overflow
@ -137,20 +136,20 @@ static inline int32_t safeSub(int32_t a, int32_t b) {
#if defined(__has_builtin)
#if __has_builtin(__builtin_sub_overflow)
static inline bool __subOverflow(int32_t a, int32_t b) {
static inline bool subOverflow__(int32_t a, int32_t b) {
int32_t temp;
return __builtin_sub_overflow(a, b, &temp);
}
#define subOverflow __subOverflow
#define HAVE_SUB_OVERFLOW
#endif
#endif
#ifndef subOverflow
static inline bool __subOverflow(int32_t a, int32_t b) {
#ifndef HAVE_SUB_OVERFLOW
static inline bool subOverflow__(int32_t a, int32_t b) {
auto c = safeSub(a, b);
return (c < a) != (b > 0);
}
#define subOverflow __subOverflow
#define HAVE_SUB_OVERFLOW
#endif
static inline double FlushDenormalNaN(double x) {
@ -165,47 +164,57 @@ static inline double FlushNaN(double x) {
return x != x ? 0.0 : x;
}
void setRoundMode(uint32_t rcflag) {
switch (rcflag & 3) {
case RoundDown:
setRoundMode__(FE_DOWNWARD);
break;
case RoundUp:
setRoundMode__(FE_UPWARD);
break;
case RoundToZero:
setRoundMode__(FE_TOWARDZERO);
break;
default:
setRoundMode__(FE_TONEAREST);
break;
}
}
bool condition(uint32_t type, int32_t value, int32_t imm32) {
switch (type & 7)
{
case 0:
return (uint32_t)value <= (uint32_t)imm32;
case 1:
return (uint32_t)value > (uint32_t)imm32;
case 2:
return safeSub(value, imm32) < 0;
case 3:
return safeSub(value, imm32) >= 0;
case 4:
return subOverflow__(value, imm32);
case 5:
return !subOverflow__(value, imm32);
case 6:
return value < imm32;
case 7:
return value >= imm32;
}
}
void initFpu() {
#ifdef __SSE2__
_mm_setcsr(0x9FC0); //Flush to zero, denormals are zero, default rounding mode, all exceptions disabled
#else
setRoundMode(FE_TONEAREST);
#endif
}
namespace RandomX {
extern "C" {
void ADD_64(convertible_t& a, convertible_t& b, convertible_t& c) {
c.u64 = a.u64 + b.u64;
}
void ADD_32(convertible_t& a, convertible_t& b, convertible_t& c) {
c.u64 = a.u32 + b.u32;
}
void SUB_64(convertible_t& a, convertible_t& b, convertible_t& c) {
c.u64 = a.u64 - b.u64;
}
void SUB_32(convertible_t& a, convertible_t& b, convertible_t& c) {
c.u64 = a.u32 - b.u32;
}
void MUL_64(convertible_t& a, convertible_t& b, convertible_t& c) {
c.u64 = a.u64 * b.u64;
}
void MULH_64(convertible_t& a, convertible_t& b, convertible_t& c) {
c.u64 = umulhi64(a.u64, b.u64);
}
void MUL_32(convertible_t& a, convertible_t& b, convertible_t& c) {
c.u64 = (uint64_t)a.u32 * b.u32;
}
void IMUL_32(convertible_t& a, convertible_t& b, convertible_t& c) {
c.i64 = (int64_t)a.i32 * b.i32;
}
void IMULH_64(convertible_t& a, convertible_t& b, convertible_t& c) {
c.i64 = imulhi64(a.i64, b.i64);
}
void DIV_64(convertible_t& a, convertible_t& b, convertible_t& c) {
/*void DIV_64(convertible_t& a, convertible_t& b, convertible_t& c) {
c.u64 = a.u64 / (b.u32 != 0 ? b.u32 : 1U);
}
@ -216,80 +225,6 @@ namespace RandomX {
c.i64 = a.i64 / (b.i32 != 0 ? b.i32 : 1);
}
void AND_64(convertible_t& a, convertible_t& b, convertible_t& c) {
c.u64 = a.u64 & b.u64;
}
void AND_32(convertible_t& a, convertible_t& b, convertible_t& c) {
c.u64 = a.u32 & b.u32;
}
void OR_64(convertible_t& a, convertible_t& b, convertible_t& c) {
c.u64 = a.u64 | b.u64;
}
void OR_32(convertible_t& a, convertible_t& b, convertible_t& c) {
c.u64 = a.u32 | b.u32;
}
void XOR_64(convertible_t& a, convertible_t& b, convertible_t& c) {
c.u64 = a.u64 ^ b.u64;
}
void XOR_32(convertible_t& a, convertible_t& b, convertible_t& c) {
c.u64 = a.u32 ^ b.u32;
}
void SHL_64(convertible_t& a, convertible_t& b, convertible_t& c) {
c.u64 = a.u64 << (b.u64 & 63);
}
void SHR_64(convertible_t& a, convertible_t& b, convertible_t& c) {
c.u64 = a.u64 >> (b.u64 & 63);
}
void SAR_64(convertible_t& a, convertible_t& b, convertible_t& c) {
c.u64 = sar64(a.i64, b.u64 & 63);
}
void ROL_64(convertible_t& a, convertible_t& b, convertible_t& c) {
c.u64 = rol64(a.u64, (b.u64 & 63));
}
void ROR_64(convertible_t& a, convertible_t& b, convertible_t& c) {
c.u64 = ror64(a.u64, (b.u64 & 63));
}
bool JMP_COND(uint8_t type, convertible_t& regb, int32_t imm32) {
switch (type & 7)
{
case 0:
return regb.u32 <= (uint32_t)imm32;
case 1:
return regb.u32 > (uint32_t)imm32;
case 2:
return safeSub(regb.i32, imm32) < 0;
case 3:
return safeSub(regb.i32, imm32) >= 0;
case 4:
return subOverflow(regb.i32, imm32);
case 5:
return !subOverflow(regb.i32, imm32);
case 6:
return regb.i32 < imm32;
case 7:
return regb.i32 >= imm32;
}
}
void FPINIT() {
#ifdef __SSE2__
_mm_setcsr(0x9FC0); //Flush to zero, denormals are zero, default rounding mode, all exceptions disabled
#else
setRoundMode(FE_TONEAREST);
#endif
}
void FPADD(convertible_t& a, fpu_reg_t& b, fpu_reg_t& c) {
#ifdef __SSE2__
__m128i ai = _mm_loadl_epi64((const __m128i*)&a);
@ -368,48 +303,8 @@ namespace RandomX {
c.lo.f64 = sqrt(std::abs(alo));
c.hi.f64 = sqrt(std::abs(ahi));
#endif
}
}*/
void FPROUND(convertible_t a, uint8_t rot) {
uint64_t flag = ror64(a.u64, rot);
switch (flag & 3) {
case RoundDown:
#ifdef DEBUG
std::cout << "Round FE_DOWNWARD (" << FE_DOWNWARD << ") = " <<
#endif
setRoundMode(FE_DOWNWARD);
#ifdef DEBUG
std::cout << std::endl;
#endif
break;
case RoundUp:
#ifdef DEBUG
std::cout << "Round FE_UPWARD (" << FE_UPWARD << ") = " <<
#endif
setRoundMode(FE_UPWARD);
#ifdef DEBUG
std::cout << std::endl;
#endif
break;
case RoundToZero:
#ifdef DEBUG
std::cout << "Round FE_TOWARDZERO (" << FE_TOWARDZERO << ") = " <<
#endif
setRoundMode(FE_TOWARDZERO);
#ifdef DEBUG
std::cout << std::endl;
#endif
break;
default:
#ifdef DEBUG
std::cout << "Round FE_TONEAREST (" << FE_TONEAREST << ") = " <<
#endif
setRoundMode(FE_TONEAREST);
#ifdef DEBUG
std::cout << std::endl;
#endif
break;
}
}
}
}

View file

@ -19,6 +19,8 @@ along with RandomX. If not, see<http://www.gnu.org/licenses/>.
#pragma once
#include <cstdint>
#if defined(_MSC_VER)
#if defined(_M_X64) || (defined(_M_IX86_FP) && _M_IX86_FP == 2)
#define __SSE2__ 1
@ -45,6 +47,18 @@ typedef union {
uint8_t u8[16];
} __m128i;
typedef struct {
double lo;
double hi;
} __m128d;
inline __m128d _mm_load_pd(const double* pd) {
__m128d x;
x.lo = *(pd + 0);
x.hi = *(pd + 1);
return x;
}
static const char* platformError = "Platform doesn't support hardware AES";
inline __m128i _mm_aeskeygenassist_si128(__m128i key, uint8_t rcon) {
@ -131,4 +145,17 @@ inline __m128i _mm_slli_si128(__m128i _A, int _Imm) {
return _A;
}
#endif
#endif
constexpr int RoundToNearest = 0;
constexpr int RoundDown = 1;
constexpr int RoundUp = 2;
constexpr int RoundToZero = 3;
uint64_t mulh(uint64_t, uint64_t);
int64_t smulh(int64_t, int64_t);
uint64_t rotl(uint64_t, int);
uint64_t rotr(uint64_t, int);
void initFpu();
void setRoundMode(uint32_t);
bool condition(uint32_t, int32_t, int32_t);

View file

@ -35,6 +35,7 @@ along with RandomX. If not, see<http://www.gnu.org/licenses/>.
#include "dataset.hpp"
#include "Cache.hpp"
#include "Pcg32.hpp"
#include "hashAes1Rx4.hpp"
const uint8_t seed[32] = { 191, 182, 222, 175, 249, 89, 134, 104, 241, 68, 191, 62, 162, 166, 61, 64, 123, 191, 227, 193, 118, 60, 188, 53, 223, 133, 175, 24, 123, 230, 55, 74 };
@ -153,7 +154,7 @@ void generateNative(int nonce) {
}
void mine(RandomX::VirtualMachine* vm, std::atomic<int>& atomicNonce, AtomicHash& result, int noncesCount, int thread, uint8_t* scratchpad) {
uint64_t hash[4];
alignas(16) uint64_t hash[8];
unsigned char blockTemplate[] = {
0x07, 0x07, 0xf7, 0xa4, 0xf0, 0xd6, 0x05, 0xb3, 0x03, 0x26, 0x08, 0x16, 0xba, 0x3f, 0x10, 0x90, 0x2e, 0x1a, 0x14,
0x5a, 0xc5, 0xfa, 0xd3, 0xaa, 0x3a, 0xf6, 0xea, 0x44, 0xc1, 0x18, 0x69, 0xdc, 0x4f, 0x85, 0x3f, 0x00, 0x2b, 0x2e,
@ -167,8 +168,8 @@ void mine(RandomX::VirtualMachine* vm, std::atomic<int>& atomicNonce, AtomicHash
//std::cout << "Thread " << thread << " nonce " << nonce << std::endl;
*noncePtr = nonce;
blake2b(hash, sizeof(hash), blockTemplate, sizeof(blockTemplate), nullptr, 0);
int spIndex = ((uint8_t*)hash)[24] | ((((uint8_t*)hash)[25] & 15) << 8);
vm->initializeScratchpad(scratchpad, spIndex);
fillAes1Rx4<false>((void*)hash, RandomX::ScratchpadSize, scratchpad);
//vm->initializeScratchpad(scratchpad, spIndex);
vm->setScratchpad(scratchpad);
//dump((char*)((RandomX::CompiledVirtualMachine*)vm)->getProgram(), RandomX::CodeSize, "code-1337-jmp.txt");
for (int chain = 0; chain < 16; ++chain) {
@ -309,7 +310,7 @@ int main(int argc, char** argv) {
}
uint8_t* scratchpadMem;
if (largePages) {
scratchpadMem = (uint8_t*)allocLargePagesMemory(RandomX::ScratchpadSize * (threadCount + 1) / 2);
scratchpadMem = (uint8_t*)allocLargePagesMemory(threadCount * RandomX::ScratchpadSize);
}
else {
scratchpadMem = (uint8_t*)_mm_malloc(threadCount * RandomX::ScratchpadSize, RandomX::CacheLineSize);

17
src/squareHash.S Normal file
View file

@ -0,0 +1,17 @@
.intel_syntax noprefix
#if defined(__APPLE__)
.text
#else
.section .text
#endif
#if defined(__WIN32__) || defined(__APPLE__)
#define DECL(x) _##x
#else
#define DECL(x) x
#endif
.global DECL(squareHash)
DECL(squareHash):
mov rcx, rsi
#include "asm/squareHash.inc"

9
src/squareHash.asm Normal file
View file

@ -0,0 +1,9 @@
PUBLIC squareHash
.code
squareHash PROC
include asm/squareHash.inc
squareHash ENDP
END

71
src/squareHash.h Normal file
View file

@ -0,0 +1,71 @@
/*
Copyright (c) 2019 tevador
This file is part of RandomX.
RandomX is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
RandomX is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with RandomX. If not, see<http://www.gnu.org/licenses/>.
*/
#include <stdint.h>
#if !defined(_M_X64) && !defined(__x86_64__)
typedef struct {
uint64_t lo;
uint64_t hi;
} uint128_t;
#define LO(x) ((x)&0xffffffff)
#define HI(x) ((x)>>32)
static inline uint128_t square128(uint64_t x) {
uint64_t xh = HI(x), xl = LO(x);
uint64_t xll = xl * xl;
uint64_t xlh = xl * xh;
uint64_t xhh = xh * xh;
uint64_t m1 = 2 * LO(xlh) + HI(xll);
uint64_t m2 = 2 * HI(xlh) + LO(xhh) + HI(m1);
uint64_t m3 = HI(xhh) + HI(m2);
uint128_t x2;
x2.lo = (m1 << 32) + LO(xll);
x2.hi = (m3 << 32) + LO(m2);
return x2;
}
#undef LO(x)
#undef HI(x)
inline uint64_t squareHash(uint64_t x) {
x += 1613783669344650115;
for (int i = 0; i < 42; ++i) {
uint128_t x2 = square128(x);
x = x2.lo - x2.hi;
}
return x;
}
#else
#if defined(__cplusplus)
extern "C" {
#endif
uint64_t squareHash(uint64_t);
#if defined(__cplusplus)
}
#endif
#endif