From 1ee94bef2a3f6f57c1d77e6fd953061a332b2e44 Mon Sep 17 00:00:00 2001 From: tevador Date: Mon, 4 Feb 2019 17:07:00 +0100 Subject: [PATCH] Added ISWAP instruction Scratchpad -> 2 MiB New scratchpad initialization New dataset initialization --- makefile | 5 +- src/AssemblyGeneratorX86.cpp | 18 +- src/AssemblyGeneratorX86.hpp | 1 + src/CompiledVirtualMachine.cpp | 2 +- src/Instruction.cpp | 18 +- src/Instruction.hpp | 43 ++++- src/InterpretedVirtualMachine.cpp | 39 ++-- src/InterpretedVirtualMachine.hpp | 17 +- src/JitCompilerX86.cpp | 19 +- src/JitCompilerX86.hpp | 1 + src/asm/program_loop_load.inc | 4 +- src/asm/squareHash.inc | 87 +++++++++ src/common.hpp | 17 +- src/dataset.cpp | 60 +++--- src/hashAes1Rx4.cpp | 41 ++++ src/hashAes1Rx4.hpp | 5 +- src/instructionWeights.hpp | 7 +- src/instructionsPortable.cpp | 299 ++++++++++-------------------- src/intrinPortable.h | 29 ++- src/main.cpp | 9 +- src/squareHash.S | 17 ++ src/squareHash.asm | 9 + src/squareHash.h | 71 +++++++ 23 files changed, 528 insertions(+), 290 deletions(-) create mode 100644 src/asm/squareHash.inc create mode 100644 src/squareHash.S create mode 100644 src/squareHash.asm create mode 100644 src/squareHash.h diff --git a/makefile b/makefile index f805724..87fef86 100644 --- a/makefile +++ b/makefile @@ -13,7 +13,7 @@ LDFLAGS=-lpthread TOBJS=$(addprefix $(OBJDIR)/,instructionsPortable.o TestAluFpu.o) ROBJS=$(addprefix $(OBJDIR)/,argon2_core.o argon2_ref.o AssemblyGeneratorX86.o blake2b.o CompiledVirtualMachine.o dataset.o JitCompilerX86.o instructionsPortable.o Instruction.o InterpretedVirtualMachine.o main.o Program.o softAes.o VirtualMachine.o t1ha2.o Cache.o virtualMemory.o divideByConstantCodegen.o LightClientAsyncWorker.o AddressTransform.o hashAes1Rx4.o) ifeq ($(PLATFORM),x86_64) - ROBJS += $(OBJDIR)/JitCompilerX86-static.o + ROBJS += $(OBJDIR)/JitCompilerX86-static.o $(OBJDIR)/squareHash.o endif all: release test @@ -77,6 +77,9 @@ $(OBJDIR)/JitCompilerX86.o: $(addprefix $(SRCDIR)/,JitCompilerX86.cpp JitCompile $(OBJDIR)/JitCompilerX86-static.o: $(addprefix $(SRCDIR)/,JitCompilerX86-static.S $(addprefix asm/program_, prologue_linux.inc prologue_load.inc epilogue_linux.inc epilogue_store.inc read.inc)) | $(OBJDIR) $(CXX) -x assembler-with-cpp -c $(SRCDIR)/JitCompilerX86-static.S -o $@ +$(OBJDIR)/squareHash.o: $(addprefix $(SRCDIR)/,squareHash.S $(addprefix asm/, squareHash.inc)) | $(OBJDIR) + $(CXX) -x assembler-with-cpp -c $(SRCDIR)/squareHash.S -o $@ + $(OBJDIR)/instructionsPortable.o: $(addprefix $(SRCDIR)/,instructionsPortable.cpp instructions.hpp intrinPortable.h) | $(OBJDIR) $(CXX) $(CXXFLAGS) -c $(SRCDIR)/instructionsPortable.cpp -o $@ diff --git a/src/AssemblyGeneratorX86.cpp b/src/AssemblyGeneratorX86.cpp index a46fe5d..3092e4d 100644 --- a/src/AssemblyGeneratorX86.cpp +++ b/src/AssemblyGeneratorX86.cpp @@ -72,16 +72,16 @@ namespace RandomX { void AssemblyGeneratorX86::genAddressReg(Instruction& instr, const char* reg = "eax") { asmCode << "\tmov " << reg << ", " << regR32[instr.src] << std::endl; - asmCode << "\tand " << reg << ", " << ((instr.alt % 4) ? ScratchpadL1Mask : ScratchpadL2Mask) << std::endl; + asmCode << "\tand " << reg << ", " << ((instr.mod % 4) ? ScratchpadL1Mask : ScratchpadL2Mask) << std::endl; } void AssemblyGeneratorX86::genAddressRegDst(Instruction& instr, int maskAlign = 8) { asmCode << "\tmov eax" << ", " << regR32[instr.dst] << std::endl; - asmCode << "\tand eax" << ", " << ((instr.alt % 4) ? (ScratchpadL1Mask & (-maskAlign)) : (ScratchpadL2Mask & (-maskAlign))) << std::endl; + asmCode << "\tand eax" << ", " << ((instr.mod % 4) ? (ScratchpadL1Mask & (-maskAlign)) : (ScratchpadL2Mask & (-maskAlign))) << std::endl; } int32_t AssemblyGeneratorX86::genAddressImm(Instruction& instr) { - return instr.imm32 & ((instr.alt % 4) ? ScratchpadL1Mask : ScratchpadL2Mask); + return instr.imm32 & ((instr.mod % 4) ? ScratchpadL1Mask : ScratchpadL2Mask); } //1 uOP @@ -348,6 +348,13 @@ namespace RandomX { } } + //2 uOPs + void AssemblyGeneratorX86::h_ISWAP_R(Instruction& instr, int i) { + if (instr.src != instr.dst) { + asmCode << "\txchg " << regR[instr.dst] << ", " << regR[instr.src] << std::endl; + } + } + //1 uOPs void AssemblyGeneratorX86::h_FPSWAP_R(Instruction& instr, int i) { asmCode << "\tshufpd " << regFE[instr.dst] << ", " << regFE[instr.dst] << ", 1" << std::endl; @@ -431,7 +438,7 @@ namespace RandomX { //6 uOPs void AssemblyGeneratorX86::h_CFROUND(Instruction& instr, int i) { asmCode << "\tmov rax, " << regR[instr.src] << std::endl; - int rotate = (13 - (instr.alt & 63)) & 63; + int rotate = (13 - (instr.imm32 & 63)) & 63; if (rotate != 0) asmCode << "\trol rax, " << rotate << std::endl; asmCode << "\tand eax, 24576" << std::endl; @@ -441,7 +448,7 @@ namespace RandomX { } static inline const char* condition(Instruction& instr, bool invert = false) { - switch (((instr.alt >> 2) & 7) ^ invert) + switch (((instr.mod >> 2) & 7) ^ invert) { case 0: return "be"; @@ -519,6 +526,7 @@ namespace RandomX { INST_HANDLE(IXOR_M) INST_HANDLE(IROR_R) INST_HANDLE(IROL_R) + INST_HANDLE(ISWAP_R) //Common floating point INST_HANDLE(FPSWAP_R) diff --git a/src/AssemblyGeneratorX86.hpp b/src/AssemblyGeneratorX86.hpp index 6b0c505..a8e062c 100644 --- a/src/AssemblyGeneratorX86.hpp +++ b/src/AssemblyGeneratorX86.hpp @@ -63,6 +63,7 @@ namespace RandomX { void h_IXOR_M(Instruction&, int); void h_IROR_R(Instruction&, int); void h_IROL_R(Instruction&, int); + void h_ISWAP_R(Instruction&, int); void h_FPSWAP_R(Instruction&, int); void h_FPADD_R(Instruction&, int); void h_FPADD_M(Instruction&, int); diff --git a/src/CompiledVirtualMachine.cpp b/src/CompiledVirtualMachine.cpp index f5d33d0..ebacf42 100644 --- a/src/CompiledVirtualMachine.cpp +++ b/src/CompiledVirtualMachine.cpp @@ -57,7 +57,7 @@ namespace RandomX { for (unsigned i = 0; i < sizeof(reg) / sizeof(Pcg32::result_type); ++i) { *(((uint32_t*)®) + i) = gen(); } - FPINIT(); + initFpu(); /*for (int i = 0; i < RegistersCount / 2; ++i) { reg.f[i].lo.f64 = (double)reg.f[i].lo.i64; reg.f[i].hi.f64 = (double)reg.f[i].hi.i64; diff --git a/src/Instruction.cpp b/src/Instruction.cpp index 0aa0289..ce75f43 100644 --- a/src/Instruction.cpp +++ b/src/Instruction.cpp @@ -29,15 +29,15 @@ namespace RandomX { } void Instruction::genAddressReg(std::ostream& os) const { - os << ((alt % 4) ? "L1" : "L2") << "[r" << (int)src << "]"; + os << ((mod % 4) ? "L1" : "L2") << "[r" << (int)src << "]"; } void Instruction::genAddressRegDst(std::ostream& os) const { - os << ((alt % 4) ? "L1" : "L2") << "[r" << (int)dst << "]"; + os << ((mod % 4) ? "L1" : "L2") << "[r" << (int)dst << "]"; } void Instruction::genAddressImm(std::ostream& os) const { - os << ((alt % 4) ? "L1" : "L2") << "[" << (imm32 & ((alt % 4) ? ScratchpadL1Mask : ScratchpadL2Mask)) << "]"; + os << ((mod % 4) ? "L1" : "L2") << "[" << (imm32 & ((mod % 4) ? ScratchpadL1Mask : ScratchpadL2Mask)) << "]"; } void Instruction::h_IADD_R(std::ostream& os) const { @@ -211,6 +211,10 @@ namespace RandomX { os << "r" << (int)dst << ", " << imm32 << std::endl; } + void Instruction::h_ISWAP_R(std::ostream& os) const { + os << "r" << (int)dst << ", r" << (int)src << std::endl; + } + void Instruction::h_FPSWAP_R(std::ostream& os) const { const char reg = (dst >= 4) ? 'e' : 'f'; auto dstIndex = dst % 4; @@ -280,7 +284,7 @@ namespace RandomX { } void Instruction::h_CFROUND(std::ostream& os) const { - os << "r" << (int)src << ", " << (alt & 63) << std::endl; + os << "r" << (int)src << ", " << (imm32 & 63) << std::endl; } static inline const char* condition(int index) { @@ -306,11 +310,11 @@ namespace RandomX { } void Instruction::h_COND_R(std::ostream& os) const { - os << "r" << (int)dst << ", " << condition((alt >> 2) & 7) << "(r" << (int)src << ", " << imm32 << ")" << std::endl; + os << "r" << (int)dst << ", " << condition((mod >> 2) & 7) << "(r" << (int)src << ", " << imm32 << ")" << std::endl; } void Instruction::h_COND_M(std::ostream& os) const { - os << "r" << (int)dst << ", " << condition((alt >> 2) & 7) << "("; + os << "r" << (int)dst << ", " << condition((mod >> 2) & 7) << "("; genAddressReg(os); os << ", " << imm32 << ")" << std::endl; } @@ -356,6 +360,7 @@ namespace RandomX { INST_NAME(IXOR_M) INST_NAME(IROR_R) INST_NAME(IROL_R) + INST_NAME(ISWAP_R) //Common floating point INST_NAME(FPSWAP_R) @@ -406,6 +411,7 @@ namespace RandomX { INST_HANDLE(IXOR_M) INST_HANDLE(IROR_R) INST_HANDLE(IROL_R) + INST_HANDLE(ISWAP_R) //Common floating point INST_HANDLE(FPSWAP_R) diff --git a/src/Instruction.hpp b/src/Instruction.hpp index ffa3880..987f326 100644 --- a/src/Instruction.hpp +++ b/src/Instruction.hpp @@ -28,12 +28,52 @@ namespace RandomX { typedef void(Instruction::*InstructionVisualizer)(std::ostream&) const; + namespace InstructionType { + constexpr int IADD_R = 0; + constexpr int IADD_M = 1; + constexpr int IADD_RC = 2; + constexpr int ISUB_R = 3; + constexpr int ISUB_M = 4; + constexpr int IMUL_9C = 5; + constexpr int IMUL_R = 6; + constexpr int IMUL_M = 7; + constexpr int IMULH_R = 8; + constexpr int IMULH_M = 9; + constexpr int ISMULH_R = 10; + constexpr int ISMULH_M = 11; + constexpr int IDIV_C = 12; + constexpr int ISDIV_C = 13; + constexpr int INEG_R = 14; + constexpr int IXOR_R = 15; + constexpr int IXOR_M = 16; + constexpr int IROR_R = 17; + constexpr int IROL_R = 18; + constexpr int ISWAP_R = 19; + constexpr int FPSWAP_R = 20; + constexpr int FPADD_R = 21; + constexpr int FPADD_M = 22; + constexpr int FPSUB_R = 23; + constexpr int FPSUB_M = 24; + constexpr int FPNEG_R = 25; + constexpr int FPMUL_R = 26; + constexpr int FPMUL_M = 27; + constexpr int FPDIV_R = 28; + constexpr int FPDIV_M = 29; + constexpr int FPSQRT_R = 30; + constexpr int COND_R = 31; + constexpr int COND_M = 32; + constexpr int CFROUND = 33; + constexpr int ISTORE = 34; + constexpr int FSTORE = 35; + constexpr int NOP = 36; + } + class Instruction { public: uint8_t opcode; uint8_t dst; uint8_t src; - uint8_t alt; + uint8_t mod; int32_t imm32; const char* getName() const { return names[opcode]; @@ -70,6 +110,7 @@ namespace RandomX { void h_IXOR_M(std::ostream&) const; void h_IROR_R(std::ostream&) const; void h_IROL_R(std::ostream&) const; + void h_ISWAP_R(std::ostream&) const; void h_FPSWAP_R(std::ostream&) const; void h_FPADD_R(std::ostream&) const; void h_FPADD_M(std::ostream&) const; diff --git a/src/InterpretedVirtualMachine.cpp b/src/InterpretedVirtualMachine.cpp index d145e78..af01183 100644 --- a/src/InterpretedVirtualMachine.cpp +++ b/src/InterpretedVirtualMachine.cpp @@ -30,6 +30,7 @@ along with RandomX. If not, see. #include #include #include +#include "intrinPortable.h" #ifdef STATS #include #endif @@ -98,7 +99,7 @@ namespace RandomX { for (unsigned i = 0; i < sizeof(reg) / sizeof(Pcg32::result_type); ++i) { *(((uint32_t*)®) + i) = gen(); } - FPINIT(); + initFpu(); for (int i = 0; i < RegistersCount; ++i) { reg.f[i].lo.f64 = (double)reg.f[i].lo.i64; reg.f[i].hi.f64 = (double)reg.f[i].hi.i64; @@ -114,24 +115,32 @@ namespace RandomX { } void InterpretedVirtualMachine::execute() { - while (ic > 0) { -#ifdef STATS - count_instructions[pc]++; -#endif - auto& inst = p(pc); - if(trace) std::cout << inst.getName() << " (" << std::dec << pc << ")" << std::endl; - pc = (pc + 1) % ProgramLength; - auto handler = engine[inst.opcode]; - (this->*handler)(inst); - ic--; + for(int i = 0; i < InstructionCount; ++i) { + for (int j = 0; j < ProgramLength; ++j) { + auto& ibc = byteCode[j]; + switch (ibc.type) + { + case InstructionType::CFROUND: { + uint64_t rcFlag = rotr(ibc.isrc->u64, ibc.imm.i32); + setRoundMode(rcFlag); + } + break; + } + } } -#ifdef STATS - count_endstack += stack.size(); -#endif + } #include "instructionWeights.hpp" -#define INST_HANDLE(x) REPN(&InterpretedVirtualMachine::h_##x, WT(x)) + + void InterpretedVirtualMachine::executeInstruction(Instruction& instr) { + switch (instr.opcode) + { + CASE_REP(IADD_R) + + break; + } + } InstructionHandler InterpretedVirtualMachine::engine[256] = { diff --git a/src/InterpretedVirtualMachine.hpp b/src/InterpretedVirtualMachine.hpp index fba081a..2eee73d 100644 --- a/src/InterpretedVirtualMachine.hpp +++ b/src/InterpretedVirtualMachine.hpp @@ -33,10 +33,24 @@ namespace RandomX { virtual std::ostream& printCxx(std::ostream&) const = 0; }; + struct InstructionByteCode; class InterpretedVirtualMachine; typedef void(InterpretedVirtualMachine::*InstructionHandler)(Instruction&); + struct alignas(64) InstructionByteCode { + convertible_t* idst; + convertible_t* isrc; + convertible_t imm; + fpu_reg_t* fdst; + fpu_reg_t* fsrc; + uint32_t condition; + uint32_t memMask; + uint32_t type; + }; + + constexpr int asedwfagdewsa = sizeof(InstructionByteCode); + class InterpretedVirtualMachine : public VirtualMachine { public: InterpretedVirtualMachine(bool soft, bool async) : softAes(soft), asyncWorker(async) {} @@ -53,6 +67,7 @@ namespace RandomX { static const ITransform* addressTransformations[TransformationCount]; bool softAes, asyncWorker; Program p; + InstructionByteCode byteCode[ProgramLength]; std::vector stack; uint64_t pc, ic; const ITransform* currentTransform; @@ -106,7 +121,7 @@ namespace RandomX { int count_FPMUL_nop2 = 0; int datasetAccess[256] = { 0 }; #endif - + void executeInstruction(Instruction&); convertible_t loada(Instruction&); convertible_t loadbiashift(Instruction&); convertible_t loadbiadiv(Instruction&); diff --git a/src/JitCompilerX86.cpp b/src/JitCompilerX86.cpp index cf50582..d8e7a42 100644 --- a/src/JitCompilerX86.cpp +++ b/src/JitCompilerX86.cpp @@ -176,6 +176,7 @@ namespace RandomX { static const uint8_t JNZ[] = { 0x0f, 0x85 }; static const uint8_t JMP = 0xe9; static const uint8_t REX_XOR_RAX_R64[] = { 0x49, 0x33 }; + static const uint8_t REX_XCHG[] = { 0x4d, 0x87 }; size_t JitCompilerX86::getCodeSize() { return codePos - prologueSize; @@ -248,7 +249,7 @@ namespace RandomX { emitByte(AND_EAX_I); else emit(AND_ECX_I); - emit32((instr.alt % 4) ? ScratchpadL1Mask : ScratchpadL2Mask); + emit32((instr.mod % 4) ? ScratchpadL1Mask : ScratchpadL2Mask); } void JitCompilerX86::genAddressRegDst(Instruction& instr, bool align16 = false) { @@ -257,11 +258,11 @@ namespace RandomX { emitByte(AND_EAX_I); int32_t maskL1 = align16 ? ScratchpadL1Mask16 : ScratchpadL1Mask; int32_t maskL2 = align16 ? ScratchpadL2Mask16 : ScratchpadL2Mask; - emit32((instr.alt % 4) ? maskL1 : maskL2); + emit32((instr.mod % 4) ? maskL1 : maskL2); } void JitCompilerX86::genAddressImm(Instruction& instr) { - emit32(instr.imm32 & ((instr.alt % 4) ? ScratchpadL1Mask : ScratchpadL2Mask)); + emit32(instr.imm32 & ((instr.mod % 4) ? ScratchpadL1Mask : ScratchpadL2Mask)); } void JitCompilerX86::h_IADD_R(Instruction& instr) { @@ -595,6 +596,13 @@ namespace RandomX { } } + void JitCompilerX86::h_ISWAP_R(Instruction& instr) { + if (instr.src != instr.dst) { + emit(REX_XCHG); + emitByte(0xc0 + instr.dst + 8 * instr.src); + } + } + void JitCompilerX86::h_FPSWAP_R(Instruction& instr) { emit(SHUFPD); emitByte(0xc0 + 9 * instr.dst); @@ -682,7 +690,7 @@ namespace RandomX { void JitCompilerX86::h_CFROUND(Instruction& instr) { emit(REX_MOV_RR64); emitByte(0xc0 + instr.src); - int rotate = (13 - (instr.alt & 63)) & 63; + int rotate = (13 - (instr.imm32 & 63)) & 63; if (rotate != 0) { emit(ROL_RAX); emitByte(rotate); @@ -691,7 +699,7 @@ namespace RandomX { } static inline uint8_t condition(Instruction& instr, bool invert = false) { - switch ((instr.alt & 7) ^ invert) + switch ((instr.mod & 7) ^ invert) { case 0: return 0x96; //setbe @@ -777,6 +785,7 @@ namespace RandomX { INST_HANDLE(IXOR_M) INST_HANDLE(IROR_R) INST_HANDLE(IROL_R) + INST_HANDLE(ISWAP_R) INST_HANDLE(FPSWAP_R) INST_HANDLE(FPADD_R) INST_HANDLE(FPADD_M) diff --git a/src/JitCompilerX86.hpp b/src/JitCompilerX86.hpp index 0aef990..9c85667 100644 --- a/src/JitCompilerX86.hpp +++ b/src/JitCompilerX86.hpp @@ -109,6 +109,7 @@ namespace RandomX { void h_IXOR_M(Instruction&); void h_IROR_R(Instruction&); void h_IROL_R(Instruction&); + void h_ISWAP_R(Instruction&); void h_FPSWAP_R(Instruction&); void h_FPADD_R(Instruction&); void h_FPADD_M(Instruction&); diff --git a/src/asm/program_loop_load.inc b/src/asm/program_loop_load.inc index c4c1fed..76b8f3d 100644 --- a/src/asm/program_loop_load.inc +++ b/src/asm/program_loop_load.inc @@ -1,5 +1,5 @@ mov rdx, rax - and eax, 1048512 + and eax, 2097088 lea rcx, [rsi+rax] push rcx xor r8, qword ptr [rcx+0] @@ -11,7 +11,7 @@ xor r14, qword ptr [rcx+48] xor r15, qword ptr [rcx+56] ror rdx, 32 - and edx, 1048512 + and edx, 2097088 lea rcx, [rsi+rdx] push rcx cvtdq2pd xmm0, qword ptr [rcx+0] diff --git a/src/asm/squareHash.inc b/src/asm/squareHash.inc new file mode 100644 index 0000000..b62dc9e --- /dev/null +++ b/src/asm/squareHash.inc @@ -0,0 +1,87 @@ + mov rax, 1613783669344650115 + add rax, rcx + mul rax + sub rax, rdx ;# 1 + mul rax + sub rax, rdx ;# 2 + mul rax + sub rax, rdx ;# 3 + mul rax + sub rax, rdx ;# 4 + mul rax + sub rax, rdx ;# 5 + mul rax + sub rax, rdx ;# 6 + mul rax + sub rax, rdx ;# 7 + mul rax + sub rax, rdx ;# 8 + mul rax + sub rax, rdx ;# 9 + mul rax + sub rax, rdx ;# 10 + mul rax + sub rax, rdx ;# 11 + mul rax + sub rax, rdx ;# 12 + mul rax + sub rax, rdx ;# 13 + mul rax + sub rax, rdx ;# 14 + mul rax + sub rax, rdx ;# 15 + mul rax + sub rax, rdx ;# 16 + mul rax + sub rax, rdx ;# 17 + mul rax + sub rax, rdx ;# 18 + mul rax + sub rax, rdx ;# 19 + mul rax + sub rax, rdx ;# 20 + mul rax + sub rax, rdx ;# 21 + mul rax + sub rax, rdx ;# 22 + mul rax + sub rax, rdx ;# 23 + mul rax + sub rax, rdx ;# 24 + mul rax + sub rax, rdx ;# 25 + mul rax + sub rax, rdx ;# 26 + mul rax + sub rax, rdx ;# 27 + mul rax + sub rax, rdx ;# 28 + mul rax + sub rax, rdx ;# 29 + mul rax + sub rax, rdx ;# 30 + mul rax + sub rax, rdx ;# 31 + mul rax + sub rax, rdx ;# 32 + mul rax + sub rax, rdx ;# 33 + mul rax + sub rax, rdx ;# 34 + mul rax + sub rax, rdx ;# 35 + mul rax + sub rax, rdx ;# 36 + mul rax + sub rax, rdx ;# 37 + mul rax + sub rax, rdx ;# 38 + mul rax + sub rax, rdx ;# 39 + mul rax + sub rax, rdx ;# 40 + mul rax + sub rax, rdx ;# 41 + mul rax + sub rax, rdx ;# 42 + ret \ No newline at end of file diff --git a/src/common.hpp b/src/common.hpp index bbd5a2b..e52dbc2 100644 --- a/src/common.hpp +++ b/src/common.hpp @@ -26,11 +26,6 @@ namespace RandomX { using addr_t = uint32_t; - constexpr int RoundToNearest = 0; - constexpr int RoundDown = 1; - constexpr int RoundUp = 2; - constexpr int RoundToZero = 3; - constexpr int SeedSize = 32; constexpr int ResultSize = 32; @@ -46,7 +41,7 @@ namespace RandomX { constexpr int CacheBlockCount = CacheSize / CacheLineSize; constexpr int BlockExpansionRatio = DatasetSize / CacheSize; constexpr int DatasetBlockCount = BlockExpansionRatio * CacheBlockCount; - constexpr int DatasetIterations = 3; + constexpr int DatasetIterations = 10; #ifdef TRACE @@ -72,12 +67,12 @@ namespace RandomX { convertible_t hi; }; - constexpr int ProgramLength = 128; + constexpr int ProgramLength = 256; constexpr uint32_t InstructionCount = 1024; - constexpr uint32_t ScratchpadSize = 1024 * 1024; + constexpr uint32_t ScratchpadSize = 2 * 1024 * 1024; constexpr uint32_t ScratchpadLength = ScratchpadSize / sizeof(convertible_t); - constexpr uint32_t ScratchpadL1 = ScratchpadSize / 64 / sizeof(convertible_t); - constexpr uint32_t ScratchpadL2 = ScratchpadSize / 4 / sizeof(convertible_t); + constexpr uint32_t ScratchpadL1 = ScratchpadSize / 128 / sizeof(convertible_t); + constexpr uint32_t ScratchpadL2 = ScratchpadSize / 8 / sizeof(convertible_t); constexpr uint32_t ScratchpadL3 = ScratchpadSize / sizeof(convertible_t); constexpr int ScratchpadL1Mask = (ScratchpadL1 - 1) * 8; constexpr int ScratchpadL2Mask = (ScratchpadL2 - 1) * 8; @@ -133,6 +128,8 @@ namespace RandomX { typedef void(*ProgramFunc)(RegisterFile&, MemoryRegisters&, convertible_t*, uint64_t); + typedef bool(*Condition)(convertible_t&, convertible_t&); + extern "C" { void executeProgram(RegisterFile&, MemoryRegisters&, convertible_t*, uint64_t); } diff --git a/src/dataset.cpp b/src/dataset.cpp index 6029611..b941a75 100644 --- a/src/dataset.cpp +++ b/src/dataset.cpp @@ -28,10 +28,11 @@ along with RandomX. If not, see. #include "Cache.hpp" #include "virtualMemory.hpp" #include "softAes.h" +#include "squareHash.h" #if defined(__SSE2__) #include -#define PREFETCH(memory) _mm_prefetch((const char *)((memory).ds.dataset + (memory).ma), _MM_HINT_NTA) +#define PREFETCHNTA(x) _mm_prefetch((const char *)(x), _MM_HINT_NTA) #else #define PREFETCH(memory) #endif @@ -49,42 +50,37 @@ namespace RandomX { template void initBlock(const uint8_t* intermediate, uint8_t* out, uint32_t blockNumber, const KeysContainer& keys) { - __m128i x0, x1, x2, x3; + uint64_t r0, r1, r2, r3, r4, r5, r6, r7; - __m128i* xit = (__m128i*)intermediate; - __m128i* xout = (__m128i*)out; + r0 = 4ULL * blockNumber; + r1 = r2 = r3 = r4 = r5 = r6 = r7 = 0; - x0 = _mm_cvtsi32_si128(blockNumber); - constexpr int mask = (CacheSize / CacheLineSize) - 1; + constexpr int mask = (CacheSize - 1) & -64; for (auto i = 0; i < DatasetIterations; ++i) { - x0 = aesenc(x0, keys[0]); - //x0 = aesenc(x0, keys[1]); - x1 = aesenc(x0, keys[2]); - //x1 = aesenc(x1, keys[3]); - x2 = aesenc(x1, keys[4]); - //x2 = aesenc(x2, keys[5]); - x3 = aesenc(x2, keys[6]); - //x3 = aesenc(x3, keys[7]); - - int index = _mm_cvtsi128_si32(x3); - index &= mask; - - __m128i t0 = _mm_load_si128(xit + 4 * index + 0); - __m128i t1 = _mm_load_si128(xit + 4 * index + 1); - __m128i t2 = _mm_load_si128(xit + 4 * index + 2); - __m128i t3 = _mm_load_si128(xit + 4 * index + 3); - - x0 = _mm_xor_si128(x0, t0); - x1 = _mm_xor_si128(x1, t1); - x2 = _mm_xor_si128(x2, t2); - x3 = _mm_xor_si128(x3, t3); + uint64_t* mix = (uint64_t*)(intermediate + (r0 & mask)); + PREFETCHNTA(mix); + r0 = squareHash(r0); + r0 ^= mix[0]; + r1 ^= mix[1]; + r2 ^= mix[2]; + r3 ^= mix[3]; + r4 ^= mix[4]; + r5 ^= mix[5]; + r6 ^= mix[6]; + r7 ^= mix[7]; } - _mm_store_si128(xout + 0, x0); - _mm_store_si128(xout + 1, x1); - _mm_store_si128(xout + 2, x2); - _mm_store_si128(xout + 3, x3); + uint64_t* out64 = (uint64_t*)out; + + out64[0] = r0; + out64[1] = r1; + out64[2] = r2; + out64[3] = r3; + out64[4] = r4; + out64[5] = r5; + out64[6] = r6; + out64[7] = r7; } template @@ -98,7 +94,7 @@ namespace RandomX { memory.mx ^= addr; memory.mx &= -64; //align to cache line std::swap(memory.mx, memory.ma); - PREFETCH(memory); + PREFETCHNTA(memory.ds.dataset + memory.ma); for (int i = 0; i < RegistersCount; ++i) reg.r[i].u64 ^= datasetLine[i]; } diff --git a/src/hashAes1Rx4.cpp b/src/hashAes1Rx4.cpp index 1f25335..623d4b6 100644 --- a/src/hashAes1Rx4.cpp +++ b/src/hashAes1Rx4.cpp @@ -71,3 +71,44 @@ void hashAes1Rx4(const void *input, size_t inputSize, void *hash) { template void hashAes1Rx4(const void *input, size_t inputSize, void *hash); template void hashAes1Rx4(const void *input, size_t inputSize, void *hash); + +template +void fillAes1Rx4(void *state, size_t outputSize, void *buffer) { + const uint8_t* outptr = (uint8_t*)buffer; + const uint8_t* outputEnd = outptr + outputSize; + + __m128i state0, state1, state2, state3; + __m128i key0, key1, key2, key3; + + key0 = _mm_set_epi32(0x9274f206, 0x79498d2f, 0x7d2de6ab, 0x67a04d26); + key1 = _mm_set_epi32(0xe1f7af05, 0x2a3a6f1d, 0x86658a15, 0x4f719812); + key2 = _mm_set_epi32(0xd1b1f791, 0x9e2ec914, 0x14c77bce, 0xba90750e); + key3 = _mm_set_epi32(0x179d0fd9, 0x6e57883c, 0xa53bbe4f, 0xaa07621f); + + state0 = _mm_load_si128((__m128i*)state + 0); + state1 = _mm_load_si128((__m128i*)state + 1); + state2 = _mm_load_si128((__m128i*)state + 2); + state3 = _mm_load_si128((__m128i*)state + 3); + + while (outptr < outputEnd) { + state0 = aesdec(state0, key0); + state1 = aesenc(state1, key1); + state2 = aesdec(state2, key2); + state3 = aesenc(state3, key3); + + _mm_store_si128((__m128i*)outptr + 0, state0); + _mm_store_si128((__m128i*)outptr + 1, state1); + _mm_store_si128((__m128i*)outptr + 2, state2); + _mm_store_si128((__m128i*)outptr + 3, state3); + + outptr += 64; + } + + _mm_store_si128((__m128i*)state + 0, state0); + _mm_store_si128((__m128i*)state + 1, state1); + _mm_store_si128((__m128i*)state + 2, state2); + _mm_store_si128((__m128i*)state + 3, state3); +} + +template void fillAes1Rx4(void *state, size_t outputSize, void *buffer); +template void fillAes1Rx4(void *state, size_t outputSize, void *buffer); diff --git a/src/hashAes1Rx4.hpp b/src/hashAes1Rx4.hpp index a9af1fc..8c0c156 100644 --- a/src/hashAes1Rx4.hpp +++ b/src/hashAes1Rx4.hpp @@ -20,4 +20,7 @@ along with RandomX. If not, see. #include "softAes.h" template -void hashAes1Rx4(const void *input, size_t inputSize, void *hash); \ No newline at end of file +void hashAes1Rx4(const void *input, size_t inputSize, void *hash); + +template +void fillAes1Rx4(void *state, size_t outputSize, void *buffer); diff --git a/src/instructionWeights.hpp b/src/instructionWeights.hpp index 55c9b79..d24800e 100644 --- a/src/instructionWeights.hpp +++ b/src/instructionWeights.hpp @@ -37,8 +37,9 @@ along with RandomX. If not, see. #define WT_INEG_R 2 #define WT_IXOR_R 12 #define WT_IXOR_M 3 -#define WT_IROR_R 12 -#define WT_IROL_R 12 +#define WT_IROR_R 10 +#define WT_IROL_R 10 +#define WT_ISWAP_R 4 //Common floating point #define WT_FPSWAP_R 8 @@ -72,7 +73,7 @@ constexpr int wtSum = WT_IADD_R + WT_IADD_M + WT_IADD_RC + WT_ISUB_R + \ WT_ISUB_M + WT_IMUL_9C + WT_IMUL_R + WT_IMUL_M + WT_IMULH_R + \ WT_IMULH_M + WT_ISMULH_R + WT_ISMULH_M + WT_IDIV_C + WT_ISDIV_C + \ WT_INEG_R + WT_IXOR_R + WT_IXOR_M + WT_IROR_R + WT_IROL_R + \ -WT_FPSWAP_R + WT_FPADD_R + WT_FPADD_M + WT_FPSUB_R + WT_FPSUB_M + \ +WT_ISWAP_R + WT_FPSWAP_R + WT_FPADD_R + WT_FPADD_M + WT_FPSUB_R + WT_FPSUB_M + \ WT_FPNEG_R + WT_FPMUL_R + WT_FPMUL_M + WT_FPDIV_R + WT_FPDIV_M + \ WT_FPSQRT_R + WT_COND_R + WT_COND_M + WT_CFROUND + WT_ISTORE + WT_FSTORE + WT_NOP; diff --git a/src/instructionsPortable.cpp b/src/instructionsPortable.cpp index 78bdb6f..9e1eff1 100644 --- a/src/instructionsPortable.cpp +++ b/src/instructionsPortable.cpp @@ -17,7 +17,6 @@ You should have received a copy of the GNU General Public License along with RandomX. If not, see. */ //#define DEBUG -#include "instructions.hpp" #include "intrinPortable.h" #pragma STDC FENV_ACCESS on #include @@ -29,14 +28,14 @@ along with RandomX. If not, see. #if defined(__SIZEOF_INT128__) typedef unsigned __int128 uint128_t; typedef __int128 int128_t; - static inline uint64_t __umulhi64(uint64_t a, uint64_t b) { + uint64_t mulh(uint64_t a, uint64_t b) { return ((uint128_t)a * b) >> 64; } - static inline uint64_t __imulhi64(int64_t a, int64_t b) { + int64_t smulh(int64_t a, int64_t b) { return ((int128_t)a * b) >> 64; } - #define umulhi64 __umulhi64 - #define imulhi64 __imulhi64 + #define HAVE_MULH + #define HAVE_SMULH #endif #if defined(_MSC_VER) @@ -44,62 +43,62 @@ along with RandomX. If not, see. #define EVAL_DEFINE(X) HAS_VALUE(X) #include #include - #define ror64 _rotr64 - #define rol64 _rotl64 + + uint64_t rotl(uint64_t x, int c) { + return _rotl64(x, c); + } + uint64_t rotr(uint64_t x , int c) { + return _rotr64(x, c); + } + #define HAVE_ROTL + #define HAVE_ROTR + #if EVAL_DEFINE(__MACHINEARM64_X64(1)) - #define umulhi64 __umulh + uint64_t mulh(uint64_t a, uint64_t b) { + return __umulh(a, b); + } + #define HAVE_MULH #endif + #if EVAL_DEFINE(__MACHINEX64(1)) - static inline uint64_t __imulhi64(int64_t a, int64_t b) { + int64_t smulh(int64_t a, int64_t b) { int64_t hi; _mul128(a, b, &hi); return hi; } - #define imulhi64 __imulhi64 + #define HAVE_SMULH #endif - static inline uint32_t _setRoundMode(uint32_t mode) { - return _controlfp(mode, _MCW_RC); + + static void setRoundMode__(uint32_t mode) { + _controlfp(mode, _MCW_RC); } - #define setRoundMode _setRoundMode + #define HAVE_SETROUNDMODE_IMPL #endif -#ifndef setRoundMode - #define setRoundMode fesetround +#ifndef HAVE_SETROUNDMODE_IMPL + static void setRoundMode__(uint32_t mode) { + fesetround(mode); + } #endif -#ifndef ror64 - static inline uint64_t __ror64(uint64_t a, int b) { +#ifndef HAVE_ROTR + uint64_t rotr(uint64_t a, int b) { return (a >> b) | (a << (64 - b)); } - #define ror64 __ror64 + #define HAS_ROTR #endif -#ifndef rol64 - static inline uint64_t __rol64(uint64_t a, int b) { +#ifndef HAVE_ROTL + uint64_t rotl(uint64_t a, int b) { return (a << b) | (a >> (64 - b)); } - #define rol64 __rol64 + #define HAS_ROTL #endif -#ifndef sar64 - #include - constexpr int64_t builtintShr64(int64_t value, int shift) noexcept { - return value >> shift; - } - - struct UsesArithmeticShift : std::integral_constant { - }; - - static inline int64_t __sar64(int64_t a, int b) { - return UsesArithmeticShift::value ? builtintShr64(a, b) : (a < 0 ? ~(~a >> b) : a >> b); - } - #define sar64 __sar64 -#endif - -#ifndef umulhi64 +#ifndef HAVE_MULH #define LO(x) ((x)&0xffffffff) #define HI(x) ((x)>>32) - static inline uint64_t __umulhi64(uint64_t a, uint64_t b) { + uint64_t mulh(uint64_t a, uint64_t b) { uint64_t ah = HI(a), al = LO(a); uint64_t bh = HI(b), bl = LO(b); uint64_t x00 = al * bl; @@ -112,17 +111,17 @@ along with RandomX. If not, see. return (m3 << 32) + LO(m2); } - #define umulhi64 __umulhi64 + #define HAVE_MULH #endif -#ifndef imulhi64 - static inline int64_t __imulhi64(int64_t a, int64_t b) { - int64_t hi = umulhi64(a, b); +#ifndef HAVE_SMULH + int64_t smulh(int64_t a, int64_t b) { + int64_t hi = mulh(a, b); if (a < 0LL) hi -= b; if (b < 0LL) hi -= a; return hi; } - #define imulhi64 __imulhi64 + #define HAVE_SMULH #endif // avoid undefined behavior of signed overflow @@ -137,20 +136,20 @@ static inline int32_t safeSub(int32_t a, int32_t b) { #if defined(__has_builtin) #if __has_builtin(__builtin_sub_overflow) - static inline bool __subOverflow(int32_t a, int32_t b) { + static inline bool subOverflow__(int32_t a, int32_t b) { int32_t temp; return __builtin_sub_overflow(a, b, &temp); } - #define subOverflow __subOverflow + #define HAVE_SUB_OVERFLOW #endif #endif -#ifndef subOverflow - static inline bool __subOverflow(int32_t a, int32_t b) { +#ifndef HAVE_SUB_OVERFLOW + static inline bool subOverflow__(int32_t a, int32_t b) { auto c = safeSub(a, b); return (c < a) != (b > 0); } - #define subOverflow __subOverflow + #define HAVE_SUB_OVERFLOW #endif static inline double FlushDenormalNaN(double x) { @@ -165,47 +164,57 @@ static inline double FlushNaN(double x) { return x != x ? 0.0 : x; } +void setRoundMode(uint32_t rcflag) { + switch (rcflag & 3) { + case RoundDown: + setRoundMode__(FE_DOWNWARD); + break; + case RoundUp: + setRoundMode__(FE_UPWARD); + break; + case RoundToZero: + setRoundMode__(FE_TOWARDZERO); + break; + default: + setRoundMode__(FE_TONEAREST); + break; + } +} + +bool condition(uint32_t type, int32_t value, int32_t imm32) { + switch (type & 7) + { + case 0: + return (uint32_t)value <= (uint32_t)imm32; + case 1: + return (uint32_t)value > (uint32_t)imm32; + case 2: + return safeSub(value, imm32) < 0; + case 3: + return safeSub(value, imm32) >= 0; + case 4: + return subOverflow__(value, imm32); + case 5: + return !subOverflow__(value, imm32); + case 6: + return value < imm32; + case 7: + return value >= imm32; + } +} + +void initFpu() { +#ifdef __SSE2__ + _mm_setcsr(0x9FC0); //Flush to zero, denormals are zero, default rounding mode, all exceptions disabled +#else + setRoundMode(FE_TONEAREST); +#endif +} + namespace RandomX { extern "C" { - - void ADD_64(convertible_t& a, convertible_t& b, convertible_t& c) { - c.u64 = a.u64 + b.u64; - } - - void ADD_32(convertible_t& a, convertible_t& b, convertible_t& c) { - c.u64 = a.u32 + b.u32; - } - - void SUB_64(convertible_t& a, convertible_t& b, convertible_t& c) { - c.u64 = a.u64 - b.u64; - } - - void SUB_32(convertible_t& a, convertible_t& b, convertible_t& c) { - c.u64 = a.u32 - b.u32; - } - - void MUL_64(convertible_t& a, convertible_t& b, convertible_t& c) { - c.u64 = a.u64 * b.u64; - } - - void MULH_64(convertible_t& a, convertible_t& b, convertible_t& c) { - c.u64 = umulhi64(a.u64, b.u64); - } - - void MUL_32(convertible_t& a, convertible_t& b, convertible_t& c) { - c.u64 = (uint64_t)a.u32 * b.u32; - } - - void IMUL_32(convertible_t& a, convertible_t& b, convertible_t& c) { - c.i64 = (int64_t)a.i32 * b.i32; - } - - void IMULH_64(convertible_t& a, convertible_t& b, convertible_t& c) { - c.i64 = imulhi64(a.i64, b.i64); - } - - void DIV_64(convertible_t& a, convertible_t& b, convertible_t& c) { + /*void DIV_64(convertible_t& a, convertible_t& b, convertible_t& c) { c.u64 = a.u64 / (b.u32 != 0 ? b.u32 : 1U); } @@ -216,80 +225,6 @@ namespace RandomX { c.i64 = a.i64 / (b.i32 != 0 ? b.i32 : 1); } - void AND_64(convertible_t& a, convertible_t& b, convertible_t& c) { - c.u64 = a.u64 & b.u64; - } - - void AND_32(convertible_t& a, convertible_t& b, convertible_t& c) { - c.u64 = a.u32 & b.u32; - } - - void OR_64(convertible_t& a, convertible_t& b, convertible_t& c) { - c.u64 = a.u64 | b.u64; - } - - void OR_32(convertible_t& a, convertible_t& b, convertible_t& c) { - c.u64 = a.u32 | b.u32; - } - - void XOR_64(convertible_t& a, convertible_t& b, convertible_t& c) { - c.u64 = a.u64 ^ b.u64; - } - - void XOR_32(convertible_t& a, convertible_t& b, convertible_t& c) { - c.u64 = a.u32 ^ b.u32; - } - - void SHL_64(convertible_t& a, convertible_t& b, convertible_t& c) { - c.u64 = a.u64 << (b.u64 & 63); - } - - void SHR_64(convertible_t& a, convertible_t& b, convertible_t& c) { - c.u64 = a.u64 >> (b.u64 & 63); - } - - void SAR_64(convertible_t& a, convertible_t& b, convertible_t& c) { - c.u64 = sar64(a.i64, b.u64 & 63); - } - - void ROL_64(convertible_t& a, convertible_t& b, convertible_t& c) { - c.u64 = rol64(a.u64, (b.u64 & 63)); - } - - void ROR_64(convertible_t& a, convertible_t& b, convertible_t& c) { - c.u64 = ror64(a.u64, (b.u64 & 63)); - } - - bool JMP_COND(uint8_t type, convertible_t& regb, int32_t imm32) { - switch (type & 7) - { - case 0: - return regb.u32 <= (uint32_t)imm32; - case 1: - return regb.u32 > (uint32_t)imm32; - case 2: - return safeSub(regb.i32, imm32) < 0; - case 3: - return safeSub(regb.i32, imm32) >= 0; - case 4: - return subOverflow(regb.i32, imm32); - case 5: - return !subOverflow(regb.i32, imm32); - case 6: - return regb.i32 < imm32; - case 7: - return regb.i32 >= imm32; - } - } - - void FPINIT() { -#ifdef __SSE2__ - _mm_setcsr(0x9FC0); //Flush to zero, denormals are zero, default rounding mode, all exceptions disabled -#else - setRoundMode(FE_TONEAREST); -#endif - } - void FPADD(convertible_t& a, fpu_reg_t& b, fpu_reg_t& c) { #ifdef __SSE2__ __m128i ai = _mm_loadl_epi64((const __m128i*)&a); @@ -368,48 +303,8 @@ namespace RandomX { c.lo.f64 = sqrt(std::abs(alo)); c.hi.f64 = sqrt(std::abs(ahi)); #endif - } + }*/ + - void FPROUND(convertible_t a, uint8_t rot) { - uint64_t flag = ror64(a.u64, rot); - switch (flag & 3) { - case RoundDown: -#ifdef DEBUG - std::cout << "Round FE_DOWNWARD (" << FE_DOWNWARD << ") = " << -#endif - setRoundMode(FE_DOWNWARD); -#ifdef DEBUG - std::cout << std::endl; -#endif - break; - case RoundUp: -#ifdef DEBUG - std::cout << "Round FE_UPWARD (" << FE_UPWARD << ") = " << -#endif - setRoundMode(FE_UPWARD); -#ifdef DEBUG - std::cout << std::endl; -#endif - break; - case RoundToZero: -#ifdef DEBUG - std::cout << "Round FE_TOWARDZERO (" << FE_TOWARDZERO << ") = " << -#endif - setRoundMode(FE_TOWARDZERO); -#ifdef DEBUG - std::cout << std::endl; -#endif - break; - default: -#ifdef DEBUG - std::cout << "Round FE_TONEAREST (" << FE_TONEAREST << ") = " << -#endif - setRoundMode(FE_TONEAREST); -#ifdef DEBUG - std::cout << std::endl; -#endif - break; - } - } } } \ No newline at end of file diff --git a/src/intrinPortable.h b/src/intrinPortable.h index 3a473a2..3d2136c 100644 --- a/src/intrinPortable.h +++ b/src/intrinPortable.h @@ -19,6 +19,8 @@ along with RandomX. If not, see. #pragma once +#include + #if defined(_MSC_VER) #if defined(_M_X64) || (defined(_M_IX86_FP) && _M_IX86_FP == 2) #define __SSE2__ 1 @@ -45,6 +47,18 @@ typedef union { uint8_t u8[16]; } __m128i; +typedef struct { + double lo; + double hi; +} __m128d; + +inline __m128d _mm_load_pd(const double* pd) { + __m128d x; + x.lo = *(pd + 0); + x.hi = *(pd + 1); + return x; +} + static const char* platformError = "Platform doesn't support hardware AES"; inline __m128i _mm_aeskeygenassist_si128(__m128i key, uint8_t rcon) { @@ -131,4 +145,17 @@ inline __m128i _mm_slli_si128(__m128i _A, int _Imm) { return _A; } -#endif \ No newline at end of file +#endif + +constexpr int RoundToNearest = 0; +constexpr int RoundDown = 1; +constexpr int RoundUp = 2; +constexpr int RoundToZero = 3; + +uint64_t mulh(uint64_t, uint64_t); +int64_t smulh(int64_t, int64_t); +uint64_t rotl(uint64_t, int); +uint64_t rotr(uint64_t, int); +void initFpu(); +void setRoundMode(uint32_t); +bool condition(uint32_t, int32_t, int32_t); diff --git a/src/main.cpp b/src/main.cpp index 4f5a021..c761b97 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -35,6 +35,7 @@ along with RandomX. If not, see. #include "dataset.hpp" #include "Cache.hpp" #include "Pcg32.hpp" +#include "hashAes1Rx4.hpp" const uint8_t seed[32] = { 191, 182, 222, 175, 249, 89, 134, 104, 241, 68, 191, 62, 162, 166, 61, 64, 123, 191, 227, 193, 118, 60, 188, 53, 223, 133, 175, 24, 123, 230, 55, 74 }; @@ -153,7 +154,7 @@ void generateNative(int nonce) { } void mine(RandomX::VirtualMachine* vm, std::atomic& atomicNonce, AtomicHash& result, int noncesCount, int thread, uint8_t* scratchpad) { - uint64_t hash[4]; + alignas(16) uint64_t hash[8]; unsigned char blockTemplate[] = { 0x07, 0x07, 0xf7, 0xa4, 0xf0, 0xd6, 0x05, 0xb3, 0x03, 0x26, 0x08, 0x16, 0xba, 0x3f, 0x10, 0x90, 0x2e, 0x1a, 0x14, 0x5a, 0xc5, 0xfa, 0xd3, 0xaa, 0x3a, 0xf6, 0xea, 0x44, 0xc1, 0x18, 0x69, 0xdc, 0x4f, 0x85, 0x3f, 0x00, 0x2b, 0x2e, @@ -167,8 +168,8 @@ void mine(RandomX::VirtualMachine* vm, std::atomic& atomicNonce, AtomicHash //std::cout << "Thread " << thread << " nonce " << nonce << std::endl; *noncePtr = nonce; blake2b(hash, sizeof(hash), blockTemplate, sizeof(blockTemplate), nullptr, 0); - int spIndex = ((uint8_t*)hash)[24] | ((((uint8_t*)hash)[25] & 15) << 8); - vm->initializeScratchpad(scratchpad, spIndex); + fillAes1Rx4((void*)hash, RandomX::ScratchpadSize, scratchpad); + //vm->initializeScratchpad(scratchpad, spIndex); vm->setScratchpad(scratchpad); //dump((char*)((RandomX::CompiledVirtualMachine*)vm)->getProgram(), RandomX::CodeSize, "code-1337-jmp.txt"); for (int chain = 0; chain < 16; ++chain) { @@ -309,7 +310,7 @@ int main(int argc, char** argv) { } uint8_t* scratchpadMem; if (largePages) { - scratchpadMem = (uint8_t*)allocLargePagesMemory(RandomX::ScratchpadSize * (threadCount + 1) / 2); + scratchpadMem = (uint8_t*)allocLargePagesMemory(threadCount * RandomX::ScratchpadSize); } else { scratchpadMem = (uint8_t*)_mm_malloc(threadCount * RandomX::ScratchpadSize, RandomX::CacheLineSize); diff --git a/src/squareHash.S b/src/squareHash.S new file mode 100644 index 0000000..4cd3b54 --- /dev/null +++ b/src/squareHash.S @@ -0,0 +1,17 @@ +.intel_syntax noprefix +#if defined(__APPLE__) +.text +#else +.section .text +#endif +#if defined(__WIN32__) || defined(__APPLE__) +#define DECL(x) _##x +#else +#define DECL(x) x +#endif + +.global DECL(squareHash) + +DECL(squareHash): + mov rcx, rsi + #include "asm/squareHash.inc" diff --git a/src/squareHash.asm b/src/squareHash.asm new file mode 100644 index 0000000..4433719 --- /dev/null +++ b/src/squareHash.asm @@ -0,0 +1,9 @@ +PUBLIC squareHash + +.code + +squareHash PROC + include asm/squareHash.inc +squareHash ENDP + +END \ No newline at end of file diff --git a/src/squareHash.h b/src/squareHash.h new file mode 100644 index 0000000..f80b492 --- /dev/null +++ b/src/squareHash.h @@ -0,0 +1,71 @@ +/* +Copyright (c) 2019 tevador + +This file is part of RandomX. + +RandomX is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +RandomX is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with RandomX. If not, see. +*/ + +#include + +#if !defined(_M_X64) && !defined(__x86_64__) + +typedef struct { + uint64_t lo; + uint64_t hi; +} uint128_t; + +#define LO(x) ((x)&0xffffffff) +#define HI(x) ((x)>>32) +static inline uint128_t square128(uint64_t x) { + uint64_t xh = HI(x), xl = LO(x); + uint64_t xll = xl * xl; + uint64_t xlh = xl * xh; + uint64_t xhh = xh * xh; + uint64_t m1 = 2 * LO(xlh) + HI(xll); + uint64_t m2 = 2 * HI(xlh) + LO(xhh) + HI(m1); + uint64_t m3 = HI(xhh) + HI(m2); + + uint128_t x2; + + x2.lo = (m1 << 32) + LO(xll); + x2.hi = (m3 << 32) + LO(m2); + + return x2; +} +#undef LO(x) +#undef HI(x) + +inline uint64_t squareHash(uint64_t x) { + x += 1613783669344650115; + for (int i = 0; i < 42; ++i) { + uint128_t x2 = square128(x); + x = x2.lo - x2.hi; + } + return x; +} + +#else + +#if defined(__cplusplus) +extern "C" { +#endif + +uint64_t squareHash(uint64_t); + +#if defined(__cplusplus) +} +#endif + +#endif \ No newline at end of file