From 2132e5fef5a47b3870aa4e655750ef46ce9d5f82 Mon Sep 17 00:00:00 2001 From: tevador Date: Thu, 11 Apr 2019 00:01:22 +0200 Subject: [PATCH] SuperscalarHash interpreter Linux assembly code --- makefile | 5 +- src/Instruction.hpp | 2 +- src/InterpretedVirtualMachine.cpp | 173 ++++++++++++++++--- src/InterpretedVirtualMachine.hpp | 26 ++- src/JitCompilerX86-static.S | 88 ++++++++++ src/JitCompilerX86-static.asm | 28 +-- src/LightProgramGenerator.cpp | 5 +- src/asm/program_read_dataset_sshash_fin.inc | 10 ++ src/asm/program_read_dataset_sshash_init.inc | 16 ++ src/asm/program_sshash_constants.inc | 24 ++- src/main.cpp | 7 +- 11 files changed, 310 insertions(+), 74 deletions(-) create mode 100644 src/asm/program_read_dataset_sshash_fin.inc create mode 100644 src/asm/program_read_dataset_sshash_init.inc diff --git a/makefile b/makefile index cd49f88..7dde5ae 100644 --- a/makefile +++ b/makefile @@ -9,7 +9,7 @@ OBJDIR=obj LDFLAGS=-lpthread CPPSRC=src/argon2_core.c src/Cache.cpp src/divideByConstantCodegen.c src/Instruction.cpp src/JitCompilerX86.cpp src/Program.cpp src/VirtualMachine.cpp src/argon2_ref.c src/CompiledVirtualMachine.cpp src/executeProgram-linux.cpp src/instructionsPortable.cpp src/LightClientAsyncWorker.cpp src/softAes.cpp src/virtualMemory.cpp src/AssemblyGeneratorX86.cpp src/dataset.cpp src/hashAes1Rx4.cpp src/InterpretedVirtualMachine.cpp src/main.cpp src/TestAluFpu.cpp src/blake2/blake2b.c TOBJS=$(addprefix $(OBJDIR)/,instructionsPortable.o TestAluFpu.o) -ROBJS=$(addprefix $(OBJDIR)/,argon2_core.o argon2_ref.o AssemblyGeneratorX86.o blake2b.o CompiledVirtualMachine.o CompiledLightVirtualMachine.o dataset.o JitCompilerX86.o instructionsPortable.o Instruction.o InterpretedVirtualMachine.o main.o Program.o softAes.o VirtualMachine.o Cache.o virtualMemory.o reciprocal.o LightClientAsyncWorker.o hashAes1Rx4.o) +ROBJS=$(addprefix $(OBJDIR)/,argon2_core.o argon2_ref.o AssemblyGeneratorX86.o blake2b.o CompiledVirtualMachine.o CompiledLightVirtualMachine.o dataset.o JitCompilerX86.o instructionsPortable.o Instruction.o InterpretedVirtualMachine.o main.o softAes.o VirtualMachine.o Cache.o virtualMemory.o reciprocal.o LightClientAsyncWorker.o hashAes1Rx4.o LightProgramGenerator.o) ifeq ($(PLATFORM),amd64) ROBJS += $(OBJDIR)/JitCompilerX86-static.o $(OBJDIR)/squareHash.o CXXFLAGS += -maes @@ -99,6 +99,9 @@ $(OBJDIR)/InterpretedVirtualMachine.o: $(addprefix $(SRCDIR)/,InterpretedVirtual $(OBJDIR)/LightClientAsyncWorker.o: $(addprefix $(SRCDIR)/,LightClientAsyncWorker.cpp LightClientAsyncWorker.hpp common.hpp) | $(OBJDIR) $(CXX) $(CXXFLAGS) -c $(SRCDIR)/LightClientAsyncWorker.cpp -o $@ + +$(OBJDIR)/LightProgramGenerator.o: $(addprefix $(SRCDIR)/,LightProgramGenerator.cpp LightProgramGenerator.hpp Program.hpp blake2/blake2.h blake2/endian.h configuration.h) | $(OBJDIR) + $(CXX) $(CXXFLAGS) -c $(SRCDIR)/LightProgramGenerator.cpp -o $@ $(OBJDIR)/main.o: $(addprefix $(SRCDIR)/,main.cpp InterpretedVirtualMachine.hpp Stopwatch.hpp blake2/blake2.h VirtualMachine.hpp common.hpp blake2/endian.h Program.hpp Instruction.hpp intrinPortable.h CompiledVirtualMachine.hpp JitCompilerX86.hpp AssemblyGeneratorX86.hpp dataset.hpp Cache.hpp virtualMemory.hpp hashAes1Rx4.hpp softAes.h configuration.h) | $(OBJDIR) $(CXX) $(CXXFLAGS) -c $(SRCDIR)/main.cpp -o $@ diff --git a/src/Instruction.hpp b/src/Instruction.hpp index 65d1c8a..9baf8ce 100644 --- a/src/Instruction.hpp +++ b/src/Instruction.hpp @@ -30,7 +30,7 @@ namespace RandomX { typedef void(Instruction::*InstructionVisualizer)(std::ostream&) const; namespace InstructionType { - constexpr int IADD_R = 0; + constexpr int IADD_RS = 0; constexpr int IADD_M = 1; constexpr int IADD_RC = 2; constexpr int ISUB_R = 3; diff --git a/src/InterpretedVirtualMachine.cpp b/src/InterpretedVirtualMachine.cpp index 636b95b..7ee00ba 100644 --- a/src/InterpretedVirtualMachine.cpp +++ b/src/InterpretedVirtualMachine.cpp @@ -36,6 +36,7 @@ along with RandomX. If not, see. #ifdef STATS #include #endif +#include "LightProgramGenerator.hpp" #ifdef FPUCHECK constexpr bool fpuCheck = true; @@ -45,17 +46,20 @@ constexpr bool fpuCheck = false; namespace RandomX { - InterpretedVirtualMachine::~InterpretedVirtualMachine() { - - } - - void InterpretedVirtualMachine::setDataset(dataset_t ds, uint64_t size, LightProgram(&programs)[RANDOMX_CACHE_ACCESSES]) { + template + void InterpretedVirtualMachine::setDataset(dataset_t ds, uint64_t size, LightProgram(&programs)[RANDOMX_CACHE_ACCESSES]) { mem.ds = ds; readDataset = &datasetReadLight; datasetRange = (size - RANDOMX_DATASET_SIZE + CacheLineSize) / CacheLineSize; + if(superscalar) + precompileSuperscalar(programs); } - void InterpretedVirtualMachine::initialize() { + template void InterpretedVirtualMachine::setDataset(dataset_t ds, uint64_t size, LightProgram(&programs)[RANDOMX_CACHE_ACCESSES]); + template void InterpretedVirtualMachine::setDataset(dataset_t ds, uint64_t size, LightProgram(&programs)[RANDOMX_CACHE_ACCESSES]); + + template + void InterpretedVirtualMachine::initialize() { VirtualMachine::initialize(); for (unsigned i = 0; i < RANDOMX_PROGRAM_SIZE; ++i) { program(i).src %= RegistersCount; @@ -63,12 +67,19 @@ namespace RandomX { } } - void InterpretedVirtualMachine::executeBytecode(int_reg_t(&r)[8], __m128d (&f)[4], __m128d (&e)[4], __m128d (&a)[4]) { + template void InterpretedVirtualMachine::initialize(); + template void InterpretedVirtualMachine::initialize(); + + template + void InterpretedVirtualMachine::executeBytecode(int_reg_t(&r)[8], __m128d (&f)[4], __m128d (&e)[4], __m128d (&a)[4]) { for (int ic = 0; ic < RANDOMX_PROGRAM_SIZE; ++ic) { executeBytecode(ic, r, f, e, a); } } + template void InterpretedVirtualMachine::executeBytecode(int_reg_t(&r)[8], __m128d (&f)[4], __m128d (&e)[4], __m128d (&a)[4]); + template void InterpretedVirtualMachine::executeBytecode(int_reg_t(&r)[8], __m128d (&f)[4], __m128d (&e)[4], __m128d (&a)[4]); + static void print(int_reg_t r) { std::cout << std::hex << std::setw(16) << std::setfill('0') << r << std::endl; } @@ -98,14 +109,15 @@ namespace RandomX { return std::fpclassify(x) == FP_SUBNORMAL; } - FORCE_INLINE void InterpretedVirtualMachine::executeBytecode(int& ic, int_reg_t(&r)[8], __m128d (&f)[4], __m128d (&e)[4], __m128d (&a)[4]) { + template + FORCE_INLINE void InterpretedVirtualMachine::executeBytecode(int& ic, int_reg_t(&r)[8], __m128d (&f)[4], __m128d (&e)[4], __m128d (&a)[4]) { auto& ibc = byteCode[ic]; if (trace) std::cout << std::dec << std::setw(3) << ic << " " << program(ic); //if(trace) printState(r, f, e, a); switch (ibc.type) { - case InstructionType::IADD_R: { - *ibc.idst += *ibc.isrc; + case InstructionType::IADD_RS: { + *ibc.idst += (*ibc.isrc << ibc.shift) + ibc.imm; } break; case InstructionType::IADD_M: { @@ -289,7 +301,8 @@ namespace RandomX { #endif } - void InterpretedVirtualMachine::execute() { + template + void InterpretedVirtualMachine::execute() { int_reg_t r[8] = { 0, 0, 0, 0, 0, 0, 0, 0 }; __m128d f[4]; __m128d e[4]; @@ -350,11 +363,16 @@ namespace RandomX { mem.mx ^= r[readReg2] ^ r[readReg3]; mem.mx &= CacheLineAlignMask; - Cache& cache = mem.ds.cache; - uint64_t datasetLine[CacheLineSize / sizeof(uint64_t)]; - initBlock(cache, (uint8_t*)datasetLine, datasetBase + mem.ma / CacheLineSize, RANDOMX_CACHE_ACCESSES / 8); - for (int i = 0; i < RegistersCount; ++i) - r[i] ^= datasetLine[i]; + if (superscalar) { + executeSuperscalar(datasetBase + mem.ma / CacheLineSize, r); + } + else { + Cache& cache = mem.ds.cache; + uint64_t datasetLine[CacheLineSize / sizeof(uint64_t)]; + initBlock(cache, (uint8_t*)datasetLine, datasetBase + mem.ma / CacheLineSize, RANDOMX_CACHE_ACCESSES / 8); + for (int i = 0; i < RegistersCount; ++i) + r[i] ^= datasetLine[i]; + } std::swap(mem.mx, mem.ma); if (trace) { @@ -419,6 +437,9 @@ namespace RandomX { _mm_store_pd(®.e[3].lo, e[3]); } + template void InterpretedVirtualMachine::execute(); + template void InterpretedVirtualMachine::execute(); + static int getConditionRegister(int(®isterUsage)[8]) { int min = INT_MAX; int minIndex; @@ -431,9 +452,118 @@ namespace RandomX { return minIndex; } + constexpr uint64_t superscalarMul0 = 6364136223846793005ULL; + constexpr uint64_t superscalarAdd1 = 9298410992540426048ULL; + constexpr uint64_t superscalarAdd2 = 12065312585734608966ULL; + constexpr uint64_t superscalarAdd3 = 9306329213124610396ULL; + constexpr uint64_t superscalarAdd4 = 5281919268842080866ULL; + constexpr uint64_t superscalarAdd5 = 10536153434571861004ULL; + constexpr uint64_t superscalarAdd6 = 3398623926847679864ULL; + constexpr uint64_t superscalarAdd7 = 9549104520008361294ULL; + + static uint8_t* getMixBlock(uint64_t registerValue, Cache& cache) { + uint8_t* mixBlock; + if (RANDOMX_ARGON_GROWTH == 0) { + constexpr uint32_t mask = (RANDOMX_ARGON_MEMORY * ArgonBlockSize / CacheLineSize - 1); + mixBlock = cache.memory + (registerValue & mask) * CacheLineSize; + } + else { + const uint32_t modulus = cache.size / CacheLineSize; + mixBlock = cache.memory + (registerValue % modulus) * CacheLineSize; + } + return mixBlock; + } + + template + void InterpretedVirtualMachine::executeSuperscalar(uint32_t blockNumber, int_reg_t(&r)[8]) { + int_reg_t rl[8]; + uint8_t* mixBlock; + uint64_t registerValue = blockNumber; + rl[0] = (blockNumber + 1) * superscalarMul0; + rl[1] = rl[0] ^ superscalarAdd1; + rl[2] = rl[0] ^ superscalarAdd2; + rl[3] = rl[0] ^ superscalarAdd3; + rl[4] = rl[0] ^ superscalarAdd4; + rl[5] = rl[0] ^ superscalarAdd5; + rl[6] = rl[0] ^ superscalarAdd6; + rl[7] = rl[0] ^ superscalarAdd7; + Cache& cache = mem.ds.cache; + for (unsigned i = 0; i < RANDOMX_CACHE_ACCESSES; ++i) { + mixBlock = getMixBlock(registerValue, cache); + LightProgram& prog = superScalarPrograms[i]; + for (unsigned j = 0; j < prog.getSize(); ++j) { + Instruction& instr = prog(j); + switch (instr.opcode) + { + case RandomX::LightInstructionType::ISUB_R: + rl[instr.dst] -= rl[instr.src]; + break; + case RandomX::LightInstructionType::IXOR_R: + rl[instr.dst] ^= rl[instr.src]; + break; + case RandomX::LightInstructionType::IADD_RS: + rl[instr.dst] += rl[instr.src] << (instr.mod % 4); + break; + case RandomX::LightInstructionType::IMUL_R: + rl[instr.dst] *= rl[instr.src]; + break; + case RandomX::LightInstructionType::IROR_C: + rl[instr.dst] = rotr(rl[instr.dst], instr.getImm32()); + break; + case RandomX::LightInstructionType::IADD_C7: + case RandomX::LightInstructionType::IADD_C8: + case RandomX::LightInstructionType::IADD_C9: + rl[instr.dst] += signExtend2sCompl(instr.getImm32()); + break; + case RandomX::LightInstructionType::IXOR_C7: + case RandomX::LightInstructionType::IXOR_C8: + case RandomX::LightInstructionType::IXOR_C9: + rl[instr.dst] ^= signExtend2sCompl(instr.getImm32()); + break; + case RandomX::LightInstructionType::IMULH_R: + rl[instr.dst] = mulh(rl[instr.dst], rl[instr.src]); + break; + case RandomX::LightInstructionType::ISMULH_R: + rl[instr.dst] = smulh(rl[instr.dst], rl[instr.src]); + break; + case RandomX::LightInstructionType::IMUL_RCP: + rl[instr.dst] *= reciprocals[instr.getImm32()]; + break; + default: + UNREACHABLE; + } + } + + for(unsigned q = 0; q < 8; ++q) + rl[q] ^= load64(mixBlock + 8 * q); + + registerValue = rl[prog.getAddressRegister()]; + } + + for (unsigned q = 0; q < 8; ++q) + r[q] ^= rl[q]; + } + + template + void InterpretedVirtualMachine::precompileSuperscalar(LightProgram* programs) { + memcpy(superScalarPrograms, programs, sizeof(superScalarPrograms)); + reciprocals.clear(); + for (unsigned i = 0; i < RANDOMX_CACHE_ACCESSES; ++i) { + for (unsigned j = 0; j < superScalarPrograms[i].getSize(); ++j) { + Instruction& instr = superScalarPrograms[i](j); + if (instr.opcode == LightInstructionType::IMUL_RCP) { + auto rcp = reciprocal(instr.getImm32()); + instr.setImm32(reciprocals.size()); + reciprocals.push_back(rcp); + } + } + } + } + #include "instructionWeights.hpp" - void InterpretedVirtualMachine::precompileProgram(int_reg_t(&r)[8], __m128d (&f)[4], __m128d (&e)[4], __m128d (&a)[4]) { + template + void InterpretedVirtualMachine::precompileProgram(int_reg_t(&r)[8], __m128d (&f)[4], __m128d (&e)[4], __m128d (&a)[4]) { int registerUsage[8]; for (unsigned i = 0; i < 8; ++i) { registerUsage[i] = -1; @@ -445,14 +575,17 @@ namespace RandomX { CASE_REP(IADD_RS) { auto dst = instr.dst % RegistersCount; auto src = instr.src % RegistersCount; - ibc.type = InstructionType::IADD_R; + ibc.type = InstructionType::IADD_RS; ibc.idst = &r[dst]; - if (src != dst) { + if (dst != 5) { ibc.isrc = &r[src]; + ibc.shift = instr.mod % 4; + ibc.imm = 0; } else { + ibc.isrc = &r[src]; + ibc.shift = instr.mod % 4; ibc.imm = signExtend2sCompl(instr.getImm32()); - ibc.isrc = &ibc.imm; } registerUsage[instr.dst] = i; } break; diff --git a/src/InterpretedVirtualMachine.hpp b/src/InterpretedVirtualMachine.hpp index 49178bc..24bb9c6 100644 --- a/src/InterpretedVirtualMachine.hpp +++ b/src/InterpretedVirtualMachine.hpp @@ -23,23 +23,17 @@ along with RandomX. If not, see. #include "VirtualMachine.hpp" #include "Program.hpp" #include "intrinPortable.h" +#include namespace RandomX { - class ITransform { - public: - virtual int32_t apply(int32_t) const = 0; - virtual const char* getName() const = 0; - virtual std::ostream& printAsm(std::ostream&) const = 0; - virtual std::ostream& printCxx(std::ostream&) const = 0; - }; - struct InstructionByteCode; - class InterpretedVirtualMachine; + template class InterpretedVirtualMachine; - typedef void(InterpretedVirtualMachine::*InstructionHandler)(Instruction&); + template + using InstructionHandler = void(InterpretedVirtualMachine::*)(Instruction&); - struct alignas(8) InstructionByteCode { + struct InstructionByteCode { union { int_reg_t* idst; __m128d* fdst; @@ -62,6 +56,7 @@ namespace RandomX { constexpr int asedwfagdewsa = sizeof(InstructionByteCode); + template class InterpretedVirtualMachine : public VirtualMachine { public: void* operator new(size_t size) { @@ -74,16 +69,17 @@ namespace RandomX { _mm_free(ptr); } InterpretedVirtualMachine(bool soft) : softAes(soft) {} - ~InterpretedVirtualMachine(); + ~InterpretedVirtualMachine() {} void setDataset(dataset_t ds, uint64_t size, LightProgram(&programs)[RANDOMX_CACHE_ACCESSES]) override; void initialize() override; void execute() override; private: - static InstructionHandler engine[256]; + static InstructionHandler engine[256]; DatasetReadFunc readDataset; bool softAes; InstructionByteCode byteCode[RANDOMX_PROGRAM_SIZE]; - + std::vector reciprocals; + alignas(64) LightProgram superScalarPrograms[RANDOMX_CACHE_ACCESSES]; #ifdef STATS int count_ADD_64 = 0; int count_ADD_32 = 0; @@ -131,7 +127,9 @@ namespace RandomX { int datasetAccess[256] = { 0 }; #endif void precompileProgram(int_reg_t(&r)[8], __m128d (&f)[4], __m128d (&e)[4], __m128d (&a)[4]); + void precompileSuperscalar(LightProgram*); void executeBytecode(int_reg_t(&r)[8], __m128d (&f)[4], __m128d (&e)[4], __m128d (&a)[4]); void executeBytecode(int& i, int_reg_t(&r)[8], __m128d (&f)[4], __m128d (&e)[4], __m128d (&a)[4]); + void executeSuperscalar(uint32_t blockNumber, int_reg_t(&r)[8]); }; } \ No newline at end of file diff --git a/src/JitCompilerX86-static.S b/src/JitCompilerX86-static.S index 9ccdb16..e78dbe7 100644 --- a/src/JitCompilerX86-static.S +++ b/src/JitCompilerX86-static.S @@ -32,10 +32,18 @@ .global DECL(randomx_program_start) .global DECL(randomx_program_read_dataset) .global DECL(randomx_program_read_dataset_light) +.global DECL(randomx_program_read_dataset_sshash_init) +.global DECL(randomx_program_read_dataset_sshash_fin) +.global DECL(randomx_program_read_dataset_light_sub) +.global DECL(randomx_dataset_init) .global DECL(randomx_program_loop_store) .global DECL(randomx_program_loop_end) .global DECL(randomx_program_read_dataset_light_sub) .global DECL(randomx_program_epilogue) +.global DECL(randomx_sshash_load) +.global DECL(randomx_sshash_prefetch) +.global DECL(randomx_sshash_end) +.global DECL(randomx_sshash_init) .global DECL(randomx_program_end) #define db .byte @@ -63,6 +71,12 @@ DECL(randomx_program_read_dataset): DECL(randomx_program_read_dataset_light): #include "asm/program_read_dataset_light.inc" +DECL(randomx_program_read_dataset_sshash_init): + #include "asm/program_read_dataset_sshash_init.inc" + +DECL(randomx_program_read_dataset_sshash_fin): + #include "asm/program_read_dataset_sshash_fin.inc" + DECL(randomx_program_loop_store): #include "asm/program_loop_store.inc" @@ -75,10 +89,84 @@ DECL(randomx_program_read_dataset_light_sub): squareHashSub: #include "asm/squareHash.inc" +.balign 64 +DECL(randomx_dataset_init): + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + ;# cache in rdi + ;# dataset in rsi + mov rbp, rdx ;# block index + push rcx ;# max. block index +init_block_loop: + prefetchw byte ptr [rsi] + mov rbx, rbp + .byte 232 ;# 0xE8 = call + ;# .set CALL_LOC, + .int 32768 - (call_offset - DECL(randomx_dataset_init)) +call_offset: + mov qword ptr [rsi+0], r8 + mov qword ptr [rsi+8], r9 + mov qword ptr [rsi+16], r10 + mov qword ptr [rsi+24], r11 + mov qword ptr [rsi+32], r12 + mov qword ptr [rsi+40], r13 + mov qword ptr [rsi+48], r14 + mov qword ptr [rsi+56], r15 + add rbp, 1 + add rsi, 64 + cmp rbp, qword ptr [rsp] + jb init_block_loop + pop rcx + pop r15 + pop r14 + pop r13 + pop r12 + pop rbp + pop rbx + ret + .balign 64 DECL(randomx_program_epilogue): #include "asm/program_epilogue_linux.inc" +.balign 64 +DECL(randomx_sshash_load): + #include "asm/program_sshash_load.inc" + +DECL(randomx_sshash_prefetch): + #include "asm/program_sshash_prefetch.inc" + +DECL(randomx_sshash_end): + nop + +.balign 64 +DECL(randomx_sshash_init): + lea r8, [rbx+1] + #include "asm/program_sshash_prefetch.inc" + imul r8, qword ptr r0_mul[rip] + mov r9, qword ptr r1_add[rip] + xor r9, r8 + mov r10, qword ptr r2_add[rip] + xor r10, r8 + mov r11, qword ptr r3_add[rip] + xor r11, r8 + mov r12, qword ptr r4_add[rip] + xor r12, r8 + mov r13, qword ptr r5_add[rip] + xor r13, r8 + mov r14, qword ptr r6_add[rip] + xor r14, r8 + mov r15, qword ptr r7_add[rip] + xor r15, r8 + jmp DECL(randomx_program_end) + +.balign 64 + #include "asm/program_sshash_constants.inc" + .balign 64 DECL(randomx_program_end): nop diff --git a/src/JitCompilerX86-static.asm b/src/JitCompilerX86-static.asm index f149655..ab29312 100644 --- a/src/JitCompilerX86-static.asm +++ b/src/JitCompilerX86-static.asm @@ -68,35 +68,11 @@ randomx_program_read_dataset_light PROC randomx_program_read_dataset_light ENDP randomx_program_read_dataset_sshash_init PROC - sub rsp, 72 - mov qword ptr [rsp+64], rbx - mov qword ptr [rsp+56], r8 - mov qword ptr [rsp+48], r9 - mov qword ptr [rsp+40], r10 - mov qword ptr [rsp+32], r11 - mov qword ptr [rsp+24], r12 - mov qword ptr [rsp+16], r13 - mov qword ptr [rsp+8], r14 - mov qword ptr [rsp+0], r15 - xor rbp, rax ;# modify "mx" - ror rbp, 32 ;# swap "ma" and "mx" - mov ebx, ebp ;# ecx = ma - and ebx, 2147483584 ;# align "ma" to the start of a cache line - shr ebx, 6 ;# ebx = Dataset block number - ;# call 32768 + include asm/program_read_dataset_sshash_init.inc randomx_program_read_dataset_sshash_init ENDP randomx_program_read_dataset_sshash_fin PROC - mov rbx, qword ptr [rsp+64] - xor r8, qword ptr [rsp+56] - xor r9, qword ptr [rsp+48] - xor r10, qword ptr [rsp+40] - xor r11, qword ptr [rsp+32] - xor r12, qword ptr [rsp+24] - xor r13, qword ptr [rsp+16] - xor r14, qword ptr [rsp+8] - xor r15, qword ptr [rsp+0] - add rsp, 72 + include asm/program_read_dataset_sshash_fin.inc randomx_program_read_dataset_sshash_fin ENDP randomx_program_loop_store PROC diff --git a/src/LightProgramGenerator.cpp b/src/LightProgramGenerator.cpp index eeb09de..97fbb91 100644 --- a/src/LightProgramGenerator.cpp +++ b/src/LightProgramGenerator.cpp @@ -17,10 +17,11 @@ You should have received a copy of the GNU General Public License along with RandomX. If not, see. */ +#include #include "blake2/blake2.h" #include "configuration.h" #include "Program.hpp" -#include "blake2/endian.h"; +#include "blake2/endian.h" #include #include #include @@ -793,7 +794,7 @@ namespace RandomX { mop.setCycle(scheduleCycle); if (scheduleCycle < 0) { if (TRACE) std::cout << "; Failed at cycle " << cycle << std::endl; - return DBL_MIN; + return 0; } if (instrIndex == currentInstruction.getInfo().getSrcOp()) { diff --git a/src/asm/program_read_dataset_sshash_fin.inc b/src/asm/program_read_dataset_sshash_fin.inc new file mode 100644 index 0000000..f5a067d --- /dev/null +++ b/src/asm/program_read_dataset_sshash_fin.inc @@ -0,0 +1,10 @@ + mov rbx, qword ptr [rsp+64] + xor r8, qword ptr [rsp+56] + xor r9, qword ptr [rsp+48] + xor r10, qword ptr [rsp+40] + xor r11, qword ptr [rsp+32] + xor r12, qword ptr [rsp+24] + xor r13, qword ptr [rsp+16] + xor r14, qword ptr [rsp+8] + xor r15, qword ptr [rsp+0] + add rsp, 72 \ No newline at end of file diff --git a/src/asm/program_read_dataset_sshash_init.inc b/src/asm/program_read_dataset_sshash_init.inc new file mode 100644 index 0000000..a186d2e --- /dev/null +++ b/src/asm/program_read_dataset_sshash_init.inc @@ -0,0 +1,16 @@ + sub rsp, 72 + mov qword ptr [rsp+64], rbx + mov qword ptr [rsp+56], r8 + mov qword ptr [rsp+48], r9 + mov qword ptr [rsp+40], r10 + mov qword ptr [rsp+32], r11 + mov qword ptr [rsp+24], r12 + mov qword ptr [rsp+16], r13 + mov qword ptr [rsp+8], r14 + mov qword ptr [rsp+0], r15 + xor rbp, rax ;# modify "mx" + ror rbp, 32 ;# swap "ma" and "mx" + mov ebx, ebp ;# ecx = ma + and ebx, 2147483584 ;# align "ma" to the start of a cache line + shr ebx, 6 ;# ebx = Dataset block number + ;# call 32768 \ No newline at end of file diff --git a/src/asm/program_sshash_constants.inc b/src/asm/program_sshash_constants.inc index a25a90e..77b4ecd 100644 --- a/src/asm/program_sshash_constants.inc +++ b/src/asm/program_sshash_constants.inc @@ -1,16 +1,24 @@ -r0_mul: ;# 6364136223846793005 +r0_mul: + ;#/ 6364136223846793005 db 45, 127, 149, 76, 45, 244, 81, 88 -r1_add: ;# 9298410992540426048 +r1_add: + ;#/ 9298410992540426048 db 64, 159, 245, 89, 136, 151, 10, 129 -r2_add: ;# 12065312585734608966 +r2_add: + ;#/ 12065312585734608966 db 70, 216, 194, 56, 223, 153, 112, 167 -r3_add: ;# 9306329213124610396 +r3_add: + ;#/ 9306329213124610396 db 92, 9, 34, 191, 28, 185, 38, 129 -r4_add: ;# 5281919268842080866 +r4_add: + ;#/ 5281919268842080866 db 98, 138, 159, 23, 151, 37, 77, 73 -r5_add: ;# 10536153434571861004 +r5_add: + ;#/ 10536153434571861004 db 12, 236, 170, 206, 185, 239, 55, 146 -r6_add: ;# 3398623926847679864 +r6_add: + ;#/ 3398623926847679864 db 120, 45, 230, 108, 116, 86, 42, 47 -r7_add: ;# 9549104520008361294 +r7_add: + ;#/ 9549104520008361294 db 78, 229, 44, 182, 247, 59, 133, 132 \ No newline at end of file diff --git a/src/main.cpp b/src/main.cpp index 9410881..36cd800 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -301,6 +301,7 @@ int main(int argc, char** argv) { RandomX::JitCompilerX86 jit86; jit86.generateSuperScalarHash(programs); jit86.getDatasetInitFunc()(cache.memory, dataset.dataset.memory, 0, datasetBlockCount); + //dump((const char*)dataset.dataset.memory, RANDOMX_DATASET_SIZE, "dataset.dat"); } else { if (initThreadCount > 1) { @@ -331,10 +332,12 @@ int main(int argc, char** argv) { else { if (jit && useSuperscalar) vm = new RandomX::CompiledLightVirtualMachine(); - else if(jit) + else if (jit) vm = new RandomX::CompiledLightVirtualMachine(); + else if (useSuperscalar) + vm = new RandomX::InterpretedVirtualMachine(softAes); else - vm = new RandomX::InterpretedVirtualMachine(softAes); + vm = new RandomX::InterpretedVirtualMachine(softAes); } vm->setDataset(dataset, datasetSize, programs); vms.push_back(vm);