From 007f8599b9e3c32539b9b89df6cb0845e8e86ba2 Mon Sep 17 00:00:00 2001 From: tevador Date: Wed, 20 Mar 2019 23:38:37 +0100 Subject: [PATCH] Implemented branches in the interpreter Fixed x86 immediate encoding --- src/Instruction.cpp | 4 +- src/InterpretedVirtualMachine.cpp | 145 ++++++++++++++++++++---------- src/InterpretedVirtualMachine.hpp | 18 ++-- src/JitCompilerX86.cpp | 5 +- src/main.cpp | 4 +- src/program.inc | 32 +++---- 6 files changed, 128 insertions(+), 80 deletions(-) diff --git a/src/Instruction.cpp b/src/Instruction.cpp index f8d8507..7069926 100644 --- a/src/Instruction.cpp +++ b/src/Instruction.cpp @@ -302,13 +302,13 @@ namespace RandomX { } void Instruction::h_COND_R(std::ostream& os) const { - os << "r" << (int)dst << ", " << condition((mod >> 2) & 7) << "(r" << (int)src << ", " << (int32_t)getImm32() << ")" << std::endl; + os << "r" << (int)dst << ", " << condition((mod >> 2) & 7) << "(r" << (int)src << ", " << (int32_t)getImm32() << "), " << (int)(mod >> 5) << std::endl; } void Instruction::h_COND_M(std::ostream& os) const { os << "r" << (int)dst << ", " << condition((mod >> 2) & 7) << "("; genAddressReg(os); - os << ", " << (int32_t)getImm32() << ")" << std::endl; + os << ", " << (int32_t)getImm32() << "), " << (int)(mod >> 5) << std::endl; } void Instruction::h_ISTORE(std::ostream& os) const { diff --git a/src/InterpretedVirtualMachine.cpp b/src/InterpretedVirtualMachine.cpp index 1fbe825..54dd7be 100644 --- a/src/InterpretedVirtualMachine.cpp +++ b/src/InterpretedVirtualMachine.cpp @@ -18,6 +18,7 @@ along with RandomX. If not, see. */ //#define TRACE //#define FPUCHECK +#define RANDOMX_JUMP #include "InterpretedVirtualMachine.hpp" #include "dataset.hpp" #include "Cache.hpp" @@ -45,25 +46,12 @@ constexpr bool fpuCheck = false; namespace RandomX { InterpretedVirtualMachine::~InterpretedVirtualMachine() { - if (asyncWorker) { - delete mem.ds.asyncWorker; - } + } void InterpretedVirtualMachine::setDataset(dataset_t ds, uint64_t size) { - if (asyncWorker) { - if (softAes) { - mem.ds.asyncWorker = new LightClientAsyncWorker(ds.cache); - } - else { - mem.ds.asyncWorker = new LightClientAsyncWorker(ds.cache); - } - readDataset = &datasetReadLightAsync; - } - else { - mem.ds = ds; - readDataset = &datasetReadLight; - } + mem.ds = ds; + readDataset = &datasetReadLight; datasetRange = (size - RANDOMX_DATASET_SIZE + CacheLineSize) / CacheLineSize; } @@ -75,14 +63,10 @@ namespace RandomX { } } - template void InterpretedVirtualMachine::executeBytecode(int_reg_t(&r)[8], __m128d (&f)[4], __m128d (&e)[4], __m128d (&a)[4]) { - executeBytecode(N, r, f, e, a); - executeBytecode(r, f, e, a); - } - - template<> - void InterpretedVirtualMachine::executeBytecode(int_reg_t(&r)[8], __m128d (&f)[4], __m128d (&e)[4], __m128d (&a)[4]) { + for (int ic = 0; ic < RANDOMX_PROGRAM_SIZE; ++ic) { + executeBytecode(ic, r, f, e, a); + } } static void print(int_reg_t r) { @@ -114,8 +98,9 @@ namespace RandomX { return std::fpclassify(x) == FP_SUBNORMAL; } - FORCE_INLINE void InterpretedVirtualMachine::executeBytecode(int i, int_reg_t(&r)[8], __m128d (&f)[4], __m128d (&e)[4], __m128d (&a)[4]) { - auto& ibc = byteCode[i]; + FORCE_INLINE void InterpretedVirtualMachine::executeBytecode(int& ic, int_reg_t(&r)[8], __m128d (&f)[4], __m128d (&e)[4], __m128d (&a)[4]) { + auto& ibc = byteCode[ic]; + if (trace) std::cout << std::dec << std::setw(3) << ic << " " << program(ic); //if(trace) printState(r, f, e, a); switch (ibc.type) { @@ -234,10 +219,38 @@ namespace RandomX { } break; case InstructionType::COND_R: { +#ifdef RANDOMX_JUMP + *ibc.creg += (1 << ibc.shift); + const uint64_t conditionMask = ((1ULL << RANDOMX_CONDITION_BITS) - 1) << ibc.shift; + if ((*ibc.creg & conditionMask) == 0) { +#ifdef STATS + count_JUMP_taken++; +#endif + ic = ibc.target; + break; + } +#ifdef STATS + count_JUMP_not_taken++; +#endif +#endif *ibc.idst += condition(ibc.condition, *ibc.isrc, ibc.imm) ? 1 : 0; } break; case InstructionType::COND_M: { +#ifdef RANDOMX_JUMP + *ibc.creg += (1uLL << ibc.shift); + const uint64_t conditionMask = ((1ULL << RANDOMX_CONDITION_BITS) - 1) << ibc.shift; + if ((*ibc.creg & conditionMask) == 0) { +#ifdef STATS + count_JUMP_taken++; +#endif + ic = ibc.target; + break; + } +#ifdef STATS + count_JUMP_not_taken++; +#endif +#endif *ibc.idst += condition(ibc.condition, load64(scratchpad + (*ibc.isrc & ibc.memMask)), ibc.imm) ? 1 : 0; } break; @@ -257,7 +270,6 @@ namespace RandomX { UNREACHABLE; } if (trace) { - std::cout << program(i); if(ibc.type < 20 || ibc.type == 31 || ibc.type == 32) print(*ibc.idst); else //if(ibc.type >= 20 && ibc.type <= 30) @@ -334,28 +346,15 @@ namespace RandomX { std::cout << "-----------------------------------" << std::endl; } - executeBytecode<0>(r, f, e, a); + executeBytecode(r, f, e, a); - if (asyncWorker) { - ILightClientAsyncWorker* aw = mem.ds.asyncWorker; - const uint64_t* datasetLine = aw->getBlock(datasetBase + mem.ma); - for (int i = 0; i < RegistersCount; ++i) - r[i] ^= datasetLine[i]; - mem.mx ^= r[readReg2] ^ r[readReg3]; - mem.mx &= CacheLineAlignMask; //align to cache line - std::swap(mem.mx, mem.ma); - aw->prepareBlock(datasetBase + mem.ma); - } - else { - mem.mx ^= r[readReg2] ^ r[readReg3]; - //mem.mx &= CacheLineAlignMask; - Cache& cache = mem.ds.cache; - uint64_t datasetLine[CacheLineSize / sizeof(uint64_t)]; - initBlock(cache, (uint8_t*)datasetLine, datasetBase + mem.ma / CacheLineSize, RANDOMX_CACHE_ACCESSES / 8); - for (int i = 0; i < RegistersCount; ++i) - r[i] ^= datasetLine[i]; - std::swap(mem.mx, mem.ma); - } + mem.mx ^= r[readReg2] ^ r[readReg3]; + Cache& cache = mem.ds.cache; + uint64_t datasetLine[CacheLineSize / sizeof(uint64_t)]; + initBlock(cache, (uint8_t*)datasetLine, datasetBase + mem.ma / CacheLineSize, RANDOMX_CACHE_ACCESSES / 8); + for (int i = 0; i < RegistersCount; ++i) + r[i] ^= datasetLine[i]; + std::swap(mem.mx, mem.ma); if (trace) { std::cout << "iteration " << std::dec << ic << std::endl; @@ -419,9 +418,25 @@ namespace RandomX { _mm_store_pd(®.e[3].lo, e[3]); } + static int getConditionRegister(int(®isterUsage)[8]) { + int min = INT_MAX; + int minIndex; + for (unsigned i = 0; i < 8; ++i) { + if (registerUsage[i] < min) { + min = registerUsage[i]; + minIndex = i; + } + } + return minIndex; + } + #include "instructionWeights.hpp" void InterpretedVirtualMachine::precompileProgram(int_reg_t(&r)[8], __m128d (&f)[4], __m128d (&e)[4], __m128d (&a)[4]) { + int registerUsage[8]; + for (unsigned i = 0; i < 8; ++i) { + registerUsage[i] = -1; + } for (unsigned i = 0; i < RANDOMX_PROGRAM_SIZE; ++i) { auto& instr = program(i); auto& ibc = byteCode[i]; @@ -438,6 +453,7 @@ namespace RandomX { ibc.imm = signExtend2sCompl(instr.getImm32()); ibc.isrc = &ibc.imm; } + registerUsage[instr.dst] = i; } break; CASE_REP(IADD_M) { @@ -454,6 +470,7 @@ namespace RandomX { ibc.isrc = &ibc.imm; ibc.memMask = ScratchpadL3Mask; } + registerUsage[instr.dst] = i; } break; CASE_REP(IADD_RC) { @@ -463,6 +480,7 @@ namespace RandomX { ibc.idst = &r[dst]; ibc.isrc = &r[src]; ibc.imm = signExtend2sCompl(instr.getImm32()); + registerUsage[instr.dst] = i; } break; CASE_REP(ISUB_R) { @@ -477,6 +495,7 @@ namespace RandomX { ibc.imm = signExtend2sCompl(instr.getImm32()); ibc.isrc = &ibc.imm; } + registerUsage[instr.dst] = i; } break; CASE_REP(ISUB_M) { @@ -493,6 +512,7 @@ namespace RandomX { ibc.isrc = &ibc.imm; ibc.memMask = ScratchpadL3Mask; } + registerUsage[instr.dst] = i; } break; CASE_REP(IMUL_9C) { @@ -500,6 +520,7 @@ namespace RandomX { ibc.type = InstructionType::IMUL_9C; ibc.idst = &r[dst]; ibc.imm = signExtend2sCompl(instr.getImm32()); + registerUsage[instr.dst] = i; } break; CASE_REP(IMUL_R) { @@ -514,6 +535,7 @@ namespace RandomX { ibc.imm = signExtend2sCompl(instr.getImm32()); ibc.isrc = &ibc.imm; } + registerUsage[instr.dst] = i; } break; CASE_REP(IMUL_M) { @@ -530,6 +552,7 @@ namespace RandomX { ibc.isrc = &ibc.imm; ibc.memMask = ScratchpadL3Mask; } + registerUsage[instr.dst] = i; } break; CASE_REP(IMULH_R) { @@ -538,6 +561,7 @@ namespace RandomX { ibc.type = InstructionType::IMULH_R; ibc.idst = &r[dst]; ibc.isrc = &r[src]; + registerUsage[instr.dst] = i; } break; CASE_REP(IMULH_M) { @@ -554,6 +578,7 @@ namespace RandomX { ibc.isrc = &ibc.imm; ibc.memMask = ScratchpadL3Mask; } + registerUsage[instr.dst] = i; } break; CASE_REP(ISMULH_R) { @@ -562,6 +587,7 @@ namespace RandomX { ibc.type = InstructionType::ISMULH_R; ibc.idst = &r[dst]; ibc.isrc = &r[src]; + registerUsage[instr.dst] = i; } break; CASE_REP(ISMULH_M) { @@ -578,6 +604,7 @@ namespace RandomX { ibc.isrc = &ibc.imm; ibc.memMask = ScratchpadL3Mask; } + registerUsage[instr.dst] = i; } break; CASE_REP(IMUL_RCP) { @@ -588,6 +615,7 @@ namespace RandomX { ibc.idst = &r[dst]; ibc.imm = reciprocal(divisor); ibc.isrc = &ibc.imm; + registerUsage[instr.dst] = i; } else { ibc.type = InstructionType::NOP; @@ -598,6 +626,7 @@ namespace RandomX { auto dst = instr.dst % RegistersCount; ibc.type = InstructionType::INEG_R; ibc.idst = &r[dst]; + registerUsage[instr.dst] = i; } break; CASE_REP(IXOR_R) { @@ -612,6 +641,7 @@ namespace RandomX { ibc.imm = signExtend2sCompl(instr.getImm32()); ibc.isrc = &ibc.imm; } + registerUsage[instr.dst] = i; } break; CASE_REP(IXOR_M) { @@ -628,6 +658,7 @@ namespace RandomX { ibc.isrc = &ibc.imm; ibc.memMask = ScratchpadL3Mask; } + registerUsage[instr.dst] = i; } break; CASE_REP(IROR_R) { @@ -642,6 +673,7 @@ namespace RandomX { ibc.imm = instr.getImm32(); ibc.isrc = &ibc.imm; } + registerUsage[instr.dst] = i; } break; CASE_REP(IROL_R) { @@ -656,6 +688,7 @@ namespace RandomX { ibc.imm = instr.getImm32(); ibc.isrc = &ibc.imm; } + registerUsage[instr.dst] = i; } break; CASE_REP(ISWAP_R) { @@ -665,6 +698,8 @@ namespace RandomX { ibc.idst = &r[dst]; ibc.isrc = &r[src]; ibc.type = InstructionType::ISWAP_R; + registerUsage[instr.dst] = i; + registerUsage[instr.src] = i; } else { ibc.type = InstructionType::NOP; @@ -751,6 +786,14 @@ namespace RandomX { ibc.isrc = &r[src]; ibc.condition = (instr.mod >> 2) & 7; ibc.imm = instr.getImm32(); + //jump condition + int reg = getConditionRegister(registerUsage); + ibc.target = registerUsage[reg]; + ibc.shift = (instr.mod >> 5); + ibc.creg = &r[reg]; + for (unsigned j = 0; j < 8; ++j) { //mark all registers as used + registerUsage[j] = i; + } } break; CASE_REP(COND_M) { @@ -762,6 +805,14 @@ namespace RandomX { ibc.condition = (instr.mod >> 2) & 7; ibc.imm = instr.getImm32(); ibc.memMask = ((instr.mod % 4) ? ScratchpadL1Mask : ScratchpadL2Mask); + //jump condition + int reg = getConditionRegister(registerUsage); + ibc.target = registerUsage[reg]; + ibc.shift = (instr.mod >> 5); + ibc.creg = &r[reg]; + for (unsigned j = 0; j < 8; ++j) { //mark all registers as used + registerUsage[j] = i; + } } break; CASE_REP(CFROUND) { diff --git a/src/InterpretedVirtualMachine.hpp b/src/InterpretedVirtualMachine.hpp index b3c7f80..3801187 100644 --- a/src/InterpretedVirtualMachine.hpp +++ b/src/InterpretedVirtualMachine.hpp @@ -52,9 +52,12 @@ namespace RandomX { uint64_t imm; int64_t simm; }; - uint32_t condition; + int_reg_t* creg; + uint16_t condition; + int16_t target; uint32_t memMask; - uint32_t type; + uint16_t type; + uint16_t shift; }; constexpr int asedwfagdewsa = sizeof(InstructionByteCode); @@ -70,7 +73,7 @@ namespace RandomX { void operator delete(void* ptr) { _mm_free(ptr); } - InterpretedVirtualMachine(bool soft, bool async) : softAes(soft), asyncWorker(async) {} + InterpretedVirtualMachine(bool soft) : softAes(soft) {} ~InterpretedVirtualMachine(); void setDataset(dataset_t ds, uint64_t size) override; void initialize() override; @@ -78,7 +81,7 @@ namespace RandomX { private: static InstructionHandler engine[256]; DatasetReadFunc readDataset; - bool softAes, asyncWorker; + bool softAes; InstructionByteCode byteCode[RANDOMX_PROGRAM_SIZE]; #ifdef STATS @@ -112,10 +115,6 @@ namespace RandomX { int count_FPROUND = 0; int count_JUMP_taken = 0; int count_JUMP_not_taken = 0; - int count_CALL_taken = 0; - int count_CALL_not_taken = 0; - int count_RET_stack_empty = 0; - int count_RET_taken = 0; int count_jump_taken[8] = { 0 }; int count_jump_not_taken[8] = { 0 }; int count_max_stack = 0; @@ -132,8 +131,7 @@ namespace RandomX { int datasetAccess[256] = { 0 }; #endif void precompileProgram(int_reg_t(&r)[8], __m128d (&f)[4], __m128d (&e)[4], __m128d (&a)[4]); - template void executeBytecode(int_reg_t(&r)[8], __m128d (&f)[4], __m128d (&e)[4], __m128d (&a)[4]); - void executeBytecode(int i, int_reg_t(&r)[8], __m128d (&f)[4], __m128d (&e)[4], __m128d (&a)[4]); + void executeBytecode(int& i, int_reg_t(&r)[8], __m128d (&f)[4], __m128d (&e)[4], __m128d (&a)[4]); }; } \ No newline at end of file diff --git a/src/JitCompilerX86.cpp b/src/JitCompilerX86.cpp index fc307a3..1f18a2a 100644 --- a/src/JitCompilerX86.cpp +++ b/src/JitCompilerX86.cpp @@ -171,7 +171,7 @@ namespace RandomX { static const uint8_t REX_ANDPS_XMM12[] = { 0x45, 0x0F, 0x54, 0xE5, 0x45, 0x0F, 0x56, 0xE6 }; static const uint8_t REX_PADD[] = { 0x66, 0x44, 0x0f }; static const uint8_t PADD_OPCODES[] = { 0xfc, 0xfd, 0xfe, 0xd4 }; - static const uint8_t REX_ADD_I[] = { 0x49, 0x83 }; + static const uint8_t REX_ADD_I[] = { 0x49, 0x81 }; static const uint8_t REX_TEST[] = { 0x49, 0xF7 }; static const uint8_t JZ[] = { 0x0f, 0x84 }; @@ -673,10 +673,9 @@ namespace RandomX { const int conditionMask = ((1 << RANDOMX_CONDITION_BITS) - 1) << shift; int reg = getConditionRegister(); int target = registerUsage[reg] + 1; - registerUsage[reg] = i; emit(REX_ADD_I); emitByte(0xc0 + reg); - emitByte(1 << shift); + emit32(1 << shift); emit(REX_TEST); emitByte(0xc0 + reg); emit32(conditionMask); diff --git a/src/main.cpp b/src/main.cpp index 7de622b..88c0b16 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -299,7 +299,7 @@ int main(int argc, char** argv) { vm = new RandomX::CompiledVirtualMachine(); } else { - vm = new RandomX::InterpretedVirtualMachine(softAes, async); + vm = new RandomX::InterpretedVirtualMachine(softAes); } vm->setDataset(dataset, datasetSize); vms.push_back(vm); @@ -336,7 +336,7 @@ int main(int argc, char** argv) { std::cout << "Calculated result: "; result.print(std::cout); if(programCount == 1000) - std::cout << "Reference result: 84f37cc43cb21eabf1d5b9def462060cd24218290678dd80a8ea2f663892629e" << std::endl; + std::cout << "Reference result: 9e636a04a2517f37d8ed40b67a7051e02a7577e878fbba5c4352996b2c653f90" << std::endl; if (!miningMode) { std::cout << "Performance: " << 1000 * elapsed / programCount << " ms per hash" << std::endl; } diff --git a/src/program.inc b/src/program.inc index b12dc13..46d8093 100644 --- a/src/program.inc +++ b/src/program.inc @@ -76,9 +76,9 @@ randomx_isn_19: ; FMUL_R e1, a2 mulpd xmm5, xmm10 randomx_isn_20: - ; COND_R r6, of(r3, 1593588996) + ; COND_R r6, of(r3, 1593588996), 1 add r8, 2 - test r8, 2 + test r8, 254 jz randomx_isn_0 xor ecx, ecx cmp r11d, 1593588996 @@ -98,9 +98,9 @@ randomx_isn_23: ; FMUL_R e2, a0 mulpd xmm6, xmm8 randomx_isn_24: - ; COND_R r6, no(r0, 149087159) + ; COND_R r6, no(r0, 149087159), 6 add r8, 64 - test r8, 64 + test r8, 8128 jz randomx_isn_21 xor ecx, ecx cmp r8d, 149087159 @@ -197,9 +197,9 @@ randomx_isn_50: ; FSUB_R f3, a0 subpd xmm3, xmm8 randomx_isn_51: - ; COND_R r2, be(r3, -1975981803) + ; COND_R r2, be(r3, -1975981803), 7 add r12, 128 - test r12, 128 + test r12, 16256 jz randomx_isn_25 xor ecx, ecx cmp r11d, -1975981803 @@ -212,9 +212,9 @@ randomx_isn_53: ; FSUB_R f2, a0 subpd xmm2, xmm8 randomx_isn_54: - ; COND_R r5, ns(r1, 1917049931) + ; COND_R r5, ns(r1, 1917049931), 6 add r8, 64 - test r8, 64 + test r8, 8128 jz randomx_isn_52 xor ecx, ecx cmp r9d, 1917049931 @@ -288,9 +288,9 @@ randomx_isn_73: ; FMUL_R e0, a0 mulpd xmm4, xmm8 randomx_isn_74: - ; COND_R r6, ns(r3, -1200328848) + ; COND_R r6, ns(r3, -1200328848), 2 add r9, 4 - test r9, 4 + test r9, 508 jz randomx_isn_55 xor ecx, ecx cmp r11d, -1200328848 @@ -350,9 +350,9 @@ randomx_isn_88: ; IMUL_R r1, r3 imul r9, r11 randomx_isn_89: - ; COND_M r2, no(L1[r0], -122257389) + ; COND_M r2, no(L1[r0], -122257389), 6 add r8, 64 - test r8, 64 + test r8, 8128 jz randomx_isn_75 xor ecx, ecx mov eax, r8d @@ -562,9 +562,9 @@ randomx_isn_142: ; FADD_R f1, a0 addpd xmm1, xmm8 randomx_isn_143: - ; COND_R r5, ge(r1, 880467599) + ; COND_R r5, ge(r1, 880467599), 2 add r14, 4 - test r14, 4 + test r14, 508 jz randomx_isn_110 xor ecx, ecx cmp r9d, 880467599 @@ -962,9 +962,9 @@ randomx_isn_246: ; IMUL_9C r7, 1938400676 lea r15, [r15+r15*8+1938400676] randomx_isn_247: - ; COND_M r2, be(L1[r5], -8545330) + ; COND_M r2, be(L1[r5], -8545330), 2 add r9, 4 - test r9, 4 + test r9, 508 jz randomx_isn_223 xor ecx, ecx mov eax, r13d