From 77dbe14658dfca2f7cc4aa803e6090d275a518f7 Mon Sep 17 00:00:00 2001 From: tevador Date: Sat, 6 Apr 2019 12:00:56 +0200 Subject: [PATCH] SuperscalarHash JIT compiler (unfinished) --- src/AssemblyGeneratorX86.cpp | 14 +-- src/AssemblyGeneratorX86.hpp | 2 +- src/Instruction.cpp | 12 +- src/Instruction.hpp | 2 +- src/InterpretedVirtualMachine.cpp | 2 +- src/JitCompilerX86-static.asm | 87 +++++++++++++ src/JitCompilerX86-static.hpp | 5 + src/JitCompilerX86.cpp | 176 ++++++++++++++++++++++++--- src/JitCompilerX86.hpp | 22 +++- src/LightProgramGenerator.cpp | 156 +++++++++++++----------- src/LightProgramGenerator.hpp | 16 ++- src/Program.hpp | 7 ++ src/asm/program_sshash_constants.inc | 16 +++ src/asm/program_sshash_load.inc | 8 ++ src/asm/program_sshash_prefetch.inc | 4 + src/common.hpp | 3 +- src/configuration.h | 10 +- src/main.cpp | 46 ++++--- 18 files changed, 453 insertions(+), 135 deletions(-) create mode 100644 src/asm/program_sshash_constants.inc create mode 100644 src/asm/program_sshash_load.inc create mode 100644 src/asm/program_sshash_prefetch.inc diff --git a/src/AssemblyGeneratorX86.cpp b/src/AssemblyGeneratorX86.cpp index 8b5dbcf..dc4cea2 100644 --- a/src/AssemblyGeneratorX86.cpp +++ b/src/AssemblyGeneratorX86.cpp @@ -97,14 +97,12 @@ namespace RandomX { } //1 uOP - void AssemblyGeneratorX86::h_IADD_R(Instruction& instr, int i) { + void AssemblyGeneratorX86::h_IADD_RS(Instruction& instr, int i) { registerUsage[instr.dst] = i; - if (instr.src != instr.dst) { - asmCode << "\tadd " << regR[instr.dst] << ", " << regR[instr.src] << std::endl; - } - else { - asmCode << "\tadd " << regR[instr.dst] << ", " << (int32_t)instr.getImm32() << std::endl; - } + if(instr.dst == 5) + asmCode << "\tlea " << regR[instr.dst] << ", [" << regR[instr.dst] << "+" << regR[instr.src] << "*" << (1 << (instr.mod % 4)) << std::showpos << (int32_t)instr.getImm32() << std::noshowpos << "]" << std::endl; + else + asmCode << "\tlea " << regR[instr.dst] << ", [" << regR[instr.dst] << "+" << regR[instr.src] << "*" << (1 << (instr.mod % 4)) << "]" << std::endl; traceint(instr); } @@ -517,7 +515,7 @@ namespace RandomX { InstructionGenerator AssemblyGeneratorX86::engine[256] = { //Integer - INST_HANDLE(IADD_R) + INST_HANDLE(IADD_RS) INST_HANDLE(IADD_M) INST_HANDLE(IADD_RC) INST_HANDLE(ISUB_R) diff --git a/src/AssemblyGeneratorX86.hpp b/src/AssemblyGeneratorX86.hpp index 8ab638b..601d278 100644 --- a/src/AssemblyGeneratorX86.hpp +++ b/src/AssemblyGeneratorX86.hpp @@ -68,7 +68,7 @@ namespace RandomX { void traceflt(Instruction&); void tracenop(Instruction&); - void h_IADD_R(Instruction&, int); + void h_IADD_RS(Instruction&, int); void h_IADD_M(Instruction&, int); void h_IADD_RC(Instruction&, int); void h_ISUB_R(Instruction&, int); diff --git a/src/Instruction.cpp b/src/Instruction.cpp index 7069926..e8ddc64 100644 --- a/src/Instruction.cpp +++ b/src/Instruction.cpp @@ -40,9 +40,9 @@ namespace RandomX { os << "L3" << "[" << (getImm32() & ScratchpadL3Mask) << "]"; } - void Instruction::h_IADD_R(std::ostream& os) const { + void Instruction::h_IADD_RS(std::ostream& os) const { if (src != dst) { - os << "r" << (int)dst << ", r" << (int)src << std::endl; + os << "r" << (int)dst << ", r" << (int)src << ", LSH " << (int)(mod % 4) << std::endl; } else { os << "r" << (int)dst << ", " << (int32_t)getImm32() << std::endl; @@ -302,13 +302,13 @@ namespace RandomX { } void Instruction::h_COND_R(std::ostream& os) const { - os << "r" << (int)dst << ", " << condition((mod >> 2) & 7) << "(r" << (int)src << ", " << (int32_t)getImm32() << "), " << (int)(mod >> 5) << std::endl; + os << "r" << (int)dst << ", " << condition((mod >> 2) & 7) << "(r" << (int)src << ", " << (int32_t)getImm32() << "), LSH " << (int)(mod >> 5) << std::endl; } void Instruction::h_COND_M(std::ostream& os) const { os << "r" << (int)dst << ", " << condition((mod >> 2) & 7) << "("; genAddressReg(os); - os << ", " << (int32_t)getImm32() << "), " << (int)(mod >> 5) << std::endl; + os << ", " << (int32_t)getImm32() << "), LSH " << (int)(mod >> 5) << std::endl; } void Instruction::h_ISTORE(std::ostream& os) const { @@ -333,7 +333,7 @@ namespace RandomX { const char* Instruction::names[256] = { //Integer - INST_NAME(IADD_R) + INST_NAME(IADD_RS) INST_NAME(IADD_M) INST_NAME(IADD_RC) INST_NAME(ISUB_R) @@ -379,7 +379,7 @@ namespace RandomX { InstructionVisualizer Instruction::engine[256] = { //Integer - INST_HANDLE(IADD_R) + INST_HANDLE(IADD_RS) INST_HANDLE(IADD_M) INST_HANDLE(IADD_RC) INST_HANDLE(ISUB_R) diff --git a/src/Instruction.hpp b/src/Instruction.hpp index d10575f..65d1c8a 100644 --- a/src/Instruction.hpp +++ b/src/Instruction.hpp @@ -98,7 +98,7 @@ namespace RandomX { void genAddressImm(std::ostream& os) const; void genAddressRegDst(std::ostream&) const; - void h_IADD_R(std::ostream&) const; + void h_IADD_RS(std::ostream&) const; void h_IADD_M(std::ostream&) const; void h_IADD_RC(std::ostream&) const; void h_ISUB_R(std::ostream&) const; diff --git a/src/InterpretedVirtualMachine.cpp b/src/InterpretedVirtualMachine.cpp index 15a5049..ebb3571 100644 --- a/src/InterpretedVirtualMachine.cpp +++ b/src/InterpretedVirtualMachine.cpp @@ -442,7 +442,7 @@ namespace RandomX { auto& instr = program(i); auto& ibc = byteCode[i]; switch (instr.opcode) { - CASE_REP(IADD_R) { + CASE_REP(IADD_RS) { auto dst = instr.dst % RegistersCount; auto src = instr.src % RegistersCount; ibc.type = InstructionType::IADD_R; diff --git a/src/JitCompilerX86-static.asm b/src/JitCompilerX86-static.asm index ffac80c..d16cab7 100644 --- a/src/JitCompilerX86-static.asm +++ b/src/JitCompilerX86-static.asm @@ -26,9 +26,14 @@ PUBLIC randomx_program_start PUBLIC randomx_program_read_dataset PUBLIC randomx_program_read_dataset_light PUBLIC randomx_program_read_dataset_light_sub +PUBLIC randomx_dataset_init PUBLIC randomx_program_loop_store PUBLIC randomx_program_loop_end PUBLIC randomx_program_epilogue +PUBLIC randomx_sshash_load +PUBLIC randomx_sshash_prefetch +PUBLIC randomx_sshash_end +PUBLIC randomx_sshash_init PUBLIC randomx_program_end ALIGN 64 @@ -75,11 +80,93 @@ randomx_program_read_dataset_light_sub PROC include asm/squareHash.inc randomx_program_read_dataset_light_sub ENDP +ALIGN 64 +randomx_dataset_init PROC + push rbx + push rbp + push rdi + push rsi + push r12 + push r13 + push r14 + push r15 + mov rdi, rcx ;# cache + mov rsi, rdx ;# dataset + mov rbp, r8 ;# block index + push r9 ;# max. block index +init_block_loop: + prefetchw byte ptr [rsi] + mov rbx, rbp + db 232 ;# 0xE8 = call + dd 32768 - distance + distance equ $ - offset randomx_dataset_init + mov qword ptr [rsi+0], r8 + mov qword ptr [rsi+8], r9 + mov qword ptr [rsi+16], r10 + mov qword ptr [rsi+24], r11 + mov qword ptr [rsi+32], r12 + mov qword ptr [rsi+40], r13 + mov qword ptr [rsi+48], r14 + mov qword ptr [rsi+56], r15 + add rbp, 1 + add rsi, 64 + cmp rbp, qword ptr [rsp] + jb init_block_loop + pop r9 + pop r15 + pop r14 + pop r13 + pop r12 + pop rsi + pop rdi + pop rbp + pop rbx + ret +randomx_dataset_init ENDP + ALIGN 64 randomx_program_epilogue PROC include asm/program_epilogue_win64.inc randomx_program_epilogue ENDP +ALIGN 64 +randomx_sshash_load PROC + include asm/program_sshash_load.inc +randomx_sshash_load ENDP + +randomx_sshash_prefetch PROC + include asm/program_sshash_prefetch.inc +randomx_sshash_prefetch ENDP + +randomx_sshash_end PROC + nop +randomx_sshash_end ENDP + +ALIGN 64 +randomx_sshash_init PROC + lea r8, [rbx+1] + include asm/program_sshash_prefetch.inc + imul r8, qword ptr [r0_mul] + mov r9, qword ptr [r1_add] + xor r9, r8 + mov r10, qword ptr [r2_add] + xor r10, r8 + mov r11, qword ptr [r3_add] + xor r11, r8 + mov r12, qword ptr [r4_add] + xor r12, r8 + mov r13, qword ptr [r5_add] + xor r13, r8 + mov r14, qword ptr [r6_add] + xor r14, r8 + mov r15, qword ptr [r7_add] + xor r15, r8 + jmp randomx_program_end +randomx_sshash_init ENDP + +ALIGN 64 + include asm/program_sshash_constants.inc + ALIGN 64 randomx_program_end PROC nop diff --git a/src/JitCompilerX86-static.hpp b/src/JitCompilerX86-static.hpp index 3d835b6..cf250c2 100644 --- a/src/JitCompilerX86-static.hpp +++ b/src/JitCompilerX86-static.hpp @@ -27,6 +27,11 @@ extern "C" { void randomx_program_loop_store(); void randomx_program_loop_end(); void randomx_program_read_dataset_light_sub(); + void randomx_dataset_init(); void randomx_program_epilogue(); + void randomx_sshash_load(); + void randomx_sshash_prefetch(); + void randomx_sshash_end(); + void randomx_sshash_init(); void randomx_program_end(); } \ No newline at end of file diff --git a/src/JitCompilerX86.cpp b/src/JitCompilerX86.cpp index 6c58a88..8c49326 100644 --- a/src/JitCompilerX86.cpp +++ b/src/JitCompilerX86.cpp @@ -88,29 +88,40 @@ namespace RandomX { #include "JitCompilerX86-static.hpp" +#define NOP_TEST true + const uint8_t* codePrologue = (uint8_t*)&randomx_program_prologue; const uint8_t* codeLoopBegin = (uint8_t*)&randomx_program_loop_begin; const uint8_t* codeLoopLoad = (uint8_t*)&randomx_program_loop_load; const uint8_t* codeProgamStart = (uint8_t*)&randomx_program_start; const uint8_t* codeReadDataset = (uint8_t*)&randomx_program_read_dataset; const uint8_t* codeReadDatasetLight = (uint8_t*)&randomx_program_read_dataset_light; + const uint8_t* codeDatasetInit = (uint8_t*)&randomx_dataset_init; const uint8_t* codeLoopStore = (uint8_t*)&randomx_program_loop_store; const uint8_t* codeLoopEnd = (uint8_t*)&randomx_program_loop_end; const uint8_t* codeReadDatasetLightSub = (uint8_t*)&randomx_program_read_dataset_light_sub; const uint8_t* codeEpilogue = (uint8_t*)&randomx_program_epilogue; const uint8_t* codeProgramEnd = (uint8_t*)&randomx_program_end; + const uint8_t* codeShhLoad = (uint8_t*)&randomx_sshash_load; + const uint8_t* codeShhPrefetch = (uint8_t*)&randomx_sshash_prefetch; + const uint8_t* codeShhEnd = (uint8_t*)&randomx_sshash_end; + const uint8_t* codeShhInit = (uint8_t*)&randomx_sshash_init; const int32_t prologueSize = codeLoopBegin - codePrologue; - const int32_t epilogueSize = codeProgramEnd - codeEpilogue; - const int32_t loopLoadSize = codeProgamStart - codeLoopLoad; const int32_t readDatasetSize = codeReadDatasetLight - codeReadDataset; const int32_t readDatasetLightSize = codeLoopStore - codeReadDatasetLight; const int32_t loopStoreSize = codeLoopEnd - codeLoopStore; - const int32_t readDatasetLightSubSize = codeEpilogue - codeReadDatasetLightSub; + const int32_t readDatasetLightSubSize = codeDatasetInit - codeReadDatasetLightSub; + const int32_t datasetInitSize = codeEpilogue - codeDatasetInit; + const int32_t epilogueSize = codeShhLoad - codeEpilogue; + const int32_t codeSshLoadSize = codeShhPrefetch - codeShhLoad; + const int32_t codeSshPrefetchSize = codeShhEnd - codeShhPrefetch; + const int32_t codeSshInitSize = codeProgramEnd - codeShhInit; const int32_t epilogueOffset = CodeSize - epilogueSize; const int32_t readDatasetLightSubOffset = epilogueOffset - readDatasetLightSubSize; + constexpr int32_t superScalarHashOffset = 32768; static const uint8_t REX_ADD_RR[] = { 0x4d, 0x03 }; static const uint8_t REX_ADD_RM[] = { 0x4c, 0x03 }; @@ -166,7 +177,7 @@ namespace RandomX { static const uint8_t SQRTPD[] = { 0x66, 0x0f, 0x51 }; static const uint8_t AND_OR_MOV_LDMXCSR[] = { 0x25, 0x00, 0x60, 0x00, 0x00, 0x0D, 0xC0, 0x9F, 0x00, 0x00, 0x89, 0x44, 0x24, 0xF8, 0x0F, 0xAE, 0x54, 0x24, 0xF8 }; static const uint8_t ROL_RAX[] = { 0x48, 0xc1, 0xc0 }; - static const uint8_t XOR_ECX_ECX[] = { 0x33, 0xC9 }; + static const uint8_t XOR_RCX_RCX[] = { 0x48, 0x33, 0xC9 }; static const uint8_t REX_CMP_R32I[] = { 0x41, 0x81 }; static const uint8_t REX_CMP_M32I[] = { 0x81, 0x3c, 0x06 }; static const uint8_t MOVAPD[] = { 0x66, 0x0f, 0x29 }; @@ -184,6 +195,18 @@ namespace RandomX { static const uint8_t REX_ADD_I[] = { 0x49, 0x81 }; static const uint8_t REX_TEST[] = { 0x49, 0xF7 }; static const uint8_t JZ[] = { 0x0f, 0x84 }; + static const uint8_t RET = 0xc3; + + static const uint8_t NOP1[] = { 0x90 }; + static const uint8_t NOP2[] = { 0x66, 0x90 }; + static const uint8_t NOP3[] = { 0x0F, 0x1F, 0x00 }; + static const uint8_t NOP4[] = { 0x0F, 0x1F, 0x40, 0x00 }; + static const uint8_t NOP5[] = { 0x0F, 0x1F, 0x44, 0x00, 0x00 }; + static const uint8_t NOP6[] = { 0x66, 0x0F, 0x1F, 0x44, 0x00, 0x00 }; + static const uint8_t NOP7[] = { 0x0F, 0x1F, 0x80, 0x00, 0x00, 0x00, 0x00 }; + static const uint8_t NOP8[] = { 0x0F, 0x1F, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00 }; + + static const uint8_t* NOPX[] = { NOP1, NOP2, NOP3, NOP4, NOP5, NOP6, NOP7, NOP8 }; size_t JitCompilerX86::getCodeSize() { return codePos - prologueSize; @@ -196,6 +219,10 @@ namespace RandomX { memcpy(code + readDatasetLightSubOffset, codeReadDatasetLightSub, readDatasetLightSubSize); } + JitCompilerX86::~JitCompilerX86() { + freePagedMemory(code, CodeSize); + } + void JitCompilerX86::generateProgram(Program& prog) { generateProgramPrologue(prog); memcpy(code + codePos, codeReadDataset, readDatasetSize); @@ -216,6 +243,42 @@ namespace RandomX { generateProgramEpilogue(prog); } + template + void JitCompilerX86::generateSuperScalarHash(LightProgram(&programs)[N]) { + memcpy(code + superScalarHashOffset, codeShhInit, codeSshInitSize); + codePos = superScalarHashOffset + codeSshInitSize; + for (unsigned j = 0; j < N; ++j) { + LightProgram& prog = programs[j]; + for (unsigned i = 0; i < prog.getSize(); ++i) { + Instruction& instr = prog(i); + instr.src %= RegistersCount; + instr.dst %= RegistersCount; + generateCode(instr, i); + } + emit(codeShhLoad, codeSshLoadSize); + if (j < N - 1) { + emit(REX_MOV_RR64); + emitByte(0xd8 + prog.getAddressRegister()); + emit(codeShhPrefetch, codeSshPrefetchSize); + int align = (codePos % 16); + while (align != 0) { + int nopSize = 16 - align; + if (nopSize > 8) nopSize = 8; + emit(NOPX[nopSize - 1], nopSize); + align = (codePos % 16); + } + } + } + emitByte(RET); + } + + template + void JitCompilerX86::generateSuperScalarHash(LightProgram(&programs)[RANDOMX_CACHE_ACCESSES]); + + void JitCompilerX86::generateDatasetInitCode() { + memcpy(code, codeDatasetInit, datasetInitSize); + } + void JitCompilerX86::generateProgramPrologue(Program& prog) { #ifdef RANDOMX_JUMP instructionOffsets.clear(); @@ -253,7 +316,6 @@ namespace RandomX { emit32(prologueSize - codePos - 4); emitByte(JMP); emit32(epilogueOffset - codePos - 4); - emitByte(0x90); } void JitCompilerX86::generateCode(Instruction& instr, int i) { @@ -287,9 +349,9 @@ namespace RandomX { emit32(instr.getImm32() & ScratchpadL3Mask); } - void JitCompilerX86::h_IADD_R(Instruction& instr, int i) { + void JitCompilerX86::h_IADD_RS(Instruction& instr, int i) { registerUsage[instr.dst] = i; - if (instr.src != instr.dst) { + /*if (instr.src != instr.dst) { emit(REX_ADD_RR); emitByte(0xc0 + 8 * instr.dst + instr.src); } @@ -297,7 +359,19 @@ namespace RandomX { emit(REX_81); emitByte(0xc0 + instr.dst); emit32(instr.getImm32()); + }*/ + if (false && NOP_TEST) { + emit(NOP4); + return; } + emit(REX_LEA); + if (instr.dst == 5) //rbp,r13 cannot be the base register without offset + emitByte(0xac); + else + emitByte(0x04 + 8 * instr.dst); + genSIB(instr.mod % 4, instr.src, instr.dst); + if (instr.dst == 5) + emit32(instr.getImm32()); } void JitCompilerX86::h_IADD_M(Instruction& instr, int i) { @@ -330,10 +404,18 @@ namespace RandomX { void JitCompilerX86::h_ISUB_R(Instruction& instr, int i) { registerUsage[instr.dst] = i; if (instr.src != instr.dst) { + if (false && NOP_TEST) { + emit(NOP3); + return; + } emit(REX_SUB_RR); emitByte(0xc0 + 8 * instr.dst + instr.src); } else { + if (false && NOP_TEST) { + emit(NOP7); + return; + } emit(REX_81); emitByte(0xe8 + instr.dst); emit32(instr.getImm32()); @@ -366,10 +448,18 @@ namespace RandomX { void JitCompilerX86::h_IMUL_R(Instruction& instr, int i) { registerUsage[instr.dst] = i; if (instr.src != instr.dst) { + if (false && NOP_TEST) { + emit(NOP4); + return; + } emit(REX_IMUL_RR); emitByte(0xc0 + 8 * instr.dst + instr.src); } else { + if (false && NOP_TEST) { + emit(NOP7); + return; + } emit(REX_IMUL_RRI); emitByte(0xc0 + 9 * instr.dst); emit32(instr.getImm32()); @@ -393,6 +483,12 @@ namespace RandomX { void JitCompilerX86::h_IMULH_R(Instruction& instr, int i) { registerUsage[instr.dst] = i; + if (false && NOP_TEST) { + emit(NOP3); + emit(NOP3); + emit(NOP3); + return; + } emit(REX_MOV_RR64); emitByte(0xc0 + instr.dst); emit(REX_MUL_R); @@ -422,6 +518,12 @@ namespace RandomX { void JitCompilerX86::h_ISMULH_R(Instruction& instr, int i) { registerUsage[instr.dst] = i; + if (false && NOP_TEST) { + emit(NOP3); + emit(NOP3); + emit(NOP3); + return; + } emit(REX_MOV_RR64); emitByte(0xc0 + instr.dst); emit(REX_MUL_R); @@ -451,6 +553,13 @@ namespace RandomX { void JitCompilerX86::h_IMUL_RCP(Instruction& instr, int i) { if (instr.getImm32() != 0) { + if (false && NOP_TEST) { + emitByte(0x66); + emitByte(0x66); + emit(NOP8); + emit(NOP4); + return; + } registerUsage[instr.dst] = i; emit(MOV_RAX_I); emit64(reciprocal(instr.getImm32())); @@ -472,10 +581,18 @@ namespace RandomX { void JitCompilerX86::h_IXOR_R(Instruction& instr, int i) { registerUsage[instr.dst] = i; if (instr.src != instr.dst) { + if (false && NOP_TEST) { + emit(NOP3); + return; + } emit(REX_XOR_RR); emitByte(0xc0 + 8 * instr.dst + instr.src); } else { + if (false && NOP_TEST) { + emit(NOP7); + return; + } emit(REX_XOR_RI); emitByte(0xf0 + instr.dst); emit32(instr.getImm32()); @@ -500,12 +617,21 @@ namespace RandomX { void JitCompilerX86::h_IROR_R(Instruction& instr, int i) { registerUsage[instr.dst] = i; if (instr.src != instr.dst) { + if (false && NOP_TEST) { + emit(NOP3); + emit(NOP3); + return; + } emit(REX_MOV_RR); emitByte(0xc8 + instr.src); emit(REX_ROT_CL); emitByte(0xc8 + instr.dst); } else { + if (NOP_TEST) { + emit(NOP4); + return; + } emit(REX_ROT_I8); emitByte(0xc8 + instr.dst); emitByte(instr.getImm32() & 63); @@ -700,14 +826,21 @@ namespace RandomX { const int conditionMask = ((1 << RANDOMX_CONDITION_BITS) - 1) << shift; int reg = getConditionRegister(); int target = registerUsage[reg] + 1; - emit(REX_ADD_I); - emitByte(0xc0 + reg); - emit32(1 << shift); - emit(REX_TEST); - emitByte(0xc0 + reg); - emit32(conditionMask); - emit(JZ); - emit32(instructionOffsets[target] - (codePos + 4)); + if (false && NOP_TEST) { + emit(NOP7); + emit(NOP7); + emit(NOP6); + } + else { + emit(REX_ADD_I); + emitByte(0xc0 + reg); + emit32(1 << shift); + emit(REX_TEST); + emitByte(0xc0 + reg); + emit32(conditionMask); + emit(JZ); + emit32(instructionOffsets[target] - (codePos + 4)); + } for (unsigned j = 0; j < 8; ++j) { //mark all registers as used registerUsage[j] = i; } @@ -717,7 +850,14 @@ namespace RandomX { #ifdef RANDOMX_JUMP handleCondition(instr, i); #endif - emit(XOR_ECX_ECX); + if (false && NOP_TEST) { + emit(NOP3); + emit(NOP7); + emit(NOP3); + emit(NOP3); + return; + } + emit(XOR_RCX_RCX); emit(REX_CMP_R32I); emitByte(0xf8 + instr.src); emit32(instr.getImm32()); @@ -732,7 +872,7 @@ namespace RandomX { #ifdef RANDOMX_JUMP handleCondition(instr, i); #endif - emit(XOR_ECX_ECX); + emit(XOR_RCX_RCX); genAddressReg(instr); emit(REX_CMP_M32I); emit32(instr.getImm32()); @@ -765,7 +905,7 @@ namespace RandomX { #define INST_HANDLE(x) REPN(&JitCompilerX86::h_##x, WT(x)) InstructionGeneratorX86 JitCompilerX86::engine[256] = { - INST_HANDLE(IADD_R) + INST_HANDLE(IADD_RS) INST_HANDLE(IADD_M) INST_HANDLE(IADD_RC) INST_HANDLE(ISUB_R) diff --git a/src/JitCompilerX86.hpp b/src/JitCompilerX86.hpp index f2fd330..16fe26d 100644 --- a/src/JitCompilerX86.hpp +++ b/src/JitCompilerX86.hpp @@ -27,6 +27,7 @@ along with RandomX. If not, see. namespace RandomX { class Program; + class LightProgram; class JitCompilerX86; typedef void(JitCompilerX86::*InstructionGeneratorX86)(Instruction&, int); @@ -36,11 +37,18 @@ namespace RandomX { class JitCompilerX86 { public: JitCompilerX86(); + ~JitCompilerX86(); void generateProgram(Program&); void generateProgramLight(Program&); + template + void generateSuperScalarHash(LightProgram (&programs)[N]); ProgramFunc getProgramFunc() { return (ProgramFunc)code; } + DatasetInitFunc getDatasetInitFunc() { + generateDatasetInitCode(); + return (DatasetInitFunc)code; + } uint8_t* getCode() { return code; } @@ -62,6 +70,8 @@ namespace RandomX { } } + void generateDatasetInitCode(); + void generateProgramPrologue(Program&); void generateProgramEpilogue(Program&); int getConditionRegister(); @@ -100,13 +110,15 @@ namespace RandomX { template void emit(const uint8_t (&src)[N]) { - for (unsigned i = 0; i < N; ++i) { - code[codePos + i] = src[i]; - } - codePos += N; + emit(src, N); } - void h_IADD_R(Instruction&, int); + void emit(const uint8_t* src, size_t count) { + memcpy(code + codePos, src, count); + codePos += count; + } + + void h_IADD_RS(Instruction&, int); void h_IADD_M(Instruction&, int); void h_IADD_RC(Instruction&, int); void h_ISUB_R(Instruction&, int); diff --git a/src/LightProgramGenerator.cpp b/src/LightProgramGenerator.cpp index 5825808..900e2ae 100644 --- a/src/LightProgramGenerator.cpp +++ b/src/LightProgramGenerator.cpp @@ -26,6 +26,7 @@ along with RandomX. If not, see. #include #include #include +#include "LightProgramGenerator.hpp" namespace RandomX { // Intel Ivy Bridge reference @@ -47,8 +48,8 @@ namespace RandomX { } namespace LightInstructionOpcode { - constexpr int IADD_R = 0; - constexpr int IADD_RC = RANDOMX_FREQ_IADD_R + RANDOMX_FREQ_IADD_M; + constexpr int IADD_RS = 0; + constexpr int IADD_RC = RANDOMX_FREQ_IADD_RS + RANDOMX_FREQ_IADD_M; constexpr int ISUB_R = IADD_RC + RANDOMX_FREQ_IADD_RC; constexpr int IMUL_9C = ISUB_R + RANDOMX_FREQ_ISUB_R + RANDOMX_FREQ_ISUB_M; constexpr int IMUL_R = IMUL_9C + RANDOMX_FREQ_IMUL_9C; @@ -65,20 +66,18 @@ namespace RandomX { } const int lightInstructionOpcode[] = { - LightInstructionOpcode::IADD_R, - LightInstructionOpcode::IADD_R, - LightInstructionOpcode::IADD_RC, - LightInstructionOpcode::ISUB_R, - LightInstructionOpcode::IMUL_9C, - LightInstructionOpcode::IMUL_R, - LightInstructionOpcode::IMUL_R, + LightInstructionOpcode::IADD_RS, + LightInstructionOpcode::ISUB_R, //ISUB_R + LightInstructionOpcode::ISUB_R, //ISUB_R + LightInstructionOpcode::IMUL_R, //IMUL_R + LightInstructionOpcode::IMUL_R, //IMUL_C LightInstructionOpcode::IMULH_R, LightInstructionOpcode::ISMULH_R, LightInstructionOpcode::IMUL_RCP, - LightInstructionOpcode::IXOR_R, - LightInstructionOpcode::IXOR_R, - LightInstructionOpcode::IROR_R, - LightInstructionOpcode::IROR_R, + LightInstructionOpcode::IXOR_R, //IXOR_R + LightInstructionOpcode::IXOR_R, //IXOR_C + LightInstructionOpcode::IROR_R, //IROR_R + LightInstructionOpcode::IROR_R, //IROR_C LightInstructionOpcode::COND_R }; @@ -93,37 +92,30 @@ namespace RandomX { constexpr type P015 = 6; } - class Blake2Generator { - public: - Blake2Generator(const void* seed, int nonce) : dataIndex(sizeof(data)) { - memset(data, 0, sizeof(data)); - memcpy(data, seed, SeedSize); - store32(&data[60], nonce); - } + Blake2Generator::Blake2Generator(const void* seed, int nonce) : dataIndex(sizeof(data)) { + memset(data, 0, sizeof(data)); + memcpy(data, seed, SeedSize); + store32(&data[60], nonce); + } - uint8_t getByte() { - checkData(1); - return data[dataIndex++]; - } + uint8_t Blake2Generator::getByte() { + checkData(1); + return data[dataIndex++]; + } - uint32_t getInt32() { - checkData(4); - auto ret = load32(&data[dataIndex]); - dataIndex += 4; - return ret; - } + uint32_t Blake2Generator::getInt32() { + checkData(4); + auto ret = load32(&data[dataIndex]); + dataIndex += 4; + return ret; + } - private: - uint8_t data[64]; - size_t dataIndex; - - void checkData(const size_t bytesNeeded) { - if (dataIndex + bytesNeeded > sizeof(data)) { - blake2b(data, sizeof(data), data, sizeof(data), nullptr, 0); - dataIndex = 0; - } + void Blake2Generator::checkData(const size_t bytesNeeded) { + if (dataIndex + bytesNeeded > sizeof(data)) { + blake2b(data, sizeof(data), data, sizeof(data), nullptr, 0); + dataIndex = 0; } - }; + } class RegisterInfo { public: @@ -201,7 +193,7 @@ namespace RandomX { static const MacroOp Xor_ri; static const MacroOp Ror_rcl; static const MacroOp Ror_ri; - static const MacroOp TestJmp_fused; + static const MacroOp TestJz_fused; static const MacroOp Xor_self; static const MacroOp Cmp_ri; static const MacroOp Setcc_r; @@ -235,13 +227,13 @@ namespace RandomX { const MacroOp MacroOp::Xor_self = MacroOp("xor rcx,rcx", 3); const MacroOp MacroOp::Cmp_ri = MacroOp("cmp r,i", 7, 1, ExecutionPort::P015); const MacroOp MacroOp::Setcc_r = MacroOp("setcc cl", 3, 1, ExecutionPort::P05); - const MacroOp MacroOp::TestJmp_fused = MacroOp("testjz r,i", 13, 0, ExecutionPort::P5); + const MacroOp MacroOp::TestJz_fused = MacroOp("testjz r,i", 13, 0, ExecutionPort::P5); const MacroOp IMULH_R_ops_array[] = { MacroOp::Mov_rr, MacroOp::Mul_r, MacroOp::Mov_rr }; const MacroOp ISMULH_R_ops_array[] = { MacroOp::Mov_rr, MacroOp::Imul_r, MacroOp::Mov_rr }; const MacroOp IMUL_RCP_ops_array[] = { MacroOp::Mov_ri64, MacroOp(MacroOp::Imul_rr, true) }; const MacroOp IROR_R_ops_array[] = { MacroOp::Mov_rr, MacroOp::Ror_rcl }; - const MacroOp COND_R_ops_array[] = { MacroOp::Add_ri, MacroOp(MacroOp::TestJmp_fused, true), MacroOp::Xor_self, MacroOp::Cmp_ri, MacroOp(MacroOp::Setcc_r, true), MacroOp(MacroOp::Add_rr, true) }; + const MacroOp COND_R_ops_array[] = { MacroOp::Add_ri, MacroOp(MacroOp::TestJz_fused, true), MacroOp::Xor_self, MacroOp::Cmp_ri, MacroOp(MacroOp::Setcc_r, true), MacroOp(MacroOp::Add_rr, true) }; class LightInstructionInfo { @@ -349,7 +341,7 @@ namespace RandomX { class DecoderBuffer { public: - static DecoderBuffer Default; + static const DecoderBuffer Default; template DecoderBuffer(const char* name, int index, const int(&arr)[N]) : name_(name), index_(index), counts_(arr), opsCount_(N) {} @@ -365,17 +357,17 @@ namespace RandomX { const char* getName() const { return name_; } - const DecoderBuffer& fetchNext(int prevType, Blake2Generator& gen) { + const DecoderBuffer* fetchNext(int prevType, Blake2Generator& gen) const { if (prevType == LightInstructionType::IMULH_R || prevType == LightInstructionType::ISMULH_R) - return decodeBuffer3310; //2-1-1 decode + return &decodeBuffer3310; //2-1-1 decode if (index_ == 0) { - return decodeBuffer4444; //IMUL_RCP end - } - if (index_ == 2) { - return decodeBuffer133; //COND_R middle + return &decodeBuffer4444; //IMUL_RCP end } + /*if (index_ == 2) { + return &decodeBuffer133; //COND_R middle + }*/ if (index_ == 7) { - return decodeBuffer7333; //COND_R end + return &decodeBuffer7333; //COND_R end } return fetchNextDefault(gen); } @@ -393,12 +385,12 @@ namespace RandomX { static const DecoderBuffer decodeBuffer3373; static const DecoderBuffer decodeBuffer133; static const DecoderBuffer* decodeBuffers[7]; - const DecoderBuffer& fetchNextDefault(Blake2Generator& gen) { + const DecoderBuffer* fetchNextDefault(Blake2Generator& gen) const { int select; do { select = gen.getByte() & 7; } while (select == 7); - return *decodeBuffers[select]; + return decodeBuffers[select]; } }; @@ -420,7 +412,7 @@ namespace RandomX { &DecoderBuffer::decodeBuffer3373, }; - DecoderBuffer DecoderBuffer::Default = DecoderBuffer(); + const DecoderBuffer DecoderBuffer::Default = DecoderBuffer(); const LightInstructionInfo* slot_3[] = { &LightInstructionInfo::ISUB_R, &LightInstructionInfo::IXOR_R }; const LightInstructionInfo* slot_3L[] = { &LightInstructionInfo::ISUB_R, &LightInstructionInfo::IXOR_R, &LightInstructionInfo::IMULH_R, &LightInstructionInfo::ISMULH_R }; @@ -472,7 +464,7 @@ namespace RandomX { case 4: return create(slot_4[gen.getByte() & 3], gen); case 7: - if (isLast) { + if (false && isLast) { return create(slot_7L, gen); } else { @@ -595,7 +587,7 @@ namespace RandomX { bool selectDestination(int cycle, RegisterInfo (®isters)[8], Blake2Generator& gen) { std::vector availableRegisters; for (unsigned i = 0; i < 8; ++i) { - if (registers[i].latency <= cycle && (canReuse_ || i != src_) && (registers[i].lastOpGroup != opGroup_ || registers[i].lastOpPar != opGroupPar_)) + if (registers[i].latency <= cycle && (canReuse_ || i != src_) && (registers[i].lastOpGroup != opGroup_ || registers[i].lastOpPar != opGroupPar_) && (info_.getType() != LightInstructionType::IADD_RS || i != 5)) availableRegisters.push_back(i); } return selectRegister(availableRegisters, gen, dst_); @@ -607,6 +599,12 @@ namespace RandomX { if (registers[i].latency <= cycle) availableRegisters.push_back(i); } + if (availableRegisters.size() == 2 && info_.getType() == LightInstructionType::IADD_RS) { + if (availableRegisters[0] == 5 || availableRegisters[1] == 5) { + opGroupPar_ = src_ = 5; + return true; + } + } if (selectRegister(availableRegisters, gen, src_)) { if (groupParIsSource_) opGroupPar_ = src_; @@ -666,7 +664,7 @@ namespace RandomX { constexpr int V4_SRC_INDEX_BITS = 3; constexpr int V4_DST_INDEX_BITS = 3; constexpr int CYCLE_MAP_SIZE = RANDOMX_LPROG_LATENCY + 3; - constexpr bool TRACE = true; + constexpr bool TRACE = false; static int blakeCounter = 0; @@ -782,15 +780,14 @@ namespace RandomX { } } - void generateLightProg2(LightProgram& prog, const void* seed, int indexRegister, int nonce) { + double generateLightProg2(LightProgram& prog, Blake2Generator& gen) { ExecutionPort::type portBusy[CYCLE_MAP_SIZE][3]; memset(portBusy, 0, sizeof(portBusy)); RegisterInfo registers[8]; - Blake2Generator gen(seed, nonce); std::vector instructions; - DecoderBuffer& fetchLine = DecoderBuffer::Default; + const DecoderBuffer* fetchLine = &DecoderBuffer::Default; LightInstruction currentInstruction = LightInstruction::Null; int instrIndex = 0; int codeSize = 0; @@ -806,24 +803,24 @@ namespace RandomX { constexpr int MAX_ATTEMPTS = 4; while(!portsSaturated) { - fetchLine = fetchLine.fetchNext(currentInstruction.getType(), gen); - if (TRACE) std::cout << "; ------------- fetch cycle " << cycle << " (" << fetchLine.getName() << ")" << std::endl; + fetchLine = fetchLine->fetchNext(currentInstruction.getType(), gen); + if (TRACE) std::cout << "; ------------- fetch cycle " << cycle << " (" << fetchLine->getName() << ")" << std::endl; mopIndex = 0; - while (mopIndex < fetchLine.getSize()) { + while (mopIndex < fetchLine->getSize()) { int topCycle = cycle; if (instrIndex >= currentInstruction.getInfo().getSize()) { if (portsSaturated) break; - currentInstruction = LightInstruction::createForSlot(gen, fetchLine.getCounts()[mopIndex], fetchLine.getSize() == mopIndex + 1, fetchLine.getIndex() == 0 && mopIndex == 0); + currentInstruction = LightInstruction::createForSlot(gen, fetchLine->getCounts()[mopIndex], fetchLine->getSize() == mopIndex + 1, fetchLine->getIndex() == 0 && mopIndex == 0); instrIndex = 0; if (TRACE) std::cout << "; " << currentInstruction.getInfo().getName() << std::endl; } MacroOp& mop = currentInstruction.getInfo().getOp(instrIndex); - if (fetchLine.getCounts()[mopIndex] != mop.getSize()) { - if (TRACE) std::cout << "ERROR instruction " << mop.getName() << " doesn't fit into slot of size " << fetchLine.getCounts()[mopIndex] << std::endl; - return; + if (fetchLine->getCounts()[mopIndex] != mop.getSize()) { + if (TRACE) std::cout << "ERROR instruction " << mop.getName() << " doesn't fit into slot of size " << fetchLine->getCounts()[mopIndex] << std::endl; + return DBL_MIN; } if (TRACE) std::cout << mop.getName() << " "; @@ -831,7 +828,7 @@ namespace RandomX { mop.setCycle(scheduleCycle); if (scheduleCycle < 0) { if (TRACE) std::cout << "; Failed at cycle " << cycle << std::endl; - return; + return DBL_MIN; } if (instrIndex == currentInstruction.getInfo().getSrcOp()) { @@ -893,25 +890,29 @@ namespace RandomX { std::cout << "; (* = in use, _ = idle)" << std::endl; int portCycles = 0; - for (int i = 0; i < CYCLE_MAP_SIZE; ++i) { + /*for (int i = 0; i < CYCLE_MAP_SIZE; ++i) { std::cout << "; " << std::setw(3) << i << " "; for (int j = 0; j < 3; ++j) { std::cout << (portBusy[i][j] ? '*' : '_'); portCycles += !!portBusy[i][j]; } std::cout << std::endl; - } + }*/ + + double ipc = (macroOpCount / (double)retireCycle); std::cout << "; code size " << codeSize << " bytes" << std::endl; std::cout << "; x86 macro-ops: " << macroOpCount << std::endl; std::cout << "; RandomX instructions: " << outIndex << std::endl; std::cout << "; Execution time: " << retireCycle << " cycles" << std::endl; - std::cout << "; IPC = " << (macroOpCount / (double)retireCycle) << std::endl; + std::cout << "; IPC = " << ipc << std::endl; std::cout << "; Port-cycles: " << portCycles << std::endl; + std::cout << "; Multiplications: " << mulCount << std::endl; int asicLatency[8]; memset(asicLatency, 0, sizeof(asicLatency)); + for (int i = 0; i < outIndex; ++i) { Instruction& instr = prog(i); int latDst = asicLatency[instr.dst] + 1; @@ -919,7 +920,16 @@ namespace RandomX { asicLatency[instr.dst] = std::max(latDst, latSrc); } - std::cout << "; Multiplications: " << mulCount << std::endl; + int asicLatencyFinal = 0; + int addressReg = 0; + for (int i = 0; i < 8; ++i) { + if (asicLatency[i] > asicLatencyFinal) { + asicLatencyFinal = asicLatency[i]; + addressReg = i; + } + } + + std::cout << "; ASIC latency: " << asicLatencyFinal << std::endl; std::cout << "; ASIC latency:" << std::endl; for (int i = 0; i < 8; ++i) { @@ -931,5 +941,7 @@ namespace RandomX { } prog.setSize(outIndex); + prog.setAddressRegister(addressReg); + return addressReg; } } \ No newline at end of file diff --git a/src/LightProgramGenerator.hpp b/src/LightProgramGenerator.hpp index 8027aab..e7b1bda 100644 --- a/src/LightProgramGenerator.hpp +++ b/src/LightProgramGenerator.hpp @@ -20,6 +20,18 @@ along with RandomX. If not, see. #include "Program.hpp" namespace RandomX { - void generateLightProgram(LightProgram& prog, const void* seed, int indexRegister, int nonce); - void generateLightProg2(LightProgram& prog, const void* seed, int indexRegister, int nonce); + + class Blake2Generator { + public: + Blake2Generator(const void* seed, int nonce); + uint8_t getByte(); + uint32_t getInt32(); + private: + uint8_t data[64]; + size_t dataIndex; + + void checkData(const size_t); + }; + + double generateLightProg2(LightProgram& prog, Blake2Generator& gen); } \ No newline at end of file diff --git a/src/Program.hpp b/src/Program.hpp index 53c973b..2b81435 100644 --- a/src/Program.hpp +++ b/src/Program.hpp @@ -68,6 +68,12 @@ namespace RandomX { void setSize(uint32_t val) { size = val; } + int getAddressRegister() { + return addrReg; + } + void setAddressRegister(uint32_t val) { + addrReg = val; + } private: void print(std::ostream& os) const { for (unsigned i = 0; i < size; ++i) { @@ -77,6 +83,7 @@ namespace RandomX { } Instruction programBuffer[RANDOMX_LPROG_MAX_SIZE]; uint32_t size; + int addrReg; }; static_assert(sizeof(Program) % 64 == 0, "Invalid size of class Program"); diff --git a/src/asm/program_sshash_constants.inc b/src/asm/program_sshash_constants.inc new file mode 100644 index 0000000..a25a90e --- /dev/null +++ b/src/asm/program_sshash_constants.inc @@ -0,0 +1,16 @@ +r0_mul: ;# 6364136223846793005 + db 45, 127, 149, 76, 45, 244, 81, 88 +r1_add: ;# 9298410992540426048 + db 64, 159, 245, 89, 136, 151, 10, 129 +r2_add: ;# 12065312585734608966 + db 70, 216, 194, 56, 223, 153, 112, 167 +r3_add: ;# 9306329213124610396 + db 92, 9, 34, 191, 28, 185, 38, 129 +r4_add: ;# 5281919268842080866 + db 98, 138, 159, 23, 151, 37, 77, 73 +r5_add: ;# 10536153434571861004 + db 12, 236, 170, 206, 185, 239, 55, 146 +r6_add: ;# 3398623926847679864 + db 120, 45, 230, 108, 116, 86, 42, 47 +r7_add: ;# 9549104520008361294 + db 78, 229, 44, 182, 247, 59, 133, 132 \ No newline at end of file diff --git a/src/asm/program_sshash_load.inc b/src/asm/program_sshash_load.inc new file mode 100644 index 0000000..a9ae9a2 --- /dev/null +++ b/src/asm/program_sshash_load.inc @@ -0,0 +1,8 @@ + ;xor r8, qword ptr [rbx+0] + ;xor r9, qword ptr [rbx+8] + ;xor r10, qword ptr [rbx+16] + ;xor r11, qword ptr [rbx+24] + ;xor r12, qword ptr [rbx+32] + ;xor r13, qword ptr [rbx+40] + ;xor r14, qword ptr [rbx+48] + ;xor r15, qword ptr [rbx+56] \ No newline at end of file diff --git a/src/asm/program_sshash_prefetch.inc b/src/asm/program_sshash_prefetch.inc new file mode 100644 index 0000000..78faba4 --- /dev/null +++ b/src/asm/program_sshash_prefetch.inc @@ -0,0 +1,4 @@ + and rbx, 4194303 + shl rbx, 6 + add rbx, rdi + ; prefetchnta byte ptr [rbx] \ No newline at end of file diff --git a/src/common.hpp b/src/common.hpp index 118f053..83a9bc7 100644 --- a/src/common.hpp +++ b/src/common.hpp @@ -41,7 +41,7 @@ namespace RandomX { static_assert((RANDOMX_SCRATCHPAD_L1 & (RANDOMX_SCRATCHPAD_L1 - 1)) == 0, "RANDOMX_SCRATCHPAD_L1 must be a power of 2."); static_assert(RANDOMX_CACHE_ACCESSES > 1, "RANDOMX_CACHE_ACCESSES must be greater than 1"); - constexpr int wtSum = RANDOMX_FREQ_IADD_R + RANDOMX_FREQ_IADD_M + RANDOMX_FREQ_IADD_RC + RANDOMX_FREQ_ISUB_R + \ + constexpr int wtSum = RANDOMX_FREQ_IADD_RS + RANDOMX_FREQ_IADD_M + RANDOMX_FREQ_IADD_RC + RANDOMX_FREQ_ISUB_R + \ RANDOMX_FREQ_ISUB_M + RANDOMX_FREQ_IMUL_9C + RANDOMX_FREQ_IMUL_R + RANDOMX_FREQ_IMUL_M + RANDOMX_FREQ_IMULH_R + \ RANDOMX_FREQ_IMULH_M + RANDOMX_FREQ_ISMULH_R + RANDOMX_FREQ_ISMULH_M + RANDOMX_FREQ_IMUL_RCP + \ RANDOMX_FREQ_INEG_R + RANDOMX_FREQ_IXOR_R + RANDOMX_FREQ_IXOR_M + RANDOMX_FREQ_IROR_R + RANDOMX_FREQ_ISWAP_R + \ @@ -141,6 +141,7 @@ namespace RandomX { typedef void(*DatasetReadFunc)(addr_t, MemoryRegisters&, int_reg_t(®)[RegistersCount]); typedef void(*ProgramFunc)(RegisterFile&, MemoryRegisters&, uint8_t* /* scratchpad */, uint64_t); + typedef void(*DatasetInitFunc)(uint8_t* cache, uint8_t* dataset, uint32_t startBlock, uint32_t endBlock); } std::ostream& operator<<(std::ostream& os, const RandomX::RegisterFile& rf); diff --git a/src/configuration.h b/src/configuration.h index 95c1412..72e44a4 100644 --- a/src/configuration.h +++ b/src/configuration.h @@ -37,7 +37,7 @@ along with RandomX. If not, see. //Number of random Cache accesses per Dataset block. Minimum is 2. #define RANDOMX_CACHE_ACCESSES 8 -#define RANDOMX_LPROG_LATENCY 168 +#define RANDOMX_LPROG_LATENCY 130 #define RANDOMX_LPROG_ASIC_LATENCY 84 #define RANDOMX_LPROG_MIN_SIZE 225 #define RANDOMX_LPROG_MAX_SIZE 512 @@ -80,12 +80,12 @@ Instruction frequencies (per 256 opcodes) Total sum of frequencies must be 256 */ -#define RANDOMX_FREQ_IADD_R 12 +#define RANDOMX_FREQ_IADD_RS 32 #define RANDOMX_FREQ_IADD_M 7 -#define RANDOMX_FREQ_IADD_RC 16 -#define RANDOMX_FREQ_ISUB_R 12 +#define RANDOMX_FREQ_IADD_RC 0 +#define RANDOMX_FREQ_ISUB_R 17 #define RANDOMX_FREQ_ISUB_M 7 -#define RANDOMX_FREQ_IMUL_9C 9 +#define RANDOMX_FREQ_IMUL_9C 0 #define RANDOMX_FREQ_IMUL_R 16 #define RANDOMX_FREQ_IMUL_M 4 #define RANDOMX_FREQ_IMULH_R 4 diff --git a/src/main.cpp b/src/main.cpp index 7f37a37..d5e4657 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -37,6 +37,7 @@ along with RandomX. If not, see. #include "Cache.hpp" #include "hashAes1Rx4.hpp" #include "LightProgramGenerator.hpp" +#include "JitCompilerX86.hpp" const uint8_t seed[32] = { 191, 182, 222, 175, 249, 89, 134, 104, 241, 68, 191, 62, 162, 166, 61, 64, 123, 191, 227, 193, 118, 60, 188, 53, 223, 133, 175, 24, 123, 230, 55, 74 }; @@ -204,7 +205,7 @@ void mine(RandomX::VirtualMachine* vm, std::atomic& atomicNonce, Atomi } int main(int argc, char** argv) { - bool softAes, genAsm, miningMode, verificationMode, help, largePages, async, genNative, jit, genLight; + bool softAes, genAsm, miningMode, verificationMode, help, largePages, async, genNative, jit, genLight, useSuperscalar; int programCount, threadCount, initThreadCount, epoch; readOption("--softAes", argc, argv, softAes); @@ -220,14 +221,16 @@ int main(int argc, char** argv) { readOption("--genNative", argc, argv, genNative); readOption("--help", argc, argv, help); readOption("--genLight", argc, argv, genLight); + readOption("--useSuperscalar", argc, argv, useSuperscalar); if (genLight) { RandomX::LightProgram p; - RandomX::generateLightProg2(p, seed, 0, programCount); - //RandomX::AssemblyGeneratorX86 asmX86; - //asmX86.generateProgram(p); + RandomX::Blake2Generator gen(seed, programCount); + RandomX::generateLightProg2(p, gen); + RandomX::AssemblyGeneratorX86 asmX86; + asmX86.generateProgram(p); //std::ofstream file("lightProg2.asm"); - //asmX86.printCode(std::cout); + asmX86.printCode(std::cout); return 0; } @@ -287,24 +290,37 @@ int main(int argc, char** argv) { dataset.dataset.size = datasetSize; RandomX::datasetAlloc(dataset, largePages); const uint64_t datasetBlockCount = datasetSize / RandomX::CacheLineSize; - if (initThreadCount > 1) { - auto perThread = datasetBlockCount / initThreadCount; - auto remainder = datasetBlockCount % initThreadCount; - for (int i = 0; i < initThreadCount; ++i) { - auto count = perThread + (i == initThreadCount - 1 ? remainder : 0); - threads.push_back(std::thread(&RandomX::datasetInit, std::ref(cache), std::ref(dataset.dataset), i * perThread, count)); - } - for (unsigned i = 0; i < threads.size(); ++i) { - threads[i].join(); + if (useSuperscalar) { + RandomX::Blake2Generator gen(seed, programCount); + RandomX::LightProgram programs[RANDOMX_CACHE_ACCESSES]; + for (int i = 0; i < RANDOMX_CACHE_ACCESSES; ++i) { + RandomX::generateLightProg2(programs[i], gen); } + RandomX::JitCompilerX86 jit86; + jit86.generateSuperScalarHash(programs); + jit86.getDatasetInitFunc()(cache.memory, dataset.dataset.memory, 0, datasetBlockCount); } else { - RandomX::datasetInit(cache, dataset.dataset, 0, datasetBlockCount); + if (initThreadCount > 1) { + auto perThread = datasetBlockCount / initThreadCount; + auto remainder = datasetBlockCount % initThreadCount; + for (int i = 0; i < initThreadCount; ++i) { + auto count = perThread + (i == initThreadCount - 1 ? remainder : 0); + threads.push_back(std::thread(&RandomX::datasetInit, std::ref(cache), std::ref(dataset.dataset), i * perThread, count)); + } + for (unsigned i = 0; i < threads.size(); ++i) { + threads[i].join(); + } + } + else { + RandomX::datasetInit(cache, dataset.dataset, 0, datasetBlockCount); + } } RandomX::deallocCache(cache, largePages); threads.clear(); std::cout << "Dataset (" << datasetSize << " bytes) initialized in " << sw.getElapsed() << " s" << std::endl; } + return 0; std::cout << "Initializing " << threadCount << " virtual machine(s) ..." << std::endl; for (int i = 0; i < threadCount; ++i) { RandomX::VirtualMachine* vm;