SuperscalarHash JIT compiler

(unfinished)
This commit is contained in:
tevador 2019-04-06 12:00:56 +02:00
parent 690707ef49
commit 77dbe14658
18 changed files with 453 additions and 135 deletions

View file

@ -97,14 +97,12 @@ namespace RandomX {
}
//1 uOP
void AssemblyGeneratorX86::h_IADD_R(Instruction& instr, int i) {
void AssemblyGeneratorX86::h_IADD_RS(Instruction& instr, int i) {
registerUsage[instr.dst] = i;
if (instr.src != instr.dst) {
asmCode << "\tadd " << regR[instr.dst] << ", " << regR[instr.src] << std::endl;
}
else {
asmCode << "\tadd " << regR[instr.dst] << ", " << (int32_t)instr.getImm32() << std::endl;
}
if(instr.dst == 5)
asmCode << "\tlea " << regR[instr.dst] << ", [" << regR[instr.dst] << "+" << regR[instr.src] << "*" << (1 << (instr.mod % 4)) << std::showpos << (int32_t)instr.getImm32() << std::noshowpos << "]" << std::endl;
else
asmCode << "\tlea " << regR[instr.dst] << ", [" << regR[instr.dst] << "+" << regR[instr.src] << "*" << (1 << (instr.mod % 4)) << "]" << std::endl;
traceint(instr);
}
@ -517,7 +515,7 @@ namespace RandomX {
InstructionGenerator AssemblyGeneratorX86::engine[256] = {
//Integer
INST_HANDLE(IADD_R)
INST_HANDLE(IADD_RS)
INST_HANDLE(IADD_M)
INST_HANDLE(IADD_RC)
INST_HANDLE(ISUB_R)

View file

@ -68,7 +68,7 @@ namespace RandomX {
void traceflt(Instruction&);
void tracenop(Instruction&);
void h_IADD_R(Instruction&, int);
void h_IADD_RS(Instruction&, int);
void h_IADD_M(Instruction&, int);
void h_IADD_RC(Instruction&, int);
void h_ISUB_R(Instruction&, int);

View file

@ -40,9 +40,9 @@ namespace RandomX {
os << "L3" << "[" << (getImm32() & ScratchpadL3Mask) << "]";
}
void Instruction::h_IADD_R(std::ostream& os) const {
void Instruction::h_IADD_RS(std::ostream& os) const {
if (src != dst) {
os << "r" << (int)dst << ", r" << (int)src << std::endl;
os << "r" << (int)dst << ", r" << (int)src << ", LSH " << (int)(mod % 4) << std::endl;
}
else {
os << "r" << (int)dst << ", " << (int32_t)getImm32() << std::endl;
@ -302,13 +302,13 @@ namespace RandomX {
}
void Instruction::h_COND_R(std::ostream& os) const {
os << "r" << (int)dst << ", " << condition((mod >> 2) & 7) << "(r" << (int)src << ", " << (int32_t)getImm32() << "), " << (int)(mod >> 5) << std::endl;
os << "r" << (int)dst << ", " << condition((mod >> 2) & 7) << "(r" << (int)src << ", " << (int32_t)getImm32() << "), LSH " << (int)(mod >> 5) << std::endl;
}
void Instruction::h_COND_M(std::ostream& os) const {
os << "r" << (int)dst << ", " << condition((mod >> 2) & 7) << "(";
genAddressReg(os);
os << ", " << (int32_t)getImm32() << "), " << (int)(mod >> 5) << std::endl;
os << ", " << (int32_t)getImm32() << "), LSH " << (int)(mod >> 5) << std::endl;
}
void Instruction::h_ISTORE(std::ostream& os) const {
@ -333,7 +333,7 @@ namespace RandomX {
const char* Instruction::names[256] = {
//Integer
INST_NAME(IADD_R)
INST_NAME(IADD_RS)
INST_NAME(IADD_M)
INST_NAME(IADD_RC)
INST_NAME(ISUB_R)
@ -379,7 +379,7 @@ namespace RandomX {
InstructionVisualizer Instruction::engine[256] = {
//Integer
INST_HANDLE(IADD_R)
INST_HANDLE(IADD_RS)
INST_HANDLE(IADD_M)
INST_HANDLE(IADD_RC)
INST_HANDLE(ISUB_R)

View file

@ -98,7 +98,7 @@ namespace RandomX {
void genAddressImm(std::ostream& os) const;
void genAddressRegDst(std::ostream&) const;
void h_IADD_R(std::ostream&) const;
void h_IADD_RS(std::ostream&) const;
void h_IADD_M(std::ostream&) const;
void h_IADD_RC(std::ostream&) const;
void h_ISUB_R(std::ostream&) const;

View file

@ -442,7 +442,7 @@ namespace RandomX {
auto& instr = program(i);
auto& ibc = byteCode[i];
switch (instr.opcode) {
CASE_REP(IADD_R) {
CASE_REP(IADD_RS) {
auto dst = instr.dst % RegistersCount;
auto src = instr.src % RegistersCount;
ibc.type = InstructionType::IADD_R;

View file

@ -26,9 +26,14 @@ PUBLIC randomx_program_start
PUBLIC randomx_program_read_dataset
PUBLIC randomx_program_read_dataset_light
PUBLIC randomx_program_read_dataset_light_sub
PUBLIC randomx_dataset_init
PUBLIC randomx_program_loop_store
PUBLIC randomx_program_loop_end
PUBLIC randomx_program_epilogue
PUBLIC randomx_sshash_load
PUBLIC randomx_sshash_prefetch
PUBLIC randomx_sshash_end
PUBLIC randomx_sshash_init
PUBLIC randomx_program_end
ALIGN 64
@ -75,11 +80,93 @@ randomx_program_read_dataset_light_sub PROC
include asm/squareHash.inc
randomx_program_read_dataset_light_sub ENDP
ALIGN 64
randomx_dataset_init PROC
push rbx
push rbp
push rdi
push rsi
push r12
push r13
push r14
push r15
mov rdi, rcx ;# cache
mov rsi, rdx ;# dataset
mov rbp, r8 ;# block index
push r9 ;# max. block index
init_block_loop:
prefetchw byte ptr [rsi]
mov rbx, rbp
db 232 ;# 0xE8 = call
dd 32768 - distance
distance equ $ - offset randomx_dataset_init
mov qword ptr [rsi+0], r8
mov qword ptr [rsi+8], r9
mov qword ptr [rsi+16], r10
mov qword ptr [rsi+24], r11
mov qword ptr [rsi+32], r12
mov qword ptr [rsi+40], r13
mov qword ptr [rsi+48], r14
mov qword ptr [rsi+56], r15
add rbp, 1
add rsi, 64
cmp rbp, qword ptr [rsp]
jb init_block_loop
pop r9
pop r15
pop r14
pop r13
pop r12
pop rsi
pop rdi
pop rbp
pop rbx
ret
randomx_dataset_init ENDP
ALIGN 64
randomx_program_epilogue PROC
include asm/program_epilogue_win64.inc
randomx_program_epilogue ENDP
ALIGN 64
randomx_sshash_load PROC
include asm/program_sshash_load.inc
randomx_sshash_load ENDP
randomx_sshash_prefetch PROC
include asm/program_sshash_prefetch.inc
randomx_sshash_prefetch ENDP
randomx_sshash_end PROC
nop
randomx_sshash_end ENDP
ALIGN 64
randomx_sshash_init PROC
lea r8, [rbx+1]
include asm/program_sshash_prefetch.inc
imul r8, qword ptr [r0_mul]
mov r9, qword ptr [r1_add]
xor r9, r8
mov r10, qword ptr [r2_add]
xor r10, r8
mov r11, qword ptr [r3_add]
xor r11, r8
mov r12, qword ptr [r4_add]
xor r12, r8
mov r13, qword ptr [r5_add]
xor r13, r8
mov r14, qword ptr [r6_add]
xor r14, r8
mov r15, qword ptr [r7_add]
xor r15, r8
jmp randomx_program_end
randomx_sshash_init ENDP
ALIGN 64
include asm/program_sshash_constants.inc
ALIGN 64
randomx_program_end PROC
nop

View file

@ -27,6 +27,11 @@ extern "C" {
void randomx_program_loop_store();
void randomx_program_loop_end();
void randomx_program_read_dataset_light_sub();
void randomx_dataset_init();
void randomx_program_epilogue();
void randomx_sshash_load();
void randomx_sshash_prefetch();
void randomx_sshash_end();
void randomx_sshash_init();
void randomx_program_end();
}

View file

@ -88,29 +88,40 @@ namespace RandomX {
#include "JitCompilerX86-static.hpp"
#define NOP_TEST true
const uint8_t* codePrologue = (uint8_t*)&randomx_program_prologue;
const uint8_t* codeLoopBegin = (uint8_t*)&randomx_program_loop_begin;
const uint8_t* codeLoopLoad = (uint8_t*)&randomx_program_loop_load;
const uint8_t* codeProgamStart = (uint8_t*)&randomx_program_start;
const uint8_t* codeReadDataset = (uint8_t*)&randomx_program_read_dataset;
const uint8_t* codeReadDatasetLight = (uint8_t*)&randomx_program_read_dataset_light;
const uint8_t* codeDatasetInit = (uint8_t*)&randomx_dataset_init;
const uint8_t* codeLoopStore = (uint8_t*)&randomx_program_loop_store;
const uint8_t* codeLoopEnd = (uint8_t*)&randomx_program_loop_end;
const uint8_t* codeReadDatasetLightSub = (uint8_t*)&randomx_program_read_dataset_light_sub;
const uint8_t* codeEpilogue = (uint8_t*)&randomx_program_epilogue;
const uint8_t* codeProgramEnd = (uint8_t*)&randomx_program_end;
const uint8_t* codeShhLoad = (uint8_t*)&randomx_sshash_load;
const uint8_t* codeShhPrefetch = (uint8_t*)&randomx_sshash_prefetch;
const uint8_t* codeShhEnd = (uint8_t*)&randomx_sshash_end;
const uint8_t* codeShhInit = (uint8_t*)&randomx_sshash_init;
const int32_t prologueSize = codeLoopBegin - codePrologue;
const int32_t epilogueSize = codeProgramEnd - codeEpilogue;
const int32_t loopLoadSize = codeProgamStart - codeLoopLoad;
const int32_t readDatasetSize = codeReadDatasetLight - codeReadDataset;
const int32_t readDatasetLightSize = codeLoopStore - codeReadDatasetLight;
const int32_t loopStoreSize = codeLoopEnd - codeLoopStore;
const int32_t readDatasetLightSubSize = codeEpilogue - codeReadDatasetLightSub;
const int32_t readDatasetLightSubSize = codeDatasetInit - codeReadDatasetLightSub;
const int32_t datasetInitSize = codeEpilogue - codeDatasetInit;
const int32_t epilogueSize = codeShhLoad - codeEpilogue;
const int32_t codeSshLoadSize = codeShhPrefetch - codeShhLoad;
const int32_t codeSshPrefetchSize = codeShhEnd - codeShhPrefetch;
const int32_t codeSshInitSize = codeProgramEnd - codeShhInit;
const int32_t epilogueOffset = CodeSize - epilogueSize;
const int32_t readDatasetLightSubOffset = epilogueOffset - readDatasetLightSubSize;
constexpr int32_t superScalarHashOffset = 32768;
static const uint8_t REX_ADD_RR[] = { 0x4d, 0x03 };
static const uint8_t REX_ADD_RM[] = { 0x4c, 0x03 };
@ -166,7 +177,7 @@ namespace RandomX {
static const uint8_t SQRTPD[] = { 0x66, 0x0f, 0x51 };
static const uint8_t AND_OR_MOV_LDMXCSR[] = { 0x25, 0x00, 0x60, 0x00, 0x00, 0x0D, 0xC0, 0x9F, 0x00, 0x00, 0x89, 0x44, 0x24, 0xF8, 0x0F, 0xAE, 0x54, 0x24, 0xF8 };
static const uint8_t ROL_RAX[] = { 0x48, 0xc1, 0xc0 };
static const uint8_t XOR_ECX_ECX[] = { 0x33, 0xC9 };
static const uint8_t XOR_RCX_RCX[] = { 0x48, 0x33, 0xC9 };
static const uint8_t REX_CMP_R32I[] = { 0x41, 0x81 };
static const uint8_t REX_CMP_M32I[] = { 0x81, 0x3c, 0x06 };
static const uint8_t MOVAPD[] = { 0x66, 0x0f, 0x29 };
@ -184,6 +195,18 @@ namespace RandomX {
static const uint8_t REX_ADD_I[] = { 0x49, 0x81 };
static const uint8_t REX_TEST[] = { 0x49, 0xF7 };
static const uint8_t JZ[] = { 0x0f, 0x84 };
static const uint8_t RET = 0xc3;
static const uint8_t NOP1[] = { 0x90 };
static const uint8_t NOP2[] = { 0x66, 0x90 };
static const uint8_t NOP3[] = { 0x0F, 0x1F, 0x00 };
static const uint8_t NOP4[] = { 0x0F, 0x1F, 0x40, 0x00 };
static const uint8_t NOP5[] = { 0x0F, 0x1F, 0x44, 0x00, 0x00 };
static const uint8_t NOP6[] = { 0x66, 0x0F, 0x1F, 0x44, 0x00, 0x00 };
static const uint8_t NOP7[] = { 0x0F, 0x1F, 0x80, 0x00, 0x00, 0x00, 0x00 };
static const uint8_t NOP8[] = { 0x0F, 0x1F, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00 };
static const uint8_t* NOPX[] = { NOP1, NOP2, NOP3, NOP4, NOP5, NOP6, NOP7, NOP8 };
size_t JitCompilerX86::getCodeSize() {
return codePos - prologueSize;
@ -196,6 +219,10 @@ namespace RandomX {
memcpy(code + readDatasetLightSubOffset, codeReadDatasetLightSub, readDatasetLightSubSize);
}
JitCompilerX86::~JitCompilerX86() {
freePagedMemory(code, CodeSize);
}
void JitCompilerX86::generateProgram(Program& prog) {
generateProgramPrologue(prog);
memcpy(code + codePos, codeReadDataset, readDatasetSize);
@ -216,6 +243,42 @@ namespace RandomX {
generateProgramEpilogue(prog);
}
template<size_t N>
void JitCompilerX86::generateSuperScalarHash(LightProgram(&programs)[N]) {
memcpy(code + superScalarHashOffset, codeShhInit, codeSshInitSize);
codePos = superScalarHashOffset + codeSshInitSize;
for (unsigned j = 0; j < N; ++j) {
LightProgram& prog = programs[j];
for (unsigned i = 0; i < prog.getSize(); ++i) {
Instruction& instr = prog(i);
instr.src %= RegistersCount;
instr.dst %= RegistersCount;
generateCode(instr, i);
}
emit(codeShhLoad, codeSshLoadSize);
if (j < N - 1) {
emit(REX_MOV_RR64);
emitByte(0xd8 + prog.getAddressRegister());
emit(codeShhPrefetch, codeSshPrefetchSize);
int align = (codePos % 16);
while (align != 0) {
int nopSize = 16 - align;
if (nopSize > 8) nopSize = 8;
emit(NOPX[nopSize - 1], nopSize);
align = (codePos % 16);
}
}
}
emitByte(RET);
}
template
void JitCompilerX86::generateSuperScalarHash(LightProgram(&programs)[RANDOMX_CACHE_ACCESSES]);
void JitCompilerX86::generateDatasetInitCode() {
memcpy(code, codeDatasetInit, datasetInitSize);
}
void JitCompilerX86::generateProgramPrologue(Program& prog) {
#ifdef RANDOMX_JUMP
instructionOffsets.clear();
@ -253,7 +316,6 @@ namespace RandomX {
emit32(prologueSize - codePos - 4);
emitByte(JMP);
emit32(epilogueOffset - codePos - 4);
emitByte(0x90);
}
void JitCompilerX86::generateCode(Instruction& instr, int i) {
@ -287,9 +349,9 @@ namespace RandomX {
emit32(instr.getImm32() & ScratchpadL3Mask);
}
void JitCompilerX86::h_IADD_R(Instruction& instr, int i) {
void JitCompilerX86::h_IADD_RS(Instruction& instr, int i) {
registerUsage[instr.dst] = i;
if (instr.src != instr.dst) {
/*if (instr.src != instr.dst) {
emit(REX_ADD_RR);
emitByte(0xc0 + 8 * instr.dst + instr.src);
}
@ -297,7 +359,19 @@ namespace RandomX {
emit(REX_81);
emitByte(0xc0 + instr.dst);
emit32(instr.getImm32());
}*/
if (false && NOP_TEST) {
emit(NOP4);
return;
}
emit(REX_LEA);
if (instr.dst == 5) //rbp,r13 cannot be the base register without offset
emitByte(0xac);
else
emitByte(0x04 + 8 * instr.dst);
genSIB(instr.mod % 4, instr.src, instr.dst);
if (instr.dst == 5)
emit32(instr.getImm32());
}
void JitCompilerX86::h_IADD_M(Instruction& instr, int i) {
@ -330,10 +404,18 @@ namespace RandomX {
void JitCompilerX86::h_ISUB_R(Instruction& instr, int i) {
registerUsage[instr.dst] = i;
if (instr.src != instr.dst) {
if (false && NOP_TEST) {
emit(NOP3);
return;
}
emit(REX_SUB_RR);
emitByte(0xc0 + 8 * instr.dst + instr.src);
}
else {
if (false && NOP_TEST) {
emit(NOP7);
return;
}
emit(REX_81);
emitByte(0xe8 + instr.dst);
emit32(instr.getImm32());
@ -366,10 +448,18 @@ namespace RandomX {
void JitCompilerX86::h_IMUL_R(Instruction& instr, int i) {
registerUsage[instr.dst] = i;
if (instr.src != instr.dst) {
if (false && NOP_TEST) {
emit(NOP4);
return;
}
emit(REX_IMUL_RR);
emitByte(0xc0 + 8 * instr.dst + instr.src);
}
else {
if (false && NOP_TEST) {
emit(NOP7);
return;
}
emit(REX_IMUL_RRI);
emitByte(0xc0 + 9 * instr.dst);
emit32(instr.getImm32());
@ -393,6 +483,12 @@ namespace RandomX {
void JitCompilerX86::h_IMULH_R(Instruction& instr, int i) {
registerUsage[instr.dst] = i;
if (false && NOP_TEST) {
emit(NOP3);
emit(NOP3);
emit(NOP3);
return;
}
emit(REX_MOV_RR64);
emitByte(0xc0 + instr.dst);
emit(REX_MUL_R);
@ -422,6 +518,12 @@ namespace RandomX {
void JitCompilerX86::h_ISMULH_R(Instruction& instr, int i) {
registerUsage[instr.dst] = i;
if (false && NOP_TEST) {
emit(NOP3);
emit(NOP3);
emit(NOP3);
return;
}
emit(REX_MOV_RR64);
emitByte(0xc0 + instr.dst);
emit(REX_MUL_R);
@ -451,6 +553,13 @@ namespace RandomX {
void JitCompilerX86::h_IMUL_RCP(Instruction& instr, int i) {
if (instr.getImm32() != 0) {
if (false && NOP_TEST) {
emitByte(0x66);
emitByte(0x66);
emit(NOP8);
emit(NOP4);
return;
}
registerUsage[instr.dst] = i;
emit(MOV_RAX_I);
emit64(reciprocal(instr.getImm32()));
@ -472,10 +581,18 @@ namespace RandomX {
void JitCompilerX86::h_IXOR_R(Instruction& instr, int i) {
registerUsage[instr.dst] = i;
if (instr.src != instr.dst) {
if (false && NOP_TEST) {
emit(NOP3);
return;
}
emit(REX_XOR_RR);
emitByte(0xc0 + 8 * instr.dst + instr.src);
}
else {
if (false && NOP_TEST) {
emit(NOP7);
return;
}
emit(REX_XOR_RI);
emitByte(0xf0 + instr.dst);
emit32(instr.getImm32());
@ -500,12 +617,21 @@ namespace RandomX {
void JitCompilerX86::h_IROR_R(Instruction& instr, int i) {
registerUsage[instr.dst] = i;
if (instr.src != instr.dst) {
if (false && NOP_TEST) {
emit(NOP3);
emit(NOP3);
return;
}
emit(REX_MOV_RR);
emitByte(0xc8 + instr.src);
emit(REX_ROT_CL);
emitByte(0xc8 + instr.dst);
}
else {
if (NOP_TEST) {
emit(NOP4);
return;
}
emit(REX_ROT_I8);
emitByte(0xc8 + instr.dst);
emitByte(instr.getImm32() & 63);
@ -700,6 +826,12 @@ namespace RandomX {
const int conditionMask = ((1 << RANDOMX_CONDITION_BITS) - 1) << shift;
int reg = getConditionRegister();
int target = registerUsage[reg] + 1;
if (false && NOP_TEST) {
emit(NOP7);
emit(NOP7);
emit(NOP6);
}
else {
emit(REX_ADD_I);
emitByte(0xc0 + reg);
emit32(1 << shift);
@ -708,6 +840,7 @@ namespace RandomX {
emit32(conditionMask);
emit(JZ);
emit32(instructionOffsets[target] - (codePos + 4));
}
for (unsigned j = 0; j < 8; ++j) { //mark all registers as used
registerUsage[j] = i;
}
@ -717,7 +850,14 @@ namespace RandomX {
#ifdef RANDOMX_JUMP
handleCondition(instr, i);
#endif
emit(XOR_ECX_ECX);
if (false && NOP_TEST) {
emit(NOP3);
emit(NOP7);
emit(NOP3);
emit(NOP3);
return;
}
emit(XOR_RCX_RCX);
emit(REX_CMP_R32I);
emitByte(0xf8 + instr.src);
emit32(instr.getImm32());
@ -732,7 +872,7 @@ namespace RandomX {
#ifdef RANDOMX_JUMP
handleCondition(instr, i);
#endif
emit(XOR_ECX_ECX);
emit(XOR_RCX_RCX);
genAddressReg(instr);
emit(REX_CMP_M32I);
emit32(instr.getImm32());
@ -765,7 +905,7 @@ namespace RandomX {
#define INST_HANDLE(x) REPN(&JitCompilerX86::h_##x, WT(x))
InstructionGeneratorX86 JitCompilerX86::engine[256] = {
INST_HANDLE(IADD_R)
INST_HANDLE(IADD_RS)
INST_HANDLE(IADD_M)
INST_HANDLE(IADD_RC)
INST_HANDLE(ISUB_R)

View file

@ -27,6 +27,7 @@ along with RandomX. If not, see<http://www.gnu.org/licenses/>.
namespace RandomX {
class Program;
class LightProgram;
class JitCompilerX86;
typedef void(JitCompilerX86::*InstructionGeneratorX86)(Instruction&, int);
@ -36,11 +37,18 @@ namespace RandomX {
class JitCompilerX86 {
public:
JitCompilerX86();
~JitCompilerX86();
void generateProgram(Program&);
void generateProgramLight(Program&);
template<size_t N>
void generateSuperScalarHash(LightProgram (&programs)[N]);
ProgramFunc getProgramFunc() {
return (ProgramFunc)code;
}
DatasetInitFunc getDatasetInitFunc() {
generateDatasetInitCode();
return (DatasetInitFunc)code;
}
uint8_t* getCode() {
return code;
}
@ -62,6 +70,8 @@ namespace RandomX {
}
}
void generateDatasetInitCode();
void generateProgramPrologue(Program&);
void generateProgramEpilogue(Program&);
int getConditionRegister();
@ -100,13 +110,15 @@ namespace RandomX {
template<size_t N>
void emit(const uint8_t (&src)[N]) {
for (unsigned i = 0; i < N; ++i) {
code[codePos + i] = src[i];
}
codePos += N;
emit(src, N);
}
void h_IADD_R(Instruction&, int);
void emit(const uint8_t* src, size_t count) {
memcpy(code + codePos, src, count);
codePos += count;
}
void h_IADD_RS(Instruction&, int);
void h_IADD_M(Instruction&, int);
void h_IADD_RC(Instruction&, int);
void h_ISUB_R(Instruction&, int);

View file

@ -26,6 +26,7 @@ along with RandomX. If not, see<http://www.gnu.org/licenses/>.
#include <algorithm>
#include <stdexcept>
#include <iomanip>
#include "LightProgramGenerator.hpp"
namespace RandomX {
// Intel Ivy Bridge reference
@ -47,8 +48,8 @@ namespace RandomX {
}
namespace LightInstructionOpcode {
constexpr int IADD_R = 0;
constexpr int IADD_RC = RANDOMX_FREQ_IADD_R + RANDOMX_FREQ_IADD_M;
constexpr int IADD_RS = 0;
constexpr int IADD_RC = RANDOMX_FREQ_IADD_RS + RANDOMX_FREQ_IADD_M;
constexpr int ISUB_R = IADD_RC + RANDOMX_FREQ_IADD_RC;
constexpr int IMUL_9C = ISUB_R + RANDOMX_FREQ_ISUB_R + RANDOMX_FREQ_ISUB_M;
constexpr int IMUL_R = IMUL_9C + RANDOMX_FREQ_IMUL_9C;
@ -65,20 +66,18 @@ namespace RandomX {
}
const int lightInstructionOpcode[] = {
LightInstructionOpcode::IADD_R,
LightInstructionOpcode::IADD_R,
LightInstructionOpcode::IADD_RC,
LightInstructionOpcode::ISUB_R,
LightInstructionOpcode::IMUL_9C,
LightInstructionOpcode::IMUL_R,
LightInstructionOpcode::IMUL_R,
LightInstructionOpcode::IADD_RS,
LightInstructionOpcode::ISUB_R, //ISUB_R
LightInstructionOpcode::ISUB_R, //ISUB_R
LightInstructionOpcode::IMUL_R, //IMUL_R
LightInstructionOpcode::IMUL_R, //IMUL_C
LightInstructionOpcode::IMULH_R,
LightInstructionOpcode::ISMULH_R,
LightInstructionOpcode::IMUL_RCP,
LightInstructionOpcode::IXOR_R,
LightInstructionOpcode::IXOR_R,
LightInstructionOpcode::IROR_R,
LightInstructionOpcode::IROR_R,
LightInstructionOpcode::IXOR_R, //IXOR_R
LightInstructionOpcode::IXOR_R, //IXOR_C
LightInstructionOpcode::IROR_R, //IROR_R
LightInstructionOpcode::IROR_R, //IROR_C
LightInstructionOpcode::COND_R
};
@ -93,37 +92,30 @@ namespace RandomX {
constexpr type P015 = 6;
}
class Blake2Generator {
public:
Blake2Generator(const void* seed, int nonce) : dataIndex(sizeof(data)) {
Blake2Generator::Blake2Generator(const void* seed, int nonce) : dataIndex(sizeof(data)) {
memset(data, 0, sizeof(data));
memcpy(data, seed, SeedSize);
store32(&data[60], nonce);
}
uint8_t getByte() {
uint8_t Blake2Generator::getByte() {
checkData(1);
return data[dataIndex++];
}
uint32_t getInt32() {
uint32_t Blake2Generator::getInt32() {
checkData(4);
auto ret = load32(&data[dataIndex]);
dataIndex += 4;
return ret;
}
private:
uint8_t data[64];
size_t dataIndex;
void checkData(const size_t bytesNeeded) {
void Blake2Generator::checkData(const size_t bytesNeeded) {
if (dataIndex + bytesNeeded > sizeof(data)) {
blake2b(data, sizeof(data), data, sizeof(data), nullptr, 0);
dataIndex = 0;
}
}
};
class RegisterInfo {
public:
@ -201,7 +193,7 @@ namespace RandomX {
static const MacroOp Xor_ri;
static const MacroOp Ror_rcl;
static const MacroOp Ror_ri;
static const MacroOp TestJmp_fused;
static const MacroOp TestJz_fused;
static const MacroOp Xor_self;
static const MacroOp Cmp_ri;
static const MacroOp Setcc_r;
@ -235,13 +227,13 @@ namespace RandomX {
const MacroOp MacroOp::Xor_self = MacroOp("xor rcx,rcx", 3);
const MacroOp MacroOp::Cmp_ri = MacroOp("cmp r,i", 7, 1, ExecutionPort::P015);
const MacroOp MacroOp::Setcc_r = MacroOp("setcc cl", 3, 1, ExecutionPort::P05);
const MacroOp MacroOp::TestJmp_fused = MacroOp("testjz r,i", 13, 0, ExecutionPort::P5);
const MacroOp MacroOp::TestJz_fused = MacroOp("testjz r,i", 13, 0, ExecutionPort::P5);
const MacroOp IMULH_R_ops_array[] = { MacroOp::Mov_rr, MacroOp::Mul_r, MacroOp::Mov_rr };
const MacroOp ISMULH_R_ops_array[] = { MacroOp::Mov_rr, MacroOp::Imul_r, MacroOp::Mov_rr };
const MacroOp IMUL_RCP_ops_array[] = { MacroOp::Mov_ri64, MacroOp(MacroOp::Imul_rr, true) };
const MacroOp IROR_R_ops_array[] = { MacroOp::Mov_rr, MacroOp::Ror_rcl };
const MacroOp COND_R_ops_array[] = { MacroOp::Add_ri, MacroOp(MacroOp::TestJmp_fused, true), MacroOp::Xor_self, MacroOp::Cmp_ri, MacroOp(MacroOp::Setcc_r, true), MacroOp(MacroOp::Add_rr, true) };
const MacroOp COND_R_ops_array[] = { MacroOp::Add_ri, MacroOp(MacroOp::TestJz_fused, true), MacroOp::Xor_self, MacroOp::Cmp_ri, MacroOp(MacroOp::Setcc_r, true), MacroOp(MacroOp::Add_rr, true) };
class LightInstructionInfo {
@ -349,7 +341,7 @@ namespace RandomX {
class DecoderBuffer {
public:
static DecoderBuffer Default;
static const DecoderBuffer Default;
template <size_t N>
DecoderBuffer(const char* name, int index, const int(&arr)[N])
: name_(name), index_(index), counts_(arr), opsCount_(N) {}
@ -365,17 +357,17 @@ namespace RandomX {
const char* getName() const {
return name_;
}
const DecoderBuffer& fetchNext(int prevType, Blake2Generator& gen) {
const DecoderBuffer* fetchNext(int prevType, Blake2Generator& gen) const {
if (prevType == LightInstructionType::IMULH_R || prevType == LightInstructionType::ISMULH_R)
return decodeBuffer3310; //2-1-1 decode
return &decodeBuffer3310; //2-1-1 decode
if (index_ == 0) {
return decodeBuffer4444; //IMUL_RCP end
}
if (index_ == 2) {
return decodeBuffer133; //COND_R middle
return &decodeBuffer4444; //IMUL_RCP end
}
/*if (index_ == 2) {
return &decodeBuffer133; //COND_R middle
}*/
if (index_ == 7) {
return decodeBuffer7333; //COND_R end
return &decodeBuffer7333; //COND_R end
}
return fetchNextDefault(gen);
}
@ -393,12 +385,12 @@ namespace RandomX {
static const DecoderBuffer decodeBuffer3373;
static const DecoderBuffer decodeBuffer133;
static const DecoderBuffer* decodeBuffers[7];
const DecoderBuffer& fetchNextDefault(Blake2Generator& gen) {
const DecoderBuffer* fetchNextDefault(Blake2Generator& gen) const {
int select;
do {
select = gen.getByte() & 7;
} while (select == 7);
return *decodeBuffers[select];
return decodeBuffers[select];
}
};
@ -420,7 +412,7 @@ namespace RandomX {
&DecoderBuffer::decodeBuffer3373,
};
DecoderBuffer DecoderBuffer::Default = DecoderBuffer();
const DecoderBuffer DecoderBuffer::Default = DecoderBuffer();
const LightInstructionInfo* slot_3[] = { &LightInstructionInfo::ISUB_R, &LightInstructionInfo::IXOR_R };
const LightInstructionInfo* slot_3L[] = { &LightInstructionInfo::ISUB_R, &LightInstructionInfo::IXOR_R, &LightInstructionInfo::IMULH_R, &LightInstructionInfo::ISMULH_R };
@ -472,7 +464,7 @@ namespace RandomX {
case 4:
return create(slot_4[gen.getByte() & 3], gen);
case 7:
if (isLast) {
if (false && isLast) {
return create(slot_7L, gen);
}
else {
@ -595,7 +587,7 @@ namespace RandomX {
bool selectDestination(int cycle, RegisterInfo (&registers)[8], Blake2Generator& gen) {
std::vector<int> availableRegisters;
for (unsigned i = 0; i < 8; ++i) {
if (registers[i].latency <= cycle && (canReuse_ || i != src_) && (registers[i].lastOpGroup != opGroup_ || registers[i].lastOpPar != opGroupPar_))
if (registers[i].latency <= cycle && (canReuse_ || i != src_) && (registers[i].lastOpGroup != opGroup_ || registers[i].lastOpPar != opGroupPar_) && (info_.getType() != LightInstructionType::IADD_RS || i != 5))
availableRegisters.push_back(i);
}
return selectRegister(availableRegisters, gen, dst_);
@ -607,6 +599,12 @@ namespace RandomX {
if (registers[i].latency <= cycle)
availableRegisters.push_back(i);
}
if (availableRegisters.size() == 2 && info_.getType() == LightInstructionType::IADD_RS) {
if (availableRegisters[0] == 5 || availableRegisters[1] == 5) {
opGroupPar_ = src_ = 5;
return true;
}
}
if (selectRegister(availableRegisters, gen, src_)) {
if (groupParIsSource_)
opGroupPar_ = src_;
@ -666,7 +664,7 @@ namespace RandomX {
constexpr int V4_SRC_INDEX_BITS = 3;
constexpr int V4_DST_INDEX_BITS = 3;
constexpr int CYCLE_MAP_SIZE = RANDOMX_LPROG_LATENCY + 3;
constexpr bool TRACE = true;
constexpr bool TRACE = false;
static int blakeCounter = 0;
@ -782,15 +780,14 @@ namespace RandomX {
}
}
void generateLightProg2(LightProgram& prog, const void* seed, int indexRegister, int nonce) {
double generateLightProg2(LightProgram& prog, Blake2Generator& gen) {
ExecutionPort::type portBusy[CYCLE_MAP_SIZE][3];
memset(portBusy, 0, sizeof(portBusy));
RegisterInfo registers[8];
Blake2Generator gen(seed, nonce);
std::vector<LightInstruction> instructions;
DecoderBuffer& fetchLine = DecoderBuffer::Default;
const DecoderBuffer* fetchLine = &DecoderBuffer::Default;
LightInstruction currentInstruction = LightInstruction::Null;
int instrIndex = 0;
int codeSize = 0;
@ -806,24 +803,24 @@ namespace RandomX {
constexpr int MAX_ATTEMPTS = 4;
while(!portsSaturated) {
fetchLine = fetchLine.fetchNext(currentInstruction.getType(), gen);
if (TRACE) std::cout << "; ------------- fetch cycle " << cycle << " (" << fetchLine.getName() << ")" << std::endl;
fetchLine = fetchLine->fetchNext(currentInstruction.getType(), gen);
if (TRACE) std::cout << "; ------------- fetch cycle " << cycle << " (" << fetchLine->getName() << ")" << std::endl;
mopIndex = 0;
while (mopIndex < fetchLine.getSize()) {
while (mopIndex < fetchLine->getSize()) {
int topCycle = cycle;
if (instrIndex >= currentInstruction.getInfo().getSize()) {
if (portsSaturated)
break;
currentInstruction = LightInstruction::createForSlot(gen, fetchLine.getCounts()[mopIndex], fetchLine.getSize() == mopIndex + 1, fetchLine.getIndex() == 0 && mopIndex == 0);
currentInstruction = LightInstruction::createForSlot(gen, fetchLine->getCounts()[mopIndex], fetchLine->getSize() == mopIndex + 1, fetchLine->getIndex() == 0 && mopIndex == 0);
instrIndex = 0;
if (TRACE) std::cout << "; " << currentInstruction.getInfo().getName() << std::endl;
}
MacroOp& mop = currentInstruction.getInfo().getOp(instrIndex);
if (fetchLine.getCounts()[mopIndex] != mop.getSize()) {
if (TRACE) std::cout << "ERROR instruction " << mop.getName() << " doesn't fit into slot of size " << fetchLine.getCounts()[mopIndex] << std::endl;
return;
if (fetchLine->getCounts()[mopIndex] != mop.getSize()) {
if (TRACE) std::cout << "ERROR instruction " << mop.getName() << " doesn't fit into slot of size " << fetchLine->getCounts()[mopIndex] << std::endl;
return DBL_MIN;
}
if (TRACE) std::cout << mop.getName() << " ";
@ -831,7 +828,7 @@ namespace RandomX {
mop.setCycle(scheduleCycle);
if (scheduleCycle < 0) {
if (TRACE) std::cout << "; Failed at cycle " << cycle << std::endl;
return;
return DBL_MIN;
}
if (instrIndex == currentInstruction.getInfo().getSrcOp()) {
@ -893,25 +890,29 @@ namespace RandomX {
std::cout << "; (* = in use, _ = idle)" << std::endl;
int portCycles = 0;
for (int i = 0; i < CYCLE_MAP_SIZE; ++i) {
/*for (int i = 0; i < CYCLE_MAP_SIZE; ++i) {
std::cout << "; " << std::setw(3) << i << " ";
for (int j = 0; j < 3; ++j) {
std::cout << (portBusy[i][j] ? '*' : '_');
portCycles += !!portBusy[i][j];
}
std::cout << std::endl;
}
}*/
double ipc = (macroOpCount / (double)retireCycle);
std::cout << "; code size " << codeSize << " bytes" << std::endl;
std::cout << "; x86 macro-ops: " << macroOpCount << std::endl;
std::cout << "; RandomX instructions: " << outIndex << std::endl;
std::cout << "; Execution time: " << retireCycle << " cycles" << std::endl;
std::cout << "; IPC = " << (macroOpCount / (double)retireCycle) << std::endl;
std::cout << "; IPC = " << ipc << std::endl;
std::cout << "; Port-cycles: " << portCycles << std::endl;
std::cout << "; Multiplications: " << mulCount << std::endl;
int asicLatency[8];
memset(asicLatency, 0, sizeof(asicLatency));
for (int i = 0; i < outIndex; ++i) {
Instruction& instr = prog(i);
int latDst = asicLatency[instr.dst] + 1;
@ -919,7 +920,16 @@ namespace RandomX {
asicLatency[instr.dst] = std::max(latDst, latSrc);
}
std::cout << "; Multiplications: " << mulCount << std::endl;
int asicLatencyFinal = 0;
int addressReg = 0;
for (int i = 0; i < 8; ++i) {
if (asicLatency[i] > asicLatencyFinal) {
asicLatencyFinal = asicLatency[i];
addressReg = i;
}
}
std::cout << "; ASIC latency: " << asicLatencyFinal << std::endl;
std::cout << "; ASIC latency:" << std::endl;
for (int i = 0; i < 8; ++i) {
@ -931,5 +941,7 @@ namespace RandomX {
}
prog.setSize(outIndex);
prog.setAddressRegister(addressReg);
return addressReg;
}
}

View file

@ -20,6 +20,18 @@ along with RandomX. If not, see<http://www.gnu.org/licenses/>.
#include "Program.hpp"
namespace RandomX {
void generateLightProgram(LightProgram& prog, const void* seed, int indexRegister, int nonce);
void generateLightProg2(LightProgram& prog, const void* seed, int indexRegister, int nonce);
class Blake2Generator {
public:
Blake2Generator(const void* seed, int nonce);
uint8_t getByte();
uint32_t getInt32();
private:
uint8_t data[64];
size_t dataIndex;
void checkData(const size_t);
};
double generateLightProg2(LightProgram& prog, Blake2Generator& gen);
}

View file

@ -68,6 +68,12 @@ namespace RandomX {
void setSize(uint32_t val) {
size = val;
}
int getAddressRegister() {
return addrReg;
}
void setAddressRegister(uint32_t val) {
addrReg = val;
}
private:
void print(std::ostream& os) const {
for (unsigned i = 0; i < size; ++i) {
@ -77,6 +83,7 @@ namespace RandomX {
}
Instruction programBuffer[RANDOMX_LPROG_MAX_SIZE];
uint32_t size;
int addrReg;
};
static_assert(sizeof(Program) % 64 == 0, "Invalid size of class Program");

View file

@ -0,0 +1,16 @@
r0_mul: ;# 6364136223846793005
db 45, 127, 149, 76, 45, 244, 81, 88
r1_add: ;# 9298410992540426048
db 64, 159, 245, 89, 136, 151, 10, 129
r2_add: ;# 12065312585734608966
db 70, 216, 194, 56, 223, 153, 112, 167
r3_add: ;# 9306329213124610396
db 92, 9, 34, 191, 28, 185, 38, 129
r4_add: ;# 5281919268842080866
db 98, 138, 159, 23, 151, 37, 77, 73
r5_add: ;# 10536153434571861004
db 12, 236, 170, 206, 185, 239, 55, 146
r6_add: ;# 3398623926847679864
db 120, 45, 230, 108, 116, 86, 42, 47
r7_add: ;# 9549104520008361294
db 78, 229, 44, 182, 247, 59, 133, 132

View file

@ -0,0 +1,8 @@
;xor r8, qword ptr [rbx+0]
;xor r9, qword ptr [rbx+8]
;xor r10, qword ptr [rbx+16]
;xor r11, qword ptr [rbx+24]
;xor r12, qword ptr [rbx+32]
;xor r13, qword ptr [rbx+40]
;xor r14, qword ptr [rbx+48]
;xor r15, qword ptr [rbx+56]

View file

@ -0,0 +1,4 @@
and rbx, 4194303
shl rbx, 6
add rbx, rdi
; prefetchnta byte ptr [rbx]

View file

@ -41,7 +41,7 @@ namespace RandomX {
static_assert((RANDOMX_SCRATCHPAD_L1 & (RANDOMX_SCRATCHPAD_L1 - 1)) == 0, "RANDOMX_SCRATCHPAD_L1 must be a power of 2.");
static_assert(RANDOMX_CACHE_ACCESSES > 1, "RANDOMX_CACHE_ACCESSES must be greater than 1");
constexpr int wtSum = RANDOMX_FREQ_IADD_R + RANDOMX_FREQ_IADD_M + RANDOMX_FREQ_IADD_RC + RANDOMX_FREQ_ISUB_R + \
constexpr int wtSum = RANDOMX_FREQ_IADD_RS + RANDOMX_FREQ_IADD_M + RANDOMX_FREQ_IADD_RC + RANDOMX_FREQ_ISUB_R + \
RANDOMX_FREQ_ISUB_M + RANDOMX_FREQ_IMUL_9C + RANDOMX_FREQ_IMUL_R + RANDOMX_FREQ_IMUL_M + RANDOMX_FREQ_IMULH_R + \
RANDOMX_FREQ_IMULH_M + RANDOMX_FREQ_ISMULH_R + RANDOMX_FREQ_ISMULH_M + RANDOMX_FREQ_IMUL_RCP + \
RANDOMX_FREQ_INEG_R + RANDOMX_FREQ_IXOR_R + RANDOMX_FREQ_IXOR_M + RANDOMX_FREQ_IROR_R + RANDOMX_FREQ_ISWAP_R + \
@ -141,6 +141,7 @@ namespace RandomX {
typedef void(*DatasetReadFunc)(addr_t, MemoryRegisters&, int_reg_t(&reg)[RegistersCount]);
typedef void(*ProgramFunc)(RegisterFile&, MemoryRegisters&, uint8_t* /* scratchpad */, uint64_t);
typedef void(*DatasetInitFunc)(uint8_t* cache, uint8_t* dataset, uint32_t startBlock, uint32_t endBlock);
}
std::ostream& operator<<(std::ostream& os, const RandomX::RegisterFile& rf);

View file

@ -37,7 +37,7 @@ along with RandomX. If not, see<http://www.gnu.org/licenses/>.
//Number of random Cache accesses per Dataset block. Minimum is 2.
#define RANDOMX_CACHE_ACCESSES 8
#define RANDOMX_LPROG_LATENCY 168
#define RANDOMX_LPROG_LATENCY 130
#define RANDOMX_LPROG_ASIC_LATENCY 84
#define RANDOMX_LPROG_MIN_SIZE 225
#define RANDOMX_LPROG_MAX_SIZE 512
@ -80,12 +80,12 @@ Instruction frequencies (per 256 opcodes)
Total sum of frequencies must be 256
*/
#define RANDOMX_FREQ_IADD_R 12
#define RANDOMX_FREQ_IADD_RS 32
#define RANDOMX_FREQ_IADD_M 7
#define RANDOMX_FREQ_IADD_RC 16
#define RANDOMX_FREQ_ISUB_R 12
#define RANDOMX_FREQ_IADD_RC 0
#define RANDOMX_FREQ_ISUB_R 17
#define RANDOMX_FREQ_ISUB_M 7
#define RANDOMX_FREQ_IMUL_9C 9
#define RANDOMX_FREQ_IMUL_9C 0
#define RANDOMX_FREQ_IMUL_R 16
#define RANDOMX_FREQ_IMUL_M 4
#define RANDOMX_FREQ_IMULH_R 4

View file

@ -37,6 +37,7 @@ along with RandomX. If not, see<http://www.gnu.org/licenses/>.
#include "Cache.hpp"
#include "hashAes1Rx4.hpp"
#include "LightProgramGenerator.hpp"
#include "JitCompilerX86.hpp"
const uint8_t seed[32] = { 191, 182, 222, 175, 249, 89, 134, 104, 241, 68, 191, 62, 162, 166, 61, 64, 123, 191, 227, 193, 118, 60, 188, 53, 223, 133, 175, 24, 123, 230, 55, 74 };
@ -204,7 +205,7 @@ void mine(RandomX::VirtualMachine* vm, std::atomic<uint32_t>& atomicNonce, Atomi
}
int main(int argc, char** argv) {
bool softAes, genAsm, miningMode, verificationMode, help, largePages, async, genNative, jit, genLight;
bool softAes, genAsm, miningMode, verificationMode, help, largePages, async, genNative, jit, genLight, useSuperscalar;
int programCount, threadCount, initThreadCount, epoch;
readOption("--softAes", argc, argv, softAes);
@ -220,14 +221,16 @@ int main(int argc, char** argv) {
readOption("--genNative", argc, argv, genNative);
readOption("--help", argc, argv, help);
readOption("--genLight", argc, argv, genLight);
readOption("--useSuperscalar", argc, argv, useSuperscalar);
if (genLight) {
RandomX::LightProgram p;
RandomX::generateLightProg2(p, seed, 0, programCount);
//RandomX::AssemblyGeneratorX86 asmX86;
//asmX86.generateProgram(p);
RandomX::Blake2Generator gen(seed, programCount);
RandomX::generateLightProg2(p, gen);
RandomX::AssemblyGeneratorX86 asmX86;
asmX86.generateProgram(p);
//std::ofstream file("lightProg2.asm");
//asmX86.printCode(std::cout);
asmX86.printCode(std::cout);
return 0;
}
@ -287,6 +290,17 @@ int main(int argc, char** argv) {
dataset.dataset.size = datasetSize;
RandomX::datasetAlloc(dataset, largePages);
const uint64_t datasetBlockCount = datasetSize / RandomX::CacheLineSize;
if (useSuperscalar) {
RandomX::Blake2Generator gen(seed, programCount);
RandomX::LightProgram programs[RANDOMX_CACHE_ACCESSES];
for (int i = 0; i < RANDOMX_CACHE_ACCESSES; ++i) {
RandomX::generateLightProg2(programs[i], gen);
}
RandomX::JitCompilerX86 jit86;
jit86.generateSuperScalarHash(programs);
jit86.getDatasetInitFunc()(cache.memory, dataset.dataset.memory, 0, datasetBlockCount);
}
else {
if (initThreadCount > 1) {
auto perThread = datasetBlockCount / initThreadCount;
auto remainder = datasetBlockCount % initThreadCount;
@ -301,10 +315,12 @@ int main(int argc, char** argv) {
else {
RandomX::datasetInit(cache, dataset.dataset, 0, datasetBlockCount);
}
}
RandomX::deallocCache(cache, largePages);
threads.clear();
std::cout << "Dataset (" << datasetSize << " bytes) initialized in " << sw.getElapsed() << " s" << std::endl;
}
return 0;
std::cout << "Initializing " << threadCount << " virtual machine(s) ..." << std::endl;
for (int i = 0; i < threadCount; ++i) {
RandomX::VirtualMachine* vm;