Random accesses - JIT compiler

This commit is contained in:
tevador 2019-01-10 22:04:55 +01:00
parent b71e0eec65
commit d1a808643d
24 changed files with 341 additions and 341 deletions

View File

@ -11,7 +11,7 @@ SRCDIR=src
OBJDIR=obj
LDFLAGS=-lpthread
TOBJS=$(addprefix $(OBJDIR)/,instructionsPortable.o TestAluFpu.o)
ROBJS=$(addprefix $(OBJDIR)/,argon2_core.o argon2_ref.o AssemblyGeneratorX86.o blake2b.o CompiledVirtualMachine.o dataset.o JitCompilerX86.o instructionsPortable.o Instruction.o InterpretedVirtualMachine.o main.o Program.o softAes.o VirtualMachine.o t1ha2.o Cache.o)
ROBJS=$(addprefix $(OBJDIR)/,argon2_core.o argon2_ref.o AssemblyGeneratorX86.o blake2b.o CompiledVirtualMachine.o dataset.o JitCompilerX86.o instructionsPortable.o Instruction.o InterpretedVirtualMachine.o main.o Program.o softAes.o VirtualMachine.o t1ha2.o Cache.o virtualMemory.o)
ifeq ($(PLATFORM),x86_64)
ROBJS += $(OBJDIR)/JitCompilerX86-static.o
endif
@ -60,7 +60,7 @@ $(OBJDIR)/dataset.o: $(addprefix $(SRCDIR)/,dataset.cpp common.hpp Pcg32.hpp) |
$(OBJDIR)/JitCompilerX86.o: $(addprefix $(SRCDIR)/,JitCompilerX86.cpp JitCompilerX86.hpp Instruction.hpp instructionWeights.hpp) | $(OBJDIR)
$(CXX) $(CXXFLAGS) -c $(SRCDIR)/JitCompilerX86.cpp -o $@
$(OBJDIR)/JitCompilerX86-static.o: $(addprefix $(SRCDIR)/,JitCompilerX86-static.S $(addprefix asm/program_, prologue_linux.inc prologue_load.inc epilogue_linux.inc epilogue_store.inc read_r.inc read_f.inc)) | $(OBJDIR)
$(OBJDIR)/JitCompilerX86-static.o: $(addprefix $(SRCDIR)/,JitCompilerX86-static.S $(addprefix asm/program_, prologue_linux.inc prologue_load.inc epilogue_linux.inc epilogue_store.inc read.inc)) | $(OBJDIR)
$(CXX) -x assembler-with-cpp -c $(SRCDIR)/JitCompilerX86-static.S -o $@
$(OBJDIR)/instructionsPortable.o: $(addprefix $(SRCDIR)/,instructionsPortable.cpp instructions.hpp intrinPortable.h) | $(OBJDIR)
@ -87,6 +87,9 @@ $(OBJDIR)/softAes.o: $(addprefix $(SRCDIR)/,softAes.cpp softAes.h) | $(OBJDIR)
$(OBJDIR)/VirtualMachine.o: $(addprefix $(SRCDIR)/,VirtualMachine.cpp VirtualMachine.hpp common.hpp dataset.hpp) | $(OBJDIR)
$(CXX) $(CXXFLAGS) -c $(SRCDIR)/VirtualMachine.cpp -o $@
$(OBJDIR)/virtualMemory.o: $(addprefix $(SRCDIR)/,virtualMemory.cpp virtualMemory.hpp) | $(OBJDIR)
$(CXX) $(CXXFLAGS) -c $(SRCDIR)/virtualMemory.cpp -o $@
$(OBJDIR)/t1ha2.o: $(addprefix $(SRCDIR)/t1ha/,t1ha2.c t1ha.h t1ha_bits.h) | $(OBJDIR)
$(CC) $(CCFLAGS) -c $(SRCDIR)/t1ha/t1ha2.c -o $@

View File

@ -169,11 +169,12 @@ namespace RandomX {
asmCode << "\t" << instrx86 << " xmm0, " << regF[instr.regb % RegistersCount] << std::endl;
}
void AssemblyGeneratorX86::gencr(Instruction& instr) {
void AssemblyGeneratorX86::gencr(Instruction& instr, bool rax = true) {
switch (instr.locc & 7)
{
case 0:
asmCode << "\tmov rcx, rax" << std::endl;
if(rax)
asmCode << "\tmov rcx, rax" << std::endl;
asmCode << "\tmov eax, " << regR32[instr.regc % RegistersCount] << std::endl;
asmCode << "\txor eax, 0" << std::hex << instr.addrc << "h" << std::dec << std::endl;
asmCode << "\tand eax, " << (ScratchpadL2 - 1) << std::endl;
@ -186,7 +187,8 @@ namespace RandomX {
case 1:
case 2:
case 3:
asmCode << "\tmov rcx, rax" << std::endl;
if (rax)
asmCode << "\tmov rcx, rax" << std::endl;
asmCode << "\tmov eax, " << regR32[instr.regc % RegistersCount] << std::endl;
asmCode << "\txor eax, 0" << std::hex << instr.addrc << "h" << std::dec << std::endl;
asmCode << "\tand eax, " << (ScratchpadL1 - 1) << std::endl;
@ -197,9 +199,9 @@ namespace RandomX {
return;
default:
asmCode << "\tmov " << regR[instr.regc % RegistersCount] << ", rax" << std::endl;
asmCode << "\tmov " << regR[instr.regc % RegistersCount] << ", " << (rax ? "rax" : "rcx") << std::endl;
if (trace) {
asmCode << "\tmov qword ptr [" << regScratchpadAddr << " + " << regIc << " * 8 + 262136], rax" << std::endl;
asmCode << "\tmov qword ptr [" << regScratchpadAddr << " + " << regIc << " * 8 + 262136], " << (rax ? "rax" : "rcx") << std::endl;
}
return;
}
@ -208,7 +210,7 @@ namespace RandomX {
void AssemblyGeneratorX86::gencf(Instruction& instr, bool move = true) {
if(move)
asmCode << "\tmovaps " << regF[instr.regc % RegistersCount] << ", xmm0" << std::endl;
const char* store = (instr.locc & 8) ? "movhpd" : "movlpd";
const char* store = (instr.locc & 128) ? "movhpd" : "movlpd";
switch (instr.locc & 7)
{
case 4:
@ -463,14 +465,13 @@ namespace RandomX {
void AssemblyGeneratorX86::h_FPROUND(Instruction& instr, int i) {
genar(instr, i);
//asmCode << "\tmov rcx, rax" << std::endl;
asmCode << "\tmov rcx, rax" << std::endl;
asmCode << "\tshl eax, 13" << std::endl;
//asmCode << "\tand rcx, -2048" << std::endl;
asmCode << "\tand eax, 24576" << std::endl;
//asmCode << "\tmovaps " << regF[instr.regc % RegistersCount] << ", xmm0" << std::endl;
asmCode << "\tor eax, 40896" << std::endl;
asmCode << "\tmov dword ptr [rsp - 8], eax" << std::endl;
asmCode << "\tldmxcsr dword ptr [rsp - 8]" << std::endl;
gencr(instr, false);
}
static inline const char* jumpCondition(Instruction& instr, bool invert = false) {

View File

@ -44,7 +44,7 @@ namespace RandomX {
void genbr1(Instruction&);
void genbr132(Instruction&);
void genbf(Instruction&, const char*);
void gencr(Instruction&);
void gencr(Instruction&, bool);
void gencf(Instruction&, bool);
void generateCode(Instruction&, int);

View File

@ -47,8 +47,8 @@ namespace RandomX {
}
void CompiledVirtualMachine::execute() {
executeProgram(reg, mem, scratchpad, readDataset);
//compiler.getProgramFunc()(reg, mem, scratchpad);
//executeProgram(reg, mem, scratchpad, readDataset);
compiler.getProgramFunc()(reg, mem, scratchpad);
#ifdef TRACEVM
for (int32_t i = InstructionCount - 1; i >= 0; --i) {
std::cout << std::hex << tracepad[i].u64 << std::endl;

View File

@ -197,6 +197,17 @@ namespace RandomX {
#define ALU_RETIRE(x) x(a, b, c); \
if(trace) std::cout << std::hex << /*a.u64 << " " << b.u64 << " " <<*/ c.u64 << std::endl;
#define CHECK_NOP_FPDIV(b, c)
#ifndef STATS
#define CHECK_NOP_FPADD(b, c)
#define CHECK_NOP_FPSUB(b, c)
#define CHECK_NOP_FPMUL(b, c)
#else
#define CHECK_NOP_FPADD(b, c) bool loeq = (b.lo.u64 == c.lo.u64); bool hieq = (b.hi.u64 == c.hi.u64); count_FPADD_nop += loeq + hieq; if(loeq && hieq) count_FPADD_nop2++;
#define CHECK_NOP_FPSUB(b, c) bool loeq = ((b.lo.u64 & INT64_MAX) == (c.lo.u64 & INT64_MAX)); bool hieq = ((b.hi.u64 & INT64_MAX) == (c.hi.u64 & INT64_MAX)); count_FPSUB_nop += loeq + hieq; if(loeq && hieq) count_FPSUB_nop2++;
#define CHECK_NOP_FPMUL(b, c) bool loeq = (b.lo.u64 == c.lo.u64); bool hieq = (b.hi.u64 == c.hi.u64); count_FPMUL_nop += loeq + hieq; if(loeq && hieq) count_FPMUL_nop2++;
#endif
#define FPU_RETIRE(x) x(a, b, c); \
writecf(inst, c); \
if(trace) { \
@ -248,8 +259,10 @@ namespace RandomX {
INC_COUNT(x) \
convertible_t a = loada(inst); \
fpu_reg_t& b = reg.f[inst.regb % RegistersCount]; \
fpu_reg_t btemp = b; \
fpu_reg_t& c = reg.f[inst.regc % RegistersCount]; \
FPU_RETIRE(x) \
CHECK_NOP_##x(btemp, c) \
}
#define FPU_INST_NB(x) void InterpretedVirtualMachine::h_##x(Instruction& inst) { \

View File

@ -83,6 +83,12 @@ namespace RandomX {
int count_retdepth_max = 0;
int count_endstack = 0;
int count_instructions[ProgramLength] = { 0 };
int count_FPADD_nop = 0;
int count_FPADD_nop2 = 0;
int count_FPSUB_nop = 0;
int count_FPSUB_nop2 = 0;
int count_FPMUL_nop = 0;
int count_FPMUL_nop2 = 0;
#endif
convertible_t loada(Instruction&);

View File

@ -29,9 +29,12 @@
.global DECL(randomx_program_prologue)
.global DECL(randomx_program_begin)
.global DECL(randomx_program_epilogue)
.global DECL(randomx_program_read_r)
.global DECL(randomx_program_read_f)
.global DECL(randomx_program_read_l1)
.global DECL(randomx_program_read_l2)
.global DECL(randomx_program_end)
.global DECL(randomx_program_transform)
#define db .byte
.align 64
DECL(randomx_program_prologue):
@ -45,14 +48,26 @@ DECL(randomx_program_begin):
DECL(randomx_program_epilogue):
#include "asm/program_epilogue_linux.inc"
.align 64
DECL(randomx_program_read_r):
#include "asm/program_read_r.inc"
#define scratchpad_mask and ecx, 2040
.align 64
DECL(randomx_program_read_f):
#include "asm/program_read_f.inc"
DECL(randomx_program_read_l1):
#include "asm/program_read.inc"
#undef scratchpad_mask
#define scratchpad_mask and ecx, 32760
.align 64
DECL(randomx_program_read_l2):
#include "asm/program_read.inc"
#undef scratchpad_mask
.align 64
DECL(randomx_program_end):
nop
nop
.align 8
DECL(randomx_program_transform):
#include "asm/program_transform_address.inc"

View File

@ -20,9 +20,11 @@ _RANDOMX_JITX86_STATIC SEGMENT PAGE READ EXECUTE
PUBLIC randomx_program_prologue
PUBLIC randomx_program_begin
PUBLIC randomx_program_epilogue
PUBLIC randomx_program_read_r
PUBLIC randomx_program_read_f
PUBLIC randomx_program_read_l1
PUBLIC randomx_program_read_l2
PUBLIC randomx_program_end
PUBLIC randomx_program_transform
ALIGN 64
randomx_program_prologue PROC
@ -39,21 +41,34 @@ randomx_program_epilogue PROC
include asm/program_epilogue_win64.inc
randomx_program_epilogue ENDP
ALIGN 64
randomx_program_read_r PROC
include asm/program_read_r.inc
randomx_program_read_r ENDP
scratchpad_mask MACRO
and ecx, 2040
ENDM
ALIGN 64
randomx_program_read_f PROC
include asm/program_read_f.inc
randomx_program_read_f ENDP
randomx_program_read_l1 PROC
include asm/program_read.inc
randomx_program_read_l1 ENDP
scratchpad_mask MACRO
and ecx, 32760
ENDM
ALIGN 64
randomx_program_read_l2 PROC
include asm/program_read.inc
randomx_program_read_l2 ENDP
ALIGN 64
randomx_program_end PROC
nop
randomx_program_end ENDP
ALIGN 8
randomx_program_transform PROC
include asm/program_transform_address.inc
randomx_program_transform ENDP
_RANDOMX_JITX86_STATIC ENDS
END

View File

@ -21,7 +21,8 @@ extern "C" {
void randomx_program_prologue();
void randomx_program_begin();
void randomx_program_epilogue();
void randomx_program_read_r();
void randomx_program_read_f();
void randomx_program_transform();
void randomx_program_read_l1();
void randomx_program_read_l2();
void randomx_program_end();
}

View File

@ -48,12 +48,12 @@ namespace RandomX {
REGISTER ALLOCATION:
rax -> temporary
rbx -> MemoryRegisters& memory
rbx -> "ic"
rcx -> temporary
rdx -> temporary
rsi -> convertible_t* scratchpad
rdi -> "ic" (instruction counter)
rbp -> beginning of VM stack
rdi -> beginning of VM stack
rbp -> "ma", "mx"
rsp -> end of VM stack
r8 -> "r0"
r9 -> "r1"
@ -82,7 +82,8 @@ namespace RandomX {
| saved registers
|
v
[rbp] RegisterFile& registerFile
[rdi+8] RegisterFile& registerFile
[rdi] uint8_t* dataset
|
|
| VM stack
@ -97,18 +98,19 @@ namespace RandomX {
const uint8_t* codePrologue = (uint8_t*)&randomx_program_prologue;
const uint8_t* codeProgramBegin = (uint8_t*)&randomx_program_begin;
const uint8_t* codeEpilogue = (uint8_t*)&randomx_program_epilogue;
const uint8_t* codeReadDatasetR = (uint8_t*)&randomx_program_read_r;
const uint8_t* codeReadDatasetF = (uint8_t*)&randomx_program_read_f;
const uint8_t* codeReadDatasetL1 = (uint8_t*)&randomx_program_read_l1;
const uint8_t* codeReadDatasetL2 = (uint8_t*)&randomx_program_read_l2;
const uint8_t* codeProgramEnd = (uint8_t*)&randomx_program_end;
const uint32_t* addressTransformations = (uint32_t*)&randomx_program_transform;
const int32_t prologueSize = codeProgramBegin - codePrologue;
const int32_t epilogueSize = codeReadDatasetR - codeEpilogue;
const int32_t readDatasetRSize = codeReadDatasetF - codeReadDatasetR;
const int32_t readDatasetFSize = codeProgramEnd - codeReadDatasetF;
const int32_t epilogueSize = codeReadDatasetL1 - codeEpilogue;
const int32_t readDatasetL1Size = codeReadDatasetL2 - codeReadDatasetL1;
const int32_t readDatasetL2Size = codeProgramEnd - codeReadDatasetL2;
const int32_t readDatasetFOffset = CodeSize - readDatasetFSize;
const int32_t readDatasetROffset = readDatasetFOffset - readDatasetRSize;
const int32_t epilogueOffset = readDatasetROffset - epilogueSize;
const int32_t readDatasetL2Offset = CodeSize - readDatasetL2Size;
const int32_t readDatasetL1Offset = readDatasetL2Offset - readDatasetL1Size;
const int32_t epilogueOffset = readDatasetL1Offset - epilogueSize;
JitCompilerX86::JitCompilerX86() {
#ifdef _WIN32
@ -121,9 +123,9 @@ namespace RandomX {
throw std::runtime_error("mmap failed");
#endif
memcpy(code, codePrologue, prologueSize);
memcpy(code + CodeSize - readDatasetRSize - readDatasetFSize - epilogueSize, codeEpilogue, epilogueSize);
memcpy(code + CodeSize - readDatasetRSize - readDatasetFSize, codeReadDatasetR, readDatasetRSize);
memcpy(code + CodeSize - readDatasetFSize, codeReadDatasetF, readDatasetFSize);
memcpy(code + CodeSize - epilogueSize - readDatasetL1Size - readDatasetL2Size, codeEpilogue, epilogueSize);
memcpy(code + CodeSize - readDatasetL1Size - readDatasetL2Size, codeReadDatasetL1, readDatasetL1Size);
memcpy(code + CodeSize - readDatasetL2Size, codeReadDatasetL2, readDatasetL2Size);
}
void JitCompilerX86::generateProgram(Pcg32& gen) {
@ -140,12 +142,33 @@ namespace RandomX {
emitByte(0xe9);
emit(instructionOffsets[0] - (codePos + 4));
fixCallOffsets();
uint32_t transformL1 = addressTransformations[gen.getUniform(0, TransformationCount - 1)];
uint32_t transformL2 = addressTransformations[gen.getUniform(0, TransformationCount - 1)];
*reinterpret_cast<uint32_t*>(code + readDatasetL1Offset + 1) = transformL1;
*reinterpret_cast<uint32_t*>(code + readDatasetL2Offset + 1) = transformL2;
}
void JitCompilerX86::generateCode(Instruction& instr, int i) {
instructionOffsets.push_back(codePos);
emit(0x840fcfff); //dec edx; jz <epilogue>
emit(0x840fcbff); //dec ebx; jz <epilogue>
emit(epilogueOffset - (codePos + 4)); //jump offset (RIP-relative)
emit(uint16_t(0x8149)); //xor
emitByte(0xf0 + (instr.rega % RegistersCount));
emit(instr.addra);
emit(uint16_t(0x8b41)); //mov
emitByte(0xc8 + (instr.rega % RegistersCount)); //ecx, rega
emit(0x753fc3f6); //test bl,0x3f; jne
emit(uint16_t(0xe805));
if (instr.loca & 3) { //A.LOC.W
emit(readDatasetL1Offset - (codePos + 4));
}
else {
emit(readDatasetL2Offset - (codePos + 4));
}
if ((instr.loca & 192) == 0) { //A.LOC.X
emit(uint16_t(0x3348));
emitByte(0xe9); //xor rbp, rcx
}
auto generator = engine[instr.opcode];
(this->*generator)(instr, i);
}
@ -157,73 +180,26 @@ namespace RandomX {
}
void JitCompilerX86::genar(Instruction& instr) {
emit(uint16_t(0x8149)); //xor
emitByte(0xf0 + (instr.rega % RegistersCount));
emit(instr.addra);
switch (instr.loca & 7)
{
case 0:
case 1:
case 2:
case 3:
emit(uint16_t(0x8b41)); //mov
emitByte(0xc8 + (instr.rega % RegistersCount)); //ecx, rega
emitByte(0xe8); //call
emit(readDatasetROffset - (codePos + 4));
return;
case 4:
emit(uint16_t(0x8b41)); //mov
emitByte(0xc0 + (instr.rega % RegistersCount)); //eax, rega
emitByte(0x25); //and
emit(ScratchpadL2 - 1); //whole scratchpad
emit(0xc6048b48); // mov rax,QWORD PTR [rsi+rax*8]
return;
default:
emit(uint16_t(0x8b41)); //mov
emitByte(0xc0 + (instr.rega % RegistersCount)); //eax, rega
emitByte(0x25); //and
emit(ScratchpadL1 - 1); //first 16 KiB of scratchpad
emit(0xc6048b48); // mov rax,QWORD PTR [rsi+rax*8]
return;
emit(uint16_t(0xe181)); //and ecx,
if (instr.loca & 3) {
emit(ScratchpadL1 - 1); //first 16 KiB of scratchpad
}
else {
emit(ScratchpadL2 - 1); //whole scratchpad
}
emit(0xce048b48); //mov rax,QWORD PTR [rsi+rcx*8]
}
void JitCompilerX86::genaf(Instruction& instr) {
emit(uint16_t(0x8149)); //xor
emitByte(0xf0 + (instr.rega % RegistersCount));
emit(instr.addra);
switch (instr.loca & 7)
{
case 0:
case 1:
case 2:
case 3:
emit(uint16_t(0x8b41)); //mov
emitByte(0xc8 + (instr.rega % RegistersCount)); //ecx, rega
emitByte(0xe8); //call
emit(readDatasetFOffset - (codePos + 4));
return;
case 4:
emit(uint16_t(0x8b41)); //mov
emitByte(0xc0 + (instr.rega % RegistersCount)); //eax, rega
emitByte(0x25); //and
emit(ScratchpadL2 - 1); //whole scratchpad
emitByte(0xf3);
emit(0xc604e60f); //cvtdq2pd xmm0,QWORD PTR [rsi+rax*8]
return;
default:
emit(uint16_t(0x8b41)); //mov
emitByte(0xc0 + (instr.rega % RegistersCount)); //eax, rega
emitByte(0x25); //and
emit(uint16_t(0xe181)); //and ecx,
if (instr.loca & 3) {
emit(ScratchpadL1 - 1); //first 16 KiB of scratchpad
emitByte(0xf3);
emit(0xc604e60f); //cvtdq2pd xmm0,QWORD PTR [rsi+rax*8]
return;
}
else {
emit(ScratchpadL2 - 1); //whole scratchpad
}
emitByte(0xf3);
emit(0xce04e60f); //cvtdq2pd xmm0,QWORD PTR [rsi+rcx*8]
}
void JitCompilerX86::genbr0(Instruction& instr, uint16_t opcodeReg, uint16_t opcodeImm) {
@ -274,8 +250,13 @@ namespace RandomX {
}
void JitCompilerX86::scratchpadStoreR(Instruction& instr, uint32_t scratchpadSize) {
emit(0x41c88b48); //mov rcx, rax; REX
void JitCompilerX86::scratchpadStoreR(Instruction& instr, uint32_t scratchpadSize, bool rax) {
if (rax) {
emit(0x41c88b48); //mov rcx, rax; REX
}
else {
emitByte(0x41);
}
emitByte(0x8b); // mov
emitByte(0xc0 + (instr.regc % RegistersCount)); //eax, regc
emitByte(0x35); // xor eax
@ -285,22 +266,27 @@ namespace RandomX {
emit(0xc60c8948); // mov QWORD PTR [rsi+rax*8],rcx
}
void JitCompilerX86::gencr(Instruction& instr) {
void JitCompilerX86::gencr(Instruction& instr, bool rax = true) {
switch (instr.locc & 7)
{
case 0:
scratchpadStoreR(instr, ScratchpadL2);
scratchpadStoreR(instr, ScratchpadL2, rax);
break;
case 1:
case 2:
case 3:
scratchpadStoreR(instr, ScratchpadL1);
scratchpadStoreR(instr, ScratchpadL1, rax);
break;
default:
emit(uint16_t(0x8b4c)); //mov
emitByte(0xc0 + 8 * (instr.regc % RegistersCount)); //regc, rax
if (rax) {
emitByte(0xc0 + 8 * (instr.regc % RegistersCount)); //regc, rax
}
else {
emitByte(0xc1 + 8 * (instr.regc % RegistersCount)); //regc, rcx
}
break;
}
}
@ -322,29 +308,21 @@ namespace RandomX {
emitByte(0xc6);
}
void JitCompilerX86::gencf(Instruction& instr, bool alwaysLow = false) {
void JitCompilerX86::gencf(Instruction& instr) {
int regc = (instr.regc % RegistersCount);
if (!alwaysLow) {
if (regc <= 1) {
emitByte(0x44); //REX
}
emit(uint16_t(0x280f)); //movaps
emitByte(0xc0 + 8 * regc); // regc, xmm0
if (regc <= 1) {
emitByte(0x44); //REX
}
switch (instr.locc & 7)
emit(uint16_t(0x280f)); //movaps
emitByte(0xc0 + 8 * regc); // regc, xmm0
if (instr.locc & 4) //C.LOC.R
{
case 4:
scratchpadStoreF(instr, regc, ScratchpadL2, !alwaysLow && (instr.locc & 8));
break;
case 5:
case 6:
case 7:
scratchpadStoreF(instr, regc, ScratchpadL1, !alwaysLow && (instr.locc & 8));
break;
default:
break;
if (instr.locc & 3) { //C.LOC.W
scratchpadStoreF(instr, regc, ScratchpadL1, (instr.locc & 128)); //first 16 KiB of scratchpad
}
else {
scratchpadStoreF(instr, regc, ScratchpadL2, (instr.locc & 128)); //whole scratchpad
}
}
}
@ -596,24 +574,11 @@ namespace RandomX {
void JitCompilerX86::h_FPROUND(Instruction& instr, int i) {
genar(instr);
emit(0x81480de0c1c88b48);
emit(0x600025fffff800e1);
emit(uint16_t(0x0000));
emitByte(0xf2);
int regc = (instr.regc % RegistersCount);
if (regc <= 1) {
emitByte(0x4c); //REX
}
else {
emitByte(0x48); //REX
}
emit(uint16_t(0x2a0f));
emitByte(0xc1 + 8 * regc);
emitByte(0x0d);
emit(0xf824448900009fc0);
emit(0x2454ae0f); //ldmxcsr DWORD PTR [rsp-0x8]
emit(0x00250de0c1c88b48); //mov rcx,rax; shl eax,0xd
emit(0x00009fc00d000060); //and eax,0x6000; or eax,0x9fc0
emit(0x2454ae0ff8244489); //ldmxcsr DWORD PTR [rsp-0x8]
emitByte(0xf8);
gencf(instr, true);
gencr(instr, false); //result in rcx
}
static inline uint8_t jumpCondition(Instruction& instr, bool invert = false) {
@ -670,7 +635,7 @@ namespace RandomX {
if ((instr.locc & 7) <= 3) {
crlen = 17;
}
emit(0x74e53b48); //cmp rsp, rbp; je
emit(0x74e73b48); //cmp rsp, rdi; je
emitByte(11 + crlen);
emitByte(0x48);
emit(0x08244433); //xor rax,QWORD PTR [rsp+0x8]

View File

@ -64,10 +64,10 @@ namespace RandomX {
void genbr1(Instruction&, uint16_t, uint16_t);
void genbr132(Instruction&, uint16_t, uint8_t);
void genbf(Instruction&, uint8_t);
void scratchpadStoreR(Instruction&, uint32_t);
void scratchpadStoreR(Instruction&, uint32_t, bool);
void scratchpadStoreF(Instruction&, int, uint32_t, bool);
void gencr(Instruction&);
void gencf(Instruction&, bool);
void gencr(Instruction&, bool);
void gencf(Instruction&);
void generateCode(Instruction&, int);
void fixCallOffsets();

View File

@ -1,8 +1,9 @@
;# unroll VM stack
mov rsp, rbp
mov rsp, rdi
;# save VM register values
pop rcx
pop rcx
mov qword ptr [rcx+0], r8
mov qword ptr [rcx+8], r9
mov qword ptr [rcx+16], r10

View File

@ -7,9 +7,11 @@
push r15
;# function arguments
push rdi ;# RegisterFile& registerFile
mov rbx, rsi ;# MemoryRegisters& memory
mov rsi, rdx ;# convertible_t* scratchpad
push rdi ;# RegisterFile& registerFile
mov rbp, qword ptr [rsi] ;# "mx", "ma"
mov rax, qword ptr [rsi+8] ;# uint8_t* dataset
push rax
mov rsi, rdx ;# convertible_t* scratchpad
mov rcx, rdi
#include "program_prologue_load.inc"

View File

@ -1,5 +1,5 @@
mov rbp, rsp ;# beginning of VM stack
mov rdi, 1048577 ;# number of VM instructions to execute + 1
mov rdi, rsp ;# beginning of VM stack
mov ebx, 1048577 ;# number of VM instructions to execute + 1
xorps xmm10, xmm10
cmpeqpd xmm10, xmm10

View File

@ -15,9 +15,11 @@
movdqu xmmword ptr [rsp+0], xmm10
;# function arguments
push rcx ;# RegisterFile& registerFile
mov rbx, rdx ;# MemoryRegisters& memory
mov rsi, r8 ;# convertible_t* scratchpad
push rcx ;# RegisterFile& registerFile
mov rbp, qword ptr [rdx] ;# "mx", "ma"
mov rax, qword ptr [rdx+8] ;# uint8_t* dataset
push rax
mov rsi, r8 ;# convertible_t* scratchpad
include program_prologue_load.inc

32
src/asm/program_read.inc Normal file
View File

@ -0,0 +1,32 @@
push rcx ;# preserve ecx
db 0, 0, 0, 0 ;# TransformAddress placeholder
mov rax, qword ptr [rdi] ;# load the dataset address
xor rbp, rcx ;# modify "mx"
;# prefetch cacheline "mx"
and rbp, -64 ;# align "mx" to the start of a cache line
mov edx, ebp ;# edx = mx
prefetchnta byte ptr [rax+rdx]
;# read cacheline "ma"
ror rbp, 32 ;# swap "ma" and "mx"
mov edx, ebp ;# edx = ma
scratchpad_mask ;# limit address to the specified scratchpad size aligned to multiple of 8
lea rcx, [rsi+rcx*8] ;# scratchpad cache line
lea rax, [rax+rdx] ;# dataset cache line
mov rdx, qword ptr [rax+0] ;# load first dataset quadword (prefetched into the cache by now)
xor qword ptr [rcx+0], rdx ;# XOR the dataset item with a scratchpad item, repeat for the rest of the cacheline
mov rdx, qword ptr [rax+8]
xor qword ptr [rcx+8], rdx
mov rdx, qword ptr [rax+16]
xor qword ptr [rcx+16], rdx
mov rdx, qword ptr [rax+24]
xor qword ptr [rcx+24], rdx
mov rdx, qword ptr [rax+32]
xor qword ptr [rcx+32], rdx
mov rdx, qword ptr [rax+40]
xor qword ptr [rcx+40], rdx
mov rdx, qword ptr [rax+48]
xor qword ptr [rcx+48], rdx
mov rdx, qword ptr [rax+56]
xor qword ptr [rcx+56], rdx
pop rcx ;# restore ecx
ret

View File

@ -1,13 +0,0 @@
mov edx, dword ptr [rbx] ;# ma
mov rax, qword ptr [rbx+8] ;# dataset
cvtdq2pd xmm0, qword ptr [rax+rdx]
add dword ptr [rbx], 8
xor ecx, dword ptr [rbx+4] ;# mx
mov dword ptr [rbx+4], ecx
test ecx, 65528
jne short rx_read_dataset_f_ret
and ecx, -8
mov dword ptr [rbx], ecx
prefetcht0 byte ptr [rax+rcx]
rx_read_dataset_f_ret:
ret 0

View File

@ -1,13 +0,0 @@
mov eax, dword ptr [rbx] ;# ma
mov rdx, qword ptr [rbx+8] ;# dataset
mov rax, qword ptr [rdx+rax]
add dword ptr [rbx], 8
xor ecx, dword ptr [rbx+4] ;# mx
mov dword ptr [rbx+4], ecx
test ecx, 65528
jne short rx_read_dataset_r_ret
and ecx, -8
mov dword ptr [rbx], ecx
prefetcht0 byte ptr [rdx+rcx]
rx_read_dataset_r_ret:
ret 0

View File

@ -77,6 +77,7 @@ namespace RandomX {
constexpr uint32_t ScratchpadLength = ScratchpadSize / sizeof(convertible_t);
constexpr uint32_t ScratchpadL1 = ScratchpadSize / 16 / sizeof(convertible_t);
constexpr uint32_t ScratchpadL2 = ScratchpadSize / sizeof(convertible_t);
constexpr uint32_t TransformationCount = 90;
constexpr int RegistersCount = 8;
class Cache;

View File

@ -158,10 +158,14 @@ executeProgram PROC
pslldq xmm7, 8
cvtsi2sd xmm7, qword ptr [rcx+112]
; program body
jmp program_begin
; program body
ALIGN 64
program_begin:
include program.inc
ALIGN 64
rx_finish:
; unroll the stack
mov rsp, rdi

View File

@ -277,10 +277,6 @@ int main(int argc, char** argv) {
if(programCount == 1000)
std::cout << "Reference result: 3e1c5f9b9d0bf8ffa250f860bf5f7ab76ac823b206ddee6a592660119a3640c6" << std::endl;
std::cout << "Performance: " << programCount / elapsed << " programs per second" << std::endl;
/*if (threadCount == 1 && !compiled) {
auto ivm = (RandomX::InterpretedVirtualMachine*)vms[0];
std::cout << ivm->getProgam();
}*/
}
catch (std::exception& e) {
std::cout << "ERROR: " << e.what() << std::endl;

View File

@ -76,11 +76,13 @@ rx_body_3:
xor rbp, rcx
and ecx, 2047
mov rax, qword ptr [rsi+rcx*8]
mov rcx, rax
shl eax, 13
and eax, 24576
or eax, 40896
mov dword ptr [rsp - 8], eax
ldmxcsr dword ptr [rsp - 8]
mov r8, rcx
rx_i_4: ;MULH_64
dec ebx
@ -153,7 +155,7 @@ rx_body_7:
mov eax, r14d
xor eax, 057c8c41bh
and eax, 32767
movhpd qword ptr [rsi + rax * 8], xmm6
movlpd qword ptr [rsi + rax * 8], xmm6
rx_i_8: ;SHL_64
dec ebx
@ -218,7 +220,7 @@ rx_body_11:
mov eax, r12d
xor eax, 0852d40d8h
and eax, 2047
movhpd qword ptr [rsi + rax * 8], xmm4
movlpd qword ptr [rsi + rax * 8], xmm4
rx_i_12: ;CALL
dec ebx
@ -355,7 +357,7 @@ rx_body_18:
mov eax, r11d
xor eax, 0869baa81h
and eax, 2047
movhpd qword ptr [rsi + rax * 8], xmm3
movlpd qword ptr [rsi + rax * 8], xmm3
rx_i_19: ;FPSUB
dec ebx
@ -372,7 +374,7 @@ rx_body_19:
subpd xmm0, xmm8
movaps xmm7, xmm0
rx_i_20: ;FPMUL
rx_i_20: ;FPSUB
dec ebx
jz rx_finish
xor r13, 0ecca967dh
@ -383,15 +385,12 @@ rx_i_20: ;FPMUL
rx_body_20:
and ecx, 2047
cvtdq2pd xmm0, qword ptr [rsi+rcx*8]
mulpd xmm0, xmm2
movaps xmm1, xmm0
cmpeqpd xmm1, xmm1
andps xmm0, xmm1
subpd xmm0, xmm2
movaps xmm7, xmm0
mov eax, r15d
xor eax, 0aad81365h
and eax, 2047
movhpd qword ptr [rsi + rax * 8], xmm7
movlpd qword ptr [rsi + rax * 8], xmm7
rx_i_21: ;FPADD
dec ebx
@ -482,7 +481,7 @@ rx_body_25:
mov eax, r14d
xor eax, 0baf5c2d4h
and eax, 2047
movhpd qword ptr [rsi + rax * 8], xmm6
movlpd qword ptr [rsi + rax * 8], xmm6
rx_i_26: ;IMUL_32
dec ebx
@ -580,7 +579,7 @@ rx_body_31:
mov eax, r14d
xor eax, 01e2da792h
and eax, 2047
movhpd qword ptr [rsi + rax * 8], xmm6
movlpd qword ptr [rsi + rax * 8], xmm6
rx_i_32: ;XOR_64
dec ebx
@ -668,7 +667,7 @@ rx_body_36:
andps xmm0, xmm1
movaps xmm7, xmm0
rx_i_37: ;FPMUL
rx_i_37: ;FPSUB
dec ebx
jz rx_finish
xor r12, 0d0706601h
@ -679,10 +678,7 @@ rx_i_37: ;FPMUL
rx_body_37:
and ecx, 2047
cvtdq2pd xmm0, qword ptr [rsi+rcx*8]
mulpd xmm0, xmm2
movaps xmm1, xmm0
cmpeqpd xmm1, xmm1
andps xmm0, xmm1
subpd xmm0, xmm2
movaps xmm9, xmm0
mov eax, r9d
xor eax, 0bca81c78h
@ -764,7 +760,7 @@ taken_call_41:
push rax
call rx_i_127
rx_i_42: ;FPSUB
rx_i_42: ;FPADD
dec ebx
jz rx_finish
xor r15, 0bc1de9f6h
@ -776,7 +772,7 @@ rx_body_42:
xor rbp, rcx
and ecx, 2047
cvtdq2pd xmm0, qword ptr [rsi+rcx*8]
subpd xmm0, xmm6
addpd xmm0, xmm6
movaps xmm6, xmm0
rx_i_43: ;SUB_64
@ -887,7 +883,7 @@ rx_body_48:
and eax, 2047
movhpd qword ptr [rsi + rax * 8], xmm9
rx_i_49: ;FPMUL
rx_i_49: ;FPSUB
dec ebx
jz rx_finish
xor r8, 0f96c6a45h
@ -898,10 +894,7 @@ rx_i_49: ;FPMUL
rx_body_49:
and ecx, 2047
cvtdq2pd xmm0, qword ptr [rsi+rcx*8]
mulpd xmm0, xmm3
movaps xmm1, xmm0
cmpeqpd xmm1, xmm1
andps xmm0, xmm1
subpd xmm0, xmm3
movaps xmm5, xmm0
rx_i_50: ;OR_32
@ -1018,7 +1011,7 @@ rx_body_55:
mov eax, r11d
xor eax, 07c79cddh
and eax, 2047
movlpd qword ptr [rsi + rax * 8], xmm3
movhpd qword ptr [rsi + rax * 8], xmm3
rx_i_56: ;AND_64
dec ebx
@ -1144,7 +1137,7 @@ taken_call_61:
push rax
call rx_i_120
rx_i_62: ;FPMUL
rx_i_62: ;FPSUB
dec ebx
jz rx_finish
xor r15, 0c3089414h
@ -1155,17 +1148,14 @@ rx_i_62: ;FPMUL
rx_body_62:
and ecx, 2047
cvtdq2pd xmm0, qword ptr [rsi+rcx*8]
mulpd xmm0, xmm8
movaps xmm1, xmm0
cmpeqpd xmm1, xmm1
andps xmm0, xmm1
subpd xmm0, xmm8
movaps xmm2, xmm0
mov eax, r10d
xor eax, 05c4789e3h
and eax, 2047
movlpd qword ptr [rsi + rax * 8], xmm2
movhpd qword ptr [rsi + rax * 8], xmm2
rx_i_63: ;FPMUL
rx_i_63: ;FPSUB
dec ebx
jz rx_finish
xor r9, 065cf272eh
@ -1176,10 +1166,7 @@ rx_i_63: ;FPMUL
rx_body_63:
and ecx, 2047
cvtdq2pd xmm0, qword ptr [rsi+rcx*8]
mulpd xmm0, xmm7
movaps xmm1, xmm0
cmpeqpd xmm1, xmm1
andps xmm0, xmm1
subpd xmm0, xmm7
movaps xmm8, xmm0
rx_i_64: ;SUB_64
@ -1253,7 +1240,7 @@ taken_call_67:
push rax
call rx_i_79
rx_i_68: ;FPSUB
rx_i_68: ;FPADD
dec ebx
jz rx_finish
xor r13, 03aa5c3a4h
@ -1264,7 +1251,7 @@ rx_i_68: ;FPSUB
rx_body_68:
and ecx, 2047
cvtdq2pd xmm0, qword ptr [rsi+rcx*8]
subpd xmm0, xmm2
addpd xmm0, xmm2
movaps xmm4, xmm0
mov eax, r12d
xor eax, 03c51ef39h
@ -1354,11 +1341,16 @@ rx_i_73: ;FPROUND
rx_body_73:
and ecx, 32767
mov rax, qword ptr [rsi+rcx*8]
mov rcx, rax
shl eax, 13
and eax, 24576
or eax, 40896
mov dword ptr [rsp - 8], eax
ldmxcsr dword ptr [rsp - 8]
mov eax, r10d
xor eax, 040624270h
and eax, 2047
mov qword ptr [rsi + rax * 8], rcx
rx_i_74: ;MUL_64
dec ebx
@ -1722,7 +1714,7 @@ rx_body_93:
mov eax, r10d
xor eax, 07e48a0d8h
and eax, 2047
movlpd qword ptr [rsi + rax * 8], xmm2
movhpd qword ptr [rsi + rax * 8], xmm2
rx_i_94: ;RET
dec ebx
@ -1830,7 +1822,7 @@ rx_body_99:
mov eax, r12d
xor eax, 04c21df83h
and eax, 2047
movlpd qword ptr [rsi + rax * 8], xmm4
movhpd qword ptr [rsi + rax * 8], xmm4
rx_i_100: ;ADD_64
dec ebx
@ -1955,7 +1947,7 @@ rx_body_106:
mov eax, r12d
xor eax, 03cb2505h
and eax, 2047
movhpd qword ptr [rsi + rax * 8], xmm4
movlpd qword ptr [rsi + rax * 8], xmm4
rx_i_107: ;CALL
dec ebx
@ -1999,7 +1991,7 @@ rx_body_108:
mov eax, r9d
xor eax, 0678b65beh
and eax, 32767
movlpd qword ptr [rsi + rax * 8], xmm9
movhpd qword ptr [rsi + rax * 8], xmm9
rx_i_109: ;FPADD
dec ebx
@ -2207,7 +2199,7 @@ rx_body_120:
addpd xmm0, xmm4
movaps xmm8, xmm0
rx_i_121: ;FPMUL
rx_i_121: ;FPSUB
dec ebx
jz rx_finish
xor r9, 03ab8f73h
@ -2218,10 +2210,7 @@ rx_i_121: ;FPMUL
rx_body_121:
and ecx, 2047
cvtdq2pd xmm0, qword ptr [rsi+rcx*8]
mulpd xmm0, xmm5
movaps xmm1, xmm0
cmpeqpd xmm1, xmm1
andps xmm0, xmm1
subpd xmm0, xmm5
movaps xmm8, xmm0
rx_i_122: ;RET
@ -2813,7 +2802,7 @@ rx_body_153:
mov eax, r8d
xor eax, 09111c981h
and eax, 2047
movlpd qword ptr [rsi + rax * 8], xmm8
movhpd qword ptr [rsi + rax * 8], xmm8
rx_i_154: ;MUL_32
dec ebx
@ -3196,11 +3185,13 @@ rx_i_174: ;FPROUND
rx_body_174:
and ecx, 2047
mov rax, qword ptr [rsi+rcx*8]
mov rcx, rax
shl eax, 13
and eax, 24576
or eax, 40896
mov dword ptr [rsp - 8], eax
ldmxcsr dword ptr [rsp - 8]
mov r14, rcx
rx_i_175: ;SAR_64
dec ebx
@ -3431,7 +3422,7 @@ rx_body_187:
andps xmm0, xmm1
movaps xmm5, xmm0
rx_i_188: ;FPMUL
rx_i_188: ;FPSUB
dec ebx
jz rx_finish
xor r9, 04659becbh
@ -3443,10 +3434,7 @@ rx_body_188:
xor rbp, rcx
and ecx, 2047
cvtdq2pd xmm0, qword ptr [rsi+rcx*8]
mulpd xmm0, xmm3
movaps xmm1, xmm0
cmpeqpd xmm1, xmm1
andps xmm0, xmm1
subpd xmm0, xmm3
movaps xmm4, xmm0
rx_i_189: ;FPROUND
@ -3460,11 +3448,16 @@ rx_i_189: ;FPROUND
rx_body_189:
and ecx, 2047
mov rax, qword ptr [rsi+rcx*8]
mov rcx, rax
shl eax, 13
and eax, 24576
or eax, 40896
mov dword ptr [rsp - 8], eax
ldmxcsr dword ptr [rsp - 8]
mov eax, r13d
xor eax, 0e6f1a3b7h
and eax, 2047
mov qword ptr [rsi + rax * 8], rcx
rx_i_190: ;RET
dec ebx
@ -3761,7 +3754,7 @@ rx_body_205:
andps xmm0, xmm1
movaps xmm5, xmm0
rx_i_206: ;FPMUL
rx_i_206: ;FPSUB
dec ebx
jz rx_finish
xor r11, 0e836a177h
@ -3773,10 +3766,7 @@ rx_body_206:
xor rbp, rcx
and ecx, 2047
cvtdq2pd xmm0, qword ptr [rsi+rcx*8]
mulpd xmm0, xmm7
movaps xmm1, xmm0
cmpeqpd xmm1, xmm1
andps xmm0, xmm1
subpd xmm0, xmm7
movaps xmm4, xmm0
rx_i_207: ;AND_32
@ -4085,7 +4075,7 @@ rx_body_223:
mov eax, r10d
xor eax, 07fca59eeh
and eax, 32767
movlpd qword ptr [rsi + rax * 8], xmm2
movhpd qword ptr [rsi + rax * 8], xmm2
rx_i_224: ;SAR_64
dec ebx
@ -4171,7 +4161,7 @@ rx_body_227:
mov eax, r11d
xor eax, 0aabe2a0ah
and eax, 2047
movlpd qword ptr [rsi + rax * 8], xmm3
movhpd qword ptr [rsi + rax * 8], xmm3
rx_i_228: ;CALL
dec ebx
@ -4313,11 +4303,16 @@ rx_i_234: ;FPROUND
rx_body_234:
and ecx, 2047
mov rax, qword ptr [rsi+rcx*8]
mov rcx, rax
shl eax, 13
and eax, 24576
or eax, 40896
mov dword ptr [rsp - 8], eax
ldmxcsr dword ptr [rsp - 8]
mov eax, r12d
xor eax, 04d2e9e7dh
and eax, 2047
mov qword ptr [rsi + rax * 8], rcx
rx_i_235: ;IMUL_32
dec ebx
@ -4438,7 +4433,7 @@ rx_body_241:
mov eax, r15d
xor eax, 0bc2423ebh
and eax, 2047
movhpd qword ptr [rsi + rax * 8], xmm7
movlpd qword ptr [rsi + rax * 8], xmm7
rx_i_242: ;MULH_64
dec ebx
@ -4734,7 +4729,7 @@ rx_body_257:
mov eax, r11d
xor eax, 0373b1b6fh
and eax, 2047
movhpd qword ptr [rsi + rax * 8], xmm3
movlpd qword ptr [rsi + rax * 8], xmm3
rx_i_258: ;MUL_32
dec ebx
@ -4771,7 +4766,7 @@ rx_body_259:
addpd xmm0, xmm9
movaps xmm3, xmm0
rx_i_260: ;FPMUL
rx_i_260: ;FPSUB
dec ebx
jz rx_finish
xor r13, 0f94e9fa9h
@ -4783,10 +4778,7 @@ rx_body_260:
xor rbp, rcx
and ecx, 32767
cvtdq2pd xmm0, qword ptr [rsi+rcx*8]
mulpd xmm0, xmm5
movaps xmm1, xmm0
cmpeqpd xmm1, xmm1
andps xmm0, xmm1
subpd xmm0, xmm5
movaps xmm9, xmm0
rx_i_261: ;FPSQRT
@ -4806,7 +4798,7 @@ rx_body_261:
mov eax, r11d
xor eax, 0745a48e9h
and eax, 2047
movlpd qword ptr [rsi + rax * 8], xmm3
movhpd qword ptr [rsi + rax * 8], xmm3
rx_i_262: ;OR_32
dec ebx
@ -5044,7 +5036,7 @@ rx_body_274:
mov eax, r14d
xor eax, 06a2b2b5bh
and eax, 2047
movlpd qword ptr [rsi + rax * 8], xmm6
movhpd qword ptr [rsi + rax * 8], xmm6
rx_i_275: ;OR_64
dec ebx
@ -5121,7 +5113,7 @@ rx_body_278:
mov eax, r12d
xor eax, 02d00ad10h
and eax, 2047
movlpd qword ptr [rsi + rax * 8], xmm4
movhpd qword ptr [rsi + rax * 8], xmm4
rx_i_279: ;FPSUB
dec ebx
@ -5139,7 +5131,7 @@ rx_body_279:
mov eax, r9d
xor eax, 0475ade01h
and eax, 2047
movhpd qword ptr [rsi + rax * 8], xmm9
movlpd qword ptr [rsi + rax * 8], xmm9
rx_i_280: ;AND_64
dec ebx
@ -5210,7 +5202,7 @@ rx_body_283:
and eax, 2047
mov qword ptr [rsi + rax * 8], rcx
rx_i_284: ;FPSUB
rx_i_284: ;FPADD
dec ebx
jz rx_finish
xor r15, 0e68f36ach
@ -5222,7 +5214,7 @@ rx_body_284:
xor rbp, rcx
and ecx, 2047
cvtdq2pd xmm0, qword ptr [rsi+rcx*8]
subpd xmm0, xmm6
addpd xmm0, xmm6
movaps xmm9, xmm0
mov eax, r9d
xor eax, 0936f2960h
@ -5313,7 +5305,7 @@ rx_body_289:
andps xmm0, xmm1
movaps xmm8, xmm0
rx_i_290: ;FPMUL
rx_i_290: ;FPSUB
dec ebx
jz rx_finish
xor r15, 060665748h
@ -5324,10 +5316,7 @@ rx_i_290: ;FPMUL
rx_body_290:
and ecx, 32767
cvtdq2pd xmm0, qword ptr [rsi+rcx*8]
mulpd xmm0, xmm8
movaps xmm1, xmm0
cmpeqpd xmm1, xmm1
andps xmm0, xmm1
subpd xmm0, xmm8
movaps xmm9, xmm0
rx_i_291: ;RET
@ -5531,7 +5520,7 @@ rx_body_301:
mov eax, r15d
xor eax, 0433cf2d6h
and eax, 32767
movlpd qword ptr [rsi + rax * 8], xmm7
movhpd qword ptr [rsi + rax * 8], xmm7
rx_i_302: ;ADD_64
dec ebx
@ -5937,7 +5926,7 @@ rx_body_324:
mov eax, r9d
xor eax, 0944856d4h
and eax, 32767
movlpd qword ptr [rsi + rax * 8], xmm9
movhpd qword ptr [rsi + rax * 8], xmm9
rx_i_325: ;SHL_64
dec ebx
@ -6076,7 +6065,7 @@ rx_body_332:
mov eax, r11d
xor eax, 0116c919eh
and eax, 2047
movhpd qword ptr [rsi + rax * 8], xmm3
movlpd qword ptr [rsi + rax * 8], xmm3
rx_i_333: ;XOR_64
dec ebx
@ -6222,7 +6211,7 @@ rx_body_341:
and eax, 2047
mov qword ptr [rsi + rax * 8], rcx
rx_i_342: ;FPMUL
rx_i_342: ;FPSUB
dec ebx
jz rx_finish
xor r9, 09ccc7abah
@ -6233,10 +6222,7 @@ rx_i_342: ;FPMUL
rx_body_342:
and ecx, 32767
cvtdq2pd xmm0, qword ptr [rsi+rcx*8]
mulpd xmm0, xmm2
movaps xmm1, xmm0
cmpeqpd xmm1, xmm1
andps xmm0, xmm1
subpd xmm0, xmm2
movaps xmm3, xmm0
rx_i_343: ;SHR_64
@ -6258,7 +6244,7 @@ rx_body_343:
and eax, 32767
mov qword ptr [rsi + rax * 8], rcx
rx_i_344: ;FPMUL
rx_i_344: ;FPSUB
dec ebx
jz rx_finish
xor r10, 03ef9bcc4h
@ -6269,10 +6255,7 @@ rx_i_344: ;FPMUL
rx_body_344:
and ecx, 32767
cvtdq2pd xmm0, qword ptr [rsi+rcx*8]
mulpd xmm0, xmm6
movaps xmm1, xmm0
cmpeqpd xmm1, xmm1
andps xmm0, xmm1
subpd xmm0, xmm6
movaps xmm5, xmm0
rx_i_345: ;MULH_64
@ -6343,7 +6326,7 @@ rx_body_348:
mov eax, r9d
xor eax, 039c35461h
and eax, 2047
movlpd qword ptr [rsi + rax * 8], xmm9
movhpd qword ptr [rsi + rax * 8], xmm9
rx_i_349: ;XOR_32
dec ebx
@ -6413,9 +6396,9 @@ rx_body_352:
mov eax, r10d
xor eax, 03bf686f2h
and eax, 2047
movhpd qword ptr [rsi + rax * 8], xmm2
movlpd qword ptr [rsi + rax * 8], xmm2
rx_i_353: ;FPMUL
rx_i_353: ;FPSUB
dec ebx
jz rx_finish
xor r13, 02e65278bh
@ -6426,15 +6409,12 @@ rx_i_353: ;FPMUL
rx_body_353:
and ecx, 2047
cvtdq2pd xmm0, qword ptr [rsi+rcx*8]
mulpd xmm0, xmm2
movaps xmm1, xmm0
cmpeqpd xmm1, xmm1
andps xmm0, xmm1
subpd xmm0, xmm2
movaps xmm7, xmm0
mov eax, r15d
xor eax, 0b3c9f7aeh
and eax, 2047
movhpd qword ptr [rsi + rax * 8], xmm7
movlpd qword ptr [rsi + rax * 8], xmm7
rx_i_354: ;MULH_64
dec ebx
@ -6535,7 +6515,7 @@ rx_body_359:
mov eax, r12d
xor eax, 0f16b9be3h
and eax, 32767
movhpd qword ptr [rsi + rax * 8], xmm4
movlpd qword ptr [rsi + rax * 8], xmm4
rx_i_360: ;FPMUL
dec ebx
@ -6570,7 +6550,7 @@ rx_body_361:
mov eax, r14d
xor eax, 0ad0b81f5h
and eax, 2047
movhpd qword ptr [rsi + rax * 8], xmm6
movlpd qword ptr [rsi + rax * 8], xmm6
rx_i_362: ;SUB_64
dec ebx
@ -6726,7 +6706,7 @@ rx_body_370:
mov eax, r14d
xor eax, 0a120e0edh
and eax, 2047
movhpd qword ptr [rsi + rax * 8], xmm6
movlpd qword ptr [rsi + rax * 8], xmm6
rx_i_371: ;FPADD
dec ebx
@ -6948,7 +6928,7 @@ rx_body_383:
mov eax, r13d
xor eax, 0c9f5cc22h
and eax, 2047
movhpd qword ptr [rsi + rax * 8], xmm5
movlpd qword ptr [rsi + rax * 8], xmm5
rx_i_384: ;SHR_64
dec ebx
@ -7256,7 +7236,7 @@ rx_body_400:
and eax, 32767
mov qword ptr [rsi + rax * 8], rcx
rx_i_401: ;FPMUL
rx_i_401: ;FPSUB
dec ebx
jz rx_finish
xor r13, 032e81f25h
@ -7267,15 +7247,12 @@ rx_i_401: ;FPMUL
rx_body_401:
and ecx, 2047
cvtdq2pd xmm0, qword ptr [rsi+rcx*8]
mulpd xmm0, xmm4
movaps xmm1, xmm0
cmpeqpd xmm1, xmm1
andps xmm0, xmm1
subpd xmm0, xmm4
movaps xmm6, xmm0
mov eax, r14d
xor eax, 03ea60344h
and eax, 32767
movhpd qword ptr [rsi + rax * 8], xmm6
movlpd qword ptr [rsi + rax * 8], xmm6
rx_i_402: ;RET
dec ebx
@ -7382,13 +7359,15 @@ rx_i_406: ;FPROUND
rx_body_406:
and ecx, 32767
mov rax, qword ptr [rsi+rcx*8]
mov rcx, rax
shl eax, 13
and eax, 24576
or eax, 40896
mov dword ptr [rsp - 8], eax
ldmxcsr dword ptr [rsp - 8]
mov r9, rcx
rx_i_407: ;FPMUL
rx_i_407: ;FPSUB
dec ebx
jz rx_finish
xor r14, 09699566fh
@ -7400,10 +7379,7 @@ rx_body_407:
xor rbp, rcx
and ecx, 32767
cvtdq2pd xmm0, qword ptr [rsi+rcx*8]
mulpd xmm0, xmm9
movaps xmm1, xmm0
cmpeqpd xmm1, xmm1
andps xmm0, xmm1
subpd xmm0, xmm9
movaps xmm8, xmm0
rx_i_408: ;MUL_64
@ -7493,7 +7469,7 @@ rx_body_412:
mov eax, r11d
xor eax, 0bbd2640ah
and eax, 2047
movhpd qword ptr [rsi + rax * 8], xmm3
movlpd qword ptr [rsi + rax * 8], xmm3
rx_i_413: ;FPDIV
dec ebx
@ -7704,7 +7680,7 @@ rx_body_424:
mov eax, r9d
xor eax, 0565ae8aah
and eax, 2047
movhpd qword ptr [rsi + rax * 8], xmm9
movlpd qword ptr [rsi + rax * 8], xmm9
rx_i_425: ;IMUL_32
dec ebx
@ -7887,7 +7863,7 @@ rx_body_434:
mov eax, r9d
xor eax, 08c1cfc74h
and eax, 2047
movlpd qword ptr [rsi + rax * 8], xmm9
movhpd qword ptr [rsi + rax * 8], xmm9
rx_i_435: ;MUL_64
dec ebx
@ -8068,7 +8044,7 @@ not_taken_ret_443:
and eax, 2047
mov qword ptr [rsi + rax * 8], rcx
rx_i_444: ;FPMUL
rx_i_444: ;FPSUB
dec ebx
jz rx_finish
xor r8, 042455dd8h
@ -8079,15 +8055,12 @@ rx_i_444: ;FPMUL
rx_body_444:
and ecx, 32767
cvtdq2pd xmm0, qword ptr [rsi+rcx*8]
mulpd xmm0, xmm7
movaps xmm1, xmm0
cmpeqpd xmm1, xmm1
andps xmm0, xmm1
subpd xmm0, xmm7
movaps xmm5, xmm0
mov eax, r13d
xor eax, 0ce416070h
and eax, 2047
movlpd qword ptr [rsi + rax * 8], xmm5
movhpd qword ptr [rsi + rax * 8], xmm5
rx_i_445: ;ADD_64
dec ebx
@ -8128,7 +8101,7 @@ rx_body_446:
and eax, 2047
mov qword ptr [rsi + rax * 8], rcx
rx_i_447: ;FPSUB
rx_i_447: ;FPADD
dec ebx
jz rx_finish
xor r8, 01596d0e8h
@ -8139,12 +8112,12 @@ rx_i_447: ;FPSUB
rx_body_447:
and ecx, 2047
cvtdq2pd xmm0, qword ptr [rsi+rcx*8]
subpd xmm0, xmm7
addpd xmm0, xmm7
movaps xmm5, xmm0
mov eax, r13d
xor eax, 0b384d4afh
and eax, 2047
movhpd qword ptr [rsi + rax * 8], xmm5
movlpd qword ptr [rsi + rax * 8], xmm5
rx_i_448: ;FPSUB
dec ebx
@ -8668,7 +8641,7 @@ rx_body_477:
mov eax, r14d
xor eax, 0e81fc7a6h
and eax, 2047
movlpd qword ptr [rsi + rax * 8], xmm6
movhpd qword ptr [rsi + rax * 8], xmm6
rx_i_478: ;MUL_64
dec ebx
@ -9143,7 +9116,7 @@ rx_body_504:
and eax, 32767
movhpd qword ptr [rsi + rax * 8], xmm4
rx_i_505: ;FPMUL
rx_i_505: ;FPSUB
dec ebx
jz rx_finish
xor r12, 032c0a28ah
@ -9154,17 +9127,14 @@ rx_i_505: ;FPMUL
rx_body_505:
and ecx, 32767
cvtdq2pd xmm0, qword ptr [rsi+rcx*8]
mulpd xmm0, xmm4
movaps xmm1, xmm0
cmpeqpd xmm1, xmm1
andps xmm0, xmm1
subpd xmm0, xmm4
movaps xmm8, xmm0
mov eax, r8d
xor eax, 021b54eaeh
and eax, 32767
movlpd qword ptr [rsi + rax * 8], xmm8
movhpd qword ptr [rsi + rax * 8], xmm8
rx_i_506: ;FPMUL
rx_i_506: ;FPSUB
dec ebx
jz rx_finish
xor r9, 0a973d58ch
@ -9175,10 +9145,7 @@ rx_i_506: ;FPMUL
rx_body_506:
and ecx, 2047
cvtdq2pd xmm0, qword ptr [rsi+rcx*8]
mulpd xmm0, xmm9
movaps xmm1, xmm0
cmpeqpd xmm1, xmm1
andps xmm0, xmm1
subpd xmm0, xmm9
movaps xmm3, xmm0
rx_i_507: ;RET
@ -9238,7 +9205,7 @@ taken_call_509:
push rax
call rx_i_42
rx_i_510: ;FPSUB
rx_i_510: ;FPADD
dec ebx
jz rx_finish
xor r8, 0db65513ch
@ -9249,7 +9216,7 @@ rx_i_510: ;FPSUB
rx_body_510:
and ecx, 2047
cvtdq2pd xmm0, qword ptr [rsi+rcx*8]
subpd xmm0, xmm2
addpd xmm0, xmm2
movaps xmm9, xmm0
rx_i_511: ;ROL_64

View File

@ -74,21 +74,21 @@ void setPrivilege(const char* pszPrivilege, BOOL bEnable) {
}
#endif
void* allocExecutableMemory(size_t bytes) {
void* allocExecutableMemory(std::size_t bytes) {
void* mem;
#ifdef _WIN32
mem = VirtualAlloc(nullptr, bytes, MEM_COMMIT, PAGE_EXECUTE_READWRITE);
if (mem == nullptr)
throw std::runtime_error(getErrorMessage("allocExecutableMemory - VirtualAlloc"));
#else
mem = mmap(nullptr, CodeSize, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
mem = mmap(nullptr, bytes, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
if (mem == MAP_FAILED)
throw std::runtime_error("allocExecutableMemory - mmap failed");
#endif
return mem;
}
void* allocLargePagesMemory(size_t bytes) {
void* allocLargePagesMemory(std::size_t bytes) {
void* mem;
#ifdef _WIN32
setPrivilege("SeLockMemoryPrivilege", 1);

View File

@ -19,5 +19,7 @@ along with RandomX. If not, see<http://www.gnu.org/licenses/>.
#pragma once
void* allocExecutableMemory(size_t);
void* allocLargePagesMemory(size_t);
#include <cstddef>
void* allocExecutableMemory(std::size_t);
void* allocLargePagesMemory(std::size_t);