Random accesses - JIT compiler

This commit is contained in:
tevador 2019-01-10 22:04:55 +01:00
parent b71e0eec65
commit d1a808643d
24 changed files with 341 additions and 341 deletions

View file

@ -11,7 +11,7 @@ SRCDIR=src
OBJDIR=obj OBJDIR=obj
LDFLAGS=-lpthread LDFLAGS=-lpthread
TOBJS=$(addprefix $(OBJDIR)/,instructionsPortable.o TestAluFpu.o) TOBJS=$(addprefix $(OBJDIR)/,instructionsPortable.o TestAluFpu.o)
ROBJS=$(addprefix $(OBJDIR)/,argon2_core.o argon2_ref.o AssemblyGeneratorX86.o blake2b.o CompiledVirtualMachine.o dataset.o JitCompilerX86.o instructionsPortable.o Instruction.o InterpretedVirtualMachine.o main.o Program.o softAes.o VirtualMachine.o t1ha2.o Cache.o) ROBJS=$(addprefix $(OBJDIR)/,argon2_core.o argon2_ref.o AssemblyGeneratorX86.o blake2b.o CompiledVirtualMachine.o dataset.o JitCompilerX86.o instructionsPortable.o Instruction.o InterpretedVirtualMachine.o main.o Program.o softAes.o VirtualMachine.o t1ha2.o Cache.o virtualMemory.o)
ifeq ($(PLATFORM),x86_64) ifeq ($(PLATFORM),x86_64)
ROBJS += $(OBJDIR)/JitCompilerX86-static.o ROBJS += $(OBJDIR)/JitCompilerX86-static.o
endif endif
@ -60,7 +60,7 @@ $(OBJDIR)/dataset.o: $(addprefix $(SRCDIR)/,dataset.cpp common.hpp Pcg32.hpp) |
$(OBJDIR)/JitCompilerX86.o: $(addprefix $(SRCDIR)/,JitCompilerX86.cpp JitCompilerX86.hpp Instruction.hpp instructionWeights.hpp) | $(OBJDIR) $(OBJDIR)/JitCompilerX86.o: $(addprefix $(SRCDIR)/,JitCompilerX86.cpp JitCompilerX86.hpp Instruction.hpp instructionWeights.hpp) | $(OBJDIR)
$(CXX) $(CXXFLAGS) -c $(SRCDIR)/JitCompilerX86.cpp -o $@ $(CXX) $(CXXFLAGS) -c $(SRCDIR)/JitCompilerX86.cpp -o $@
$(OBJDIR)/JitCompilerX86-static.o: $(addprefix $(SRCDIR)/,JitCompilerX86-static.S $(addprefix asm/program_, prologue_linux.inc prologue_load.inc epilogue_linux.inc epilogue_store.inc read_r.inc read_f.inc)) | $(OBJDIR) $(OBJDIR)/JitCompilerX86-static.o: $(addprefix $(SRCDIR)/,JitCompilerX86-static.S $(addprefix asm/program_, prologue_linux.inc prologue_load.inc epilogue_linux.inc epilogue_store.inc read.inc)) | $(OBJDIR)
$(CXX) -x assembler-with-cpp -c $(SRCDIR)/JitCompilerX86-static.S -o $@ $(CXX) -x assembler-with-cpp -c $(SRCDIR)/JitCompilerX86-static.S -o $@
$(OBJDIR)/instructionsPortable.o: $(addprefix $(SRCDIR)/,instructionsPortable.cpp instructions.hpp intrinPortable.h) | $(OBJDIR) $(OBJDIR)/instructionsPortable.o: $(addprefix $(SRCDIR)/,instructionsPortable.cpp instructions.hpp intrinPortable.h) | $(OBJDIR)
@ -87,6 +87,9 @@ $(OBJDIR)/softAes.o: $(addprefix $(SRCDIR)/,softAes.cpp softAes.h) | $(OBJDIR)
$(OBJDIR)/VirtualMachine.o: $(addprefix $(SRCDIR)/,VirtualMachine.cpp VirtualMachine.hpp common.hpp dataset.hpp) | $(OBJDIR) $(OBJDIR)/VirtualMachine.o: $(addprefix $(SRCDIR)/,VirtualMachine.cpp VirtualMachine.hpp common.hpp dataset.hpp) | $(OBJDIR)
$(CXX) $(CXXFLAGS) -c $(SRCDIR)/VirtualMachine.cpp -o $@ $(CXX) $(CXXFLAGS) -c $(SRCDIR)/VirtualMachine.cpp -o $@
$(OBJDIR)/virtualMemory.o: $(addprefix $(SRCDIR)/,virtualMemory.cpp virtualMemory.hpp) | $(OBJDIR)
$(CXX) $(CXXFLAGS) -c $(SRCDIR)/virtualMemory.cpp -o $@
$(OBJDIR)/t1ha2.o: $(addprefix $(SRCDIR)/t1ha/,t1ha2.c t1ha.h t1ha_bits.h) | $(OBJDIR) $(OBJDIR)/t1ha2.o: $(addprefix $(SRCDIR)/t1ha/,t1ha2.c t1ha.h t1ha_bits.h) | $(OBJDIR)
$(CC) $(CCFLAGS) -c $(SRCDIR)/t1ha/t1ha2.c -o $@ $(CC) $(CCFLAGS) -c $(SRCDIR)/t1ha/t1ha2.c -o $@

View file

@ -169,11 +169,12 @@ namespace RandomX {
asmCode << "\t" << instrx86 << " xmm0, " << regF[instr.regb % RegistersCount] << std::endl; asmCode << "\t" << instrx86 << " xmm0, " << regF[instr.regb % RegistersCount] << std::endl;
} }
void AssemblyGeneratorX86::gencr(Instruction& instr) { void AssemblyGeneratorX86::gencr(Instruction& instr, bool rax = true) {
switch (instr.locc & 7) switch (instr.locc & 7)
{ {
case 0: case 0:
asmCode << "\tmov rcx, rax" << std::endl; if(rax)
asmCode << "\tmov rcx, rax" << std::endl;
asmCode << "\tmov eax, " << regR32[instr.regc % RegistersCount] << std::endl; asmCode << "\tmov eax, " << regR32[instr.regc % RegistersCount] << std::endl;
asmCode << "\txor eax, 0" << std::hex << instr.addrc << "h" << std::dec << std::endl; asmCode << "\txor eax, 0" << std::hex << instr.addrc << "h" << std::dec << std::endl;
asmCode << "\tand eax, " << (ScratchpadL2 - 1) << std::endl; asmCode << "\tand eax, " << (ScratchpadL2 - 1) << std::endl;
@ -186,7 +187,8 @@ namespace RandomX {
case 1: case 1:
case 2: case 2:
case 3: case 3:
asmCode << "\tmov rcx, rax" << std::endl; if (rax)
asmCode << "\tmov rcx, rax" << std::endl;
asmCode << "\tmov eax, " << regR32[instr.regc % RegistersCount] << std::endl; asmCode << "\tmov eax, " << regR32[instr.regc % RegistersCount] << std::endl;
asmCode << "\txor eax, 0" << std::hex << instr.addrc << "h" << std::dec << std::endl; asmCode << "\txor eax, 0" << std::hex << instr.addrc << "h" << std::dec << std::endl;
asmCode << "\tand eax, " << (ScratchpadL1 - 1) << std::endl; asmCode << "\tand eax, " << (ScratchpadL1 - 1) << std::endl;
@ -197,9 +199,9 @@ namespace RandomX {
return; return;
default: default:
asmCode << "\tmov " << regR[instr.regc % RegistersCount] << ", rax" << std::endl; asmCode << "\tmov " << regR[instr.regc % RegistersCount] << ", " << (rax ? "rax" : "rcx") << std::endl;
if (trace) { if (trace) {
asmCode << "\tmov qword ptr [" << regScratchpadAddr << " + " << regIc << " * 8 + 262136], rax" << std::endl; asmCode << "\tmov qword ptr [" << regScratchpadAddr << " + " << regIc << " * 8 + 262136], " << (rax ? "rax" : "rcx") << std::endl;
} }
return; return;
} }
@ -208,7 +210,7 @@ namespace RandomX {
void AssemblyGeneratorX86::gencf(Instruction& instr, bool move = true) { void AssemblyGeneratorX86::gencf(Instruction& instr, bool move = true) {
if(move) if(move)
asmCode << "\tmovaps " << regF[instr.regc % RegistersCount] << ", xmm0" << std::endl; asmCode << "\tmovaps " << regF[instr.regc % RegistersCount] << ", xmm0" << std::endl;
const char* store = (instr.locc & 8) ? "movhpd" : "movlpd"; const char* store = (instr.locc & 128) ? "movhpd" : "movlpd";
switch (instr.locc & 7) switch (instr.locc & 7)
{ {
case 4: case 4:
@ -463,14 +465,13 @@ namespace RandomX {
void AssemblyGeneratorX86::h_FPROUND(Instruction& instr, int i) { void AssemblyGeneratorX86::h_FPROUND(Instruction& instr, int i) {
genar(instr, i); genar(instr, i);
//asmCode << "\tmov rcx, rax" << std::endl; asmCode << "\tmov rcx, rax" << std::endl;
asmCode << "\tshl eax, 13" << std::endl; asmCode << "\tshl eax, 13" << std::endl;
//asmCode << "\tand rcx, -2048" << std::endl;
asmCode << "\tand eax, 24576" << std::endl; asmCode << "\tand eax, 24576" << std::endl;
//asmCode << "\tmovaps " << regF[instr.regc % RegistersCount] << ", xmm0" << std::endl;
asmCode << "\tor eax, 40896" << std::endl; asmCode << "\tor eax, 40896" << std::endl;
asmCode << "\tmov dword ptr [rsp - 8], eax" << std::endl; asmCode << "\tmov dword ptr [rsp - 8], eax" << std::endl;
asmCode << "\tldmxcsr dword ptr [rsp - 8]" << std::endl; asmCode << "\tldmxcsr dword ptr [rsp - 8]" << std::endl;
gencr(instr, false);
} }
static inline const char* jumpCondition(Instruction& instr, bool invert = false) { static inline const char* jumpCondition(Instruction& instr, bool invert = false) {

View file

@ -44,7 +44,7 @@ namespace RandomX {
void genbr1(Instruction&); void genbr1(Instruction&);
void genbr132(Instruction&); void genbr132(Instruction&);
void genbf(Instruction&, const char*); void genbf(Instruction&, const char*);
void gencr(Instruction&); void gencr(Instruction&, bool);
void gencf(Instruction&, bool); void gencf(Instruction&, bool);
void generateCode(Instruction&, int); void generateCode(Instruction&, int);

View file

@ -47,8 +47,8 @@ namespace RandomX {
} }
void CompiledVirtualMachine::execute() { void CompiledVirtualMachine::execute() {
executeProgram(reg, mem, scratchpad, readDataset); //executeProgram(reg, mem, scratchpad, readDataset);
//compiler.getProgramFunc()(reg, mem, scratchpad); compiler.getProgramFunc()(reg, mem, scratchpad);
#ifdef TRACEVM #ifdef TRACEVM
for (int32_t i = InstructionCount - 1; i >= 0; --i) { for (int32_t i = InstructionCount - 1; i >= 0; --i) {
std::cout << std::hex << tracepad[i].u64 << std::endl; std::cout << std::hex << tracepad[i].u64 << std::endl;

View file

@ -197,6 +197,17 @@ namespace RandomX {
#define ALU_RETIRE(x) x(a, b, c); \ #define ALU_RETIRE(x) x(a, b, c); \
if(trace) std::cout << std::hex << /*a.u64 << " " << b.u64 << " " <<*/ c.u64 << std::endl; if(trace) std::cout << std::hex << /*a.u64 << " " << b.u64 << " " <<*/ c.u64 << std::endl;
#define CHECK_NOP_FPDIV(b, c)
#ifndef STATS
#define CHECK_NOP_FPADD(b, c)
#define CHECK_NOP_FPSUB(b, c)
#define CHECK_NOP_FPMUL(b, c)
#else
#define CHECK_NOP_FPADD(b, c) bool loeq = (b.lo.u64 == c.lo.u64); bool hieq = (b.hi.u64 == c.hi.u64); count_FPADD_nop += loeq + hieq; if(loeq && hieq) count_FPADD_nop2++;
#define CHECK_NOP_FPSUB(b, c) bool loeq = ((b.lo.u64 & INT64_MAX) == (c.lo.u64 & INT64_MAX)); bool hieq = ((b.hi.u64 & INT64_MAX) == (c.hi.u64 & INT64_MAX)); count_FPSUB_nop += loeq + hieq; if(loeq && hieq) count_FPSUB_nop2++;
#define CHECK_NOP_FPMUL(b, c) bool loeq = (b.lo.u64 == c.lo.u64); bool hieq = (b.hi.u64 == c.hi.u64); count_FPMUL_nop += loeq + hieq; if(loeq && hieq) count_FPMUL_nop2++;
#endif
#define FPU_RETIRE(x) x(a, b, c); \ #define FPU_RETIRE(x) x(a, b, c); \
writecf(inst, c); \ writecf(inst, c); \
if(trace) { \ if(trace) { \
@ -248,8 +259,10 @@ namespace RandomX {
INC_COUNT(x) \ INC_COUNT(x) \
convertible_t a = loada(inst); \ convertible_t a = loada(inst); \
fpu_reg_t& b = reg.f[inst.regb % RegistersCount]; \ fpu_reg_t& b = reg.f[inst.regb % RegistersCount]; \
fpu_reg_t btemp = b; \
fpu_reg_t& c = reg.f[inst.regc % RegistersCount]; \ fpu_reg_t& c = reg.f[inst.regc % RegistersCount]; \
FPU_RETIRE(x) \ FPU_RETIRE(x) \
CHECK_NOP_##x(btemp, c) \
} }
#define FPU_INST_NB(x) void InterpretedVirtualMachine::h_##x(Instruction& inst) { \ #define FPU_INST_NB(x) void InterpretedVirtualMachine::h_##x(Instruction& inst) { \

View file

@ -83,6 +83,12 @@ namespace RandomX {
int count_retdepth_max = 0; int count_retdepth_max = 0;
int count_endstack = 0; int count_endstack = 0;
int count_instructions[ProgramLength] = { 0 }; int count_instructions[ProgramLength] = { 0 };
int count_FPADD_nop = 0;
int count_FPADD_nop2 = 0;
int count_FPSUB_nop = 0;
int count_FPSUB_nop2 = 0;
int count_FPMUL_nop = 0;
int count_FPMUL_nop2 = 0;
#endif #endif
convertible_t loada(Instruction&); convertible_t loada(Instruction&);

View file

@ -29,9 +29,12 @@
.global DECL(randomx_program_prologue) .global DECL(randomx_program_prologue)
.global DECL(randomx_program_begin) .global DECL(randomx_program_begin)
.global DECL(randomx_program_epilogue) .global DECL(randomx_program_epilogue)
.global DECL(randomx_program_read_r) .global DECL(randomx_program_read_l1)
.global DECL(randomx_program_read_f) .global DECL(randomx_program_read_l2)
.global DECL(randomx_program_end) .global DECL(randomx_program_end)
.global DECL(randomx_program_transform)
#define db .byte
.align 64 .align 64
DECL(randomx_program_prologue): DECL(randomx_program_prologue):
@ -45,14 +48,26 @@ DECL(randomx_program_begin):
DECL(randomx_program_epilogue): DECL(randomx_program_epilogue):
#include "asm/program_epilogue_linux.inc" #include "asm/program_epilogue_linux.inc"
.align 64 #define scratchpad_mask and ecx, 2040
DECL(randomx_program_read_r):
#include "asm/program_read_r.inc"
.align 64 .align 64
DECL(randomx_program_read_f): DECL(randomx_program_read_l1):
#include "asm/program_read_f.inc" #include "asm/program_read.inc"
#undef scratchpad_mask
#define scratchpad_mask and ecx, 32760
.align 64
DECL(randomx_program_read_l2):
#include "asm/program_read.inc"
#undef scratchpad_mask
.align 64 .align 64
DECL(randomx_program_end): DECL(randomx_program_end):
nop nop
.align 8
DECL(randomx_program_transform):
#include "asm/program_transform_address.inc"

View file

@ -20,9 +20,11 @@ _RANDOMX_JITX86_STATIC SEGMENT PAGE READ EXECUTE
PUBLIC randomx_program_prologue PUBLIC randomx_program_prologue
PUBLIC randomx_program_begin PUBLIC randomx_program_begin
PUBLIC randomx_program_epilogue PUBLIC randomx_program_epilogue
PUBLIC randomx_program_read_r PUBLIC randomx_program_read_l1
PUBLIC randomx_program_read_f PUBLIC randomx_program_read_l2
PUBLIC randomx_program_end PUBLIC randomx_program_end
PUBLIC randomx_program_transform
ALIGN 64 ALIGN 64
randomx_program_prologue PROC randomx_program_prologue PROC
@ -39,21 +41,34 @@ randomx_program_epilogue PROC
include asm/program_epilogue_win64.inc include asm/program_epilogue_win64.inc
randomx_program_epilogue ENDP randomx_program_epilogue ENDP
ALIGN 64 scratchpad_mask MACRO
randomx_program_read_r PROC and ecx, 2040
include asm/program_read_r.inc ENDM
randomx_program_read_r ENDP
ALIGN 64 ALIGN 64
randomx_program_read_f PROC randomx_program_read_l1 PROC
include asm/program_read_f.inc include asm/program_read.inc
randomx_program_read_f ENDP randomx_program_read_l1 ENDP
scratchpad_mask MACRO
and ecx, 32760
ENDM
ALIGN 64
randomx_program_read_l2 PROC
include asm/program_read.inc
randomx_program_read_l2 ENDP
ALIGN 64 ALIGN 64
randomx_program_end PROC randomx_program_end PROC
nop nop
randomx_program_end ENDP randomx_program_end ENDP
ALIGN 8
randomx_program_transform PROC
include asm/program_transform_address.inc
randomx_program_transform ENDP
_RANDOMX_JITX86_STATIC ENDS _RANDOMX_JITX86_STATIC ENDS
END END

View file

@ -21,7 +21,8 @@ extern "C" {
void randomx_program_prologue(); void randomx_program_prologue();
void randomx_program_begin(); void randomx_program_begin();
void randomx_program_epilogue(); void randomx_program_epilogue();
void randomx_program_read_r(); void randomx_program_transform();
void randomx_program_read_f(); void randomx_program_read_l1();
void randomx_program_read_l2();
void randomx_program_end(); void randomx_program_end();
} }

View file

@ -48,12 +48,12 @@ namespace RandomX {
REGISTER ALLOCATION: REGISTER ALLOCATION:
rax -> temporary rax -> temporary
rbx -> MemoryRegisters& memory rbx -> "ic"
rcx -> temporary rcx -> temporary
rdx -> temporary rdx -> temporary
rsi -> convertible_t* scratchpad rsi -> convertible_t* scratchpad
rdi -> "ic" (instruction counter) rdi -> beginning of VM stack
rbp -> beginning of VM stack rbp -> "ma", "mx"
rsp -> end of VM stack rsp -> end of VM stack
r8 -> "r0" r8 -> "r0"
r9 -> "r1" r9 -> "r1"
@ -82,7 +82,8 @@ namespace RandomX {
| saved registers | saved registers
| |
v v
[rbp] RegisterFile& registerFile [rdi+8] RegisterFile& registerFile
[rdi] uint8_t* dataset
| |
| |
| VM stack | VM stack
@ -97,18 +98,19 @@ namespace RandomX {
const uint8_t* codePrologue = (uint8_t*)&randomx_program_prologue; const uint8_t* codePrologue = (uint8_t*)&randomx_program_prologue;
const uint8_t* codeProgramBegin = (uint8_t*)&randomx_program_begin; const uint8_t* codeProgramBegin = (uint8_t*)&randomx_program_begin;
const uint8_t* codeEpilogue = (uint8_t*)&randomx_program_epilogue; const uint8_t* codeEpilogue = (uint8_t*)&randomx_program_epilogue;
const uint8_t* codeReadDatasetR = (uint8_t*)&randomx_program_read_r; const uint8_t* codeReadDatasetL1 = (uint8_t*)&randomx_program_read_l1;
const uint8_t* codeReadDatasetF = (uint8_t*)&randomx_program_read_f; const uint8_t* codeReadDatasetL2 = (uint8_t*)&randomx_program_read_l2;
const uint8_t* codeProgramEnd = (uint8_t*)&randomx_program_end; const uint8_t* codeProgramEnd = (uint8_t*)&randomx_program_end;
const uint32_t* addressTransformations = (uint32_t*)&randomx_program_transform;
const int32_t prologueSize = codeProgramBegin - codePrologue; const int32_t prologueSize = codeProgramBegin - codePrologue;
const int32_t epilogueSize = codeReadDatasetR - codeEpilogue; const int32_t epilogueSize = codeReadDatasetL1 - codeEpilogue;
const int32_t readDatasetRSize = codeReadDatasetF - codeReadDatasetR; const int32_t readDatasetL1Size = codeReadDatasetL2 - codeReadDatasetL1;
const int32_t readDatasetFSize = codeProgramEnd - codeReadDatasetF; const int32_t readDatasetL2Size = codeProgramEnd - codeReadDatasetL2;
const int32_t readDatasetFOffset = CodeSize - readDatasetFSize; const int32_t readDatasetL2Offset = CodeSize - readDatasetL2Size;
const int32_t readDatasetROffset = readDatasetFOffset - readDatasetRSize; const int32_t readDatasetL1Offset = readDatasetL2Offset - readDatasetL1Size;
const int32_t epilogueOffset = readDatasetROffset - epilogueSize; const int32_t epilogueOffset = readDatasetL1Offset - epilogueSize;
JitCompilerX86::JitCompilerX86() { JitCompilerX86::JitCompilerX86() {
#ifdef _WIN32 #ifdef _WIN32
@ -121,9 +123,9 @@ namespace RandomX {
throw std::runtime_error("mmap failed"); throw std::runtime_error("mmap failed");
#endif #endif
memcpy(code, codePrologue, prologueSize); memcpy(code, codePrologue, prologueSize);
memcpy(code + CodeSize - readDatasetRSize - readDatasetFSize - epilogueSize, codeEpilogue, epilogueSize); memcpy(code + CodeSize - epilogueSize - readDatasetL1Size - readDatasetL2Size, codeEpilogue, epilogueSize);
memcpy(code + CodeSize - readDatasetRSize - readDatasetFSize, codeReadDatasetR, readDatasetRSize); memcpy(code + CodeSize - readDatasetL1Size - readDatasetL2Size, codeReadDatasetL1, readDatasetL1Size);
memcpy(code + CodeSize - readDatasetFSize, codeReadDatasetF, readDatasetFSize); memcpy(code + CodeSize - readDatasetL2Size, codeReadDatasetL2, readDatasetL2Size);
} }
void JitCompilerX86::generateProgram(Pcg32& gen) { void JitCompilerX86::generateProgram(Pcg32& gen) {
@ -140,12 +142,33 @@ namespace RandomX {
emitByte(0xe9); emitByte(0xe9);
emit(instructionOffsets[0] - (codePos + 4)); emit(instructionOffsets[0] - (codePos + 4));
fixCallOffsets(); fixCallOffsets();
uint32_t transformL1 = addressTransformations[gen.getUniform(0, TransformationCount - 1)];
uint32_t transformL2 = addressTransformations[gen.getUniform(0, TransformationCount - 1)];
*reinterpret_cast<uint32_t*>(code + readDatasetL1Offset + 1) = transformL1;
*reinterpret_cast<uint32_t*>(code + readDatasetL2Offset + 1) = transformL2;
} }
void JitCompilerX86::generateCode(Instruction& instr, int i) { void JitCompilerX86::generateCode(Instruction& instr, int i) {
instructionOffsets.push_back(codePos); instructionOffsets.push_back(codePos);
emit(0x840fcfff); //dec edx; jz <epilogue> emit(0x840fcbff); //dec ebx; jz <epilogue>
emit(epilogueOffset - (codePos + 4)); //jump offset (RIP-relative) emit(epilogueOffset - (codePos + 4)); //jump offset (RIP-relative)
emit(uint16_t(0x8149)); //xor
emitByte(0xf0 + (instr.rega % RegistersCount));
emit(instr.addra);
emit(uint16_t(0x8b41)); //mov
emitByte(0xc8 + (instr.rega % RegistersCount)); //ecx, rega
emit(0x753fc3f6); //test bl,0x3f; jne
emit(uint16_t(0xe805));
if (instr.loca & 3) { //A.LOC.W
emit(readDatasetL1Offset - (codePos + 4));
}
else {
emit(readDatasetL2Offset - (codePos + 4));
}
if ((instr.loca & 192) == 0) { //A.LOC.X
emit(uint16_t(0x3348));
emitByte(0xe9); //xor rbp, rcx
}
auto generator = engine[instr.opcode]; auto generator = engine[instr.opcode];
(this->*generator)(instr, i); (this->*generator)(instr, i);
} }
@ -157,73 +180,26 @@ namespace RandomX {
} }
void JitCompilerX86::genar(Instruction& instr) { void JitCompilerX86::genar(Instruction& instr) {
emit(uint16_t(0x8149)); //xor emit(uint16_t(0xe181)); //and ecx,
emitByte(0xf0 + (instr.rega % RegistersCount)); if (instr.loca & 3) {
emit(instr.addra); emit(ScratchpadL1 - 1); //first 16 KiB of scratchpad
switch (instr.loca & 7)
{
case 0:
case 1:
case 2:
case 3:
emit(uint16_t(0x8b41)); //mov
emitByte(0xc8 + (instr.rega % RegistersCount)); //ecx, rega
emitByte(0xe8); //call
emit(readDatasetROffset - (codePos + 4));
return;
case 4:
emit(uint16_t(0x8b41)); //mov
emitByte(0xc0 + (instr.rega % RegistersCount)); //eax, rega
emitByte(0x25); //and
emit(ScratchpadL2 - 1); //whole scratchpad
emit(0xc6048b48); // mov rax,QWORD PTR [rsi+rax*8]
return;
default:
emit(uint16_t(0x8b41)); //mov
emitByte(0xc0 + (instr.rega % RegistersCount)); //eax, rega
emitByte(0x25); //and
emit(ScratchpadL1 - 1); //first 16 KiB of scratchpad
emit(0xc6048b48); // mov rax,QWORD PTR [rsi+rax*8]
return;
} }
else {
emit(ScratchpadL2 - 1); //whole scratchpad
}
emit(0xce048b48); //mov rax,QWORD PTR [rsi+rcx*8]
} }
void JitCompilerX86::genaf(Instruction& instr) { void JitCompilerX86::genaf(Instruction& instr) {
emit(uint16_t(0x8149)); //xor emit(uint16_t(0xe181)); //and ecx,
emitByte(0xf0 + (instr.rega % RegistersCount)); if (instr.loca & 3) {
emit(instr.addra);
switch (instr.loca & 7)
{
case 0:
case 1:
case 2:
case 3:
emit(uint16_t(0x8b41)); //mov
emitByte(0xc8 + (instr.rega % RegistersCount)); //ecx, rega
emitByte(0xe8); //call
emit(readDatasetFOffset - (codePos + 4));
return;
case 4:
emit(uint16_t(0x8b41)); //mov
emitByte(0xc0 + (instr.rega % RegistersCount)); //eax, rega
emitByte(0x25); //and
emit(ScratchpadL2 - 1); //whole scratchpad
emitByte(0xf3);
emit(0xc604e60f); //cvtdq2pd xmm0,QWORD PTR [rsi+rax*8]
return;
default:
emit(uint16_t(0x8b41)); //mov
emitByte(0xc0 + (instr.rega % RegistersCount)); //eax, rega
emitByte(0x25); //and
emit(ScratchpadL1 - 1); //first 16 KiB of scratchpad emit(ScratchpadL1 - 1); //first 16 KiB of scratchpad
emitByte(0xf3);
emit(0xc604e60f); //cvtdq2pd xmm0,QWORD PTR [rsi+rax*8]
return;
} }
else {
emit(ScratchpadL2 - 1); //whole scratchpad
}
emitByte(0xf3);
emit(0xce04e60f); //cvtdq2pd xmm0,QWORD PTR [rsi+rcx*8]
} }
void JitCompilerX86::genbr0(Instruction& instr, uint16_t opcodeReg, uint16_t opcodeImm) { void JitCompilerX86::genbr0(Instruction& instr, uint16_t opcodeReg, uint16_t opcodeImm) {
@ -274,8 +250,13 @@ namespace RandomX {
} }
void JitCompilerX86::scratchpadStoreR(Instruction& instr, uint32_t scratchpadSize) { void JitCompilerX86::scratchpadStoreR(Instruction& instr, uint32_t scratchpadSize, bool rax) {
emit(0x41c88b48); //mov rcx, rax; REX if (rax) {
emit(0x41c88b48); //mov rcx, rax; REX
}
else {
emitByte(0x41);
}
emitByte(0x8b); // mov emitByte(0x8b); // mov
emitByte(0xc0 + (instr.regc % RegistersCount)); //eax, regc emitByte(0xc0 + (instr.regc % RegistersCount)); //eax, regc
emitByte(0x35); // xor eax emitByte(0x35); // xor eax
@ -285,22 +266,27 @@ namespace RandomX {
emit(0xc60c8948); // mov QWORD PTR [rsi+rax*8],rcx emit(0xc60c8948); // mov QWORD PTR [rsi+rax*8],rcx
} }
void JitCompilerX86::gencr(Instruction& instr) { void JitCompilerX86::gencr(Instruction& instr, bool rax = true) {
switch (instr.locc & 7) switch (instr.locc & 7)
{ {
case 0: case 0:
scratchpadStoreR(instr, ScratchpadL2); scratchpadStoreR(instr, ScratchpadL2, rax);
break; break;
case 1: case 1:
case 2: case 2:
case 3: case 3:
scratchpadStoreR(instr, ScratchpadL1); scratchpadStoreR(instr, ScratchpadL1, rax);
break; break;
default: default:
emit(uint16_t(0x8b4c)); //mov emit(uint16_t(0x8b4c)); //mov
emitByte(0xc0 + 8 * (instr.regc % RegistersCount)); //regc, rax if (rax) {
emitByte(0xc0 + 8 * (instr.regc % RegistersCount)); //regc, rax
}
else {
emitByte(0xc1 + 8 * (instr.regc % RegistersCount)); //regc, rcx
}
break; break;
} }
} }
@ -322,29 +308,21 @@ namespace RandomX {
emitByte(0xc6); emitByte(0xc6);
} }
void JitCompilerX86::gencf(Instruction& instr, bool alwaysLow = false) { void JitCompilerX86::gencf(Instruction& instr) {
int regc = (instr.regc % RegistersCount); int regc = (instr.regc % RegistersCount);
if (!alwaysLow) { if (regc <= 1) {
if (regc <= 1) { emitByte(0x44); //REX
emitByte(0x44); //REX
}
emit(uint16_t(0x280f)); //movaps
emitByte(0xc0 + 8 * regc); // regc, xmm0
} }
switch (instr.locc & 7) emit(uint16_t(0x280f)); //movaps
emitByte(0xc0 + 8 * regc); // regc, xmm0
if (instr.locc & 4) //C.LOC.R
{ {
case 4: if (instr.locc & 3) { //C.LOC.W
scratchpadStoreF(instr, regc, ScratchpadL2, !alwaysLow && (instr.locc & 8)); scratchpadStoreF(instr, regc, ScratchpadL1, (instr.locc & 128)); //first 16 KiB of scratchpad
break; }
else {
case 5: scratchpadStoreF(instr, regc, ScratchpadL2, (instr.locc & 128)); //whole scratchpad
case 6: }
case 7:
scratchpadStoreF(instr, regc, ScratchpadL1, !alwaysLow && (instr.locc & 8));
break;
default:
break;
} }
} }
@ -596,24 +574,11 @@ namespace RandomX {
void JitCompilerX86::h_FPROUND(Instruction& instr, int i) { void JitCompilerX86::h_FPROUND(Instruction& instr, int i) {
genar(instr); genar(instr);
emit(0x81480de0c1c88b48); emit(0x00250de0c1c88b48); //mov rcx,rax; shl eax,0xd
emit(0x600025fffff800e1); emit(0x00009fc00d000060); //and eax,0x6000; or eax,0x9fc0
emit(uint16_t(0x0000)); emit(0x2454ae0ff8244489); //ldmxcsr DWORD PTR [rsp-0x8]
emitByte(0xf2);
int regc = (instr.regc % RegistersCount);
if (regc <= 1) {
emitByte(0x4c); //REX
}
else {
emitByte(0x48); //REX
}
emit(uint16_t(0x2a0f));
emitByte(0xc1 + 8 * regc);
emitByte(0x0d);
emit(0xf824448900009fc0);
emit(0x2454ae0f); //ldmxcsr DWORD PTR [rsp-0x8]
emitByte(0xf8); emitByte(0xf8);
gencf(instr, true); gencr(instr, false); //result in rcx
} }
static inline uint8_t jumpCondition(Instruction& instr, bool invert = false) { static inline uint8_t jumpCondition(Instruction& instr, bool invert = false) {
@ -670,7 +635,7 @@ namespace RandomX {
if ((instr.locc & 7) <= 3) { if ((instr.locc & 7) <= 3) {
crlen = 17; crlen = 17;
} }
emit(0x74e53b48); //cmp rsp, rbp; je emit(0x74e73b48); //cmp rsp, rdi; je
emitByte(11 + crlen); emitByte(11 + crlen);
emitByte(0x48); emitByte(0x48);
emit(0x08244433); //xor rax,QWORD PTR [rsp+0x8] emit(0x08244433); //xor rax,QWORD PTR [rsp+0x8]

View file

@ -64,10 +64,10 @@ namespace RandomX {
void genbr1(Instruction&, uint16_t, uint16_t); void genbr1(Instruction&, uint16_t, uint16_t);
void genbr132(Instruction&, uint16_t, uint8_t); void genbr132(Instruction&, uint16_t, uint8_t);
void genbf(Instruction&, uint8_t); void genbf(Instruction&, uint8_t);
void scratchpadStoreR(Instruction&, uint32_t); void scratchpadStoreR(Instruction&, uint32_t, bool);
void scratchpadStoreF(Instruction&, int, uint32_t, bool); void scratchpadStoreF(Instruction&, int, uint32_t, bool);
void gencr(Instruction&); void gencr(Instruction&, bool);
void gencf(Instruction&, bool); void gencf(Instruction&);
void generateCode(Instruction&, int); void generateCode(Instruction&, int);
void fixCallOffsets(); void fixCallOffsets();

View file

@ -1,8 +1,9 @@
;# unroll VM stack ;# unroll VM stack
mov rsp, rbp mov rsp, rdi
;# save VM register values ;# save VM register values
pop rcx pop rcx
pop rcx
mov qword ptr [rcx+0], r8 mov qword ptr [rcx+0], r8
mov qword ptr [rcx+8], r9 mov qword ptr [rcx+8], r9
mov qword ptr [rcx+16], r10 mov qword ptr [rcx+16], r10

View file

@ -7,9 +7,11 @@
push r15 push r15
;# function arguments ;# function arguments
push rdi ;# RegisterFile& registerFile push rdi ;# RegisterFile& registerFile
mov rbx, rsi ;# MemoryRegisters& memory mov rbp, qword ptr [rsi] ;# "mx", "ma"
mov rsi, rdx ;# convertible_t* scratchpad mov rax, qword ptr [rsi+8] ;# uint8_t* dataset
push rax
mov rsi, rdx ;# convertible_t* scratchpad
mov rcx, rdi mov rcx, rdi
#include "program_prologue_load.inc" #include "program_prologue_load.inc"

View file

@ -1,5 +1,5 @@
mov rbp, rsp ;# beginning of VM stack mov rdi, rsp ;# beginning of VM stack
mov rdi, 1048577 ;# number of VM instructions to execute + 1 mov ebx, 1048577 ;# number of VM instructions to execute + 1
xorps xmm10, xmm10 xorps xmm10, xmm10
cmpeqpd xmm10, xmm10 cmpeqpd xmm10, xmm10

View file

@ -15,9 +15,11 @@
movdqu xmmword ptr [rsp+0], xmm10 movdqu xmmword ptr [rsp+0], xmm10
;# function arguments ;# function arguments
push rcx ;# RegisterFile& registerFile push rcx ;# RegisterFile& registerFile
mov rbx, rdx ;# MemoryRegisters& memory mov rbp, qword ptr [rdx] ;# "mx", "ma"
mov rsi, r8 ;# convertible_t* scratchpad mov rax, qword ptr [rdx+8] ;# uint8_t* dataset
push rax
mov rsi, r8 ;# convertible_t* scratchpad
include program_prologue_load.inc include program_prologue_load.inc

32
src/asm/program_read.inc Normal file
View file

@ -0,0 +1,32 @@
push rcx ;# preserve ecx
db 0, 0, 0, 0 ;# TransformAddress placeholder
mov rax, qword ptr [rdi] ;# load the dataset address
xor rbp, rcx ;# modify "mx"
;# prefetch cacheline "mx"
and rbp, -64 ;# align "mx" to the start of a cache line
mov edx, ebp ;# edx = mx
prefetchnta byte ptr [rax+rdx]
;# read cacheline "ma"
ror rbp, 32 ;# swap "ma" and "mx"
mov edx, ebp ;# edx = ma
scratchpad_mask ;# limit address to the specified scratchpad size aligned to multiple of 8
lea rcx, [rsi+rcx*8] ;# scratchpad cache line
lea rax, [rax+rdx] ;# dataset cache line
mov rdx, qword ptr [rax+0] ;# load first dataset quadword (prefetched into the cache by now)
xor qword ptr [rcx+0], rdx ;# XOR the dataset item with a scratchpad item, repeat for the rest of the cacheline
mov rdx, qword ptr [rax+8]
xor qword ptr [rcx+8], rdx
mov rdx, qword ptr [rax+16]
xor qword ptr [rcx+16], rdx
mov rdx, qword ptr [rax+24]
xor qword ptr [rcx+24], rdx
mov rdx, qword ptr [rax+32]
xor qword ptr [rcx+32], rdx
mov rdx, qword ptr [rax+40]
xor qword ptr [rcx+40], rdx
mov rdx, qword ptr [rax+48]
xor qword ptr [rcx+48], rdx
mov rdx, qword ptr [rax+56]
xor qword ptr [rcx+56], rdx
pop rcx ;# restore ecx
ret

View file

@ -1,13 +0,0 @@
mov edx, dword ptr [rbx] ;# ma
mov rax, qword ptr [rbx+8] ;# dataset
cvtdq2pd xmm0, qword ptr [rax+rdx]
add dword ptr [rbx], 8
xor ecx, dword ptr [rbx+4] ;# mx
mov dword ptr [rbx+4], ecx
test ecx, 65528
jne short rx_read_dataset_f_ret
and ecx, -8
mov dword ptr [rbx], ecx
prefetcht0 byte ptr [rax+rcx]
rx_read_dataset_f_ret:
ret 0

View file

@ -1,13 +0,0 @@
mov eax, dword ptr [rbx] ;# ma
mov rdx, qword ptr [rbx+8] ;# dataset
mov rax, qword ptr [rdx+rax]
add dword ptr [rbx], 8
xor ecx, dword ptr [rbx+4] ;# mx
mov dword ptr [rbx+4], ecx
test ecx, 65528
jne short rx_read_dataset_r_ret
and ecx, -8
mov dword ptr [rbx], ecx
prefetcht0 byte ptr [rdx+rcx]
rx_read_dataset_r_ret:
ret 0

View file

@ -77,6 +77,7 @@ namespace RandomX {
constexpr uint32_t ScratchpadLength = ScratchpadSize / sizeof(convertible_t); constexpr uint32_t ScratchpadLength = ScratchpadSize / sizeof(convertible_t);
constexpr uint32_t ScratchpadL1 = ScratchpadSize / 16 / sizeof(convertible_t); constexpr uint32_t ScratchpadL1 = ScratchpadSize / 16 / sizeof(convertible_t);
constexpr uint32_t ScratchpadL2 = ScratchpadSize / sizeof(convertible_t); constexpr uint32_t ScratchpadL2 = ScratchpadSize / sizeof(convertible_t);
constexpr uint32_t TransformationCount = 90;
constexpr int RegistersCount = 8; constexpr int RegistersCount = 8;
class Cache; class Cache;

View file

@ -158,10 +158,14 @@ executeProgram PROC
pslldq xmm7, 8 pslldq xmm7, 8
cvtsi2sd xmm7, qword ptr [rcx+112] cvtsi2sd xmm7, qword ptr [rcx+112]
; program body jmp program_begin
; program body
ALIGN 64
program_begin:
include program.inc include program.inc
ALIGN 64
rx_finish: rx_finish:
; unroll the stack ; unroll the stack
mov rsp, rdi mov rsp, rdi

View file

@ -277,10 +277,6 @@ int main(int argc, char** argv) {
if(programCount == 1000) if(programCount == 1000)
std::cout << "Reference result: 3e1c5f9b9d0bf8ffa250f860bf5f7ab76ac823b206ddee6a592660119a3640c6" << std::endl; std::cout << "Reference result: 3e1c5f9b9d0bf8ffa250f860bf5f7ab76ac823b206ddee6a592660119a3640c6" << std::endl;
std::cout << "Performance: " << programCount / elapsed << " programs per second" << std::endl; std::cout << "Performance: " << programCount / elapsed << " programs per second" << std::endl;
/*if (threadCount == 1 && !compiled) {
auto ivm = (RandomX::InterpretedVirtualMachine*)vms[0];
std::cout << ivm->getProgam();
}*/
} }
catch (std::exception& e) { catch (std::exception& e) {
std::cout << "ERROR: " << e.what() << std::endl; std::cout << "ERROR: " << e.what() << std::endl;

View file

@ -76,11 +76,13 @@ rx_body_3:
xor rbp, rcx xor rbp, rcx
and ecx, 2047 and ecx, 2047
mov rax, qword ptr [rsi+rcx*8] mov rax, qword ptr [rsi+rcx*8]
mov rcx, rax
shl eax, 13 shl eax, 13
and eax, 24576 and eax, 24576
or eax, 40896 or eax, 40896
mov dword ptr [rsp - 8], eax mov dword ptr [rsp - 8], eax
ldmxcsr dword ptr [rsp - 8] ldmxcsr dword ptr [rsp - 8]
mov r8, rcx
rx_i_4: ;MULH_64 rx_i_4: ;MULH_64
dec ebx dec ebx
@ -153,7 +155,7 @@ rx_body_7:
mov eax, r14d mov eax, r14d
xor eax, 057c8c41bh xor eax, 057c8c41bh
and eax, 32767 and eax, 32767
movhpd qword ptr [rsi + rax * 8], xmm6 movlpd qword ptr [rsi + rax * 8], xmm6
rx_i_8: ;SHL_64 rx_i_8: ;SHL_64
dec ebx dec ebx
@ -218,7 +220,7 @@ rx_body_11:
mov eax, r12d mov eax, r12d
xor eax, 0852d40d8h xor eax, 0852d40d8h
and eax, 2047 and eax, 2047
movhpd qword ptr [rsi + rax * 8], xmm4 movlpd qword ptr [rsi + rax * 8], xmm4
rx_i_12: ;CALL rx_i_12: ;CALL
dec ebx dec ebx
@ -355,7 +357,7 @@ rx_body_18:
mov eax, r11d mov eax, r11d
xor eax, 0869baa81h xor eax, 0869baa81h
and eax, 2047 and eax, 2047
movhpd qword ptr [rsi + rax * 8], xmm3 movlpd qword ptr [rsi + rax * 8], xmm3
rx_i_19: ;FPSUB rx_i_19: ;FPSUB
dec ebx dec ebx
@ -372,7 +374,7 @@ rx_body_19:
subpd xmm0, xmm8 subpd xmm0, xmm8
movaps xmm7, xmm0 movaps xmm7, xmm0
rx_i_20: ;FPMUL rx_i_20: ;FPSUB
dec ebx dec ebx
jz rx_finish jz rx_finish
xor r13, 0ecca967dh xor r13, 0ecca967dh
@ -383,15 +385,12 @@ rx_i_20: ;FPMUL
rx_body_20: rx_body_20:
and ecx, 2047 and ecx, 2047
cvtdq2pd xmm0, qword ptr [rsi+rcx*8] cvtdq2pd xmm0, qword ptr [rsi+rcx*8]
mulpd xmm0, xmm2 subpd xmm0, xmm2
movaps xmm1, xmm0
cmpeqpd xmm1, xmm1
andps xmm0, xmm1
movaps xmm7, xmm0 movaps xmm7, xmm0
mov eax, r15d mov eax, r15d
xor eax, 0aad81365h xor eax, 0aad81365h
and eax, 2047 and eax, 2047
movhpd qword ptr [rsi + rax * 8], xmm7 movlpd qword ptr [rsi + rax * 8], xmm7
rx_i_21: ;FPADD rx_i_21: ;FPADD
dec ebx dec ebx
@ -482,7 +481,7 @@ rx_body_25:
mov eax, r14d mov eax, r14d
xor eax, 0baf5c2d4h xor eax, 0baf5c2d4h
and eax, 2047 and eax, 2047
movhpd qword ptr [rsi + rax * 8], xmm6 movlpd qword ptr [rsi + rax * 8], xmm6
rx_i_26: ;IMUL_32 rx_i_26: ;IMUL_32
dec ebx dec ebx
@ -580,7 +579,7 @@ rx_body_31:
mov eax, r14d mov eax, r14d
xor eax, 01e2da792h xor eax, 01e2da792h
and eax, 2047 and eax, 2047
movhpd qword ptr [rsi + rax * 8], xmm6 movlpd qword ptr [rsi + rax * 8], xmm6
rx_i_32: ;XOR_64 rx_i_32: ;XOR_64
dec ebx dec ebx
@ -668,7 +667,7 @@ rx_body_36:
andps xmm0, xmm1 andps xmm0, xmm1
movaps xmm7, xmm0 movaps xmm7, xmm0
rx_i_37: ;FPMUL rx_i_37: ;FPSUB
dec ebx dec ebx
jz rx_finish jz rx_finish
xor r12, 0d0706601h xor r12, 0d0706601h
@ -679,10 +678,7 @@ rx_i_37: ;FPMUL
rx_body_37: rx_body_37:
and ecx, 2047 and ecx, 2047
cvtdq2pd xmm0, qword ptr [rsi+rcx*8] cvtdq2pd xmm0, qword ptr [rsi+rcx*8]
mulpd xmm0, xmm2 subpd xmm0, xmm2
movaps xmm1, xmm0
cmpeqpd xmm1, xmm1
andps xmm0, xmm1
movaps xmm9, xmm0 movaps xmm9, xmm0
mov eax, r9d mov eax, r9d
xor eax, 0bca81c78h xor eax, 0bca81c78h
@ -764,7 +760,7 @@ taken_call_41:
push rax push rax
call rx_i_127 call rx_i_127
rx_i_42: ;FPSUB rx_i_42: ;FPADD
dec ebx dec ebx
jz rx_finish jz rx_finish
xor r15, 0bc1de9f6h xor r15, 0bc1de9f6h
@ -776,7 +772,7 @@ rx_body_42:
xor rbp, rcx xor rbp, rcx
and ecx, 2047 and ecx, 2047
cvtdq2pd xmm0, qword ptr [rsi+rcx*8] cvtdq2pd xmm0, qword ptr [rsi+rcx*8]
subpd xmm0, xmm6 addpd xmm0, xmm6
movaps xmm6, xmm0 movaps xmm6, xmm0
rx_i_43: ;SUB_64 rx_i_43: ;SUB_64
@ -887,7 +883,7 @@ rx_body_48:
and eax, 2047 and eax, 2047
movhpd qword ptr [rsi + rax * 8], xmm9 movhpd qword ptr [rsi + rax * 8], xmm9
rx_i_49: ;FPMUL rx_i_49: ;FPSUB
dec ebx dec ebx
jz rx_finish jz rx_finish
xor r8, 0f96c6a45h xor r8, 0f96c6a45h
@ -898,10 +894,7 @@ rx_i_49: ;FPMUL
rx_body_49: rx_body_49:
and ecx, 2047 and ecx, 2047
cvtdq2pd xmm0, qword ptr [rsi+rcx*8] cvtdq2pd xmm0, qword ptr [rsi+rcx*8]
mulpd xmm0, xmm3 subpd xmm0, xmm3
movaps xmm1, xmm0
cmpeqpd xmm1, xmm1
andps xmm0, xmm1
movaps xmm5, xmm0 movaps xmm5, xmm0
rx_i_50: ;OR_32 rx_i_50: ;OR_32
@ -1018,7 +1011,7 @@ rx_body_55:
mov eax, r11d mov eax, r11d
xor eax, 07c79cddh xor eax, 07c79cddh
and eax, 2047 and eax, 2047
movlpd qword ptr [rsi + rax * 8], xmm3 movhpd qword ptr [rsi + rax * 8], xmm3
rx_i_56: ;AND_64 rx_i_56: ;AND_64
dec ebx dec ebx
@ -1144,7 +1137,7 @@ taken_call_61:
push rax push rax
call rx_i_120 call rx_i_120
rx_i_62: ;FPMUL rx_i_62: ;FPSUB
dec ebx dec ebx
jz rx_finish jz rx_finish
xor r15, 0c3089414h xor r15, 0c3089414h
@ -1155,17 +1148,14 @@ rx_i_62: ;FPMUL
rx_body_62: rx_body_62:
and ecx, 2047 and ecx, 2047
cvtdq2pd xmm0, qword ptr [rsi+rcx*8] cvtdq2pd xmm0, qword ptr [rsi+rcx*8]
mulpd xmm0, xmm8 subpd xmm0, xmm8
movaps xmm1, xmm0
cmpeqpd xmm1, xmm1
andps xmm0, xmm1
movaps xmm2, xmm0 movaps xmm2, xmm0
mov eax, r10d mov eax, r10d
xor eax, 05c4789e3h xor eax, 05c4789e3h
and eax, 2047 and eax, 2047
movlpd qword ptr [rsi + rax * 8], xmm2 movhpd qword ptr [rsi + rax * 8], xmm2
rx_i_63: ;FPMUL rx_i_63: ;FPSUB
dec ebx dec ebx
jz rx_finish jz rx_finish
xor r9, 065cf272eh xor r9, 065cf272eh
@ -1176,10 +1166,7 @@ rx_i_63: ;FPMUL
rx_body_63: rx_body_63:
and ecx, 2047 and ecx, 2047
cvtdq2pd xmm0, qword ptr [rsi+rcx*8] cvtdq2pd xmm0, qword ptr [rsi+rcx*8]
mulpd xmm0, xmm7 subpd xmm0, xmm7
movaps xmm1, xmm0
cmpeqpd xmm1, xmm1
andps xmm0, xmm1
movaps xmm8, xmm0 movaps xmm8, xmm0
rx_i_64: ;SUB_64 rx_i_64: ;SUB_64
@ -1253,7 +1240,7 @@ taken_call_67:
push rax push rax
call rx_i_79 call rx_i_79
rx_i_68: ;FPSUB rx_i_68: ;FPADD
dec ebx dec ebx
jz rx_finish jz rx_finish
xor r13, 03aa5c3a4h xor r13, 03aa5c3a4h
@ -1264,7 +1251,7 @@ rx_i_68: ;FPSUB
rx_body_68: rx_body_68:
and ecx, 2047 and ecx, 2047
cvtdq2pd xmm0, qword ptr [rsi+rcx*8] cvtdq2pd xmm0, qword ptr [rsi+rcx*8]
subpd xmm0, xmm2 addpd xmm0, xmm2
movaps xmm4, xmm0 movaps xmm4, xmm0
mov eax, r12d mov eax, r12d
xor eax, 03c51ef39h xor eax, 03c51ef39h
@ -1354,11 +1341,16 @@ rx_i_73: ;FPROUND
rx_body_73: rx_body_73:
and ecx, 32767 and ecx, 32767
mov rax, qword ptr [rsi+rcx*8] mov rax, qword ptr [rsi+rcx*8]
mov rcx, rax
shl eax, 13 shl eax, 13
and eax, 24576 and eax, 24576
or eax, 40896 or eax, 40896
mov dword ptr [rsp - 8], eax mov dword ptr [rsp - 8], eax
ldmxcsr dword ptr [rsp - 8] ldmxcsr dword ptr [rsp - 8]
mov eax, r10d
xor eax, 040624270h
and eax, 2047
mov qword ptr [rsi + rax * 8], rcx
rx_i_74: ;MUL_64 rx_i_74: ;MUL_64
dec ebx dec ebx
@ -1722,7 +1714,7 @@ rx_body_93:
mov eax, r10d mov eax, r10d
xor eax, 07e48a0d8h xor eax, 07e48a0d8h
and eax, 2047 and eax, 2047
movlpd qword ptr [rsi + rax * 8], xmm2 movhpd qword ptr [rsi + rax * 8], xmm2
rx_i_94: ;RET rx_i_94: ;RET
dec ebx dec ebx
@ -1830,7 +1822,7 @@ rx_body_99:
mov eax, r12d mov eax, r12d
xor eax, 04c21df83h xor eax, 04c21df83h
and eax, 2047 and eax, 2047
movlpd qword ptr [rsi + rax * 8], xmm4 movhpd qword ptr [rsi + rax * 8], xmm4
rx_i_100: ;ADD_64 rx_i_100: ;ADD_64
dec ebx dec ebx
@ -1955,7 +1947,7 @@ rx_body_106:
mov eax, r12d mov eax, r12d
xor eax, 03cb2505h xor eax, 03cb2505h
and eax, 2047 and eax, 2047
movhpd qword ptr [rsi + rax * 8], xmm4 movlpd qword ptr [rsi + rax * 8], xmm4
rx_i_107: ;CALL rx_i_107: ;CALL
dec ebx dec ebx
@ -1999,7 +1991,7 @@ rx_body_108:
mov eax, r9d mov eax, r9d
xor eax, 0678b65beh xor eax, 0678b65beh
and eax, 32767 and eax, 32767
movlpd qword ptr [rsi + rax * 8], xmm9 movhpd qword ptr [rsi + rax * 8], xmm9
rx_i_109: ;FPADD rx_i_109: ;FPADD
dec ebx dec ebx
@ -2207,7 +2199,7 @@ rx_body_120:
addpd xmm0, xmm4 addpd xmm0, xmm4
movaps xmm8, xmm0 movaps xmm8, xmm0
rx_i_121: ;FPMUL rx_i_121: ;FPSUB
dec ebx dec ebx
jz rx_finish jz rx_finish
xor r9, 03ab8f73h xor r9, 03ab8f73h
@ -2218,10 +2210,7 @@ rx_i_121: ;FPMUL
rx_body_121: rx_body_121:
and ecx, 2047 and ecx, 2047
cvtdq2pd xmm0, qword ptr [rsi+rcx*8] cvtdq2pd xmm0, qword ptr [rsi+rcx*8]
mulpd xmm0, xmm5 subpd xmm0, xmm5
movaps xmm1, xmm0
cmpeqpd xmm1, xmm1
andps xmm0, xmm1
movaps xmm8, xmm0 movaps xmm8, xmm0
rx_i_122: ;RET rx_i_122: ;RET
@ -2813,7 +2802,7 @@ rx_body_153:
mov eax, r8d mov eax, r8d
xor eax, 09111c981h xor eax, 09111c981h
and eax, 2047 and eax, 2047
movlpd qword ptr [rsi + rax * 8], xmm8 movhpd qword ptr [rsi + rax * 8], xmm8
rx_i_154: ;MUL_32 rx_i_154: ;MUL_32
dec ebx dec ebx
@ -3196,11 +3185,13 @@ rx_i_174: ;FPROUND
rx_body_174: rx_body_174:
and ecx, 2047 and ecx, 2047
mov rax, qword ptr [rsi+rcx*8] mov rax, qword ptr [rsi+rcx*8]
mov rcx, rax
shl eax, 13 shl eax, 13
and eax, 24576 and eax, 24576
or eax, 40896 or eax, 40896
mov dword ptr [rsp - 8], eax mov dword ptr [rsp - 8], eax
ldmxcsr dword ptr [rsp - 8] ldmxcsr dword ptr [rsp - 8]
mov r14, rcx
rx_i_175: ;SAR_64 rx_i_175: ;SAR_64
dec ebx dec ebx
@ -3431,7 +3422,7 @@ rx_body_187:
andps xmm0, xmm1 andps xmm0, xmm1
movaps xmm5, xmm0 movaps xmm5, xmm0
rx_i_188: ;FPMUL rx_i_188: ;FPSUB
dec ebx dec ebx
jz rx_finish jz rx_finish
xor r9, 04659becbh xor r9, 04659becbh
@ -3443,10 +3434,7 @@ rx_body_188:
xor rbp, rcx xor rbp, rcx
and ecx, 2047 and ecx, 2047
cvtdq2pd xmm0, qword ptr [rsi+rcx*8] cvtdq2pd xmm0, qword ptr [rsi+rcx*8]
mulpd xmm0, xmm3 subpd xmm0, xmm3
movaps xmm1, xmm0
cmpeqpd xmm1, xmm1
andps xmm0, xmm1
movaps xmm4, xmm0 movaps xmm4, xmm0
rx_i_189: ;FPROUND rx_i_189: ;FPROUND
@ -3460,11 +3448,16 @@ rx_i_189: ;FPROUND
rx_body_189: rx_body_189:
and ecx, 2047 and ecx, 2047
mov rax, qword ptr [rsi+rcx*8] mov rax, qword ptr [rsi+rcx*8]
mov rcx, rax
shl eax, 13 shl eax, 13
and eax, 24576 and eax, 24576
or eax, 40896 or eax, 40896
mov dword ptr [rsp - 8], eax mov dword ptr [rsp - 8], eax
ldmxcsr dword ptr [rsp - 8] ldmxcsr dword ptr [rsp - 8]
mov eax, r13d
xor eax, 0e6f1a3b7h
and eax, 2047
mov qword ptr [rsi + rax * 8], rcx
rx_i_190: ;RET rx_i_190: ;RET
dec ebx dec ebx
@ -3761,7 +3754,7 @@ rx_body_205:
andps xmm0, xmm1 andps xmm0, xmm1
movaps xmm5, xmm0 movaps xmm5, xmm0
rx_i_206: ;FPMUL rx_i_206: ;FPSUB
dec ebx dec ebx
jz rx_finish jz rx_finish
xor r11, 0e836a177h xor r11, 0e836a177h
@ -3773,10 +3766,7 @@ rx_body_206:
xor rbp, rcx xor rbp, rcx
and ecx, 2047 and ecx, 2047
cvtdq2pd xmm0, qword ptr [rsi+rcx*8] cvtdq2pd xmm0, qword ptr [rsi+rcx*8]
mulpd xmm0, xmm7 subpd xmm0, xmm7
movaps xmm1, xmm0
cmpeqpd xmm1, xmm1
andps xmm0, xmm1
movaps xmm4, xmm0 movaps xmm4, xmm0
rx_i_207: ;AND_32 rx_i_207: ;AND_32
@ -4085,7 +4075,7 @@ rx_body_223:
mov eax, r10d mov eax, r10d
xor eax, 07fca59eeh xor eax, 07fca59eeh
and eax, 32767 and eax, 32767
movlpd qword ptr [rsi + rax * 8], xmm2 movhpd qword ptr [rsi + rax * 8], xmm2
rx_i_224: ;SAR_64 rx_i_224: ;SAR_64
dec ebx dec ebx
@ -4171,7 +4161,7 @@ rx_body_227:
mov eax, r11d mov eax, r11d
xor eax, 0aabe2a0ah xor eax, 0aabe2a0ah
and eax, 2047 and eax, 2047
movlpd qword ptr [rsi + rax * 8], xmm3 movhpd qword ptr [rsi + rax * 8], xmm3
rx_i_228: ;CALL rx_i_228: ;CALL
dec ebx dec ebx
@ -4313,11 +4303,16 @@ rx_i_234: ;FPROUND
rx_body_234: rx_body_234:
and ecx, 2047 and ecx, 2047
mov rax, qword ptr [rsi+rcx*8] mov rax, qword ptr [rsi+rcx*8]
mov rcx, rax
shl eax, 13 shl eax, 13
and eax, 24576 and eax, 24576
or eax, 40896 or eax, 40896
mov dword ptr [rsp - 8], eax mov dword ptr [rsp - 8], eax
ldmxcsr dword ptr [rsp - 8] ldmxcsr dword ptr [rsp - 8]
mov eax, r12d
xor eax, 04d2e9e7dh
and eax, 2047
mov qword ptr [rsi + rax * 8], rcx
rx_i_235: ;IMUL_32 rx_i_235: ;IMUL_32
dec ebx dec ebx
@ -4438,7 +4433,7 @@ rx_body_241:
mov eax, r15d mov eax, r15d
xor eax, 0bc2423ebh xor eax, 0bc2423ebh
and eax, 2047 and eax, 2047
movhpd qword ptr [rsi + rax * 8], xmm7 movlpd qword ptr [rsi + rax * 8], xmm7
rx_i_242: ;MULH_64 rx_i_242: ;MULH_64
dec ebx dec ebx
@ -4734,7 +4729,7 @@ rx_body_257:
mov eax, r11d mov eax, r11d
xor eax, 0373b1b6fh xor eax, 0373b1b6fh
and eax, 2047 and eax, 2047
movhpd qword ptr [rsi + rax * 8], xmm3 movlpd qword ptr [rsi + rax * 8], xmm3
rx_i_258: ;MUL_32 rx_i_258: ;MUL_32
dec ebx dec ebx
@ -4771,7 +4766,7 @@ rx_body_259:
addpd xmm0, xmm9 addpd xmm0, xmm9
movaps xmm3, xmm0 movaps xmm3, xmm0
rx_i_260: ;FPMUL rx_i_260: ;FPSUB
dec ebx dec ebx
jz rx_finish jz rx_finish
xor r13, 0f94e9fa9h xor r13, 0f94e9fa9h
@ -4783,10 +4778,7 @@ rx_body_260:
xor rbp, rcx xor rbp, rcx
and ecx, 32767 and ecx, 32767
cvtdq2pd xmm0, qword ptr [rsi+rcx*8] cvtdq2pd xmm0, qword ptr [rsi+rcx*8]
mulpd xmm0, xmm5 subpd xmm0, xmm5
movaps xmm1, xmm0
cmpeqpd xmm1, xmm1
andps xmm0, xmm1
movaps xmm9, xmm0 movaps xmm9, xmm0
rx_i_261: ;FPSQRT rx_i_261: ;FPSQRT
@ -4806,7 +4798,7 @@ rx_body_261:
mov eax, r11d mov eax, r11d
xor eax, 0745a48e9h xor eax, 0745a48e9h
and eax, 2047 and eax, 2047
movlpd qword ptr [rsi + rax * 8], xmm3 movhpd qword ptr [rsi + rax * 8], xmm3
rx_i_262: ;OR_32 rx_i_262: ;OR_32
dec ebx dec ebx
@ -5044,7 +5036,7 @@ rx_body_274:
mov eax, r14d mov eax, r14d
xor eax, 06a2b2b5bh xor eax, 06a2b2b5bh
and eax, 2047 and eax, 2047
movlpd qword ptr [rsi + rax * 8], xmm6 movhpd qword ptr [rsi + rax * 8], xmm6
rx_i_275: ;OR_64 rx_i_275: ;OR_64
dec ebx dec ebx
@ -5121,7 +5113,7 @@ rx_body_278:
mov eax, r12d mov eax, r12d
xor eax, 02d00ad10h xor eax, 02d00ad10h
and eax, 2047 and eax, 2047
movlpd qword ptr [rsi + rax * 8], xmm4 movhpd qword ptr [rsi + rax * 8], xmm4
rx_i_279: ;FPSUB rx_i_279: ;FPSUB
dec ebx dec ebx
@ -5139,7 +5131,7 @@ rx_body_279:
mov eax, r9d mov eax, r9d
xor eax, 0475ade01h xor eax, 0475ade01h
and eax, 2047 and eax, 2047
movhpd qword ptr [rsi + rax * 8], xmm9 movlpd qword ptr [rsi + rax * 8], xmm9
rx_i_280: ;AND_64 rx_i_280: ;AND_64
dec ebx dec ebx
@ -5210,7 +5202,7 @@ rx_body_283:
and eax, 2047 and eax, 2047
mov qword ptr [rsi + rax * 8], rcx mov qword ptr [rsi + rax * 8], rcx
rx_i_284: ;FPSUB rx_i_284: ;FPADD
dec ebx dec ebx
jz rx_finish jz rx_finish
xor r15, 0e68f36ach xor r15, 0e68f36ach
@ -5222,7 +5214,7 @@ rx_body_284:
xor rbp, rcx xor rbp, rcx
and ecx, 2047 and ecx, 2047
cvtdq2pd xmm0, qword ptr [rsi+rcx*8] cvtdq2pd xmm0, qword ptr [rsi+rcx*8]
subpd xmm0, xmm6 addpd xmm0, xmm6
movaps xmm9, xmm0 movaps xmm9, xmm0
mov eax, r9d mov eax, r9d
xor eax, 0936f2960h xor eax, 0936f2960h
@ -5313,7 +5305,7 @@ rx_body_289:
andps xmm0, xmm1 andps xmm0, xmm1
movaps xmm8, xmm0 movaps xmm8, xmm0
rx_i_290: ;FPMUL rx_i_290: ;FPSUB
dec ebx dec ebx
jz rx_finish jz rx_finish
xor r15, 060665748h xor r15, 060665748h
@ -5324,10 +5316,7 @@ rx_i_290: ;FPMUL
rx_body_290: rx_body_290:
and ecx, 32767 and ecx, 32767
cvtdq2pd xmm0, qword ptr [rsi+rcx*8] cvtdq2pd xmm0, qword ptr [rsi+rcx*8]
mulpd xmm0, xmm8 subpd xmm0, xmm8
movaps xmm1, xmm0
cmpeqpd xmm1, xmm1
andps xmm0, xmm1
movaps xmm9, xmm0 movaps xmm9, xmm0
rx_i_291: ;RET rx_i_291: ;RET
@ -5531,7 +5520,7 @@ rx_body_301:
mov eax, r15d mov eax, r15d
xor eax, 0433cf2d6h xor eax, 0433cf2d6h
and eax, 32767 and eax, 32767
movlpd qword ptr [rsi + rax * 8], xmm7 movhpd qword ptr [rsi + rax * 8], xmm7
rx_i_302: ;ADD_64 rx_i_302: ;ADD_64
dec ebx dec ebx
@ -5937,7 +5926,7 @@ rx_body_324:
mov eax, r9d mov eax, r9d
xor eax, 0944856d4h xor eax, 0944856d4h
and eax, 32767 and eax, 32767
movlpd qword ptr [rsi + rax * 8], xmm9 movhpd qword ptr [rsi + rax * 8], xmm9
rx_i_325: ;SHL_64 rx_i_325: ;SHL_64
dec ebx dec ebx
@ -6076,7 +6065,7 @@ rx_body_332:
mov eax, r11d mov eax, r11d
xor eax, 0116c919eh xor eax, 0116c919eh
and eax, 2047 and eax, 2047
movhpd qword ptr [rsi + rax * 8], xmm3 movlpd qword ptr [rsi + rax * 8], xmm3
rx_i_333: ;XOR_64 rx_i_333: ;XOR_64
dec ebx dec ebx
@ -6222,7 +6211,7 @@ rx_body_341:
and eax, 2047 and eax, 2047
mov qword ptr [rsi + rax * 8], rcx mov qword ptr [rsi + rax * 8], rcx
rx_i_342: ;FPMUL rx_i_342: ;FPSUB
dec ebx dec ebx
jz rx_finish jz rx_finish
xor r9, 09ccc7abah xor r9, 09ccc7abah
@ -6233,10 +6222,7 @@ rx_i_342: ;FPMUL
rx_body_342: rx_body_342:
and ecx, 32767 and ecx, 32767
cvtdq2pd xmm0, qword ptr [rsi+rcx*8] cvtdq2pd xmm0, qword ptr [rsi+rcx*8]
mulpd xmm0, xmm2 subpd xmm0, xmm2
movaps xmm1, xmm0
cmpeqpd xmm1, xmm1
andps xmm0, xmm1
movaps xmm3, xmm0 movaps xmm3, xmm0
rx_i_343: ;SHR_64 rx_i_343: ;SHR_64
@ -6258,7 +6244,7 @@ rx_body_343:
and eax, 32767 and eax, 32767
mov qword ptr [rsi + rax * 8], rcx mov qword ptr [rsi + rax * 8], rcx
rx_i_344: ;FPMUL rx_i_344: ;FPSUB
dec ebx dec ebx
jz rx_finish jz rx_finish
xor r10, 03ef9bcc4h xor r10, 03ef9bcc4h
@ -6269,10 +6255,7 @@ rx_i_344: ;FPMUL
rx_body_344: rx_body_344:
and ecx, 32767 and ecx, 32767
cvtdq2pd xmm0, qword ptr [rsi+rcx*8] cvtdq2pd xmm0, qword ptr [rsi+rcx*8]
mulpd xmm0, xmm6 subpd xmm0, xmm6
movaps xmm1, xmm0
cmpeqpd xmm1, xmm1
andps xmm0, xmm1
movaps xmm5, xmm0 movaps xmm5, xmm0
rx_i_345: ;MULH_64 rx_i_345: ;MULH_64
@ -6343,7 +6326,7 @@ rx_body_348:
mov eax, r9d mov eax, r9d
xor eax, 039c35461h xor eax, 039c35461h
and eax, 2047 and eax, 2047
movlpd qword ptr [rsi + rax * 8], xmm9 movhpd qword ptr [rsi + rax * 8], xmm9
rx_i_349: ;XOR_32 rx_i_349: ;XOR_32
dec ebx dec ebx
@ -6413,9 +6396,9 @@ rx_body_352:
mov eax, r10d mov eax, r10d
xor eax, 03bf686f2h xor eax, 03bf686f2h
and eax, 2047 and eax, 2047
movhpd qword ptr [rsi + rax * 8], xmm2 movlpd qword ptr [rsi + rax * 8], xmm2
rx_i_353: ;FPMUL rx_i_353: ;FPSUB
dec ebx dec ebx
jz rx_finish jz rx_finish
xor r13, 02e65278bh xor r13, 02e65278bh
@ -6426,15 +6409,12 @@ rx_i_353: ;FPMUL
rx_body_353: rx_body_353:
and ecx, 2047 and ecx, 2047
cvtdq2pd xmm0, qword ptr [rsi+rcx*8] cvtdq2pd xmm0, qword ptr [rsi+rcx*8]
mulpd xmm0, xmm2 subpd xmm0, xmm2
movaps xmm1, xmm0
cmpeqpd xmm1, xmm1
andps xmm0, xmm1
movaps xmm7, xmm0 movaps xmm7, xmm0
mov eax, r15d mov eax, r15d
xor eax, 0b3c9f7aeh xor eax, 0b3c9f7aeh
and eax, 2047 and eax, 2047
movhpd qword ptr [rsi + rax * 8], xmm7 movlpd qword ptr [rsi + rax * 8], xmm7
rx_i_354: ;MULH_64 rx_i_354: ;MULH_64
dec ebx dec ebx
@ -6535,7 +6515,7 @@ rx_body_359:
mov eax, r12d mov eax, r12d
xor eax, 0f16b9be3h xor eax, 0f16b9be3h
and eax, 32767 and eax, 32767
movhpd qword ptr [rsi + rax * 8], xmm4 movlpd qword ptr [rsi + rax * 8], xmm4
rx_i_360: ;FPMUL rx_i_360: ;FPMUL
dec ebx dec ebx
@ -6570,7 +6550,7 @@ rx_body_361:
mov eax, r14d mov eax, r14d
xor eax, 0ad0b81f5h xor eax, 0ad0b81f5h
and eax, 2047 and eax, 2047
movhpd qword ptr [rsi + rax * 8], xmm6 movlpd qword ptr [rsi + rax * 8], xmm6
rx_i_362: ;SUB_64 rx_i_362: ;SUB_64
dec ebx dec ebx
@ -6726,7 +6706,7 @@ rx_body_370:
mov eax, r14d mov eax, r14d
xor eax, 0a120e0edh xor eax, 0a120e0edh
and eax, 2047 and eax, 2047
movhpd qword ptr [rsi + rax * 8], xmm6 movlpd qword ptr [rsi + rax * 8], xmm6
rx_i_371: ;FPADD rx_i_371: ;FPADD
dec ebx dec ebx
@ -6948,7 +6928,7 @@ rx_body_383:
mov eax, r13d mov eax, r13d
xor eax, 0c9f5cc22h xor eax, 0c9f5cc22h
and eax, 2047 and eax, 2047
movhpd qword ptr [rsi + rax * 8], xmm5 movlpd qword ptr [rsi + rax * 8], xmm5
rx_i_384: ;SHR_64 rx_i_384: ;SHR_64
dec ebx dec ebx
@ -7256,7 +7236,7 @@ rx_body_400:
and eax, 32767 and eax, 32767
mov qword ptr [rsi + rax * 8], rcx mov qword ptr [rsi + rax * 8], rcx
rx_i_401: ;FPMUL rx_i_401: ;FPSUB
dec ebx dec ebx
jz rx_finish jz rx_finish
xor r13, 032e81f25h xor r13, 032e81f25h
@ -7267,15 +7247,12 @@ rx_i_401: ;FPMUL
rx_body_401: rx_body_401:
and ecx, 2047 and ecx, 2047
cvtdq2pd xmm0, qword ptr [rsi+rcx*8] cvtdq2pd xmm0, qword ptr [rsi+rcx*8]
mulpd xmm0, xmm4 subpd xmm0, xmm4
movaps xmm1, xmm0
cmpeqpd xmm1, xmm1
andps xmm0, xmm1
movaps xmm6, xmm0 movaps xmm6, xmm0
mov eax, r14d mov eax, r14d
xor eax, 03ea60344h xor eax, 03ea60344h
and eax, 32767 and eax, 32767
movhpd qword ptr [rsi + rax * 8], xmm6 movlpd qword ptr [rsi + rax * 8], xmm6
rx_i_402: ;RET rx_i_402: ;RET
dec ebx dec ebx
@ -7382,13 +7359,15 @@ rx_i_406: ;FPROUND
rx_body_406: rx_body_406:
and ecx, 32767 and ecx, 32767
mov rax, qword ptr [rsi+rcx*8] mov rax, qword ptr [rsi+rcx*8]
mov rcx, rax
shl eax, 13 shl eax, 13
and eax, 24576 and eax, 24576
or eax, 40896 or eax, 40896
mov dword ptr [rsp - 8], eax mov dword ptr [rsp - 8], eax
ldmxcsr dword ptr [rsp - 8] ldmxcsr dword ptr [rsp - 8]
mov r9, rcx
rx_i_407: ;FPMUL rx_i_407: ;FPSUB
dec ebx dec ebx
jz rx_finish jz rx_finish
xor r14, 09699566fh xor r14, 09699566fh
@ -7400,10 +7379,7 @@ rx_body_407:
xor rbp, rcx xor rbp, rcx
and ecx, 32767 and ecx, 32767
cvtdq2pd xmm0, qword ptr [rsi+rcx*8] cvtdq2pd xmm0, qword ptr [rsi+rcx*8]
mulpd xmm0, xmm9 subpd xmm0, xmm9
movaps xmm1, xmm0
cmpeqpd xmm1, xmm1
andps xmm0, xmm1
movaps xmm8, xmm0 movaps xmm8, xmm0
rx_i_408: ;MUL_64 rx_i_408: ;MUL_64
@ -7493,7 +7469,7 @@ rx_body_412:
mov eax, r11d mov eax, r11d
xor eax, 0bbd2640ah xor eax, 0bbd2640ah
and eax, 2047 and eax, 2047
movhpd qword ptr [rsi + rax * 8], xmm3 movlpd qword ptr [rsi + rax * 8], xmm3
rx_i_413: ;FPDIV rx_i_413: ;FPDIV
dec ebx dec ebx
@ -7704,7 +7680,7 @@ rx_body_424:
mov eax, r9d mov eax, r9d
xor eax, 0565ae8aah xor eax, 0565ae8aah
and eax, 2047 and eax, 2047
movhpd qword ptr [rsi + rax * 8], xmm9 movlpd qword ptr [rsi + rax * 8], xmm9
rx_i_425: ;IMUL_32 rx_i_425: ;IMUL_32
dec ebx dec ebx
@ -7887,7 +7863,7 @@ rx_body_434:
mov eax, r9d mov eax, r9d
xor eax, 08c1cfc74h xor eax, 08c1cfc74h
and eax, 2047 and eax, 2047
movlpd qword ptr [rsi + rax * 8], xmm9 movhpd qword ptr [rsi + rax * 8], xmm9
rx_i_435: ;MUL_64 rx_i_435: ;MUL_64
dec ebx dec ebx
@ -8068,7 +8044,7 @@ not_taken_ret_443:
and eax, 2047 and eax, 2047
mov qword ptr [rsi + rax * 8], rcx mov qword ptr [rsi + rax * 8], rcx
rx_i_444: ;FPMUL rx_i_444: ;FPSUB
dec ebx dec ebx
jz rx_finish jz rx_finish
xor r8, 042455dd8h xor r8, 042455dd8h
@ -8079,15 +8055,12 @@ rx_i_444: ;FPMUL
rx_body_444: rx_body_444:
and ecx, 32767 and ecx, 32767
cvtdq2pd xmm0, qword ptr [rsi+rcx*8] cvtdq2pd xmm0, qword ptr [rsi+rcx*8]
mulpd xmm0, xmm7 subpd xmm0, xmm7
movaps xmm1, xmm0
cmpeqpd xmm1, xmm1
andps xmm0, xmm1
movaps xmm5, xmm0 movaps xmm5, xmm0
mov eax, r13d mov eax, r13d
xor eax, 0ce416070h xor eax, 0ce416070h
and eax, 2047 and eax, 2047
movlpd qword ptr [rsi + rax * 8], xmm5 movhpd qword ptr [rsi + rax * 8], xmm5
rx_i_445: ;ADD_64 rx_i_445: ;ADD_64
dec ebx dec ebx
@ -8128,7 +8101,7 @@ rx_body_446:
and eax, 2047 and eax, 2047
mov qword ptr [rsi + rax * 8], rcx mov qword ptr [rsi + rax * 8], rcx
rx_i_447: ;FPSUB rx_i_447: ;FPADD
dec ebx dec ebx
jz rx_finish jz rx_finish
xor r8, 01596d0e8h xor r8, 01596d0e8h
@ -8139,12 +8112,12 @@ rx_i_447: ;FPSUB
rx_body_447: rx_body_447:
and ecx, 2047 and ecx, 2047
cvtdq2pd xmm0, qword ptr [rsi+rcx*8] cvtdq2pd xmm0, qword ptr [rsi+rcx*8]
subpd xmm0, xmm7 addpd xmm0, xmm7
movaps xmm5, xmm0 movaps xmm5, xmm0
mov eax, r13d mov eax, r13d
xor eax, 0b384d4afh xor eax, 0b384d4afh
and eax, 2047 and eax, 2047
movhpd qword ptr [rsi + rax * 8], xmm5 movlpd qword ptr [rsi + rax * 8], xmm5
rx_i_448: ;FPSUB rx_i_448: ;FPSUB
dec ebx dec ebx
@ -8668,7 +8641,7 @@ rx_body_477:
mov eax, r14d mov eax, r14d
xor eax, 0e81fc7a6h xor eax, 0e81fc7a6h
and eax, 2047 and eax, 2047
movlpd qword ptr [rsi + rax * 8], xmm6 movhpd qword ptr [rsi + rax * 8], xmm6
rx_i_478: ;MUL_64 rx_i_478: ;MUL_64
dec ebx dec ebx
@ -9143,7 +9116,7 @@ rx_body_504:
and eax, 32767 and eax, 32767
movhpd qword ptr [rsi + rax * 8], xmm4 movhpd qword ptr [rsi + rax * 8], xmm4
rx_i_505: ;FPMUL rx_i_505: ;FPSUB
dec ebx dec ebx
jz rx_finish jz rx_finish
xor r12, 032c0a28ah xor r12, 032c0a28ah
@ -9154,17 +9127,14 @@ rx_i_505: ;FPMUL
rx_body_505: rx_body_505:
and ecx, 32767 and ecx, 32767
cvtdq2pd xmm0, qword ptr [rsi+rcx*8] cvtdq2pd xmm0, qword ptr [rsi+rcx*8]
mulpd xmm0, xmm4 subpd xmm0, xmm4
movaps xmm1, xmm0
cmpeqpd xmm1, xmm1
andps xmm0, xmm1
movaps xmm8, xmm0 movaps xmm8, xmm0
mov eax, r8d mov eax, r8d
xor eax, 021b54eaeh xor eax, 021b54eaeh
and eax, 32767 and eax, 32767
movlpd qword ptr [rsi + rax * 8], xmm8 movhpd qword ptr [rsi + rax * 8], xmm8
rx_i_506: ;FPMUL rx_i_506: ;FPSUB
dec ebx dec ebx
jz rx_finish jz rx_finish
xor r9, 0a973d58ch xor r9, 0a973d58ch
@ -9175,10 +9145,7 @@ rx_i_506: ;FPMUL
rx_body_506: rx_body_506:
and ecx, 2047 and ecx, 2047
cvtdq2pd xmm0, qword ptr [rsi+rcx*8] cvtdq2pd xmm0, qword ptr [rsi+rcx*8]
mulpd xmm0, xmm9 subpd xmm0, xmm9
movaps xmm1, xmm0
cmpeqpd xmm1, xmm1
andps xmm0, xmm1
movaps xmm3, xmm0 movaps xmm3, xmm0
rx_i_507: ;RET rx_i_507: ;RET
@ -9238,7 +9205,7 @@ taken_call_509:
push rax push rax
call rx_i_42 call rx_i_42
rx_i_510: ;FPSUB rx_i_510: ;FPADD
dec ebx dec ebx
jz rx_finish jz rx_finish
xor r8, 0db65513ch xor r8, 0db65513ch
@ -9249,7 +9216,7 @@ rx_i_510: ;FPSUB
rx_body_510: rx_body_510:
and ecx, 2047 and ecx, 2047
cvtdq2pd xmm0, qword ptr [rsi+rcx*8] cvtdq2pd xmm0, qword ptr [rsi+rcx*8]
subpd xmm0, xmm2 addpd xmm0, xmm2
movaps xmm9, xmm0 movaps xmm9, xmm0
rx_i_511: ;ROL_64 rx_i_511: ;ROL_64

View file

@ -74,21 +74,21 @@ void setPrivilege(const char* pszPrivilege, BOOL bEnable) {
} }
#endif #endif
void* allocExecutableMemory(size_t bytes) { void* allocExecutableMemory(std::size_t bytes) {
void* mem; void* mem;
#ifdef _WIN32 #ifdef _WIN32
mem = VirtualAlloc(nullptr, bytes, MEM_COMMIT, PAGE_EXECUTE_READWRITE); mem = VirtualAlloc(nullptr, bytes, MEM_COMMIT, PAGE_EXECUTE_READWRITE);
if (mem == nullptr) if (mem == nullptr)
throw std::runtime_error(getErrorMessage("allocExecutableMemory - VirtualAlloc")); throw std::runtime_error(getErrorMessage("allocExecutableMemory - VirtualAlloc"));
#else #else
mem = mmap(nullptr, CodeSize, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); mem = mmap(nullptr, bytes, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
if (mem == MAP_FAILED) if (mem == MAP_FAILED)
throw std::runtime_error("allocExecutableMemory - mmap failed"); throw std::runtime_error("allocExecutableMemory - mmap failed");
#endif #endif
return mem; return mem;
} }
void* allocLargePagesMemory(size_t bytes) { void* allocLargePagesMemory(std::size_t bytes) {
void* mem; void* mem;
#ifdef _WIN32 #ifdef _WIN32
setPrivilege("SeLockMemoryPrivilege", 1); setPrivilege("SeLockMemoryPrivilege", 1);

View file

@ -19,5 +19,7 @@ along with RandomX. If not, see<http://www.gnu.org/licenses/>.
#pragma once #pragma once
void* allocExecutableMemory(size_t); #include <cstddef>
void* allocLargePagesMemory(size_t);
void* allocExecutableMemory(std::size_t);
void* allocLargePagesMemory(std::size_t);