diff --git a/src/AssemblyGeneratorX86.cpp b/src/AssemblyGeneratorX86.cpp
index f1c3de8..11bb3f0 100644
--- a/src/AssemblyGeneratorX86.cpp
+++ b/src/AssemblyGeneratorX86.cpp
@@ -75,6 +75,11 @@ namespace RandomX {
asmCode << "\tand " << reg << ", " << ((instr.alt % 4) ? ScratchpadL1Mask : ScratchpadL2Mask) << std::endl;
}
+ void AssemblyGeneratorX86::genAddressRegDst(Instruction& instr, int maskAlign = 8) {
+ asmCode << "\tmov eax" << ", " << regR32[instr.dst] << std::endl;
+ asmCode << "\tand eax" << ", " << ((instr.alt % 4) ? (ScratchpadL1Mask & (-maskAlign)) : (ScratchpadL2Mask & (-maskAlign))) << std::endl;
+ }
+
int32_t AssemblyGeneratorX86::genAddressImm(Instruction& instr) {
return instr.imm32 & ((instr.alt % 4) ? ScratchpadL1Mask : ScratchpadL2Mask);
}
@@ -425,7 +430,7 @@ namespace RandomX {
//6 uOPs
void AssemblyGeneratorX86::h_CFROUND(Instruction& instr, int i) {
- asmCode << "\tmov rax, " << regR[instr.dst] << std::endl;
+ asmCode << "\tmov rax, " << regR[instr.src] << std::endl;
int rotate = (13 - (instr.alt & 63)) & 63;
if (rotate != 0)
asmCode << "\trol rax, " << rotate << std::endl;
@@ -474,6 +479,18 @@ namespace RandomX {
asmCode << "\tadd " << regR[instr.dst] << ", rcx" << std::endl;
}
+ //3 uOPs
+ void AssemblyGeneratorX86::h_ISTORE(Instruction& instr, int i) {
+ genAddressRegDst(instr);
+ asmCode << "\tmov qword ptr [rsi+rax], " << regR[instr.src] << std::endl;
+ }
+
+ //3 uOPs
+ void AssemblyGeneratorX86::h_FSTORE(Instruction& instr, int i) {
+ genAddressRegDst(instr, 16);
+ asmCode << "\tmovapd xmmword ptr [rsi+rax], " << regFE[instr.src] << std::endl;
+ }
+
#include "instructionWeights.hpp"
#define INST_HANDLE(x) REPN(&AssemblyGeneratorX86::h_##x, WT(x))
@@ -520,5 +537,8 @@ namespace RandomX {
INST_HANDLE(COND_R)
INST_HANDLE(COND_M)
INST_HANDLE(CFROUND)
+
+ INST_HANDLE(ISTORE)
+ INST_HANDLE(FSTORE)
};
}
\ No newline at end of file
diff --git a/src/AssemblyGeneratorX86.hpp b/src/AssemblyGeneratorX86.hpp
index 2d3c9a6..5c22142 100644
--- a/src/AssemblyGeneratorX86.hpp
+++ b/src/AssemblyGeneratorX86.hpp
@@ -38,16 +38,8 @@ namespace RandomX {
static InstructionGenerator engine[256];
std::stringstream asmCode;
- void gena(Instruction&, int);
- void genar(Instruction&, int);
- void genaf(Instruction&, int);
- void genbiashift(Instruction&, const char*);
- void genbia(Instruction&);
- void genbia32(Instruction&);
- void genbf(Instruction&, const char*);
- void gencr(Instruction&, bool);
- void gencf(Instruction&, bool);
void genAddressReg(Instruction&, const char*);
+ void genAddressRegDst(Instruction&, int);
int32_t genAddressImm(Instruction&);
void generateCode(Instruction&, int);
@@ -85,5 +77,7 @@ namespace RandomX {
void h_COND_R(Instruction&, int);
void h_COND_M(Instruction&, int);
void h_CFROUND(Instruction&, int);
+ void h_ISTORE(Instruction&, int);
+ void h_FSTORE(Instruction&, int);
};
}
\ No newline at end of file
diff --git a/src/CompiledVirtualMachine.cpp b/src/CompiledVirtualMachine.cpp
index f0a63d1..f5d33d0 100644
--- a/src/CompiledVirtualMachine.cpp
+++ b/src/CompiledVirtualMachine.cpp
@@ -71,14 +71,14 @@ namespace RandomX {
reg.a[i].hi.u64 = getSmallPositiveFloatBits(reg.f[i].hi.u64);
}
compiler.generateProgram(gen);
- mem.ma = (gen() ^ *(((uint32_t*)seed) + 4)) & ~7;
+ mem.ma = (gen() ^ *(((uint32_t*)seed) + 4)) & -64;
mem.mx = *(((uint32_t*)seed) + 5);
}
void CompiledVirtualMachine::execute() {
- executeProgram(reg, mem, scratchpad, InstructionCount);
+ //executeProgram(reg, mem, scratchpad, InstructionCount);
totalSize += compiler.getCodeSize();
- //compiler.getProgramFunc()(reg, mem, scratchpad);
+ compiler.getProgramFunc()(reg, mem, scratchpad, InstructionCount);
#ifdef TRACEVM
for (int32_t i = InstructionCount - 1; i >= 0; --i) {
std::cout << std::hex << tracepad[i].u64 << std::endl;
diff --git a/src/Instruction.cpp b/src/Instruction.cpp
index c766ffd..13cfc1d 100644
--- a/src/Instruction.cpp
+++ b/src/Instruction.cpp
@@ -32,6 +32,10 @@ namespace RandomX {
os << ((alt % 4) ? "L1" : "L2") << "[r" << (int)src << "]";
}
+ void Instruction::genAddressRegDst(std::ostream& os) const {
+ os << ((alt % 4) ? "L1" : "L2") << "[r" << (int)dst << "]";
+ }
+
void Instruction::genAddressImm(std::ostream& os) const {
os << ((alt % 4) ? "L1" : "L2") << "[" << (imm32 & ((alt % 4) ? ScratchpadL1Mask : ScratchpadL2Mask)) << "]";
}
@@ -276,7 +280,7 @@ namespace RandomX {
}
void Instruction::h_CFROUND(std::ostream& os) const {
- os << "r" << (int)dst << ", " << (alt & 63) << std::endl;
+ os << "r" << (int)src << ", " << (alt & 63) << std::endl;
}
static inline const char* condition(int index) {
@@ -311,6 +315,18 @@ namespace RandomX {
os << ", " << imm32 << ")" << std::endl;
}
+ void Instruction::h_ISTORE(std::ostream& os) const {
+ genAddressRegDst(os);
+ os << ", r" << (int)src << std::endl;
+ }
+
+ void Instruction::h_FSTORE(std::ostream& os) const {
+ const char reg = (src >= 4) ? 'e' : 'f';
+ genAddressRegDst(os);
+ auto srcIndex = src % 4;
+ os << ", " << reg << srcIndex << std::endl;
+ }
+
#include "instructionWeights.hpp"
#define INST_NAME(x) REPN(#x, WT(x))
#define INST_HANDLE(x) REPN(&Instruction::h_##x, WT(x))
@@ -358,6 +374,9 @@ namespace RandomX {
INST_NAME(COND_R)
INST_NAME(COND_M)
INST_NAME(CFROUND)
+
+ INST_NAME(ISTORE)
+ INST_NAME(FSTORE)
};
InstructionVisualizer Instruction::engine[256] = {
@@ -403,6 +422,9 @@ namespace RandomX {
INST_HANDLE(COND_R)
INST_HANDLE(COND_M)
INST_HANDLE(CFROUND)
+
+ INST_HANDLE(ISTORE)
+ INST_HANDLE(FSTORE)
};
}
\ No newline at end of file
diff --git a/src/Instruction.hpp b/src/Instruction.hpp
index becb983..017d92f 100644
--- a/src/Instruction.hpp
+++ b/src/Instruction.hpp
@@ -49,6 +49,7 @@ namespace RandomX {
void genAddressReg(std::ostream& os) const;
void genAddressImm(std::ostream& os) const;
+ void genAddressRegDst(std::ostream&) const;
void h_IADD_R(std::ostream&) const;
void h_IADD_M(std::ostream&) const;
@@ -83,6 +84,8 @@ namespace RandomX {
void h_COND_R(std::ostream&) const;
void h_COND_M(std::ostream&) const;
void h_CFROUND(std::ostream&) const;
+ void h_ISTORE(std::ostream&) const;
+ void h_FSTORE(std::ostream&) const;
};
static_assert(sizeof(Instruction) == 8, "Invalid alignment of struct Instruction");
diff --git a/src/JitCompilerX86-static.S b/src/JitCompilerX86-static.S
index e0e8f62..a799e11 100644
--- a/src/JitCompilerX86-static.S
+++ b/src/JitCompilerX86-static.S
@@ -27,11 +27,16 @@
#define DECL(x) x
#endif
.global DECL(randomx_program_prologue)
-.global DECL(randomx_program_begin)
+.global DECL(randomx_loop_begin)
+.global DECL(randomx_program_load_int)
+.global DECL(randomx_program_load_flt)
+.global DECL(randomx_program_start)
+.global DECL(randomx_program_read_dataset)
+.global DECL(randomx_program_store_int)
+.global DECL(randomx_program_store_flt)
+.global DECL(randomx_program_loop_end)
.global DECL(randomx_program_epilogue)
-.global DECL(randomx_program_read)
.global DECL(randomx_program_end)
-.global DECL(randomx_program_transform)
#define db .byte
@@ -40,21 +45,37 @@ DECL(randomx_program_prologue):
#include "asm/program_prologue_linux.inc"
.align 64
-DECL(randomx_program_begin):
+ #include "asm/program_xmm_constants.inc"
+
+.align 64
+DECL(randomx_loop_begin):
+ nop
+
+DECL(randomx_program_load_int):
+ #include "asm/program_load_int.inc"
+
+DECL(randomx_program_load_flt):
+ #include "asm/program_load_flt.inc"
+
+DECL(randomx_program_start):
+ nop
+
+DECL(randomx_program_read_dataset):
+ #include "asm/program_read_dataset.inc"
+
+DECL(randomx_program_store_int):
+ #include "asm/program_store_int.inc"
+
+DECL(randomx_program_store_flt):
+ #include "asm/program_store_flt.inc"
+
+DECL(randomx_program_loop_end):
nop
.align 64
DECL(randomx_program_epilogue):
#include "asm/program_epilogue_linux.inc"
-.align 64
-DECL(randomx_program_read):
- #include "asm/program_read.inc"
-
.align 64
DECL(randomx_program_end):
nop
-
-.align 8
-DECL(randomx_program_transform):
- #include "asm/program_transform_address.inc"
diff --git a/src/JitCompilerX86-static.asm b/src/JitCompilerX86-static.asm
index 031c2e4..8d5a4fe 100644
--- a/src/JitCompilerX86-static.asm
+++ b/src/JitCompilerX86-static.asm
@@ -20,12 +20,16 @@ IFDEF RAX
_RANDOMX_JITX86_STATIC SEGMENT PAGE READ EXECUTE
PUBLIC randomx_program_prologue
-PUBLIC randomx_program_begin
+PUBLIC randomx_loop_begin
+PUBLIC randomx_program_load_int
+PUBLIC randomx_program_load_flt
+PUBLIC randomx_program_start
+PUBLIC randomx_program_read_dataset
+PUBLIC randomx_program_store_int
+PUBLIC randomx_program_store_flt
+PUBLIC randomx_program_loop_end
PUBLIC randomx_program_epilogue
-PUBLIC randomx_program_read
PUBLIC randomx_program_end
-PUBLIC randomx_program_transform
-
ALIGN 64
randomx_program_prologue PROC
@@ -33,30 +37,51 @@ randomx_program_prologue PROC
randomx_program_prologue ENDP
ALIGN 64
-randomx_program_begin PROC
+ include asm/program_xmm_constants.inc
+
+ALIGN 64
+randomx_loop_begin PROC
nop
-randomx_program_begin ENDP
+randomx_loop_begin ENDP
+
+randomx_program_load_int PROC
+ include asm/program_load_int.inc
+randomx_program_load_int ENDP
+
+randomx_program_load_flt PROC
+ include asm/program_load_flt.inc
+randomx_program_load_flt ENDP
+
+randomx_program_start PROC
+ nop
+randomx_program_start ENDP
+
+randomx_program_read_dataset PROC
+ include asm/program_read_dataset.inc
+randomx_program_read_dataset ENDP
+
+randomx_program_store_int PROC
+ include asm/program_store_int.inc
+randomx_program_store_int ENDP
+
+randomx_program_store_flt PROC
+ include asm/program_store_flt.inc
+randomx_program_store_flt ENDP
+
+randomx_program_loop_end PROC
+ nop
+randomx_program_loop_end ENDP
ALIGN 64
randomx_program_epilogue PROC
include asm/program_epilogue_win64.inc
randomx_program_epilogue ENDP
-ALIGN 64
-randomx_program_read PROC
- include asm/program_read.inc
-randomx_program_read ENDP
-
ALIGN 64
randomx_program_end PROC
nop
randomx_program_end ENDP
-ALIGN 8
-randomx_program_transform PROC
- include asm/program_transform_address.inc
-randomx_program_transform ENDP
-
_RANDOMX_JITX86_STATIC ENDS
ENDIF
diff --git a/src/JitCompilerX86-static.hpp b/src/JitCompilerX86-static.hpp
index e72244a..df5cd28 100644
--- a/src/JitCompilerX86-static.hpp
+++ b/src/JitCompilerX86-static.hpp
@@ -18,10 +18,15 @@ along with RandomX. If not, see.
*/
extern "C" {
- void randomx_program_prologue();
- void randomx_program_begin();
- void randomx_program_epilogue();
- void randomx_program_transform();
- void randomx_program_read();
- void randomx_program_end();
+ void randomx_program_prologue();
+ void randomx_loop_begin();
+ void randomx_program_load_int();
+ void randomx_program_load_flt();
+ void randomx_program_start();
+ void randomx_program_read_dataset();
+ void randomx_program_store_int();
+ void randomx_program_store_flt();
+ void randomx_program_loop_end();
+ void randomx_program_epilogue();
+ void randomx_program_end();
}
\ No newline at end of file
diff --git a/src/JitCompilerX86.cpp b/src/JitCompilerX86.cpp
index 8776d61..e001464 100644
--- a/src/JitCompilerX86.cpp
+++ b/src/JitCompilerX86.cpp
@@ -38,7 +38,7 @@ along with RandomX. If not, see.
namespace RandomX {
-#if true || !defined(_M_X64) && !defined(__x86_64__)
+#if !defined(_M_X64) && !defined(__x86_64__)
JitCompilerX86::JitCompilerX86() {
//throw std::runtime_error("JIT compiler only supports x86-64 CPUs");
}
@@ -53,69 +53,132 @@ namespace RandomX {
#else
/*
- REGISTER ALLOCATION:
- rax -> temporary
- rbx -> "ic"
- rcx -> temporary
- rdx -> temporary
- rsi -> convertible_t* scratchpad
- rdi -> beginning of VM stack
- rbp -> "ma", "mx"
- rsp -> end of VM stack
- r8 -> "r0"
- r9 -> "r1"
- r10 -> "r2"
- r11 -> "r3"
- r12 -> "r4"
- r13 -> "r5"
- r14 -> "r6"
- r15 -> "r7"
- xmm0 -> temporary
- xmm1 -> temporary
- xmm2 -> "f2"
- xmm3 -> "f3"
- xmm4 -> "f4"
- xmm5 -> "f5"
- xmm6 -> "f6"
- xmm7 -> "f7"
- xmm8 -> "f0"
- xmm9 -> "f1"
- xmm10 -> absolute value mask 0x7fffffffffffffff7fffffffffffffff
+ REGISTER ALLOCATION:
- STACK STRUCTURE:
-
- |
- |
- | saved registers
- |
- v
- [rdi+8] RegisterFile& registerFile
- [rdi] uint8_t* dataset
- |
- |
- | VM stack
- |
- v
- [rsp] last element of VM stack
+ ; rax -> temporary
+ ; rbx -> loop counter "lc"
+ ; rcx -> temporary
+ ; rdx -> temporary
+ ; rsi -> scratchpad pointer
+ ; rdi -> dataset pointer
+ ; rbp -> memory registers "ma" (high 32 bits), "mx" (low 32 bits)
+ ; rsp -> stack pointer
+ ; r8 -> "r0"
+ ; r9 -> "r1"
+ ; r10 -> "r2"
+ ; r11 -> "r3"
+ ; r12 -> "r4"
+ ; r13 -> "r5"
+ ; r14 -> "r6"
+ ; r15 -> "r7"
+ ; xmm0 -> "f0"
+ ; xmm1 -> "f1"
+ ; xmm2 -> "f2"
+ ; xmm3 -> "f3"
+ ; xmm4 -> "e0"
+ ; xmm5 -> "e1"
+ ; xmm6 -> "e2"
+ ; xmm7 -> "e3"
+ ; xmm8 -> "a0"
+ ; xmm9 -> "a1"
+ ; xmm10 -> "a2"
+ ; xmm11 -> "a3"
+ ; xmm12 -> temporary
+ ; xmm13 -> DBL_MIN
+ ; xmm14 -> absolute value mask 0x7fffffffffffffff7fffffffffffffff
+ ; xmm15 -> sign mask 0x80000000000000008000000000000000
*/
#include "JitCompilerX86-static.hpp"
const uint8_t* codePrologue = (uint8_t*)&randomx_program_prologue;
- const uint8_t* codeProgramBegin = (uint8_t*)&randomx_program_begin;
+ const uint8_t* codeLoopBegin = (uint8_t*)&randomx_loop_begin;
+ const uint8_t* codeLoadInt = (uint8_t*)&randomx_program_load_int;
+ const uint8_t* codeLoadFlt = (uint8_t*)&randomx_program_load_flt;
+ const uint8_t* codeProgamStart = (uint8_t*)&randomx_program_start;
+ const uint8_t* codeReadDataset = (uint8_t*)&randomx_program_read_dataset;
+ const uint8_t* codeStoreInt = (uint8_t*)&randomx_program_store_int;
+ const uint8_t* codeStoreFlt = (uint8_t*)&randomx_program_store_flt;
+ const uint8_t* codeLoopEnd = (uint8_t*)&randomx_program_loop_end;
const uint8_t* codeEpilogue = (uint8_t*)&randomx_program_epilogue;
- const uint8_t* codeReadDataset = (uint8_t*)&randomx_program_read;
const uint8_t* codeProgramEnd = (uint8_t*)&randomx_program_end;
- const uint32_t* addressTransformations = (uint32_t*)&randomx_program_transform;
- const int32_t prologueSize = codeProgramBegin - codePrologue;
- const int32_t epilogueSize = codeReadDataset - codeEpilogue;
- const int32_t readDatasetSize = codeProgramEnd - codeReadDataset;
+ const int32_t prologueSize = codeLoopBegin - codePrologue;
+ const int32_t epilogueSize = codeProgramEnd - codeEpilogue;
- const int32_t readDatasetOffset = CodeSize - readDatasetSize;
- const int32_t epilogueOffset = readDatasetOffset - epilogueSize;
+ const int32_t loadIntSize = codeLoadFlt - codeLoadInt;
+ const int32_t loadFltSize = codeProgamStart - codeLoadFlt;
+ const int32_t readDatasetSize = codeStoreInt - codeReadDataset;
+ const int32_t storeIntSize = codeStoreFlt - codeStoreInt;
+ const int32_t storeFltSize = codeLoopEnd - codeStoreFlt;
+
+ const int32_t epilogueOffset = CodeSize - epilogueSize;
+
+ static const uint8_t REX_ADD_RR[] = { 0x4d, 0x03 };
+ static const uint8_t REX_ADD_RM[] = { 0x4c, 0x03 };
+ static const uint8_t REX_SUB_RR[] = { 0x4d, 0x2b };
+ static const uint8_t REX_SUB_RM[] = { 0x4c, 0x2b };
+ static const uint8_t REX_MOV_RR[] = { 0x41, 0x8b };
+ static const uint8_t REX_MOV_RR64[] = { 0x49, 0x8b };
+ static const uint8_t REX_MOV_R64R[] = { 0x4c, 0x8b };
+ static const uint8_t REX_IMUL_RR[] = { 0x4d, 0x0f, 0xaf };
+ static const uint8_t REX_IMUL_RRI[] = { 0x4d, 0x69 };
+ static const uint8_t REX_IMUL_RM[] = { 0x4c, 0x0f, 0xaf };
+ static const uint8_t REX_MUL_R[] = { 0x49, 0xf7 };
+ static const uint8_t REX_MUL_M[] = { 0x48, 0xf7 };
+ static const uint8_t REX_81[] = { 0x49, 0x81 };
+ static const uint8_t AND_EAX_I = 0x25;
+ static const uint8_t MOV_EAX_I = 0xb8;
+ static const uint8_t MOV_RAX_I[] = { 0x48, 0xb8 };
+ static const uint8_t MOV_RCX_I[] = { 0x48, 0xb9 };
+ static const uint8_t REX_LEA[] = { 0x4f, 0x8d };
+ static const uint8_t REX_MUL_MEM[] = { 0x48, 0xf7, 0x24, 0x0e };
+ static const uint8_t REX_IMUL_MEM[] = { 0x48, 0xf7, 0x2c, 0x0e };
+ static const uint8_t REX_SHR_RAX[] = { 0x48, 0xc1, 0xe8 };
+ static const uint8_t RAX_ADD_SBB_1[] = { 0x48, 0x83, 0xC0, 0x01, 0x48, 0x83, 0xD8, 0x00 };
+ static const uint8_t MUL_RCX[] = { 0x48, 0xf7, 0xe1 };
+ static const uint8_t REX_SHR_RDX[] = { 0x48, 0xc1, 0xea };
+ static const uint8_t REX_SH[] = { 0x49, 0xc1 };
+ static const uint8_t MOV_RCX_RAX_SAR_RCX_63[] = { 0x48, 0x89, 0xc1, 0x48, 0xc1, 0xf9, 0x3f };
+ static const uint8_t AND_ECX_I[] = { 0x81, 0xe1 };
+ static const uint8_t ADD_RAX_RCX[] = { 0x48, 0x01, 0xC8 };
+ static const uint8_t SAR_RAX_I8[] = { 0x48, 0xC1, 0xF8 };
+ static const uint8_t NEG_RAX[] = { 0x48, 0xF7, 0xD8 };
+ static const uint8_t ADD_R_RAX[] = { 0x49, 0x01 };
+ static const uint8_t XOR_EAX_EAX[] = { 0x31, 0xC0 };
+ static const uint8_t ADD_RDX_R[] = { 0x4c, 0x01 };
+ static const uint8_t SUB_RDX_R[] = { 0x4c, 0x29 };
+ static const uint8_t SAR_RDX_I8[] = { 0x48, 0xC1, 0xFA };
+ static const uint8_t TEST_RDX_RDX[] = { 0x48, 0x85, 0xD2 };
+ static const uint8_t SETS_AL_ADD_RDX_RAX[] = { 0x0F, 0x98, 0xC0, 0x48, 0x01, 0xC2 };
+ static const uint8_t REX_NEG[] = { 0x49, 0xF7 };
+ static const uint8_t REX_XOR_RR[] = { 0x4D, 0x33 };
+ static const uint8_t REX_XOR_RI[] = { 0x49, 0x81 };
+ static const uint8_t REX_XOR_RM[] = { 0x4c, 0x33 };
+ static const uint8_t REX_ROT_CL[] = { 0x49, 0xd3 };
+ static const uint8_t REX_ROT_I8[] = { 0x49, 0xc1 };
+ static const uint8_t SHUFPD[] = { 0x66, 0x0f, 0xc6 };
+ static const uint8_t REX_ADDPD[] = { 0x66, 0x41, 0x0f, 0x58 };
+ static const uint8_t REX_CVTDQ2PD_XMM12[] = { 0xf3, 0x44, 0x0f, 0xe6, 0x24, 0x06 };
+ static const uint8_t REX_SUBPD[] = { 0x66, 0x41, 0x0f, 0x5c };
+ static const uint8_t REX_XORPS[] = { 0x41, 0x0f, 0x57 };
+ static const uint8_t REX_MULPD[] = { 0x66, 0x41, 0x0f, 0x59 };
+ static const uint8_t REX_MAXPD[] = { 0x66, 0x41, 0x0f, 0x5f };
+ static const uint8_t REX_DIVPD[] = { 0x66, 0x41, 0x0f, 0x5e };
+ static const uint8_t SQRTPD[] = { 0x66, 0x0f, 0x51 };
+ static const uint8_t AND_OR_MOV_LDMXCSR[] = { 0x25, 0x00, 0x60, 0x00, 0x00, 0x0D, 0xC0, 0x9F, 0x00, 0x00, 0x89, 0x44, 0x24, 0xF8, 0x0F, 0xAE, 0x54, 0x24, 0xF8 };
+ static const uint8_t ROL_RAX[] = { 0x48, 0xc1, 0xc0 };
+ static const uint8_t XOR_ECX_ECX[] = { 0x33, 0xC9 };
+ static const uint8_t REX_CMP_R32I[] = { 0x41, 0x81 };
+ static const uint8_t REX_CMP_M32I[] = { 0x81, 0x3c, 0x06 };
+ static const uint8_t MOVAPD[] = { 0x66, 0x0f, 0x29 };
+ static const uint8_t REX_MOV_MR[] = { 0x4c, 0x89 };
+ static const uint8_t REX_XOR_EAX[] = { 0x41, 0x33 };
+ static const uint8_t SUB_EBX[] = { 0x83, 0xEB, 0x01 };
+ static const uint8_t JNZ[] = { 0x0f, 0x85 };
+ static const uint8_t JMP = 0xe9;
size_t JitCompilerX86::getCodeSize() {
return codePos - prologueSize + readDatasetSize;
@@ -132,687 +195,613 @@ namespace RandomX {
throw std::runtime_error("mmap failed");
#endif
memcpy(code, codePrologue, prologueSize);
- memcpy(code + CodeSize - epilogueSize - readDatasetSize, codeEpilogue, epilogueSize);
- memcpy(code + CodeSize - readDatasetSize, codeReadDataset, readDatasetSize);
+ memcpy(code + CodeSize - epilogueSize, codeEpilogue, epilogueSize);
}
void JitCompilerX86::generateProgram(Pcg32& gen) {
- instructionOffsets.clear();
- callOffsets.clear();
+ auto addressRegisters = gen();
+ int readReg1 = addressRegisters & 1;
+ addressRegisters >>= 1;
+ int readReg2 = 2 + (addressRegisters & 1);
+ addressRegisters >>= 1;
+ int writeReg1 = 4 + (addressRegisters & 1);
+ addressRegisters >>= 1;
+ int writeReg2 = 6 + (addressRegisters & 1);
codePos = prologueSize;
+ emit(REX_XOR_EAX);
+ emitByte(0xc0 + readReg1);
+ memcpy(code + codePos, codeLoadInt, loadIntSize);
+ codePos += loadIntSize;
+ emit(REX_XOR_EAX);
+ emitByte(0xc0 + readReg2);
+ memcpy(code + codePos, codeLoadFlt, loadFltSize);
+ codePos += loadFltSize;
Instruction instr;
for (unsigned i = 0; i < ProgramLength; ++i) {
for (unsigned j = 0; j < sizeof(instr) / sizeof(Pcg32::result_type); ++j) {
*(((uint32_t*)&instr) + j) = gen();
}
- generateCode(instr, i);
+ instr.src %= RegistersCount;
+ instr.dst %= RegistersCount;
+ generateCode(instr);
}
- emitByte(0xe9);
- emit(instructionOffsets[0] - (codePos + 4));
- fixCallOffsets();
- uint32_t transform = addressTransformations[gen.getUniform(0, TransformationCount - 1)];
- *reinterpret_cast(code + readDatasetOffset) = transform;
+ emit(REX_MOV_RR);
+ emitByte(0xc0 + readReg1);
+ emit(REX_XOR_EAX);
+ emitByte(0xc0 + readReg2);
+ memcpy(code + codePos, codeReadDataset, readDatasetSize);
+ codePos += readDatasetSize;
+ emit(REX_MOV_RR);
+ emitByte(0xc0 + writeReg1);
+ memcpy(code + codePos, codeStoreInt, storeIntSize);
+ codePos += storeIntSize;
+ emit(REX_XOR_EAX);
+ emitByte(0xc0 + writeReg2);
+ memcpy(code + codePos, codeStoreFlt, storeFltSize);
+ codePos += storeFltSize;
+ emit(SUB_EBX);
+ emit(JNZ);
+ emit32(prologueSize - codePos - 4);
+ emitByte(JMP);
+ emit32(epilogueOffset - codePos - 4);
+ emitByte(0x90);
}
- void JitCompilerX86::generateCode(Instruction& instr, int i) {
- instructionOffsets.push_back(codePos);
- emit(0x840fcbff); //dec ebx; jz
- emit(epilogueOffset - (codePos + 4)); //jump offset (RIP-relative)
+ void JitCompilerX86::generateCode(Instruction& instr) {
auto generator = engine[instr.opcode];
- (this->*generator)(instr, i);
+ (this->*generator)(instr);
}
- void JitCompilerX86::fixCallOffsets() {
- for (CallOffset& co : callOffsets) {
- *reinterpret_cast(code + co.pos) = instructionOffsets[co.index] - (co.pos + 4);
- }
+ void JitCompilerX86::genAddressReg(Instruction& instr, bool rax = true) {
+ emit(REX_MOV_RR);
+ emitByte((rax ? 0xc0 : 0xc8) + instr.src);
+ if (rax)
+ emitByte(AND_EAX_I);
+ else
+ emit(AND_ECX_I);
+ emit32((instr.alt % 4) ? ScratchpadL1Mask : ScratchpadL2Mask);
}
- void JitCompilerX86::gena(Instruction& instr) {
- emit(uint16_t(0x8149)); //xor
- emitByte(0xf0 + (instr.rega % RegistersCount));
- emit(instr.addra);
- emit(uint16_t(0x8b41)); //mov
- emitByte(0xc0 + (instr.rega % RegistersCount)); //eax, rega
- emit(0x753fc3f6); //test bl,0x3f; jne
- emit(uint16_t(0xe805));
- emit(readDatasetOffset - (codePos + 4));
- if ((instr.loca & 192) == 0) { //A.LOC.X
- emit(uint16_t(0x3348));
- emitByte(0xe8); //xor rbp, rax
- }
- emitByte(0x25); //and eax,
- //if (instr.loca & 15) {
- if (instr.loca & 3) {
- emit(ScratchpadL1 - 1); //first 16 KiB of scratchpad
- }
- else {
- emit(ScratchpadL2 - 1); //first 256 KiB of scratchpad
- }
- /*}
- else {
- emit(ScratchpadL3 - 1); //whole scratchpad
- }*/
+ void JitCompilerX86::genAddressRegDst(Instruction& instr, bool align16 = false) {
+ emit(REX_MOV_RR);
+ emitByte(0xc0 + instr.dst);
+ emitByte(AND_EAX_I);
+ int32_t maskL1 = align16 ? ScratchpadL1Mask16 : ScratchpadL1Mask;
+ int32_t maskL2 = align16 ? ScratchpadL2Mask16 : ScratchpadL2Mask;
+ emit32((instr.alt % 4) ? maskL1 : maskL2);
}
- void JitCompilerX86::genar(Instruction& instr) {
- gena(instr);
- emit(0xc6048b48); //mov rax,QWORD PTR [rsi+rax*8]
+ void JitCompilerX86::genAddressImm(Instruction& instr) {
+ emit32(instr.imm32 & ((instr.alt % 4) ? ScratchpadL1Mask : ScratchpadL2Mask));
}
- void JitCompilerX86::genaf(Instruction& instr) {
- gena(instr);
- emitByte(0xf3);
- emit(0xc604e60f); //cvtdq2pd xmm0,QWORD PTR [rsi+rax*8]
- }
-
- void JitCompilerX86::genbiashift(Instruction& instr, uint16_t opcodeReg, uint16_t opcodeImm) {
- if (instr.locb & 1) {
- emit(uint16_t(0x8b49)); //mov
- emitByte(0xc8 + (instr.regb % RegistersCount)); //rcx, regb
- emitByte(0x48); //REX.W
- emit(opcodeReg); //xxx rax, cl
+ void JitCompilerX86::h_IADD_R(Instruction& instr) {
+ if (instr.src != instr.dst) {
+ emit(REX_ADD_RR);
+ emitByte(0xc0 + 8 * instr.dst + instr.src);
}
else {
- emitByte(0x48); //REX.W
- emit(opcodeImm); //xxx rax, imm8
- emitByte((instr.imm8 & 63));
+ emit(REX_81);
+ emitByte(0xc0 + instr.dst);
+ emit32(instr.imm32);
}
}
- void JitCompilerX86::genbia(Instruction& instr, uint16_t opcodeReg, uint16_t opcodeImm) {
- if (instr.locb & 3) {
- emit(opcodeReg); // xxx rax, r64
- emitByte(0xc0 + (instr.regb % RegistersCount));
+ void JitCompilerX86::h_IADD_M(Instruction& instr) {
+ if (instr.src != instr.dst) {
+ genAddressReg(instr);
+ emit(REX_ADD_RM);
+ emitByte(0x04 + 8 * instr.dst);
+ emitByte(0x06);
}
else {
- emit(opcodeImm); // xxx rax, imm32
- emit(instr.imm32);
+ emit(REX_ADD_RM);
+ emitByte(0x86 + 8 * instr.dst);
+ genAddressImm(instr);
}
}
- void JitCompilerX86::genbia32(Instruction& instr, uint16_t opcodeReg, uint8_t opcodeImm) {
- if (instr.locb & 3) {
- emit(opcodeReg); // xxx eax, r32
- emitByte(0xc0 + (instr.regb % RegistersCount));
+ void JitCompilerX86::genSIB(int scale, int index, int base) {
+ emitByte((scale << 5) | (index << 3) | base);
+ }
+
+ void JitCompilerX86::h_IADD_RC(Instruction& instr) {
+ emit(REX_LEA);
+ emitByte(0x84 + 8 * instr.dst);
+ genSIB(0, instr.src, instr.dst);
+ emit32(instr.imm32);
+ }
+
+ void JitCompilerX86::h_ISUB_R(Instruction& instr) {
+ if (instr.src != instr.dst) {
+ emit(REX_SUB_RR);
+ emitByte(0xc0 + 8 * instr.dst + instr.src);
}
else {
- emitByte(opcodeImm); // xxx eax, imm32
- emit(instr.imm32);
+ emit(REX_81);
+ emitByte(0xe8 + instr.dst);
+ genAddressImm(instr);
}
}
- void JitCompilerX86::genbf(Instruction& instr, uint8_t opcode) {
- int regb = (instr.regb % RegistersCount);
- emitByte(0x66); //xxxpd xmm0,regb
- if (regb <= 1) {
- emitByte(0x41); //REX
- }
- emitByte(0x0f);
- emitByte(opcode);
- emitByte(0xc0 + regb);
- }
-
-
- void JitCompilerX86::scratchpadStoreR(Instruction& instr, uint32_t scratchpadSize, bool rax) {
- if (rax) {
- emit(0x41c88b48); //mov rcx, rax; REX
+ void JitCompilerX86::h_ISUB_M(Instruction& instr) {
+ if (instr.src != instr.dst) {
+ genAddressReg(instr);
+ emit(REX_SUB_RM);
+ emitByte(0x04 + 8 * instr.dst);
+ emitByte(0x06);
}
else {
- emitByte(0x41);
+ emit(REX_SUB_RM);
+ emitByte(0x86 + 8 * instr.dst);
+ genAddressImm(instr);
}
- emitByte(0x8b); // mov
- emitByte(0xc0 + (instr.regc % RegistersCount)); //eax, regc
- emitByte(0x35); // xor eax
- emit(instr.addrc);
- emitByte(0x25); //and
- emit(scratchpadSize - 1);
- emit(0xc60c8948); // mov QWORD PTR [rsi+rax*8],rcx
}
- void JitCompilerX86::gencr(Instruction& instr, bool rax = true) {
- if (instr.locc & 8) { //write to register
- emit(uint16_t(0x8b4c)); //mov
- if (rax) {
- emitByte(0xc0 + 8 * (instr.regc % RegistersCount)); //regc, rax
- }
- else {
- emitByte(0xc1 + 8 * (instr.regc % RegistersCount)); //regc, rcx
- }
+ void JitCompilerX86::h_IMUL_9C(Instruction& instr) {
+ emit(REX_LEA);
+ emitByte(0x84 + 8 * instr.dst);
+ genSIB(3, instr.src, instr.dst);
+ emit32(instr.imm32);
+ }
+
+ void JitCompilerX86::h_IMUL_R(Instruction& instr) {
+ if (instr.src != instr.dst) {
+ emit(REX_IMUL_RR);
+ emitByte(0xc0 + 8 * instr.dst + instr.src);
}
else {
- //if (instr.locc & 7) {
- if (instr.locc & 1) {
- scratchpadStoreR(instr, ScratchpadL1, rax);
+ emit(REX_IMUL_RRI);
+ emitByte(0xc0 + 9 * instr.dst);
+ genAddressImm(instr);
+ }
+ }
+
+ void JitCompilerX86::h_IMUL_M(Instruction& instr) {
+ if (instr.src != instr.dst) {
+ genAddressReg(instr);
+ emit(REX_IMUL_RM);
+ emitByte(0x04 + 8 * instr.dst);
+ emitByte(0x06);
+ }
+ else {
+ emit(REX_IMUL_RM);
+ emitByte(0x86 + 8 * instr.dst);
+ genAddressImm(instr);
+ }
+ }
+
+ void JitCompilerX86::h_IMULH_R(Instruction& instr) {
+ if (instr.src != instr.dst) {
+ emit(REX_MOV_RR64);
+ emitByte(0xc0 + instr.dst);
+ emit(REX_MUL_R);
+ emitByte(0xe0 + instr.src);
+ emit(REX_MOV_R64R);
+ emitByte(0xc2 + 8 * instr.dst);
+ }
+ else {
+ emitByte(MOV_EAX_I);
+ emit32(instr.imm32);
+ emit(REX_MUL_R);
+ emitByte(0xe0 + instr.dst);
+ emit(REX_ADD_RM);
+ emitByte(0xc2 + 8 * instr.dst);
+ }
+ }
+
+ void JitCompilerX86::h_IMULH_M(Instruction& instr) {
+ if (instr.src != instr.dst) {
+ genAddressReg(instr, false);
+ emit(REX_MOV_RR64);
+ emitByte(0xc0 + instr.dst);
+ emit(REX_MUL_MEM);
+ }
+ else {
+ emit(REX_MOV_RR64);
+ emitByte(0xc0 + instr.dst);
+ emit(REX_MUL_M);
+ emitByte(0xa6);
+ genAddressImm(instr);
+ }
+ emit(REX_MOV_R64R);
+ emitByte(0xc2 + 8 * instr.dst);
+ }
+
+ void JitCompilerX86::h_ISMULH_R(Instruction& instr) {
+ if (instr.src != instr.dst) {
+ emit(REX_MOV_RR64);
+ emitByte(0xc0 + instr.dst);
+ emit(REX_MUL_R);
+ emitByte(0xe8 + instr.src);
+ emit(REX_MOV_R64R);
+ emitByte(0xc2 + 8 * instr.dst);
+ }
+ else {
+ emitByte(MOV_EAX_I);
+ emit32(instr.imm32);
+ emit(REX_MUL_R);
+ emitByte(0xe8 + instr.dst);
+ emit(REX_ADD_RM);
+ emitByte(0xc2 + 8 * instr.dst);
+ }
+ }
+
+ void JitCompilerX86::h_ISMULH_M(Instruction& instr) {
+ if (instr.src != instr.dst) {
+ genAddressReg(instr, false);
+ emit(REX_MOV_RR64);
+ emitByte(0xc0 + instr.dst);
+ emit(REX_IMUL_MEM);
+ }
+ else {
+ emit(REX_MOV_RR64);
+ emitByte(0xc0 + instr.dst);
+ emit(REX_MUL_M);
+ emitByte(0xae);
+ genAddressImm(instr);
+ }
+ emit(REX_MOV_R64R);
+ emitByte(0xc2 + 8 * instr.dst);
+ }
+
+ void JitCompilerX86::h_IDIV_C(Instruction& instr) {
+ if (instr.imm32 != 0) {
+ uint32_t divisor = instr.imm32;
+ if (divisor & (divisor - 1)) {
+ magicu_info mi = compute_unsigned_magic_info(divisor, sizeof(uint64_t) * 8);
+ if (mi.pre_shift == 0 && !mi.increment) {
+ emit(MOV_RAX_I);
+ emit64(mi.multiplier);
+ emit(REX_MUL_R);
+ emitByte(0xe0 + instr.dst);
}
else {
- scratchpadStoreR(instr, ScratchpadL2, rax);
- }
- /*}
- else {
- scratchpadStoreR(instr, ScratchpadL3, rax);
- }*/
- }
- }
-
- void JitCompilerX86::scratchpadStoreF(Instruction& instr, int regc, uint32_t scratchpadSize, bool storeHigh) {
- emit(uint16_t(0x8b41)); //mov
- emitByte(0xc0 + regc); //eax, regc
- emitByte(0x35); // xor eax
- emit(instr.addrc);
- emitByte(0x25); //and
- emit(scratchpadSize - 1);
- emitByte(0x66); //movhpd/movlpd QWORD PTR [rsi+rax*8], regc
- if (regc <= 1) {
- emitByte(0x44); //REX
- }
- emitByte(0x0f);
- emitByte(storeHigh ? 0x17 : 0x13);
- emitByte(4 + 8 * regc);
- emitByte(0xc6);
- }
-
- void JitCompilerX86::gencf(Instruction& instr) {
- int regc = (instr.regc % RegistersCount);
- if (regc <= 1) {
- emitByte(0x44); //REX
- }
- emit(uint16_t(0x280f)); //movaps
- emitByte(0xc0 + 8 * regc); // regc, xmm0
- if (instr.locc & 8) { //write to scratchpad
- //if (instr.locc & 7) {
- if (instr.locc & 1) { //C.LOC.W
- scratchpadStoreF(instr, regc, ScratchpadL1, (instr.locc & 128)); //first 16 KiB of scratchpad
- }
- else {
- scratchpadStoreF(instr, regc, ScratchpadL2, (instr.locc & 128)); //first 256 KiB of scratchpad
- }
- //}
- /*else {
- scratchpadStoreF(instr, regc, ScratchpadL3, (instr.locc & 128)); //whole scratchpad
- }*/
- }
- }
-
- void JitCompilerX86::h_ADD_64(Instruction& instr, int i) {
- genar(instr);
- genbia(instr, 0x0349, 0x0548);
- gencr(instr);
- }
-
- void JitCompilerX86::h_ADD_32(Instruction& instr, int i) {
- genar(instr);
- genbia32(instr, 0x0341, 0x05);
- gencr(instr);
- }
-
- void JitCompilerX86::h_SUB_64(Instruction& instr, int i) {
- genar(instr);
- genbia(instr, 0x2b49, 0x2d48);
- gencr(instr);
- }
-
- void JitCompilerX86::h_SUB_32(Instruction& instr, int i) {
- genar(instr);
- genbia32(instr, 0x2b41, 0x2d);
- gencr(instr);
- }
-
- void JitCompilerX86::h_MUL_64(Instruction& instr, int i) {
- genar(instr);
- if ((instr.locb & 7) <= 5) {
- emitByte(0x49); //REX
- emit(uint16_t(0xaf0f)); // imul rax, r64
- emitByte(0xc0 + (instr.regb % RegistersCount));
- }
- else {
- emitByte(0x48); //REX
- emit(uint16_t(0xc069)); // imul rax, rax, imm32
- emit(instr.imm32);
- }
- gencr(instr);
- }
-
- void JitCompilerX86::h_MULH_64(Instruction& instr, int i) {
- genar(instr);
- if ((instr.locb & 7) <= 5) {
- emit(uint16_t(0x8b49)); //mov rcx, r64
- emitByte(0xc8 + (instr.regb % RegistersCount));
- }
- else {
- emitByte(0x48);
- emit(uint16_t(0xc1c7)); // mov rcx, imm32
- emit(instr.imm32);
- }
- emitByte(0x48);
- emit(uint16_t(0xe1f7)); // mul rcx
- emitByte(0x48);
- emit(uint16_t(0xc28b)); // mov rax,rdx
- gencr(instr);
- }
-
- void JitCompilerX86::h_MUL_32(Instruction& instr, int i) {
- genar(instr);
- emit(uint16_t(0xc88b)); //mov ecx, eax
- if ((instr.locb & 7) <= 5) {
- emit(uint16_t(0x8b41)); // mov eax, r32
- emitByte(0xc0 + (instr.regb % RegistersCount));
- }
- else {
- emitByte(0xb8); // mov eax, imm32
- emit(instr.imm32);
- }
- emit(0xc1af0f48); //imul rax,rcx
- gencr(instr);
- }
-
- void JitCompilerX86::h_IMUL_32(Instruction& instr, int i) {
- genar(instr);
- emitByte(0x48);
- emit(uint16_t(0xc863)); //movsxd rcx,eax
- if ((instr.locb & 7) <= 5) {
- emit(uint16_t(0x6349)); //movsxd rax,r32
- emitByte(0xc0 + (instr.regb % RegistersCount));
- }
- else {
- emitByte(0x48);
- emit(uint16_t(0xc0c7)); // mov rax, imm32
- emit(instr.imm32);
- }
- emit(0xc1af0f48); //imul rax,rcx
- gencr(instr);
- }
-
- void JitCompilerX86::h_IMULH_64(Instruction& instr, int i) {
- genar(instr);
- if ((instr.locb & 7) <= 5) {
- emit(uint16_t(0x8b49)); //mov rcx, r64
- emitByte(0xc8 + (instr.regb % RegistersCount));
- }
- else {
- emitByte(0x48);
- emit(uint16_t(0xc1c7)); // mov rcx, imm32
- emit(instr.imm32);
- }
- emitByte(0x48);
- emit(uint16_t(0xe9f7)); // imul rcx
- emitByte(0x48);
- emit(uint16_t(0xc28b)); // mov rax,rdx
- gencr(instr);
- }
-
- void JitCompilerX86::h_DIV_64(Instruction& instr, int i) {
- genar(instr);
- if (instr.locb & 7) {
-#ifdef MAGIC_DIVISION
- if (instr.imm32 != 0) {
- uint32_t divisor = instr.imm32;
- if (divisor & (divisor - 1)) {
- magicu_info mi = compute_unsigned_magic_info(divisor, sizeof(uint64_t) * 8);
+ emit(REX_MOV_RR64);
+ emitByte(0xc0 + instr.dst);
if (mi.pre_shift > 0) {
- if (mi.pre_shift == 1) {
- emitByte(0x48);
- emit(uint16_t(0xe8d1)); //shr rax,1
- }
- else {
- emit(0x00e8c148 | (mi.pre_shift << 24)); //shr rax, pre_shift
- }
+ emit(REX_SHR_RAX);
+ emitByte(mi.pre_shift);
}
if (mi.increment) {
- emit(0x00d8834801c08348); //add rax,1; sbb rax,0
+ emit(RAX_ADD_SBB_1);
}
- emit(uint16_t(0xb948)); //movabs rcx, multiplier
- emit(mi.multiplier);
- emit(0x48e1f748); //mul rcx; REX
- emit(uint16_t(0xc28b)); //mov rax,rdx
- if (mi.post_shift > 0)
- emit(0x00e8c148 | (mi.post_shift << 24)); //shr rax, post_shift
- }
- else { //divisor is a power of two
- int shift = 0;
- while (divisor >>= 1)
- ++shift;
- if (shift > 0)
- emit(0x00e8c148 | (shift << 24)); //shr rax, shift
+ emit(MOV_RCX_I);
+ emit64(mi.multiplier);
+ emit(MUL_RCX);
}
- }
-#else
- emitByte(0xb9); //mov ecx, imm32
- emit(instr.imm32 != 0 ? instr.imm32 : 1);
-#endif
- }
- else {
- emitByte(0xb9); //mov ecx, 1
- emit(1);
- emit(uint16_t(0x8b41)); //mov edx, r32
- emitByte(0xd0 + (instr.regb % RegistersCount));
- emit(0x450fd285); //test edx, edx; cmovne ecx,edx
- emitByte(0xca);
-#ifdef MAGIC_DIVISION
- emit(0xf748d233); //xor edx,edx; div rcx
- emitByte(0xf1);
-#endif
- }
-#ifndef MAGIC_DIVISION
- emit(0xf748d233); //xor edx,edx; div rcx
- emitByte(0xf1);
-#endif
- gencr(instr);
- }
-
- void JitCompilerX86::h_IDIV_64(Instruction& instr, int i) {
- genar(instr);
- if (instr.locb & 7) {
-#ifdef MAGIC_DIVISION
- int64_t divisor = instr.imm32;
- if ((divisor & -divisor) == divisor || (divisor & -divisor) == -divisor) {
- // +/- power of two
- bool negative = divisor < 0;
- if (negative)
- divisor = -divisor;
+ if (mi.post_shift > 0) {
+ emit(REX_SHR_RDX);
+ emitByte(mi.post_shift);
+ }
+ emit(REX_ADD_RR);
+ emitByte(0xc2 + 8 * instr.dst);
+ }
+ else { //divisor is a power of two
int shift = 0;
- uint64_t unsignedDivisor = divisor;
- while (unsignedDivisor >>= 1)
+ while (divisor >>= 1)
++shift;
if (shift > 0) {
- emitByte(0x48);
- emit(uint16_t(0xc88b)); //mov rcx, rax
- emit(0x3ff9c148); //sar rcx, 63
- uint32_t mask = (1ULL << shift) - 1;
- emit(uint16_t(0xe181)); //and ecx, mask
- emit(mask);
- emitByte(0x48);
- emit(uint16_t(0xc103)); //add rax, rcx
- emit(0x00f8c148 | (shift << 24)); //sar rax, shift
- }
- if (negative) {
- emitByte(0x48);
- emit(uint16_t(0xd8f7)); //neg rax
+ emit(REX_SH);
+ emitByte(0xe8 + instr.dst);
}
}
- else if (divisor != 0) {
- magics_info mi = compute_signed_magic_info(divisor);
- if ((divisor >= 0) != (mi.multiplier >= 0)) {
- emitByte(0x48);
- emit(uint16_t(0xc88b)); //mov rcx, rax
- }
- emit(uint16_t(0xba48)); //movabs rdx, multiplier
- emit(mi.multiplier);
- emit(0xd233c28b48eaf748); //imul rdx; mov rax,rdx; xor edx,edx
- bool haveSF = false;
- if (divisor > 0 && mi.multiplier < 0) {
- emitByte(0x48);
- emit(uint16_t(0xc103)); //add rax, rcx
- haveSF = true;
- }
- if (divisor < 0 && mi.multiplier > 0) {
- emitByte(0x48);
- emit(uint16_t(0xc12b)); //sub rax, rcx
- haveSF = true;
- }
- if (mi.shift > 0) {
- emit(0x00f8c148 | (mi.shift << 24)); //sar rax, shift
- haveSF = true;
- }
- if (!haveSF) {
- emitByte(0x48);
- emit(uint16_t(0x85c0));
- }
- emit(0x48c2980f); //sets dl; add rax, rdx
- emit(uint16_t(0xc203));
+ }
+ }
+
+ void JitCompilerX86::h_ISDIV_C(Instruction& instr) {
+ int64_t divisor = instr.imm32;
+ if ((divisor & -divisor) == divisor || (divisor & -divisor) == -divisor) {
+ emit(REX_MOV_RR64);
+ emitByte(0xc0 + instr.dst);
+ // +/- power of two
+ bool negative = divisor < 0;
+ if (negative)
+ divisor = -divisor;
+ int shift = 0;
+ uint64_t unsignedDivisor = divisor;
+ while (unsignedDivisor >>= 1)
+ ++shift;
+ if (shift > 0) {
+ emit(MOV_RCX_RAX_SAR_RCX_63);
+ uint32_t mask = (1ULL << shift) - 1;
+ emit(AND_ECX_I);
+ emit32(mask);
+ emit(ADD_RAX_RCX);
+ emit(SAR_RAX_I8);
+ emitByte(shift);
}
-#else
- emitByte(0xba); // mov edx, imm32
- emit(instr.imm32);
-#endif
+ if (negative)
+ emit(NEG_RAX);
+ emit(ADD_R_RAX);
+ emitByte(0xc0 + instr.dst);
+ }
+ else if (divisor != 0) {
+ magics_info mi = compute_signed_magic_info(divisor);
+ emit(MOV_RAX_I);
+ emit64(mi.multiplier);
+ emit(REX_MUL_R);
+ emitByte(0xe8 + instr.dst);
+ emit(XOR_EAX_EAX);
+ bool haveSF = false;
+ if (divisor > 0 && mi.multiplier < 0) {
+ emit(ADD_RDX_R);
+ emitByte(0xc2 + 8 * instr.dst);
+ haveSF = true;
+ }
+ if (divisor < 0 && mi.multiplier > 0) {
+ emit(SUB_RDX_R);
+ emitByte(0xc2 + 8 * instr.dst);
+ haveSF = true;
+ }
+ if (mi.shift > 0) {
+ emit(SAR_RDX_I8);
+ emitByte(mi.shift);
+ haveSF = true;
+ }
+ if (!haveSF)
+ emit(TEST_RDX_RDX);
+ emit(SETS_AL_ADD_RDX_RAX);
+ emit(ADD_R_RAX);
+ emitByte(0xd0 + instr.dst);
+ }
+ }
+
+ void JitCompilerX86::h_INEG_R(Instruction& instr) {
+ emit(REX_NEG);
+ emitByte(0xd8 + instr.dst);
+ }
+
+ void JitCompilerX86::h_IXOR_R(Instruction& instr) {
+ if (instr.src != instr.dst) {
+ emit(REX_XOR_RR);
+ emitByte(0xc0 + 8 * instr.dst + instr.src);
}
else {
- emit(uint16_t(0x8b41)); //mov edx, r32
- emitByte(0xd0 + (instr.regb % RegistersCount));
-#ifndef MAGIC_DIVISION
+ emit(REX_XOR_RI);
+ emitByte(0xf0 + instr.dst);
+ emit32(instr.imm32);
}
-#endif
- emit(0xd8f7480575fffa83); //cmp edx,-1
- emit(uint16_t(0x12eb)); //jmp result
- emit(0x0fd28500000001b9);
- emit(0x489948c96348ca45);
- emit(uint16_t(0xf9f7)); //idiv rcx
-#ifdef MAGIC_DIVISION
- }
-#endif
- gencr(instr);
}
- void JitCompilerX86::h_AND_64(Instruction& instr, int i) {
- genar(instr);
- genbia(instr, 0x2349, 0x2548);
- gencr(instr);
+ void JitCompilerX86::h_IXOR_M(Instruction& instr) {
+ if (instr.src != instr.dst) {
+ genAddressReg(instr);
+ emit(REX_XOR_RM);
+ emitByte(0x04 + 8 * instr.dst);
+ emitByte(0x06);
+ }
+ else {
+ emit(REX_XOR_RM);
+ emitByte(0x86 + 8 * instr.dst);
+ genAddressImm(instr);
+ }
}
- void JitCompilerX86::h_AND_32(Instruction& instr, int i) {
- genar(instr);
- genbia32(instr, 0x2341, 0x25);
- gencr(instr);
+ void JitCompilerX86::h_IROR_R(Instruction& instr) {
+ if (instr.src != instr.dst) {
+ emit(REX_MOV_RR);
+ emitByte(0xc8 + instr.src);
+ emit(REX_ROT_CL);
+ emitByte(0xc8 + instr.dst);
+ }
+ else {
+ emit(REX_ROT_I8);
+ emitByte(0xc8 + instr.dst);
+ emitByte(instr.imm32 & 63);
+ }
}
- void JitCompilerX86::h_OR_64(Instruction& instr, int i) {
- genar(instr);
- genbia(instr, 0x0b49, 0x0d48);
- gencr(instr);
+ void JitCompilerX86::h_IROL_R(Instruction& instr) {
+ if (instr.src != instr.dst) {
+ emit(REX_MOV_RR);
+ emitByte(0xc8 + instr.src);
+ emit(REX_ROT_CL);
+ emitByte(0xc0 + instr.dst);
+ }
+ else {
+ emit(REX_ROT_I8);
+ emitByte(0xc0 + instr.dst);
+ emitByte(instr.imm32 & 63);
+ }
}
- void JitCompilerX86::h_OR_32(Instruction& instr, int i) {
- genar(instr);
- genbia32(instr, 0x0b41, 0x0d);
- gencr(instr);
+ void JitCompilerX86::h_FPSWAP_R(Instruction& instr) {
+ emit(SHUFPD);
+ emitByte(0xc0 + 9 * instr.dst);
+ emitByte(1);
}
- void JitCompilerX86::h_XOR_64(Instruction& instr, int i) {
- genar(instr);
- genbia(instr, 0x3349, 0x3548);
- gencr(instr);
+ void JitCompilerX86::h_FPADD_R(Instruction& instr) {
+ instr.dst %= 4;
+ instr.src %= 4;
+ emit(REX_ADDPD);
+ emitByte(0xc0 + instr.src + 8 * instr.dst);
}
- void JitCompilerX86::h_XOR_32(Instruction& instr, int i) {
- genar(instr);
- genbia32(instr, 0x3341, 0x35);
- gencr(instr);
+ void JitCompilerX86::h_FPADD_M(Instruction& instr) {
+ instr.dst %= 4;
+ genAddressReg(instr);
+ emit(REX_CVTDQ2PD_XMM12);
+ emit(REX_ADDPD);
+ emitByte(0xc4 + 8 * instr.dst);
}
- void JitCompilerX86::h_SHL_64(Instruction& instr, int i) {
- genar(instr);
- genbiashift(instr, 0xe0d3, 0xe0c1);
- gencr(instr);
+ void JitCompilerX86::h_FPSUB_R(Instruction& instr) {
+ instr.dst %= 4;
+ instr.src %= 4;
+ emit(REX_SUBPD);
+ emitByte(0xc0 + instr.src + 8 * instr.dst);
}
- void JitCompilerX86::h_SHR_64(Instruction& instr, int i) {
- genar(instr);
- genbiashift(instr, 0xe8d3, 0xe8c1);
- gencr(instr);
+ void JitCompilerX86::h_FPSUB_M(Instruction& instr) {
+ instr.dst %= 4;
+ genAddressReg(instr);
+ emit(REX_CVTDQ2PD_XMM12);
+ emit(REX_SUBPD);
+ emitByte(0xc4 + 8 * instr.dst);
}
- void JitCompilerX86::h_SAR_64(Instruction& instr, int i) {
- genar(instr);
- genbiashift(instr, 0xf8d3, 0xf8c1);
- gencr(instr);
+ void JitCompilerX86::h_FPNEG_R(Instruction& instr) {
+ instr.dst %= 4;
+ emit(REX_XORPS);
+ emitByte(0xc7 + 8 * instr.dst);
}
- void JitCompilerX86::h_ROL_64(Instruction& instr, int i) {
- genar(instr);
- genbiashift(instr, 0xc0d3, 0xc0c1);
- gencr(instr);
+ void JitCompilerX86::h_FPMUL_R(Instruction& instr) {
+ instr.dst %= 4;
+ instr.src %= 4;
+ emit(REX_MULPD);
+ emitByte(0xe0 + instr.src + 8 * instr.dst);
}
- void JitCompilerX86::h_ROR_64(Instruction& instr, int i) {
- genar(instr);
- genbiashift(instr, 0xc8d3, 0xc8c1);
- gencr(instr);
+ void JitCompilerX86::h_FPMUL_M(Instruction& instr) {
+ instr.dst %= 4;
+ genAddressReg(instr);
+ emit(REX_CVTDQ2PD_XMM12);
+ emit(REX_MULPD);
+ emitByte(0xe4 + 8 * instr.dst);
+ emit(REX_MAXPD);
+ emitByte(0xe5 + 8 * instr.dst);
}
- void JitCompilerX86::h_FPADD(Instruction& instr, int i) {
- genaf(instr);
- genbf(instr, 0x58);
- gencf(instr);
+ void JitCompilerX86::h_FPDIV_R(Instruction& instr) {
+ instr.dst %= 4;
+ instr.src %= 4;
+ emit(REX_DIVPD);
+ emitByte(0xe0 + instr.src + 8 * instr.dst);
+ emit(REX_MAXPD);
+ emitByte(0xe5 + 8 * instr.dst);
}
- void JitCompilerX86::h_FPSUB(Instruction& instr, int i) {
- genaf(instr);
- genbf(instr, 0x5c);
- gencf(instr);
+ void JitCompilerX86::h_FPDIV_M(Instruction& instr) {
+ instr.dst %= 4;
+ genAddressReg(instr);
+ emit(REX_CVTDQ2PD_XMM12);
+ emit(REX_DIVPD);
+ emitByte(0xe4 + 8 * instr.dst);
+ emit(REX_MAXPD);
+ emitByte(0xe5 + 8 * instr.dst);
}
- void JitCompilerX86::h_FPMUL(Instruction& instr, int i) {
- genaf(instr);
- genbf(instr, 0x59);
- emit(0x00c9c20f66c8280f); //movaps xmm1,xmm0; cmpeqpd xmm1,xmm1
- emit(uint16_t(0x540f)); //andps xmm0,xmm1
- emitByte(0xc1);
- gencf(instr);
+ void JitCompilerX86::h_FPSQRT_R(Instruction& instr) {
+ instr.dst %= 4;
+ emit(SQRTPD);
+ emitByte(0xe4 + 9 * instr.dst);
}
- void JitCompilerX86::h_FPDIV(Instruction& instr, int i) {
- genaf(instr);
- genbf(instr, 0x5e);
- emit(0x00c9c20f66c8280f); //movaps xmm1,xmm0; cmpeqpd xmm1,xmm1
- emit(uint16_t(0x540f)); //andps xmm0,xmm1
- emitByte(0xc1);
- gencf(instr);
- }
-
- void JitCompilerX86::h_FPSQRT(Instruction& instr, int i) {
- genaf(instr);
- emit(0xc0510f66c2540f41); //andps xmm0,xmm10; sqrtpd xmm0,xmm0
- gencf(instr);
- }
-
- void JitCompilerX86::h_FPROUND(Instruction& instr, int i) {
- genar(instr);
- emitByte(0x48);
- emit(uint16_t(0xc88b)); //mov rcx,rax
- int rotate = (13 - (instr.imm8 & 63)) & 63;
+ void JitCompilerX86::h_CFROUND(Instruction& instr) {
+ emit(REX_MOV_RR64);
+ emitByte(0xc0 + instr.src);
+ int rotate = (13 - (instr.alt & 63)) & 63;
if (rotate != 0) {
- emitByte(0x48);
- emit(uint16_t(0xc0c1)); //rol rax
+ emit(ROL_RAX);
emitByte(rotate);
}
- emit(uint16_t(0x0025));
- emit(0x00009fc00d000060); //and eax,0x6000; or eax,0x9fc0
- emit(0x2454ae0ff8244489); //ldmxcsr DWORD PTR [rsp-0x8]
- emitByte(0xf8);
- gencr(instr, false); //result in rcx
+ emit(AND_OR_MOV_LDMXCSR);
}
- static inline uint8_t jumpCondition(Instruction& instr, bool invert = false) {
- switch ((instr.locb & 7) ^ invert)
+ static inline uint8_t condition(Instruction& instr, bool invert = false) {
+ switch ((instr.alt & 7) ^ invert)
{
case 0:
- return 0x76; //jbe
+ return 0x96; //setbe
case 1:
- return 0x77; //ja
+ return 0x97; //seta
case 2:
- return 0x78; //js
+ return 0x98; //sets
case 3:
- return 0x79; //jns
+ return 0x99; //setns
case 4:
- return 0x70; //jo
+ return 0x90; //seto
case 5:
- return 0x71; //jno
+ return 0x91; //setno
case 6:
- return 0x7c; //jl
+ return 0x9c; //setl
case 7:
- return 0x7d; //jge
+ return 0x9d; //setge
}
}
- void JitCompilerX86::h_JUMP(Instruction& instr, int i) {
- genar(instr);
- gencr(instr);
- emit(uint16_t(0x8141)); //cmp regb, imm32
- emitByte(0xf8 + (instr.regb % RegistersCount));
- emit(instr.imm32);
- emitByte(0x0f); //near jump
- emitByte(jumpCondition(instr) + 0x10);
- i = wrapInstr(i + (instr.imm8 & 127) + 2);
- if (i < instructionOffsets.size()) {
- emit(instructionOffsets[i] - (codePos + 4));
- }
- else {
- callOffsets.push_back(CallOffset(codePos, i));
- codePos += 4;
- }
+ void JitCompilerX86::h_COND_R(Instruction& instr) {
+ emit(XOR_ECX_ECX);
+ emit(REX_CMP_R32I);
+ emitByte(0xf8 + instr.src);
+ emit32(instr.imm32);
+ emitByte(0x0f);
+ emitByte(condition(instr));
+ emitByte(0xc1);
+ emit(REX_ADD_RM);
+ emitByte(0xc1 + 8 * instr.dst);
}
- void JitCompilerX86::h_CALL(Instruction& instr, int i) {
- genar(instr);
- gencr(instr);
- emit(uint16_t(0x8141)); //cmp regb, imm32
- emitByte(0xf8 + (instr.regb % RegistersCount));
- emit(instr.imm32);
- emitByte(jumpCondition(instr, true));
- emitByte(0x05);
- emitByte(0xe8); //call
- i = wrapInstr(i + (instr.imm8 & 127) + 2);
- if (i < instructionOffsets.size()) {
- emit(instructionOffsets[i] - (codePos + 4));
- }
- else {
- callOffsets.push_back(CallOffset(codePos, i));
- codePos += 4;
- }
+ void JitCompilerX86::h_COND_M(Instruction& instr) {
+ emit(XOR_ECX_ECX);
+ genAddressReg(instr);
+ emit(REX_CMP_M32I);
+ emit32(instr.imm32);
+ emitByte(0x0f);
+ emitByte(condition(instr));
+ emitByte(0xc1);
+ emit(REX_ADD_RM);
+ emitByte(0xc1 + 8 * instr.dst);
}
- void JitCompilerX86::h_RET(Instruction& instr, int i) {
- genar(instr);
- int crlen = 0;
- if ((instr.locc & 7) <= 3) {
- crlen = 17;
- }
- emit(0x74e73b48); //cmp rsp, rdi; je
- emitByte(0x01);
- emitByte(0xc3); //ret
+ void JitCompilerX86::h_ISTORE(Instruction& instr) {
+ genAddressRegDst(instr);
+ emit(REX_MOV_MR);
+ emitByte(0x04 + 8 * instr.src);
+ emitByte(0x06);
}
- void JitCompilerX86::h_NOP(Instruction& instr, int i) {
- genar(instr);
+ void JitCompilerX86::h_FSTORE(Instruction& instr) {
+ genAddressRegDst(instr, true);
+ emit(MOVAPD);
+ emitByte(0x04 + 8 * instr.src);
+ emitByte(0x06);
}
#include "instructionWeights.hpp"
#define INST_HANDLE(x) REPN(&JitCompilerX86::h_##x, WT(x))
InstructionGeneratorX86 JitCompilerX86::engine[256] = {
- INST_HANDLE(ADD_64)
- INST_HANDLE(ADD_32)
- INST_HANDLE(SUB_64)
- INST_HANDLE(SUB_32)
- INST_HANDLE(MUL_64)
- INST_HANDLE(MULH_64)
- INST_HANDLE(MUL_32)
- INST_HANDLE(IMUL_32)
- INST_HANDLE(IMULH_64)
- INST_HANDLE(DIV_64)
- INST_HANDLE(IDIV_64)
- INST_HANDLE(AND_64)
- INST_HANDLE(AND_32)
- INST_HANDLE(OR_64)
- INST_HANDLE(OR_32)
- INST_HANDLE(XOR_64)
- INST_HANDLE(XOR_32)
- INST_HANDLE(SHL_64)
- INST_HANDLE(SHR_64)
- INST_HANDLE(SAR_64)
- INST_HANDLE(ROL_64)
- INST_HANDLE(ROR_64)
- INST_HANDLE(FPADD)
- INST_HANDLE(FPSUB)
- INST_HANDLE(FPMUL)
- INST_HANDLE(FPDIV)
- INST_HANDLE(FPSQRT)
- INST_HANDLE(FPROUND)
- INST_HANDLE(JUMP)
- INST_HANDLE(CALL)
- INST_HANDLE(RET)
- INST_HANDLE(NOP)
+ INST_HANDLE(IADD_R)
+ INST_HANDLE(IADD_M)
+ INST_HANDLE(IADD_RC)
+ INST_HANDLE(ISUB_R)
+ INST_HANDLE(ISUB_M)
+ INST_HANDLE(IMUL_9C)
+ INST_HANDLE(IMUL_R)
+ INST_HANDLE(IMUL_M)
+ INST_HANDLE(IMULH_R)
+ INST_HANDLE(IMULH_M)
+ INST_HANDLE(ISMULH_R)
+ INST_HANDLE(ISMULH_M)
+ INST_HANDLE(IDIV_C)
+ INST_HANDLE(ISDIV_C)
+ INST_HANDLE(INEG_R)
+ INST_HANDLE(IXOR_R)
+ INST_HANDLE(IXOR_M)
+ INST_HANDLE(IROR_R)
+ INST_HANDLE(IROL_R)
+ INST_HANDLE(FPSWAP_R)
+ INST_HANDLE(FPADD_R)
+ INST_HANDLE(FPADD_M)
+ INST_HANDLE(FPSUB_R)
+ INST_HANDLE(FPSUB_M)
+ INST_HANDLE(FPNEG_R)
+ INST_HANDLE(FPMUL_R)
+ INST_HANDLE(FPMUL_M)
+ INST_HANDLE(FPDIV_R)
+ INST_HANDLE(FPDIV_M)
+ INST_HANDLE(FPSQRT_R)
+ INST_HANDLE(COND_R)
+ INST_HANDLE(COND_M)
+ INST_HANDLE(CFROUND)
+ INST_HANDLE(ISTORE)
+ INST_HANDLE(FSTORE)
};
+
#endif
}
\ No newline at end of file
diff --git a/src/JitCompilerX86.hpp b/src/JitCompilerX86.hpp
index e6a7e6d..fa5aa93 100644
--- a/src/JitCompilerX86.hpp
+++ b/src/JitCompilerX86.hpp
@@ -30,16 +30,10 @@ namespace RandomX {
class JitCompilerX86;
- typedef void(JitCompilerX86::*InstructionGeneratorX86)(Instruction&, int);
+ typedef void(JitCompilerX86::*InstructionGeneratorX86)(Instruction&);
constexpr uint32_t CodeSize = 64 * 1024;
- struct CallOffset {
- CallOffset(int32_t p, int32_t i) : pos(p), index(i) {}
- int32_t pos;
- int32_t index;
- };
-
class JitCompilerX86 {
public:
JitCompilerX86();
@@ -55,66 +49,82 @@ namespace RandomX {
static InstructionGeneratorX86 engine[256];
uint8_t* code;
int32_t codePos;
- std::vector instructionOffsets;
- std::vector callOffsets;
- void gena(Instruction&);
- void genar(Instruction&);
- void genaf(Instruction&);
- void genbiashift(Instruction&, uint16_t, uint16_t);
- void genbia(Instruction&, uint16_t, uint16_t);
- void genbia32(Instruction&, uint16_t, uint8_t);
- void genbf(Instruction&, uint8_t);
- void scratchpadStoreR(Instruction&, uint32_t, bool);
- void scratchpadStoreF(Instruction&, int, uint32_t, bool);
- void gencr(Instruction&, bool);
- void gencf(Instruction&);
- void generateCode(Instruction&, int);
- void fixCallOffsets();
+ void genAddressReg(Instruction&, bool);
+ void genAddressRegDst(Instruction&, bool);
+ void genAddressImm(Instruction&);
+ void genSIB(int scale, int index, int base);
+
+ void generateCode(Instruction&);
void emitByte(uint8_t val) {
code[codePos] = val;
codePos++;
}
- template
- void emit(T val) {
- *reinterpret_cast(code + codePos) = val;
- codePos += sizeof(T);
+ void emit32(uint32_t val) {
+ code[codePos + 0] = val;
+ code[codePos + 1] = val >> 8;
+ code[codePos + 2] = val >> 16;
+ code[codePos + 3] = val >> 24;
+ codePos += 4;
}
- void h_ADD_64(Instruction&, int);
- void h_ADD_32(Instruction&, int);
- void h_SUB_64(Instruction&, int);
- void h_SUB_32(Instruction&, int);
- void h_MUL_64(Instruction&, int);
- void h_MULH_64(Instruction&, int);
- void h_MUL_32(Instruction&, int);
- void h_IMUL_32(Instruction&, int);
- void h_IMULH_64(Instruction&, int);
- void h_DIV_64(Instruction&, int);
- void h_IDIV_64(Instruction&, int);
- void h_AND_64(Instruction&, int);
- void h_AND_32(Instruction&, int);
- void h_OR_64(Instruction&, int);
- void h_OR_32(Instruction&, int);
- void h_XOR_64(Instruction&, int);
- void h_XOR_32(Instruction&, int);
- void h_SHL_64(Instruction&, int);
- void h_SHR_64(Instruction&, int);
- void h_SAR_64(Instruction&, int);
- void h_ROL_64(Instruction&, int);
- void h_ROR_64(Instruction&, int);
- void h_FPADD(Instruction&, int);
- void h_FPSUB(Instruction&, int);
- void h_FPMUL(Instruction&, int);
- void h_FPDIV(Instruction&, int);
- void h_FPSQRT(Instruction&, int);
- void h_FPROUND(Instruction&, int);
- void h_JUMP(Instruction&, int);
- void h_CALL(Instruction&, int);
- void h_RET(Instruction&, int);
- void h_NOP(Instruction&, int);
+ void emit64(uint64_t val) {
+ code[codePos + 0] = val;
+ code[codePos + 1] = val >> 8;
+ code[codePos + 2] = val >> 16;
+ code[codePos + 3] = val >> 24;
+ code[codePos + 4] = val >> 32;
+ code[codePos + 5] = val >> 40;
+ code[codePos + 6] = val >> 48;
+ code[codePos + 7] = val >> 56;
+ codePos += 8;
+ }
+
+ template
+ void emit(const uint8_t (&src)[N]) {
+ for (int i = 0; i < N; ++i) {
+ code[codePos + i] = src[i];
+ }
+ codePos += N;
+ }
+
+ void h_IADD_R(Instruction&);
+ void h_IADD_M(Instruction&);
+ void h_IADD_RC(Instruction&);
+ void h_ISUB_R(Instruction&);
+ void h_ISUB_M(Instruction&);
+ void h_IMUL_9C(Instruction&);
+ void h_IMUL_R(Instruction&);
+ void h_IMUL_M(Instruction&);
+ void h_IMULH_R(Instruction&);
+ void h_IMULH_M(Instruction&);
+ void h_ISMULH_R(Instruction&);
+ void h_ISMULH_M(Instruction&);
+ void h_IDIV_C(Instruction&);
+ void h_ISDIV_C(Instruction&);
+ void h_INEG_R(Instruction&);
+ void h_IXOR_R(Instruction&);
+ void h_IXOR_M(Instruction&);
+ void h_IROR_R(Instruction&);
+ void h_IROL_R(Instruction&);
+ void h_FPSWAP_R(Instruction&);
+ void h_FPADD_R(Instruction&);
+ void h_FPADD_M(Instruction&);
+ void h_FPSUB_R(Instruction&);
+ void h_FPSUB_M(Instruction&);
+ void h_FPNEG_R(Instruction&);
+ void h_FPMUL_R(Instruction&);
+ void h_FPMUL_M(Instruction&);
+ void h_FPDIV_R(Instruction&);
+ void h_FPDIV_M(Instruction&);
+ void h_FPSQRT_R(Instruction&);
+ void h_COND_R(Instruction&);
+ void h_COND_M(Instruction&);
+ void h_CFROUND(Instruction&);
+ void h_ISTORE(Instruction&);
+ void h_FSTORE(Instruction&);
};
}
\ No newline at end of file
diff --git a/src/asm/program_epilogue_store.inc b/src/asm/program_epilogue_store.inc
index 95a4752..b94fa4d 100644
--- a/src/asm/program_epilogue_store.inc
+++ b/src/asm/program_epilogue_store.inc
@@ -1,9 +1,5 @@
- ;# unroll VM stack
- mov rsp, rdi
-
;# save VM register values
pop rcx
- pop rcx
mov qword ptr [rcx+0], r8
mov qword ptr [rcx+8], r9
mov qword ptr [rcx+16], r10
@@ -12,12 +8,12 @@
mov qword ptr [rcx+40], r13
mov qword ptr [rcx+48], r14
mov qword ptr [rcx+56], r15
- movapd xmmword ptr [rcx+64], xmm8
- movapd xmmword ptr [rcx+80], xmm9
- movapd xmmword ptr [rcx+96], xmm2
- movapd xmmword ptr [rcx+112], xmm3
+ movdqa xmmword ptr [rcx+64], xmm0
+ movdqa xmmword ptr [rcx+80], xmm1
+ movdqa xmmword ptr [rcx+96], xmm2
+ movdqa xmmword ptr [rcx+112], xmm3
lea rcx, [rcx+64]
- movapd xmmword ptr [rcx+64], xmm4
- movapd xmmword ptr [rcx+80], xmm5
- movapd xmmword ptr [rcx+96], xmm6
- movapd xmmword ptr [rcx+112], xmm7
\ No newline at end of file
+ movdqa xmmword ptr [rcx+64], xmm4
+ movdqa xmmword ptr [rcx+80], xmm5
+ movdqa xmmword ptr [rcx+96], xmm6
+ movdqa xmmword ptr [rcx+112], xmm7
\ No newline at end of file
diff --git a/src/asm/program_epilogue_win64.inc b/src/asm/program_epilogue_win64.inc
index 220bed8..f2e4b44 100644
--- a/src/asm/program_epilogue_win64.inc
+++ b/src/asm/program_epilogue_win64.inc
@@ -1,6 +1,12 @@
include program_epilogue_store.inc
;# restore callee-saved registers - Microsoft x64 calling convention
+ movdqu xmm15, xmmword ptr [rsp]
+ movdqu xmm14, xmmword ptr [rsp+16]
+ movdqu xmm13, xmmword ptr [rsp+32]
+ movdqu xmm12, xmmword ptr [rsp+48]
+ movdqu xmm11, xmmword ptr [rsp+64]
+ add rsp, 80
movdqu xmm10, xmmword ptr [rsp]
movdqu xmm9, xmmword ptr [rsp+16]
movdqu xmm8, xmmword ptr [rsp+32]
@@ -17,4 +23,4 @@
pop rbx
;# program finished
- ret 0
\ No newline at end of file
+ ret
diff --git a/src/asm/program_load_flt.inc b/src/asm/program_load_flt.inc
new file mode 100644
index 0000000..af6f1b7
--- /dev/null
+++ b/src/asm/program_load_flt.inc
@@ -0,0 +1,14 @@
+ and eax, 262080
+ lea rcx, [rsi+rax]
+ cvtdq2pd xmm0, qword ptr [rcx+0]
+ cvtdq2pd xmm1, qword ptr [rcx+8]
+ cvtdq2pd xmm2, qword ptr [rcx+16]
+ cvtdq2pd xmm3, qword ptr [rcx+24]
+ cvtdq2pd xmm4, qword ptr [rcx+32]
+ cvtdq2pd xmm5, qword ptr [rcx+40]
+ cvtdq2pd xmm6, qword ptr [rcx+48]
+ cvtdq2pd xmm7, qword ptr [rcx+56]
+ andps xmm4, xmm14
+ andps xmm5, xmm14
+ andps xmm6, xmm14
+ andps xmm7, xmm14
diff --git a/src/asm/program_load_int.inc b/src/asm/program_load_int.inc
new file mode 100644
index 0000000..d139549
--- /dev/null
+++ b/src/asm/program_load_int.inc
@@ -0,0 +1,10 @@
+ and eax, 262080
+ lea rcx, [rsi+rax]
+ xor r8, qword ptr [rcx+0]
+ xor r9, qword ptr [rcx+8]
+ xor r10, qword ptr [rcx+16]
+ xor r11, qword ptr [rcx+24]
+ xor r12, qword ptr [rcx+32]
+ xor r13, qword ptr [rcx+40]
+ xor r14, qword ptr [rcx+48]
+ xor r15, qword ptr [rcx+56]
diff --git a/src/asm/program_prologue_linux.inc b/src/asm/program_prologue_linux.inc
index 6bc3bd2..67a967d 100644
--- a/src/asm/program_prologue_linux.inc
+++ b/src/asm/program_prologue_linux.inc
@@ -7,13 +7,14 @@
push r15
;# function arguments
+ mov rbx, rcx ;# loop counter
push rdi ;# RegisterFile& registerFile
- mov rbp, qword ptr [rsi] ;# "mx", "ma"
- mov rax, qword ptr [rsi+8] ;# uint8_t* dataset
- push rax
- mov rsi, rdx ;# convertible_t* scratchpad
mov rcx, rdi
+ mov rbp, qword ptr [rsi] ;# "mx", "ma"
+ mov eax, ebp ;# "mx"
+ mov rdi, qword ptr [rsi+8] ;# uint8_t* dataset
+ mov rsi, rdx ;# convertible_t* scratchpad
#include "program_prologue_load.inc"
- jmp randomx_program_begin
\ No newline at end of file
+ jmp DECL(randomx_loop_begin)
\ No newline at end of file
diff --git a/src/asm/program_prologue_load.inc b/src/asm/program_prologue_load.inc
index 9ceeed6..ecdd4f9 100644
--- a/src/asm/program_prologue_load.inc
+++ b/src/asm/program_prologue_load.inc
@@ -1,27 +1,20 @@
- mov rdi, rsp ;# beginning of VM stack
- mov ebx, 262145 ;# number of VM instructions to execute + 1
+ ;# zero integer registers
+ xor r8, r8
+ xor r9, r9
+ xor r10, r10
+ xor r11, r11
+ xor r12, r12
+ xor r13, r13
+ xor r14, r14
+ xor r15, r15
- xorps xmm10, xmm10
- cmpeqpd xmm10, xmm10
- psrlq xmm10, 1 ;# mask for absolute value = 0x7fffffffffffffff7fffffffffffffff
+ ;# load constant registers
+ lea rcx, [rcx+120]
+ movapd xmm8, xmmword ptr [rcx+72]
+ movapd xmm9, xmmword ptr [rcx+88]
+ movapd xmm10, xmmword ptr [rcx+104]
+ movapd xmm11, xmmword ptr [rcx+120]
+ movapd xmm13, xmmword ptr [minDbl]
+ movapd xmm14, xmmword ptr [absMask]
+ movapd xmm15, xmmword ptr [signMask]
- ;# load integer registers
- mov r8, qword ptr [rcx+0]
- mov r9, qword ptr [rcx+8]
- mov r10, qword ptr [rcx+16]
- mov r11, qword ptr [rcx+24]
- mov r12, qword ptr [rcx+32]
- mov r13, qword ptr [rcx+40]
- mov r14, qword ptr [rcx+48]
- mov r15, qword ptr [rcx+56]
-
- ;# load floating point registers
- movapd xmm8, xmmword ptr [rcx+64]
- movapd xmm9, xmmword ptr [rcx+80]
- movapd xmm2, xmmword ptr [rcx+96]
- movapd xmm3, xmmword ptr [rcx+112]
- lea rcx, [rcx+64]
- movapd xmm4, xmmword ptr [rcx+64]
- movapd xmm5, xmmword ptr [rcx+80]
- movapd xmm6, xmmword ptr [rcx+96]
- movapd xmm7, xmmword ptr [rcx+112]
diff --git a/src/asm/program_prologue_win64.inc b/src/asm/program_prologue_win64.inc
index bbf7851..83ae2a5 100644
--- a/src/asm/program_prologue_win64.inc
+++ b/src/asm/program_prologue_win64.inc
@@ -13,14 +13,21 @@
movdqu xmmword ptr [rsp+32], xmm8
movdqu xmmword ptr [rsp+16], xmm9
movdqu xmmword ptr [rsp+0], xmm10
+ sub rsp, 80
+ movdqu xmmword ptr [rsp+64], xmm11
+ movdqu xmmword ptr [rsp+48], xmm12
+ movdqu xmmword ptr [rsp+32], xmm13
+ movdqu xmmword ptr [rsp+16], xmm14
+ movdqu xmmword ptr [rsp+0], xmm15
- ;# function arguments
- push rcx ;# RegisterFile& registerFile
- mov rbp, qword ptr [rdx] ;# "mx", "ma"
- mov rax, qword ptr [rdx+8] ;# uint8_t* dataset
- push rax
- mov rsi, r8 ;# convertible_t* scratchpad
+ ; function arguments
+ push rcx ; RegisterFile& registerFile
+ mov rbp, qword ptr [rdx] ; "mx", "ma"
+ mov eax, ebp ; "mx"
+ mov rdi, qword ptr [rdx+8] ; uint8_t* dataset
+ mov rsi, r8 ; convertible_t* scratchpad
+ mov rbx, r9 ; loop counter
include program_prologue_load.inc
- jmp randomx_program_begin
\ No newline at end of file
+ jmp randomx_loop_begin
\ No newline at end of file
diff --git a/src/asm/program_read.inc b/src/asm/program_read.inc
deleted file mode 100644
index c7650ea..0000000
--- a/src/asm/program_read.inc
+++ /dev/null
@@ -1,20 +0,0 @@
- db 0, 0, 0, 0 ;# TransformAddress placeholder
- mov rcx, qword ptr [rdi] ;# load the dataset address
- xor rbp, rax ;# modify "mx"
- ;# prefetch cacheline "mx"
- and rbp, -64 ;# align "mx" to the start of a cache line
- mov edx, ebp ;# edx = mx
- prefetchnta byte ptr [rcx+rdx]
- ;# read cacheline "ma"
- ror rbp, 32 ;# swap "ma" and "mx"
- mov edx, ebp ;# edx = ma
- lea rcx, [rcx+rdx] ;# dataset cache line
- xor r8, qword ptr [rcx+0]
- xor r9, qword ptr [rcx+8]
- xor r10, qword ptr [rcx+16]
- xor r11, qword ptr [rcx+24]
- xor r12, qword ptr [rcx+32]
- xor r13, qword ptr [rcx+40]
- xor r14, qword ptr [rcx+48]
- xor r15, qword ptr [rcx+56]
- ret
\ No newline at end of file
diff --git a/src/asm/program_read_dataset.inc b/src/asm/program_read_dataset.inc
new file mode 100644
index 0000000..bae4817
--- /dev/null
+++ b/src/asm/program_read_dataset.inc
@@ -0,0 +1,16 @@
+ xor rbp, rax ;# modify "mx"
+ and rbp, -64 ;# align "mx" to the start of a cache line
+ mov edx, ebp ;# edx = mx
+ prefetchnta byte ptr [rdi+rdx]
+ ror rbp, 32 ;# swap "ma" and "mx"
+ mov edx, ebp ;# edx = ma
+ lea rcx, [rdi+rdx] ;# dataset cache line
+ xor r8, qword ptr [rcx+0]
+ xor r9, qword ptr [rcx+8]
+ xor r10, qword ptr [rcx+16]
+ xor r11, qword ptr [rcx+24]
+ xor r12, qword ptr [rcx+32]
+ xor r13, qword ptr [rcx+40]
+ xor r14, qword ptr [rcx+48]
+ xor r15, qword ptr [rcx+56]
+
\ No newline at end of file
diff --git a/src/asm/program_store_flt.inc b/src/asm/program_store_flt.inc
new file mode 100644
index 0000000..d6ca7f1
--- /dev/null
+++ b/src/asm/program_store_flt.inc
@@ -0,0 +1,11 @@
+ and eax, 262080
+ lea rcx, [rsi+rax]
+ mulpd xmm0, xmm4
+ mulpd xmm1, xmm5
+ mulpd xmm2, xmm6
+ mulpd xmm3, xmm7
+ movapd xmmword ptr [rcx+0], xmm0
+ movapd xmmword ptr [rcx+16], xmm1
+ movapd xmmword ptr [rcx+32], xmm2
+ movapd xmmword ptr [rcx+48], xmm3
+
diff --git a/src/asm/program_store_int.inc b/src/asm/program_store_int.inc
new file mode 100644
index 0000000..75c973f
--- /dev/null
+++ b/src/asm/program_store_int.inc
@@ -0,0 +1,10 @@
+ and eax, 262080
+ lea rcx, [rsi+rax]
+ mov qword ptr [rcx+0], r8
+ mov qword ptr [rcx+8], r9
+ mov qword ptr [rcx+16], r10
+ mov qword ptr [rcx+24], r11
+ mov qword ptr [rcx+32], r12
+ mov qword ptr [rcx+40], r13
+ mov qword ptr [rcx+48], r14
+ mov qword ptr [rcx+56], r15
diff --git a/src/asm/program_xmm_constants.inc b/src/asm/program_xmm_constants.inc
new file mode 100644
index 0000000..38c897c
--- /dev/null
+++ b/src/asm/program_xmm_constants.inc
@@ -0,0 +1,6 @@
+minDbl:
+ db 0, 0, 0, 0, 0, 0, 16, 0, 0, 0, 0, 0, 0, 0, 16, 0
+absMask:
+ db 255, 255, 255, 255, 255, 255, 255, 127, 255, 255, 255, 255, 255, 255, 255, 127
+signMask:
+ db 0, 0, 0, 0, 0, 0, 0, 128, 0, 0, 0, 0, 0, 0, 0, 128
\ No newline at end of file
diff --git a/src/common.hpp b/src/common.hpp
index bf235ec..053f2a1 100644
--- a/src/common.hpp
+++ b/src/common.hpp
@@ -81,6 +81,8 @@ namespace RandomX {
constexpr uint32_t ScratchpadL3 = ScratchpadSize / sizeof(convertible_t);
constexpr int ScratchpadL1Mask = (ScratchpadL1 - 1) * 8;
constexpr int ScratchpadL2Mask = (ScratchpadL2 - 1) * 8;
+ constexpr int ScratchpadL1Mask16 = (ScratchpadL1 / 2 - 1) * 16;
+ constexpr int ScratchpadL2Mask16 = (ScratchpadL2 / 2 - 1) * 16;
constexpr uint32_t TransformationCount = 90;
constexpr int RegistersCount = 8;
@@ -129,7 +131,7 @@ namespace RandomX {
typedef void(*DatasetReadFunc)(addr_t, MemoryRegisters&, RegisterFile&);
- typedef void(*ProgramFunc)(RegisterFile&, MemoryRegisters&, convertible_t*);
+ typedef void(*ProgramFunc)(RegisterFile&, MemoryRegisters&, convertible_t*, uint64_t);
extern "C" {
void executeProgram(RegisterFile&, MemoryRegisters&, convertible_t*, uint64_t);
diff --git a/src/executeProgram-win64.asm b/src/executeProgram-win64.asm
index 17e593d..be3bc82 100644
--- a/src/executeProgram-win64.asm
+++ b/src/executeProgram-win64.asm
@@ -21,14 +21,6 @@ _RANDOMX_EXECUTE_PROGRAM SEGMENT PAGE READ EXECUTE
PUBLIC executeProgram
-ALIGN 16
-minDbl:
-db 0, 0, 0, 0, 0, 0, 16, 0, 0, 0, 0, 0, 0, 0, 16, 0
-absMask:
-db 255, 255, 255, 255, 255, 255, 255, 127, 255, 255, 255, 255, 255, 255, 255, 127
-signMask:
-db 0, 0, 0, 0, 0, 0, 0, 128, 0, 0, 0, 0, 0, 0, 0, 128
-
executeProgram PROC
; REGISTER ALLOCATION:
; rax -> temporary
@@ -114,6 +106,17 @@ executeProgram PROC
movapd xmm14, xmmword ptr [absMask]
movapd xmm15, xmmword ptr [signMask]
+ jmp program_begin
+
+ALIGN 64
+minDbl:
+ db 0, 0, 0, 0, 0, 0, 16, 0, 0, 0, 0, 0, 0, 0, 16, 0
+absMask:
+ db 255, 255, 255, 255, 255, 255, 255, 127, 255, 255, 255, 255, 255, 255, 255, 127
+signMask:
+ db 0, 0, 0, 0, 0, 0, 0, 128, 0, 0, 0, 0, 0, 0, 0, 128
+
+ALIGN 64
program_begin:
xor eax, r8d ;# read address register 1
and eax, 262080
@@ -144,7 +147,7 @@ program_begin:
;# 256 instructions
include program.inc
-
+
mov eax, r8d ;# read address register 1
xor eax, r9d ;# read address register 2
xor rbp, rax ;# modify "mx"
diff --git a/src/instructionWeights.hpp b/src/instructionWeights.hpp
index 242b5bd..86285de 100644
--- a/src/instructionWeights.hpp
+++ b/src/instructionWeights.hpp
@@ -22,21 +22,21 @@ along with RandomX. If not, see.
//Integer
#define WT_IADD_R 10
#define WT_IADD_M 3
-#define WT_IADD_RC 12
+#define WT_IADD_RC 10
#define WT_ISUB_R 10
#define WT_ISUB_M 3
-#define WT_IMUL_9C 12
-#define WT_IMUL_R 24
-#define WT_IMUL_M 8
+#define WT_IMUL_9C 10
+#define WT_IMUL_R 20
+#define WT_IMUL_M 6
#define WT_IMULH_R 6
#define WT_IMULH_M 2
#define WT_ISMULH_R 6
#define WT_ISMULH_M 2
#define WT_IDIV_C 4
-#define WT_ISDIV_C 2
-#define WT_INEG_R 4
-#define WT_IXOR_R 15
-#define WT_IXOR_M 5
+#define WT_ISDIV_C 4
+#define WT_INEG_R 2
+#define WT_IXOR_R 12
+#define WT_IXOR_M 4
#define WT_IROR_R 10
#define WT_IROL_R 10
@@ -58,10 +58,14 @@ along with RandomX. If not, see.
#define WT_FPSQRT_R 6
//Control
-#define WT_COND_R 15
-#define WT_COND_M 5
+#define WT_COND_R 12
+#define WT_COND_M 4
#define WT_CFROUND 1
+//Store
+#define WT_ISTORE 12
+#define WT_FSTORE 6
+
#define WT_NOP 0
constexpr int wtSum = WT_IADD_R + WT_IADD_M + WT_IADD_RC + WT_ISUB_R + \
@@ -70,7 +74,7 @@ WT_IMULH_M + WT_ISMULH_R + WT_ISMULH_M + WT_IDIV_C + WT_ISDIV_C + \
WT_INEG_R + WT_IXOR_R + WT_IXOR_M + WT_IROR_R + WT_IROL_R + \
WT_FPSWAP_R + WT_FPADD_R + WT_FPADD_M + WT_FPSUB_R + WT_FPSUB_M + \
WT_FPNEG_R + WT_FPMUL_R + WT_FPMUL_M + WT_FPDIV_R + WT_FPDIV_M + \
-WT_FPSQRT_R + WT_COND_R + WT_COND_M + WT_CFROUND + WT_NOP;
+WT_FPSQRT_R + WT_COND_R + WT_COND_M + WT_CFROUND + WT_ISTORE + WT_FSTORE + WT_NOP;
static_assert(wtSum == 256,
"Sum of instruction weights must be 256");
@@ -116,3 +120,40 @@ static_assert(wtSum == 256,
#define REPN(x,N) REPNX(x,N)
#define NUM(x) x
#define WT(x) NUM(WT_##x)
+
+#define REPCASE0(x)
+#define REPCASE1(x) case __COUNTER__:
+#define REPCASE2(x) REPCASE1(x) case __COUNTER__:
+#define REPCASE3(x) REPCASE2(x) case __COUNTER__:
+#define REPCASE4(x) REPCASE3(x) case __COUNTER__:
+#define REPCASE5(x) REPCASE4(x) case __COUNTER__:
+#define REPCASE6(x) REPCASE5(x) case __COUNTER__:
+#define REPCASE7(x) REPCASE6(x) case __COUNTER__:
+#define REPCASE8(x) REPCASE7(x) case __COUNTER__:
+#define REPCASE9(x) REPCASE8(x) case __COUNTER__:
+#define REPCASE10(x) REPCASE9(x) case __COUNTER__:
+#define REPCASE11(x) REPCASE10(x) case __COUNTER__:
+#define REPCASE12(x) REPCASE11(x) case __COUNTER__:
+#define REPCASE13(x) REPCASE12(x) case __COUNTER__:
+#define REPCASE14(x) REPCASE13(x) case __COUNTER__:
+#define REPCASE15(x) REPCASE14(x) case __COUNTER__:
+#define REPCASE16(x) REPCASE15(x) case __COUNTER__:
+#define REPCASE17(x) REPCASE16(x) case __COUNTER__:
+#define REPCASE18(x) REPCASE17(x) case __COUNTER__:
+#define REPCASE19(x) REPCASE18(x) case __COUNTER__:
+#define REPCASE20(x) REPCASE19(x) case __COUNTER__:
+#define REPCASE21(x) REPCASE20(x) case __COUNTER__:
+#define REPCASE22(x) REPCASE21(x) case __COUNTER__:
+#define REPCASE23(x) REPCASE22(x) case __COUNTER__:
+#define REPCASE24(x) REPCASE23(x) case __COUNTER__:
+#define REPCASE25(x) REPCASE24(x) case __COUNTER__:
+#define REPCASE26(x) REPCASE25(x) case __COUNTER__:
+#define REPCASE27(x) REPCASE26(x) case __COUNTER__:
+#define REPCASE28(x) REPCASE27(x) case __COUNTER__:
+#define REPCASE29(x) REPCASE28(x) case __COUNTER__:
+#define REPCASE30(x) REPCASE29(x) case __COUNTER__:
+#define REPCASE31(x) REPCASE30(x) case __COUNTER__:
+#define REPCASE32(x) REPCASE31(x) case __COUNTER__:
+#define REPCASENX(x,N) REPCASE##N(x)
+#define REPCASEN(x,N) REPCASENX(x,N)
+#define CASE_REP(x) REPCASEN(x, WT(x))
\ No newline at end of file
diff --git a/src/main.cpp b/src/main.cpp
index 0b09a74..12e9cdb 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -174,7 +174,7 @@ void mine(RandomX::VirtualMachine* vm, std::atomic& atomicNonce, AtomicHash
for (int chain = 0; chain < 16; ++chain) {
vm->initializeProgram(hash);
int segment = hash[3] & 3;
- vm->setScratchpad(scratchpad);// +segment * RandomX::ScratchpadSize / 4);
+ vm->setScratchpad(scratchpad + segment * RandomX::ScratchpadSize / 4);
vm->execute();
vm->getResult(nullptr, 0, hash);
}
diff --git a/src/program.inc b/src/program.inc
index a91240e..21f7d0b 100644
--- a/src/program.inc
+++ b/src/program.inc
@@ -1,745 +1,793 @@
- ; ISUB_R r0, r4
- sub r8, r12
- ; IROR_R r5, 15
- ror r13, 15
- ; ISUB_M r6, L1[r5]
- mov eax, r13d
- and eax, 16376
- sub r14, qword ptr [rsi+rax]
- ; IMUL_R r7, r6
- imul r15, r14
- ; FPADD_R f3, a1
- addpd xmm3, xmm9
- ; FPMUL_R e1, a3
- mulpd xmm5, xmm11
- ; IMUL_R r2, r4
- imul r10, r12
- ; IADD_RC r4, r5, 1789610138
- lea r12, [r12+r13+1789610138]
- ; IADD_R r1, r4
- add r9, r12
- ; IADD_R r6, r0
- add r14, r8
- ; IXOR_R r7, r2
- xor r15, r10
- ; ISMULH_M r6, L1[6816]
- mov rax, r14
- imul qword ptr [rsi+6816]
- mov r14, rdx
- ; ISUB_R r0, r4
- sub r8, r12
- ; IXOR_R r7, r2
- xor r15, r10
- ; INEG_R r4
- neg r12
- ; IROL_R r3, r0
- mov ecx, r8d
- rol r11, cl
- ; IADD_RC r2, r5, -1667142135
- lea r10, [r10+r13-1667142135]
- ; ISUB_R r6, r2
- sub r14, r10
- ; IDIV_C r3, 2650709570
- mov rax, 3736177069856446853
- mul r11
- shr rdx, 29
- add r11, rdx
- ; IMULH_R r3, r0
- mov rax, r11
- mul r8
- mov r11, rdx
- ; FPSUB_R f0, a2
- subpd xmm0, xmm10
- ; FPADD_M f3, L2[r4]
- mov eax, r12d
+ ; FPMUL_R e0, a2
+ mulpd xmm4, xmm10
+ ; IADD_RC r2, r5, -1621224194
+ lea r10, [r10+r13-1621224194]
+ ; ISTORE L2[r2], r7
+ mov eax, r10d
and eax, 262136
- cvtdq2pd xmm12, qword ptr [rsi+rax]
- addpd xmm3, xmm12
- ; FPMUL_M e1, L1[r5]
- mov eax, r13d
+ mov qword ptr [rsi+rax], r15
+ ; FPMUL_R e2, a2
+ mulpd xmm6, xmm10
+ ; IMUL_R r6, r3
+ imul r14, r11
+ ; FPMUL_R e1, a0
+ mulpd xmm5, xmm8
+ ; IROR_R r5, r3
+ mov ecx, r11d
+ ror r13, cl
+ ; FPMUL_R e2, a0
+ mulpd xmm6, xmm8
+ ; FPNEG_R f3
+ xorps xmm3, xmm15
+ ; IXOR_R r0, r4
+ xor r8, r12
+ ; ISMULH_R r3, r7
+ mov rax, r11
+ imul r15
+ mov r11, rdx
+ ; FPSWAP_R f2
+ shufpd xmm2, xmm2, 1
+ ; ISMULH_R r6, r0
+ mov rax, r14
+ imul r8
+ mov r14, rdx
+ ; FPMUL_R e0, a2
+ mulpd xmm4, xmm10
+ ; ISUB_R r3, r4
+ sub r11, r12
+ ; IADD_R r7, -1138617760
+ add r15, -1138617760
+ ; IROR_R r2, r6
+ mov ecx, r14d
+ ror r10, cl
+ ; FPMUL_R e2, a1
+ mulpd xmm6, xmm9
+ ; IROR_R r7, r1
+ mov ecx, r9d
+ ror r15, cl
+ ; COND_M r2, lt(L1[r7], -41618808)
+ xor ecx, ecx
+ mov eax, r15d
and eax, 16376
+ cmp dword ptr [rsi+rax], -41618808
+ setl cl
+ add r10, rcx
+ ; FPMUL_M e3, L1[r0]
+ mov eax, r8d
+ and eax, 16376
+ cvtdq2pd xmm12, qword ptr [rsi+rax]
+ mulpd xmm7, xmm12
+ maxpd xmm7, xmm13
+ ; CFROUND r1, 43
+ mov rax, r9
+ rol rax, 34
+ and eax, 24576
+ or eax, 40896
+ mov dword ptr [rsp-8], eax
+ ldmxcsr dword ptr [rsp-8]
+ ; FPADD_R f2, a1
+ addpd xmm2, xmm9
+ ; FPNEG_R f0
+ xorps xmm0, xmm15
+ ; FSTORE L1[r6], f2
+ mov eax, r14d
+ and eax, 16368
+ movapd xmmword ptr [rsi+rax], xmm2
+ ; IMUL_9C r6, -45112665
+ lea r14, [r14+r14*8-45112665]
+ ; IADD_M r0, L1[r4]
+ mov eax, r12d
+ and eax, 16376
+ add r8, qword ptr [rsi+rax]
+ ; ISTORE L1[r4], r3
+ mov eax, r12d
+ and eax, 16376
+ mov qword ptr [rsi+rax], r11
+ ; ISTORE L1[r6], r6
+ mov eax, r14d
+ and eax, 16376
+ mov qword ptr [rsi+rax], r14
+ ; COND_R r4, sg(r1, -1189096105)
+ xor ecx, ecx
+ cmp r9d, -1189096105
+ sets cl
+ add r12, rcx
+ ; IXOR_R r2, r5
+ xor r10, r13
+ ; COND_R r1, be(r5, -965180434)
+ xor ecx, ecx
+ cmp r13d, -965180434
+ setbe cl
+ add r9, rcx
+ ; FPMUL_M e1, L2[r3]
+ mov eax, r11d
+ and eax, 262136
cvtdq2pd xmm12, qword ptr [rsi+rax]
mulpd xmm5, xmm12
maxpd xmm5, xmm13
- ; IMUL_9C r7, -778247271
- lea r15, [r15+r15*8-778247271]
- ; IXOR_R r4, 1846379510
- xor r12, 1846379510
- ; COND_M r6, of(L1[r1], -397786451)
+ ; IMULH_R r7, r6
+ mov rax, r15
+ mul r14
+ mov r15, rdx
+ ; ISMULH_M r0, L1[r4]
+ mov ecx, r12d
+ and ecx, 16376
+ mov rax, r8
+ imul qword ptr [rsi+rcx]
+ mov r8, rdx
+ ; IMUL_R r5, r3
+ imul r13, r11
+ ; COND_R r2, of(r0, -1045938770)
xor ecx, ecx
- mov eax, r9d
- and eax, 16376
- cmp dword ptr [rsi+rax], -397786451
+ cmp r8d, -1045938770
seto cl
- add r14, rcx
- ; COND_R r6, of(r3, -1033710571)
- xor ecx, ecx
- cmp r11d, -1033710571
- seto cl
- add r14, rcx
- ; COND_M r6, sg(L1[r6], 1413230028)
- xor ecx, ecx
- mov eax, r14d
- and eax, 16376
- cmp dword ptr [rsi+rax], 1413230028
- sets cl
- add r14, rcx
- ; IDIV_C r0, 2791108943
- mov rax, 1774119268816201525
- mul r8
- shr rdx, 28
- add r8, rdx
- ; FPSUB_M f1, L1[r6]
- mov eax, r14d
- and eax, 16376
- cvtdq2pd xmm12, qword ptr [rsi+rax]
- subpd xmm1, xmm12
- ; FPSWAP_R f0
- shufpd xmm0, xmm0, 1
- ; IADD_RC r6, r5, -640194892
- lea r14, [r14+r13-640194892]
- ; FPADD_M f0, L1[r2]
- mov eax, r10d
- and eax, 16376
- cvtdq2pd xmm12, qword ptr [rsi+rax]
- addpd xmm0, xmm12
- ; IMUL_R r6, r5
- imul r14, r13
- ; IROL_R r4, r1
- mov ecx, r9d
- rol r12, cl
- ; FPDIV_R e2, a0
- divpd xmm6, xmm8
- maxpd xmm6, xmm13
- ; IADD_RC r0, r2, -487084195
- lea r8, [r8+r10-487084195]
- ; FPADD_R f0, a0
- addpd xmm0, xmm8
- ; IXOR_R r5, r3
- xor r13, r11
- ; IMUL_R r2, r4
- imul r10, r12
- ; FPMUL_R e0, a0
- mulpd xmm4, xmm8
- ; FPSUB_R f3, a3
- subpd xmm3, xmm11
- ; IMUL_M r4, L1[4856]
- imul r12, qword ptr [rsi+4856]
- ; IMUL_9C r2, 7951348
- lea r10, [r10+r10*8+7951348]
- ; COND_R r3, ab(r7, 984532162)
- xor ecx, ecx
- cmp r15d, 984532162
- seta cl
- add r11, rcx
- ; IXOR_M r7, L1[r4]
+ add r10, rcx
+ ; FPADD_M f3, L1[r4]
mov eax, r12d
and eax, 16376
- xor r15, qword ptr [rsi+rax]
- ; IMUL_R r4, 248971329
- imul r12, 248971329
- ; IXOR_R r3, r1
- xor r11, r9
- ; IMUL_R r3, 2098482639
- imul r11, 2098482639
- ; IXOR_R r6, r3
- xor r14, r11
- ; IXOR_R r5, r4
- xor r13, r12
- ; IADD_R r5, r4
- add r13, r12
- ; IMUL_9C r7, 66530302
- lea r15, [r15+r15*8+66530302]
- ; IMULH_R r0, r5
- mov rax, r8
- mul r13
- mov r8, rdx
- ; IMUL_R r2, r7
- imul r10, r15
- ; IMUL_R r1, 770985098
- imul r9, 770985098
- ; COND_R r7, be(r5, 58538265)
- xor ecx, ecx
- cmp r13d, 58538265
- setbe cl
- add r15, rcx
- ; IMUL_9C r3, 245704334
- lea r11, [r11+r11*8+245704334]
- ; ISMULH_R r2, r4
- mov rax, r10
- imul r12
- mov r10, rdx
- ; FPDIV_R e3, a3
- divpd xmm7, xmm11
- maxpd xmm7, xmm13
- ; IMULH_R r5, r2
- mov rax, r13
- mul r10
- mov r13, rdx
- ; ISUB_M r7, L1[r5]
- mov eax, r13d
+ cvtdq2pd xmm12, qword ptr [rsi+rax]
+ addpd xmm3, xmm12
+ ; IADD_R r3, r2
+ add r11, r10
+ ; FPADD_R f1, a0
+ addpd xmm1, xmm8
+ ; FPSQRT_R e3
+ sqrtpd xmm7, xmm7
+ ; FPSUB_R f0, a1
+ subpd xmm0, xmm9
+ ; IMUL_M r5, L1[r6]
+ mov eax, r14d
and eax, 16376
- sub r15, qword ptr [rsi+rax]
- ; FPMUL_R e3, a3
- mulpd xmm7, xmm11
- ; IMUL_R r3, r4
- imul r11, r12
- ; FPSWAP_R f1
- shufpd xmm1, xmm1, 1
- ; IMULH_R r1, 633797287
- mov eax, 633797287
- mul r9
- add r9, rdx
- ; IADD_R r4, r3
- add r12, r11
- ; IROR_R r2, r7
- mov ecx, r15d
- ror r10, cl
- ; FPSUB_R f0, a2
- subpd xmm0, xmm10
- ; FPSUB_R f2, a2
- subpd xmm2, xmm10
- ; FPMUL_R e0, a2
- mulpd xmm4, xmm10
- ; IMUL_M r4, L1[r3]
- mov eax, r11d
- and eax, 16376
- imul r12, qword ptr [rsi+rax]
- ; IMUL_9C r1, -1901091890
- lea r9, [r9+r9*8-1901091890]
- ; IROR_R r2, r6
- mov ecx, r14d
- ror r10, cl
- ; IMULH_R r5, r3
- mov rax, r13
- mul r11
- mov r13, rdx
- ; FPSUB_M f1, L1[r7]
+ imul r13, qword ptr [rsi+rax]
+ ; ISUB_R r1, r2
+ sub r9, r10
+ ; IMUL_R r4, r6
+ imul r12, r14
+ ; FPSWAP_R e3
+ shufpd xmm7, xmm7, 1
+ ; IMUL_M r0, L1[r7]
mov eax, r15d
and eax, 16376
- cvtdq2pd xmm12, qword ptr [rsi+rax]
- subpd xmm1, xmm12
- ; IMUL_M r2, L1[r1]
- mov eax, r9d
- and eax, 16376
- imul r10, qword ptr [rsi+rax]
- ; IMUL_R r6, r0
- imul r14, r8
- ; IADD_R r7, r6
- add r15, r14
- ; FPSUB_R f2, a3
- subpd xmm2, xmm11
- ; COND_R r5, no(r2, -1589295370)
- xor ecx, ecx
- cmp r10d, -1589295370
- setno cl
- add r13, rcx
- ; IMUL_9C r7, 420978486
- lea r15, [r15+r15*8+420978486]
- ; IROL_R r4, r2
- mov ecx, r10d
- rol r12, cl
- ; IMUL_9C r0, -1084530831
- lea r8, [r8+r8*8-1084530831]
- ; FPNEG_R f3
- xorps xmm3, xmm15
- ; IROR_R r6, r4
- mov ecx, r12d
- ror r14, cl
- ; IROL_R r4, r5
- mov ecx, r13d
- rol r12, cl
- ; FPSUB_R f2, a3
- subpd xmm2, xmm11
- ; FPMUL_R e2, a2
- mulpd xmm6, xmm10
- ; ISMULH_M r6, L2[98600]
- mov rax, r14
- imul qword ptr [rsi+98600]
- mov r14, rdx
- ; IXOR_R r0, r6
- xor r8, r14
- ; FPSWAP_R f1
- shufpd xmm1, xmm1, 1
- ; FPADD_R f0, a1
- addpd xmm0, xmm9
- ; COND_R r1, ab(r3, -991705199)
- xor ecx, ecx
- cmp r11d, -991705199
- seta cl
- add r9, rcx
- ; IMULH_M r4, L2[r2]
- mov ecx, r10d
- and ecx, 262136
- mov rax, r12
- mul qword ptr [rsi+rcx]
- mov r12, rdx
- ; IROR_R r2, r6
- mov ecx, r14d
- ror r10, cl
- ; FPDIV_R e0, a1
- divpd xmm4, xmm9
- maxpd xmm4, xmm13
- ; IMUL_R r1, r7
- imul r9, r15
- ; COND_R r6, ns(r2, 939392855)
- xor ecx, ecx
- cmp r10d, 939392855
- setns cl
- add r14, rcx
- ; FPMUL_R e3, a1
- mulpd xmm7, xmm9
- ; COND_R r2, ab(r2, -499266314)
- xor ecx, ecx
- cmp r10d, -499266314
- seta cl
- add r10, rcx
- ; COND_M r7, lt(L1[r1], -1624420482)
- xor ecx, ecx
- mov eax, r9d
- and eax, 16376
- cmp dword ptr [rsi+rax], -1624420482
- setl cl
- add r15, rcx
- ; COND_R r1, lt(r1, 1525413977)
- xor ecx, ecx
- cmp r9d, 1525413977
- setl cl
- add r9, rcx
- ; IMUL_R r4, r5
- imul r12, r13
- ; IMUL_R r4, r2
- imul r12, r10
- ; FPSQRT_R e1
- sqrtpd xmm5, xmm5
- ; ISUB_R r2, r6
- sub r10, r14
- ; FPDIV_R e1, a0
- divpd xmm5, xmm8
- maxpd xmm5, xmm13
- ; FPMUL_R e2, a3
- mulpd xmm6, xmm11
- ; IADD_R r6, 671627590
- add r14, 671627590
- ; COND_M r6, sg(L1[r4], -780452820)
- xor ecx, ecx
- mov eax, r12d
- and eax, 16376
- cmp dword ptr [rsi+rax], -780452820
- sets cl
- add r14, rcx
- ; IMULH_R r4, r7
- mov rax, r12
- mul r15
- mov r12, rdx
- ; FPMUL_R e3, a1
- mulpd xmm7, xmm9
- ; FPADD_R f0, a0
- addpd xmm0, xmm8
- ; FPMUL_R e0, a1
- mulpd xmm4, xmm9
- ; IMUL_R r7, r3
- imul r15, r11
- ; IROL_R r0, r7
- mov ecx, r15d
- rol r8, cl
- ; IMUL_R r1, r7
- imul r9, r15
- ; COND_R r0, no(r7, 449007464)
- xor ecx, ecx
- cmp r15d, 449007464
- setno cl
- add r8, rcx
- ; ISMULH_M r6, L2[134288]
- mov rax, r14
- imul qword ptr [rsi+134288]
- mov r14, rdx
- ; IMULH_R r5, r2
- mov rax, r13
- mul r10
- mov r13, rdx
- ; IMULH_R r7, r4
- mov rax, r15
- mul r12
- mov r15, rdx
- ; FPDIV_R e3, a0
- divpd xmm7, xmm8
- maxpd xmm7, xmm13
- ; IXOR_R r3, r4
- xor r11, r12
- ; IDIV_C r1, 72349044
- mov rax, 8555331009525020641
- mul r9
- shr rdx, 25
- add r9, rdx
- ; IADD_R r5, r4
- add r13, r12
- ; IROR_R r2, r4
- mov ecx, r12d
- ror r10, cl
- ; FPSUB_M f1, L1[r2]
- mov eax, r10d
- and eax, 16376
- cvtdq2pd xmm12, qword ptr [rsi+rax]
- subpd xmm1, xmm12
- ; FPMUL_R e2, a3
- mulpd xmm6, xmm11
- ; IADD_R r5, r6
- add r13, r14
- ; IXOR_M r1, L1[r4]
- mov eax, r12d
- and eax, 16376
- xor r9, qword ptr [rsi+rax]
- ; ISUB_R r2, -1544880589
- sub r10, -1544880589
- ; FPNEG_R f0
- xorps xmm0, xmm15
+ imul r8, qword ptr [rsi+rax]
; IROR_R r1, r6
mov ecx, r14d
ror r9, cl
- ; IMUL_R r6, r4
- imul r14, r12
- ; IMULH_M r4, L2[r1]
- mov ecx, r9d
- and ecx, 262136
- mov rax, r12
- mul qword ptr [rsi+rcx]
- mov r12, rdx
- ; IXOR_R r3, r0
- xor r11, r8
- ; FPSWAP_R f0
- shufpd xmm0, xmm0, 1
- ; FPSWAP_R f0
- shufpd xmm0, xmm0, 1
- ; COND_R r0, ns(r2, -308295242)
- xor ecx, ecx
- cmp r10d, -308295242
- setns cl
- add r8, rcx
- ; IMUL_9C r1, 591587965
- lea r9, [r9+r9*8+591587965]
- ; FPADD_R f3, a1
- addpd xmm3, xmm9
- ; IMUL_R r5, r4
- imul r13, r12
- ; IMUL_M r7, L1[r0]
- mov eax, r8d
- and eax, 16376
- imul r15, qword ptr [rsi+rax]
- ; COND_R r6, sg(r5, -1119525789)
- xor ecx, ecx
- cmp r13d, -1119525789
- sets cl
- add r14, rcx
- ; IMUL_M r0, L1[r1]
- mov eax, r9d
- and eax, 16376
- imul r8, qword ptr [rsi+rax]
- ; IADD_M r3, L2[r7]
- mov eax, r15d
- and eax, 262136
- add r11, qword ptr [rsi+rax]
- ; IADD_R r0, r1
- add r8, r9
- ; FPSUB_R f2, a1
- subpd xmm2, xmm9
- ; IXOR_M r0, L2[r7]
- mov eax, r15d
- and eax, 262136
- xor r8, qword ptr [rsi+rax]
- ; COND_R r6, be(r6, 1481939391)
- xor ecx, ecx
- cmp r14d, 1481939391
- setbe cl
- add r14, rcx
- ; FPADD_R f0, a1
- addpd xmm0, xmm9
- ; IXOR_R r3, r2
- xor r11, r10
- ; FPSUB_R f0, a1
- subpd xmm0, xmm9
- ; IXOR_R r7, r3
- xor r15, r11
- ; IXOR_M r6, L1[r4]
- mov eax, r12d
- and eax, 16376
- xor r14, qword ptr [rsi+rax]
- ; IMULH_R r2, r7
- mov rax, r10
- mul r15
- mov r10, rdx
- ; ISUB_R r5, r1
- sub r13, r9
- ; FPMUL_R e1, a3
- mulpd xmm5, xmm11
- ; FPADD_R f3, a2
- addpd xmm3, xmm10
- ; FPSWAP_R f1
- shufpd xmm1, xmm1, 1
- ; FPSUB_R f1, a3
- subpd xmm1, xmm11
- ; FPSUB_M f0, L1[r4]
- mov eax, r12d
- and eax, 16376
- cvtdq2pd xmm12, qword ptr [rsi+rax]
- subpd xmm0, xmm12
- ; FPMUL_R e1, a2
- mulpd xmm5, xmm10
- ; FPADD_R f3, a0
- addpd xmm3, xmm8
- ; IROL_R r2, r4
+ ; IROR_R r2, r4
mov ecx, r12d
- rol r10, cl
- ; COND_M r7, ab(L2[r7], -2012390318)
- xor ecx, ecx
- mov eax, r15d
- and eax, 262136
- cmp dword ptr [rsi+rax], -2012390318
- seta cl
- add r15, rcx
- ; IMUL_9C r4, -38079585
- lea r12, [r12+r12*8-38079585]
- ; IXOR_R r0, r1
- xor r8, r9
- ; FPMUL_R e1, a3
- mulpd xmm5, xmm11
- ; FPMUL_R e1, a1
- mulpd xmm5, xmm9
- ; FPSUB_R f1, a2
- subpd xmm1, xmm10
- ; IMUL_9C r4, -847745598
- lea r12, [r12+r12*8-847745598]
- ; FPSQRT_R e1
- sqrtpd xmm5, xmm5
- ; IADD_R r7, r6
- add r15, r14
- ; FPSUB_R f3, a0
- subpd xmm3, xmm8
- ; FPSUB_R f1, a1
- subpd xmm1, xmm9
- ; IADD_R r7, r6
- add r15, r14
- ; IROL_R r2, r5
- mov ecx, r13d
- rol r10, cl
- ; IADD_RC r4, r2, 1338806320
- lea r12, [r12+r10+1338806320]
- ; FPSQRT_R e3
- sqrtpd xmm7, xmm7
- ; IMUL_R r5, r0
- imul r13, r8
- ; FPADD_R f2, a1
- addpd xmm2, xmm9
- ; INEG_R r6
- neg r14
- ; IXOR_M r6, L1[r2]
- mov eax, r10d
- and eax, 16376
- xor r14, qword ptr [rsi+rax]
- ; FPSUB_R f2, a2
- subpd xmm2, xmm10
- ; FPADD_R f2, a2
- addpd xmm2, xmm10
- ; FPADD_R f1, a2
- addpd xmm1, xmm10
- ; COND_R r3, be(r4, 174667458)
- xor ecx, ecx
- cmp r12d, 174667458
- setbe cl
- add r11, rcx
- ; INEG_R r6
- neg r14
- ; IXOR_R r6, r3
- xor r14, r11
- ; COND_M r5, sg(L1[r0], -864345921)
- xor ecx, ecx
- mov eax, r8d
- and eax, 16376
- cmp dword ptr [rsi+rax], -864345921
- sets cl
- add r13, rcx
- ; IROL_R r7, r3
- mov ecx, r11d
- rol r15, cl
- ; FPSUB_R f1, a2
- subpd xmm1, xmm10
- ; IADD_M r1, L1[r0]
- mov eax, r8d
- and eax, 16376
- add r9, qword ptr [rsi+rax]
- ; IMULH_R r1, r3
- mov rax, r9
- mul r11
- mov r9, rdx
- ; IMUL_R r0, -1489192296
- imul r8, -1489192296
- ; FPMUL_R e0, a2
- mulpd xmm4, xmm10
- ; COND_R r1, ge(r1, -1358904097)
- xor ecx, ecx
- cmp r9d, -1358904097
- setge cl
- add r9, rcx
- ; FPSUB_R f1, a1
- subpd xmm1, xmm9
- ; FPADD_R f2, a3
- addpd xmm2, xmm11
- ; IROR_R r4, r7
- mov ecx, r15d
- ror r12, cl
- ; ISDIV_C r1, -1368098113
- mov rax, -7238896260565957085
- imul r9
- xor eax, eax
- sar rdx, 29
- sets al
- add rdx, rax
- add r9, rdx
- ; IADD_M r4, L1[r1]
- mov eax, r9d
- and eax, 16376
- add r12, qword ptr [rsi+rax]
- ; IMUL_R r0, -1011605520
- imul r8, -1011605520
+ ror r10, cl
; FPSUB_R f3, a1
subpd xmm3, xmm9
- ; IADD_RC r1, r4, 272540736
- lea r9, [r9+r12+272540736]
+ ; FSTORE L1[r0], e1
+ mov eax, r8d
+ and eax, 16368
+ movapd xmmword ptr [rsi+rax], xmm5
+ ; COND_R r2, sg(r3, 1269153133)
+ xor ecx, ecx
+ cmp r11d, 1269153133
+ sets cl
+ add r10, rcx
; FPSWAP_R f2
shufpd xmm2, xmm2, 1
- ; IROR_R r3, r2
- mov ecx, r10d
- ror r11, cl
- ; IMUL_R r3, 2085105439
- imul r11, 2085105439
- ; FPMUL_R e0, a0
- mulpd xmm4, xmm8
- ; IMUL_9C r6, -483723153
- lea r14, [r14+r14*8-483723153]
- ; FPSUB_M f3, L1[r7]
- mov eax, r15d
- and eax, 16376
- cvtdq2pd xmm12, qword ptr [rsi+rax]
- subpd xmm3, xmm12
- ; IMUL_R r3, r2
- imul r11, r10
- ; ISMULH_R r7, r1
- mov rax, r15
- imul r9
- mov r15, rdx
- ; COND_R r1, of(r7, 778804236)
+ ; IADD_R r7, r5
+ add r15, r13
+ ; COND_R r0, be(r4, -1486502150)
xor ecx, ecx
- cmp r15d, 778804236
- seto cl
- add r9, rcx
- ; FPSUB_R f3, a2
- subpd xmm3, xmm10
- ; IROL_R r5, r7
- mov ecx, r15d
- rol r13, cl
- ; FPADD_R f1, a0
- addpd xmm1, xmm8
- ; FPADD_R f2, a3
- addpd xmm2, xmm11
- ; IMUL_R r6, r0
- imul r14, r8
- ; ISUB_M r2, L2[r4]
- mov eax, r12d
- and eax, 262136
- sub r10, qword ptr [rsi+rax]
- ; IXOR_R r0, r6
- xor r8, r14
- ; INEG_R r6
- neg r14
- ; FPMUL_R e2, a3
- mulpd xmm6, xmm11
- ; IADD_RC r4, r6, -1312075035
- lea r12, [r12+r14-1312075035]
- ; IMUL_R r1, r5
- imul r9, r13
- ; IXOR_M r7, L2[r6]
- mov eax, r14d
- and eax, 262136
- xor r15, qword ptr [rsi+rax]
- ; IROR_R r2, 23
- ror r10, 23
- ; FPMUL_R e0, a2
- mulpd xmm4, xmm10
- ; ISMULH_M r5, L1[r2]
- mov ecx, r10d
- and ecx, 16376
- mov rax, r13
- imul qword ptr [rsi+rcx]
- mov r13, rdx
- ; ISUB_M r7, L1[r4]
- mov eax, r12d
- and eax, 16376
- sub r15, qword ptr [rsi+rax]
- ; COND_R r0, sg(r2, 1538841628)
- xor ecx, ecx
- cmp r10d, 1538841628
- sets cl
+ cmp r12d, -1486502150
+ setbe cl
add r8, rcx
- ; IMUL_R r6, r2
- imul r14, r10
- ; ISUB_R r0, r1
- sub r8, r9
+ ; FPSUB_R f3, a1
+ subpd xmm3, xmm9
+ ; FPADD_R f0, a3
+ addpd xmm0, xmm11
+ ; IADD_R r2, r0
+ add r10, r8
+ ; FSTORE L1[r3], e2
+ mov eax, r11d
+ and eax, 16368
+ movapd xmmword ptr [rsi+rax], xmm6
+ ; IXOR_R r1, r7
+ xor r9, r15
; IMUL_R r5, r7
imul r13, r15
- ; IADD_RC r1, r0, 516706834
- lea r9, [r9+r8+516706834]
- ; INEG_R r5
- neg r13
+ ; IXOR_R r7, 266992378
+ xor r15, 266992378
+ ; COND_R r7, no(r4, 1983804692)
+ xor ecx, ecx
+ cmp r12d, 1983804692
+ setno cl
+ add r15, rcx
+ ; IMUL_M r2, L2[r0]
+ mov eax, r8d
+ and eax, 262136
+ imul r10, qword ptr [rsi+rax]
+ ; FPDIV_R e3, a2
+ divpd xmm7, xmm10
+ maxpd xmm7, xmm13
+ ; IMUL_M r0, L2[r6]
+ mov eax, r14d
+ and eax, 262136
+ imul r8, qword ptr [rsi+rax]
+ ; ISTORE L1[r0], r7
+ mov eax, r8d
+ and eax, 16376
+ mov qword ptr [rsi+rax], r15
+ ; FPMUL_R e0, a1
+ mulpd xmm4, xmm9
+ ; FPSUB_R f3, a1
+ subpd xmm3, xmm9
+ ; IROR_R r5, r4
+ mov ecx, r12d
+ ror r13, cl
+ ; ISTORE L2[r7], r2
+ mov eax, r15d
+ and eax, 262136
+ mov qword ptr [rsi+rax], r10
+ ; FPSWAP_R e2
+ shufpd xmm6, xmm6, 1
+ ; FPADD_M f3, L1[r2]
+ mov eax, r10d
+ and eax, 16376
+ cvtdq2pd xmm12, qword ptr [rsi+rax]
+ addpd xmm3, xmm12
+ ; IDIV_C r5, 2218798981
+ mov rax, 17853839665672790751
+ mul r13
+ shr rdx, 31
+ add r13, rdx
+ ; IADD_RC r0, r4, -1321374359
+ lea r8, [r8+r12-1321374359]
+ ; CFROUND r6, 28
+ mov rax, r14
+ rol rax, 49
+ and eax, 24576
+ or eax, 40896
+ mov dword ptr [rsp-8], eax
+ ldmxcsr dword ptr [rsp-8]
+ ; FPADD_R f2, a2
+ addpd xmm2, xmm10
+ ; IROL_R r7, r6
+ mov ecx, r14d
+ rol r15, cl
+ ; ISUB_R r2, r4
+ sub r10, r12
+ ; IMULH_M r0, L1[12400]
+ mov rax, r8
+ mul qword ptr [rsi+12400]
+ mov r8, rdx
+ ; IADD_R r2, r3
+ add r10, r11
+ ; COND_R r6, lt(r1, -1124202227)
+ xor ecx, ecx
+ cmp r9d, -1124202227
+ setl cl
+ add r14, rcx
+ ; IROR_R r7, r4
+ mov ecx, r12d
+ ror r15, cl
+ ; IMUL_R r4, r2
+ imul r12, r10
+ ; ISUB_R r3, r7
+ sub r11, r15
+ ; IADD_R r2, r7
+ add r10, r15
; FPSQRT_R e3
sqrtpd xmm7, xmm7
- ; IADD_RC r5, r4, -1679394922
- lea r13, [r13+r12-1679394922]
- ; FPSUB_R f1, a1
- subpd xmm1, xmm9
- ; IMUL_R r0, r2
- imul r8, r10
- ; ISUB_R r3, r2
- sub r11, r10
+ ; ISUB_R r6, 540663146
+ sub r14, 540663146
+ ; IROL_R r5, 58
+ rol r13, 58
+ ; FPADD_R f2, a1
+ addpd xmm2, xmm9
+ ; FPADD_R f2, a2
+ addpd xmm2, xmm10
+ ; FPSQRT_R e1
+ sqrtpd xmm5, xmm5
+ ; FPADD_R f1, a2
+ addpd xmm1, xmm10
+ ; IADD_R r5, r3
+ add r13, r11
+ ; IADD_M r7, L1[880]
+ add r15, qword ptr [rsi+880]
+ ; ISUB_R r7, r0
+ sub r15, r8
+ ; ISTORE L2[r0], r7
+ mov eax, r8d
+ and eax, 262136
+ mov qword ptr [rsi+rax], r15
+ ; IDIV_C r2, 1014940364
+ mov rax, r10
+ shr rax, 2
+ mov rcx, 1219717022984988185
+ mul rcx
+ shr rdx, 24
+ add r10, rdx
+ ; FPMUL_R e0, a2
+ mulpd xmm4, xmm10
+ ; IDIV_C r2, 3059159304
+ mov rax, 12949335853590502915
+ mul r10
+ shr rdx, 31
+ add r10, rdx
+ ; IADD_R r0, r3
+ add r8, r11
+ ; IMUL_9C r7, -2124093035
+ lea r15, [r15+r15*8-2124093035]
+ ; FPSUB_R f2, a0
+ subpd xmm2, xmm8
+ ; FPDIV_R e0, a2
+ divpd xmm4, xmm10
+ maxpd xmm4, xmm13
+ ; FPSUB_R f2, a3
+ subpd xmm2, xmm11
+ ; IMUL_R r1, r2
+ imul r9, r10
+ ; ISMULH_R r7, r5
+ mov rax, r15
+ imul r13
+ mov r15, rdx
+ ; IMULH_R r3, r2
+ mov rax, r11
+ mul r10
+ mov r11, rdx
+ ; IXOR_M r1, L2[r0]
+ mov eax, r8d
+ and eax, 262136
+ xor r9, qword ptr [rsi+rax]
+ ; FPMUL_R e0, a1
+ mulpd xmm4, xmm9
+ ; ISUB_R r4, 1456841848
+ sub r12, 1456841848
+ ; IXOR_M r3, L2[r2]
+ mov eax, r10d
+ and eax, 262136
+ xor r11, qword ptr [rsi+rax]
+ ; COND_M r0, of(L1[r4], 1678513610)
+ xor ecx, ecx
+ mov eax, r12d
+ and eax, 16376
+ cmp dword ptr [rsi+rax], 1678513610
+ seto cl
+ add r8, rcx
+ ; IDIV_C r4, 2674394209
+ mov rax, 925772300223658071
+ mul r12
+ shr rdx, 27
+ add r12, rdx
+ ; IMUL_R r4, r1
+ imul r12, r9
+ ; FPADD_R f1, a2
+ addpd xmm1, xmm10
+ ; FPSUB_R f2, a0
+ subpd xmm2, xmm8
+ ; FPMUL_M e1, L2[r6]
+ mov eax, r14d
+ and eax, 262136
+ cvtdq2pd xmm12, qword ptr [rsi+rax]
+ mulpd xmm5, xmm12
+ maxpd xmm5, xmm13
+ ; FPSUB_M f0, L2[r3]
+ mov eax, r11d
+ and eax, 262136
+ cvtdq2pd xmm12, qword ptr [rsi+rax]
+ subpd xmm0, xmm12
+ ; IROR_R r0, r7
+ mov ecx, r15d
+ ror r8, cl
+ ; FSTORE L2[r1], e0
+ mov eax, r9d
+ and eax, 262128
+ movapd xmmword ptr [rsi+rax], xmm4
+ ; IROR_R r7, r6
+ mov ecx, r14d
+ ror r15, cl
+ ; IMUL_9C r2, 266593902
+ lea r10, [r10+r10*8+266593902]
+ ; IMUL_R r4, r6
+ imul r12, r14
+ ; FPSUB_R f2, a2
+ subpd xmm2, xmm10
+ ; FPMUL_R e3, a0
+ mulpd xmm7, xmm8
+ ; IXOR_M r7, L1[r2]
+ mov eax, r10d
+ and eax, 16376
+ xor r15, qword ptr [rsi+rax]
+ ; IROR_R r0, r5
+ mov ecx, r13d
+ ror r8, cl
+ ; FPADD_R f1, a2
+ addpd xmm1, xmm10
+ ; FPSQRT_R e3
+ sqrtpd xmm7, xmm7
+ ; FPADD_R f3, a1
+ addpd xmm3, xmm9
+ ; FPADD_R f1, a0
+ addpd xmm1, xmm8
+ ; COND_M r2, ge(L2[r2], -226330940)
+ xor ecx, ecx
+ mov eax, r10d
+ and eax, 262136
+ cmp dword ptr [rsi+rax], -226330940
+ setge cl
+ add r10, rcx
+ ; FPDIV_R e2, a3
+ divpd xmm6, xmm11
+ maxpd xmm6, xmm13
+ ; FPMUL_R e2, a1
+ mulpd xmm6, xmm9
+ ; FPSUB_R f1, a0
+ subpd xmm1, xmm8
+ ; IMUL_R r7, r5
+ imul r15, r13
+ ; IMUL_R r0, r1
+ imul r8, r9
+ ; FPSUB_R f3, a1
+ subpd xmm3, xmm9
+ ; IROL_R r3, r5
+ mov ecx, r13d
+ rol r11, cl
+ ; IADD_RC r5, r2, 795784298
+ lea r13, [r13+r10+795784298]
+ ; ISUB_R r0, r4
+ sub r8, r12
+ ; IMUL_R r5, r4
+ imul r13, r12
+ ; FPSUB_R f0, a2
+ subpd xmm0, xmm10
+ ; FPMUL_R e3, a1
+ mulpd xmm7, xmm9
+ ; ISDIV_C r3, 1662492575
+ mov rax, 2978515652703905219
+ imul r11
+ xor eax, eax
+ sar rdx, 28
+ sets al
+ add rdx, rax
+ add r11, rdx
+ ; ISMULH_R r5, r0
+ mov rax, r13
+ imul r8
+ mov r13, rdx
+ ; ISDIV_C r4, 1963597892
+ mov rax, -8359627607928540073
+ imul r12
+ xor eax, eax
+ add rdx, r12
+ sar rdx, 30
+ sets al
+ add rdx, rax
+ add r12, rdx
+ ; IMUL_R r7, r0
+ imul r15, r8
+ ; IMULH_M r0, L1[r3]
+ mov ecx, r11d
+ and ecx, 16376
+ mov rax, r8
+ mul qword ptr [rsi+rcx]
+ mov r8, rdx
+ ; IXOR_R r3, r7
+ xor r11, r15
+ ; IDIV_C r4, 1146125335
+ mov rax, 8640870253760721727
+ mul r12
+ shr rdx, 29
+ add r12, rdx
+ ; FPSWAP_R f3
+ shufpd xmm3, xmm3, 1
+ ; IXOR_M r2, L1[r0]
+ mov eax, r8d
+ and eax, 16376
+ xor r10, qword ptr [rsi+rax]
+ ; IROR_R r0, r1
+ mov ecx, r9d
+ ror r8, cl
+ ; IXOR_R r7, r4
+ xor r15, r12
+ ; ISMULH_R r6, r2
+ mov rax, r14
+ imul r10
+ mov r14, rdx
+ ; FPMUL_R e3, a2
+ mulpd xmm7, xmm10
+ ; IADD_RC r4, r2, 1704868083
+ lea r12, [r12+r10+1704868083]
+ ; FPSUB_R f2, a0
+ subpd xmm2, xmm8
+ ; ISTORE L1[r0], r0
+ mov eax, r8d
+ and eax, 16376
+ mov qword ptr [rsi+rax], r8
+ ; FPSUB_R f0, a3
+ subpd xmm0, xmm11
; FPDIV_R e0, a3
divpd xmm4, xmm11
maxpd xmm4, xmm13
- ; ISUB_R r1, r5
- sub r9, r13
- ; COND_M r2, be(L2[r2], 1840094725)
+ ; FPMUL_R e3, a2
+ mulpd xmm7, xmm10
+ ; ISUB_R r7, 1302457878
+ sub r15, 1302457878
+ ; IMUL_9C r1, 1330165941
+ lea r9, [r9+r9*8+1330165941]
+ ; FPMUL_R e1, a3
+ mulpd xmm5, xmm11
+ ; IROL_R r0, r4
+ mov ecx, r12d
+ rol r8, cl
+ ; FPSUB_M f1, L1[r0]
+ mov eax, r8d
+ and eax, 16376
+ cvtdq2pd xmm12, qword ptr [rsi+rax]
+ subpd xmm1, xmm12
+ ; IROL_R r5, r6
+ mov ecx, r14d
+ rol r13, cl
+ ; COND_M r0, ab(L1[r1], -310933871)
xor ecx, ecx
+ mov eax, r9d
+ and eax, 16376
+ cmp dword ptr [rsi+rax], -310933871
+ seta cl
+ add r8, rcx
+ ; CFROUND r7, 39
+ mov rax, r15
+ rol rax, 38
+ and eax, 24576
+ or eax, 40896
+ mov dword ptr [rsp-8], eax
+ ldmxcsr dword ptr [rsp-8]
+ ; FPDIV_R e0, a1
+ divpd xmm4, xmm9
+ maxpd xmm4, xmm13
+ ; IMUL_M r1, L1[r3]
+ mov eax, r11d
+ and eax, 16376
+ imul r9, qword ptr [rsi+rax]
+ ; IMUL_9C r3, 1573236728
+ lea r11, [r11+r11*8+1573236728]
+ ; FPNEG_R f3
+ xorps xmm3, xmm15
+ ; COND_R r1, lt(r4, -1805702334)
+ xor ecx, ecx
+ cmp r12d, -1805702334
+ setl cl
+ add r9, rcx
+ ; FPSWAP_R f1
+ shufpd xmm1, xmm1, 1
+ ; IADD_R r7, -1421188024
+ add r15, -1421188024
+ ; FPMUL_R e3, a2
+ mulpd xmm7, xmm10
+ ; FPSUB_M f2, L2[r7]
+ mov eax, r15d
+ and eax, 262136
+ cvtdq2pd xmm12, qword ptr [rsi+rax]
+ subpd xmm2, xmm12
+ ; FPSUB_R f3, a1
+ subpd xmm3, xmm9
+ ; FPSQRT_R e1
+ sqrtpd xmm5, xmm5
+ ; ISUB_R r2, r4
+ sub r10, r12
+ ; ISMULH_R r4, r5
+ mov rax, r12
+ imul r13
+ mov r12, rdx
+ ; COND_R r1, of(r7, 1294727006)
+ xor ecx, ecx
+ cmp r15d, 1294727006
+ seto cl
+ add r9, rcx
+ ; IADD_M r5, L2[r2]
mov eax, r10d
and eax, 262136
- cmp dword ptr [rsi+rax], 1840094725
- setbe cl
+ add r13, qword ptr [rsi+rax]
+ ; IMUL_9C r4, 401020510
+ lea r12, [r12+r12*8+401020510]
+ ; IROL_R r3, r0
+ mov ecx, r8d
+ rol r11, cl
+ ; ISTORE L1[r7], r0
+ mov eax, r15d
+ and eax, 16376
+ mov qword ptr [rsi+rax], r8
+ ; FPSUB_R f2, a1
+ subpd xmm2, xmm9
+ ; FPSQRT_R e3
+ sqrtpd xmm7, xmm7
+ ; IMUL_R r3, 720965215
+ imul r11, 720965215
+ ; IMUL_R r6, r2
+ imul r14, r10
+ ; ISTORE L1[r7], r3
+ mov eax, r15d
+ and eax, 16376
+ mov qword ptr [rsi+rax], r11
+ ; IROR_R r2, r6
+ mov ecx, r14d
+ ror r10, cl
+ ; FPSQRT_R e3
+ sqrtpd xmm7, xmm7
+ ; IMUL_9C r4, 788211341
+ lea r12, [r12+r12*8+788211341]
+ ; IMUL_9C r3, -67993446
+ lea r11, [r11+r11*8-67993446]
+ ; FPSWAP_R e3
+ shufpd xmm7, xmm7, 1
+ ; IMUL_M r2, L1[r6]
+ mov eax, r14d
+ and eax, 16376
+ imul r10, qword ptr [rsi+rax]
+ ; COND_M r2, ge(L1[r2], -1892157506)
+ xor ecx, ecx
+ mov eax, r10d
+ and eax, 16376
+ cmp dword ptr [rsi+rax], -1892157506
+ setge cl
add r10, rcx
+ ; FPADD_M f1, L1[r3]
+ mov eax, r11d
+ and eax, 16376
+ cvtdq2pd xmm12, qword ptr [rsi+rax]
+ addpd xmm1, xmm12
+ ; IADD_M r7, L1[r0]
+ mov eax, r8d
+ and eax, 16376
+ add r15, qword ptr [rsi+rax]
+ ; ISDIV_C r1, 624867857
+ mov rax, 7924491717200811467
+ imul r9
+ xor eax, eax
+ sar rdx, 28
+ sets al
+ add rdx, rax
+ add r9, rdx
+ ; FPADD_R f0, a1
+ addpd xmm0, xmm9
+ ; ISUB_R r5, r7
+ sub r13, r15
+ ; FPNEG_R f0
+ xorps xmm0, xmm15
+ ; IMUL_R r6, r2
+ imul r14, r10
+ ; FPMUL_M e3, L1[r1]
+ mov eax, r9d
+ and eax, 16376
+ cvtdq2pd xmm12, qword ptr [rsi+rax]
+ mulpd xmm7, xmm12
+ maxpd xmm7, xmm13
+ ; IADD_R r0, r4
+ add r8, r12
+ ; FPSUB_M f3, L1[r1]
+ mov eax, r9d
+ and eax, 16376
+ cvtdq2pd xmm12, qword ptr [rsi+rax]
+ subpd xmm3, xmm12
+ ; FPMUL_R e2, a0
+ mulpd xmm6, xmm8
+ ; INEG_R r2
+ neg r10
+ ; FPMUL_R e2, a2
+ mulpd xmm6, xmm10
+ ; FPSUB_M f3, L1[r6]
+ mov eax, r14d
+ and eax, 16376
+ cvtdq2pd xmm12, qword ptr [rsi+rax]
+ subpd xmm3, xmm12
+ ; FPADD_R f1, a3
+ addpd xmm1, xmm11
+ ; IMULH_R r3, r2
+ mov rax, r11
+ mul r10
+ mov r11, rdx
+ ; FPSUB_R f0, a3
+ subpd xmm0, xmm11
+ ; IDIV_C r5, 2887845607
+ mov rax, 13717520480010955377
+ mul r13
+ shr rdx, 31
+ add r13, rdx
+ ; ISMULH_M r6, L1[r2]
+ mov ecx, r10d
+ and ecx, 16376
+ mov rax, r14
+ imul qword ptr [rsi+rcx]
+ mov r14, rdx
+ ; FPSUB_R f3, a3
+ subpd xmm3, xmm11
; IMUL_M r6, L1[r7]
mov eax, r15d
and eax, 16376
imul r14, qword ptr [rsi+rax]
- ; IMULH_M r6, L1[r5]
- mov ecx, r13d
- and ecx, 16376
- mov rax, r14
- mul qword ptr [rsi+rcx]
- mov r14, rdx
- ; IMUL_9C r7, -1048659408
- lea r15, [r15+r15*8-1048659408]
- ; IMUL_R r6, r3
- imul r14, r11
- ; FPADD_R f3, a0
- addpd xmm3, xmm8
- ; IMULH_R r0, r3
- mov rax, r8
- mul r11
- mov r8, rdx
- ; FPSWAP_R f0
- shufpd xmm0, xmm0, 1
+ ; FPNEG_R f0
+ xorps xmm0, xmm15
+ ; FPMUL_R e2, a0
+ mulpd xmm6, xmm8
+ ; IMUL_9C r6, 295130073
+ lea r14, [r14+r14*8+295130073]
+ ; FPADD_R f1, a1
+ addpd xmm1, xmm9
+ ; IXOR_R r0, r5
+ xor r8, r13
+ ; FPADD_R f2, a1
+ addpd xmm2, xmm9
+ ; FPSWAP_R e3
+ shufpd xmm7, xmm7, 1
; FPSQRT_R e3
sqrtpd xmm7, xmm7
- ; IMULH_R r2, r0
- mov rax, r10
- mul r8
- mov r10, rdx
- ; FPDIV_R e1, a1
- divpd xmm5, xmm9
- maxpd xmm5, xmm13
+ ; IADD_RC r3, r6, -1317630728
+ lea r11, [r11+r14-1317630728]
+ ; IMUL_M r2, L1[r3]
+ mov eax, r11d
+ and eax, 16376
+ imul r10, qword ptr [rsi+rax]
+ ; IADD_RC r1, r4, 894105694
+ lea r9, [r9+r12+894105694]
+ ; IMUL_R r7, r0
+ imul r15, r8
+ ; FPSUB_R f1, a0
+ subpd xmm1, xmm8
+ ; IMUL_M r7, L1[r1]
+ mov eax, r9d
+ and eax, 16376
+ imul r15, qword ptr [rsi+rax]
+ ; IXOR_R r2, r4
+ xor r10, r12
+ ; ISUB_M r0, L1[r1]
+ mov eax, r9d
+ and eax, 16376
+ sub r8, qword ptr [rsi+rax]
+ ; INEG_R r4
+ neg r12
+ ; IMUL_9C r4, -285272388
+ lea r12, [r12+r12*8-285272388]
+ ; IMUL_R r7, r4
+ imul r15, r12
+ ; IMULH_M r5, L1[r7]
+ mov ecx, r15d
+ and ecx, 16376
+ mov rax, r13
+ mul qword ptr [rsi+rcx]
+ mov r13, rdx
+ ; IROL_R r1, r7
+ mov ecx, r15d
+ rol r9, cl
+ ; IXOR_R r4, -757532727
+ xor r12, -757532727
+ ; IMUL_R r3, 1863959234
+ imul r11, 1863959234
+ ; IROL_R r4, 59
+ rol r12, 59
+ ; ISMULH_R r1, 2122681086
+ mov rax, 2122681086
+ imul r9
+ add r9, rdx
+ ; ISTORE L2[r6], r7
+ mov eax, r14d
+ and eax, 262136
+ mov qword ptr [rsi+rax], r15
+ ; ISTORE L1[r1], r5
+ mov eax, r9d
+ and eax, 16376
+ mov qword ptr [rsi+rax], r13
+ ; FPMUL_R e0, a1
+ mulpd xmm4, xmm9
+ ; COND_R r2, ns(r1, 486049737)
+ xor ecx, ecx
+ cmp r9d, 486049737
+ setns cl
+ add r10, rcx
+ ; FPMUL_M e0, L2[r7]
+ mov eax, r15d
+ and eax, 262136
+ cvtdq2pd xmm12, qword ptr [rsi+rax]
+ mulpd xmm4, xmm12
+ maxpd xmm4, xmm13
+ ; FPMUL_R e3, a2
+ mulpd xmm7, xmm10
+ ; IROL_R r5, r2
+ mov ecx, r10d
+ rol r13, cl
+ ; IADD_M r0, L1[r4]
+ mov eax, r12d
+ and eax, 16376
+ add r8, qword ptr [rsi+rax]