From 619bee54183278008b5ea60847cce935fdc16d5f Mon Sep 17 00:00:00 2001 From: tevador Date: Fri, 4 Jan 2019 19:44:15 +0100 Subject: [PATCH 01/35] Random dataset accesses - asm only Initial support for large pages --- src/AssemblyGeneratorX86.cpp | 102 +- src/AssemblyGeneratorX86.hpp | 2 +- src/CompiledVirtualMachine.cpp | 4 +- src/VirtualMachine.hpp | 2 +- src/dataset.cpp | 14 +- src/dataset.hpp | 2 +- src/executeProgram-win64.asm | 174 +- src/main.cpp | 5 +- src/program.inc | 2905 +++++++++++++++----------------- src/virtualMemory.cpp | 108 ++ src/virtualMemory.hpp | 23 + 11 files changed, 1616 insertions(+), 1725 deletions(-) create mode 100644 src/virtualMemory.cpp create mode 100644 src/virtualMemory.hpp diff --git a/src/AssemblyGeneratorX86.cpp b/src/AssemblyGeneratorX86.cpp index bb0e106..5c3f9a2 100644 --- a/src/AssemblyGeneratorX86.cpp +++ b/src/AssemblyGeneratorX86.cpp @@ -28,6 +28,11 @@ namespace RandomX { static const char* regR32[8] = { "r8d", "r9d", "r10d", "r11d", "r12d", "r13d", "r14d", "r15d" }; static const char* regF[8] = { "xmm8", "xmm9", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" }; + static const char* regMx = "edi"; + static const char* regIc = "ebp"; + static const char* regStackBeginAddr = "rbx"; + static const char* regScratchpadAddr = "rsi"; + void AssemblyGeneratorX86::generateProgram(const void* seed) { asmCode.str(std::string()); //clear Pcg32 gen(seed); @@ -48,7 +53,7 @@ namespace RandomX { void AssemblyGeneratorX86::generateCode(Instruction& instr, int i) { asmCode << "rx_i_" << i << ": ;" << instr.getName() << std::endl; - asmCode << "\tdec edi" << std::endl; + asmCode << "\tdec " << regIc << std::endl; asmCode << "\tjz rx_finish" << std::endl; auto generator = engine[instr.opcode]; (this->*generator)(instr, i); @@ -56,54 +61,34 @@ namespace RandomX { void AssemblyGeneratorX86::genar(Instruction& instr) { asmCode << "\txor " << regR[instr.rega % RegistersCount] << ", 0" << std::hex << instr.addra << "h" << std::dec << std::endl; - switch (instr.loca & 7) + asmCode << "\tmov ecx, " << regR32[instr.rega % RegistersCount] << std::endl; + switch (instr.loca & 3) { - case 0: - case 1: - case 2: - case 3: - asmCode << "\tmov ecx, " << regR32[instr.rega % RegistersCount] << std::endl; - asmCode << "\tcall rx_read_dataset_r" << std::endl; - return; - - case 4: - asmCode << "\tmov eax, " << regR32[instr.rega % RegistersCount] << std::endl; - asmCode << "\tand eax, " << (ScratchpadL2 - 1) << std::endl; - asmCode << "\tmov rax, qword ptr [rsi + rax * 8]" << std::endl; - return; - - default: - asmCode << "\tmov eax, " << regR32[instr.rega % RegistersCount] << std::endl; - asmCode << "\tand eax, " << (ScratchpadL1 - 1) << std::endl; - asmCode << "\tmov rax, qword ptr [rsi + rax * 8]" << std::endl; - return; + case 0: + case 1: + case 2: + asmCode << "\tcall rx_readint_l1" << std::endl; + return; + default: //3 + asmCode << "\tcall rx_readint_l2" << std::endl; + return; } } void AssemblyGeneratorX86::genaf(Instruction& instr) { asmCode << "\txor " << regR[instr.rega % RegistersCount] << ", 0" << std::hex << instr.addra << "h" << std::dec << std::endl; - switch (instr.loca & 7) + asmCode << "\tmov ecx, " << regR32[instr.rega % RegistersCount] << std::endl; + switch (instr.loca & 3) { - case 0: - case 1: - case 2: - case 3: - asmCode << "\tmov ecx, " << regR32[instr.rega % RegistersCount] << std::endl; - asmCode << "\tcall rx_read_dataset_f" << std::endl; - return; - - case 4: - asmCode << "\tmov eax, " << regR32[instr.rega % RegistersCount] << std::endl; - asmCode << "\tand eax, " << (ScratchpadL2 - 1) << std::endl; - asmCode << "\tcvtdq2pd xmm0, qword ptr [rsi + rax * 8]" << std::endl; - return; - - default: - asmCode << "\tmov eax, " << regR32[instr.rega % RegistersCount] << std::endl; - asmCode << "\tand eax, " << (ScratchpadL1 - 1) << std::endl; - asmCode << "\tcvtdq2pd xmm0, qword ptr [rsi + rax * 8]" << std::endl; - return; + case 0: + case 1: + case 2: + asmCode << "\tcall rx_readfloat_l1" << std::endl; + return; + default: //3 + asmCode << "\tcall rx_readfloat_l2" << std::endl; + return; } } @@ -169,9 +154,9 @@ namespace RandomX { asmCode << "\tmov eax, " << regR32[instr.regc % RegistersCount] << std::endl; asmCode << "\txor eax, 0" << std::hex << instr.addrc << "h" << std::dec << std::endl; asmCode << "\tand eax, " << (ScratchpadL2 - 1) << std::endl; - asmCode << "\tmov qword ptr [rsi + rax * 8], rcx" << std::endl; + asmCode << "\tmov qword ptr [" << regScratchpadAddr << " + rax * 8], rcx" << std::endl; if (trace) { - asmCode << "\tmov qword ptr [rsi + rdi * 8 + 262136], rcx" << std::endl; + asmCode << "\tmov qword ptr [" << regScratchpadAddr << " + rdi * 8 + 262136], rcx" << std::endl; } return; @@ -182,31 +167,31 @@ namespace RandomX { asmCode << "\tmov eax, " << regR32[instr.regc % RegistersCount] << std::endl; asmCode << "\txor eax, 0" << std::hex << instr.addrc << "h" << std::dec << std::endl; asmCode << "\tand eax, " << (ScratchpadL1 - 1) << std::endl; - asmCode << "\tmov qword ptr [rsi + rax * 8], rcx" << std::endl; + asmCode << "\tmov qword ptr [" << regScratchpadAddr << " + rax * 8], rcx" << std::endl; if (trace) { - asmCode << "\tmov qword ptr [rsi + rdi * 8 + 262136], rcx" << std::endl; + asmCode << "\tmov qword ptr [" << regScratchpadAddr << " + rdi * 8 + 262136], rcx" << std::endl; } return; default: asmCode << "\tmov " << regR[instr.regc % RegistersCount] << ", rax" << std::endl; if (trace) { - asmCode << "\tmov qword ptr [rsi + rdi * 8 + 262136], rax" << std::endl; + asmCode << "\tmov qword ptr [" << regScratchpadAddr << " + rdi * 8 + 262136], rax" << std::endl; } + return; } } - void AssemblyGeneratorX86::gencf(Instruction& instr, bool alwaysLow = false) { - if(!alwaysLow) - asmCode << "\tmovaps " << regF[instr.regc % RegistersCount] << ", xmm0" << std::endl; - const char* store = (!alwaysLow && (instr.locc & 8)) ? "movhpd" : "movlpd"; + void AssemblyGeneratorX86::gencf(Instruction& instr) { + asmCode << "\tmovaps " << regF[instr.regc % RegistersCount] << ", xmm0" << std::endl; + const char* store = (instr.locc & 8) ? "movhpd" : "movlpd"; switch (instr.locc & 7) { case 4: asmCode << "\tmov eax, " << regR32[instr.regc % RegistersCount] << std::endl; asmCode << "\txor eax, 0" << std::hex << instr.addrc << "h" << std::dec << std::endl; asmCode << "\tand eax, " << (ScratchpadL2 - 1) << std::endl; - asmCode << "\t" << store << " qword ptr [rsi + rax * 8], " << regF[instr.regc % RegistersCount] << std::endl; + asmCode << "\t" << store << " qword ptr [" << regScratchpadAddr << " + rax * 8], " << regF[instr.regc % RegistersCount] << std::endl; break; case 5: @@ -215,11 +200,11 @@ namespace RandomX { asmCode << "\tmov eax, " << regR32[instr.regc % RegistersCount] << std::endl; asmCode << "\txor eax, 0" << std::hex << instr.addrc << "h" << std::dec << std::endl; asmCode << "\tand eax, " << (ScratchpadL1 - 1) << std::endl; - asmCode << "\t" << store << " qword ptr [rsi + rax * 8], " << regF[instr.regc % RegistersCount] << std::endl; + asmCode << "\t" << store << " qword ptr [" << regScratchpadAddr << " + rax * 8], " << regF[instr.regc % RegistersCount] << std::endl; break; } if (trace) { - asmCode << "\t" << store << " qword ptr [rsi + rdi * 8 + 262136], " << regF[instr.regc % RegistersCount] << std::endl; + asmCode << "\t" << store << " qword ptr [" << regScratchpadAddr << " + rdi * 8 + 262136], " << regF[instr.regc % RegistersCount] << std::endl; } } @@ -454,15 +439,14 @@ namespace RandomX { void AssemblyGeneratorX86::h_FPROUND(Instruction& instr, int i) { genar(instr); - asmCode << "\tmov rcx, rax" << std::endl; + //asmCode << "\tmov rcx, rax" << std::endl; asmCode << "\tshl eax, 13" << std::endl; - asmCode << "\tand rcx, -2048" << std::endl; + //asmCode << "\tand rcx, -2048" << std::endl; asmCode << "\tand eax, 24576" << std::endl; - asmCode << "\tcvtsi2sd " << regF[instr.regc % RegistersCount] << ", rcx" << std::endl; + //asmCode << "\tmovaps " << regF[instr.regc % RegistersCount] << ", xmm0" << std::endl; asmCode << "\tor eax, 40896" << std::endl; asmCode << "\tmov dword ptr [rsp - 8], eax" << std::endl; asmCode << "\tldmxcsr dword ptr [rsp - 8]" << std::endl; - gencf(instr, true); } static inline const char* jumpCondition(Instruction& instr, bool invert = false) { @@ -496,7 +480,7 @@ namespace RandomX { asmCode << "\tjmp rx_i_" << wrapInstr(i + 1) << std::endl; asmCode << "taken_call_" << i << ":" << std::endl; if (trace) { - asmCode << "\tmov qword ptr [rsi + rdi * 8 + 262136], rax" << std::endl; + asmCode << "\tmov qword ptr [" << regScratchpadAddr << " + rdi * 8 + 262136], rax" << std::endl; } asmCode << "\tpush rax" << std::endl; asmCode << "\tcall rx_i_" << wrapInstr(i + (instr.imm8 & 127) + 2) << std::endl; @@ -504,7 +488,7 @@ namespace RandomX { void AssemblyGeneratorX86::h_RET(Instruction& instr, int i) { genar(instr); - asmCode << "\tcmp rsp, rbp" << std::endl; + asmCode << "\tcmp rsp, " << regStackBeginAddr << std::endl; asmCode << "\tje short not_taken_ret_" << i << std::endl; asmCode << "\txor rax, qword ptr [rsp + 8]" << std::endl; gencr(instr); diff --git a/src/AssemblyGeneratorX86.hpp b/src/AssemblyGeneratorX86.hpp index 3097a94..bdcbcec 100644 --- a/src/AssemblyGeneratorX86.hpp +++ b/src/AssemblyGeneratorX86.hpp @@ -45,7 +45,7 @@ namespace RandomX { void genbr132(Instruction&); void genbf(Instruction&, const char*); void gencr(Instruction&); - void gencf(Instruction&, bool); + void gencf(Instruction&); void generateCode(Instruction&, int); diff --git a/src/CompiledVirtualMachine.cpp b/src/CompiledVirtualMachine.cpp index 7803003..8ae2f83 100644 --- a/src/CompiledVirtualMachine.cpp +++ b/src/CompiledVirtualMachine.cpp @@ -47,8 +47,8 @@ namespace RandomX { } void CompiledVirtualMachine::execute() { - //executeProgram(reg, mem, scratchpad, readDataset); - compiler.getProgramFunc()(reg, mem, scratchpad); + executeProgram(reg, mem, scratchpad, readDataset); + //compiler.getProgramFunc()(reg, mem, scratchpad); #ifdef TRACEVM for (int32_t i = InstructionCount - 1; i >= 0; --i) { std::cout << std::hex << tracepad[i].u64 << std::endl; diff --git a/src/VirtualMachine.hpp b/src/VirtualMachine.hpp index f7fdcd0..bbcfec3 100644 --- a/src/VirtualMachine.hpp +++ b/src/VirtualMachine.hpp @@ -40,6 +40,6 @@ namespace RandomX { DatasetReadFunc readDataset; alignas(16) RegisterFile reg; MemoryRegisters mem; - alignas(16) convertible_t scratchpad[ScratchpadLength]; + alignas(64) convertible_t scratchpad[ScratchpadLength]; }; } \ No newline at end of file diff --git a/src/dataset.cpp b/src/dataset.cpp index dee40c5..70561c1 100644 --- a/src/dataset.cpp +++ b/src/dataset.cpp @@ -26,6 +26,7 @@ along with RandomX. If not, see. #include "dataset.hpp" #include "Pcg32.hpp" #include "Cache.hpp" +#include "virtualMemory.hpp" #if defined(__SSE2__) #include @@ -161,12 +162,17 @@ namespace RandomX { template convertible_t datasetReadLight(addr_t addr, MemoryRegisters& memory); - void datasetAlloc(dataset_t& ds) { + void datasetAlloc(dataset_t& ds, bool largePages) { if (sizeof(size_t) <= 4) throw std::runtime_error("Platform doesn't support enough memory for the dataset"); - ds.dataset = (uint8_t*)_mm_malloc(DatasetSize, /*sizeof(__m128i)*/ 64); - if (ds.dataset == nullptr) { - throw std::runtime_error("Dataset memory allocation failed. >4 GiB of free virtual memory is needed."); + if (largePages) { + ds.dataset = (uint8_t*)allocLargePagesMemory(DatasetSize); + } + else { + ds.dataset = (uint8_t*)_mm_malloc(DatasetSize, 64); + if (ds.dataset == nullptr) { + throw std::runtime_error("Dataset memory allocation failed. >4 GiB of free virtual memory is needed."); + } } } diff --git a/src/dataset.hpp b/src/dataset.hpp index bb29197..5f9836c 100644 --- a/src/dataset.hpp +++ b/src/dataset.hpp @@ -35,7 +35,7 @@ namespace RandomX { template void initBlock(const uint8_t* cache, uint8_t* block, uint32_t blockNumber, const KeysContainer& keys); - void datasetAlloc(dataset_t& ds); + void datasetAlloc(dataset_t& ds, bool largePages); template void datasetInit(Cache* cache, dataset_t ds, uint32_t startBlock, uint32_t blockCount); diff --git a/src/executeProgram-win64.asm b/src/executeProgram-win64.asm index 356428c..1e7e7a4 100644 --- a/src/executeProgram-win64.asm +++ b/src/executeProgram-win64.asm @@ -15,19 +15,19 @@ ;# You should have received a copy of the GNU General Public License ;# along with RandomX. If not, see. -PUBLIC executeProgram +_RANDOMX_EXECUTE_PROGRAM SEGMENT PAGE READ EXECUTE -.code +PUBLIC executeProgram executeProgram PROC ; REGISTER ALLOCATION: ; rax -> temporary - ; rbx -> MemoryRegisters& memory + ; rbx -> beginning of VM stack ; rcx -> temporary ; rdx -> temporary ; rsi -> convertible_t& scratchpad - ; rdi -> "ic" (instruction counter) - ; rbp -> beginning of VM stack + ; rdi -> "mx" + ; rbp -> "ic" ; rsp -> end of VM stack ; r8 -> "r0" ; r9 -> "r1" @@ -55,7 +55,8 @@ executeProgram PROC ; | saved registers ; | ; v - ; [rbp] RegisterFile& registerFile + ; [rbx+8] RegisterFile& registerFile + ; [rbx+0] uint8_t* dataset ; | ; | ; | VM stack @@ -80,17 +81,18 @@ executeProgram PROC movdqu xmmword ptr [rsp+0], xmm10 ; function arguments - push rcx ; RegisterFile& registerFile - mov rbx, rdx ; MemoryRegisters& memory - mov rsi, r8 ; convertible_t& scratchpad - push r9 + push rcx ; RegisterFile& registerFile + mov edi, dword ptr [rdx] ; "mx" + mov rax, qword ptr [rdx+8] ; uint8_t* dataset + push rax + mov rsi, r8 ; convertible_t* scratchpad - mov rbp, rsp ; beginning of VM stack - mov rdi, 1048577 ; number of VM instructions to execute + 1 + mov rbx, rsp ; beginning of VM stack + mov ebp, 524289 ; number of VM instructions to execute + 1 xorps xmm10, xmm10 cmpeqpd xmm10, xmm10 - psrlq xmm10, 1 ; mask for absolute value = 0x7fffffffffffffff7fffffffffffffff + psrlq xmm10, 1 ; mask for absolute value = 0x7fffffffffffffff7fffffffffffffff ; reset rounding mode mov dword ptr [rsp-8], 40896 @@ -162,7 +164,7 @@ executeProgram PROC rx_finish: ; unroll the stack - mov rsp, rbp + mov rsp, rbx ; save VM register values pop rcx @@ -202,57 +204,103 @@ rx_finish: pop rbx ; return - ret 0 + ret + +TransformAddress MACRO reg32, reg64 +;# Transforms the address in the register so that the transformed address +;# lies in a different cache line than the original address (mod 2^N). +;# This is done to prevent a load-store dependency. +;# There are 3 different transformations that can be used: x -> 9*x+C, x -> x+C, x -> x^C + lea reg32, [reg64+reg64*8+127] ;# C = -119 -110 -101 -92 -83 -74 -65 -55 -46 -37 -28 -19 -10 -1 9 18 27 36 45 54 63 73 82 91 100 109 118 127 + ;lea reg32, [reg64-128] ;# C = all except -7 to +7 + ;xor reg32, -8 ;# C = all except 0 to 7 +ENDM +ReadMemoryRandom MACRO spmask, float +;# IN ecx = random 32-bit address +;# OUT rax = 64-bit integer return value +;# OUT xmm0 = 128-bit floating point return value +;# GLOBAL rbp = "ic" number of instructions until the end of the program +;# GLOBAL rbx = address of the dataset address +;# GLOBAL rsi = address of the scratchpad +;# GLOBAL rdi = "mx" random 32-bit dataset address +;# MODIFY rcx, rdx +LOCAL L_prefetch, L_read, L_return + mov eax, ebp + and al, 63 + jz short L_prefetch ;# "ic" divisible by 64 -> prefetch + xor edx, edx + cmp al, 14 + je short L_read ;# "ic" = 14 (mod 64) -> random read + cmovb edx, ecx ;# "ic" < 14 (mod 64) -> modify random read address + xor edi, edx +L_return: + and ecx, spmask ;# limit address to the specified scratchpad size +IF float + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] +ELSE + mov rax, qword ptr [rsi+rcx*8] +ENDIF + ret +L_prefetch: + mov rax, qword ptr [rbx] ;# load the dataset address + and edi, -64 ;# align "mx" to the start of a cache line + prefetchnta byte ptr [rax+rdi] + jmp short L_return +L_read: + push rcx + TransformAddress ecx, rcx ;# TransformAddress function + and ecx, spmask-7 ;# limit address to the specified scratchpad size aligned to multiple of 8 + call rx_read_dataset + pop rcx + jmp short L_return +ENDM + +ALIGN 64 +rx_readint_l1: +ReadMemoryRandom 2047, 0 + +ALIGN 64 +rx_readint_l2: +ReadMemoryRandom 32767, 0 + +ALIGN 64 +rx_readfloat_l1: +ReadMemoryRandom 2047, 1 + +ALIGN 64 +rx_readfloat_l2: +ReadMemoryRandom 32767, 1 + +ALIGN 64 rx_read_dataset: - push r8 - push r9 - push r10 - push r11 - mov rdx, rbx - movd qword ptr [rsp - 8], xmm1 - movd qword ptr [rsp - 16], xmm2 - sub rsp, 48 - call qword ptr [rbp] - add rsp, 48 - movd xmm2, qword ptr [rsp - 16] - movd xmm1, qword ptr [rsp - 8] - pop r11 - pop r10 - pop r9 - pop r8 - ret 0 - -rx_read_dataset_r: - mov edx, dword ptr [rbx] ; ma - mov rax, qword ptr [rbx+8] ; dataset - mov rax, qword ptr [rax+rdx] - add dword ptr [rbx], 8 - xor ecx, dword ptr [rbx+4] ; mx - mov dword ptr [rbx+4], ecx - test ecx, 0FFF8h - jne short rx_read_dataset_r_ret - and ecx, -8 - mov dword ptr [rbx], ecx - mov rdx, qword ptr [rbx+8] - prefetcht0 byte ptr [rdx+rcx] -rx_read_dataset_r_ret: - ret 0 - -rx_read_dataset_f: - mov edx, dword ptr [rbx] ; ma - mov rax, qword ptr [rbx+8] ; dataset - cvtdq2pd xmm0, qword ptr [rax+rdx] - add dword ptr [rbx], 8 - xor ecx, dword ptr [rbx+4] ; mx - mov dword ptr [rbx+4], ecx - test ecx, 0FFF8h - jne short rx_read_dataset_f_ret - and ecx, -8 - mov dword ptr [rbx], ecx - prefetcht0 byte ptr [rax+rcx] -rx_read_dataset_f_ret: - ret 0 +;# IN rcx = scratchpad index - must be divisible by 8 +;# GLOBAL rbx = address of the dataset address +;# GLOBAL rsi = address of the scratchpad +;# GLOBAL rdi = "mx" random 32-bit dataset address +;# MODIFY rax, rcx, rdx + mov rax, qword ptr [rbx] ;# load the dataset address + lea rcx, [rsi+rcx*8] ;# scratchpad cache line + lea rax, [rax+rdi] ;# dataset cache line + mov rdx, qword ptr [rax+0] ;# load first dataset quadword (prefetched into the cache by now) + xor qword ptr [rcx+0], rdx ;# XOR the dataset item with a scratchpad item, repeat for the rest of the cacheline + mov rdx, qword ptr [rax+8] + xor qword ptr [rcx+8], rdx + mov rdx, qword ptr [rax+16] + xor qword ptr [rcx+16], rdx + mov rdx, qword ptr [rax+24] + xor qword ptr [rcx+24], rdx + mov rdx, qword ptr [rax+32] + xor qword ptr [rcx+32], rdx + mov rdx, qword ptr [rax+40] + xor qword ptr [rcx+40], rdx + mov rdx, qword ptr [rax+48] + xor qword ptr [rcx+48], rdx + mov rdx, qword ptr [rax+56] + xor qword ptr [rcx+56], rdx + ret executeProgram ENDP +_RANDOMX_EXECUTE_PROGRAM ENDS + END diff --git a/src/main.cpp b/src/main.cpp index 8bb5492..81d49ec 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -162,7 +162,7 @@ void mine(RandomX::VirtualMachine* vm, std::atomic& atomicNonce, AtomicHash } int main(int argc, char** argv) { - bool softAes, lightClient, genAsm, compiled, help; + bool softAes, lightClient, genAsm, compiled, help, largePages; int programCount, threadCount; readOption("--help", argc, argv, help); @@ -177,6 +177,7 @@ int main(int argc, char** argv) { readOption("--compiled", argc, argv, compiled); readIntOption("--threads", argc, argv, threadCount, 1); readIntOption("--nonces", argc, argv, programCount, 1000); + readOption("--largePages", argc, argv, largePages); if (genAsm) { generateAsm(programCount); @@ -216,7 +217,7 @@ int main(int argc, char** argv) { } else { RandomX::Cache* cache = dataset.cache; - RandomX::datasetAlloc(dataset); + RandomX::datasetAlloc(dataset, largePages); if (threadCount > 1) { auto perThread = RandomX::DatasetBlockCount / threadCount; auto remainder = RandomX::DatasetBlockCount % threadCount; diff --git a/src/program.inc b/src/program.inc index 081647f..8450044 100644 --- a/src/program.inc +++ b/src/program.inc @@ -1,11 +1,10 @@ rx_i_0: ;RET - dec edi + dec ebp jz rx_finish xor r9, 0ca9788ah - mov eax, r9d - and eax, 32767 - mov rax, qword ptr [rsi + rax * 8] - cmp rsp, rbp + mov ecx, r9d + call rx_readint_l1 + cmp rsp, rbx je short not_taken_ret_0 xor rax, qword ptr [rsp + 8] mov rcx, rax @@ -22,21 +21,20 @@ not_taken_ret_0: mov qword ptr [rsi + rax * 8], rcx rx_i_1: ;AND_64 - dec edi + dec ebp jz rx_finish xor r15, 06afc2fa4h mov ecx, r15d - call rx_read_dataset_r + call rx_readint_l1 and rax, r10 mov r12, rax rx_i_2: ;CALL - dec edi + dec ebp jz rx_finish xor r15, 097210f7bh - mov eax, r15d - and eax, 2047 - mov rax, qword ptr [rsi + rax * 8] + mov ecx, r15d + call rx_readint_l1 cmp r11d, 1348521207 jno short taken_call_2 mov rcx, rax @@ -50,31 +48,23 @@ taken_call_2: call rx_i_47 rx_i_3: ;FPROUND - dec edi + dec ebp jz rx_finish xor r13, 082c73195h mov ecx, r13d - call rx_read_dataset_r - mov rcx, rax + call rx_readint_l1 shl eax, 13 - and rcx, -2048 and eax, 24576 - cvtsi2sd xmm8, rcx or eax, 40896 mov dword ptr [rsp - 8], eax ldmxcsr dword ptr [rsp - 8] - mov eax, r8d - xor eax, 06bb1a0b2h - and eax, 32767 - movlpd qword ptr [rsi + rax * 8], xmm8 rx_i_4: ;MULH_64 - dec edi + dec ebp jz rx_finish xor r14, 077daefb4h - mov eax, r14d - and eax, 32767 - mov rax, qword ptr [rsi + rax * 8] + mov ecx, r14d + call rx_readint_l1 mov rcx, r14 mul rcx mov rax, rdx @@ -85,22 +75,22 @@ rx_i_4: ;MULH_64 mov qword ptr [rsi + rax * 8], rcx rx_i_5: ;IMUL_32 - dec edi + dec ebp jz rx_finish xor r15, 0379f9ee0h mov ecx, r15d - call rx_read_dataset_r + call rx_readint_l2 movsxd rcx, eax movsxd rax, r12d imul rax, rcx mov r12, rax rx_i_6: ;MUL_64 - dec edi + dec ebp jz rx_finish xor r8, 03bae7272h mov ecx, r8d - call rx_read_dataset_r + call rx_readint_l1 imul rax, r15 mov rcx, rax mov eax, r9d @@ -109,12 +99,11 @@ rx_i_6: ;MUL_64 mov qword ptr [rsi + rax * 8], rcx rx_i_7: ;FPADD - dec edi + dec ebp jz rx_finish xor r10, 0e264ed81h - mov eax, r10d - and eax, 32767 - cvtdq2pd xmm0, qword ptr [rsi + rax * 8] + mov ecx, r10d + call rx_readfloat_l1 addpd xmm0, xmm6 movaps xmm6, xmm0 mov eax, r14d @@ -123,11 +112,11 @@ rx_i_7: ;FPADD movhpd qword ptr [rsi + rax * 8], xmm6 rx_i_8: ;SHL_64 - dec edi + dec ebp jz rx_finish xor r13, 068c1e5d2h mov ecx, r13d - call rx_read_dataset_r + call rx_readint_l1 shl rax, 47 mov rcx, rax mov eax, r12d @@ -136,30 +125,29 @@ rx_i_8: ;SHL_64 mov qword ptr [rsi + rax * 8], rcx rx_i_9: ;AND_64 - dec edi + dec ebp jz rx_finish xor r14, 085121c54h - mov eax, r14d - and eax, 32767 - mov rax, qword ptr [rsi + rax * 8] + mov ecx, r14d + call rx_readint_l1 and rax, 565870810 mov r10, rax rx_i_10: ;OR_64 - dec edi + dec ebp jz rx_finish xor r8, 052efde3eh mov ecx, r8d - call rx_read_dataset_r + call rx_readint_l1 or rax, -727859809 mov r13, rax rx_i_11: ;FPADD - dec edi + dec ebp jz rx_finish xor r10, 0a9bf8aa1h mov ecx, r10d - call rx_read_dataset_f + call rx_readfloat_l2 addpd xmm0, xmm5 movaps xmm4, xmm0 mov eax, r12d @@ -168,11 +156,11 @@ rx_i_11: ;FPADD movhpd qword ptr [rsi + rax * 8], xmm4 rx_i_12: ;CALL - dec edi + dec ebp jz rx_finish xor r10, 0db2691ch mov ecx, r10d - call rx_read_dataset_r + call rx_readint_l2 cmp r8d, -1763940407 jge short taken_call_12 mov r8, rax @@ -182,20 +170,20 @@ taken_call_12: call rx_i_35 rx_i_13: ;FPSUB - dec edi + dec ebp jz rx_finish xor r12, 061c0d34dh mov ecx, r12d - call rx_read_dataset_f + call rx_readfloat_l1 subpd xmm0, xmm3 movaps xmm9, xmm0 rx_i_14: ;SHR_64 - dec edi + dec ebp jz rx_finish xor r10, 0e761d1beh mov ecx, r10d - call rx_read_dataset_r + call rx_readint_l1 shr rax, 4 mov rcx, rax mov eax, r10d @@ -204,12 +192,12 @@ rx_i_14: ;SHR_64 mov qword ptr [rsi + rax * 8], rcx rx_i_15: ;RET - dec edi + dec ebp jz rx_finish xor r11, 074ddb688h mov ecx, r11d - call rx_read_dataset_r - cmp rsp, rbp + call rx_readint_l2 + cmp rsp, rbx je short not_taken_ret_15 xor rax, qword ptr [rsp + 8] mov rcx, rax @@ -226,12 +214,11 @@ not_taken_ret_15: mov qword ptr [rsi + rax * 8], rcx rx_i_16: ;ADD_64 - dec edi + dec ebp jz rx_finish xor r14, 06be90627h - mov eax, r14d - and eax, 2047 - mov rax, qword ptr [rsi + rax * 8] + mov ecx, r14d + call rx_readint_l1 add rax, r10 mov rcx, rax mov eax, r9d @@ -240,12 +227,11 @@ rx_i_16: ;ADD_64 mov qword ptr [rsi + rax * 8], rcx rx_i_17: ;FPMUL - dec edi + dec ebp jz rx_finish xor r11, 0fbc6fc35h - mov eax, r11d - and eax, 32767 - cvtdq2pd xmm0, qword ptr [rsi + rax * 8] + mov ecx, r11d + call rx_readfloat_l1 mulpd xmm0, xmm4 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -257,12 +243,11 @@ rx_i_17: ;FPMUL movlpd qword ptr [rsi + rax * 8], xmm4 rx_i_18: ;FPSUB - dec edi + dec ebp jz rx_finish xor r14, 0c28ca080h - mov eax, r14d - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi + rax * 8] + mov ecx, r14d + call rx_readfloat_l1 subpd xmm0, xmm4 movaps xmm3, xmm0 mov eax, r11d @@ -271,21 +256,20 @@ rx_i_18: ;FPSUB movhpd qword ptr [rsi + rax * 8], xmm3 rx_i_19: ;FPSUB - dec edi + dec ebp jz rx_finish xor r13, 0ac009c30h - mov eax, r13d - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi + rax * 8] + mov ecx, r13d + call rx_readfloat_l1 subpd xmm0, xmm8 movaps xmm7, xmm0 rx_i_20: ;FPMUL - dec edi + dec ebp jz rx_finish xor r13, 0ecca967dh mov ecx, r13d - call rx_read_dataset_f + call rx_readfloat_l1 mulpd xmm0, xmm2 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -297,20 +281,20 @@ rx_i_20: ;FPMUL movhpd qword ptr [rsi + rax * 8], xmm7 rx_i_21: ;FPADD - dec edi + dec ebp jz rx_finish xor r8, 0977f0284h mov ecx, r8d - call rx_read_dataset_f + call rx_readfloat_l2 addpd xmm0, xmm9 movaps xmm7, xmm0 rx_i_22: ;ADD_32 - dec edi + dec ebp jz rx_finish xor r13, 080bdfefah mov ecx, r13d - call rx_read_dataset_r + call rx_readint_l1 add eax, r8d mov rcx, rax mov eax, r10d @@ -319,20 +303,20 @@ rx_i_22: ;ADD_32 mov qword ptr [rsi + rax * 8], rcx rx_i_23: ;MUL_64 - dec edi + dec ebp jz rx_finish xor r15, 0e1e0d3c4h mov ecx, r15d - call rx_read_dataset_r + call rx_readint_l1 imul rax, r11 mov r8, rax rx_i_24: ;IMULH_64 - dec edi + dec ebp jz rx_finish xor r8, 070d3b8c7h mov ecx, r8d - call rx_read_dataset_r + call rx_readint_l1 mov rcx, r15 imul rcx mov rax, rdx @@ -343,11 +327,11 @@ rx_i_24: ;IMULH_64 mov qword ptr [rsi + rax * 8], rcx rx_i_25: ;FPMUL - dec edi + dec ebp jz rx_finish xor r12, 01cf77a04h mov ecx, r12d - call rx_read_dataset_f + call rx_readfloat_l2 mulpd xmm0, xmm9 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -359,11 +343,11 @@ rx_i_25: ;FPMUL movhpd qword ptr [rsi + rax * 8], xmm6 rx_i_26: ;IMUL_32 - dec edi + dec ebp jz rx_finish xor r11, 0e311468ch mov ecx, r11d - call rx_read_dataset_r + call rx_readint_l1 movsxd rcx, eax movsxd rax, r13d imul rax, rcx @@ -374,12 +358,11 @@ rx_i_26: ;IMUL_32 mov qword ptr [rsi + rax * 8], rcx rx_i_27: ;FPMUL - dec edi + dec ebp jz rx_finish xor r12, 01fd9911ah - mov eax, r12d - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi + rax * 8] + mov ecx, r12d + call rx_readfloat_l2 mulpd xmm0, xmm3 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -387,41 +370,38 @@ rx_i_27: ;FPMUL movaps xmm6, xmm0 rx_i_28: ;XOR_64 - dec edi + dec ebp jz rx_finish xor r13, 067df757eh - mov eax, r13d - and eax, 2047 - mov rax, qword ptr [rsi + rax * 8] + mov ecx, r13d + call rx_readint_l1 xor rax, r13 mov r14, rax rx_i_29: ;SUB_64 - dec edi + dec ebp jz rx_finish xor r12, 0be2e7c42h mov ecx, r12d - call rx_read_dataset_r + call rx_readint_l2 sub rax, 1944166515 mov r14, rax rx_i_30: ;FPADD - dec edi + dec ebp jz rx_finish xor r11, 084d067f7h - mov eax, r11d - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi + rax * 8] + mov ecx, r11d + call rx_readfloat_l1 addpd xmm0, xmm3 movaps xmm7, xmm0 rx_i_31: ;FPADD - dec edi + dec ebp jz rx_finish xor r14, 0d352ce37h - mov eax, r14d - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi + rax * 8] + mov ecx, r14d + call rx_readfloat_l2 addpd xmm0, xmm3 movaps xmm6, xmm0 mov eax, r14d @@ -430,33 +410,31 @@ rx_i_31: ;FPADD movhpd qword ptr [rsi + rax * 8], xmm6 rx_i_32: ;XOR_64 - dec edi + dec ebp jz rx_finish xor r12, 0a1f248dah mov ecx, r12d - call rx_read_dataset_r + call rx_readint_l1 xor rax, -1936869641 mov r9, rax rx_i_33: ;MULH_64 - dec edi + dec ebp jz rx_finish xor r9, 0554720fch - mov eax, r9d - and eax, 2047 - mov rax, qword ptr [rsi + rax * 8] + mov ecx, r9d + call rx_readint_l2 mov rcx, r15 mul rcx mov rax, rdx mov r12, rax rx_i_34: ;CALL - dec edi + dec ebp jz rx_finish xor r13, 0665e91f1h - mov eax, r13d - and eax, 2047 - mov rax, qword ptr [rsi + rax * 8] + mov ecx, r13d + call rx_readint_l1 cmp r14d, -380224718 js short taken_call_34 mov r15, rax @@ -466,13 +444,12 @@ taken_call_34: call rx_i_108 rx_i_35: ;RET - dec edi + dec ebp jz rx_finish xor r15, 05ef1be79h - mov eax, r15d - and eax, 32767 - mov rax, qword ptr [rsi + rax * 8] - cmp rsp, rbp + mov ecx, r15d + call rx_readint_l1 + cmp rsp, rbx je short not_taken_ret_35 xor rax, qword ptr [rsp + 8] mov r8, rax @@ -481,12 +458,11 @@ not_taken_ret_35: mov r8, rax rx_i_36: ;FPMUL - dec edi + dec ebp jz rx_finish xor r8, 012ec7e3ah - mov eax, r8d - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi + rax * 8] + mov ecx, r8d + call rx_readfloat_l1 mulpd xmm0, xmm6 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -494,12 +470,11 @@ rx_i_36: ;FPMUL movaps xmm7, xmm0 rx_i_37: ;FPMUL - dec edi + dec ebp jz rx_finish xor r12, 0d0706601h - mov eax, r12d - and eax, 32767 - cvtdq2pd xmm0, qword ptr [rsi + rax * 8] + mov ecx, r12d + call rx_readfloat_l1 mulpd xmm0, xmm2 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -511,32 +486,30 @@ rx_i_37: ;FPMUL movhpd qword ptr [rsi + rax * 8], xmm9 rx_i_38: ;SUB_64 - dec edi + dec ebp jz rx_finish xor r9, 064056913h - mov eax, r9d - and eax, 2047 - mov rax, qword ptr [rsi + rax * 8] + mov ecx, r9d + call rx_readint_l1 sub rax, r14 mov r10, rax rx_i_39: ;ADD_32 - dec edi + dec ebp jz rx_finish xor r14, 02c1f1eb0h mov ecx, r14d - call rx_read_dataset_r + call rx_readint_l1 add eax, r14d mov r14, rax rx_i_40: ;RET - dec edi + dec ebp jz rx_finish xor r10, 068fd9009h - mov eax, r10d - and eax, 2047 - mov rax, qword ptr [rsi + rax * 8] - cmp rsp, rbp + mov ecx, r10d + call rx_readint_l1 + cmp rsp, rbx je short not_taken_ret_40 xor rax, qword ptr [rsp + 8] mov rcx, rax @@ -553,12 +526,11 @@ not_taken_ret_40: mov qword ptr [rsi + rax * 8], rcx rx_i_41: ;CALL - dec edi + dec ebp jz rx_finish xor r9, 037a30933h - mov eax, r9d - and eax, 32767 - mov rax, qword ptr [rsi + rax * 8] + mov ecx, r9d + call rx_readint_l1 cmp r14d, -1070581824 jo short taken_call_41 mov r9, rax @@ -568,21 +540,20 @@ taken_call_41: call rx_i_127 rx_i_42: ;FPSUB - dec edi + dec ebp jz rx_finish xor r15, 0bc1de9f6h - mov eax, r15d - and eax, 32767 - cvtdq2pd xmm0, qword ptr [rsi + rax * 8] + mov ecx, r15d + call rx_readfloat_l1 subpd xmm0, xmm6 movaps xmm6, xmm0 rx_i_43: ;SUB_64 - dec edi + dec ebp jz rx_finish xor r12, 02b2a2eech mov ecx, r12d - call rx_read_dataset_r + call rx_readint_l1 sub rax, 1693705407 mov rcx, rax mov eax, r11d @@ -591,31 +562,30 @@ rx_i_43: ;SUB_64 mov qword ptr [rsi + rax * 8], rcx rx_i_44: ;ROL_64 - dec edi + dec ebp jz rx_finish xor r11, 0685817abh mov ecx, r11d - call rx_read_dataset_r + call rx_readint_l1 mov rcx, r9 rol rax, cl mov r15, rax rx_i_45: ;FPSUB - dec edi + dec ebp jz rx_finish xor r12, 08cd244ebh - mov eax, r12d - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi + rax * 8] + mov ecx, r12d + call rx_readfloat_l2 subpd xmm0, xmm2 movaps xmm5, xmm0 rx_i_46: ;ADD_64 - dec edi + dec ebp jz rx_finish xor r8, 06d8f4254h mov ecx, r8d - call rx_read_dataset_r + call rx_readint_l2 add rax, r9 mov rcx, rax mov eax, r8d @@ -624,11 +594,11 @@ rx_i_46: ;ADD_64 mov qword ptr [rsi + rax * 8], rcx rx_i_47: ;CALL - dec edi + dec ebp jz rx_finish xor r12, 05ba232c6h mov ecx, r12d - call rx_read_dataset_r + call rx_readint_l2 cmp r10d, 119251505 jbe short taken_call_47 mov rcx, rax @@ -642,12 +612,11 @@ taken_call_47: call rx_i_131 rx_i_48: ;FPSQRT - dec edi + dec ebp jz rx_finish xor r8, 0aaed618fh - mov eax, r8d - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi + rax * 8] + mov ecx, r8d + call rx_readfloat_l1 andps xmm0, xmm10 sqrtpd xmm0, xmm0 movaps xmm9, xmm0 @@ -657,12 +626,11 @@ rx_i_48: ;FPSQRT movhpd qword ptr [rsi + rax * 8], xmm9 rx_i_49: ;FPMUL - dec edi + dec ebp jz rx_finish xor r8, 0f96c6a45h - mov eax, r8d - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi + rax * 8] + mov ecx, r8d + call rx_readfloat_l1 mulpd xmm0, xmm3 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -670,12 +638,11 @@ rx_i_49: ;FPMUL movaps xmm5, xmm0 rx_i_50: ;OR_32 - dec edi + dec ebp jz rx_finish xor r9, 0da3e4842h - mov eax, r9d - and eax, 32767 - mov rax, qword ptr [rsi + rax * 8] + mov ecx, r9d + call rx_readint_l1 or eax, r10d mov rcx, rax mov eax, r15d @@ -684,20 +651,20 @@ rx_i_50: ;OR_32 mov qword ptr [rsi + rax * 8], rcx rx_i_51: ;SUB_64 - dec edi + dec ebp jz rx_finish xor r10, 0302b676ah mov ecx, r10d - call rx_read_dataset_r + call rx_readint_l2 sub rax, 419241919 mov r15, rax rx_i_52: ;CALL - dec edi + dec ebp jz rx_finish xor r11, 0fa88f48bh mov ecx, r11d - call rx_read_dataset_r + call rx_readint_l2 cmp r13d, -534426193 js short taken_call_52 mov rcx, rax @@ -711,13 +678,12 @@ taken_call_52: call rx_i_94 rx_i_53: ;RET - dec edi + dec ebp jz rx_finish xor r13, 03dff9b9eh - mov eax, r13d - and eax, 2047 - mov rax, qword ptr [rsi + rax * 8] - cmp rsp, rbp + mov ecx, r13d + call rx_readint_l1 + cmp rsp, rbx je short not_taken_ret_53 xor rax, qword ptr [rsp + 8] mov r13, rax @@ -726,12 +692,11 @@ not_taken_ret_53: mov r13, rax rx_i_54: ;IMULH_64 - dec edi + dec ebp jz rx_finish xor r11, 060638de0h - mov eax, r11d - and eax, 2047 - mov rax, qword ptr [rsi + rax * 8] + mov ecx, r11d + call rx_readint_l2 mov rcx, 282209221 imul rcx mov rax, rdx @@ -742,12 +707,11 @@ rx_i_54: ;IMULH_64 mov qword ptr [rsi + rax * 8], rcx rx_i_55: ;FPMUL - dec edi + dec ebp jz rx_finish xor r10, 0dda983d4h - mov eax, r10d - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi + rax * 8] + mov ecx, r10d + call rx_readfloat_l1 mulpd xmm0, xmm5 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -759,12 +723,11 @@ rx_i_55: ;FPMUL movlpd qword ptr [rsi + rax * 8], xmm3 rx_i_56: ;AND_64 - dec edi + dec ebp jz rx_finish xor r14, 0f1456b8eh - mov eax, r14d - and eax, 32767 - mov rax, qword ptr [rsi + rax * 8] + mov ecx, r14d + call rx_readint_l1 and rax, r15 mov rcx, rax mov eax, r8d @@ -773,12 +736,11 @@ rx_i_56: ;AND_64 mov qword ptr [rsi + rax * 8], rcx rx_i_57: ;MUL_64 - dec edi + dec ebp jz rx_finish xor r9, 010dc4571h - mov eax, r9d - and eax, 2047 - mov rax, qword ptr [rsi + rax * 8] + mov ecx, r9d + call rx_readint_l2 imul rax, r14 mov rcx, rax mov eax, r15d @@ -787,12 +749,11 @@ rx_i_57: ;MUL_64 mov qword ptr [rsi + rax * 8], rcx rx_i_58: ;IDIV_64 - dec edi + dec ebp jz rx_finish xor r14, 0bcec0ebah - mov eax, r14d - and eax, 2047 - mov rax, qword ptr [rsi + rax * 8] + mov ecx, r14d + call rx_readint_l2 mov edx, r13d cmp edx, -1 jne short safe_idiv_58 @@ -811,22 +772,21 @@ result_idiv_58: mov r8, rax rx_i_59: ;FPSUB - dec edi + dec ebp jz rx_finish xor r11, 0980dd402h - mov eax, r11d - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi + rax * 8] + mov ecx, r11d + call rx_readfloat_l1 subpd xmm0, xmm8 movaps xmm7, xmm0 rx_i_60: ;RET - dec edi + dec ebp jz rx_finish xor r15, 03de14d1eh mov ecx, r15d - call rx_read_dataset_r - cmp rsp, rbp + call rx_readint_l1 + cmp rsp, rbx je short not_taken_ret_60 xor rax, qword ptr [rsp + 8] mov rcx, rax @@ -843,12 +803,11 @@ not_taken_ret_60: mov qword ptr [rsi + rax * 8], rcx rx_i_61: ;CALL - dec edi + dec ebp jz rx_finish xor r13, 05058ce64h - mov eax, r13d - and eax, 2047 - mov rax, qword ptr [rsi + rax * 8] + mov ecx, r13d + call rx_readint_l1 cmp r15d, 1933164545 jns short taken_call_61 mov r11, rax @@ -858,11 +817,11 @@ taken_call_61: call rx_i_120 rx_i_62: ;FPMUL - dec edi + dec ebp jz rx_finish xor r15, 0c3089414h mov ecx, r15d - call rx_read_dataset_f + call rx_readfloat_l1 mulpd xmm0, xmm8 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -874,12 +833,11 @@ rx_i_62: ;FPMUL movlpd qword ptr [rsi + rax * 8], xmm2 rx_i_63: ;FPMUL - dec edi + dec ebp jz rx_finish xor r9, 065cf272eh - mov eax, r9d - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi + rax * 8] + mov ecx, r9d + call rx_readfloat_l1 mulpd xmm0, xmm7 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -887,20 +845,20 @@ rx_i_63: ;FPMUL movaps xmm8, xmm0 rx_i_64: ;SUB_64 - dec edi + dec ebp jz rx_finish xor r13, 0ae54dfbfh mov ecx, r13d - call rx_read_dataset_r + call rx_readint_l1 sub rax, r15 mov r9, rax rx_i_65: ;CALL - dec edi + dec ebp jz rx_finish xor r13, 07b366ce6h mov ecx, r13d - call rx_read_dataset_r + call rx_readint_l1 cmp r8d, 1498056607 js short taken_call_65 mov r11, rax @@ -910,11 +868,11 @@ taken_call_65: call rx_i_129 rx_i_66: ;FPSQRT - dec edi + dec ebp jz rx_finish xor r15, 015a1b689h mov ecx, r15d - call rx_read_dataset_f + call rx_readfloat_l2 andps xmm0, xmm10 sqrtpd xmm0, xmm0 movaps xmm9, xmm0 @@ -924,11 +882,11 @@ rx_i_66: ;FPSQRT movlpd qword ptr [rsi + rax * 8], xmm9 rx_i_67: ;CALL - dec edi + dec ebp jz rx_finish xor r14, 088393ba0h mov ecx, r14d - call rx_read_dataset_r + call rx_readint_l1 cmp r13d, 2031541081 jns short taken_call_67 mov r9, rax @@ -938,11 +896,11 @@ taken_call_67: call rx_i_79 rx_i_68: ;FPSUB - dec edi + dec ebp jz rx_finish xor r13, 03aa5c3a4h mov ecx, r13d - call rx_read_dataset_f + call rx_readfloat_l1 subpd xmm0, xmm2 movaps xmm4, xmm0 mov eax, r12d @@ -951,33 +909,31 @@ rx_i_68: ;FPSUB movhpd qword ptr [rsi + rax * 8], xmm4 rx_i_69: ;FPADD - dec edi + dec ebp jz rx_finish xor r15, 0376c9c27h mov ecx, r15d - call rx_read_dataset_f + call rx_readfloat_l2 addpd xmm0, xmm5 movaps xmm8, xmm0 rx_i_70: ;MULH_64 - dec edi + dec ebp jz rx_finish xor r8, 0bbbec3fah - mov eax, r8d - and eax, 2047 - mov rax, qword ptr [rsi + rax * 8] + mov ecx, r8d + call rx_readint_l2 mov rcx, r9 mul rcx mov rax, rdx mov r13, rax rx_i_71: ;FPMUL - dec edi + dec ebp jz rx_finish xor r14, 0e9efb350h - mov eax, r14d - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi + rax * 8] + mov ecx, r14d + call rx_readfloat_l1 mulpd xmm0, xmm5 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -985,11 +941,11 @@ rx_i_71: ;FPMUL movaps xmm7, xmm0 rx_i_72: ;CALL - dec edi + dec ebp jz rx_finish xor r13, 0f4e51e28h mov ecx, r13d - call rx_read_dataset_r + call rx_readint_l1 cmp r9d, -631091751 jno short taken_call_72 mov rcx, rax @@ -1003,26 +959,23 @@ taken_call_72: call rx_i_191 rx_i_73: ;FPROUND - dec edi + dec ebp jz rx_finish xor r12, 0c24ddbd4h mov ecx, r12d - call rx_read_dataset_r - mov rcx, rax + call rx_readint_l2 shl eax, 13 - and rcx, -2048 and eax, 24576 - cvtsi2sd xmm2, rcx or eax, 40896 mov dword ptr [rsp - 8], eax ldmxcsr dword ptr [rsp - 8] rx_i_74: ;MUL_64 - dec edi + dec ebp jz rx_finish xor r8, 04c4b0c7fh mov ecx, r8d - call rx_read_dataset_r + call rx_readint_l1 imul rax, rax, -1431647438 mov rcx, rax mov eax, r9d @@ -1031,13 +984,12 @@ rx_i_74: ;MUL_64 mov qword ptr [rsi + rax * 8], rcx rx_i_75: ;RET - dec edi + dec ebp jz rx_finish xor r14, 03bcc02e3h - mov eax, r14d - and eax, 2047 - mov rax, qword ptr [rsi + rax * 8] - cmp rsp, rbp + mov ecx, r14d + call rx_readint_l2 + cmp rsp, rbx je short not_taken_ret_75 xor rax, qword ptr [rsp + 8] mov r13, rax @@ -1046,12 +998,11 @@ not_taken_ret_75: mov r13, rax rx_i_76: ;FPADD - dec edi + dec ebp jz rx_finish xor r11, 04b0ff63eh - mov eax, r11d - and eax, 32767 - cvtdq2pd xmm0, qword ptr [rsi + rax * 8] + mov ecx, r11d + call rx_readfloat_l1 addpd xmm0, xmm3 movaps xmm7, xmm0 mov eax, r15d @@ -1060,13 +1011,12 @@ rx_i_76: ;FPADD movlpd qword ptr [rsi + rax * 8], xmm7 rx_i_77: ;RET - dec edi + dec ebp jz rx_finish xor r14, 0b956b3e8h - mov eax, r14d - and eax, 2047 - mov rax, qword ptr [rsi + rax * 8] - cmp rsp, rbp + mov ecx, r14d + call rx_readint_l2 + cmp rsp, rbx je short not_taken_ret_77 xor rax, qword ptr [rsp + 8] mov rcx, rax @@ -1083,25 +1033,23 @@ not_taken_ret_77: mov qword ptr [rsi + rax * 8], rcx rx_i_78: ;MUL_32 - dec edi + dec ebp jz rx_finish xor r9, 0edeca680h - mov eax, r9d - and eax, 2047 - mov rax, qword ptr [rsi + rax * 8] + mov ecx, r9d + call rx_readint_l1 mov ecx, eax mov eax, r8d imul rax, rcx mov r15, rax rx_i_79: ;RET - dec edi + dec ebp jz rx_finish xor r11, 0fbdddcb5h - mov eax, r11d - and eax, 32767 - mov rax, qword ptr [rsi + rax * 8] - cmp rsp, rbp + mov ecx, r11d + call rx_readint_l1 + cmp rsp, rbx je short not_taken_ret_79 xor rax, qword ptr [rsp + 8] mov rcx, rax @@ -1118,30 +1066,29 @@ not_taken_ret_79: mov qword ptr [rsi + rax * 8], rcx rx_i_80: ;FPADD - dec edi + dec ebp jz rx_finish xor r13, 09cec97a1h - mov eax, r13d - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi + rax * 8] + mov ecx, r13d + call rx_readfloat_l2 addpd xmm0, xmm3 movaps xmm3, xmm0 rx_i_81: ;OR_64 - dec edi + dec ebp jz rx_finish xor r15, 078228167h mov ecx, r15d - call rx_read_dataset_r + call rx_readint_l1 or rax, r13 mov r8, rax rx_i_82: ;CALL - dec edi + dec ebp jz rx_finish xor r11, 078cae1ffh mov ecx, r11d - call rx_read_dataset_r + call rx_readint_l1 cmp r12d, -68969733 jo short taken_call_82 mov rcx, rax @@ -1155,21 +1102,20 @@ taken_call_82: call rx_i_145 rx_i_83: ;AND_64 - dec edi + dec ebp jz rx_finish xor r10, 0d9b6a533h - mov eax, r10d - and eax, 32767 - mov rax, qword ptr [rsi + rax * 8] + mov ecx, r10d + call rx_readint_l1 and rax, r10 mov r12, rax rx_i_84: ;ROR_64 - dec edi + dec ebp jz rx_finish xor r15, 0e9e75336h mov ecx, r15d - call rx_read_dataset_r + call rx_readint_l2 mov rcx, r10 ror rax, cl mov rcx, rax @@ -1179,21 +1125,20 @@ rx_i_84: ;ROR_64 mov qword ptr [rsi + rax * 8], rcx rx_i_85: ;MUL_64 - dec edi + dec ebp jz rx_finish xor r13, 04c0d378ah - mov eax, r13d - and eax, 2047 - mov rax, qword ptr [rsi + rax * 8] + mov ecx, r13d + call rx_readint_l1 imul rax, r8 mov r10, rax rx_i_86: ;OR_64 - dec edi + dec ebp jz rx_finish xor r11, 04386e368h mov ecx, r11d - call rx_read_dataset_r + call rx_readint_l1 or rax, r8 mov rcx, rax mov eax, r12d @@ -1202,22 +1147,20 @@ rx_i_86: ;OR_64 mov qword ptr [rsi + rax * 8], rcx rx_i_87: ;SUB_64 - dec edi + dec ebp jz rx_finish xor r9, 0d75a0ecfh - mov eax, r9d - and eax, 2047 - mov rax, qword ptr [rsi + rax * 8] + mov ecx, r9d + call rx_readint_l1 sub rax, r12 mov r8, rax rx_i_88: ;FPADD - dec edi + dec ebp jz rx_finish xor r9, 031bb7f7ah - mov eax, r9d - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi + rax * 8] + mov ecx, r9d + call rx_readfloat_l1 addpd xmm0, xmm6 movaps xmm9, xmm0 mov eax, r9d @@ -1226,11 +1169,11 @@ rx_i_88: ;FPADD movlpd qword ptr [rsi + rax * 8], xmm9 rx_i_89: ;MUL_64 - dec edi + dec ebp jz rx_finish xor r9, 03b45ecebh mov ecx, r9d - call rx_read_dataset_r + call rx_readint_l2 imul rax, r8 mov rcx, rax mov eax, r10d @@ -1239,21 +1182,20 @@ rx_i_89: ;MUL_64 mov qword ptr [rsi + rax * 8], rcx rx_i_90: ;FPADD - dec edi + dec ebp jz rx_finish xor r12, 0ee08e76bh - mov eax, r12d - and eax, 32767 - cvtdq2pd xmm0, qword ptr [rsi + rax * 8] + mov ecx, r12d + call rx_readfloat_l1 addpd xmm0, xmm3 movaps xmm6, xmm0 rx_i_91: ;FPMUL - dec edi + dec ebp jz rx_finish xor r9, 042e28e94h mov ecx, r9d - call rx_read_dataset_f + call rx_readfloat_l1 mulpd xmm0, xmm2 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -1261,11 +1203,11 @@ rx_i_91: ;FPMUL movaps xmm4, xmm0 rx_i_92: ;CALL - dec edi + dec ebp jz rx_finish xor r8, 0729260e1h mov ecx, r8d - call rx_read_dataset_r + call rx_readint_l2 cmp r14d, 1288893603 jge short taken_call_92 mov r12, rax @@ -1275,12 +1217,11 @@ taken_call_92: call rx_i_170 rx_i_93: ;FPADD - dec edi + dec ebp jz rx_finish xor r8, 0bfcebaf4h - mov eax, r8d - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi + rax * 8] + mov ecx, r8d + call rx_readfloat_l1 addpd xmm0, xmm2 movaps xmm2, xmm0 mov eax, r10d @@ -1289,13 +1230,12 @@ rx_i_93: ;FPADD movlpd qword ptr [rsi + rax * 8], xmm2 rx_i_94: ;RET - dec edi + dec ebp jz rx_finish xor r13, 0ea326630h - mov eax, r13d - and eax, 32767 - mov rax, qword ptr [rsi + rax * 8] - cmp rsp, rbp + mov ecx, r13d + call rx_readint_l1 + cmp rsp, rbx je short not_taken_ret_94 xor rax, qword ptr [rsp + 8] mov r8, rax @@ -1304,11 +1244,11 @@ not_taken_ret_94: mov r8, rax rx_i_95: ;MUL_64 - dec edi + dec ebp jz rx_finish xor r13, 0b5451a2dh mov ecx, r13d - call rx_read_dataset_r + call rx_readint_l1 imul rax, r10 mov rcx, rax mov eax, r15d @@ -1317,22 +1257,22 @@ rx_i_95: ;MUL_64 mov qword ptr [rsi + rax * 8], rcx rx_i_96: ;IMUL_32 - dec edi + dec ebp jz rx_finish xor r11, 04f912ef8h mov ecx, r11d - call rx_read_dataset_r + call rx_readint_l2 movsxd rcx, eax mov rax, -1354397081 imul rax, rcx mov r11, rax rx_i_97: ;FPSQRT - dec edi + dec ebp jz rx_finish xor r15, 0acc45b3bh mov ecx, r15d - call rx_read_dataset_f + call rx_readfloat_l1 andps xmm0, xmm10 sqrtpd xmm0, xmm0 movaps xmm5, xmm0 @@ -1342,20 +1282,20 @@ rx_i_97: ;FPSQRT movlpd qword ptr [rsi + rax * 8], xmm5 rx_i_98: ;SUB_64 - dec edi + dec ebp jz rx_finish xor r14, 09900a4e8h mov ecx, r14d - call rx_read_dataset_r + call rx_readint_l1 sub rax, r15 mov r14, rax rx_i_99: ;FPDIV - dec edi + dec ebp jz rx_finish xor r9, 0841b2984h mov ecx, r9d - call rx_read_dataset_f + call rx_readfloat_l2 divpd xmm0, xmm6 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -1367,31 +1307,29 @@ rx_i_99: ;FPDIV movlpd qword ptr [rsi + rax * 8], xmm4 rx_i_100: ;ADD_64 - dec edi + dec ebp jz rx_finish xor r15, 07ebea48fh mov ecx, r15d - call rx_read_dataset_r + call rx_readint_l1 add rax, r9 mov r14, rax rx_i_101: ;SUB_64 - dec edi + dec ebp jz rx_finish xor r10, 0631209d3h - mov eax, r10d - and eax, 2047 - mov rax, qword ptr [rsi + rax * 8] + mov ecx, r10d + call rx_readint_l1 sub rax, r8 mov r11, rax rx_i_102: ;FPDIV - dec edi + dec ebp jz rx_finish xor r10, 0e50bf07ah - mov eax, r10d - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi + rax * 8] + mov ecx, r10d + call rx_readfloat_l1 divpd xmm0, xmm3 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -1399,12 +1337,11 @@ rx_i_102: ;FPDIV movaps xmm7, xmm0 rx_i_103: ;MUL_64 - dec edi + dec ebp jz rx_finish xor r10, 02b7096f1h - mov eax, r10d - and eax, 32767 - mov rax, qword ptr [rsi + rax * 8] + mov ecx, r10d + call rx_readint_l1 imul rax, r13 mov rcx, rax mov eax, r15d @@ -1413,12 +1350,11 @@ rx_i_103: ;MUL_64 mov qword ptr [rsi + rax * 8], rcx rx_i_104: ;IMULH_64 - dec edi + dec ebp jz rx_finish xor r11, 075deaf71h - mov eax, r11d - and eax, 32767 - mov rax, qword ptr [rsi + rax * 8] + mov ecx, r11d + call rx_readint_l1 mov rcx, -1913070089 imul rcx mov rax, rdx @@ -1429,11 +1365,11 @@ rx_i_104: ;IMULH_64 mov qword ptr [rsi + rax * 8], rcx rx_i_105: ;MUL_32 - dec edi + dec ebp jz rx_finish xor r13, 036a51f72h mov ecx, r13d - call rx_read_dataset_r + call rx_readint_l1 mov ecx, eax mov eax, r15d imul rax, rcx @@ -1444,11 +1380,11 @@ rx_i_105: ;MUL_32 mov qword ptr [rsi + rax * 8], rcx rx_i_106: ;FPMUL - dec edi + dec ebp jz rx_finish xor r11, 07b512986h mov ecx, r11d - call rx_read_dataset_f + call rx_readfloat_l1 mulpd xmm0, xmm3 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -1460,12 +1396,11 @@ rx_i_106: ;FPMUL movhpd qword ptr [rsi + rax * 8], xmm4 rx_i_107: ;CALL - dec edi + dec ebp jz rx_finish xor r12, 0f1d2e50h - mov eax, r12d - and eax, 2047 - mov rax, qword ptr [rsi + rax * 8] + mov ecx, r12d + call rx_readint_l1 cmp r11d, 1917037441 jl short taken_call_107 mov rcx, rax @@ -1479,11 +1414,11 @@ taken_call_107: call rx_i_143 rx_i_108: ;FPDIV - dec edi + dec ebp jz rx_finish xor r9, 07327ba60h mov ecx, r9d - call rx_read_dataset_f + call rx_readfloat_l1 divpd xmm0, xmm5 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -1495,21 +1430,20 @@ rx_i_108: ;FPDIV movlpd qword ptr [rsi + rax * 8], xmm9 rx_i_109: ;FPADD - dec edi + dec ebp jz rx_finish xor r15, 0594e37deh - mov eax, r15d - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi + rax * 8] + mov ecx, r15d + call rx_readfloat_l1 addpd xmm0, xmm2 movaps xmm3, xmm0 rx_i_110: ;ROL_64 - dec edi + dec ebp jz rx_finish xor r9, 04cdf5ebah mov ecx, r9d - call rx_read_dataset_r + call rx_readint_l1 mov rcx, r9 rol rax, cl mov rcx, rax @@ -1519,12 +1453,12 @@ rx_i_110: ;ROL_64 mov qword ptr [rsi + rax * 8], rcx rx_i_111: ;RET - dec edi + dec ebp jz rx_finish xor r8, 02e16c97ch mov ecx, r8d - call rx_read_dataset_r - cmp rsp, rbp + call rx_readint_l1 + cmp rsp, rbx je short not_taken_ret_111 xor rax, qword ptr [rsp + 8] mov rcx, rax @@ -1541,12 +1475,11 @@ not_taken_ret_111: mov qword ptr [rsi + rax * 8], rcx rx_i_112: ;SUB_64 - dec edi + dec ebp jz rx_finish xor r12, 0d42ddbd4h - mov eax, r12d - and eax, 2047 - mov rax, qword ptr [rsi + rax * 8] + mov ecx, r12d + call rx_readint_l2 sub rax, r13 mov rcx, rax mov eax, r14d @@ -1555,42 +1488,42 @@ rx_i_112: ;SUB_64 mov qword ptr [rsi + rax * 8], rcx rx_i_113: ;MULH_64 - dec edi + dec ebp jz rx_finish xor r10, 07a4f8cbbh mov ecx, r10d - call rx_read_dataset_r + call rx_readint_l1 mov rcx, r9 mul rcx mov rax, rdx mov r13, rax rx_i_114: ;IMULH_64 - dec edi + dec ebp jz rx_finish xor r13, 06e83e2cdh mov ecx, r13d - call rx_read_dataset_r + call rx_readint_l1 mov rcx, r15 imul rcx mov rax, rdx mov r14, rax rx_i_115: ;OR_64 - dec edi + dec ebp jz rx_finish xor r14, 0336c980eh mov ecx, r14d - call rx_read_dataset_r + call rx_readint_l2 or rax, r10 mov r14, rax rx_i_116: ;IMULH_64 - dec edi + dec ebp jz rx_finish xor r10, 0d122702eh mov ecx, r10d - call rx_read_dataset_r + call rx_readint_l1 mov rcx, -1850776691 imul rcx mov rax, rdx @@ -1601,12 +1534,11 @@ rx_i_116: ;IMULH_64 mov qword ptr [rsi + rax * 8], rcx rx_i_117: ;AND_64 - dec edi + dec ebp jz rx_finish xor r11, 015f2012bh - mov eax, r11d - and eax, 2047 - mov rax, qword ptr [rsi + rax * 8] + mov ecx, r11d + call rx_readint_l1 and rax, -1205826972 mov rcx, rax mov eax, r15d @@ -1615,41 +1547,38 @@ rx_i_117: ;AND_64 mov qword ptr [rsi + rax * 8], rcx rx_i_118: ;FPSUB - dec edi + dec ebp jz rx_finish xor r9, 037ddf43dh - mov eax, r9d - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi + rax * 8] + mov ecx, r9d + call rx_readfloat_l2 subpd xmm0, xmm5 movaps xmm6, xmm0 rx_i_119: ;FPSUB - dec edi + dec ebp jz rx_finish xor r9, 0bba475f3h - mov eax, r9d - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi + rax * 8] + mov ecx, r9d + call rx_readfloat_l1 subpd xmm0, xmm3 movaps xmm5, xmm0 rx_i_120: ;FPADD - dec edi + dec ebp jz rx_finish xor r12, 0e5561e3eh - mov eax, r12d - and eax, 32767 - cvtdq2pd xmm0, qword ptr [rsi + rax * 8] + mov ecx, r12d + call rx_readfloat_l1 addpd xmm0, xmm4 movaps xmm8, xmm0 rx_i_121: ;FPMUL - dec edi + dec ebp jz rx_finish xor r9, 03ab8f73h mov ecx, r9d - call rx_read_dataset_f + call rx_readfloat_l1 mulpd xmm0, xmm5 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -1657,12 +1586,12 @@ rx_i_121: ;FPMUL movaps xmm8, xmm0 rx_i_122: ;RET - dec edi + dec ebp jz rx_finish xor r10, 04e0dbd40h mov ecx, r10d - call rx_read_dataset_r - cmp rsp, rbp + call rx_readint_l1 + cmp rsp, rbx je short not_taken_ret_122 xor rax, qword ptr [rsp + 8] mov rcx, rax @@ -1679,21 +1608,20 @@ not_taken_ret_122: mov qword ptr [rsi + rax * 8], rcx rx_i_123: ;ADD_32 - dec edi + dec ebp jz rx_finish xor r13, 073e9f58ah - mov eax, r13d - and eax, 2047 - mov rax, qword ptr [rsi + rax * 8] + mov ecx, r13d + call rx_readint_l1 add eax, r15d mov r13, rax rx_i_124: ;CALL - dec edi + dec ebp jz rx_finish xor r12, 0e3fa3670h mov ecx, r12d - call rx_read_dataset_r + call rx_readint_l2 cmp r11d, 1719505436 jns short taken_call_124 mov rcx, rax @@ -1707,23 +1635,22 @@ taken_call_124: call rx_i_237 rx_i_125: ;MUL_32 - dec edi + dec ebp jz rx_finish xor r8, 0ebec27cdh mov ecx, r8d - call rx_read_dataset_r + call rx_readint_l2 mov ecx, eax mov eax, r14d imul rax, rcx mov r14, rax rx_i_126: ;FPDIV - dec edi + dec ebp jz rx_finish xor r8, 01feb5264h - mov eax, r8d - and eax, 32767 - cvtdq2pd xmm0, qword ptr [rsi + rax * 8] + mov ecx, r8d + call rx_readfloat_l1 divpd xmm0, xmm6 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -1731,32 +1658,31 @@ rx_i_126: ;FPDIV movaps xmm2, xmm0 rx_i_127: ;IMUL_32 - dec edi + dec ebp jz rx_finish xor r9, 0405f500fh mov ecx, r9d - call rx_read_dataset_r + call rx_readint_l1 movsxd rcx, eax movsxd rax, r10d imul rax, rcx mov r8, rax rx_i_128: ;MUL_64 - dec edi + dec ebp jz rx_finish xor r13, 0459f1154h mov ecx, r13d - call rx_read_dataset_r + call rx_readint_l2 imul rax, r9 mov r9, rax rx_i_129: ;CALL - dec edi + dec ebp jz rx_finish xor r9, 081918b4ch - mov eax, r9d - and eax, 2047 - mov rax, qword ptr [rsi + rax * 8] + mov ecx, r9d + call rx_readint_l1 cmp r13d, -590624856 jge short taken_call_129 mov r9, rax @@ -1766,12 +1692,11 @@ taken_call_129: call rx_i_154 rx_i_130: ;OR_64 - dec edi + dec ebp jz rx_finish xor r9, 077c3b332h - mov eax, r9d - and eax, 2047 - mov rax, qword ptr [rsi + rax * 8] + mov ecx, r9d + call rx_readint_l1 or rax, -281794782 mov rcx, rax mov eax, r11d @@ -1780,13 +1705,12 @@ rx_i_130: ;OR_64 mov qword ptr [rsi + rax * 8], rcx rx_i_131: ;RET - dec edi + dec ebp jz rx_finish xor r12, 05792310bh - mov eax, r12d - and eax, 2047 - mov rax, qword ptr [rsi + rax * 8] - cmp rsp, rbp + mov ecx, r12d + call rx_readint_l1 + cmp rsp, rbx je short not_taken_ret_131 xor rax, qword ptr [rsp + 8] mov rcx, rax @@ -1803,20 +1727,20 @@ not_taken_ret_131: mov qword ptr [rsi + rax * 8], rcx rx_i_132: ;FPADD - dec edi + dec ebp jz rx_finish xor r10, 0ebc6e10h mov ecx, r10d - call rx_read_dataset_f + call rx_readfloat_l1 addpd xmm0, xmm6 movaps xmm7, xmm0 rx_i_133: ;XOR_64 - dec edi + dec ebp jz rx_finish xor r14, 0822f8b60h mov ecx, r14d - call rx_read_dataset_r + call rx_readint_l1 xor rax, -1000526796 mov rcx, rax mov eax, r15d @@ -1825,20 +1749,20 @@ rx_i_133: ;XOR_64 mov qword ptr [rsi + rax * 8], rcx rx_i_134: ;ADD_64 - dec edi + dec ebp jz rx_finish xor r10, 0d0f18593h mov ecx, r10d - call rx_read_dataset_r + call rx_readint_l1 add rax, 1516102347 mov r13, rax rx_i_135: ;FPMUL - dec edi + dec ebp jz rx_finish xor r11, 088212ef9h mov ecx, r11d - call rx_read_dataset_f + call rx_readfloat_l1 mulpd xmm0, xmm3 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -1846,11 +1770,11 @@ rx_i_135: ;FPMUL movaps xmm8, xmm0 rx_i_136: ;FPSQRT - dec edi + dec ebp jz rx_finish xor r8, 01ae56e03h mov ecx, r8d - call rx_read_dataset_f + call rx_readfloat_l1 andps xmm0, xmm10 sqrtpd xmm0, xmm0 movaps xmm5, xmm0 @@ -1860,23 +1784,22 @@ rx_i_136: ;FPSQRT movhpd qword ptr [rsi + rax * 8], xmm5 rx_i_137: ;ROL_64 - dec edi + dec ebp jz rx_finish xor r11, 015a24231h - mov eax, r11d - and eax, 32767 - mov rax, qword ptr [rsi + rax * 8] + mov ecx, r11d + call rx_readint_l1 mov rcx, r9 rol rax, cl mov r11, rax rx_i_138: ;RET - dec edi + dec ebp jz rx_finish xor r13, 02fd380c5h mov ecx, r13d - call rx_read_dataset_r - cmp rsp, rbp + call rx_readint_l1 + cmp rsp, rbx je short not_taken_ret_138 xor rax, qword ptr [rsp + 8] mov r10, rax @@ -1885,12 +1808,11 @@ not_taken_ret_138: mov r10, rax rx_i_139: ;ADD_64 - dec edi + dec ebp jz rx_finish xor r9, 093172470h - mov eax, r9d - and eax, 2047 - mov rax, qword ptr [rsi + rax * 8] + mov ecx, r9d + call rx_readint_l1 add rax, 515364082 mov rcx, rax mov eax, r11d @@ -1899,23 +1821,22 @@ rx_i_139: ;ADD_64 mov qword ptr [rsi + rax * 8], rcx rx_i_140: ;IMUL_32 - dec edi + dec ebp jz rx_finish xor r14, 052543553h - mov eax, r14d - and eax, 2047 - mov rax, qword ptr [rsi + rax * 8] + mov ecx, r14d + call rx_readint_l2 movsxd rcx, eax movsxd rax, r11d imul rax, rcx mov r14, rax rx_i_141: ;FPADD - dec edi + dec ebp jz rx_finish xor r8, 02f636da1h mov ecx, r8d - call rx_read_dataset_f + call rx_readfloat_l1 addpd xmm0, xmm2 movaps xmm9, xmm0 mov eax, r9d @@ -1924,11 +1845,11 @@ rx_i_141: ;FPADD movlpd qword ptr [rsi + rax * 8], xmm9 rx_i_142: ;CALL - dec edi + dec ebp jz rx_finish xor r11, 0b11a4f2ch mov ecx, r11d - call rx_read_dataset_r + call rx_readint_l2 cmp r12d, 1365939282 js short taken_call_142 mov rcx, rax @@ -1942,35 +1863,33 @@ taken_call_142: call rx_i_257 rx_i_143: ;IMUL_32 - dec edi + dec ebp jz rx_finish xor r15, 037f4b5d0h - mov eax, r15d - and eax, 2047 - mov rax, qword ptr [rsi + rax * 8] + mov ecx, r15d + call rx_readint_l2 movsxd rcx, eax movsxd rax, r11d imul rax, rcx mov r9, rax rx_i_144: ;IMULH_64 - dec edi + dec ebp jz rx_finish xor r10, 02e59e00ah - mov eax, r10d - and eax, 2047 - mov rax, qword ptr [rsi + rax * 8] + mov ecx, r10d + call rx_readint_l2 mov rcx, r11 imul rcx mov rax, rdx mov r15, rax rx_i_145: ;IMULH_64 - dec edi + dec ebp jz rx_finish xor r13, 08d5c798h mov ecx, r13d - call rx_read_dataset_r + call rx_readint_l1 mov rcx, r11 imul rcx mov rax, rdx @@ -1981,24 +1900,22 @@ rx_i_145: ;IMULH_64 mov qword ptr [rsi + rax * 8], rcx rx_i_146: ;IMUL_32 - dec edi + dec ebp jz rx_finish xor r13, 02327e6e2h - mov eax, r13d - and eax, 2047 - mov rax, qword ptr [rsi + rax * 8] + mov ecx, r13d + call rx_readint_l1 movsxd rcx, eax movsxd rax, r12d imul rax, rcx mov r10, rax rx_i_147: ;MULH_64 - dec edi + dec ebp jz rx_finish xor r13, 03a7df043h - mov eax, r13d - and eax, 2047 - mov rax, qword ptr [rsi + rax * 8] + mov ecx, r13d + call rx_readint_l1 mov rcx, 1784404616 mul rcx mov rax, rdx @@ -2009,11 +1926,11 @@ rx_i_147: ;MULH_64 mov qword ptr [rsi + rax * 8], rcx rx_i_148: ;SUB_64 - dec edi + dec ebp jz rx_finish xor r10, 0783e5c4eh mov ecx, r10d - call rx_read_dataset_r + call rx_readint_l1 sub rax, r14 mov rcx, rax mov eax, r10d @@ -2022,12 +1939,11 @@ rx_i_148: ;SUB_64 mov qword ptr [rsi + rax * 8], rcx rx_i_149: ;MUL_32 - dec edi + dec ebp jz rx_finish xor r12, 0aa0f5b2fh - mov eax, r12d - and eax, 2047 - mov rax, qword ptr [rsi + rax * 8] + mov ecx, r12d + call rx_readint_l1 mov ecx, eax mov eax, r14d imul rax, rcx @@ -2038,12 +1954,11 @@ rx_i_149: ;MUL_32 mov qword ptr [rsi + rax * 8], rcx rx_i_150: ;DIV_64 - dec edi + dec ebp jz rx_finish xor r9, 01504ca7ah - mov eax, r9d - and eax, 2047 - mov rax, qword ptr [rsi + rax * 8] + mov ecx, r9d + call rx_readint_l1 mov ecx, 1 mov edx, r8d test edx, edx @@ -2057,12 +1972,11 @@ rx_i_150: ;DIV_64 mov qword ptr [rsi + rax * 8], rcx rx_i_151: ;OR_32 - dec edi + dec ebp jz rx_finish xor r9, 0ea72a7cfh - mov eax, r9d - and eax, 2047 - mov rax, qword ptr [rsi + rax * 8] + mov ecx, r9d + call rx_readint_l1 or eax, r13d mov rcx, rax mov eax, r11d @@ -2071,21 +1985,21 @@ rx_i_151: ;OR_32 mov qword ptr [rsi + rax * 8], rcx rx_i_152: ;ROR_64 - dec edi + dec ebp jz rx_finish xor r13, 0ad0e7a88h mov ecx, r13d - call rx_read_dataset_r + call rx_readint_l1 mov rcx, r10 ror rax, cl mov r10, rax rx_i_153: ;FPDIV - dec edi + dec ebp jz rx_finish xor r15, 0fd95ab87h mov ecx, r15d - call rx_read_dataset_f + call rx_readfloat_l1 divpd xmm0, xmm2 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -2097,23 +2011,22 @@ rx_i_153: ;FPDIV movlpd qword ptr [rsi + rax * 8], xmm8 rx_i_154: ;MUL_32 - dec edi + dec ebp jz rx_finish xor r10, 0256697b0h - mov eax, r10d - and eax, 2047 - mov rax, qword ptr [rsi + rax * 8] + mov ecx, r10d + call rx_readint_l2 mov ecx, eax mov eax, r13d imul rax, rcx mov r10, rax rx_i_155: ;ROR_64 - dec edi + dec ebp jz rx_finish xor r11, 0d23f3b78h mov ecx, r11d - call rx_read_dataset_r + call rx_readint_l1 mov rcx, r10 ror rax, cl mov rcx, rax @@ -2123,42 +2036,41 @@ rx_i_155: ;ROR_64 mov qword ptr [rsi + rax * 8], rcx rx_i_156: ;IMUL_32 - dec edi + dec ebp jz rx_finish xor r10, 098917533h mov ecx, r10d - call rx_read_dataset_r + call rx_readint_l2 movsxd rcx, eax movsxd rax, r15d imul rax, rcx mov r15, rax rx_i_157: ;ADD_64 - dec edi + dec ebp jz rx_finish xor r10, 0dfac3efch mov ecx, r10d - call rx_read_dataset_r + call rx_readint_l1 add rax, r12 mov r14, rax rx_i_158: ;ADD_64 - dec edi + dec ebp jz rx_finish xor r15, 0a64de090h - mov eax, r15d - and eax, 2047 - mov rax, qword ptr [rsi + rax * 8] + mov ecx, r15d + call rx_readint_l1 add rax, 1233402159 mov r10, rax rx_i_159: ;RET - dec edi + dec ebp jz rx_finish xor r13, 0952a3abbh mov ecx, r13d - call rx_read_dataset_r - cmp rsp, rbp + call rx_readint_l1 + cmp rsp, rbx je short not_taken_ret_159 xor rax, qword ptr [rsp + 8] mov rcx, rax @@ -2175,11 +2087,11 @@ not_taken_ret_159: mov qword ptr [rsi + rax * 8], rcx rx_i_160: ;SUB_64 - dec edi + dec ebp jz rx_finish xor r14, 0b1685b90h mov ecx, r14d - call rx_read_dataset_r + call rx_readint_l1 sub rax, 1518778665 mov rcx, rax mov eax, r10d @@ -2188,31 +2100,30 @@ rx_i_160: ;SUB_64 mov qword ptr [rsi + rax * 8], rcx rx_i_161: ;OR_64 - dec edi + dec ebp jz rx_finish xor r15, 0ea992531h mov ecx, r15d - call rx_read_dataset_r + call rx_readint_l1 or rax, r14 mov r8, rax rx_i_162: ;SAR_64 - dec edi + dec ebp jz rx_finish xor r9, 01fd57a4ah - mov eax, r9d - and eax, 2047 - mov rax, qword ptr [rsi + rax * 8] + mov ecx, r9d + call rx_readint_l1 mov rcx, r10 sar rax, cl mov r13, rax rx_i_163: ;SUB_64 - dec edi + dec ebp jz rx_finish xor r12, 0e3486c0ah mov ecx, r12d - call rx_read_dataset_r + call rx_readint_l2 sub rax, -2101130488 mov rcx, rax mov eax, r14d @@ -2221,12 +2132,11 @@ rx_i_163: ;SUB_64 mov qword ptr [rsi + rax * 8], rcx rx_i_164: ;MUL_32 - dec edi + dec ebp jz rx_finish xor r12, 01f0c2737h - mov eax, r12d - and eax, 2047 - mov rax, qword ptr [rsi + rax * 8] + mov ecx, r12d + call rx_readint_l1 mov ecx, eax mov eax, r9d imul rax, rcx @@ -2237,13 +2147,12 @@ rx_i_164: ;MUL_32 mov qword ptr [rsi + rax * 8], rcx rx_i_165: ;RET - dec edi + dec ebp jz rx_finish xor r12, 0debb493eh - mov eax, r12d - and eax, 2047 - mov rax, qword ptr [rsi + rax * 8] - cmp rsp, rbp + mov ecx, r12d + call rx_readint_l1 + cmp rsp, rbx je short not_taken_ret_165 xor rax, qword ptr [rsp + 8] mov rcx, rax @@ -2260,12 +2169,11 @@ not_taken_ret_165: mov qword ptr [rsi + rax * 8], rcx rx_i_166: ;ROL_64 - dec edi + dec ebp jz rx_finish xor r9, 0fe684081h - mov eax, r9d - and eax, 2047 - mov rax, qword ptr [rsi + rax * 8] + mov ecx, r9d + call rx_readint_l2 mov rcx, r8 rol rax, cl mov rcx, rax @@ -2275,12 +2183,11 @@ rx_i_166: ;ROL_64 mov qword ptr [rsi + rax * 8], rcx rx_i_167: ;FPMUL - dec edi + dec ebp jz rx_finish xor r11, 0d10371ch - mov eax, r11d - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi + rax * 8] + mov ecx, r11d + call rx_readfloat_l1 mulpd xmm0, xmm4 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -2292,23 +2199,22 @@ rx_i_167: ;FPMUL movlpd qword ptr [rsi + rax * 8], xmm2 rx_i_168: ;FPSQRT - dec edi + dec ebp jz rx_finish xor r12, 071b15effh - mov eax, r12d - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi + rax * 8] + mov ecx, r12d + call rx_readfloat_l1 andps xmm0, xmm10 sqrtpd xmm0, xmm0 movaps xmm7, xmm0 rx_i_169: ;RET - dec edi + dec ebp jz rx_finish xor r11, 072790347h mov ecx, r11d - call rx_read_dataset_r - cmp rsp, rbp + call rx_readint_l1 + cmp rsp, rbx je short not_taken_ret_169 xor rax, qword ptr [rsp + 8] mov rcx, rax @@ -2325,12 +2231,11 @@ not_taken_ret_169: mov qword ptr [rsi + rax * 8], rcx rx_i_170: ;CALL - dec edi + dec ebp jz rx_finish xor r8, 04ae8a020h - mov eax, r8d - and eax, 2047 - mov rax, qword ptr [rsi + rax * 8] + mov ecx, r8d + call rx_readint_l2 cmp r10d, -1541051751 jl short taken_call_170 mov r14, rax @@ -2340,34 +2245,31 @@ taken_call_170: call rx_i_204 rx_i_171: ;IMULH_64 - dec edi + dec ebp jz rx_finish xor r15, 09901e05bh - mov eax, r15d - and eax, 2047 - mov rax, qword ptr [rsi + rax * 8] + mov ecx, r15d + call rx_readint_l1 mov rcx, r12 imul rcx mov rax, rdx mov r12, rax rx_i_172: ;SUB_64 - dec edi + dec ebp jz rx_finish xor r13, 050e8c510h - mov eax, r13d - and eax, 2047 - mov rax, qword ptr [rsi + rax * 8] + mov ecx, r13d + call rx_readint_l1 sub rax, r11 mov r12, rax rx_i_173: ;MUL_64 - dec edi + dec ebp jz rx_finish xor r14, 05422cf8fh - mov eax, r14d - and eax, 2047 - mov rax, qword ptr [rsi + rax * 8] + mov ecx, r14d + call rx_readint_l1 imul rax, r12 mov rcx, rax mov eax, r12d @@ -2376,50 +2278,42 @@ rx_i_173: ;MUL_64 mov qword ptr [rsi + rax * 8], rcx rx_i_174: ;FPROUND - dec edi + dec ebp jz rx_finish xor r12, 0a025c3dbh - mov eax, r12d - and eax, 2047 - mov rax, qword ptr [rsi + rax * 8] - mov rcx, rax + mov ecx, r12d + call rx_readint_l1 shl eax, 13 - and rcx, -2048 and eax, 24576 - cvtsi2sd xmm6, rcx or eax, 40896 mov dword ptr [rsp - 8], eax ldmxcsr dword ptr [rsp - 8] - mov eax, r14d - xor eax, 02be6989fh - and eax, 32767 - movlpd qword ptr [rsi + rax * 8], xmm6 rx_i_175: ;SAR_64 - dec edi + dec ebp jz rx_finish xor r13, 08f74c11h mov ecx, r13d - call rx_read_dataset_r + call rx_readint_l1 mov rcx, r8 sar rax, cl mov r8, rax rx_i_176: ;SUB_64 - dec edi + dec ebp jz rx_finish xor r9, 01f2ed5f1h mov ecx, r9d - call rx_read_dataset_r + call rx_readint_l2 sub rax, r14 mov r10, rax rx_i_177: ;ADD_64 - dec edi + dec ebp jz rx_finish xor r10, 0d2072c79h mov ecx, r10d - call rx_read_dataset_r + call rx_readint_l2 add rax, r10 mov rcx, rax mov eax, r13d @@ -2428,12 +2322,12 @@ rx_i_177: ;ADD_64 mov qword ptr [rsi + rax * 8], rcx rx_i_178: ;RET - dec edi + dec ebp jz rx_finish xor r15, 0a8e51933h mov ecx, r15d - call rx_read_dataset_r - cmp rsp, rbp + call rx_readint_l1 + cmp rsp, rbx je short not_taken_ret_178 xor rax, qword ptr [rsp + 8] mov rcx, rax @@ -2450,22 +2344,20 @@ not_taken_ret_178: mov qword ptr [rsi + rax * 8], rcx rx_i_179: ;FPADD - dec edi + dec ebp jz rx_finish xor r12, 0934ad492h - mov eax, r12d - and eax, 32767 - cvtdq2pd xmm0, qword ptr [rsi + rax * 8] + mov ecx, r12d + call rx_readfloat_l1 addpd xmm0, xmm2 movaps xmm8, xmm0 rx_i_180: ;XOR_64 - dec edi + dec ebp jz rx_finish xor r15, 01cb3ce1fh - mov eax, r15d - and eax, 2047 - mov rax, qword ptr [rsi + rax * 8] + mov ecx, r15d + call rx_readint_l2 xor rax, 1995308563 mov rcx, rax mov eax, r9d @@ -2474,12 +2366,12 @@ rx_i_180: ;XOR_64 mov qword ptr [rsi + rax * 8], rcx rx_i_181: ;RET - dec edi + dec ebp jz rx_finish xor r10, 023c7845fh mov ecx, r10d - call rx_read_dataset_r - cmp rsp, rbp + call rx_readint_l2 + cmp rsp, rbx je short not_taken_ret_181 xor rax, qword ptr [rsp + 8] mov r10, rax @@ -2488,40 +2380,38 @@ not_taken_ret_181: mov r10, rax rx_i_182: ;FPSUB - dec edi + dec ebp jz rx_finish xor r8, 0f8884327h mov ecx, r8d - call rx_read_dataset_f + call rx_readfloat_l1 subpd xmm0, xmm7 movaps xmm6, xmm0 rx_i_183: ;ADD_64 - dec edi + dec ebp jz rx_finish xor r13, 013070461h - mov eax, r13d - and eax, 2047 - mov rax, qword ptr [rsi + rax * 8] + mov ecx, r13d + call rx_readint_l1 add rax, 137260710 mov r10, rax rx_i_184: ;SAR_64 - dec edi + dec ebp jz rx_finish xor r12, 04764cdf7h mov ecx, r12d - call rx_read_dataset_r + call rx_readint_l2 sar rax, 40 mov r12, rax rx_i_185: ;CALL - dec edi + dec ebp jz rx_finish xor r10, 03c41026fh - mov eax, r10d - and eax, 32767 - mov rax, qword ptr [rsi + rax * 8] + mov ecx, r10d + call rx_readint_l1 cmp r15d, -1510284125 jbe short taken_call_185 mov rcx, rax @@ -2535,11 +2425,11 @@ taken_call_185: call rx_i_246 rx_i_186: ;XOR_32 - dec edi + dec ebp jz rx_finish xor r9, 0cded414bh mov ecx, r9d - call rx_read_dataset_r + call rx_readint_l1 xor eax, r15d mov rcx, rax mov eax, r10d @@ -2548,11 +2438,11 @@ rx_i_186: ;XOR_32 mov qword ptr [rsi + rax * 8], rcx rx_i_187: ;FPDIV - dec edi + dec ebp jz rx_finish xor r13, 05c6d64a8h mov ecx, r13d - call rx_read_dataset_f + call rx_readfloat_l2 divpd xmm0, xmm6 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -2560,12 +2450,11 @@ rx_i_187: ;FPDIV movaps xmm5, xmm0 rx_i_188: ;FPMUL - dec edi + dec ebp jz rx_finish xor r9, 04659becbh - mov eax, r9d - and eax, 32767 - cvtdq2pd xmm0, qword ptr [rsi + rax * 8] + mov ecx, r9d + call rx_readfloat_l1 mulpd xmm0, xmm3 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -2573,27 +2462,24 @@ rx_i_188: ;FPMUL movaps xmm4, xmm0 rx_i_189: ;FPROUND - dec edi + dec ebp jz rx_finish xor r11, 0c52741d5h mov ecx, r11d - call rx_read_dataset_r - mov rcx, rax + call rx_readint_l1 shl eax, 13 - and rcx, -2048 and eax, 24576 - cvtsi2sd xmm5, rcx or eax, 40896 mov dword ptr [rsp - 8], eax ldmxcsr dword ptr [rsp - 8] rx_i_190: ;RET - dec edi + dec ebp jz rx_finish xor r12, 0217bf5f3h mov ecx, r12d - call rx_read_dataset_r - cmp rsp, rbp + call rx_readint_l2 + cmp rsp, rbx je short not_taken_ret_190 xor rax, qword ptr [rsp + 8] mov r13, rax @@ -2602,11 +2488,11 @@ not_taken_ret_190: mov r13, rax rx_i_191: ;CALL - dec edi + dec ebp jz rx_finish xor r15, 0884f3526h mov ecx, r15d - call rx_read_dataset_r + call rx_readint_l1 cmp r11d, 1687119072 jno short taken_call_191 mov rcx, rax @@ -2620,12 +2506,11 @@ taken_call_191: call rx_i_275 rx_i_192: ;CALL - dec edi + dec ebp jz rx_finish xor r8, 0d76edad3h - mov eax, r8d - and eax, 2047 - mov rax, qword ptr [rsi + rax * 8] + mov ecx, r8d + call rx_readint_l1 cmp r14d, -117628864 jns short taken_call_192 mov r8, rax @@ -2635,11 +2520,11 @@ taken_call_192: call rx_i_305 rx_i_193: ;MUL_32 - dec edi + dec ebp jz rx_finish xor r12, 0e9939ach mov ecx, r12d - call rx_read_dataset_r + call rx_readint_l1 mov ecx, eax mov eax, r12d imul rax, rcx @@ -2650,12 +2535,11 @@ rx_i_193: ;MUL_32 mov qword ptr [rsi + rax * 8], rcx rx_i_194: ;FPMUL - dec edi + dec ebp jz rx_finish xor r12, 0f21ca520h - mov eax, r12d - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi + rax * 8] + mov ecx, r12d + call rx_readfloat_l2 mulpd xmm0, xmm2 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -2667,21 +2551,21 @@ rx_i_194: ;FPMUL movlpd qword ptr [rsi + rax * 8], xmm5 rx_i_195: ;ROL_64 - dec edi + dec ebp jz rx_finish xor r10, 09405152ch mov ecx, r10d - call rx_read_dataset_r + call rx_readint_l1 mov rcx, r8 rol rax, cl mov r9, rax rx_i_196: ;SUB_64 - dec edi + dec ebp jz rx_finish xor r8, 0c2a9f41bh mov ecx, r8d - call rx_read_dataset_r + call rx_readint_l2 sub rax, -1907903895 mov rcx, rax mov eax, r13d @@ -2690,21 +2574,20 @@ rx_i_196: ;SUB_64 mov qword ptr [rsi + rax * 8], rcx rx_i_197: ;MUL_64 - dec edi + dec ebp jz rx_finish xor r12, 0229208efh mov ecx, r12d - call rx_read_dataset_r + call rx_readint_l1 imul rax, r15 mov r11, rax rx_i_198: ;MULH_64 - dec edi + dec ebp jz rx_finish xor r14, 0c8d95bbbh - mov eax, r14d - and eax, 32767 - mov rax, qword ptr [rsi + rax * 8] + mov ecx, r14d + call rx_readint_l1 mov rcx, r14 mul rcx mov rax, rdx @@ -2715,11 +2598,11 @@ rx_i_198: ;MULH_64 mov qword ptr [rsi + rax * 8], rcx rx_i_199: ;MULH_64 - dec edi + dec ebp jz rx_finish xor r13, 050049e2eh mov ecx, r13d - call rx_read_dataset_r + call rx_readint_l2 mov rcx, r10 mul rcx mov rax, rdx @@ -2730,11 +2613,11 @@ rx_i_199: ;MULH_64 mov qword ptr [rsi + rax * 8], rcx rx_i_200: ;FPSUB - dec edi + dec ebp jz rx_finish xor r10, 0c63b99e8h mov ecx, r10d - call rx_read_dataset_f + call rx_readfloat_l1 subpd xmm0, xmm2 movaps xmm4, xmm0 mov eax, r12d @@ -2743,12 +2626,11 @@ rx_i_200: ;FPSUB movhpd qword ptr [rsi + rax * 8], xmm4 rx_i_201: ;FPADD - dec edi + dec ebp jz rx_finish xor r8, 0cdda801dh - mov eax, r8d - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi + rax * 8] + mov ecx, r8d + call rx_readfloat_l1 addpd xmm0, xmm9 movaps xmm4, xmm0 mov eax, r12d @@ -2757,21 +2639,20 @@ rx_i_201: ;FPADD movlpd qword ptr [rsi + rax * 8], xmm4 rx_i_202: ;FPSUB - dec edi + dec ebp jz rx_finish xor r13, 0fa44b04ah - mov eax, r13d - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi + rax * 8] + mov ecx, r13d + call rx_readfloat_l2 subpd xmm0, xmm9 movaps xmm5, xmm0 rx_i_203: ;FPSUB - dec edi + dec ebp jz rx_finish xor r10, 0d73e472ch mov ecx, r10d - call rx_read_dataset_f + call rx_readfloat_l1 subpd xmm0, xmm2 movaps xmm7, xmm0 mov eax, r15d @@ -2780,12 +2661,11 @@ rx_i_203: ;FPSUB movhpd qword ptr [rsi + rax * 8], xmm7 rx_i_204: ;MUL_64 - dec edi + dec ebp jz rx_finish xor r9, 01af8ab1dh - mov eax, r9d - and eax, 32767 - mov rax, qword ptr [rsi + rax * 8] + mov ecx, r9d + call rx_readint_l1 imul rax, r15 mov rcx, rax mov eax, r8d @@ -2794,12 +2674,11 @@ rx_i_204: ;MUL_64 mov qword ptr [rsi + rax * 8], rcx rx_i_205: ;FPDIV - dec edi + dec ebp jz rx_finish xor r14, 094e997c5h - mov eax, r14d - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi + rax * 8] + mov ecx, r14d + call rx_readfloat_l1 divpd xmm0, xmm8 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -2807,11 +2686,11 @@ rx_i_205: ;FPDIV movaps xmm5, xmm0 rx_i_206: ;FPMUL - dec edi + dec ebp jz rx_finish xor r11, 0e836a177h mov ecx, r11d - call rx_read_dataset_f + call rx_readfloat_l1 mulpd xmm0, xmm7 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -2819,11 +2698,11 @@ rx_i_206: ;FPMUL movaps xmm4, xmm0 rx_i_207: ;AND_32 - dec edi + dec ebp jz rx_finish xor r9, 039ccdd30h mov ecx, r9d - call rx_read_dataset_r + call rx_readint_l1 and eax, r12d mov rcx, rax mov eax, r9d @@ -2832,21 +2711,20 @@ rx_i_207: ;AND_32 mov qword ptr [rsi + rax * 8], rcx rx_i_208: ;MUL_64 - dec edi + dec ebp jz rx_finish xor r9, 0f4f126c5h mov ecx, r9d - call rx_read_dataset_r + call rx_readint_l1 imul rax, r12 mov r10, rax rx_i_209: ;SHR_64 - dec edi + dec ebp jz rx_finish xor r8, 0b84811f1h - mov eax, r8d - and eax, 2047 - mov rax, qword ptr [rsi + rax * 8] + mov ecx, r8d + call rx_readint_l1 shr rax, 30 mov rcx, rax mov eax, r12d @@ -2855,11 +2733,11 @@ rx_i_209: ;SHR_64 mov qword ptr [rsi + rax * 8], rcx rx_i_210: ;MUL_32 - dec edi + dec ebp jz rx_finish xor r12, 0c5efc90ah mov ecx, r12d - call rx_read_dataset_r + call rx_readint_l2 mov ecx, eax mov eax, -1027162400 imul rax, rcx @@ -2870,21 +2748,20 @@ rx_i_210: ;MUL_32 mov qword ptr [rsi + rax * 8], rcx rx_i_211: ;FPADD - dec edi + dec ebp jz rx_finish xor r12, 0ce533072h - mov eax, r12d - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi + rax * 8] + mov ecx, r12d + call rx_readfloat_l2 addpd xmm0, xmm9 movaps xmm3, xmm0 rx_i_212: ;MUL_64 - dec edi + dec ebp jz rx_finish xor r13, 06b465fdbh mov ecx, r13d - call rx_read_dataset_r + call rx_readint_l1 imul rax, r13 mov rcx, rax mov eax, r15d @@ -2893,43 +2770,41 @@ rx_i_212: ;MUL_64 mov qword ptr [rsi + rax * 8], rcx rx_i_213: ;IMUL_32 - dec edi + dec ebp jz rx_finish xor r13, 02dd1d503h - mov eax, r13d - and eax, 2047 - mov rax, qword ptr [rsi + rax * 8] + mov ecx, r13d + call rx_readint_l1 movsxd rcx, eax mov rax, 129993589 imul rax, rcx mov r14, rax rx_i_214: ;ROL_64 - dec edi + dec ebp jz rx_finish xor r9, 0a159f313h - mov eax, r9d - and eax, 2047 - mov rax, qword ptr [rsi + rax * 8] + mov ecx, r9d + call rx_readint_l1 mov rcx, r14 rol rax, cl mov r14, rax rx_i_215: ;SUB_64 - dec edi + dec ebp jz rx_finish xor r15, 08359265eh mov ecx, r15d - call rx_read_dataset_r + call rx_readint_l1 sub rax, r12 mov r10, rax rx_i_216: ;MUL_64 - dec edi + dec ebp jz rx_finish xor r12, 080696de3h mov ecx, r12d - call rx_read_dataset_r + call rx_readint_l1 imul rax, r13 mov rcx, rax mov eax, r15d @@ -2938,12 +2813,11 @@ rx_i_216: ;MUL_64 mov qword ptr [rsi + rax * 8], rcx rx_i_217: ;IMUL_32 - dec edi + dec ebp jz rx_finish xor r8, 040d5b526h - mov eax, r8d - and eax, 2047 - mov rax, qword ptr [rsi + rax * 8] + mov ecx, r8d + call rx_readint_l1 movsxd rcx, eax movsxd rax, r9d imul rax, rcx @@ -2954,11 +2828,11 @@ rx_i_217: ;IMUL_32 mov qword ptr [rsi + rax * 8], rcx rx_i_218: ;CALL - dec edi + dec ebp jz rx_finish xor r11, 083c0bd93h mov ecx, r11d - call rx_read_dataset_r + call rx_readint_l2 cmp r8d, -585552250 jge short taken_call_218 mov r11, rax @@ -2968,11 +2842,11 @@ taken_call_218: call rx_i_240 rx_i_219: ;XOR_64 - dec edi + dec ebp jz rx_finish xor r8, 0ca37f668h mov ecx, r8d - call rx_read_dataset_r + call rx_readint_l1 xor rax, -740915304 mov rcx, rax mov eax, r15d @@ -2981,11 +2855,11 @@ rx_i_219: ;XOR_64 mov qword ptr [rsi + rax * 8], rcx rx_i_220: ;IMUL_32 - dec edi + dec ebp jz rx_finish xor r9, 0bb44c384h mov ecx, r9d - call rx_read_dataset_r + call rx_readint_l1 movsxd rcx, eax movsxd rax, r11d imul rax, rcx @@ -2996,12 +2870,11 @@ rx_i_220: ;IMUL_32 mov qword ptr [rsi + rax * 8], rcx rx_i_221: ;IMULH_64 - dec edi + dec ebp jz rx_finish xor r9, 0a3deb512h - mov eax, r9d - and eax, 2047 - mov rax, qword ptr [rsi + rax * 8] + mov ecx, r9d + call rx_readint_l1 mov rcx, r15 imul rcx mov rax, rdx @@ -3012,12 +2885,11 @@ rx_i_221: ;IMULH_64 mov qword ptr [rsi + rax * 8], rcx rx_i_222: ;FPMUL - dec edi + dec ebp jz rx_finish xor r9, 084a02d64h - mov eax, r9d - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi + rax * 8] + mov ecx, r9d + call rx_readfloat_l2 mulpd xmm0, xmm5 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -3029,12 +2901,11 @@ rx_i_222: ;FPMUL movhpd qword ptr [rsi + rax * 8], xmm7 rx_i_223: ;FPSUB - dec edi + dec ebp jz rx_finish xor r8, 01e5cc085h - mov eax, r8d - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi + rax * 8] + mov ecx, r8d + call rx_readfloat_l1 subpd xmm0, xmm3 movaps xmm2, xmm0 mov eax, r10d @@ -3043,11 +2914,11 @@ rx_i_223: ;FPSUB movlpd qword ptr [rsi + rax * 8], xmm2 rx_i_224: ;SAR_64 - dec edi + dec ebp jz rx_finish xor r12, 053982440h mov ecx, r12d - call rx_read_dataset_r + call rx_readint_l1 mov rcx, r14 sar rax, cl mov rcx, rax @@ -3057,11 +2928,11 @@ rx_i_224: ;SAR_64 mov qword ptr [rsi + rax * 8], rcx rx_i_225: ;DIV_64 - dec edi + dec ebp jz rx_finish xor r13, 0c558367eh mov ecx, r13d - call rx_read_dataset_r + call rx_readint_l2 mov ecx, 1 mov edx, r10d test edx, edx @@ -3075,11 +2946,11 @@ rx_i_225: ;DIV_64 mov qword ptr [rsi + rax * 8], rcx rx_i_226: ;CALL - dec edi + dec ebp jz rx_finish xor r10, 040139b65h mov ecx, r10d - call rx_read_dataset_r + call rx_readint_l1 cmp r8d, -1752488808 jno short taken_call_226 mov rcx, rax @@ -3093,12 +2964,11 @@ taken_call_226: call rx_i_328 rx_i_227: ;FPDIV - dec edi + dec ebp jz rx_finish xor r11, 0fa312dbdh - mov eax, r11d - and eax, 32767 - cvtdq2pd xmm0, qword ptr [rsi + rax * 8] + mov ecx, r11d + call rx_readfloat_l1 divpd xmm0, xmm7 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -3110,12 +2980,11 @@ rx_i_227: ;FPDIV movlpd qword ptr [rsi + rax * 8], xmm3 rx_i_228: ;CALL - dec edi + dec ebp jz rx_finish xor r11, 0b64246c0h - mov eax, r11d - and eax, 2047 - mov rax, qword ptr [rsi + rax * 8] + mov ecx, r11d + call rx_readint_l1 cmp r10d, -2099304 jns short taken_call_228 mov rcx, rax @@ -3129,11 +2998,11 @@ taken_call_228: call rx_i_283 rx_i_229: ;IMUL_32 - dec edi + dec ebp jz rx_finish xor r11, 05c535836h mov ecx, r11d - call rx_read_dataset_r + call rx_readint_l2 movsxd rcx, eax movsxd rax, r12d imul rax, rcx @@ -3144,12 +3013,11 @@ rx_i_229: ;IMUL_32 mov qword ptr [rsi + rax * 8], rcx rx_i_230: ;FPMUL - dec edi + dec ebp jz rx_finish xor r15, 0f394972eh - mov eax, r15d - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi + rax * 8] + mov ecx, r15d + call rx_readfloat_l1 mulpd xmm0, xmm6 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -3161,12 +3029,12 @@ rx_i_230: ;FPMUL movhpd qword ptr [rsi + rax * 8], xmm5 rx_i_231: ;RET - dec edi + dec ebp jz rx_finish xor r9, 0bb56428dh mov ecx, r9d - call rx_read_dataset_r - cmp rsp, rbp + call rx_readint_l1 + cmp rsp, rbx je short not_taken_ret_231 xor rax, qword ptr [rsp + 8] mov rcx, rax @@ -3183,12 +3051,11 @@ not_taken_ret_231: mov qword ptr [rsi + rax * 8], rcx rx_i_232: ;FPDIV - dec edi + dec ebp jz rx_finish xor r15, 09ab46ab3h - mov eax, r15d - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi + rax * 8] + mov ecx, r15d + call rx_readfloat_l1 divpd xmm0, xmm3 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -3196,11 +3063,11 @@ rx_i_232: ;FPDIV movaps xmm7, xmm0 rx_i_233: ;CALL - dec edi + dec ebp jz rx_finish xor r13, 08eb2cd76h mov ecx, r13d - call rx_read_dataset_r + call rx_readint_l1 cmp r12d, 392389867 jo short taken_call_233 mov r14, rax @@ -3210,27 +3077,23 @@ taken_call_233: call rx_i_268 rx_i_234: ;FPROUND - dec edi + dec ebp jz rx_finish xor r15, 0ba687578h - mov eax, r15d - and eax, 2047 - mov rax, qword ptr [rsi + rax * 8] - mov rcx, rax + mov ecx, r15d + call rx_readint_l1 shl eax, 13 - and rcx, -2048 and eax, 24576 - cvtsi2sd xmm4, rcx or eax, 40896 mov dword ptr [rsp - 8], eax ldmxcsr dword ptr [rsp - 8] rx_i_235: ;IMUL_32 - dec edi + dec ebp jz rx_finish xor r13, 0b6cb9ff2h mov ecx, r13d - call rx_read_dataset_r + call rx_readint_l2 movsxd rcx, eax movsxd rax, r12d imul rax, rcx @@ -3241,20 +3104,20 @@ rx_i_235: ;IMUL_32 mov qword ptr [rsi + rax * 8], rcx rx_i_236: ;FPADD - dec edi + dec ebp jz rx_finish xor r15, 03ad196ach mov ecx, r15d - call rx_read_dataset_f + call rx_readfloat_l2 addpd xmm0, xmm4 movaps xmm3, xmm0 rx_i_237: ;CALL - dec edi + dec ebp jz rx_finish xor r15, 0fab4600h mov ecx, r15d - call rx_read_dataset_r + call rx_readint_l2 cmp r12d, -121899164 jge short taken_call_237 mov r11, rax @@ -3264,11 +3127,11 @@ taken_call_237: call rx_i_295 rx_i_238: ;FPADD - dec edi + dec ebp jz rx_finish xor r8, 0158f119fh mov ecx, r8d - call rx_read_dataset_f + call rx_readfloat_l2 addpd xmm0, xmm6 movaps xmm7, xmm0 mov eax, r15d @@ -3277,32 +3140,31 @@ rx_i_238: ;FPADD movlpd qword ptr [rsi + rax * 8], xmm7 rx_i_239: ;ADD_64 - dec edi + dec ebp jz rx_finish xor r13, 044f30b3fh - mov eax, r13d - and eax, 2047 - mov rax, qword ptr [rsi + rax * 8] + mov ecx, r13d + call rx_readint_l1 add rax, r10 mov r10, rax rx_i_240: ;IMUL_32 - dec edi + dec ebp jz rx_finish xor r9, 0d65d29f9h mov ecx, r9d - call rx_read_dataset_r + call rx_readint_l1 movsxd rcx, eax mov rax, -423830277 imul rax, rcx mov r8, rax rx_i_241: ;FPADD - dec edi + dec ebp jz rx_finish xor r11, 0ce5260adh mov ecx, r11d - call rx_read_dataset_f + call rx_readfloat_l1 addpd xmm0, xmm3 movaps xmm7, xmm0 mov eax, r15d @@ -3311,12 +3173,11 @@ rx_i_241: ;FPADD movhpd qword ptr [rsi + rax * 8], xmm7 rx_i_242: ;MULH_64 - dec edi + dec ebp jz rx_finish xor r12, 01119b0f9h - mov eax, r12d - and eax, 2047 - mov rax, qword ptr [rsi + rax * 8] + mov ecx, r12d + call rx_readint_l2 mov rcx, 319324914 mul rcx mov rax, rdx @@ -3327,31 +3188,29 @@ rx_i_242: ;MULH_64 mov qword ptr [rsi + rax * 8], rcx rx_i_243: ;XOR_64 - dec edi + dec ebp jz rx_finish xor r12, 0d6c2ce3dh - mov eax, r12d - and eax, 2047 - mov rax, qword ptr [rsi + rax * 8] + mov ecx, r12d + call rx_readint_l1 xor rax, 1198180774 mov r14, rax rx_i_244: ;FPADD - dec edi + dec ebp jz rx_finish xor r11, 0c6a6248h - mov eax, r11d - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi + rax * 8] + mov ecx, r11d + call rx_readfloat_l2 addpd xmm0, xmm6 movaps xmm9, xmm0 rx_i_245: ;XOR_64 - dec edi + dec ebp jz rx_finish xor r13, 084505739h mov ecx, r13d - call rx_read_dataset_r + call rx_readint_l1 xor rax, -1546539637 mov rcx, rax mov eax, r12d @@ -3360,21 +3219,20 @@ rx_i_245: ;XOR_64 mov qword ptr [rsi + rax * 8], rcx rx_i_246: ;AND_64 - dec edi + dec ebp jz rx_finish xor r15, 027eeaa2eh - mov eax, r15d - and eax, 2047 - mov rax, qword ptr [rsi + rax * 8] + mov ecx, r15d + call rx_readint_l2 and rax, r9 mov r12, rax rx_i_247: ;IMUL_32 - dec edi + dec ebp jz rx_finish xor r10, 0c4de0296h mov ecx, r10d - call rx_read_dataset_r + call rx_readint_l1 movsxd rcx, eax movsxd rax, r14d imul rax, rcx @@ -3385,12 +3243,11 @@ rx_i_247: ;IMUL_32 mov qword ptr [rsi + rax * 8], rcx rx_i_248: ;MUL_32 - dec edi + dec ebp jz rx_finish xor r8, 0649df46fh - mov eax, r8d - and eax, 2047 - mov rax, qword ptr [rsi + rax * 8] + mov ecx, r8d + call rx_readint_l2 mov ecx, eax mov eax, r15d imul rax, rcx @@ -3401,11 +3258,11 @@ rx_i_248: ;MUL_32 mov qword ptr [rsi + rax * 8], rcx rx_i_249: ;IMUL_32 - dec edi + dec ebp jz rx_finish xor r15, 0499552cch mov ecx, r15d - call rx_read_dataset_r + call rx_readint_l2 movsxd rcx, eax movsxd rax, r11d imul rax, rcx @@ -3416,12 +3273,11 @@ rx_i_249: ;IMUL_32 mov qword ptr [rsi + rax * 8], rcx rx_i_250: ;MUL_64 - dec edi + dec ebp jz rx_finish xor r13, 083eafe6fh - mov eax, r13d - and eax, 2047 - mov rax, qword ptr [rsi + rax * 8] + mov ecx, r13d + call rx_readint_l2 imul rax, r8 mov rcx, rax mov eax, r14d @@ -3430,12 +3286,11 @@ rx_i_250: ;MUL_64 mov qword ptr [rsi + rax * 8], rcx rx_i_251: ;FPMUL - dec edi + dec ebp jz rx_finish xor r13, 0a25a4d8ah - mov eax, r13d - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi + rax * 8] + mov ecx, r13d + call rx_readfloat_l2 mulpd xmm0, xmm2 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -3447,21 +3302,21 @@ rx_i_251: ;FPMUL movlpd qword ptr [rsi + rax * 8], xmm4 rx_i_252: ;ROL_64 - dec edi + dec ebp jz rx_finish xor r14, 08a75ad41h mov ecx, r14d - call rx_read_dataset_r + call rx_readint_l2 mov rcx, r8 rol rax, cl mov r14, rax rx_i_253: ;CALL - dec edi + dec ebp jz rx_finish xor r14, 057f3f596h mov ecx, r14d - call rx_read_dataset_r + call rx_readint_l1 cmp r15d, 1699431947 jns short taken_call_253 mov rcx, rax @@ -3475,11 +3330,11 @@ taken_call_253: call rx_i_367 rx_i_254: ;FPSUB - dec edi + dec ebp jz rx_finish xor r14, 04cfb709eh mov ecx, r14d - call rx_read_dataset_f + call rx_readfloat_l1 subpd xmm0, xmm4 movaps xmm8, xmm0 mov eax, r8d @@ -3488,11 +3343,11 @@ rx_i_254: ;FPSUB movlpd qword ptr [rsi + rax * 8], xmm8 rx_i_255: ;FPADD - dec edi + dec ebp jz rx_finish xor r9, 0b96ec9ech mov ecx, r9d - call rx_read_dataset_f + call rx_readfloat_l1 addpd xmm0, xmm5 movaps xmm6, xmm0 mov eax, r14d @@ -3501,11 +3356,11 @@ rx_i_255: ;FPADD movlpd qword ptr [rsi + rax * 8], xmm6 rx_i_256: ;MULH_64 - dec edi + dec ebp jz rx_finish xor r8, 08375472ch mov ecx, r8d - call rx_read_dataset_r + call rx_readint_l1 mov rcx, r15 mul rcx mov rax, rdx @@ -3516,11 +3371,11 @@ rx_i_256: ;MULH_64 mov qword ptr [rsi + rax * 8], rcx rx_i_257: ;FPADD - dec edi + dec ebp jz rx_finish xor r12, 0d75a8c3fh mov ecx, r12d - call rx_read_dataset_f + call rx_readfloat_l2 addpd xmm0, xmm5 movaps xmm3, xmm0 mov eax, r11d @@ -3529,12 +3384,11 @@ rx_i_257: ;FPADD movhpd qword ptr [rsi + rax * 8], xmm3 rx_i_258: ;MUL_32 - dec edi + dec ebp jz rx_finish xor r11, 064fdbda0h - mov eax, r11d - and eax, 32767 - mov rax, qword ptr [rsi + rax * 8] + mov ecx, r11d + call rx_readint_l1 mov ecx, eax mov eax, r14d imul rax, rcx @@ -3545,22 +3399,20 @@ rx_i_258: ;MUL_32 mov qword ptr [rsi + rax * 8], rcx rx_i_259: ;FPADD - dec edi + dec ebp jz rx_finish xor r11, 02e36a073h - mov eax, r11d - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi + rax * 8] + mov ecx, r11d + call rx_readfloat_l1 addpd xmm0, xmm9 movaps xmm3, xmm0 rx_i_260: ;FPMUL - dec edi + dec ebp jz rx_finish xor r13, 0f94e9fa9h - mov eax, r13d - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi + rax * 8] + mov ecx, r13d + call rx_readfloat_l2 mulpd xmm0, xmm5 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -3568,11 +3420,11 @@ rx_i_260: ;FPMUL movaps xmm9, xmm0 rx_i_261: ;FPSQRT - dec edi + dec ebp jz rx_finish xor r14, 02346171ch mov ecx, r14d - call rx_read_dataset_f + call rx_readfloat_l2 andps xmm0, xmm10 sqrtpd xmm0, xmm0 movaps xmm3, xmm0 @@ -3582,12 +3434,11 @@ rx_i_261: ;FPSQRT movlpd qword ptr [rsi + rax * 8], xmm3 rx_i_262: ;OR_32 - dec edi + dec ebp jz rx_finish xor r10, 01c42baa6h - mov eax, r10d - and eax, 2047 - mov rax, qword ptr [rsi + rax * 8] + mov ecx, r10d + call rx_readint_l1 or eax, r13d mov rcx, rax mov eax, r11d @@ -3596,11 +3447,11 @@ rx_i_262: ;OR_32 mov qword ptr [rsi + rax * 8], rcx rx_i_263: ;FPDIV - dec edi + dec ebp jz rx_finish xor r11, 0b39b140h mov ecx, r11d - call rx_read_dataset_f + call rx_readfloat_l1 divpd xmm0, xmm8 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -3608,12 +3459,11 @@ rx_i_263: ;FPDIV movaps xmm6, xmm0 rx_i_264: ;FPMUL - dec edi + dec ebp jz rx_finish xor r11, 01a07d201h - mov eax, r11d - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi + rax * 8] + mov ecx, r11d + call rx_readfloat_l2 mulpd xmm0, xmm3 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -3621,12 +3471,11 @@ rx_i_264: ;FPMUL movaps xmm7, xmm0 rx_i_265: ;FPADD - dec edi + dec ebp jz rx_finish xor r13, 07a3eb340h - mov eax, r13d - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi + rax * 8] + mov ecx, r13d + call rx_readfloat_l2 addpd xmm0, xmm8 movaps xmm2, xmm0 mov eax, r10d @@ -3635,13 +3484,12 @@ rx_i_265: ;FPADD movhpd qword ptr [rsi + rax * 8], xmm2 rx_i_266: ;RET - dec edi + dec ebp jz rx_finish xor r13, 03d0a3a89h - mov eax, r13d - and eax, 32767 - mov rax, qword ptr [rsi + rax * 8] - cmp rsp, rbp + mov ecx, r13d + call rx_readint_l1 + cmp rsp, rbx je short not_taken_ret_266 xor rax, qword ptr [rsp + 8] mov r10, rax @@ -3650,22 +3498,20 @@ not_taken_ret_266: mov r10, rax rx_i_267: ;ROR_64 - dec edi + dec ebp jz rx_finish xor r8, 0c6c7b37h - mov eax, r8d - and eax, 2047 - mov rax, qword ptr [rsi + rax * 8] + mov ecx, r8d + call rx_readint_l1 ror rax, 56 mov r11, rax rx_i_268: ;CALL - dec edi + dec ebp jz rx_finish xor r12, 0c2510cebh - mov eax, r12d - and eax, 2047 - mov rax, qword ptr [rsi + rax * 8] + mov ecx, r12d + call rx_readint_l2 cmp r15d, -2062812966 jl short taken_call_268 mov r13, rax @@ -3675,11 +3521,11 @@ taken_call_268: call rx_i_381 rx_i_269: ;ROR_64 - dec edi + dec ebp jz rx_finish xor r11, 0c80cc899h mov ecx, r11d - call rx_read_dataset_r + call rx_readint_l1 mov rcx, r8 ror rax, cl mov rcx, rax @@ -3689,11 +3535,11 @@ rx_i_269: ;ROR_64 mov qword ptr [rsi + rax * 8], rcx rx_i_270: ;FPMUL - dec edi + dec ebp jz rx_finish xor r11, 0eb355caah mov ecx, r11d - call rx_read_dataset_f + call rx_readfloat_l1 mulpd xmm0, xmm9 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -3701,11 +3547,11 @@ rx_i_270: ;FPMUL movaps xmm7, xmm0 rx_i_271: ;MUL_32 - dec edi + dec ebp jz rx_finish xor r13, 0c6f12299h mov ecx, r13d - call rx_read_dataset_r + call rx_readint_l1 mov ecx, eax mov eax, -2032281772 imul rax, rcx @@ -3716,22 +3562,20 @@ rx_i_271: ;MUL_32 mov qword ptr [rsi + rax * 8], rcx rx_i_272: ;OR_32 - dec edi + dec ebp jz rx_finish xor r12, 0695a5dd2h - mov eax, r12d - and eax, 2047 - mov rax, qword ptr [rsi + rax * 8] + mov ecx, r12d + call rx_readint_l2 or eax, r12d mov r13, rax rx_i_273: ;CALL - dec edi + dec ebp jz rx_finish xor r9, 0d315e4dch - mov eax, r9d - and eax, 2047 - mov rax, qword ptr [rsi + rax * 8] + mov ecx, r9d + call rx_readint_l1 cmp r12d, 1670848568 jl short taken_call_273 mov rcx, rax @@ -3745,11 +3589,11 @@ taken_call_273: call rx_i_372 rx_i_274: ;FPSUB - dec edi + dec ebp jz rx_finish xor r15, 0b66ca7e0h mov ecx, r15d - call rx_read_dataset_f + call rx_readfloat_l2 subpd xmm0, xmm4 movaps xmm6, xmm0 mov eax, r14d @@ -3758,20 +3602,20 @@ rx_i_274: ;FPSUB movlpd qword ptr [rsi + rax * 8], xmm6 rx_i_275: ;OR_64 - dec edi + dec ebp jz rx_finish xor r10, 0788eceb7h mov ecx, r10d - call rx_read_dataset_r + call rx_readint_l2 or rax, r11 mov r13, rax rx_i_276: ;CALL - dec edi + dec ebp jz rx_finish xor r9, 0c6ac5edah mov ecx, r9d - call rx_read_dataset_r + call rx_readint_l1 cmp r11d, -1236180570 jns short taken_call_276 mov rcx, rax @@ -3785,11 +3629,11 @@ taken_call_276: call rx_i_404 rx_i_277: ;IMUL_32 - dec edi + dec ebp jz rx_finish xor r11, 0c9549789h mov ecx, r11d - call rx_read_dataset_r + call rx_readint_l2 movsxd rcx, eax movsxd rax, r10d imul rax, rcx @@ -3800,11 +3644,11 @@ rx_i_277: ;IMUL_32 mov qword ptr [rsi + rax * 8], rcx rx_i_278: ;FPSUB - dec edi + dec ebp jz rx_finish xor r9, 0a2bc66c9h mov ecx, r9d - call rx_read_dataset_f + call rx_readfloat_l1 subpd xmm0, xmm7 movaps xmm4, xmm0 mov eax, r12d @@ -3813,11 +3657,11 @@ rx_i_278: ;FPSUB movlpd qword ptr [rsi + rax * 8], xmm4 rx_i_279: ;FPSUB - dec edi + dec ebp jz rx_finish xor r15, 0f1a91458h mov ecx, r15d - call rx_read_dataset_f + call rx_readfloat_l1 subpd xmm0, xmm5 movaps xmm9, xmm0 mov eax, r9d @@ -3826,12 +3670,11 @@ rx_i_279: ;FPSUB movhpd qword ptr [rsi + rax * 8], xmm9 rx_i_280: ;AND_64 - dec edi + dec ebp jz rx_finish xor r12, 066246b43h - mov eax, r12d - and eax, 2047 - mov rax, qword ptr [rsi + rax * 8] + mov ecx, r12d + call rx_readint_l2 and rax, r11 mov rcx, rax mov eax, r13d @@ -3840,11 +3683,11 @@ rx_i_280: ;AND_64 mov qword ptr [rsi + rax * 8], rcx rx_i_281: ;SUB_64 - dec edi + dec ebp jz rx_finish xor r10, 05a762727h mov ecx, r10d - call rx_read_dataset_r + call rx_readint_l1 sub rax, r10 mov rcx, rax mov eax, r11d @@ -3853,21 +3696,20 @@ rx_i_281: ;SUB_64 mov qword ptr [rsi + rax * 8], rcx rx_i_282: ;SUB_32 - dec edi + dec ebp jz rx_finish xor r15, 0de1ab603h mov ecx, r15d - call rx_read_dataset_r + call rx_readint_l1 sub eax, 1367326224 mov r11, rax rx_i_283: ;ADD_32 - dec edi + dec ebp jz rx_finish xor r9, 0df4d084fh - mov eax, r9d - and eax, 32767 - mov rax, qword ptr [rsi + rax * 8] + mov ecx, r9d + call rx_readint_l1 add eax, -1156732976 mov rcx, rax mov eax, r12d @@ -3876,11 +3718,11 @@ rx_i_283: ;ADD_32 mov qword ptr [rsi + rax * 8], rcx rx_i_284: ;FPSUB - dec edi + dec ebp jz rx_finish xor r15, 0e68f36ach mov ecx, r15d - call rx_read_dataset_f + call rx_readfloat_l1 subpd xmm0, xmm6 movaps xmm9, xmm0 mov eax, r9d @@ -3889,33 +3731,31 @@ rx_i_284: ;FPSUB movlpd qword ptr [rsi + rax * 8], xmm9 rx_i_285: ;IMUL_32 - dec edi + dec ebp jz rx_finish xor r8, 09adb333bh mov ecx, r8d - call rx_read_dataset_r + call rx_readint_l1 movsxd rcx, eax movsxd rax, r8d imul rax, rcx mov r14, rax rx_i_286: ;FPADD - dec edi + dec ebp jz rx_finish xor r14, 082f5e36ch - mov eax, r14d - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi + rax * 8] + mov ecx, r14d + call rx_readfloat_l1 addpd xmm0, xmm9 movaps xmm7, xmm0 rx_i_287: ;OR_64 - dec edi + dec ebp jz rx_finish xor r11, 049547c9ch - mov eax, r11d - and eax, 2047 - mov rax, qword ptr [rsi + rax * 8] + mov ecx, r11d + call rx_readint_l1 or rax, r15 mov rcx, rax mov eax, r8d @@ -3924,12 +3764,11 @@ rx_i_287: ;OR_64 mov qword ptr [rsi + rax * 8], rcx rx_i_288: ;MUL_64 - dec edi + dec ebp jz rx_finish xor r10, 08716ac8bh - mov eax, r10d - and eax, 2047 - mov rax, qword ptr [rsi + rax * 8] + mov ecx, r10d + call rx_readint_l1 imul rax, r8 mov rcx, rax mov eax, r9d @@ -3938,12 +3777,11 @@ rx_i_288: ;MUL_64 mov qword ptr [rsi + rax * 8], rcx rx_i_289: ;FPDIV - dec edi + dec ebp jz rx_finish xor r14, 0efef52b5h - mov eax, r14d - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi + rax * 8] + mov ecx, r14d + call rx_readfloat_l2 divpd xmm0, xmm9 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -3951,11 +3789,11 @@ rx_i_289: ;FPDIV movaps xmm8, xmm0 rx_i_290: ;FPMUL - dec edi + dec ebp jz rx_finish xor r15, 060665748h mov ecx, r15d - call rx_read_dataset_f + call rx_readfloat_l2 mulpd xmm0, xmm8 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -3963,13 +3801,12 @@ rx_i_290: ;FPMUL movaps xmm9, xmm0 rx_i_291: ;RET - dec edi + dec ebp jz rx_finish xor r13, 0ddf4bd1ah - mov eax, r13d - and eax, 2047 - mov rax, qword ptr [rsi + rax * 8] - cmp rsp, rbp + mov ecx, r13d + call rx_readint_l2 + cmp rsp, rbx je short not_taken_ret_291 xor rax, qword ptr [rsp + 8] mov rcx, rax @@ -3986,31 +3823,30 @@ not_taken_ret_291: mov qword ptr [rsi + rax * 8], rcx rx_i_292: ;ROR_64 - dec edi + dec ebp jz rx_finish xor r13, 05a87cc3dh - mov eax, r13d - and eax, 32767 - mov rax, qword ptr [rsi + rax * 8] + mov ecx, r13d + call rx_readint_l1 ror rax, 23 mov r10, rax rx_i_293: ;FPSUB - dec edi + dec ebp jz rx_finish xor r9, 0c61f4279h mov ecx, r9d - call rx_read_dataset_f + call rx_readfloat_l2 subpd xmm0, xmm5 movaps xmm8, xmm0 rx_i_294: ;RET - dec edi + dec ebp jz rx_finish xor r14, 0f3b9d85h mov ecx, r14d - call rx_read_dataset_r - cmp rsp, rbp + call rx_readint_l2 + cmp rsp, rbx je short not_taken_ret_294 xor rax, qword ptr [rsp + 8] mov rcx, rax @@ -4027,20 +3863,20 @@ not_taken_ret_294: mov qword ptr [rsi + rax * 8], rcx rx_i_295: ;FPSUB - dec edi + dec ebp jz rx_finish xor r9, 0f42798fdh mov ecx, r9d - call rx_read_dataset_f + call rx_readfloat_l1 subpd xmm0, xmm8 movaps xmm7, xmm0 rx_i_296: ;CALL - dec edi + dec ebp jz rx_finish xor r14, 018738758h mov ecx, r14d - call rx_read_dataset_r + call rx_readint_l1 cmp r9d, -207252278 jns short taken_call_296 mov rcx, rax @@ -4054,32 +3890,29 @@ taken_call_296: call rx_i_395 rx_i_297: ;ADD_64 - dec edi + dec ebp jz rx_finish xor r15, 0de3b9d9bh - mov eax, r15d - and eax, 2047 - mov rax, qword ptr [rsi + rax * 8] + mov ecx, r15d + call rx_readint_l1 add rax, r10 mov r14, rax rx_i_298: ;FPSUB - dec edi + dec ebp jz rx_finish xor r14, 084f53637h - mov eax, r14d - and eax, 32767 - cvtdq2pd xmm0, qword ptr [rsi + rax * 8] + mov ecx, r14d + call rx_readfloat_l1 subpd xmm0, xmm7 movaps xmm6, xmm0 rx_i_299: ;ADD_64 - dec edi + dec ebp jz rx_finish xor r12, 042f4897h - mov eax, r12d - and eax, 2047 - mov rax, qword ptr [rsi + rax * 8] + mov ecx, r12d + call rx_readint_l1 add rax, 21400308 mov rcx, rax mov eax, r12d @@ -4088,21 +3921,20 @@ rx_i_299: ;ADD_64 mov qword ptr [rsi + rax * 8], rcx rx_i_300: ;FPSUB - dec edi + dec ebp jz rx_finish xor r12, 095765693h - mov eax, r12d - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi + rax * 8] + mov ecx, r12d + call rx_readfloat_l2 subpd xmm0, xmm8 movaps xmm2, xmm0 rx_i_301: ;FPMUL - dec edi + dec ebp jz rx_finish xor r8, 0a0ec5eech mov ecx, r8d - call rx_read_dataset_f + call rx_readfloat_l1 mulpd xmm0, xmm5 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -4114,21 +3946,20 @@ rx_i_301: ;FPMUL movlpd qword ptr [rsi + rax * 8], xmm7 rx_i_302: ;ADD_64 - dec edi + dec ebp jz rx_finish xor r15, 0f6f8c345h mov ecx, r15d - call rx_read_dataset_r + call rx_readint_l1 add rax, r10 mov r11, rax rx_i_303: ;FPADD - dec edi + dec ebp jz rx_finish xor r14, 082a3e965h - mov eax, r14d - and eax, 32767 - cvtdq2pd xmm0, qword ptr [rsi + rax * 8] + mov ecx, r14d + call rx_readfloat_l1 addpd xmm0, xmm3 movaps xmm9, xmm0 mov eax, r9d @@ -4137,60 +3968,57 @@ rx_i_303: ;FPADD movlpd qword ptr [rsi + rax * 8], xmm9 rx_i_304: ;MUL_64 - dec edi + dec ebp jz rx_finish xor r12, 04940c652h - mov eax, r12d - and eax, 2047 - mov rax, qword ptr [rsi + rax * 8] + mov ecx, r12d + call rx_readint_l1 imul rax, r15 mov r13, rax rx_i_305: ;MUL_64 - dec edi + dec ebp jz rx_finish xor r11, 03c6c62b8h mov ecx, r11d - call rx_read_dataset_r + call rx_readint_l2 imul rax, rax, -65873120 mov r10, rax rx_i_306: ;ADD_64 - dec edi + dec ebp jz rx_finish xor r15, 08b34cdfch mov ecx, r15d - call rx_read_dataset_r + call rx_readint_l2 add rax, r15 mov r13, rax rx_i_307: ;SAR_64 - dec edi + dec ebp jz rx_finish xor r15, 04c36adb1h mov ecx, r15d - call rx_read_dataset_r + call rx_readint_l1 mov rcx, r8 sar rax, cl mov r10, rax rx_i_308: ;MUL_64 - dec edi + dec ebp jz rx_finish xor r11, 0a4213b21h - mov eax, r11d - and eax, 2047 - mov rax, qword ptr [rsi + rax * 8] + mov ecx, r11d + call rx_readint_l2 imul rax, r13 mov r15, rax rx_i_309: ;IMULH_64 - dec edi + dec ebp jz rx_finish xor r9, 090c42304h - mov eax, r9d - and eax, 32767 - mov rax, qword ptr [rsi + rax * 8] + mov ecx, r9d + call rx_readint_l1 mov rcx, -1652850028 imul rcx mov rax, rdx @@ -4201,12 +4029,11 @@ rx_i_309: ;IMULH_64 mov qword ptr [rsi + rax * 8], rcx rx_i_310: ;FPMUL - dec edi + dec ebp jz rx_finish xor r9, 0f78e1c8ch - mov eax, r9d - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi + rax * 8] + mov ecx, r9d + call rx_readfloat_l1 mulpd xmm0, xmm6 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -4218,11 +4045,11 @@ rx_i_310: ;FPMUL movhpd qword ptr [rsi + rax * 8], xmm7 rx_i_311: ;FPMUL - dec edi + dec ebp jz rx_finish xor r8, 0ff8848cfh mov ecx, r8d - call rx_read_dataset_f + call rx_readfloat_l1 mulpd xmm0, xmm4 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -4230,32 +4057,31 @@ rx_i_311: ;FPMUL movaps xmm4, xmm0 rx_i_312: ;MUL_32 - dec edi + dec ebp jz rx_finish xor r13, 0b18904cdh mov ecx, r13d - call rx_read_dataset_r + call rx_readint_l1 mov ecx, eax mov eax, -1147928648 imul rax, rcx mov r10, rax rx_i_313: ;FPADD - dec edi + dec ebp jz rx_finish xor r8, 0a0d0befh - mov eax, r8d - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi + rax * 8] + mov ecx, r8d + call rx_readfloat_l1 addpd xmm0, xmm5 movaps xmm6, xmm0 rx_i_314: ;IMUL_32 - dec edi + dec ebp jz rx_finish xor r15, 01e3c65f7h mov ecx, r15d - call rx_read_dataset_r + call rx_readint_l1 movsxd rcx, eax movsxd rax, r9d imul rax, rcx @@ -4266,23 +4092,22 @@ rx_i_314: ;IMUL_32 mov qword ptr [rsi + rax * 8], rcx rx_i_315: ;SHR_64 - dec edi + dec ebp jz rx_finish xor r9, 02e36ddafh - mov eax, r9d - and eax, 32767 - mov rax, qword ptr [rsi + rax * 8] + mov ecx, r9d + call rx_readint_l1 mov rcx, r15 shr rax, cl mov r9, rax rx_i_316: ;RET - dec edi + dec ebp jz rx_finish xor r14, 05b0cb5bbh mov ecx, r14d - call rx_read_dataset_r - cmp rsp, rbp + call rx_readint_l1 + cmp rsp, rbx je short not_taken_ret_316 xor rax, qword ptr [rsp + 8] mov rcx, rax @@ -4299,29 +4124,29 @@ not_taken_ret_316: mov qword ptr [rsi + rax * 8], rcx rx_i_317: ;FPADD - dec edi + dec ebp jz rx_finish xor r9, 0c74e7415h mov ecx, r9d - call rx_read_dataset_f + call rx_readfloat_l1 addpd xmm0, xmm7 movaps xmm5, xmm0 rx_i_318: ;FPADD - dec edi + dec ebp jz rx_finish xor r9, 057621d9ah mov ecx, r9d - call rx_read_dataset_f + call rx_readfloat_l1 addpd xmm0, xmm3 movaps xmm7, xmm0 rx_i_319: ;ROL_64 - dec edi + dec ebp jz rx_finish xor r13, 08ee02d99h mov ecx, r13d - call rx_read_dataset_r + call rx_readint_l2 mov rcx, r15 rol rax, cl mov rcx, rax @@ -4331,11 +4156,11 @@ rx_i_319: ;ROL_64 mov qword ptr [rsi + rax * 8], rcx rx_i_320: ;FPADD - dec edi + dec ebp jz rx_finish xor r15, 013461188h mov ecx, r15d - call rx_read_dataset_f + call rx_readfloat_l1 addpd xmm0, xmm4 movaps xmm2, xmm0 mov eax, r10d @@ -4344,12 +4169,11 @@ rx_i_320: ;FPADD movlpd qword ptr [rsi + rax * 8], xmm2 rx_i_321: ;IMUL_32 - dec edi + dec ebp jz rx_finish xor r11, 0a7bae383h - mov eax, r11d - and eax, 32767 - mov rax, qword ptr [rsi + rax * 8] + mov ecx, r11d + call rx_readint_l1 movsxd rcx, eax movsxd rax, r9d imul rax, rcx @@ -4360,12 +4184,12 @@ rx_i_321: ;IMUL_32 mov qword ptr [rsi + rax * 8], rcx rx_i_322: ;RET - dec edi + dec ebp jz rx_finish xor r14, 08215399bh mov ecx, r14d - call rx_read_dataset_r - cmp rsp, rbp + call rx_readint_l1 + cmp rsp, rbx je short not_taken_ret_322 xor rax, qword ptr [rsp + 8] mov rcx, rax @@ -4382,12 +4206,11 @@ not_taken_ret_322: mov qword ptr [rsi + rax * 8], rcx rx_i_323: ;MULH_64 - dec edi + dec ebp jz rx_finish xor r14, 07b07664bh - mov eax, r14d - and eax, 32767 - mov rax, qword ptr [rsi + rax * 8] + mov ecx, r14d + call rx_readint_l1 mov rcx, -696924877 mul rcx mov rax, rdx @@ -4398,12 +4221,11 @@ rx_i_323: ;MULH_64 mov qword ptr [rsi + rax * 8], rcx rx_i_324: ;FPSQRT - dec edi + dec ebp jz rx_finish xor r9, 0f956baffh - mov eax, r9d - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi + rax * 8] + mov ecx, r9d + call rx_readfloat_l1 andps xmm0, xmm10 sqrtpd xmm0, xmm0 movaps xmm9, xmm0 @@ -4413,21 +4235,20 @@ rx_i_324: ;FPSQRT movlpd qword ptr [rsi + rax * 8], xmm9 rx_i_325: ;SHL_64 - dec edi + dec ebp jz rx_finish xor r11, 0708ab9d1h - mov eax, r11d - and eax, 2047 - mov rax, qword ptr [rsi + rax * 8] + mov ecx, r11d + call rx_readint_l1 shl rax, 24 mov r13, rax rx_i_326: ;MULH_64 - dec edi + dec ebp jz rx_finish xor r11, 0d1b27540h mov ecx, r11d - call rx_read_dataset_r + call rx_readint_l1 mov rcx, r8 mul rcx mov rax, rdx @@ -4438,33 +4259,31 @@ rx_i_326: ;MULH_64 mov qword ptr [rsi + rax * 8], rcx rx_i_327: ;AND_64 - dec edi + dec ebp jz rx_finish xor r9, 09665f98dh - mov eax, r9d - and eax, 2047 - mov rax, qword ptr [rsi + rax * 8] + mov ecx, r9d + call rx_readint_l1 and rax, r15 mov r12, rax rx_i_328: ;ROL_64 - dec edi + dec ebp jz rx_finish xor r12, 0fb9c32adh - mov eax, r12d - and eax, 2047 - mov rax, qword ptr [rsi + rax * 8] + mov ecx, r12d + call rx_readint_l1 mov rcx, r13 rol rax, cl mov r9, rax rx_i_329: ;RET - dec edi + dec ebp jz rx_finish xor r11, 0e1110623h mov ecx, r11d - call rx_read_dataset_r - cmp rsp, rbp + call rx_readint_l1 + cmp rsp, rbx je short not_taken_ret_329 xor rax, qword ptr [rsp + 8] mov r11, rax @@ -4473,12 +4292,11 @@ not_taken_ret_329: mov r11, rax rx_i_330: ;MUL_32 - dec edi + dec ebp jz rx_finish xor r9, 0f6a93f19h - mov eax, r9d - and eax, 32767 - mov rax, qword ptr [rsi + rax * 8] + mov ecx, r9d + call rx_readint_l1 mov ecx, eax mov eax, r13d imul rax, rcx @@ -4489,22 +4307,20 @@ rx_i_330: ;MUL_32 mov qword ptr [rsi + rax * 8], rcx rx_i_331: ;FPADD - dec edi + dec ebp jz rx_finish xor r9, 0bc9bbe4ah - mov eax, r9d - and eax, 32767 - cvtdq2pd xmm0, qword ptr [rsi + rax * 8] + mov ecx, r9d + call rx_readfloat_l1 addpd xmm0, xmm3 movaps xmm9, xmm0 rx_i_332: ;FPADD - dec edi + dec ebp jz rx_finish xor r12, 0f253cd4eh - mov eax, r12d - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi + rax * 8] + mov ecx, r12d + call rx_readfloat_l1 addpd xmm0, xmm6 movaps xmm3, xmm0 mov eax, r11d @@ -4513,31 +4329,29 @@ rx_i_332: ;FPADD movhpd qword ptr [rsi + rax * 8], xmm3 rx_i_333: ;XOR_64 - dec edi + dec ebp jz rx_finish xor r14, 0f009758bh - mov eax, r14d - and eax, 2047 - mov rax, qword ptr [rsi + rax * 8] + mov ecx, r14d + call rx_readint_l2 xor rax, -175125848 mov r11, rax rx_i_334: ;ADD_32 - dec edi + dec ebp jz rx_finish xor r8, 0dda04168h mov ecx, r8d - call rx_read_dataset_r + call rx_readint_l1 add eax, r13d mov r8, rax rx_i_335: ;SUB_64 - dec edi + dec ebp jz rx_finish xor r15, 03e6cfb73h - mov eax, r15d - and eax, 32767 - mov rax, qword ptr [rsi + rax * 8] + mov ecx, r15d + call rx_readint_l1 sub rax, r8 mov rcx, rax mov eax, r12d @@ -4546,21 +4360,20 @@ rx_i_335: ;SUB_64 mov qword ptr [rsi + rax * 8], rcx rx_i_336: ;FPADD - dec edi + dec ebp jz rx_finish xor r15, 0aea0a435h - mov eax, r15d - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi + rax * 8] + mov ecx, r15d + call rx_readfloat_l1 addpd xmm0, xmm2 movaps xmm3, xmm0 rx_i_337: ;ADD_32 - dec edi + dec ebp jz rx_finish xor r8, 03d6c4ab2h mov ecx, r8d - call rx_read_dataset_r + call rx_readint_l1 add eax, r12d mov rcx, rax mov eax, r13d @@ -4569,41 +4382,38 @@ rx_i_337: ;ADD_32 mov qword ptr [rsi + rax * 8], rcx rx_i_338: ;MUL_64 - dec edi + dec ebp jz rx_finish xor r12, 0d428a742h - mov eax, r12d - and eax, 2047 - mov rax, qword ptr [rsi + rax * 8] + mov ecx, r12d + call rx_readint_l2 imul rax, r12 mov r11, rax rx_i_339: ;FPADD - dec edi + dec ebp jz rx_finish xor r9, 04596ef73h - mov eax, r9d - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi + rax * 8] + mov ecx, r9d + call rx_readfloat_l1 addpd xmm0, xmm6 movaps xmm2, xmm0 rx_i_340: ;FPSUB - dec edi + dec ebp jz rx_finish xor r15, 0e51629cch mov ecx, r15d - call rx_read_dataset_f + call rx_readfloat_l1 subpd xmm0, xmm5 movaps xmm5, xmm0 rx_i_341: ;MUL_32 - dec edi + dec ebp jz rx_finish xor r12, 019eb9ea5h - mov eax, r12d - and eax, 2047 - mov rax, qword ptr [rsi + rax * 8] + mov ecx, r12d + call rx_readint_l1 mov ecx, eax mov eax, r15d imul rax, rcx @@ -4614,11 +4424,11 @@ rx_i_341: ;MUL_32 mov qword ptr [rsi + rax * 8], rcx rx_i_342: ;FPMUL - dec edi + dec ebp jz rx_finish xor r9, 09ccc7abah mov ecx, r9d - call rx_read_dataset_f + call rx_readfloat_l2 mulpd xmm0, xmm2 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -4626,12 +4436,11 @@ rx_i_342: ;FPMUL movaps xmm3, xmm0 rx_i_343: ;SHR_64 - dec edi + dec ebp jz rx_finish xor r14, 056f6cf0bh - mov eax, r14d - and eax, 2047 - mov rax, qword ptr [rsi + rax * 8] + mov ecx, r14d + call rx_readint_l1 shr rax, 48 mov rcx, rax mov eax, r15d @@ -4640,12 +4449,11 @@ rx_i_343: ;SHR_64 mov qword ptr [rsi + rax * 8], rcx rx_i_344: ;FPMUL - dec edi + dec ebp jz rx_finish xor r10, 03ef9bcc4h - mov eax, r10d - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi + rax * 8] + mov ecx, r10d + call rx_readfloat_l2 mulpd xmm0, xmm6 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -4653,11 +4461,11 @@ rx_i_344: ;FPMUL movaps xmm5, xmm0 rx_i_345: ;MULH_64 - dec edi + dec ebp jz rx_finish xor r12, 0bbbcdbach mov ecx, r12d - call rx_read_dataset_r + call rx_readint_l1 mov rcx, r13 mul rcx mov rax, rdx @@ -4668,11 +4476,11 @@ rx_i_345: ;MULH_64 mov qword ptr [rsi + rax * 8], rcx rx_i_346: ;XOR_64 - dec edi + dec ebp jz rx_finish xor r12, 0ae9d1e96h mov ecx, r12d - call rx_read_dataset_r + call rx_readint_l1 xor rax, r15 mov rcx, rax mov eax, r13d @@ -4681,22 +4489,20 @@ rx_i_346: ;XOR_64 mov qword ptr [rsi + rax * 8], rcx rx_i_347: ;ADD_64 - dec edi + dec ebp jz rx_finish xor r14, 070c34d69h - mov eax, r14d - and eax, 2047 - mov rax, qword ptr [rsi + rax * 8] + mov ecx, r14d + call rx_readint_l1 add rax, r10 mov r13, rax rx_i_348: ;FPSUB - dec edi + dec ebp jz rx_finish xor r13, 0523ff904h - mov eax, r13d - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi + rax * 8] + mov ecx, r13d + call rx_readfloat_l1 subpd xmm0, xmm3 movaps xmm9, xmm0 mov eax, r9d @@ -4705,22 +4511,20 @@ rx_i_348: ;FPSUB movlpd qword ptr [rsi + rax * 8], xmm9 rx_i_349: ;XOR_32 - dec edi + dec ebp jz rx_finish xor r8, 018e0e5ddh - mov eax, r8d - and eax, 2047 - mov rax, qword ptr [rsi + rax * 8] + mov ecx, r8d + call rx_readint_l2 xor eax, r15d mov r13, rax rx_i_350: ;CALL - dec edi + dec ebp jz rx_finish xor r9, 09bd050f0h - mov eax, r9d - and eax, 32767 - mov rax, qword ptr [rsi + rax * 8] + mov ecx, r9d + call rx_readint_l1 cmp r9d, -980411581 jbe short taken_call_350 mov rcx, rax @@ -4734,20 +4538,20 @@ taken_call_350: call rx_i_352 rx_i_351: ;MUL_64 - dec edi + dec ebp jz rx_finish xor r11, 0a3a5906fh mov ecx, r11d - call rx_read_dataset_r + call rx_readint_l2 imul rax, r10 mov r13, rax rx_i_352: ;FPADD - dec edi + dec ebp jz rx_finish xor r10, 0afc9af2bh mov ecx, r10d - call rx_read_dataset_f + call rx_readfloat_l1 addpd xmm0, xmm6 movaps xmm2, xmm0 mov eax, r10d @@ -4756,12 +4560,11 @@ rx_i_352: ;FPADD movhpd qword ptr [rsi + rax * 8], xmm2 rx_i_353: ;FPMUL - dec edi + dec ebp jz rx_finish xor r13, 02e65278bh - mov eax, r13d - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi + rax * 8] + mov ecx, r13d + call rx_readfloat_l1 mulpd xmm0, xmm2 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -4773,22 +4576,22 @@ rx_i_353: ;FPMUL movhpd qword ptr [rsi + rax * 8], xmm7 rx_i_354: ;MULH_64 - dec edi + dec ebp jz rx_finish xor r13, 02412fc10h mov ecx, r13d - call rx_read_dataset_r + call rx_readint_l2 mov rcx, r13 mul rcx mov rax, rdx mov r13, rax rx_i_355: ;MUL_64 - dec edi + dec ebp jz rx_finish xor r10, 06bd6e65fh mov ecx, r10d - call rx_read_dataset_r + call rx_readint_l1 imul rax, r14 mov rcx, rax mov eax, r8d @@ -4797,31 +4600,29 @@ rx_i_355: ;MUL_64 mov qword ptr [rsi + rax * 8], rcx rx_i_356: ;MUL_64 - dec edi + dec ebp jz rx_finish xor r10, 01cd85d80h mov ecx, r10d - call rx_read_dataset_r + call rx_readint_l2 imul rax, r10 mov r11, rax rx_i_357: ;ADD_64 - dec edi + dec ebp jz rx_finish xor r10, 0f7daed36h - mov eax, r10d - and eax, 2047 - mov rax, qword ptr [rsi + rax * 8] + mov ecx, r10d + call rx_readint_l2 add rax, 820073637 mov r11, rax rx_i_358: ;DIV_64 - dec edi + dec ebp jz rx_finish xor r13, 088fa6e5ah - mov eax, r13d - and eax, 2047 - mov rax, qword ptr [rsi + rax * 8] + mov ecx, r13d + call rx_readint_l2 mov ecx, 1 mov edx, r11d test edx, edx @@ -4831,11 +4632,11 @@ rx_i_358: ;DIV_64 mov r9, rax rx_i_359: ;FPSUB - dec edi + dec ebp jz rx_finish xor r10, 0714fc2cdh mov ecx, r10d - call rx_read_dataset_f + call rx_readfloat_l2 subpd xmm0, xmm9 movaps xmm4, xmm0 mov eax, r12d @@ -4844,12 +4645,11 @@ rx_i_359: ;FPSUB movhpd qword ptr [rsi + rax * 8], xmm4 rx_i_360: ;FPMUL - dec edi + dec ebp jz rx_finish xor r10, 0c2d110b5h - mov eax, r10d - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi + rax * 8] + mov ecx, r10d + call rx_readfloat_l1 mulpd xmm0, xmm8 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -4857,11 +4657,11 @@ rx_i_360: ;FPMUL movaps xmm8, xmm0 rx_i_361: ;FPSQRT - dec edi + dec ebp jz rx_finish xor r15, 01d125a7fh mov ecx, r15d - call rx_read_dataset_f + call rx_readfloat_l1 andps xmm0, xmm10 sqrtpd xmm0, xmm0 movaps xmm6, xmm0 @@ -4871,12 +4671,11 @@ rx_i_361: ;FPSQRT movhpd qword ptr [rsi + rax * 8], xmm6 rx_i_362: ;SUB_64 - dec edi + dec ebp jz rx_finish xor r9, 0ed8954bdh - mov eax, r9d - and eax, 2047 - mov rax, qword ptr [rsi + rax * 8] + mov ecx, r9d + call rx_readint_l1 sub rax, 1082179469 mov rcx, rax mov eax, r15d @@ -4885,12 +4684,11 @@ rx_i_362: ;SUB_64 mov qword ptr [rsi + rax * 8], rcx rx_i_363: ;FPMUL - dec edi + dec ebp jz rx_finish xor r12, 09f75887bh - mov eax, r12d - and eax, 32767 - cvtdq2pd xmm0, qword ptr [rsi + rax * 8] + mov ecx, r12d + call rx_readfloat_l1 mulpd xmm0, xmm6 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -4898,23 +4696,22 @@ rx_i_363: ;FPMUL movaps xmm3, xmm0 rx_i_364: ;MULH_64 - dec edi + dec ebp jz rx_finish xor r11, 0badaf867h mov ecx, r11d - call rx_read_dataset_r + call rx_readint_l1 mov rcx, r8 mul rcx mov rax, rdx mov r8, rax rx_i_365: ;IMUL_32 - dec edi + dec ebp jz rx_finish xor r15, 02db4444ah - mov eax, r15d - and eax, 2047 - mov rax, qword ptr [rsi + rax * 8] + mov ecx, r15d + call rx_readint_l2 movsxd rcx, eax movsxd rax, r9d imul rax, rcx @@ -4925,12 +4722,11 @@ rx_i_365: ;IMUL_32 mov qword ptr [rsi + rax * 8], rcx rx_i_366: ;IMUL_32 - dec edi + dec ebp jz rx_finish xor r12, 0bff7218fh - mov eax, r12d - and eax, 2047 - mov rax, qword ptr [rsi + rax * 8] + mov ecx, r12d + call rx_readint_l2 movsxd rcx, eax movsxd rax, r8d imul rax, rcx @@ -4941,11 +4737,11 @@ rx_i_366: ;IMUL_32 mov qword ptr [rsi + rax * 8], rcx rx_i_367: ;FPADD - dec edi + dec ebp jz rx_finish xor r9, 04d14cb3ah mov ecx, r9d - call rx_read_dataset_f + call rx_readfloat_l2 addpd xmm0, xmm9 movaps xmm4, xmm0 mov eax, r12d @@ -4954,31 +4750,29 @@ rx_i_367: ;FPADD movlpd qword ptr [rsi + rax * 8], xmm4 rx_i_368: ;MUL_64 - dec edi + dec ebp jz rx_finish xor r10, 0a14836bah mov ecx, r10d - call rx_read_dataset_r + call rx_readint_l1 imul rax, r10 mov r8, rax rx_i_369: ;AND_64 - dec edi + dec ebp jz rx_finish xor r9, 053fe22e2h - mov eax, r9d - and eax, 32767 - mov rax, qword ptr [rsi + rax * 8] + mov ecx, r9d + call rx_readint_l1 and rax, r13 mov r9, rax rx_i_370: ;FPSUB - dec edi + dec ebp jz rx_finish xor r15, 010e1fb24h - mov eax, r15d - and eax, 32767 - cvtdq2pd xmm0, qword ptr [rsi + rax * 8] + mov ecx, r15d + call rx_readfloat_l1 subpd xmm0, xmm6 movaps xmm6, xmm0 mov eax, r14d @@ -4987,11 +4781,11 @@ rx_i_370: ;FPSUB movhpd qword ptr [rsi + rax * 8], xmm6 rx_i_371: ;FPADD - dec edi + dec ebp jz rx_finish xor r8, 0ebbd5cc9h mov ecx, r8d - call rx_read_dataset_f + call rx_readfloat_l1 addpd xmm0, xmm9 movaps xmm5, xmm0 mov eax, r13d @@ -5000,23 +4794,21 @@ rx_i_371: ;FPADD movhpd qword ptr [rsi + rax * 8], xmm5 rx_i_372: ;ROL_64 - dec edi + dec ebp jz rx_finish xor r10, 098ab79d7h - mov eax, r10d - and eax, 2047 - mov rax, qword ptr [rsi + rax * 8] + mov ecx, r10d + call rx_readint_l2 mov rcx, r13 rol rax, cl mov r9, rax rx_i_373: ;FPDIV - dec edi + dec ebp jz rx_finish xor r15, 056438b3h - mov eax, r15d - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi + rax * 8] + mov ecx, r15d + call rx_readfloat_l2 divpd xmm0, xmm8 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -5024,11 +4816,11 @@ rx_i_373: ;FPDIV movaps xmm4, xmm0 rx_i_374: ;FPMUL - dec edi + dec ebp jz rx_finish xor r11, 0dbcce604h mov ecx, r11d - call rx_read_dataset_f + call rx_readfloat_l1 mulpd xmm0, xmm2 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -5036,11 +4828,11 @@ rx_i_374: ;FPMUL movaps xmm2, xmm0 rx_i_375: ;ADD_64 - dec edi + dec ebp jz rx_finish xor r9, 0edea6200h mov ecx, r9d - call rx_read_dataset_r + call rx_readint_l1 add rax, r15 mov rcx, rax mov eax, r12d @@ -5049,12 +4841,11 @@ rx_i_375: ;ADD_64 mov qword ptr [rsi + rax * 8], rcx rx_i_376: ;ADD_64 - dec edi + dec ebp jz rx_finish xor r14, 05e61b279h - mov eax, r14d - and eax, 2047 - mov rax, qword ptr [rsi + rax * 8] + mov ecx, r14d + call rx_readint_l1 add rax, 476136066 mov rcx, rax mov eax, r8d @@ -5063,31 +4854,31 @@ rx_i_376: ;ADD_64 mov qword ptr [rsi + rax * 8], rcx rx_i_377: ;FPSUB - dec edi + dec ebp jz rx_finish xor r14, 0fc1fb433h mov ecx, r14d - call rx_read_dataset_f + call rx_readfloat_l1 subpd xmm0, xmm3 movaps xmm7, xmm0 rx_i_378: ;MUL_32 - dec edi + dec ebp jz rx_finish xor r12, 082aa21ach mov ecx, r12d - call rx_read_dataset_r + call rx_readint_l1 mov ecx, eax mov eax, 547725353 imul rax, rcx mov r15, rax rx_i_379: ;FPADD - dec edi + dec ebp jz rx_finish xor r10, 05dba41fbh mov ecx, r10d - call rx_read_dataset_f + call rx_readfloat_l2 addpd xmm0, xmm9 movaps xmm5, xmm0 mov eax, r13d @@ -5096,12 +4887,11 @@ rx_i_379: ;FPADD movlpd qword ptr [rsi + rax * 8], xmm5 rx_i_380: ;MUL_64 - dec edi + dec ebp jz rx_finish xor r11, 0229e3d6eh - mov eax, r11d - and eax, 32767 - mov rax, qword ptr [rsi + rax * 8] + mov ecx, r11d + call rx_readint_l1 imul rax, rax, -1443002912 mov rcx, rax mov eax, r13d @@ -5110,21 +4900,21 @@ rx_i_380: ;MUL_64 mov qword ptr [rsi + rax * 8], rcx rx_i_381: ;SAR_64 - dec edi + dec ebp jz rx_finish xor r8, 019816ff9h mov ecx, r8d - call rx_read_dataset_r + call rx_readint_l2 mov rcx, r14 sar rax, cl mov r9, rax rx_i_382: ;FPADD - dec edi + dec ebp jz rx_finish xor r14, 036b5b81fh mov ecx, r14d - call rx_read_dataset_f + call rx_readfloat_l1 addpd xmm0, xmm3 movaps xmm3, xmm0 mov eax, r11d @@ -5133,12 +4923,11 @@ rx_i_382: ;FPADD movlpd qword ptr [rsi + rax * 8], xmm3 rx_i_383: ;FPSUB - dec edi + dec ebp jz rx_finish xor r15, 05f798ec3h - mov eax, r15d - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi + rax * 8] + mov ecx, r15d + call rx_readfloat_l1 subpd xmm0, xmm4 movaps xmm5, xmm0 mov eax, r13d @@ -5147,11 +4936,11 @@ rx_i_383: ;FPSUB movhpd qword ptr [rsi + rax * 8], xmm5 rx_i_384: ;SHR_64 - dec edi + dec ebp jz rx_finish xor r10, 05b459fd7h mov ecx, r10d - call rx_read_dataset_r + call rx_readint_l1 mov rcx, r11 shr rax, cl mov rcx, rax @@ -5161,11 +4950,11 @@ rx_i_384: ;SHR_64 mov qword ptr [rsi + rax * 8], rcx rx_i_385: ;MUL_64 - dec edi + dec ebp jz rx_finish xor r15, 0c91749bbh mov ecx, r15d - call rx_read_dataset_r + call rx_readint_l1 imul rax, r12 mov rcx, rax mov eax, r13d @@ -5174,31 +4963,30 @@ rx_i_385: ;MUL_64 mov qword ptr [rsi + rax * 8], rcx rx_i_386: ;FPADD - dec edi + dec ebp jz rx_finish xor r9, 0575b4bdch mov ecx, r9d - call rx_read_dataset_f + call rx_readfloat_l2 addpd xmm0, xmm8 movaps xmm9, xmm0 rx_i_387: ;MUL_64 - dec edi + dec ebp jz rx_finish xor r9, 0d4f7bc6ah mov ecx, r9d - call rx_read_dataset_r + call rx_readint_l2 imul rax, r15 mov r9, rax rx_i_388: ;RET - dec edi + dec ebp jz rx_finish xor r8, 08a949356h - mov eax, r8d - and eax, 2047 - mov rax, qword ptr [rsi + rax * 8] - cmp rsp, rbp + mov ecx, r8d + call rx_readint_l2 + cmp rsp, rbx je short not_taken_ret_388 xor rax, qword ptr [rsp + 8] mov rcx, rax @@ -5215,12 +5003,11 @@ not_taken_ret_388: mov qword ptr [rsi + rax * 8], rcx rx_i_389: ;CALL - dec edi + dec ebp jz rx_finish xor r11, 06531ad2eh - mov eax, r11d - and eax, 32767 - mov rax, qword ptr [rsi + rax * 8] + mov ecx, r11d + call rx_readint_l1 cmp r9d, -350609584 jge short taken_call_389 mov r14, rax @@ -5230,32 +5017,29 @@ taken_call_389: call rx_i_421 rx_i_390: ;FPADD - dec edi + dec ebp jz rx_finish xor r15, 02914abeah - mov eax, r15d - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi + rax * 8] + mov ecx, r15d + call rx_readfloat_l1 addpd xmm0, xmm4 movaps xmm3, xmm0 rx_i_391: ;FPADD - dec edi + dec ebp jz rx_finish xor r8, 0473a41f0h - mov eax, r8d - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi + rax * 8] + mov ecx, r8d + call rx_readfloat_l1 addpd xmm0, xmm3 movaps xmm6, xmm0 rx_i_392: ;ROR_64 - dec edi + dec ebp jz rx_finish xor r14, 01ebc1f0dh - mov eax, r14d - and eax, 2047 - mov rax, qword ptr [rsi + rax * 8] + mov ecx, r14d + call rx_readint_l2 ror rax, 0 mov rcx, rax mov eax, r13d @@ -5264,12 +5048,11 @@ rx_i_392: ;ROR_64 mov qword ptr [rsi + rax * 8], rcx rx_i_393: ;OR_32 - dec edi + dec ebp jz rx_finish xor r14, 0742e95b1h - mov eax, r14d - and eax, 2047 - mov rax, qword ptr [rsi + rax * 8] + mov ecx, r14d + call rx_readint_l1 or eax, 552339548 mov rcx, rax mov eax, r13d @@ -5278,22 +5061,20 @@ rx_i_393: ;OR_32 mov qword ptr [rsi + rax * 8], rcx rx_i_394: ;FPADD - dec edi + dec ebp jz rx_finish xor r12, 0db885c2ch - mov eax, r12d - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi + rax * 8] + mov ecx, r12d + call rx_readfloat_l2 addpd xmm0, xmm9 movaps xmm6, xmm0 rx_i_395: ;IDIV_64 - dec edi + dec ebp jz rx_finish xor r8, 04ae4fe8ch - mov eax, r8d - and eax, 2047 - mov rax, qword ptr [rsi + rax * 8] + mov ecx, r8d + call rx_readint_l1 mov edx, r13d cmp edx, -1 jne short safe_idiv_395 @@ -5312,20 +5093,20 @@ result_idiv_395: mov r8, rax rx_i_396: ;FPADD - dec edi + dec ebp jz rx_finish xor r10, 07b41862bh mov ecx, r10d - call rx_read_dataset_f + call rx_readfloat_l1 addpd xmm0, xmm7 movaps xmm4, xmm0 rx_i_397: ;MUL_64 - dec edi + dec ebp jz rx_finish xor r8, 0916f3819h mov ecx, r8d - call rx_read_dataset_r + call rx_readint_l1 imul rax, r12 mov rcx, rax mov eax, r10d @@ -5334,12 +5115,11 @@ rx_i_397: ;MUL_64 mov qword ptr [rsi + rax * 8], rcx rx_i_398: ;ROL_64 - dec edi + dec ebp jz rx_finish xor r8, 04eb6fd2ah - mov eax, r8d - and eax, 32767 - mov rax, qword ptr [rsi + rax * 8] + mov ecx, r8d + call rx_readint_l1 rol rax, 44 mov rcx, rax mov eax, r11d @@ -5348,11 +5128,11 @@ rx_i_398: ;ROL_64 mov qword ptr [rsi + rax * 8], rcx rx_i_399: ;FPDIV - dec edi + dec ebp jz rx_finish xor r11, 0899a98cfh mov ecx, r11d - call rx_read_dataset_f + call rx_readfloat_l1 divpd xmm0, xmm2 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -5360,12 +5140,11 @@ rx_i_399: ;FPDIV movaps xmm6, xmm0 rx_i_400: ;OR_32 - dec edi + dec ebp jz rx_finish xor r13, 0aae75db6h - mov eax, r13d - and eax, 32767 - mov rax, qword ptr [rsi + rax * 8] + mov ecx, r13d + call rx_readint_l1 or eax, r11d mov rcx, rax mov eax, r14d @@ -5374,12 +5153,11 @@ rx_i_400: ;OR_32 mov qword ptr [rsi + rax * 8], rcx rx_i_401: ;FPMUL - dec edi + dec ebp jz rx_finish xor r13, 032e81f25h - mov eax, r13d - and eax, 32767 - cvtdq2pd xmm0, qword ptr [rsi + rax * 8] + mov ecx, r13d + call rx_readfloat_l1 mulpd xmm0, xmm4 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -5391,12 +5169,12 @@ rx_i_401: ;FPMUL movhpd qword ptr [rsi + rax * 8], xmm6 rx_i_402: ;RET - dec edi + dec ebp jz rx_finish xor r9, 0fa1a07ffh mov ecx, r9d - call rx_read_dataset_r - cmp rsp, rbp + call rx_readint_l1 + cmp rsp, rbx je short not_taken_ret_402 xor rax, qword ptr [rsp + 8] mov r14, rax @@ -5405,12 +5183,11 @@ not_taken_ret_402: mov r14, rax rx_i_403: ;IDIV_64 - dec edi + dec ebp jz rx_finish xor r9, 0e59500f7h - mov eax, r9d - and eax, 2047 - mov rax, qword ptr [rsi + rax * 8] + mov ecx, r9d + call rx_readint_l1 mov edx, r12d cmp edx, -1 jne short safe_idiv_403 @@ -5433,23 +5210,23 @@ result_idiv_403: mov qword ptr [rsi + rax * 8], rcx rx_i_404: ;MUL_32 - dec edi + dec ebp jz rx_finish xor r15, 05b8ceb2fh mov ecx, r15d - call rx_read_dataset_r + call rx_readint_l1 mov ecx, eax mov eax, r8d imul rax, rcx mov r15, rax rx_i_405: ;RET - dec edi + dec ebp jz rx_finish xor r8, 0f61082a3h mov ecx, r8d - call rx_read_dataset_r - cmp rsp, rbp + call rx_readint_l1 + cmp rsp, rbx je short not_taken_ret_405 xor rax, qword ptr [rsp + 8] mov rcx, rax @@ -5466,31 +5243,23 @@ not_taken_ret_405: mov qword ptr [rsi + rax * 8], rcx rx_i_406: ;FPROUND - dec edi + dec ebp jz rx_finish xor r9, 0af6886b7h - mov eax, r9d - and eax, 2047 - mov rax, qword ptr [rsi + rax * 8] - mov rcx, rax + mov ecx, r9d + call rx_readint_l2 shl eax, 13 - and rcx, -2048 and eax, 24576 - cvtsi2sd xmm9, rcx or eax, 40896 mov dword ptr [rsp - 8], eax ldmxcsr dword ptr [rsp - 8] - mov eax, r9d - xor eax, 09862adefh - and eax, 32767 - movlpd qword ptr [rsi + rax * 8], xmm9 rx_i_407: ;FPMUL - dec edi + dec ebp jz rx_finish xor r14, 09699566fh mov ecx, r14d - call rx_read_dataset_f + call rx_readfloat_l2 mulpd xmm0, xmm9 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -5498,11 +5267,11 @@ rx_i_407: ;FPMUL movaps xmm8, xmm0 rx_i_408: ;MUL_64 - dec edi + dec ebp jz rx_finish xor r15, 066e79fa6h mov ecx, r15d - call rx_read_dataset_r + call rx_readint_l1 imul rax, r9 mov rcx, rax mov eax, r10d @@ -5511,22 +5280,21 @@ rx_i_408: ;MUL_64 mov qword ptr [rsi + rax * 8], rcx rx_i_409: ;MUL_64 - dec edi + dec ebp jz rx_finish xor r11, 04b6caa9ah mov ecx, r11d - call rx_read_dataset_r + call rx_readint_l1 imul rax, r15 mov r8, rax rx_i_410: ;RET - dec edi + dec ebp jz rx_finish xor r15, 0d17f245eh - mov eax, r15d - and eax, 32767 - mov rax, qword ptr [rsi + rax * 8] - cmp rsp, rbp + mov ecx, r15d + call rx_readint_l1 + cmp rsp, rbx je short not_taken_ret_410 xor rax, qword ptr [rsp + 8] mov r8, rax @@ -5535,13 +5303,12 @@ not_taken_ret_410: mov r8, rax rx_i_411: ;RET - dec edi + dec ebp jz rx_finish xor r12, 0364f10e7h - mov eax, r12d - and eax, 2047 - mov rax, qword ptr [rsi + rax * 8] - cmp rsp, rbp + mov ecx, r12d + call rx_readint_l1 + cmp rsp, rbx je short not_taken_ret_411 xor rax, qword ptr [rsp + 8] mov r12, rax @@ -5550,12 +5317,11 @@ not_taken_ret_411: mov r12, rax rx_i_412: ;FPSQRT - dec edi + dec ebp jz rx_finish xor r10, 0ac90e7ah - mov eax, r10d - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi + rax * 8] + mov ecx, r10d + call rx_readfloat_l1 andps xmm0, xmm10 sqrtpd xmm0, xmm0 movaps xmm3, xmm0 @@ -5565,12 +5331,11 @@ rx_i_412: ;FPSQRT movhpd qword ptr [rsi + rax * 8], xmm3 rx_i_413: ;FPDIV - dec edi + dec ebp jz rx_finish xor r11, 04b6037abh - mov eax, r11d - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi + rax * 8] + mov ecx, r11d + call rx_readfloat_l1 divpd xmm0, xmm2 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -5578,11 +5343,11 @@ rx_i_413: ;FPDIV movaps xmm4, xmm0 rx_i_414: ;OR_64 - dec edi + dec ebp jz rx_finish xor r14, 06c01554dh mov ecx, r14d - call rx_read_dataset_r + call rx_readint_l1 or rax, r8 mov rcx, rax mov eax, r10d @@ -5591,23 +5356,22 @@ rx_i_414: ;OR_64 mov qword ptr [rsi + rax * 8], rcx rx_i_415: ;DIV_64 - dec edi + dec ebp jz rx_finish xor r8, 08c3e59a1h mov ecx, r8d - call rx_read_dataset_r + call rx_readint_l1 mov ecx, -538093385 xor edx, edx div rcx mov r9, rax rx_i_416: ;FPSUB - dec edi + dec ebp jz rx_finish xor r12, 0f3fafde9h - mov eax, r12d - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi + rax * 8] + mov ecx, r12d + call rx_readfloat_l1 subpd xmm0, xmm3 movaps xmm5, xmm0 mov eax, r13d @@ -5616,31 +5380,31 @@ rx_i_416: ;FPSUB movlpd qword ptr [rsi + rax * 8], xmm5 rx_i_417: ;SUB_64 - dec edi + dec ebp jz rx_finish xor r10, 03c6481fah mov ecx, r10d - call rx_read_dataset_r + call rx_readint_l1 sub rax, r12 mov r10, rax rx_i_418: ;MULH_64 - dec edi + dec ebp jz rx_finish xor r10, 02bd61c5fh mov ecx, r10d - call rx_read_dataset_r + call rx_readint_l1 mov rcx, r11 mul rcx mov rax, rdx mov r10, rax rx_i_419: ;XOR_64 - dec edi + dec ebp jz rx_finish xor r9, 0b6ab9d32h mov ecx, r9d - call rx_read_dataset_r + call rx_readint_l1 xor rax, r14 mov rcx, rax mov eax, r14d @@ -5649,12 +5413,11 @@ rx_i_419: ;XOR_64 mov qword ptr [rsi + rax * 8], rcx rx_i_420: ;FPADD - dec edi + dec ebp jz rx_finish xor r9, 0f9690ceah - mov eax, r9d - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi + rax * 8] + mov ecx, r9d + call rx_readfloat_l1 addpd xmm0, xmm3 movaps xmm9, xmm0 mov eax, r9d @@ -5663,13 +5426,12 @@ rx_i_420: ;FPADD movlpd qword ptr [rsi + rax * 8], xmm9 rx_i_421: ;RET - dec edi + dec ebp jz rx_finish xor r12, 01ada0f39h - mov eax, r12d - and eax, 2047 - mov rax, qword ptr [rsi + rax * 8] - cmp rsp, rbp + mov ecx, r12d + call rx_readint_l2 + cmp rsp, rbx je short not_taken_ret_421 xor rax, qword ptr [rsp + 8] mov r10, rax @@ -5678,22 +5440,22 @@ not_taken_ret_421: mov r10, rax rx_i_422: ;IMUL_32 - dec edi + dec ebp jz rx_finish xor r11, 04dd16ca4h mov ecx, r11d - call rx_read_dataset_r + call rx_readint_l2 movsxd rcx, eax movsxd rax, r10d imul rax, rcx mov r13, rax rx_i_423: ;MUL_64 - dec edi + dec ebp jz rx_finish xor r12, 04df5ce05h mov ecx, r12d - call rx_read_dataset_r + call rx_readint_l1 imul rax, r10 mov rcx, rax mov eax, r15d @@ -5702,11 +5464,11 @@ rx_i_423: ;MUL_64 mov qword ptr [rsi + rax * 8], rcx rx_i_424: ;FPADD - dec edi + dec ebp jz rx_finish xor r13, 01ad12ce2h mov ecx, r13d - call rx_read_dataset_f + call rx_readfloat_l2 addpd xmm0, xmm7 movaps xmm9, xmm0 mov eax, r9d @@ -5715,23 +5477,22 @@ rx_i_424: ;FPADD movhpd qword ptr [rsi + rax * 8], xmm9 rx_i_425: ;IMUL_32 - dec edi + dec ebp jz rx_finish xor r8, 0a3c5391dh mov ecx, r8d - call rx_read_dataset_r + call rx_readint_l1 movsxd rcx, eax movsxd rax, r10d imul rax, rcx mov r14, rax rx_i_426: ;AND_64 - dec edi + dec ebp jz rx_finish xor r12, 09dd55ba0h - mov eax, r12d - and eax, 2047 - mov rax, qword ptr [rsi + rax * 8] + mov ecx, r12d + call rx_readint_l2 and rax, r9 mov rcx, rax mov eax, r14d @@ -5740,12 +5501,11 @@ rx_i_426: ;AND_64 mov qword ptr [rsi + rax * 8], rcx rx_i_427: ;MUL_32 - dec edi + dec ebp jz rx_finish xor r11, 0d6cae9aeh - mov eax, r11d - and eax, 2047 - mov rax, qword ptr [rsi + rax * 8] + mov ecx, r11d + call rx_readint_l1 mov ecx, eax mov eax, r11d imul rax, rcx @@ -5756,13 +5516,12 @@ rx_i_427: ;MUL_32 mov qword ptr [rsi + rax * 8], rcx rx_i_428: ;RET - dec edi + dec ebp jz rx_finish xor r11, 0f807a961h - mov eax, r11d - and eax, 2047 - mov rax, qword ptr [rsi + rax * 8] - cmp rsp, rbp + mov ecx, r11d + call rx_readint_l1 + cmp rsp, rbx je short not_taken_ret_428 xor rax, qword ptr [rsp + 8] mov rcx, rax @@ -5779,21 +5538,20 @@ not_taken_ret_428: mov qword ptr [rsi + rax * 8], rcx rx_i_429: ;MUL_64 - dec edi + dec ebp jz rx_finish xor r12, 0650a4102h - mov eax, r12d - and eax, 2047 - mov rax, qword ptr [rsi + rax * 8] + mov ecx, r12d + call rx_readint_l2 imul rax, rax, 1990438276 mov r15, rax rx_i_430: ;FPADD - dec edi + dec ebp jz rx_finish xor r14, 019cc0e5h mov ecx, r14d - call rx_read_dataset_f + call rx_readfloat_l1 addpd xmm0, xmm8 movaps xmm5, xmm0 mov eax, r13d @@ -5802,11 +5560,11 @@ rx_i_430: ;FPADD movhpd qword ptr [rsi + rax * 8], xmm5 rx_i_431: ;FPADD - dec edi + dec ebp jz rx_finish xor r12, 0ed17ab58h mov ecx, r12d - call rx_read_dataset_f + call rx_readfloat_l1 addpd xmm0, xmm5 movaps xmm5, xmm0 mov eax, r13d @@ -5815,20 +5573,20 @@ rx_i_431: ;FPADD movlpd qword ptr [rsi + rax * 8], xmm5 rx_i_432: ;SUB_64 - dec edi + dec ebp jz rx_finish xor r10, 01c3b321fh mov ecx, r10d - call rx_read_dataset_r + call rx_readint_l2 sub rax, r10 mov r8, rax rx_i_433: ;ADD_32 - dec edi + dec ebp jz rx_finish xor r13, 0bbb88499h mov ecx, r13d - call rx_read_dataset_r + call rx_readint_l1 add eax, r12d mov rcx, rax mov eax, r12d @@ -5837,11 +5595,11 @@ rx_i_433: ;ADD_32 mov qword ptr [rsi + rax * 8], rcx rx_i_434: ;FPSQRT - dec edi + dec ebp jz rx_finish xor r13, 0167edabdh mov ecx, r13d - call rx_read_dataset_f + call rx_readfloat_l2 andps xmm0, xmm10 sqrtpd xmm0, xmm0 movaps xmm9, xmm0 @@ -5851,12 +5609,11 @@ rx_i_434: ;FPSQRT movlpd qword ptr [rsi + rax * 8], xmm9 rx_i_435: ;MUL_64 - dec edi + dec ebp jz rx_finish xor r15, 0b940480ah - mov eax, r15d - and eax, 2047 - mov rax, qword ptr [rsi + rax * 8] + mov ecx, r15d + call rx_readint_l1 imul rax, r15 mov rcx, rax mov eax, r9d @@ -5865,12 +5622,11 @@ rx_i_435: ;MUL_64 mov qword ptr [rsi + rax * 8], rcx rx_i_436: ;FPADD - dec edi + dec ebp jz rx_finish xor r15, 0bfc3ca8bh - mov eax, r15d - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi + rax * 8] + mov ecx, r15d + call rx_readfloat_l2 addpd xmm0, xmm2 movaps xmm7, xmm0 mov eax, r15d @@ -5879,11 +5635,11 @@ rx_i_436: ;FPADD movhpd qword ptr [rsi + rax * 8], xmm7 rx_i_437: ;FPDIV - dec edi + dec ebp jz rx_finish xor r8, 098a6bcf7h mov ecx, r8d - call rx_read_dataset_f + call rx_readfloat_l1 divpd xmm0, xmm3 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -5891,11 +5647,11 @@ rx_i_437: ;FPDIV movaps xmm8, xmm0 rx_i_438: ;FPMUL - dec edi + dec ebp jz rx_finish xor r10, 0325b38ebh mov ecx, r10d - call rx_read_dataset_f + call rx_readfloat_l1 mulpd xmm0, xmm9 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -5903,11 +5659,11 @@ rx_i_438: ;FPMUL movaps xmm4, xmm0 rx_i_439: ;XOR_32 - dec edi + dec ebp jz rx_finish xor r13, 05e807e81h mov ecx, r13d - call rx_read_dataset_r + call rx_readint_l2 xor eax, r15d mov rcx, rax mov eax, r10d @@ -5916,13 +5672,12 @@ rx_i_439: ;XOR_32 mov qword ptr [rsi + rax * 8], rcx rx_i_440: ;RET - dec edi + dec ebp jz rx_finish xor r10, 062f83728h - mov eax, r10d - and eax, 2047 - mov rax, qword ptr [rsi + rax * 8] - cmp rsp, rbp + mov ecx, r10d + call rx_readint_l1 + cmp rsp, rbx je short not_taken_ret_440 xor rax, qword ptr [rsp + 8] mov r9, rax @@ -5931,11 +5686,11 @@ not_taken_ret_440: mov r9, rax rx_i_441: ;ADD_64 - dec edi + dec ebp jz rx_finish xor r14, 0d18ec075h mov ecx, r14d - call rx_read_dataset_r + call rx_readint_l1 add rax, 529736748 mov rcx, rax mov eax, r9d @@ -5944,11 +5699,11 @@ rx_i_441: ;ADD_64 mov qword ptr [rsi + rax * 8], rcx rx_i_442: ;CALL - dec edi + dec ebp jz rx_finish xor r14, 0a53dd1bh mov ecx, r14d - call rx_read_dataset_r + call rx_readint_l1 cmp r15d, 799523062 jbe short taken_call_442 mov rcx, rax @@ -5962,13 +5717,12 @@ taken_call_442: call rx_i_9 rx_i_443: ;RET - dec edi + dec ebp jz rx_finish xor r14, 0232d1285h - mov eax, r14d - and eax, 2047 - mov rax, qword ptr [rsi + rax * 8] - cmp rsp, rbp + mov ecx, r14d + call rx_readint_l1 + cmp rsp, rbx je short not_taken_ret_443 xor rax, qword ptr [rsp + 8] mov rcx, rax @@ -5985,12 +5739,11 @@ not_taken_ret_443: mov qword ptr [rsi + rax * 8], rcx rx_i_444: ;FPMUL - dec edi + dec ebp jz rx_finish xor r8, 042455dd8h - mov eax, r8d - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi + rax * 8] + mov ecx, r8d + call rx_readfloat_l2 mulpd xmm0, xmm7 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -6002,11 +5755,11 @@ rx_i_444: ;FPMUL movlpd qword ptr [rsi + rax * 8], xmm5 rx_i_445: ;ADD_64 - dec edi + dec ebp jz rx_finish xor r13, 09ae009b2h mov ecx, r13d - call rx_read_dataset_r + call rx_readint_l1 add rax, r11 mov rcx, rax mov eax, r9d @@ -6015,11 +5768,11 @@ rx_i_445: ;ADD_64 mov qword ptr [rsi + rax * 8], rcx rx_i_446: ;MUL_32 - dec edi + dec ebp jz rx_finish xor r12, 01734708eh mov ecx, r12d - call rx_read_dataset_r + call rx_readint_l1 mov ecx, eax mov eax, r15d imul rax, rcx @@ -6030,11 +5783,11 @@ rx_i_446: ;MUL_32 mov qword ptr [rsi + rax * 8], rcx rx_i_447: ;FPSUB - dec edi + dec ebp jz rx_finish xor r8, 01596d0e8h mov ecx, r8d - call rx_read_dataset_f + call rx_readfloat_l1 subpd xmm0, xmm7 movaps xmm5, xmm0 mov eax, r13d @@ -6043,31 +5796,29 @@ rx_i_447: ;FPSUB movhpd qword ptr [rsi + rax * 8], xmm5 rx_i_448: ;FPSUB - dec edi + dec ebp jz rx_finish xor r9, 0390cfdb0h - mov eax, r9d - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi + rax * 8] + mov ecx, r9d + call rx_readfloat_l1 subpd xmm0, xmm3 movaps xmm9, xmm0 rx_i_449: ;ROR_64 - dec edi + dec ebp jz rx_finish xor r8, 04f27744bh - mov eax, r8d - and eax, 2047 - mov rax, qword ptr [rsi + rax * 8] + mov ecx, r8d + call rx_readint_l1 ror rax, 28 mov r8, rax rx_i_450: ;ROL_64 - dec edi + dec ebp jz rx_finish xor r8, 04e2c76ffh mov ecx, r8d - call rx_read_dataset_r + call rx_readint_l1 mov rcx, r12 rol rax, cl mov rcx, rax @@ -6077,21 +5828,21 @@ rx_i_450: ;ROL_64 mov qword ptr [rsi + rax * 8], rcx rx_i_451: ;ADD_64 - dec edi + dec ebp jz rx_finish xor r8, 0c4d99ac9h mov ecx, r8d - call rx_read_dataset_r + call rx_readint_l1 add rax, -287502157 mov r8, rax rx_i_452: ;RET - dec edi + dec ebp jz rx_finish xor r13, 040130b88h mov ecx, r13d - call rx_read_dataset_r - cmp rsp, rbp + call rx_readint_l1 + cmp rsp, rbx je short not_taken_ret_452 xor rax, qword ptr [rsp + 8] mov rcx, rax @@ -6108,23 +5859,22 @@ not_taken_ret_452: mov qword ptr [rsi + rax * 8], rcx rx_i_453: ;IMULH_64 - dec edi + dec ebp jz rx_finish xor r11, 0a2096aa4h mov ecx, r11d - call rx_read_dataset_r + call rx_readint_l1 mov rcx, r14 imul rcx mov rax, rdx mov r8, rax rx_i_454: ;FPADD - dec edi + dec ebp jz rx_finish xor r13, 081314291h - mov eax, r13d - and eax, 32767 - cvtdq2pd xmm0, qword ptr [rsi + rax * 8] + mov ecx, r13d + call rx_readfloat_l1 addpd xmm0, xmm9 movaps xmm4, xmm0 mov eax, r12d @@ -6133,21 +5883,20 @@ rx_i_454: ;FPADD movhpd qword ptr [rsi + rax * 8], xmm4 rx_i_455: ;XOR_64 - dec edi + dec ebp jz rx_finish xor r8, 059263cdbh - mov eax, r8d - and eax, 2047 - mov rax, qword ptr [rsi + rax * 8] + mov ecx, r8d + call rx_readint_l1 xor rax, r9 mov r8, rax rx_i_456: ;OR_32 - dec edi + dec ebp jz rx_finish xor r9, 010e8fe6h mov ecx, r9d - call rx_read_dataset_r + call rx_readint_l2 or eax, r11d mov rcx, rax mov eax, r9d @@ -6156,11 +5905,11 @@ rx_i_456: ;OR_32 mov qword ptr [rsi + rax * 8], rcx rx_i_457: ;SUB_64 - dec edi + dec ebp jz rx_finish xor r9, 09de1a3efh mov ecx, r9d - call rx_read_dataset_r + call rx_readint_l1 sub rax, r10 mov rcx, rax mov eax, r10d @@ -6169,20 +5918,20 @@ rx_i_457: ;SUB_64 mov qword ptr [rsi + rax * 8], rcx rx_i_458: ;ROL_64 - dec edi + dec ebp jz rx_finish xor r11, 05c79df6eh mov ecx, r11d - call rx_read_dataset_r + call rx_readint_l1 rol rax, 22 mov r14, rax rx_i_459: ;MUL_64 - dec edi + dec ebp jz rx_finish xor r9, 0346f46adh mov ecx, r9d - call rx_read_dataset_r + call rx_readint_l1 imul rax, rax, 381354340 mov rcx, rax mov eax, r13d @@ -6191,11 +5940,11 @@ rx_i_459: ;MUL_64 mov qword ptr [rsi + rax * 8], rcx rx_i_460: ;SUB_64 - dec edi + dec ebp jz rx_finish xor r11, 098ab71fch mov ecx, r11d - call rx_read_dataset_r + call rx_readint_l1 sub rax, r14 mov rcx, rax mov eax, r12d @@ -6204,11 +5953,11 @@ rx_i_460: ;SUB_64 mov qword ptr [rsi + rax * 8], rcx rx_i_461: ;SHR_64 - dec edi + dec ebp jz rx_finish xor r11, 0c814e926h mov ecx, r11d - call rx_read_dataset_r + call rx_readint_l2 mov rcx, r13 shr rax, cl mov rcx, rax @@ -6218,31 +5967,29 @@ rx_i_461: ;SHR_64 mov qword ptr [rsi + rax * 8], rcx rx_i_462: ;ADD_64 - dec edi + dec ebp jz rx_finish xor r10, 0c64b4a9eh - mov eax, r10d - and eax, 2047 - mov rax, qword ptr [rsi + rax * 8] + mov ecx, r10d + call rx_readint_l2 add rax, -1734323376 mov r15, rax rx_i_463: ;SUB_64 - dec edi + dec ebp jz rx_finish xor r9, 08c29341h - mov eax, r9d - and eax, 2047 - mov rax, qword ptr [rsi + rax * 8] + mov ecx, r9d + call rx_readint_l1 sub rax, r15 mov r10, rax rx_i_464: ;MUL_64 - dec edi + dec ebp jz rx_finish xor r12, 06ff587fdh mov ecx, r12d - call rx_read_dataset_r + call rx_readint_l1 imul rax, r15 mov rcx, rax mov eax, r13d @@ -6251,43 +5998,40 @@ rx_i_464: ;MUL_64 mov qword ptr [rsi + rax * 8], rcx rx_i_465: ;FPADD - dec edi + dec ebp jz rx_finish xor r12, 0b62c0003h - mov eax, r12d - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi + rax * 8] + mov ecx, r12d + call rx_readfloat_l2 addpd xmm0, xmm5 movaps xmm2, xmm0 rx_i_466: ;IMUL_32 - dec edi + dec ebp jz rx_finish xor r13, 05c541c42h - mov eax, r13d - and eax, 2047 - mov rax, qword ptr [rsi + rax * 8] + mov ecx, r13d + call rx_readint_l1 movsxd rcx, eax mov rax, 282682508 imul rax, rcx mov r9, rax rx_i_467: ;FPADD - dec edi + dec ebp jz rx_finish xor r8, 0cbb33f81h - mov eax, r8d - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi + rax * 8] + mov ecx, r8d + call rx_readfloat_l1 addpd xmm0, xmm9 movaps xmm8, xmm0 rx_i_468: ;IDIV_64 - dec edi + dec ebp jz rx_finish xor r8, 091044dc3h mov ecx, r8d - call rx_read_dataset_r + call rx_readint_l1 mov edx, -13394825 cmp edx, -1 jne short safe_idiv_468 @@ -6310,11 +6054,11 @@ result_idiv_468: mov qword ptr [rsi + rax * 8], rcx rx_i_469: ;MUL_32 - dec edi + dec ebp jz rx_finish xor r9, 0c0186beh mov ecx, r9d - call rx_read_dataset_r + call rx_readint_l1 mov ecx, eax mov eax, 294019485 imul rax, rcx @@ -6325,11 +6069,11 @@ rx_i_469: ;MUL_32 mov qword ptr [rsi + rax * 8], rcx rx_i_470: ;XOR_32 - dec edi + dec ebp jz rx_finish xor r14, 090849e3eh mov ecx, r14d - call rx_read_dataset_r + call rx_readint_l1 xor eax, r11d mov rcx, rax mov eax, r14d @@ -6338,24 +6082,22 @@ rx_i_470: ;XOR_32 mov qword ptr [rsi + rax * 8], rcx rx_i_471: ;IMUL_32 - dec edi + dec ebp jz rx_finish xor r14, 0cedba9b6h - mov eax, r14d - and eax, 32767 - mov rax, qword ptr [rsi + rax * 8] + mov ecx, r14d + call rx_readint_l1 movsxd rcx, eax movsxd rax, r13d imul rax, rcx mov r14, rax rx_i_472: ;CALL - dec edi + dec ebp jz rx_finish xor r9, 038f4b9d6h - mov eax, r9d - and eax, 2047 - mov rax, qword ptr [rsi + rax * 8] + mov ecx, r9d + call rx_readint_l2 cmp r10d, 1738497427 jl short taken_call_472 mov r10, rax @@ -6365,21 +6107,20 @@ taken_call_472: call rx_i_8 rx_i_473: ;MUL_64 - dec edi + dec ebp jz rx_finish xor r14, 01fb7637dh - mov eax, r14d - and eax, 2047 - mov rax, qword ptr [rsi + rax * 8] + mov ecx, r14d + call rx_readint_l1 imul rax, rax, -751043211 mov r12, rax rx_i_474: ;CALL - dec edi + dec ebp jz rx_finish xor r9, 0b5c0b4d4h mov ecx, r9d - call rx_read_dataset_r + call rx_readint_l2 cmp r15d, -233120543 jo short taken_call_474 mov r15, rax @@ -6389,32 +6130,29 @@ taken_call_474: call rx_i_69 rx_i_475: ;FPSUB - dec edi + dec ebp jz rx_finish xor r10, 0910dcdeeh - mov eax, r10d - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi + rax * 8] + mov ecx, r10d + call rx_readfloat_l2 subpd xmm0, xmm9 movaps xmm7, xmm0 rx_i_476: ;FPSUB - dec edi + dec ebp jz rx_finish xor r8, 07ab3b5a4h - mov eax, r8d - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi + rax * 8] + mov ecx, r8d + call rx_readfloat_l1 subpd xmm0, xmm2 movaps xmm9, xmm0 rx_i_477: ;FPADD - dec edi + dec ebp jz rx_finish xor r12, 07a29ec63h - mov eax, r12d - and eax, 32767 - cvtdq2pd xmm0, qword ptr [rsi + rax * 8] + mov ecx, r12d + call rx_readfloat_l1 addpd xmm0, xmm9 movaps xmm6, xmm0 mov eax, r14d @@ -6423,21 +6161,20 @@ rx_i_477: ;FPADD movlpd qword ptr [rsi + rax * 8], xmm6 rx_i_478: ;MUL_64 - dec edi + dec ebp jz rx_finish xor r14, 02d3d7e7fh mov ecx, r14d - call rx_read_dataset_r + call rx_readint_l1 imul rax, r10 mov r12, rax rx_i_479: ;MUL_64 - dec edi + dec ebp jz rx_finish xor r12, 09b49c793h - mov eax, r12d - and eax, 2047 - mov rax, qword ptr [rsi + rax * 8] + mov ecx, r12d + call rx_readint_l1 imul rax, r14 mov rcx, rax mov eax, r13d @@ -6446,22 +6183,20 @@ rx_i_479: ;MUL_64 mov qword ptr [rsi + rax * 8], rcx rx_i_480: ;FPSUB - dec edi + dec ebp jz rx_finish xor r9, 0a9cc4f01h - mov eax, r9d - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi + rax * 8] + mov ecx, r9d + call rx_readfloat_l1 subpd xmm0, xmm4 movaps xmm6, xmm0 rx_i_481: ;DIV_64 - dec edi + dec ebp jz rx_finish xor r14, 0225ba1f9h - mov eax, r14d - and eax, 2047 - mov rax, qword ptr [rsi + rax * 8] + mov ecx, r14d + call rx_readint_l1 mov ecx, 1 mov edx, r13d test edx, edx @@ -6471,38 +6206,38 @@ rx_i_481: ;DIV_64 mov r12, rax rx_i_482: ;XOR_64 - dec edi + dec ebp jz rx_finish xor r14, 044a0f592h mov ecx, r14d - call rx_read_dataset_r + call rx_readint_l2 xor rax, r12 mov r11, rax rx_i_483: ;FPADD - dec edi + dec ebp jz rx_finish xor r11, 07f71f219h mov ecx, r11d - call rx_read_dataset_f + call rx_readfloat_l1 addpd xmm0, xmm6 movaps xmm6, xmm0 rx_i_484: ;ROL_64 - dec edi + dec ebp jz rx_finish xor r12, 07027bacdh mov ecx, r12d - call rx_read_dataset_r + call rx_readint_l1 rol rax, 37 mov r11, rax rx_i_485: ;CALL - dec edi + dec ebp jz rx_finish xor r13, 03a04647h mov ecx, r13d - call rx_read_dataset_r + call rx_readint_l2 cmp r8d, 554879918 jno short taken_call_485 mov rcx, rax @@ -6516,12 +6251,11 @@ taken_call_485: call rx_i_58 rx_i_486: ;ADD_64 - dec edi + dec ebp jz rx_finish xor r15, 0ad072937h - mov eax, r15d - and eax, 2047 - mov rax, qword ptr [rsi + rax * 8] + mov ecx, r15d + call rx_readint_l1 add rax, 942846898 mov rcx, rax mov eax, r8d @@ -6530,33 +6264,31 @@ rx_i_486: ;ADD_64 mov qword ptr [rsi + rax * 8], rcx rx_i_487: ;SUB_64 - dec edi + dec ebp jz rx_finish xor r11, 07f78ad34h mov ecx, r11d - call rx_read_dataset_r + call rx_readint_l1 sub rax, -333279706 mov r11, rax rx_i_488: ;IMULH_64 - dec edi + dec ebp jz rx_finish xor r12, 0d8b1788eh - mov eax, r12d - and eax, 32767 - mov rax, qword ptr [rsi + rax * 8] + mov ecx, r12d + call rx_readint_l1 mov rcx, 297357073 imul rcx mov rax, rdx mov r12, rax rx_i_489: ;CALL - dec edi + dec ebp jz rx_finish xor r10, 0b2ec9f3ah - mov eax, r10d - and eax, 2047 - mov rax, qword ptr [rsi + rax * 8] + mov ecx, r10d + call rx_readint_l1 cmp r15d, -1127175870 jge short taken_call_489 mov rcx, rax @@ -6570,20 +6302,20 @@ taken_call_489: call rx_i_75 rx_i_490: ;FPADD - dec edi + dec ebp jz rx_finish xor r11, 015c7f598h mov ecx, r11d - call rx_read_dataset_f + call rx_readfloat_l2 addpd xmm0, xmm9 movaps xmm7, xmm0 rx_i_491: ;FPADD - dec edi + dec ebp jz rx_finish xor r8, 0902da6bdh mov ecx, r8d - call rx_read_dataset_f + call rx_readfloat_l2 addpd xmm0, xmm9 movaps xmm7, xmm0 mov eax, r15d @@ -6592,42 +6324,40 @@ rx_i_491: ;FPADD movhpd qword ptr [rsi + rax * 8], xmm7 rx_i_492: ;OR_64 - dec edi + dec ebp jz rx_finish xor r9, 0491090d9h mov ecx, r9d - call rx_read_dataset_r + call rx_readint_l2 or rax, r9 mov r12, rax rx_i_493: ;FPSUB - dec edi + dec ebp jz rx_finish xor r8, 09de81282h - mov eax, r8d - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi + rax * 8] + mov ecx, r8d + call rx_readfloat_l1 subpd xmm0, xmm9 movaps xmm4, xmm0 rx_i_494: ;MUL_32 - dec edi + dec ebp jz rx_finish xor r10, 0b0d50e46h mov ecx, r10d - call rx_read_dataset_r + call rx_readint_l2 mov ecx, eax mov eax, r11d imul rax, rcx mov r14, rax rx_i_495: ;FPMUL - dec edi + dec ebp jz rx_finish xor r11, 0e276cad1h - mov eax, r11d - and eax, 32767 - cvtdq2pd xmm0, qword ptr [rsi + rax * 8] + mov ecx, r11d + call rx_readfloat_l1 mulpd xmm0, xmm2 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -6635,21 +6365,20 @@ rx_i_495: ;FPMUL movaps xmm8, xmm0 rx_i_496: ;OR_64 - dec edi + dec ebp jz rx_finish xor r14, 0fe757b73h mov ecx, r14d - call rx_read_dataset_r + call rx_readint_l1 or rax, -359802064 mov r9, rax rx_i_497: ;FPDIV - dec edi + dec ebp jz rx_finish xor r8, 08d25742eh - mov eax, r8d - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi + rax * 8] + mov ecx, r8d + call rx_readfloat_l1 divpd xmm0, xmm3 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -6657,12 +6386,11 @@ rx_i_497: ;FPDIV movaps xmm8, xmm0 rx_i_498: ;FPMUL - dec edi + dec ebp jz rx_finish xor r15, 0e066fd15h - mov eax, r15d - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi + rax * 8] + mov ecx, r15d + call rx_readfloat_l1 mulpd xmm0, xmm9 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -6674,24 +6402,22 @@ rx_i_498: ;FPMUL movlpd qword ptr [rsi + rax * 8], xmm8 rx_i_499: ;IMUL_32 - dec edi + dec ebp jz rx_finish xor r12, 08925556bh - mov eax, r12d - and eax, 2047 - mov rax, qword ptr [rsi + rax * 8] + mov ecx, r12d + call rx_readint_l2 movsxd rcx, eax mov rax, -1795485757 imul rax, rcx mov r8, rax rx_i_500: ;CALL - dec edi + dec ebp jz rx_finish xor r10, 04bc870ebh - mov eax, r10d - and eax, 32767 - mov rax, qword ptr [rsi + rax * 8] + mov ecx, r10d + call rx_readint_l1 cmp r13d, 1243939650 jl short taken_call_500 mov rcx, rax @@ -6705,11 +6431,11 @@ taken_call_500: call rx_i_511 rx_i_501: ;SHR_64 - dec edi + dec ebp jz rx_finish xor r8, 07d46c503h mov ecx, r8d - call rx_read_dataset_r + call rx_readint_l1 mov rcx, r10 shr rax, cl mov rcx, rax @@ -6719,12 +6445,12 @@ rx_i_501: ;SHR_64 mov qword ptr [rsi + rax * 8], rcx rx_i_502: ;RET - dec edi + dec ebp jz rx_finish xor r10, 09e70b20ch mov ecx, r10d - call rx_read_dataset_r - cmp rsp, rbp + call rx_readint_l1 + cmp rsp, rbx je short not_taken_ret_502 xor rax, qword ptr [rsp + 8] mov rcx, rax @@ -6741,12 +6467,11 @@ not_taken_ret_502: mov qword ptr [rsi + rax * 8], rcx rx_i_503: ;FPSUB - dec edi + dec ebp jz rx_finish xor r13, 0442e4850h - mov eax, r13d - and eax, 32767 - cvtdq2pd xmm0, qword ptr [rsi + rax * 8] + mov ecx, r13d + call rx_readfloat_l1 subpd xmm0, xmm2 movaps xmm9, xmm0 mov eax, r9d @@ -6755,12 +6480,11 @@ rx_i_503: ;FPSUB movlpd qword ptr [rsi + rax * 8], xmm9 rx_i_504: ;FPADD - dec edi + dec ebp jz rx_finish xor r13, 099d48347h - mov eax, r13d - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi + rax * 8] + mov ecx, r13d + call rx_readfloat_l1 addpd xmm0, xmm9 movaps xmm4, xmm0 mov eax, r12d @@ -6769,11 +6493,11 @@ rx_i_504: ;FPADD movhpd qword ptr [rsi + rax * 8], xmm4 rx_i_505: ;FPMUL - dec edi + dec ebp jz rx_finish xor r12, 032c0a28ah mov ecx, r12d - call rx_read_dataset_f + call rx_readfloat_l2 mulpd xmm0, xmm4 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -6785,11 +6509,11 @@ rx_i_505: ;FPMUL movlpd qword ptr [rsi + rax * 8], xmm8 rx_i_506: ;FPMUL - dec edi + dec ebp jz rx_finish xor r9, 0a973d58ch mov ecx, r9d - call rx_read_dataset_f + call rx_readfloat_l1 mulpd xmm0, xmm9 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -6797,13 +6521,12 @@ rx_i_506: ;FPMUL movaps xmm3, xmm0 rx_i_507: ;RET - dec edi + dec ebp jz rx_finish xor r10, 0d3b7165ch - mov eax, r10d - and eax, 2047 - mov rax, qword ptr [rsi + rax * 8] - cmp rsp, rbp + mov ecx, r10d + call rx_readint_l1 + cmp rsp, rbx je short not_taken_ret_507 xor rax, qword ptr [rsp + 8] mov r14, rax @@ -6812,12 +6535,12 @@ not_taken_ret_507: mov r14, rax rx_i_508: ;RET - dec edi + dec ebp jz rx_finish xor r13, 0da34d818h mov ecx, r13d - call rx_read_dataset_r - cmp rsp, rbp + call rx_readint_l1 + cmp rsp, rbx je short not_taken_ret_508 xor rax, qword ptr [rsp + 8] mov r8, rax @@ -6826,12 +6549,11 @@ not_taken_ret_508: mov r8, rax rx_i_509: ;CALL - dec edi + dec ebp jz rx_finish xor r11, 01b2873f2h - mov eax, r11d - and eax, 2047 - mov rax, qword ptr [rsi + rax * 8] + mov ecx, r11d + call rx_readint_l1 cmp r8d, 1826115244 jno short taken_call_509 mov r10, rax @@ -6841,21 +6563,20 @@ taken_call_509: call rx_i_42 rx_i_510: ;FPSUB - dec edi + dec ebp jz rx_finish xor r8, 0db65513ch - mov eax, r8d - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi + rax * 8] + mov ecx, r8d + call rx_readfloat_l1 subpd xmm0, xmm2 movaps xmm9, xmm0 rx_i_511: ;ROL_64 - dec edi + dec ebp jz rx_finish xor r11, 02bd79286h mov ecx, r11d - call rx_read_dataset_r + call rx_readint_l1 mov rcx, r10 rol rax, cl mov r11, rax diff --git a/src/virtualMemory.cpp b/src/virtualMemory.cpp new file mode 100644 index 0000000..766fda3 --- /dev/null +++ b/src/virtualMemory.cpp @@ -0,0 +1,108 @@ +/* +Copyright (c) 2018 tevador + +This file is part of RandomX. + +RandomX is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +RandomX is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with RandomX. If not, see. +*/ + +#include "virtualMemory.hpp" + +#include + +#ifdef _WIN32 +#include +#else +#ifdef __APPLE__ +#include +#endif +#include +#include +#ifndef MAP_ANONYMOUS +#define MAP_ANONYMOUS MAP_ANON +#endif +#endif + +#ifdef _WIN32 +std::string getErrorMessage(const char* function) { + LPSTR messageBuffer = nullptr; + size_t size = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS, + NULL, GetLastError(), MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&messageBuffer, 0, NULL); + std::string message(messageBuffer, size); + LocalFree(messageBuffer); + return std::string(function) + std::string(": ") + message; +} + +void setPrivilege(const char* pszPrivilege, BOOL bEnable) { + HANDLE hToken; + TOKEN_PRIVILEGES tp; + BOOL status; + DWORD error; + + if (!OpenProcessToken(GetCurrentProcess(), TOKEN_ADJUST_PRIVILEGES | TOKEN_QUERY, &hToken)) + throw std::runtime_error(getErrorMessage("OpenProcessToken")); + + if (!LookupPrivilegeValue(NULL, pszPrivilege, &tp.Privileges[0].Luid)) + throw std::runtime_error(getErrorMessage("LookupPrivilegeValue")); + + tp.PrivilegeCount = 1; + + if (bEnable) + tp.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED; + else + tp.Privileges[0].Attributes = 0; + + status = AdjustTokenPrivileges(hToken, FALSE, &tp, 0, (PTOKEN_PRIVILEGES)NULL, 0); + + error = GetLastError(); + if (!status || (error != ERROR_SUCCESS)) + throw std::runtime_error(getErrorMessage("AdjustTokenPrivileges")); + + if (!CloseHandle(hToken)) + throw std::runtime_error(getErrorMessage("CloseHandle")); +} +#endif + +void* allocExecutableMemory(size_t bytes) { + void* mem; +#ifdef _WIN32 + mem = VirtualAlloc(nullptr, bytes, MEM_COMMIT, PAGE_EXECUTE_READWRITE); + if (mem == nullptr) + throw std::runtime_error(getErrorMessage("allocExecutableMemory - VirtualAlloc")); +#else + mem = mmap(nullptr, CodeSize, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + if (mem == MAP_FAILED) + throw std::runtime_error("allocExecutableMemory - mmap failed"); +#endif + return mem; +} + +void* allocLargePagesMemory(size_t bytes) { + void* mem; +#ifdef _WIN32 + setPrivilege("SeLockMemoryPrivilege", 1); + mem = VirtualAlloc(NULL, bytes, MEM_COMMIT | MEM_RESERVE | MEM_LARGE_PAGES, PAGE_READWRITE); + if (mem == nullptr) + throw std::runtime_error(getErrorMessage("allocLargePagesMemory - VirtualAlloc")); +#else +#ifdef __APPLE__ + mem = mmap(nullptr, bytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, VM_FLAGS_SUPERPAGE_SIZE_2MB, 0); +#else + mem = mmap(nullptr, bytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB | MAP_POPULATE, -1, 0); +#endif + if (mem == MAP_FAILED) + throw std::runtime_error("allocLargePagesMemory - mmap failed"); +#endif + return mem; +} \ No newline at end of file diff --git a/src/virtualMemory.hpp b/src/virtualMemory.hpp new file mode 100644 index 0000000..dd150d3 --- /dev/null +++ b/src/virtualMemory.hpp @@ -0,0 +1,23 @@ +/* +Copyright (c) 2018 tevador + +This file is part of RandomX. + +RandomX is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +RandomX is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with RandomX. If not, see. +*/ + +#pragma once + +void* allocExecutableMemory(size_t); +void* allocLargePagesMemory(size_t); \ No newline at end of file From 4189e4ebc6821fe3fb40282f10c165e9cb87d427 Mon Sep 17 00:00:00 2001 From: tevador Date: Sun, 6 Jan 2019 17:23:05 +0100 Subject: [PATCH 02/35] Original number of VM instructions --- src/executeProgram-win64.asm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/executeProgram-win64.asm b/src/executeProgram-win64.asm index 1e7e7a4..99941b1 100644 --- a/src/executeProgram-win64.asm +++ b/src/executeProgram-win64.asm @@ -88,7 +88,7 @@ executeProgram PROC mov rsi, r8 ; convertible_t* scratchpad mov rbx, rsp ; beginning of VM stack - mov ebp, 524289 ; number of VM instructions to execute + 1 + mov ebp, 1048577 ; number of VM instructions to execute + 1 xorps xmm10, xmm10 cmpeqpd xmm10, xmm10 From 6519fed4d18d1819a6bf06d034f416a50b31f2e1 Mon Sep 17 00:00:00 2001 From: tevador Date: Sun, 6 Jan 2019 21:26:53 +0100 Subject: [PATCH 03/35] Combined prefetch + read into a single step --- src/CompiledVirtualMachine.hpp | 11 ++++++++++ src/executeProgram-win64.asm | 37 ++++++++++++++++------------------ 2 files changed, 28 insertions(+), 20 deletions(-) diff --git a/src/CompiledVirtualMachine.hpp b/src/CompiledVirtualMachine.hpp index 0932cfe..cf131d1 100644 --- a/src/CompiledVirtualMachine.hpp +++ b/src/CompiledVirtualMachine.hpp @@ -19,13 +19,24 @@ along with RandomX. If not, see. #pragma once //#define TRACEVM +#include #include "VirtualMachine.hpp" #include "JitCompilerX86.hpp" +#include "intrinPortable.h" namespace RandomX { class CompiledVirtualMachine : public VirtualMachine { public: + void* operator new(size_t size) { + void* ptr = _mm_malloc(size, 64); + if (ptr == nullptr) + throw std::bad_alloc(); + return ptr; + } + void operator delete(void* ptr) { + _mm_free(ptr); + } CompiledVirtualMachine(bool softAes); void setDataset(dataset_t ds, bool light = false) override; void initializeProgram(const void* seed) override; diff --git a/src/executeProgram-win64.asm b/src/executeProgram-win64.asm index 99941b1..3bc161e 100644 --- a/src/executeProgram-win64.asm +++ b/src/executeProgram-win64.asm @@ -223,17 +223,12 @@ ReadMemoryRandom MACRO spmask, float ;# GLOBAL rbp = "ic" number of instructions until the end of the program ;# GLOBAL rbx = address of the dataset address ;# GLOBAL rsi = address of the scratchpad -;# GLOBAL rdi = "mx" random 32-bit dataset address +;# GLOBAL rdi = low 32 bits = "mx", high 32 bits = "ma" ;# MODIFY rcx, rdx -LOCAL L_prefetch, L_read, L_return - mov eax, ebp - and al, 63 - jz short L_prefetch ;# "ic" divisible by 64 -> prefetch - xor edx, edx - cmp al, 14 - je short L_read ;# "ic" = 14 (mod 64) -> random read - cmovb edx, ecx ;# "ic" < 14 (mod 64) -> modify random read address - xor edi, edx +LOCAL L_prefetch_read, L_return + test ebp, 63 + jz short L_prefetch_read ;# "ic" divisible by 64 -> prefetch + read + xor rdi, rcx ;# randomize "mx" L_return: and ecx, spmask ;# limit address to the specified scratchpad size IF float @@ -242,12 +237,15 @@ ELSE mov rax, qword ptr [rsi+rcx*8] ENDIF ret -L_prefetch: +L_prefetch_read: + ; prefetch cacheline "mx" mov rax, qword ptr [rbx] ;# load the dataset address - and edi, -64 ;# align "mx" to the start of a cache line - prefetchnta byte ptr [rax+rdi] - jmp short L_return -L_read: + and rdi, -64 ;# align "mx" to the start of a cache line + mov edx, edi ;# edx = mx + prefetchnta byte ptr [rax+rdx] + ; read cacheline "ma" + ror rdi, 32 ;# swap "ma" and "mx" + mov edx, edi ;# edx = ma push rcx TransformAddress ecx, rcx ;# TransformAddress function and ecx, spmask-7 ;# limit address to the specified scratchpad size aligned to multiple of 8 @@ -274,14 +272,13 @@ ReadMemoryRandom 32767, 1 ALIGN 64 rx_read_dataset: -;# IN rcx = scratchpad index - must be divisible by 8 -;# GLOBAL rbx = address of the dataset address +;# IN rax = dataset address +;# IN ecx = scratchpad index - must be divisible by 8 +;# IN edx = dataset index - must be divisible by 64 ;# GLOBAL rsi = address of the scratchpad -;# GLOBAL rdi = "mx" random 32-bit dataset address ;# MODIFY rax, rcx, rdx - mov rax, qword ptr [rbx] ;# load the dataset address lea rcx, [rsi+rcx*8] ;# scratchpad cache line - lea rax, [rax+rdi] ;# dataset cache line + lea rax, [rax+rdx] ;# dataset cache line mov rdx, qword ptr [rax+0] ;# load first dataset quadword (prefetched into the cache by now) xor qword ptr [rcx+0], rdx ;# XOR the dataset item with a scratchpad item, repeat for the rest of the cacheline mov rdx, qword ptr [rax+8] From 2f6a599ff6327934382cfb6af77f3d8d69277781 Mon Sep 17 00:00:00 2001 From: tevador Date: Mon, 7 Jan 2019 17:44:43 +0100 Subject: [PATCH 04/35] Inlined calls for memory read --- src/AssemblyGeneratorX86.cpp | 98 +- src/AssemblyGeneratorX86.hpp | 4 +- src/common.hpp | 2 +- src/executeProgram-win64.asm | 56 +- src/program.inc | 4096 +++++++++++++++++++++++++++++----- 5 files changed, 3658 insertions(+), 598 deletions(-) diff --git a/src/AssemblyGeneratorX86.cpp b/src/AssemblyGeneratorX86.cpp index 5c3f9a2..8a11ac3 100644 --- a/src/AssemblyGeneratorX86.cpp +++ b/src/AssemblyGeneratorX86.cpp @@ -59,37 +59,55 @@ namespace RandomX { (this->*generator)(instr, i); } - void AssemblyGeneratorX86::genar(Instruction& instr) { + void AssemblyGeneratorX86::genar(Instruction& instr, int i) { asmCode << "\txor " << regR[instr.rega % RegistersCount] << ", 0" << std::hex << instr.addra << "h" << std::dec << std::endl; asmCode << "\tmov ecx, " << regR32[instr.rega % RegistersCount] << std::endl; + asmCode << "\ttest ebp, 63" << std::endl; + asmCode << "\tjnz short rx_body_" << i << std::endl; switch (instr.loca & 3) { case 0: case 1: case 2: - asmCode << "\tcall rx_readint_l1" << std::endl; - return; + asmCode << "\tcall rx_read_l1" << std::endl; + asmCode << "rx_body_" << i << ":" << std::endl; + asmCode << "\txor rdi, rcx" << std::endl; + asmCode << "\tand ecx, " << (ScratchpadL1 - 1) << std::endl; + break; default: //3 - asmCode << "\tcall rx_readint_l2" << std::endl; - return; + asmCode << "\tcall rx_read_l2" << std::endl; + asmCode << "rx_body_" << i << ":" << std::endl; + asmCode << "\txor rdi, rcx" << std::endl; + asmCode << "\tand ecx, " << (ScratchpadL2 - 1) << std::endl; + break; } + asmCode << "\tmov rax, qword ptr [rsi+rcx*8]" << std::endl; } - void AssemblyGeneratorX86::genaf(Instruction& instr) { + void AssemblyGeneratorX86::genaf(Instruction& instr, int i) { asmCode << "\txor " << regR[instr.rega % RegistersCount] << ", 0" << std::hex << instr.addra << "h" << std::dec << std::endl; asmCode << "\tmov ecx, " << regR32[instr.rega % RegistersCount] << std::endl; + asmCode << "\ttest ebp, 63" << std::endl; + asmCode << "\tjnz short rx_body_" << i << std::endl; switch (instr.loca & 3) { case 0: case 1: case 2: - asmCode << "\tcall rx_readfloat_l1" << std::endl; - return; + asmCode << "\tcall rx_read_l1" << std::endl; + asmCode << "rx_body_" << i << ":" << std::endl; + asmCode << "\txor rdi, rcx" << std::endl; + asmCode << "\tand ecx, " << (ScratchpadL1 - 1) << std::endl; + break; default: //3 - asmCode << "\tcall rx_readfloat_l2" << std::endl; - return; + asmCode << "\tcall rx_read_l2" << std::endl; + asmCode << "rx_body_" << i << ":" << std::endl; + asmCode << "\txor rdi, rcx" << std::endl; + asmCode << "\tand ecx, " << (ScratchpadL2 - 1) << std::endl; + break; } + asmCode << "\tcvtdq2pd xmm0, qword ptr [rsi+rcx*8]" << std::endl; } void AssemblyGeneratorX86::genbr0(Instruction& instr, const char* instrx86) { @@ -209,35 +227,35 @@ namespace RandomX { } void AssemblyGeneratorX86::h_ADD_64(Instruction& instr, int i) { - genar(instr); + genar(instr, i); asmCode << "\tadd rax, "; genbr1(instr); gencr(instr); } void AssemblyGeneratorX86::h_ADD_32(Instruction& instr, int i) { - genar(instr); + genar(instr, i); asmCode << "\tadd eax, "; genbr132(instr); gencr(instr); } void AssemblyGeneratorX86::h_SUB_64(Instruction& instr, int i) { - genar(instr); + genar(instr, i); asmCode << "\tsub rax, "; genbr1(instr); gencr(instr); } void AssemblyGeneratorX86::h_SUB_32(Instruction& instr, int i) { - genar(instr); + genar(instr, i); asmCode << "\tsub eax, "; genbr132(instr); gencr(instr); } void AssemblyGeneratorX86::h_MUL_64(Instruction& instr, int i) { - genar(instr); + genar(instr, i); asmCode << "\timul rax, "; if ((instr.locb & 7) >= 6) { asmCode << "rax, "; @@ -247,7 +265,7 @@ namespace RandomX { } void AssemblyGeneratorX86::h_MULH_64(Instruction& instr, int i) { - genar(instr); + genar(instr, i); asmCode << "\tmov rcx, "; genbr1(instr); asmCode << "\tmul rcx" << std::endl; @@ -256,7 +274,7 @@ namespace RandomX { } void AssemblyGeneratorX86::h_MUL_32(Instruction& instr, int i) { - genar(instr); + genar(instr, i); asmCode << "\tmov ecx, eax" << std::endl; asmCode << "\tmov eax, "; genbr132(instr); @@ -265,7 +283,7 @@ namespace RandomX { } void AssemblyGeneratorX86::h_IMUL_32(Instruction& instr, int i) { - genar(instr); + genar(instr, i); asmCode << "\tmovsxd rcx, eax" << std::endl; if ((instr.locb & 7) >= 6) { asmCode << "\tmov rax, " << instr.imm32 << std::endl; @@ -278,7 +296,7 @@ namespace RandomX { } void AssemblyGeneratorX86::h_IMULH_64(Instruction& instr, int i) { - genar(instr); + genar(instr, i); asmCode << "\tmov rcx, "; genbr1(instr); asmCode << "\timul rcx" << std::endl; @@ -287,7 +305,7 @@ namespace RandomX { } void AssemblyGeneratorX86::h_DIV_64(Instruction& instr, int i) { - genar(instr); + genar(instr, i); if ((instr.locb & 7) >= 6) { if (instr.imm32 == 0) { asmCode << "\tmov ecx, 1" << std::endl; @@ -308,7 +326,7 @@ namespace RandomX { } void AssemblyGeneratorX86::h_IDIV_64(Instruction& instr, int i) { - genar(instr); + genar(instr, i); asmCode << "\tmov edx, "; genbr132(instr); asmCode << "\tcmp edx, -1" << std::endl; @@ -329,91 +347,91 @@ namespace RandomX { } void AssemblyGeneratorX86::h_AND_64(Instruction& instr, int i) { - genar(instr); + genar(instr, i); asmCode << "\tand rax, "; genbr1(instr); gencr(instr); } void AssemblyGeneratorX86::h_AND_32(Instruction& instr, int i) { - genar(instr); + genar(instr, i); asmCode << "\tand eax, "; genbr132(instr); gencr(instr); } void AssemblyGeneratorX86::h_OR_64(Instruction& instr, int i) { - genar(instr); + genar(instr, i); asmCode << "\tor rax, "; genbr1(instr); gencr(instr); } void AssemblyGeneratorX86::h_OR_32(Instruction& instr, int i) { - genar(instr); + genar(instr, i); asmCode << "\tor eax, "; genbr132(instr); gencr(instr); } void AssemblyGeneratorX86::h_XOR_64(Instruction& instr, int i) { - genar(instr); + genar(instr, i); asmCode << "\txor rax, "; genbr1(instr); gencr(instr); } void AssemblyGeneratorX86::h_XOR_32(Instruction& instr, int i) { - genar(instr); + genar(instr, i); asmCode << "\txor eax, "; genbr132(instr); gencr(instr); } void AssemblyGeneratorX86::h_SHL_64(Instruction& instr, int i) { - genar(instr); + genar(instr, i); genbr0(instr, "shl"); gencr(instr); } void AssemblyGeneratorX86::h_SHR_64(Instruction& instr, int i) { - genar(instr); + genar(instr, i); genbr0(instr, "shr"); gencr(instr); } void AssemblyGeneratorX86::h_SAR_64(Instruction& instr, int i) { - genar(instr); + genar(instr, i); genbr0(instr, "sar"); gencr(instr); } void AssemblyGeneratorX86::h_ROL_64(Instruction& instr, int i) { - genar(instr); + genar(instr, i); genbr0(instr, "rol"); gencr(instr); } void AssemblyGeneratorX86::h_ROR_64(Instruction& instr, int i) { - genar(instr); + genar(instr, i); genbr0(instr, "ror"); gencr(instr); } void AssemblyGeneratorX86::h_FPADD(Instruction& instr, int i) { - genaf(instr); + genaf(instr, i); genbf(instr, "addpd"); gencf(instr); } void AssemblyGeneratorX86::h_FPSUB(Instruction& instr, int i) { - genaf(instr); + genaf(instr, i); genbf(instr, "subpd"); gencf(instr); } void AssemblyGeneratorX86::h_FPMUL(Instruction& instr, int i) { - genaf(instr); + genaf(instr, i); genbf(instr, "mulpd"); asmCode << "\tmovaps xmm1, xmm0" << std::endl; asmCode << "\tcmpeqpd xmm1, xmm1" << std::endl; @@ -422,7 +440,7 @@ namespace RandomX { } void AssemblyGeneratorX86::h_FPDIV(Instruction& instr, int i) { - genaf(instr); + genaf(instr, i); genbf(instr, "divpd"); asmCode << "\tmovaps xmm1, xmm0" << std::endl; asmCode << "\tcmpeqpd xmm1, xmm1" << std::endl; @@ -431,14 +449,14 @@ namespace RandomX { } void AssemblyGeneratorX86::h_FPSQRT(Instruction& instr, int i) { - genaf(instr); + genaf(instr, i); asmCode << "\tandps xmm0, xmm10" << std::endl; asmCode << "\tsqrtpd xmm0, xmm0" << std::endl; gencf(instr); } void AssemblyGeneratorX86::h_FPROUND(Instruction& instr, int i) { - genar(instr); + genar(instr, i); //asmCode << "\tmov rcx, rax" << std::endl; asmCode << "\tshl eax, 13" << std::endl; //asmCode << "\tand rcx, -2048" << std::endl; @@ -472,7 +490,7 @@ namespace RandomX { } void AssemblyGeneratorX86::h_CALL(Instruction& instr, int i) { - genar(instr); + genar(instr, i); asmCode << "\tcmp " << regR32[instr.regb % RegistersCount] << ", " << instr.imm32 << std::endl; asmCode << "\t" << jumpCondition(instr); asmCode << " short taken_call_" << i << std::endl; @@ -487,7 +505,7 @@ namespace RandomX { } void AssemblyGeneratorX86::h_RET(Instruction& instr, int i) { - genar(instr); + genar(instr, i); asmCode << "\tcmp rsp, " << regStackBeginAddr << std::endl; asmCode << "\tje short not_taken_ret_" << i << std::endl; asmCode << "\txor rax, qword ptr [rsp + 8]" << std::endl; diff --git a/src/AssemblyGeneratorX86.hpp b/src/AssemblyGeneratorX86.hpp index bdcbcec..92c7d31 100644 --- a/src/AssemblyGeneratorX86.hpp +++ b/src/AssemblyGeneratorX86.hpp @@ -38,8 +38,8 @@ namespace RandomX { static InstructionGenerator engine[256]; std::stringstream asmCode; - void genar(Instruction&); - void genaf(Instruction&); + void genar(Instruction&, int); + void genaf(Instruction&, int); void genbr0(Instruction&, const char*); void genbr1(Instruction&); void genbr132(Instruction&); diff --git a/src/common.hpp b/src/common.hpp index 0bfc834..12b74c1 100644 --- a/src/common.hpp +++ b/src/common.hpp @@ -98,7 +98,7 @@ namespace RandomX { }; struct MemoryRegisters { - addr_t ma, mx; + addr_t mx, ma; dataset_t ds; }; diff --git a/src/executeProgram-win64.asm b/src/executeProgram-win64.asm index 3bc161e..05434f2 100644 --- a/src/executeProgram-win64.asm +++ b/src/executeProgram-win64.asm @@ -82,7 +82,7 @@ executeProgram PROC ; function arguments push rcx ; RegisterFile& registerFile - mov edi, dword ptr [rdx] ; "mx" + mov rdi, qword ptr [rdx] ; "mx", "ma" mov rax, qword ptr [rdx+8] ; uint8_t* dataset push rax mov rsi, r8 ; convertible_t* scratchpad @@ -216,7 +216,7 @@ TransformAddress MACRO reg32, reg64 ;xor reg32, -8 ;# C = all except 0 to 7 ENDM -ReadMemoryRandom MACRO spmask, float +ReadMemoryRandom MACRO spmask ;# IN ecx = random 32-bit address ;# OUT rax = 64-bit integer return value ;# OUT xmm0 = 128-bit floating point return value @@ -225,19 +225,6 @@ ReadMemoryRandom MACRO spmask, float ;# GLOBAL rsi = address of the scratchpad ;# GLOBAL rdi = low 32 bits = "mx", high 32 bits = "ma" ;# MODIFY rcx, rdx -LOCAL L_prefetch_read, L_return - test ebp, 63 - jz short L_prefetch_read ;# "ic" divisible by 64 -> prefetch + read - xor rdi, rcx ;# randomize "mx" -L_return: - and ecx, spmask ;# limit address to the specified scratchpad size -IF float - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] -ELSE - mov rax, qword ptr [rsi+rcx*8] -ENDIF - ret -L_prefetch_read: ; prefetch cacheline "mx" mov rax, qword ptr [rbx] ;# load the dataset address and rdi, -64 ;# align "mx" to the start of a cache line @@ -249,34 +236,6 @@ L_prefetch_read: push rcx TransformAddress ecx, rcx ;# TransformAddress function and ecx, spmask-7 ;# limit address to the specified scratchpad size aligned to multiple of 8 - call rx_read_dataset - pop rcx - jmp short L_return -ENDM - -ALIGN 64 -rx_readint_l1: -ReadMemoryRandom 2047, 0 - -ALIGN 64 -rx_readint_l2: -ReadMemoryRandom 32767, 0 - -ALIGN 64 -rx_readfloat_l1: -ReadMemoryRandom 2047, 1 - -ALIGN 64 -rx_readfloat_l2: -ReadMemoryRandom 32767, 1 - -ALIGN 64 -rx_read_dataset: -;# IN rax = dataset address -;# IN ecx = scratchpad index - must be divisible by 8 -;# IN edx = dataset index - must be divisible by 64 -;# GLOBAL rsi = address of the scratchpad -;# MODIFY rax, rcx, rdx lea rcx, [rsi+rcx*8] ;# scratchpad cache line lea rax, [rax+rdx] ;# dataset cache line mov rdx, qword ptr [rax+0] ;# load first dataset quadword (prefetched into the cache by now) @@ -295,7 +254,18 @@ rx_read_dataset: xor qword ptr [rcx+48], rdx mov rdx, qword ptr [rax+56] xor qword ptr [rcx+56], rdx + pop rcx ret +ENDM + +ALIGN 64 +rx_read_l1: +ReadMemoryRandom 2047 + +ALIGN 64 +rx_read_l2: +ReadMemoryRandom 32767 + executeProgram ENDP _RANDOMX_EXECUTE_PROGRAM ENDS diff --git a/src/program.inc b/src/program.inc index 8450044..a551edb 100644 --- a/src/program.inc +++ b/src/program.inc @@ -3,7 +3,13 @@ rx_i_0: ;RET jz rx_finish xor r9, 0ca9788ah mov ecx, r9d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_0 + call rx_read_l1 +rx_body_0: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] cmp rsp, rbx je short not_taken_ret_0 xor rax, qword ptr [rsp + 8] @@ -25,7 +31,13 @@ rx_i_1: ;AND_64 jz rx_finish xor r15, 06afc2fa4h mov ecx, r15d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_1 + call rx_read_l1 +rx_body_1: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] and rax, r10 mov r12, rax @@ -34,7 +46,13 @@ rx_i_2: ;CALL jz rx_finish xor r15, 097210f7bh mov ecx, r15d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_2 + call rx_read_l1 +rx_body_2: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] cmp r11d, 1348521207 jno short taken_call_2 mov rcx, rax @@ -52,7 +70,13 @@ rx_i_3: ;FPROUND jz rx_finish xor r13, 082c73195h mov ecx, r13d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_3 + call rx_read_l1 +rx_body_3: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] shl eax, 13 and eax, 24576 or eax, 40896 @@ -64,7 +88,13 @@ rx_i_4: ;MULH_64 jz rx_finish xor r14, 077daefb4h mov ecx, r14d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_4 + call rx_read_l1 +rx_body_4: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] mov rcx, r14 mul rcx mov rax, rdx @@ -79,7 +109,13 @@ rx_i_5: ;IMUL_32 jz rx_finish xor r15, 0379f9ee0h mov ecx, r15d - call rx_readint_l2 + test ebp, 63 + jnz short rx_body_5 + call rx_read_l2 +rx_body_5: + xor rdi, rcx + and ecx, 32767 + mov rax, qword ptr [rsi+rcx*8] movsxd rcx, eax movsxd rax, r12d imul rax, rcx @@ -90,7 +126,13 @@ rx_i_6: ;MUL_64 jz rx_finish xor r8, 03bae7272h mov ecx, r8d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_6 + call rx_read_l1 +rx_body_6: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] imul rax, r15 mov rcx, rax mov eax, r9d @@ -103,7 +145,13 @@ rx_i_7: ;FPADD jz rx_finish xor r10, 0e264ed81h mov ecx, r10d - call rx_readfloat_l1 + test ebp, 63 + jnz short rx_body_7 + call rx_read_l1 +rx_body_7: + xor rdi, rcx + and ecx, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm6 movaps xmm6, xmm0 mov eax, r14d @@ -116,7 +164,13 @@ rx_i_8: ;SHL_64 jz rx_finish xor r13, 068c1e5d2h mov ecx, r13d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_8 + call rx_read_l1 +rx_body_8: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] shl rax, 47 mov rcx, rax mov eax, r12d @@ -129,7 +183,13 @@ rx_i_9: ;AND_64 jz rx_finish xor r14, 085121c54h mov ecx, r14d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_9 + call rx_read_l1 +rx_body_9: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] and rax, 565870810 mov r10, rax @@ -138,7 +198,13 @@ rx_i_10: ;OR_64 jz rx_finish xor r8, 052efde3eh mov ecx, r8d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_10 + call rx_read_l1 +rx_body_10: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] or rax, -727859809 mov r13, rax @@ -147,7 +213,13 @@ rx_i_11: ;FPADD jz rx_finish xor r10, 0a9bf8aa1h mov ecx, r10d - call rx_readfloat_l2 + test ebp, 63 + jnz short rx_body_11 + call rx_read_l2 +rx_body_11: + xor rdi, rcx + and ecx, 32767 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm5 movaps xmm4, xmm0 mov eax, r12d @@ -160,7 +232,13 @@ rx_i_12: ;CALL jz rx_finish xor r10, 0db2691ch mov ecx, r10d - call rx_readint_l2 + test ebp, 63 + jnz short rx_body_12 + call rx_read_l2 +rx_body_12: + xor rdi, rcx + and ecx, 32767 + mov rax, qword ptr [rsi+rcx*8] cmp r8d, -1763940407 jge short taken_call_12 mov r8, rax @@ -174,7 +252,13 @@ rx_i_13: ;FPSUB jz rx_finish xor r12, 061c0d34dh mov ecx, r12d - call rx_readfloat_l1 + test ebp, 63 + jnz short rx_body_13 + call rx_read_l1 +rx_body_13: + xor rdi, rcx + and ecx, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] subpd xmm0, xmm3 movaps xmm9, xmm0 @@ -183,7 +267,13 @@ rx_i_14: ;SHR_64 jz rx_finish xor r10, 0e761d1beh mov ecx, r10d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_14 + call rx_read_l1 +rx_body_14: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] shr rax, 4 mov rcx, rax mov eax, r10d @@ -196,7 +286,13 @@ rx_i_15: ;RET jz rx_finish xor r11, 074ddb688h mov ecx, r11d - call rx_readint_l2 + test ebp, 63 + jnz short rx_body_15 + call rx_read_l2 +rx_body_15: + xor rdi, rcx + and ecx, 32767 + mov rax, qword ptr [rsi+rcx*8] cmp rsp, rbx je short not_taken_ret_15 xor rax, qword ptr [rsp + 8] @@ -218,7 +314,13 @@ rx_i_16: ;ADD_64 jz rx_finish xor r14, 06be90627h mov ecx, r14d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_16 + call rx_read_l1 +rx_body_16: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] add rax, r10 mov rcx, rax mov eax, r9d @@ -231,7 +333,13 @@ rx_i_17: ;FPMUL jz rx_finish xor r11, 0fbc6fc35h mov ecx, r11d - call rx_readfloat_l1 + test ebp, 63 + jnz short rx_body_17 + call rx_read_l1 +rx_body_17: + xor rdi, rcx + and ecx, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] mulpd xmm0, xmm4 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -247,7 +355,13 @@ rx_i_18: ;FPSUB jz rx_finish xor r14, 0c28ca080h mov ecx, r14d - call rx_readfloat_l1 + test ebp, 63 + jnz short rx_body_18 + call rx_read_l1 +rx_body_18: + xor rdi, rcx + and ecx, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] subpd xmm0, xmm4 movaps xmm3, xmm0 mov eax, r11d @@ -260,7 +374,13 @@ rx_i_19: ;FPSUB jz rx_finish xor r13, 0ac009c30h mov ecx, r13d - call rx_readfloat_l1 + test ebp, 63 + jnz short rx_body_19 + call rx_read_l1 +rx_body_19: + xor rdi, rcx + and ecx, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] subpd xmm0, xmm8 movaps xmm7, xmm0 @@ -269,7 +389,13 @@ rx_i_20: ;FPMUL jz rx_finish xor r13, 0ecca967dh mov ecx, r13d - call rx_readfloat_l1 + test ebp, 63 + jnz short rx_body_20 + call rx_read_l1 +rx_body_20: + xor rdi, rcx + and ecx, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] mulpd xmm0, xmm2 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -285,7 +411,13 @@ rx_i_21: ;FPADD jz rx_finish xor r8, 0977f0284h mov ecx, r8d - call rx_readfloat_l2 + test ebp, 63 + jnz short rx_body_21 + call rx_read_l2 +rx_body_21: + xor rdi, rcx + and ecx, 32767 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm9 movaps xmm7, xmm0 @@ -294,7 +426,13 @@ rx_i_22: ;ADD_32 jz rx_finish xor r13, 080bdfefah mov ecx, r13d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_22 + call rx_read_l1 +rx_body_22: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] add eax, r8d mov rcx, rax mov eax, r10d @@ -307,7 +445,13 @@ rx_i_23: ;MUL_64 jz rx_finish xor r15, 0e1e0d3c4h mov ecx, r15d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_23 + call rx_read_l1 +rx_body_23: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] imul rax, r11 mov r8, rax @@ -316,7 +460,13 @@ rx_i_24: ;IMULH_64 jz rx_finish xor r8, 070d3b8c7h mov ecx, r8d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_24 + call rx_read_l1 +rx_body_24: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] mov rcx, r15 imul rcx mov rax, rdx @@ -331,7 +481,13 @@ rx_i_25: ;FPMUL jz rx_finish xor r12, 01cf77a04h mov ecx, r12d - call rx_readfloat_l2 + test ebp, 63 + jnz short rx_body_25 + call rx_read_l2 +rx_body_25: + xor rdi, rcx + and ecx, 32767 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] mulpd xmm0, xmm9 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -347,7 +503,13 @@ rx_i_26: ;IMUL_32 jz rx_finish xor r11, 0e311468ch mov ecx, r11d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_26 + call rx_read_l1 +rx_body_26: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] movsxd rcx, eax movsxd rax, r13d imul rax, rcx @@ -362,7 +524,13 @@ rx_i_27: ;FPMUL jz rx_finish xor r12, 01fd9911ah mov ecx, r12d - call rx_readfloat_l2 + test ebp, 63 + jnz short rx_body_27 + call rx_read_l2 +rx_body_27: + xor rdi, rcx + and ecx, 32767 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] mulpd xmm0, xmm3 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -374,7 +542,13 @@ rx_i_28: ;XOR_64 jz rx_finish xor r13, 067df757eh mov ecx, r13d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_28 + call rx_read_l1 +rx_body_28: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] xor rax, r13 mov r14, rax @@ -383,7 +557,13 @@ rx_i_29: ;SUB_64 jz rx_finish xor r12, 0be2e7c42h mov ecx, r12d - call rx_readint_l2 + test ebp, 63 + jnz short rx_body_29 + call rx_read_l2 +rx_body_29: + xor rdi, rcx + and ecx, 32767 + mov rax, qword ptr [rsi+rcx*8] sub rax, 1944166515 mov r14, rax @@ -392,7 +572,13 @@ rx_i_30: ;FPADD jz rx_finish xor r11, 084d067f7h mov ecx, r11d - call rx_readfloat_l1 + test ebp, 63 + jnz short rx_body_30 + call rx_read_l1 +rx_body_30: + xor rdi, rcx + and ecx, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm3 movaps xmm7, xmm0 @@ -401,7 +587,13 @@ rx_i_31: ;FPADD jz rx_finish xor r14, 0d352ce37h mov ecx, r14d - call rx_readfloat_l2 + test ebp, 63 + jnz short rx_body_31 + call rx_read_l2 +rx_body_31: + xor rdi, rcx + and ecx, 32767 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm3 movaps xmm6, xmm0 mov eax, r14d @@ -414,7 +606,13 @@ rx_i_32: ;XOR_64 jz rx_finish xor r12, 0a1f248dah mov ecx, r12d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_32 + call rx_read_l1 +rx_body_32: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] xor rax, -1936869641 mov r9, rax @@ -423,7 +621,13 @@ rx_i_33: ;MULH_64 jz rx_finish xor r9, 0554720fch mov ecx, r9d - call rx_readint_l2 + test ebp, 63 + jnz short rx_body_33 + call rx_read_l2 +rx_body_33: + xor rdi, rcx + and ecx, 32767 + mov rax, qword ptr [rsi+rcx*8] mov rcx, r15 mul rcx mov rax, rdx @@ -434,7 +638,13 @@ rx_i_34: ;CALL jz rx_finish xor r13, 0665e91f1h mov ecx, r13d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_34 + call rx_read_l1 +rx_body_34: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] cmp r14d, -380224718 js short taken_call_34 mov r15, rax @@ -448,7 +658,13 @@ rx_i_35: ;RET jz rx_finish xor r15, 05ef1be79h mov ecx, r15d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_35 + call rx_read_l1 +rx_body_35: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] cmp rsp, rbx je short not_taken_ret_35 xor rax, qword ptr [rsp + 8] @@ -462,7 +678,13 @@ rx_i_36: ;FPMUL jz rx_finish xor r8, 012ec7e3ah mov ecx, r8d - call rx_readfloat_l1 + test ebp, 63 + jnz short rx_body_36 + call rx_read_l1 +rx_body_36: + xor rdi, rcx + and ecx, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] mulpd xmm0, xmm6 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -474,7 +696,13 @@ rx_i_37: ;FPMUL jz rx_finish xor r12, 0d0706601h mov ecx, r12d - call rx_readfloat_l1 + test ebp, 63 + jnz short rx_body_37 + call rx_read_l1 +rx_body_37: + xor rdi, rcx + and ecx, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] mulpd xmm0, xmm2 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -490,7 +718,13 @@ rx_i_38: ;SUB_64 jz rx_finish xor r9, 064056913h mov ecx, r9d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_38 + call rx_read_l1 +rx_body_38: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] sub rax, r14 mov r10, rax @@ -499,7 +733,13 @@ rx_i_39: ;ADD_32 jz rx_finish xor r14, 02c1f1eb0h mov ecx, r14d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_39 + call rx_read_l1 +rx_body_39: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] add eax, r14d mov r14, rax @@ -508,7 +748,13 @@ rx_i_40: ;RET jz rx_finish xor r10, 068fd9009h mov ecx, r10d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_40 + call rx_read_l1 +rx_body_40: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] cmp rsp, rbx je short not_taken_ret_40 xor rax, qword ptr [rsp + 8] @@ -530,7 +776,13 @@ rx_i_41: ;CALL jz rx_finish xor r9, 037a30933h mov ecx, r9d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_41 + call rx_read_l1 +rx_body_41: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] cmp r14d, -1070581824 jo short taken_call_41 mov r9, rax @@ -544,7 +796,13 @@ rx_i_42: ;FPSUB jz rx_finish xor r15, 0bc1de9f6h mov ecx, r15d - call rx_readfloat_l1 + test ebp, 63 + jnz short rx_body_42 + call rx_read_l1 +rx_body_42: + xor rdi, rcx + and ecx, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] subpd xmm0, xmm6 movaps xmm6, xmm0 @@ -553,7 +811,13 @@ rx_i_43: ;SUB_64 jz rx_finish xor r12, 02b2a2eech mov ecx, r12d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_43 + call rx_read_l1 +rx_body_43: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] sub rax, 1693705407 mov rcx, rax mov eax, r11d @@ -566,7 +830,13 @@ rx_i_44: ;ROL_64 jz rx_finish xor r11, 0685817abh mov ecx, r11d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_44 + call rx_read_l1 +rx_body_44: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] mov rcx, r9 rol rax, cl mov r15, rax @@ -576,7 +846,13 @@ rx_i_45: ;FPSUB jz rx_finish xor r12, 08cd244ebh mov ecx, r12d - call rx_readfloat_l2 + test ebp, 63 + jnz short rx_body_45 + call rx_read_l2 +rx_body_45: + xor rdi, rcx + and ecx, 32767 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] subpd xmm0, xmm2 movaps xmm5, xmm0 @@ -585,7 +861,13 @@ rx_i_46: ;ADD_64 jz rx_finish xor r8, 06d8f4254h mov ecx, r8d - call rx_readint_l2 + test ebp, 63 + jnz short rx_body_46 + call rx_read_l2 +rx_body_46: + xor rdi, rcx + and ecx, 32767 + mov rax, qword ptr [rsi+rcx*8] add rax, r9 mov rcx, rax mov eax, r8d @@ -598,7 +880,13 @@ rx_i_47: ;CALL jz rx_finish xor r12, 05ba232c6h mov ecx, r12d - call rx_readint_l2 + test ebp, 63 + jnz short rx_body_47 + call rx_read_l2 +rx_body_47: + xor rdi, rcx + and ecx, 32767 + mov rax, qword ptr [rsi+rcx*8] cmp r10d, 119251505 jbe short taken_call_47 mov rcx, rax @@ -616,7 +904,13 @@ rx_i_48: ;FPSQRT jz rx_finish xor r8, 0aaed618fh mov ecx, r8d - call rx_readfloat_l1 + test ebp, 63 + jnz short rx_body_48 + call rx_read_l1 +rx_body_48: + xor rdi, rcx + and ecx, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] andps xmm0, xmm10 sqrtpd xmm0, xmm0 movaps xmm9, xmm0 @@ -630,7 +924,13 @@ rx_i_49: ;FPMUL jz rx_finish xor r8, 0f96c6a45h mov ecx, r8d - call rx_readfloat_l1 + test ebp, 63 + jnz short rx_body_49 + call rx_read_l1 +rx_body_49: + xor rdi, rcx + and ecx, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] mulpd xmm0, xmm3 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -642,7 +942,13 @@ rx_i_50: ;OR_32 jz rx_finish xor r9, 0da3e4842h mov ecx, r9d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_50 + call rx_read_l1 +rx_body_50: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] or eax, r10d mov rcx, rax mov eax, r15d @@ -655,7 +961,13 @@ rx_i_51: ;SUB_64 jz rx_finish xor r10, 0302b676ah mov ecx, r10d - call rx_readint_l2 + test ebp, 63 + jnz short rx_body_51 + call rx_read_l2 +rx_body_51: + xor rdi, rcx + and ecx, 32767 + mov rax, qword ptr [rsi+rcx*8] sub rax, 419241919 mov r15, rax @@ -664,7 +976,13 @@ rx_i_52: ;CALL jz rx_finish xor r11, 0fa88f48bh mov ecx, r11d - call rx_readint_l2 + test ebp, 63 + jnz short rx_body_52 + call rx_read_l2 +rx_body_52: + xor rdi, rcx + and ecx, 32767 + mov rax, qword ptr [rsi+rcx*8] cmp r13d, -534426193 js short taken_call_52 mov rcx, rax @@ -682,7 +1000,13 @@ rx_i_53: ;RET jz rx_finish xor r13, 03dff9b9eh mov ecx, r13d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_53 + call rx_read_l1 +rx_body_53: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] cmp rsp, rbx je short not_taken_ret_53 xor rax, qword ptr [rsp + 8] @@ -696,7 +1020,13 @@ rx_i_54: ;IMULH_64 jz rx_finish xor r11, 060638de0h mov ecx, r11d - call rx_readint_l2 + test ebp, 63 + jnz short rx_body_54 + call rx_read_l2 +rx_body_54: + xor rdi, rcx + and ecx, 32767 + mov rax, qword ptr [rsi+rcx*8] mov rcx, 282209221 imul rcx mov rax, rdx @@ -711,7 +1041,13 @@ rx_i_55: ;FPMUL jz rx_finish xor r10, 0dda983d4h mov ecx, r10d - call rx_readfloat_l1 + test ebp, 63 + jnz short rx_body_55 + call rx_read_l1 +rx_body_55: + xor rdi, rcx + and ecx, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] mulpd xmm0, xmm5 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -727,7 +1063,13 @@ rx_i_56: ;AND_64 jz rx_finish xor r14, 0f1456b8eh mov ecx, r14d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_56 + call rx_read_l1 +rx_body_56: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] and rax, r15 mov rcx, rax mov eax, r8d @@ -740,7 +1082,13 @@ rx_i_57: ;MUL_64 jz rx_finish xor r9, 010dc4571h mov ecx, r9d - call rx_readint_l2 + test ebp, 63 + jnz short rx_body_57 + call rx_read_l2 +rx_body_57: + xor rdi, rcx + and ecx, 32767 + mov rax, qword ptr [rsi+rcx*8] imul rax, r14 mov rcx, rax mov eax, r15d @@ -753,7 +1101,13 @@ rx_i_58: ;IDIV_64 jz rx_finish xor r14, 0bcec0ebah mov ecx, r14d - call rx_readint_l2 + test ebp, 63 + jnz short rx_body_58 + call rx_read_l2 +rx_body_58: + xor rdi, rcx + and ecx, 32767 + mov rax, qword ptr [rsi+rcx*8] mov edx, r13d cmp edx, -1 jne short safe_idiv_58 @@ -776,7 +1130,13 @@ rx_i_59: ;FPSUB jz rx_finish xor r11, 0980dd402h mov ecx, r11d - call rx_readfloat_l1 + test ebp, 63 + jnz short rx_body_59 + call rx_read_l1 +rx_body_59: + xor rdi, rcx + and ecx, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] subpd xmm0, xmm8 movaps xmm7, xmm0 @@ -785,7 +1145,13 @@ rx_i_60: ;RET jz rx_finish xor r15, 03de14d1eh mov ecx, r15d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_60 + call rx_read_l1 +rx_body_60: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] cmp rsp, rbx je short not_taken_ret_60 xor rax, qword ptr [rsp + 8] @@ -807,7 +1173,13 @@ rx_i_61: ;CALL jz rx_finish xor r13, 05058ce64h mov ecx, r13d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_61 + call rx_read_l1 +rx_body_61: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] cmp r15d, 1933164545 jns short taken_call_61 mov r11, rax @@ -821,7 +1193,13 @@ rx_i_62: ;FPMUL jz rx_finish xor r15, 0c3089414h mov ecx, r15d - call rx_readfloat_l1 + test ebp, 63 + jnz short rx_body_62 + call rx_read_l1 +rx_body_62: + xor rdi, rcx + and ecx, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] mulpd xmm0, xmm8 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -837,7 +1215,13 @@ rx_i_63: ;FPMUL jz rx_finish xor r9, 065cf272eh mov ecx, r9d - call rx_readfloat_l1 + test ebp, 63 + jnz short rx_body_63 + call rx_read_l1 +rx_body_63: + xor rdi, rcx + and ecx, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] mulpd xmm0, xmm7 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -849,7 +1233,13 @@ rx_i_64: ;SUB_64 jz rx_finish xor r13, 0ae54dfbfh mov ecx, r13d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_64 + call rx_read_l1 +rx_body_64: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] sub rax, r15 mov r9, rax @@ -858,7 +1248,13 @@ rx_i_65: ;CALL jz rx_finish xor r13, 07b366ce6h mov ecx, r13d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_65 + call rx_read_l1 +rx_body_65: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] cmp r8d, 1498056607 js short taken_call_65 mov r11, rax @@ -872,7 +1268,13 @@ rx_i_66: ;FPSQRT jz rx_finish xor r15, 015a1b689h mov ecx, r15d - call rx_readfloat_l2 + test ebp, 63 + jnz short rx_body_66 + call rx_read_l2 +rx_body_66: + xor rdi, rcx + and ecx, 32767 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] andps xmm0, xmm10 sqrtpd xmm0, xmm0 movaps xmm9, xmm0 @@ -886,7 +1288,13 @@ rx_i_67: ;CALL jz rx_finish xor r14, 088393ba0h mov ecx, r14d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_67 + call rx_read_l1 +rx_body_67: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] cmp r13d, 2031541081 jns short taken_call_67 mov r9, rax @@ -900,7 +1308,13 @@ rx_i_68: ;FPSUB jz rx_finish xor r13, 03aa5c3a4h mov ecx, r13d - call rx_readfloat_l1 + test ebp, 63 + jnz short rx_body_68 + call rx_read_l1 +rx_body_68: + xor rdi, rcx + and ecx, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] subpd xmm0, xmm2 movaps xmm4, xmm0 mov eax, r12d @@ -913,7 +1327,13 @@ rx_i_69: ;FPADD jz rx_finish xor r15, 0376c9c27h mov ecx, r15d - call rx_readfloat_l2 + test ebp, 63 + jnz short rx_body_69 + call rx_read_l2 +rx_body_69: + xor rdi, rcx + and ecx, 32767 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm5 movaps xmm8, xmm0 @@ -922,7 +1342,13 @@ rx_i_70: ;MULH_64 jz rx_finish xor r8, 0bbbec3fah mov ecx, r8d - call rx_readint_l2 + test ebp, 63 + jnz short rx_body_70 + call rx_read_l2 +rx_body_70: + xor rdi, rcx + and ecx, 32767 + mov rax, qword ptr [rsi+rcx*8] mov rcx, r9 mul rcx mov rax, rdx @@ -933,7 +1359,13 @@ rx_i_71: ;FPMUL jz rx_finish xor r14, 0e9efb350h mov ecx, r14d - call rx_readfloat_l1 + test ebp, 63 + jnz short rx_body_71 + call rx_read_l1 +rx_body_71: + xor rdi, rcx + and ecx, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] mulpd xmm0, xmm5 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -945,7 +1377,13 @@ rx_i_72: ;CALL jz rx_finish xor r13, 0f4e51e28h mov ecx, r13d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_72 + call rx_read_l1 +rx_body_72: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] cmp r9d, -631091751 jno short taken_call_72 mov rcx, rax @@ -963,7 +1401,13 @@ rx_i_73: ;FPROUND jz rx_finish xor r12, 0c24ddbd4h mov ecx, r12d - call rx_readint_l2 + test ebp, 63 + jnz short rx_body_73 + call rx_read_l2 +rx_body_73: + xor rdi, rcx + and ecx, 32767 + mov rax, qword ptr [rsi+rcx*8] shl eax, 13 and eax, 24576 or eax, 40896 @@ -975,7 +1419,13 @@ rx_i_74: ;MUL_64 jz rx_finish xor r8, 04c4b0c7fh mov ecx, r8d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_74 + call rx_read_l1 +rx_body_74: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] imul rax, rax, -1431647438 mov rcx, rax mov eax, r9d @@ -988,7 +1438,13 @@ rx_i_75: ;RET jz rx_finish xor r14, 03bcc02e3h mov ecx, r14d - call rx_readint_l2 + test ebp, 63 + jnz short rx_body_75 + call rx_read_l2 +rx_body_75: + xor rdi, rcx + and ecx, 32767 + mov rax, qword ptr [rsi+rcx*8] cmp rsp, rbx je short not_taken_ret_75 xor rax, qword ptr [rsp + 8] @@ -1002,7 +1458,13 @@ rx_i_76: ;FPADD jz rx_finish xor r11, 04b0ff63eh mov ecx, r11d - call rx_readfloat_l1 + test ebp, 63 + jnz short rx_body_76 + call rx_read_l1 +rx_body_76: + xor rdi, rcx + and ecx, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm3 movaps xmm7, xmm0 mov eax, r15d @@ -1015,7 +1477,13 @@ rx_i_77: ;RET jz rx_finish xor r14, 0b956b3e8h mov ecx, r14d - call rx_readint_l2 + test ebp, 63 + jnz short rx_body_77 + call rx_read_l2 +rx_body_77: + xor rdi, rcx + and ecx, 32767 + mov rax, qword ptr [rsi+rcx*8] cmp rsp, rbx je short not_taken_ret_77 xor rax, qword ptr [rsp + 8] @@ -1037,7 +1505,13 @@ rx_i_78: ;MUL_32 jz rx_finish xor r9, 0edeca680h mov ecx, r9d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_78 + call rx_read_l1 +rx_body_78: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] mov ecx, eax mov eax, r8d imul rax, rcx @@ -1048,7 +1522,13 @@ rx_i_79: ;RET jz rx_finish xor r11, 0fbdddcb5h mov ecx, r11d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_79 + call rx_read_l1 +rx_body_79: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] cmp rsp, rbx je short not_taken_ret_79 xor rax, qword ptr [rsp + 8] @@ -1070,7 +1550,13 @@ rx_i_80: ;FPADD jz rx_finish xor r13, 09cec97a1h mov ecx, r13d - call rx_readfloat_l2 + test ebp, 63 + jnz short rx_body_80 + call rx_read_l2 +rx_body_80: + xor rdi, rcx + and ecx, 32767 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm3 movaps xmm3, xmm0 @@ -1079,7 +1565,13 @@ rx_i_81: ;OR_64 jz rx_finish xor r15, 078228167h mov ecx, r15d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_81 + call rx_read_l1 +rx_body_81: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] or rax, r13 mov r8, rax @@ -1088,7 +1580,13 @@ rx_i_82: ;CALL jz rx_finish xor r11, 078cae1ffh mov ecx, r11d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_82 + call rx_read_l1 +rx_body_82: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] cmp r12d, -68969733 jo short taken_call_82 mov rcx, rax @@ -1106,7 +1604,13 @@ rx_i_83: ;AND_64 jz rx_finish xor r10, 0d9b6a533h mov ecx, r10d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_83 + call rx_read_l1 +rx_body_83: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] and rax, r10 mov r12, rax @@ -1115,7 +1619,13 @@ rx_i_84: ;ROR_64 jz rx_finish xor r15, 0e9e75336h mov ecx, r15d - call rx_readint_l2 + test ebp, 63 + jnz short rx_body_84 + call rx_read_l2 +rx_body_84: + xor rdi, rcx + and ecx, 32767 + mov rax, qword ptr [rsi+rcx*8] mov rcx, r10 ror rax, cl mov rcx, rax @@ -1129,7 +1639,13 @@ rx_i_85: ;MUL_64 jz rx_finish xor r13, 04c0d378ah mov ecx, r13d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_85 + call rx_read_l1 +rx_body_85: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] imul rax, r8 mov r10, rax @@ -1138,7 +1654,13 @@ rx_i_86: ;OR_64 jz rx_finish xor r11, 04386e368h mov ecx, r11d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_86 + call rx_read_l1 +rx_body_86: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] or rax, r8 mov rcx, rax mov eax, r12d @@ -1151,7 +1673,13 @@ rx_i_87: ;SUB_64 jz rx_finish xor r9, 0d75a0ecfh mov ecx, r9d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_87 + call rx_read_l1 +rx_body_87: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] sub rax, r12 mov r8, rax @@ -1160,7 +1688,13 @@ rx_i_88: ;FPADD jz rx_finish xor r9, 031bb7f7ah mov ecx, r9d - call rx_readfloat_l1 + test ebp, 63 + jnz short rx_body_88 + call rx_read_l1 +rx_body_88: + xor rdi, rcx + and ecx, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm6 movaps xmm9, xmm0 mov eax, r9d @@ -1173,7 +1707,13 @@ rx_i_89: ;MUL_64 jz rx_finish xor r9, 03b45ecebh mov ecx, r9d - call rx_readint_l2 + test ebp, 63 + jnz short rx_body_89 + call rx_read_l2 +rx_body_89: + xor rdi, rcx + and ecx, 32767 + mov rax, qword ptr [rsi+rcx*8] imul rax, r8 mov rcx, rax mov eax, r10d @@ -1186,7 +1726,13 @@ rx_i_90: ;FPADD jz rx_finish xor r12, 0ee08e76bh mov ecx, r12d - call rx_readfloat_l1 + test ebp, 63 + jnz short rx_body_90 + call rx_read_l1 +rx_body_90: + xor rdi, rcx + and ecx, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm3 movaps xmm6, xmm0 @@ -1195,7 +1741,13 @@ rx_i_91: ;FPMUL jz rx_finish xor r9, 042e28e94h mov ecx, r9d - call rx_readfloat_l1 + test ebp, 63 + jnz short rx_body_91 + call rx_read_l1 +rx_body_91: + xor rdi, rcx + and ecx, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] mulpd xmm0, xmm2 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -1207,7 +1759,13 @@ rx_i_92: ;CALL jz rx_finish xor r8, 0729260e1h mov ecx, r8d - call rx_readint_l2 + test ebp, 63 + jnz short rx_body_92 + call rx_read_l2 +rx_body_92: + xor rdi, rcx + and ecx, 32767 + mov rax, qword ptr [rsi+rcx*8] cmp r14d, 1288893603 jge short taken_call_92 mov r12, rax @@ -1221,7 +1779,13 @@ rx_i_93: ;FPADD jz rx_finish xor r8, 0bfcebaf4h mov ecx, r8d - call rx_readfloat_l1 + test ebp, 63 + jnz short rx_body_93 + call rx_read_l1 +rx_body_93: + xor rdi, rcx + and ecx, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm2 movaps xmm2, xmm0 mov eax, r10d @@ -1234,7 +1798,13 @@ rx_i_94: ;RET jz rx_finish xor r13, 0ea326630h mov ecx, r13d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_94 + call rx_read_l1 +rx_body_94: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] cmp rsp, rbx je short not_taken_ret_94 xor rax, qword ptr [rsp + 8] @@ -1248,7 +1818,13 @@ rx_i_95: ;MUL_64 jz rx_finish xor r13, 0b5451a2dh mov ecx, r13d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_95 + call rx_read_l1 +rx_body_95: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] imul rax, r10 mov rcx, rax mov eax, r15d @@ -1261,7 +1837,13 @@ rx_i_96: ;IMUL_32 jz rx_finish xor r11, 04f912ef8h mov ecx, r11d - call rx_readint_l2 + test ebp, 63 + jnz short rx_body_96 + call rx_read_l2 +rx_body_96: + xor rdi, rcx + and ecx, 32767 + mov rax, qword ptr [rsi+rcx*8] movsxd rcx, eax mov rax, -1354397081 imul rax, rcx @@ -1272,7 +1854,13 @@ rx_i_97: ;FPSQRT jz rx_finish xor r15, 0acc45b3bh mov ecx, r15d - call rx_readfloat_l1 + test ebp, 63 + jnz short rx_body_97 + call rx_read_l1 +rx_body_97: + xor rdi, rcx + and ecx, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] andps xmm0, xmm10 sqrtpd xmm0, xmm0 movaps xmm5, xmm0 @@ -1286,7 +1874,13 @@ rx_i_98: ;SUB_64 jz rx_finish xor r14, 09900a4e8h mov ecx, r14d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_98 + call rx_read_l1 +rx_body_98: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] sub rax, r15 mov r14, rax @@ -1295,7 +1889,13 @@ rx_i_99: ;FPDIV jz rx_finish xor r9, 0841b2984h mov ecx, r9d - call rx_readfloat_l2 + test ebp, 63 + jnz short rx_body_99 + call rx_read_l2 +rx_body_99: + xor rdi, rcx + and ecx, 32767 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] divpd xmm0, xmm6 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -1311,7 +1911,13 @@ rx_i_100: ;ADD_64 jz rx_finish xor r15, 07ebea48fh mov ecx, r15d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_100 + call rx_read_l1 +rx_body_100: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] add rax, r9 mov r14, rax @@ -1320,7 +1926,13 @@ rx_i_101: ;SUB_64 jz rx_finish xor r10, 0631209d3h mov ecx, r10d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_101 + call rx_read_l1 +rx_body_101: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] sub rax, r8 mov r11, rax @@ -1329,7 +1941,13 @@ rx_i_102: ;FPDIV jz rx_finish xor r10, 0e50bf07ah mov ecx, r10d - call rx_readfloat_l1 + test ebp, 63 + jnz short rx_body_102 + call rx_read_l1 +rx_body_102: + xor rdi, rcx + and ecx, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] divpd xmm0, xmm3 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -1341,7 +1959,13 @@ rx_i_103: ;MUL_64 jz rx_finish xor r10, 02b7096f1h mov ecx, r10d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_103 + call rx_read_l1 +rx_body_103: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] imul rax, r13 mov rcx, rax mov eax, r15d @@ -1354,7 +1978,13 @@ rx_i_104: ;IMULH_64 jz rx_finish xor r11, 075deaf71h mov ecx, r11d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_104 + call rx_read_l1 +rx_body_104: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] mov rcx, -1913070089 imul rcx mov rax, rdx @@ -1369,7 +1999,13 @@ rx_i_105: ;MUL_32 jz rx_finish xor r13, 036a51f72h mov ecx, r13d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_105 + call rx_read_l1 +rx_body_105: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] mov ecx, eax mov eax, r15d imul rax, rcx @@ -1384,7 +2020,13 @@ rx_i_106: ;FPMUL jz rx_finish xor r11, 07b512986h mov ecx, r11d - call rx_readfloat_l1 + test ebp, 63 + jnz short rx_body_106 + call rx_read_l1 +rx_body_106: + xor rdi, rcx + and ecx, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] mulpd xmm0, xmm3 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -1400,7 +2042,13 @@ rx_i_107: ;CALL jz rx_finish xor r12, 0f1d2e50h mov ecx, r12d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_107 + call rx_read_l1 +rx_body_107: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] cmp r11d, 1917037441 jl short taken_call_107 mov rcx, rax @@ -1418,7 +2066,13 @@ rx_i_108: ;FPDIV jz rx_finish xor r9, 07327ba60h mov ecx, r9d - call rx_readfloat_l1 + test ebp, 63 + jnz short rx_body_108 + call rx_read_l1 +rx_body_108: + xor rdi, rcx + and ecx, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] divpd xmm0, xmm5 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -1434,7 +2088,13 @@ rx_i_109: ;FPADD jz rx_finish xor r15, 0594e37deh mov ecx, r15d - call rx_readfloat_l1 + test ebp, 63 + jnz short rx_body_109 + call rx_read_l1 +rx_body_109: + xor rdi, rcx + and ecx, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm2 movaps xmm3, xmm0 @@ -1443,7 +2103,13 @@ rx_i_110: ;ROL_64 jz rx_finish xor r9, 04cdf5ebah mov ecx, r9d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_110 + call rx_read_l1 +rx_body_110: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] mov rcx, r9 rol rax, cl mov rcx, rax @@ -1457,7 +2123,13 @@ rx_i_111: ;RET jz rx_finish xor r8, 02e16c97ch mov ecx, r8d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_111 + call rx_read_l1 +rx_body_111: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] cmp rsp, rbx je short not_taken_ret_111 xor rax, qword ptr [rsp + 8] @@ -1479,7 +2151,13 @@ rx_i_112: ;SUB_64 jz rx_finish xor r12, 0d42ddbd4h mov ecx, r12d - call rx_readint_l2 + test ebp, 63 + jnz short rx_body_112 + call rx_read_l2 +rx_body_112: + xor rdi, rcx + and ecx, 32767 + mov rax, qword ptr [rsi+rcx*8] sub rax, r13 mov rcx, rax mov eax, r14d @@ -1492,7 +2170,13 @@ rx_i_113: ;MULH_64 jz rx_finish xor r10, 07a4f8cbbh mov ecx, r10d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_113 + call rx_read_l1 +rx_body_113: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] mov rcx, r9 mul rcx mov rax, rdx @@ -1503,7 +2187,13 @@ rx_i_114: ;IMULH_64 jz rx_finish xor r13, 06e83e2cdh mov ecx, r13d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_114 + call rx_read_l1 +rx_body_114: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] mov rcx, r15 imul rcx mov rax, rdx @@ -1514,7 +2204,13 @@ rx_i_115: ;OR_64 jz rx_finish xor r14, 0336c980eh mov ecx, r14d - call rx_readint_l2 + test ebp, 63 + jnz short rx_body_115 + call rx_read_l2 +rx_body_115: + xor rdi, rcx + and ecx, 32767 + mov rax, qword ptr [rsi+rcx*8] or rax, r10 mov r14, rax @@ -1523,7 +2219,13 @@ rx_i_116: ;IMULH_64 jz rx_finish xor r10, 0d122702eh mov ecx, r10d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_116 + call rx_read_l1 +rx_body_116: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] mov rcx, -1850776691 imul rcx mov rax, rdx @@ -1538,7 +2240,13 @@ rx_i_117: ;AND_64 jz rx_finish xor r11, 015f2012bh mov ecx, r11d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_117 + call rx_read_l1 +rx_body_117: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] and rax, -1205826972 mov rcx, rax mov eax, r15d @@ -1551,7 +2259,13 @@ rx_i_118: ;FPSUB jz rx_finish xor r9, 037ddf43dh mov ecx, r9d - call rx_readfloat_l2 + test ebp, 63 + jnz short rx_body_118 + call rx_read_l2 +rx_body_118: + xor rdi, rcx + and ecx, 32767 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] subpd xmm0, xmm5 movaps xmm6, xmm0 @@ -1560,7 +2274,13 @@ rx_i_119: ;FPSUB jz rx_finish xor r9, 0bba475f3h mov ecx, r9d - call rx_readfloat_l1 + test ebp, 63 + jnz short rx_body_119 + call rx_read_l1 +rx_body_119: + xor rdi, rcx + and ecx, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] subpd xmm0, xmm3 movaps xmm5, xmm0 @@ -1569,7 +2289,13 @@ rx_i_120: ;FPADD jz rx_finish xor r12, 0e5561e3eh mov ecx, r12d - call rx_readfloat_l1 + test ebp, 63 + jnz short rx_body_120 + call rx_read_l1 +rx_body_120: + xor rdi, rcx + and ecx, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm4 movaps xmm8, xmm0 @@ -1578,7 +2304,13 @@ rx_i_121: ;FPMUL jz rx_finish xor r9, 03ab8f73h mov ecx, r9d - call rx_readfloat_l1 + test ebp, 63 + jnz short rx_body_121 + call rx_read_l1 +rx_body_121: + xor rdi, rcx + and ecx, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] mulpd xmm0, xmm5 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -1590,7 +2322,13 @@ rx_i_122: ;RET jz rx_finish xor r10, 04e0dbd40h mov ecx, r10d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_122 + call rx_read_l1 +rx_body_122: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] cmp rsp, rbx je short not_taken_ret_122 xor rax, qword ptr [rsp + 8] @@ -1612,7 +2350,13 @@ rx_i_123: ;ADD_32 jz rx_finish xor r13, 073e9f58ah mov ecx, r13d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_123 + call rx_read_l1 +rx_body_123: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] add eax, r15d mov r13, rax @@ -1621,7 +2365,13 @@ rx_i_124: ;CALL jz rx_finish xor r12, 0e3fa3670h mov ecx, r12d - call rx_readint_l2 + test ebp, 63 + jnz short rx_body_124 + call rx_read_l2 +rx_body_124: + xor rdi, rcx + and ecx, 32767 + mov rax, qword ptr [rsi+rcx*8] cmp r11d, 1719505436 jns short taken_call_124 mov rcx, rax @@ -1639,7 +2389,13 @@ rx_i_125: ;MUL_32 jz rx_finish xor r8, 0ebec27cdh mov ecx, r8d - call rx_readint_l2 + test ebp, 63 + jnz short rx_body_125 + call rx_read_l2 +rx_body_125: + xor rdi, rcx + and ecx, 32767 + mov rax, qword ptr [rsi+rcx*8] mov ecx, eax mov eax, r14d imul rax, rcx @@ -1650,7 +2406,13 @@ rx_i_126: ;FPDIV jz rx_finish xor r8, 01feb5264h mov ecx, r8d - call rx_readfloat_l1 + test ebp, 63 + jnz short rx_body_126 + call rx_read_l1 +rx_body_126: + xor rdi, rcx + and ecx, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] divpd xmm0, xmm6 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -1662,7 +2424,13 @@ rx_i_127: ;IMUL_32 jz rx_finish xor r9, 0405f500fh mov ecx, r9d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_127 + call rx_read_l1 +rx_body_127: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] movsxd rcx, eax movsxd rax, r10d imul rax, rcx @@ -1673,7 +2441,13 @@ rx_i_128: ;MUL_64 jz rx_finish xor r13, 0459f1154h mov ecx, r13d - call rx_readint_l2 + test ebp, 63 + jnz short rx_body_128 + call rx_read_l2 +rx_body_128: + xor rdi, rcx + and ecx, 32767 + mov rax, qword ptr [rsi+rcx*8] imul rax, r9 mov r9, rax @@ -1682,7 +2456,13 @@ rx_i_129: ;CALL jz rx_finish xor r9, 081918b4ch mov ecx, r9d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_129 + call rx_read_l1 +rx_body_129: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] cmp r13d, -590624856 jge short taken_call_129 mov r9, rax @@ -1696,7 +2476,13 @@ rx_i_130: ;OR_64 jz rx_finish xor r9, 077c3b332h mov ecx, r9d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_130 + call rx_read_l1 +rx_body_130: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] or rax, -281794782 mov rcx, rax mov eax, r11d @@ -1709,7 +2495,13 @@ rx_i_131: ;RET jz rx_finish xor r12, 05792310bh mov ecx, r12d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_131 + call rx_read_l1 +rx_body_131: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] cmp rsp, rbx je short not_taken_ret_131 xor rax, qword ptr [rsp + 8] @@ -1731,7 +2523,13 @@ rx_i_132: ;FPADD jz rx_finish xor r10, 0ebc6e10h mov ecx, r10d - call rx_readfloat_l1 + test ebp, 63 + jnz short rx_body_132 + call rx_read_l1 +rx_body_132: + xor rdi, rcx + and ecx, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm6 movaps xmm7, xmm0 @@ -1740,7 +2538,13 @@ rx_i_133: ;XOR_64 jz rx_finish xor r14, 0822f8b60h mov ecx, r14d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_133 + call rx_read_l1 +rx_body_133: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] xor rax, -1000526796 mov rcx, rax mov eax, r15d @@ -1753,7 +2557,13 @@ rx_i_134: ;ADD_64 jz rx_finish xor r10, 0d0f18593h mov ecx, r10d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_134 + call rx_read_l1 +rx_body_134: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] add rax, 1516102347 mov r13, rax @@ -1762,7 +2572,13 @@ rx_i_135: ;FPMUL jz rx_finish xor r11, 088212ef9h mov ecx, r11d - call rx_readfloat_l1 + test ebp, 63 + jnz short rx_body_135 + call rx_read_l1 +rx_body_135: + xor rdi, rcx + and ecx, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] mulpd xmm0, xmm3 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -1774,7 +2590,13 @@ rx_i_136: ;FPSQRT jz rx_finish xor r8, 01ae56e03h mov ecx, r8d - call rx_readfloat_l1 + test ebp, 63 + jnz short rx_body_136 + call rx_read_l1 +rx_body_136: + xor rdi, rcx + and ecx, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] andps xmm0, xmm10 sqrtpd xmm0, xmm0 movaps xmm5, xmm0 @@ -1788,7 +2610,13 @@ rx_i_137: ;ROL_64 jz rx_finish xor r11, 015a24231h mov ecx, r11d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_137 + call rx_read_l1 +rx_body_137: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] mov rcx, r9 rol rax, cl mov r11, rax @@ -1798,7 +2626,13 @@ rx_i_138: ;RET jz rx_finish xor r13, 02fd380c5h mov ecx, r13d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_138 + call rx_read_l1 +rx_body_138: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] cmp rsp, rbx je short not_taken_ret_138 xor rax, qword ptr [rsp + 8] @@ -1812,7 +2646,13 @@ rx_i_139: ;ADD_64 jz rx_finish xor r9, 093172470h mov ecx, r9d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_139 + call rx_read_l1 +rx_body_139: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] add rax, 515364082 mov rcx, rax mov eax, r11d @@ -1825,7 +2665,13 @@ rx_i_140: ;IMUL_32 jz rx_finish xor r14, 052543553h mov ecx, r14d - call rx_readint_l2 + test ebp, 63 + jnz short rx_body_140 + call rx_read_l2 +rx_body_140: + xor rdi, rcx + and ecx, 32767 + mov rax, qword ptr [rsi+rcx*8] movsxd rcx, eax movsxd rax, r11d imul rax, rcx @@ -1836,7 +2682,13 @@ rx_i_141: ;FPADD jz rx_finish xor r8, 02f636da1h mov ecx, r8d - call rx_readfloat_l1 + test ebp, 63 + jnz short rx_body_141 + call rx_read_l1 +rx_body_141: + xor rdi, rcx + and ecx, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm2 movaps xmm9, xmm0 mov eax, r9d @@ -1849,7 +2701,13 @@ rx_i_142: ;CALL jz rx_finish xor r11, 0b11a4f2ch mov ecx, r11d - call rx_readint_l2 + test ebp, 63 + jnz short rx_body_142 + call rx_read_l2 +rx_body_142: + xor rdi, rcx + and ecx, 32767 + mov rax, qword ptr [rsi+rcx*8] cmp r12d, 1365939282 js short taken_call_142 mov rcx, rax @@ -1867,7 +2725,13 @@ rx_i_143: ;IMUL_32 jz rx_finish xor r15, 037f4b5d0h mov ecx, r15d - call rx_readint_l2 + test ebp, 63 + jnz short rx_body_143 + call rx_read_l2 +rx_body_143: + xor rdi, rcx + and ecx, 32767 + mov rax, qword ptr [rsi+rcx*8] movsxd rcx, eax movsxd rax, r11d imul rax, rcx @@ -1878,7 +2742,13 @@ rx_i_144: ;IMULH_64 jz rx_finish xor r10, 02e59e00ah mov ecx, r10d - call rx_readint_l2 + test ebp, 63 + jnz short rx_body_144 + call rx_read_l2 +rx_body_144: + xor rdi, rcx + and ecx, 32767 + mov rax, qword ptr [rsi+rcx*8] mov rcx, r11 imul rcx mov rax, rdx @@ -1889,7 +2759,13 @@ rx_i_145: ;IMULH_64 jz rx_finish xor r13, 08d5c798h mov ecx, r13d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_145 + call rx_read_l1 +rx_body_145: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] mov rcx, r11 imul rcx mov rax, rdx @@ -1904,7 +2780,13 @@ rx_i_146: ;IMUL_32 jz rx_finish xor r13, 02327e6e2h mov ecx, r13d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_146 + call rx_read_l1 +rx_body_146: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] movsxd rcx, eax movsxd rax, r12d imul rax, rcx @@ -1915,7 +2797,13 @@ rx_i_147: ;MULH_64 jz rx_finish xor r13, 03a7df043h mov ecx, r13d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_147 + call rx_read_l1 +rx_body_147: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] mov rcx, 1784404616 mul rcx mov rax, rdx @@ -1930,7 +2818,13 @@ rx_i_148: ;SUB_64 jz rx_finish xor r10, 0783e5c4eh mov ecx, r10d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_148 + call rx_read_l1 +rx_body_148: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] sub rax, r14 mov rcx, rax mov eax, r10d @@ -1943,7 +2837,13 @@ rx_i_149: ;MUL_32 jz rx_finish xor r12, 0aa0f5b2fh mov ecx, r12d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_149 + call rx_read_l1 +rx_body_149: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] mov ecx, eax mov eax, r14d imul rax, rcx @@ -1958,7 +2858,13 @@ rx_i_150: ;DIV_64 jz rx_finish xor r9, 01504ca7ah mov ecx, r9d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_150 + call rx_read_l1 +rx_body_150: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] mov ecx, 1 mov edx, r8d test edx, edx @@ -1976,7 +2882,13 @@ rx_i_151: ;OR_32 jz rx_finish xor r9, 0ea72a7cfh mov ecx, r9d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_151 + call rx_read_l1 +rx_body_151: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] or eax, r13d mov rcx, rax mov eax, r11d @@ -1989,7 +2901,13 @@ rx_i_152: ;ROR_64 jz rx_finish xor r13, 0ad0e7a88h mov ecx, r13d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_152 + call rx_read_l1 +rx_body_152: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] mov rcx, r10 ror rax, cl mov r10, rax @@ -1999,7 +2917,13 @@ rx_i_153: ;FPDIV jz rx_finish xor r15, 0fd95ab87h mov ecx, r15d - call rx_readfloat_l1 + test ebp, 63 + jnz short rx_body_153 + call rx_read_l1 +rx_body_153: + xor rdi, rcx + and ecx, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] divpd xmm0, xmm2 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -2015,7 +2939,13 @@ rx_i_154: ;MUL_32 jz rx_finish xor r10, 0256697b0h mov ecx, r10d - call rx_readint_l2 + test ebp, 63 + jnz short rx_body_154 + call rx_read_l2 +rx_body_154: + xor rdi, rcx + and ecx, 32767 + mov rax, qword ptr [rsi+rcx*8] mov ecx, eax mov eax, r13d imul rax, rcx @@ -2026,7 +2956,13 @@ rx_i_155: ;ROR_64 jz rx_finish xor r11, 0d23f3b78h mov ecx, r11d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_155 + call rx_read_l1 +rx_body_155: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] mov rcx, r10 ror rax, cl mov rcx, rax @@ -2040,7 +2976,13 @@ rx_i_156: ;IMUL_32 jz rx_finish xor r10, 098917533h mov ecx, r10d - call rx_readint_l2 + test ebp, 63 + jnz short rx_body_156 + call rx_read_l2 +rx_body_156: + xor rdi, rcx + and ecx, 32767 + mov rax, qword ptr [rsi+rcx*8] movsxd rcx, eax movsxd rax, r15d imul rax, rcx @@ -2051,7 +2993,13 @@ rx_i_157: ;ADD_64 jz rx_finish xor r10, 0dfac3efch mov ecx, r10d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_157 + call rx_read_l1 +rx_body_157: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] add rax, r12 mov r14, rax @@ -2060,7 +3008,13 @@ rx_i_158: ;ADD_64 jz rx_finish xor r15, 0a64de090h mov ecx, r15d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_158 + call rx_read_l1 +rx_body_158: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] add rax, 1233402159 mov r10, rax @@ -2069,7 +3023,13 @@ rx_i_159: ;RET jz rx_finish xor r13, 0952a3abbh mov ecx, r13d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_159 + call rx_read_l1 +rx_body_159: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] cmp rsp, rbx je short not_taken_ret_159 xor rax, qword ptr [rsp + 8] @@ -2091,7 +3051,13 @@ rx_i_160: ;SUB_64 jz rx_finish xor r14, 0b1685b90h mov ecx, r14d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_160 + call rx_read_l1 +rx_body_160: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] sub rax, 1518778665 mov rcx, rax mov eax, r10d @@ -2104,7 +3070,13 @@ rx_i_161: ;OR_64 jz rx_finish xor r15, 0ea992531h mov ecx, r15d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_161 + call rx_read_l1 +rx_body_161: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] or rax, r14 mov r8, rax @@ -2113,7 +3085,13 @@ rx_i_162: ;SAR_64 jz rx_finish xor r9, 01fd57a4ah mov ecx, r9d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_162 + call rx_read_l1 +rx_body_162: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] mov rcx, r10 sar rax, cl mov r13, rax @@ -2123,7 +3101,13 @@ rx_i_163: ;SUB_64 jz rx_finish xor r12, 0e3486c0ah mov ecx, r12d - call rx_readint_l2 + test ebp, 63 + jnz short rx_body_163 + call rx_read_l2 +rx_body_163: + xor rdi, rcx + and ecx, 32767 + mov rax, qword ptr [rsi+rcx*8] sub rax, -2101130488 mov rcx, rax mov eax, r14d @@ -2136,7 +3120,13 @@ rx_i_164: ;MUL_32 jz rx_finish xor r12, 01f0c2737h mov ecx, r12d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_164 + call rx_read_l1 +rx_body_164: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] mov ecx, eax mov eax, r9d imul rax, rcx @@ -2151,7 +3141,13 @@ rx_i_165: ;RET jz rx_finish xor r12, 0debb493eh mov ecx, r12d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_165 + call rx_read_l1 +rx_body_165: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] cmp rsp, rbx je short not_taken_ret_165 xor rax, qword ptr [rsp + 8] @@ -2173,7 +3169,13 @@ rx_i_166: ;ROL_64 jz rx_finish xor r9, 0fe684081h mov ecx, r9d - call rx_readint_l2 + test ebp, 63 + jnz short rx_body_166 + call rx_read_l2 +rx_body_166: + xor rdi, rcx + and ecx, 32767 + mov rax, qword ptr [rsi+rcx*8] mov rcx, r8 rol rax, cl mov rcx, rax @@ -2187,7 +3189,13 @@ rx_i_167: ;FPMUL jz rx_finish xor r11, 0d10371ch mov ecx, r11d - call rx_readfloat_l1 + test ebp, 63 + jnz short rx_body_167 + call rx_read_l1 +rx_body_167: + xor rdi, rcx + and ecx, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] mulpd xmm0, xmm4 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -2203,7 +3211,13 @@ rx_i_168: ;FPSQRT jz rx_finish xor r12, 071b15effh mov ecx, r12d - call rx_readfloat_l1 + test ebp, 63 + jnz short rx_body_168 + call rx_read_l1 +rx_body_168: + xor rdi, rcx + and ecx, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] andps xmm0, xmm10 sqrtpd xmm0, xmm0 movaps xmm7, xmm0 @@ -2213,7 +3227,13 @@ rx_i_169: ;RET jz rx_finish xor r11, 072790347h mov ecx, r11d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_169 + call rx_read_l1 +rx_body_169: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] cmp rsp, rbx je short not_taken_ret_169 xor rax, qword ptr [rsp + 8] @@ -2235,7 +3255,13 @@ rx_i_170: ;CALL jz rx_finish xor r8, 04ae8a020h mov ecx, r8d - call rx_readint_l2 + test ebp, 63 + jnz short rx_body_170 + call rx_read_l2 +rx_body_170: + xor rdi, rcx + and ecx, 32767 + mov rax, qword ptr [rsi+rcx*8] cmp r10d, -1541051751 jl short taken_call_170 mov r14, rax @@ -2249,7 +3275,13 @@ rx_i_171: ;IMULH_64 jz rx_finish xor r15, 09901e05bh mov ecx, r15d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_171 + call rx_read_l1 +rx_body_171: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] mov rcx, r12 imul rcx mov rax, rdx @@ -2260,7 +3292,13 @@ rx_i_172: ;SUB_64 jz rx_finish xor r13, 050e8c510h mov ecx, r13d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_172 + call rx_read_l1 +rx_body_172: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] sub rax, r11 mov r12, rax @@ -2269,7 +3307,13 @@ rx_i_173: ;MUL_64 jz rx_finish xor r14, 05422cf8fh mov ecx, r14d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_173 + call rx_read_l1 +rx_body_173: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] imul rax, r12 mov rcx, rax mov eax, r12d @@ -2282,7 +3326,13 @@ rx_i_174: ;FPROUND jz rx_finish xor r12, 0a025c3dbh mov ecx, r12d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_174 + call rx_read_l1 +rx_body_174: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] shl eax, 13 and eax, 24576 or eax, 40896 @@ -2294,7 +3344,13 @@ rx_i_175: ;SAR_64 jz rx_finish xor r13, 08f74c11h mov ecx, r13d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_175 + call rx_read_l1 +rx_body_175: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] mov rcx, r8 sar rax, cl mov r8, rax @@ -2304,7 +3360,13 @@ rx_i_176: ;SUB_64 jz rx_finish xor r9, 01f2ed5f1h mov ecx, r9d - call rx_readint_l2 + test ebp, 63 + jnz short rx_body_176 + call rx_read_l2 +rx_body_176: + xor rdi, rcx + and ecx, 32767 + mov rax, qword ptr [rsi+rcx*8] sub rax, r14 mov r10, rax @@ -2313,7 +3375,13 @@ rx_i_177: ;ADD_64 jz rx_finish xor r10, 0d2072c79h mov ecx, r10d - call rx_readint_l2 + test ebp, 63 + jnz short rx_body_177 + call rx_read_l2 +rx_body_177: + xor rdi, rcx + and ecx, 32767 + mov rax, qword ptr [rsi+rcx*8] add rax, r10 mov rcx, rax mov eax, r13d @@ -2326,7 +3394,13 @@ rx_i_178: ;RET jz rx_finish xor r15, 0a8e51933h mov ecx, r15d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_178 + call rx_read_l1 +rx_body_178: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] cmp rsp, rbx je short not_taken_ret_178 xor rax, qword ptr [rsp + 8] @@ -2348,7 +3422,13 @@ rx_i_179: ;FPADD jz rx_finish xor r12, 0934ad492h mov ecx, r12d - call rx_readfloat_l1 + test ebp, 63 + jnz short rx_body_179 + call rx_read_l1 +rx_body_179: + xor rdi, rcx + and ecx, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm2 movaps xmm8, xmm0 @@ -2357,7 +3437,13 @@ rx_i_180: ;XOR_64 jz rx_finish xor r15, 01cb3ce1fh mov ecx, r15d - call rx_readint_l2 + test ebp, 63 + jnz short rx_body_180 + call rx_read_l2 +rx_body_180: + xor rdi, rcx + and ecx, 32767 + mov rax, qword ptr [rsi+rcx*8] xor rax, 1995308563 mov rcx, rax mov eax, r9d @@ -2370,7 +3456,13 @@ rx_i_181: ;RET jz rx_finish xor r10, 023c7845fh mov ecx, r10d - call rx_readint_l2 + test ebp, 63 + jnz short rx_body_181 + call rx_read_l2 +rx_body_181: + xor rdi, rcx + and ecx, 32767 + mov rax, qword ptr [rsi+rcx*8] cmp rsp, rbx je short not_taken_ret_181 xor rax, qword ptr [rsp + 8] @@ -2384,7 +3476,13 @@ rx_i_182: ;FPSUB jz rx_finish xor r8, 0f8884327h mov ecx, r8d - call rx_readfloat_l1 + test ebp, 63 + jnz short rx_body_182 + call rx_read_l1 +rx_body_182: + xor rdi, rcx + and ecx, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] subpd xmm0, xmm7 movaps xmm6, xmm0 @@ -2393,7 +3491,13 @@ rx_i_183: ;ADD_64 jz rx_finish xor r13, 013070461h mov ecx, r13d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_183 + call rx_read_l1 +rx_body_183: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] add rax, 137260710 mov r10, rax @@ -2402,7 +3506,13 @@ rx_i_184: ;SAR_64 jz rx_finish xor r12, 04764cdf7h mov ecx, r12d - call rx_readint_l2 + test ebp, 63 + jnz short rx_body_184 + call rx_read_l2 +rx_body_184: + xor rdi, rcx + and ecx, 32767 + mov rax, qword ptr [rsi+rcx*8] sar rax, 40 mov r12, rax @@ -2411,7 +3521,13 @@ rx_i_185: ;CALL jz rx_finish xor r10, 03c41026fh mov ecx, r10d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_185 + call rx_read_l1 +rx_body_185: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] cmp r15d, -1510284125 jbe short taken_call_185 mov rcx, rax @@ -2429,7 +3545,13 @@ rx_i_186: ;XOR_32 jz rx_finish xor r9, 0cded414bh mov ecx, r9d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_186 + call rx_read_l1 +rx_body_186: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] xor eax, r15d mov rcx, rax mov eax, r10d @@ -2442,7 +3564,13 @@ rx_i_187: ;FPDIV jz rx_finish xor r13, 05c6d64a8h mov ecx, r13d - call rx_readfloat_l2 + test ebp, 63 + jnz short rx_body_187 + call rx_read_l2 +rx_body_187: + xor rdi, rcx + and ecx, 32767 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] divpd xmm0, xmm6 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -2454,7 +3582,13 @@ rx_i_188: ;FPMUL jz rx_finish xor r9, 04659becbh mov ecx, r9d - call rx_readfloat_l1 + test ebp, 63 + jnz short rx_body_188 + call rx_read_l1 +rx_body_188: + xor rdi, rcx + and ecx, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] mulpd xmm0, xmm3 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -2466,7 +3600,13 @@ rx_i_189: ;FPROUND jz rx_finish xor r11, 0c52741d5h mov ecx, r11d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_189 + call rx_read_l1 +rx_body_189: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] shl eax, 13 and eax, 24576 or eax, 40896 @@ -2478,7 +3618,13 @@ rx_i_190: ;RET jz rx_finish xor r12, 0217bf5f3h mov ecx, r12d - call rx_readint_l2 + test ebp, 63 + jnz short rx_body_190 + call rx_read_l2 +rx_body_190: + xor rdi, rcx + and ecx, 32767 + mov rax, qword ptr [rsi+rcx*8] cmp rsp, rbx je short not_taken_ret_190 xor rax, qword ptr [rsp + 8] @@ -2492,7 +3638,13 @@ rx_i_191: ;CALL jz rx_finish xor r15, 0884f3526h mov ecx, r15d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_191 + call rx_read_l1 +rx_body_191: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] cmp r11d, 1687119072 jno short taken_call_191 mov rcx, rax @@ -2510,7 +3662,13 @@ rx_i_192: ;CALL jz rx_finish xor r8, 0d76edad3h mov ecx, r8d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_192 + call rx_read_l1 +rx_body_192: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] cmp r14d, -117628864 jns short taken_call_192 mov r8, rax @@ -2524,7 +3682,13 @@ rx_i_193: ;MUL_32 jz rx_finish xor r12, 0e9939ach mov ecx, r12d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_193 + call rx_read_l1 +rx_body_193: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] mov ecx, eax mov eax, r12d imul rax, rcx @@ -2539,7 +3703,13 @@ rx_i_194: ;FPMUL jz rx_finish xor r12, 0f21ca520h mov ecx, r12d - call rx_readfloat_l2 + test ebp, 63 + jnz short rx_body_194 + call rx_read_l2 +rx_body_194: + xor rdi, rcx + and ecx, 32767 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] mulpd xmm0, xmm2 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -2555,7 +3725,13 @@ rx_i_195: ;ROL_64 jz rx_finish xor r10, 09405152ch mov ecx, r10d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_195 + call rx_read_l1 +rx_body_195: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] mov rcx, r8 rol rax, cl mov r9, rax @@ -2565,7 +3741,13 @@ rx_i_196: ;SUB_64 jz rx_finish xor r8, 0c2a9f41bh mov ecx, r8d - call rx_readint_l2 + test ebp, 63 + jnz short rx_body_196 + call rx_read_l2 +rx_body_196: + xor rdi, rcx + and ecx, 32767 + mov rax, qword ptr [rsi+rcx*8] sub rax, -1907903895 mov rcx, rax mov eax, r13d @@ -2578,7 +3760,13 @@ rx_i_197: ;MUL_64 jz rx_finish xor r12, 0229208efh mov ecx, r12d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_197 + call rx_read_l1 +rx_body_197: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] imul rax, r15 mov r11, rax @@ -2587,7 +3775,13 @@ rx_i_198: ;MULH_64 jz rx_finish xor r14, 0c8d95bbbh mov ecx, r14d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_198 + call rx_read_l1 +rx_body_198: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] mov rcx, r14 mul rcx mov rax, rdx @@ -2602,7 +3796,13 @@ rx_i_199: ;MULH_64 jz rx_finish xor r13, 050049e2eh mov ecx, r13d - call rx_readint_l2 + test ebp, 63 + jnz short rx_body_199 + call rx_read_l2 +rx_body_199: + xor rdi, rcx + and ecx, 32767 + mov rax, qword ptr [rsi+rcx*8] mov rcx, r10 mul rcx mov rax, rdx @@ -2617,7 +3817,13 @@ rx_i_200: ;FPSUB jz rx_finish xor r10, 0c63b99e8h mov ecx, r10d - call rx_readfloat_l1 + test ebp, 63 + jnz short rx_body_200 + call rx_read_l1 +rx_body_200: + xor rdi, rcx + and ecx, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] subpd xmm0, xmm2 movaps xmm4, xmm0 mov eax, r12d @@ -2630,7 +3836,13 @@ rx_i_201: ;FPADD jz rx_finish xor r8, 0cdda801dh mov ecx, r8d - call rx_readfloat_l1 + test ebp, 63 + jnz short rx_body_201 + call rx_read_l1 +rx_body_201: + xor rdi, rcx + and ecx, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm9 movaps xmm4, xmm0 mov eax, r12d @@ -2643,7 +3855,13 @@ rx_i_202: ;FPSUB jz rx_finish xor r13, 0fa44b04ah mov ecx, r13d - call rx_readfloat_l2 + test ebp, 63 + jnz short rx_body_202 + call rx_read_l2 +rx_body_202: + xor rdi, rcx + and ecx, 32767 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] subpd xmm0, xmm9 movaps xmm5, xmm0 @@ -2652,7 +3870,13 @@ rx_i_203: ;FPSUB jz rx_finish xor r10, 0d73e472ch mov ecx, r10d - call rx_readfloat_l1 + test ebp, 63 + jnz short rx_body_203 + call rx_read_l1 +rx_body_203: + xor rdi, rcx + and ecx, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] subpd xmm0, xmm2 movaps xmm7, xmm0 mov eax, r15d @@ -2665,7 +3889,13 @@ rx_i_204: ;MUL_64 jz rx_finish xor r9, 01af8ab1dh mov ecx, r9d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_204 + call rx_read_l1 +rx_body_204: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] imul rax, r15 mov rcx, rax mov eax, r8d @@ -2678,7 +3908,13 @@ rx_i_205: ;FPDIV jz rx_finish xor r14, 094e997c5h mov ecx, r14d - call rx_readfloat_l1 + test ebp, 63 + jnz short rx_body_205 + call rx_read_l1 +rx_body_205: + xor rdi, rcx + and ecx, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] divpd xmm0, xmm8 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -2690,7 +3926,13 @@ rx_i_206: ;FPMUL jz rx_finish xor r11, 0e836a177h mov ecx, r11d - call rx_readfloat_l1 + test ebp, 63 + jnz short rx_body_206 + call rx_read_l1 +rx_body_206: + xor rdi, rcx + and ecx, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] mulpd xmm0, xmm7 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -2702,7 +3944,13 @@ rx_i_207: ;AND_32 jz rx_finish xor r9, 039ccdd30h mov ecx, r9d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_207 + call rx_read_l1 +rx_body_207: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] and eax, r12d mov rcx, rax mov eax, r9d @@ -2715,7 +3963,13 @@ rx_i_208: ;MUL_64 jz rx_finish xor r9, 0f4f126c5h mov ecx, r9d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_208 + call rx_read_l1 +rx_body_208: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] imul rax, r12 mov r10, rax @@ -2724,7 +3978,13 @@ rx_i_209: ;SHR_64 jz rx_finish xor r8, 0b84811f1h mov ecx, r8d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_209 + call rx_read_l1 +rx_body_209: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] shr rax, 30 mov rcx, rax mov eax, r12d @@ -2737,7 +3997,13 @@ rx_i_210: ;MUL_32 jz rx_finish xor r12, 0c5efc90ah mov ecx, r12d - call rx_readint_l2 + test ebp, 63 + jnz short rx_body_210 + call rx_read_l2 +rx_body_210: + xor rdi, rcx + and ecx, 32767 + mov rax, qword ptr [rsi+rcx*8] mov ecx, eax mov eax, -1027162400 imul rax, rcx @@ -2752,7 +4018,13 @@ rx_i_211: ;FPADD jz rx_finish xor r12, 0ce533072h mov ecx, r12d - call rx_readfloat_l2 + test ebp, 63 + jnz short rx_body_211 + call rx_read_l2 +rx_body_211: + xor rdi, rcx + and ecx, 32767 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm9 movaps xmm3, xmm0 @@ -2761,7 +4033,13 @@ rx_i_212: ;MUL_64 jz rx_finish xor r13, 06b465fdbh mov ecx, r13d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_212 + call rx_read_l1 +rx_body_212: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] imul rax, r13 mov rcx, rax mov eax, r15d @@ -2774,7 +4052,13 @@ rx_i_213: ;IMUL_32 jz rx_finish xor r13, 02dd1d503h mov ecx, r13d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_213 + call rx_read_l1 +rx_body_213: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] movsxd rcx, eax mov rax, 129993589 imul rax, rcx @@ -2785,7 +4069,13 @@ rx_i_214: ;ROL_64 jz rx_finish xor r9, 0a159f313h mov ecx, r9d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_214 + call rx_read_l1 +rx_body_214: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] mov rcx, r14 rol rax, cl mov r14, rax @@ -2795,7 +4085,13 @@ rx_i_215: ;SUB_64 jz rx_finish xor r15, 08359265eh mov ecx, r15d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_215 + call rx_read_l1 +rx_body_215: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] sub rax, r12 mov r10, rax @@ -2804,7 +4100,13 @@ rx_i_216: ;MUL_64 jz rx_finish xor r12, 080696de3h mov ecx, r12d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_216 + call rx_read_l1 +rx_body_216: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] imul rax, r13 mov rcx, rax mov eax, r15d @@ -2817,7 +4119,13 @@ rx_i_217: ;IMUL_32 jz rx_finish xor r8, 040d5b526h mov ecx, r8d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_217 + call rx_read_l1 +rx_body_217: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] movsxd rcx, eax movsxd rax, r9d imul rax, rcx @@ -2832,7 +4140,13 @@ rx_i_218: ;CALL jz rx_finish xor r11, 083c0bd93h mov ecx, r11d - call rx_readint_l2 + test ebp, 63 + jnz short rx_body_218 + call rx_read_l2 +rx_body_218: + xor rdi, rcx + and ecx, 32767 + mov rax, qword ptr [rsi+rcx*8] cmp r8d, -585552250 jge short taken_call_218 mov r11, rax @@ -2846,7 +4160,13 @@ rx_i_219: ;XOR_64 jz rx_finish xor r8, 0ca37f668h mov ecx, r8d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_219 + call rx_read_l1 +rx_body_219: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] xor rax, -740915304 mov rcx, rax mov eax, r15d @@ -2859,7 +4179,13 @@ rx_i_220: ;IMUL_32 jz rx_finish xor r9, 0bb44c384h mov ecx, r9d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_220 + call rx_read_l1 +rx_body_220: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] movsxd rcx, eax movsxd rax, r11d imul rax, rcx @@ -2874,7 +4200,13 @@ rx_i_221: ;IMULH_64 jz rx_finish xor r9, 0a3deb512h mov ecx, r9d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_221 + call rx_read_l1 +rx_body_221: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] mov rcx, r15 imul rcx mov rax, rdx @@ -2889,7 +4221,13 @@ rx_i_222: ;FPMUL jz rx_finish xor r9, 084a02d64h mov ecx, r9d - call rx_readfloat_l2 + test ebp, 63 + jnz short rx_body_222 + call rx_read_l2 +rx_body_222: + xor rdi, rcx + and ecx, 32767 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] mulpd xmm0, xmm5 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -2905,7 +4243,13 @@ rx_i_223: ;FPSUB jz rx_finish xor r8, 01e5cc085h mov ecx, r8d - call rx_readfloat_l1 + test ebp, 63 + jnz short rx_body_223 + call rx_read_l1 +rx_body_223: + xor rdi, rcx + and ecx, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] subpd xmm0, xmm3 movaps xmm2, xmm0 mov eax, r10d @@ -2918,7 +4262,13 @@ rx_i_224: ;SAR_64 jz rx_finish xor r12, 053982440h mov ecx, r12d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_224 + call rx_read_l1 +rx_body_224: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] mov rcx, r14 sar rax, cl mov rcx, rax @@ -2932,7 +4282,13 @@ rx_i_225: ;DIV_64 jz rx_finish xor r13, 0c558367eh mov ecx, r13d - call rx_readint_l2 + test ebp, 63 + jnz short rx_body_225 + call rx_read_l2 +rx_body_225: + xor rdi, rcx + and ecx, 32767 + mov rax, qword ptr [rsi+rcx*8] mov ecx, 1 mov edx, r10d test edx, edx @@ -2950,7 +4306,13 @@ rx_i_226: ;CALL jz rx_finish xor r10, 040139b65h mov ecx, r10d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_226 + call rx_read_l1 +rx_body_226: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] cmp r8d, -1752488808 jno short taken_call_226 mov rcx, rax @@ -2968,7 +4330,13 @@ rx_i_227: ;FPDIV jz rx_finish xor r11, 0fa312dbdh mov ecx, r11d - call rx_readfloat_l1 + test ebp, 63 + jnz short rx_body_227 + call rx_read_l1 +rx_body_227: + xor rdi, rcx + and ecx, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] divpd xmm0, xmm7 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -2984,7 +4352,13 @@ rx_i_228: ;CALL jz rx_finish xor r11, 0b64246c0h mov ecx, r11d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_228 + call rx_read_l1 +rx_body_228: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] cmp r10d, -2099304 jns short taken_call_228 mov rcx, rax @@ -3002,7 +4376,13 @@ rx_i_229: ;IMUL_32 jz rx_finish xor r11, 05c535836h mov ecx, r11d - call rx_readint_l2 + test ebp, 63 + jnz short rx_body_229 + call rx_read_l2 +rx_body_229: + xor rdi, rcx + and ecx, 32767 + mov rax, qword ptr [rsi+rcx*8] movsxd rcx, eax movsxd rax, r12d imul rax, rcx @@ -3017,7 +4397,13 @@ rx_i_230: ;FPMUL jz rx_finish xor r15, 0f394972eh mov ecx, r15d - call rx_readfloat_l1 + test ebp, 63 + jnz short rx_body_230 + call rx_read_l1 +rx_body_230: + xor rdi, rcx + and ecx, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] mulpd xmm0, xmm6 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -3033,7 +4419,13 @@ rx_i_231: ;RET jz rx_finish xor r9, 0bb56428dh mov ecx, r9d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_231 + call rx_read_l1 +rx_body_231: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] cmp rsp, rbx je short not_taken_ret_231 xor rax, qword ptr [rsp + 8] @@ -3055,7 +4447,13 @@ rx_i_232: ;FPDIV jz rx_finish xor r15, 09ab46ab3h mov ecx, r15d - call rx_readfloat_l1 + test ebp, 63 + jnz short rx_body_232 + call rx_read_l1 +rx_body_232: + xor rdi, rcx + and ecx, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] divpd xmm0, xmm3 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -3067,7 +4465,13 @@ rx_i_233: ;CALL jz rx_finish xor r13, 08eb2cd76h mov ecx, r13d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_233 + call rx_read_l1 +rx_body_233: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] cmp r12d, 392389867 jo short taken_call_233 mov r14, rax @@ -3081,7 +4485,13 @@ rx_i_234: ;FPROUND jz rx_finish xor r15, 0ba687578h mov ecx, r15d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_234 + call rx_read_l1 +rx_body_234: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] shl eax, 13 and eax, 24576 or eax, 40896 @@ -3093,7 +4503,13 @@ rx_i_235: ;IMUL_32 jz rx_finish xor r13, 0b6cb9ff2h mov ecx, r13d - call rx_readint_l2 + test ebp, 63 + jnz short rx_body_235 + call rx_read_l2 +rx_body_235: + xor rdi, rcx + and ecx, 32767 + mov rax, qword ptr [rsi+rcx*8] movsxd rcx, eax movsxd rax, r12d imul rax, rcx @@ -3108,7 +4524,13 @@ rx_i_236: ;FPADD jz rx_finish xor r15, 03ad196ach mov ecx, r15d - call rx_readfloat_l2 + test ebp, 63 + jnz short rx_body_236 + call rx_read_l2 +rx_body_236: + xor rdi, rcx + and ecx, 32767 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm4 movaps xmm3, xmm0 @@ -3117,7 +4539,13 @@ rx_i_237: ;CALL jz rx_finish xor r15, 0fab4600h mov ecx, r15d - call rx_readint_l2 + test ebp, 63 + jnz short rx_body_237 + call rx_read_l2 +rx_body_237: + xor rdi, rcx + and ecx, 32767 + mov rax, qword ptr [rsi+rcx*8] cmp r12d, -121899164 jge short taken_call_237 mov r11, rax @@ -3131,7 +4559,13 @@ rx_i_238: ;FPADD jz rx_finish xor r8, 0158f119fh mov ecx, r8d - call rx_readfloat_l2 + test ebp, 63 + jnz short rx_body_238 + call rx_read_l2 +rx_body_238: + xor rdi, rcx + and ecx, 32767 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm6 movaps xmm7, xmm0 mov eax, r15d @@ -3144,7 +4578,13 @@ rx_i_239: ;ADD_64 jz rx_finish xor r13, 044f30b3fh mov ecx, r13d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_239 + call rx_read_l1 +rx_body_239: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] add rax, r10 mov r10, rax @@ -3153,7 +4593,13 @@ rx_i_240: ;IMUL_32 jz rx_finish xor r9, 0d65d29f9h mov ecx, r9d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_240 + call rx_read_l1 +rx_body_240: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] movsxd rcx, eax mov rax, -423830277 imul rax, rcx @@ -3164,7 +4610,13 @@ rx_i_241: ;FPADD jz rx_finish xor r11, 0ce5260adh mov ecx, r11d - call rx_readfloat_l1 + test ebp, 63 + jnz short rx_body_241 + call rx_read_l1 +rx_body_241: + xor rdi, rcx + and ecx, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm3 movaps xmm7, xmm0 mov eax, r15d @@ -3177,7 +4629,13 @@ rx_i_242: ;MULH_64 jz rx_finish xor r12, 01119b0f9h mov ecx, r12d - call rx_readint_l2 + test ebp, 63 + jnz short rx_body_242 + call rx_read_l2 +rx_body_242: + xor rdi, rcx + and ecx, 32767 + mov rax, qword ptr [rsi+rcx*8] mov rcx, 319324914 mul rcx mov rax, rdx @@ -3192,7 +4650,13 @@ rx_i_243: ;XOR_64 jz rx_finish xor r12, 0d6c2ce3dh mov ecx, r12d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_243 + call rx_read_l1 +rx_body_243: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] xor rax, 1198180774 mov r14, rax @@ -3201,7 +4665,13 @@ rx_i_244: ;FPADD jz rx_finish xor r11, 0c6a6248h mov ecx, r11d - call rx_readfloat_l2 + test ebp, 63 + jnz short rx_body_244 + call rx_read_l2 +rx_body_244: + xor rdi, rcx + and ecx, 32767 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm6 movaps xmm9, xmm0 @@ -3210,7 +4680,13 @@ rx_i_245: ;XOR_64 jz rx_finish xor r13, 084505739h mov ecx, r13d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_245 + call rx_read_l1 +rx_body_245: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] xor rax, -1546539637 mov rcx, rax mov eax, r12d @@ -3223,7 +4699,13 @@ rx_i_246: ;AND_64 jz rx_finish xor r15, 027eeaa2eh mov ecx, r15d - call rx_readint_l2 + test ebp, 63 + jnz short rx_body_246 + call rx_read_l2 +rx_body_246: + xor rdi, rcx + and ecx, 32767 + mov rax, qword ptr [rsi+rcx*8] and rax, r9 mov r12, rax @@ -3232,7 +4714,13 @@ rx_i_247: ;IMUL_32 jz rx_finish xor r10, 0c4de0296h mov ecx, r10d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_247 + call rx_read_l1 +rx_body_247: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] movsxd rcx, eax movsxd rax, r14d imul rax, rcx @@ -3247,7 +4735,13 @@ rx_i_248: ;MUL_32 jz rx_finish xor r8, 0649df46fh mov ecx, r8d - call rx_readint_l2 + test ebp, 63 + jnz short rx_body_248 + call rx_read_l2 +rx_body_248: + xor rdi, rcx + and ecx, 32767 + mov rax, qword ptr [rsi+rcx*8] mov ecx, eax mov eax, r15d imul rax, rcx @@ -3262,7 +4756,13 @@ rx_i_249: ;IMUL_32 jz rx_finish xor r15, 0499552cch mov ecx, r15d - call rx_readint_l2 + test ebp, 63 + jnz short rx_body_249 + call rx_read_l2 +rx_body_249: + xor rdi, rcx + and ecx, 32767 + mov rax, qword ptr [rsi+rcx*8] movsxd rcx, eax movsxd rax, r11d imul rax, rcx @@ -3277,7 +4777,13 @@ rx_i_250: ;MUL_64 jz rx_finish xor r13, 083eafe6fh mov ecx, r13d - call rx_readint_l2 + test ebp, 63 + jnz short rx_body_250 + call rx_read_l2 +rx_body_250: + xor rdi, rcx + and ecx, 32767 + mov rax, qword ptr [rsi+rcx*8] imul rax, r8 mov rcx, rax mov eax, r14d @@ -3290,7 +4796,13 @@ rx_i_251: ;FPMUL jz rx_finish xor r13, 0a25a4d8ah mov ecx, r13d - call rx_readfloat_l2 + test ebp, 63 + jnz short rx_body_251 + call rx_read_l2 +rx_body_251: + xor rdi, rcx + and ecx, 32767 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] mulpd xmm0, xmm2 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -3306,7 +4818,13 @@ rx_i_252: ;ROL_64 jz rx_finish xor r14, 08a75ad41h mov ecx, r14d - call rx_readint_l2 + test ebp, 63 + jnz short rx_body_252 + call rx_read_l2 +rx_body_252: + xor rdi, rcx + and ecx, 32767 + mov rax, qword ptr [rsi+rcx*8] mov rcx, r8 rol rax, cl mov r14, rax @@ -3316,7 +4834,13 @@ rx_i_253: ;CALL jz rx_finish xor r14, 057f3f596h mov ecx, r14d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_253 + call rx_read_l1 +rx_body_253: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] cmp r15d, 1699431947 jns short taken_call_253 mov rcx, rax @@ -3334,7 +4858,13 @@ rx_i_254: ;FPSUB jz rx_finish xor r14, 04cfb709eh mov ecx, r14d - call rx_readfloat_l1 + test ebp, 63 + jnz short rx_body_254 + call rx_read_l1 +rx_body_254: + xor rdi, rcx + and ecx, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] subpd xmm0, xmm4 movaps xmm8, xmm0 mov eax, r8d @@ -3347,7 +4877,13 @@ rx_i_255: ;FPADD jz rx_finish xor r9, 0b96ec9ech mov ecx, r9d - call rx_readfloat_l1 + test ebp, 63 + jnz short rx_body_255 + call rx_read_l1 +rx_body_255: + xor rdi, rcx + and ecx, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm5 movaps xmm6, xmm0 mov eax, r14d @@ -3360,7 +4896,13 @@ rx_i_256: ;MULH_64 jz rx_finish xor r8, 08375472ch mov ecx, r8d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_256 + call rx_read_l1 +rx_body_256: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] mov rcx, r15 mul rcx mov rax, rdx @@ -3375,7 +4917,13 @@ rx_i_257: ;FPADD jz rx_finish xor r12, 0d75a8c3fh mov ecx, r12d - call rx_readfloat_l2 + test ebp, 63 + jnz short rx_body_257 + call rx_read_l2 +rx_body_257: + xor rdi, rcx + and ecx, 32767 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm5 movaps xmm3, xmm0 mov eax, r11d @@ -3388,7 +4936,13 @@ rx_i_258: ;MUL_32 jz rx_finish xor r11, 064fdbda0h mov ecx, r11d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_258 + call rx_read_l1 +rx_body_258: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] mov ecx, eax mov eax, r14d imul rax, rcx @@ -3403,7 +4957,13 @@ rx_i_259: ;FPADD jz rx_finish xor r11, 02e36a073h mov ecx, r11d - call rx_readfloat_l1 + test ebp, 63 + jnz short rx_body_259 + call rx_read_l1 +rx_body_259: + xor rdi, rcx + and ecx, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm9 movaps xmm3, xmm0 @@ -3412,7 +4972,13 @@ rx_i_260: ;FPMUL jz rx_finish xor r13, 0f94e9fa9h mov ecx, r13d - call rx_readfloat_l2 + test ebp, 63 + jnz short rx_body_260 + call rx_read_l2 +rx_body_260: + xor rdi, rcx + and ecx, 32767 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] mulpd xmm0, xmm5 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -3424,7 +4990,13 @@ rx_i_261: ;FPSQRT jz rx_finish xor r14, 02346171ch mov ecx, r14d - call rx_readfloat_l2 + test ebp, 63 + jnz short rx_body_261 + call rx_read_l2 +rx_body_261: + xor rdi, rcx + and ecx, 32767 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] andps xmm0, xmm10 sqrtpd xmm0, xmm0 movaps xmm3, xmm0 @@ -3438,7 +5010,13 @@ rx_i_262: ;OR_32 jz rx_finish xor r10, 01c42baa6h mov ecx, r10d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_262 + call rx_read_l1 +rx_body_262: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] or eax, r13d mov rcx, rax mov eax, r11d @@ -3451,7 +5029,13 @@ rx_i_263: ;FPDIV jz rx_finish xor r11, 0b39b140h mov ecx, r11d - call rx_readfloat_l1 + test ebp, 63 + jnz short rx_body_263 + call rx_read_l1 +rx_body_263: + xor rdi, rcx + and ecx, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] divpd xmm0, xmm8 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -3463,7 +5047,13 @@ rx_i_264: ;FPMUL jz rx_finish xor r11, 01a07d201h mov ecx, r11d - call rx_readfloat_l2 + test ebp, 63 + jnz short rx_body_264 + call rx_read_l2 +rx_body_264: + xor rdi, rcx + and ecx, 32767 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] mulpd xmm0, xmm3 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -3475,7 +5065,13 @@ rx_i_265: ;FPADD jz rx_finish xor r13, 07a3eb340h mov ecx, r13d - call rx_readfloat_l2 + test ebp, 63 + jnz short rx_body_265 + call rx_read_l2 +rx_body_265: + xor rdi, rcx + and ecx, 32767 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm8 movaps xmm2, xmm0 mov eax, r10d @@ -3488,7 +5084,13 @@ rx_i_266: ;RET jz rx_finish xor r13, 03d0a3a89h mov ecx, r13d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_266 + call rx_read_l1 +rx_body_266: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] cmp rsp, rbx je short not_taken_ret_266 xor rax, qword ptr [rsp + 8] @@ -3502,7 +5104,13 @@ rx_i_267: ;ROR_64 jz rx_finish xor r8, 0c6c7b37h mov ecx, r8d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_267 + call rx_read_l1 +rx_body_267: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] ror rax, 56 mov r11, rax @@ -3511,7 +5119,13 @@ rx_i_268: ;CALL jz rx_finish xor r12, 0c2510cebh mov ecx, r12d - call rx_readint_l2 + test ebp, 63 + jnz short rx_body_268 + call rx_read_l2 +rx_body_268: + xor rdi, rcx + and ecx, 32767 + mov rax, qword ptr [rsi+rcx*8] cmp r15d, -2062812966 jl short taken_call_268 mov r13, rax @@ -3525,7 +5139,13 @@ rx_i_269: ;ROR_64 jz rx_finish xor r11, 0c80cc899h mov ecx, r11d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_269 + call rx_read_l1 +rx_body_269: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] mov rcx, r8 ror rax, cl mov rcx, rax @@ -3539,7 +5159,13 @@ rx_i_270: ;FPMUL jz rx_finish xor r11, 0eb355caah mov ecx, r11d - call rx_readfloat_l1 + test ebp, 63 + jnz short rx_body_270 + call rx_read_l1 +rx_body_270: + xor rdi, rcx + and ecx, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] mulpd xmm0, xmm9 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -3551,7 +5177,13 @@ rx_i_271: ;MUL_32 jz rx_finish xor r13, 0c6f12299h mov ecx, r13d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_271 + call rx_read_l1 +rx_body_271: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] mov ecx, eax mov eax, -2032281772 imul rax, rcx @@ -3566,7 +5198,13 @@ rx_i_272: ;OR_32 jz rx_finish xor r12, 0695a5dd2h mov ecx, r12d - call rx_readint_l2 + test ebp, 63 + jnz short rx_body_272 + call rx_read_l2 +rx_body_272: + xor rdi, rcx + and ecx, 32767 + mov rax, qword ptr [rsi+rcx*8] or eax, r12d mov r13, rax @@ -3575,7 +5213,13 @@ rx_i_273: ;CALL jz rx_finish xor r9, 0d315e4dch mov ecx, r9d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_273 + call rx_read_l1 +rx_body_273: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] cmp r12d, 1670848568 jl short taken_call_273 mov rcx, rax @@ -3593,7 +5237,13 @@ rx_i_274: ;FPSUB jz rx_finish xor r15, 0b66ca7e0h mov ecx, r15d - call rx_readfloat_l2 + test ebp, 63 + jnz short rx_body_274 + call rx_read_l2 +rx_body_274: + xor rdi, rcx + and ecx, 32767 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] subpd xmm0, xmm4 movaps xmm6, xmm0 mov eax, r14d @@ -3606,7 +5256,13 @@ rx_i_275: ;OR_64 jz rx_finish xor r10, 0788eceb7h mov ecx, r10d - call rx_readint_l2 + test ebp, 63 + jnz short rx_body_275 + call rx_read_l2 +rx_body_275: + xor rdi, rcx + and ecx, 32767 + mov rax, qword ptr [rsi+rcx*8] or rax, r11 mov r13, rax @@ -3615,7 +5271,13 @@ rx_i_276: ;CALL jz rx_finish xor r9, 0c6ac5edah mov ecx, r9d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_276 + call rx_read_l1 +rx_body_276: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] cmp r11d, -1236180570 jns short taken_call_276 mov rcx, rax @@ -3633,7 +5295,13 @@ rx_i_277: ;IMUL_32 jz rx_finish xor r11, 0c9549789h mov ecx, r11d - call rx_readint_l2 + test ebp, 63 + jnz short rx_body_277 + call rx_read_l2 +rx_body_277: + xor rdi, rcx + and ecx, 32767 + mov rax, qword ptr [rsi+rcx*8] movsxd rcx, eax movsxd rax, r10d imul rax, rcx @@ -3648,7 +5316,13 @@ rx_i_278: ;FPSUB jz rx_finish xor r9, 0a2bc66c9h mov ecx, r9d - call rx_readfloat_l1 + test ebp, 63 + jnz short rx_body_278 + call rx_read_l1 +rx_body_278: + xor rdi, rcx + and ecx, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] subpd xmm0, xmm7 movaps xmm4, xmm0 mov eax, r12d @@ -3661,7 +5335,13 @@ rx_i_279: ;FPSUB jz rx_finish xor r15, 0f1a91458h mov ecx, r15d - call rx_readfloat_l1 + test ebp, 63 + jnz short rx_body_279 + call rx_read_l1 +rx_body_279: + xor rdi, rcx + and ecx, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] subpd xmm0, xmm5 movaps xmm9, xmm0 mov eax, r9d @@ -3674,7 +5354,13 @@ rx_i_280: ;AND_64 jz rx_finish xor r12, 066246b43h mov ecx, r12d - call rx_readint_l2 + test ebp, 63 + jnz short rx_body_280 + call rx_read_l2 +rx_body_280: + xor rdi, rcx + and ecx, 32767 + mov rax, qword ptr [rsi+rcx*8] and rax, r11 mov rcx, rax mov eax, r13d @@ -3687,7 +5373,13 @@ rx_i_281: ;SUB_64 jz rx_finish xor r10, 05a762727h mov ecx, r10d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_281 + call rx_read_l1 +rx_body_281: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] sub rax, r10 mov rcx, rax mov eax, r11d @@ -3700,7 +5392,13 @@ rx_i_282: ;SUB_32 jz rx_finish xor r15, 0de1ab603h mov ecx, r15d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_282 + call rx_read_l1 +rx_body_282: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] sub eax, 1367326224 mov r11, rax @@ -3709,7 +5407,13 @@ rx_i_283: ;ADD_32 jz rx_finish xor r9, 0df4d084fh mov ecx, r9d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_283 + call rx_read_l1 +rx_body_283: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] add eax, -1156732976 mov rcx, rax mov eax, r12d @@ -3722,7 +5426,13 @@ rx_i_284: ;FPSUB jz rx_finish xor r15, 0e68f36ach mov ecx, r15d - call rx_readfloat_l1 + test ebp, 63 + jnz short rx_body_284 + call rx_read_l1 +rx_body_284: + xor rdi, rcx + and ecx, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] subpd xmm0, xmm6 movaps xmm9, xmm0 mov eax, r9d @@ -3735,7 +5445,13 @@ rx_i_285: ;IMUL_32 jz rx_finish xor r8, 09adb333bh mov ecx, r8d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_285 + call rx_read_l1 +rx_body_285: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] movsxd rcx, eax movsxd rax, r8d imul rax, rcx @@ -3746,7 +5462,13 @@ rx_i_286: ;FPADD jz rx_finish xor r14, 082f5e36ch mov ecx, r14d - call rx_readfloat_l1 + test ebp, 63 + jnz short rx_body_286 + call rx_read_l1 +rx_body_286: + xor rdi, rcx + and ecx, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm9 movaps xmm7, xmm0 @@ -3755,7 +5477,13 @@ rx_i_287: ;OR_64 jz rx_finish xor r11, 049547c9ch mov ecx, r11d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_287 + call rx_read_l1 +rx_body_287: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] or rax, r15 mov rcx, rax mov eax, r8d @@ -3768,7 +5496,13 @@ rx_i_288: ;MUL_64 jz rx_finish xor r10, 08716ac8bh mov ecx, r10d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_288 + call rx_read_l1 +rx_body_288: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] imul rax, r8 mov rcx, rax mov eax, r9d @@ -3781,7 +5515,13 @@ rx_i_289: ;FPDIV jz rx_finish xor r14, 0efef52b5h mov ecx, r14d - call rx_readfloat_l2 + test ebp, 63 + jnz short rx_body_289 + call rx_read_l2 +rx_body_289: + xor rdi, rcx + and ecx, 32767 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] divpd xmm0, xmm9 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -3793,7 +5533,13 @@ rx_i_290: ;FPMUL jz rx_finish xor r15, 060665748h mov ecx, r15d - call rx_readfloat_l2 + test ebp, 63 + jnz short rx_body_290 + call rx_read_l2 +rx_body_290: + xor rdi, rcx + and ecx, 32767 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] mulpd xmm0, xmm8 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -3805,7 +5551,13 @@ rx_i_291: ;RET jz rx_finish xor r13, 0ddf4bd1ah mov ecx, r13d - call rx_readint_l2 + test ebp, 63 + jnz short rx_body_291 + call rx_read_l2 +rx_body_291: + xor rdi, rcx + and ecx, 32767 + mov rax, qword ptr [rsi+rcx*8] cmp rsp, rbx je short not_taken_ret_291 xor rax, qword ptr [rsp + 8] @@ -3827,7 +5579,13 @@ rx_i_292: ;ROR_64 jz rx_finish xor r13, 05a87cc3dh mov ecx, r13d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_292 + call rx_read_l1 +rx_body_292: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] ror rax, 23 mov r10, rax @@ -3836,7 +5594,13 @@ rx_i_293: ;FPSUB jz rx_finish xor r9, 0c61f4279h mov ecx, r9d - call rx_readfloat_l2 + test ebp, 63 + jnz short rx_body_293 + call rx_read_l2 +rx_body_293: + xor rdi, rcx + and ecx, 32767 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] subpd xmm0, xmm5 movaps xmm8, xmm0 @@ -3845,7 +5609,13 @@ rx_i_294: ;RET jz rx_finish xor r14, 0f3b9d85h mov ecx, r14d - call rx_readint_l2 + test ebp, 63 + jnz short rx_body_294 + call rx_read_l2 +rx_body_294: + xor rdi, rcx + and ecx, 32767 + mov rax, qword ptr [rsi+rcx*8] cmp rsp, rbx je short not_taken_ret_294 xor rax, qword ptr [rsp + 8] @@ -3867,7 +5637,13 @@ rx_i_295: ;FPSUB jz rx_finish xor r9, 0f42798fdh mov ecx, r9d - call rx_readfloat_l1 + test ebp, 63 + jnz short rx_body_295 + call rx_read_l1 +rx_body_295: + xor rdi, rcx + and ecx, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] subpd xmm0, xmm8 movaps xmm7, xmm0 @@ -3876,7 +5652,13 @@ rx_i_296: ;CALL jz rx_finish xor r14, 018738758h mov ecx, r14d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_296 + call rx_read_l1 +rx_body_296: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] cmp r9d, -207252278 jns short taken_call_296 mov rcx, rax @@ -3894,7 +5676,13 @@ rx_i_297: ;ADD_64 jz rx_finish xor r15, 0de3b9d9bh mov ecx, r15d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_297 + call rx_read_l1 +rx_body_297: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] add rax, r10 mov r14, rax @@ -3903,7 +5691,13 @@ rx_i_298: ;FPSUB jz rx_finish xor r14, 084f53637h mov ecx, r14d - call rx_readfloat_l1 + test ebp, 63 + jnz short rx_body_298 + call rx_read_l1 +rx_body_298: + xor rdi, rcx + and ecx, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] subpd xmm0, xmm7 movaps xmm6, xmm0 @@ -3912,7 +5706,13 @@ rx_i_299: ;ADD_64 jz rx_finish xor r12, 042f4897h mov ecx, r12d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_299 + call rx_read_l1 +rx_body_299: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] add rax, 21400308 mov rcx, rax mov eax, r12d @@ -3925,7 +5725,13 @@ rx_i_300: ;FPSUB jz rx_finish xor r12, 095765693h mov ecx, r12d - call rx_readfloat_l2 + test ebp, 63 + jnz short rx_body_300 + call rx_read_l2 +rx_body_300: + xor rdi, rcx + and ecx, 32767 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] subpd xmm0, xmm8 movaps xmm2, xmm0 @@ -3934,7 +5740,13 @@ rx_i_301: ;FPMUL jz rx_finish xor r8, 0a0ec5eech mov ecx, r8d - call rx_readfloat_l1 + test ebp, 63 + jnz short rx_body_301 + call rx_read_l1 +rx_body_301: + xor rdi, rcx + and ecx, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] mulpd xmm0, xmm5 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -3950,7 +5762,13 @@ rx_i_302: ;ADD_64 jz rx_finish xor r15, 0f6f8c345h mov ecx, r15d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_302 + call rx_read_l1 +rx_body_302: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] add rax, r10 mov r11, rax @@ -3959,7 +5777,13 @@ rx_i_303: ;FPADD jz rx_finish xor r14, 082a3e965h mov ecx, r14d - call rx_readfloat_l1 + test ebp, 63 + jnz short rx_body_303 + call rx_read_l1 +rx_body_303: + xor rdi, rcx + and ecx, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm3 movaps xmm9, xmm0 mov eax, r9d @@ -3972,7 +5796,13 @@ rx_i_304: ;MUL_64 jz rx_finish xor r12, 04940c652h mov ecx, r12d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_304 + call rx_read_l1 +rx_body_304: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] imul rax, r15 mov r13, rax @@ -3981,7 +5811,13 @@ rx_i_305: ;MUL_64 jz rx_finish xor r11, 03c6c62b8h mov ecx, r11d - call rx_readint_l2 + test ebp, 63 + jnz short rx_body_305 + call rx_read_l2 +rx_body_305: + xor rdi, rcx + and ecx, 32767 + mov rax, qword ptr [rsi+rcx*8] imul rax, rax, -65873120 mov r10, rax @@ -3990,7 +5826,13 @@ rx_i_306: ;ADD_64 jz rx_finish xor r15, 08b34cdfch mov ecx, r15d - call rx_readint_l2 + test ebp, 63 + jnz short rx_body_306 + call rx_read_l2 +rx_body_306: + xor rdi, rcx + and ecx, 32767 + mov rax, qword ptr [rsi+rcx*8] add rax, r15 mov r13, rax @@ -3999,7 +5841,13 @@ rx_i_307: ;SAR_64 jz rx_finish xor r15, 04c36adb1h mov ecx, r15d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_307 + call rx_read_l1 +rx_body_307: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] mov rcx, r8 sar rax, cl mov r10, rax @@ -4009,7 +5857,13 @@ rx_i_308: ;MUL_64 jz rx_finish xor r11, 0a4213b21h mov ecx, r11d - call rx_readint_l2 + test ebp, 63 + jnz short rx_body_308 + call rx_read_l2 +rx_body_308: + xor rdi, rcx + and ecx, 32767 + mov rax, qword ptr [rsi+rcx*8] imul rax, r13 mov r15, rax @@ -4018,7 +5872,13 @@ rx_i_309: ;IMULH_64 jz rx_finish xor r9, 090c42304h mov ecx, r9d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_309 + call rx_read_l1 +rx_body_309: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] mov rcx, -1652850028 imul rcx mov rax, rdx @@ -4033,7 +5893,13 @@ rx_i_310: ;FPMUL jz rx_finish xor r9, 0f78e1c8ch mov ecx, r9d - call rx_readfloat_l1 + test ebp, 63 + jnz short rx_body_310 + call rx_read_l1 +rx_body_310: + xor rdi, rcx + and ecx, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] mulpd xmm0, xmm6 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -4049,7 +5915,13 @@ rx_i_311: ;FPMUL jz rx_finish xor r8, 0ff8848cfh mov ecx, r8d - call rx_readfloat_l1 + test ebp, 63 + jnz short rx_body_311 + call rx_read_l1 +rx_body_311: + xor rdi, rcx + and ecx, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] mulpd xmm0, xmm4 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -4061,7 +5933,13 @@ rx_i_312: ;MUL_32 jz rx_finish xor r13, 0b18904cdh mov ecx, r13d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_312 + call rx_read_l1 +rx_body_312: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] mov ecx, eax mov eax, -1147928648 imul rax, rcx @@ -4072,7 +5950,13 @@ rx_i_313: ;FPADD jz rx_finish xor r8, 0a0d0befh mov ecx, r8d - call rx_readfloat_l1 + test ebp, 63 + jnz short rx_body_313 + call rx_read_l1 +rx_body_313: + xor rdi, rcx + and ecx, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm5 movaps xmm6, xmm0 @@ -4081,7 +5965,13 @@ rx_i_314: ;IMUL_32 jz rx_finish xor r15, 01e3c65f7h mov ecx, r15d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_314 + call rx_read_l1 +rx_body_314: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] movsxd rcx, eax movsxd rax, r9d imul rax, rcx @@ -4096,7 +5986,13 @@ rx_i_315: ;SHR_64 jz rx_finish xor r9, 02e36ddafh mov ecx, r9d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_315 + call rx_read_l1 +rx_body_315: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] mov rcx, r15 shr rax, cl mov r9, rax @@ -4106,7 +6002,13 @@ rx_i_316: ;RET jz rx_finish xor r14, 05b0cb5bbh mov ecx, r14d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_316 + call rx_read_l1 +rx_body_316: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] cmp rsp, rbx je short not_taken_ret_316 xor rax, qword ptr [rsp + 8] @@ -4128,7 +6030,13 @@ rx_i_317: ;FPADD jz rx_finish xor r9, 0c74e7415h mov ecx, r9d - call rx_readfloat_l1 + test ebp, 63 + jnz short rx_body_317 + call rx_read_l1 +rx_body_317: + xor rdi, rcx + and ecx, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm7 movaps xmm5, xmm0 @@ -4137,7 +6045,13 @@ rx_i_318: ;FPADD jz rx_finish xor r9, 057621d9ah mov ecx, r9d - call rx_readfloat_l1 + test ebp, 63 + jnz short rx_body_318 + call rx_read_l1 +rx_body_318: + xor rdi, rcx + and ecx, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm3 movaps xmm7, xmm0 @@ -4146,7 +6060,13 @@ rx_i_319: ;ROL_64 jz rx_finish xor r13, 08ee02d99h mov ecx, r13d - call rx_readint_l2 + test ebp, 63 + jnz short rx_body_319 + call rx_read_l2 +rx_body_319: + xor rdi, rcx + and ecx, 32767 + mov rax, qword ptr [rsi+rcx*8] mov rcx, r15 rol rax, cl mov rcx, rax @@ -4160,7 +6080,13 @@ rx_i_320: ;FPADD jz rx_finish xor r15, 013461188h mov ecx, r15d - call rx_readfloat_l1 + test ebp, 63 + jnz short rx_body_320 + call rx_read_l1 +rx_body_320: + xor rdi, rcx + and ecx, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm4 movaps xmm2, xmm0 mov eax, r10d @@ -4173,7 +6099,13 @@ rx_i_321: ;IMUL_32 jz rx_finish xor r11, 0a7bae383h mov ecx, r11d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_321 + call rx_read_l1 +rx_body_321: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] movsxd rcx, eax movsxd rax, r9d imul rax, rcx @@ -4188,7 +6120,13 @@ rx_i_322: ;RET jz rx_finish xor r14, 08215399bh mov ecx, r14d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_322 + call rx_read_l1 +rx_body_322: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] cmp rsp, rbx je short not_taken_ret_322 xor rax, qword ptr [rsp + 8] @@ -4210,7 +6148,13 @@ rx_i_323: ;MULH_64 jz rx_finish xor r14, 07b07664bh mov ecx, r14d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_323 + call rx_read_l1 +rx_body_323: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] mov rcx, -696924877 mul rcx mov rax, rdx @@ -4225,7 +6169,13 @@ rx_i_324: ;FPSQRT jz rx_finish xor r9, 0f956baffh mov ecx, r9d - call rx_readfloat_l1 + test ebp, 63 + jnz short rx_body_324 + call rx_read_l1 +rx_body_324: + xor rdi, rcx + and ecx, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] andps xmm0, xmm10 sqrtpd xmm0, xmm0 movaps xmm9, xmm0 @@ -4239,7 +6189,13 @@ rx_i_325: ;SHL_64 jz rx_finish xor r11, 0708ab9d1h mov ecx, r11d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_325 + call rx_read_l1 +rx_body_325: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] shl rax, 24 mov r13, rax @@ -4248,7 +6204,13 @@ rx_i_326: ;MULH_64 jz rx_finish xor r11, 0d1b27540h mov ecx, r11d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_326 + call rx_read_l1 +rx_body_326: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] mov rcx, r8 mul rcx mov rax, rdx @@ -4263,7 +6225,13 @@ rx_i_327: ;AND_64 jz rx_finish xor r9, 09665f98dh mov ecx, r9d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_327 + call rx_read_l1 +rx_body_327: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] and rax, r15 mov r12, rax @@ -4272,7 +6240,13 @@ rx_i_328: ;ROL_64 jz rx_finish xor r12, 0fb9c32adh mov ecx, r12d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_328 + call rx_read_l1 +rx_body_328: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] mov rcx, r13 rol rax, cl mov r9, rax @@ -4282,7 +6256,13 @@ rx_i_329: ;RET jz rx_finish xor r11, 0e1110623h mov ecx, r11d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_329 + call rx_read_l1 +rx_body_329: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] cmp rsp, rbx je short not_taken_ret_329 xor rax, qword ptr [rsp + 8] @@ -4296,7 +6276,13 @@ rx_i_330: ;MUL_32 jz rx_finish xor r9, 0f6a93f19h mov ecx, r9d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_330 + call rx_read_l1 +rx_body_330: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] mov ecx, eax mov eax, r13d imul rax, rcx @@ -4311,7 +6297,13 @@ rx_i_331: ;FPADD jz rx_finish xor r9, 0bc9bbe4ah mov ecx, r9d - call rx_readfloat_l1 + test ebp, 63 + jnz short rx_body_331 + call rx_read_l1 +rx_body_331: + xor rdi, rcx + and ecx, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm3 movaps xmm9, xmm0 @@ -4320,7 +6312,13 @@ rx_i_332: ;FPADD jz rx_finish xor r12, 0f253cd4eh mov ecx, r12d - call rx_readfloat_l1 + test ebp, 63 + jnz short rx_body_332 + call rx_read_l1 +rx_body_332: + xor rdi, rcx + and ecx, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm6 movaps xmm3, xmm0 mov eax, r11d @@ -4333,7 +6331,13 @@ rx_i_333: ;XOR_64 jz rx_finish xor r14, 0f009758bh mov ecx, r14d - call rx_readint_l2 + test ebp, 63 + jnz short rx_body_333 + call rx_read_l2 +rx_body_333: + xor rdi, rcx + and ecx, 32767 + mov rax, qword ptr [rsi+rcx*8] xor rax, -175125848 mov r11, rax @@ -4342,7 +6346,13 @@ rx_i_334: ;ADD_32 jz rx_finish xor r8, 0dda04168h mov ecx, r8d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_334 + call rx_read_l1 +rx_body_334: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] add eax, r13d mov r8, rax @@ -4351,7 +6361,13 @@ rx_i_335: ;SUB_64 jz rx_finish xor r15, 03e6cfb73h mov ecx, r15d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_335 + call rx_read_l1 +rx_body_335: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] sub rax, r8 mov rcx, rax mov eax, r12d @@ -4364,7 +6380,13 @@ rx_i_336: ;FPADD jz rx_finish xor r15, 0aea0a435h mov ecx, r15d - call rx_readfloat_l1 + test ebp, 63 + jnz short rx_body_336 + call rx_read_l1 +rx_body_336: + xor rdi, rcx + and ecx, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm2 movaps xmm3, xmm0 @@ -4373,7 +6395,13 @@ rx_i_337: ;ADD_32 jz rx_finish xor r8, 03d6c4ab2h mov ecx, r8d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_337 + call rx_read_l1 +rx_body_337: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] add eax, r12d mov rcx, rax mov eax, r13d @@ -4386,7 +6414,13 @@ rx_i_338: ;MUL_64 jz rx_finish xor r12, 0d428a742h mov ecx, r12d - call rx_readint_l2 + test ebp, 63 + jnz short rx_body_338 + call rx_read_l2 +rx_body_338: + xor rdi, rcx + and ecx, 32767 + mov rax, qword ptr [rsi+rcx*8] imul rax, r12 mov r11, rax @@ -4395,7 +6429,13 @@ rx_i_339: ;FPADD jz rx_finish xor r9, 04596ef73h mov ecx, r9d - call rx_readfloat_l1 + test ebp, 63 + jnz short rx_body_339 + call rx_read_l1 +rx_body_339: + xor rdi, rcx + and ecx, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm6 movaps xmm2, xmm0 @@ -4404,7 +6444,13 @@ rx_i_340: ;FPSUB jz rx_finish xor r15, 0e51629cch mov ecx, r15d - call rx_readfloat_l1 + test ebp, 63 + jnz short rx_body_340 + call rx_read_l1 +rx_body_340: + xor rdi, rcx + and ecx, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] subpd xmm0, xmm5 movaps xmm5, xmm0 @@ -4413,7 +6459,13 @@ rx_i_341: ;MUL_32 jz rx_finish xor r12, 019eb9ea5h mov ecx, r12d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_341 + call rx_read_l1 +rx_body_341: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] mov ecx, eax mov eax, r15d imul rax, rcx @@ -4428,7 +6480,13 @@ rx_i_342: ;FPMUL jz rx_finish xor r9, 09ccc7abah mov ecx, r9d - call rx_readfloat_l2 + test ebp, 63 + jnz short rx_body_342 + call rx_read_l2 +rx_body_342: + xor rdi, rcx + and ecx, 32767 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] mulpd xmm0, xmm2 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -4440,7 +6498,13 @@ rx_i_343: ;SHR_64 jz rx_finish xor r14, 056f6cf0bh mov ecx, r14d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_343 + call rx_read_l1 +rx_body_343: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] shr rax, 48 mov rcx, rax mov eax, r15d @@ -4453,7 +6517,13 @@ rx_i_344: ;FPMUL jz rx_finish xor r10, 03ef9bcc4h mov ecx, r10d - call rx_readfloat_l2 + test ebp, 63 + jnz short rx_body_344 + call rx_read_l2 +rx_body_344: + xor rdi, rcx + and ecx, 32767 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] mulpd xmm0, xmm6 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -4465,7 +6535,13 @@ rx_i_345: ;MULH_64 jz rx_finish xor r12, 0bbbcdbach mov ecx, r12d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_345 + call rx_read_l1 +rx_body_345: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] mov rcx, r13 mul rcx mov rax, rdx @@ -4480,7 +6556,13 @@ rx_i_346: ;XOR_64 jz rx_finish xor r12, 0ae9d1e96h mov ecx, r12d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_346 + call rx_read_l1 +rx_body_346: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] xor rax, r15 mov rcx, rax mov eax, r13d @@ -4493,7 +6575,13 @@ rx_i_347: ;ADD_64 jz rx_finish xor r14, 070c34d69h mov ecx, r14d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_347 + call rx_read_l1 +rx_body_347: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] add rax, r10 mov r13, rax @@ -4502,7 +6590,13 @@ rx_i_348: ;FPSUB jz rx_finish xor r13, 0523ff904h mov ecx, r13d - call rx_readfloat_l1 + test ebp, 63 + jnz short rx_body_348 + call rx_read_l1 +rx_body_348: + xor rdi, rcx + and ecx, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] subpd xmm0, xmm3 movaps xmm9, xmm0 mov eax, r9d @@ -4515,7 +6609,13 @@ rx_i_349: ;XOR_32 jz rx_finish xor r8, 018e0e5ddh mov ecx, r8d - call rx_readint_l2 + test ebp, 63 + jnz short rx_body_349 + call rx_read_l2 +rx_body_349: + xor rdi, rcx + and ecx, 32767 + mov rax, qword ptr [rsi+rcx*8] xor eax, r15d mov r13, rax @@ -4524,7 +6624,13 @@ rx_i_350: ;CALL jz rx_finish xor r9, 09bd050f0h mov ecx, r9d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_350 + call rx_read_l1 +rx_body_350: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] cmp r9d, -980411581 jbe short taken_call_350 mov rcx, rax @@ -4542,7 +6648,13 @@ rx_i_351: ;MUL_64 jz rx_finish xor r11, 0a3a5906fh mov ecx, r11d - call rx_readint_l2 + test ebp, 63 + jnz short rx_body_351 + call rx_read_l2 +rx_body_351: + xor rdi, rcx + and ecx, 32767 + mov rax, qword ptr [rsi+rcx*8] imul rax, r10 mov r13, rax @@ -4551,7 +6663,13 @@ rx_i_352: ;FPADD jz rx_finish xor r10, 0afc9af2bh mov ecx, r10d - call rx_readfloat_l1 + test ebp, 63 + jnz short rx_body_352 + call rx_read_l1 +rx_body_352: + xor rdi, rcx + and ecx, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm6 movaps xmm2, xmm0 mov eax, r10d @@ -4564,7 +6682,13 @@ rx_i_353: ;FPMUL jz rx_finish xor r13, 02e65278bh mov ecx, r13d - call rx_readfloat_l1 + test ebp, 63 + jnz short rx_body_353 + call rx_read_l1 +rx_body_353: + xor rdi, rcx + and ecx, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] mulpd xmm0, xmm2 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -4580,7 +6704,13 @@ rx_i_354: ;MULH_64 jz rx_finish xor r13, 02412fc10h mov ecx, r13d - call rx_readint_l2 + test ebp, 63 + jnz short rx_body_354 + call rx_read_l2 +rx_body_354: + xor rdi, rcx + and ecx, 32767 + mov rax, qword ptr [rsi+rcx*8] mov rcx, r13 mul rcx mov rax, rdx @@ -4591,7 +6721,13 @@ rx_i_355: ;MUL_64 jz rx_finish xor r10, 06bd6e65fh mov ecx, r10d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_355 + call rx_read_l1 +rx_body_355: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] imul rax, r14 mov rcx, rax mov eax, r8d @@ -4604,7 +6740,13 @@ rx_i_356: ;MUL_64 jz rx_finish xor r10, 01cd85d80h mov ecx, r10d - call rx_readint_l2 + test ebp, 63 + jnz short rx_body_356 + call rx_read_l2 +rx_body_356: + xor rdi, rcx + and ecx, 32767 + mov rax, qword ptr [rsi+rcx*8] imul rax, r10 mov r11, rax @@ -4613,7 +6755,13 @@ rx_i_357: ;ADD_64 jz rx_finish xor r10, 0f7daed36h mov ecx, r10d - call rx_readint_l2 + test ebp, 63 + jnz short rx_body_357 + call rx_read_l2 +rx_body_357: + xor rdi, rcx + and ecx, 32767 + mov rax, qword ptr [rsi+rcx*8] add rax, 820073637 mov r11, rax @@ -4622,7 +6770,13 @@ rx_i_358: ;DIV_64 jz rx_finish xor r13, 088fa6e5ah mov ecx, r13d - call rx_readint_l2 + test ebp, 63 + jnz short rx_body_358 + call rx_read_l2 +rx_body_358: + xor rdi, rcx + and ecx, 32767 + mov rax, qword ptr [rsi+rcx*8] mov ecx, 1 mov edx, r11d test edx, edx @@ -4636,7 +6790,13 @@ rx_i_359: ;FPSUB jz rx_finish xor r10, 0714fc2cdh mov ecx, r10d - call rx_readfloat_l2 + test ebp, 63 + jnz short rx_body_359 + call rx_read_l2 +rx_body_359: + xor rdi, rcx + and ecx, 32767 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] subpd xmm0, xmm9 movaps xmm4, xmm0 mov eax, r12d @@ -4649,7 +6809,13 @@ rx_i_360: ;FPMUL jz rx_finish xor r10, 0c2d110b5h mov ecx, r10d - call rx_readfloat_l1 + test ebp, 63 + jnz short rx_body_360 + call rx_read_l1 +rx_body_360: + xor rdi, rcx + and ecx, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] mulpd xmm0, xmm8 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -4661,7 +6827,13 @@ rx_i_361: ;FPSQRT jz rx_finish xor r15, 01d125a7fh mov ecx, r15d - call rx_readfloat_l1 + test ebp, 63 + jnz short rx_body_361 + call rx_read_l1 +rx_body_361: + xor rdi, rcx + and ecx, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] andps xmm0, xmm10 sqrtpd xmm0, xmm0 movaps xmm6, xmm0 @@ -4675,7 +6847,13 @@ rx_i_362: ;SUB_64 jz rx_finish xor r9, 0ed8954bdh mov ecx, r9d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_362 + call rx_read_l1 +rx_body_362: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] sub rax, 1082179469 mov rcx, rax mov eax, r15d @@ -4688,7 +6866,13 @@ rx_i_363: ;FPMUL jz rx_finish xor r12, 09f75887bh mov ecx, r12d - call rx_readfloat_l1 + test ebp, 63 + jnz short rx_body_363 + call rx_read_l1 +rx_body_363: + xor rdi, rcx + and ecx, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] mulpd xmm0, xmm6 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -4700,7 +6884,13 @@ rx_i_364: ;MULH_64 jz rx_finish xor r11, 0badaf867h mov ecx, r11d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_364 + call rx_read_l1 +rx_body_364: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] mov rcx, r8 mul rcx mov rax, rdx @@ -4711,7 +6901,13 @@ rx_i_365: ;IMUL_32 jz rx_finish xor r15, 02db4444ah mov ecx, r15d - call rx_readint_l2 + test ebp, 63 + jnz short rx_body_365 + call rx_read_l2 +rx_body_365: + xor rdi, rcx + and ecx, 32767 + mov rax, qword ptr [rsi+rcx*8] movsxd rcx, eax movsxd rax, r9d imul rax, rcx @@ -4726,7 +6922,13 @@ rx_i_366: ;IMUL_32 jz rx_finish xor r12, 0bff7218fh mov ecx, r12d - call rx_readint_l2 + test ebp, 63 + jnz short rx_body_366 + call rx_read_l2 +rx_body_366: + xor rdi, rcx + and ecx, 32767 + mov rax, qword ptr [rsi+rcx*8] movsxd rcx, eax movsxd rax, r8d imul rax, rcx @@ -4741,7 +6943,13 @@ rx_i_367: ;FPADD jz rx_finish xor r9, 04d14cb3ah mov ecx, r9d - call rx_readfloat_l2 + test ebp, 63 + jnz short rx_body_367 + call rx_read_l2 +rx_body_367: + xor rdi, rcx + and ecx, 32767 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm9 movaps xmm4, xmm0 mov eax, r12d @@ -4754,7 +6962,13 @@ rx_i_368: ;MUL_64 jz rx_finish xor r10, 0a14836bah mov ecx, r10d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_368 + call rx_read_l1 +rx_body_368: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] imul rax, r10 mov r8, rax @@ -4763,7 +6977,13 @@ rx_i_369: ;AND_64 jz rx_finish xor r9, 053fe22e2h mov ecx, r9d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_369 + call rx_read_l1 +rx_body_369: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] and rax, r13 mov r9, rax @@ -4772,7 +6992,13 @@ rx_i_370: ;FPSUB jz rx_finish xor r15, 010e1fb24h mov ecx, r15d - call rx_readfloat_l1 + test ebp, 63 + jnz short rx_body_370 + call rx_read_l1 +rx_body_370: + xor rdi, rcx + and ecx, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] subpd xmm0, xmm6 movaps xmm6, xmm0 mov eax, r14d @@ -4785,7 +7011,13 @@ rx_i_371: ;FPADD jz rx_finish xor r8, 0ebbd5cc9h mov ecx, r8d - call rx_readfloat_l1 + test ebp, 63 + jnz short rx_body_371 + call rx_read_l1 +rx_body_371: + xor rdi, rcx + and ecx, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm9 movaps xmm5, xmm0 mov eax, r13d @@ -4798,7 +7030,13 @@ rx_i_372: ;ROL_64 jz rx_finish xor r10, 098ab79d7h mov ecx, r10d - call rx_readint_l2 + test ebp, 63 + jnz short rx_body_372 + call rx_read_l2 +rx_body_372: + xor rdi, rcx + and ecx, 32767 + mov rax, qword ptr [rsi+rcx*8] mov rcx, r13 rol rax, cl mov r9, rax @@ -4808,7 +7046,13 @@ rx_i_373: ;FPDIV jz rx_finish xor r15, 056438b3h mov ecx, r15d - call rx_readfloat_l2 + test ebp, 63 + jnz short rx_body_373 + call rx_read_l2 +rx_body_373: + xor rdi, rcx + and ecx, 32767 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] divpd xmm0, xmm8 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -4820,7 +7064,13 @@ rx_i_374: ;FPMUL jz rx_finish xor r11, 0dbcce604h mov ecx, r11d - call rx_readfloat_l1 + test ebp, 63 + jnz short rx_body_374 + call rx_read_l1 +rx_body_374: + xor rdi, rcx + and ecx, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] mulpd xmm0, xmm2 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -4832,7 +7082,13 @@ rx_i_375: ;ADD_64 jz rx_finish xor r9, 0edea6200h mov ecx, r9d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_375 + call rx_read_l1 +rx_body_375: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] add rax, r15 mov rcx, rax mov eax, r12d @@ -4845,7 +7101,13 @@ rx_i_376: ;ADD_64 jz rx_finish xor r14, 05e61b279h mov ecx, r14d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_376 + call rx_read_l1 +rx_body_376: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] add rax, 476136066 mov rcx, rax mov eax, r8d @@ -4858,7 +7120,13 @@ rx_i_377: ;FPSUB jz rx_finish xor r14, 0fc1fb433h mov ecx, r14d - call rx_readfloat_l1 + test ebp, 63 + jnz short rx_body_377 + call rx_read_l1 +rx_body_377: + xor rdi, rcx + and ecx, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] subpd xmm0, xmm3 movaps xmm7, xmm0 @@ -4867,7 +7135,13 @@ rx_i_378: ;MUL_32 jz rx_finish xor r12, 082aa21ach mov ecx, r12d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_378 + call rx_read_l1 +rx_body_378: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] mov ecx, eax mov eax, 547725353 imul rax, rcx @@ -4878,7 +7152,13 @@ rx_i_379: ;FPADD jz rx_finish xor r10, 05dba41fbh mov ecx, r10d - call rx_readfloat_l2 + test ebp, 63 + jnz short rx_body_379 + call rx_read_l2 +rx_body_379: + xor rdi, rcx + and ecx, 32767 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm9 movaps xmm5, xmm0 mov eax, r13d @@ -4891,7 +7171,13 @@ rx_i_380: ;MUL_64 jz rx_finish xor r11, 0229e3d6eh mov ecx, r11d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_380 + call rx_read_l1 +rx_body_380: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] imul rax, rax, -1443002912 mov rcx, rax mov eax, r13d @@ -4904,7 +7190,13 @@ rx_i_381: ;SAR_64 jz rx_finish xor r8, 019816ff9h mov ecx, r8d - call rx_readint_l2 + test ebp, 63 + jnz short rx_body_381 + call rx_read_l2 +rx_body_381: + xor rdi, rcx + and ecx, 32767 + mov rax, qword ptr [rsi+rcx*8] mov rcx, r14 sar rax, cl mov r9, rax @@ -4914,7 +7206,13 @@ rx_i_382: ;FPADD jz rx_finish xor r14, 036b5b81fh mov ecx, r14d - call rx_readfloat_l1 + test ebp, 63 + jnz short rx_body_382 + call rx_read_l1 +rx_body_382: + xor rdi, rcx + and ecx, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm3 movaps xmm3, xmm0 mov eax, r11d @@ -4927,7 +7225,13 @@ rx_i_383: ;FPSUB jz rx_finish xor r15, 05f798ec3h mov ecx, r15d - call rx_readfloat_l1 + test ebp, 63 + jnz short rx_body_383 + call rx_read_l1 +rx_body_383: + xor rdi, rcx + and ecx, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] subpd xmm0, xmm4 movaps xmm5, xmm0 mov eax, r13d @@ -4940,7 +7244,13 @@ rx_i_384: ;SHR_64 jz rx_finish xor r10, 05b459fd7h mov ecx, r10d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_384 + call rx_read_l1 +rx_body_384: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] mov rcx, r11 shr rax, cl mov rcx, rax @@ -4954,7 +7264,13 @@ rx_i_385: ;MUL_64 jz rx_finish xor r15, 0c91749bbh mov ecx, r15d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_385 + call rx_read_l1 +rx_body_385: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] imul rax, r12 mov rcx, rax mov eax, r13d @@ -4967,7 +7283,13 @@ rx_i_386: ;FPADD jz rx_finish xor r9, 0575b4bdch mov ecx, r9d - call rx_readfloat_l2 + test ebp, 63 + jnz short rx_body_386 + call rx_read_l2 +rx_body_386: + xor rdi, rcx + and ecx, 32767 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm8 movaps xmm9, xmm0 @@ -4976,7 +7298,13 @@ rx_i_387: ;MUL_64 jz rx_finish xor r9, 0d4f7bc6ah mov ecx, r9d - call rx_readint_l2 + test ebp, 63 + jnz short rx_body_387 + call rx_read_l2 +rx_body_387: + xor rdi, rcx + and ecx, 32767 + mov rax, qword ptr [rsi+rcx*8] imul rax, r15 mov r9, rax @@ -4985,7 +7313,13 @@ rx_i_388: ;RET jz rx_finish xor r8, 08a949356h mov ecx, r8d - call rx_readint_l2 + test ebp, 63 + jnz short rx_body_388 + call rx_read_l2 +rx_body_388: + xor rdi, rcx + and ecx, 32767 + mov rax, qword ptr [rsi+rcx*8] cmp rsp, rbx je short not_taken_ret_388 xor rax, qword ptr [rsp + 8] @@ -5007,7 +7341,13 @@ rx_i_389: ;CALL jz rx_finish xor r11, 06531ad2eh mov ecx, r11d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_389 + call rx_read_l1 +rx_body_389: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] cmp r9d, -350609584 jge short taken_call_389 mov r14, rax @@ -5021,7 +7361,13 @@ rx_i_390: ;FPADD jz rx_finish xor r15, 02914abeah mov ecx, r15d - call rx_readfloat_l1 + test ebp, 63 + jnz short rx_body_390 + call rx_read_l1 +rx_body_390: + xor rdi, rcx + and ecx, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm4 movaps xmm3, xmm0 @@ -5030,7 +7376,13 @@ rx_i_391: ;FPADD jz rx_finish xor r8, 0473a41f0h mov ecx, r8d - call rx_readfloat_l1 + test ebp, 63 + jnz short rx_body_391 + call rx_read_l1 +rx_body_391: + xor rdi, rcx + and ecx, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm3 movaps xmm6, xmm0 @@ -5039,7 +7391,13 @@ rx_i_392: ;ROR_64 jz rx_finish xor r14, 01ebc1f0dh mov ecx, r14d - call rx_readint_l2 + test ebp, 63 + jnz short rx_body_392 + call rx_read_l2 +rx_body_392: + xor rdi, rcx + and ecx, 32767 + mov rax, qword ptr [rsi+rcx*8] ror rax, 0 mov rcx, rax mov eax, r13d @@ -5052,7 +7410,13 @@ rx_i_393: ;OR_32 jz rx_finish xor r14, 0742e95b1h mov ecx, r14d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_393 + call rx_read_l1 +rx_body_393: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] or eax, 552339548 mov rcx, rax mov eax, r13d @@ -5065,7 +7429,13 @@ rx_i_394: ;FPADD jz rx_finish xor r12, 0db885c2ch mov ecx, r12d - call rx_readfloat_l2 + test ebp, 63 + jnz short rx_body_394 + call rx_read_l2 +rx_body_394: + xor rdi, rcx + and ecx, 32767 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm9 movaps xmm6, xmm0 @@ -5074,7 +7444,13 @@ rx_i_395: ;IDIV_64 jz rx_finish xor r8, 04ae4fe8ch mov ecx, r8d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_395 + call rx_read_l1 +rx_body_395: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] mov edx, r13d cmp edx, -1 jne short safe_idiv_395 @@ -5097,7 +7473,13 @@ rx_i_396: ;FPADD jz rx_finish xor r10, 07b41862bh mov ecx, r10d - call rx_readfloat_l1 + test ebp, 63 + jnz short rx_body_396 + call rx_read_l1 +rx_body_396: + xor rdi, rcx + and ecx, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm7 movaps xmm4, xmm0 @@ -5106,7 +7488,13 @@ rx_i_397: ;MUL_64 jz rx_finish xor r8, 0916f3819h mov ecx, r8d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_397 + call rx_read_l1 +rx_body_397: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] imul rax, r12 mov rcx, rax mov eax, r10d @@ -5119,7 +7507,13 @@ rx_i_398: ;ROL_64 jz rx_finish xor r8, 04eb6fd2ah mov ecx, r8d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_398 + call rx_read_l1 +rx_body_398: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] rol rax, 44 mov rcx, rax mov eax, r11d @@ -5132,7 +7526,13 @@ rx_i_399: ;FPDIV jz rx_finish xor r11, 0899a98cfh mov ecx, r11d - call rx_readfloat_l1 + test ebp, 63 + jnz short rx_body_399 + call rx_read_l1 +rx_body_399: + xor rdi, rcx + and ecx, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] divpd xmm0, xmm2 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -5144,7 +7544,13 @@ rx_i_400: ;OR_32 jz rx_finish xor r13, 0aae75db6h mov ecx, r13d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_400 + call rx_read_l1 +rx_body_400: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] or eax, r11d mov rcx, rax mov eax, r14d @@ -5157,7 +7563,13 @@ rx_i_401: ;FPMUL jz rx_finish xor r13, 032e81f25h mov ecx, r13d - call rx_readfloat_l1 + test ebp, 63 + jnz short rx_body_401 + call rx_read_l1 +rx_body_401: + xor rdi, rcx + and ecx, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] mulpd xmm0, xmm4 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -5173,7 +7585,13 @@ rx_i_402: ;RET jz rx_finish xor r9, 0fa1a07ffh mov ecx, r9d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_402 + call rx_read_l1 +rx_body_402: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] cmp rsp, rbx je short not_taken_ret_402 xor rax, qword ptr [rsp + 8] @@ -5187,7 +7605,13 @@ rx_i_403: ;IDIV_64 jz rx_finish xor r9, 0e59500f7h mov ecx, r9d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_403 + call rx_read_l1 +rx_body_403: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] mov edx, r12d cmp edx, -1 jne short safe_idiv_403 @@ -5214,7 +7638,13 @@ rx_i_404: ;MUL_32 jz rx_finish xor r15, 05b8ceb2fh mov ecx, r15d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_404 + call rx_read_l1 +rx_body_404: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] mov ecx, eax mov eax, r8d imul rax, rcx @@ -5225,7 +7655,13 @@ rx_i_405: ;RET jz rx_finish xor r8, 0f61082a3h mov ecx, r8d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_405 + call rx_read_l1 +rx_body_405: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] cmp rsp, rbx je short not_taken_ret_405 xor rax, qword ptr [rsp + 8] @@ -5247,7 +7683,13 @@ rx_i_406: ;FPROUND jz rx_finish xor r9, 0af6886b7h mov ecx, r9d - call rx_readint_l2 + test ebp, 63 + jnz short rx_body_406 + call rx_read_l2 +rx_body_406: + xor rdi, rcx + and ecx, 32767 + mov rax, qword ptr [rsi+rcx*8] shl eax, 13 and eax, 24576 or eax, 40896 @@ -5259,7 +7701,13 @@ rx_i_407: ;FPMUL jz rx_finish xor r14, 09699566fh mov ecx, r14d - call rx_readfloat_l2 + test ebp, 63 + jnz short rx_body_407 + call rx_read_l2 +rx_body_407: + xor rdi, rcx + and ecx, 32767 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] mulpd xmm0, xmm9 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -5271,7 +7719,13 @@ rx_i_408: ;MUL_64 jz rx_finish xor r15, 066e79fa6h mov ecx, r15d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_408 + call rx_read_l1 +rx_body_408: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] imul rax, r9 mov rcx, rax mov eax, r10d @@ -5284,7 +7738,13 @@ rx_i_409: ;MUL_64 jz rx_finish xor r11, 04b6caa9ah mov ecx, r11d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_409 + call rx_read_l1 +rx_body_409: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] imul rax, r15 mov r8, rax @@ -5293,7 +7753,13 @@ rx_i_410: ;RET jz rx_finish xor r15, 0d17f245eh mov ecx, r15d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_410 + call rx_read_l1 +rx_body_410: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] cmp rsp, rbx je short not_taken_ret_410 xor rax, qword ptr [rsp + 8] @@ -5307,7 +7773,13 @@ rx_i_411: ;RET jz rx_finish xor r12, 0364f10e7h mov ecx, r12d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_411 + call rx_read_l1 +rx_body_411: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] cmp rsp, rbx je short not_taken_ret_411 xor rax, qword ptr [rsp + 8] @@ -5321,7 +7793,13 @@ rx_i_412: ;FPSQRT jz rx_finish xor r10, 0ac90e7ah mov ecx, r10d - call rx_readfloat_l1 + test ebp, 63 + jnz short rx_body_412 + call rx_read_l1 +rx_body_412: + xor rdi, rcx + and ecx, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] andps xmm0, xmm10 sqrtpd xmm0, xmm0 movaps xmm3, xmm0 @@ -5335,7 +7813,13 @@ rx_i_413: ;FPDIV jz rx_finish xor r11, 04b6037abh mov ecx, r11d - call rx_readfloat_l1 + test ebp, 63 + jnz short rx_body_413 + call rx_read_l1 +rx_body_413: + xor rdi, rcx + and ecx, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] divpd xmm0, xmm2 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -5347,7 +7831,13 @@ rx_i_414: ;OR_64 jz rx_finish xor r14, 06c01554dh mov ecx, r14d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_414 + call rx_read_l1 +rx_body_414: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] or rax, r8 mov rcx, rax mov eax, r10d @@ -5360,7 +7850,13 @@ rx_i_415: ;DIV_64 jz rx_finish xor r8, 08c3e59a1h mov ecx, r8d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_415 + call rx_read_l1 +rx_body_415: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] mov ecx, -538093385 xor edx, edx div rcx @@ -5371,7 +7867,13 @@ rx_i_416: ;FPSUB jz rx_finish xor r12, 0f3fafde9h mov ecx, r12d - call rx_readfloat_l1 + test ebp, 63 + jnz short rx_body_416 + call rx_read_l1 +rx_body_416: + xor rdi, rcx + and ecx, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] subpd xmm0, xmm3 movaps xmm5, xmm0 mov eax, r13d @@ -5384,7 +7886,13 @@ rx_i_417: ;SUB_64 jz rx_finish xor r10, 03c6481fah mov ecx, r10d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_417 + call rx_read_l1 +rx_body_417: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] sub rax, r12 mov r10, rax @@ -5393,7 +7901,13 @@ rx_i_418: ;MULH_64 jz rx_finish xor r10, 02bd61c5fh mov ecx, r10d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_418 + call rx_read_l1 +rx_body_418: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] mov rcx, r11 mul rcx mov rax, rdx @@ -5404,7 +7918,13 @@ rx_i_419: ;XOR_64 jz rx_finish xor r9, 0b6ab9d32h mov ecx, r9d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_419 + call rx_read_l1 +rx_body_419: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] xor rax, r14 mov rcx, rax mov eax, r14d @@ -5417,7 +7937,13 @@ rx_i_420: ;FPADD jz rx_finish xor r9, 0f9690ceah mov ecx, r9d - call rx_readfloat_l1 + test ebp, 63 + jnz short rx_body_420 + call rx_read_l1 +rx_body_420: + xor rdi, rcx + and ecx, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm3 movaps xmm9, xmm0 mov eax, r9d @@ -5430,7 +7956,13 @@ rx_i_421: ;RET jz rx_finish xor r12, 01ada0f39h mov ecx, r12d - call rx_readint_l2 + test ebp, 63 + jnz short rx_body_421 + call rx_read_l2 +rx_body_421: + xor rdi, rcx + and ecx, 32767 + mov rax, qword ptr [rsi+rcx*8] cmp rsp, rbx je short not_taken_ret_421 xor rax, qword ptr [rsp + 8] @@ -5444,7 +7976,13 @@ rx_i_422: ;IMUL_32 jz rx_finish xor r11, 04dd16ca4h mov ecx, r11d - call rx_readint_l2 + test ebp, 63 + jnz short rx_body_422 + call rx_read_l2 +rx_body_422: + xor rdi, rcx + and ecx, 32767 + mov rax, qword ptr [rsi+rcx*8] movsxd rcx, eax movsxd rax, r10d imul rax, rcx @@ -5455,7 +7993,13 @@ rx_i_423: ;MUL_64 jz rx_finish xor r12, 04df5ce05h mov ecx, r12d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_423 + call rx_read_l1 +rx_body_423: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] imul rax, r10 mov rcx, rax mov eax, r15d @@ -5468,7 +8012,13 @@ rx_i_424: ;FPADD jz rx_finish xor r13, 01ad12ce2h mov ecx, r13d - call rx_readfloat_l2 + test ebp, 63 + jnz short rx_body_424 + call rx_read_l2 +rx_body_424: + xor rdi, rcx + and ecx, 32767 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm7 movaps xmm9, xmm0 mov eax, r9d @@ -5481,7 +8031,13 @@ rx_i_425: ;IMUL_32 jz rx_finish xor r8, 0a3c5391dh mov ecx, r8d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_425 + call rx_read_l1 +rx_body_425: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] movsxd rcx, eax movsxd rax, r10d imul rax, rcx @@ -5492,7 +8048,13 @@ rx_i_426: ;AND_64 jz rx_finish xor r12, 09dd55ba0h mov ecx, r12d - call rx_readint_l2 + test ebp, 63 + jnz short rx_body_426 + call rx_read_l2 +rx_body_426: + xor rdi, rcx + and ecx, 32767 + mov rax, qword ptr [rsi+rcx*8] and rax, r9 mov rcx, rax mov eax, r14d @@ -5505,7 +8067,13 @@ rx_i_427: ;MUL_32 jz rx_finish xor r11, 0d6cae9aeh mov ecx, r11d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_427 + call rx_read_l1 +rx_body_427: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] mov ecx, eax mov eax, r11d imul rax, rcx @@ -5520,7 +8088,13 @@ rx_i_428: ;RET jz rx_finish xor r11, 0f807a961h mov ecx, r11d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_428 + call rx_read_l1 +rx_body_428: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] cmp rsp, rbx je short not_taken_ret_428 xor rax, qword ptr [rsp + 8] @@ -5542,7 +8116,13 @@ rx_i_429: ;MUL_64 jz rx_finish xor r12, 0650a4102h mov ecx, r12d - call rx_readint_l2 + test ebp, 63 + jnz short rx_body_429 + call rx_read_l2 +rx_body_429: + xor rdi, rcx + and ecx, 32767 + mov rax, qword ptr [rsi+rcx*8] imul rax, rax, 1990438276 mov r15, rax @@ -5551,7 +8131,13 @@ rx_i_430: ;FPADD jz rx_finish xor r14, 019cc0e5h mov ecx, r14d - call rx_readfloat_l1 + test ebp, 63 + jnz short rx_body_430 + call rx_read_l1 +rx_body_430: + xor rdi, rcx + and ecx, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm8 movaps xmm5, xmm0 mov eax, r13d @@ -5564,7 +8150,13 @@ rx_i_431: ;FPADD jz rx_finish xor r12, 0ed17ab58h mov ecx, r12d - call rx_readfloat_l1 + test ebp, 63 + jnz short rx_body_431 + call rx_read_l1 +rx_body_431: + xor rdi, rcx + and ecx, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm5 movaps xmm5, xmm0 mov eax, r13d @@ -5577,7 +8169,13 @@ rx_i_432: ;SUB_64 jz rx_finish xor r10, 01c3b321fh mov ecx, r10d - call rx_readint_l2 + test ebp, 63 + jnz short rx_body_432 + call rx_read_l2 +rx_body_432: + xor rdi, rcx + and ecx, 32767 + mov rax, qword ptr [rsi+rcx*8] sub rax, r10 mov r8, rax @@ -5586,7 +8184,13 @@ rx_i_433: ;ADD_32 jz rx_finish xor r13, 0bbb88499h mov ecx, r13d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_433 + call rx_read_l1 +rx_body_433: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] add eax, r12d mov rcx, rax mov eax, r12d @@ -5599,7 +8203,13 @@ rx_i_434: ;FPSQRT jz rx_finish xor r13, 0167edabdh mov ecx, r13d - call rx_readfloat_l2 + test ebp, 63 + jnz short rx_body_434 + call rx_read_l2 +rx_body_434: + xor rdi, rcx + and ecx, 32767 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] andps xmm0, xmm10 sqrtpd xmm0, xmm0 movaps xmm9, xmm0 @@ -5613,7 +8223,13 @@ rx_i_435: ;MUL_64 jz rx_finish xor r15, 0b940480ah mov ecx, r15d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_435 + call rx_read_l1 +rx_body_435: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] imul rax, r15 mov rcx, rax mov eax, r9d @@ -5626,7 +8242,13 @@ rx_i_436: ;FPADD jz rx_finish xor r15, 0bfc3ca8bh mov ecx, r15d - call rx_readfloat_l2 + test ebp, 63 + jnz short rx_body_436 + call rx_read_l2 +rx_body_436: + xor rdi, rcx + and ecx, 32767 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm2 movaps xmm7, xmm0 mov eax, r15d @@ -5639,7 +8261,13 @@ rx_i_437: ;FPDIV jz rx_finish xor r8, 098a6bcf7h mov ecx, r8d - call rx_readfloat_l1 + test ebp, 63 + jnz short rx_body_437 + call rx_read_l1 +rx_body_437: + xor rdi, rcx + and ecx, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] divpd xmm0, xmm3 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -5651,7 +8279,13 @@ rx_i_438: ;FPMUL jz rx_finish xor r10, 0325b38ebh mov ecx, r10d - call rx_readfloat_l1 + test ebp, 63 + jnz short rx_body_438 + call rx_read_l1 +rx_body_438: + xor rdi, rcx + and ecx, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] mulpd xmm0, xmm9 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -5663,7 +8297,13 @@ rx_i_439: ;XOR_32 jz rx_finish xor r13, 05e807e81h mov ecx, r13d - call rx_readint_l2 + test ebp, 63 + jnz short rx_body_439 + call rx_read_l2 +rx_body_439: + xor rdi, rcx + and ecx, 32767 + mov rax, qword ptr [rsi+rcx*8] xor eax, r15d mov rcx, rax mov eax, r10d @@ -5676,7 +8316,13 @@ rx_i_440: ;RET jz rx_finish xor r10, 062f83728h mov ecx, r10d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_440 + call rx_read_l1 +rx_body_440: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] cmp rsp, rbx je short not_taken_ret_440 xor rax, qword ptr [rsp + 8] @@ -5690,7 +8336,13 @@ rx_i_441: ;ADD_64 jz rx_finish xor r14, 0d18ec075h mov ecx, r14d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_441 + call rx_read_l1 +rx_body_441: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] add rax, 529736748 mov rcx, rax mov eax, r9d @@ -5703,7 +8355,13 @@ rx_i_442: ;CALL jz rx_finish xor r14, 0a53dd1bh mov ecx, r14d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_442 + call rx_read_l1 +rx_body_442: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] cmp r15d, 799523062 jbe short taken_call_442 mov rcx, rax @@ -5721,7 +8379,13 @@ rx_i_443: ;RET jz rx_finish xor r14, 0232d1285h mov ecx, r14d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_443 + call rx_read_l1 +rx_body_443: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] cmp rsp, rbx je short not_taken_ret_443 xor rax, qword ptr [rsp + 8] @@ -5743,7 +8407,13 @@ rx_i_444: ;FPMUL jz rx_finish xor r8, 042455dd8h mov ecx, r8d - call rx_readfloat_l2 + test ebp, 63 + jnz short rx_body_444 + call rx_read_l2 +rx_body_444: + xor rdi, rcx + and ecx, 32767 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] mulpd xmm0, xmm7 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -5759,7 +8429,13 @@ rx_i_445: ;ADD_64 jz rx_finish xor r13, 09ae009b2h mov ecx, r13d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_445 + call rx_read_l1 +rx_body_445: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] add rax, r11 mov rcx, rax mov eax, r9d @@ -5772,7 +8448,13 @@ rx_i_446: ;MUL_32 jz rx_finish xor r12, 01734708eh mov ecx, r12d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_446 + call rx_read_l1 +rx_body_446: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] mov ecx, eax mov eax, r15d imul rax, rcx @@ -5787,7 +8469,13 @@ rx_i_447: ;FPSUB jz rx_finish xor r8, 01596d0e8h mov ecx, r8d - call rx_readfloat_l1 + test ebp, 63 + jnz short rx_body_447 + call rx_read_l1 +rx_body_447: + xor rdi, rcx + and ecx, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] subpd xmm0, xmm7 movaps xmm5, xmm0 mov eax, r13d @@ -5800,7 +8488,13 @@ rx_i_448: ;FPSUB jz rx_finish xor r9, 0390cfdb0h mov ecx, r9d - call rx_readfloat_l1 + test ebp, 63 + jnz short rx_body_448 + call rx_read_l1 +rx_body_448: + xor rdi, rcx + and ecx, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] subpd xmm0, xmm3 movaps xmm9, xmm0 @@ -5809,7 +8503,13 @@ rx_i_449: ;ROR_64 jz rx_finish xor r8, 04f27744bh mov ecx, r8d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_449 + call rx_read_l1 +rx_body_449: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] ror rax, 28 mov r8, rax @@ -5818,7 +8518,13 @@ rx_i_450: ;ROL_64 jz rx_finish xor r8, 04e2c76ffh mov ecx, r8d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_450 + call rx_read_l1 +rx_body_450: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] mov rcx, r12 rol rax, cl mov rcx, rax @@ -5832,7 +8538,13 @@ rx_i_451: ;ADD_64 jz rx_finish xor r8, 0c4d99ac9h mov ecx, r8d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_451 + call rx_read_l1 +rx_body_451: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] add rax, -287502157 mov r8, rax @@ -5841,7 +8553,13 @@ rx_i_452: ;RET jz rx_finish xor r13, 040130b88h mov ecx, r13d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_452 + call rx_read_l1 +rx_body_452: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] cmp rsp, rbx je short not_taken_ret_452 xor rax, qword ptr [rsp + 8] @@ -5863,7 +8581,13 @@ rx_i_453: ;IMULH_64 jz rx_finish xor r11, 0a2096aa4h mov ecx, r11d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_453 + call rx_read_l1 +rx_body_453: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] mov rcx, r14 imul rcx mov rax, rdx @@ -5874,7 +8598,13 @@ rx_i_454: ;FPADD jz rx_finish xor r13, 081314291h mov ecx, r13d - call rx_readfloat_l1 + test ebp, 63 + jnz short rx_body_454 + call rx_read_l1 +rx_body_454: + xor rdi, rcx + and ecx, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm9 movaps xmm4, xmm0 mov eax, r12d @@ -5887,7 +8617,13 @@ rx_i_455: ;XOR_64 jz rx_finish xor r8, 059263cdbh mov ecx, r8d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_455 + call rx_read_l1 +rx_body_455: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] xor rax, r9 mov r8, rax @@ -5896,7 +8632,13 @@ rx_i_456: ;OR_32 jz rx_finish xor r9, 010e8fe6h mov ecx, r9d - call rx_readint_l2 + test ebp, 63 + jnz short rx_body_456 + call rx_read_l2 +rx_body_456: + xor rdi, rcx + and ecx, 32767 + mov rax, qword ptr [rsi+rcx*8] or eax, r11d mov rcx, rax mov eax, r9d @@ -5909,7 +8651,13 @@ rx_i_457: ;SUB_64 jz rx_finish xor r9, 09de1a3efh mov ecx, r9d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_457 + call rx_read_l1 +rx_body_457: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] sub rax, r10 mov rcx, rax mov eax, r10d @@ -5922,7 +8670,13 @@ rx_i_458: ;ROL_64 jz rx_finish xor r11, 05c79df6eh mov ecx, r11d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_458 + call rx_read_l1 +rx_body_458: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] rol rax, 22 mov r14, rax @@ -5931,7 +8685,13 @@ rx_i_459: ;MUL_64 jz rx_finish xor r9, 0346f46adh mov ecx, r9d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_459 + call rx_read_l1 +rx_body_459: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] imul rax, rax, 381354340 mov rcx, rax mov eax, r13d @@ -5944,7 +8704,13 @@ rx_i_460: ;SUB_64 jz rx_finish xor r11, 098ab71fch mov ecx, r11d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_460 + call rx_read_l1 +rx_body_460: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] sub rax, r14 mov rcx, rax mov eax, r12d @@ -5957,7 +8723,13 @@ rx_i_461: ;SHR_64 jz rx_finish xor r11, 0c814e926h mov ecx, r11d - call rx_readint_l2 + test ebp, 63 + jnz short rx_body_461 + call rx_read_l2 +rx_body_461: + xor rdi, rcx + and ecx, 32767 + mov rax, qword ptr [rsi+rcx*8] mov rcx, r13 shr rax, cl mov rcx, rax @@ -5971,7 +8743,13 @@ rx_i_462: ;ADD_64 jz rx_finish xor r10, 0c64b4a9eh mov ecx, r10d - call rx_readint_l2 + test ebp, 63 + jnz short rx_body_462 + call rx_read_l2 +rx_body_462: + xor rdi, rcx + and ecx, 32767 + mov rax, qword ptr [rsi+rcx*8] add rax, -1734323376 mov r15, rax @@ -5980,7 +8758,13 @@ rx_i_463: ;SUB_64 jz rx_finish xor r9, 08c29341h mov ecx, r9d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_463 + call rx_read_l1 +rx_body_463: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] sub rax, r15 mov r10, rax @@ -5989,7 +8773,13 @@ rx_i_464: ;MUL_64 jz rx_finish xor r12, 06ff587fdh mov ecx, r12d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_464 + call rx_read_l1 +rx_body_464: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] imul rax, r15 mov rcx, rax mov eax, r13d @@ -6002,7 +8792,13 @@ rx_i_465: ;FPADD jz rx_finish xor r12, 0b62c0003h mov ecx, r12d - call rx_readfloat_l2 + test ebp, 63 + jnz short rx_body_465 + call rx_read_l2 +rx_body_465: + xor rdi, rcx + and ecx, 32767 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm5 movaps xmm2, xmm0 @@ -6011,7 +8807,13 @@ rx_i_466: ;IMUL_32 jz rx_finish xor r13, 05c541c42h mov ecx, r13d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_466 + call rx_read_l1 +rx_body_466: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] movsxd rcx, eax mov rax, 282682508 imul rax, rcx @@ -6022,7 +8824,13 @@ rx_i_467: ;FPADD jz rx_finish xor r8, 0cbb33f81h mov ecx, r8d - call rx_readfloat_l1 + test ebp, 63 + jnz short rx_body_467 + call rx_read_l1 +rx_body_467: + xor rdi, rcx + and ecx, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm9 movaps xmm8, xmm0 @@ -6031,7 +8839,13 @@ rx_i_468: ;IDIV_64 jz rx_finish xor r8, 091044dc3h mov ecx, r8d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_468 + call rx_read_l1 +rx_body_468: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] mov edx, -13394825 cmp edx, -1 jne short safe_idiv_468 @@ -6058,7 +8872,13 @@ rx_i_469: ;MUL_32 jz rx_finish xor r9, 0c0186beh mov ecx, r9d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_469 + call rx_read_l1 +rx_body_469: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] mov ecx, eax mov eax, 294019485 imul rax, rcx @@ -6073,7 +8893,13 @@ rx_i_470: ;XOR_32 jz rx_finish xor r14, 090849e3eh mov ecx, r14d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_470 + call rx_read_l1 +rx_body_470: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] xor eax, r11d mov rcx, rax mov eax, r14d @@ -6086,7 +8912,13 @@ rx_i_471: ;IMUL_32 jz rx_finish xor r14, 0cedba9b6h mov ecx, r14d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_471 + call rx_read_l1 +rx_body_471: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] movsxd rcx, eax movsxd rax, r13d imul rax, rcx @@ -6097,7 +8929,13 @@ rx_i_472: ;CALL jz rx_finish xor r9, 038f4b9d6h mov ecx, r9d - call rx_readint_l2 + test ebp, 63 + jnz short rx_body_472 + call rx_read_l2 +rx_body_472: + xor rdi, rcx + and ecx, 32767 + mov rax, qword ptr [rsi+rcx*8] cmp r10d, 1738497427 jl short taken_call_472 mov r10, rax @@ -6111,7 +8949,13 @@ rx_i_473: ;MUL_64 jz rx_finish xor r14, 01fb7637dh mov ecx, r14d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_473 + call rx_read_l1 +rx_body_473: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] imul rax, rax, -751043211 mov r12, rax @@ -6120,7 +8964,13 @@ rx_i_474: ;CALL jz rx_finish xor r9, 0b5c0b4d4h mov ecx, r9d - call rx_readint_l2 + test ebp, 63 + jnz short rx_body_474 + call rx_read_l2 +rx_body_474: + xor rdi, rcx + and ecx, 32767 + mov rax, qword ptr [rsi+rcx*8] cmp r15d, -233120543 jo short taken_call_474 mov r15, rax @@ -6134,7 +8984,13 @@ rx_i_475: ;FPSUB jz rx_finish xor r10, 0910dcdeeh mov ecx, r10d - call rx_readfloat_l2 + test ebp, 63 + jnz short rx_body_475 + call rx_read_l2 +rx_body_475: + xor rdi, rcx + and ecx, 32767 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] subpd xmm0, xmm9 movaps xmm7, xmm0 @@ -6143,7 +8999,13 @@ rx_i_476: ;FPSUB jz rx_finish xor r8, 07ab3b5a4h mov ecx, r8d - call rx_readfloat_l1 + test ebp, 63 + jnz short rx_body_476 + call rx_read_l1 +rx_body_476: + xor rdi, rcx + and ecx, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] subpd xmm0, xmm2 movaps xmm9, xmm0 @@ -6152,7 +9014,13 @@ rx_i_477: ;FPADD jz rx_finish xor r12, 07a29ec63h mov ecx, r12d - call rx_readfloat_l1 + test ebp, 63 + jnz short rx_body_477 + call rx_read_l1 +rx_body_477: + xor rdi, rcx + and ecx, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm9 movaps xmm6, xmm0 mov eax, r14d @@ -6165,7 +9033,13 @@ rx_i_478: ;MUL_64 jz rx_finish xor r14, 02d3d7e7fh mov ecx, r14d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_478 + call rx_read_l1 +rx_body_478: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] imul rax, r10 mov r12, rax @@ -6174,7 +9048,13 @@ rx_i_479: ;MUL_64 jz rx_finish xor r12, 09b49c793h mov ecx, r12d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_479 + call rx_read_l1 +rx_body_479: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] imul rax, r14 mov rcx, rax mov eax, r13d @@ -6187,7 +9067,13 @@ rx_i_480: ;FPSUB jz rx_finish xor r9, 0a9cc4f01h mov ecx, r9d - call rx_readfloat_l1 + test ebp, 63 + jnz short rx_body_480 + call rx_read_l1 +rx_body_480: + xor rdi, rcx + and ecx, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] subpd xmm0, xmm4 movaps xmm6, xmm0 @@ -6196,7 +9082,13 @@ rx_i_481: ;DIV_64 jz rx_finish xor r14, 0225ba1f9h mov ecx, r14d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_481 + call rx_read_l1 +rx_body_481: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] mov ecx, 1 mov edx, r13d test edx, edx @@ -6210,7 +9102,13 @@ rx_i_482: ;XOR_64 jz rx_finish xor r14, 044a0f592h mov ecx, r14d - call rx_readint_l2 + test ebp, 63 + jnz short rx_body_482 + call rx_read_l2 +rx_body_482: + xor rdi, rcx + and ecx, 32767 + mov rax, qword ptr [rsi+rcx*8] xor rax, r12 mov r11, rax @@ -6219,7 +9117,13 @@ rx_i_483: ;FPADD jz rx_finish xor r11, 07f71f219h mov ecx, r11d - call rx_readfloat_l1 + test ebp, 63 + jnz short rx_body_483 + call rx_read_l1 +rx_body_483: + xor rdi, rcx + and ecx, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm6 movaps xmm6, xmm0 @@ -6228,7 +9132,13 @@ rx_i_484: ;ROL_64 jz rx_finish xor r12, 07027bacdh mov ecx, r12d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_484 + call rx_read_l1 +rx_body_484: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] rol rax, 37 mov r11, rax @@ -6237,7 +9147,13 @@ rx_i_485: ;CALL jz rx_finish xor r13, 03a04647h mov ecx, r13d - call rx_readint_l2 + test ebp, 63 + jnz short rx_body_485 + call rx_read_l2 +rx_body_485: + xor rdi, rcx + and ecx, 32767 + mov rax, qword ptr [rsi+rcx*8] cmp r8d, 554879918 jno short taken_call_485 mov rcx, rax @@ -6255,7 +9171,13 @@ rx_i_486: ;ADD_64 jz rx_finish xor r15, 0ad072937h mov ecx, r15d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_486 + call rx_read_l1 +rx_body_486: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] add rax, 942846898 mov rcx, rax mov eax, r8d @@ -6268,7 +9190,13 @@ rx_i_487: ;SUB_64 jz rx_finish xor r11, 07f78ad34h mov ecx, r11d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_487 + call rx_read_l1 +rx_body_487: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] sub rax, -333279706 mov r11, rax @@ -6277,7 +9205,13 @@ rx_i_488: ;IMULH_64 jz rx_finish xor r12, 0d8b1788eh mov ecx, r12d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_488 + call rx_read_l1 +rx_body_488: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] mov rcx, 297357073 imul rcx mov rax, rdx @@ -6288,7 +9222,13 @@ rx_i_489: ;CALL jz rx_finish xor r10, 0b2ec9f3ah mov ecx, r10d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_489 + call rx_read_l1 +rx_body_489: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] cmp r15d, -1127175870 jge short taken_call_489 mov rcx, rax @@ -6306,7 +9246,13 @@ rx_i_490: ;FPADD jz rx_finish xor r11, 015c7f598h mov ecx, r11d - call rx_readfloat_l2 + test ebp, 63 + jnz short rx_body_490 + call rx_read_l2 +rx_body_490: + xor rdi, rcx + and ecx, 32767 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm9 movaps xmm7, xmm0 @@ -6315,7 +9261,13 @@ rx_i_491: ;FPADD jz rx_finish xor r8, 0902da6bdh mov ecx, r8d - call rx_readfloat_l2 + test ebp, 63 + jnz short rx_body_491 + call rx_read_l2 +rx_body_491: + xor rdi, rcx + and ecx, 32767 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm9 movaps xmm7, xmm0 mov eax, r15d @@ -6328,7 +9280,13 @@ rx_i_492: ;OR_64 jz rx_finish xor r9, 0491090d9h mov ecx, r9d - call rx_readint_l2 + test ebp, 63 + jnz short rx_body_492 + call rx_read_l2 +rx_body_492: + xor rdi, rcx + and ecx, 32767 + mov rax, qword ptr [rsi+rcx*8] or rax, r9 mov r12, rax @@ -6337,7 +9295,13 @@ rx_i_493: ;FPSUB jz rx_finish xor r8, 09de81282h mov ecx, r8d - call rx_readfloat_l1 + test ebp, 63 + jnz short rx_body_493 + call rx_read_l1 +rx_body_493: + xor rdi, rcx + and ecx, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] subpd xmm0, xmm9 movaps xmm4, xmm0 @@ -6346,7 +9310,13 @@ rx_i_494: ;MUL_32 jz rx_finish xor r10, 0b0d50e46h mov ecx, r10d - call rx_readint_l2 + test ebp, 63 + jnz short rx_body_494 + call rx_read_l2 +rx_body_494: + xor rdi, rcx + and ecx, 32767 + mov rax, qword ptr [rsi+rcx*8] mov ecx, eax mov eax, r11d imul rax, rcx @@ -6357,7 +9327,13 @@ rx_i_495: ;FPMUL jz rx_finish xor r11, 0e276cad1h mov ecx, r11d - call rx_readfloat_l1 + test ebp, 63 + jnz short rx_body_495 + call rx_read_l1 +rx_body_495: + xor rdi, rcx + and ecx, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] mulpd xmm0, xmm2 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -6369,7 +9345,13 @@ rx_i_496: ;OR_64 jz rx_finish xor r14, 0fe757b73h mov ecx, r14d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_496 + call rx_read_l1 +rx_body_496: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] or rax, -359802064 mov r9, rax @@ -6378,7 +9360,13 @@ rx_i_497: ;FPDIV jz rx_finish xor r8, 08d25742eh mov ecx, r8d - call rx_readfloat_l1 + test ebp, 63 + jnz short rx_body_497 + call rx_read_l1 +rx_body_497: + xor rdi, rcx + and ecx, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] divpd xmm0, xmm3 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -6390,7 +9378,13 @@ rx_i_498: ;FPMUL jz rx_finish xor r15, 0e066fd15h mov ecx, r15d - call rx_readfloat_l1 + test ebp, 63 + jnz short rx_body_498 + call rx_read_l1 +rx_body_498: + xor rdi, rcx + and ecx, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] mulpd xmm0, xmm9 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -6406,7 +9400,13 @@ rx_i_499: ;IMUL_32 jz rx_finish xor r12, 08925556bh mov ecx, r12d - call rx_readint_l2 + test ebp, 63 + jnz short rx_body_499 + call rx_read_l2 +rx_body_499: + xor rdi, rcx + and ecx, 32767 + mov rax, qword ptr [rsi+rcx*8] movsxd rcx, eax mov rax, -1795485757 imul rax, rcx @@ -6417,7 +9417,13 @@ rx_i_500: ;CALL jz rx_finish xor r10, 04bc870ebh mov ecx, r10d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_500 + call rx_read_l1 +rx_body_500: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] cmp r13d, 1243939650 jl short taken_call_500 mov rcx, rax @@ -6435,7 +9441,13 @@ rx_i_501: ;SHR_64 jz rx_finish xor r8, 07d46c503h mov ecx, r8d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_501 + call rx_read_l1 +rx_body_501: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] mov rcx, r10 shr rax, cl mov rcx, rax @@ -6449,7 +9461,13 @@ rx_i_502: ;RET jz rx_finish xor r10, 09e70b20ch mov ecx, r10d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_502 + call rx_read_l1 +rx_body_502: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] cmp rsp, rbx je short not_taken_ret_502 xor rax, qword ptr [rsp + 8] @@ -6471,7 +9489,13 @@ rx_i_503: ;FPSUB jz rx_finish xor r13, 0442e4850h mov ecx, r13d - call rx_readfloat_l1 + test ebp, 63 + jnz short rx_body_503 + call rx_read_l1 +rx_body_503: + xor rdi, rcx + and ecx, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] subpd xmm0, xmm2 movaps xmm9, xmm0 mov eax, r9d @@ -6484,7 +9508,13 @@ rx_i_504: ;FPADD jz rx_finish xor r13, 099d48347h mov ecx, r13d - call rx_readfloat_l1 + test ebp, 63 + jnz short rx_body_504 + call rx_read_l1 +rx_body_504: + xor rdi, rcx + and ecx, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm9 movaps xmm4, xmm0 mov eax, r12d @@ -6497,7 +9527,13 @@ rx_i_505: ;FPMUL jz rx_finish xor r12, 032c0a28ah mov ecx, r12d - call rx_readfloat_l2 + test ebp, 63 + jnz short rx_body_505 + call rx_read_l2 +rx_body_505: + xor rdi, rcx + and ecx, 32767 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] mulpd xmm0, xmm4 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -6513,7 +9549,13 @@ rx_i_506: ;FPMUL jz rx_finish xor r9, 0a973d58ch mov ecx, r9d - call rx_readfloat_l1 + test ebp, 63 + jnz short rx_body_506 + call rx_read_l1 +rx_body_506: + xor rdi, rcx + and ecx, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] mulpd xmm0, xmm9 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -6525,7 +9567,13 @@ rx_i_507: ;RET jz rx_finish xor r10, 0d3b7165ch mov ecx, r10d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_507 + call rx_read_l1 +rx_body_507: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] cmp rsp, rbx je short not_taken_ret_507 xor rax, qword ptr [rsp + 8] @@ -6539,7 +9587,13 @@ rx_i_508: ;RET jz rx_finish xor r13, 0da34d818h mov ecx, r13d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_508 + call rx_read_l1 +rx_body_508: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] cmp rsp, rbx je short not_taken_ret_508 xor rax, qword ptr [rsp + 8] @@ -6553,7 +9607,13 @@ rx_i_509: ;CALL jz rx_finish xor r11, 01b2873f2h mov ecx, r11d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_509 + call rx_read_l1 +rx_body_509: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] cmp r8d, 1826115244 jno short taken_call_509 mov r10, rax @@ -6567,7 +9627,13 @@ rx_i_510: ;FPSUB jz rx_finish xor r8, 0db65513ch mov ecx, r8d - call rx_readfloat_l1 + test ebp, 63 + jnz short rx_body_510 + call rx_read_l1 +rx_body_510: + xor rdi, rcx + and ecx, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] subpd xmm0, xmm2 movaps xmm9, xmm0 @@ -6576,7 +9642,13 @@ rx_i_511: ;ROL_64 jz rx_finish xor r11, 02bd79286h mov ecx, r11d - call rx_readint_l1 + test ebp, 63 + jnz short rx_body_511 + call rx_read_l1 +rx_body_511: + xor rdi, rcx + and ecx, 2047 + mov rax, qword ptr [rsi+rcx*8] mov rcx, r10 rol rax, cl mov r11, rax From b6d654291f45a2707d3a226476cbc8572966f3dd Mon Sep 17 00:00:00 2001 From: tevador Date: Tue, 8 Jan 2019 12:19:19 +0100 Subject: [PATCH 05/35] 90 address transformations --- src/AssemblyGeneratorX86.cpp | 28 ++--- src/asm/program_transform_address.inc | 154 ++++++++++++++++++++++++++ 2 files changed, 168 insertions(+), 14 deletions(-) create mode 100644 src/asm/program_transform_address.inc diff --git a/src/AssemblyGeneratorX86.cpp b/src/AssemblyGeneratorX86.cpp index 8a11ac3..7cac04d 100644 --- a/src/AssemblyGeneratorX86.cpp +++ b/src/AssemblyGeneratorX86.cpp @@ -28,7 +28,7 @@ namespace RandomX { static const char* regR32[8] = { "r8d", "r9d", "r10d", "r11d", "r12d", "r13d", "r14d", "r15d" }; static const char* regF[8] = { "xmm8", "xmm9", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" }; - static const char* regMx = "edi"; + static const char* regMx = "rdi"; static const char* regIc = "ebp"; static const char* regStackBeginAddr = "rbx"; static const char* regScratchpadAddr = "rsi"; @@ -62,7 +62,7 @@ namespace RandomX { void AssemblyGeneratorX86::genar(Instruction& instr, int i) { asmCode << "\txor " << regR[instr.rega % RegistersCount] << ", 0" << std::hex << instr.addra << "h" << std::dec << std::endl; asmCode << "\tmov ecx, " << regR32[instr.rega % RegistersCount] << std::endl; - asmCode << "\ttest ebp, 63" << std::endl; + asmCode << "\ttest " << regIc << ", 63" << std::endl; asmCode << "\tjnz short rx_body_" << i << std::endl; switch (instr.loca & 3) { @@ -71,24 +71,24 @@ namespace RandomX { case 2: asmCode << "\tcall rx_read_l1" << std::endl; asmCode << "rx_body_" << i << ":" << std::endl; - asmCode << "\txor rdi, rcx" << std::endl; + asmCode << "\txor " << regMx << ", rcx" << std::endl; asmCode << "\tand ecx, " << (ScratchpadL1 - 1) << std::endl; break; default: //3 asmCode << "\tcall rx_read_l2" << std::endl; asmCode << "rx_body_" << i << ":" << std::endl; - asmCode << "\txor rdi, rcx" << std::endl; + asmCode << "\txor " << regMx << ", rcx" << std::endl; asmCode << "\tand ecx, " << (ScratchpadL2 - 1) << std::endl; break; } - asmCode << "\tmov rax, qword ptr [rsi+rcx*8]" << std::endl; + asmCode << "\tmov rax, qword ptr [" << regScratchpadAddr << "+rcx*8]" << std::endl; } void AssemblyGeneratorX86::genaf(Instruction& instr, int i) { asmCode << "\txor " << regR[instr.rega % RegistersCount] << ", 0" << std::hex << instr.addra << "h" << std::dec << std::endl; asmCode << "\tmov ecx, " << regR32[instr.rega % RegistersCount] << std::endl; - asmCode << "\ttest ebp, 63" << std::endl; + asmCode << "\ttest " << regIc << ", 63" << std::endl; asmCode << "\tjnz short rx_body_" << i << std::endl; switch (instr.loca & 3) { @@ -97,17 +97,17 @@ namespace RandomX { case 2: asmCode << "\tcall rx_read_l1" << std::endl; asmCode << "rx_body_" << i << ":" << std::endl; - asmCode << "\txor rdi, rcx" << std::endl; + asmCode << "\txor " << regMx << ", rcx" << std::endl; asmCode << "\tand ecx, " << (ScratchpadL1 - 1) << std::endl; break; default: //3 asmCode << "\tcall rx_read_l2" << std::endl; asmCode << "rx_body_" << i << ":" << std::endl; - asmCode << "\txor rdi, rcx" << std::endl; + asmCode << "\txor " << regMx << ", rcx" << std::endl; asmCode << "\tand ecx, " << (ScratchpadL2 - 1) << std::endl; break; } - asmCode << "\tcvtdq2pd xmm0, qword ptr [rsi+rcx*8]" << std::endl; + asmCode << "\tcvtdq2pd xmm0, qword ptr [" << regScratchpadAddr << "+rcx*8]" << std::endl; } void AssemblyGeneratorX86::genbr0(Instruction& instr, const char* instrx86) { @@ -174,7 +174,7 @@ namespace RandomX { asmCode << "\tand eax, " << (ScratchpadL2 - 1) << std::endl; asmCode << "\tmov qword ptr [" << regScratchpadAddr << " + rax * 8], rcx" << std::endl; if (trace) { - asmCode << "\tmov qword ptr [" << regScratchpadAddr << " + rdi * 8 + 262136], rcx" << std::endl; + asmCode << "\tmov qword ptr [" << regScratchpadAddr << " + " << regIc << " * 8 + 262136], rcx" << std::endl; } return; @@ -187,14 +187,14 @@ namespace RandomX { asmCode << "\tand eax, " << (ScratchpadL1 - 1) << std::endl; asmCode << "\tmov qword ptr [" << regScratchpadAddr << " + rax * 8], rcx" << std::endl; if (trace) { - asmCode << "\tmov qword ptr [" << regScratchpadAddr << " + rdi * 8 + 262136], rcx" << std::endl; + asmCode << "\tmov qword ptr [" << regScratchpadAddr << " + " << regIc << " * 8 + 262136], rcx" << std::endl; } return; default: asmCode << "\tmov " << regR[instr.regc % RegistersCount] << ", rax" << std::endl; if (trace) { - asmCode << "\tmov qword ptr [" << regScratchpadAddr << " + rdi * 8 + 262136], rax" << std::endl; + asmCode << "\tmov qword ptr [" << regScratchpadAddr << " + " << regIc << " * 8 + 262136], rax" << std::endl; } return; } @@ -222,7 +222,7 @@ namespace RandomX { break; } if (trace) { - asmCode << "\t" << store << " qword ptr [" << regScratchpadAddr << " + rdi * 8 + 262136], " << regF[instr.regc % RegistersCount] << std::endl; + asmCode << "\t" << store << " qword ptr [" << regScratchpadAddr << " + " << regIc << " * 8 + 262136], " << regF[instr.regc % RegistersCount] << std::endl; } } @@ -498,7 +498,7 @@ namespace RandomX { asmCode << "\tjmp rx_i_" << wrapInstr(i + 1) << std::endl; asmCode << "taken_call_" << i << ":" << std::endl; if (trace) { - asmCode << "\tmov qword ptr [" << regScratchpadAddr << " + rdi * 8 + 262136], rax" << std::endl; + asmCode << "\tmov qword ptr [" << regScratchpadAddr << " + " << regIc << " * 8 + 262136], rax" << std::endl; } asmCode << "\tpush rax" << std::endl; asmCode << "\tcall rx_i_" << wrapInstr(i + (instr.imm8 & 127) + 2) << std::endl; diff --git a/src/asm/program_transform_address.inc b/src/asm/program_transform_address.inc new file mode 100644 index 0000000..8d2a79f --- /dev/null +++ b/src/asm/program_transform_address.inc @@ -0,0 +1,154 @@ + ;# 90 address transformations + ;# forced REX prefix is used to make all transformations 4 bytes long + lea ecx, [rcx+rcx*8+109] + db 64 + xor ecx, 96 + lea ecx, [rcx+rcx*8-19] + db 64 + add ecx, -98 + db 64 + add ecx, -21 + db 64 + xor ecx, -80 + lea ecx, [rcx+rcx*8-92] + db 64 + add ecx, 113 + lea ecx, [rcx+rcx*8+100] + db 64 + add ecx, -39 + db 64 + xor ecx, 120 + lea ecx, [rcx+rcx*8-119] + db 64 + add ecx, -113 + db 64 + add ecx, 111 + db 64 + xor ecx, 104 + lea ecx, [rcx+rcx*8-83] + lea ecx, [rcx+rcx*8+127] + db 64 + xor ecx, -112 + db 64 + add ecx, 89 + db 64 + add ecx, -32 + db 64 + add ecx, 104 + db 64 + xor ecx, -120 + db 64 + xor ecx, 24 + lea ecx, [rcx+rcx*8+9] + db 64 + add ecx, -31 + db 64 + xor ecx, -16 + db 64 + add ecx, 68 + lea ecx, [rcx+rcx*8-110] + db 64 + xor ecx, 64 + db 64 + xor ecx, -40 + db 64 + xor ecx, -8 + db 64 + add ecx, -10 + db 64 + xor ecx, -32 + db 64 + add ecx, 14 + lea ecx, [rcx+rcx*8-46] + db 64 + xor ecx, -104 + lea ecx, [rcx+rcx*8+36] + db 64 + add ecx, 100 + lea ecx, [rcx+rcx*8-65] + lea ecx, [rcx+rcx*8+27] + lea ecx, [rcx+rcx*8+91] + db 64 + add ecx, -101 + db 64 + add ecx, -94 + lea ecx, [rcx+rcx*8-10] + db 64 + xor ecx, 80 + db 64 + add ecx, -108 + db 64 + add ecx, -58 + db 64 + xor ecx, 48 + lea ecx, [rcx+rcx*8+73] + db 64 + xor ecx, -48 + db 64 + xor ecx, 32 + db 64 + xor ecx, -96 + db 64 + add ecx, 118 + db 64 + add ecx, 91 + lea ecx, [rcx+rcx*8+18] + db 64 + add ecx, -11 + lea ecx, [rcx+rcx*8+63] + db 64 + add ecx, 114 + lea ecx, [rcx+rcx*8+45] + db 64 + add ecx, -67 + db 64 + add ecx, 53 + lea ecx, [rcx+rcx*8-101] + lea ecx, [rcx+rcx*8-1] + db 64 + xor ecx, 16 + lea ecx, [rcx+rcx*8-37] + lea ecx, [rcx+rcx*8-28] + lea ecx, [rcx+rcx*8-55] + db 64 + xor ecx, -88 + db 64 + xor ecx, -72 + db 64 + add ecx, 36 + db 64 + xor ecx, -56 + db 64 + add ecx, 116 + db 64 + xor ecx, 88 + db 64 + xor ecx, -128 + db 64 + add ecx, 50 + db 64 + add ecx, 105 + db 64 + add ecx, -37 + db 64 + xor ecx, 112 + db 64 + xor ecx, 8 + db 64 + xor ecx, -24 + lea ecx, [rcx+rcx*8+118] + db 64 + xor ecx, 72 + db 64 + xor ecx, -64 + db 64 + add ecx, 40 + lea ecx, [rcx+rcx*8-74] + lea ecx, [rcx+rcx*8+82] + lea ecx, [rcx+rcx*8+54] + db 64 + xor ecx, 56 + db 64 + xor ecx, 40 + db 64 + add ecx, 87 \ No newline at end of file From b71e0eec65b73ece018ae6bd21c2002ceef9619c Mon Sep 17 00:00:00 2001 From: tevador Date: Tue, 8 Jan 2019 14:50:31 +0100 Subject: [PATCH 06/35] Optimizations to reduce code size under 32K --- src/AssemblyGeneratorX86.cpp | 32 +- src/AssemblyGeneratorX86.hpp | 2 +- src/executeProgram-win64.asm | 45 +- src/program.inc | 2806 +++++++++++++++------------------- 4 files changed, 1252 insertions(+), 1633 deletions(-) diff --git a/src/AssemblyGeneratorX86.cpp b/src/AssemblyGeneratorX86.cpp index 7cac04d..21b39c8 100644 --- a/src/AssemblyGeneratorX86.cpp +++ b/src/AssemblyGeneratorX86.cpp @@ -28,9 +28,10 @@ namespace RandomX { static const char* regR32[8] = { "r8d", "r9d", "r10d", "r11d", "r12d", "r13d", "r14d", "r15d" }; static const char* regF[8] = { "xmm8", "xmm9", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" }; - static const char* regMx = "rdi"; - static const char* regIc = "ebp"; - static const char* regStackBeginAddr = "rbx"; + static const char* regMx = "rbp"; + static const char* regIc = "ebx"; + static const char* regIc8 = "bl"; + static const char* regStackBeginAddr = "rdi"; static const char* regScratchpadAddr = "rsi"; void AssemblyGeneratorX86::generateProgram(const void* seed) { @@ -62,7 +63,7 @@ namespace RandomX { void AssemblyGeneratorX86::genar(Instruction& instr, int i) { asmCode << "\txor " << regR[instr.rega % RegistersCount] << ", 0" << std::hex << instr.addra << "h" << std::dec << std::endl; asmCode << "\tmov ecx, " << regR32[instr.rega % RegistersCount] << std::endl; - asmCode << "\ttest " << regIc << ", 63" << std::endl; + asmCode << "\ttest " << regIc8 << ", 63" << std::endl; asmCode << "\tjnz short rx_body_" << i << std::endl; switch (instr.loca & 3) { @@ -71,13 +72,15 @@ namespace RandomX { case 2: asmCode << "\tcall rx_read_l1" << std::endl; asmCode << "rx_body_" << i << ":" << std::endl; - asmCode << "\txor " << regMx << ", rcx" << std::endl; + if ((instr.loca & 192) == 0) + asmCode << "\txor " << regMx << ", rcx" << std::endl; asmCode << "\tand ecx, " << (ScratchpadL1 - 1) << std::endl; break; default: //3 asmCode << "\tcall rx_read_l2" << std::endl; asmCode << "rx_body_" << i << ":" << std::endl; - asmCode << "\txor " << regMx << ", rcx" << std::endl; + if ((instr.loca & 192) == 0) + asmCode << "\txor " << regMx << ", rcx" << std::endl; asmCode << "\tand ecx, " << (ScratchpadL2 - 1) << std::endl; break; } @@ -88,7 +91,7 @@ namespace RandomX { void AssemblyGeneratorX86::genaf(Instruction& instr, int i) { asmCode << "\txor " << regR[instr.rega % RegistersCount] << ", 0" << std::hex << instr.addra << "h" << std::dec << std::endl; asmCode << "\tmov ecx, " << regR32[instr.rega % RegistersCount] << std::endl; - asmCode << "\ttest " << regIc << ", 63" << std::endl; + asmCode << "\ttest " << regIc8 << ", 63" << std::endl; asmCode << "\tjnz short rx_body_" << i << std::endl; switch (instr.loca & 3) { @@ -97,13 +100,15 @@ namespace RandomX { case 2: asmCode << "\tcall rx_read_l1" << std::endl; asmCode << "rx_body_" << i << ":" << std::endl; - asmCode << "\txor " << regMx << ", rcx" << std::endl; + if((instr.loca & 192) == 0) + asmCode << "\txor " << regMx << ", rcx" << std::endl; asmCode << "\tand ecx, " << (ScratchpadL1 - 1) << std::endl; break; default: //3 asmCode << "\tcall rx_read_l2" << std::endl; asmCode << "rx_body_" << i << ":" << std::endl; - asmCode << "\txor " << regMx << ", rcx" << std::endl; + if ((instr.loca & 192) == 0) + asmCode << "\txor " << regMx << ", rcx" << std::endl; asmCode << "\tand ecx, " << (ScratchpadL2 - 1) << std::endl; break; } @@ -200,8 +205,9 @@ namespace RandomX { } } - void AssemblyGeneratorX86::gencf(Instruction& instr) { - asmCode << "\tmovaps " << regF[instr.regc % RegistersCount] << ", xmm0" << std::endl; + void AssemblyGeneratorX86::gencf(Instruction& instr, bool move = true) { + if(move) + asmCode << "\tmovaps " << regF[instr.regc % RegistersCount] << ", xmm0" << std::endl; const char* store = (instr.locc & 8) ? "movhpd" : "movlpd"; switch (instr.locc & 7) { @@ -451,8 +457,8 @@ namespace RandomX { void AssemblyGeneratorX86::h_FPSQRT(Instruction& instr, int i) { genaf(instr, i); asmCode << "\tandps xmm0, xmm10" << std::endl; - asmCode << "\tsqrtpd xmm0, xmm0" << std::endl; - gencf(instr); + asmCode << "\tsqrtpd " << regF[instr.regc % RegistersCount] << ", xmm0" << std::endl; + gencf(instr, false); } void AssemblyGeneratorX86::h_FPROUND(Instruction& instr, int i) { diff --git a/src/AssemblyGeneratorX86.hpp b/src/AssemblyGeneratorX86.hpp index 92c7d31..6ffa2f9 100644 --- a/src/AssemblyGeneratorX86.hpp +++ b/src/AssemblyGeneratorX86.hpp @@ -45,7 +45,7 @@ namespace RandomX { void genbr132(Instruction&); void genbf(Instruction&, const char*); void gencr(Instruction&); - void gencf(Instruction&); + void gencf(Instruction&, bool); void generateCode(Instruction&, int); diff --git a/src/executeProgram-win64.asm b/src/executeProgram-win64.asm index 05434f2..ec39c60 100644 --- a/src/executeProgram-win64.asm +++ b/src/executeProgram-win64.asm @@ -22,12 +22,12 @@ PUBLIC executeProgram executeProgram PROC ; REGISTER ALLOCATION: ; rax -> temporary - ; rbx -> beginning of VM stack + ; rbx -> "ic" ; rcx -> temporary ; rdx -> temporary ; rsi -> convertible_t& scratchpad - ; rdi -> "mx" - ; rbp -> "ic" + ; rdi -> beginning of VM stack + ; rbp -> "ma", "mx" ; rsp -> end of VM stack ; r8 -> "r0" ; r9 -> "r1" @@ -82,13 +82,13 @@ executeProgram PROC ; function arguments push rcx ; RegisterFile& registerFile - mov rdi, qword ptr [rdx] ; "mx", "ma" + mov rbp, qword ptr [rdx] ; "mx", "ma" mov rax, qword ptr [rdx+8] ; uint8_t* dataset push rax mov rsi, r8 ; convertible_t* scratchpad - mov rbx, rsp ; beginning of VM stack - mov ebp, 1048577 ; number of VM instructions to execute + 1 + mov rdi, rsp ; beginning of VM stack + mov ebx, 1048577 ; number of VM instructions to execute + 1 xorps xmm10, xmm10 cmpeqpd xmm10, xmm10 @@ -164,7 +164,7 @@ executeProgram PROC rx_finish: ; unroll the stack - mov rsp, rbx + mov rsp, rdi ; save VM register values pop rcx @@ -211,30 +211,29 @@ TransformAddress MACRO reg32, reg64 ;# lies in a different cache line than the original address (mod 2^N). ;# This is done to prevent a load-store dependency. ;# There are 3 different transformations that can be used: x -> 9*x+C, x -> x+C, x -> x^C - lea reg32, [reg64+reg64*8+127] ;# C = -119 -110 -101 -92 -83 -74 -65 -55 -46 -37 -28 -19 -10 -1 9 18 27 36 45 54 63 73 82 91 100 109 118 127 - ;lea reg32, [reg64-128] ;# C = all except -7 to +7 - ;xor reg32, -8 ;# C = all except 0 to 7 + ;lea reg32, [reg64+reg64*8+127] ;# C = -119 -110 -101 -92 -83 -74 -65 -55 -46 -37 -28 -19 -10 -1 9 18 27 36 45 54 63 73 82 91 100 109 118 127 + db 64 + add reg32, -39 ;# C = all except -7 to +7 + ;xor reg32, -8 ;# C = all except 0 to 7 ENDM ReadMemoryRandom MACRO spmask ;# IN ecx = random 32-bit address -;# OUT rax = 64-bit integer return value -;# OUT xmm0 = 128-bit floating point return value -;# GLOBAL rbp = "ic" number of instructions until the end of the program -;# GLOBAL rbx = address of the dataset address +;# GLOBAL rdi = address of the dataset address ;# GLOBAL rsi = address of the scratchpad -;# GLOBAL rdi = low 32 bits = "mx", high 32 bits = "ma" +;# GLOBAL rbp = low 32 bits = "mx", high 32 bits = "ma" ;# MODIFY rcx, rdx + push rcx ;# preserve ecx + TransformAddress ecx, rcx ;# TransformAddress function + mov rax, qword ptr [rdi] ;# load the dataset address + xor rbp, rcx ;# modify "mx" ; prefetch cacheline "mx" - mov rax, qword ptr [rbx] ;# load the dataset address - and rdi, -64 ;# align "mx" to the start of a cache line - mov edx, edi ;# edx = mx + and rbp, -64 ;# align "mx" to the start of a cache line + mov edx, ebp ;# edx = mx prefetchnta byte ptr [rax+rdx] ; read cacheline "ma" - ror rdi, 32 ;# swap "ma" and "mx" - mov edx, edi ;# edx = ma - push rcx - TransformAddress ecx, rcx ;# TransformAddress function + ror rbp, 32 ;# swap "ma" and "mx" + mov edx, ebp ;# edx = ma and ecx, spmask-7 ;# limit address to the specified scratchpad size aligned to multiple of 8 lea rcx, [rsi+rcx*8] ;# scratchpad cache line lea rax, [rax+rdx] ;# dataset cache line @@ -254,7 +253,7 @@ ReadMemoryRandom MACRO spmask xor qword ptr [rcx+48], rdx mov rdx, qword ptr [rax+56] xor qword ptr [rcx+56], rdx - pop rcx + pop rcx ;# restore ecx ret ENDM diff --git a/src/program.inc b/src/program.inc index a551edb..5dd1b4e 100644 --- a/src/program.inc +++ b/src/program.inc @@ -1,16 +1,16 @@ rx_i_0: ;RET - dec ebp + dec ebx jz rx_finish xor r9, 0ca9788ah mov ecx, r9d - test ebp, 63 + test bl, 63 jnz short rx_body_0 call rx_read_l1 rx_body_0: - xor rdi, rcx + xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - cmp rsp, rbx + cmp rsp, rdi je short not_taken_ret_0 xor rax, qword ptr [rsp + 8] mov rcx, rax @@ -27,30 +27,29 @@ not_taken_ret_0: mov qword ptr [rsi + rax * 8], rcx rx_i_1: ;AND_64 - dec ebp + dec ebx jz rx_finish xor r15, 06afc2fa4h mov ecx, r15d - test ebp, 63 + test bl, 63 jnz short rx_body_1 call rx_read_l1 rx_body_1: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] and rax, r10 mov r12, rax rx_i_2: ;CALL - dec ebp + dec ebx jz rx_finish xor r15, 097210f7bh mov ecx, r15d - test ebp, 63 + test bl, 63 jnz short rx_body_2 call rx_read_l1 rx_body_2: - xor rdi, rcx + xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] cmp r11d, 1348521207 @@ -66,15 +65,15 @@ taken_call_2: call rx_i_47 rx_i_3: ;FPROUND - dec ebp + dec ebx jz rx_finish xor r13, 082c73195h mov ecx, r13d - test ebp, 63 + test bl, 63 jnz short rx_body_3 call rx_read_l1 rx_body_3: - xor rdi, rcx + xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] shl eax, 13 @@ -84,15 +83,14 @@ rx_body_3: ldmxcsr dword ptr [rsp - 8] rx_i_4: ;MULH_64 - dec ebp + dec ebx jz rx_finish xor r14, 077daefb4h mov ecx, r14d - test ebp, 63 + test bl, 63 jnz short rx_body_4 call rx_read_l1 rx_body_4: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] mov rcx, r14 @@ -105,15 +103,15 @@ rx_body_4: mov qword ptr [rsi + rax * 8], rcx rx_i_5: ;IMUL_32 - dec ebp + dec ebx jz rx_finish xor r15, 0379f9ee0h mov ecx, r15d - test ebp, 63 + test bl, 63 jnz short rx_body_5 call rx_read_l2 rx_body_5: - xor rdi, rcx + xor rbp, rcx and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] movsxd rcx, eax @@ -122,15 +120,14 @@ rx_body_5: mov r12, rax rx_i_6: ;MUL_64 - dec ebp + dec ebx jz rx_finish xor r8, 03bae7272h mov ecx, r8d - test ebp, 63 + test bl, 63 jnz short rx_body_6 call rx_read_l1 rx_body_6: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] imul rax, r15 @@ -141,15 +138,14 @@ rx_body_6: mov qword ptr [rsi + rax * 8], rcx rx_i_7: ;FPADD - dec ebp + dec ebx jz rx_finish xor r10, 0e264ed81h mov ecx, r10d - test ebp, 63 + test bl, 63 jnz short rx_body_7 call rx_read_l1 rx_body_7: - xor rdi, rcx and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm6 @@ -160,15 +156,14 @@ rx_body_7: movhpd qword ptr [rsi + rax * 8], xmm6 rx_i_8: ;SHL_64 - dec ebp + dec ebx jz rx_finish xor r13, 068c1e5d2h mov ecx, r13d - test ebp, 63 + test bl, 63 jnz short rx_body_8 call rx_read_l1 rx_body_8: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] shl rax, 47 @@ -179,45 +174,43 @@ rx_body_8: mov qword ptr [rsi + rax * 8], rcx rx_i_9: ;AND_64 - dec ebp + dec ebx jz rx_finish xor r14, 085121c54h mov ecx, r14d - test ebp, 63 + test bl, 63 jnz short rx_body_9 call rx_read_l1 rx_body_9: - xor rdi, rcx + xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] and rax, 565870810 mov r10, rax rx_i_10: ;OR_64 - dec ebp + dec ebx jz rx_finish xor r8, 052efde3eh mov ecx, r8d - test ebp, 63 + test bl, 63 jnz short rx_body_10 call rx_read_l1 rx_body_10: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] or rax, -727859809 mov r13, rax rx_i_11: ;FPADD - dec ebp + dec ebx jz rx_finish xor r10, 0a9bf8aa1h mov ecx, r10d - test ebp, 63 + test bl, 63 jnz short rx_body_11 call rx_read_l2 rx_body_11: - xor rdi, rcx and ecx, 32767 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm5 @@ -228,15 +221,14 @@ rx_body_11: movhpd qword ptr [rsi + rax * 8], xmm4 rx_i_12: ;CALL - dec ebp + dec ebx jz rx_finish xor r10, 0db2691ch mov ecx, r10d - test ebp, 63 + test bl, 63 jnz short rx_body_12 call rx_read_l2 rx_body_12: - xor rdi, rcx and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] cmp r8d, -1763940407 @@ -248,30 +240,30 @@ taken_call_12: call rx_i_35 rx_i_13: ;FPSUB - dec ebp + dec ebx jz rx_finish xor r12, 061c0d34dh mov ecx, r12d - test ebp, 63 + test bl, 63 jnz short rx_body_13 call rx_read_l1 rx_body_13: - xor rdi, rcx + xor rbp, rcx and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] subpd xmm0, xmm3 movaps xmm9, xmm0 rx_i_14: ;SHR_64 - dec ebp + dec ebx jz rx_finish xor r10, 0e761d1beh mov ecx, r10d - test ebp, 63 + test bl, 63 jnz short rx_body_14 call rx_read_l1 rx_body_14: - xor rdi, rcx + xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] shr rax, 4 @@ -282,18 +274,17 @@ rx_body_14: mov qword ptr [rsi + rax * 8], rcx rx_i_15: ;RET - dec ebp + dec ebx jz rx_finish xor r11, 074ddb688h mov ecx, r11d - test ebp, 63 + test bl, 63 jnz short rx_body_15 call rx_read_l2 rx_body_15: - xor rdi, rcx and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] - cmp rsp, rbx + cmp rsp, rdi je short not_taken_ret_15 xor rax, qword ptr [rsp + 8] mov rcx, rax @@ -310,15 +301,14 @@ not_taken_ret_15: mov qword ptr [rsi + rax * 8], rcx rx_i_16: ;ADD_64 - dec ebp + dec ebx jz rx_finish xor r14, 06be90627h mov ecx, r14d - test ebp, 63 + test bl, 63 jnz short rx_body_16 call rx_read_l1 rx_body_16: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] add rax, r10 @@ -329,15 +319,14 @@ rx_body_16: mov qword ptr [rsi + rax * 8], rcx rx_i_17: ;FPMUL - dec ebp + dec ebx jz rx_finish xor r11, 0fbc6fc35h mov ecx, r11d - test ebp, 63 + test bl, 63 jnz short rx_body_17 call rx_read_l1 rx_body_17: - xor rdi, rcx and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] mulpd xmm0, xmm4 @@ -351,15 +340,14 @@ rx_body_17: movlpd qword ptr [rsi + rax * 8], xmm4 rx_i_18: ;FPSUB - dec ebp + dec ebx jz rx_finish xor r14, 0c28ca080h mov ecx, r14d - test ebp, 63 + test bl, 63 jnz short rx_body_18 call rx_read_l1 rx_body_18: - xor rdi, rcx and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] subpd xmm0, xmm4 @@ -370,30 +358,29 @@ rx_body_18: movhpd qword ptr [rsi + rax * 8], xmm3 rx_i_19: ;FPSUB - dec ebp + dec ebx jz rx_finish xor r13, 0ac009c30h mov ecx, r13d - test ebp, 63 + test bl, 63 jnz short rx_body_19 call rx_read_l1 rx_body_19: - xor rdi, rcx + xor rbp, rcx and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] subpd xmm0, xmm8 movaps xmm7, xmm0 rx_i_20: ;FPMUL - dec ebp + dec ebx jz rx_finish xor r13, 0ecca967dh mov ecx, r13d - test ebp, 63 + test bl, 63 jnz short rx_body_20 call rx_read_l1 rx_body_20: - xor rdi, rcx and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] mulpd xmm0, xmm2 @@ -407,30 +394,30 @@ rx_body_20: movhpd qword ptr [rsi + rax * 8], xmm7 rx_i_21: ;FPADD - dec ebp + dec ebx jz rx_finish xor r8, 0977f0284h mov ecx, r8d - test ebp, 63 + test bl, 63 jnz short rx_body_21 call rx_read_l2 rx_body_21: - xor rdi, rcx + xor rbp, rcx and ecx, 32767 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm9 movaps xmm7, xmm0 rx_i_22: ;ADD_32 - dec ebp + dec ebx jz rx_finish xor r13, 080bdfefah mov ecx, r13d - test ebp, 63 + test bl, 63 jnz short rx_body_22 call rx_read_l1 rx_body_22: - xor rdi, rcx + xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] add eax, r8d @@ -441,30 +428,29 @@ rx_body_22: mov qword ptr [rsi + rax * 8], rcx rx_i_23: ;MUL_64 - dec ebp + dec ebx jz rx_finish xor r15, 0e1e0d3c4h mov ecx, r15d - test ebp, 63 + test bl, 63 jnz short rx_body_23 call rx_read_l1 rx_body_23: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] imul rax, r11 mov r8, rax rx_i_24: ;IMULH_64 - dec ebp + dec ebx jz rx_finish xor r8, 070d3b8c7h mov ecx, r8d - test ebp, 63 + test bl, 63 jnz short rx_body_24 call rx_read_l1 rx_body_24: - xor rdi, rcx + xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] mov rcx, r15 @@ -477,15 +463,15 @@ rx_body_24: mov qword ptr [rsi + rax * 8], rcx rx_i_25: ;FPMUL - dec ebp + dec ebx jz rx_finish xor r12, 01cf77a04h mov ecx, r12d - test ebp, 63 + test bl, 63 jnz short rx_body_25 call rx_read_l2 rx_body_25: - xor rdi, rcx + xor rbp, rcx and ecx, 32767 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] mulpd xmm0, xmm9 @@ -499,15 +485,14 @@ rx_body_25: movhpd qword ptr [rsi + rax * 8], xmm6 rx_i_26: ;IMUL_32 - dec ebp + dec ebx jz rx_finish xor r11, 0e311468ch mov ecx, r11d - test ebp, 63 + test bl, 63 jnz short rx_body_26 call rx_read_l1 rx_body_26: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] movsxd rcx, eax @@ -520,15 +505,14 @@ rx_body_26: mov qword ptr [rsi + rax * 8], rcx rx_i_27: ;FPMUL - dec ebp + dec ebx jz rx_finish xor r12, 01fd9911ah mov ecx, r12d - test ebp, 63 + test bl, 63 jnz short rx_body_27 call rx_read_l2 rx_body_27: - xor rdi, rcx and ecx, 32767 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] mulpd xmm0, xmm3 @@ -538,60 +522,57 @@ rx_body_27: movaps xmm6, xmm0 rx_i_28: ;XOR_64 - dec ebp + dec ebx jz rx_finish xor r13, 067df757eh mov ecx, r13d - test ebp, 63 + test bl, 63 jnz short rx_body_28 call rx_read_l1 rx_body_28: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] xor rax, r13 mov r14, rax rx_i_29: ;SUB_64 - dec ebp + dec ebx jz rx_finish xor r12, 0be2e7c42h mov ecx, r12d - test ebp, 63 + test bl, 63 jnz short rx_body_29 call rx_read_l2 rx_body_29: - xor rdi, rcx and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] sub rax, 1944166515 mov r14, rax rx_i_30: ;FPADD - dec ebp + dec ebx jz rx_finish xor r11, 084d067f7h mov ecx, r11d - test ebp, 63 + test bl, 63 jnz short rx_body_30 call rx_read_l1 rx_body_30: - xor rdi, rcx and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm3 movaps xmm7, xmm0 rx_i_31: ;FPADD - dec ebp + dec ebx jz rx_finish xor r14, 0d352ce37h mov ecx, r14d - test ebp, 63 + test bl, 63 jnz short rx_body_31 call rx_read_l2 rx_body_31: - xor rdi, rcx + xor rbp, rcx and ecx, 32767 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm3 @@ -602,30 +583,28 @@ rx_body_31: movhpd qword ptr [rsi + rax * 8], xmm6 rx_i_32: ;XOR_64 - dec ebp + dec ebx jz rx_finish xor r12, 0a1f248dah mov ecx, r12d - test ebp, 63 + test bl, 63 jnz short rx_body_32 call rx_read_l1 rx_body_32: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] xor rax, -1936869641 mov r9, rax rx_i_33: ;MULH_64 - dec ebp + dec ebx jz rx_finish xor r9, 0554720fch mov ecx, r9d - test ebp, 63 + test bl, 63 jnz short rx_body_33 call rx_read_l2 rx_body_33: - xor rdi, rcx and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] mov rcx, r15 @@ -634,15 +613,15 @@ rx_body_33: mov r12, rax rx_i_34: ;CALL - dec ebp + dec ebx jz rx_finish xor r13, 0665e91f1h mov ecx, r13d - test ebp, 63 + test bl, 63 jnz short rx_body_34 call rx_read_l1 rx_body_34: - xor rdi, rcx + xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] cmp r14d, -380224718 @@ -654,18 +633,17 @@ taken_call_34: call rx_i_108 rx_i_35: ;RET - dec ebp + dec ebx jz rx_finish xor r15, 05ef1be79h mov ecx, r15d - test ebp, 63 + test bl, 63 jnz short rx_body_35 call rx_read_l1 rx_body_35: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - cmp rsp, rbx + cmp rsp, rdi je short not_taken_ret_35 xor rax, qword ptr [rsp + 8] mov r8, rax @@ -674,15 +652,14 @@ not_taken_ret_35: mov r8, rax rx_i_36: ;FPMUL - dec ebp + dec ebx jz rx_finish xor r8, 012ec7e3ah mov ecx, r8d - test ebp, 63 + test bl, 63 jnz short rx_body_36 call rx_read_l1 rx_body_36: - xor rdi, rcx and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] mulpd xmm0, xmm6 @@ -692,15 +669,14 @@ rx_body_36: movaps xmm7, xmm0 rx_i_37: ;FPMUL - dec ebp + dec ebx jz rx_finish xor r12, 0d0706601h mov ecx, r12d - test ebp, 63 + test bl, 63 jnz short rx_body_37 call rx_read_l1 rx_body_37: - xor rdi, rcx and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] mulpd xmm0, xmm2 @@ -714,48 +690,46 @@ rx_body_37: movhpd qword ptr [rsi + rax * 8], xmm9 rx_i_38: ;SUB_64 - dec ebp + dec ebx jz rx_finish xor r9, 064056913h mov ecx, r9d - test ebp, 63 + test bl, 63 jnz short rx_body_38 call rx_read_l1 rx_body_38: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] sub rax, r14 mov r10, rax rx_i_39: ;ADD_32 - dec ebp + dec ebx jz rx_finish xor r14, 02c1f1eb0h mov ecx, r14d - test ebp, 63 + test bl, 63 jnz short rx_body_39 call rx_read_l1 rx_body_39: - xor rdi, rcx + xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] add eax, r14d mov r14, rax rx_i_40: ;RET - dec ebp + dec ebx jz rx_finish xor r10, 068fd9009h mov ecx, r10d - test ebp, 63 + test bl, 63 jnz short rx_body_40 call rx_read_l1 rx_body_40: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - cmp rsp, rbx + cmp rsp, rdi je short not_taken_ret_40 xor rax, qword ptr [rsp + 8] mov rcx, rax @@ -772,15 +746,14 @@ not_taken_ret_40: mov qword ptr [rsi + rax * 8], rcx rx_i_41: ;CALL - dec ebp + dec ebx jz rx_finish xor r9, 037a30933h mov ecx, r9d - test ebp, 63 + test bl, 63 jnz short rx_body_41 call rx_read_l1 rx_body_41: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] cmp r14d, -1070581824 @@ -792,30 +765,29 @@ taken_call_41: call rx_i_127 rx_i_42: ;FPSUB - dec ebp + dec ebx jz rx_finish xor r15, 0bc1de9f6h mov ecx, r15d - test ebp, 63 + test bl, 63 jnz short rx_body_42 call rx_read_l1 rx_body_42: - xor rdi, rcx + xor rbp, rcx and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] subpd xmm0, xmm6 movaps xmm6, xmm0 rx_i_43: ;SUB_64 - dec ebp + dec ebx jz rx_finish xor r12, 02b2a2eech mov ecx, r12d - test ebp, 63 + test bl, 63 jnz short rx_body_43 call rx_read_l1 rx_body_43: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] sub rax, 1693705407 @@ -826,15 +798,14 @@ rx_body_43: mov qword ptr [rsi + rax * 8], rcx rx_i_44: ;ROL_64 - dec ebp + dec ebx jz rx_finish xor r11, 0685817abh mov ecx, r11d - test ebp, 63 + test bl, 63 jnz short rx_body_44 call rx_read_l1 rx_body_44: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] mov rcx, r9 @@ -842,30 +813,29 @@ rx_body_44: mov r15, rax rx_i_45: ;FPSUB - dec ebp + dec ebx jz rx_finish xor r12, 08cd244ebh mov ecx, r12d - test ebp, 63 + test bl, 63 jnz short rx_body_45 call rx_read_l2 rx_body_45: - xor rdi, rcx + xor rbp, rcx and ecx, 32767 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] subpd xmm0, xmm2 movaps xmm5, xmm0 rx_i_46: ;ADD_64 - dec ebp + dec ebx jz rx_finish xor r8, 06d8f4254h mov ecx, r8d - test ebp, 63 + test bl, 63 jnz short rx_body_46 call rx_read_l2 rx_body_46: - xor rdi, rcx and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] add rax, r9 @@ -876,15 +846,15 @@ rx_body_46: mov qword ptr [rsi + rax * 8], rcx rx_i_47: ;CALL - dec ebp + dec ebx jz rx_finish xor r12, 05ba232c6h mov ecx, r12d - test ebp, 63 + test bl, 63 jnz short rx_body_47 call rx_read_l2 rx_body_47: - xor rdi, rcx + xor rbp, rcx and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] cmp r10d, 119251505 @@ -900,35 +870,32 @@ taken_call_47: call rx_i_131 rx_i_48: ;FPSQRT - dec ebp + dec ebx jz rx_finish xor r8, 0aaed618fh mov ecx, r8d - test ebp, 63 + test bl, 63 jnz short rx_body_48 call rx_read_l1 rx_body_48: - xor rdi, rcx and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] andps xmm0, xmm10 - sqrtpd xmm0, xmm0 - movaps xmm9, xmm0 + sqrtpd xmm9, xmm0 mov eax, r9d xor eax, 020e5d9e9h and eax, 2047 movhpd qword ptr [rsi + rax * 8], xmm9 rx_i_49: ;FPMUL - dec ebp + dec ebx jz rx_finish xor r8, 0f96c6a45h mov ecx, r8d - test ebp, 63 + test bl, 63 jnz short rx_body_49 call rx_read_l1 rx_body_49: - xor rdi, rcx and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] mulpd xmm0, xmm3 @@ -938,15 +905,15 @@ rx_body_49: movaps xmm5, xmm0 rx_i_50: ;OR_32 - dec ebp + dec ebx jz rx_finish xor r9, 0da3e4842h mov ecx, r9d - test ebp, 63 + test bl, 63 jnz short rx_body_50 call rx_read_l1 rx_body_50: - xor rdi, rcx + xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] or eax, r10d @@ -957,30 +924,28 @@ rx_body_50: mov qword ptr [rsi + rax * 8], rcx rx_i_51: ;SUB_64 - dec ebp + dec ebx jz rx_finish xor r10, 0302b676ah mov ecx, r10d - test ebp, 63 + test bl, 63 jnz short rx_body_51 call rx_read_l2 rx_body_51: - xor rdi, rcx and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] sub rax, 419241919 mov r15, rax rx_i_52: ;CALL - dec ebp + dec ebx jz rx_finish xor r11, 0fa88f48bh mov ecx, r11d - test ebp, 63 + test bl, 63 jnz short rx_body_52 call rx_read_l2 rx_body_52: - xor rdi, rcx and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] cmp r13d, -534426193 @@ -996,18 +961,17 @@ taken_call_52: call rx_i_94 rx_i_53: ;RET - dec ebp + dec ebx jz rx_finish xor r13, 03dff9b9eh mov ecx, r13d - test ebp, 63 + test bl, 63 jnz short rx_body_53 call rx_read_l1 rx_body_53: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - cmp rsp, rbx + cmp rsp, rdi je short not_taken_ret_53 xor rax, qword ptr [rsp + 8] mov r13, rax @@ -1016,15 +980,14 @@ not_taken_ret_53: mov r13, rax rx_i_54: ;IMULH_64 - dec ebp + dec ebx jz rx_finish xor r11, 060638de0h mov ecx, r11d - test ebp, 63 + test bl, 63 jnz short rx_body_54 call rx_read_l2 rx_body_54: - xor rdi, rcx and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] mov rcx, 282209221 @@ -1037,15 +1000,14 @@ rx_body_54: mov qword ptr [rsi + rax * 8], rcx rx_i_55: ;FPMUL - dec ebp + dec ebx jz rx_finish xor r10, 0dda983d4h mov ecx, r10d - test ebp, 63 + test bl, 63 jnz short rx_body_55 call rx_read_l1 rx_body_55: - xor rdi, rcx and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] mulpd xmm0, xmm5 @@ -1059,15 +1021,14 @@ rx_body_55: movlpd qword ptr [rsi + rax * 8], xmm3 rx_i_56: ;AND_64 - dec ebp + dec ebx jz rx_finish xor r14, 0f1456b8eh mov ecx, r14d - test ebp, 63 + test bl, 63 jnz short rx_body_56 call rx_read_l1 rx_body_56: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] and rax, r15 @@ -1078,15 +1039,14 @@ rx_body_56: mov qword ptr [rsi + rax * 8], rcx rx_i_57: ;MUL_64 - dec ebp + dec ebx jz rx_finish xor r9, 010dc4571h mov ecx, r9d - test ebp, 63 + test bl, 63 jnz short rx_body_57 call rx_read_l2 rx_body_57: - xor rdi, rcx and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] imul rax, r14 @@ -1097,15 +1057,14 @@ rx_body_57: mov qword ptr [rsi + rax * 8], rcx rx_i_58: ;IDIV_64 - dec ebp + dec ebx jz rx_finish xor r14, 0bcec0ebah mov ecx, r14d - test ebp, 63 + test bl, 63 jnz short rx_body_58 call rx_read_l2 rx_body_58: - xor rdi, rcx and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] mov edx, r13d @@ -1126,33 +1085,31 @@ result_idiv_58: mov r8, rax rx_i_59: ;FPSUB - dec ebp + dec ebx jz rx_finish xor r11, 0980dd402h mov ecx, r11d - test ebp, 63 + test bl, 63 jnz short rx_body_59 call rx_read_l1 rx_body_59: - xor rdi, rcx and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] subpd xmm0, xmm8 movaps xmm7, xmm0 rx_i_60: ;RET - dec ebp + dec ebx jz rx_finish xor r15, 03de14d1eh mov ecx, r15d - test ebp, 63 + test bl, 63 jnz short rx_body_60 call rx_read_l1 rx_body_60: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - cmp rsp, rbx + cmp rsp, rdi je short not_taken_ret_60 xor rax, qword ptr [rsp + 8] mov rcx, rax @@ -1169,15 +1126,14 @@ not_taken_ret_60: mov qword ptr [rsi + rax * 8], rcx rx_i_61: ;CALL - dec ebp + dec ebx jz rx_finish xor r13, 05058ce64h mov ecx, r13d - test ebp, 63 + test bl, 63 jnz short rx_body_61 call rx_read_l1 rx_body_61: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] cmp r15d, 1933164545 @@ -1189,15 +1145,14 @@ taken_call_61: call rx_i_120 rx_i_62: ;FPMUL - dec ebp + dec ebx jz rx_finish xor r15, 0c3089414h mov ecx, r15d - test ebp, 63 + test bl, 63 jnz short rx_body_62 call rx_read_l1 rx_body_62: - xor rdi, rcx and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] mulpd xmm0, xmm8 @@ -1211,15 +1166,14 @@ rx_body_62: movlpd qword ptr [rsi + rax * 8], xmm2 rx_i_63: ;FPMUL - dec ebp + dec ebx jz rx_finish xor r9, 065cf272eh mov ecx, r9d - test ebp, 63 + test bl, 63 jnz short rx_body_63 call rx_read_l1 rx_body_63: - xor rdi, rcx and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] mulpd xmm0, xmm7 @@ -1229,30 +1183,28 @@ rx_body_63: movaps xmm8, xmm0 rx_i_64: ;SUB_64 - dec ebp + dec ebx jz rx_finish xor r13, 0ae54dfbfh mov ecx, r13d - test ebp, 63 + test bl, 63 jnz short rx_body_64 call rx_read_l1 rx_body_64: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] sub rax, r15 mov r9, rax rx_i_65: ;CALL - dec ebp + dec ebx jz rx_finish xor r13, 07b366ce6h mov ecx, r13d - test ebp, 63 + test bl, 63 jnz short rx_body_65 call rx_read_l1 rx_body_65: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] cmp r8d, 1498056607 @@ -1264,35 +1216,33 @@ taken_call_65: call rx_i_129 rx_i_66: ;FPSQRT - dec ebp + dec ebx jz rx_finish xor r15, 015a1b689h mov ecx, r15d - test ebp, 63 + test bl, 63 jnz short rx_body_66 call rx_read_l2 rx_body_66: - xor rdi, rcx + xor rbp, rcx and ecx, 32767 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] andps xmm0, xmm10 - sqrtpd xmm0, xmm0 - movaps xmm9, xmm0 + sqrtpd xmm9, xmm0 mov eax, r9d xor eax, 07305e78h and eax, 2047 movlpd qword ptr [rsi + rax * 8], xmm9 rx_i_67: ;CALL - dec ebp + dec ebx jz rx_finish xor r14, 088393ba0h mov ecx, r14d - test ebp, 63 + test bl, 63 jnz short rx_body_67 call rx_read_l1 rx_body_67: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] cmp r13d, 2031541081 @@ -1304,15 +1254,14 @@ taken_call_67: call rx_i_79 rx_i_68: ;FPSUB - dec ebp + dec ebx jz rx_finish xor r13, 03aa5c3a4h mov ecx, r13d - test ebp, 63 + test bl, 63 jnz short rx_body_68 call rx_read_l1 rx_body_68: - xor rdi, rcx and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] subpd xmm0, xmm2 @@ -1323,30 +1272,30 @@ rx_body_68: movhpd qword ptr [rsi + rax * 8], xmm4 rx_i_69: ;FPADD - dec ebp + dec ebx jz rx_finish xor r15, 0376c9c27h mov ecx, r15d - test ebp, 63 + test bl, 63 jnz short rx_body_69 call rx_read_l2 rx_body_69: - xor rdi, rcx + xor rbp, rcx and ecx, 32767 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm5 movaps xmm8, xmm0 rx_i_70: ;MULH_64 - dec ebp + dec ebx jz rx_finish xor r8, 0bbbec3fah mov ecx, r8d - test ebp, 63 + test bl, 63 jnz short rx_body_70 call rx_read_l2 rx_body_70: - xor rdi, rcx + xor rbp, rcx and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] mov rcx, r9 @@ -1355,15 +1304,14 @@ rx_body_70: mov r13, rax rx_i_71: ;FPMUL - dec ebp + dec ebx jz rx_finish xor r14, 0e9efb350h mov ecx, r14d - test ebp, 63 + test bl, 63 jnz short rx_body_71 call rx_read_l1 rx_body_71: - xor rdi, rcx and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] mulpd xmm0, xmm5 @@ -1373,15 +1321,14 @@ rx_body_71: movaps xmm7, xmm0 rx_i_72: ;CALL - dec ebp + dec ebx jz rx_finish xor r13, 0f4e51e28h mov ecx, r13d - test ebp, 63 + test bl, 63 jnz short rx_body_72 call rx_read_l1 rx_body_72: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] cmp r9d, -631091751 @@ -1397,15 +1344,14 @@ taken_call_72: call rx_i_191 rx_i_73: ;FPROUND - dec ebp + dec ebx jz rx_finish xor r12, 0c24ddbd4h mov ecx, r12d - test ebp, 63 + test bl, 63 jnz short rx_body_73 call rx_read_l2 rx_body_73: - xor rdi, rcx and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] shl eax, 13 @@ -1415,15 +1361,15 @@ rx_body_73: ldmxcsr dword ptr [rsp - 8] rx_i_74: ;MUL_64 - dec ebp + dec ebx jz rx_finish xor r8, 04c4b0c7fh mov ecx, r8d - test ebp, 63 + test bl, 63 jnz short rx_body_74 call rx_read_l1 rx_body_74: - xor rdi, rcx + xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] imul rax, rax, -1431647438 @@ -1434,18 +1380,17 @@ rx_body_74: mov qword ptr [rsi + rax * 8], rcx rx_i_75: ;RET - dec ebp + dec ebx jz rx_finish xor r14, 03bcc02e3h mov ecx, r14d - test ebp, 63 + test bl, 63 jnz short rx_body_75 call rx_read_l2 rx_body_75: - xor rdi, rcx and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] - cmp rsp, rbx + cmp rsp, rdi je short not_taken_ret_75 xor rax, qword ptr [rsp + 8] mov r13, rax @@ -1454,15 +1399,14 @@ not_taken_ret_75: mov r13, rax rx_i_76: ;FPADD - dec ebp + dec ebx jz rx_finish xor r11, 04b0ff63eh mov ecx, r11d - test ebp, 63 + test bl, 63 jnz short rx_body_76 call rx_read_l1 rx_body_76: - xor rdi, rcx and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm3 @@ -1473,18 +1417,17 @@ rx_body_76: movlpd qword ptr [rsi + rax * 8], xmm7 rx_i_77: ;RET - dec ebp + dec ebx jz rx_finish xor r14, 0b956b3e8h mov ecx, r14d - test ebp, 63 + test bl, 63 jnz short rx_body_77 call rx_read_l2 rx_body_77: - xor rdi, rcx and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] - cmp rsp, rbx + cmp rsp, rdi je short not_taken_ret_77 xor rax, qword ptr [rsp + 8] mov rcx, rax @@ -1501,15 +1444,14 @@ not_taken_ret_77: mov qword ptr [rsi + rax * 8], rcx rx_i_78: ;MUL_32 - dec ebp + dec ebx jz rx_finish xor r9, 0edeca680h mov ecx, r9d - test ebp, 63 + test bl, 63 jnz short rx_body_78 call rx_read_l1 rx_body_78: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] mov ecx, eax @@ -1518,18 +1460,17 @@ rx_body_78: mov r15, rax rx_i_79: ;RET - dec ebp + dec ebx jz rx_finish xor r11, 0fbdddcb5h mov ecx, r11d - test ebp, 63 + test bl, 63 jnz short rx_body_79 call rx_read_l1 rx_body_79: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - cmp rsp, rbx + cmp rsp, rdi je short not_taken_ret_79 xor rax, qword ptr [rsp + 8] mov rcx, rax @@ -1546,45 +1487,42 @@ not_taken_ret_79: mov qword ptr [rsi + rax * 8], rcx rx_i_80: ;FPADD - dec ebp + dec ebx jz rx_finish xor r13, 09cec97a1h mov ecx, r13d - test ebp, 63 + test bl, 63 jnz short rx_body_80 call rx_read_l2 rx_body_80: - xor rdi, rcx and ecx, 32767 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm3 movaps xmm3, xmm0 rx_i_81: ;OR_64 - dec ebp + dec ebx jz rx_finish xor r15, 078228167h mov ecx, r15d - test ebp, 63 + test bl, 63 jnz short rx_body_81 call rx_read_l1 rx_body_81: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] or rax, r13 mov r8, rax rx_i_82: ;CALL - dec ebp + dec ebx jz rx_finish xor r11, 078cae1ffh mov ecx, r11d - test ebp, 63 + test bl, 63 jnz short rx_body_82 call rx_read_l1 rx_body_82: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] cmp r12d, -68969733 @@ -1600,30 +1538,28 @@ taken_call_82: call rx_i_145 rx_i_83: ;AND_64 - dec ebp + dec ebx jz rx_finish xor r10, 0d9b6a533h mov ecx, r10d - test ebp, 63 + test bl, 63 jnz short rx_body_83 call rx_read_l1 rx_body_83: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] and rax, r10 mov r12, rax rx_i_84: ;ROR_64 - dec ebp + dec ebx jz rx_finish xor r15, 0e9e75336h mov ecx, r15d - test ebp, 63 + test bl, 63 jnz short rx_body_84 call rx_read_l2 rx_body_84: - xor rdi, rcx and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] mov rcx, r10 @@ -1635,30 +1571,28 @@ rx_body_84: mov qword ptr [rsi + rax * 8], rcx rx_i_85: ;MUL_64 - dec ebp + dec ebx jz rx_finish xor r13, 04c0d378ah mov ecx, r13d - test ebp, 63 + test bl, 63 jnz short rx_body_85 call rx_read_l1 rx_body_85: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] imul rax, r8 mov r10, rax rx_i_86: ;OR_64 - dec ebp + dec ebx jz rx_finish xor r11, 04386e368h mov ecx, r11d - test ebp, 63 + test bl, 63 jnz short rx_body_86 call rx_read_l1 rx_body_86: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] or rax, r8 @@ -1669,30 +1603,30 @@ rx_body_86: mov qword ptr [rsi + rax * 8], rcx rx_i_87: ;SUB_64 - dec ebp + dec ebx jz rx_finish xor r9, 0d75a0ecfh mov ecx, r9d - test ebp, 63 + test bl, 63 jnz short rx_body_87 call rx_read_l1 rx_body_87: - xor rdi, rcx + xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] sub rax, r12 mov r8, rax rx_i_88: ;FPADD - dec ebp + dec ebx jz rx_finish xor r9, 031bb7f7ah mov ecx, r9d - test ebp, 63 + test bl, 63 jnz short rx_body_88 call rx_read_l1 rx_body_88: - xor rdi, rcx + xor rbp, rcx and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm6 @@ -1703,15 +1637,14 @@ rx_body_88: movlpd qword ptr [rsi + rax * 8], xmm9 rx_i_89: ;MUL_64 - dec ebp + dec ebx jz rx_finish xor r9, 03b45ecebh mov ecx, r9d - test ebp, 63 + test bl, 63 jnz short rx_body_89 call rx_read_l2 rx_body_89: - xor rdi, rcx and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] imul rax, r8 @@ -1722,30 +1655,29 @@ rx_body_89: mov qword ptr [rsi + rax * 8], rcx rx_i_90: ;FPADD - dec ebp + dec ebx jz rx_finish xor r12, 0ee08e76bh mov ecx, r12d - test ebp, 63 + test bl, 63 jnz short rx_body_90 call rx_read_l1 rx_body_90: - xor rdi, rcx and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm3 movaps xmm6, xmm0 rx_i_91: ;FPMUL - dec ebp + dec ebx jz rx_finish xor r9, 042e28e94h mov ecx, r9d - test ebp, 63 + test bl, 63 jnz short rx_body_91 call rx_read_l1 rx_body_91: - xor rdi, rcx + xor rbp, rcx and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] mulpd xmm0, xmm2 @@ -1755,15 +1687,14 @@ rx_body_91: movaps xmm4, xmm0 rx_i_92: ;CALL - dec ebp + dec ebx jz rx_finish xor r8, 0729260e1h mov ecx, r8d - test ebp, 63 + test bl, 63 jnz short rx_body_92 call rx_read_l2 rx_body_92: - xor rdi, rcx and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] cmp r14d, 1288893603 @@ -1775,15 +1706,15 @@ taken_call_92: call rx_i_170 rx_i_93: ;FPADD - dec ebp + dec ebx jz rx_finish xor r8, 0bfcebaf4h mov ecx, r8d - test ebp, 63 + test bl, 63 jnz short rx_body_93 call rx_read_l1 rx_body_93: - xor rdi, rcx + xor rbp, rcx and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm2 @@ -1794,18 +1725,18 @@ rx_body_93: movlpd qword ptr [rsi + rax * 8], xmm2 rx_i_94: ;RET - dec ebp + dec ebx jz rx_finish xor r13, 0ea326630h mov ecx, r13d - test ebp, 63 + test bl, 63 jnz short rx_body_94 call rx_read_l1 rx_body_94: - xor rdi, rcx + xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - cmp rsp, rbx + cmp rsp, rdi je short not_taken_ret_94 xor rax, qword ptr [rsp + 8] mov r8, rax @@ -1814,15 +1745,14 @@ not_taken_ret_94: mov r8, rax rx_i_95: ;MUL_64 - dec ebp + dec ebx jz rx_finish xor r13, 0b5451a2dh mov ecx, r13d - test ebp, 63 + test bl, 63 jnz short rx_body_95 call rx_read_l1 rx_body_95: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] imul rax, r10 @@ -1833,15 +1763,15 @@ rx_body_95: mov qword ptr [rsi + rax * 8], rcx rx_i_96: ;IMUL_32 - dec ebp + dec ebx jz rx_finish xor r11, 04f912ef8h mov ecx, r11d - test ebp, 63 + test bl, 63 jnz short rx_body_96 call rx_read_l2 rx_body_96: - xor rdi, rcx + xor rbp, rcx and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] movsxd rcx, eax @@ -1850,50 +1780,46 @@ rx_body_96: mov r11, rax rx_i_97: ;FPSQRT - dec ebp + dec ebx jz rx_finish xor r15, 0acc45b3bh mov ecx, r15d - test ebp, 63 + test bl, 63 jnz short rx_body_97 call rx_read_l1 rx_body_97: - xor rdi, rcx and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] andps xmm0, xmm10 - sqrtpd xmm0, xmm0 - movaps xmm5, xmm0 + sqrtpd xmm5, xmm0 mov eax, r13d xor eax, 0c477e850h and eax, 2047 movlpd qword ptr [rsi + rax * 8], xmm5 rx_i_98: ;SUB_64 - dec ebp + dec ebx jz rx_finish xor r14, 09900a4e8h mov ecx, r14d - test ebp, 63 + test bl, 63 jnz short rx_body_98 call rx_read_l1 rx_body_98: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] sub rax, r15 mov r14, rax rx_i_99: ;FPDIV - dec ebp + dec ebx jz rx_finish xor r9, 0841b2984h mov ecx, r9d - test ebp, 63 + test bl, 63 jnz short rx_body_99 call rx_read_l2 rx_body_99: - xor rdi, rcx and ecx, 32767 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] divpd xmm0, xmm6 @@ -1907,45 +1833,42 @@ rx_body_99: movlpd qword ptr [rsi + rax * 8], xmm4 rx_i_100: ;ADD_64 - dec ebp + dec ebx jz rx_finish xor r15, 07ebea48fh mov ecx, r15d - test ebp, 63 + test bl, 63 jnz short rx_body_100 call rx_read_l1 rx_body_100: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] add rax, r9 mov r14, rax rx_i_101: ;SUB_64 - dec ebp + dec ebx jz rx_finish xor r10, 0631209d3h mov ecx, r10d - test ebp, 63 + test bl, 63 jnz short rx_body_101 call rx_read_l1 rx_body_101: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] sub rax, r8 mov r11, rax rx_i_102: ;FPDIV - dec ebp + dec ebx jz rx_finish xor r10, 0e50bf07ah mov ecx, r10d - test ebp, 63 + test bl, 63 jnz short rx_body_102 call rx_read_l1 rx_body_102: - xor rdi, rcx and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] divpd xmm0, xmm3 @@ -1955,15 +1878,14 @@ rx_body_102: movaps xmm7, xmm0 rx_i_103: ;MUL_64 - dec ebp + dec ebx jz rx_finish xor r10, 02b7096f1h mov ecx, r10d - test ebp, 63 + test bl, 63 jnz short rx_body_103 call rx_read_l1 rx_body_103: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] imul rax, r13 @@ -1974,15 +1896,14 @@ rx_body_103: mov qword ptr [rsi + rax * 8], rcx rx_i_104: ;IMULH_64 - dec ebp + dec ebx jz rx_finish xor r11, 075deaf71h mov ecx, r11d - test ebp, 63 + test bl, 63 jnz short rx_body_104 call rx_read_l1 rx_body_104: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] mov rcx, -1913070089 @@ -1995,15 +1916,15 @@ rx_body_104: mov qword ptr [rsi + rax * 8], rcx rx_i_105: ;MUL_32 - dec ebp + dec ebx jz rx_finish xor r13, 036a51f72h mov ecx, r13d - test ebp, 63 + test bl, 63 jnz short rx_body_105 call rx_read_l1 rx_body_105: - xor rdi, rcx + xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] mov ecx, eax @@ -2016,15 +1937,14 @@ rx_body_105: mov qword ptr [rsi + rax * 8], rcx rx_i_106: ;FPMUL - dec ebp + dec ebx jz rx_finish xor r11, 07b512986h mov ecx, r11d - test ebp, 63 + test bl, 63 jnz short rx_body_106 call rx_read_l1 rx_body_106: - xor rdi, rcx and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] mulpd xmm0, xmm3 @@ -2038,15 +1958,14 @@ rx_body_106: movhpd qword ptr [rsi + rax * 8], xmm4 rx_i_107: ;CALL - dec ebp + dec ebx jz rx_finish xor r12, 0f1d2e50h mov ecx, r12d - test ebp, 63 + test bl, 63 jnz short rx_body_107 call rx_read_l1 rx_body_107: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] cmp r11d, 1917037441 @@ -2062,15 +1981,14 @@ taken_call_107: call rx_i_143 rx_i_108: ;FPDIV - dec ebp + dec ebx jz rx_finish xor r9, 07327ba60h mov ecx, r9d - test ebp, 63 + test bl, 63 jnz short rx_body_108 call rx_read_l1 rx_body_108: - xor rdi, rcx and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] divpd xmm0, xmm5 @@ -2084,30 +2002,29 @@ rx_body_108: movlpd qword ptr [rsi + rax * 8], xmm9 rx_i_109: ;FPADD - dec ebp + dec ebx jz rx_finish xor r15, 0594e37deh mov ecx, r15d - test ebp, 63 + test bl, 63 jnz short rx_body_109 call rx_read_l1 rx_body_109: - xor rdi, rcx + xor rbp, rcx and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm2 movaps xmm3, xmm0 rx_i_110: ;ROL_64 - dec ebp + dec ebx jz rx_finish xor r9, 04cdf5ebah mov ecx, r9d - test ebp, 63 + test bl, 63 jnz short rx_body_110 call rx_read_l1 rx_body_110: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] mov rcx, r9 @@ -2119,18 +2036,17 @@ rx_body_110: mov qword ptr [rsi + rax * 8], rcx rx_i_111: ;RET - dec ebp + dec ebx jz rx_finish xor r8, 02e16c97ch mov ecx, r8d - test ebp, 63 + test bl, 63 jnz short rx_body_111 call rx_read_l1 rx_body_111: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - cmp rsp, rbx + cmp rsp, rdi je short not_taken_ret_111 xor rax, qword ptr [rsp + 8] mov rcx, rax @@ -2147,15 +2063,14 @@ not_taken_ret_111: mov qword ptr [rsi + rax * 8], rcx rx_i_112: ;SUB_64 - dec ebp + dec ebx jz rx_finish xor r12, 0d42ddbd4h mov ecx, r12d - test ebp, 63 + test bl, 63 jnz short rx_body_112 call rx_read_l2 rx_body_112: - xor rdi, rcx and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] sub rax, r13 @@ -2166,15 +2081,14 @@ rx_body_112: mov qword ptr [rsi + rax * 8], rcx rx_i_113: ;MULH_64 - dec ebp + dec ebx jz rx_finish xor r10, 07a4f8cbbh mov ecx, r10d - test ebp, 63 + test bl, 63 jnz short rx_body_113 call rx_read_l1 rx_body_113: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] mov rcx, r9 @@ -2183,15 +2097,14 @@ rx_body_113: mov r13, rax rx_i_114: ;IMULH_64 - dec ebp + dec ebx jz rx_finish xor r13, 06e83e2cdh mov ecx, r13d - test ebp, 63 + test bl, 63 jnz short rx_body_114 call rx_read_l1 rx_body_114: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] mov rcx, r15 @@ -2200,30 +2113,28 @@ rx_body_114: mov r14, rax rx_i_115: ;OR_64 - dec ebp + dec ebx jz rx_finish xor r14, 0336c980eh mov ecx, r14d - test ebp, 63 + test bl, 63 jnz short rx_body_115 call rx_read_l2 rx_body_115: - xor rdi, rcx and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] or rax, r10 mov r14, rax rx_i_116: ;IMULH_64 - dec ebp + dec ebx jz rx_finish xor r10, 0d122702eh mov ecx, r10d - test ebp, 63 + test bl, 63 jnz short rx_body_116 call rx_read_l1 rx_body_116: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] mov rcx, -1850776691 @@ -2236,15 +2147,14 @@ rx_body_116: mov qword ptr [rsi + rax * 8], rcx rx_i_117: ;AND_64 - dec ebp + dec ebx jz rx_finish xor r11, 015f2012bh mov ecx, r11d - test ebp, 63 + test bl, 63 jnz short rx_body_117 call rx_read_l1 rx_body_117: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] and rax, -1205826972 @@ -2255,60 +2165,57 @@ rx_body_117: mov qword ptr [rsi + rax * 8], rcx rx_i_118: ;FPSUB - dec ebp + dec ebx jz rx_finish xor r9, 037ddf43dh mov ecx, r9d - test ebp, 63 + test bl, 63 jnz short rx_body_118 call rx_read_l2 rx_body_118: - xor rdi, rcx and ecx, 32767 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] subpd xmm0, xmm5 movaps xmm6, xmm0 rx_i_119: ;FPSUB - dec ebp + dec ebx jz rx_finish xor r9, 0bba475f3h mov ecx, r9d - test ebp, 63 + test bl, 63 jnz short rx_body_119 call rx_read_l1 rx_body_119: - xor rdi, rcx and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] subpd xmm0, xmm3 movaps xmm5, xmm0 rx_i_120: ;FPADD - dec ebp + dec ebx jz rx_finish xor r12, 0e5561e3eh mov ecx, r12d - test ebp, 63 + test bl, 63 jnz short rx_body_120 call rx_read_l1 rx_body_120: - xor rdi, rcx + xor rbp, rcx and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm4 movaps xmm8, xmm0 rx_i_121: ;FPMUL - dec ebp + dec ebx jz rx_finish xor r9, 03ab8f73h mov ecx, r9d - test ebp, 63 + test bl, 63 jnz short rx_body_121 call rx_read_l1 rx_body_121: - xor rdi, rcx and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] mulpd xmm0, xmm5 @@ -2318,18 +2225,17 @@ rx_body_121: movaps xmm8, xmm0 rx_i_122: ;RET - dec ebp + dec ebx jz rx_finish xor r10, 04e0dbd40h mov ecx, r10d - test ebp, 63 + test bl, 63 jnz short rx_body_122 call rx_read_l1 rx_body_122: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - cmp rsp, rbx + cmp rsp, rdi je short not_taken_ret_122 xor rax, qword ptr [rsp + 8] mov rcx, rax @@ -2346,30 +2252,28 @@ not_taken_ret_122: mov qword ptr [rsi + rax * 8], rcx rx_i_123: ;ADD_32 - dec ebp + dec ebx jz rx_finish xor r13, 073e9f58ah mov ecx, r13d - test ebp, 63 + test bl, 63 jnz short rx_body_123 call rx_read_l1 rx_body_123: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] add eax, r15d mov r13, rax rx_i_124: ;CALL - dec ebp + dec ebx jz rx_finish xor r12, 0e3fa3670h mov ecx, r12d - test ebp, 63 + test bl, 63 jnz short rx_body_124 call rx_read_l2 rx_body_124: - xor rdi, rcx and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] cmp r11d, 1719505436 @@ -2385,15 +2289,15 @@ taken_call_124: call rx_i_237 rx_i_125: ;MUL_32 - dec ebp + dec ebx jz rx_finish xor r8, 0ebec27cdh mov ecx, r8d - test ebp, 63 + test bl, 63 jnz short rx_body_125 call rx_read_l2 rx_body_125: - xor rdi, rcx + xor rbp, rcx and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] mov ecx, eax @@ -2402,15 +2306,14 @@ rx_body_125: mov r14, rax rx_i_126: ;FPDIV - dec ebp + dec ebx jz rx_finish xor r8, 01feb5264h mov ecx, r8d - test ebp, 63 + test bl, 63 jnz short rx_body_126 call rx_read_l1 rx_body_126: - xor rdi, rcx and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] divpd xmm0, xmm6 @@ -2420,15 +2323,14 @@ rx_body_126: movaps xmm2, xmm0 rx_i_127: ;IMUL_32 - dec ebp + dec ebx jz rx_finish xor r9, 0405f500fh mov ecx, r9d - test ebp, 63 + test bl, 63 jnz short rx_body_127 call rx_read_l1 rx_body_127: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] movsxd rcx, eax @@ -2437,30 +2339,29 @@ rx_body_127: mov r8, rax rx_i_128: ;MUL_64 - dec ebp + dec ebx jz rx_finish xor r13, 0459f1154h mov ecx, r13d - test ebp, 63 + test bl, 63 jnz short rx_body_128 call rx_read_l2 rx_body_128: - xor rdi, rcx + xor rbp, rcx and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] imul rax, r9 mov r9, rax rx_i_129: ;CALL - dec ebp + dec ebx jz rx_finish xor r9, 081918b4ch mov ecx, r9d - test ebp, 63 + test bl, 63 jnz short rx_body_129 call rx_read_l1 rx_body_129: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] cmp r13d, -590624856 @@ -2472,15 +2373,14 @@ taken_call_129: call rx_i_154 rx_i_130: ;OR_64 - dec ebp + dec ebx jz rx_finish xor r9, 077c3b332h mov ecx, r9d - test ebp, 63 + test bl, 63 jnz short rx_body_130 call rx_read_l1 rx_body_130: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] or rax, -281794782 @@ -2491,18 +2391,17 @@ rx_body_130: mov qword ptr [rsi + rax * 8], rcx rx_i_131: ;RET - dec ebp + dec ebx jz rx_finish xor r12, 05792310bh mov ecx, r12d - test ebp, 63 + test bl, 63 jnz short rx_body_131 call rx_read_l1 rx_body_131: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - cmp rsp, rbx + cmp rsp, rdi je short not_taken_ret_131 xor rax, qword ptr [rsp + 8] mov rcx, rax @@ -2519,30 +2418,28 @@ not_taken_ret_131: mov qword ptr [rsi + rax * 8], rcx rx_i_132: ;FPADD - dec ebp + dec ebx jz rx_finish xor r10, 0ebc6e10h mov ecx, r10d - test ebp, 63 + test bl, 63 jnz short rx_body_132 call rx_read_l1 rx_body_132: - xor rdi, rcx and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm6 movaps xmm7, xmm0 rx_i_133: ;XOR_64 - dec ebp + dec ebx jz rx_finish xor r14, 0822f8b60h mov ecx, r14d - test ebp, 63 + test bl, 63 jnz short rx_body_133 call rx_read_l1 rx_body_133: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] xor rax, -1000526796 @@ -2553,30 +2450,28 @@ rx_body_133: mov qword ptr [rsi + rax * 8], rcx rx_i_134: ;ADD_64 - dec ebp + dec ebx jz rx_finish xor r10, 0d0f18593h mov ecx, r10d - test ebp, 63 + test bl, 63 jnz short rx_body_134 call rx_read_l1 rx_body_134: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] add rax, 1516102347 mov r13, rax rx_i_135: ;FPMUL - dec ebp + dec ebx jz rx_finish xor r11, 088212ef9h mov ecx, r11d - test ebp, 63 + test bl, 63 jnz short rx_body_135 call rx_read_l1 rx_body_135: - xor rdi, rcx and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] mulpd xmm0, xmm3 @@ -2586,35 +2481,33 @@ rx_body_135: movaps xmm8, xmm0 rx_i_136: ;FPSQRT - dec ebp + dec ebx jz rx_finish xor r8, 01ae56e03h mov ecx, r8d - test ebp, 63 + test bl, 63 jnz short rx_body_136 call rx_read_l1 rx_body_136: - xor rdi, rcx and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] andps xmm0, xmm10 - sqrtpd xmm0, xmm0 - movaps xmm5, xmm0 + sqrtpd xmm5, xmm0 mov eax, r13d xor eax, 0efd7799dh and eax, 2047 movhpd qword ptr [rsi + rax * 8], xmm5 rx_i_137: ;ROL_64 - dec ebp + dec ebx jz rx_finish xor r11, 015a24231h mov ecx, r11d - test ebp, 63 + test bl, 63 jnz short rx_body_137 call rx_read_l1 rx_body_137: - xor rdi, rcx + xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] mov rcx, r9 @@ -2622,18 +2515,17 @@ rx_body_137: mov r11, rax rx_i_138: ;RET - dec ebp + dec ebx jz rx_finish xor r13, 02fd380c5h mov ecx, r13d - test ebp, 63 + test bl, 63 jnz short rx_body_138 call rx_read_l1 rx_body_138: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - cmp rsp, rbx + cmp rsp, rdi je short not_taken_ret_138 xor rax, qword ptr [rsp + 8] mov r10, rax @@ -2642,15 +2534,15 @@ not_taken_ret_138: mov r10, rax rx_i_139: ;ADD_64 - dec ebp + dec ebx jz rx_finish xor r9, 093172470h mov ecx, r9d - test ebp, 63 + test bl, 63 jnz short rx_body_139 call rx_read_l1 rx_body_139: - xor rdi, rcx + xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] add rax, 515364082 @@ -2661,15 +2553,14 @@ rx_body_139: mov qword ptr [rsi + rax * 8], rcx rx_i_140: ;IMUL_32 - dec ebp + dec ebx jz rx_finish xor r14, 052543553h mov ecx, r14d - test ebp, 63 + test bl, 63 jnz short rx_body_140 call rx_read_l2 rx_body_140: - xor rdi, rcx and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] movsxd rcx, eax @@ -2678,15 +2569,15 @@ rx_body_140: mov r14, rax rx_i_141: ;FPADD - dec ebp + dec ebx jz rx_finish xor r8, 02f636da1h mov ecx, r8d - test ebp, 63 + test bl, 63 jnz short rx_body_141 call rx_read_l1 rx_body_141: - xor rdi, rcx + xor rbp, rcx and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm2 @@ -2697,15 +2588,14 @@ rx_body_141: movlpd qword ptr [rsi + rax * 8], xmm9 rx_i_142: ;CALL - dec ebp + dec ebx jz rx_finish xor r11, 0b11a4f2ch mov ecx, r11d - test ebp, 63 + test bl, 63 jnz short rx_body_142 call rx_read_l2 rx_body_142: - xor rdi, rcx and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] cmp r12d, 1365939282 @@ -2721,15 +2611,14 @@ taken_call_142: call rx_i_257 rx_i_143: ;IMUL_32 - dec ebp + dec ebx jz rx_finish xor r15, 037f4b5d0h mov ecx, r15d - test ebp, 63 + test bl, 63 jnz short rx_body_143 call rx_read_l2 rx_body_143: - xor rdi, rcx and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] movsxd rcx, eax @@ -2738,15 +2627,14 @@ rx_body_143: mov r9, rax rx_i_144: ;IMULH_64 - dec ebp + dec ebx jz rx_finish xor r10, 02e59e00ah mov ecx, r10d - test ebp, 63 + test bl, 63 jnz short rx_body_144 call rx_read_l2 rx_body_144: - xor rdi, rcx and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] mov rcx, r11 @@ -2755,15 +2643,14 @@ rx_body_144: mov r15, rax rx_i_145: ;IMULH_64 - dec ebp + dec ebx jz rx_finish xor r13, 08d5c798h mov ecx, r13d - test ebp, 63 + test bl, 63 jnz short rx_body_145 call rx_read_l1 rx_body_145: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] mov rcx, r11 @@ -2776,15 +2663,15 @@ rx_body_145: mov qword ptr [rsi + rax * 8], rcx rx_i_146: ;IMUL_32 - dec ebp + dec ebx jz rx_finish xor r13, 02327e6e2h mov ecx, r13d - test ebp, 63 + test bl, 63 jnz short rx_body_146 call rx_read_l1 rx_body_146: - xor rdi, rcx + xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] movsxd rcx, eax @@ -2793,15 +2680,15 @@ rx_body_146: mov r10, rax rx_i_147: ;MULH_64 - dec ebp + dec ebx jz rx_finish xor r13, 03a7df043h mov ecx, r13d - test ebp, 63 + test bl, 63 jnz short rx_body_147 call rx_read_l1 rx_body_147: - xor rdi, rcx + xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] mov rcx, 1784404616 @@ -2814,15 +2701,14 @@ rx_body_147: mov qword ptr [rsi + rax * 8], rcx rx_i_148: ;SUB_64 - dec ebp + dec ebx jz rx_finish xor r10, 0783e5c4eh mov ecx, r10d - test ebp, 63 + test bl, 63 jnz short rx_body_148 call rx_read_l1 rx_body_148: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] sub rax, r14 @@ -2833,15 +2719,14 @@ rx_body_148: mov qword ptr [rsi + rax * 8], rcx rx_i_149: ;MUL_32 - dec ebp + dec ebx jz rx_finish xor r12, 0aa0f5b2fh mov ecx, r12d - test ebp, 63 + test bl, 63 jnz short rx_body_149 call rx_read_l1 rx_body_149: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] mov ecx, eax @@ -2854,15 +2739,14 @@ rx_body_149: mov qword ptr [rsi + rax * 8], rcx rx_i_150: ;DIV_64 - dec ebp + dec ebx jz rx_finish xor r9, 01504ca7ah mov ecx, r9d - test ebp, 63 + test bl, 63 jnz short rx_body_150 call rx_read_l1 rx_body_150: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] mov ecx, 1 @@ -2878,15 +2762,14 @@ rx_body_150: mov qword ptr [rsi + rax * 8], rcx rx_i_151: ;OR_32 - dec ebp + dec ebx jz rx_finish xor r9, 0ea72a7cfh mov ecx, r9d - test ebp, 63 + test bl, 63 jnz short rx_body_151 call rx_read_l1 rx_body_151: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] or eax, r13d @@ -2897,15 +2780,14 @@ rx_body_151: mov qword ptr [rsi + rax * 8], rcx rx_i_152: ;ROR_64 - dec ebp + dec ebx jz rx_finish xor r13, 0ad0e7a88h mov ecx, r13d - test ebp, 63 + test bl, 63 jnz short rx_body_152 call rx_read_l1 rx_body_152: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] mov rcx, r10 @@ -2913,15 +2795,14 @@ rx_body_152: mov r10, rax rx_i_153: ;FPDIV - dec ebp + dec ebx jz rx_finish xor r15, 0fd95ab87h mov ecx, r15d - test ebp, 63 + test bl, 63 jnz short rx_body_153 call rx_read_l1 rx_body_153: - xor rdi, rcx and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] divpd xmm0, xmm2 @@ -2935,15 +2816,14 @@ rx_body_153: movlpd qword ptr [rsi + rax * 8], xmm8 rx_i_154: ;MUL_32 - dec ebp + dec ebx jz rx_finish xor r10, 0256697b0h mov ecx, r10d - test ebp, 63 + test bl, 63 jnz short rx_body_154 call rx_read_l2 rx_body_154: - xor rdi, rcx and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] mov ecx, eax @@ -2952,15 +2832,14 @@ rx_body_154: mov r10, rax rx_i_155: ;ROR_64 - dec ebp + dec ebx jz rx_finish xor r11, 0d23f3b78h mov ecx, r11d - test ebp, 63 + test bl, 63 jnz short rx_body_155 call rx_read_l1 rx_body_155: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] mov rcx, r10 @@ -2972,15 +2851,14 @@ rx_body_155: mov qword ptr [rsi + rax * 8], rcx rx_i_156: ;IMUL_32 - dec ebp + dec ebx jz rx_finish xor r10, 098917533h mov ecx, r10d - test ebp, 63 + test bl, 63 jnz short rx_body_156 call rx_read_l2 rx_body_156: - xor rdi, rcx and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] movsxd rcx, eax @@ -2989,48 +2867,45 @@ rx_body_156: mov r15, rax rx_i_157: ;ADD_64 - dec ebp + dec ebx jz rx_finish xor r10, 0dfac3efch mov ecx, r10d - test ebp, 63 + test bl, 63 jnz short rx_body_157 call rx_read_l1 rx_body_157: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] add rax, r12 mov r14, rax rx_i_158: ;ADD_64 - dec ebp + dec ebx jz rx_finish xor r15, 0a64de090h mov ecx, r15d - test ebp, 63 + test bl, 63 jnz short rx_body_158 call rx_read_l1 rx_body_158: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] add rax, 1233402159 mov r10, rax rx_i_159: ;RET - dec ebp + dec ebx jz rx_finish xor r13, 0952a3abbh mov ecx, r13d - test ebp, 63 + test bl, 63 jnz short rx_body_159 call rx_read_l1 rx_body_159: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - cmp rsp, rbx + cmp rsp, rdi je short not_taken_ret_159 xor rax, qword ptr [rsp + 8] mov rcx, rax @@ -3047,15 +2922,15 @@ not_taken_ret_159: mov qword ptr [rsi + rax * 8], rcx rx_i_160: ;SUB_64 - dec ebp + dec ebx jz rx_finish xor r14, 0b1685b90h mov ecx, r14d - test ebp, 63 + test bl, 63 jnz short rx_body_160 call rx_read_l1 rx_body_160: - xor rdi, rcx + xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] sub rax, 1518778665 @@ -3066,30 +2941,28 @@ rx_body_160: mov qword ptr [rsi + rax * 8], rcx rx_i_161: ;OR_64 - dec ebp + dec ebx jz rx_finish xor r15, 0ea992531h mov ecx, r15d - test ebp, 63 + test bl, 63 jnz short rx_body_161 call rx_read_l1 rx_body_161: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] or rax, r14 mov r8, rax rx_i_162: ;SAR_64 - dec ebp + dec ebx jz rx_finish xor r9, 01fd57a4ah mov ecx, r9d - test ebp, 63 + test bl, 63 jnz short rx_body_162 call rx_read_l1 rx_body_162: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] mov rcx, r10 @@ -3097,15 +2970,15 @@ rx_body_162: mov r13, rax rx_i_163: ;SUB_64 - dec ebp + dec ebx jz rx_finish xor r12, 0e3486c0ah mov ecx, r12d - test ebp, 63 + test bl, 63 jnz short rx_body_163 call rx_read_l2 rx_body_163: - xor rdi, rcx + xor rbp, rcx and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] sub rax, -2101130488 @@ -3116,15 +2989,15 @@ rx_body_163: mov qword ptr [rsi + rax * 8], rcx rx_i_164: ;MUL_32 - dec ebp + dec ebx jz rx_finish xor r12, 01f0c2737h mov ecx, r12d - test ebp, 63 + test bl, 63 jnz short rx_body_164 call rx_read_l1 rx_body_164: - xor rdi, rcx + xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] mov ecx, eax @@ -3137,18 +3010,17 @@ rx_body_164: mov qword ptr [rsi + rax * 8], rcx rx_i_165: ;RET - dec ebp + dec ebx jz rx_finish xor r12, 0debb493eh mov ecx, r12d - test ebp, 63 + test bl, 63 jnz short rx_body_165 call rx_read_l1 rx_body_165: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - cmp rsp, rbx + cmp rsp, rdi je short not_taken_ret_165 xor rax, qword ptr [rsp + 8] mov rcx, rax @@ -3165,15 +3037,14 @@ not_taken_ret_165: mov qword ptr [rsi + rax * 8], rcx rx_i_166: ;ROL_64 - dec ebp + dec ebx jz rx_finish xor r9, 0fe684081h mov ecx, r9d - test ebp, 63 + test bl, 63 jnz short rx_body_166 call rx_read_l2 rx_body_166: - xor rdi, rcx and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] mov rcx, r8 @@ -3185,15 +3056,14 @@ rx_body_166: mov qword ptr [rsi + rax * 8], rcx rx_i_167: ;FPMUL - dec ebp + dec ebx jz rx_finish xor r11, 0d10371ch mov ecx, r11d - test ebp, 63 + test bl, 63 jnz short rx_body_167 call rx_read_l1 rx_body_167: - xor rdi, rcx and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] mulpd xmm0, xmm4 @@ -3207,34 +3077,31 @@ rx_body_167: movlpd qword ptr [rsi + rax * 8], xmm2 rx_i_168: ;FPSQRT - dec ebp + dec ebx jz rx_finish xor r12, 071b15effh mov ecx, r12d - test ebp, 63 + test bl, 63 jnz short rx_body_168 call rx_read_l1 rx_body_168: - xor rdi, rcx and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] andps xmm0, xmm10 - sqrtpd xmm0, xmm0 - movaps xmm7, xmm0 + sqrtpd xmm7, xmm0 rx_i_169: ;RET - dec ebp + dec ebx jz rx_finish xor r11, 072790347h mov ecx, r11d - test ebp, 63 + test bl, 63 jnz short rx_body_169 call rx_read_l1 rx_body_169: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - cmp rsp, rbx + cmp rsp, rdi je short not_taken_ret_169 xor rax, qword ptr [rsp + 8] mov rcx, rax @@ -3251,15 +3118,14 @@ not_taken_ret_169: mov qword ptr [rsi + rax * 8], rcx rx_i_170: ;CALL - dec ebp + dec ebx jz rx_finish xor r8, 04ae8a020h mov ecx, r8d - test ebp, 63 + test bl, 63 jnz short rx_body_170 call rx_read_l2 rx_body_170: - xor rdi, rcx and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] cmp r10d, -1541051751 @@ -3271,15 +3137,14 @@ taken_call_170: call rx_i_204 rx_i_171: ;IMULH_64 - dec ebp + dec ebx jz rx_finish xor r15, 09901e05bh mov ecx, r15d - test ebp, 63 + test bl, 63 jnz short rx_body_171 call rx_read_l1 rx_body_171: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] mov rcx, r12 @@ -3288,30 +3153,29 @@ rx_body_171: mov r12, rax rx_i_172: ;SUB_64 - dec ebp + dec ebx jz rx_finish xor r13, 050e8c510h mov ecx, r13d - test ebp, 63 + test bl, 63 jnz short rx_body_172 call rx_read_l1 rx_body_172: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] sub rax, r11 mov r12, rax rx_i_173: ;MUL_64 - dec ebp + dec ebx jz rx_finish xor r14, 05422cf8fh mov ecx, r14d - test ebp, 63 + test bl, 63 jnz short rx_body_173 call rx_read_l1 rx_body_173: - xor rdi, rcx + xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] imul rax, r12 @@ -3322,15 +3186,14 @@ rx_body_173: mov qword ptr [rsi + rax * 8], rcx rx_i_174: ;FPROUND - dec ebp + dec ebx jz rx_finish xor r12, 0a025c3dbh mov ecx, r12d - test ebp, 63 + test bl, 63 jnz short rx_body_174 call rx_read_l1 rx_body_174: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] shl eax, 13 @@ -3340,15 +3203,14 @@ rx_body_174: ldmxcsr dword ptr [rsp - 8] rx_i_175: ;SAR_64 - dec ebp + dec ebx jz rx_finish xor r13, 08f74c11h mov ecx, r13d - test ebp, 63 + test bl, 63 jnz short rx_body_175 call rx_read_l1 rx_body_175: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] mov rcx, r8 @@ -3356,30 +3218,28 @@ rx_body_175: mov r8, rax rx_i_176: ;SUB_64 - dec ebp + dec ebx jz rx_finish xor r9, 01f2ed5f1h mov ecx, r9d - test ebp, 63 + test bl, 63 jnz short rx_body_176 call rx_read_l2 rx_body_176: - xor rdi, rcx and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] sub rax, r14 mov r10, rax rx_i_177: ;ADD_64 - dec ebp + dec ebx jz rx_finish xor r10, 0d2072c79h mov ecx, r10d - test ebp, 63 + test bl, 63 jnz short rx_body_177 call rx_read_l2 rx_body_177: - xor rdi, rcx and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] add rax, r10 @@ -3390,18 +3250,17 @@ rx_body_177: mov qword ptr [rsi + rax * 8], rcx rx_i_178: ;RET - dec ebp + dec ebx jz rx_finish xor r15, 0a8e51933h mov ecx, r15d - test ebp, 63 + test bl, 63 jnz short rx_body_178 call rx_read_l1 rx_body_178: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - cmp rsp, rbx + cmp rsp, rdi je short not_taken_ret_178 xor rax, qword ptr [rsp + 8] mov rcx, rax @@ -3418,30 +3277,28 @@ not_taken_ret_178: mov qword ptr [rsi + rax * 8], rcx rx_i_179: ;FPADD - dec ebp + dec ebx jz rx_finish xor r12, 0934ad492h mov ecx, r12d - test ebp, 63 + test bl, 63 jnz short rx_body_179 call rx_read_l1 rx_body_179: - xor rdi, rcx and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm2 movaps xmm8, xmm0 rx_i_180: ;XOR_64 - dec ebp + dec ebx jz rx_finish xor r15, 01cb3ce1fh mov ecx, r15d - test ebp, 63 + test bl, 63 jnz short rx_body_180 call rx_read_l2 rx_body_180: - xor rdi, rcx and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] xor rax, 1995308563 @@ -3452,18 +3309,18 @@ rx_body_180: mov qword ptr [rsi + rax * 8], rcx rx_i_181: ;RET - dec ebp + dec ebx jz rx_finish xor r10, 023c7845fh mov ecx, r10d - test ebp, 63 + test bl, 63 jnz short rx_body_181 call rx_read_l2 rx_body_181: - xor rdi, rcx + xor rbp, rcx and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] - cmp rsp, rbx + cmp rsp, rdi je short not_taken_ret_181 xor rax, qword ptr [rsp + 8] mov r10, rax @@ -3472,60 +3329,58 @@ not_taken_ret_181: mov r10, rax rx_i_182: ;FPSUB - dec ebp + dec ebx jz rx_finish xor r8, 0f8884327h mov ecx, r8d - test ebp, 63 + test bl, 63 jnz short rx_body_182 call rx_read_l1 rx_body_182: - xor rdi, rcx + xor rbp, rcx and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] subpd xmm0, xmm7 movaps xmm6, xmm0 rx_i_183: ;ADD_64 - dec ebp + dec ebx jz rx_finish xor r13, 013070461h mov ecx, r13d - test ebp, 63 + test bl, 63 jnz short rx_body_183 call rx_read_l1 rx_body_183: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] add rax, 137260710 mov r10, rax rx_i_184: ;SAR_64 - dec ebp + dec ebx jz rx_finish xor r12, 04764cdf7h mov ecx, r12d - test ebp, 63 + test bl, 63 jnz short rx_body_184 call rx_read_l2 rx_body_184: - xor rdi, rcx + xor rbp, rcx and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] sar rax, 40 mov r12, rax rx_i_185: ;CALL - dec ebp + dec ebx jz rx_finish xor r10, 03c41026fh mov ecx, r10d - test ebp, 63 + test bl, 63 jnz short rx_body_185 call rx_read_l1 rx_body_185: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] cmp r15d, -1510284125 @@ -3541,15 +3396,15 @@ taken_call_185: call rx_i_246 rx_i_186: ;XOR_32 - dec ebp + dec ebx jz rx_finish xor r9, 0cded414bh mov ecx, r9d - test ebp, 63 + test bl, 63 jnz short rx_body_186 call rx_read_l1 rx_body_186: - xor rdi, rcx + xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] xor eax, r15d @@ -3560,15 +3415,14 @@ rx_body_186: mov qword ptr [rsi + rax * 8], rcx rx_i_187: ;FPDIV - dec ebp + dec ebx jz rx_finish xor r13, 05c6d64a8h mov ecx, r13d - test ebp, 63 + test bl, 63 jnz short rx_body_187 call rx_read_l2 rx_body_187: - xor rdi, rcx and ecx, 32767 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] divpd xmm0, xmm6 @@ -3578,15 +3432,15 @@ rx_body_187: movaps xmm5, xmm0 rx_i_188: ;FPMUL - dec ebp + dec ebx jz rx_finish xor r9, 04659becbh mov ecx, r9d - test ebp, 63 + test bl, 63 jnz short rx_body_188 call rx_read_l1 rx_body_188: - xor rdi, rcx + xor rbp, rcx and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] mulpd xmm0, xmm3 @@ -3596,15 +3450,14 @@ rx_body_188: movaps xmm4, xmm0 rx_i_189: ;FPROUND - dec ebp + dec ebx jz rx_finish xor r11, 0c52741d5h mov ecx, r11d - test ebp, 63 + test bl, 63 jnz short rx_body_189 call rx_read_l1 rx_body_189: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] shl eax, 13 @@ -3614,18 +3467,17 @@ rx_body_189: ldmxcsr dword ptr [rsp - 8] rx_i_190: ;RET - dec ebp + dec ebx jz rx_finish xor r12, 0217bf5f3h mov ecx, r12d - test ebp, 63 + test bl, 63 jnz short rx_body_190 call rx_read_l2 rx_body_190: - xor rdi, rcx and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] - cmp rsp, rbx + cmp rsp, rdi je short not_taken_ret_190 xor rax, qword ptr [rsp + 8] mov r13, rax @@ -3634,15 +3486,14 @@ not_taken_ret_190: mov r13, rax rx_i_191: ;CALL - dec ebp + dec ebx jz rx_finish xor r15, 0884f3526h mov ecx, r15d - test ebp, 63 + test bl, 63 jnz short rx_body_191 call rx_read_l1 rx_body_191: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] cmp r11d, 1687119072 @@ -3658,15 +3509,14 @@ taken_call_191: call rx_i_275 rx_i_192: ;CALL - dec ebp + dec ebx jz rx_finish xor r8, 0d76edad3h mov ecx, r8d - test ebp, 63 + test bl, 63 jnz short rx_body_192 call rx_read_l1 rx_body_192: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] cmp r14d, -117628864 @@ -3678,15 +3528,14 @@ taken_call_192: call rx_i_305 rx_i_193: ;MUL_32 - dec ebp + dec ebx jz rx_finish xor r12, 0e9939ach mov ecx, r12d - test ebp, 63 + test bl, 63 jnz short rx_body_193 call rx_read_l1 rx_body_193: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] mov ecx, eax @@ -3699,15 +3548,14 @@ rx_body_193: mov qword ptr [rsi + rax * 8], rcx rx_i_194: ;FPMUL - dec ebp + dec ebx jz rx_finish xor r12, 0f21ca520h mov ecx, r12d - test ebp, 63 + test bl, 63 jnz short rx_body_194 call rx_read_l2 rx_body_194: - xor rdi, rcx and ecx, 32767 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] mulpd xmm0, xmm2 @@ -3721,15 +3569,14 @@ rx_body_194: movlpd qword ptr [rsi + rax * 8], xmm5 rx_i_195: ;ROL_64 - dec ebp + dec ebx jz rx_finish xor r10, 09405152ch mov ecx, r10d - test ebp, 63 + test bl, 63 jnz short rx_body_195 call rx_read_l1 rx_body_195: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] mov rcx, r8 @@ -3737,15 +3584,14 @@ rx_body_195: mov r9, rax rx_i_196: ;SUB_64 - dec ebp + dec ebx jz rx_finish xor r8, 0c2a9f41bh mov ecx, r8d - test ebp, 63 + test bl, 63 jnz short rx_body_196 call rx_read_l2 rx_body_196: - xor rdi, rcx and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] sub rax, -1907903895 @@ -3756,30 +3602,29 @@ rx_body_196: mov qword ptr [rsi + rax * 8], rcx rx_i_197: ;MUL_64 - dec ebp + dec ebx jz rx_finish xor r12, 0229208efh mov ecx, r12d - test ebp, 63 + test bl, 63 jnz short rx_body_197 call rx_read_l1 rx_body_197: - xor rdi, rcx + xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] imul rax, r15 mov r11, rax rx_i_198: ;MULH_64 - dec ebp + dec ebx jz rx_finish xor r14, 0c8d95bbbh mov ecx, r14d - test ebp, 63 + test bl, 63 jnz short rx_body_198 call rx_read_l1 rx_body_198: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] mov rcx, r14 @@ -3792,15 +3637,14 @@ rx_body_198: mov qword ptr [rsi + rax * 8], rcx rx_i_199: ;MULH_64 - dec ebp + dec ebx jz rx_finish xor r13, 050049e2eh mov ecx, r13d - test ebp, 63 + test bl, 63 jnz short rx_body_199 call rx_read_l2 rx_body_199: - xor rdi, rcx and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] mov rcx, r10 @@ -3813,15 +3657,14 @@ rx_body_199: mov qword ptr [rsi + rax * 8], rcx rx_i_200: ;FPSUB - dec ebp + dec ebx jz rx_finish xor r10, 0c63b99e8h mov ecx, r10d - test ebp, 63 + test bl, 63 jnz short rx_body_200 call rx_read_l1 rx_body_200: - xor rdi, rcx and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] subpd xmm0, xmm2 @@ -3832,15 +3675,15 @@ rx_body_200: movhpd qword ptr [rsi + rax * 8], xmm4 rx_i_201: ;FPADD - dec ebp + dec ebx jz rx_finish xor r8, 0cdda801dh mov ecx, r8d - test ebp, 63 + test bl, 63 jnz short rx_body_201 call rx_read_l1 rx_body_201: - xor rdi, rcx + xor rbp, rcx and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm9 @@ -3851,30 +3694,28 @@ rx_body_201: movlpd qword ptr [rsi + rax * 8], xmm4 rx_i_202: ;FPSUB - dec ebp + dec ebx jz rx_finish xor r13, 0fa44b04ah mov ecx, r13d - test ebp, 63 + test bl, 63 jnz short rx_body_202 call rx_read_l2 rx_body_202: - xor rdi, rcx and ecx, 32767 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] subpd xmm0, xmm9 movaps xmm5, xmm0 rx_i_203: ;FPSUB - dec ebp + dec ebx jz rx_finish xor r10, 0d73e472ch mov ecx, r10d - test ebp, 63 + test bl, 63 jnz short rx_body_203 call rx_read_l1 rx_body_203: - xor rdi, rcx and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] subpd xmm0, xmm2 @@ -3885,15 +3726,14 @@ rx_body_203: movhpd qword ptr [rsi + rax * 8], xmm7 rx_i_204: ;MUL_64 - dec ebp + dec ebx jz rx_finish xor r9, 01af8ab1dh mov ecx, r9d - test ebp, 63 + test bl, 63 jnz short rx_body_204 call rx_read_l1 rx_body_204: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] imul rax, r15 @@ -3904,15 +3744,15 @@ rx_body_204: mov qword ptr [rsi + rax * 8], rcx rx_i_205: ;FPDIV - dec ebp + dec ebx jz rx_finish xor r14, 094e997c5h mov ecx, r14d - test ebp, 63 + test bl, 63 jnz short rx_body_205 call rx_read_l1 rx_body_205: - xor rdi, rcx + xor rbp, rcx and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] divpd xmm0, xmm8 @@ -3922,15 +3762,15 @@ rx_body_205: movaps xmm5, xmm0 rx_i_206: ;FPMUL - dec ebp + dec ebx jz rx_finish xor r11, 0e836a177h mov ecx, r11d - test ebp, 63 + test bl, 63 jnz short rx_body_206 call rx_read_l1 rx_body_206: - xor rdi, rcx + xor rbp, rcx and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] mulpd xmm0, xmm7 @@ -3940,15 +3780,15 @@ rx_body_206: movaps xmm4, xmm0 rx_i_207: ;AND_32 - dec ebp + dec ebx jz rx_finish xor r9, 039ccdd30h mov ecx, r9d - test ebp, 63 + test bl, 63 jnz short rx_body_207 call rx_read_l1 rx_body_207: - xor rdi, rcx + xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] and eax, r12d @@ -3959,30 +3799,29 @@ rx_body_207: mov qword ptr [rsi + rax * 8], rcx rx_i_208: ;MUL_64 - dec ebp + dec ebx jz rx_finish xor r9, 0f4f126c5h mov ecx, r9d - test ebp, 63 + test bl, 63 jnz short rx_body_208 call rx_read_l1 rx_body_208: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] imul rax, r12 mov r10, rax rx_i_209: ;SHR_64 - dec ebp + dec ebx jz rx_finish xor r8, 0b84811f1h mov ecx, r8d - test ebp, 63 + test bl, 63 jnz short rx_body_209 call rx_read_l1 rx_body_209: - xor rdi, rcx + xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] shr rax, 30 @@ -3993,15 +3832,15 @@ rx_body_209: mov qword ptr [rsi + rax * 8], rcx rx_i_210: ;MUL_32 - dec ebp + dec ebx jz rx_finish xor r12, 0c5efc90ah mov ecx, r12d - test ebp, 63 + test bl, 63 jnz short rx_body_210 call rx_read_l2 rx_body_210: - xor rdi, rcx + xor rbp, rcx and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] mov ecx, eax @@ -4014,30 +3853,29 @@ rx_body_210: mov qword ptr [rsi + rax * 8], rcx rx_i_211: ;FPADD - dec ebp + dec ebx jz rx_finish xor r12, 0ce533072h mov ecx, r12d - test ebp, 63 + test bl, 63 jnz short rx_body_211 call rx_read_l2 rx_body_211: - xor rdi, rcx and ecx, 32767 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm9 movaps xmm3, xmm0 rx_i_212: ;MUL_64 - dec ebp + dec ebx jz rx_finish xor r13, 06b465fdbh mov ecx, r13d - test ebp, 63 + test bl, 63 jnz short rx_body_212 call rx_read_l1 rx_body_212: - xor rdi, rcx + xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] imul rax, r13 @@ -4048,15 +3886,15 @@ rx_body_212: mov qword ptr [rsi + rax * 8], rcx rx_i_213: ;IMUL_32 - dec ebp + dec ebx jz rx_finish xor r13, 02dd1d503h mov ecx, r13d - test ebp, 63 + test bl, 63 jnz short rx_body_213 call rx_read_l1 rx_body_213: - xor rdi, rcx + xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] movsxd rcx, eax @@ -4065,15 +3903,14 @@ rx_body_213: mov r14, rax rx_i_214: ;ROL_64 - dec ebp + dec ebx jz rx_finish xor r9, 0a159f313h mov ecx, r9d - test ebp, 63 + test bl, 63 jnz short rx_body_214 call rx_read_l1 rx_body_214: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] mov rcx, r14 @@ -4081,30 +3918,29 @@ rx_body_214: mov r14, rax rx_i_215: ;SUB_64 - dec ebp + dec ebx jz rx_finish xor r15, 08359265eh mov ecx, r15d - test ebp, 63 + test bl, 63 jnz short rx_body_215 call rx_read_l1 rx_body_215: - xor rdi, rcx + xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] sub rax, r12 mov r10, rax rx_i_216: ;MUL_64 - dec ebp + dec ebx jz rx_finish xor r12, 080696de3h mov ecx, r12d - test ebp, 63 + test bl, 63 jnz short rx_body_216 call rx_read_l1 rx_body_216: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] imul rax, r13 @@ -4115,15 +3951,14 @@ rx_body_216: mov qword ptr [rsi + rax * 8], rcx rx_i_217: ;IMUL_32 - dec ebp + dec ebx jz rx_finish xor r8, 040d5b526h mov ecx, r8d - test ebp, 63 + test bl, 63 jnz short rx_body_217 call rx_read_l1 rx_body_217: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] movsxd rcx, eax @@ -4136,15 +3971,14 @@ rx_body_217: mov qword ptr [rsi + rax * 8], rcx rx_i_218: ;CALL - dec ebp + dec ebx jz rx_finish xor r11, 083c0bd93h mov ecx, r11d - test ebp, 63 + test bl, 63 jnz short rx_body_218 call rx_read_l2 rx_body_218: - xor rdi, rcx and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] cmp r8d, -585552250 @@ -4156,15 +3990,14 @@ taken_call_218: call rx_i_240 rx_i_219: ;XOR_64 - dec ebp + dec ebx jz rx_finish xor r8, 0ca37f668h mov ecx, r8d - test ebp, 63 + test bl, 63 jnz short rx_body_219 call rx_read_l1 rx_body_219: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] xor rax, -740915304 @@ -4175,15 +4008,14 @@ rx_body_219: mov qword ptr [rsi + rax * 8], rcx rx_i_220: ;IMUL_32 - dec ebp + dec ebx jz rx_finish xor r9, 0bb44c384h mov ecx, r9d - test ebp, 63 + test bl, 63 jnz short rx_body_220 call rx_read_l1 rx_body_220: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] movsxd rcx, eax @@ -4196,15 +4028,14 @@ rx_body_220: mov qword ptr [rsi + rax * 8], rcx rx_i_221: ;IMULH_64 - dec ebp + dec ebx jz rx_finish xor r9, 0a3deb512h mov ecx, r9d - test ebp, 63 + test bl, 63 jnz short rx_body_221 call rx_read_l1 rx_body_221: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] mov rcx, r15 @@ -4217,15 +4048,14 @@ rx_body_221: mov qword ptr [rsi + rax * 8], rcx rx_i_222: ;FPMUL - dec ebp + dec ebx jz rx_finish xor r9, 084a02d64h mov ecx, r9d - test ebp, 63 + test bl, 63 jnz short rx_body_222 call rx_read_l2 rx_body_222: - xor rdi, rcx and ecx, 32767 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] mulpd xmm0, xmm5 @@ -4239,15 +4069,15 @@ rx_body_222: movhpd qword ptr [rsi + rax * 8], xmm7 rx_i_223: ;FPSUB - dec ebp + dec ebx jz rx_finish xor r8, 01e5cc085h mov ecx, r8d - test ebp, 63 + test bl, 63 jnz short rx_body_223 call rx_read_l1 rx_body_223: - xor rdi, rcx + xor rbp, rcx and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] subpd xmm0, xmm3 @@ -4258,15 +4088,14 @@ rx_body_223: movlpd qword ptr [rsi + rax * 8], xmm2 rx_i_224: ;SAR_64 - dec ebp + dec ebx jz rx_finish xor r12, 053982440h mov ecx, r12d - test ebp, 63 + test bl, 63 jnz short rx_body_224 call rx_read_l1 rx_body_224: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] mov rcx, r14 @@ -4278,15 +4107,14 @@ rx_body_224: mov qword ptr [rsi + rax * 8], rcx rx_i_225: ;DIV_64 - dec ebp + dec ebx jz rx_finish xor r13, 0c558367eh mov ecx, r13d - test ebp, 63 + test bl, 63 jnz short rx_body_225 call rx_read_l2 rx_body_225: - xor rdi, rcx and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] mov ecx, 1 @@ -4302,15 +4130,14 @@ rx_body_225: mov qword ptr [rsi + rax * 8], rcx rx_i_226: ;CALL - dec ebp + dec ebx jz rx_finish xor r10, 040139b65h mov ecx, r10d - test ebp, 63 + test bl, 63 jnz short rx_body_226 call rx_read_l1 rx_body_226: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] cmp r8d, -1752488808 @@ -4326,15 +4153,14 @@ taken_call_226: call rx_i_328 rx_i_227: ;FPDIV - dec ebp + dec ebx jz rx_finish xor r11, 0fa312dbdh mov ecx, r11d - test ebp, 63 + test bl, 63 jnz short rx_body_227 call rx_read_l1 rx_body_227: - xor rdi, rcx and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] divpd xmm0, xmm7 @@ -4348,15 +4174,14 @@ rx_body_227: movlpd qword ptr [rsi + rax * 8], xmm3 rx_i_228: ;CALL - dec ebp + dec ebx jz rx_finish xor r11, 0b64246c0h mov ecx, r11d - test ebp, 63 + test bl, 63 jnz short rx_body_228 call rx_read_l1 rx_body_228: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] cmp r10d, -2099304 @@ -4372,15 +4197,14 @@ taken_call_228: call rx_i_283 rx_i_229: ;IMUL_32 - dec ebp + dec ebx jz rx_finish xor r11, 05c535836h mov ecx, r11d - test ebp, 63 + test bl, 63 jnz short rx_body_229 call rx_read_l2 rx_body_229: - xor rdi, rcx and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] movsxd rcx, eax @@ -4393,15 +4217,14 @@ rx_body_229: mov qword ptr [rsi + rax * 8], rcx rx_i_230: ;FPMUL - dec ebp + dec ebx jz rx_finish xor r15, 0f394972eh mov ecx, r15d - test ebp, 63 + test bl, 63 jnz short rx_body_230 call rx_read_l1 rx_body_230: - xor rdi, rcx and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] mulpd xmm0, xmm6 @@ -4415,18 +4238,18 @@ rx_body_230: movhpd qword ptr [rsi + rax * 8], xmm5 rx_i_231: ;RET - dec ebp + dec ebx jz rx_finish xor r9, 0bb56428dh mov ecx, r9d - test ebp, 63 + test bl, 63 jnz short rx_body_231 call rx_read_l1 rx_body_231: - xor rdi, rcx + xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - cmp rsp, rbx + cmp rsp, rdi je short not_taken_ret_231 xor rax, qword ptr [rsp + 8] mov rcx, rax @@ -4443,15 +4266,14 @@ not_taken_ret_231: mov qword ptr [rsi + rax * 8], rcx rx_i_232: ;FPDIV - dec ebp + dec ebx jz rx_finish xor r15, 09ab46ab3h mov ecx, r15d - test ebp, 63 + test bl, 63 jnz short rx_body_232 call rx_read_l1 rx_body_232: - xor rdi, rcx and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] divpd xmm0, xmm3 @@ -4461,15 +4283,15 @@ rx_body_232: movaps xmm7, xmm0 rx_i_233: ;CALL - dec ebp + dec ebx jz rx_finish xor r13, 08eb2cd76h mov ecx, r13d - test ebp, 63 + test bl, 63 jnz short rx_body_233 call rx_read_l1 rx_body_233: - xor rdi, rcx + xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] cmp r12d, 392389867 @@ -4481,15 +4303,14 @@ taken_call_233: call rx_i_268 rx_i_234: ;FPROUND - dec ebp + dec ebx jz rx_finish xor r15, 0ba687578h mov ecx, r15d - test ebp, 63 + test bl, 63 jnz short rx_body_234 call rx_read_l1 rx_body_234: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] shl eax, 13 @@ -4499,15 +4320,14 @@ rx_body_234: ldmxcsr dword ptr [rsp - 8] rx_i_235: ;IMUL_32 - dec ebp + dec ebx jz rx_finish xor r13, 0b6cb9ff2h mov ecx, r13d - test ebp, 63 + test bl, 63 jnz short rx_body_235 call rx_read_l2 rx_body_235: - xor rdi, rcx and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] movsxd rcx, eax @@ -4520,30 +4340,28 @@ rx_body_235: mov qword ptr [rsi + rax * 8], rcx rx_i_236: ;FPADD - dec ebp + dec ebx jz rx_finish xor r15, 03ad196ach mov ecx, r15d - test ebp, 63 + test bl, 63 jnz short rx_body_236 call rx_read_l2 rx_body_236: - xor rdi, rcx and ecx, 32767 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm4 movaps xmm3, xmm0 rx_i_237: ;CALL - dec ebp + dec ebx jz rx_finish xor r15, 0fab4600h mov ecx, r15d - test ebp, 63 + test bl, 63 jnz short rx_body_237 call rx_read_l2 rx_body_237: - xor rdi, rcx and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] cmp r12d, -121899164 @@ -4555,15 +4373,15 @@ taken_call_237: call rx_i_295 rx_i_238: ;FPADD - dec ebp + dec ebx jz rx_finish xor r8, 0158f119fh mov ecx, r8d - test ebp, 63 + test bl, 63 jnz short rx_body_238 call rx_read_l2 rx_body_238: - xor rdi, rcx + xor rbp, rcx and ecx, 32767 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm6 @@ -4574,30 +4392,29 @@ rx_body_238: movlpd qword ptr [rsi + rax * 8], xmm7 rx_i_239: ;ADD_64 - dec ebp + dec ebx jz rx_finish xor r13, 044f30b3fh mov ecx, r13d - test ebp, 63 + test bl, 63 jnz short rx_body_239 call rx_read_l1 rx_body_239: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] add rax, r10 mov r10, rax rx_i_240: ;IMUL_32 - dec ebp + dec ebx jz rx_finish xor r9, 0d65d29f9h mov ecx, r9d - test ebp, 63 + test bl, 63 jnz short rx_body_240 call rx_read_l1 rx_body_240: - xor rdi, rcx + xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] movsxd rcx, eax @@ -4606,15 +4423,14 @@ rx_body_240: mov r8, rax rx_i_241: ;FPADD - dec ebp + dec ebx jz rx_finish xor r11, 0ce5260adh mov ecx, r11d - test ebp, 63 + test bl, 63 jnz short rx_body_241 call rx_read_l1 rx_body_241: - xor rdi, rcx and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm3 @@ -4625,15 +4441,14 @@ rx_body_241: movhpd qword ptr [rsi + rax * 8], xmm7 rx_i_242: ;MULH_64 - dec ebp + dec ebx jz rx_finish xor r12, 01119b0f9h mov ecx, r12d - test ebp, 63 + test bl, 63 jnz short rx_body_242 call rx_read_l2 rx_body_242: - xor rdi, rcx and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] mov rcx, 319324914 @@ -4646,45 +4461,43 @@ rx_body_242: mov qword ptr [rsi + rax * 8], rcx rx_i_243: ;XOR_64 - dec ebp + dec ebx jz rx_finish xor r12, 0d6c2ce3dh mov ecx, r12d - test ebp, 63 + test bl, 63 jnz short rx_body_243 call rx_read_l1 rx_body_243: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] xor rax, 1198180774 mov r14, rax rx_i_244: ;FPADD - dec ebp + dec ebx jz rx_finish xor r11, 0c6a6248h mov ecx, r11d - test ebp, 63 + test bl, 63 jnz short rx_body_244 call rx_read_l2 rx_body_244: - xor rdi, rcx and ecx, 32767 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm6 movaps xmm9, xmm0 rx_i_245: ;XOR_64 - dec ebp + dec ebx jz rx_finish xor r13, 084505739h mov ecx, r13d - test ebp, 63 + test bl, 63 jnz short rx_body_245 call rx_read_l1 rx_body_245: - xor rdi, rcx + xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] xor rax, -1546539637 @@ -4695,30 +4508,29 @@ rx_body_245: mov qword ptr [rsi + rax * 8], rcx rx_i_246: ;AND_64 - dec ebp + dec ebx jz rx_finish xor r15, 027eeaa2eh mov ecx, r15d - test ebp, 63 + test bl, 63 jnz short rx_body_246 call rx_read_l2 rx_body_246: - xor rdi, rcx + xor rbp, rcx and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] and rax, r9 mov r12, rax rx_i_247: ;IMUL_32 - dec ebp + dec ebx jz rx_finish xor r10, 0c4de0296h mov ecx, r10d - test ebp, 63 + test bl, 63 jnz short rx_body_247 call rx_read_l1 rx_body_247: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] movsxd rcx, eax @@ -4731,15 +4543,15 @@ rx_body_247: mov qword ptr [rsi + rax * 8], rcx rx_i_248: ;MUL_32 - dec ebp + dec ebx jz rx_finish xor r8, 0649df46fh mov ecx, r8d - test ebp, 63 + test bl, 63 jnz short rx_body_248 call rx_read_l2 rx_body_248: - xor rdi, rcx + xor rbp, rcx and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] mov ecx, eax @@ -4752,15 +4564,15 @@ rx_body_248: mov qword ptr [rsi + rax * 8], rcx rx_i_249: ;IMUL_32 - dec ebp + dec ebx jz rx_finish xor r15, 0499552cch mov ecx, r15d - test ebp, 63 + test bl, 63 jnz short rx_body_249 call rx_read_l2 rx_body_249: - xor rdi, rcx + xor rbp, rcx and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] movsxd rcx, eax @@ -4773,15 +4585,14 @@ rx_body_249: mov qword ptr [rsi + rax * 8], rcx rx_i_250: ;MUL_64 - dec ebp + dec ebx jz rx_finish xor r13, 083eafe6fh mov ecx, r13d - test ebp, 63 + test bl, 63 jnz short rx_body_250 call rx_read_l2 rx_body_250: - xor rdi, rcx and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] imul rax, r8 @@ -4792,15 +4603,14 @@ rx_body_250: mov qword ptr [rsi + rax * 8], rcx rx_i_251: ;FPMUL - dec ebp + dec ebx jz rx_finish xor r13, 0a25a4d8ah mov ecx, r13d - test ebp, 63 + test bl, 63 jnz short rx_body_251 call rx_read_l2 rx_body_251: - xor rdi, rcx and ecx, 32767 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] mulpd xmm0, xmm2 @@ -4814,15 +4624,14 @@ rx_body_251: movlpd qword ptr [rsi + rax * 8], xmm4 rx_i_252: ;ROL_64 - dec ebp + dec ebx jz rx_finish xor r14, 08a75ad41h mov ecx, r14d - test ebp, 63 + test bl, 63 jnz short rx_body_252 call rx_read_l2 rx_body_252: - xor rdi, rcx and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] mov rcx, r8 @@ -4830,15 +4639,14 @@ rx_body_252: mov r14, rax rx_i_253: ;CALL - dec ebp + dec ebx jz rx_finish xor r14, 057f3f596h mov ecx, r14d - test ebp, 63 + test bl, 63 jnz short rx_body_253 call rx_read_l1 rx_body_253: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] cmp r15d, 1699431947 @@ -4854,15 +4662,14 @@ taken_call_253: call rx_i_367 rx_i_254: ;FPSUB - dec ebp + dec ebx jz rx_finish xor r14, 04cfb709eh mov ecx, r14d - test ebp, 63 + test bl, 63 jnz short rx_body_254 call rx_read_l1 rx_body_254: - xor rdi, rcx and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] subpd xmm0, xmm4 @@ -4873,15 +4680,14 @@ rx_body_254: movlpd qword ptr [rsi + rax * 8], xmm8 rx_i_255: ;FPADD - dec ebp + dec ebx jz rx_finish xor r9, 0b96ec9ech mov ecx, r9d - test ebp, 63 + test bl, 63 jnz short rx_body_255 call rx_read_l1 rx_body_255: - xor rdi, rcx and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm5 @@ -4892,15 +4698,15 @@ rx_body_255: movlpd qword ptr [rsi + rax * 8], xmm6 rx_i_256: ;MULH_64 - dec ebp + dec ebx jz rx_finish xor r8, 08375472ch mov ecx, r8d - test ebp, 63 + test bl, 63 jnz short rx_body_256 call rx_read_l1 rx_body_256: - xor rdi, rcx + xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] mov rcx, r15 @@ -4913,15 +4719,14 @@ rx_body_256: mov qword ptr [rsi + rax * 8], rcx rx_i_257: ;FPADD - dec ebp + dec ebx jz rx_finish xor r12, 0d75a8c3fh mov ecx, r12d - test ebp, 63 + test bl, 63 jnz short rx_body_257 call rx_read_l2 rx_body_257: - xor rdi, rcx and ecx, 32767 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm5 @@ -4932,15 +4737,15 @@ rx_body_257: movhpd qword ptr [rsi + rax * 8], xmm3 rx_i_258: ;MUL_32 - dec ebp + dec ebx jz rx_finish xor r11, 064fdbda0h mov ecx, r11d - test ebp, 63 + test bl, 63 jnz short rx_body_258 call rx_read_l1 rx_body_258: - xor rdi, rcx + xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] mov ecx, eax @@ -4953,30 +4758,29 @@ rx_body_258: mov qword ptr [rsi + rax * 8], rcx rx_i_259: ;FPADD - dec ebp + dec ebx jz rx_finish xor r11, 02e36a073h mov ecx, r11d - test ebp, 63 + test bl, 63 jnz short rx_body_259 call rx_read_l1 rx_body_259: - xor rdi, rcx and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm9 movaps xmm3, xmm0 rx_i_260: ;FPMUL - dec ebp + dec ebx jz rx_finish xor r13, 0f94e9fa9h mov ecx, r13d - test ebp, 63 + test bl, 63 jnz short rx_body_260 call rx_read_l2 rx_body_260: - xor rdi, rcx + xor rbp, rcx and ecx, 32767 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] mulpd xmm0, xmm5 @@ -4986,35 +4790,34 @@ rx_body_260: movaps xmm9, xmm0 rx_i_261: ;FPSQRT - dec ebp + dec ebx jz rx_finish xor r14, 02346171ch mov ecx, r14d - test ebp, 63 + test bl, 63 jnz short rx_body_261 call rx_read_l2 rx_body_261: - xor rdi, rcx + xor rbp, rcx and ecx, 32767 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] andps xmm0, xmm10 - sqrtpd xmm0, xmm0 - movaps xmm3, xmm0 + sqrtpd xmm3, xmm0 mov eax, r11d xor eax, 0745a48e9h and eax, 2047 movlpd qword ptr [rsi + rax * 8], xmm3 rx_i_262: ;OR_32 - dec ebp + dec ebx jz rx_finish xor r10, 01c42baa6h mov ecx, r10d - test ebp, 63 + test bl, 63 jnz short rx_body_262 call rx_read_l1 rx_body_262: - xor rdi, rcx + xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] or eax, r13d @@ -5025,15 +4828,15 @@ rx_body_262: mov qword ptr [rsi + rax * 8], rcx rx_i_263: ;FPDIV - dec ebp + dec ebx jz rx_finish xor r11, 0b39b140h mov ecx, r11d - test ebp, 63 + test bl, 63 jnz short rx_body_263 call rx_read_l1 rx_body_263: - xor rdi, rcx + xor rbp, rcx and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] divpd xmm0, xmm8 @@ -5043,15 +4846,14 @@ rx_body_263: movaps xmm6, xmm0 rx_i_264: ;FPMUL - dec ebp + dec ebx jz rx_finish xor r11, 01a07d201h mov ecx, r11d - test ebp, 63 + test bl, 63 jnz short rx_body_264 call rx_read_l2 rx_body_264: - xor rdi, rcx and ecx, 32767 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] mulpd xmm0, xmm3 @@ -5061,15 +4863,15 @@ rx_body_264: movaps xmm7, xmm0 rx_i_265: ;FPADD - dec ebp + dec ebx jz rx_finish xor r13, 07a3eb340h mov ecx, r13d - test ebp, 63 + test bl, 63 jnz short rx_body_265 call rx_read_l2 rx_body_265: - xor rdi, rcx + xor rbp, rcx and ecx, 32767 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm8 @@ -5080,18 +4882,17 @@ rx_body_265: movhpd qword ptr [rsi + rax * 8], xmm2 rx_i_266: ;RET - dec ebp + dec ebx jz rx_finish xor r13, 03d0a3a89h mov ecx, r13d - test ebp, 63 + test bl, 63 jnz short rx_body_266 call rx_read_l1 rx_body_266: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - cmp rsp, rbx + cmp rsp, rdi je short not_taken_ret_266 xor rax, qword ptr [rsp + 8] mov r10, rax @@ -5100,30 +4901,29 @@ not_taken_ret_266: mov r10, rax rx_i_267: ;ROR_64 - dec ebp + dec ebx jz rx_finish xor r8, 0c6c7b37h mov ecx, r8d - test ebp, 63 + test bl, 63 jnz short rx_body_267 call rx_read_l1 rx_body_267: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] ror rax, 56 mov r11, rax rx_i_268: ;CALL - dec ebp + dec ebx jz rx_finish xor r12, 0c2510cebh mov ecx, r12d - test ebp, 63 + test bl, 63 jnz short rx_body_268 call rx_read_l2 rx_body_268: - xor rdi, rcx + xor rbp, rcx and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] cmp r15d, -2062812966 @@ -5135,15 +4935,14 @@ taken_call_268: call rx_i_381 rx_i_269: ;ROR_64 - dec ebp + dec ebx jz rx_finish xor r11, 0c80cc899h mov ecx, r11d - test ebp, 63 + test bl, 63 jnz short rx_body_269 call rx_read_l1 rx_body_269: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] mov rcx, r8 @@ -5155,15 +4954,14 @@ rx_body_269: mov qword ptr [rsi + rax * 8], rcx rx_i_270: ;FPMUL - dec ebp + dec ebx jz rx_finish xor r11, 0eb355caah mov ecx, r11d - test ebp, 63 + test bl, 63 jnz short rx_body_270 call rx_read_l1 rx_body_270: - xor rdi, rcx and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] mulpd xmm0, xmm9 @@ -5173,15 +4971,14 @@ rx_body_270: movaps xmm7, xmm0 rx_i_271: ;MUL_32 - dec ebp + dec ebx jz rx_finish xor r13, 0c6f12299h mov ecx, r13d - test ebp, 63 + test bl, 63 jnz short rx_body_271 call rx_read_l1 rx_body_271: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] mov ecx, eax @@ -5194,30 +4991,29 @@ rx_body_271: mov qword ptr [rsi + rax * 8], rcx rx_i_272: ;OR_32 - dec ebp + dec ebx jz rx_finish xor r12, 0695a5dd2h mov ecx, r12d - test ebp, 63 + test bl, 63 jnz short rx_body_272 call rx_read_l2 rx_body_272: - xor rdi, rcx and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] or eax, r12d mov r13, rax rx_i_273: ;CALL - dec ebp + dec ebx jz rx_finish xor r9, 0d315e4dch mov ecx, r9d - test ebp, 63 + test bl, 63 jnz short rx_body_273 call rx_read_l1 rx_body_273: - xor rdi, rcx + xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] cmp r12d, 1670848568 @@ -5233,15 +5029,14 @@ taken_call_273: call rx_i_372 rx_i_274: ;FPSUB - dec ebp + dec ebx jz rx_finish xor r15, 0b66ca7e0h mov ecx, r15d - test ebp, 63 + test bl, 63 jnz short rx_body_274 call rx_read_l2 rx_body_274: - xor rdi, rcx and ecx, 32767 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] subpd xmm0, xmm4 @@ -5252,30 +5047,30 @@ rx_body_274: movlpd qword ptr [rsi + rax * 8], xmm6 rx_i_275: ;OR_64 - dec ebp + dec ebx jz rx_finish xor r10, 0788eceb7h mov ecx, r10d - test ebp, 63 + test bl, 63 jnz short rx_body_275 call rx_read_l2 rx_body_275: - xor rdi, rcx + xor rbp, rcx and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] or rax, r11 mov r13, rax rx_i_276: ;CALL - dec ebp + dec ebx jz rx_finish xor r9, 0c6ac5edah mov ecx, r9d - test ebp, 63 + test bl, 63 jnz short rx_body_276 call rx_read_l1 rx_body_276: - xor rdi, rcx + xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] cmp r11d, -1236180570 @@ -5291,15 +5086,14 @@ taken_call_276: call rx_i_404 rx_i_277: ;IMUL_32 - dec ebp + dec ebx jz rx_finish xor r11, 0c9549789h mov ecx, r11d - test ebp, 63 + test bl, 63 jnz short rx_body_277 call rx_read_l2 rx_body_277: - xor rdi, rcx and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] movsxd rcx, eax @@ -5312,15 +5106,14 @@ rx_body_277: mov qword ptr [rsi + rax * 8], rcx rx_i_278: ;FPSUB - dec ebp + dec ebx jz rx_finish xor r9, 0a2bc66c9h mov ecx, r9d - test ebp, 63 + test bl, 63 jnz short rx_body_278 call rx_read_l1 rx_body_278: - xor rdi, rcx and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] subpd xmm0, xmm7 @@ -5331,15 +5124,14 @@ rx_body_278: movlpd qword ptr [rsi + rax * 8], xmm4 rx_i_279: ;FPSUB - dec ebp + dec ebx jz rx_finish xor r15, 0f1a91458h mov ecx, r15d - test ebp, 63 + test bl, 63 jnz short rx_body_279 call rx_read_l1 rx_body_279: - xor rdi, rcx and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] subpd xmm0, xmm5 @@ -5350,15 +5142,14 @@ rx_body_279: movhpd qword ptr [rsi + rax * 8], xmm9 rx_i_280: ;AND_64 - dec ebp + dec ebx jz rx_finish xor r12, 066246b43h mov ecx, r12d - test ebp, 63 + test bl, 63 jnz short rx_body_280 call rx_read_l2 rx_body_280: - xor rdi, rcx and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] and rax, r11 @@ -5369,15 +5160,14 @@ rx_body_280: mov qword ptr [rsi + rax * 8], rcx rx_i_281: ;SUB_64 - dec ebp + dec ebx jz rx_finish xor r10, 05a762727h mov ecx, r10d - test ebp, 63 + test bl, 63 jnz short rx_body_281 call rx_read_l1 rx_body_281: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] sub rax, r10 @@ -5388,30 +5178,29 @@ rx_body_281: mov qword ptr [rsi + rax * 8], rcx rx_i_282: ;SUB_32 - dec ebp + dec ebx jz rx_finish xor r15, 0de1ab603h mov ecx, r15d - test ebp, 63 + test bl, 63 jnz short rx_body_282 call rx_read_l1 rx_body_282: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] sub eax, 1367326224 mov r11, rax rx_i_283: ;ADD_32 - dec ebp + dec ebx jz rx_finish xor r9, 0df4d084fh mov ecx, r9d - test ebp, 63 + test bl, 63 jnz short rx_body_283 call rx_read_l1 rx_body_283: - xor rdi, rcx + xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] add eax, -1156732976 @@ -5422,15 +5211,15 @@ rx_body_283: mov qword ptr [rsi + rax * 8], rcx rx_i_284: ;FPSUB - dec ebp + dec ebx jz rx_finish xor r15, 0e68f36ach mov ecx, r15d - test ebp, 63 + test bl, 63 jnz short rx_body_284 call rx_read_l1 rx_body_284: - xor rdi, rcx + xor rbp, rcx and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] subpd xmm0, xmm6 @@ -5441,15 +5230,14 @@ rx_body_284: movlpd qword ptr [rsi + rax * 8], xmm9 rx_i_285: ;IMUL_32 - dec ebp + dec ebx jz rx_finish xor r8, 09adb333bh mov ecx, r8d - test ebp, 63 + test bl, 63 jnz short rx_body_285 call rx_read_l1 rx_body_285: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] movsxd rcx, eax @@ -5458,30 +5246,29 @@ rx_body_285: mov r14, rax rx_i_286: ;FPADD - dec ebp + dec ebx jz rx_finish xor r14, 082f5e36ch mov ecx, r14d - test ebp, 63 + test bl, 63 jnz short rx_body_286 call rx_read_l1 rx_body_286: - xor rdi, rcx and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm9 movaps xmm7, xmm0 rx_i_287: ;OR_64 - dec ebp + dec ebx jz rx_finish xor r11, 049547c9ch mov ecx, r11d - test ebp, 63 + test bl, 63 jnz short rx_body_287 call rx_read_l1 rx_body_287: - xor rdi, rcx + xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] or rax, r15 @@ -5492,15 +5279,14 @@ rx_body_287: mov qword ptr [rsi + rax * 8], rcx rx_i_288: ;MUL_64 - dec ebp + dec ebx jz rx_finish xor r10, 08716ac8bh mov ecx, r10d - test ebp, 63 + test bl, 63 jnz short rx_body_288 call rx_read_l1 rx_body_288: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] imul rax, r8 @@ -5511,15 +5297,14 @@ rx_body_288: mov qword ptr [rsi + rax * 8], rcx rx_i_289: ;FPDIV - dec ebp + dec ebx jz rx_finish xor r14, 0efef52b5h mov ecx, r14d - test ebp, 63 + test bl, 63 jnz short rx_body_289 call rx_read_l2 rx_body_289: - xor rdi, rcx and ecx, 32767 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] divpd xmm0, xmm9 @@ -5529,15 +5314,14 @@ rx_body_289: movaps xmm8, xmm0 rx_i_290: ;FPMUL - dec ebp + dec ebx jz rx_finish xor r15, 060665748h mov ecx, r15d - test ebp, 63 + test bl, 63 jnz short rx_body_290 call rx_read_l2 rx_body_290: - xor rdi, rcx and ecx, 32767 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] mulpd xmm0, xmm8 @@ -5547,18 +5331,17 @@ rx_body_290: movaps xmm9, xmm0 rx_i_291: ;RET - dec ebp + dec ebx jz rx_finish xor r13, 0ddf4bd1ah mov ecx, r13d - test ebp, 63 + test bl, 63 jnz short rx_body_291 call rx_read_l2 rx_body_291: - xor rdi, rcx and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] - cmp rsp, rbx + cmp rsp, rdi je short not_taken_ret_291 xor rax, qword ptr [rsp + 8] mov rcx, rax @@ -5575,48 +5358,45 @@ not_taken_ret_291: mov qword ptr [rsi + rax * 8], rcx rx_i_292: ;ROR_64 - dec ebp + dec ebx jz rx_finish xor r13, 05a87cc3dh mov ecx, r13d - test ebp, 63 + test bl, 63 jnz short rx_body_292 call rx_read_l1 rx_body_292: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] ror rax, 23 mov r10, rax rx_i_293: ;FPSUB - dec ebp + dec ebx jz rx_finish xor r9, 0c61f4279h mov ecx, r9d - test ebp, 63 + test bl, 63 jnz short rx_body_293 call rx_read_l2 rx_body_293: - xor rdi, rcx and ecx, 32767 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] subpd xmm0, xmm5 movaps xmm8, xmm0 rx_i_294: ;RET - dec ebp + dec ebx jz rx_finish xor r14, 0f3b9d85h mov ecx, r14d - test ebp, 63 + test bl, 63 jnz short rx_body_294 call rx_read_l2 rx_body_294: - xor rdi, rcx and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] - cmp rsp, rbx + cmp rsp, rdi je short not_taken_ret_294 xor rax, qword ptr [rsp + 8] mov rcx, rax @@ -5633,30 +5413,29 @@ not_taken_ret_294: mov qword ptr [rsi + rax * 8], rcx rx_i_295: ;FPSUB - dec ebp + dec ebx jz rx_finish xor r9, 0f42798fdh mov ecx, r9d - test ebp, 63 + test bl, 63 jnz short rx_body_295 call rx_read_l1 rx_body_295: - xor rdi, rcx + xor rbp, rcx and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] subpd xmm0, xmm8 movaps xmm7, xmm0 rx_i_296: ;CALL - dec ebp + dec ebx jz rx_finish xor r14, 018738758h mov ecx, r14d - test ebp, 63 + test bl, 63 jnz short rx_body_296 call rx_read_l1 rx_body_296: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] cmp r9d, -207252278 @@ -5672,45 +5451,43 @@ taken_call_296: call rx_i_395 rx_i_297: ;ADD_64 - dec ebp + dec ebx jz rx_finish xor r15, 0de3b9d9bh mov ecx, r15d - test ebp, 63 + test bl, 63 jnz short rx_body_297 call rx_read_l1 rx_body_297: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] add rax, r10 mov r14, rax rx_i_298: ;FPSUB - dec ebp + dec ebx jz rx_finish xor r14, 084f53637h mov ecx, r14d - test ebp, 63 + test bl, 63 jnz short rx_body_298 call rx_read_l1 rx_body_298: - xor rdi, rcx and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] subpd xmm0, xmm7 movaps xmm6, xmm0 rx_i_299: ;ADD_64 - dec ebp + dec ebx jz rx_finish xor r12, 042f4897h mov ecx, r12d - test ebp, 63 + test bl, 63 jnz short rx_body_299 call rx_read_l1 rx_body_299: - xor rdi, rcx + xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] add rax, 21400308 @@ -5721,30 +5498,29 @@ rx_body_299: mov qword ptr [rsi + rax * 8], rcx rx_i_300: ;FPSUB - dec ebp + dec ebx jz rx_finish xor r12, 095765693h mov ecx, r12d - test ebp, 63 + test bl, 63 jnz short rx_body_300 call rx_read_l2 rx_body_300: - xor rdi, rcx + xor rbp, rcx and ecx, 32767 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] subpd xmm0, xmm8 movaps xmm2, xmm0 rx_i_301: ;FPMUL - dec ebp + dec ebx jz rx_finish xor r8, 0a0ec5eech mov ecx, r8d - test ebp, 63 + test bl, 63 jnz short rx_body_301 call rx_read_l1 rx_body_301: - xor rdi, rcx and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] mulpd xmm0, xmm5 @@ -5758,30 +5534,29 @@ rx_body_301: movlpd qword ptr [rsi + rax * 8], xmm7 rx_i_302: ;ADD_64 - dec ebp + dec ebx jz rx_finish xor r15, 0f6f8c345h mov ecx, r15d - test ebp, 63 + test bl, 63 jnz short rx_body_302 call rx_read_l1 rx_body_302: - xor rdi, rcx + xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] add rax, r10 mov r11, rax rx_i_303: ;FPADD - dec ebp + dec ebx jz rx_finish xor r14, 082a3e965h mov ecx, r14d - test ebp, 63 + test bl, 63 jnz short rx_body_303 call rx_read_l1 rx_body_303: - xor rdi, rcx and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm3 @@ -5792,60 +5567,56 @@ rx_body_303: movlpd qword ptr [rsi + rax * 8], xmm9 rx_i_304: ;MUL_64 - dec ebp + dec ebx jz rx_finish xor r12, 04940c652h mov ecx, r12d - test ebp, 63 + test bl, 63 jnz short rx_body_304 call rx_read_l1 rx_body_304: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] imul rax, r15 mov r13, rax rx_i_305: ;MUL_64 - dec ebp + dec ebx jz rx_finish xor r11, 03c6c62b8h mov ecx, r11d - test ebp, 63 + test bl, 63 jnz short rx_body_305 call rx_read_l2 rx_body_305: - xor rdi, rcx and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] imul rax, rax, -65873120 mov r10, rax rx_i_306: ;ADD_64 - dec ebp + dec ebx jz rx_finish xor r15, 08b34cdfch mov ecx, r15d - test ebp, 63 + test bl, 63 jnz short rx_body_306 call rx_read_l2 rx_body_306: - xor rdi, rcx and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] add rax, r15 mov r13, rax rx_i_307: ;SAR_64 - dec ebp + dec ebx jz rx_finish xor r15, 04c36adb1h mov ecx, r15d - test ebp, 63 + test bl, 63 jnz short rx_body_307 call rx_read_l1 rx_body_307: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] mov rcx, r8 @@ -5853,30 +5624,28 @@ rx_body_307: mov r10, rax rx_i_308: ;MUL_64 - dec ebp + dec ebx jz rx_finish xor r11, 0a4213b21h mov ecx, r11d - test ebp, 63 + test bl, 63 jnz short rx_body_308 call rx_read_l2 rx_body_308: - xor rdi, rcx and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] imul rax, r13 mov r15, rax rx_i_309: ;IMULH_64 - dec ebp + dec ebx jz rx_finish xor r9, 090c42304h mov ecx, r9d - test ebp, 63 + test bl, 63 jnz short rx_body_309 call rx_read_l1 rx_body_309: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] mov rcx, -1652850028 @@ -5889,15 +5658,14 @@ rx_body_309: mov qword ptr [rsi + rax * 8], rcx rx_i_310: ;FPMUL - dec ebp + dec ebx jz rx_finish xor r9, 0f78e1c8ch mov ecx, r9d - test ebp, 63 + test bl, 63 jnz short rx_body_310 call rx_read_l1 rx_body_310: - xor rdi, rcx and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] mulpd xmm0, xmm6 @@ -5911,15 +5679,14 @@ rx_body_310: movhpd qword ptr [rsi + rax * 8], xmm7 rx_i_311: ;FPMUL - dec ebp + dec ebx jz rx_finish xor r8, 0ff8848cfh mov ecx, r8d - test ebp, 63 + test bl, 63 jnz short rx_body_311 call rx_read_l1 rx_body_311: - xor rdi, rcx and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] mulpd xmm0, xmm4 @@ -5929,15 +5696,14 @@ rx_body_311: movaps xmm4, xmm0 rx_i_312: ;MUL_32 - dec ebp + dec ebx jz rx_finish xor r13, 0b18904cdh mov ecx, r13d - test ebp, 63 + test bl, 63 jnz short rx_body_312 call rx_read_l1 rx_body_312: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] mov ecx, eax @@ -5946,30 +5712,28 @@ rx_body_312: mov r10, rax rx_i_313: ;FPADD - dec ebp + dec ebx jz rx_finish xor r8, 0a0d0befh mov ecx, r8d - test ebp, 63 + test bl, 63 jnz short rx_body_313 call rx_read_l1 rx_body_313: - xor rdi, rcx and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm5 movaps xmm6, xmm0 rx_i_314: ;IMUL_32 - dec ebp + dec ebx jz rx_finish xor r15, 01e3c65f7h mov ecx, r15d - test ebp, 63 + test bl, 63 jnz short rx_body_314 call rx_read_l1 rx_body_314: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] movsxd rcx, eax @@ -5982,15 +5746,14 @@ rx_body_314: mov qword ptr [rsi + rax * 8], rcx rx_i_315: ;SHR_64 - dec ebp + dec ebx jz rx_finish xor r9, 02e36ddafh mov ecx, r9d - test ebp, 63 + test bl, 63 jnz short rx_body_315 call rx_read_l1 rx_body_315: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] mov rcx, r15 @@ -5998,18 +5761,18 @@ rx_body_315: mov r9, rax rx_i_316: ;RET - dec ebp + dec ebx jz rx_finish xor r14, 05b0cb5bbh mov ecx, r14d - test ebp, 63 + test bl, 63 jnz short rx_body_316 call rx_read_l1 rx_body_316: - xor rdi, rcx + xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - cmp rsp, rbx + cmp rsp, rdi je short not_taken_ret_316 xor rax, qword ptr [rsp + 8] mov rcx, rax @@ -6026,45 +5789,42 @@ not_taken_ret_316: mov qword ptr [rsi + rax * 8], rcx rx_i_317: ;FPADD - dec ebp + dec ebx jz rx_finish xor r9, 0c74e7415h mov ecx, r9d - test ebp, 63 + test bl, 63 jnz short rx_body_317 call rx_read_l1 rx_body_317: - xor rdi, rcx and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm7 movaps xmm5, xmm0 rx_i_318: ;FPADD - dec ebp + dec ebx jz rx_finish xor r9, 057621d9ah mov ecx, r9d - test ebp, 63 + test bl, 63 jnz short rx_body_318 call rx_read_l1 rx_body_318: - xor rdi, rcx and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm3 movaps xmm7, xmm0 rx_i_319: ;ROL_64 - dec ebp + dec ebx jz rx_finish xor r13, 08ee02d99h mov ecx, r13d - test ebp, 63 + test bl, 63 jnz short rx_body_319 call rx_read_l2 rx_body_319: - xor rdi, rcx and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] mov rcx, r15 @@ -6076,15 +5836,14 @@ rx_body_319: mov qword ptr [rsi + rax * 8], rcx rx_i_320: ;FPADD - dec ebp + dec ebx jz rx_finish xor r15, 013461188h mov ecx, r15d - test ebp, 63 + test bl, 63 jnz short rx_body_320 call rx_read_l1 rx_body_320: - xor rdi, rcx and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm4 @@ -6095,15 +5854,14 @@ rx_body_320: movlpd qword ptr [rsi + rax * 8], xmm2 rx_i_321: ;IMUL_32 - dec ebp + dec ebx jz rx_finish xor r11, 0a7bae383h mov ecx, r11d - test ebp, 63 + test bl, 63 jnz short rx_body_321 call rx_read_l1 rx_body_321: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] movsxd rcx, eax @@ -6116,18 +5874,17 @@ rx_body_321: mov qword ptr [rsi + rax * 8], rcx rx_i_322: ;RET - dec ebp + dec ebx jz rx_finish xor r14, 08215399bh mov ecx, r14d - test ebp, 63 + test bl, 63 jnz short rx_body_322 call rx_read_l1 rx_body_322: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - cmp rsp, rbx + cmp rsp, rdi je short not_taken_ret_322 xor rax, qword ptr [rsp + 8] mov rcx, rax @@ -6144,15 +5901,15 @@ not_taken_ret_322: mov qword ptr [rsi + rax * 8], rcx rx_i_323: ;MULH_64 - dec ebp + dec ebx jz rx_finish xor r14, 07b07664bh mov ecx, r14d - test ebp, 63 + test bl, 63 jnz short rx_body_323 call rx_read_l1 rx_body_323: - xor rdi, rcx + xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] mov rcx, -696924877 @@ -6165,50 +5922,47 @@ rx_body_323: mov qword ptr [rsi + rax * 8], rcx rx_i_324: ;FPSQRT - dec ebp + dec ebx jz rx_finish xor r9, 0f956baffh mov ecx, r9d - test ebp, 63 + test bl, 63 jnz short rx_body_324 call rx_read_l1 rx_body_324: - xor rdi, rcx and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] andps xmm0, xmm10 - sqrtpd xmm0, xmm0 - movaps xmm9, xmm0 + sqrtpd xmm9, xmm0 mov eax, r9d xor eax, 0944856d4h and eax, 32767 movlpd qword ptr [rsi + rax * 8], xmm9 rx_i_325: ;SHL_64 - dec ebp + dec ebx jz rx_finish xor r11, 0708ab9d1h mov ecx, r11d - test ebp, 63 + test bl, 63 jnz short rx_body_325 call rx_read_l1 rx_body_325: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] shl rax, 24 mov r13, rax rx_i_326: ;MULH_64 - dec ebp + dec ebx jz rx_finish xor r11, 0d1b27540h mov ecx, r11d - test ebp, 63 + test bl, 63 jnz short rx_body_326 call rx_read_l1 rx_body_326: - xor rdi, rcx + xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] mov rcx, r8 @@ -6221,30 +5975,29 @@ rx_body_326: mov qword ptr [rsi + rax * 8], rcx rx_i_327: ;AND_64 - dec ebp + dec ebx jz rx_finish xor r9, 09665f98dh mov ecx, r9d - test ebp, 63 + test bl, 63 jnz short rx_body_327 call rx_read_l1 rx_body_327: - xor rdi, rcx + xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] and rax, r15 mov r12, rax rx_i_328: ;ROL_64 - dec ebp + dec ebx jz rx_finish xor r12, 0fb9c32adh mov ecx, r12d - test ebp, 63 + test bl, 63 jnz short rx_body_328 call rx_read_l1 rx_body_328: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] mov rcx, r13 @@ -6252,18 +6005,17 @@ rx_body_328: mov r9, rax rx_i_329: ;RET - dec ebp + dec ebx jz rx_finish xor r11, 0e1110623h mov ecx, r11d - test ebp, 63 + test bl, 63 jnz short rx_body_329 call rx_read_l1 rx_body_329: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - cmp rsp, rbx + cmp rsp, rdi je short not_taken_ret_329 xor rax, qword ptr [rsp + 8] mov r11, rax @@ -6272,15 +6024,15 @@ not_taken_ret_329: mov r11, rax rx_i_330: ;MUL_32 - dec ebp + dec ebx jz rx_finish xor r9, 0f6a93f19h mov ecx, r9d - test ebp, 63 + test bl, 63 jnz short rx_body_330 call rx_read_l1 rx_body_330: - xor rdi, rcx + xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] mov ecx, eax @@ -6293,30 +6045,30 @@ rx_body_330: mov qword ptr [rsi + rax * 8], rcx rx_i_331: ;FPADD - dec ebp + dec ebx jz rx_finish xor r9, 0bc9bbe4ah mov ecx, r9d - test ebp, 63 + test bl, 63 jnz short rx_body_331 call rx_read_l1 rx_body_331: - xor rdi, rcx + xor rbp, rcx and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm3 movaps xmm9, xmm0 rx_i_332: ;FPADD - dec ebp + dec ebx jz rx_finish xor r12, 0f253cd4eh mov ecx, r12d - test ebp, 63 + test bl, 63 jnz short rx_body_332 call rx_read_l1 rx_body_332: - xor rdi, rcx + xor rbp, rcx and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm6 @@ -6327,45 +6079,44 @@ rx_body_332: movhpd qword ptr [rsi + rax * 8], xmm3 rx_i_333: ;XOR_64 - dec ebp + dec ebx jz rx_finish xor r14, 0f009758bh mov ecx, r14d - test ebp, 63 + test bl, 63 jnz short rx_body_333 call rx_read_l2 rx_body_333: - xor rdi, rcx + xor rbp, rcx and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] xor rax, -175125848 mov r11, rax rx_i_334: ;ADD_32 - dec ebp + dec ebx jz rx_finish xor r8, 0dda04168h mov ecx, r8d - test ebp, 63 + test bl, 63 jnz short rx_body_334 call rx_read_l1 rx_body_334: - xor rdi, rcx + xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] add eax, r13d mov r8, rax rx_i_335: ;SUB_64 - dec ebp + dec ebx jz rx_finish xor r15, 03e6cfb73h mov ecx, r15d - test ebp, 63 + test bl, 63 jnz short rx_body_335 call rx_read_l1 rx_body_335: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] sub rax, r8 @@ -6376,30 +6127,29 @@ rx_body_335: mov qword ptr [rsi + rax * 8], rcx rx_i_336: ;FPADD - dec ebp + dec ebx jz rx_finish xor r15, 0aea0a435h mov ecx, r15d - test ebp, 63 + test bl, 63 jnz short rx_body_336 call rx_read_l1 rx_body_336: - xor rdi, rcx + xor rbp, rcx and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm2 movaps xmm3, xmm0 rx_i_337: ;ADD_32 - dec ebp + dec ebx jz rx_finish xor r8, 03d6c4ab2h mov ecx, r8d - test ebp, 63 + test bl, 63 jnz short rx_body_337 call rx_read_l1 rx_body_337: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] add eax, r12d @@ -6410,60 +6160,57 @@ rx_body_337: mov qword ptr [rsi + rax * 8], rcx rx_i_338: ;MUL_64 - dec ebp + dec ebx jz rx_finish xor r12, 0d428a742h mov ecx, r12d - test ebp, 63 + test bl, 63 jnz short rx_body_338 call rx_read_l2 rx_body_338: - xor rdi, rcx + xor rbp, rcx and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] imul rax, r12 mov r11, rax rx_i_339: ;FPADD - dec ebp + dec ebx jz rx_finish xor r9, 04596ef73h mov ecx, r9d - test ebp, 63 + test bl, 63 jnz short rx_body_339 call rx_read_l1 rx_body_339: - xor rdi, rcx and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm6 movaps xmm2, xmm0 rx_i_340: ;FPSUB - dec ebp + dec ebx jz rx_finish xor r15, 0e51629cch mov ecx, r15d - test ebp, 63 + test bl, 63 jnz short rx_body_340 call rx_read_l1 rx_body_340: - xor rdi, rcx and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] subpd xmm0, xmm5 movaps xmm5, xmm0 rx_i_341: ;MUL_32 - dec ebp + dec ebx jz rx_finish xor r12, 019eb9ea5h mov ecx, r12d - test ebp, 63 + test bl, 63 jnz short rx_body_341 call rx_read_l1 rx_body_341: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] mov ecx, eax @@ -6476,15 +6223,14 @@ rx_body_341: mov qword ptr [rsi + rax * 8], rcx rx_i_342: ;FPMUL - dec ebp + dec ebx jz rx_finish xor r9, 09ccc7abah mov ecx, r9d - test ebp, 63 + test bl, 63 jnz short rx_body_342 call rx_read_l2 rx_body_342: - xor rdi, rcx and ecx, 32767 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] mulpd xmm0, xmm2 @@ -6494,15 +6240,15 @@ rx_body_342: movaps xmm3, xmm0 rx_i_343: ;SHR_64 - dec ebp + dec ebx jz rx_finish xor r14, 056f6cf0bh mov ecx, r14d - test ebp, 63 + test bl, 63 jnz short rx_body_343 call rx_read_l1 rx_body_343: - xor rdi, rcx + xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] shr rax, 48 @@ -6513,15 +6259,14 @@ rx_body_343: mov qword ptr [rsi + rax * 8], rcx rx_i_344: ;FPMUL - dec ebp + dec ebx jz rx_finish xor r10, 03ef9bcc4h mov ecx, r10d - test ebp, 63 + test bl, 63 jnz short rx_body_344 call rx_read_l2 rx_body_344: - xor rdi, rcx and ecx, 32767 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] mulpd xmm0, xmm6 @@ -6531,15 +6276,14 @@ rx_body_344: movaps xmm5, xmm0 rx_i_345: ;MULH_64 - dec ebp + dec ebx jz rx_finish xor r12, 0bbbcdbach mov ecx, r12d - test ebp, 63 + test bl, 63 jnz short rx_body_345 call rx_read_l1 rx_body_345: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] mov rcx, r13 @@ -6552,15 +6296,14 @@ rx_body_345: mov qword ptr [rsi + rax * 8], rcx rx_i_346: ;XOR_64 - dec ebp + dec ebx jz rx_finish xor r12, 0ae9d1e96h mov ecx, r12d - test ebp, 63 + test bl, 63 jnz short rx_body_346 call rx_read_l1 rx_body_346: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] xor rax, r15 @@ -6571,30 +6314,28 @@ rx_body_346: mov qword ptr [rsi + rax * 8], rcx rx_i_347: ;ADD_64 - dec ebp + dec ebx jz rx_finish xor r14, 070c34d69h mov ecx, r14d - test ebp, 63 + test bl, 63 jnz short rx_body_347 call rx_read_l1 rx_body_347: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] add rax, r10 mov r13, rax rx_i_348: ;FPSUB - dec ebp + dec ebx jz rx_finish xor r13, 0523ff904h mov ecx, r13d - test ebp, 63 + test bl, 63 jnz short rx_body_348 call rx_read_l1 rx_body_348: - xor rdi, rcx and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] subpd xmm0, xmm3 @@ -6605,30 +6346,28 @@ rx_body_348: movlpd qword ptr [rsi + rax * 8], xmm9 rx_i_349: ;XOR_32 - dec ebp + dec ebx jz rx_finish xor r8, 018e0e5ddh mov ecx, r8d - test ebp, 63 + test bl, 63 jnz short rx_body_349 call rx_read_l2 rx_body_349: - xor rdi, rcx and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] xor eax, r15d mov r13, rax rx_i_350: ;CALL - dec ebp + dec ebx jz rx_finish xor r9, 09bd050f0h mov ecx, r9d - test ebp, 63 + test bl, 63 jnz short rx_body_350 call rx_read_l1 rx_body_350: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] cmp r9d, -980411581 @@ -6644,30 +6383,29 @@ taken_call_350: call rx_i_352 rx_i_351: ;MUL_64 - dec ebp + dec ebx jz rx_finish xor r11, 0a3a5906fh mov ecx, r11d - test ebp, 63 + test bl, 63 jnz short rx_body_351 call rx_read_l2 rx_body_351: - xor rdi, rcx and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] imul rax, r10 mov r13, rax rx_i_352: ;FPADD - dec ebp + dec ebx jz rx_finish xor r10, 0afc9af2bh mov ecx, r10d - test ebp, 63 + test bl, 63 jnz short rx_body_352 call rx_read_l1 rx_body_352: - xor rdi, rcx + xor rbp, rcx and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm6 @@ -6678,15 +6416,14 @@ rx_body_352: movhpd qword ptr [rsi + rax * 8], xmm2 rx_i_353: ;FPMUL - dec ebp + dec ebx jz rx_finish xor r13, 02e65278bh mov ecx, r13d - test ebp, 63 + test bl, 63 jnz short rx_body_353 call rx_read_l1 rx_body_353: - xor rdi, rcx and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] mulpd xmm0, xmm2 @@ -6700,15 +6437,14 @@ rx_body_353: movhpd qword ptr [rsi + rax * 8], xmm7 rx_i_354: ;MULH_64 - dec ebp + dec ebx jz rx_finish xor r13, 02412fc10h mov ecx, r13d - test ebp, 63 + test bl, 63 jnz short rx_body_354 call rx_read_l2 rx_body_354: - xor rdi, rcx and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] mov rcx, r13 @@ -6717,15 +6453,14 @@ rx_body_354: mov r13, rax rx_i_355: ;MUL_64 - dec ebp + dec ebx jz rx_finish xor r10, 06bd6e65fh mov ecx, r10d - test ebp, 63 + test bl, 63 jnz short rx_body_355 call rx_read_l1 rx_body_355: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] imul rax, r14 @@ -6736,45 +6471,43 @@ rx_body_355: mov qword ptr [rsi + rax * 8], rcx rx_i_356: ;MUL_64 - dec ebp + dec ebx jz rx_finish xor r10, 01cd85d80h mov ecx, r10d - test ebp, 63 + test bl, 63 jnz short rx_body_356 call rx_read_l2 rx_body_356: - xor rdi, rcx + xor rbp, rcx and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] imul rax, r10 mov r11, rax rx_i_357: ;ADD_64 - dec ebp + dec ebx jz rx_finish xor r10, 0f7daed36h mov ecx, r10d - test ebp, 63 + test bl, 63 jnz short rx_body_357 call rx_read_l2 rx_body_357: - xor rdi, rcx and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] add rax, 820073637 mov r11, rax rx_i_358: ;DIV_64 - dec ebp + dec ebx jz rx_finish xor r13, 088fa6e5ah mov ecx, r13d - test ebp, 63 + test bl, 63 jnz short rx_body_358 call rx_read_l2 rx_body_358: - xor rdi, rcx and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] mov ecx, 1 @@ -6786,15 +6519,15 @@ rx_body_358: mov r9, rax rx_i_359: ;FPSUB - dec ebp + dec ebx jz rx_finish xor r10, 0714fc2cdh mov ecx, r10d - test ebp, 63 + test bl, 63 jnz short rx_body_359 call rx_read_l2 rx_body_359: - xor rdi, rcx + xor rbp, rcx and ecx, 32767 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] subpd xmm0, xmm9 @@ -6805,15 +6538,14 @@ rx_body_359: movhpd qword ptr [rsi + rax * 8], xmm4 rx_i_360: ;FPMUL - dec ebp + dec ebx jz rx_finish xor r10, 0c2d110b5h mov ecx, r10d - test ebp, 63 + test bl, 63 jnz short rx_body_360 call rx_read_l1 rx_body_360: - xor rdi, rcx and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] mulpd xmm0, xmm8 @@ -6823,35 +6555,32 @@ rx_body_360: movaps xmm8, xmm0 rx_i_361: ;FPSQRT - dec ebp + dec ebx jz rx_finish xor r15, 01d125a7fh mov ecx, r15d - test ebp, 63 + test bl, 63 jnz short rx_body_361 call rx_read_l1 rx_body_361: - xor rdi, rcx and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] andps xmm0, xmm10 - sqrtpd xmm0, xmm0 - movaps xmm6, xmm0 + sqrtpd xmm6, xmm0 mov eax, r14d xor eax, 0ad0b81f5h and eax, 2047 movhpd qword ptr [rsi + rax * 8], xmm6 rx_i_362: ;SUB_64 - dec ebp + dec ebx jz rx_finish xor r9, 0ed8954bdh mov ecx, r9d - test ebp, 63 + test bl, 63 jnz short rx_body_362 call rx_read_l1 rx_body_362: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] sub rax, 1082179469 @@ -6862,15 +6591,14 @@ rx_body_362: mov qword ptr [rsi + rax * 8], rcx rx_i_363: ;FPMUL - dec ebp + dec ebx jz rx_finish xor r12, 09f75887bh mov ecx, r12d - test ebp, 63 + test bl, 63 jnz short rx_body_363 call rx_read_l1 rx_body_363: - xor rdi, rcx and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] mulpd xmm0, xmm6 @@ -6880,15 +6608,14 @@ rx_body_363: movaps xmm3, xmm0 rx_i_364: ;MULH_64 - dec ebp + dec ebx jz rx_finish xor r11, 0badaf867h mov ecx, r11d - test ebp, 63 + test bl, 63 jnz short rx_body_364 call rx_read_l1 rx_body_364: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] mov rcx, r8 @@ -6897,15 +6624,14 @@ rx_body_364: mov r8, rax rx_i_365: ;IMUL_32 - dec ebp + dec ebx jz rx_finish xor r15, 02db4444ah mov ecx, r15d - test ebp, 63 + test bl, 63 jnz short rx_body_365 call rx_read_l2 rx_body_365: - xor rdi, rcx and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] movsxd rcx, eax @@ -6918,15 +6644,14 @@ rx_body_365: mov qword ptr [rsi + rax * 8], rcx rx_i_366: ;IMUL_32 - dec ebp + dec ebx jz rx_finish xor r12, 0bff7218fh mov ecx, r12d - test ebp, 63 + test bl, 63 jnz short rx_body_366 call rx_read_l2 rx_body_366: - xor rdi, rcx and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] movsxd rcx, eax @@ -6939,15 +6664,15 @@ rx_body_366: mov qword ptr [rsi + rax * 8], rcx rx_i_367: ;FPADD - dec ebp + dec ebx jz rx_finish xor r9, 04d14cb3ah mov ecx, r9d - test ebp, 63 + test bl, 63 jnz short rx_body_367 call rx_read_l2 rx_body_367: - xor rdi, rcx + xor rbp, rcx and ecx, 32767 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm9 @@ -6958,45 +6683,42 @@ rx_body_367: movlpd qword ptr [rsi + rax * 8], xmm4 rx_i_368: ;MUL_64 - dec ebp + dec ebx jz rx_finish xor r10, 0a14836bah mov ecx, r10d - test ebp, 63 + test bl, 63 jnz short rx_body_368 call rx_read_l1 rx_body_368: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] imul rax, r10 mov r8, rax rx_i_369: ;AND_64 - dec ebp + dec ebx jz rx_finish xor r9, 053fe22e2h mov ecx, r9d - test ebp, 63 + test bl, 63 jnz short rx_body_369 call rx_read_l1 rx_body_369: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] and rax, r13 mov r9, rax rx_i_370: ;FPSUB - dec ebp + dec ebx jz rx_finish xor r15, 010e1fb24h mov ecx, r15d - test ebp, 63 + test bl, 63 jnz short rx_body_370 call rx_read_l1 rx_body_370: - xor rdi, rcx and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] subpd xmm0, xmm6 @@ -7007,15 +6729,15 @@ rx_body_370: movhpd qword ptr [rsi + rax * 8], xmm6 rx_i_371: ;FPADD - dec ebp + dec ebx jz rx_finish xor r8, 0ebbd5cc9h mov ecx, r8d - test ebp, 63 + test bl, 63 jnz short rx_body_371 call rx_read_l1 rx_body_371: - xor rdi, rcx + xor rbp, rcx and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm9 @@ -7026,15 +6748,14 @@ rx_body_371: movhpd qword ptr [rsi + rax * 8], xmm5 rx_i_372: ;ROL_64 - dec ebp + dec ebx jz rx_finish xor r10, 098ab79d7h mov ecx, r10d - test ebp, 63 + test bl, 63 jnz short rx_body_372 call rx_read_l2 rx_body_372: - xor rdi, rcx and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] mov rcx, r13 @@ -7042,15 +6763,14 @@ rx_body_372: mov r9, rax rx_i_373: ;FPDIV - dec ebp + dec ebx jz rx_finish xor r15, 056438b3h mov ecx, r15d - test ebp, 63 + test bl, 63 jnz short rx_body_373 call rx_read_l2 rx_body_373: - xor rdi, rcx and ecx, 32767 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] divpd xmm0, xmm8 @@ -7060,15 +6780,14 @@ rx_body_373: movaps xmm4, xmm0 rx_i_374: ;FPMUL - dec ebp + dec ebx jz rx_finish xor r11, 0dbcce604h mov ecx, r11d - test ebp, 63 + test bl, 63 jnz short rx_body_374 call rx_read_l1 rx_body_374: - xor rdi, rcx and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] mulpd xmm0, xmm2 @@ -7078,15 +6797,14 @@ rx_body_374: movaps xmm2, xmm0 rx_i_375: ;ADD_64 - dec ebp + dec ebx jz rx_finish xor r9, 0edea6200h mov ecx, r9d - test ebp, 63 + test bl, 63 jnz short rx_body_375 call rx_read_l1 rx_body_375: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] add rax, r15 @@ -7097,15 +6815,14 @@ rx_body_375: mov qword ptr [rsi + rax * 8], rcx rx_i_376: ;ADD_64 - dec ebp + dec ebx jz rx_finish xor r14, 05e61b279h mov ecx, r14d - test ebp, 63 + test bl, 63 jnz short rx_body_376 call rx_read_l1 rx_body_376: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] add rax, 476136066 @@ -7116,30 +6833,28 @@ rx_body_376: mov qword ptr [rsi + rax * 8], rcx rx_i_377: ;FPSUB - dec ebp + dec ebx jz rx_finish xor r14, 0fc1fb433h mov ecx, r14d - test ebp, 63 + test bl, 63 jnz short rx_body_377 call rx_read_l1 rx_body_377: - xor rdi, rcx and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] subpd xmm0, xmm3 movaps xmm7, xmm0 rx_i_378: ;MUL_32 - dec ebp + dec ebx jz rx_finish xor r12, 082aa21ach mov ecx, r12d - test ebp, 63 + test bl, 63 jnz short rx_body_378 call rx_read_l1 rx_body_378: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] mov ecx, eax @@ -7148,15 +6863,14 @@ rx_body_378: mov r15, rax rx_i_379: ;FPADD - dec ebp + dec ebx jz rx_finish xor r10, 05dba41fbh mov ecx, r10d - test ebp, 63 + test bl, 63 jnz short rx_body_379 call rx_read_l2 rx_body_379: - xor rdi, rcx and ecx, 32767 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm9 @@ -7167,15 +6881,14 @@ rx_body_379: movlpd qword ptr [rsi + rax * 8], xmm5 rx_i_380: ;MUL_64 - dec ebp + dec ebx jz rx_finish xor r11, 0229e3d6eh mov ecx, r11d - test ebp, 63 + test bl, 63 jnz short rx_body_380 call rx_read_l1 rx_body_380: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] imul rax, rax, -1443002912 @@ -7186,15 +6899,15 @@ rx_body_380: mov qword ptr [rsi + rax * 8], rcx rx_i_381: ;SAR_64 - dec ebp + dec ebx jz rx_finish xor r8, 019816ff9h mov ecx, r8d - test ebp, 63 + test bl, 63 jnz short rx_body_381 call rx_read_l2 rx_body_381: - xor rdi, rcx + xor rbp, rcx and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] mov rcx, r14 @@ -7202,15 +6915,14 @@ rx_body_381: mov r9, rax rx_i_382: ;FPADD - dec ebp + dec ebx jz rx_finish xor r14, 036b5b81fh mov ecx, r14d - test ebp, 63 + test bl, 63 jnz short rx_body_382 call rx_read_l1 rx_body_382: - xor rdi, rcx and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm3 @@ -7221,15 +6933,14 @@ rx_body_382: movlpd qword ptr [rsi + rax * 8], xmm3 rx_i_383: ;FPSUB - dec ebp + dec ebx jz rx_finish xor r15, 05f798ec3h mov ecx, r15d - test ebp, 63 + test bl, 63 jnz short rx_body_383 call rx_read_l1 rx_body_383: - xor rdi, rcx and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] subpd xmm0, xmm4 @@ -7240,15 +6951,14 @@ rx_body_383: movhpd qword ptr [rsi + rax * 8], xmm5 rx_i_384: ;SHR_64 - dec ebp + dec ebx jz rx_finish xor r10, 05b459fd7h mov ecx, r10d - test ebp, 63 + test bl, 63 jnz short rx_body_384 call rx_read_l1 rx_body_384: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] mov rcx, r11 @@ -7260,15 +6970,14 @@ rx_body_384: mov qword ptr [rsi + rax * 8], rcx rx_i_385: ;MUL_64 - dec ebp + dec ebx jz rx_finish xor r15, 0c91749bbh mov ecx, r15d - test ebp, 63 + test bl, 63 jnz short rx_body_385 call rx_read_l1 rx_body_385: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] imul rax, r12 @@ -7279,48 +6988,46 @@ rx_body_385: mov qword ptr [rsi + rax * 8], rcx rx_i_386: ;FPADD - dec ebp + dec ebx jz rx_finish xor r9, 0575b4bdch mov ecx, r9d - test ebp, 63 + test bl, 63 jnz short rx_body_386 call rx_read_l2 rx_body_386: - xor rdi, rcx and ecx, 32767 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm8 movaps xmm9, xmm0 rx_i_387: ;MUL_64 - dec ebp + dec ebx jz rx_finish xor r9, 0d4f7bc6ah mov ecx, r9d - test ebp, 63 + test bl, 63 jnz short rx_body_387 call rx_read_l2 rx_body_387: - xor rdi, rcx + xor rbp, rcx and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] imul rax, r15 mov r9, rax rx_i_388: ;RET - dec ebp + dec ebx jz rx_finish xor r8, 08a949356h mov ecx, r8d - test ebp, 63 + test bl, 63 jnz short rx_body_388 call rx_read_l2 rx_body_388: - xor rdi, rcx and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] - cmp rsp, rbx + cmp rsp, rdi je short not_taken_ret_388 xor rax, qword ptr [rsp + 8] mov rcx, rax @@ -7337,15 +7044,14 @@ not_taken_ret_388: mov qword ptr [rsi + rax * 8], rcx rx_i_389: ;CALL - dec ebp + dec ebx jz rx_finish xor r11, 06531ad2eh mov ecx, r11d - test ebp, 63 + test bl, 63 jnz short rx_body_389 call rx_read_l1 rx_body_389: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] cmp r9d, -350609584 @@ -7357,45 +7063,42 @@ taken_call_389: call rx_i_421 rx_i_390: ;FPADD - dec ebp + dec ebx jz rx_finish xor r15, 02914abeah mov ecx, r15d - test ebp, 63 + test bl, 63 jnz short rx_body_390 call rx_read_l1 rx_body_390: - xor rdi, rcx and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm4 movaps xmm3, xmm0 rx_i_391: ;FPADD - dec ebp + dec ebx jz rx_finish xor r8, 0473a41f0h mov ecx, r8d - test ebp, 63 + test bl, 63 jnz short rx_body_391 call rx_read_l1 rx_body_391: - xor rdi, rcx and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm3 movaps xmm6, xmm0 rx_i_392: ;ROR_64 - dec ebp + dec ebx jz rx_finish xor r14, 01ebc1f0dh mov ecx, r14d - test ebp, 63 + test bl, 63 jnz short rx_body_392 call rx_read_l2 rx_body_392: - xor rdi, rcx and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] ror rax, 0 @@ -7406,15 +7109,14 @@ rx_body_392: mov qword ptr [rsi + rax * 8], rcx rx_i_393: ;OR_32 - dec ebp + dec ebx jz rx_finish xor r14, 0742e95b1h mov ecx, r14d - test ebp, 63 + test bl, 63 jnz short rx_body_393 call rx_read_l1 rx_body_393: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] or eax, 552339548 @@ -7425,30 +7127,29 @@ rx_body_393: mov qword ptr [rsi + rax * 8], rcx rx_i_394: ;FPADD - dec ebp + dec ebx jz rx_finish xor r12, 0db885c2ch mov ecx, r12d - test ebp, 63 + test bl, 63 jnz short rx_body_394 call rx_read_l2 rx_body_394: - xor rdi, rcx and ecx, 32767 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm9 movaps xmm6, xmm0 rx_i_395: ;IDIV_64 - dec ebp + dec ebx jz rx_finish xor r8, 04ae4fe8ch mov ecx, r8d - test ebp, 63 + test bl, 63 jnz short rx_body_395 call rx_read_l1 rx_body_395: - xor rdi, rcx + xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] mov edx, r13d @@ -7469,30 +7170,29 @@ result_idiv_395: mov r8, rax rx_i_396: ;FPADD - dec ebp + dec ebx jz rx_finish xor r10, 07b41862bh mov ecx, r10d - test ebp, 63 + test bl, 63 jnz short rx_body_396 call rx_read_l1 rx_body_396: - xor rdi, rcx and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm7 movaps xmm4, xmm0 rx_i_397: ;MUL_64 - dec ebp + dec ebx jz rx_finish xor r8, 0916f3819h mov ecx, r8d - test ebp, 63 + test bl, 63 jnz short rx_body_397 call rx_read_l1 rx_body_397: - xor rdi, rcx + xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] imul rax, r12 @@ -7503,15 +7203,14 @@ rx_body_397: mov qword ptr [rsi + rax * 8], rcx rx_i_398: ;ROL_64 - dec ebp + dec ebx jz rx_finish xor r8, 04eb6fd2ah mov ecx, r8d - test ebp, 63 + test bl, 63 jnz short rx_body_398 call rx_read_l1 rx_body_398: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] rol rax, 44 @@ -7522,15 +7221,15 @@ rx_body_398: mov qword ptr [rsi + rax * 8], rcx rx_i_399: ;FPDIV - dec ebp + dec ebx jz rx_finish xor r11, 0899a98cfh mov ecx, r11d - test ebp, 63 + test bl, 63 jnz short rx_body_399 call rx_read_l1 rx_body_399: - xor rdi, rcx + xor rbp, rcx and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] divpd xmm0, xmm2 @@ -7540,15 +7239,14 @@ rx_body_399: movaps xmm6, xmm0 rx_i_400: ;OR_32 - dec ebp + dec ebx jz rx_finish xor r13, 0aae75db6h mov ecx, r13d - test ebp, 63 + test bl, 63 jnz short rx_body_400 call rx_read_l1 rx_body_400: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] or eax, r11d @@ -7559,15 +7257,14 @@ rx_body_400: mov qword ptr [rsi + rax * 8], rcx rx_i_401: ;FPMUL - dec ebp + dec ebx jz rx_finish xor r13, 032e81f25h mov ecx, r13d - test ebp, 63 + test bl, 63 jnz short rx_body_401 call rx_read_l1 rx_body_401: - xor rdi, rcx and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] mulpd xmm0, xmm4 @@ -7581,18 +7278,17 @@ rx_body_401: movhpd qword ptr [rsi + rax * 8], xmm6 rx_i_402: ;RET - dec ebp + dec ebx jz rx_finish xor r9, 0fa1a07ffh mov ecx, r9d - test ebp, 63 + test bl, 63 jnz short rx_body_402 call rx_read_l1 rx_body_402: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - cmp rsp, rbx + cmp rsp, rdi je short not_taken_ret_402 xor rax, qword ptr [rsp + 8] mov r14, rax @@ -7601,15 +7297,14 @@ not_taken_ret_402: mov r14, rax rx_i_403: ;IDIV_64 - dec ebp + dec ebx jz rx_finish xor r9, 0e59500f7h mov ecx, r9d - test ebp, 63 + test bl, 63 jnz short rx_body_403 call rx_read_l1 rx_body_403: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] mov edx, r12d @@ -7634,15 +7329,14 @@ result_idiv_403: mov qword ptr [rsi + rax * 8], rcx rx_i_404: ;MUL_32 - dec ebp + dec ebx jz rx_finish xor r15, 05b8ceb2fh mov ecx, r15d - test ebp, 63 + test bl, 63 jnz short rx_body_404 call rx_read_l1 rx_body_404: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] mov ecx, eax @@ -7651,18 +7345,17 @@ rx_body_404: mov r15, rax rx_i_405: ;RET - dec ebp + dec ebx jz rx_finish xor r8, 0f61082a3h mov ecx, r8d - test ebp, 63 + test bl, 63 jnz short rx_body_405 call rx_read_l1 rx_body_405: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - cmp rsp, rbx + cmp rsp, rdi je short not_taken_ret_405 xor rax, qword ptr [rsp + 8] mov rcx, rax @@ -7679,15 +7372,14 @@ not_taken_ret_405: mov qword ptr [rsi + rax * 8], rcx rx_i_406: ;FPROUND - dec ebp + dec ebx jz rx_finish xor r9, 0af6886b7h mov ecx, r9d - test ebp, 63 + test bl, 63 jnz short rx_body_406 call rx_read_l2 rx_body_406: - xor rdi, rcx and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] shl eax, 13 @@ -7697,15 +7389,15 @@ rx_body_406: ldmxcsr dword ptr [rsp - 8] rx_i_407: ;FPMUL - dec ebp + dec ebx jz rx_finish xor r14, 09699566fh mov ecx, r14d - test ebp, 63 + test bl, 63 jnz short rx_body_407 call rx_read_l2 rx_body_407: - xor rdi, rcx + xor rbp, rcx and ecx, 32767 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] mulpd xmm0, xmm9 @@ -7715,15 +7407,14 @@ rx_body_407: movaps xmm8, xmm0 rx_i_408: ;MUL_64 - dec ebp + dec ebx jz rx_finish xor r15, 066e79fa6h mov ecx, r15d - test ebp, 63 + test bl, 63 jnz short rx_body_408 call rx_read_l1 rx_body_408: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] imul rax, r9 @@ -7734,33 +7425,32 @@ rx_body_408: mov qword ptr [rsi + rax * 8], rcx rx_i_409: ;MUL_64 - dec ebp + dec ebx jz rx_finish xor r11, 04b6caa9ah mov ecx, r11d - test ebp, 63 + test bl, 63 jnz short rx_body_409 call rx_read_l1 rx_body_409: - xor rdi, rcx + xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] imul rax, r15 mov r8, rax rx_i_410: ;RET - dec ebp + dec ebx jz rx_finish xor r15, 0d17f245eh mov ecx, r15d - test ebp, 63 + test bl, 63 jnz short rx_body_410 call rx_read_l1 rx_body_410: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - cmp rsp, rbx + cmp rsp, rdi je short not_taken_ret_410 xor rax, qword ptr [rsp + 8] mov r8, rax @@ -7769,18 +7459,17 @@ not_taken_ret_410: mov r8, rax rx_i_411: ;RET - dec ebp + dec ebx jz rx_finish xor r12, 0364f10e7h mov ecx, r12d - test ebp, 63 + test bl, 63 jnz short rx_body_411 call rx_read_l1 rx_body_411: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - cmp rsp, rbx + cmp rsp, rdi je short not_taken_ret_411 xor rax, qword ptr [rsp + 8] mov r12, rax @@ -7789,35 +7478,32 @@ not_taken_ret_411: mov r12, rax rx_i_412: ;FPSQRT - dec ebp + dec ebx jz rx_finish xor r10, 0ac90e7ah mov ecx, r10d - test ebp, 63 + test bl, 63 jnz short rx_body_412 call rx_read_l1 rx_body_412: - xor rdi, rcx and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] andps xmm0, xmm10 - sqrtpd xmm0, xmm0 - movaps xmm3, xmm0 + sqrtpd xmm3, xmm0 mov eax, r11d xor eax, 0bbd2640ah and eax, 2047 movhpd qword ptr [rsi + rax * 8], xmm3 rx_i_413: ;FPDIV - dec ebp + dec ebx jz rx_finish xor r11, 04b6037abh mov ecx, r11d - test ebp, 63 + test bl, 63 jnz short rx_body_413 call rx_read_l1 rx_body_413: - xor rdi, rcx and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] divpd xmm0, xmm2 @@ -7827,15 +7513,15 @@ rx_body_413: movaps xmm4, xmm0 rx_i_414: ;OR_64 - dec ebp + dec ebx jz rx_finish xor r14, 06c01554dh mov ecx, r14d - test ebp, 63 + test bl, 63 jnz short rx_body_414 call rx_read_l1 rx_body_414: - xor rdi, rcx + xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] or rax, r8 @@ -7846,15 +7532,15 @@ rx_body_414: mov qword ptr [rsi + rax * 8], rcx rx_i_415: ;DIV_64 - dec ebp + dec ebx jz rx_finish xor r8, 08c3e59a1h mov ecx, r8d - test ebp, 63 + test bl, 63 jnz short rx_body_415 call rx_read_l1 rx_body_415: - xor rdi, rcx + xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] mov ecx, -538093385 @@ -7863,15 +7549,15 @@ rx_body_415: mov r9, rax rx_i_416: ;FPSUB - dec ebp + dec ebx jz rx_finish xor r12, 0f3fafde9h mov ecx, r12d - test ebp, 63 + test bl, 63 jnz short rx_body_416 call rx_read_l1 rx_body_416: - xor rdi, rcx + xor rbp, rcx and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] subpd xmm0, xmm3 @@ -7882,30 +7568,28 @@ rx_body_416: movlpd qword ptr [rsi + rax * 8], xmm5 rx_i_417: ;SUB_64 - dec ebp + dec ebx jz rx_finish xor r10, 03c6481fah mov ecx, r10d - test ebp, 63 + test bl, 63 jnz short rx_body_417 call rx_read_l1 rx_body_417: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] sub rax, r12 mov r10, rax rx_i_418: ;MULH_64 - dec ebp + dec ebx jz rx_finish xor r10, 02bd61c5fh mov ecx, r10d - test ebp, 63 + test bl, 63 jnz short rx_body_418 call rx_read_l1 rx_body_418: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] mov rcx, r11 @@ -7914,15 +7598,14 @@ rx_body_418: mov r10, rax rx_i_419: ;XOR_64 - dec ebp + dec ebx jz rx_finish xor r9, 0b6ab9d32h mov ecx, r9d - test ebp, 63 + test bl, 63 jnz short rx_body_419 call rx_read_l1 rx_body_419: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] xor rax, r14 @@ -7933,15 +7616,14 @@ rx_body_419: mov qword ptr [rsi + rax * 8], rcx rx_i_420: ;FPADD - dec ebp + dec ebx jz rx_finish xor r9, 0f9690ceah mov ecx, r9d - test ebp, 63 + test bl, 63 jnz short rx_body_420 call rx_read_l1 rx_body_420: - xor rdi, rcx and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm3 @@ -7952,18 +7634,18 @@ rx_body_420: movlpd qword ptr [rsi + rax * 8], xmm9 rx_i_421: ;RET - dec ebp + dec ebx jz rx_finish xor r12, 01ada0f39h mov ecx, r12d - test ebp, 63 + test bl, 63 jnz short rx_body_421 call rx_read_l2 rx_body_421: - xor rdi, rcx + xor rbp, rcx and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] - cmp rsp, rbx + cmp rsp, rdi je short not_taken_ret_421 xor rax, qword ptr [rsp + 8] mov r10, rax @@ -7972,15 +7654,14 @@ not_taken_ret_421: mov r10, rax rx_i_422: ;IMUL_32 - dec ebp + dec ebx jz rx_finish xor r11, 04dd16ca4h mov ecx, r11d - test ebp, 63 + test bl, 63 jnz short rx_body_422 call rx_read_l2 rx_body_422: - xor rdi, rcx and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] movsxd rcx, eax @@ -7989,15 +7670,14 @@ rx_body_422: mov r13, rax rx_i_423: ;MUL_64 - dec ebp + dec ebx jz rx_finish xor r12, 04df5ce05h mov ecx, r12d - test ebp, 63 + test bl, 63 jnz short rx_body_423 call rx_read_l1 rx_body_423: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] imul rax, r10 @@ -8008,15 +7688,15 @@ rx_body_423: mov qword ptr [rsi + rax * 8], rcx rx_i_424: ;FPADD - dec ebp + dec ebx jz rx_finish xor r13, 01ad12ce2h mov ecx, r13d - test ebp, 63 + test bl, 63 jnz short rx_body_424 call rx_read_l2 rx_body_424: - xor rdi, rcx + xor rbp, rcx and ecx, 32767 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm7 @@ -8027,15 +7707,14 @@ rx_body_424: movhpd qword ptr [rsi + rax * 8], xmm9 rx_i_425: ;IMUL_32 - dec ebp + dec ebx jz rx_finish xor r8, 0a3c5391dh mov ecx, r8d - test ebp, 63 + test bl, 63 jnz short rx_body_425 call rx_read_l1 rx_body_425: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] movsxd rcx, eax @@ -8044,15 +7723,14 @@ rx_body_425: mov r14, rax rx_i_426: ;AND_64 - dec ebp + dec ebx jz rx_finish xor r12, 09dd55ba0h mov ecx, r12d - test ebp, 63 + test bl, 63 jnz short rx_body_426 call rx_read_l2 rx_body_426: - xor rdi, rcx and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] and rax, r9 @@ -8063,15 +7741,15 @@ rx_body_426: mov qword ptr [rsi + rax * 8], rcx rx_i_427: ;MUL_32 - dec ebp + dec ebx jz rx_finish xor r11, 0d6cae9aeh mov ecx, r11d - test ebp, 63 + test bl, 63 jnz short rx_body_427 call rx_read_l1 rx_body_427: - xor rdi, rcx + xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] mov ecx, eax @@ -8084,18 +7762,17 @@ rx_body_427: mov qword ptr [rsi + rax * 8], rcx rx_i_428: ;RET - dec ebp + dec ebx jz rx_finish xor r11, 0f807a961h mov ecx, r11d - test ebp, 63 + test bl, 63 jnz short rx_body_428 call rx_read_l1 rx_body_428: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - cmp rsp, rbx + cmp rsp, rdi je short not_taken_ret_428 xor rax, qword ptr [rsp + 8] mov rcx, rax @@ -8112,30 +7789,29 @@ not_taken_ret_428: mov qword ptr [rsi + rax * 8], rcx rx_i_429: ;MUL_64 - dec ebp + dec ebx jz rx_finish xor r12, 0650a4102h mov ecx, r12d - test ebp, 63 + test bl, 63 jnz short rx_body_429 call rx_read_l2 rx_body_429: - xor rdi, rcx and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] imul rax, rax, 1990438276 mov r15, rax rx_i_430: ;FPADD - dec ebp + dec ebx jz rx_finish xor r14, 019cc0e5h mov ecx, r14d - test ebp, 63 + test bl, 63 jnz short rx_body_430 call rx_read_l1 rx_body_430: - xor rdi, rcx + xor rbp, rcx and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm8 @@ -8146,15 +7822,14 @@ rx_body_430: movhpd qword ptr [rsi + rax * 8], xmm5 rx_i_431: ;FPADD - dec ebp + dec ebx jz rx_finish xor r12, 0ed17ab58h mov ecx, r12d - test ebp, 63 + test bl, 63 jnz short rx_body_431 call rx_read_l1 rx_body_431: - xor rdi, rcx and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm5 @@ -8165,30 +7840,28 @@ rx_body_431: movlpd qword ptr [rsi + rax * 8], xmm5 rx_i_432: ;SUB_64 - dec ebp + dec ebx jz rx_finish xor r10, 01c3b321fh mov ecx, r10d - test ebp, 63 + test bl, 63 jnz short rx_body_432 call rx_read_l2 rx_body_432: - xor rdi, rcx and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] sub rax, r10 mov r8, rax rx_i_433: ;ADD_32 - dec ebp + dec ebx jz rx_finish xor r13, 0bbb88499h mov ecx, r13d - test ebp, 63 + test bl, 63 jnz short rx_body_433 call rx_read_l1 rx_body_433: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] add eax, r12d @@ -8199,35 +7872,33 @@ rx_body_433: mov qword ptr [rsi + rax * 8], rcx rx_i_434: ;FPSQRT - dec ebp + dec ebx jz rx_finish xor r13, 0167edabdh mov ecx, r13d - test ebp, 63 + test bl, 63 jnz short rx_body_434 call rx_read_l2 rx_body_434: - xor rdi, rcx and ecx, 32767 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] andps xmm0, xmm10 - sqrtpd xmm0, xmm0 - movaps xmm9, xmm0 + sqrtpd xmm9, xmm0 mov eax, r9d xor eax, 08c1cfc74h and eax, 2047 movlpd qword ptr [rsi + rax * 8], xmm9 rx_i_435: ;MUL_64 - dec ebp + dec ebx jz rx_finish xor r15, 0b940480ah mov ecx, r15d - test ebp, 63 + test bl, 63 jnz short rx_body_435 call rx_read_l1 rx_body_435: - xor rdi, rcx + xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] imul rax, r15 @@ -8238,15 +7909,14 @@ rx_body_435: mov qword ptr [rsi + rax * 8], rcx rx_i_436: ;FPADD - dec ebp + dec ebx jz rx_finish xor r15, 0bfc3ca8bh mov ecx, r15d - test ebp, 63 + test bl, 63 jnz short rx_body_436 call rx_read_l2 rx_body_436: - xor rdi, rcx and ecx, 32767 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm2 @@ -8257,15 +7927,15 @@ rx_body_436: movhpd qword ptr [rsi + rax * 8], xmm7 rx_i_437: ;FPDIV - dec ebp + dec ebx jz rx_finish xor r8, 098a6bcf7h mov ecx, r8d - test ebp, 63 + test bl, 63 jnz short rx_body_437 call rx_read_l1 rx_body_437: - xor rdi, rcx + xor rbp, rcx and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] divpd xmm0, xmm3 @@ -8275,15 +7945,14 @@ rx_body_437: movaps xmm8, xmm0 rx_i_438: ;FPMUL - dec ebp + dec ebx jz rx_finish xor r10, 0325b38ebh mov ecx, r10d - test ebp, 63 + test bl, 63 jnz short rx_body_438 call rx_read_l1 rx_body_438: - xor rdi, rcx and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] mulpd xmm0, xmm9 @@ -8293,15 +7962,14 @@ rx_body_438: movaps xmm4, xmm0 rx_i_439: ;XOR_32 - dec ebp + dec ebx jz rx_finish xor r13, 05e807e81h mov ecx, r13d - test ebp, 63 + test bl, 63 jnz short rx_body_439 call rx_read_l2 rx_body_439: - xor rdi, rcx and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] xor eax, r15d @@ -8312,18 +7980,18 @@ rx_body_439: mov qword ptr [rsi + rax * 8], rcx rx_i_440: ;RET - dec ebp + dec ebx jz rx_finish xor r10, 062f83728h mov ecx, r10d - test ebp, 63 + test bl, 63 jnz short rx_body_440 call rx_read_l1 rx_body_440: - xor rdi, rcx + xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - cmp rsp, rbx + cmp rsp, rdi je short not_taken_ret_440 xor rax, qword ptr [rsp + 8] mov r9, rax @@ -8332,15 +8000,15 @@ not_taken_ret_440: mov r9, rax rx_i_441: ;ADD_64 - dec ebp + dec ebx jz rx_finish xor r14, 0d18ec075h mov ecx, r14d - test ebp, 63 + test bl, 63 jnz short rx_body_441 call rx_read_l1 rx_body_441: - xor rdi, rcx + xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] add rax, 529736748 @@ -8351,15 +8019,14 @@ rx_body_441: mov qword ptr [rsi + rax * 8], rcx rx_i_442: ;CALL - dec ebp + dec ebx jz rx_finish xor r14, 0a53dd1bh mov ecx, r14d - test ebp, 63 + test bl, 63 jnz short rx_body_442 call rx_read_l1 rx_body_442: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] cmp r15d, 799523062 @@ -8375,18 +8042,17 @@ taken_call_442: call rx_i_9 rx_i_443: ;RET - dec ebp + dec ebx jz rx_finish xor r14, 0232d1285h mov ecx, r14d - test ebp, 63 + test bl, 63 jnz short rx_body_443 call rx_read_l1 rx_body_443: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - cmp rsp, rbx + cmp rsp, rdi je short not_taken_ret_443 xor rax, qword ptr [rsp + 8] mov rcx, rax @@ -8403,15 +8069,14 @@ not_taken_ret_443: mov qword ptr [rsi + rax * 8], rcx rx_i_444: ;FPMUL - dec ebp + dec ebx jz rx_finish xor r8, 042455dd8h mov ecx, r8d - test ebp, 63 + test bl, 63 jnz short rx_body_444 call rx_read_l2 rx_body_444: - xor rdi, rcx and ecx, 32767 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] mulpd xmm0, xmm7 @@ -8425,15 +8090,14 @@ rx_body_444: movlpd qword ptr [rsi + rax * 8], xmm5 rx_i_445: ;ADD_64 - dec ebp + dec ebx jz rx_finish xor r13, 09ae009b2h mov ecx, r13d - test ebp, 63 + test bl, 63 jnz short rx_body_445 call rx_read_l1 rx_body_445: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] add rax, r11 @@ -8444,15 +8108,15 @@ rx_body_445: mov qword ptr [rsi + rax * 8], rcx rx_i_446: ;MUL_32 - dec ebp + dec ebx jz rx_finish xor r12, 01734708eh mov ecx, r12d - test ebp, 63 + test bl, 63 jnz short rx_body_446 call rx_read_l1 rx_body_446: - xor rdi, rcx + xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] mov ecx, eax @@ -8465,15 +8129,14 @@ rx_body_446: mov qword ptr [rsi + rax * 8], rcx rx_i_447: ;FPSUB - dec ebp + dec ebx jz rx_finish xor r8, 01596d0e8h mov ecx, r8d - test ebp, 63 + test bl, 63 jnz short rx_body_447 call rx_read_l1 rx_body_447: - xor rdi, rcx and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] subpd xmm0, xmm7 @@ -8484,45 +8147,43 @@ rx_body_447: movhpd qword ptr [rsi + rax * 8], xmm5 rx_i_448: ;FPSUB - dec ebp + dec ebx jz rx_finish xor r9, 0390cfdb0h mov ecx, r9d - test ebp, 63 + test bl, 63 jnz short rx_body_448 call rx_read_l1 rx_body_448: - xor rdi, rcx + xor rbp, rcx and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] subpd xmm0, xmm3 movaps xmm9, xmm0 rx_i_449: ;ROR_64 - dec ebp + dec ebx jz rx_finish xor r8, 04f27744bh mov ecx, r8d - test ebp, 63 + test bl, 63 jnz short rx_body_449 call rx_read_l1 rx_body_449: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] ror rax, 28 mov r8, rax rx_i_450: ;ROL_64 - dec ebp + dec ebx jz rx_finish xor r8, 04e2c76ffh mov ecx, r8d - test ebp, 63 + test bl, 63 jnz short rx_body_450 call rx_read_l1 rx_body_450: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] mov rcx, r12 @@ -8534,33 +8195,32 @@ rx_body_450: mov qword ptr [rsi + rax * 8], rcx rx_i_451: ;ADD_64 - dec ebp + dec ebx jz rx_finish xor r8, 0c4d99ac9h mov ecx, r8d - test ebp, 63 + test bl, 63 jnz short rx_body_451 call rx_read_l1 rx_body_451: - xor rdi, rcx + xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] add rax, -287502157 mov r8, rax rx_i_452: ;RET - dec ebp + dec ebx jz rx_finish xor r13, 040130b88h mov ecx, r13d - test ebp, 63 + test bl, 63 jnz short rx_body_452 call rx_read_l1 rx_body_452: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - cmp rsp, rbx + cmp rsp, rdi je short not_taken_ret_452 xor rax, qword ptr [rsp + 8] mov rcx, rax @@ -8577,15 +8237,14 @@ not_taken_ret_452: mov qword ptr [rsi + rax * 8], rcx rx_i_453: ;IMULH_64 - dec ebp + dec ebx jz rx_finish xor r11, 0a2096aa4h mov ecx, r11d - test ebp, 63 + test bl, 63 jnz short rx_body_453 call rx_read_l1 rx_body_453: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] mov rcx, r14 @@ -8594,15 +8253,14 @@ rx_body_453: mov r8, rax rx_i_454: ;FPADD - dec ebp + dec ebx jz rx_finish xor r13, 081314291h mov ecx, r13d - test ebp, 63 + test bl, 63 jnz short rx_body_454 call rx_read_l1 rx_body_454: - xor rdi, rcx and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm9 @@ -8613,30 +8271,29 @@ rx_body_454: movhpd qword ptr [rsi + rax * 8], xmm4 rx_i_455: ;XOR_64 - dec ebp + dec ebx jz rx_finish xor r8, 059263cdbh mov ecx, r8d - test ebp, 63 + test bl, 63 jnz short rx_body_455 call rx_read_l1 rx_body_455: - xor rdi, rcx + xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] xor rax, r9 mov r8, rax rx_i_456: ;OR_32 - dec ebp + dec ebx jz rx_finish xor r9, 010e8fe6h mov ecx, r9d - test ebp, 63 + test bl, 63 jnz short rx_body_456 call rx_read_l2 rx_body_456: - xor rdi, rcx and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] or eax, r11d @@ -8647,15 +8304,15 @@ rx_body_456: mov qword ptr [rsi + rax * 8], rcx rx_i_457: ;SUB_64 - dec ebp + dec ebx jz rx_finish xor r9, 09de1a3efh mov ecx, r9d - test ebp, 63 + test bl, 63 jnz short rx_body_457 call rx_read_l1 rx_body_457: - xor rdi, rcx + xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] sub rax, r10 @@ -8666,30 +8323,28 @@ rx_body_457: mov qword ptr [rsi + rax * 8], rcx rx_i_458: ;ROL_64 - dec ebp + dec ebx jz rx_finish xor r11, 05c79df6eh mov ecx, r11d - test ebp, 63 + test bl, 63 jnz short rx_body_458 call rx_read_l1 rx_body_458: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] rol rax, 22 mov r14, rax rx_i_459: ;MUL_64 - dec ebp + dec ebx jz rx_finish xor r9, 0346f46adh mov ecx, r9d - test ebp, 63 + test bl, 63 jnz short rx_body_459 call rx_read_l1 rx_body_459: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] imul rax, rax, 381354340 @@ -8700,15 +8355,14 @@ rx_body_459: mov qword ptr [rsi + rax * 8], rcx rx_i_460: ;SUB_64 - dec ebp + dec ebx jz rx_finish xor r11, 098ab71fch mov ecx, r11d - test ebp, 63 + test bl, 63 jnz short rx_body_460 call rx_read_l1 rx_body_460: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] sub rax, r14 @@ -8719,15 +8373,14 @@ rx_body_460: mov qword ptr [rsi + rax * 8], rcx rx_i_461: ;SHR_64 - dec ebp + dec ebx jz rx_finish xor r11, 0c814e926h mov ecx, r11d - test ebp, 63 + test bl, 63 jnz short rx_body_461 call rx_read_l2 rx_body_461: - xor rdi, rcx and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] mov rcx, r13 @@ -8739,45 +8392,42 @@ rx_body_461: mov qword ptr [rsi + rax * 8], rcx rx_i_462: ;ADD_64 - dec ebp + dec ebx jz rx_finish xor r10, 0c64b4a9eh mov ecx, r10d - test ebp, 63 + test bl, 63 jnz short rx_body_462 call rx_read_l2 rx_body_462: - xor rdi, rcx and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] add rax, -1734323376 mov r15, rax rx_i_463: ;SUB_64 - dec ebp + dec ebx jz rx_finish xor r9, 08c29341h mov ecx, r9d - test ebp, 63 + test bl, 63 jnz short rx_body_463 call rx_read_l1 rx_body_463: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] sub rax, r15 mov r10, rax rx_i_464: ;MUL_64 - dec ebp + dec ebx jz rx_finish xor r12, 06ff587fdh mov ecx, r12d - test ebp, 63 + test bl, 63 jnz short rx_body_464 call rx_read_l1 rx_body_464: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] imul rax, r15 @@ -8788,30 +8438,29 @@ rx_body_464: mov qword ptr [rsi + rax * 8], rcx rx_i_465: ;FPADD - dec ebp + dec ebx jz rx_finish xor r12, 0b62c0003h mov ecx, r12d - test ebp, 63 + test bl, 63 jnz short rx_body_465 call rx_read_l2 rx_body_465: - xor rdi, rcx and ecx, 32767 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm5 movaps xmm2, xmm0 rx_i_466: ;IMUL_32 - dec ebp + dec ebx jz rx_finish xor r13, 05c541c42h mov ecx, r13d - test ebp, 63 + test bl, 63 jnz short rx_body_466 call rx_read_l1 rx_body_466: - xor rdi, rcx + xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] movsxd rcx, eax @@ -8820,30 +8469,30 @@ rx_body_466: mov r9, rax rx_i_467: ;FPADD - dec ebp + dec ebx jz rx_finish xor r8, 0cbb33f81h mov ecx, r8d - test ebp, 63 + test bl, 63 jnz short rx_body_467 call rx_read_l1 rx_body_467: - xor rdi, rcx + xor rbp, rcx and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm9 movaps xmm8, xmm0 rx_i_468: ;IDIV_64 - dec ebp + dec ebx jz rx_finish xor r8, 091044dc3h mov ecx, r8d - test ebp, 63 + test bl, 63 jnz short rx_body_468 call rx_read_l1 rx_body_468: - xor rdi, rcx + xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] mov edx, -13394825 @@ -8868,15 +8517,14 @@ result_idiv_468: mov qword ptr [rsi + rax * 8], rcx rx_i_469: ;MUL_32 - dec ebp + dec ebx jz rx_finish xor r9, 0c0186beh mov ecx, r9d - test ebp, 63 + test bl, 63 jnz short rx_body_469 call rx_read_l1 rx_body_469: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] mov ecx, eax @@ -8889,15 +8537,14 @@ rx_body_469: mov qword ptr [rsi + rax * 8], rcx rx_i_470: ;XOR_32 - dec ebp + dec ebx jz rx_finish xor r14, 090849e3eh mov ecx, r14d - test ebp, 63 + test bl, 63 jnz short rx_body_470 call rx_read_l1 rx_body_470: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] xor eax, r11d @@ -8908,15 +8555,14 @@ rx_body_470: mov qword ptr [rsi + rax * 8], rcx rx_i_471: ;IMUL_32 - dec ebp + dec ebx jz rx_finish xor r14, 0cedba9b6h mov ecx, r14d - test ebp, 63 + test bl, 63 jnz short rx_body_471 call rx_read_l1 rx_body_471: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] movsxd rcx, eax @@ -8925,15 +8571,15 @@ rx_body_471: mov r14, rax rx_i_472: ;CALL - dec ebp + dec ebx jz rx_finish xor r9, 038f4b9d6h mov ecx, r9d - test ebp, 63 + test bl, 63 jnz short rx_body_472 call rx_read_l2 rx_body_472: - xor rdi, rcx + xor rbp, rcx and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] cmp r10d, 1738497427 @@ -8945,30 +8591,29 @@ taken_call_472: call rx_i_8 rx_i_473: ;MUL_64 - dec ebp + dec ebx jz rx_finish xor r14, 01fb7637dh mov ecx, r14d - test ebp, 63 + test bl, 63 jnz short rx_body_473 call rx_read_l1 rx_body_473: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] imul rax, rax, -751043211 mov r12, rax rx_i_474: ;CALL - dec ebp + dec ebx jz rx_finish xor r9, 0b5c0b4d4h mov ecx, r9d - test ebp, 63 + test bl, 63 jnz short rx_body_474 call rx_read_l2 rx_body_474: - xor rdi, rcx + xor rbp, rcx and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] cmp r15d, -233120543 @@ -8980,45 +8625,42 @@ taken_call_474: call rx_i_69 rx_i_475: ;FPSUB - dec ebp + dec ebx jz rx_finish xor r10, 0910dcdeeh mov ecx, r10d - test ebp, 63 + test bl, 63 jnz short rx_body_475 call rx_read_l2 rx_body_475: - xor rdi, rcx and ecx, 32767 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] subpd xmm0, xmm9 movaps xmm7, xmm0 rx_i_476: ;FPSUB - dec ebp + dec ebx jz rx_finish xor r8, 07ab3b5a4h mov ecx, r8d - test ebp, 63 + test bl, 63 jnz short rx_body_476 call rx_read_l1 rx_body_476: - xor rdi, rcx and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] subpd xmm0, xmm2 movaps xmm9, xmm0 rx_i_477: ;FPADD - dec ebp + dec ebx jz rx_finish xor r12, 07a29ec63h mov ecx, r12d - test ebp, 63 + test bl, 63 jnz short rx_body_477 call rx_read_l1 rx_body_477: - xor rdi, rcx and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm9 @@ -9029,30 +8671,28 @@ rx_body_477: movlpd qword ptr [rsi + rax * 8], xmm6 rx_i_478: ;MUL_64 - dec ebp + dec ebx jz rx_finish xor r14, 02d3d7e7fh mov ecx, r14d - test ebp, 63 + test bl, 63 jnz short rx_body_478 call rx_read_l1 rx_body_478: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] imul rax, r10 mov r12, rax rx_i_479: ;MUL_64 - dec ebp + dec ebx jz rx_finish xor r12, 09b49c793h mov ecx, r12d - test ebp, 63 + test bl, 63 jnz short rx_body_479 call rx_read_l1 rx_body_479: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] imul rax, r14 @@ -9063,30 +8703,28 @@ rx_body_479: mov qword ptr [rsi + rax * 8], rcx rx_i_480: ;FPSUB - dec ebp + dec ebx jz rx_finish xor r9, 0a9cc4f01h mov ecx, r9d - test ebp, 63 + test bl, 63 jnz short rx_body_480 call rx_read_l1 rx_body_480: - xor rdi, rcx and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] subpd xmm0, xmm4 movaps xmm6, xmm0 rx_i_481: ;DIV_64 - dec ebp + dec ebx jz rx_finish xor r14, 0225ba1f9h mov ecx, r14d - test ebp, 63 + test bl, 63 jnz short rx_body_481 call rx_read_l1 rx_body_481: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] mov ecx, 1 @@ -9098,60 +8736,58 @@ rx_body_481: mov r12, rax rx_i_482: ;XOR_64 - dec ebp + dec ebx jz rx_finish xor r14, 044a0f592h mov ecx, r14d - test ebp, 63 + test bl, 63 jnz short rx_body_482 call rx_read_l2 rx_body_482: - xor rdi, rcx and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] xor rax, r12 mov r11, rax rx_i_483: ;FPADD - dec ebp + dec ebx jz rx_finish xor r11, 07f71f219h mov ecx, r11d - test ebp, 63 + test bl, 63 jnz short rx_body_483 call rx_read_l1 rx_body_483: - xor rdi, rcx and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm6 movaps xmm6, xmm0 rx_i_484: ;ROL_64 - dec ebp + dec ebx jz rx_finish xor r12, 07027bacdh mov ecx, r12d - test ebp, 63 + test bl, 63 jnz short rx_body_484 call rx_read_l1 rx_body_484: - xor rdi, rcx + xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] rol rax, 37 mov r11, rax rx_i_485: ;CALL - dec ebp + dec ebx jz rx_finish xor r13, 03a04647h mov ecx, r13d - test ebp, 63 + test bl, 63 jnz short rx_body_485 call rx_read_l2 rx_body_485: - xor rdi, rcx + xor rbp, rcx and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] cmp r8d, 554879918 @@ -9167,15 +8803,14 @@ taken_call_485: call rx_i_58 rx_i_486: ;ADD_64 - dec ebp + dec ebx jz rx_finish xor r15, 0ad072937h mov ecx, r15d - test ebp, 63 + test bl, 63 jnz short rx_body_486 call rx_read_l1 rx_body_486: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] add rax, 942846898 @@ -9186,30 +8821,28 @@ rx_body_486: mov qword ptr [rsi + rax * 8], rcx rx_i_487: ;SUB_64 - dec ebp + dec ebx jz rx_finish xor r11, 07f78ad34h mov ecx, r11d - test ebp, 63 + test bl, 63 jnz short rx_body_487 call rx_read_l1 rx_body_487: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] sub rax, -333279706 mov r11, rax rx_i_488: ;IMULH_64 - dec ebp + dec ebx jz rx_finish xor r12, 0d8b1788eh mov ecx, r12d - test ebp, 63 + test bl, 63 jnz short rx_body_488 call rx_read_l1 rx_body_488: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] mov rcx, 297357073 @@ -9218,15 +8851,15 @@ rx_body_488: mov r12, rax rx_i_489: ;CALL - dec ebp + dec ebx jz rx_finish xor r10, 0b2ec9f3ah mov ecx, r10d - test ebp, 63 + test bl, 63 jnz short rx_body_489 call rx_read_l1 rx_body_489: - xor rdi, rcx + xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] cmp r15d, -1127175870 @@ -9242,30 +8875,29 @@ taken_call_489: call rx_i_75 rx_i_490: ;FPADD - dec ebp + dec ebx jz rx_finish xor r11, 015c7f598h mov ecx, r11d - test ebp, 63 + test bl, 63 jnz short rx_body_490 call rx_read_l2 rx_body_490: - xor rdi, rcx + xor rbp, rcx and ecx, 32767 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm9 movaps xmm7, xmm0 rx_i_491: ;FPADD - dec ebp + dec ebx jz rx_finish xor r8, 0902da6bdh mov ecx, r8d - test ebp, 63 + test bl, 63 jnz short rx_body_491 call rx_read_l2 rx_body_491: - xor rdi, rcx and ecx, 32767 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm9 @@ -9276,45 +8908,42 @@ rx_body_491: movhpd qword ptr [rsi + rax * 8], xmm7 rx_i_492: ;OR_64 - dec ebp + dec ebx jz rx_finish xor r9, 0491090d9h mov ecx, r9d - test ebp, 63 + test bl, 63 jnz short rx_body_492 call rx_read_l2 rx_body_492: - xor rdi, rcx and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] or rax, r9 mov r12, rax rx_i_493: ;FPSUB - dec ebp + dec ebx jz rx_finish xor r8, 09de81282h mov ecx, r8d - test ebp, 63 + test bl, 63 jnz short rx_body_493 call rx_read_l1 rx_body_493: - xor rdi, rcx and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] subpd xmm0, xmm9 movaps xmm4, xmm0 rx_i_494: ;MUL_32 - dec ebp + dec ebx jz rx_finish xor r10, 0b0d50e46h mov ecx, r10d - test ebp, 63 + test bl, 63 jnz short rx_body_494 call rx_read_l2 rx_body_494: - xor rdi, rcx and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] mov ecx, eax @@ -9323,15 +8952,14 @@ rx_body_494: mov r14, rax rx_i_495: ;FPMUL - dec ebp + dec ebx jz rx_finish xor r11, 0e276cad1h mov ecx, r11d - test ebp, 63 + test bl, 63 jnz short rx_body_495 call rx_read_l1 rx_body_495: - xor rdi, rcx and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] mulpd xmm0, xmm2 @@ -9341,30 +8969,29 @@ rx_body_495: movaps xmm8, xmm0 rx_i_496: ;OR_64 - dec ebp + dec ebx jz rx_finish xor r14, 0fe757b73h mov ecx, r14d - test ebp, 63 + test bl, 63 jnz short rx_body_496 call rx_read_l1 rx_body_496: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] or rax, -359802064 mov r9, rax rx_i_497: ;FPDIV - dec ebp + dec ebx jz rx_finish xor r8, 08d25742eh mov ecx, r8d - test ebp, 63 + test bl, 63 jnz short rx_body_497 call rx_read_l1 rx_body_497: - xor rdi, rcx + xor rbp, rcx and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] divpd xmm0, xmm3 @@ -9374,15 +9001,14 @@ rx_body_497: movaps xmm8, xmm0 rx_i_498: ;FPMUL - dec ebp + dec ebx jz rx_finish xor r15, 0e066fd15h mov ecx, r15d - test ebp, 63 + test bl, 63 jnz short rx_body_498 call rx_read_l1 rx_body_498: - xor rdi, rcx and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] mulpd xmm0, xmm9 @@ -9396,15 +9022,14 @@ rx_body_498: movlpd qword ptr [rsi + rax * 8], xmm8 rx_i_499: ;IMUL_32 - dec ebp + dec ebx jz rx_finish xor r12, 08925556bh mov ecx, r12d - test ebp, 63 + test bl, 63 jnz short rx_body_499 call rx_read_l2 rx_body_499: - xor rdi, rcx and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] movsxd rcx, eax @@ -9413,15 +9038,14 @@ rx_body_499: mov r8, rax rx_i_500: ;CALL - dec ebp + dec ebx jz rx_finish xor r10, 04bc870ebh mov ecx, r10d - test ebp, 63 + test bl, 63 jnz short rx_body_500 call rx_read_l1 rx_body_500: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] cmp r13d, 1243939650 @@ -9437,15 +9061,14 @@ taken_call_500: call rx_i_511 rx_i_501: ;SHR_64 - dec ebp + dec ebx jz rx_finish xor r8, 07d46c503h mov ecx, r8d - test ebp, 63 + test bl, 63 jnz short rx_body_501 call rx_read_l1 rx_body_501: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] mov rcx, r10 @@ -9457,18 +9080,18 @@ rx_body_501: mov qword ptr [rsi + rax * 8], rcx rx_i_502: ;RET - dec ebp + dec ebx jz rx_finish xor r10, 09e70b20ch mov ecx, r10d - test ebp, 63 + test bl, 63 jnz short rx_body_502 call rx_read_l1 rx_body_502: - xor rdi, rcx + xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - cmp rsp, rbx + cmp rsp, rdi je short not_taken_ret_502 xor rax, qword ptr [rsp + 8] mov rcx, rax @@ -9485,15 +9108,14 @@ not_taken_ret_502: mov qword ptr [rsi + rax * 8], rcx rx_i_503: ;FPSUB - dec ebp + dec ebx jz rx_finish xor r13, 0442e4850h mov ecx, r13d - test ebp, 63 + test bl, 63 jnz short rx_body_503 call rx_read_l1 rx_body_503: - xor rdi, rcx and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] subpd xmm0, xmm2 @@ -9504,15 +9126,14 @@ rx_body_503: movlpd qword ptr [rsi + rax * 8], xmm9 rx_i_504: ;FPADD - dec ebp + dec ebx jz rx_finish xor r13, 099d48347h mov ecx, r13d - test ebp, 63 + test bl, 63 jnz short rx_body_504 call rx_read_l1 rx_body_504: - xor rdi, rcx and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm9 @@ -9523,15 +9144,14 @@ rx_body_504: movhpd qword ptr [rsi + rax * 8], xmm4 rx_i_505: ;FPMUL - dec ebp + dec ebx jz rx_finish xor r12, 032c0a28ah mov ecx, r12d - test ebp, 63 + test bl, 63 jnz short rx_body_505 call rx_read_l2 rx_body_505: - xor rdi, rcx and ecx, 32767 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] mulpd xmm0, xmm4 @@ -9545,15 +9165,14 @@ rx_body_505: movlpd qword ptr [rsi + rax * 8], xmm8 rx_i_506: ;FPMUL - dec ebp + dec ebx jz rx_finish xor r9, 0a973d58ch mov ecx, r9d - test ebp, 63 + test bl, 63 jnz short rx_body_506 call rx_read_l1 rx_body_506: - xor rdi, rcx and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] mulpd xmm0, xmm9 @@ -9563,18 +9182,17 @@ rx_body_506: movaps xmm3, xmm0 rx_i_507: ;RET - dec ebp + dec ebx jz rx_finish xor r10, 0d3b7165ch mov ecx, r10d - test ebp, 63 + test bl, 63 jnz short rx_body_507 call rx_read_l1 rx_body_507: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - cmp rsp, rbx + cmp rsp, rdi je short not_taken_ret_507 xor rax, qword ptr [rsp + 8] mov r14, rax @@ -9583,18 +9201,17 @@ not_taken_ret_507: mov r14, rax rx_i_508: ;RET - dec ebp + dec ebx jz rx_finish xor r13, 0da34d818h mov ecx, r13d - test ebp, 63 + test bl, 63 jnz short rx_body_508 call rx_read_l1 rx_body_508: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - cmp rsp, rbx + cmp rsp, rdi je short not_taken_ret_508 xor rax, qword ptr [rsp + 8] mov r8, rax @@ -9603,15 +9220,14 @@ not_taken_ret_508: mov r8, rax rx_i_509: ;CALL - dec ebp + dec ebx jz rx_finish xor r11, 01b2873f2h mov ecx, r11d - test ebp, 63 + test bl, 63 jnz short rx_body_509 call rx_read_l1 rx_body_509: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] cmp r8d, 1826115244 @@ -9623,30 +9239,28 @@ taken_call_509: call rx_i_42 rx_i_510: ;FPSUB - dec ebp + dec ebx jz rx_finish xor r8, 0db65513ch mov ecx, r8d - test ebp, 63 + test bl, 63 jnz short rx_body_510 call rx_read_l1 rx_body_510: - xor rdi, rcx and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] subpd xmm0, xmm2 movaps xmm9, xmm0 rx_i_511: ;ROL_64 - dec ebp + dec ebx jz rx_finish xor r11, 02bd79286h mov ecx, r11d - test ebp, 63 + test bl, 63 jnz short rx_body_511 call rx_read_l1 rx_body_511: - xor rdi, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] mov rcx, r10 From d1a808643d478385c75ac5d1fe1ac7a24e6c04a8 Mon Sep 17 00:00:00 2001 From: tevador Date: Thu, 10 Jan 2019 22:04:55 +0100 Subject: [PATCH 07/35] Random accesses - JIT compiler --- makefile | 7 +- src/AssemblyGeneratorX86.cpp | 19 +-- src/AssemblyGeneratorX86.hpp | 2 +- src/CompiledVirtualMachine.cpp | 4 +- src/InterpretedVirtualMachine.cpp | 13 ++ src/InterpretedVirtualMachine.hpp | 6 + src/JitCompilerX86-static.S | 31 +++- src/JitCompilerX86-static.asm | 33 ++-- src/JitCompilerX86-static.hpp | 5 +- src/JitCompilerX86.cpp | 207 ++++++++++-------------- src/JitCompilerX86.hpp | 6 +- src/asm/program_epilogue_store.inc | 3 +- src/asm/program_prologue_linux.inc | 8 +- src/asm/program_prologue_load.inc | 4 +- src/asm/program_prologue_win64.inc | 8 +- src/asm/program_read.inc | 32 ++++ src/asm/program_read_f.inc | 13 -- src/asm/program_read_r.inc | 13 -- src/common.hpp | 1 + src/executeProgram-win64.asm | 6 +- src/main.cpp | 4 - src/program.inc | 245 +++++++++++++---------------- src/virtualMemory.cpp | 6 +- src/virtualMemory.hpp | 6 +- 24 files changed, 341 insertions(+), 341 deletions(-) create mode 100644 src/asm/program_read.inc delete mode 100644 src/asm/program_read_f.inc delete mode 100644 src/asm/program_read_r.inc diff --git a/makefile b/makefile index 21584cb..55e1abd 100644 --- a/makefile +++ b/makefile @@ -11,7 +11,7 @@ SRCDIR=src OBJDIR=obj LDFLAGS=-lpthread TOBJS=$(addprefix $(OBJDIR)/,instructionsPortable.o TestAluFpu.o) -ROBJS=$(addprefix $(OBJDIR)/,argon2_core.o argon2_ref.o AssemblyGeneratorX86.o blake2b.o CompiledVirtualMachine.o dataset.o JitCompilerX86.o instructionsPortable.o Instruction.o InterpretedVirtualMachine.o main.o Program.o softAes.o VirtualMachine.o t1ha2.o Cache.o) +ROBJS=$(addprefix $(OBJDIR)/,argon2_core.o argon2_ref.o AssemblyGeneratorX86.o blake2b.o CompiledVirtualMachine.o dataset.o JitCompilerX86.o instructionsPortable.o Instruction.o InterpretedVirtualMachine.o main.o Program.o softAes.o VirtualMachine.o t1ha2.o Cache.o virtualMemory.o) ifeq ($(PLATFORM),x86_64) ROBJS += $(OBJDIR)/JitCompilerX86-static.o endif @@ -60,7 +60,7 @@ $(OBJDIR)/dataset.o: $(addprefix $(SRCDIR)/,dataset.cpp common.hpp Pcg32.hpp) | $(OBJDIR)/JitCompilerX86.o: $(addprefix $(SRCDIR)/,JitCompilerX86.cpp JitCompilerX86.hpp Instruction.hpp instructionWeights.hpp) | $(OBJDIR) $(CXX) $(CXXFLAGS) -c $(SRCDIR)/JitCompilerX86.cpp -o $@ -$(OBJDIR)/JitCompilerX86-static.o: $(addprefix $(SRCDIR)/,JitCompilerX86-static.S $(addprefix asm/program_, prologue_linux.inc prologue_load.inc epilogue_linux.inc epilogue_store.inc read_r.inc read_f.inc)) | $(OBJDIR) +$(OBJDIR)/JitCompilerX86-static.o: $(addprefix $(SRCDIR)/,JitCompilerX86-static.S $(addprefix asm/program_, prologue_linux.inc prologue_load.inc epilogue_linux.inc epilogue_store.inc read.inc)) | $(OBJDIR) $(CXX) -x assembler-with-cpp -c $(SRCDIR)/JitCompilerX86-static.S -o $@ $(OBJDIR)/instructionsPortable.o: $(addprefix $(SRCDIR)/,instructionsPortable.cpp instructions.hpp intrinPortable.h) | $(OBJDIR) @@ -87,6 +87,9 @@ $(OBJDIR)/softAes.o: $(addprefix $(SRCDIR)/,softAes.cpp softAes.h) | $(OBJDIR) $(OBJDIR)/VirtualMachine.o: $(addprefix $(SRCDIR)/,VirtualMachine.cpp VirtualMachine.hpp common.hpp dataset.hpp) | $(OBJDIR) $(CXX) $(CXXFLAGS) -c $(SRCDIR)/VirtualMachine.cpp -o $@ +$(OBJDIR)/virtualMemory.o: $(addprefix $(SRCDIR)/,virtualMemory.cpp virtualMemory.hpp) | $(OBJDIR) + $(CXX) $(CXXFLAGS) -c $(SRCDIR)/virtualMemory.cpp -o $@ + $(OBJDIR)/t1ha2.o: $(addprefix $(SRCDIR)/t1ha/,t1ha2.c t1ha.h t1ha_bits.h) | $(OBJDIR) $(CC) $(CCFLAGS) -c $(SRCDIR)/t1ha/t1ha2.c -o $@ diff --git a/src/AssemblyGeneratorX86.cpp b/src/AssemblyGeneratorX86.cpp index 21b39c8..c2394c9 100644 --- a/src/AssemblyGeneratorX86.cpp +++ b/src/AssemblyGeneratorX86.cpp @@ -169,11 +169,12 @@ namespace RandomX { asmCode << "\t" << instrx86 << " xmm0, " << regF[instr.regb % RegistersCount] << std::endl; } - void AssemblyGeneratorX86::gencr(Instruction& instr) { + void AssemblyGeneratorX86::gencr(Instruction& instr, bool rax = true) { switch (instr.locc & 7) { case 0: - asmCode << "\tmov rcx, rax" << std::endl; + if(rax) + asmCode << "\tmov rcx, rax" << std::endl; asmCode << "\tmov eax, " << regR32[instr.regc % RegistersCount] << std::endl; asmCode << "\txor eax, 0" << std::hex << instr.addrc << "h" << std::dec << std::endl; asmCode << "\tand eax, " << (ScratchpadL2 - 1) << std::endl; @@ -186,7 +187,8 @@ namespace RandomX { case 1: case 2: case 3: - asmCode << "\tmov rcx, rax" << std::endl; + if (rax) + asmCode << "\tmov rcx, rax" << std::endl; asmCode << "\tmov eax, " << regR32[instr.regc % RegistersCount] << std::endl; asmCode << "\txor eax, 0" << std::hex << instr.addrc << "h" << std::dec << std::endl; asmCode << "\tand eax, " << (ScratchpadL1 - 1) << std::endl; @@ -197,9 +199,9 @@ namespace RandomX { return; default: - asmCode << "\tmov " << regR[instr.regc % RegistersCount] << ", rax" << std::endl; + asmCode << "\tmov " << regR[instr.regc % RegistersCount] << ", " << (rax ? "rax" : "rcx") << std::endl; if (trace) { - asmCode << "\tmov qword ptr [" << regScratchpadAddr << " + " << regIc << " * 8 + 262136], rax" << std::endl; + asmCode << "\tmov qword ptr [" << regScratchpadAddr << " + " << regIc << " * 8 + 262136], " << (rax ? "rax" : "rcx") << std::endl; } return; } @@ -208,7 +210,7 @@ namespace RandomX { void AssemblyGeneratorX86::gencf(Instruction& instr, bool move = true) { if(move) asmCode << "\tmovaps " << regF[instr.regc % RegistersCount] << ", xmm0" << std::endl; - const char* store = (instr.locc & 8) ? "movhpd" : "movlpd"; + const char* store = (instr.locc & 128) ? "movhpd" : "movlpd"; switch (instr.locc & 7) { case 4: @@ -463,14 +465,13 @@ namespace RandomX { void AssemblyGeneratorX86::h_FPROUND(Instruction& instr, int i) { genar(instr, i); - //asmCode << "\tmov rcx, rax" << std::endl; + asmCode << "\tmov rcx, rax" << std::endl; asmCode << "\tshl eax, 13" << std::endl; - //asmCode << "\tand rcx, -2048" << std::endl; asmCode << "\tand eax, 24576" << std::endl; - //asmCode << "\tmovaps " << regF[instr.regc % RegistersCount] << ", xmm0" << std::endl; asmCode << "\tor eax, 40896" << std::endl; asmCode << "\tmov dword ptr [rsp - 8], eax" << std::endl; asmCode << "\tldmxcsr dword ptr [rsp - 8]" << std::endl; + gencr(instr, false); } static inline const char* jumpCondition(Instruction& instr, bool invert = false) { diff --git a/src/AssemblyGeneratorX86.hpp b/src/AssemblyGeneratorX86.hpp index 6ffa2f9..bf5238a 100644 --- a/src/AssemblyGeneratorX86.hpp +++ b/src/AssemblyGeneratorX86.hpp @@ -44,7 +44,7 @@ namespace RandomX { void genbr1(Instruction&); void genbr132(Instruction&); void genbf(Instruction&, const char*); - void gencr(Instruction&); + void gencr(Instruction&, bool); void gencf(Instruction&, bool); void generateCode(Instruction&, int); diff --git a/src/CompiledVirtualMachine.cpp b/src/CompiledVirtualMachine.cpp index 8ae2f83..7803003 100644 --- a/src/CompiledVirtualMachine.cpp +++ b/src/CompiledVirtualMachine.cpp @@ -47,8 +47,8 @@ namespace RandomX { } void CompiledVirtualMachine::execute() { - executeProgram(reg, mem, scratchpad, readDataset); - //compiler.getProgramFunc()(reg, mem, scratchpad); + //executeProgram(reg, mem, scratchpad, readDataset); + compiler.getProgramFunc()(reg, mem, scratchpad); #ifdef TRACEVM for (int32_t i = InstructionCount - 1; i >= 0; --i) { std::cout << std::hex << tracepad[i].u64 << std::endl; diff --git a/src/InterpretedVirtualMachine.cpp b/src/InterpretedVirtualMachine.cpp index c436ef7..a6a3a0c 100644 --- a/src/InterpretedVirtualMachine.cpp +++ b/src/InterpretedVirtualMachine.cpp @@ -197,6 +197,17 @@ namespace RandomX { #define ALU_RETIRE(x) x(a, b, c); \ if(trace) std::cout << std::hex << /*a.u64 << " " << b.u64 << " " <<*/ c.u64 << std::endl; +#define CHECK_NOP_FPDIV(b, c) +#ifndef STATS +#define CHECK_NOP_FPADD(b, c) +#define CHECK_NOP_FPSUB(b, c) +#define CHECK_NOP_FPMUL(b, c) +#else +#define CHECK_NOP_FPADD(b, c) bool loeq = (b.lo.u64 == c.lo.u64); bool hieq = (b.hi.u64 == c.hi.u64); count_FPADD_nop += loeq + hieq; if(loeq && hieq) count_FPADD_nop2++; +#define CHECK_NOP_FPSUB(b, c) bool loeq = ((b.lo.u64 & INT64_MAX) == (c.lo.u64 & INT64_MAX)); bool hieq = ((b.hi.u64 & INT64_MAX) == (c.hi.u64 & INT64_MAX)); count_FPSUB_nop += loeq + hieq; if(loeq && hieq) count_FPSUB_nop2++; +#define CHECK_NOP_FPMUL(b, c) bool loeq = (b.lo.u64 == c.lo.u64); bool hieq = (b.hi.u64 == c.hi.u64); count_FPMUL_nop += loeq + hieq; if(loeq && hieq) count_FPMUL_nop2++; +#endif + #define FPU_RETIRE(x) x(a, b, c); \ writecf(inst, c); \ if(trace) { \ @@ -248,8 +259,10 @@ namespace RandomX { INC_COUNT(x) \ convertible_t a = loada(inst); \ fpu_reg_t& b = reg.f[inst.regb % RegistersCount]; \ + fpu_reg_t btemp = b; \ fpu_reg_t& c = reg.f[inst.regc % RegistersCount]; \ FPU_RETIRE(x) \ + CHECK_NOP_##x(btemp, c) \ } #define FPU_INST_NB(x) void InterpretedVirtualMachine::h_##x(Instruction& inst) { \ diff --git a/src/InterpretedVirtualMachine.hpp b/src/InterpretedVirtualMachine.hpp index b8fd98f..8c34936 100644 --- a/src/InterpretedVirtualMachine.hpp +++ b/src/InterpretedVirtualMachine.hpp @@ -83,6 +83,12 @@ namespace RandomX { int count_retdepth_max = 0; int count_endstack = 0; int count_instructions[ProgramLength] = { 0 }; + int count_FPADD_nop = 0; + int count_FPADD_nop2 = 0; + int count_FPSUB_nop = 0; + int count_FPSUB_nop2 = 0; + int count_FPMUL_nop = 0; + int count_FPMUL_nop2 = 0; #endif convertible_t loada(Instruction&); diff --git a/src/JitCompilerX86-static.S b/src/JitCompilerX86-static.S index be156ef..fdc32b1 100644 --- a/src/JitCompilerX86-static.S +++ b/src/JitCompilerX86-static.S @@ -29,9 +29,12 @@ .global DECL(randomx_program_prologue) .global DECL(randomx_program_begin) .global DECL(randomx_program_epilogue) -.global DECL(randomx_program_read_r) -.global DECL(randomx_program_read_f) +.global DECL(randomx_program_read_l1) +.global DECL(randomx_program_read_l2) .global DECL(randomx_program_end) +.global DECL(randomx_program_transform) + +#define db .byte .align 64 DECL(randomx_program_prologue): @@ -45,14 +48,26 @@ DECL(randomx_program_begin): DECL(randomx_program_epilogue): #include "asm/program_epilogue_linux.inc" -.align 64 -DECL(randomx_program_read_r): - #include "asm/program_read_r.inc" +#define scratchpad_mask and ecx, 2040 .align 64 -DECL(randomx_program_read_f): - #include "asm/program_read_f.inc" +DECL(randomx_program_read_l1): + #include "asm/program_read.inc" + +#undef scratchpad_mask + +#define scratchpad_mask and ecx, 32760 + +.align 64 +DECL(randomx_program_read_l2): + #include "asm/program_read.inc" + +#undef scratchpad_mask .align 64 DECL(randomx_program_end): - nop \ No newline at end of file + nop + +.align 8 +DECL(randomx_program_transform): + #include "asm/program_transform_address.inc" diff --git a/src/JitCompilerX86-static.asm b/src/JitCompilerX86-static.asm index d7d3d4b..7a2b3c4 100644 --- a/src/JitCompilerX86-static.asm +++ b/src/JitCompilerX86-static.asm @@ -20,9 +20,11 @@ _RANDOMX_JITX86_STATIC SEGMENT PAGE READ EXECUTE PUBLIC randomx_program_prologue PUBLIC randomx_program_begin PUBLIC randomx_program_epilogue -PUBLIC randomx_program_read_r -PUBLIC randomx_program_read_f +PUBLIC randomx_program_read_l1 +PUBLIC randomx_program_read_l2 PUBLIC randomx_program_end +PUBLIC randomx_program_transform + ALIGN 64 randomx_program_prologue PROC @@ -39,21 +41,34 @@ randomx_program_epilogue PROC include asm/program_epilogue_win64.inc randomx_program_epilogue ENDP -ALIGN 64 -randomx_program_read_r PROC - include asm/program_read_r.inc -randomx_program_read_r ENDP +scratchpad_mask MACRO + and ecx, 2040 +ENDM ALIGN 64 -randomx_program_read_f PROC - include asm/program_read_f.inc -randomx_program_read_f ENDP +randomx_program_read_l1 PROC + include asm/program_read.inc +randomx_program_read_l1 ENDP + +scratchpad_mask MACRO + and ecx, 32760 +ENDM + +ALIGN 64 +randomx_program_read_l2 PROC + include asm/program_read.inc +randomx_program_read_l2 ENDP ALIGN 64 randomx_program_end PROC nop randomx_program_end ENDP +ALIGN 8 +randomx_program_transform PROC + include asm/program_transform_address.inc +randomx_program_transform ENDP + _RANDOMX_JITX86_STATIC ENDS END \ No newline at end of file diff --git a/src/JitCompilerX86-static.hpp b/src/JitCompilerX86-static.hpp index 6052283..f5904ad 100644 --- a/src/JitCompilerX86-static.hpp +++ b/src/JitCompilerX86-static.hpp @@ -21,7 +21,8 @@ extern "C" { void randomx_program_prologue(); void randomx_program_begin(); void randomx_program_epilogue(); - void randomx_program_read_r(); - void randomx_program_read_f(); + void randomx_program_transform(); + void randomx_program_read_l1(); + void randomx_program_read_l2(); void randomx_program_end(); } \ No newline at end of file diff --git a/src/JitCompilerX86.cpp b/src/JitCompilerX86.cpp index b03a330..fda3746 100644 --- a/src/JitCompilerX86.cpp +++ b/src/JitCompilerX86.cpp @@ -48,12 +48,12 @@ namespace RandomX { REGISTER ALLOCATION: rax -> temporary - rbx -> MemoryRegisters& memory + rbx -> "ic" rcx -> temporary rdx -> temporary rsi -> convertible_t* scratchpad - rdi -> "ic" (instruction counter) - rbp -> beginning of VM stack + rdi -> beginning of VM stack + rbp -> "ma", "mx" rsp -> end of VM stack r8 -> "r0" r9 -> "r1" @@ -82,7 +82,8 @@ namespace RandomX { | saved registers | v - [rbp] RegisterFile& registerFile + [rdi+8] RegisterFile& registerFile + [rdi] uint8_t* dataset | | | VM stack @@ -97,18 +98,19 @@ namespace RandomX { const uint8_t* codePrologue = (uint8_t*)&randomx_program_prologue; const uint8_t* codeProgramBegin = (uint8_t*)&randomx_program_begin; const uint8_t* codeEpilogue = (uint8_t*)&randomx_program_epilogue; - const uint8_t* codeReadDatasetR = (uint8_t*)&randomx_program_read_r; - const uint8_t* codeReadDatasetF = (uint8_t*)&randomx_program_read_f; + const uint8_t* codeReadDatasetL1 = (uint8_t*)&randomx_program_read_l1; + const uint8_t* codeReadDatasetL2 = (uint8_t*)&randomx_program_read_l2; const uint8_t* codeProgramEnd = (uint8_t*)&randomx_program_end; + const uint32_t* addressTransformations = (uint32_t*)&randomx_program_transform; const int32_t prologueSize = codeProgramBegin - codePrologue; - const int32_t epilogueSize = codeReadDatasetR - codeEpilogue; - const int32_t readDatasetRSize = codeReadDatasetF - codeReadDatasetR; - const int32_t readDatasetFSize = codeProgramEnd - codeReadDatasetF; + const int32_t epilogueSize = codeReadDatasetL1 - codeEpilogue; + const int32_t readDatasetL1Size = codeReadDatasetL2 - codeReadDatasetL1; + const int32_t readDatasetL2Size = codeProgramEnd - codeReadDatasetL2; - const int32_t readDatasetFOffset = CodeSize - readDatasetFSize; - const int32_t readDatasetROffset = readDatasetFOffset - readDatasetRSize; - const int32_t epilogueOffset = readDatasetROffset - epilogueSize; + const int32_t readDatasetL2Offset = CodeSize - readDatasetL2Size; + const int32_t readDatasetL1Offset = readDatasetL2Offset - readDatasetL1Size; + const int32_t epilogueOffset = readDatasetL1Offset - epilogueSize; JitCompilerX86::JitCompilerX86() { #ifdef _WIN32 @@ -121,9 +123,9 @@ namespace RandomX { throw std::runtime_error("mmap failed"); #endif memcpy(code, codePrologue, prologueSize); - memcpy(code + CodeSize - readDatasetRSize - readDatasetFSize - epilogueSize, codeEpilogue, epilogueSize); - memcpy(code + CodeSize - readDatasetRSize - readDatasetFSize, codeReadDatasetR, readDatasetRSize); - memcpy(code + CodeSize - readDatasetFSize, codeReadDatasetF, readDatasetFSize); + memcpy(code + CodeSize - epilogueSize - readDatasetL1Size - readDatasetL2Size, codeEpilogue, epilogueSize); + memcpy(code + CodeSize - readDatasetL1Size - readDatasetL2Size, codeReadDatasetL1, readDatasetL1Size); + memcpy(code + CodeSize - readDatasetL2Size, codeReadDatasetL2, readDatasetL2Size); } void JitCompilerX86::generateProgram(Pcg32& gen) { @@ -140,12 +142,33 @@ namespace RandomX { emitByte(0xe9); emit(instructionOffsets[0] - (codePos + 4)); fixCallOffsets(); + uint32_t transformL1 = addressTransformations[gen.getUniform(0, TransformationCount - 1)]; + uint32_t transformL2 = addressTransformations[gen.getUniform(0, TransformationCount - 1)]; + *reinterpret_cast(code + readDatasetL1Offset + 1) = transformL1; + *reinterpret_cast(code + readDatasetL2Offset + 1) = transformL2; } void JitCompilerX86::generateCode(Instruction& instr, int i) { instructionOffsets.push_back(codePos); - emit(0x840fcfff); //dec edx; jz + emit(0x840fcbff); //dec ebx; jz emit(epilogueOffset - (codePos + 4)); //jump offset (RIP-relative) + emit(uint16_t(0x8149)); //xor + emitByte(0xf0 + (instr.rega % RegistersCount)); + emit(instr.addra); + emit(uint16_t(0x8b41)); //mov + emitByte(0xc8 + (instr.rega % RegistersCount)); //ecx, rega + emit(0x753fc3f6); //test bl,0x3f; jne + emit(uint16_t(0xe805)); + if (instr.loca & 3) { //A.LOC.W + emit(readDatasetL1Offset - (codePos + 4)); + } + else { + emit(readDatasetL2Offset - (codePos + 4)); + } + if ((instr.loca & 192) == 0) { //A.LOC.X + emit(uint16_t(0x3348)); + emitByte(0xe9); //xor rbp, rcx + } auto generator = engine[instr.opcode]; (this->*generator)(instr, i); } @@ -157,73 +180,26 @@ namespace RandomX { } void JitCompilerX86::genar(Instruction& instr) { - emit(uint16_t(0x8149)); //xor - emitByte(0xf0 + (instr.rega % RegistersCount)); - emit(instr.addra); - switch (instr.loca & 7) - { - case 0: - case 1: - case 2: - case 3: - emit(uint16_t(0x8b41)); //mov - emitByte(0xc8 + (instr.rega % RegistersCount)); //ecx, rega - emitByte(0xe8); //call - emit(readDatasetROffset - (codePos + 4)); - return; - - case 4: - emit(uint16_t(0x8b41)); //mov - emitByte(0xc0 + (instr.rega % RegistersCount)); //eax, rega - emitByte(0x25); //and - emit(ScratchpadL2 - 1); //whole scratchpad - emit(0xc6048b48); // mov rax,QWORD PTR [rsi+rax*8] - return; - - default: - emit(uint16_t(0x8b41)); //mov - emitByte(0xc0 + (instr.rega % RegistersCount)); //eax, rega - emitByte(0x25); //and - emit(ScratchpadL1 - 1); //first 16 KiB of scratchpad - emit(0xc6048b48); // mov rax,QWORD PTR [rsi+rax*8] - return; + emit(uint16_t(0xe181)); //and ecx, + if (instr.loca & 3) { + emit(ScratchpadL1 - 1); //first 16 KiB of scratchpad } + else { + emit(ScratchpadL2 - 1); //whole scratchpad + } + emit(0xce048b48); //mov rax,QWORD PTR [rsi+rcx*8] } void JitCompilerX86::genaf(Instruction& instr) { - emit(uint16_t(0x8149)); //xor - emitByte(0xf0 + (instr.rega % RegistersCount)); - emit(instr.addra); - switch (instr.loca & 7) - { - case 0: - case 1: - case 2: - case 3: - emit(uint16_t(0x8b41)); //mov - emitByte(0xc8 + (instr.rega % RegistersCount)); //ecx, rega - emitByte(0xe8); //call - emit(readDatasetFOffset - (codePos + 4)); - return; - - case 4: - emit(uint16_t(0x8b41)); //mov - emitByte(0xc0 + (instr.rega % RegistersCount)); //eax, rega - emitByte(0x25); //and - emit(ScratchpadL2 - 1); //whole scratchpad - emitByte(0xf3); - emit(0xc604e60f); //cvtdq2pd xmm0,QWORD PTR [rsi+rax*8] - return; - - default: - emit(uint16_t(0x8b41)); //mov - emitByte(0xc0 + (instr.rega % RegistersCount)); //eax, rega - emitByte(0x25); //and + emit(uint16_t(0xe181)); //and ecx, + if (instr.loca & 3) { emit(ScratchpadL1 - 1); //first 16 KiB of scratchpad - emitByte(0xf3); - emit(0xc604e60f); //cvtdq2pd xmm0,QWORD PTR [rsi+rax*8] - return; } + else { + emit(ScratchpadL2 - 1); //whole scratchpad + } + emitByte(0xf3); + emit(0xce04e60f); //cvtdq2pd xmm0,QWORD PTR [rsi+rcx*8] } void JitCompilerX86::genbr0(Instruction& instr, uint16_t opcodeReg, uint16_t opcodeImm) { @@ -274,8 +250,13 @@ namespace RandomX { } - void JitCompilerX86::scratchpadStoreR(Instruction& instr, uint32_t scratchpadSize) { - emit(0x41c88b48); //mov rcx, rax; REX + void JitCompilerX86::scratchpadStoreR(Instruction& instr, uint32_t scratchpadSize, bool rax) { + if (rax) { + emit(0x41c88b48); //mov rcx, rax; REX + } + else { + emitByte(0x41); + } emitByte(0x8b); // mov emitByte(0xc0 + (instr.regc % RegistersCount)); //eax, regc emitByte(0x35); // xor eax @@ -285,22 +266,27 @@ namespace RandomX { emit(0xc60c8948); // mov QWORD PTR [rsi+rax*8],rcx } - void JitCompilerX86::gencr(Instruction& instr) { + void JitCompilerX86::gencr(Instruction& instr, bool rax = true) { switch (instr.locc & 7) { case 0: - scratchpadStoreR(instr, ScratchpadL2); + scratchpadStoreR(instr, ScratchpadL2, rax); break; case 1: case 2: case 3: - scratchpadStoreR(instr, ScratchpadL1); + scratchpadStoreR(instr, ScratchpadL1, rax); break; default: emit(uint16_t(0x8b4c)); //mov - emitByte(0xc0 + 8 * (instr.regc % RegistersCount)); //regc, rax + if (rax) { + emitByte(0xc0 + 8 * (instr.regc % RegistersCount)); //regc, rax + } + else { + emitByte(0xc1 + 8 * (instr.regc % RegistersCount)); //regc, rcx + } break; } } @@ -322,29 +308,21 @@ namespace RandomX { emitByte(0xc6); } - void JitCompilerX86::gencf(Instruction& instr, bool alwaysLow = false) { + void JitCompilerX86::gencf(Instruction& instr) { int regc = (instr.regc % RegistersCount); - if (!alwaysLow) { - if (regc <= 1) { - emitByte(0x44); //REX - } - emit(uint16_t(0x280f)); //movaps - emitByte(0xc0 + 8 * regc); // regc, xmm0 + if (regc <= 1) { + emitByte(0x44); //REX } - switch (instr.locc & 7) + emit(uint16_t(0x280f)); //movaps + emitByte(0xc0 + 8 * regc); // regc, xmm0 + if (instr.locc & 4) //C.LOC.R { - case 4: - scratchpadStoreF(instr, regc, ScratchpadL2, !alwaysLow && (instr.locc & 8)); - break; - - case 5: - case 6: - case 7: - scratchpadStoreF(instr, regc, ScratchpadL1, !alwaysLow && (instr.locc & 8)); - break; - - default: - break; + if (instr.locc & 3) { //C.LOC.W + scratchpadStoreF(instr, regc, ScratchpadL1, (instr.locc & 128)); //first 16 KiB of scratchpad + } + else { + scratchpadStoreF(instr, regc, ScratchpadL2, (instr.locc & 128)); //whole scratchpad + } } } @@ -596,24 +574,11 @@ namespace RandomX { void JitCompilerX86::h_FPROUND(Instruction& instr, int i) { genar(instr); - emit(0x81480de0c1c88b48); - emit(0x600025fffff800e1); - emit(uint16_t(0x0000)); - emitByte(0xf2); - int regc = (instr.regc % RegistersCount); - if (regc <= 1) { - emitByte(0x4c); //REX - } - else { - emitByte(0x48); //REX - } - emit(uint16_t(0x2a0f)); - emitByte(0xc1 + 8 * regc); - emitByte(0x0d); - emit(0xf824448900009fc0); - emit(0x2454ae0f); //ldmxcsr DWORD PTR [rsp-0x8] + emit(0x00250de0c1c88b48); //mov rcx,rax; shl eax,0xd + emit(0x00009fc00d000060); //and eax,0x6000; or eax,0x9fc0 + emit(0x2454ae0ff8244489); //ldmxcsr DWORD PTR [rsp-0x8] emitByte(0xf8); - gencf(instr, true); + gencr(instr, false); //result in rcx } static inline uint8_t jumpCondition(Instruction& instr, bool invert = false) { @@ -670,7 +635,7 @@ namespace RandomX { if ((instr.locc & 7) <= 3) { crlen = 17; } - emit(0x74e53b48); //cmp rsp, rbp; je + emit(0x74e73b48); //cmp rsp, rdi; je emitByte(11 + crlen); emitByte(0x48); emit(0x08244433); //xor rax,QWORD PTR [rsp+0x8] diff --git a/src/JitCompilerX86.hpp b/src/JitCompilerX86.hpp index e2c432c..cea067c 100644 --- a/src/JitCompilerX86.hpp +++ b/src/JitCompilerX86.hpp @@ -64,10 +64,10 @@ namespace RandomX { void genbr1(Instruction&, uint16_t, uint16_t); void genbr132(Instruction&, uint16_t, uint8_t); void genbf(Instruction&, uint8_t); - void scratchpadStoreR(Instruction&, uint32_t); + void scratchpadStoreR(Instruction&, uint32_t, bool); void scratchpadStoreF(Instruction&, int, uint32_t, bool); - void gencr(Instruction&); - void gencf(Instruction&, bool); + void gencr(Instruction&, bool); + void gencf(Instruction&); void generateCode(Instruction&, int); void fixCallOffsets(); diff --git a/src/asm/program_epilogue_store.inc b/src/asm/program_epilogue_store.inc index b7b779b..90b26ce 100644 --- a/src/asm/program_epilogue_store.inc +++ b/src/asm/program_epilogue_store.inc @@ -1,8 +1,9 @@ ;# unroll VM stack - mov rsp, rbp + mov rsp, rdi ;# save VM register values pop rcx + pop rcx mov qword ptr [rcx+0], r8 mov qword ptr [rcx+8], r9 mov qword ptr [rcx+16], r10 diff --git a/src/asm/program_prologue_linux.inc b/src/asm/program_prologue_linux.inc index 8d09d88..6bc3bd2 100644 --- a/src/asm/program_prologue_linux.inc +++ b/src/asm/program_prologue_linux.inc @@ -7,9 +7,11 @@ push r15 ;# function arguments - push rdi ;# RegisterFile& registerFile - mov rbx, rsi ;# MemoryRegisters& memory - mov rsi, rdx ;# convertible_t* scratchpad + push rdi ;# RegisterFile& registerFile + mov rbp, qword ptr [rsi] ;# "mx", "ma" + mov rax, qword ptr [rsi+8] ;# uint8_t* dataset + push rax + mov rsi, rdx ;# convertible_t* scratchpad mov rcx, rdi #include "program_prologue_load.inc" diff --git a/src/asm/program_prologue_load.inc b/src/asm/program_prologue_load.inc index df44c08..ef4f96e 100644 --- a/src/asm/program_prologue_load.inc +++ b/src/asm/program_prologue_load.inc @@ -1,5 +1,5 @@ - mov rbp, rsp ;# beginning of VM stack - mov rdi, 1048577 ;# number of VM instructions to execute + 1 + mov rdi, rsp ;# beginning of VM stack + mov ebx, 1048577 ;# number of VM instructions to execute + 1 xorps xmm10, xmm10 cmpeqpd xmm10, xmm10 diff --git a/src/asm/program_prologue_win64.inc b/src/asm/program_prologue_win64.inc index 6059904..bbf7851 100644 --- a/src/asm/program_prologue_win64.inc +++ b/src/asm/program_prologue_win64.inc @@ -15,9 +15,11 @@ movdqu xmmword ptr [rsp+0], xmm10 ;# function arguments - push rcx ;# RegisterFile& registerFile - mov rbx, rdx ;# MemoryRegisters& memory - mov rsi, r8 ;# convertible_t* scratchpad + push rcx ;# RegisterFile& registerFile + mov rbp, qword ptr [rdx] ;# "mx", "ma" + mov rax, qword ptr [rdx+8] ;# uint8_t* dataset + push rax + mov rsi, r8 ;# convertible_t* scratchpad include program_prologue_load.inc diff --git a/src/asm/program_read.inc b/src/asm/program_read.inc new file mode 100644 index 0000000..adf8e92 --- /dev/null +++ b/src/asm/program_read.inc @@ -0,0 +1,32 @@ + push rcx ;# preserve ecx + db 0, 0, 0, 0 ;# TransformAddress placeholder + mov rax, qword ptr [rdi] ;# load the dataset address + xor rbp, rcx ;# modify "mx" + ;# prefetch cacheline "mx" + and rbp, -64 ;# align "mx" to the start of a cache line + mov edx, ebp ;# edx = mx + prefetchnta byte ptr [rax+rdx] + ;# read cacheline "ma" + ror rbp, 32 ;# swap "ma" and "mx" + mov edx, ebp ;# edx = ma + scratchpad_mask ;# limit address to the specified scratchpad size aligned to multiple of 8 + lea rcx, [rsi+rcx*8] ;# scratchpad cache line + lea rax, [rax+rdx] ;# dataset cache line + mov rdx, qword ptr [rax+0] ;# load first dataset quadword (prefetched into the cache by now) + xor qword ptr [rcx+0], rdx ;# XOR the dataset item with a scratchpad item, repeat for the rest of the cacheline + mov rdx, qword ptr [rax+8] + xor qword ptr [rcx+8], rdx + mov rdx, qword ptr [rax+16] + xor qword ptr [rcx+16], rdx + mov rdx, qword ptr [rax+24] + xor qword ptr [rcx+24], rdx + mov rdx, qword ptr [rax+32] + xor qword ptr [rcx+32], rdx + mov rdx, qword ptr [rax+40] + xor qword ptr [rcx+40], rdx + mov rdx, qword ptr [rax+48] + xor qword ptr [rcx+48], rdx + mov rdx, qword ptr [rax+56] + xor qword ptr [rcx+56], rdx + pop rcx ;# restore ecx + ret \ No newline at end of file diff --git a/src/asm/program_read_f.inc b/src/asm/program_read_f.inc deleted file mode 100644 index 1d70dab..0000000 --- a/src/asm/program_read_f.inc +++ /dev/null @@ -1,13 +0,0 @@ - mov edx, dword ptr [rbx] ;# ma - mov rax, qword ptr [rbx+8] ;# dataset - cvtdq2pd xmm0, qword ptr [rax+rdx] - add dword ptr [rbx], 8 - xor ecx, dword ptr [rbx+4] ;# mx - mov dword ptr [rbx+4], ecx - test ecx, 65528 - jne short rx_read_dataset_f_ret - and ecx, -8 - mov dword ptr [rbx], ecx - prefetcht0 byte ptr [rax+rcx] -rx_read_dataset_f_ret: - ret 0 \ No newline at end of file diff --git a/src/asm/program_read_r.inc b/src/asm/program_read_r.inc deleted file mode 100644 index b3102dc..0000000 --- a/src/asm/program_read_r.inc +++ /dev/null @@ -1,13 +0,0 @@ - mov eax, dword ptr [rbx] ;# ma - mov rdx, qword ptr [rbx+8] ;# dataset - mov rax, qword ptr [rdx+rax] - add dword ptr [rbx], 8 - xor ecx, dword ptr [rbx+4] ;# mx - mov dword ptr [rbx+4], ecx - test ecx, 65528 - jne short rx_read_dataset_r_ret - and ecx, -8 - mov dword ptr [rbx], ecx - prefetcht0 byte ptr [rdx+rcx] -rx_read_dataset_r_ret: - ret 0 \ No newline at end of file diff --git a/src/common.hpp b/src/common.hpp index 12b74c1..acda52a 100644 --- a/src/common.hpp +++ b/src/common.hpp @@ -77,6 +77,7 @@ namespace RandomX { constexpr uint32_t ScratchpadLength = ScratchpadSize / sizeof(convertible_t); constexpr uint32_t ScratchpadL1 = ScratchpadSize / 16 / sizeof(convertible_t); constexpr uint32_t ScratchpadL2 = ScratchpadSize / sizeof(convertible_t); + constexpr uint32_t TransformationCount = 90; constexpr int RegistersCount = 8; class Cache; diff --git a/src/executeProgram-win64.asm b/src/executeProgram-win64.asm index ec39c60..841bb16 100644 --- a/src/executeProgram-win64.asm +++ b/src/executeProgram-win64.asm @@ -158,10 +158,14 @@ executeProgram PROC pslldq xmm7, 8 cvtsi2sd xmm7, qword ptr [rcx+112] - ; program body + jmp program_begin + ; program body +ALIGN 64 +program_begin: include program.inc +ALIGN 64 rx_finish: ; unroll the stack mov rsp, rdi diff --git a/src/main.cpp b/src/main.cpp index 81d49ec..a0ffc0a 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -277,10 +277,6 @@ int main(int argc, char** argv) { if(programCount == 1000) std::cout << "Reference result: 3e1c5f9b9d0bf8ffa250f860bf5f7ab76ac823b206ddee6a592660119a3640c6" << std::endl; std::cout << "Performance: " << programCount / elapsed << " programs per second" << std::endl; - /*if (threadCount == 1 && !compiled) { - auto ivm = (RandomX::InterpretedVirtualMachine*)vms[0]; - std::cout << ivm->getProgam(); - }*/ } catch (std::exception& e) { std::cout << "ERROR: " << e.what() << std::endl; diff --git a/src/program.inc b/src/program.inc index 5dd1b4e..e078cc3 100644 --- a/src/program.inc +++ b/src/program.inc @@ -76,11 +76,13 @@ rx_body_3: xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] + mov rcx, rax shl eax, 13 and eax, 24576 or eax, 40896 mov dword ptr [rsp - 8], eax ldmxcsr dword ptr [rsp - 8] + mov r8, rcx rx_i_4: ;MULH_64 dec ebx @@ -153,7 +155,7 @@ rx_body_7: mov eax, r14d xor eax, 057c8c41bh and eax, 32767 - movhpd qword ptr [rsi + rax * 8], xmm6 + movlpd qword ptr [rsi + rax * 8], xmm6 rx_i_8: ;SHL_64 dec ebx @@ -218,7 +220,7 @@ rx_body_11: mov eax, r12d xor eax, 0852d40d8h and eax, 2047 - movhpd qword ptr [rsi + rax * 8], xmm4 + movlpd qword ptr [rsi + rax * 8], xmm4 rx_i_12: ;CALL dec ebx @@ -355,7 +357,7 @@ rx_body_18: mov eax, r11d xor eax, 0869baa81h and eax, 2047 - movhpd qword ptr [rsi + rax * 8], xmm3 + movlpd qword ptr [rsi + rax * 8], xmm3 rx_i_19: ;FPSUB dec ebx @@ -372,7 +374,7 @@ rx_body_19: subpd xmm0, xmm8 movaps xmm7, xmm0 -rx_i_20: ;FPMUL +rx_i_20: ;FPSUB dec ebx jz rx_finish xor r13, 0ecca967dh @@ -383,15 +385,12 @@ rx_i_20: ;FPMUL rx_body_20: and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] - mulpd xmm0, xmm2 - movaps xmm1, xmm0 - cmpeqpd xmm1, xmm1 - andps xmm0, xmm1 + subpd xmm0, xmm2 movaps xmm7, xmm0 mov eax, r15d xor eax, 0aad81365h and eax, 2047 - movhpd qword ptr [rsi + rax * 8], xmm7 + movlpd qword ptr [rsi + rax * 8], xmm7 rx_i_21: ;FPADD dec ebx @@ -482,7 +481,7 @@ rx_body_25: mov eax, r14d xor eax, 0baf5c2d4h and eax, 2047 - movhpd qword ptr [rsi + rax * 8], xmm6 + movlpd qword ptr [rsi + rax * 8], xmm6 rx_i_26: ;IMUL_32 dec ebx @@ -580,7 +579,7 @@ rx_body_31: mov eax, r14d xor eax, 01e2da792h and eax, 2047 - movhpd qword ptr [rsi + rax * 8], xmm6 + movlpd qword ptr [rsi + rax * 8], xmm6 rx_i_32: ;XOR_64 dec ebx @@ -668,7 +667,7 @@ rx_body_36: andps xmm0, xmm1 movaps xmm7, xmm0 -rx_i_37: ;FPMUL +rx_i_37: ;FPSUB dec ebx jz rx_finish xor r12, 0d0706601h @@ -679,10 +678,7 @@ rx_i_37: ;FPMUL rx_body_37: and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] - mulpd xmm0, xmm2 - movaps xmm1, xmm0 - cmpeqpd xmm1, xmm1 - andps xmm0, xmm1 + subpd xmm0, xmm2 movaps xmm9, xmm0 mov eax, r9d xor eax, 0bca81c78h @@ -764,7 +760,7 @@ taken_call_41: push rax call rx_i_127 -rx_i_42: ;FPSUB +rx_i_42: ;FPADD dec ebx jz rx_finish xor r15, 0bc1de9f6h @@ -776,7 +772,7 @@ rx_body_42: xor rbp, rcx and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] - subpd xmm0, xmm6 + addpd xmm0, xmm6 movaps xmm6, xmm0 rx_i_43: ;SUB_64 @@ -887,7 +883,7 @@ rx_body_48: and eax, 2047 movhpd qword ptr [rsi + rax * 8], xmm9 -rx_i_49: ;FPMUL +rx_i_49: ;FPSUB dec ebx jz rx_finish xor r8, 0f96c6a45h @@ -898,10 +894,7 @@ rx_i_49: ;FPMUL rx_body_49: and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] - mulpd xmm0, xmm3 - movaps xmm1, xmm0 - cmpeqpd xmm1, xmm1 - andps xmm0, xmm1 + subpd xmm0, xmm3 movaps xmm5, xmm0 rx_i_50: ;OR_32 @@ -1018,7 +1011,7 @@ rx_body_55: mov eax, r11d xor eax, 07c79cddh and eax, 2047 - movlpd qword ptr [rsi + rax * 8], xmm3 + movhpd qword ptr [rsi + rax * 8], xmm3 rx_i_56: ;AND_64 dec ebx @@ -1144,7 +1137,7 @@ taken_call_61: push rax call rx_i_120 -rx_i_62: ;FPMUL +rx_i_62: ;FPSUB dec ebx jz rx_finish xor r15, 0c3089414h @@ -1155,17 +1148,14 @@ rx_i_62: ;FPMUL rx_body_62: and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] - mulpd xmm0, xmm8 - movaps xmm1, xmm0 - cmpeqpd xmm1, xmm1 - andps xmm0, xmm1 + subpd xmm0, xmm8 movaps xmm2, xmm0 mov eax, r10d xor eax, 05c4789e3h and eax, 2047 - movlpd qword ptr [rsi + rax * 8], xmm2 + movhpd qword ptr [rsi + rax * 8], xmm2 -rx_i_63: ;FPMUL +rx_i_63: ;FPSUB dec ebx jz rx_finish xor r9, 065cf272eh @@ -1176,10 +1166,7 @@ rx_i_63: ;FPMUL rx_body_63: and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] - mulpd xmm0, xmm7 - movaps xmm1, xmm0 - cmpeqpd xmm1, xmm1 - andps xmm0, xmm1 + subpd xmm0, xmm7 movaps xmm8, xmm0 rx_i_64: ;SUB_64 @@ -1253,7 +1240,7 @@ taken_call_67: push rax call rx_i_79 -rx_i_68: ;FPSUB +rx_i_68: ;FPADD dec ebx jz rx_finish xor r13, 03aa5c3a4h @@ -1264,7 +1251,7 @@ rx_i_68: ;FPSUB rx_body_68: and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] - subpd xmm0, xmm2 + addpd xmm0, xmm2 movaps xmm4, xmm0 mov eax, r12d xor eax, 03c51ef39h @@ -1354,11 +1341,16 @@ rx_i_73: ;FPROUND rx_body_73: and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] + mov rcx, rax shl eax, 13 and eax, 24576 or eax, 40896 mov dword ptr [rsp - 8], eax ldmxcsr dword ptr [rsp - 8] + mov eax, r10d + xor eax, 040624270h + and eax, 2047 + mov qword ptr [rsi + rax * 8], rcx rx_i_74: ;MUL_64 dec ebx @@ -1722,7 +1714,7 @@ rx_body_93: mov eax, r10d xor eax, 07e48a0d8h and eax, 2047 - movlpd qword ptr [rsi + rax * 8], xmm2 + movhpd qword ptr [rsi + rax * 8], xmm2 rx_i_94: ;RET dec ebx @@ -1830,7 +1822,7 @@ rx_body_99: mov eax, r12d xor eax, 04c21df83h and eax, 2047 - movlpd qword ptr [rsi + rax * 8], xmm4 + movhpd qword ptr [rsi + rax * 8], xmm4 rx_i_100: ;ADD_64 dec ebx @@ -1955,7 +1947,7 @@ rx_body_106: mov eax, r12d xor eax, 03cb2505h and eax, 2047 - movhpd qword ptr [rsi + rax * 8], xmm4 + movlpd qword ptr [rsi + rax * 8], xmm4 rx_i_107: ;CALL dec ebx @@ -1999,7 +1991,7 @@ rx_body_108: mov eax, r9d xor eax, 0678b65beh and eax, 32767 - movlpd qword ptr [rsi + rax * 8], xmm9 + movhpd qword ptr [rsi + rax * 8], xmm9 rx_i_109: ;FPADD dec ebx @@ -2207,7 +2199,7 @@ rx_body_120: addpd xmm0, xmm4 movaps xmm8, xmm0 -rx_i_121: ;FPMUL +rx_i_121: ;FPSUB dec ebx jz rx_finish xor r9, 03ab8f73h @@ -2218,10 +2210,7 @@ rx_i_121: ;FPMUL rx_body_121: and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] - mulpd xmm0, xmm5 - movaps xmm1, xmm0 - cmpeqpd xmm1, xmm1 - andps xmm0, xmm1 + subpd xmm0, xmm5 movaps xmm8, xmm0 rx_i_122: ;RET @@ -2813,7 +2802,7 @@ rx_body_153: mov eax, r8d xor eax, 09111c981h and eax, 2047 - movlpd qword ptr [rsi + rax * 8], xmm8 + movhpd qword ptr [rsi + rax * 8], xmm8 rx_i_154: ;MUL_32 dec ebx @@ -3196,11 +3185,13 @@ rx_i_174: ;FPROUND rx_body_174: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] + mov rcx, rax shl eax, 13 and eax, 24576 or eax, 40896 mov dword ptr [rsp - 8], eax ldmxcsr dword ptr [rsp - 8] + mov r14, rcx rx_i_175: ;SAR_64 dec ebx @@ -3431,7 +3422,7 @@ rx_body_187: andps xmm0, xmm1 movaps xmm5, xmm0 -rx_i_188: ;FPMUL +rx_i_188: ;FPSUB dec ebx jz rx_finish xor r9, 04659becbh @@ -3443,10 +3434,7 @@ rx_body_188: xor rbp, rcx and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] - mulpd xmm0, xmm3 - movaps xmm1, xmm0 - cmpeqpd xmm1, xmm1 - andps xmm0, xmm1 + subpd xmm0, xmm3 movaps xmm4, xmm0 rx_i_189: ;FPROUND @@ -3460,11 +3448,16 @@ rx_i_189: ;FPROUND rx_body_189: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] + mov rcx, rax shl eax, 13 and eax, 24576 or eax, 40896 mov dword ptr [rsp - 8], eax ldmxcsr dword ptr [rsp - 8] + mov eax, r13d + xor eax, 0e6f1a3b7h + and eax, 2047 + mov qword ptr [rsi + rax * 8], rcx rx_i_190: ;RET dec ebx @@ -3761,7 +3754,7 @@ rx_body_205: andps xmm0, xmm1 movaps xmm5, xmm0 -rx_i_206: ;FPMUL +rx_i_206: ;FPSUB dec ebx jz rx_finish xor r11, 0e836a177h @@ -3773,10 +3766,7 @@ rx_body_206: xor rbp, rcx and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] - mulpd xmm0, xmm7 - movaps xmm1, xmm0 - cmpeqpd xmm1, xmm1 - andps xmm0, xmm1 + subpd xmm0, xmm7 movaps xmm4, xmm0 rx_i_207: ;AND_32 @@ -4085,7 +4075,7 @@ rx_body_223: mov eax, r10d xor eax, 07fca59eeh and eax, 32767 - movlpd qword ptr [rsi + rax * 8], xmm2 + movhpd qword ptr [rsi + rax * 8], xmm2 rx_i_224: ;SAR_64 dec ebx @@ -4171,7 +4161,7 @@ rx_body_227: mov eax, r11d xor eax, 0aabe2a0ah and eax, 2047 - movlpd qword ptr [rsi + rax * 8], xmm3 + movhpd qword ptr [rsi + rax * 8], xmm3 rx_i_228: ;CALL dec ebx @@ -4313,11 +4303,16 @@ rx_i_234: ;FPROUND rx_body_234: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] + mov rcx, rax shl eax, 13 and eax, 24576 or eax, 40896 mov dword ptr [rsp - 8], eax ldmxcsr dword ptr [rsp - 8] + mov eax, r12d + xor eax, 04d2e9e7dh + and eax, 2047 + mov qword ptr [rsi + rax * 8], rcx rx_i_235: ;IMUL_32 dec ebx @@ -4438,7 +4433,7 @@ rx_body_241: mov eax, r15d xor eax, 0bc2423ebh and eax, 2047 - movhpd qword ptr [rsi + rax * 8], xmm7 + movlpd qword ptr [rsi + rax * 8], xmm7 rx_i_242: ;MULH_64 dec ebx @@ -4734,7 +4729,7 @@ rx_body_257: mov eax, r11d xor eax, 0373b1b6fh and eax, 2047 - movhpd qword ptr [rsi + rax * 8], xmm3 + movlpd qword ptr [rsi + rax * 8], xmm3 rx_i_258: ;MUL_32 dec ebx @@ -4771,7 +4766,7 @@ rx_body_259: addpd xmm0, xmm9 movaps xmm3, xmm0 -rx_i_260: ;FPMUL +rx_i_260: ;FPSUB dec ebx jz rx_finish xor r13, 0f94e9fa9h @@ -4783,10 +4778,7 @@ rx_body_260: xor rbp, rcx and ecx, 32767 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] - mulpd xmm0, xmm5 - movaps xmm1, xmm0 - cmpeqpd xmm1, xmm1 - andps xmm0, xmm1 + subpd xmm0, xmm5 movaps xmm9, xmm0 rx_i_261: ;FPSQRT @@ -4806,7 +4798,7 @@ rx_body_261: mov eax, r11d xor eax, 0745a48e9h and eax, 2047 - movlpd qword ptr [rsi + rax * 8], xmm3 + movhpd qword ptr [rsi + rax * 8], xmm3 rx_i_262: ;OR_32 dec ebx @@ -5044,7 +5036,7 @@ rx_body_274: mov eax, r14d xor eax, 06a2b2b5bh and eax, 2047 - movlpd qword ptr [rsi + rax * 8], xmm6 + movhpd qword ptr [rsi + rax * 8], xmm6 rx_i_275: ;OR_64 dec ebx @@ -5121,7 +5113,7 @@ rx_body_278: mov eax, r12d xor eax, 02d00ad10h and eax, 2047 - movlpd qword ptr [rsi + rax * 8], xmm4 + movhpd qword ptr [rsi + rax * 8], xmm4 rx_i_279: ;FPSUB dec ebx @@ -5139,7 +5131,7 @@ rx_body_279: mov eax, r9d xor eax, 0475ade01h and eax, 2047 - movhpd qword ptr [rsi + rax * 8], xmm9 + movlpd qword ptr [rsi + rax * 8], xmm9 rx_i_280: ;AND_64 dec ebx @@ -5210,7 +5202,7 @@ rx_body_283: and eax, 2047 mov qword ptr [rsi + rax * 8], rcx -rx_i_284: ;FPSUB +rx_i_284: ;FPADD dec ebx jz rx_finish xor r15, 0e68f36ach @@ -5222,7 +5214,7 @@ rx_body_284: xor rbp, rcx and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] - subpd xmm0, xmm6 + addpd xmm0, xmm6 movaps xmm9, xmm0 mov eax, r9d xor eax, 0936f2960h @@ -5313,7 +5305,7 @@ rx_body_289: andps xmm0, xmm1 movaps xmm8, xmm0 -rx_i_290: ;FPMUL +rx_i_290: ;FPSUB dec ebx jz rx_finish xor r15, 060665748h @@ -5324,10 +5316,7 @@ rx_i_290: ;FPMUL rx_body_290: and ecx, 32767 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] - mulpd xmm0, xmm8 - movaps xmm1, xmm0 - cmpeqpd xmm1, xmm1 - andps xmm0, xmm1 + subpd xmm0, xmm8 movaps xmm9, xmm0 rx_i_291: ;RET @@ -5531,7 +5520,7 @@ rx_body_301: mov eax, r15d xor eax, 0433cf2d6h and eax, 32767 - movlpd qword ptr [rsi + rax * 8], xmm7 + movhpd qword ptr [rsi + rax * 8], xmm7 rx_i_302: ;ADD_64 dec ebx @@ -5937,7 +5926,7 @@ rx_body_324: mov eax, r9d xor eax, 0944856d4h and eax, 32767 - movlpd qword ptr [rsi + rax * 8], xmm9 + movhpd qword ptr [rsi + rax * 8], xmm9 rx_i_325: ;SHL_64 dec ebx @@ -6076,7 +6065,7 @@ rx_body_332: mov eax, r11d xor eax, 0116c919eh and eax, 2047 - movhpd qword ptr [rsi + rax * 8], xmm3 + movlpd qword ptr [rsi + rax * 8], xmm3 rx_i_333: ;XOR_64 dec ebx @@ -6222,7 +6211,7 @@ rx_body_341: and eax, 2047 mov qword ptr [rsi + rax * 8], rcx -rx_i_342: ;FPMUL +rx_i_342: ;FPSUB dec ebx jz rx_finish xor r9, 09ccc7abah @@ -6233,10 +6222,7 @@ rx_i_342: ;FPMUL rx_body_342: and ecx, 32767 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] - mulpd xmm0, xmm2 - movaps xmm1, xmm0 - cmpeqpd xmm1, xmm1 - andps xmm0, xmm1 + subpd xmm0, xmm2 movaps xmm3, xmm0 rx_i_343: ;SHR_64 @@ -6258,7 +6244,7 @@ rx_body_343: and eax, 32767 mov qword ptr [rsi + rax * 8], rcx -rx_i_344: ;FPMUL +rx_i_344: ;FPSUB dec ebx jz rx_finish xor r10, 03ef9bcc4h @@ -6269,10 +6255,7 @@ rx_i_344: ;FPMUL rx_body_344: and ecx, 32767 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] - mulpd xmm0, xmm6 - movaps xmm1, xmm0 - cmpeqpd xmm1, xmm1 - andps xmm0, xmm1 + subpd xmm0, xmm6 movaps xmm5, xmm0 rx_i_345: ;MULH_64 @@ -6343,7 +6326,7 @@ rx_body_348: mov eax, r9d xor eax, 039c35461h and eax, 2047 - movlpd qword ptr [rsi + rax * 8], xmm9 + movhpd qword ptr [rsi + rax * 8], xmm9 rx_i_349: ;XOR_32 dec ebx @@ -6413,9 +6396,9 @@ rx_body_352: mov eax, r10d xor eax, 03bf686f2h and eax, 2047 - movhpd qword ptr [rsi + rax * 8], xmm2 + movlpd qword ptr [rsi + rax * 8], xmm2 -rx_i_353: ;FPMUL +rx_i_353: ;FPSUB dec ebx jz rx_finish xor r13, 02e65278bh @@ -6426,15 +6409,12 @@ rx_i_353: ;FPMUL rx_body_353: and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] - mulpd xmm0, xmm2 - movaps xmm1, xmm0 - cmpeqpd xmm1, xmm1 - andps xmm0, xmm1 + subpd xmm0, xmm2 movaps xmm7, xmm0 mov eax, r15d xor eax, 0b3c9f7aeh and eax, 2047 - movhpd qword ptr [rsi + rax * 8], xmm7 + movlpd qword ptr [rsi + rax * 8], xmm7 rx_i_354: ;MULH_64 dec ebx @@ -6535,7 +6515,7 @@ rx_body_359: mov eax, r12d xor eax, 0f16b9be3h and eax, 32767 - movhpd qword ptr [rsi + rax * 8], xmm4 + movlpd qword ptr [rsi + rax * 8], xmm4 rx_i_360: ;FPMUL dec ebx @@ -6570,7 +6550,7 @@ rx_body_361: mov eax, r14d xor eax, 0ad0b81f5h and eax, 2047 - movhpd qword ptr [rsi + rax * 8], xmm6 + movlpd qword ptr [rsi + rax * 8], xmm6 rx_i_362: ;SUB_64 dec ebx @@ -6726,7 +6706,7 @@ rx_body_370: mov eax, r14d xor eax, 0a120e0edh and eax, 2047 - movhpd qword ptr [rsi + rax * 8], xmm6 + movlpd qword ptr [rsi + rax * 8], xmm6 rx_i_371: ;FPADD dec ebx @@ -6948,7 +6928,7 @@ rx_body_383: mov eax, r13d xor eax, 0c9f5cc22h and eax, 2047 - movhpd qword ptr [rsi + rax * 8], xmm5 + movlpd qword ptr [rsi + rax * 8], xmm5 rx_i_384: ;SHR_64 dec ebx @@ -7256,7 +7236,7 @@ rx_body_400: and eax, 32767 mov qword ptr [rsi + rax * 8], rcx -rx_i_401: ;FPMUL +rx_i_401: ;FPSUB dec ebx jz rx_finish xor r13, 032e81f25h @@ -7267,15 +7247,12 @@ rx_i_401: ;FPMUL rx_body_401: and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] - mulpd xmm0, xmm4 - movaps xmm1, xmm0 - cmpeqpd xmm1, xmm1 - andps xmm0, xmm1 + subpd xmm0, xmm4 movaps xmm6, xmm0 mov eax, r14d xor eax, 03ea60344h and eax, 32767 - movhpd qword ptr [rsi + rax * 8], xmm6 + movlpd qword ptr [rsi + rax * 8], xmm6 rx_i_402: ;RET dec ebx @@ -7382,13 +7359,15 @@ rx_i_406: ;FPROUND rx_body_406: and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] + mov rcx, rax shl eax, 13 and eax, 24576 or eax, 40896 mov dword ptr [rsp - 8], eax ldmxcsr dword ptr [rsp - 8] + mov r9, rcx -rx_i_407: ;FPMUL +rx_i_407: ;FPSUB dec ebx jz rx_finish xor r14, 09699566fh @@ -7400,10 +7379,7 @@ rx_body_407: xor rbp, rcx and ecx, 32767 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] - mulpd xmm0, xmm9 - movaps xmm1, xmm0 - cmpeqpd xmm1, xmm1 - andps xmm0, xmm1 + subpd xmm0, xmm9 movaps xmm8, xmm0 rx_i_408: ;MUL_64 @@ -7493,7 +7469,7 @@ rx_body_412: mov eax, r11d xor eax, 0bbd2640ah and eax, 2047 - movhpd qword ptr [rsi + rax * 8], xmm3 + movlpd qword ptr [rsi + rax * 8], xmm3 rx_i_413: ;FPDIV dec ebx @@ -7704,7 +7680,7 @@ rx_body_424: mov eax, r9d xor eax, 0565ae8aah and eax, 2047 - movhpd qword ptr [rsi + rax * 8], xmm9 + movlpd qword ptr [rsi + rax * 8], xmm9 rx_i_425: ;IMUL_32 dec ebx @@ -7887,7 +7863,7 @@ rx_body_434: mov eax, r9d xor eax, 08c1cfc74h and eax, 2047 - movlpd qword ptr [rsi + rax * 8], xmm9 + movhpd qword ptr [rsi + rax * 8], xmm9 rx_i_435: ;MUL_64 dec ebx @@ -8068,7 +8044,7 @@ not_taken_ret_443: and eax, 2047 mov qword ptr [rsi + rax * 8], rcx -rx_i_444: ;FPMUL +rx_i_444: ;FPSUB dec ebx jz rx_finish xor r8, 042455dd8h @@ -8079,15 +8055,12 @@ rx_i_444: ;FPMUL rx_body_444: and ecx, 32767 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] - mulpd xmm0, xmm7 - movaps xmm1, xmm0 - cmpeqpd xmm1, xmm1 - andps xmm0, xmm1 + subpd xmm0, xmm7 movaps xmm5, xmm0 mov eax, r13d xor eax, 0ce416070h and eax, 2047 - movlpd qword ptr [rsi + rax * 8], xmm5 + movhpd qword ptr [rsi + rax * 8], xmm5 rx_i_445: ;ADD_64 dec ebx @@ -8128,7 +8101,7 @@ rx_body_446: and eax, 2047 mov qword ptr [rsi + rax * 8], rcx -rx_i_447: ;FPSUB +rx_i_447: ;FPADD dec ebx jz rx_finish xor r8, 01596d0e8h @@ -8139,12 +8112,12 @@ rx_i_447: ;FPSUB rx_body_447: and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] - subpd xmm0, xmm7 + addpd xmm0, xmm7 movaps xmm5, xmm0 mov eax, r13d xor eax, 0b384d4afh and eax, 2047 - movhpd qword ptr [rsi + rax * 8], xmm5 + movlpd qword ptr [rsi + rax * 8], xmm5 rx_i_448: ;FPSUB dec ebx @@ -8668,7 +8641,7 @@ rx_body_477: mov eax, r14d xor eax, 0e81fc7a6h and eax, 2047 - movlpd qword ptr [rsi + rax * 8], xmm6 + movhpd qword ptr [rsi + rax * 8], xmm6 rx_i_478: ;MUL_64 dec ebx @@ -9143,7 +9116,7 @@ rx_body_504: and eax, 32767 movhpd qword ptr [rsi + rax * 8], xmm4 -rx_i_505: ;FPMUL +rx_i_505: ;FPSUB dec ebx jz rx_finish xor r12, 032c0a28ah @@ -9154,17 +9127,14 @@ rx_i_505: ;FPMUL rx_body_505: and ecx, 32767 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] - mulpd xmm0, xmm4 - movaps xmm1, xmm0 - cmpeqpd xmm1, xmm1 - andps xmm0, xmm1 + subpd xmm0, xmm4 movaps xmm8, xmm0 mov eax, r8d xor eax, 021b54eaeh and eax, 32767 - movlpd qword ptr [rsi + rax * 8], xmm8 + movhpd qword ptr [rsi + rax * 8], xmm8 -rx_i_506: ;FPMUL +rx_i_506: ;FPSUB dec ebx jz rx_finish xor r9, 0a973d58ch @@ -9175,10 +9145,7 @@ rx_i_506: ;FPMUL rx_body_506: and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] - mulpd xmm0, xmm9 - movaps xmm1, xmm0 - cmpeqpd xmm1, xmm1 - andps xmm0, xmm1 + subpd xmm0, xmm9 movaps xmm3, xmm0 rx_i_507: ;RET @@ -9238,7 +9205,7 @@ taken_call_509: push rax call rx_i_42 -rx_i_510: ;FPSUB +rx_i_510: ;FPADD dec ebx jz rx_finish xor r8, 0db65513ch @@ -9249,7 +9216,7 @@ rx_i_510: ;FPSUB rx_body_510: and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] - subpd xmm0, xmm2 + addpd xmm0, xmm2 movaps xmm9, xmm0 rx_i_511: ;ROL_64 diff --git a/src/virtualMemory.cpp b/src/virtualMemory.cpp index 766fda3..e6e44fc 100644 --- a/src/virtualMemory.cpp +++ b/src/virtualMemory.cpp @@ -74,21 +74,21 @@ void setPrivilege(const char* pszPrivilege, BOOL bEnable) { } #endif -void* allocExecutableMemory(size_t bytes) { +void* allocExecutableMemory(std::size_t bytes) { void* mem; #ifdef _WIN32 mem = VirtualAlloc(nullptr, bytes, MEM_COMMIT, PAGE_EXECUTE_READWRITE); if (mem == nullptr) throw std::runtime_error(getErrorMessage("allocExecutableMemory - VirtualAlloc")); #else - mem = mmap(nullptr, CodeSize, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + mem = mmap(nullptr, bytes, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); if (mem == MAP_FAILED) throw std::runtime_error("allocExecutableMemory - mmap failed"); #endif return mem; } -void* allocLargePagesMemory(size_t bytes) { +void* allocLargePagesMemory(std::size_t bytes) { void* mem; #ifdef _WIN32 setPrivilege("SeLockMemoryPrivilege", 1); diff --git a/src/virtualMemory.hpp b/src/virtualMemory.hpp index dd150d3..c80d33e 100644 --- a/src/virtualMemory.hpp +++ b/src/virtualMemory.hpp @@ -19,5 +19,7 @@ along with RandomX. If not, see. #pragma once -void* allocExecutableMemory(size_t); -void* allocLargePagesMemory(size_t); \ No newline at end of file +#include + +void* allocExecutableMemory(std::size_t); +void* allocLargePagesMemory(std::size_t); \ No newline at end of file From 6941b2cb697d4ae917111b850cd34d98fd8bab87 Mon Sep 17 00:00:00 2001 From: tevador Date: Thu, 10 Jan 2019 23:36:53 +0100 Subject: [PATCH 08/35] Reworked instruction set documentation --- doc/isa-ops.md | 129 ++++++++++++++++++ doc/isa.md | 362 ++++++++++++++++++++++--------------------------- 2 files changed, 294 insertions(+), 197 deletions(-) create mode 100644 doc/isa-ops.md diff --git a/doc/isa-ops.md b/doc/isa-ops.md new file mode 100644 index 0000000..fd5f286 --- /dev/null +++ b/doc/isa-ops.md @@ -0,0 +1,129 @@ + +# RandomX instruction listing +There are 31 unique instructions divided into 3 groups: + +|group|# operations|# opcodes|| +|---------|-----------------|----|-| +|integer (IA)|22|144|56.3%| +|floating point (FP)|5|76|29.7%| +|control (CL)|4|36|14.0% +||**31**|**256**|**100%** + + +## Integer instructions +There are 22 integer instructions. They are divided into 3 classes (MATH, DIV, SHIFT) with different B operand selection rules. +|# opcodes|instruction|class|signed|A width|B width|C|C width| +|-|-|-|-|-|-|-|-| +|12|ADD_64|MATH|no|64|64|`A + B`|64| +|2|ADD_32|MATH|no|32|32|`A + B`|32| +|12|SUB_64|MATH|no|64|64|`A - B`|64| +|2|SUB_32|MATH|no|32|32|`A - B`|32| +|21|MUL_64|MATH|no|64|64|`A * B`|64| +|10|MULH_64|MATH|no|64|64|`A * B`|64| +|15|MUL_32|MATH|no|32|32|`A * B`|64| +|15|IMUL_32|MATH|yes|32|32|`A * B`|64| +|10|IMULH_64|MATH|yes|64|64|`A * B`|64| +|4|DIV_64|DIV|no|64|32|`A / B`|64| +|4|IDIV_64|DIV|yes|64|32|`A / B`|64| +|4|AND_64|MATH|no|64|64|`A & B`|64| +|2|AND_32|MATH|no|32|32|`A & B`|32| +|4|OR_64|MATH|no|64|64|`A | B`|64| +|2|OR_32|MATH|no|32|32|`A | B`|32| +|4|XOR_64|MATH|no|64|64|`A ^ B`|64| +|2|XOR_32|MATH|no|32|32|`A ^ B`|32| +|3|SHL_64|SHIFT|no|64|6|`A << B`|64| +|3|SHR_64|SHIFT|no|64|6|`A >> B`|64| +|3|SAR_64|SHIFT|yes|64|6|`A >> B`|64| +|6|ROL_64|SHIFT|no|64|6|`A <<< B`|64| +|6|ROR_64|SHIFT|no|64|6|`A >>> B`|64| + +#### 32-bit operations +Instructions ADD_32, SUB_32, AND_32, OR_32, XOR_32 only use the low-order 32 bits of the input operands. The result of these operations is 32 bits long and bits 32-63 of C are set to zero. + +#### Multiplication +There are 5 different multiplication operations. MUL_64 and MULH_64 both take 64-bit unsigned operands, but MUL_64 produces the low 64 bits of the result and MULH_64 produces the high 64 bits. MUL_32 and IMUL_32 use only the low-order 32 bits of the operands and produce a 64-bit result. The signed variant interprets the arguments as signed integers. IMULH_64 takes two 64-bit signed operands and produces the high-order 64 bits of the result. + +#### Division +For the division instructions, the dividend is 64 bits long and the divisor 32 bits long. The IDIV_64 instruction interprets both operands as signed integers. In case of division by zero or signed overflow, the result is equal to the dividend `A`. + +75% of division instructions use a runtime-constant divisor and can be optimized using a multiplication and shifts. + +#### Shift and rotate +The shift/rotate instructions use just the bottom 6 bits of the `B` operand (`imm8` is used as the immediate value). All treat `A` as unsigned except SAR_64, which performs an arithmetic right shift by copying the sign bit. + +## Floating point instructions +There are 5 floating point instructions. All floating point instructions are vector instructions that operate on two packed double precision floating point values. + +|# opcodes|instruction|C| +|-|-|-|-| +|20|FPADD|`A + B`| +|20|FPSUB|`A - B`| +|22|FPMUL|`A * B`| +|8|FPDIV|`A / B`| +|6|FPSQRT|`sqrt(abs(A))`| + +#### Conversion of operand A +Operand A is loaded from memory as a 64-bit value. All floating point instructions interpret A as two packed 32-bit signed integers and convert them into two packed double precision floating point values. + +#### Rounding +FPU instructions conform to the IEEE-754 specification, so they must give correctly rounded results. Initial rounding mode is *roundTiesToEven*. Rounding mode can be changed by the `FPROUND` control instruction. Denormal values must be always flushed to zero. + +#### NaN +If an operation produces NaN, the result is converted into positive zero. NaN results may never be written into registers or memory. Only division and multiplication must be checked for NaN results (`0.0 / 0.0` and `0.0 * Infinity` result in NaN). + +## Control instructions +There are 4 control instructions. + +|# opcodes|instruction|description|condition| +|-|-|-|-| +|2|FPROUND|change floating point rounding mode|- +|11|JUMP|conditional jump|(see condition table below) +|11|CALL|conditional procedure call|(see condition table below) +|12|RET|return from procedure|stack is not empty + +All control instructions behave as 'arithmetic no-op' and simply copy the input operand A into the destination C. + +The JUMP and CALL instructions use a condition function, which takes the lower 32 bits of operand B (register) and the value `imm32` and evaluates a condition based on the `B.LOC.C` flag: + +|`B.LOC.C`|signed|jump condition|probability|*x86*|*ARM* +|---|---|----------|-----|--|----| +|0|no|`B <= imm32`|0% - 100%|`JBE`|`BLS` +|1|no|`B > imm32`|0% - 100%|`JA`|`BHI` +|2|yes|`B - imm32 < 0`|50%|`JS`|`BMI` +|3|yes|`B - imm32 >= 0`|50%|`JNS`|`BPL` +|4|yes|`B - imm32` overflows|0% - 50%|`JO`|`BVS` +|5|yes|`B - imm32` doesn't overflow|50% - 100%|`JNO`|`BVC` +|6|yes|`B < imm32`|0% - 100%|`JL`|`BLT` +|7|yes|`B >= imm32`|0% - 100%|`JGE`|`BGE` + +The 'signed' column specifies if the operands are interpreted as signed or unsigned 32-bit numbers. Column 'probability' lists the expected jump probability (range means that the actual value for a specific instruction depends on `imm32`). *Columns 'x86' and 'ARM' list the corresponding hardware instructions (following a `CMP` instruction).* + +### FPROUND +The FPROUND instruction changes the rounding mode for all subsequent FPU operations depending on a two-bit flag. The flag is calculated by rotating A `imm8` bits to the right and taking the two least-significant bits: + +``` +rounding flag = (A >>> imm8)[1:0] +``` + +|rounding flag|rounding mode| +|-------|------------| +|00|roundTiesToEven| +|01|roundTowardNegative| +|10|roundTowardPositive| +|11|roundTowardZero| + +The rounding modes are defined by the IEEE-754 standard. + +*The two-bit flag value exactly corresponds to bits 13-14 of the x86 `MXCSR` register and bits 23 and 22 (reversed) of the ARM `FPSCR` register.* + +### JUMP +If the jump condition is `true`, the JUMP instruction performs a forward jump relative to the value of `pc`. The forward offset is equal to `16 * (imm8[6:0] + 1)` bytes (1-128 instructions forward). + +### CALL +If the jump condition is `true`, the CALL instruction pushes the value of `pc` (program counter) onto the stack and then performs a forward jump relative to the value of `pc`. The forward offset is equal to `16 * (imm8[6:0] + 1)` bytes (1-128 instructions forward). + +### RET +If the stack is not empty, the RET instruction pops the return address from the stack (it's the instruction following the previous CALL) and jumps to it. + +## Reference implementation +A portable C++ implementation of all integer and floating point instructions is available in [instructionsPortable.cpp](../src/instructionsPortable.cpp). \ No newline at end of file diff --git a/doc/isa.md b/doc/isa.md index 0c0ab7b..4f1cc5d 100644 --- a/doc/isa.md +++ b/doc/isa.md @@ -1,213 +1,181 @@ +# RandomX instruction encoding +The instruction set was designed in such way that any random 16-byte word is a valid instruction and any sequence of valid instructions is a valid program. There are no syntax rules. -## RandomX instruction set -RandomX uses a simple low-level language (instruction set), which was designed so that any random bitstring forms a valid program. +The encoding of each 128-bit instruction word is following: -Each RandomX instruction has a length of 128 bits. The encoding is following: +![Imgur](https://i.imgur.com/xi8zuAZ.png) -![Imgur](https://i.imgur.com/mbndESz.png) +## opcode +There are 256 opcodes, which are distributed between 3 groups of instructions. There are 31 distinct operations (each operation can be encoded using multiple opcodes - for example opcodes `0x00` to `0x0d` correspond to integer addition). -*All flags are aligned to an 8-bit boundary for easier decoding.* +**Table 1: Instruction groups** +|group|# operations|# opcodes|| +|---------|-----------------|----|-| +|integer (IA)|22|144|56.3%| +|floating point (FP)|5|76|29.7%| +|control (CL)|4|36|14.0% +||**31**|**256**|**100%** -#### Opcode -There are 256 opcodes, which are distributed between 30 instructions based on their weight (how often they will occur in the program on average). Instructions are divided into 5 groups: +Full description of all instructions: [isa-ops.md](isa-ops.md). -|group|number of opcodes||comment| -|---------|-----------------|----|------| -|IA|115|44.9%|integer arithmetic operations -|IS|21|8.2%|bitwise shift and rotate -|FA|70|27.4%|floating point arithmetic operations -|FS|8|3.1%|floating point single-input operations -|CF|42|16.4%|control flow instructions (branches) -||**256**|**100%** +## A.LOC +**Table 2: `A.LOC` encoding** -#### Operand A -The first 64-bit operand is read from memory. The location is determined by the `loc(a)` flag: +|bits|description| +|----|--------| +|0-1|`A.LOC.W` flag| +|2-5|Reserved| +|6-7|`A.LOC.X` flag| -|loc(a)[2:0]|read A from|address size (W) +The `A.LOC.W` flag determines the address width when reading operand A from the scratchpad: + +**Table 3: Operand A read address width** + +|`A.LOC.W`|address width (W) |---------|-|-| -|000|dataset|32 bits| -|001|dataset|32 bits| -|010|dataset|32 bits| -|011|dataset|32 bits| -|100|scratchpad|15 bits| -|101|scratchpad|11 bits| -|110|scratchpad|11 bits| -|111|scratchpad|11 bits| +|0|15 bits (256 KiB)| +|1-3|11 bits (16 KiB)| -Flag `reg(a)` encodes an integer register `r0`-`r7`. The read address is calculated as: +If the `A.LOC.W` flag is zero, the address space covers the whole 256 KiB scratchpad. Otherwise, just the first 16 KiB of the scratchpad are addressed. + +If the `A.LOC.X` flag is zero, the instruction mixes the scratchpad read address into the `mx` register using XOR. This mixing happens before the address is truncated to W bits (see pseudocode below). + +## A.REG +**Table 4: `A.REG` encoding** + +|bits|description| +|----|--------| +|0-2|`A.REG.R` flag| +|3-7|Reserved| + +The `A.REG.R` flag encodes "readAddressRegister", which is an integer register `r0`-`r7` to be used for scratchpad read address generation. Read address is generated as follows (pseudocode): + +```python +readAddressRegister = IntegerRegister(A.REG.R) +readAddressRegister = readAddressRegister XOR SignExtend(A.mask32) +readAddress = readAddressRegister[31:0] +# dataset is read if the ic register is divisible by 64 +IF ic mod 64 == 0: + DatasetRead(readAddress) +# optional mixing into the mx register +IF A.LOC.X == 0: + mx = mx XOR readAddress +# truncate to W bits +W = GetAddressWidth(A.LOC.W) +readAddress = readAddress[W-1:0] ``` -reg(a) = reg(a) XOR signExtend(addr(a)) -read_addr = reg(a)[W-1:0] + +Note that the value of the read address register is modified during address generation. + +## B.LOC +**Table 5: `B.LOC` encoding** + +|bits|description| +|----|--------| +|0-1|`B.LOC.L` flag| +|0-2|`B.LOC.C` flag| +|3-7|Reserved| + +The `B.LOC.L` flag determines the B operand. It can be either a register or immediate value. + +**Table 6: Operand B** + +|`B.LOC.L`|IA/DIV|IA/SHIFT|IA/MATH|FP|CL| +|----|--------|----|------|----|---| +|0|register|register|register|register|register| +|1|`imm32`|register|register|register|register| +|2|`imm32`|`imm8`|register|register|register| +|3|`imm32`|`imm8`|`imm32`|register|register| + +Integer instructions are split into 3 classes: integer division (IA/DIV), shift and rotate (IA/SHIFT) and other (IA/MATH). Floating point (FP) and control (CL) instructions always use a register operand. + +Register to be used as operand B is encoded in the `B.REG.R` flag (see below). + +The `B.LOC.C` flag determines the condition for the JUMP and CALL instructions. The flag partially overlaps with the `B.LOC.L` flag. + +## B.REG +**Table 7: `B.REG` encoding** + +|bits|description| +|----|--------| +|0-2|`B.REG.R` flag| +|3-7|Reserved| + +Register encoded by the `B.REG.R` depends on the instruction group: + +**Table 8: Register operands by group** + +|group|registers| +|----|--------| +|IA|`r0`-`r7`| +|FP|`f0`-`f7`| +|CL|`r0`-`r7`| + +## C.LOC +**Table 9: `C.LOC` encoding** + +|bits|description| +|----|--------| +|0-1|`C.LOC.W` flag| +|2|`C.LOC.R` flag| +|3-6|Reserved| +|7|`C.LOC.H` flag| + +The `C.LOC.W` flag determines the address width when writing operand C to the scratchpad: + +**Table 10: Operand C write address width** + +|`C.LOC.W`|address width (W) +|---------|-|-| +|0|15 bits (256 KiB)| +|1-3|11 bits (16 KiB)| + +If the `C.LOC.W` flag is zero, the address space covers the whole 256 KiB scratchpad. Otherwise, just the first 16 KiB of the scratchpad are addressed. + +The `C.LOC.R` determines the destination where operand C is written: + +**Table 11: Operand C destination** + +|`C.LOC.R`|groups IA, CL|group FP +|---------|-|-| +|0|scratchpad|register +|1|register|register + scratchpad + +Integer and control instructions (groups IA and CL) write either to the scratchpad or to a register. Floating point instructions always write to a register and can also write to the scratchpad. In that case, flag `C.LOC.H` determines if the low or high half of the register is written: + +**Table 12: Floating point register write** + +|`C.LOC.H`|write bits| +|---------|----------| +|0|0-63| +|1|64-127| + +## C.REG +**Table 13: `C.REG` encoding** + +|bits|description| +|----|--------| +|0-2|`C.REG.R` flag| +|3-7|Reserved| + +The destination register encoded in the `C.REG.R` flag encodes both the write address register (if writing to the scratchpad) and the destination register (if writing to a register). The destination register depends on the instruction group (see Table 8). Write address is always generated from an integer register: + +```python +writeAddressRegister = IntegerRegister(C.REG.R) +writeAddress = writeAddressRegister[31:0] XOR C.mask32 +# truncate to W bits +W = GetAddressWidth(C.LOC.W) +writeAddress = writeAddress [W-1:0] ``` -`W` is the address width from the above table. For reading from the scratchpad, `read_addr` is multiplied by 8 for 8-byte aligned access. -#### Operand B -The second operand is loaded either from a register or from an immediate value encoded within the instruction. The `reg(b)` flag encodes an integer register (instruction groups IA and IS) or a floating point register (instruction group FA). Instruction group FS doesn't use operand B. +## imm8 +`imm8` is an 8-bit immediate value that is used as the B operand by IA/SHIFT instructions (see Table 6). Additionally, it's used by some control instructions. -|loc(b)[2:0]|B (IA)|B (IS)|B (FA)|B (FS) -|---------|-|-|-|-| -|000|integer `reg(b)`|integer `reg(b)`|floating point `reg(b)`|- -|001|integer `reg(b)`|integer `reg(b)`|floating point `reg(b)`|- -|010|integer `reg(b)`|integer `reg(b)`|floating point `reg(b)`|- -|011|integer `reg(b)`|integer `reg(b)`|floating point `reg(b)`|- -|100|integer `reg(b)`|`imm8`|floating point `reg(b)`|- -|101|integer `reg(b)`|`imm8`|floating point `reg(b)`|- -|110|`imm32`|`imm8`|floating point `reg(b)`|- -|111|`imm32`|`imm8`|floating point `reg(b)`|- +## A.mask32 +`A.mask32` is a 32-bit address mask that is used to calculate the read address for the A operand. It's sign-extended to 64 bits before use. -`imm8` is an 8-bit immediate value, which is used for shift and rotate integer instructions (group IS). Only bits 0-5 are used. +## imm32 +`imm32` is a 32-bit immediate value which is used for integer instructions from groups IA/DIV and IA/OTHER (see Table 6). The immediate value is sign-extended for instructions that expect 64-bit operands. -`imm32` is a 32-bit immediate value which is used for integer instructions from group IA. - -Floating point instructions don't use immediate values. - -#### Operand C -The third operand is the location where the result is stored. It can be a register or a 64-bit scratchpad location, depending on the value of flag `loc(c)`. - -|loc\(c\)[2:0]|address size (W)| C (IA, IS)|C (FA, FS) -|---------|-|-|-|-|-| -|000|15 bits|scratchpad|floating point `reg(c)` -|001|11 bits|scratchpad|floating point `reg(c)` -|010|11 bits|scratchpad|floating point `reg(c)` -|011|11 bits|scratchpad|floating point `reg(c)` -|100|15 bits|integer `reg(c)`|floating point `reg(c)`, scratchpad -|101|11 bits|integer `reg(c)`|floating point `reg(c)`, scratchpad -|110|11 bits|integer `reg(c)`|floating point `reg(c)`, scratchpad -|111|11 bits|integer `reg(c)`|floating point `reg(c)`, scratchpad - -Integer operations write either to the scratchpad or to a register. Floating point operations always write to a register and can also write to the scratchpad. In that case, bit 3 of the `loc(c)` flag determines if the low or high half of the register is written: - -|loc\(c\)[3]|write to scratchpad| -|------------|-----------------------| -|0|floating point `reg(c)[63:0]` -|1|floating point `reg(c)[127:64]` - -The FPROUND instruction is an exception and always writes the low half of the register. - -For writing to the scratchpad, an integer register is always used to calculate the address: -``` -write_addr = 8 * (addr(c) XOR reg(c)[31:0])[W-1:0] -``` -*CPUs are typically designed for a 2:1 load:store ratio, so each VM instruction performs on average 1 memory read and 0.5 writes to memory.* - -#### imm8 -An 8-bit immediate value that is used as the shift/rotate count by group IS instructions and as the jump offset of the CALL instruction. - -#### addr(a) -A 32-bit address mask that is used to calculate the read address for the A operand. It's sign-extended to 64 bits. - -#### addr\(c\) -A 32-bit address mask that is used to calculate the write address for the C operand. `addr(c)` is equal to `imm32`. - -### ALU instructions - -|weight|instruction|group|signed|A width|B width|C|C width| -|-|-|-|-|-|-|-|-| -|10|ADD_64|IA|no|64|64|`A + B`|64| -|2|ADD_32|IA|no|32|32|`A + B`|32| -|10|SUB_64|IA|no|64|64|`A - B`|64| -|2|SUB_32|IA|no|32|32|`A - B`|32| -|21|MUL_64|IA|no|64|64|`A * B`|64| -|10|MULH_64|IA|no|64|64|`A * B`|64| -|15|MUL_32|IA|no|32|32|`A * B`|64| -|15|IMUL_32|IA|yes|32|32|`A * B`|64| -|10|IMULH_64|IA|yes|64|64|`A * B`|64| -|1|DIV_64|IA|no|64|32|`A / B`|32| -|1|IDIV_64|IA|yes|64|32|`A / B`|32| -|4|AND_64|IA|no|64|64|`A & B`|64| -|2|AND_32|IA|no|32|32|`A & B`|32| -|4|OR_64|IA|no|64|64|`A | B`|64| -|2|OR_32|IA|no|32|32|`A | B`|32| -|4|XOR_64|IA|no|64|64|`A ^ B`|64| -|2|XOR_32|IA|no|32|32|`A ^ B`|32| -|3|SHL_64|IS|no|64|6|`A << B`|64| -|3|SHR_64|IS|no|64|6|`A >> B`|64| -|3|SAR_64|IS|yes|64|6|`A >> B`|64| -|6|ROL_64|IS|no|64|6|`A <<< B`|64| -|6|ROR_64|IS|no|64|6|`A >>> B`|64| - -##### 32-bit operations -Instructions ADD_32, SUB_32, AND_32, OR_32, XOR_32 only use the low-order 32 bits of the input operands. The result of these operations is 32 bits long and bits 32-63 of C are set to zero. - -##### Multiplication -There are 5 different multiplication operations. MUL_64 and MULH_64 both take 64-bit unsigned operands, but MUL_64 produces the low 64 bits of the result and MULH_64 produces the high 64 bits. MUL_32 and IMUL_32 use only the low-order 32 bits of the operands and produce a 64-bit result. The signed variant interprets the arguments as signed integers. IMULH_64 takes two 64-bit signed operands and produces the high-order 64 bits of the result. - -##### Division -For the division instructions, the dividend is 64 bits long and the divisor 32 bits long. The IDIV_64 instruction interprets both operands as signed integers. In case of division by zero or signed overflow, the result is equal to the dividend `A`. - -*Division by zero can be handled without branching by a conditional move. Signed overflow happens only for the signed variant when the minimum negative value is divided by -1. This rare case must be handled in x86 (ARM produces the "correct" result).* - -##### Shift and rotate -The shift/rotate instructions use just the bottom 6 bits of the `B` operand (`imm8` is used as the immediate value). All treat `A` as unsigned except SAR_64, which performs an arithmetic right shift by copying the sign bit. - -### FPU instructions - -|weight|instruction|group|C| -|-|-|-|-| -|20|FPADD|FA|`A + B`| -|20|FPSUB|FA|`A - B`| -|22|FPMUL|FA|`A * B`| -|8|FPDIV|FA|`A / B`| -|6|FPSQRT|FS|`sqrt(abs(A))`| -|2|FPROUND|FS|`convertSigned52(A)`| - -All floating point instructions apart FPROUND are vector instructions that operate on two packed double precision floating point values. - -#### Conversion of operand A -Operand A is loaded from memory as a 64-bit value. All floating point instructions apart FPROUND interpret A as two packed 32-bit signed integers and convert them into two packed double precision floating point values. - -The FPROUND instruction has a scalar output and interprets A as a 64-bit signed integer. The 11 least-significant bits are cleared before conversion to a double precision format. This is done so the number fits exactly into the 52-bit mantissa without rounding. Output of FPROUND is always written into the lower half of the result register and only this lower half may be written into the scratchpad. - -#### Rounding -FPU instructions conform to the IEEE-754 specification, so they must give correctly rounded results. Initial rounding mode is *roundTiesToEven*. Rounding mode can be changed by the `FPROUND` instruction. Denormal values must be flushed to zero. - -#### NaN -If an operation produces NaN, the result is converted into positive zero. NaN results may never be written into registers or memory. Only division and multiplication must be checked for NaN results (`0.0 / 0.0` and `0.0 * Infinity` result in NaN). - -##### FPROUND -The FPROUND instruction changes the rounding mode for all subsequent FPU operations depending on the two least-significant bits of A. - -|A[1:0]|rounding mode| -|-------|------------| -|00|roundTiesToEven| -|01|roundTowardNegative| -|10|roundTowardPositive| -|11|roundTowardZero| - -The rounding modes are defined by the IEEE-754 standard. - -*The two-bit flag value exactly corresponds to bits 13-14 of the x86 `MXCSR` register and bits 23 and 22 (reversed) of the ARM `FPSCR` register.* - -### Control instructions -The following 2 control instructions are supported: - -|weight|instruction|function|condition| -|-|-|-|-| -|20|CALL|near procedure call|(see condition table below) -|22|RET|return from procedure|stack is not empty - -Both instructions are conditional. If the condition evaluates to `false`, CALL and RET behave as "arithmetic no-op" and simply copy operand A into destination C without jumping. - -##### CALL -The CALL instruction uses a condition function, which takes the lower 32 bits of integer register `reg(b)` and the value `imm32` and evaluates a condition based on the `loc(b)` flag: - -|loc(b)[2:0]|signed|jump condition|probability|*x86*|*ARM* -|---|---|----------|-----|--|----| -|000|no|`reg(b)[31:0] <= imm32`|0% - 100%|`JBE`|`BLS` -|001|no|`reg(b)[31:0] > imm32`|0% - 100%|`JA`|`BHI` -|010|yes|`reg(b)[31:0] - imm32 < 0`|50%|`JS`|`BMI` -|011|yes|`reg(b)[31:0] - imm32 >= 0`|50%|`JNS`|`BPL` -|100|yes|`reg(b)[31:0] - imm32` overflows|0% - 50%|`JO`|`BVS` -|101|yes|`reg(b)[31:0] - imm32` doesn't overflow|50% - 100%|`JNO`|`BVC` -|110|yes|`reg(b)[31:0] < imm32`|0% - 100%|`JL`|`BLT` -|111|yes|`reg(b)[31:0] >= imm32`|0% - 100%|`JGE`|`BGE` - -The 'signed' column specifies if the operands are interpreted as signed or unsigned 32-bit numbers. Column 'probability' lists the expected jump probability (range means that the actual value for a specific instruction depends on `imm32`). *Columns 'x86' and 'ARM' list the corresponding hardware instructions (following a `CMP` instruction).* - -Taken CALL instruction pushes the values `A` and `pc` (program counter) onto the stack and then performs a forward jump relative to the value of `pc`. The forward offset is equal to `16 * (imm8[6:0] + 1)`. Maximum jump distance is therefore 128 instructions forward (this means that at least 4 correctly spaced CALL instructions are needed to form a loop in the program). - -##### RET -The RET instruction is taken only if the stack is not empty. Taken RET instruction pops the return address `raddr` from the stack (it's the instruction following the previous CALL), then pops a return value `retval` from the stack and sets `C = A XOR retval`. Finally, the instruction jumps back to `raddr`. - -## Reference implementation -A portable C++ implementation of all ALU and FPU instructions is available in [instructionsPortable.cpp](../src/instructionsPortable.cpp). \ No newline at end of file +## C.mask32 +`C.mask32` is a 32-bit address mask that is used to calculate the write address for the C operand. `C.mask32` is equal to `imm32`. From 557241cd957f8a798805c1c71d43cc56057b6872 Mon Sep 17 00:00:00 2001 From: tevador Date: Fri, 11 Jan 2019 09:58:06 +0100 Subject: [PATCH 09/35] JUMP instruction --- src/AssemblyGeneratorX86.cpp | 9 + src/AssemblyGeneratorX86.hpp | 1 + src/Instruction.cpp | 1 + src/JitCompilerX86.cpp | 19 + src/JitCompilerX86.hpp | 1 + src/instructionWeights.hpp | 11 +- src/program.inc | 2264 ++++++++++++++++------------------ 7 files changed, 1092 insertions(+), 1214 deletions(-) diff --git a/src/AssemblyGeneratorX86.cpp b/src/AssemblyGeneratorX86.cpp index c2394c9..619afd3 100644 --- a/src/AssemblyGeneratorX86.cpp +++ b/src/AssemblyGeneratorX86.cpp @@ -496,6 +496,14 @@ namespace RandomX { } } + void AssemblyGeneratorX86::h_JUMP(Instruction& instr, int i) { + genar(instr, i); + gencr(instr); + asmCode << "\tcmp " << regR32[instr.regb % RegistersCount] << ", " << instr.imm32 << std::endl; + asmCode << "\t" << jumpCondition(instr); + asmCode << " rx_i_" << wrapInstr(i + (instr.imm8 & 127) + 2) << std::endl; + } + void AssemblyGeneratorX86::h_CALL(Instruction& instr, int i) { genar(instr, i); asmCode << "\tcmp " << regR32[instr.regb % RegistersCount] << ", " << instr.imm32 << std::endl; @@ -554,6 +562,7 @@ namespace RandomX { INST_HANDLE(FPDIV) INST_HANDLE(FPSQRT) INST_HANDLE(FPROUND) + INST_HANDLE(JUMP) INST_HANDLE(CALL) INST_HANDLE(RET) }; diff --git a/src/AssemblyGeneratorX86.hpp b/src/AssemblyGeneratorX86.hpp index bf5238a..2a1be1b 100644 --- a/src/AssemblyGeneratorX86.hpp +++ b/src/AssemblyGeneratorX86.hpp @@ -77,6 +77,7 @@ namespace RandomX { void h_FPDIV(Instruction&, int); void h_FPSQRT(Instruction&, int); void h_FPROUND(Instruction&, int); + void h_JUMP(Instruction&, int); void h_CALL(Instruction&, int); void h_RET(Instruction&, int); }; diff --git a/src/Instruction.cpp b/src/Instruction.cpp index 4ab128a..b668a81 100644 --- a/src/Instruction.cpp +++ b/src/Instruction.cpp @@ -63,6 +63,7 @@ namespace RandomX { INST_NAME(FPDIV) INST_NAME(FPSQRT) INST_NAME(FPROUND) + INST_NAME(JUMP) INST_NAME(CALL) INST_NAME(RET) }; diff --git a/src/JitCompilerX86.cpp b/src/JitCompilerX86.cpp index fda3746..c7f753b 100644 --- a/src/JitCompilerX86.cpp +++ b/src/JitCompilerX86.cpp @@ -603,6 +603,24 @@ namespace RandomX { } } + void JitCompilerX86::h_JUMP(Instruction& instr, int i) { + genar(instr); + gencr(instr); + emit(uint16_t(0x8141)); //cmp regb, imm32 + emitByte(0xf8 + (instr.regb % RegistersCount)); + emit(instr.imm32); + emitByte(0x0f); //near jump + emitByte(jumpCondition(instr) + 0x10); + i = wrapInstr(i + (instr.imm8 & 127) + 2); + if (i < instructionOffsets.size()) { + emit(instructionOffsets[i] - (codePos + 4)); + } + else { + callOffsets.push_back(CallOffset(codePos, i)); + codePos += 4; + } + } + void JitCompilerX86::h_CALL(Instruction& instr, int i) { genar(instr); emit(uint16_t(0x8141)); //cmp regb, imm32 @@ -677,6 +695,7 @@ namespace RandomX { INST_HANDLE(FPDIV) INST_HANDLE(FPSQRT) INST_HANDLE(FPROUND) + INST_HANDLE(JUMP) INST_HANDLE(CALL) INST_HANDLE(RET) }; diff --git a/src/JitCompilerX86.hpp b/src/JitCompilerX86.hpp index cea067c..e4277c6 100644 --- a/src/JitCompilerX86.hpp +++ b/src/JitCompilerX86.hpp @@ -110,6 +110,7 @@ namespace RandomX { void h_FPDIV(Instruction&, int); void h_FPSQRT(Instruction&, int); void h_FPROUND(Instruction&, int); + void h_JUMP(Instruction&, int); void h_CALL(Instruction&, int); void h_RET(Instruction&, int); }; diff --git a/src/instructionWeights.hpp b/src/instructionWeights.hpp index bb99ca7..39f8dec 100644 --- a/src/instructionWeights.hpp +++ b/src/instructionWeights.hpp @@ -19,9 +19,9 @@ along with RandomX. If not, see. #pragma once -#define WT_ADD_64 11 +#define WT_ADD_64 15 #define WT_ADD_32 2 -#define WT_SUB_64 11 +#define WT_SUB_64 15 #define WT_SUB_32 2 #define WT_MUL_64 23 #define WT_MULH_64 10 @@ -47,8 +47,9 @@ along with RandomX. If not, see. #define WT_FPDIV 8 #define WT_FPSQRT 6 #define WT_FPROUND 2 -#define WT_CALL 20 -#define WT_RET 22 +#define WT_JUMP 11 +#define WT_CALL 11 +#define WT_RET 12 constexpr int wtSum = WT_ADD_64 + WT_ADD_32 + WT_SUB_64 + WT_SUB_32 + \ @@ -56,7 +57,7 @@ WT_MUL_64 + WT_MULH_64 + WT_MUL_32 + WT_IMUL_32 + WT_IMULH_64 + \ WT_DIV_64 + WT_IDIV_64 + WT_AND_64 + WT_AND_32 + WT_OR_64 + \ WT_OR_32 + WT_XOR_64 + WT_XOR_32 + WT_SHL_64 + WT_SHR_64 + \ WT_SAR_64 + WT_ROL_64 + WT_ROR_64 + WT_FPADD + WT_FPSUB + WT_FPMUL \ -+ WT_FPDIV + WT_FPSQRT + WT_FPROUND + WT_CALL + WT_RET; ++ WT_FPDIV + WT_FPSQRT + WT_FPROUND + WT_JUMP + WT_CALL + WT_RET; static_assert(wtSum == 256, "Sum of instruction weights must be 256"); diff --git a/src/program.inc b/src/program.inc index e078cc3..2d4fc25 100644 --- a/src/program.inc +++ b/src/program.inc @@ -1,4 +1,4 @@ -rx_i_0: ;RET +rx_i_0: ;CALL dec ebx jz rx_finish xor r9, 0ca9788ah @@ -10,23 +10,19 @@ rx_body_0: xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - cmp rsp, rdi - je short not_taken_ret_0 - xor rax, qword ptr [rsp + 8] - mov rcx, rax - mov eax, r12d - xor eax, 01a8e4171h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - ret 8 -not_taken_ret_0: + cmp r11d, 445530481 + jbe short taken_call_0 mov rcx, rax mov eax, r12d xor eax, 01a8e4171h and eax, 2047 mov qword ptr [rsi + rax * 8], rcx + jmp rx_i_1 +taken_call_0: + push rax + call rx_i_30 -rx_i_1: ;AND_64 +rx_i_1: ;IMULH_64 dec ebx jz rx_finish xor r15, 06afc2fa4h @@ -37,10 +33,12 @@ rx_i_1: ;AND_64 rx_body_1: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - and rax, r10 + mov rcx, r10 + imul rcx + mov rax, rdx mov r12, rax -rx_i_2: ;CALL +rx_i_2: ;JUMP dec ebx jz rx_finish xor r15, 097210f7bh @@ -52,19 +50,15 @@ rx_body_2: xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - cmp r11d, 1348521207 - jno short taken_call_2 mov rcx, rax mov eax, r9d xor eax, 05060ccf7h and eax, 2047 mov qword ptr [rsi + rax * 8], rcx - jmp rx_i_3 -taken_call_2: - push rax - call rx_i_47 + cmp r11d, 1348521207 + jno rx_i_47 -rx_i_3: ;FPROUND +rx_i_3: ;FPDIV dec ebx jz rx_finish xor r13, 082c73195h @@ -75,14 +69,16 @@ rx_i_3: ;FPROUND rx_body_3: xor rbp, rcx and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] - mov rcx, rax - shl eax, 13 - and eax, 24576 - or eax, 40896 - mov dword ptr [rsp - 8], eax - ldmxcsr dword ptr [rsp - 8] - mov r8, rcx + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + divpd xmm0, xmm9 + movaps xmm1, xmm0 + cmpeqpd xmm1, xmm1 + andps xmm0, xmm1 + movaps xmm8, xmm0 + mov eax, r8d + xor eax, 06bb1a0b2h + and eax, 32767 + movhpd qword ptr [rsi + rax * 8], xmm8 rx_i_4: ;MULH_64 dec ebx @@ -104,7 +100,7 @@ rx_body_4: and eax, 2047 mov qword ptr [rsi + rax * 8], rcx -rx_i_5: ;IMUL_32 +rx_i_5: ;MUL_32 dec ebx jz rx_finish xor r15, 0379f9ee0h @@ -116,8 +112,8 @@ rx_body_5: xor rbp, rcx and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] - movsxd rcx, eax - movsxd rax, r12d + mov ecx, eax + mov eax, r12d imul rax, rcx mov r12, rax @@ -157,7 +153,7 @@ rx_body_7: and eax, 32767 movlpd qword ptr [rsi + rax * 8], xmm6 -rx_i_8: ;SHL_64 +rx_i_8: ;XOR_64 dec ebx jz rx_finish xor r13, 068c1e5d2h @@ -168,14 +164,14 @@ rx_i_8: ;SHL_64 rx_body_8: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - shl rax, 47 + xor rax, 1344700093 mov rcx, rax mov eax, r12d xor eax, 050267ebdh and eax, 2047 mov qword ptr [rsi + rax * 8], rcx -rx_i_9: ;AND_64 +rx_i_9: ;IMULH_64 dec ebx jz rx_finish xor r14, 085121c54h @@ -187,10 +183,12 @@ rx_body_9: xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - and rax, 565870810 + mov rcx, 565870810 + imul rcx + mov rax, rdx mov r10, rax -rx_i_10: ;OR_64 +rx_i_10: ;AND_64 dec ebx jz rx_finish xor r8, 052efde3eh @@ -201,7 +199,7 @@ rx_i_10: ;OR_64 rx_body_10: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - or rax, -727859809 + and rax, -727859809 mov r13, rax rx_i_11: ;FPADD @@ -222,7 +220,7 @@ rx_body_11: and eax, 2047 movlpd qword ptr [rsi + rax * 8], xmm4 -rx_i_12: ;CALL +rx_i_12: ;FPSQRT dec ebx jz rx_finish xor r10, 0db2691ch @@ -232,16 +230,15 @@ rx_i_12: ;CALL call rx_read_l2 rx_body_12: and ecx, 32767 - mov rax, qword ptr [rsi+rcx*8] - cmp r8d, -1763940407 - jge short taken_call_12 - mov r8, rax - jmp rx_i_13 -taken_call_12: - push rax - call rx_i_35 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + andps xmm0, xmm10 + sqrtpd xmm8, xmm0 + mov eax, r8d + xor eax, 096dc67c9h + and eax, 2047 + movlpd qword ptr [rsi + rax * 8], xmm8 -rx_i_13: ;FPSUB +rx_i_13: ;FPADD dec ebx jz rx_finish xor r12, 061c0d34dh @@ -253,10 +250,10 @@ rx_body_13: xor rbp, rcx and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] - subpd xmm0, xmm3 + addpd xmm0, xmm3 movaps xmm9, xmm0 -rx_i_14: ;SHR_64 +rx_i_14: ;XOR_64 dec ebx jz rx_finish xor r10, 0e761d1beh @@ -268,7 +265,7 @@ rx_body_14: xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - shr rax, 4 + xor rax, r9 mov rcx, rax mov eax, r10d xor eax, 03c1a72f8h @@ -392,7 +389,7 @@ rx_body_20: and eax, 2047 movlpd qword ptr [rsi + rax * 8], xmm7 -rx_i_21: ;FPADD +rx_i_21: ;ROR_64 dec ebx jz rx_finish xor r8, 0977f0284h @@ -403,11 +400,16 @@ rx_i_21: ;FPADD rx_body_21: xor rbp, rcx and ecx, 32767 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] - addpd xmm0, xmm9 - movaps xmm7, xmm0 + mov rax, qword ptr [rsi+rcx*8] + mov rcx, r9 + ror rax, cl + mov rcx, rax + mov eax, r15d + xor eax, 0db5e0aafh + and eax, 2047 + mov qword ptr [rsi + rax * 8], rcx -rx_i_22: ;ADD_32 +rx_i_22: ;ADD_64 dec ebx jz rx_finish xor r13, 080bdfefah @@ -419,7 +421,7 @@ rx_body_22: xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - add eax, r8d + add rax, r8 mov rcx, rax mov eax, r10d xor eax, 0cfa09799h @@ -440,7 +442,7 @@ rx_body_23: imul rax, r11 mov r8, rax -rx_i_24: ;IMULH_64 +rx_i_24: ;IMUL_32 dec ebx jz rx_finish xor r8, 070d3b8c7h @@ -452,9 +454,9 @@ rx_body_24: xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - mov rcx, r15 - imul rcx - mov rax, rdx + movsxd rcx, eax + movsxd rax, r15d + imul rax, rcx mov rcx, rax mov eax, r15d xor eax, 099b77a68h @@ -520,7 +522,7 @@ rx_body_27: andps xmm0, xmm1 movaps xmm6, xmm0 -rx_i_28: ;XOR_64 +rx_i_28: ;AND_32 dec ebx jz rx_finish xor r13, 067df757eh @@ -531,10 +533,10 @@ rx_i_28: ;XOR_64 rx_body_28: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - xor rax, r13 + and eax, r13d mov r14, rax -rx_i_29: ;SUB_64 +rx_i_29: ;ADD_64 dec ebx jz rx_finish xor r12, 0be2e7c42h @@ -545,7 +547,7 @@ rx_i_29: ;SUB_64 rx_body_29: and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] - sub rax, 1944166515 + add rax, 1944166515 mov r14, rax rx_i_30: ;FPADD @@ -562,7 +564,7 @@ rx_body_30: addpd xmm0, xmm3 movaps xmm7, xmm0 -rx_i_31: ;FPADD +rx_i_31: ;ROR_64 dec ebx jz rx_finish xor r14, 0d352ce37h @@ -573,15 +575,12 @@ rx_i_31: ;FPADD rx_body_31: xor rbp, rcx and ecx, 32767 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] - addpd xmm0, xmm3 - movaps xmm6, xmm0 - mov eax, r14d - xor eax, 01e2da792h - and eax, 2047 - movlpd qword ptr [rsi + rax * 8], xmm6 + mov rax, qword ptr [rsi+rcx*8] + mov rcx, r11 + ror rax, cl + mov r14, rax -rx_i_32: ;XOR_64 +rx_i_32: ;AND_32 dec ebx jz rx_finish xor r12, 0a1f248dah @@ -592,10 +591,10 @@ rx_i_32: ;XOR_64 rx_body_32: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - xor rax, -1936869641 + and eax, -1936869641 mov r9, rax -rx_i_33: ;MULH_64 +rx_i_33: ;MUL_64 dec ebx jz rx_finish xor r9, 0554720fch @@ -606,9 +605,7 @@ rx_i_33: ;MULH_64 rx_body_33: and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] - mov rcx, r15 - mul rcx - mov rax, rdx + imul rax, r15 mov r12, rax rx_i_34: ;CALL @@ -631,7 +628,7 @@ taken_call_34: push rax call rx_i_108 -rx_i_35: ;RET +rx_i_35: ;CALL dec ebx jz rx_finish xor r15, 05ef1be79h @@ -642,13 +639,13 @@ rx_i_35: ;RET rx_body_35: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - cmp rsp, rdi - je short not_taken_ret_35 - xor rax, qword ptr [rsp + 8] - mov r8, rax - ret 8 -not_taken_ret_35: + cmp r9d, -2040787098 + js short taken_call_35 mov r8, rax + jmp rx_i_36 +taken_call_35: + push rax + call rx_i_58 rx_i_36: ;FPMUL dec ebx @@ -699,7 +696,7 @@ rx_body_38: sub rax, r14 mov r10, rax -rx_i_39: ;ADD_32 +rx_i_39: ;ADD_64 dec ebx jz rx_finish xor r14, 02c1f1eb0h @@ -711,10 +708,10 @@ rx_body_39: xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - add eax, r14d + add rax, r14 mov r14, rax -rx_i_40: ;RET +rx_i_40: ;CALL dec ebx jz rx_finish xor r10, 068fd9009h @@ -725,23 +722,19 @@ rx_i_40: ;RET rx_body_40: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - cmp rsp, rdi - je short not_taken_ret_40 - xor rax, qword ptr [rsp + 8] - mov rcx, rax - mov eax, r9d - xor eax, 0b2a27eceh - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - ret 8 -not_taken_ret_40: + cmp r12d, -1297973554 + js short taken_call_40 mov rcx, rax mov eax, r9d xor eax, 0b2a27eceh and eax, 2047 mov qword ptr [rsi + rax * 8], rcx + jmp rx_i_41 +taken_call_40: + push rax + call rx_i_90 -rx_i_41: ;CALL +rx_i_41: ;JUMP dec ebx jz rx_finish xor r9, 037a30933h @@ -752,13 +745,9 @@ rx_i_41: ;CALL rx_body_41: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - cmp r14d, -1070581824 - jo short taken_call_41 mov r9, rax - jmp rx_i_42 -taken_call_41: - push rax - call rx_i_127 + cmp r14d, -1070581824 + jo rx_i_127 rx_i_42: ;FPADD dec ebx @@ -775,7 +764,7 @@ rx_body_42: addpd xmm0, xmm6 movaps xmm6, xmm0 -rx_i_43: ;SUB_64 +rx_i_43: ;ADD_32 dec ebx jz rx_finish xor r12, 02b2a2eech @@ -786,14 +775,14 @@ rx_i_43: ;SUB_64 rx_body_43: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - sub rax, 1693705407 + add eax, 1693705407 mov rcx, rax mov eax, r11d xor eax, 064f3e4bfh and eax, 2047 mov qword ptr [rsi + rax * 8], rcx -rx_i_44: ;ROL_64 +rx_i_44: ;SAR_64 dec ebx jz rx_finish xor r11, 0685817abh @@ -805,7 +794,7 @@ rx_body_44: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] mov rcx, r9 - rol rax, cl + sar rax, cl mov r15, rax rx_i_45: ;FPSUB @@ -841,7 +830,7 @@ rx_body_46: and eax, 2047 mov qword ptr [rsi + rax * 8], rcx -rx_i_47: ;CALL +rx_i_47: ;JUMP dec ebx jz rx_finish xor r12, 05ba232c6h @@ -853,19 +842,15 @@ rx_body_47: xor rbp, rcx and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] - cmp r10d, 119251505 - jbe short taken_call_47 mov rcx, rax mov eax, r13d xor eax, 071ba231h and eax, 32767 mov qword ptr [rsi + rax * 8], rcx - jmp rx_i_48 -taken_call_47: - push rax - call rx_i_131 + cmp r10d, 119251505 + jbe rx_i_131 -rx_i_48: ;FPSQRT +rx_i_48: ;FPDIV dec ebx jz rx_finish xor r8, 0aaed618fh @@ -876,8 +861,11 @@ rx_i_48: ;FPSQRT rx_body_48: and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] - andps xmm0, xmm10 - sqrtpd xmm9, xmm0 + divpd xmm0, xmm3 + movaps xmm1, xmm0 + cmpeqpd xmm1, xmm1 + andps xmm0, xmm1 + movaps xmm9, xmm0 mov eax, r9d xor eax, 020e5d9e9h and eax, 2047 @@ -897,7 +885,7 @@ rx_body_49: subpd xmm0, xmm3 movaps xmm5, xmm0 -rx_i_50: ;OR_32 +rx_i_50: ;AND_64 dec ebx jz rx_finish xor r9, 0da3e4842h @@ -909,7 +897,7 @@ rx_body_50: xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - or eax, r10d + and rax, r10 mov rcx, rax mov eax, r15d xor eax, 06ac56a2ah @@ -930,7 +918,7 @@ rx_body_51: sub rax, 419241919 mov r15, rax -rx_i_52: ;CALL +rx_i_52: ;FPSQRT dec ebx jz rx_finish xor r11, 0fa88f48bh @@ -940,18 +928,9 @@ rx_i_52: ;CALL call rx_read_l2 rx_body_52: and ecx, 32767 - mov rax, qword ptr [rsi+rcx*8] - cmp r13d, -534426193 - js short taken_call_52 - mov rcx, rax - mov eax, r15d - xor eax, 0e0254dafh - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - jmp rx_i_53 -taken_call_52: - push rax - call rx_i_94 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + andps xmm0, xmm10 + sqrtpd xmm7, xmm0 rx_i_53: ;RET dec ebx @@ -972,7 +951,7 @@ rx_body_53: not_taken_ret_53: mov r13, rax -rx_i_54: ;IMULH_64 +rx_i_54: ;IMUL_32 dec ebx jz rx_finish xor r11, 060638de0h @@ -983,9 +962,9 @@ rx_i_54: ;IMULH_64 rx_body_54: and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] - mov rcx, 282209221 - imul rcx - mov rax, rdx + movsxd rcx, eax + mov rax, 282209221 + imul rax, rcx mov rcx, rax mov eax, r12d xor eax, 010d22bc5h @@ -1013,7 +992,7 @@ rx_body_55: and eax, 2047 movhpd qword ptr [rsi + rax * 8], xmm3 -rx_i_56: ;AND_64 +rx_i_56: ;IMULH_64 dec ebx jz rx_finish xor r14, 0f1456b8eh @@ -1024,14 +1003,16 @@ rx_i_56: ;AND_64 rx_body_56: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - and rax, r15 + mov rcx, r15 + imul rcx + mov rax, rdx mov rcx, rax mov eax, r8d xor eax, 0fcf95491h and eax, 32767 mov qword ptr [rsi + rax * 8], rcx -rx_i_57: ;MUL_64 +rx_i_57: ;SUB_32 dec ebx jz rx_finish xor r9, 010dc4571h @@ -1042,14 +1023,14 @@ rx_i_57: ;MUL_64 rx_body_57: and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] - imul rax, r14 + sub eax, r14d mov rcx, rax mov eax, r15d xor eax, 0a426387h and eax, 2047 mov qword ptr [rsi + rax * 8], rcx -rx_i_58: ;IDIV_64 +rx_i_58: ;IMUL_32 dec ebx jz rx_finish xor r14, 0bcec0ebah @@ -1060,21 +1041,9 @@ rx_i_58: ;IDIV_64 rx_body_58: and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] - mov edx, r13d - cmp edx, -1 - jne short safe_idiv_58 - mov rcx, rax - rol rcx, 1 - dec rcx - jz short result_idiv_58 -safe_idiv_58: - mov ecx, 1 - test edx, edx - cmovne ecx, edx - movsxd rcx, ecx - cqo - idiv rcx -result_idiv_58: + movsxd rcx, eax + movsxd rax, r13d + imul rax, rcx mov r8, rax rx_i_59: ;FPSUB @@ -1091,7 +1060,7 @@ rx_body_59: subpd xmm0, xmm8 movaps xmm7, xmm0 -rx_i_60: ;RET +rx_i_60: ;CALL dec ebx jz rx_finish xor r15, 03de14d1eh @@ -1102,23 +1071,19 @@ rx_i_60: ;RET rx_body_60: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - cmp rsp, rdi - je short not_taken_ret_60 - xor rax, qword ptr [rsp + 8] - mov rcx, rax - mov eax, r11d - xor eax, 07bb60f45h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - ret 8 -not_taken_ret_60: + cmp r11d, 2075529029 + jno short taken_call_60 mov rcx, rax mov eax, r11d xor eax, 07bb60f45h and eax, 2047 mov qword ptr [rsi + rax * 8], rcx + jmp rx_i_61 +taken_call_60: + push rax + call rx_i_116 -rx_i_61: ;CALL +rx_i_61: ;JUMP dec ebx jz rx_finish xor r13, 05058ce64h @@ -1129,13 +1094,9 @@ rx_i_61: ;CALL rx_body_61: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - cmp r15d, 1933164545 - jns short taken_call_61 mov r11, rax - jmp rx_i_62 -taken_call_61: - push rax - call rx_i_120 + cmp r15d, 1933164545 + jns rx_i_120 rx_i_62: ;FPSUB dec ebx @@ -1183,7 +1144,7 @@ rx_body_64: sub rax, r15 mov r9, rax -rx_i_65: ;CALL +rx_i_65: ;JUMP dec ebx jz rx_finish xor r13, 07b366ce6h @@ -1194,15 +1155,11 @@ rx_i_65: ;CALL rx_body_65: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - cmp r8d, 1498056607 - js short taken_call_65 mov r11, rax - jmp rx_i_66 -taken_call_65: - push rax - call rx_i_129 + cmp r8d, 1498056607 + js rx_i_129 -rx_i_66: ;FPSQRT +rx_i_66: ;FPDIV dec ebx jz rx_finish xor r15, 015a1b689h @@ -1214,14 +1171,17 @@ rx_body_66: xor rbp, rcx and ecx, 32767 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] - andps xmm0, xmm10 - sqrtpd xmm9, xmm0 + divpd xmm0, xmm3 + movaps xmm1, xmm0 + cmpeqpd xmm1, xmm1 + andps xmm0, xmm1 + movaps xmm9, xmm0 mov eax, r9d xor eax, 07305e78h and eax, 2047 movlpd qword ptr [rsi + rax * 8], xmm9 -rx_i_67: ;CALL +rx_i_67: ;JUMP dec ebx jz rx_finish xor r14, 088393ba0h @@ -1232,13 +1192,9 @@ rx_i_67: ;CALL rx_body_67: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - cmp r13d, 2031541081 - jns short taken_call_67 mov r9, rax - jmp rx_i_68 -taken_call_67: - push rax - call rx_i_79 + cmp r13d, 2031541081 + jns rx_i_79 rx_i_68: ;FPADD dec ebx @@ -1273,7 +1229,7 @@ rx_body_69: addpd xmm0, xmm5 movaps xmm8, xmm0 -rx_i_70: ;MULH_64 +rx_i_70: ;MUL_64 dec ebx jz rx_finish xor r8, 0bbbec3fah @@ -1285,9 +1241,7 @@ rx_body_70: xor rbp, rcx and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] - mov rcx, r9 - mul rcx - mov rax, rdx + imul rax, r9 mov r13, rax rx_i_71: ;FPMUL @@ -1307,7 +1261,7 @@ rx_body_71: andps xmm0, xmm1 movaps xmm7, xmm0 -rx_i_72: ;CALL +rx_i_72: ;JUMP dec ebx jz rx_finish xor r13, 0f4e51e28h @@ -1318,19 +1272,15 @@ rx_i_72: ;CALL rx_body_72: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - cmp r9d, -631091751 - jno short taken_call_72 mov rcx, rax mov eax, r11d xor eax, 0da624dd9h and eax, 2047 mov qword ptr [rsi + rax * 8], rcx - jmp rx_i_73 -taken_call_72: - push rax - call rx_i_191 + cmp r9d, -631091751 + jno rx_i_191 -rx_i_73: ;FPROUND +rx_i_73: ;FPDIV dec ebx jz rx_finish xor r12, 0c24ddbd4h @@ -1340,17 +1290,12 @@ rx_i_73: ;FPROUND call rx_read_l2 rx_body_73: and ecx, 32767 - mov rax, qword ptr [rsi+rcx*8] - mov rcx, rax - shl eax, 13 - and eax, 24576 - or eax, 40896 - mov dword ptr [rsp - 8], eax - ldmxcsr dword ptr [rsp - 8] - mov eax, r10d - xor eax, 040624270h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + divpd xmm0, xmm3 + movaps xmm1, xmm0 + cmpeqpd xmm1, xmm1 + andps xmm0, xmm1 + movaps xmm2, xmm0 rx_i_74: ;MUL_64 dec ebx @@ -1371,7 +1316,7 @@ rx_body_74: and eax, 2047 mov qword ptr [rsi + rax * 8], rcx -rx_i_75: ;RET +rx_i_75: ;CALL dec ebx jz rx_finish xor r14, 03bcc02e3h @@ -1382,13 +1327,13 @@ rx_i_75: ;RET rx_body_75: and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] - cmp rsp, rdi - je short not_taken_ret_75 - xor rax, qword ptr [rsp + 8] - mov r13, rax - ret 8 -not_taken_ret_75: + cmp r11d, -1160798683 + jo short taken_call_75 mov r13, rax + jmp rx_i_76 +taken_call_75: + push rax + call rx_i_202 rx_i_76: ;FPADD dec ebx @@ -1435,7 +1380,7 @@ not_taken_ret_77: and eax, 2047 mov qword ptr [rsi + rax * 8], rcx -rx_i_78: ;MUL_32 +rx_i_78: ;MULH_64 dec ebx jz rx_finish xor r9, 0edeca680h @@ -1446,12 +1391,12 @@ rx_i_78: ;MUL_32 rx_body_78: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - mov ecx, eax - mov eax, r8d - imul rax, rcx + mov rcx, r8 + mul rcx + mov rax, rdx mov r15, rax -rx_i_79: ;RET +rx_i_79: ;CALL dec ebx jz rx_finish xor r11, 0fbdddcb5h @@ -1462,23 +1407,19 @@ rx_i_79: ;RET rx_body_79: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - cmp rsp, rdi - je short not_taken_ret_79 - xor rax, qword ptr [rsp + 8] - mov rcx, rax - mov eax, r11d - xor eax, 06b4a7b43h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - ret 8 -not_taken_ret_79: + cmp r13d, 1800043331 + jbe short taken_call_79 mov rcx, rax mov eax, r11d xor eax, 06b4a7b43h and eax, 2047 mov qword ptr [rsi + rax * 8], rcx + jmp rx_i_80 +taken_call_79: + push rax + call rx_i_93 -rx_i_80: ;FPADD +rx_i_80: ;ROR_64 dec ebx jz rx_finish xor r13, 09cec97a1h @@ -1488,11 +1429,15 @@ rx_i_80: ;FPADD call rx_read_l2 rx_body_80: and ecx, 32767 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] - addpd xmm0, xmm3 - movaps xmm3, xmm0 + mov rax, qword ptr [rsi+rcx*8] + ror rax, 4 + mov rcx, rax + mov eax, r11d + xor eax, 01a681d13h + and eax, 2047 + mov qword ptr [rsi + rax * 8], rcx -rx_i_81: ;OR_64 +rx_i_81: ;AND_64 dec ebx jz rx_finish xor r15, 078228167h @@ -1503,10 +1448,10 @@ rx_i_81: ;OR_64 rx_body_81: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - or rax, r13 + and rax, r13 mov r8, rax -rx_i_82: ;CALL +rx_i_82: ;JUMP dec ebx jz rx_finish xor r11, 078cae1ffh @@ -1517,19 +1462,15 @@ rx_i_82: ;CALL rx_body_82: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - cmp r12d, -68969733 - jo short taken_call_82 mov rcx, rax mov eax, r10d xor eax, 0fbe39afbh and eax, 32767 mov qword ptr [rsi + rax * 8], rcx - jmp rx_i_83 -taken_call_82: - push rax - call rx_i_145 + cmp r12d, -68969733 + jo rx_i_145 -rx_i_83: ;AND_64 +rx_i_83: ;IMULH_64 dec ebx jz rx_finish xor r10, 0d9b6a533h @@ -1540,10 +1481,12 @@ rx_i_83: ;AND_64 rx_body_83: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - and rax, r10 + mov rcx, r10 + imul rcx + mov rax, rdx mov r12, rax -rx_i_84: ;ROR_64 +rx_i_84: ;SAR_64 dec ebx jz rx_finish xor r15, 0e9e75336h @@ -1555,7 +1498,7 @@ rx_body_84: and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] mov rcx, r10 - ror rax, cl + sar rax, cl mov rcx, rax mov eax, r13d xor eax, 0ec5c52e6h @@ -1576,7 +1519,7 @@ rx_body_85: imul rax, r8 mov r10, rax -rx_i_86: ;OR_64 +rx_i_86: ;AND_64 dec ebx jz rx_finish xor r11, 04386e368h @@ -1587,14 +1530,14 @@ rx_i_86: ;OR_64 rx_body_86: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - or rax, r8 + and rax, r8 mov rcx, rax mov eax, r12d xor eax, 0a90410e4h and eax, 2047 mov qword ptr [rsi + rax * 8], rcx -rx_i_87: ;SUB_64 +rx_i_87: ;ADD_32 dec ebx jz rx_finish xor r9, 0d75a0ecfh @@ -1606,10 +1549,10 @@ rx_body_87: xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - sub rax, r12 + add eax, r12d mov r8, rax -rx_i_88: ;FPADD +rx_i_88: ;ROR_64 dec ebx jz rx_finish xor r9, 031bb7f7ah @@ -1620,13 +1563,10 @@ rx_i_88: ;FPADD rx_body_88: xor rbp, rcx and ecx, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] - addpd xmm0, xmm6 - movaps xmm9, xmm0 - mov eax, r9d - xor eax, 0c149906eh - and eax, 2047 - movlpd qword ptr [rsi + rax * 8], xmm9 + mov rax, qword ptr [rsi+rcx*8] + mov rcx, r14 + ror rax, cl + mov r9, rax rx_i_89: ;MUL_64 dec ebx @@ -1678,7 +1618,7 @@ rx_body_91: andps xmm0, xmm1 movaps xmm4, xmm0 -rx_i_92: ;CALL +rx_i_92: ;JUMP dec ebx jz rx_finish xor r8, 0729260e1h @@ -1689,13 +1629,9 @@ rx_i_92: ;CALL rx_body_92: and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] - cmp r14d, 1288893603 - jge short taken_call_92 mov r12, rax - jmp rx_i_93 -taken_call_92: - push rax - call rx_i_170 + cmp r14d, 1288893603 + jge rx_i_170 rx_i_93: ;FPADD dec ebx @@ -1716,7 +1652,7 @@ rx_body_93: and eax, 2047 movhpd qword ptr [rsi + rax * 8], xmm2 -rx_i_94: ;RET +rx_i_94: ;CALL dec ebx jz rx_finish xor r13, 0ea326630h @@ -1728,13 +1664,13 @@ rx_body_94: xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - cmp rsp, rdi - je short not_taken_ret_94 - xor rax, qword ptr [rsp + 8] - mov r8, rax - ret 8 -not_taken_ret_94: + cmp r13d, -343122976 + js short taken_call_94 mov r8, rax + jmp rx_i_95 +taken_call_94: + push rax + call rx_i_157 rx_i_95: ;MUL_64 dec ebx @@ -1754,7 +1690,7 @@ rx_body_95: and eax, 32767 mov qword ptr [rsi + rax * 8], rcx -rx_i_96: ;IMUL_32 +rx_i_96: ;MUL_32 dec ebx jz rx_finish xor r11, 04f912ef8h @@ -1766,12 +1702,12 @@ rx_body_96: xor rbp, rcx and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] - movsxd rcx, eax - mov rax, -1354397081 + mov ecx, eax + mov eax, -1354397081 imul rax, rcx mov r11, rax -rx_i_97: ;FPSQRT +rx_i_97: ;FPDIV dec ebx jz rx_finish xor r15, 0acc45b3bh @@ -1782,8 +1718,11 @@ rx_i_97: ;FPSQRT rx_body_97: and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] - andps xmm0, xmm10 - sqrtpd xmm5, xmm0 + divpd xmm0, xmm9 + movaps xmm1, xmm0 + cmpeqpd xmm1, xmm1 + andps xmm0, xmm1 + movaps xmm5, xmm0 mov eax, r13d xor eax, 0c477e850h and eax, 2047 @@ -1803,7 +1742,7 @@ rx_body_98: sub rax, r15 mov r14, rax -rx_i_99: ;FPDIV +rx_i_99: ;FPMUL dec ebx jz rx_finish xor r9, 0841b2984h @@ -1814,7 +1753,7 @@ rx_i_99: ;FPDIV rx_body_99: and ecx, 32767 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] - divpd xmm0, xmm6 + mulpd xmm0, xmm6 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 andps xmm0, xmm1 @@ -1852,7 +1791,7 @@ rx_body_101: sub rax, r8 mov r11, rax -rx_i_102: ;FPDIV +rx_i_102: ;FPMUL dec ebx jz rx_finish xor r10, 0e50bf07ah @@ -1863,7 +1802,7 @@ rx_i_102: ;FPDIV rx_body_102: and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] - divpd xmm0, xmm3 + mulpd xmm0, xmm3 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 andps xmm0, xmm1 @@ -1887,7 +1826,7 @@ rx_body_103: and eax, 2047 mov qword ptr [rsi + rax * 8], rcx -rx_i_104: ;IMULH_64 +rx_i_104: ;IMUL_32 dec ebx jz rx_finish xor r11, 075deaf71h @@ -1898,16 +1837,16 @@ rx_i_104: ;IMULH_64 rx_body_104: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - mov rcx, -1913070089 - imul rcx - mov rax, rdx + movsxd rcx, eax + mov rax, -1913070089 + imul rax, rcx mov rcx, rax mov eax, r15d xor eax, 08df8ddf7h and eax, 2047 mov qword ptr [rsi + rax * 8], rcx -rx_i_105: ;MUL_32 +rx_i_105: ;MULH_64 dec ebx jz rx_finish xor r13, 036a51f72h @@ -1919,9 +1858,9 @@ rx_body_105: xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - mov ecx, eax - mov eax, r15d - imul rax, rcx + mov rcx, r15 + mul rcx + mov rax, rdx mov rcx, rax mov eax, r14d xor eax, 09c8724edh @@ -1949,7 +1888,7 @@ rx_body_106: and eax, 2047 movlpd qword ptr [rsi + rax * 8], xmm4 -rx_i_107: ;CALL +rx_i_107: ;JUMP dec ebx jz rx_finish xor r12, 0f1d2e50h @@ -1960,19 +1899,15 @@ rx_i_107: ;CALL rx_body_107: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - cmp r11d, 1917037441 - jl short taken_call_107 mov rcx, rax mov eax, r14d xor eax, 07243ab81h and eax, 2047 mov qword ptr [rsi + rax * 8], rcx - jmp rx_i_108 -taken_call_107: - push rax - call rx_i_143 + cmp r11d, 1917037441 + jl rx_i_143 -rx_i_108: ;FPDIV +rx_i_108: ;FPMUL dec ebx jz rx_finish xor r9, 07327ba60h @@ -1983,7 +1918,7 @@ rx_i_108: ;FPDIV rx_body_108: and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] - divpd xmm0, xmm5 + mulpd xmm0, xmm5 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 andps xmm0, xmm1 @@ -1993,7 +1928,7 @@ rx_body_108: and eax, 32767 movhpd qword ptr [rsi + rax * 8], xmm9 -rx_i_109: ;FPADD +rx_i_109: ;ROR_64 dec ebx jz rx_finish xor r15, 0594e37deh @@ -2004,11 +1939,16 @@ rx_i_109: ;FPADD rx_body_109: xor rbp, rcx and ecx, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] - addpd xmm0, xmm2 - movaps xmm3, xmm0 + mov rax, qword ptr [rsi+rcx*8] + mov rcx, r10 + ror rax, cl + mov rcx, rax + mov eax, r11d + xor eax, 094ab5a5ch + and eax, 2047 + mov qword ptr [rsi + rax * 8], rcx -rx_i_110: ;ROL_64 +rx_i_110: ;SHR_64 dec ebx jz rx_finish xor r9, 04cdf5ebah @@ -2020,14 +1960,14 @@ rx_body_110: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] mov rcx, r9 - rol rax, cl + shr rax, cl mov rcx, rax mov eax, r14d xor eax, 0ec68532fh and eax, 2047 mov qword ptr [rsi + rax * 8], rcx -rx_i_111: ;RET +rx_i_111: ;CALL dec ebx jz rx_finish xor r8, 02e16c97ch @@ -2038,21 +1978,17 @@ rx_i_111: ;RET rx_body_111: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - cmp rsp, rdi - je short not_taken_ret_111 - xor rax, qword ptr [rsp + 8] - mov rcx, rax - mov eax, r12d - xor eax, 05d237d0bh - and eax, 32767 - mov qword ptr [rsi + rax * 8], rcx - ret 8 -not_taken_ret_111: + cmp r14d, 1562606859 + jge short taken_call_111 mov rcx, rax mov eax, r12d xor eax, 05d237d0bh and eax, 32767 mov qword ptr [rsi + rax * 8], rcx + jmp rx_i_112 +taken_call_111: + push rax + call rx_i_212 rx_i_112: ;SUB_64 dec ebx @@ -2072,7 +2008,7 @@ rx_body_112: and eax, 2047 mov qword ptr [rsi + rax * 8], rcx -rx_i_113: ;MULH_64 +rx_i_113: ;MUL_64 dec ebx jz rx_finish xor r10, 07a4f8cbbh @@ -2083,12 +2019,10 @@ rx_i_113: ;MULH_64 rx_body_113: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - mov rcx, r9 - mul rcx - mov rax, rdx + imul rax, r9 mov r13, rax -rx_i_114: ;IMULH_64 +rx_i_114: ;IMUL_32 dec ebx jz rx_finish xor r13, 06e83e2cdh @@ -2099,12 +2033,12 @@ rx_i_114: ;IMULH_64 rx_body_114: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - mov rcx, r15 - imul rcx - mov rax, rdx + movsxd rcx, eax + movsxd rax, r15d + imul rax, rcx mov r14, rax -rx_i_115: ;OR_64 +rx_i_115: ;IDIV_64 dec ebx jz rx_finish xor r14, 0336c980eh @@ -2115,10 +2049,24 @@ rx_i_115: ;OR_64 rx_body_115: and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] - or rax, r10 + mov edx, r10d + cmp edx, -1 + jne short safe_idiv_115 + mov rcx, rax + rol rcx, 1 + dec rcx + jz short result_idiv_115 +safe_idiv_115: + mov ecx, 1 + test edx, edx + cmovne ecx, edx + movsxd rcx, ecx + cqo + idiv rcx +result_idiv_115: mov r14, rax -rx_i_116: ;IMULH_64 +rx_i_116: ;IMUL_32 dec ebx jz rx_finish xor r10, 0d122702eh @@ -2129,16 +2077,16 @@ rx_i_116: ;IMULH_64 rx_body_116: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - mov rcx, -1850776691 - imul rcx - mov rax, rdx + movsxd rcx, eax + mov rax, -1850776691 + imul rax, rcx mov rcx, rax mov eax, r8d xor eax, 091af638dh and eax, 32767 mov qword ptr [rsi + rax * 8], rcx -rx_i_117: ;AND_64 +rx_i_117: ;IMULH_64 dec ebx jz rx_finish xor r11, 015f2012bh @@ -2149,7 +2097,9 @@ rx_i_117: ;AND_64 rx_body_117: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - and rax, -1205826972 + mov rcx, -1205826972 + imul rcx + mov rax, rdx mov rcx, rax mov eax, r15d xor eax, 0b8208a64h @@ -2213,7 +2163,7 @@ rx_body_121: subpd xmm0, xmm5 movaps xmm8, xmm0 -rx_i_122: ;RET +rx_i_122: ;CALL dec ebx jz rx_finish xor r10, 04e0dbd40h @@ -2224,23 +2174,19 @@ rx_i_122: ;RET rx_body_122: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - cmp rsp, rdi - je short not_taken_ret_122 - xor rax, qword ptr [rsp + 8] - mov rcx, rax - mov eax, r14d - xor eax, 078f6ec29h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - ret 8 -not_taken_ret_122: + cmp r11d, 2029448233 + jo short taken_call_122 mov rcx, rax mov eax, r14d xor eax, 078f6ec29h and eax, 2047 mov qword ptr [rsi + rax * 8], rcx + jmp rx_i_123 +taken_call_122: + push rax + call rx_i_192 -rx_i_123: ;ADD_32 +rx_i_123: ;ADD_64 dec ebx jz rx_finish xor r13, 073e9f58ah @@ -2251,10 +2197,10 @@ rx_i_123: ;ADD_32 rx_body_123: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - add eax, r15d + add rax, r15 mov r13, rax -rx_i_124: ;CALL +rx_i_124: ;JUMP dec ebx jz rx_finish xor r12, 0e3fa3670h @@ -2265,17 +2211,13 @@ rx_i_124: ;CALL rx_body_124: and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] - cmp r11d, 1719505436 - jns short taken_call_124 mov rcx, rax mov eax, r11d xor eax, 0667d921ch and eax, 2047 mov qword ptr [rsi + rax * 8], rcx - jmp rx_i_125 -taken_call_124: - push rax - call rx_i_237 + cmp r11d, 1719505436 + jns rx_i_237 rx_i_125: ;MUL_32 dec ebx @@ -2294,7 +2236,7 @@ rx_body_125: imul rax, rcx mov r14, rax -rx_i_126: ;FPDIV +rx_i_126: ;FPMUL dec ebx jz rx_finish xor r8, 01feb5264h @@ -2305,7 +2247,7 @@ rx_i_126: ;FPDIV rx_body_126: and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] - divpd xmm0, xmm6 + mulpd xmm0, xmm6 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 andps xmm0, xmm1 @@ -2342,7 +2284,7 @@ rx_body_128: imul rax, r9 mov r9, rax -rx_i_129: ;CALL +rx_i_129: ;JUMP dec ebx jz rx_finish xor r9, 081918b4ch @@ -2353,15 +2295,11 @@ rx_i_129: ;CALL rx_body_129: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - cmp r13d, -590624856 - jge short taken_call_129 mov r9, rax - jmp rx_i_130 -taken_call_129: - push rax - call rx_i_154 + cmp r13d, -590624856 + jge rx_i_154 -rx_i_130: ;OR_64 +rx_i_130: ;DIV_64 dec ebx jz rx_finish xor r9, 077c3b332h @@ -2372,7 +2310,9 @@ rx_i_130: ;OR_64 rx_body_130: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - or rax, -281794782 + mov ecx, -281794782 + xor edx, edx + div rcx mov rcx, rax mov eax, r11d xor eax, 0ef342722h @@ -2420,7 +2360,7 @@ rx_body_132: addpd xmm0, xmm6 movaps xmm7, xmm0 -rx_i_133: ;XOR_64 +rx_i_133: ;OR_64 dec ebx jz rx_finish xor r14, 0822f8b60h @@ -2431,7 +2371,7 @@ rx_i_133: ;XOR_64 rx_body_133: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - xor rax, -1000526796 + or rax, -1000526796 mov rcx, rax mov eax, r15d xor eax, 0c45d2c34h @@ -2469,7 +2409,7 @@ rx_body_135: andps xmm0, xmm1 movaps xmm8, xmm0 -rx_i_136: ;FPSQRT +rx_i_136: ;FPDIV dec ebx jz rx_finish xor r8, 01ae56e03h @@ -2480,14 +2420,17 @@ rx_i_136: ;FPSQRT rx_body_136: and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] - andps xmm0, xmm10 - sqrtpd xmm5, xmm0 + divpd xmm0, xmm8 + movaps xmm1, xmm0 + cmpeqpd xmm1, xmm1 + andps xmm0, xmm1 + movaps xmm5, xmm0 mov eax, r13d xor eax, 0efd7799dh and eax, 2047 movhpd qword ptr [rsi + rax * 8], xmm5 -rx_i_137: ;ROL_64 +rx_i_137: ;SHR_64 dec ebx jz rx_finish xor r11, 015a24231h @@ -2500,7 +2443,7 @@ rx_body_137: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] mov rcx, r9 - rol rax, cl + shr rax, cl mov r11, rax rx_i_138: ;RET @@ -2576,7 +2519,7 @@ rx_body_141: and eax, 2047 movlpd qword ptr [rsi + rax * 8], xmm9 -rx_i_142: ;CALL +rx_i_142: ;JUMP dec ebx jz rx_finish xor r11, 0b11a4f2ch @@ -2587,19 +2530,15 @@ rx_i_142: ;CALL rx_body_142: and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] - cmp r12d, 1365939282 - js short taken_call_142 mov rcx, rax mov eax, r10d xor eax, 0516a9452h and eax, 32767 mov qword ptr [rsi + rax * 8], rcx - jmp rx_i_143 -taken_call_142: - push rax - call rx_i_257 + cmp r12d, 1365939282 + js rx_i_257 -rx_i_143: ;IMUL_32 +rx_i_143: ;MUL_32 dec ebx jz rx_finish xor r15, 037f4b5d0h @@ -2610,12 +2549,12 @@ rx_i_143: ;IMUL_32 rx_body_143: and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] - movsxd rcx, eax - movsxd rax, r11d + mov ecx, eax + mov eax, r11d imul rax, rcx mov r9, rax -rx_i_144: ;IMULH_64 +rx_i_144: ;IMUL_32 dec ebx jz rx_finish xor r10, 02e59e00ah @@ -2626,12 +2565,12 @@ rx_i_144: ;IMULH_64 rx_body_144: and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] - mov rcx, r11 - imul rcx - mov rax, rdx + movsxd rcx, eax + movsxd rax, r11d + imul rax, rcx mov r15, rax -rx_i_145: ;IMULH_64 +rx_i_145: ;IMUL_32 dec ebx jz rx_finish xor r13, 08d5c798h @@ -2642,9 +2581,9 @@ rx_i_145: ;IMULH_64 rx_body_145: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - mov rcx, r11 - imul rcx - mov rax, rdx + movsxd rcx, eax + movsxd rax, r11d + imul rax, rcx mov rcx, rax mov eax, r10d xor eax, 0dd491985h @@ -2668,7 +2607,7 @@ rx_body_146: imul rax, rcx mov r10, rax -rx_i_147: ;MULH_64 +rx_i_147: ;MUL_64 dec ebx jz rx_finish xor r13, 03a7df043h @@ -2680,16 +2619,14 @@ rx_body_147: xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - mov rcx, 1784404616 - mul rcx - mov rax, rdx + imul rax, rax, 1784404616 mov rcx, rax mov eax, r12d xor eax, 06a5bda88h and eax, 2047 mov qword ptr [rsi + rax * 8], rcx -rx_i_148: ;SUB_64 +rx_i_148: ;ADD_32 dec ebx jz rx_finish xor r10, 0783e5c4eh @@ -2700,7 +2637,7 @@ rx_i_148: ;SUB_64 rx_body_148: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - sub rax, r14 + add eax, r14d mov rcx, rax mov eax, r10d xor eax, 08c783d2ch @@ -2727,7 +2664,7 @@ rx_body_149: and eax, 2047 mov qword ptr [rsi + rax * 8], rcx -rx_i_150: ;DIV_64 +rx_i_150: ;IMUL_32 dec ebx jz rx_finish xor r9, 01504ca7ah @@ -2738,19 +2675,16 @@ rx_i_150: ;DIV_64 rx_body_150: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - mov ecx, 1 - mov edx, r8d - test edx, edx - cmovne ecx, edx - xor edx, edx - div rcx + movsxd rcx, eax + movsxd rax, r8d + imul rax, rcx mov rcx, rax mov eax, r9d xor eax, 0c854a524h and eax, 2047 mov qword ptr [rsi + rax * 8], rcx -rx_i_151: ;OR_32 +rx_i_151: ;AND_64 dec ebx jz rx_finish xor r9, 0ea72a7cfh @@ -2761,14 +2695,14 @@ rx_i_151: ;OR_32 rx_body_151: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - or eax, r13d + and rax, r13 mov rcx, rax mov eax, r11d xor eax, 087aed7f2h and eax, 32767 mov qword ptr [rsi + rax * 8], rcx -rx_i_152: ;ROR_64 +rx_i_152: ;SAR_64 dec ebx jz rx_finish xor r13, 0ad0e7a88h @@ -2780,10 +2714,10 @@ rx_body_152: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] mov rcx, r10 - ror rax, cl + sar rax, cl mov r10, rax -rx_i_153: ;FPDIV +rx_i_153: ;FPMUL dec ebx jz rx_finish xor r15, 0fd95ab87h @@ -2794,7 +2728,7 @@ rx_i_153: ;FPDIV rx_body_153: and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] - divpd xmm0, xmm2 + mulpd xmm0, xmm2 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 andps xmm0, xmm1 @@ -2820,7 +2754,7 @@ rx_body_154: imul rax, rcx mov r10, rax -rx_i_155: ;ROR_64 +rx_i_155: ;ROL_64 dec ebx jz rx_finish xor r11, 0d23f3b78h @@ -2832,7 +2766,7 @@ rx_body_155: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] mov rcx, r10 - ror rax, cl + rol rax, cl mov rcx, rax mov eax, r13d xor eax, 01c5d3ebeh @@ -2883,7 +2817,7 @@ rx_body_158: add rax, 1233402159 mov r10, rax -rx_i_159: ;RET +rx_i_159: ;CALL dec ebx jz rx_finish xor r13, 0952a3abbh @@ -2894,23 +2828,19 @@ rx_i_159: ;RET rx_body_159: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - cmp rsp, rdi - je short not_taken_ret_159 - xor rax, qword ptr [rsp + 8] - mov rcx, rax - mov eax, r13d - xor eax, 0ff7d3697h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - ret 8 -not_taken_ret_159: + cmp r15d, -8571241 + jbe short taken_call_159 mov rcx, rax mov eax, r13d xor eax, 0ff7d3697h and eax, 2047 mov qword ptr [rsi + rax * 8], rcx + jmp rx_i_160 +taken_call_159: + push rax + call rx_i_181 -rx_i_160: ;SUB_64 +rx_i_160: ;ADD_32 dec ebx jz rx_finish xor r14, 0b1685b90h @@ -2922,14 +2852,14 @@ rx_body_160: xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - sub rax, 1518778665 + add eax, 1518778665 mov rcx, rax mov eax, r10d xor eax, 05a86b929h and eax, 32767 mov qword ptr [rsi + rax * 8], rcx -rx_i_161: ;OR_64 +rx_i_161: ;IDIV_64 dec ebx jz rx_finish xor r15, 0ea992531h @@ -2940,10 +2870,24 @@ rx_i_161: ;OR_64 rx_body_161: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - or rax, r14 + mov edx, r14d + cmp edx, -1 + jne short safe_idiv_161 + mov rcx, rax + rol rcx, 1 + dec rcx + jz short result_idiv_161 +safe_idiv_161: + mov ecx, 1 + test edx, edx + cmovne ecx, edx + movsxd rcx, ecx + cqo + idiv rcx +result_idiv_161: mov r8, rax -rx_i_162: ;SAR_64 +rx_i_162: ;SHL_64 dec ebx jz rx_finish xor r9, 01fd57a4ah @@ -2955,10 +2899,10 @@ rx_body_162: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] mov rcx, r10 - sar rax, cl + shl rax, cl mov r13, rax -rx_i_163: ;SUB_64 +rx_i_163: ;ADD_32 dec ebx jz rx_finish xor r12, 0e3486c0ah @@ -2970,14 +2914,14 @@ rx_body_163: xor rbp, rcx and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] - sub rax, -2101130488 + add eax, -2101130488 mov rcx, rax mov eax, r14d xor eax, 082c34b08h and eax, 2047 mov qword ptr [rsi + rax * 8], rcx -rx_i_164: ;MUL_32 +rx_i_164: ;MULH_64 dec ebx jz rx_finish xor r12, 01f0c2737h @@ -2989,9 +2933,9 @@ rx_body_164: xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - mov ecx, eax - mov eax, r9d - imul rax, rcx + mov rcx, r9 + mul rcx + mov rax, rdx mov rcx, rax mov eax, r13d xor eax, 09aa6da19h @@ -3025,7 +2969,7 @@ not_taken_ret_165: and eax, 32767 mov qword ptr [rsi + rax * 8], rcx -rx_i_166: ;ROL_64 +rx_i_166: ;SHR_64 dec ebx jz rx_finish xor r9, 0fe684081h @@ -3037,7 +2981,7 @@ rx_body_166: and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] mov rcx, r8 - rol rax, cl + shr rax, cl mov rcx, rax mov eax, r13d xor eax, 0bb67f8abh @@ -3065,7 +3009,7 @@ rx_body_167: and eax, 2047 movlpd qword ptr [rsi + rax * 8], xmm2 -rx_i_168: ;FPSQRT +rx_i_168: ;FPDIV dec ebx jz rx_finish xor r12, 071b15effh @@ -3076,10 +3020,13 @@ rx_i_168: ;FPSQRT rx_body_168: and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] - andps xmm0, xmm10 - sqrtpd xmm7, xmm0 + divpd xmm0, xmm5 + movaps xmm1, xmm0 + cmpeqpd xmm1, xmm1 + andps xmm0, xmm1 + movaps xmm7, xmm0 -rx_i_169: ;RET +rx_i_169: ;CALL dec ebx jz rx_finish xor r11, 072790347h @@ -3090,23 +3037,19 @@ rx_i_169: ;RET rx_body_169: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - cmp rsp, rdi - je short not_taken_ret_169 - xor rax, qword ptr [rsp + 8] - mov rcx, rax - mov eax, r14d - xor eax, 0b353bf8dh - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - ret 8 -not_taken_ret_169: + cmp r10d, -1286357107 + jbe short taken_call_169 mov rcx, rax mov eax, r14d xor eax, 0b353bf8dh and eax, 2047 mov qword ptr [rsi + rax * 8], rcx + jmp rx_i_170 +taken_call_169: + push rax + call rx_i_197 -rx_i_170: ;CALL +rx_i_170: ;FPSQRT dec ebx jz rx_finish xor r8, 04ae8a020h @@ -3116,16 +3059,15 @@ rx_i_170: ;CALL call rx_read_l2 rx_body_170: and ecx, 32767 - mov rax, qword ptr [rsi+rcx*8] - cmp r10d, -1541051751 - jl short taken_call_170 - mov r14, rax - jmp rx_i_171 -taken_call_170: - push rax - call rx_i_204 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + andps xmm0, xmm10 + sqrtpd xmm6, xmm0 + mov eax, r14d + xor eax, 0a4256a99h + and eax, 32767 + movlpd qword ptr [rsi + rax * 8], xmm6 -rx_i_171: ;IMULH_64 +rx_i_171: ;IMUL_32 dec ebx jz rx_finish xor r15, 09901e05bh @@ -3136,9 +3078,9 @@ rx_i_171: ;IMULH_64 rx_body_171: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - mov rcx, r12 - imul rcx - mov rax, rdx + movsxd rcx, eax + movsxd rax, r12d + imul rax, rcx mov r12, rax rx_i_172: ;SUB_64 @@ -3174,7 +3116,7 @@ rx_body_173: and eax, 32767 mov qword ptr [rsi + rax * 8], rcx -rx_i_174: ;FPROUND +rx_i_174: ;FPDIV dec ebx jz rx_finish xor r12, 0a025c3dbh @@ -3184,16 +3126,18 @@ rx_i_174: ;FPROUND call rx_read_l1 rx_body_174: and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] - mov rcx, rax - shl eax, 13 - and eax, 24576 - or eax, 40896 - mov dword ptr [rsp - 8], eax - ldmxcsr dword ptr [rsp - 8] - mov r14, rcx + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + divpd xmm0, xmm9 + movaps xmm1, xmm0 + cmpeqpd xmm1, xmm1 + andps xmm0, xmm1 + movaps xmm6, xmm0 + mov eax, r14d + xor eax, 02be6989fh + and eax, 32767 + movlpd qword ptr [rsi + rax * 8], xmm6 -rx_i_175: ;SAR_64 +rx_i_175: ;XOR_32 dec ebx jz rx_finish xor r13, 08f74c11h @@ -3204,8 +3148,7 @@ rx_i_175: ;SAR_64 rx_body_175: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - mov rcx, r8 - sar rax, cl + xor eax, r8d mov r8, rax rx_i_176: ;SUB_64 @@ -3281,7 +3224,7 @@ rx_body_179: addpd xmm0, xmm2 movaps xmm8, xmm0 -rx_i_180: ;XOR_64 +rx_i_180: ;AND_32 dec ebx jz rx_finish xor r15, 01cb3ce1fh @@ -3292,14 +3235,14 @@ rx_i_180: ;XOR_64 rx_body_180: and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] - xor rax, 1995308563 + and eax, 1995308563 mov rcx, rax mov eax, r9d xor eax, 076edfe13h and eax, 2047 mov qword ptr [rsi + rax * 8], rcx -rx_i_181: ;RET +rx_i_181: ;CALL dec ebx jz rx_finish xor r10, 023c7845fh @@ -3311,13 +3254,13 @@ rx_body_181: xor rbp, rcx and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] - cmp rsp, rdi - je short not_taken_ret_181 - xor rax, qword ptr [rsp + 8] - mov r10, rax - ret 8 -not_taken_ret_181: + cmp r12d, -1612576918 + ja short taken_call_181 mov r10, rax + jmp rx_i_182 +taken_call_181: + push rax + call rx_i_211 rx_i_182: ;FPSUB dec ebx @@ -3348,7 +3291,7 @@ rx_body_183: add rax, 137260710 mov r10, rax -rx_i_184: ;SAR_64 +rx_i_184: ;XOR_32 dec ebx jz rx_finish xor r12, 04764cdf7h @@ -3360,10 +3303,10 @@ rx_body_184: xor rbp, rcx and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] - sar rax, 40 + xor eax, 790123591 mov r12, rax -rx_i_185: ;CALL +rx_i_185: ;JUMP dec ebx jz rx_finish xor r10, 03c41026fh @@ -3374,19 +3317,15 @@ rx_i_185: ;CALL rx_body_185: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - cmp r15d, -1510284125 - jbe short taken_call_185 mov rcx, rax mov eax, r9d xor eax, 0a5fae4a3h and eax, 2047 mov qword ptr [rsi + rax * 8], rcx - jmp rx_i_186 -taken_call_185: - push rax - call rx_i_246 + cmp r15d, -1510284125 + jbe rx_i_246 -rx_i_186: ;XOR_32 +rx_i_186: ;OR_64 dec ebx jz rx_finish xor r9, 0cded414bh @@ -3398,14 +3337,14 @@ rx_body_186: xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - xor eax, r15d + or rax, r15 mov rcx, rax mov eax, r10d xor eax, 0b55bfba0h and eax, 2047 mov qword ptr [rsi + rax * 8], rcx -rx_i_187: ;FPDIV +rx_i_187: ;FPMUL dec ebx jz rx_finish xor r13, 05c6d64a8h @@ -3416,7 +3355,7 @@ rx_i_187: ;FPDIV rx_body_187: and ecx, 32767 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] - divpd xmm0, xmm6 + mulpd xmm0, xmm6 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 andps xmm0, xmm1 @@ -3437,7 +3376,7 @@ rx_body_188: subpd xmm0, xmm3 movaps xmm4, xmm0 -rx_i_189: ;FPROUND +rx_i_189: ;FPDIV dec ebx jz rx_finish xor r11, 0c52741d5h @@ -3447,17 +3386,12 @@ rx_i_189: ;FPROUND call rx_read_l1 rx_body_189: and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] - mov rcx, rax - shl eax, 13 - and eax, 24576 - or eax, 40896 - mov dword ptr [rsp - 8], eax - ldmxcsr dword ptr [rsp - 8] - mov eax, r13d - xor eax, 0e6f1a3b7h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + divpd xmm0, xmm7 + movaps xmm1, xmm0 + cmpeqpd xmm1, xmm1 + andps xmm0, xmm1 + movaps xmm5, xmm0 rx_i_190: ;RET dec ebx @@ -3478,7 +3412,7 @@ rx_body_190: not_taken_ret_190: mov r13, rax -rx_i_191: ;CALL +rx_i_191: ;FPSQRT dec ebx jz rx_finish xor r15, 0884f3526h @@ -3488,20 +3422,11 @@ rx_i_191: ;CALL call rx_read_l1 rx_body_191: and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] - cmp r11d, 1687119072 - jno short taken_call_191 - mov rcx, rax - mov eax, r14d - xor eax, 0648f64e0h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - jmp rx_i_192 -taken_call_191: - push rax - call rx_i_275 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + andps xmm0, xmm10 + sqrtpd xmm6, xmm0 -rx_i_192: ;CALL +rx_i_192: ;FPSQRT dec ebx jz rx_finish xor r8, 0d76edad3h @@ -3511,16 +3436,15 @@ rx_i_192: ;CALL call rx_read_l1 rx_body_192: and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] - cmp r14d, -117628864 - jns short taken_call_192 - mov r8, rax - jmp rx_i_193 -taken_call_192: - push rax - call rx_i_305 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + andps xmm0, xmm10 + sqrtpd xmm8, xmm0 + mov eax, r8d + xor eax, 0f8fd2040h + and eax, 32767 + movlpd qword ptr [rsi + rax * 8], xmm8 -rx_i_193: ;MUL_32 +rx_i_193: ;MULH_64 dec ebx jz rx_finish xor r12, 0e9939ach @@ -3531,9 +3455,9 @@ rx_i_193: ;MUL_32 rx_body_193: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - mov ecx, eax - mov eax, r12d - imul rax, rcx + mov rcx, r12 + mul rcx + mov rax, rdx mov rcx, rax mov eax, r15d xor eax, 074e097dch @@ -3561,7 +3485,7 @@ rx_body_194: and eax, 2047 movlpd qword ptr [rsi + rax * 8], xmm5 -rx_i_195: ;ROL_64 +rx_i_195: ;SHL_64 dec ebx jz rx_finish xor r10, 09405152ch @@ -3573,10 +3497,10 @@ rx_body_195: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] mov rcx, r8 - rol rax, cl + shl rax, cl mov r9, rax -rx_i_196: ;SUB_64 +rx_i_196: ;ADD_32 dec ebx jz rx_finish xor r8, 0c2a9f41bh @@ -3587,7 +3511,7 @@ rx_i_196: ;SUB_64 rx_body_196: and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] - sub rax, -1907903895 + add eax, -1907903895 mov rcx, rax mov eax, r13d xor eax, 08e47b269h @@ -3609,7 +3533,7 @@ rx_body_197: imul rax, r15 mov r11, rax -rx_i_198: ;MULH_64 +rx_i_198: ;MUL_64 dec ebx jz rx_finish xor r14, 0c8d95bbbh @@ -3620,16 +3544,14 @@ rx_i_198: ;MULH_64 rx_body_198: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - mov rcx, r14 - mul rcx - mov rax, rdx + imul rax, r14 mov rcx, rax mov eax, r8d xor eax, 01149cba0h and eax, 2047 mov qword ptr [rsi + rax * 8], rcx -rx_i_199: ;MULH_64 +rx_i_199: ;MUL_64 dec ebx jz rx_finish xor r13, 050049e2eh @@ -3640,9 +3562,7 @@ rx_i_199: ;MULH_64 rx_body_199: and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] - mov rcx, r10 - mul rcx - mov rax, rdx + imul rax, r10 mov rcx, rax mov eax, r10d xor eax, 0d0e71e9ah @@ -3686,7 +3606,7 @@ rx_body_201: and eax, 2047 movlpd qword ptr [rsi + rax * 8], xmm4 -rx_i_202: ;FPSUB +rx_i_202: ;FPADD dec ebx jz rx_finish xor r13, 0fa44b04ah @@ -3697,7 +3617,7 @@ rx_i_202: ;FPSUB rx_body_202: and ecx, 32767 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] - subpd xmm0, xmm9 + addpd xmm0, xmm9 movaps xmm5, xmm0 rx_i_203: ;FPSUB @@ -3736,7 +3656,7 @@ rx_body_204: and eax, 32767 mov qword ptr [rsi + rax * 8], rcx -rx_i_205: ;FPDIV +rx_i_205: ;FPMUL dec ebx jz rx_finish xor r14, 094e997c5h @@ -3748,7 +3668,7 @@ rx_body_205: xor rbp, rcx and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] - divpd xmm0, xmm8 + mulpd xmm0, xmm8 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 andps xmm0, xmm1 @@ -3769,7 +3689,7 @@ rx_body_206: subpd xmm0, xmm7 movaps xmm4, xmm0 -rx_i_207: ;AND_32 +rx_i_207: ;IMULH_64 dec ebx jz rx_finish xor r9, 039ccdd30h @@ -3781,7 +3701,9 @@ rx_body_207: xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - and eax, r12d + mov rcx, r12 + imul rcx + mov rax, rdx mov rcx, rax mov eax, r9d xor eax, 012bbcc84h @@ -3802,7 +3724,7 @@ rx_body_208: imul rax, r12 mov r10, rax -rx_i_209: ;SHR_64 +rx_i_209: ;XOR_64 dec ebx jz rx_finish xor r8, 0b84811f1h @@ -3814,7 +3736,7 @@ rx_body_209: xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - shr rax, 30 + xor rax, -1016364182 mov rcx, rax mov eax, r12d xor eax, 0c36b836ah @@ -3842,7 +3764,7 @@ rx_body_210: and eax, 32767 mov qword ptr [rsi + rax * 8], rcx -rx_i_211: ;FPADD +rx_i_211: ;ROR_64 dec ebx jz rx_finish xor r12, 0ce533072h @@ -3852,9 +3774,13 @@ rx_i_211: ;FPADD call rx_read_l2 rx_body_211: and ecx, 32767 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] - addpd xmm0, xmm9 - movaps xmm3, xmm0 + mov rax, qword ptr [rsi+rcx*8] + ror rax, 27 + mov rcx, rax + mov eax, r11d + xor eax, 0212e615h + and eax, 2047 + mov qword ptr [rsi + rax * 8], rcx rx_i_212: ;MUL_64 dec ebx @@ -3892,7 +3818,7 @@ rx_body_213: imul rax, rcx mov r14, rax -rx_i_214: ;ROL_64 +rx_i_214: ;SHL_64 dec ebx jz rx_finish xor r9, 0a159f313h @@ -3904,10 +3830,10 @@ rx_body_214: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] mov rcx, r14 - rol rax, cl + shl rax, cl mov r14, rax -rx_i_215: ;SUB_64 +rx_i_215: ;ADD_64 dec ebx jz rx_finish xor r15, 08359265eh @@ -3919,7 +3845,7 @@ rx_body_215: xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - sub rax, r12 + add rax, r12 mov r10, rax rx_i_216: ;MUL_64 @@ -3940,7 +3866,7 @@ rx_body_216: and eax, 2047 mov qword ptr [rsi + rax * 8], rcx -rx_i_217: ;IMUL_32 +rx_i_217: ;MUL_32 dec ebx jz rx_finish xor r8, 040d5b526h @@ -3951,8 +3877,8 @@ rx_i_217: ;IMUL_32 rx_body_217: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - movsxd rcx, eax - movsxd rax, r9d + mov ecx, eax + mov eax, r9d imul rax, rcx mov rcx, rax mov eax, r10d @@ -3960,7 +3886,7 @@ rx_body_217: and eax, 2047 mov qword ptr [rsi + rax * 8], rcx -rx_i_218: ;CALL +rx_i_218: ;FPSQRT dec ebx jz rx_finish xor r11, 083c0bd93h @@ -3970,16 +3896,15 @@ rx_i_218: ;CALL call rx_read_l2 rx_body_218: and ecx, 32767 - mov rax, qword ptr [rsi+rcx*8] - cmp r8d, -585552250 - jge short taken_call_218 - mov r11, rax - jmp rx_i_219 -taken_call_218: - push rax - call rx_i_240 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + andps xmm0, xmm10 + sqrtpd xmm3, xmm0 + mov eax, r11d + xor eax, 0dd192e86h + and eax, 2047 + movhpd qword ptr [rsi + rax * 8], xmm3 -rx_i_219: ;XOR_64 +rx_i_219: ;OR_64 dec ebx jz rx_finish xor r8, 0ca37f668h @@ -3990,7 +3915,7 @@ rx_i_219: ;XOR_64 rx_body_219: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - xor rax, -740915304 + or rax, -740915304 mov rcx, rax mov eax, r15d xor eax, 0d3d68798h @@ -4017,7 +3942,7 @@ rx_body_220: and eax, 32767 mov qword ptr [rsi + rax * 8], rcx -rx_i_221: ;IMULH_64 +rx_i_221: ;IMUL_32 dec ebx jz rx_finish xor r9, 0a3deb512h @@ -4028,9 +3953,9 @@ rx_i_221: ;IMULH_64 rx_body_221: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - mov rcx, r15 - imul rcx - mov rax, rdx + movsxd rcx, eax + movsxd rax, r15d + imul rax, rcx mov rcx, rax mov eax, r11d xor eax, 07feab351h @@ -4077,7 +4002,7 @@ rx_body_223: and eax, 32767 movhpd qword ptr [rsi + rax * 8], xmm2 -rx_i_224: ;SAR_64 +rx_i_224: ;XOR_32 dec ebx jz rx_finish xor r12, 053982440h @@ -4088,15 +4013,14 @@ rx_i_224: ;SAR_64 rx_body_224: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - mov rcx, r14 - sar rax, cl + xor eax, r14d mov rcx, rax mov eax, r11d xor eax, 0e500c69dh and eax, 2047 mov qword ptr [rsi + rax * 8], rcx -rx_i_225: ;DIV_64 +rx_i_225: ;IMUL_32 dec ebx jz rx_finish xor r13, 0c558367eh @@ -4107,19 +4031,16 @@ rx_i_225: ;DIV_64 rx_body_225: and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] - mov ecx, 1 - mov edx, r10d - test edx, edx - cmovne ecx, edx - xor edx, edx - div rcx + movsxd rcx, eax + movsxd rax, r10d + imul rax, rcx mov rcx, rax mov eax, r12d xor eax, 0fe304a4ah and eax, 32767 mov qword ptr [rsi + rax * 8], rcx -rx_i_226: ;CALL +rx_i_226: ;JUMP dec ebx jz rx_finish xor r10, 040139b65h @@ -4130,19 +4051,15 @@ rx_i_226: ;CALL rx_body_226: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - cmp r8d, -1752488808 - jno short taken_call_226 mov rcx, rax mov eax, r8d xor eax, 0978b2498h and eax, 2047 mov qword ptr [rsi + rax * 8], rcx - jmp rx_i_227 -taken_call_226: - push rax - call rx_i_328 + cmp r8d, -1752488808 + jno rx_i_328 -rx_i_227: ;FPDIV +rx_i_227: ;FPMUL dec ebx jz rx_finish xor r11, 0fa312dbdh @@ -4153,7 +4070,7 @@ rx_i_227: ;FPDIV rx_body_227: and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] - divpd xmm0, xmm7 + mulpd xmm0, xmm7 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 andps xmm0, xmm1 @@ -4163,7 +4080,7 @@ rx_body_227: and eax, 2047 movhpd qword ptr [rsi + rax * 8], xmm3 -rx_i_228: ;CALL +rx_i_228: ;FPSQRT dec ebx jz rx_finish xor r11, 0b64246c0h @@ -4173,18 +4090,9 @@ rx_i_228: ;CALL call rx_read_l1 rx_body_228: and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] - cmp r10d, -2099304 - jns short taken_call_228 - mov rcx, rax - mov eax, r15d - xor eax, 0ffdff798h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - jmp rx_i_229 -taken_call_228: - push rax - call rx_i_283 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + andps xmm0, xmm10 + sqrtpd xmm7, xmm0 rx_i_229: ;IMUL_32 dec ebx @@ -4255,7 +4163,7 @@ not_taken_ret_231: and eax, 2047 mov qword ptr [rsi + rax * 8], rcx -rx_i_232: ;FPDIV +rx_i_232: ;FPMUL dec ebx jz rx_finish xor r15, 09ab46ab3h @@ -4266,13 +4174,13 @@ rx_i_232: ;FPDIV rx_body_232: and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] - divpd xmm0, xmm3 + mulpd xmm0, xmm3 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 andps xmm0, xmm1 movaps xmm7, xmm0 -rx_i_233: ;CALL +rx_i_233: ;JUMP dec ebx jz rx_finish xor r13, 08eb2cd76h @@ -4284,15 +4192,11 @@ rx_body_233: xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - cmp r12d, 392389867 - jo short taken_call_233 mov r14, rax - jmp rx_i_234 -taken_call_233: - push rax - call rx_i_268 + cmp r12d, 392389867 + jo rx_i_268 -rx_i_234: ;FPROUND +rx_i_234: ;FPDIV dec ebx jz rx_finish xor r15, 0ba687578h @@ -4302,19 +4206,14 @@ rx_i_234: ;FPROUND call rx_read_l1 rx_body_234: and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] - mov rcx, rax - shl eax, 13 - and eax, 24576 - or eax, 40896 - mov dword ptr [rsp - 8], eax - ldmxcsr dword ptr [rsp - 8] - mov eax, r12d - xor eax, 04d2e9e7dh - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + divpd xmm0, xmm4 + movaps xmm1, xmm0 + cmpeqpd xmm1, xmm1 + andps xmm0, xmm1 + movaps xmm4, xmm0 -rx_i_235: ;IMUL_32 +rx_i_235: ;MUL_32 dec ebx jz rx_finish xor r13, 0b6cb9ff2h @@ -4325,8 +4224,8 @@ rx_i_235: ;IMUL_32 rx_body_235: and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] - movsxd rcx, eax - movsxd rax, r12d + mov ecx, eax + mov eax, r12d imul rax, rcx mov rcx, rax mov eax, r15d @@ -4348,7 +4247,7 @@ rx_body_236: addpd xmm0, xmm4 movaps xmm3, xmm0 -rx_i_237: ;CALL +rx_i_237: ;JUMP dec ebx jz rx_finish xor r15, 0fab4600h @@ -4359,13 +4258,9 @@ rx_i_237: ;CALL rx_body_237: and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] - cmp r12d, -121899164 - jge short taken_call_237 mov r11, rax - jmp rx_i_238 -taken_call_237: - push rax - call rx_i_295 + cmp r12d, -121899164 + jge rx_i_295 rx_i_238: ;FPADD dec ebx @@ -4455,7 +4350,7 @@ rx_body_242: and eax, 2047 mov qword ptr [rsi + rax * 8], rcx -rx_i_243: ;XOR_64 +rx_i_243: ;OR_64 dec ebx jz rx_finish xor r12, 0d6c2ce3dh @@ -4466,10 +4361,10 @@ rx_i_243: ;XOR_64 rx_body_243: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - xor rax, 1198180774 + or rax, 1198180774 mov r14, rax -rx_i_244: ;FPADD +rx_i_244: ;ROR_64 dec ebx jz rx_finish xor r11, 0c6a6248h @@ -4479,11 +4374,16 @@ rx_i_244: ;FPADD call rx_read_l2 rx_body_244: and ecx, 32767 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] - addpd xmm0, xmm6 - movaps xmm9, xmm0 + mov rax, qword ptr [rsi+rcx*8] + mov rcx, r14 + ror rax, cl + mov rcx, rax + mov eax, r9d + xor eax, 0b4a1fad6h + and eax, 2047 + mov qword ptr [rsi + rax * 8], rcx -rx_i_245: ;XOR_64 +rx_i_245: ;AND_32 dec ebx jz rx_finish xor r13, 084505739h @@ -4495,14 +4395,14 @@ rx_body_245: xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - xor rax, -1546539637 + and eax, -1546539637 mov rcx, rax mov eax, r12d xor eax, 0a3d1ad8bh and eax, 2047 mov qword ptr [rsi + rax * 8], rcx -rx_i_246: ;AND_64 +rx_i_246: ;IMULH_64 dec ebx jz rx_finish xor r15, 027eeaa2eh @@ -4514,10 +4414,12 @@ rx_body_246: xor rbp, rcx and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] - and rax, r9 + mov rcx, r9 + imul rcx + mov rax, rdx mov r12, rax -rx_i_247: ;IMUL_32 +rx_i_247: ;MUL_32 dec ebx jz rx_finish xor r10, 0c4de0296h @@ -4528,8 +4430,8 @@ rx_i_247: ;IMUL_32 rx_body_247: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - movsxd rcx, eax - movsxd rax, r14d + mov ecx, eax + mov eax, r14d imul rax, rcx mov rcx, rax mov eax, r9d @@ -4537,7 +4439,7 @@ rx_body_247: and eax, 2047 mov qword ptr [rsi + rax * 8], rcx -rx_i_248: ;MUL_32 +rx_i_248: ;MULH_64 dec ebx jz rx_finish xor r8, 0649df46fh @@ -4549,9 +4451,9 @@ rx_body_248: xor rbp, rcx and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] - mov ecx, eax - mov eax, r15d - imul rax, rcx + mov rcx, r15 + mul rcx + mov rax, rdx mov rcx, rax mov eax, r9d xor eax, 07b10fc32h @@ -4579,7 +4481,7 @@ rx_body_249: and eax, 32767 mov qword ptr [rsi + rax * 8], rcx -rx_i_250: ;MUL_64 +rx_i_250: ;SUB_32 dec ebx jz rx_finish xor r13, 083eafe6fh @@ -4590,7 +4492,7 @@ rx_i_250: ;MUL_64 rx_body_250: and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] - imul rax, r8 + sub eax, r8d mov rcx, rax mov eax, r14d xor eax, 031115b87h @@ -4618,7 +4520,7 @@ rx_body_251: and eax, 2047 movlpd qword ptr [rsi + rax * 8], xmm4 -rx_i_252: ;ROL_64 +rx_i_252: ;SHL_64 dec ebx jz rx_finish xor r14, 08a75ad41h @@ -4630,7 +4532,7 @@ rx_body_252: and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] mov rcx, r8 - rol rax, cl + shl rax, cl mov r14, rax rx_i_253: ;CALL @@ -4656,7 +4558,7 @@ taken_call_253: push rax call rx_i_367 -rx_i_254: ;FPSUB +rx_i_254: ;FPADD dec ebx jz rx_finish xor r14, 04cfb709eh @@ -4667,7 +4569,7 @@ rx_i_254: ;FPSUB rx_body_254: and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] - subpd xmm0, xmm4 + addpd xmm0, xmm4 movaps xmm8, xmm0 mov eax, r8d xor eax, 0c251872eh @@ -4692,7 +4594,7 @@ rx_body_255: and eax, 2047 movlpd qword ptr [rsi + rax * 8], xmm6 -rx_i_256: ;MULH_64 +rx_i_256: ;MUL_64 dec ebx jz rx_finish xor r8, 08375472ch @@ -4704,9 +4606,7 @@ rx_body_256: xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - mov rcx, r15 - mul rcx - mov rax, rdx + imul rax, r15 mov rcx, rax mov eax, r9d xor eax, 0f8942c0h @@ -4781,7 +4681,7 @@ rx_body_260: subpd xmm0, xmm5 movaps xmm9, xmm0 -rx_i_261: ;FPSQRT +rx_i_261: ;FPDIV dec ebx jz rx_finish xor r14, 02346171ch @@ -4793,14 +4693,17 @@ rx_body_261: xor rbp, rcx and ecx, 32767 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] - andps xmm0, xmm10 - sqrtpd xmm3, xmm0 + divpd xmm0, xmm3 + movaps xmm1, xmm0 + cmpeqpd xmm1, xmm1 + andps xmm0, xmm1 + movaps xmm3, xmm0 mov eax, r11d xor eax, 0745a48e9h and eax, 2047 movhpd qword ptr [rsi + rax * 8], xmm3 -rx_i_262: ;OR_32 +rx_i_262: ;AND_64 dec ebx jz rx_finish xor r10, 01c42baa6h @@ -4812,14 +4715,14 @@ rx_body_262: xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - or eax, r13d + and rax, r13 mov rcx, rax mov eax, r11d xor eax, 0a271ff06h and eax, 2047 mov qword ptr [rsi + rax * 8], rcx -rx_i_263: ;FPDIV +rx_i_263: ;FPMUL dec ebx jz rx_finish xor r11, 0b39b140h @@ -4831,7 +4734,7 @@ rx_body_263: xor rbp, rcx and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] - divpd xmm0, xmm8 + mulpd xmm0, xmm8 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 andps xmm0, xmm1 @@ -4873,7 +4776,7 @@ rx_body_265: and eax, 2047 movhpd qword ptr [rsi + rax * 8], xmm2 -rx_i_266: ;RET +rx_i_266: ;CALL dec ebx jz rx_finish xor r13, 03d0a3a89h @@ -4884,15 +4787,15 @@ rx_i_266: ;RET rx_body_266: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - cmp rsp, rdi - je short not_taken_ret_266 - xor rax, qword ptr [rsp + 8] - mov r10, rax - ret 8 -not_taken_ret_266: + cmp r12d, 136160027 + ja short taken_call_266 mov r10, rax + jmp rx_i_267 +taken_call_266: + push rax + call rx_i_295 -rx_i_267: ;ROR_64 +rx_i_267: ;ROL_64 dec ebx jz rx_finish xor r8, 0c6c7b37h @@ -4903,10 +4806,10 @@ rx_i_267: ;ROR_64 rx_body_267: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - ror rax, 56 + rol rax, 56 mov r11, rax -rx_i_268: ;CALL +rx_i_268: ;JUMP dec ebx jz rx_finish xor r12, 0c2510cebh @@ -4918,15 +4821,11 @@ rx_body_268: xor rbp, rcx and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] - cmp r15d, -2062812966 - jl short taken_call_268 mov r13, rax - jmp rx_i_269 -taken_call_268: - push rax - call rx_i_381 + cmp r15d, -2062812966 + jl rx_i_381 -rx_i_269: ;ROR_64 +rx_i_269: ;ROL_64 dec ebx jz rx_finish xor r11, 0c80cc899h @@ -4938,7 +4837,7 @@ rx_body_269: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] mov rcx, r8 - ror rax, cl + rol rax, cl mov rcx, rax mov eax, r10d xor eax, 01ba81447h @@ -4982,7 +4881,7 @@ rx_body_271: and eax, 2047 mov qword ptr [rsi + rax * 8], rcx -rx_i_272: ;OR_32 +rx_i_272: ;AND_64 dec ebx jz rx_finish xor r12, 0695a5dd2h @@ -4993,10 +4892,10 @@ rx_i_272: ;OR_32 rx_body_272: and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] - or eax, r12d + and rax, r12 mov r13, rax -rx_i_273: ;CALL +rx_i_273: ;JUMP dec ebx jz rx_finish xor r9, 0d315e4dch @@ -5008,19 +4907,15 @@ rx_body_273: xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - cmp r12d, 1670848568 - jl short taken_call_273 mov rcx, rax mov eax, r13d xor eax, 063972038h and eax, 32767 mov qword ptr [rsi + rax * 8], rcx - jmp rx_i_274 -taken_call_273: - push rax - call rx_i_372 + cmp r12d, 1670848568 + jl rx_i_372 -rx_i_274: ;FPSUB +rx_i_274: ;FPADD dec ebx jz rx_finish xor r15, 0b66ca7e0h @@ -5031,14 +4926,14 @@ rx_i_274: ;FPSUB rx_body_274: and ecx, 32767 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] - subpd xmm0, xmm4 + addpd xmm0, xmm4 movaps xmm6, xmm0 mov eax, r14d xor eax, 06a2b2b5bh and eax, 2047 movhpd qword ptr [rsi + rax * 8], xmm6 -rx_i_275: ;OR_64 +rx_i_275: ;DIV_64 dec ebx jz rx_finish xor r10, 0788eceb7h @@ -5050,10 +4945,15 @@ rx_body_275: xor rbp, rcx and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] - or rax, r11 + mov ecx, 1 + mov edx, r11d + test edx, edx + cmovne ecx, edx + xor edx, edx + div rcx mov r13, rax -rx_i_276: ;CALL +rx_i_276: ;JUMP dec ebx jz rx_finish xor r9, 0c6ac5edah @@ -5065,17 +4965,13 @@ rx_body_276: xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - cmp r11d, -1236180570 - jns short taken_call_276 mov rcx, rax mov eax, r12d xor eax, 0b65161a6h and eax, 2047 mov qword ptr [rsi + rax * 8], rcx - jmp rx_i_277 -taken_call_276: - push rax - call rx_i_404 + cmp r11d, -1236180570 + jns rx_i_404 rx_i_277: ;IMUL_32 dec ebx @@ -5115,7 +5011,7 @@ rx_body_278: and eax, 2047 movhpd qword ptr [rsi + rax * 8], xmm4 -rx_i_279: ;FPSUB +rx_i_279: ;FPADD dec ebx jz rx_finish xor r15, 0f1a91458h @@ -5126,14 +5022,14 @@ rx_i_279: ;FPSUB rx_body_279: and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] - subpd xmm0, xmm5 + addpd xmm0, xmm5 movaps xmm9, xmm0 mov eax, r9d xor eax, 0475ade01h and eax, 2047 movlpd qword ptr [rsi + rax * 8], xmm9 -rx_i_280: ;AND_64 +rx_i_280: ;IMULH_64 dec ebx jz rx_finish xor r12, 066246b43h @@ -5144,7 +5040,9 @@ rx_i_280: ;AND_64 rx_body_280: and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] - and rax, r11 + mov rcx, r11 + imul rcx + mov rax, rdx mov rcx, rax mov eax, r13d xor eax, 0211aeb00h @@ -5169,7 +5067,7 @@ rx_body_281: and eax, 32767 mov qword ptr [rsi + rax * 8], rcx -rx_i_282: ;SUB_32 +rx_i_282: ;SUB_64 dec ebx jz rx_finish xor r15, 0de1ab603h @@ -5180,10 +5078,10 @@ rx_i_282: ;SUB_32 rx_body_282: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - sub eax, 1367326224 + sub rax, 1367326224 mov r11, rax -rx_i_283: ;ADD_32 +rx_i_283: ;ADD_64 dec ebx jz rx_finish xor r9, 0df4d084fh @@ -5195,7 +5093,7 @@ rx_body_283: xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - add eax, -1156732976 + add rax, -1156732976 mov rcx, rax mov eax, r12d xor eax, 0bb0da7d0h @@ -5221,7 +5119,7 @@ rx_body_284: and eax, 32767 movlpd qword ptr [rsi + rax * 8], xmm9 -rx_i_285: ;IMUL_32 +rx_i_285: ;MUL_32 dec ebx jz rx_finish xor r8, 09adb333bh @@ -5232,12 +5130,12 @@ rx_i_285: ;IMUL_32 rx_body_285: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - movsxd rcx, eax - movsxd rax, r8d + mov ecx, eax + mov eax, r8d imul rax, rcx mov r14, rax -rx_i_286: ;FPADD +rx_i_286: ;ROL_64 dec ebx jz rx_finish xor r14, 082f5e36ch @@ -5247,11 +5145,16 @@ rx_i_286: ;FPADD call rx_read_l1 rx_body_286: and ecx, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] - addpd xmm0, xmm9 - movaps xmm7, xmm0 + mov rax, qword ptr [rsi+rcx*8] + mov rcx, r9 + rol rax, cl + mov rcx, rax + mov eax, r15d + xor eax, 0546e75d1h + and eax, 2047 + mov qword ptr [rsi + rax * 8], rcx -rx_i_287: ;OR_64 +rx_i_287: ;IDIV_64 dec ebx jz rx_finish xor r11, 049547c9ch @@ -5263,7 +5166,21 @@ rx_body_287: xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - or rax, r15 + mov edx, r15d + cmp edx, -1 + jne short safe_idiv_287 + mov rcx, rax + rol rcx, 1 + dec rcx + jz short result_idiv_287 +safe_idiv_287: + mov ecx, 1 + test edx, edx + cmovne ecx, edx + movsxd rcx, ecx + cqo + idiv rcx +result_idiv_287: mov rcx, rax mov eax, r8d xor eax, 04926c7fah @@ -5288,7 +5205,7 @@ rx_body_288: and eax, 2047 mov qword ptr [rsi + rax * 8], rcx -rx_i_289: ;FPDIV +rx_i_289: ;FPMUL dec ebx jz rx_finish xor r14, 0efef52b5h @@ -5299,7 +5216,7 @@ rx_i_289: ;FPDIV rx_body_289: and ecx, 32767 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] - divpd xmm0, xmm9 + mulpd xmm0, xmm9 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 andps xmm0, xmm1 @@ -5346,7 +5263,7 @@ not_taken_ret_291: and eax, 32767 mov qword ptr [rsi + rax * 8], rcx -rx_i_292: ;ROR_64 +rx_i_292: ;ROL_64 dec ebx jz rx_finish xor r13, 05a87cc3dh @@ -5357,10 +5274,10 @@ rx_i_292: ;ROR_64 rx_body_292: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - ror rax, 23 + rol rax, 23 mov r10, rax -rx_i_293: ;FPSUB +rx_i_293: ;FPADD dec ebx jz rx_finish xor r9, 0c61f4279h @@ -5371,7 +5288,7 @@ rx_i_293: ;FPSUB rx_body_293: and ecx, 32767 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] - subpd xmm0, xmm5 + addpd xmm0, xmm5 movaps xmm8, xmm0 rx_i_294: ;RET @@ -5416,7 +5333,7 @@ rx_body_295: subpd xmm0, xmm8 movaps xmm7, xmm0 -rx_i_296: ;CALL +rx_i_296: ;FPSQRT dec ebx jz rx_finish xor r14, 018738758h @@ -5426,18 +5343,9 @@ rx_i_296: ;CALL call rx_read_l1 rx_body_296: and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] - cmp r9d, -207252278 - jns short taken_call_296 - mov rcx, rax - mov eax, r8d - xor eax, 0f3a594cah - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - jmp rx_i_297 -taken_call_296: - push rax - call rx_i_395 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + andps xmm0, xmm10 + sqrtpd xmm8, xmm0 rx_i_297: ;ADD_64 dec ebx @@ -5569,7 +5477,7 @@ rx_body_304: imul rax, r15 mov r13, rax -rx_i_305: ;MUL_64 +rx_i_305: ;SUB_32 dec ebx jz rx_finish xor r11, 03c6c62b8h @@ -5580,7 +5488,7 @@ rx_i_305: ;MUL_64 rx_body_305: and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] - imul rax, rax, -65873120 + sub eax, -65873120 mov r10, rax rx_i_306: ;ADD_64 @@ -5597,7 +5505,7 @@ rx_body_306: add rax, r15 mov r13, rax -rx_i_307: ;SAR_64 +rx_i_307: ;SHL_64 dec ebx jz rx_finish xor r15, 04c36adb1h @@ -5609,7 +5517,7 @@ rx_body_307: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] mov rcx, r8 - sar rax, cl + shl rax, cl mov r10, rax rx_i_308: ;MUL_64 @@ -5626,7 +5534,7 @@ rx_body_308: imul rax, r13 mov r15, rax -rx_i_309: ;IMULH_64 +rx_i_309: ;IMUL_32 dec ebx jz rx_finish xor r9, 090c42304h @@ -5637,9 +5545,9 @@ rx_i_309: ;IMULH_64 rx_body_309: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - mov rcx, -1652850028 - imul rcx - mov rax, rdx + movsxd rcx, eax + mov rax, -1652850028 + imul rax, rcx mov rcx, rax mov eax, r9d xor eax, 09d7b8294h @@ -5684,7 +5592,7 @@ rx_body_311: andps xmm0, xmm1 movaps xmm4, xmm0 -rx_i_312: ;MUL_32 +rx_i_312: ;MULH_64 dec ebx jz rx_finish xor r13, 0b18904cdh @@ -5695,12 +5603,12 @@ rx_i_312: ;MUL_32 rx_body_312: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - mov ecx, eax - mov eax, -1147928648 - imul rax, rcx + mov rcx, -1147928648 + mul rcx + mov rax, rdx mov r10, rax -rx_i_313: ;FPADD +rx_i_313: ;ROR_64 dec ebx jz rx_finish xor r8, 0a0d0befh @@ -5710,9 +5618,13 @@ rx_i_313: ;FPADD call rx_read_l1 rx_body_313: and ecx, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] - addpd xmm0, xmm5 - movaps xmm6, xmm0 + mov rax, qword ptr [rsi+rcx*8] + ror rax, 62 + mov rcx, rax + mov eax, r14d + xor eax, 09500d514h + and eax, 2047 + mov qword ptr [rsi + rax * 8], rcx rx_i_314: ;IMUL_32 dec ebx @@ -5734,7 +5646,7 @@ rx_body_314: and eax, 2047 mov qword ptr [rsi + rax * 8], rcx -rx_i_315: ;SHR_64 +rx_i_315: ;XOR_64 dec ebx jz rx_finish xor r9, 02e36ddafh @@ -5745,8 +5657,7 @@ rx_i_315: ;SHR_64 rx_body_315: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - mov rcx, r15 - shr rax, cl + xor rax, r15 mov r9, rax rx_i_316: ;RET @@ -5791,7 +5702,7 @@ rx_body_317: addpd xmm0, xmm7 movaps xmm5, xmm0 -rx_i_318: ;FPADD +rx_i_318: ;ROR_64 dec ebx jz rx_finish xor r9, 057621d9ah @@ -5801,11 +5712,15 @@ rx_i_318: ;FPADD call rx_read_l1 rx_body_318: and ecx, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] - addpd xmm0, xmm3 - movaps xmm7, xmm0 + mov rax, qword ptr [rsi+rcx*8] + ror rax, 41 + mov rcx, rax + mov eax, r15d + xor eax, 061cb9db8h + and eax, 2047 + mov qword ptr [rsi + rax * 8], rcx -rx_i_319: ;ROL_64 +rx_i_319: ;SHR_64 dec ebx jz rx_finish xor r13, 08ee02d99h @@ -5817,7 +5732,7 @@ rx_body_319: and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] mov rcx, r15 - rol rax, cl + shr rax, cl mov rcx, rax mov eax, r11d xor eax, 01f931a08h @@ -5842,7 +5757,7 @@ rx_body_320: and eax, 2047 movlpd qword ptr [rsi + rax * 8], xmm2 -rx_i_321: ;IMUL_32 +rx_i_321: ;MUL_32 dec ebx jz rx_finish xor r11, 0a7bae383h @@ -5853,8 +5768,8 @@ rx_i_321: ;IMUL_32 rx_body_321: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - movsxd rcx, eax - movsxd rax, r9d + mov ecx, eax + mov eax, r9d imul rax, rcx mov rcx, rax mov eax, r12d @@ -5862,7 +5777,7 @@ rx_body_321: and eax, 2047 mov qword ptr [rsi + rax * 8], rcx -rx_i_322: ;RET +rx_i_322: ;CALL dec ebx jz rx_finish xor r14, 08215399bh @@ -5873,23 +5788,19 @@ rx_i_322: ;RET rx_body_322: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - cmp rsp, rdi - je short not_taken_ret_322 - xor rax, qword ptr [rsp + 8] - mov rcx, rax - mov eax, r11d - xor eax, 054292224h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - ret 8 -not_taken_ret_322: + cmp r11d, 1411981860 + jo short taken_call_322 mov rcx, rax mov eax, r11d xor eax, 054292224h and eax, 2047 mov qword ptr [rsi + rax * 8], rcx + jmp rx_i_323 +taken_call_322: + push rax + call rx_i_343 -rx_i_323: ;MULH_64 +rx_i_323: ;MUL_64 dec ebx jz rx_finish xor r14, 07b07664bh @@ -5901,16 +5812,14 @@ rx_body_323: xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - mov rcx, -696924877 - mul rcx - mov rax, rdx + imul rax, rax, -696924877 mov rcx, rax mov eax, r14d xor eax, 0d675c533h and eax, 2047 mov qword ptr [rsi + rax * 8], rcx -rx_i_324: ;FPSQRT +rx_i_324: ;FPDIV dec ebx jz rx_finish xor r9, 0f956baffh @@ -5921,14 +5830,17 @@ rx_i_324: ;FPSQRT rx_body_324: and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] - andps xmm0, xmm10 - sqrtpd xmm9, xmm0 + divpd xmm0, xmm2 + movaps xmm1, xmm0 + cmpeqpd xmm1, xmm1 + andps xmm0, xmm1 + movaps xmm9, xmm0 mov eax, r9d xor eax, 0944856d4h and eax, 32767 movhpd qword ptr [rsi + rax * 8], xmm9 -rx_i_325: ;SHL_64 +rx_i_325: ;OR_32 dec ebx jz rx_finish xor r11, 0708ab9d1h @@ -5939,10 +5851,10 @@ rx_i_325: ;SHL_64 rx_body_325: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - shl rax, 24 + or eax, -281580460 mov r13, rax -rx_i_326: ;MULH_64 +rx_i_326: ;MUL_64 dec ebx jz rx_finish xor r11, 0d1b27540h @@ -5954,16 +5866,14 @@ rx_body_326: xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - mov rcx, r8 - mul rcx - mov rax, rdx + imul rax, r8 mov rcx, rax mov eax, r9d xor eax, 0b67623c3h and eax, 2047 mov qword ptr [rsi + rax * 8], rcx -rx_i_327: ;AND_64 +rx_i_327: ;IMULH_64 dec ebx jz rx_finish xor r9, 09665f98dh @@ -5975,10 +5885,12 @@ rx_body_327: xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - and rax, r15 + mov rcx, r15 + imul rcx + mov rax, rdx mov r12, rax -rx_i_328: ;ROL_64 +rx_i_328: ;SHR_64 dec ebx jz rx_finish xor r12, 0fb9c32adh @@ -5990,7 +5902,7 @@ rx_body_328: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] mov rcx, r13 - rol rax, cl + shr rax, cl mov r9, rax rx_i_329: ;RET @@ -6067,7 +5979,7 @@ rx_body_332: and eax, 2047 movlpd qword ptr [rsi + rax * 8], xmm3 -rx_i_333: ;XOR_64 +rx_i_333: ;OR_64 dec ebx jz rx_finish xor r14, 0f009758bh @@ -6079,10 +5991,10 @@ rx_body_333: xor rbp, rcx and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] - xor rax, -175125848 + or rax, -175125848 mov r11, rax -rx_i_334: ;ADD_32 +rx_i_334: ;ADD_64 dec ebx jz rx_finish xor r8, 0dda04168h @@ -6094,7 +6006,7 @@ rx_body_334: xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - add eax, r13d + add rax, r13 mov r8, rax rx_i_335: ;SUB_64 @@ -6115,7 +6027,7 @@ rx_body_335: and eax, 2047 mov qword ptr [rsi + rax * 8], rcx -rx_i_336: ;FPADD +rx_i_336: ;ROR_64 dec ebx jz rx_finish xor r15, 0aea0a435h @@ -6126,11 +6038,16 @@ rx_i_336: ;FPADD rx_body_336: xor rbp, rcx and ecx, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] - addpd xmm0, xmm2 - movaps xmm3, xmm0 + mov rax, qword ptr [rsi+rcx*8] + mov rcx, r10 + ror rax, cl + mov rcx, rax + mov eax, r11d + xor eax, 02644c5ah + and eax, 2047 + mov qword ptr [rsi + rax * 8], rcx -rx_i_337: ;ADD_32 +rx_i_337: ;ADD_64 dec ebx jz rx_finish xor r8, 03d6c4ab2h @@ -6141,7 +6058,7 @@ rx_i_337: ;ADD_32 rx_body_337: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - add eax, r12d + add rax, r12 mov rcx, rax mov eax, r13d xor eax, 0dab07c39h @@ -6177,7 +6094,7 @@ rx_body_339: addpd xmm0, xmm6 movaps xmm2, xmm0 -rx_i_340: ;FPSUB +rx_i_340: ;FPADD dec ebx jz rx_finish xor r15, 0e51629cch @@ -6188,10 +6105,10 @@ rx_i_340: ;FPSUB rx_body_340: and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] - subpd xmm0, xmm5 + addpd xmm0, xmm5 movaps xmm5, xmm0 -rx_i_341: ;MUL_32 +rx_i_341: ;MULH_64 dec ebx jz rx_finish xor r12, 019eb9ea5h @@ -6202,9 +6119,9 @@ rx_i_341: ;MUL_32 rx_body_341: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - mov ecx, eax - mov eax, r15d - imul rax, rcx + mov rcx, r15 + mul rcx + mov rax, rdx mov rcx, rax mov eax, r8d xor eax, 024736405h @@ -6225,7 +6142,7 @@ rx_body_342: subpd xmm0, xmm2 movaps xmm3, xmm0 -rx_i_343: ;SHR_64 +rx_i_343: ;XOR_64 dec ebx jz rx_finish xor r14, 056f6cf0bh @@ -6237,7 +6154,7 @@ rx_body_343: xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - shr rax, 48 + xor rax, r13 mov rcx, rax mov eax, r15d xor eax, 0d9a469a9h @@ -6258,7 +6175,7 @@ rx_body_344: subpd xmm0, xmm6 movaps xmm5, xmm0 -rx_i_345: ;MULH_64 +rx_i_345: ;MUL_64 dec ebx jz rx_finish xor r12, 0bbbcdbach @@ -6269,16 +6186,14 @@ rx_i_345: ;MULH_64 rx_body_345: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - mov rcx, r13 - mul rcx - mov rax, rdx + imul rax, r13 mov rcx, rax mov eax, r9d xor eax, 0ef03b0ddh and eax, 2047 mov qword ptr [rsi + rax * 8], rcx -rx_i_346: ;XOR_64 +rx_i_346: ;AND_32 dec ebx jz rx_finish xor r12, 0ae9d1e96h @@ -6289,7 +6204,7 @@ rx_i_346: ;XOR_64 rx_body_346: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - xor rax, r15 + and eax, r15d mov rcx, rax mov eax, r13d xor eax, 0ed2d3987h @@ -6328,7 +6243,7 @@ rx_body_348: and eax, 2047 movhpd qword ptr [rsi + rax * 8], xmm9 -rx_i_349: ;XOR_32 +rx_i_349: ;OR_64 dec ebx jz rx_finish xor r8, 018e0e5ddh @@ -6339,7 +6254,7 @@ rx_i_349: ;XOR_32 rx_body_349: and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] - xor eax, r15d + or rax, r15 mov r13, rax rx_i_350: ;CALL @@ -6416,7 +6331,7 @@ rx_body_353: and eax, 2047 movlpd qword ptr [rsi + rax * 8], xmm7 -rx_i_354: ;MULH_64 +rx_i_354: ;MUL_64 dec ebx jz rx_finish xor r13, 02412fc10h @@ -6427,9 +6342,7 @@ rx_i_354: ;MULH_64 rx_body_354: and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] - mov rcx, r13 - mul rcx - mov rax, rdx + imul rax, r13 mov r13, rax rx_i_355: ;MUL_64 @@ -6450,7 +6363,7 @@ rx_body_355: and eax, 2047 mov qword ptr [rsi + rax * 8], rcx -rx_i_356: ;MUL_64 +rx_i_356: ;SUB_64 dec ebx jz rx_finish xor r10, 01cd85d80h @@ -6462,7 +6375,7 @@ rx_body_356: xor rbp, rcx and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] - imul rax, r10 + sub rax, r10 mov r11, rax rx_i_357: ;ADD_64 @@ -6479,7 +6392,7 @@ rx_body_357: add rax, 820073637 mov r11, rax -rx_i_358: ;DIV_64 +rx_i_358: ;IMUL_32 dec ebx jz rx_finish xor r13, 088fa6e5ah @@ -6490,12 +6403,9 @@ rx_i_358: ;DIV_64 rx_body_358: and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] - mov ecx, 1 - mov edx, r11d - test edx, edx - cmovne ecx, edx - xor edx, edx - div rcx + movsxd rcx, eax + movsxd rax, r11d + imul rax, rcx mov r9, rax rx_i_359: ;FPSUB @@ -6534,7 +6444,7 @@ rx_body_360: andps xmm0, xmm1 movaps xmm8, xmm0 -rx_i_361: ;FPSQRT +rx_i_361: ;FPDIV dec ebx jz rx_finish xor r15, 01d125a7fh @@ -6545,8 +6455,11 @@ rx_i_361: ;FPSQRT rx_body_361: and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] - andps xmm0, xmm10 - sqrtpd xmm6, xmm0 + divpd xmm0, xmm6 + movaps xmm1, xmm0 + cmpeqpd xmm1, xmm1 + andps xmm0, xmm1 + movaps xmm6, xmm0 mov eax, r14d xor eax, 0ad0b81f5h and eax, 2047 @@ -6603,7 +6516,7 @@ rx_body_364: mov rax, rdx mov r8, rax -rx_i_365: ;IMUL_32 +rx_i_365: ;MUL_32 dec ebx jz rx_finish xor r15, 02db4444ah @@ -6614,8 +6527,8 @@ rx_i_365: ;IMUL_32 rx_body_365: and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] - movsxd rcx, eax - movsxd rax, r9d + mov ecx, eax + mov eax, r9d imul rax, rcx mov rcx, rax mov eax, r12d @@ -6643,7 +6556,7 @@ rx_body_366: and eax, 2047 mov qword ptr [rsi + rax * 8], rcx -rx_i_367: ;FPADD +rx_i_367: ;ROR_64 dec ebx jz rx_finish xor r9, 04d14cb3ah @@ -6654,15 +6567,12 @@ rx_i_367: ;FPADD rx_body_367: xor rbp, rcx and ecx, 32767 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] - addpd xmm0, xmm9 - movaps xmm4, xmm0 - mov eax, r12d - xor eax, 0ad9b92e8h - and eax, 2047 - movlpd qword ptr [rsi + rax * 8], xmm4 + mov rax, qword ptr [rsi+rcx*8] + mov rcx, r9 + ror rax, cl + mov r12, rax -rx_i_368: ;MUL_64 +rx_i_368: ;SUB_64 dec ebx jz rx_finish xor r10, 0a14836bah @@ -6673,10 +6583,10 @@ rx_i_368: ;MUL_64 rx_body_368: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - imul rax, r10 + sub rax, r10 mov r8, rax -rx_i_369: ;AND_64 +rx_i_369: ;IMULH_64 dec ebx jz rx_finish xor r9, 053fe22e2h @@ -6687,7 +6597,9 @@ rx_i_369: ;AND_64 rx_body_369: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - and rax, r13 + mov rcx, r13 + imul rcx + mov rax, rdx mov r9, rax rx_i_370: ;FPSUB @@ -6727,7 +6639,7 @@ rx_body_371: and eax, 2047 movhpd qword ptr [rsi + rax * 8], xmm5 -rx_i_372: ;ROL_64 +rx_i_372: ;SHL_64 dec ebx jz rx_finish xor r10, 098ab79d7h @@ -6739,10 +6651,10 @@ rx_body_372: and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] mov rcx, r13 - rol rax, cl + shl rax, cl mov r9, rax -rx_i_373: ;FPDIV +rx_i_373: ;FPMUL dec ebx jz rx_finish xor r15, 056438b3h @@ -6753,7 +6665,7 @@ rx_i_373: ;FPDIV rx_body_373: and ecx, 32767 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] - divpd xmm0, xmm8 + mulpd xmm0, xmm8 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 andps xmm0, xmm1 @@ -6826,7 +6738,7 @@ rx_body_377: subpd xmm0, xmm3 movaps xmm7, xmm0 -rx_i_378: ;MUL_32 +rx_i_378: ;MULH_64 dec ebx jz rx_finish xor r12, 082aa21ach @@ -6837,12 +6749,12 @@ rx_i_378: ;MUL_32 rx_body_378: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - mov ecx, eax - mov eax, 547725353 - imul rax, rcx + mov rcx, 547725353 + mul rcx + mov rax, rdx mov r15, rax -rx_i_379: ;FPADD +rx_i_379: ;ROR_64 dec ebx jz rx_finish xor r10, 05dba41fbh @@ -6852,15 +6764,11 @@ rx_i_379: ;FPADD call rx_read_l2 rx_body_379: and ecx, 32767 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] - addpd xmm0, xmm9 - movaps xmm5, xmm0 - mov eax, r13d - xor eax, 03a2dc429h - and eax, 2047 - movlpd qword ptr [rsi + rax * 8], xmm5 + mov rax, qword ptr [rsi+rcx*8] + ror rax, 56 + mov r13, rax -rx_i_380: ;MUL_64 +rx_i_380: ;SUB_32 dec ebx jz rx_finish xor r11, 0229e3d6eh @@ -6871,14 +6779,14 @@ rx_i_380: ;MUL_64 rx_body_380: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - imul rax, rax, -1443002912 + sub eax, -1443002912 mov rcx, rax mov eax, r13d xor eax, 0a9fd85e0h and eax, 2047 mov qword ptr [rsi + rax * 8], rcx -rx_i_381: ;SAR_64 +rx_i_381: ;XOR_32 dec ebx jz rx_finish xor r8, 019816ff9h @@ -6890,11 +6798,10 @@ rx_body_381: xor rbp, rcx and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] - mov rcx, r14 - sar rax, cl + xor eax, r14d mov r9, rax -rx_i_382: ;FPADD +rx_i_382: ;ROL_64 dec ebx jz rx_finish xor r14, 036b5b81fh @@ -6904,13 +6811,9 @@ rx_i_382: ;FPADD call rx_read_l1 rx_body_382: and ecx, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] - addpd xmm0, xmm3 - movaps xmm3, xmm0 - mov eax, r11d - xor eax, 0a6a2e0b1h - and eax, 2047 - movlpd qword ptr [rsi + rax * 8], xmm3 + mov rax, qword ptr [rsi+rcx*8] + rol rax, 55 + mov r11, rax rx_i_383: ;FPSUB dec ebx @@ -6930,7 +6833,7 @@ rx_body_383: and eax, 2047 movlpd qword ptr [rsi + rax * 8], xmm5 -rx_i_384: ;SHR_64 +rx_i_384: ;XOR_64 dec ebx jz rx_finish xor r10, 05b459fd7h @@ -6941,8 +6844,7 @@ rx_i_384: ;SHR_64 rx_body_384: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - mov rcx, r11 - shr rax, cl + xor rax, r11 mov rcx, rax mov eax, r9d xor eax, 054439464h @@ -6981,7 +6883,7 @@ rx_body_386: addpd xmm0, xmm8 movaps xmm9, xmm0 -rx_i_387: ;MUL_64 +rx_i_387: ;SUB_64 dec ebx jz rx_finish xor r9, 0d4f7bc6ah @@ -6993,7 +6895,7 @@ rx_body_387: xor rbp, rcx and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] - imul rax, r15 + sub rax, r15 mov r9, rax rx_i_388: ;RET @@ -7023,7 +6925,7 @@ not_taken_ret_388: and eax, 2047 mov qword ptr [rsi + rax * 8], rcx -rx_i_389: ;CALL +rx_i_389: ;JUMP dec ebx jz rx_finish xor r11, 06531ad2eh @@ -7034,13 +6936,9 @@ rx_i_389: ;CALL rx_body_389: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - cmp r9d, -350609584 - jge short taken_call_389 mov r14, rax - jmp rx_i_390 -taken_call_389: - push rax - call rx_i_421 + cmp r9d, -350609584 + jge rx_i_421 rx_i_390: ;FPADD dec ebx @@ -7070,7 +6968,7 @@ rx_body_391: addpd xmm0, xmm3 movaps xmm6, xmm0 -rx_i_392: ;ROR_64 +rx_i_392: ;SAR_64 dec ebx jz rx_finish xor r14, 01ebc1f0dh @@ -7081,14 +6979,14 @@ rx_i_392: ;ROR_64 rx_body_392: and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] - ror rax, 0 + sar rax, 0 mov rcx, rax mov eax, r13d xor eax, 08c4a0f0dh and eax, 2047 mov qword ptr [rsi + rax * 8], rcx -rx_i_393: ;OR_32 +rx_i_393: ;AND_64 dec ebx jz rx_finish xor r14, 0742e95b1h @@ -7099,7 +6997,7 @@ rx_i_393: ;OR_32 rx_body_393: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - or eax, 552339548 + and rax, 552339548 mov rcx, rax mov eax, r13d xor eax, 020ec085ch @@ -7120,7 +7018,7 @@ rx_body_394: addpd xmm0, xmm9 movaps xmm6, xmm0 -rx_i_395: ;IDIV_64 +rx_i_395: ;IMUL_32 dec ebx jz rx_finish xor r8, 04ae4fe8ch @@ -7132,24 +7030,12 @@ rx_body_395: xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - mov edx, r13d - cmp edx, -1 - jne short safe_idiv_395 - mov rcx, rax - rol rcx, 1 - dec rcx - jz short result_idiv_395 -safe_idiv_395: - mov ecx, 1 - test edx, edx - cmovne ecx, edx - movsxd rcx, ecx - cqo - idiv rcx -result_idiv_395: + movsxd rcx, eax + movsxd rax, r13d + imul rax, rcx mov r8, rax -rx_i_396: ;FPADD +rx_i_396: ;ROR_64 dec ebx jz rx_finish xor r10, 07b41862bh @@ -7159,11 +7045,15 @@ rx_i_396: ;FPADD call rx_read_l1 rx_body_396: and ecx, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] - addpd xmm0, xmm7 - movaps xmm4, xmm0 + mov rax, qword ptr [rsi+rcx*8] + ror rax, 62 + mov rcx, rax + mov eax, r12d + xor eax, 01ee1c837h + and eax, 2047 + mov qword ptr [rsi + rax * 8], rcx -rx_i_397: ;MUL_64 +rx_i_397: ;SUB_64 dec ebx jz rx_finish xor r8, 0916f3819h @@ -7175,14 +7065,14 @@ rx_body_397: xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - imul rax, r12 + sub rax, r12 mov rcx, rax mov eax, r10d xor eax, 0146db5dfh and eax, 2047 mov qword ptr [rsi + rax * 8], rcx -rx_i_398: ;ROL_64 +rx_i_398: ;SHR_64 dec ebx jz rx_finish xor r8, 04eb6fd2ah @@ -7193,14 +7083,14 @@ rx_i_398: ;ROL_64 rx_body_398: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - rol rax, 44 + shr rax, 44 mov rcx, rax mov eax, r11d xor eax, 0724e7136h and eax, 2047 mov qword ptr [rsi + rax * 8], rcx -rx_i_399: ;FPDIV +rx_i_399: ;FPMUL dec ebx jz rx_finish xor r11, 0899a98cfh @@ -7212,13 +7102,13 @@ rx_body_399: xor rbp, rcx and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] - divpd xmm0, xmm2 + mulpd xmm0, xmm2 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 andps xmm0, xmm1 movaps xmm6, xmm0 -rx_i_400: ;OR_32 +rx_i_400: ;AND_64 dec ebx jz rx_finish xor r13, 0aae75db6h @@ -7229,7 +7119,7 @@ rx_i_400: ;OR_32 rx_body_400: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - or eax, r11d + and rax, r11 mov rcx, rax mov eax, r14d xor eax, 094ac538ch @@ -7273,7 +7163,7 @@ rx_body_402: not_taken_ret_402: mov r14, rax -rx_i_403: ;IDIV_64 +rx_i_403: ;IMUL_32 dec ebx jz rx_finish xor r9, 0e59500f7h @@ -7284,28 +7174,16 @@ rx_i_403: ;IDIV_64 rx_body_403: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - mov edx, r12d - cmp edx, -1 - jne short safe_idiv_403 - mov rcx, rax - rol rcx, 1 - dec rcx - jz short result_idiv_403 -safe_idiv_403: - mov ecx, 1 - test edx, edx - cmovne ecx, edx - movsxd rcx, ecx - cqo - idiv rcx -result_idiv_403: + movsxd rcx, eax + movsxd rax, r12d + imul rax, rcx mov rcx, rax mov eax, r11d xor eax, 01ff394a0h and eax, 2047 mov qword ptr [rsi + rax * 8], rcx -rx_i_404: ;MUL_32 +rx_i_404: ;MULH_64 dec ebx jz rx_finish xor r15, 05b8ceb2fh @@ -7316,12 +7194,12 @@ rx_i_404: ;MUL_32 rx_body_404: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - mov ecx, eax - mov eax, r8d - imul rax, rcx + mov rcx, r8 + mul rcx + mov rax, rdx mov r15, rax -rx_i_405: ;RET +rx_i_405: ;CALL dec ebx jz rx_finish xor r8, 0f61082a3h @@ -7332,23 +7210,19 @@ rx_i_405: ;RET rx_body_405: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - cmp rsp, rdi - je short not_taken_ret_405 - xor rax, qword ptr [rsp + 8] - mov rcx, rax - mov eax, r12d - xor eax, 06b0af6c1h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - ret 8 -not_taken_ret_405: + cmp r10d, 1795880641 + jbe short taken_call_405 mov rcx, rax mov eax, r12d xor eax, 06b0af6c1h and eax, 2047 mov qword ptr [rsi + rax * 8], rcx + jmp rx_i_406 +taken_call_405: + push rax + call rx_i_494 -rx_i_406: ;FPROUND +rx_i_406: ;FPDIV dec ebx jz rx_finish xor r9, 0af6886b7h @@ -7358,14 +7232,16 @@ rx_i_406: ;FPROUND call rx_read_l2 rx_body_406: and ecx, 32767 - mov rax, qword ptr [rsi+rcx*8] - mov rcx, rax - shl eax, 13 - and eax, 24576 - or eax, 40896 - mov dword ptr [rsp - 8], eax - ldmxcsr dword ptr [rsp - 8] - mov r9, rcx + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + divpd xmm0, xmm7 + movaps xmm1, xmm0 + cmpeqpd xmm1, xmm1 + andps xmm0, xmm1 + movaps xmm9, xmm0 + mov eax, r9d + xor eax, 09862adefh + and eax, 32767 + movlpd qword ptr [rsi + rax * 8], xmm9 rx_i_407: ;FPSUB dec ebx @@ -7453,7 +7329,7 @@ rx_body_411: not_taken_ret_411: mov r12, rax -rx_i_412: ;FPSQRT +rx_i_412: ;FPDIV dec ebx jz rx_finish xor r10, 0ac90e7ah @@ -7464,14 +7340,17 @@ rx_i_412: ;FPSQRT rx_body_412: and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] - andps xmm0, xmm10 - sqrtpd xmm3, xmm0 + divpd xmm0, xmm4 + movaps xmm1, xmm0 + cmpeqpd xmm1, xmm1 + andps xmm0, xmm1 + movaps xmm3, xmm0 mov eax, r11d xor eax, 0bbd2640ah and eax, 2047 movlpd qword ptr [rsi + rax * 8], xmm3 -rx_i_413: ;FPDIV +rx_i_413: ;FPMUL dec ebx jz rx_finish xor r11, 04b6037abh @@ -7482,13 +7361,13 @@ rx_i_413: ;FPDIV rx_body_413: and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] - divpd xmm0, xmm2 + mulpd xmm0, xmm2 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 andps xmm0, xmm1 movaps xmm4, xmm0 -rx_i_414: ;OR_64 +rx_i_414: ;AND_64 dec ebx jz rx_finish xor r14, 06c01554dh @@ -7500,14 +7379,14 @@ rx_body_414: xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - or rax, r8 + and rax, r8 mov rcx, rax mov eax, r10d xor eax, 0e973b3b1h and eax, 2047 mov qword ptr [rsi + rax * 8], rcx -rx_i_415: ;DIV_64 +rx_i_415: ;IMUL_32 dec ebx jz rx_finish xor r8, 08c3e59a1h @@ -7519,12 +7398,12 @@ rx_body_415: xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - mov ecx, -538093385 - xor edx, edx - div rcx + movsxd rcx, eax + mov rax, -538093385 + imul rax, rcx mov r9, rax -rx_i_416: ;FPSUB +rx_i_416: ;FPADD dec ebx jz rx_finish xor r12, 0f3fafde9h @@ -7536,7 +7415,7 @@ rx_body_416: xor rbp, rcx and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] - subpd xmm0, xmm3 + addpd xmm0, xmm3 movaps xmm5, xmm0 mov eax, r13d xor eax, 0f84b5382h @@ -7557,7 +7436,7 @@ rx_body_417: sub rax, r12 mov r10, rax -rx_i_418: ;MULH_64 +rx_i_418: ;MUL_64 dec ebx jz rx_finish xor r10, 02bd61c5fh @@ -7568,12 +7447,10 @@ rx_i_418: ;MULH_64 rx_body_418: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - mov rcx, r11 - mul rcx - mov rax, rdx + imul rax, r11 mov r10, rax -rx_i_419: ;XOR_64 +rx_i_419: ;OR_64 dec ebx jz rx_finish xor r9, 0b6ab9d32h @@ -7584,14 +7461,14 @@ rx_i_419: ;XOR_64 rx_body_419: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - xor rax, r14 + or rax, r14 mov rcx, rax mov eax, r14d xor eax, 0beeca8dbh and eax, 32767 mov qword ptr [rsi + rax * 8], rcx -rx_i_420: ;FPADD +rx_i_420: ;ROR_64 dec ebx jz rx_finish xor r9, 0f9690ceah @@ -7601,15 +7478,12 @@ rx_i_420: ;FPADD call rx_read_l1 rx_body_420: and ecx, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] - addpd xmm0, xmm3 - movaps xmm9, xmm0 - mov eax, r9d - xor eax, 08f7bb3ech - and eax, 2047 - movlpd qword ptr [rsi + rax * 8], xmm9 + mov rax, qword ptr [rsi+rcx*8] + mov rcx, r11 + ror rax, cl + mov r9, rax -rx_i_421: ;RET +rx_i_421: ;CALL dec ebx jz rx_finish xor r12, 01ada0f39h @@ -7621,15 +7495,15 @@ rx_body_421: xor rbp, rcx and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] - cmp rsp, rdi - je short not_taken_ret_421 - xor rax, qword ptr [rsp + 8] - mov r10, rax - ret 8 -not_taken_ret_421: + cmp r8d, -1600409762 + jno short taken_call_421 mov r10, rax + jmp rx_i_422 +taken_call_421: + push rax + call rx_i_31 -rx_i_422: ;IMUL_32 +rx_i_422: ;MUL_32 dec ebx jz rx_finish xor r11, 04dd16ca4h @@ -7640,12 +7514,12 @@ rx_i_422: ;IMUL_32 rx_body_422: and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] - movsxd rcx, eax - movsxd rax, r10d + mov ecx, eax + mov eax, r10d imul rax, rcx mov r13, rax -rx_i_423: ;MUL_64 +rx_i_423: ;SUB_64 dec ebx jz rx_finish xor r12, 04df5ce05h @@ -7656,7 +7530,7 @@ rx_i_423: ;MUL_64 rx_body_423: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - imul rax, r10 + sub rax, r10 mov rcx, rax mov eax, r15d xor eax, 0a5d40d0ah @@ -7682,7 +7556,7 @@ rx_body_424: and eax, 2047 movlpd qword ptr [rsi + rax * 8], xmm9 -rx_i_425: ;IMUL_32 +rx_i_425: ;MUL_32 dec ebx jz rx_finish xor r8, 0a3c5391dh @@ -7693,12 +7567,12 @@ rx_i_425: ;IMUL_32 rx_body_425: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - movsxd rcx, eax - movsxd rax, r10d + mov ecx, eax + mov eax, r10d imul rax, rcx mov r14, rax -rx_i_426: ;AND_64 +rx_i_426: ;IMULH_64 dec ebx jz rx_finish xor r12, 09dd55ba0h @@ -7709,14 +7583,16 @@ rx_i_426: ;AND_64 rx_body_426: and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] - and rax, r9 + mov rcx, r9 + imul rcx + mov rax, rdx mov rcx, rax mov eax, r14d xor eax, 0dcca31efh and eax, 2047 mov qword ptr [rsi + rax * 8], rcx -rx_i_427: ;MUL_32 +rx_i_427: ;MULH_64 dec ebx jz rx_finish xor r11, 0d6cae9aeh @@ -7728,9 +7604,9 @@ rx_body_427: xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - mov ecx, eax - mov eax, r11d - imul rax, rcx + mov rcx, r11 + mul rcx + mov rax, rdx mov rcx, rax mov eax, r9d xor eax, 0801190f4h @@ -7797,7 +7673,7 @@ rx_body_430: and eax, 32767 movhpd qword ptr [rsi + rax * 8], xmm5 -rx_i_431: ;FPADD +rx_i_431: ;ROR_64 dec ebx jz rx_finish xor r12, 0ed17ab58h @@ -7807,13 +7683,10 @@ rx_i_431: ;FPADD call rx_read_l1 rx_body_431: and ecx, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] - addpd xmm0, xmm5 - movaps xmm5, xmm0 - mov eax, r13d - xor eax, 019fe4aadh - and eax, 2047 - movlpd qword ptr [rsi + rax * 8], xmm5 + mov rax, qword ptr [rsi+rcx*8] + mov rcx, r13 + ror rax, cl + mov r13, rax rx_i_432: ;SUB_64 dec ebx @@ -7829,7 +7702,7 @@ rx_body_432: sub rax, r10 mov r8, rax -rx_i_433: ;ADD_32 +rx_i_433: ;ADD_64 dec ebx jz rx_finish xor r13, 0bbb88499h @@ -7840,14 +7713,14 @@ rx_i_433: ;ADD_32 rx_body_433: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - add eax, r12d + add rax, r12 mov rcx, rax mov eax, r12d xor eax, 04722b36fh and eax, 2047 mov qword ptr [rsi + rax * 8], rcx -rx_i_434: ;FPSQRT +rx_i_434: ;FPDIV dec ebx jz rx_finish xor r13, 0167edabdh @@ -7858,8 +7731,11 @@ rx_i_434: ;FPSQRT rx_body_434: and ecx, 32767 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] - andps xmm0, xmm10 - sqrtpd xmm9, xmm0 + divpd xmm0, xmm3 + movaps xmm1, xmm0 + cmpeqpd xmm1, xmm1 + andps xmm0, xmm1 + movaps xmm9, xmm0 mov eax, r9d xor eax, 08c1cfc74h and eax, 2047 @@ -7902,7 +7778,7 @@ rx_body_436: and eax, 2047 movhpd qword ptr [rsi + rax * 8], xmm7 -rx_i_437: ;FPDIV +rx_i_437: ;FPMUL dec ebx jz rx_finish xor r8, 098a6bcf7h @@ -7914,7 +7790,7 @@ rx_body_437: xor rbp, rcx and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] - divpd xmm0, xmm3 + mulpd xmm0, xmm3 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 andps xmm0, xmm1 @@ -7937,7 +7813,7 @@ rx_body_438: andps xmm0, xmm1 movaps xmm4, xmm0 -rx_i_439: ;XOR_32 +rx_i_439: ;OR_64 dec ebx jz rx_finish xor r13, 05e807e81h @@ -7948,14 +7824,14 @@ rx_i_439: ;XOR_32 rx_body_439: and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] - xor eax, r15d + or rax, r15 mov rcx, rax mov eax, r10d xor eax, 0b28e6e01h and eax, 2047 mov qword ptr [rsi + rax * 8], rcx -rx_i_440: ;RET +rx_i_440: ;CALL dec ebx jz rx_finish xor r10, 062f83728h @@ -7967,13 +7843,13 @@ rx_body_440: xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - cmp rsp, rdi - je short not_taken_ret_440 - xor rax, qword ptr [rsp + 8] - mov r9, rax - ret 8 -not_taken_ret_440: + cmp r12d, 2127765370 + js short taken_call_440 mov r9, rax + jmp rx_i_441 +taken_call_440: + push rax + call rx_i_41 rx_i_441: ;ADD_64 dec ebx @@ -7994,7 +7870,7 @@ rx_body_441: and eax, 32767 mov qword ptr [rsi + rax * 8], rcx -rx_i_442: ;CALL +rx_i_442: ;FPSQRT dec ebx jz rx_finish xor r14, 0a53dd1bh @@ -8004,18 +7880,9 @@ rx_i_442: ;CALL call rx_read_l1 rx_body_442: and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] - cmp r15d, 799523062 - jbe short taken_call_442 - mov rcx, rax - mov eax, r11d - xor eax, 02fa7c0f6h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - jmp rx_i_443 -taken_call_442: - push rax - call rx_i_9 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + andps xmm0, xmm10 + sqrtpd xmm3, xmm0 rx_i_443: ;RET dec ebx @@ -8080,7 +7947,7 @@ rx_body_445: and eax, 2047 mov qword ptr [rsi + rax * 8], rcx -rx_i_446: ;MUL_32 +rx_i_446: ;MULH_64 dec ebx jz rx_finish xor r12, 01734708eh @@ -8092,9 +7959,9 @@ rx_body_446: xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - mov ecx, eax - mov eax, r15d - imul rax, rcx + mov rcx, r15 + mul rcx + mov rax, rdx mov rcx, rax mov eax, r13d xor eax, 03166163h @@ -8134,7 +8001,7 @@ rx_body_448: subpd xmm0, xmm3 movaps xmm9, xmm0 -rx_i_449: ;ROR_64 +rx_i_449: ;ROL_64 dec ebx jz rx_finish xor r8, 04f27744bh @@ -8145,10 +8012,10 @@ rx_i_449: ;ROR_64 rx_body_449: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - ror rax, 28 + rol rax, 28 mov r8, rax -rx_i_450: ;ROL_64 +rx_i_450: ;SAR_64 dec ebx jz rx_finish xor r8, 04e2c76ffh @@ -8160,7 +8027,7 @@ rx_body_450: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] mov rcx, r12 - rol rax, cl + sar rax, cl mov rcx, rax mov eax, r8d xor eax, 0f6de92ach @@ -8209,7 +8076,7 @@ not_taken_ret_452: and eax, 2047 mov qword ptr [rsi + rax * 8], rcx -rx_i_453: ;IMULH_64 +rx_i_453: ;IMUL_32 dec ebx jz rx_finish xor r11, 0a2096aa4h @@ -8220,9 +8087,9 @@ rx_i_453: ;IMULH_64 rx_body_453: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - mov rcx, r14 - imul rcx - mov rax, rdx + movsxd rcx, eax + movsxd rax, r14d + imul rax, rcx mov r8, rax rx_i_454: ;FPADD @@ -8243,7 +8110,7 @@ rx_body_454: and eax, 2047 movhpd qword ptr [rsi + rax * 8], xmm4 -rx_i_455: ;XOR_64 +rx_i_455: ;OR_64 dec ebx jz rx_finish xor r8, 059263cdbh @@ -8255,10 +8122,10 @@ rx_body_455: xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - xor rax, r9 + or rax, r9 mov r8, rax -rx_i_456: ;OR_32 +rx_i_456: ;AND_64 dec ebx jz rx_finish xor r9, 010e8fe6h @@ -8269,7 +8136,7 @@ rx_i_456: ;OR_32 rx_body_456: and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] - or eax, r11d + and rax, r11 mov rcx, rax mov eax, r9d xor eax, 017f52c3fh @@ -8295,7 +8162,7 @@ rx_body_457: and eax, 2047 mov qword ptr [rsi + rax * 8], rcx -rx_i_458: ;ROL_64 +rx_i_458: ;SAR_64 dec ebx jz rx_finish xor r11, 05c79df6eh @@ -8306,10 +8173,10 @@ rx_i_458: ;ROL_64 rx_body_458: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - rol rax, 22 + sar rax, 22 mov r14, rax -rx_i_459: ;MUL_64 +rx_i_459: ;SUB_64 dec ebx jz rx_finish xor r9, 0346f46adh @@ -8320,14 +8187,14 @@ rx_i_459: ;MUL_64 rx_body_459: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - imul rax, rax, 381354340 + sub rax, 381354340 mov rcx, rax mov eax, r13d xor eax, 016bb0164h and eax, 2047 mov qword ptr [rsi + rax * 8], rcx -rx_i_460: ;SUB_64 +rx_i_460: ;ADD_64 dec ebx jz rx_finish xor r11, 098ab71fch @@ -8338,14 +8205,14 @@ rx_i_460: ;SUB_64 rx_body_460: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - sub rax, r14 + add rax, r14 mov rcx, rax mov eax, r12d xor eax, 0eb453a97h and eax, 2047 mov qword ptr [rsi + rax * 8], rcx -rx_i_461: ;SHR_64 +rx_i_461: ;XOR_64 dec ebx jz rx_finish xor r11, 0c814e926h @@ -8356,8 +8223,7 @@ rx_i_461: ;SHR_64 rx_body_461: and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] - mov rcx, r13 - shr rax, cl + xor rax, r13 mov rcx, rax mov eax, r12d xor eax, 062ef5b99h @@ -8378,7 +8244,7 @@ rx_body_462: add rax, -1734323376 mov r15, rax -rx_i_463: ;SUB_64 +rx_i_463: ;ADD_64 dec ebx jz rx_finish xor r9, 08c29341h @@ -8389,7 +8255,7 @@ rx_i_463: ;SUB_64 rx_body_463: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - sub rax, r15 + add rax, r15 mov r10, rax rx_i_464: ;MUL_64 @@ -8424,7 +8290,7 @@ rx_body_465: addpd xmm0, xmm5 movaps xmm2, xmm0 -rx_i_466: ;IMUL_32 +rx_i_466: ;MUL_32 dec ebx jz rx_finish xor r13, 05c541c42h @@ -8436,8 +8302,8 @@ rx_body_466: xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - movsxd rcx, eax - mov rax, 282682508 + mov ecx, eax + mov eax, 282682508 imul rax, rcx mov r9, rax @@ -8456,7 +8322,7 @@ rx_body_467: addpd xmm0, xmm9 movaps xmm8, xmm0 -rx_i_468: ;IDIV_64 +rx_i_468: ;IMUL_32 dec ebx jz rx_finish xor r8, 091044dc3h @@ -8468,21 +8334,9 @@ rx_body_468: xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - mov edx, -13394825 - cmp edx, -1 - jne short safe_idiv_468 - mov rcx, rax - rol rcx, 1 - dec rcx - jz short result_idiv_468 -safe_idiv_468: - mov ecx, 1 - test edx, edx - cmovne ecx, edx - movsxd rcx, ecx - cqo - idiv rcx -result_idiv_468: + movsxd rcx, eax + mov rax, -13394825 + imul rax, rcx mov rcx, rax mov eax, r8d xor eax, 0ff339c77h @@ -8509,7 +8363,7 @@ rx_body_469: and eax, 2047 mov qword ptr [rsi + rax * 8], rcx -rx_i_470: ;XOR_32 +rx_i_470: ;OR_64 dec ebx jz rx_finish xor r14, 090849e3eh @@ -8520,7 +8374,7 @@ rx_i_470: ;XOR_32 rx_body_470: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - xor eax, r11d + or rax, r11 mov rcx, rax mov eax, r14d xor eax, 090d56b4ch @@ -8543,7 +8397,7 @@ rx_body_471: imul rax, rcx mov r14, rax -rx_i_472: ;CALL +rx_i_472: ;JUMP dec ebx jz rx_finish xor r9, 038f4b9d6h @@ -8555,13 +8409,9 @@ rx_body_472: xor rbp, rcx and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] - cmp r10d, 1738497427 - jl short taken_call_472 mov r10, rax - jmp rx_i_473 -taken_call_472: - push rax - call rx_i_8 + cmp r10d, 1738497427 + jl rx_i_8 rx_i_473: ;MUL_64 dec ebx @@ -8577,7 +8427,7 @@ rx_body_473: imul rax, rax, -751043211 mov r12, rax -rx_i_474: ;CALL +rx_i_474: ;JUMP dec ebx jz rx_finish xor r9, 0b5c0b4d4h @@ -8589,13 +8439,9 @@ rx_body_474: xor rbp, rcx and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] - cmp r15d, -233120543 - jo short taken_call_474 mov r15, rax - jmp rx_i_475 -taken_call_474: - push rax - call rx_i_69 + cmp r15d, -233120543 + jo rx_i_69 rx_i_475: ;FPSUB dec ebx @@ -8611,7 +8457,7 @@ rx_body_475: subpd xmm0, xmm9 movaps xmm7, xmm0 -rx_i_476: ;FPSUB +rx_i_476: ;FPADD dec ebx jz rx_finish xor r8, 07ab3b5a4h @@ -8622,7 +8468,7 @@ rx_i_476: ;FPSUB rx_body_476: and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] - subpd xmm0, xmm2 + addpd xmm0, xmm2 movaps xmm9, xmm0 rx_i_477: ;FPADD @@ -8675,7 +8521,7 @@ rx_body_479: and eax, 2047 mov qword ptr [rsi + rax * 8], rcx -rx_i_480: ;FPSUB +rx_i_480: ;FPADD dec ebx jz rx_finish xor r9, 0a9cc4f01h @@ -8686,10 +8532,10 @@ rx_i_480: ;FPSUB rx_body_480: and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] - subpd xmm0, xmm4 + addpd xmm0, xmm4 movaps xmm6, xmm0 -rx_i_481: ;DIV_64 +rx_i_481: ;IMUL_32 dec ebx jz rx_finish xor r14, 0225ba1f9h @@ -8700,15 +8546,12 @@ rx_i_481: ;DIV_64 rx_body_481: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - mov ecx, 1 - mov edx, r13d - test edx, edx - cmovne ecx, edx - xor edx, edx - div rcx + movsxd rcx, eax + movsxd rax, r13d + imul rax, rcx mov r12, rax -rx_i_482: ;XOR_64 +rx_i_482: ;AND_32 dec ebx jz rx_finish xor r14, 044a0f592h @@ -8719,7 +8562,7 @@ rx_i_482: ;XOR_64 rx_body_482: and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] - xor rax, r12 + and eax, r12d mov r11, rax rx_i_483: ;FPADD @@ -8736,7 +8579,7 @@ rx_body_483: addpd xmm0, xmm6 movaps xmm6, xmm0 -rx_i_484: ;ROL_64 +rx_i_484: ;SHR_64 dec ebx jz rx_finish xor r12, 07027bacdh @@ -8748,10 +8591,10 @@ rx_body_484: xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - rol rax, 37 + shr rax, 37 mov r11, rax -rx_i_485: ;CALL +rx_i_485: ;JUMP dec ebx jz rx_finish xor r13, 03a04647h @@ -8763,17 +8606,13 @@ rx_body_485: xor rbp, rcx and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] - cmp r8d, 554879918 - jno short taken_call_485 mov rcx, rax mov eax, r15d xor eax, 02112cbaeh and eax, 32767 mov qword ptr [rsi + rax * 8], rcx - jmp rx_i_486 -taken_call_485: - push rax - call rx_i_58 + cmp r8d, 554879918 + jno rx_i_58 rx_i_486: ;ADD_64 dec ebx @@ -8807,7 +8646,7 @@ rx_body_487: sub rax, -333279706 mov r11, rax -rx_i_488: ;IMULH_64 +rx_i_488: ;IMUL_32 dec ebx jz rx_finish xor r12, 0d8b1788eh @@ -8818,12 +8657,12 @@ rx_i_488: ;IMULH_64 rx_body_488: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - mov rcx, 297357073 - imul rcx - mov rax, rdx + movsxd rcx, eax + mov rax, 297357073 + imul rax, rcx mov r12, rax -rx_i_489: ;CALL +rx_i_489: ;JUMP dec ebx jz rx_finish xor r10, 0b2ec9f3ah @@ -8835,19 +8674,15 @@ rx_body_489: xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - cmp r15d, -1127175870 - jge short taken_call_489 mov rcx, rax mov eax, r8d xor eax, 0bcd0a942h and eax, 2047 mov qword ptr [rsi + rax * 8], rcx - jmp rx_i_490 -taken_call_489: - push rax - call rx_i_75 + cmp r15d, -1127175870 + jge rx_i_75 -rx_i_490: ;FPADD +rx_i_490: ;ROR_64 dec ebx jz rx_finish xor r11, 015c7f598h @@ -8858,9 +8693,14 @@ rx_i_490: ;FPADD rx_body_490: xor rbp, rcx and ecx, 32767 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] - addpd xmm0, xmm9 - movaps xmm7, xmm0 + mov rax, qword ptr [rsi+rcx*8] + mov rcx, r9 + ror rax, cl + mov rcx, rax + mov eax, r15d + xor eax, 0ab8bd68h + and eax, 2047 + mov qword ptr [rsi + rax * 8], rcx rx_i_491: ;FPADD dec ebx @@ -8880,7 +8720,7 @@ rx_body_491: and eax, 32767 movhpd qword ptr [rsi + rax * 8], xmm7 -rx_i_492: ;OR_64 +rx_i_492: ;IDIV_64 dec ebx jz rx_finish xor r9, 0491090d9h @@ -8891,7 +8731,21 @@ rx_i_492: ;OR_64 rx_body_492: and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] - or rax, r9 + mov edx, r9d + cmp edx, -1 + jne short safe_idiv_492 + mov rcx, rax + rol rcx, 1 + dec rcx + jz short result_idiv_492 +safe_idiv_492: + mov ecx, 1 + test edx, edx + cmovne ecx, edx + movsxd rcx, ecx + cqo + idiv rcx +result_idiv_492: mov r12, rax rx_i_493: ;FPSUB @@ -8908,7 +8762,7 @@ rx_body_493: subpd xmm0, xmm9 movaps xmm4, xmm0 -rx_i_494: ;MUL_32 +rx_i_494: ;MULH_64 dec ebx jz rx_finish xor r10, 0b0d50e46h @@ -8919,9 +8773,9 @@ rx_i_494: ;MUL_32 rx_body_494: and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] - mov ecx, eax - mov eax, r11d - imul rax, rcx + mov rcx, r11 + mul rcx + mov rax, rdx mov r14, rax rx_i_495: ;FPMUL @@ -8941,7 +8795,7 @@ rx_body_495: andps xmm0, xmm1 movaps xmm8, xmm0 -rx_i_496: ;OR_64 +rx_i_496: ;DIV_64 dec ebx jz rx_finish xor r14, 0fe757b73h @@ -8952,10 +8806,12 @@ rx_i_496: ;OR_64 rx_body_496: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - or rax, -359802064 + mov ecx, -359802064 + xor edx, edx + div rcx mov r9, rax -rx_i_497: ;FPDIV +rx_i_497: ;FPMUL dec ebx jz rx_finish xor r8, 08d25742eh @@ -8967,7 +8823,7 @@ rx_body_497: xor rbp, rcx and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] - divpd xmm0, xmm3 + mulpd xmm0, xmm3 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 andps xmm0, xmm1 @@ -8994,7 +8850,7 @@ rx_body_498: and eax, 2047 movlpd qword ptr [rsi + rax * 8], xmm8 -rx_i_499: ;IMUL_32 +rx_i_499: ;MUL_32 dec ebx jz rx_finish xor r12, 08925556bh @@ -9005,12 +8861,12 @@ rx_i_499: ;IMUL_32 rx_body_499: and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] - movsxd rcx, eax - mov rax, -1795485757 + mov ecx, eax + mov eax, -1795485757 imul rax, rcx mov r8, rax -rx_i_500: ;CALL +rx_i_500: ;FPSQRT dec ebx jz rx_finish xor r10, 04bc870ebh @@ -9020,20 +8876,11 @@ rx_i_500: ;CALL call rx_read_l1 rx_body_500: and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] - cmp r13d, 1243939650 - jl short taken_call_500 - mov rcx, rax - mov eax, r10d - xor eax, 04a250342h - and eax, 32767 - mov qword ptr [rsi + rax * 8], rcx - jmp rx_i_501 -taken_call_500: - push rax - call rx_i_511 + cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + andps xmm0, xmm10 + sqrtpd xmm2, xmm0 -rx_i_501: ;SHR_64 +rx_i_501: ;XOR_64 dec ebx jz rx_finish xor r8, 07d46c503h @@ -9044,8 +8891,7 @@ rx_i_501: ;SHR_64 rx_body_501: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - mov rcx, r10 - shr rax, cl + xor rax, r10 mov rcx, rax mov eax, r12d xor eax, 03e22874bh @@ -9186,7 +9032,7 @@ rx_body_508: not_taken_ret_508: mov r8, rax -rx_i_509: ;CALL +rx_i_509: ;FPROUND dec ebx jz rx_finish xor r11, 01b2873f2h @@ -9197,13 +9043,13 @@ rx_i_509: ;CALL rx_body_509: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - cmp r8d, 1826115244 - jno short taken_call_509 - mov r10, rax - jmp rx_i_510 -taken_call_509: - push rax - call rx_i_42 + mov rcx, rax + shl eax, 13 + and eax, 24576 + or eax, 40896 + mov dword ptr [rsp - 8], eax + ldmxcsr dword ptr [rsp - 8] + mov r10, rcx rx_i_510: ;FPADD dec ebx @@ -9219,7 +9065,7 @@ rx_body_510: addpd xmm0, xmm2 movaps xmm9, xmm0 -rx_i_511: ;ROL_64 +rx_i_511: ;SHR_64 dec ebx jz rx_finish xor r11, 02bd79286h @@ -9231,7 +9077,7 @@ rx_body_511: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] mov rcx, r10 - rol rax, cl + shr rax, cl mov r11, rax jmp rx_i_0 From e487092f076c50a34f60f84417c1d660b8e4ba73 Mon Sep 17 00:00:00 2001 From: tevador Date: Fri, 11 Jan 2019 10:18:24 +0100 Subject: [PATCH 10/35] Simplified CALL and RET --- src/AssemblyGeneratorX86.cpp | 21 +- src/JitCompilerX86.cpp | 22 +- src/program.inc | 515 +++++++++++------------------------ 3 files changed, 176 insertions(+), 382 deletions(-) diff --git a/src/AssemblyGeneratorX86.cpp b/src/AssemblyGeneratorX86.cpp index 619afd3..2b8db69 100644 --- a/src/AssemblyGeneratorX86.cpp +++ b/src/AssemblyGeneratorX86.cpp @@ -506,28 +506,19 @@ namespace RandomX { void AssemblyGeneratorX86::h_CALL(Instruction& instr, int i) { genar(instr, i); - asmCode << "\tcmp " << regR32[instr.regb % RegistersCount] << ", " << instr.imm32 << std::endl; - asmCode << "\t" << jumpCondition(instr); - asmCode << " short taken_call_" << i << std::endl; gencr(instr); - asmCode << "\tjmp rx_i_" << wrapInstr(i + 1) << std::endl; - asmCode << "taken_call_" << i << ":" << std::endl; - if (trace) { - asmCode << "\tmov qword ptr [" << regScratchpadAddr << " + " << regIc << " * 8 + 262136], rax" << std::endl; - } - asmCode << "\tpush rax" << std::endl; + asmCode << "\tcmp " << regR32[instr.regb % RegistersCount] << ", " << instr.imm32 << std::endl; + asmCode << "\t" << jumpCondition(instr, true); + asmCode << " short rx_i_" << wrapInstr(i + 1) << std::endl; asmCode << "\tcall rx_i_" << wrapInstr(i + (instr.imm8 & 127) + 2) << std::endl; } void AssemblyGeneratorX86::h_RET(Instruction& instr, int i) { genar(instr, i); + gencr(instr); asmCode << "\tcmp rsp, " << regStackBeginAddr << std::endl; - asmCode << "\tje short not_taken_ret_" << i << std::endl; - asmCode << "\txor rax, qword ptr [rsp + 8]" << std::endl; - gencr(instr); - asmCode << "\tret 8" << std::endl; - asmCode << "not_taken_ret_" << i << ":" << std::endl; - gencr(instr); + asmCode << "\tje short rx_i_" << wrapInstr(i + 1) << std::endl; + asmCode << "\tret" << std::endl; } #include "instructionWeights.hpp" diff --git a/src/JitCompilerX86.cpp b/src/JitCompilerX86.cpp index c7f753b..7018b97 100644 --- a/src/JitCompilerX86.cpp +++ b/src/JitCompilerX86.cpp @@ -623,19 +623,12 @@ namespace RandomX { void JitCompilerX86::h_CALL(Instruction& instr, int i) { genar(instr); + gencr(instr); emit(uint16_t(0x8141)); //cmp regb, imm32 emitByte(0xf8 + (instr.regb % RegistersCount)); emit(instr.imm32); - emitByte(jumpCondition(instr)); - if ((instr.locc & 7) <= 3) { - emitByte(0x16); - } - else { - emitByte(0x05); - } - gencr(instr); - emit(uint16_t(0x06eb)); //jmp to next - emitByte(0x50); //push rax + emitByte(jumpCondition(instr, true)); + emitByte(0x05); emitByte(0xe8); //call i = wrapInstr(i + (instr.imm8 & 127) + 2); if (i < instructionOffsets.size()) { @@ -654,13 +647,8 @@ namespace RandomX { crlen = 17; } emit(0x74e73b48); //cmp rsp, rdi; je - emitByte(11 + crlen); - emitByte(0x48); - emit(0x08244433); //xor rax,QWORD PTR [rsp+0x8] - gencr(instr); - emitByte(0xc2); //ret 8 - emit(uint16_t(0x0008)); - gencr(instr); + emitByte(0x01); + emitByte(0xc3); //ret } #include "instructionWeights.hpp" diff --git a/src/program.inc b/src/program.inc index 2d4fc25..dd8cb36 100644 --- a/src/program.inc +++ b/src/program.inc @@ -10,16 +10,13 @@ rx_body_0: xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - cmp r11d, 445530481 - jbe short taken_call_0 mov rcx, rax mov eax, r12d xor eax, 01a8e4171h and eax, 2047 mov qword ptr [rsi + rax * 8], rcx - jmp rx_i_1 -taken_call_0: - push rax + cmp r11d, 445530481 + ja short rx_i_1 call rx_i_30 rx_i_1: ;IMULH_64 @@ -283,21 +280,14 @@ rx_i_15: ;RET rx_body_15: and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] + mov rcx, rax + mov eax, r14d + xor eax, 0468b38b8h + and eax, 32767 + mov qword ptr [rsi + rax * 8], rcx cmp rsp, rdi - je short not_taken_ret_15 - xor rax, qword ptr [rsp + 8] - mov rcx, rax - mov eax, r14d - xor eax, 0468b38b8h - and eax, 32767 - mov qword ptr [rsi + rax * 8], rcx - ret 8 -not_taken_ret_15: - mov rcx, rax - mov eax, r14d - xor eax, 0468b38b8h - and eax, 32767 - mov qword ptr [rsi + rax * 8], rcx + je short rx_i_16 + ret rx_i_16: ;ADD_64 dec ebx @@ -620,12 +610,9 @@ rx_body_34: xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - cmp r14d, -380224718 - js short taken_call_34 mov r15, rax - jmp rx_i_35 -taken_call_34: - push rax + cmp r14d, -380224718 + jns short rx_i_35 call rx_i_108 rx_i_35: ;CALL @@ -639,12 +626,9 @@ rx_i_35: ;CALL rx_body_35: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - cmp r9d, -2040787098 - js short taken_call_35 mov r8, rax - jmp rx_i_36 -taken_call_35: - push rax + cmp r9d, -2040787098 + jns short rx_i_36 call rx_i_58 rx_i_36: ;FPMUL @@ -722,16 +706,13 @@ rx_i_40: ;CALL rx_body_40: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - cmp r12d, -1297973554 - js short taken_call_40 mov rcx, rax mov eax, r9d xor eax, 0b2a27eceh and eax, 2047 mov qword ptr [rsi + rax * 8], rcx - jmp rx_i_41 -taken_call_40: - push rax + cmp r12d, -1297973554 + jns short rx_i_41 call rx_i_90 rx_i_41: ;JUMP @@ -943,13 +924,10 @@ rx_i_53: ;RET rx_body_53: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] + mov r13, rax cmp rsp, rdi - je short not_taken_ret_53 - xor rax, qword ptr [rsp + 8] - mov r13, rax - ret 8 -not_taken_ret_53: - mov r13, rax + je short rx_i_54 + ret rx_i_54: ;IMUL_32 dec ebx @@ -1071,16 +1049,13 @@ rx_i_60: ;CALL rx_body_60: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - cmp r11d, 2075529029 - jno short taken_call_60 mov rcx, rax mov eax, r11d xor eax, 07bb60f45h and eax, 2047 mov qword ptr [rsi + rax * 8], rcx - jmp rx_i_61 -taken_call_60: - push rax + cmp r11d, 2075529029 + jo short rx_i_61 call rx_i_116 rx_i_61: ;JUMP @@ -1327,12 +1302,9 @@ rx_i_75: ;CALL rx_body_75: and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] - cmp r11d, -1160798683 - jo short taken_call_75 mov r13, rax - jmp rx_i_76 -taken_call_75: - push rax + cmp r11d, -1160798683 + jno short rx_i_76 call rx_i_202 rx_i_76: ;FPADD @@ -1364,21 +1336,14 @@ rx_i_77: ;RET rx_body_77: and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] + mov rcx, rax + mov eax, r11d + xor eax, 03a92bc7ah + and eax, 2047 + mov qword ptr [rsi + rax * 8], rcx cmp rsp, rdi - je short not_taken_ret_77 - xor rax, qword ptr [rsp + 8] - mov rcx, rax - mov eax, r11d - xor eax, 03a92bc7ah - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - ret 8 -not_taken_ret_77: - mov rcx, rax - mov eax, r11d - xor eax, 03a92bc7ah - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx + je short rx_i_78 + ret rx_i_78: ;MULH_64 dec ebx @@ -1407,16 +1372,13 @@ rx_i_79: ;CALL rx_body_79: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - cmp r13d, 1800043331 - jbe short taken_call_79 mov rcx, rax mov eax, r11d xor eax, 06b4a7b43h and eax, 2047 mov qword ptr [rsi + rax * 8], rcx - jmp rx_i_80 -taken_call_79: - push rax + cmp r13d, 1800043331 + ja short rx_i_80 call rx_i_93 rx_i_80: ;ROR_64 @@ -1664,12 +1626,9 @@ rx_body_94: xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - cmp r13d, -343122976 - js short taken_call_94 mov r8, rax - jmp rx_i_95 -taken_call_94: - push rax + cmp r13d, -343122976 + jns short rx_i_95 call rx_i_157 rx_i_95: ;MUL_64 @@ -1978,16 +1937,13 @@ rx_i_111: ;CALL rx_body_111: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - cmp r14d, 1562606859 - jge short taken_call_111 mov rcx, rax mov eax, r12d xor eax, 05d237d0bh and eax, 32767 mov qword ptr [rsi + rax * 8], rcx - jmp rx_i_112 -taken_call_111: - push rax + cmp r14d, 1562606859 + jl short rx_i_112 call rx_i_212 rx_i_112: ;SUB_64 @@ -2174,16 +2130,13 @@ rx_i_122: ;CALL rx_body_122: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - cmp r11d, 2029448233 - jo short taken_call_122 mov rcx, rax mov eax, r14d xor eax, 078f6ec29h and eax, 2047 mov qword ptr [rsi + rax * 8], rcx - jmp rx_i_123 -taken_call_122: - push rax + cmp r11d, 2029448233 + jno short rx_i_123 call rx_i_192 rx_i_123: ;ADD_64 @@ -2330,21 +2283,14 @@ rx_i_131: ;RET rx_body_131: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] + mov rcx, rax + mov eax, r15d + xor eax, 0dff06f75h + and eax, 2047 + mov qword ptr [rsi + rax * 8], rcx cmp rsp, rdi - je short not_taken_ret_131 - xor rax, qword ptr [rsp + 8] - mov rcx, rax - mov eax, r15d - xor eax, 0dff06f75h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - ret 8 -not_taken_ret_131: - mov rcx, rax - mov eax, r15d - xor eax, 0dff06f75h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx + je short rx_i_132 + ret rx_i_132: ;FPADD dec ebx @@ -2457,13 +2403,10 @@ rx_i_138: ;RET rx_body_138: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] + mov r10, rax cmp rsp, rdi - je short not_taken_ret_138 - xor rax, qword ptr [rsp + 8] - mov r10, rax - ret 8 -not_taken_ret_138: - mov r10, rax + je short rx_i_139 + ret rx_i_139: ;ADD_64 dec ebx @@ -2828,16 +2771,13 @@ rx_i_159: ;CALL rx_body_159: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - cmp r15d, -8571241 - jbe short taken_call_159 mov rcx, rax mov eax, r13d xor eax, 0ff7d3697h and eax, 2047 mov qword ptr [rsi + rax * 8], rcx - jmp rx_i_160 -taken_call_159: - push rax + cmp r15d, -8571241 + ja short rx_i_160 call rx_i_181 rx_i_160: ;ADD_32 @@ -2953,21 +2893,14 @@ rx_i_165: ;RET rx_body_165: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] + mov rcx, rax + mov eax, r9d + xor eax, 06450685ch + and eax, 32767 + mov qword ptr [rsi + rax * 8], rcx cmp rsp, rdi - je short not_taken_ret_165 - xor rax, qword ptr [rsp + 8] - mov rcx, rax - mov eax, r9d - xor eax, 06450685ch - and eax, 32767 - mov qword ptr [rsi + rax * 8], rcx - ret 8 -not_taken_ret_165: - mov rcx, rax - mov eax, r9d - xor eax, 06450685ch - and eax, 32767 - mov qword ptr [rsi + rax * 8], rcx + je short rx_i_166 + ret rx_i_166: ;SHR_64 dec ebx @@ -3037,16 +2970,13 @@ rx_i_169: ;CALL rx_body_169: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - cmp r10d, -1286357107 - jbe short taken_call_169 mov rcx, rax mov eax, r14d xor eax, 0b353bf8dh and eax, 2047 mov qword ptr [rsi + rax * 8], rcx - jmp rx_i_170 -taken_call_169: - push rax + cmp r10d, -1286357107 + ja short rx_i_170 call rx_i_197 rx_i_170: ;FPSQRT @@ -3194,21 +3124,14 @@ rx_i_178: ;RET rx_body_178: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] + mov rcx, rax + mov eax, r12d + xor eax, 0c366b275h + and eax, 2047 + mov qword ptr [rsi + rax * 8], rcx cmp rsp, rdi - je short not_taken_ret_178 - xor rax, qword ptr [rsp + 8] - mov rcx, rax - mov eax, r12d - xor eax, 0c366b275h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - ret 8 -not_taken_ret_178: - mov rcx, rax - mov eax, r12d - xor eax, 0c366b275h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx + je short rx_i_179 + ret rx_i_179: ;FPADD dec ebx @@ -3254,12 +3177,9 @@ rx_body_181: xor rbp, rcx and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] - cmp r12d, -1612576918 - ja short taken_call_181 mov r10, rax - jmp rx_i_182 -taken_call_181: - push rax + cmp r12d, -1612576918 + jbe short rx_i_182 call rx_i_211 rx_i_182: ;FPSUB @@ -3404,13 +3324,10 @@ rx_i_190: ;RET rx_body_190: and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] + mov r13, rax cmp rsp, rdi - je short not_taken_ret_190 - xor rax, qword ptr [rsp + 8] - mov r13, rax - ret 8 -not_taken_ret_190: - mov r13, rax + je short rx_i_191 + ret rx_i_191: ;FPSQRT dec ebx @@ -4147,21 +4064,14 @@ rx_body_231: xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] + mov rcx, rax + mov eax, r9d + xor eax, 0e6c9edaah + and eax, 2047 + mov qword ptr [rsi + rax * 8], rcx cmp rsp, rdi - je short not_taken_ret_231 - xor rax, qword ptr [rsp + 8] - mov rcx, rax - mov eax, r9d - xor eax, 0e6c9edaah - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - ret 8 -not_taken_ret_231: - mov rcx, rax - mov eax, r9d - xor eax, 0e6c9edaah - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx + je short rx_i_232 + ret rx_i_232: ;FPMUL dec ebx @@ -4546,16 +4456,13 @@ rx_i_253: ;CALL rx_body_253: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - cmp r15d, 1699431947 - jns short taken_call_253 mov rcx, rax mov eax, r13d xor eax, 0654b460bh and eax, 2047 mov qword ptr [rsi + rax * 8], rcx - jmp rx_i_254 -taken_call_253: - push rax + cmp r15d, 1699431947 + js short rx_i_254 call rx_i_367 rx_i_254: ;FPADD @@ -4787,12 +4694,9 @@ rx_i_266: ;CALL rx_body_266: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - cmp r12d, 136160027 - ja short taken_call_266 mov r10, rax - jmp rx_i_267 -taken_call_266: - push rax + cmp r12d, 136160027 + jbe short rx_i_267 call rx_i_295 rx_i_267: ;ROL_64 @@ -5247,21 +5151,14 @@ rx_i_291: ;RET rx_body_291: and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] + mov rcx, rax + mov eax, r14d + xor eax, 0768a9d75h + and eax, 32767 + mov qword ptr [rsi + rax * 8], rcx cmp rsp, rdi - je short not_taken_ret_291 - xor rax, qword ptr [rsp + 8] - mov rcx, rax - mov eax, r14d - xor eax, 0768a9d75h - and eax, 32767 - mov qword ptr [rsi + rax * 8], rcx - ret 8 -not_taken_ret_291: - mov rcx, rax - mov eax, r14d - xor eax, 0768a9d75h - and eax, 32767 - mov qword ptr [rsi + rax * 8], rcx + je short rx_i_292 + ret rx_i_292: ;ROL_64 dec ebx @@ -5302,21 +5199,14 @@ rx_i_294: ;RET rx_body_294: and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] + mov rcx, rax + mov eax, r8d + xor eax, 0ef8571b7h + and eax, 2047 + mov qword ptr [rsi + rax * 8], rcx cmp rsp, rdi - je short not_taken_ret_294 - xor rax, qword ptr [rsp + 8] - mov rcx, rax - mov eax, r8d - xor eax, 0ef8571b7h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - ret 8 -not_taken_ret_294: - mov rcx, rax - mov eax, r8d - xor eax, 0ef8571b7h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx + je short rx_i_295 + ret rx_i_295: ;FPSUB dec ebx @@ -5672,21 +5562,14 @@ rx_body_316: xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] + mov rcx, rax + mov eax, r8d + xor eax, 03602c513h + and eax, 2047 + mov qword ptr [rsi + rax * 8], rcx cmp rsp, rdi - je short not_taken_ret_316 - xor rax, qword ptr [rsp + 8] - mov rcx, rax - mov eax, r8d - xor eax, 03602c513h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - ret 8 -not_taken_ret_316: - mov rcx, rax - mov eax, r8d - xor eax, 03602c513h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx + je short rx_i_317 + ret rx_i_317: ;FPADD dec ebx @@ -5788,16 +5671,13 @@ rx_i_322: ;CALL rx_body_322: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - cmp r11d, 1411981860 - jo short taken_call_322 mov rcx, rax mov eax, r11d xor eax, 054292224h and eax, 2047 mov qword ptr [rsi + rax * 8], rcx - jmp rx_i_323 -taken_call_322: - push rax + cmp r11d, 1411981860 + jno short rx_i_323 call rx_i_343 rx_i_323: ;MUL_64 @@ -5916,13 +5796,10 @@ rx_i_329: ;RET rx_body_329: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] + mov r11, rax cmp rsp, rdi - je short not_taken_ret_329 - xor rax, qword ptr [rsp + 8] - mov r11, rax - ret 8 -not_taken_ret_329: - mov r11, rax + je short rx_i_330 + ret rx_i_330: ;MUL_32 dec ebx @@ -6268,16 +6145,13 @@ rx_i_350: ;CALL rx_body_350: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - cmp r9d, -980411581 - jbe short taken_call_350 mov rcx, rax mov eax, r12d xor eax, 0c5901b43h and eax, 2047 mov qword ptr [rsi + rax * 8], rcx - jmp rx_i_351 -taken_call_350: - push rax + cmp r9d, -980411581 + ja short rx_i_351 call rx_i_352 rx_i_351: ;MUL_64 @@ -6909,21 +6783,14 @@ rx_i_388: ;RET rx_body_388: and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] + mov rcx, rax + mov eax, r11d + xor eax, 0a0985cc2h + and eax, 2047 + mov qword ptr [rsi + rax * 8], rcx cmp rsp, rdi - je short not_taken_ret_388 - xor rax, qword ptr [rsp + 8] - mov rcx, rax - mov eax, r11d - xor eax, 0a0985cc2h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - ret 8 -not_taken_ret_388: - mov rcx, rax - mov eax, r11d - xor eax, 0a0985cc2h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx + je short rx_i_389 + ret rx_i_389: ;JUMP dec ebx @@ -7155,13 +7022,10 @@ rx_i_402: ;RET rx_body_402: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] + mov r14, rax cmp rsp, rdi - je short not_taken_ret_402 - xor rax, qword ptr [rsp + 8] - mov r14, rax - ret 8 -not_taken_ret_402: - mov r14, rax + je short rx_i_403 + ret rx_i_403: ;IMUL_32 dec ebx @@ -7210,16 +7074,13 @@ rx_i_405: ;CALL rx_body_405: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - cmp r10d, 1795880641 - jbe short taken_call_405 mov rcx, rax mov eax, r12d xor eax, 06b0af6c1h and eax, 2047 mov qword ptr [rsi + rax * 8], rcx - jmp rx_i_406 -taken_call_405: - push rax + cmp r10d, 1795880641 + ja short rx_i_406 call rx_i_494 rx_i_406: ;FPDIV @@ -7302,13 +7163,10 @@ rx_i_410: ;RET rx_body_410: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] + mov r8, rax cmp rsp, rdi - je short not_taken_ret_410 - xor rax, qword ptr [rsp + 8] - mov r8, rax - ret 8 -not_taken_ret_410: - mov r8, rax + je short rx_i_411 + ret rx_i_411: ;RET dec ebx @@ -7321,13 +7179,10 @@ rx_i_411: ;RET rx_body_411: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] + mov r12, rax cmp rsp, rdi - je short not_taken_ret_411 - xor rax, qword ptr [rsp + 8] - mov r12, rax - ret 8 -not_taken_ret_411: - mov r12, rax + je short rx_i_412 + ret rx_i_412: ;FPDIV dec ebx @@ -7495,12 +7350,9 @@ rx_body_421: xor rbp, rcx and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] - cmp r8d, -1600409762 - jno short taken_call_421 mov r10, rax - jmp rx_i_422 -taken_call_421: - push rax + cmp r8d, -1600409762 + jo short rx_i_422 call rx_i_31 rx_i_422: ;MUL_32 @@ -7624,21 +7476,14 @@ rx_i_428: ;RET rx_body_428: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] + mov rcx, rax + mov eax, r8d + xor eax, 0e3b86b2fh + and eax, 2047 + mov qword ptr [rsi + rax * 8], rcx cmp rsp, rdi - je short not_taken_ret_428 - xor rax, qword ptr [rsp + 8] - mov rcx, rax - mov eax, r8d - xor eax, 0e3b86b2fh - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - ret 8 -not_taken_ret_428: - mov rcx, rax - mov eax, r8d - xor eax, 0e3b86b2fh - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx + je short rx_i_429 + ret rx_i_429: ;MUL_64 dec ebx @@ -7843,12 +7688,9 @@ rx_body_440: xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - cmp r12d, 2127765370 - js short taken_call_440 mov r9, rax - jmp rx_i_441 -taken_call_440: - push rax + cmp r12d, 2127765370 + jns short rx_i_441 call rx_i_41 rx_i_441: ;ADD_64 @@ -7895,21 +7737,14 @@ rx_i_443: ;RET rx_body_443: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] + mov rcx, rax + mov eax, r9d + xor eax, 04f71c419h + and eax, 2047 + mov qword ptr [rsi + rax * 8], rcx cmp rsp, rdi - je short not_taken_ret_443 - xor rax, qword ptr [rsp + 8] - mov rcx, rax - mov eax, r9d - xor eax, 04f71c419h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - ret 8 -not_taken_ret_443: - mov rcx, rax - mov eax, r9d - xor eax, 04f71c419h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx + je short rx_i_444 + ret rx_i_444: ;FPSUB dec ebx @@ -8060,21 +7895,14 @@ rx_i_452: ;RET rx_body_452: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] + mov rcx, rax + mov eax, r11d + xor eax, 0e27dea25h + and eax, 2047 + mov qword ptr [rsi + rax * 8], rcx cmp rsp, rdi - je short not_taken_ret_452 - xor rax, qword ptr [rsp + 8] - mov rcx, rax - mov eax, r11d - xor eax, 0e27dea25h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - ret 8 -not_taken_ret_452: - mov rcx, rax - mov eax, r11d - xor eax, 0e27dea25h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx + je short rx_i_453 + ret rx_i_453: ;IMUL_32 dec ebx @@ -8910,21 +8738,14 @@ rx_body_502: xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] + mov rcx, rax + mov eax, r9d + xor eax, 08d85312h + and eax, 2047 + mov qword ptr [rsi + rax * 8], rcx cmp rsp, rdi - je short not_taken_ret_502 - xor rax, qword ptr [rsp + 8] - mov rcx, rax - mov eax, r9d - xor eax, 08d85312h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - ret 8 -not_taken_ret_502: - mov rcx, rax - mov eax, r9d - xor eax, 08d85312h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx + je short rx_i_503 + ret rx_i_503: ;FPSUB dec ebx @@ -9005,13 +8826,10 @@ rx_i_507: ;RET rx_body_507: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] + mov r14, rax cmp rsp, rdi - je short not_taken_ret_507 - xor rax, qword ptr [rsp + 8] - mov r14, rax - ret 8 -not_taken_ret_507: - mov r14, rax + je short rx_i_508 + ret rx_i_508: ;RET dec ebx @@ -9024,13 +8842,10 @@ rx_i_508: ;RET rx_body_508: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] + mov r8, rax cmp rsp, rdi - je short not_taken_ret_508 - xor rax, qword ptr [rsp + 8] - mov r8, rax - ret 8 -not_taken_ret_508: - mov r8, rax + je short rx_i_509 + ret rx_i_509: ;FPROUND dec ebx From c02ee4291d174e8a6511ba2f8839a15b3e34f257 Mon Sep 17 00:00:00 2001 From: tevador Date: Fri, 11 Jan 2019 10:52:12 +0100 Subject: [PATCH 11/35] FPROUND - variable flag offset --- doc/isa-ops.md | 3 ++- doc/isa.md | 9 +++++---- src/AssemblyGeneratorX86.cpp | 4 +++- src/JitCompilerX86.cpp | 10 +++++++++- src/program.inc | 2 +- 5 files changed, 20 insertions(+), 8 deletions(-) diff --git a/doc/isa-ops.md b/doc/isa-ops.md index fd5f286..5e389e3 100644 --- a/doc/isa-ops.md +++ b/doc/isa-ops.md @@ -12,6 +12,7 @@ There are 31 unique instructions divided into 3 groups: ## Integer instructions There are 22 integer instructions. They are divided into 3 classes (MATH, DIV, SHIFT) with different B operand selection rules. + |# opcodes|instruction|class|signed|A width|B width|C|C width| |-|-|-|-|-|-|-|-| |12|ADD_64|MATH|no|64|64|`A + B`|64| @@ -55,7 +56,7 @@ The shift/rotate instructions use just the bottom 6 bits of the `B` operand (`im There are 5 floating point instructions. All floating point instructions are vector instructions that operate on two packed double precision floating point values. |# opcodes|instruction|C| -|-|-|-|-| +|-|-|-| |20|FPADD|`A + B`| |20|FPSUB|`A - B`| |22|FPMUL|`A * B`| diff --git a/doc/isa.md b/doc/isa.md index 4f1cc5d..cedece9 100644 --- a/doc/isa.md +++ b/doc/isa.md @@ -9,6 +9,7 @@ The encoding of each 128-bit instruction word is following: There are 256 opcodes, which are distributed between 3 groups of instructions. There are 31 distinct operations (each operation can be encoded using multiple opcodes - for example opcodes `0x00` to `0x0d` correspond to integer addition). **Table 1: Instruction groups** + |group|# operations|# opcodes|| |---------|-----------------|----|-| |integer (IA)|22|144|56.3%| @@ -31,8 +32,8 @@ The `A.LOC.W` flag determines the address width when reading operand A from the **Table 3: Operand A read address width** -|`A.LOC.W`|address width (W) -|---------|-|-| +|`A.LOC.W`|address width (W)| +|---------|-| |0|15 bits (256 KiB)| |1-3|11 bits (16 KiB)| @@ -125,8 +126,8 @@ The `C.LOC.W` flag determines the address width when writing operand C to the sc **Table 10: Operand C write address width** -|`C.LOC.W`|address width (W) -|---------|-|-| +|`C.LOC.W`|address width (W)| +|---------|-| |0|15 bits (256 KiB)| |1-3|11 bits (16 KiB)| diff --git a/src/AssemblyGeneratorX86.cpp b/src/AssemblyGeneratorX86.cpp index 2b8db69..1fbf2f2 100644 --- a/src/AssemblyGeneratorX86.cpp +++ b/src/AssemblyGeneratorX86.cpp @@ -466,7 +466,9 @@ namespace RandomX { void AssemblyGeneratorX86::h_FPROUND(Instruction& instr, int i) { genar(instr, i); asmCode << "\tmov rcx, rax" << std::endl; - asmCode << "\tshl eax, 13" << std::endl; + int rotate = (13 - (instr.imm8 & 63)) & 63; + if (rotate != 0) + asmCode << "\trol rax, " << rotate << std::endl; asmCode << "\tand eax, 24576" << std::endl; asmCode << "\tor eax, 40896" << std::endl; asmCode << "\tmov dword ptr [rsp - 8], eax" << std::endl; diff --git a/src/JitCompilerX86.cpp b/src/JitCompilerX86.cpp index 7018b97..1f09cd9 100644 --- a/src/JitCompilerX86.cpp +++ b/src/JitCompilerX86.cpp @@ -574,7 +574,15 @@ namespace RandomX { void JitCompilerX86::h_FPROUND(Instruction& instr, int i) { genar(instr); - emit(0x00250de0c1c88b48); //mov rcx,rax; shl eax,0xd + emitByte(0x48); + emit(uint16_t(0xc88b)); //mov rcx,rax + int rotate = (13 - (instr.imm8 & 63)) & 63; + if (rotate != 0) { + emitByte(0x48); + emit(uint16_t(0xc0c1)); //rol rax + emitByte(rotate); + } + emit(uint16_t(0x0025)); emit(0x00009fc00d000060); //and eax,0x6000; or eax,0x9fc0 emit(0x2454ae0ff8244489); //ldmxcsr DWORD PTR [rsp-0x8] emitByte(0xf8); diff --git a/src/program.inc b/src/program.inc index dd8cb36..66b9147 100644 --- a/src/program.inc +++ b/src/program.inc @@ -8859,7 +8859,7 @@ rx_body_509: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] mov rcx, rax - shl eax, 13 + rol rax, 34 and eax, 24576 or eax, 40896 mov dword ptr [rsp - 8], eax From 451dfc5730898b33f3c3e1ae1faea1ef9c65badb Mon Sep 17 00:00:00 2001 From: tevador Date: Fri, 11 Jan 2019 14:08:21 +0100 Subject: [PATCH 12/35] Optimized division by constants --- makefile | 5 +- src/AssemblyGeneratorX86.cpp | 99 +++++++++++++++++++- src/divideByConstantCodegen.c | 169 ++++++++++++++++++++++++++++++++++ src/divideByConstantCodegen.h | 117 +++++++++++++++++++++++ 4 files changed, 385 insertions(+), 5 deletions(-) create mode 100644 src/divideByConstantCodegen.c create mode 100644 src/divideByConstantCodegen.h diff --git a/makefile b/makefile index 55e1abd..d0a969c 100644 --- a/makefile +++ b/makefile @@ -11,7 +11,7 @@ SRCDIR=src OBJDIR=obj LDFLAGS=-lpthread TOBJS=$(addprefix $(OBJDIR)/,instructionsPortable.o TestAluFpu.o) -ROBJS=$(addprefix $(OBJDIR)/,argon2_core.o argon2_ref.o AssemblyGeneratorX86.o blake2b.o CompiledVirtualMachine.o dataset.o JitCompilerX86.o instructionsPortable.o Instruction.o InterpretedVirtualMachine.o main.o Program.o softAes.o VirtualMachine.o t1ha2.o Cache.o virtualMemory.o) +ROBJS=$(addprefix $(OBJDIR)/,argon2_core.o argon2_ref.o AssemblyGeneratorX86.o blake2b.o CompiledVirtualMachine.o dataset.o JitCompilerX86.o instructionsPortable.o Instruction.o InterpretedVirtualMachine.o main.o Program.o softAes.o VirtualMachine.o t1ha2.o Cache.o virtualMemory.o divideByConstantCodegen.o) ifeq ($(PLATFORM),x86_64) ROBJS += $(OBJDIR)/JitCompilerX86-static.o endif @@ -57,6 +57,9 @@ $(OBJDIR)/CompiledVirtualMachine.o: $(addprefix $(SRCDIR)/,CompiledVirtualMachin $(OBJDIR)/dataset.o: $(addprefix $(SRCDIR)/,dataset.cpp common.hpp Pcg32.hpp) | $(OBJDIR) $(CXX) $(CXXFLAGS) -c $(SRCDIR)/dataset.cpp -o $@ +$(OBJDIR)/divideByConstantCodegen.o: $(addprefix $(SRCDIR)/,divideByConstantCodegen.c divideByConstantCodegen.h) | $(OBJDIR) + $(CC) $(CCFLAGS) -c $(SRCDIR)/divideByConstantCodegen.c -o $@ + $(OBJDIR)/JitCompilerX86.o: $(addprefix $(SRCDIR)/,JitCompilerX86.cpp JitCompilerX86.hpp Instruction.hpp instructionWeights.hpp) | $(OBJDIR) $(CXX) $(CXXFLAGS) -c $(SRCDIR)/JitCompilerX86.cpp -o $@ diff --git a/src/AssemblyGeneratorX86.cpp b/src/AssemblyGeneratorX86.cpp index 1fbf2f2..9389634 100644 --- a/src/AssemblyGeneratorX86.cpp +++ b/src/AssemblyGeneratorX86.cpp @@ -17,10 +17,14 @@ You should have received a copy of the GNU General Public License along with RandomX. If not, see. */ //#define TRACE +//#define MAGIC_DIVISION #include "AssemblyGeneratorX86.hpp" #include "Pcg32.hpp" #include "common.hpp" #include "instructions.hpp" +#ifdef MAGIC_DIVISION +#include "divideByConstantCodegen.h" +#endif namespace RandomX { @@ -315,34 +319,118 @@ namespace RandomX { void AssemblyGeneratorX86::h_DIV_64(Instruction& instr, int i) { genar(instr, i); if ((instr.locb & 7) >= 6) { +#ifdef MAGIC_DIVISION + if (instr.imm32 != 0) { + uint32_t divisor = instr.imm32; + asmCode << "\t; magic divide by " << divisor << std::endl; + if (divisor & (divisor - 1)) { + magicu_info mi = compute_unsigned_magic_info(divisor, sizeof(uint64_t) * 8); + if (mi.pre_shift > 0) + asmCode << "\tshr rax, " << mi.pre_shift << std::endl; + if (mi.increment) { + asmCode << "\tadd rax, 1" << std::endl; + asmCode << "\tsbb rax, 0" << std::endl; + } + asmCode << "\tmov rcx, " << mi.multiplier << std::endl; + asmCode << "\tmul rcx" << std::endl; + asmCode << "\tmov rax, rdx" << std::endl; + if (mi.post_shift > 0) + asmCode << "\tshr rax, " << mi.post_shift << std::endl; + } + else { //divisor is a power of two + int shift = 0; + while (divisor >>= 1) + ++shift; + if(shift > 0) + asmCode << "\tshr rax, " << shift << std::endl; + } + } +#else if (instr.imm32 == 0) { asmCode << "\tmov ecx, 1" << std::endl; } else { asmCode << "\tmov ecx, " << instr.imm32 << std::endl; } +#endif } else { asmCode << "\tmov ecx, 1" << std::endl; asmCode << "\tmov edx, " << regR32[instr.regb % RegistersCount] << std::endl; asmCode << "\ttest edx, edx" << std::endl; asmCode << "\tcmovne ecx, edx" << std::endl; +#ifdef MAGIC_DIVISION + asmCode << "\txor edx, edx" << std::endl; + asmCode << "\tdiv rcx" << std::endl; +#endif } +#ifndef MAGIC_DIVISION asmCode << "\txor edx, edx" << std::endl; asmCode << "\tdiv rcx" << std::endl; +#endif gencr(instr); } void AssemblyGeneratorX86::h_IDIV_64(Instruction& instr, int i) { genar(instr, i); +#ifdef MAGIC_DIVISION + if ((instr.locb & 7) >= 6) { + int64_t divisor = instr.imm32; + asmCode << "\t; magic divide by " << divisor << std::endl; + if ((divisor & -divisor) == divisor || (divisor & -divisor) == -divisor) { + // +/- power of two + bool negative = divisor < 0; + if (negative) + divisor = -divisor; + int shift = 0; + uint64_t unsignedDivisor = divisor; + while (unsignedDivisor >>= 1) + ++shift; + if (shift > 0) { + asmCode << "\tmov rcx, rax" << std::endl; + asmCode << "\tsar rcx, 63" << std::endl; + uint32_t mask = (1ULL << shift) + 0xFFFFFFFF; + asmCode << "\tand ecx, 0" << std::hex << mask << std::dec << "h" << std::endl; + asmCode << "\tadd rax, rcx" << std::endl; + asmCode << "\tsar rax, " << shift << std::endl; + } + if(negative) + asmCode << "\tneg rax" << std::endl; + } else if(divisor != 0) { + magics_info mi = compute_signed_magic_info(divisor); + if ((divisor >= 0) != (mi.multiplier >= 0)) + asmCode << "\tmov rcx, rax" << std::endl; + asmCode << "\tmov rdx, " << mi.multiplier << std::endl; + asmCode << "\timul rdx" << std::endl; + asmCode << "\tmov rax, rdx" << std::endl; + asmCode << "\txor edx, edx" << std::endl; + bool haveSF = false; + if (divisor > 0 && mi.multiplier < 0) { + asmCode << "\tadd rax, rcx" << std::endl; + haveSF = true; + } + if (divisor < 0 && mi.multiplier > 0) { + asmCode << "\tsub rax, rcx" << std::endl; + haveSF = true; + } + if (mi.shift > 0) { + asmCode << "\tsar rax, " << mi.shift << std::endl; + haveSF = true; + } + if (!haveSF) + asmCode << "\ttest rax, rax" << std::endl; + asmCode << "\tsets dl" << std::endl; + asmCode << "\tadd rax, rdx" << std::endl; + } + } + else { +#endif asmCode << "\tmov edx, "; genbr132(instr); asmCode << "\tcmp edx, -1" << std::endl; asmCode << "\tjne short safe_idiv_" << i << std::endl; - asmCode << "\tmov rcx, rax" << std::endl; - asmCode << "\trol rcx, 1" << std::endl; - asmCode << "\tdec rcx" << std::endl; - asmCode << "\tjz short result_idiv_" << i << std::endl; + asmCode << "\tneg rax" << std::endl; + asmCode << "\tjmp short result_idiv_" << i << std::endl; asmCode << "safe_idiv_" << i << ":" << std::endl; asmCode << "\tmov ecx, 1" << std::endl; asmCode << "\ttest edx, edx" << std::endl; @@ -351,6 +439,9 @@ namespace RandomX { asmCode << "\tcqo" << std::endl; asmCode << "\tidiv rcx" << std::endl; asmCode << "result_idiv_" << i << ":" << std::endl; +#ifdef MAGIC_DIVISION + } +#endif gencr(instr); } diff --git a/src/divideByConstantCodegen.c b/src/divideByConstantCodegen.c new file mode 100644 index 0000000..4b06712 --- /dev/null +++ b/src/divideByConstantCodegen.c @@ -0,0 +1,169 @@ +/* + Reference implementations of computing and using the "magic number" approach to dividing + by constants, including codegen instructions. The unsigned division incorporates the + "round down" optimization per ridiculous_fish. + + This is free and unencumbered software. Any copyright is dedicated to the Public Domain. +*/ + +#include //for CHAR_BIT +#include + +#include "divideByConstantCodegen.h" + +struct magicu_info compute_unsigned_magic_info(uint D, unsigned num_bits) { + + //The numerator must fit in a uint + assert(num_bits > 0 && num_bits <= sizeof(uint) * CHAR_BIT); + + // D must be larger than zero and not a power of 2 + assert(D & (D - 1)); + + // The eventual result + struct magicu_info result; + + // Bits in a uint + const unsigned UINT_BITS = sizeof(uint) * CHAR_BIT; + + // The extra shift implicit in the difference between UINT_BITS and num_bits + const unsigned extra_shift = UINT_BITS - num_bits; + + // The initial power of 2 is one less than the first one that can possibly work + const uint initial_power_of_2 = (uint)1 << (UINT_BITS - 1); + + // The remainder and quotient of our power of 2 divided by d + uint quotient = initial_power_of_2 / D, remainder = initial_power_of_2 % D; + + // ceil(log_2 D) + unsigned ceil_log_2_D; + + // The magic info for the variant "round down" algorithm + uint down_multiplier = 0; + unsigned down_exponent = 0; + int has_magic_down = 0; + + // Compute ceil(log_2 D) + ceil_log_2_D = 0; + uint tmp; + for (tmp = D; tmp > 0; tmp >>= 1) + ceil_log_2_D += 1; + + + // Begin a loop that increments the exponent, until we find a power of 2 that works. + unsigned exponent; + for (exponent = 0; ; exponent++) { + // Quotient and remainder is from previous exponent; compute it for this exponent. + if (remainder >= D - remainder) { + // Doubling remainder will wrap around D + quotient = quotient * 2 + 1; + remainder = remainder * 2 - D; + } + else { + // Remainder will not wrap + quotient = quotient * 2; + remainder = remainder * 2; + } + + // We're done if this exponent works for the round_up algorithm. + // Note that exponent may be larger than the maximum shift supported, + // so the check for >= ceil_log_2_D is critical. + if ((exponent + extra_shift >= ceil_log_2_D) || (D - remainder) <= ((uint)1 << (exponent + extra_shift))) + break; + + // Set magic_down if we have not set it yet and this exponent works for the round_down algorithm + if (!has_magic_down && remainder <= ((uint)1 << (exponent + extra_shift))) { + has_magic_down = 1; + down_multiplier = quotient; + down_exponent = exponent; + } + } + + if (exponent < ceil_log_2_D) { + // magic_up is efficient + result.multiplier = quotient + 1; + result.pre_shift = 0; + result.post_shift = exponent; + result.increment = 0; + } + else if (D & 1) { + // Odd divisor, so use magic_down, which must have been set + assert(has_magic_down); + result.multiplier = down_multiplier; + result.pre_shift = 0; + result.post_shift = down_exponent; + result.increment = 1; + } + else { + // Even divisor, so use a prefix-shifted dividend + unsigned pre_shift = 0; + uint shifted_D = D; + while ((shifted_D & 1) == 0) { + shifted_D >>= 1; + pre_shift += 1; + } + result = compute_unsigned_magic_info(shifted_D, num_bits - pre_shift); + assert(result.increment == 0 && result.pre_shift == 0); //expect no increment or pre_shift in this path + result.pre_shift = pre_shift; + } + return result; +} + +struct magics_info compute_signed_magic_info(sint D) { + // D must not be zero and must not be a power of 2 (or its negative) + assert(D != 0 && (D & -D) != D && (D & -D) != -D); + + // Our result + struct magics_info result; + + // Bits in an sint + const unsigned SINT_BITS = sizeof(sint) * CHAR_BIT; + + // Absolute value of D (we know D is not the most negative value since that's a power of 2) + const uint abs_d = (D < 0 ? -D : D); + + // The initial power of 2 is one less than the first one that can possibly work + // "two31" in Warren + unsigned exponent = SINT_BITS - 1; + const uint initial_power_of_2 = (uint)1 << exponent; + + // Compute the absolute value of our "test numerator," + // which is the largest dividend whose remainder with d is d-1. + // This is called anc in Warren. + const uint tmp = initial_power_of_2 + (D < 0); + const uint abs_test_numer = tmp - 1 - tmp % abs_d; + + // Initialize our quotients and remainders (q1, r1, q2, r2 in Warren) + uint quotient1 = initial_power_of_2 / abs_test_numer, remainder1 = initial_power_of_2 % abs_test_numer; + uint quotient2 = initial_power_of_2 / abs_d, remainder2 = initial_power_of_2 % abs_d; + uint delta; + + // Begin our loop + do { + // Update the exponent + exponent++; + + // Update quotient1 and remainder1 + quotient1 *= 2; + remainder1 *= 2; + if (remainder1 >= abs_test_numer) { + quotient1 += 1; + remainder1 -= abs_test_numer; + } + + // Update quotient2 and remainder2 + quotient2 *= 2; + remainder2 *= 2; + if (remainder2 >= abs_d) { + quotient2 += 1; + remainder2 -= abs_d; + } + + // Keep going as long as (2**exponent) / abs_d <= delta + delta = abs_d - remainder2; + } while (quotient1 < delta || (quotient1 == delta && remainder1 == 0)); + + result.multiplier = quotient2 + 1; + if (D < 0) result.multiplier = -result.multiplier; + result.shift = exponent - SINT_BITS; + return result; +} diff --git a/src/divideByConstantCodegen.h b/src/divideByConstantCodegen.h new file mode 100644 index 0000000..1ac55e8 --- /dev/null +++ b/src/divideByConstantCodegen.h @@ -0,0 +1,117 @@ +/* +Copyright (c) 2018 tevador + +This file is part of RandomX. + +RandomX is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +RandomX is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with RandomX. If not, see. +*/ + +#pragma once +#include + +#if defined(__cplusplus) +extern "C" { +#endif + + typedef uint64_t uint; + typedef int64_t sint; + + /* Computes "magic info" for performing signed division by a fixed integer D. + The type 'sint' is assumed to be defined as a signed integer type large enough + to hold both the dividend and the divisor. + Here >> is arithmetic (signed) shift, and >>> is logical shift. + + To emit code for n/d, rounding towards zero, use the following sequence: + + m = compute_signed_magic_info(D) + emit("result = (m.multiplier * n) >> SINT_BITS"); + if d > 0 and m.multiplier < 0: emit("result += n") + if d < 0 and m.multiplier > 0: emit("result -= n") + if m.post_shift > 0: emit("result >>= m.shift") + emit("result += (result < 0)") + + The shifts by SINT_BITS may be "free" if the high half of the full multiply + is put in a separate register. + + The final add can of course be implemented via the sign bit, e.g. + result += (result >>> (SINT_BITS - 1)) + or + result -= (result >> (SINT_BITS - 1)) + + This code is heavily indebted to Hacker's Delight by Henry Warren. + See http://www.hackersdelight.org/HDcode/magic.c.txt + Used with permission from http://www.hackersdelight.org/permissions.htm + */ + + struct magics_info { + sint multiplier; // the "magic number" multiplier + unsigned shift; // shift for the dividend after multiplying + }; + struct magics_info compute_signed_magic_info(sint D); + + + /* Computes "magic info" for performing unsigned division by a fixed positive integer D. + The type 'uint' is assumed to be defined as an unsigned integer type large enough + to hold both the dividend and the divisor. num_bits can be set appropriately if n is + known to be smaller than the largest uint; if this is not known then pass + (sizeof(uint) * CHAR_BIT) for num_bits. + + Assume we have a hardware register of width UINT_BITS, a known constant D which is + not zero and not a power of 2, and a variable n of width num_bits (which may be + up to UINT_BITS). To emit code for n/d, use one of the two following sequences + (here >>> refers to a logical bitshift): + + m = compute_unsigned_magic_info(D, num_bits) + if m.pre_shift > 0: emit("n >>>= m.pre_shift") + if m.increment: emit("n = saturated_increment(n)") + emit("result = (m.multiplier * n) >>> UINT_BITS") + if m.post_shift > 0: emit("result >>>= m.post_shift") + + or + + m = compute_unsigned_magic_info(D, num_bits) + if m.pre_shift > 0: emit("n >>>= m.pre_shift") + emit("result = m.multiplier * n") + if m.increment: emit("result = result + m.multiplier") + emit("result >>>= UINT_BITS") + if m.post_shift > 0: emit("result >>>= m.post_shift") + + The shifts by UINT_BITS may be "free" if the high half of the full multiply + is put in a separate register. + + saturated_increment(n) means "increment n unless it would wrap to 0," i.e. + if n == (1 << UINT_BITS)-1: result = n + else: result = n+1 + A common way to implement this is with the carry bit. For example, on x86: + add 1 + sbb 0 + + Some invariants: + 1: At least one of pre_shift and increment is zero + 2: multiplier is never zero + + This code incorporates the "round down" optimization per ridiculous_fish. + */ + + struct magicu_info { + uint multiplier; // the "magic number" multiplier + unsigned pre_shift; // shift for the dividend before multiplying + unsigned post_shift; //shift for the dividend after multiplying + int increment; // 0 or 1; if set then increment the numerator, using one of the two strategies + }; + struct magicu_info compute_unsigned_magic_info(uint D, unsigned num_bits); + +#if defined(__cplusplus) +} +#endif \ No newline at end of file From 2756bcdcfe85647ef494f4e5a222c3bea9968157 Mon Sep 17 00:00:00 2001 From: tevador Date: Fri, 11 Jan 2019 16:53:52 +0100 Subject: [PATCH 13/35] Added magic division to JIT compiler New B operand selection rules --- doc/isa.md | 4 +- src/AssemblyGeneratorX86.cpp | 190 ++-- src/AssemblyGeneratorX86.hpp | 7 +- src/JitCompilerX86.cpp | 211 +++- src/JitCompilerX86.hpp | 7 +- src/divideByConstantCodegen.c | 44 +- src/divideByConstantCodegen.h | 20 +- src/instructionWeights.hpp | 8 +- src/program.inc | 1882 +++++++++++++++++---------------- 9 files changed, 1237 insertions(+), 1136 deletions(-) diff --git a/doc/isa.md b/doc/isa.md index cedece9..d46b16e 100644 --- a/doc/isa.md +++ b/doc/isa.md @@ -83,10 +83,10 @@ The `B.LOC.L` flag determines the B operand. It can be either a register or imme |`B.LOC.L`|IA/DIV|IA/SHIFT|IA/MATH|FP|CL| |----|--------|----|------|----|---| -|0|register|register|register|register|register| +|0|register|`imm8`|`imm32`|register|register| |1|`imm32`|register|register|register|register| |2|`imm32`|`imm8`|register|register|register| -|3|`imm32`|`imm8`|`imm32`|register|register| +|3|`imm32`|register|register|register|register| Integer instructions are split into 3 classes: integer division (IA/DIV), shift and rotate (IA/SHIFT) and other (IA/MATH). Floating point (FP) and control (CL) instructions always use a register operand. diff --git a/src/AssemblyGeneratorX86.cpp b/src/AssemblyGeneratorX86.cpp index 9389634..efa0818 100644 --- a/src/AssemblyGeneratorX86.cpp +++ b/src/AssemblyGeneratorX86.cpp @@ -17,7 +17,7 @@ You should have received a copy of the GNU General Public License along with RandomX. If not, see. */ //#define TRACE -//#define MAGIC_DIVISION +#define MAGIC_DIVISION #include "AssemblyGeneratorX86.hpp" #include "Pcg32.hpp" #include "common.hpp" @@ -64,108 +64,61 @@ namespace RandomX { (this->*generator)(instr, i); } - void AssemblyGeneratorX86::genar(Instruction& instr, int i) { + void AssemblyGeneratorX86::gena(Instruction& instr, int i) { asmCode << "\txor " << regR[instr.rega % RegistersCount] << ", 0" << std::hex << instr.addra << "h" << std::dec << std::endl; asmCode << "\tmov ecx, " << regR32[instr.rega % RegistersCount] << std::endl; asmCode << "\ttest " << regIc8 << ", 63" << std::endl; asmCode << "\tjnz short rx_body_" << i << std::endl; - switch (instr.loca & 3) - { - case 0: - case 1: - case 2: - asmCode << "\tcall rx_read_l1" << std::endl; - asmCode << "rx_body_" << i << ":" << std::endl; - if ((instr.loca & 192) == 0) - asmCode << "\txor " << regMx << ", rcx" << std::endl; - asmCode << "\tand ecx, " << (ScratchpadL1 - 1) << std::endl; - break; - default: //3 - asmCode << "\tcall rx_read_l2" << std::endl; - asmCode << "rx_body_" << i << ":" << std::endl; - if ((instr.loca & 192) == 0) - asmCode << "\txor " << regMx << ", rcx" << std::endl; - asmCode << "\tand ecx, " << (ScratchpadL2 - 1) << std::endl; - break; + if (instr.loca & 3) { + asmCode << "\tcall rx_read_l1" << std::endl; + asmCode << "rx_body_" << i << ":" << std::endl; + if ((instr.loca & 192) == 0) + asmCode << "\txor " << regMx << ", rcx" << std::endl; + asmCode << "\tand ecx, " << (ScratchpadL1 - 1) << std::endl; } + else { + asmCode << "\tcall rx_read_l2" << std::endl; + asmCode << "rx_body_" << i << ":" << std::endl; + if ((instr.loca & 192) == 0) + asmCode << "\txor " << regMx << ", rcx" << std::endl; + asmCode << "\tand ecx, " << (ScratchpadL2 - 1) << std::endl; + } + } + + void AssemblyGeneratorX86::genar(Instruction& instr, int i) { + gena(instr, i); asmCode << "\tmov rax, qword ptr [" << regScratchpadAddr << "+rcx*8]" << std::endl; } void AssemblyGeneratorX86::genaf(Instruction& instr, int i) { - asmCode << "\txor " << regR[instr.rega % RegistersCount] << ", 0" << std::hex << instr.addra << "h" << std::dec << std::endl; - asmCode << "\tmov ecx, " << regR32[instr.rega % RegistersCount] << std::endl; - asmCode << "\ttest " << regIc8 << ", 63" << std::endl; - asmCode << "\tjnz short rx_body_" << i << std::endl; - switch (instr.loca & 3) - { - case 0: - case 1: - case 2: - asmCode << "\tcall rx_read_l1" << std::endl; - asmCode << "rx_body_" << i << ":" << std::endl; - if((instr.loca & 192) == 0) - asmCode << "\txor " << regMx << ", rcx" << std::endl; - asmCode << "\tand ecx, " << (ScratchpadL1 - 1) << std::endl; - break; - default: //3 - asmCode << "\tcall rx_read_l2" << std::endl; - asmCode << "rx_body_" << i << ":" << std::endl; - if ((instr.loca & 192) == 0) - asmCode << "\txor " << regMx << ", rcx" << std::endl; - asmCode << "\tand ecx, " << (ScratchpadL2 - 1) << std::endl; - break; - } + gena(instr, i); asmCode << "\tcvtdq2pd xmm0, qword ptr [" << regScratchpadAddr << "+rcx*8]" << std::endl; } - void AssemblyGeneratorX86::genbr0(Instruction& instr, const char* instrx86) { - switch (instr.locb & 7) - { - case 0: - case 1: - case 2: - case 3: + void AssemblyGeneratorX86::genbiashift(Instruction& instr, const char* instrx86) { + if (instr.locb & 1) { asmCode << "\tmov rcx, " << regR[instr.regb % RegistersCount] << std::endl; asmCode << "\t" << instrx86 << " rax, cl" << std::endl; - return; - default: + } else { asmCode << "\t" << instrx86 << " rax, " << (instr.imm8 & 63) << std::endl;; - return; } } - void AssemblyGeneratorX86::genbr1(Instruction& instr) { - switch (instr.locb & 7) - { - case 0: - case 1: - case 2: - case 3: - case 4: - case 5: + void AssemblyGeneratorX86::genbia(Instruction& instr) { + if (instr.locb & 3) { asmCode << regR[instr.regb % RegistersCount] << std::endl; - return; - default: + } else { asmCode << instr.imm32 << std::endl;; - return; } } - void AssemblyGeneratorX86::genbr132(Instruction& instr) { - switch (instr.locb & 7) - { - case 0: - case 1: - case 2: - case 3: - case 4: - case 5: + void AssemblyGeneratorX86::genbia32(Instruction& instr) { + if (instr.locb & 3) { asmCode << regR32[instr.regb % RegistersCount] << std::endl; - return; - default: + } + else { asmCode << instr.imm32 << std::endl;; - return; } } @@ -241,28 +194,28 @@ namespace RandomX { void AssemblyGeneratorX86::h_ADD_64(Instruction& instr, int i) { genar(instr, i); asmCode << "\tadd rax, "; - genbr1(instr); + genbia(instr); gencr(instr); } void AssemblyGeneratorX86::h_ADD_32(Instruction& instr, int i) { genar(instr, i); asmCode << "\tadd eax, "; - genbr132(instr); + genbia32(instr); gencr(instr); } void AssemblyGeneratorX86::h_SUB_64(Instruction& instr, int i) { genar(instr, i); asmCode << "\tsub rax, "; - genbr1(instr); + genbia(instr); gencr(instr); } void AssemblyGeneratorX86::h_SUB_32(Instruction& instr, int i) { genar(instr, i); asmCode << "\tsub eax, "; - genbr132(instr); + genbia32(instr); gencr(instr); } @@ -272,14 +225,14 @@ namespace RandomX { if ((instr.locb & 7) >= 6) { asmCode << "rax, "; } - genbr1(instr); + genbia(instr); gencr(instr); } void AssemblyGeneratorX86::h_MULH_64(Instruction& instr, int i) { genar(instr, i); asmCode << "\tmov rcx, "; - genbr1(instr); + genbia(instr); asmCode << "\tmul rcx" << std::endl; asmCode << "\tmov rax, rdx" << std::endl; gencr(instr); @@ -289,7 +242,7 @@ namespace RandomX { genar(instr, i); asmCode << "\tmov ecx, eax" << std::endl; asmCode << "\tmov eax, "; - genbr132(instr); + genbia32(instr); asmCode << "\timul rax, rcx" << std::endl; gencr(instr); } @@ -310,7 +263,7 @@ namespace RandomX { void AssemblyGeneratorX86::h_IMULH_64(Instruction& instr, int i) { genar(instr, i); asmCode << "\tmov rcx, "; - genbr1(instr); + genbia(instr); asmCode << "\timul rcx" << std::endl; asmCode << "\tmov rax, rdx" << std::endl; gencr(instr); @@ -318,7 +271,7 @@ namespace RandomX { void AssemblyGeneratorX86::h_DIV_64(Instruction& instr, int i) { genar(instr, i); - if ((instr.locb & 7) >= 6) { + if (instr.locb & 3) { #ifdef MAGIC_DIVISION if (instr.imm32 != 0) { uint32_t divisor = instr.imm32; @@ -373,8 +326,8 @@ namespace RandomX { void AssemblyGeneratorX86::h_IDIV_64(Instruction& instr, int i) { genar(instr, i); + if (instr.locb & 3) { #ifdef MAGIC_DIVISION - if ((instr.locb & 7) >= 6) { int64_t divisor = instr.imm32; asmCode << "\t; magic divide by " << divisor << std::endl; if ((divisor & -divisor) == divisor || (divisor & -divisor) == -divisor) { @@ -394,9 +347,10 @@ namespace RandomX { asmCode << "\tadd rax, rcx" << std::endl; asmCode << "\tsar rax, " << shift << std::endl; } - if(negative) + if (negative) asmCode << "\tneg rax" << std::endl; - } else if(divisor != 0) { + } + else if (divisor != 0) { magics_info mi = compute_signed_magic_info(divisor); if ((divisor >= 0) != (mi.multiplier >= 0)) asmCode << "\tmov rcx, rax" << std::endl; @@ -422,25 +376,29 @@ namespace RandomX { asmCode << "\tsets dl" << std::endl; asmCode << "\tadd rax, rdx" << std::endl; } +#else + asmCode << "\tmov edx, " << instr.imm32 << std::endl; +#endif } else { -#endif - asmCode << "\tmov edx, "; - genbr132(instr); - asmCode << "\tcmp edx, -1" << std::endl; - asmCode << "\tjne short safe_idiv_" << i << std::endl; - asmCode << "\tneg rax" << std::endl; - asmCode << "\tjmp short result_idiv_" << i << std::endl; - asmCode << "safe_idiv_" << i << ":" << std::endl; - asmCode << "\tmov ecx, 1" << std::endl; - asmCode << "\ttest edx, edx" << std::endl; - asmCode << "\tcmovne ecx, edx" << std::endl; - asmCode << "\tmovsxd rcx, ecx" << std::endl; - asmCode << "\tcqo" << std::endl; - asmCode << "\tidiv rcx" << std::endl; - asmCode << "result_idiv_" << i << ":" << std::endl; -#ifdef MAGIC_DIVISION + asmCode << "\tmov edx, " << regR32[instr.regb % RegistersCount] << std::endl; +#ifndef MAGIC_DIVISION } +#endif + asmCode << "\tcmp edx, -1" << std::endl; + asmCode << "\tjne short body_idiv_" << i << std::endl; + asmCode << "\tneg rax" << std::endl; + asmCode << "\tjmp short result_idiv_" << i << std::endl; + asmCode << "body_idiv_" << i << ":" << std::endl; + asmCode << "\tmov ecx, 1" << std::endl; + asmCode << "\ttest edx, edx" << std::endl; + asmCode << "\tcmovne ecx, edx" << std::endl; + asmCode << "\tmovsxd rcx, ecx" << std::endl; + asmCode << "\tcqo" << std::endl; + asmCode << "\tidiv rcx" << std::endl; + asmCode << "result_idiv_" << i << ":" << std::endl; +#ifdef MAGIC_DIVISION + } #endif gencr(instr); } @@ -448,72 +406,72 @@ namespace RandomX { void AssemblyGeneratorX86::h_AND_64(Instruction& instr, int i) { genar(instr, i); asmCode << "\tand rax, "; - genbr1(instr); + genbia(instr); gencr(instr); } void AssemblyGeneratorX86::h_AND_32(Instruction& instr, int i) { genar(instr, i); asmCode << "\tand eax, "; - genbr132(instr); + genbia32(instr); gencr(instr); } void AssemblyGeneratorX86::h_OR_64(Instruction& instr, int i) { genar(instr, i); asmCode << "\tor rax, "; - genbr1(instr); + genbia(instr); gencr(instr); } void AssemblyGeneratorX86::h_OR_32(Instruction& instr, int i) { genar(instr, i); asmCode << "\tor eax, "; - genbr132(instr); + genbia32(instr); gencr(instr); } void AssemblyGeneratorX86::h_XOR_64(Instruction& instr, int i) { genar(instr, i); asmCode << "\txor rax, "; - genbr1(instr); + genbia(instr); gencr(instr); } void AssemblyGeneratorX86::h_XOR_32(Instruction& instr, int i) { genar(instr, i); asmCode << "\txor eax, "; - genbr132(instr); + genbia32(instr); gencr(instr); } void AssemblyGeneratorX86::h_SHL_64(Instruction& instr, int i) { genar(instr, i); - genbr0(instr, "shl"); + genbiashift(instr, "shl"); gencr(instr); } void AssemblyGeneratorX86::h_SHR_64(Instruction& instr, int i) { genar(instr, i); - genbr0(instr, "shr"); + genbiashift(instr, "shr"); gencr(instr); } void AssemblyGeneratorX86::h_SAR_64(Instruction& instr, int i) { genar(instr, i); - genbr0(instr, "sar"); + genbiashift(instr, "sar"); gencr(instr); } void AssemblyGeneratorX86::h_ROL_64(Instruction& instr, int i) { genar(instr, i); - genbr0(instr, "rol"); + genbiashift(instr, "rol"); gencr(instr); } void AssemblyGeneratorX86::h_ROR_64(Instruction& instr, int i) { genar(instr, i); - genbr0(instr, "ror"); + genbiashift(instr, "ror"); gencr(instr); } diff --git a/src/AssemblyGeneratorX86.hpp b/src/AssemblyGeneratorX86.hpp index 2a1be1b..d2e2eb0 100644 --- a/src/AssemblyGeneratorX86.hpp +++ b/src/AssemblyGeneratorX86.hpp @@ -38,11 +38,12 @@ namespace RandomX { static InstructionGenerator engine[256]; std::stringstream asmCode; + void gena(Instruction&, int); void genar(Instruction&, int); void genaf(Instruction&, int); - void genbr0(Instruction&, const char*); - void genbr1(Instruction&); - void genbr132(Instruction&); + void genbiashift(Instruction&, const char*); + void genbia(Instruction&); + void genbia32(Instruction&); void genbf(Instruction&, const char*); void gencr(Instruction&, bool); void gencf(Instruction&, bool); diff --git a/src/JitCompilerX86.cpp b/src/JitCompilerX86.cpp index 1f09cd9..32bad3a 100644 --- a/src/JitCompilerX86.cpp +++ b/src/JitCompilerX86.cpp @@ -17,10 +17,14 @@ You should have received a copy of the GNU General Public License along with RandomX. If not, see. */ +//#define MAGIC_DIVISION #include "JitCompilerX86.hpp" #include "Pcg32.hpp" #include #include +#ifdef MAGIC_DIVISION +#include "divideByConstantCodegen.h" +#endif #ifdef _WIN32 #include @@ -152,6 +156,17 @@ namespace RandomX { instructionOffsets.push_back(codePos); emit(0x840fcbff); //dec ebx; jz emit(epilogueOffset - (codePos + 4)); //jump offset (RIP-relative) + auto generator = engine[instr.opcode]; + (this->*generator)(instr, i); + } + + void JitCompilerX86::fixCallOffsets() { + for (CallOffset& co : callOffsets) { + *reinterpret_cast(code + co.pos) = instructionOffsets[co.index] - (co.pos + 4); + } + } + + void JitCompilerX86::gena(Instruction& instr) { emit(uint16_t(0x8149)); //xor emitByte(0xf0 + (instr.rega % RegistersCount)); emit(instr.addra); @@ -169,41 +184,28 @@ namespace RandomX { emit(uint16_t(0x3348)); emitByte(0xe9); //xor rbp, rcx } - auto generator = engine[instr.opcode]; - (this->*generator)(instr, i); - } - - void JitCompilerX86::fixCallOffsets() { - for (CallOffset& co : callOffsets) { - *reinterpret_cast(code + co.pos) = instructionOffsets[co.index] - (co.pos + 4); + emit(uint16_t(0xe181)); //and ecx, + if (instr.loca & 3) { + emit(ScratchpadL1 - 1); //first 16 KiB of scratchpad + } + else { + emit(ScratchpadL2 - 1); //whole scratchpad } } void JitCompilerX86::genar(Instruction& instr) { - emit(uint16_t(0xe181)); //and ecx, - if (instr.loca & 3) { - emit(ScratchpadL1 - 1); //first 16 KiB of scratchpad - } - else { - emit(ScratchpadL2 - 1); //whole scratchpad - } + gena(instr); emit(0xce048b48); //mov rax,QWORD PTR [rsi+rcx*8] } void JitCompilerX86::genaf(Instruction& instr) { - emit(uint16_t(0xe181)); //and ecx, - if (instr.loca & 3) { - emit(ScratchpadL1 - 1); //first 16 KiB of scratchpad - } - else { - emit(ScratchpadL2 - 1); //whole scratchpad - } + gena(instr); emitByte(0xf3); emit(0xce04e60f); //cvtdq2pd xmm0,QWORD PTR [rsi+rcx*8] } - void JitCompilerX86::genbr0(Instruction& instr, uint16_t opcodeReg, uint16_t opcodeImm) { - if ((instr.locb & 7) <= 3) { + void JitCompilerX86::genbiashift(Instruction& instr, uint16_t opcodeReg, uint16_t opcodeImm) { + if (instr.locb & 1) { emit(uint16_t(0x8b49)); //mov emitByte(0xc8 + (instr.regb % RegistersCount)); //rcx, regb emitByte(0x48); //REX.W @@ -216,8 +218,8 @@ namespace RandomX { } } - void JitCompilerX86::genbr1(Instruction& instr, uint16_t opcodeReg, uint16_t opcodeImm) { - if ((instr.locb & 7) <= 5) { + void JitCompilerX86::genbia(Instruction& instr, uint16_t opcodeReg, uint16_t opcodeImm) { + if (instr.locb & 3) { emit(opcodeReg); // xxx rax, r64 emitByte(0xc0 + (instr.regb % RegistersCount)); } @@ -227,8 +229,8 @@ namespace RandomX { } } - void JitCompilerX86::genbr132(Instruction& instr, uint16_t opcodeReg, uint8_t opcodeImm) { - if ((instr.locb & 7) <= 5) { + void JitCompilerX86::genbia32(Instruction& instr, uint16_t opcodeReg, uint8_t opcodeImm) { + if (instr.locb & 3) { emit(opcodeReg); // xxx eax, r32 emitByte(0xc0 + (instr.regb % RegistersCount)); } @@ -328,25 +330,25 @@ namespace RandomX { void JitCompilerX86::h_ADD_64(Instruction& instr, int i) { genar(instr); - genbr1(instr, 0x0349, 0x0548); + genbia(instr, 0x0349, 0x0548); gencr(instr); } void JitCompilerX86::h_ADD_32(Instruction& instr, int i) { genar(instr); - genbr132(instr, 0x0341, 0x05); + genbia32(instr, 0x0341, 0x05); gencr(instr); } void JitCompilerX86::h_SUB_64(Instruction& instr, int i) { genar(instr); - genbr1(instr, 0x2b49, 0x2d48); + genbia(instr, 0x2b49, 0x2d48); gencr(instr); } void JitCompilerX86::h_SUB_32(Instruction& instr, int i) { genar(instr); - genbr132(instr, 0x2b41, 0x2d); + genbia32(instr, 0x2b41, 0x2d); gencr(instr); } @@ -435,104 +437,209 @@ namespace RandomX { void JitCompilerX86::h_DIV_64(Instruction& instr, int i) { genar(instr); - if ((instr.locb & 7) <= 5) { + if (instr.locb & 3) { +#ifdef MAGIC_DIVISION + if (instr.imm32 != 0) { + uint32_t divisor = instr.imm32; + if (divisor & (divisor - 1)) { + magicu_info mi = compute_unsigned_magic_info(divisor, sizeof(uint64_t) * 8); + if (mi.pre_shift > 0) { + if (mi.pre_shift == 1) { + emitByte(0x48); + emit(uint16_t(0xe8d1)); //shr rax,1 + } + else { + emit(0x00e8c148 | (mi.pre_shift << 24)); //shr rax, pre_shift + } + } + if (mi.increment) { + emit(0x00d8834801c08348); //add rax,1; sbb rax,0 + } + emit(uint16_t(0xb948)); //movabs rcx, multiplier + emit(mi.multiplier); + emit(0x48e1f748); //mul rcx; REX + emit(uint16_t(0xc28b)); //mov rax,rdx + if (mi.post_shift > 0) + emit(0x00e8c148 | (mi.post_shift << 24)); //shr rax, post_shift + } + else { //divisor is a power of two + int shift = 0; + while (divisor >>= 1) + ++shift; + if (shift > 0) + emit(0x00e8c148 | (shift << 24)); //shr rax, shift + } + } +#else + emitByte(0xb9); //mov ecx, imm32 + emit(instr.imm32 != 0 ? instr.imm32 : 1); +#endif + } + else { emitByte(0xb9); //mov ecx, 1 emit(1); emit(uint16_t(0x8b41)); //mov edx, r32 emitByte(0xd0 + (instr.regb % RegistersCount)); emit(0x450fd285); //test edx, edx; cmovne ecx,edx emitByte(0xca); +#ifdef MAGIC_DIVISION + emit(0xf748d233); //xor edx,edx; div rcx + emitByte(0xf1); +#endif } - else { - emitByte(0xb9); //mov ecx, imm32 - emit(instr.imm32 != 0 ? instr.imm32 : 1); - } +#ifndef MAGIC_DIVISION emit(0xf748d233); //xor edx,edx; div rcx emitByte(0xf1); +#endif gencr(instr); } void JitCompilerX86::h_IDIV_64(Instruction& instr, int i) { genar(instr); - if ((instr.locb & 7) <= 5) { - emit(uint16_t(0x8b41)); //mov edx, r32 - emitByte(0xd0 + (instr.regb % RegistersCount)); + if (instr.locb & 3) { +#ifdef MAGIC_DIVISION + int64_t divisor = instr.imm32; + if ((divisor & -divisor) == divisor || (divisor & -divisor) == -divisor) { + // +/- power of two + bool negative = divisor < 0; + if (negative) + divisor = -divisor; + int shift = 0; + uint64_t unsignedDivisor = divisor; + while (unsignedDivisor >>= 1) + ++shift; + if (shift > 0) { + emitByte(0x48); + emit(uint16_t(0xc88b)); //mov rcx, rax + emit(0x3ff9c148); //sar rcx, 63 + uint32_t mask = (1ULL << shift) - 1; + emit(uint16_t(0xe181)); //and ecx, mask + emit(mask); + emitByte(0x48); + emit(uint16_t(0xc103)); //add rax, rcx + emit(0x00f8c148 | (shift << 24)); //sar rax, shift + } + if (negative) { + emitByte(0x48); + emit(uint16_t(0xd8f7)); //neg rax + } + } + else if (divisor != 0) { + magics_info mi = compute_signed_magic_info(divisor); + if ((divisor >= 0) != (mi.multiplier >= 0)) { + emitByte(0x48); + emit(uint16_t(0xc88b)); //mov rcx, rax + } + emit(uint16_t(0xba48)); //movabs rdx, multiplier + emit(mi.multiplier); + emit(0xd233c28b48eaf748); //imul rdx; mov rax,rdx; xor edx,edx + bool haveSF = false; + if (divisor > 0 && mi.multiplier < 0) { + emitByte(0x48); + emit(uint16_t(0xc103)); //add rax, rcx + haveSF = true; + } + if (divisor < 0 && mi.multiplier > 0) { + emitByte(0x48); + emit(uint16_t(0xc12b)); //sub rax, rcx + haveSF = true; + } + if (mi.shift > 0) { + emit(0x00f8c148 | (mi.shift << 24)); //sar rax, shift + haveSF = true; + } + if (!haveSF) { + emitByte(0x48); + emit(uint16_t(0x85c0)); + } + emit(0x48c2980f); //sets dl; add rax, rdx + emit(uint16_t(0xc203)); + } +#else + emitByte(0xba); // mov edx, imm32 + emit(instr.imm32); +#endif } else { - emitByte(0xba); // xxx edx, imm32 - emit(instr.imm32); + emit(uint16_t(0x8b41)); //mov edx, r32 + emitByte(0xd0 + (instr.regb % RegistersCount)); +#ifndef MAGIC_DIVISION } +#endif emit(0xc88b480b75fffa83); emit(0x1274c9ff48c1d148); emit(0x0fd28500000001b9); emit(0x489948c96348ca45); emit(uint16_t(0xf9f7)); //idiv rcx +#ifdef MAGIC_DIVISION + } +#endif gencr(instr); } void JitCompilerX86::h_AND_64(Instruction& instr, int i) { genar(instr); - genbr1(instr, 0x2349, 0x2548); + genbia(instr, 0x2349, 0x2548); gencr(instr); } void JitCompilerX86::h_AND_32(Instruction& instr, int i) { genar(instr); - genbr132(instr, 0x2341, 0x25); + genbia32(instr, 0x2341, 0x25); gencr(instr); } void JitCompilerX86::h_OR_64(Instruction& instr, int i) { genar(instr); - genbr1(instr, 0x0b49, 0x0d48); + genbia(instr, 0x0b49, 0x0d48); gencr(instr); } void JitCompilerX86::h_OR_32(Instruction& instr, int i) { genar(instr); - genbr132(instr, 0x0b41, 0x0d); + genbia32(instr, 0x0b41, 0x0d); gencr(instr); } void JitCompilerX86::h_XOR_64(Instruction& instr, int i) { genar(instr); - genbr1(instr, 0x3349, 0x3548); + genbia(instr, 0x3349, 0x3548); gencr(instr); } void JitCompilerX86::h_XOR_32(Instruction& instr, int i) { genar(instr); - genbr132(instr, 0x3341, 0x35); + genbia32(instr, 0x3341, 0x35); gencr(instr); } void JitCompilerX86::h_SHL_64(Instruction& instr, int i) { genar(instr); - genbr0(instr, 0xe0d3, 0xe0c1); + genbiashift(instr, 0xe0d3, 0xe0c1); gencr(instr); } void JitCompilerX86::h_SHR_64(Instruction& instr, int i) { genar(instr); - genbr0(instr, 0xe8d3, 0xe8c1); + genbiashift(instr, 0xe8d3, 0xe8c1); gencr(instr); } void JitCompilerX86::h_SAR_64(Instruction& instr, int i) { genar(instr); - genbr0(instr, 0xf8d3, 0xf8c1); + genbiashift(instr, 0xf8d3, 0xf8c1); gencr(instr); } void JitCompilerX86::h_ROL_64(Instruction& instr, int i) { genar(instr); - genbr0(instr, 0xc0d3, 0xc0c1); + genbiashift(instr, 0xc0d3, 0xc0c1); gencr(instr); } void JitCompilerX86::h_ROR_64(Instruction& instr, int i) { genar(instr); - genbr0(instr, 0xc8d3, 0xc8c1); + genbiashift(instr, 0xc8d3, 0xc8c1); gencr(instr); } diff --git a/src/JitCompilerX86.hpp b/src/JitCompilerX86.hpp index e4277c6..d95cbad 100644 --- a/src/JitCompilerX86.hpp +++ b/src/JitCompilerX86.hpp @@ -58,11 +58,12 @@ namespace RandomX { std::vector instructionOffsets; std::vector callOffsets; + void gena(Instruction&); void genar(Instruction&); void genaf(Instruction&); - void genbr0(Instruction&, uint16_t, uint16_t); - void genbr1(Instruction&, uint16_t, uint16_t); - void genbr132(Instruction&, uint16_t, uint8_t); + void genbiashift(Instruction&, uint16_t, uint16_t); + void genbia(Instruction&, uint16_t, uint16_t); + void genbia32(Instruction&, uint16_t, uint8_t); void genbf(Instruction&, uint8_t); void scratchpadStoreR(Instruction&, uint32_t, bool); void scratchpadStoreF(Instruction&, int, uint32_t, bool); diff --git a/src/divideByConstantCodegen.c b/src/divideByConstantCodegen.c index 4b06712..255baf4 100644 --- a/src/divideByConstantCodegen.c +++ b/src/divideByConstantCodegen.c @@ -11,10 +11,10 @@ #include "divideByConstantCodegen.h" -struct magicu_info compute_unsigned_magic_info(uint D, unsigned num_bits) { +struct magicu_info compute_unsigned_magic_info(unsigned_type D, unsigned num_bits) { - //The numerator must fit in a uint - assert(num_bits > 0 && num_bits <= sizeof(uint) * CHAR_BIT); + //The numerator must fit in a unsigned_type + assert(num_bits > 0 && num_bits <= sizeof(unsigned_type) * CHAR_BIT); // D must be larger than zero and not a power of 2 assert(D & (D - 1)); @@ -22,29 +22,29 @@ struct magicu_info compute_unsigned_magic_info(uint D, unsigned num_bits) { // The eventual result struct magicu_info result; - // Bits in a uint - const unsigned UINT_BITS = sizeof(uint) * CHAR_BIT; + // Bits in a unsigned_type + const unsigned UINT_BITS = sizeof(unsigned_type) * CHAR_BIT; // The extra shift implicit in the difference between UINT_BITS and num_bits const unsigned extra_shift = UINT_BITS - num_bits; // The initial power of 2 is one less than the first one that can possibly work - const uint initial_power_of_2 = (uint)1 << (UINT_BITS - 1); + const unsigned_type initial_power_of_2 = (unsigned_type)1 << (UINT_BITS - 1); // The remainder and quotient of our power of 2 divided by d - uint quotient = initial_power_of_2 / D, remainder = initial_power_of_2 % D; + unsigned_type quotient = initial_power_of_2 / D, remainder = initial_power_of_2 % D; // ceil(log_2 D) unsigned ceil_log_2_D; // The magic info for the variant "round down" algorithm - uint down_multiplier = 0; + unsigned_type down_multiplier = 0; unsigned down_exponent = 0; int has_magic_down = 0; // Compute ceil(log_2 D) ceil_log_2_D = 0; - uint tmp; + unsigned_type tmp; for (tmp = D; tmp > 0; tmp >>= 1) ceil_log_2_D += 1; @@ -67,11 +67,11 @@ struct magicu_info compute_unsigned_magic_info(uint D, unsigned num_bits) { // We're done if this exponent works for the round_up algorithm. // Note that exponent may be larger than the maximum shift supported, // so the check for >= ceil_log_2_D is critical. - if ((exponent + extra_shift >= ceil_log_2_D) || (D - remainder) <= ((uint)1 << (exponent + extra_shift))) + if ((exponent + extra_shift >= ceil_log_2_D) || (D - remainder) <= ((unsigned_type)1 << (exponent + extra_shift))) break; // Set magic_down if we have not set it yet and this exponent works for the round_down algorithm - if (!has_magic_down && remainder <= ((uint)1 << (exponent + extra_shift))) { + if (!has_magic_down && remainder <= ((unsigned_type)1 << (exponent + extra_shift))) { has_magic_down = 1; down_multiplier = quotient; down_exponent = exponent; @@ -96,7 +96,7 @@ struct magicu_info compute_unsigned_magic_info(uint D, unsigned num_bits) { else { // Even divisor, so use a prefix-shifted dividend unsigned pre_shift = 0; - uint shifted_D = D; + unsigned_type shifted_D = D; while ((shifted_D & 1) == 0) { shifted_D >>= 1; pre_shift += 1; @@ -108,34 +108,34 @@ struct magicu_info compute_unsigned_magic_info(uint D, unsigned num_bits) { return result; } -struct magics_info compute_signed_magic_info(sint D) { +struct magics_info compute_signed_magic_info(signed_type D) { // D must not be zero and must not be a power of 2 (or its negative) assert(D != 0 && (D & -D) != D && (D & -D) != -D); // Our result struct magics_info result; - // Bits in an sint - const unsigned SINT_BITS = sizeof(sint) * CHAR_BIT; + // Bits in an signed_type + const unsigned SINT_BITS = sizeof(signed_type) * CHAR_BIT; // Absolute value of D (we know D is not the most negative value since that's a power of 2) - const uint abs_d = (D < 0 ? -D : D); + const unsigned_type abs_d = (D < 0 ? -D : D); // The initial power of 2 is one less than the first one that can possibly work // "two31" in Warren unsigned exponent = SINT_BITS - 1; - const uint initial_power_of_2 = (uint)1 << exponent; + const unsigned_type initial_power_of_2 = (unsigned_type)1 << exponent; // Compute the absolute value of our "test numerator," // which is the largest dividend whose remainder with d is d-1. // This is called anc in Warren. - const uint tmp = initial_power_of_2 + (D < 0); - const uint abs_test_numer = tmp - 1 - tmp % abs_d; + const unsigned_type tmp = initial_power_of_2 + (D < 0); + const unsigned_type abs_test_numer = tmp - 1 - tmp % abs_d; // Initialize our quotients and remainders (q1, r1, q2, r2 in Warren) - uint quotient1 = initial_power_of_2 / abs_test_numer, remainder1 = initial_power_of_2 % abs_test_numer; - uint quotient2 = initial_power_of_2 / abs_d, remainder2 = initial_power_of_2 % abs_d; - uint delta; + unsigned_type quotient1 = initial_power_of_2 / abs_test_numer, remainder1 = initial_power_of_2 % abs_test_numer; + unsigned_type quotient2 = initial_power_of_2 / abs_d, remainder2 = initial_power_of_2 % abs_d; + unsigned_type delta; // Begin our loop do { diff --git a/src/divideByConstantCodegen.h b/src/divideByConstantCodegen.h index 1ac55e8..800647c 100644 --- a/src/divideByConstantCodegen.h +++ b/src/divideByConstantCodegen.h @@ -24,11 +24,11 @@ along with RandomX. If not, see. extern "C" { #endif - typedef uint64_t uint; - typedef int64_t sint; + typedef uint64_t unsigned_type; + typedef int64_t signed_type; /* Computes "magic info" for performing signed division by a fixed integer D. - The type 'sint' is assumed to be defined as a signed integer type large enough + The type 'signed_type' is assumed to be defined as a signed integer type large enough to hold both the dividend and the divisor. Here >> is arithmetic (signed) shift, and >>> is logical shift. @@ -55,17 +55,17 @@ extern "C" { */ struct magics_info { - sint multiplier; // the "magic number" multiplier + signed_type multiplier; // the "magic number" multiplier unsigned shift; // shift for the dividend after multiplying }; - struct magics_info compute_signed_magic_info(sint D); + struct magics_info compute_signed_magic_info(signed_type D); /* Computes "magic info" for performing unsigned division by a fixed positive integer D. - The type 'uint' is assumed to be defined as an unsigned integer type large enough + The type 'unsigned_type' is assumed to be defined as an unsigned integer type large enough to hold both the dividend and the divisor. num_bits can be set appropriately if n is - known to be smaller than the largest uint; if this is not known then pass - (sizeof(uint) * CHAR_BIT) for num_bits. + known to be smaller than the largest unsigned_type; if this is not known then pass + (sizeof(unsigned_type) * CHAR_BIT) for num_bits. Assume we have a hardware register of width UINT_BITS, a known constant D which is not zero and not a power of 2, and a variable n of width num_bits (which may be @@ -105,12 +105,12 @@ extern "C" { */ struct magicu_info { - uint multiplier; // the "magic number" multiplier + unsigned_type multiplier; // the "magic number" multiplier unsigned pre_shift; // shift for the dividend before multiplying unsigned post_shift; //shift for the dividend after multiplying int increment; // 0 or 1; if set then increment the numerator, using one of the two strategies }; - struct magicu_info compute_unsigned_magic_info(uint D, unsigned num_bits); + struct magicu_info compute_unsigned_magic_info(unsigned_type D, unsigned num_bits); #if defined(__cplusplus) } diff --git a/src/instructionWeights.hpp b/src/instructionWeights.hpp index 39f8dec..7771a35 100644 --- a/src/instructionWeights.hpp +++ b/src/instructionWeights.hpp @@ -19,17 +19,17 @@ along with RandomX. If not, see. #pragma once -#define WT_ADD_64 15 +#define WT_ADD_64 12 #define WT_ADD_32 2 -#define WT_SUB_64 15 +#define WT_SUB_64 12 #define WT_SUB_32 2 #define WT_MUL_64 23 #define WT_MULH_64 10 #define WT_MUL_32 15 #define WT_IMUL_32 15 #define WT_IMULH_64 6 -#define WT_DIV_64 1 -#define WT_IDIV_64 1 +#define WT_DIV_64 4 +#define WT_IDIV_64 4 #define WT_AND_64 4 #define WT_AND_32 2 #define WT_OR_64 4 diff --git a/src/program.inc b/src/program.inc index 66b9147..79a7dda 100644 --- a/src/program.inc +++ b/src/program.inc @@ -5,10 +5,10 @@ rx_i_0: ;CALL mov ecx, r9d test bl, 63 jnz short rx_body_0 - call rx_read_l1 + call rx_read_l2 rx_body_0: xor rbp, rcx - and ecx, 2047 + and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] mov rcx, rax mov eax, r12d @@ -19,20 +19,23 @@ rx_body_0: ja short rx_i_1 call rx_i_30 -rx_i_1: ;IMULH_64 +rx_i_1: ;DIV_64 dec ebx jz rx_finish xor r15, 06afc2fa4h mov ecx, r15d test bl, 63 jnz short rx_body_1 - call rx_read_l1 + call rx_read_l2 rx_body_1: - and ecx, 2047 + and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] - mov rcx, r10 - imul rcx - mov rax, rdx + mov ecx, 1 + mov edx, r10d + test edx, edx + cmovne ecx, edx + xor edx, edx + div rcx mov r12, rax rx_i_2: ;JUMP @@ -62,10 +65,10 @@ rx_i_3: ;FPDIV mov ecx, r13d test bl, 63 jnz short rx_body_3 - call rx_read_l1 + call rx_read_l2 rx_body_3: xor rbp, rcx - and ecx, 2047 + and ecx, 32767 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] divpd xmm0, xmm9 movaps xmm1, xmm0 @@ -84,9 +87,9 @@ rx_i_4: ;MULH_64 mov ecx, r14d test bl, 63 jnz short rx_body_4 - call rx_read_l1 + call rx_read_l2 rx_body_4: - and ecx, 2047 + and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] mov rcx, r14 mul rcx @@ -104,13 +107,13 @@ rx_i_5: ;MUL_32 mov ecx, r15d test bl, 63 jnz short rx_body_5 - call rx_read_l2 + call rx_read_l1 rx_body_5: xor rbp, rcx - and ecx, 32767 + and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] mov ecx, eax - mov eax, r12d + mov eax, 1037420699 imul rax, rcx mov r12, rax @@ -139,9 +142,9 @@ rx_i_7: ;FPADD mov ecx, r10d test bl, 63 jnz short rx_body_7 - call rx_read_l1 + call rx_read_l2 rx_body_7: - and ecx, 2047 + and ecx, 32767 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm6 movaps xmm6, xmm0 @@ -157,32 +160,34 @@ rx_i_8: ;XOR_64 mov ecx, r13d test bl, 63 jnz short rx_body_8 - call rx_read_l1 + call rx_read_l2 rx_body_8: - and ecx, 2047 + and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] - xor rax, 1344700093 + xor rax, r11 mov rcx, rax mov eax, r12d xor eax, 050267ebdh and eax, 2047 mov qword ptr [rsi + rax * 8], rcx -rx_i_9: ;IMULH_64 +rx_i_9: ;DIV_64 dec ebx jz rx_finish xor r14, 085121c54h mov ecx, r14d test bl, 63 jnz short rx_body_9 - call rx_read_l1 + call rx_read_l2 rx_body_9: xor rbp, rcx - and ecx, 2047 + and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] - mov rcx, 565870810 - imul rcx + ; magic divide by 565870810 + mov rcx, 8750690209911200579 + mul rcx mov rax, rdx + shr rax, 28 mov r10, rax rx_i_10: ;AND_64 @@ -196,7 +201,7 @@ rx_i_10: ;AND_64 rx_body_10: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - and rax, -727859809 + and rax, r10 mov r13, rax rx_i_11: ;FPADD @@ -206,9 +211,9 @@ rx_i_11: ;FPADD mov ecx, r10d test bl, 63 jnz short rx_body_11 - call rx_read_l2 + call rx_read_l1 rx_body_11: - and ecx, 32767 + and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm5 movaps xmm4, xmm0 @@ -224,9 +229,9 @@ rx_i_12: ;FPSQRT mov ecx, r10d test bl, 63 jnz short rx_body_12 - call rx_read_l2 + call rx_read_l1 rx_body_12: - and ecx, 32767 + and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] andps xmm0, xmm10 sqrtpd xmm8, xmm0 @@ -276,9 +281,9 @@ rx_i_15: ;RET mov ecx, r11d test bl, 63 jnz short rx_body_15 - call rx_read_l2 + call rx_read_l1 rx_body_15: - and ecx, 32767 + and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] mov rcx, rax mov eax, r14d @@ -314,9 +319,9 @@ rx_i_17: ;FPMUL mov ecx, r11d test bl, 63 jnz short rx_body_17 - call rx_read_l1 + call rx_read_l2 rx_body_17: - and ecx, 2047 + and ecx, 32767 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] mulpd xmm0, xmm4 movaps xmm1, xmm0 @@ -386,10 +391,10 @@ rx_i_21: ;ROR_64 mov ecx, r8d test bl, 63 jnz short rx_body_21 - call rx_read_l2 + call rx_read_l1 rx_body_21: xor rbp, rcx - and ecx, 32767 + and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] mov rcx, r9 ror rax, cl @@ -406,10 +411,10 @@ rx_i_22: ;ADD_64 mov ecx, r13d test bl, 63 jnz short rx_body_22 - call rx_read_l1 + call rx_read_l2 rx_body_22: xor rbp, rcx - and ecx, 2047 + and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] add rax, r8 mov rcx, rax @@ -429,7 +434,7 @@ rx_i_23: ;MUL_64 rx_body_23: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - imul rax, r11 + imul rax, 1283724485 mov r8, rax rx_i_24: ;IMUL_32 @@ -439,10 +444,10 @@ rx_i_24: ;IMUL_32 mov ecx, r8d test bl, 63 jnz short rx_body_24 - call rx_read_l1 + call rx_read_l2 rx_body_24: xor rbp, rcx - and ecx, 2047 + and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] movsxd rcx, eax movsxd rax, r15d @@ -460,10 +465,10 @@ rx_i_25: ;FPMUL mov ecx, r12d test bl, 63 jnz short rx_body_25 - call rx_read_l2 + call rx_read_l1 rx_body_25: xor rbp, rcx - and ecx, 32767 + and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] mulpd xmm0, xmm9 movaps xmm1, xmm0 @@ -502,9 +507,9 @@ rx_i_27: ;FPMUL mov ecx, r12d test bl, 63 jnz short rx_body_27 - call rx_read_l2 + call rx_read_l1 rx_body_27: - and ecx, 32767 + and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] mulpd xmm0, xmm3 movaps xmm1, xmm0 @@ -523,21 +528,21 @@ rx_i_28: ;AND_32 rx_body_28: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - and eax, r13d + and eax, 565865719 mov r14, rax -rx_i_29: ;ADD_64 +rx_i_29: ;SUB_64 dec ebx jz rx_finish xor r12, 0be2e7c42h mov ecx, r12d test bl, 63 jnz short rx_body_29 - call rx_read_l2 + call rx_read_l1 rx_body_29: - and ecx, 32767 + and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - add rax, 1944166515 + sub rax, r13 mov r14, rax rx_i_30: ;FPADD @@ -561,13 +566,12 @@ rx_i_31: ;ROR_64 mov ecx, r14d test bl, 63 jnz short rx_body_31 - call rx_read_l2 + call rx_read_l1 rx_body_31: xor rbp, rcx - and ecx, 32767 + and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - mov rcx, r11 - ror rax, cl + ror rax, 55 mov r14, rax rx_i_32: ;AND_32 @@ -577,11 +581,11 @@ rx_i_32: ;AND_32 mov ecx, r12d test bl, 63 jnz short rx_body_32 - call rx_read_l1 + call rx_read_l2 rx_body_32: - and ecx, 2047 + and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] - and eax, -1936869641 + and eax, r14d mov r9, rax rx_i_33: ;MUL_64 @@ -591,9 +595,9 @@ rx_i_33: ;MUL_64 mov ecx, r9d test bl, 63 jnz short rx_body_33 - call rx_read_l2 + call rx_read_l1 rx_body_33: - and ecx, 32767 + and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] imul rax, r15 mov r12, rax @@ -622,9 +626,9 @@ rx_i_35: ;CALL mov ecx, r15d test bl, 63 jnz short rx_body_35 - call rx_read_l1 + call rx_read_l2 rx_body_35: - and ecx, 2047 + and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] mov r8, rax cmp r9d, -2040787098 @@ -655,9 +659,9 @@ rx_i_37: ;FPSUB mov ecx, r12d test bl, 63 jnz short rx_body_37 - call rx_read_l1 + call rx_read_l2 rx_body_37: - and ecx, 2047 + and ecx, 32767 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] subpd xmm0, xmm2 movaps xmm9, xmm0 @@ -687,10 +691,10 @@ rx_i_39: ;ADD_64 mov ecx, r14d test bl, 63 jnz short rx_body_39 - call rx_read_l1 + call rx_read_l2 rx_body_39: xor rbp, rcx - and ecx, 2047 + and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] add rax, r14 mov r14, rax @@ -722,9 +726,9 @@ rx_i_41: ;JUMP mov ecx, r9d test bl, 63 jnz short rx_body_41 - call rx_read_l1 + call rx_read_l2 rx_body_41: - and ecx, 2047 + and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] mov r9, rax cmp r14d, -1070581824 @@ -737,26 +741,26 @@ rx_i_42: ;FPADD mov ecx, r15d test bl, 63 jnz short rx_body_42 - call rx_read_l1 + call rx_read_l2 rx_body_42: xor rbp, rcx - and ecx, 2047 + and ecx, 32767 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm6 movaps xmm6, xmm0 -rx_i_43: ;ADD_32 +rx_i_43: ;SUB_64 dec ebx jz rx_finish xor r12, 02b2a2eech mov ecx, r12d test bl, 63 jnz short rx_body_43 - call rx_read_l1 + call rx_read_l2 rx_body_43: - and ecx, 2047 + and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] - add eax, 1693705407 + sub rax, r8 mov rcx, rax mov eax, r11d xor eax, 064f3e4bfh @@ -785,10 +789,10 @@ rx_i_45: ;FPSUB mov ecx, r12d test bl, 63 jnz short rx_body_45 - call rx_read_l2 + call rx_read_l1 rx_body_45: xor rbp, rcx - and ecx, 32767 + and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] subpd xmm0, xmm2 movaps xmm5, xmm0 @@ -800,9 +804,9 @@ rx_i_46: ;ADD_64 mov ecx, r8d test bl, 63 jnz short rx_body_46 - call rx_read_l2 + call rx_read_l1 rx_body_46: - and ecx, 32767 + and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] add rax, r9 mov rcx, rax @@ -818,10 +822,10 @@ rx_i_47: ;JUMP mov ecx, r12d test bl, 63 jnz short rx_body_47 - call rx_read_l2 + call rx_read_l1 rx_body_47: xor rbp, rcx - and ecx, 32767 + and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] mov rcx, rax mov eax, r13d @@ -873,10 +877,10 @@ rx_i_50: ;AND_64 mov ecx, r9d test bl, 63 jnz short rx_body_50 - call rx_read_l1 + call rx_read_l2 rx_body_50: xor rbp, rcx - and ecx, 2047 + and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] and rax, r10 mov rcx, rax @@ -892,11 +896,11 @@ rx_i_51: ;SUB_64 mov ecx, r10d test bl, 63 jnz short rx_body_51 - call rx_read_l2 + call rx_read_l1 rx_body_51: - and ecx, 32767 + and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - sub rax, 419241919 + sub rax, r15 mov r15, rax rx_i_52: ;FPSQRT @@ -906,9 +910,9 @@ rx_i_52: ;FPSQRT mov ecx, r11d test bl, 63 jnz short rx_body_52 - call rx_read_l2 + call rx_read_l1 rx_body_52: - and ecx, 32767 + and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] andps xmm0, xmm10 sqrtpd xmm7, xmm0 @@ -929,20 +933,20 @@ rx_body_53: je short rx_i_54 ret -rx_i_54: ;IMUL_32 +rx_i_54: ;IMULH_64 dec ebx jz rx_finish xor r11, 060638de0h mov ecx, r11d test bl, 63 jnz short rx_body_54 - call rx_read_l2 + call rx_read_l1 rx_body_54: - and ecx, 32767 + and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - movsxd rcx, eax - mov rax, 282209221 - imul rax, rcx + mov rcx, r8 + imul rcx + mov rax, rdx mov rcx, rax mov eax, r12d xor eax, 010d22bc5h @@ -970,58 +974,62 @@ rx_body_55: and eax, 2047 movhpd qword ptr [rsi + rax * 8], xmm3 -rx_i_56: ;IMULH_64 +rx_i_56: ;DIV_64 dec ebx jz rx_finish xor r14, 0f1456b8eh mov ecx, r14d test bl, 63 jnz short rx_body_56 - call rx_read_l1 + call rx_read_l2 rx_body_56: - and ecx, 2047 + and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] - mov rcx, r15 - imul rcx + ; magic divide by 4244198545 + add rax, 1 + sbb rax, 0 + mov rcx, 9333701248213440683 + mul rcx mov rax, rdx + shr rax, 31 mov rcx, rax mov eax, r8d xor eax, 0fcf95491h and eax, 32767 mov qword ptr [rsi + rax * 8], rcx -rx_i_57: ;SUB_32 +rx_i_57: ;MUL_64 dec ebx jz rx_finish xor r9, 010dc4571h mov ecx, r9d test bl, 63 jnz short rx_body_57 - call rx_read_l2 + call rx_read_l1 rx_body_57: - and ecx, 32767 + and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - sub eax, r14d + imul rax, 172123015 mov rcx, rax mov eax, r15d xor eax, 0a426387h and eax, 2047 mov qword ptr [rsi + rax * 8], rcx -rx_i_58: ;IMUL_32 +rx_i_58: ;IMULH_64 dec ebx jz rx_finish xor r14, 0bcec0ebah mov ecx, r14d test bl, 63 jnz short rx_body_58 - call rx_read_l2 + call rx_read_l1 rx_body_58: - and ecx, 32767 + and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - movsxd rcx, eax - movsxd rax, r13d - imul rax, rcx + mov rcx, r13 + imul rcx + mov rax, rdx mov r8, rax rx_i_59: ;FPSUB @@ -1045,9 +1053,9 @@ rx_i_60: ;CALL mov ecx, r15d test bl, 63 jnz short rx_body_60 - call rx_read_l1 + call rx_read_l2 rx_body_60: - and ecx, 2047 + and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] mov rcx, rax mov eax, r11d @@ -1112,9 +1120,9 @@ rx_i_64: ;SUB_64 mov ecx, r13d test bl, 63 jnz short rx_body_64 - call rx_read_l1 + call rx_read_l2 rx_body_64: - and ecx, 2047 + and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] sub rax, r15 mov r9, rax @@ -1126,9 +1134,9 @@ rx_i_65: ;JUMP mov ecx, r13d test bl, 63 jnz short rx_body_65 - call rx_read_l1 + call rx_read_l2 rx_body_65: - and ecx, 2047 + and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] mov r11, rax cmp r8d, 1498056607 @@ -1141,10 +1149,10 @@ rx_i_66: ;FPDIV mov ecx, r15d test bl, 63 jnz short rx_body_66 - call rx_read_l2 + call rx_read_l1 rx_body_66: xor rbp, rcx - and ecx, 32767 + and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] divpd xmm0, xmm3 movaps xmm1, xmm0 @@ -1178,9 +1186,9 @@ rx_i_68: ;FPADD mov ecx, r13d test bl, 63 jnz short rx_body_68 - call rx_read_l1 + call rx_read_l2 rx_body_68: - and ecx, 2047 + and ecx, 32767 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm2 movaps xmm4, xmm0 @@ -1196,27 +1204,29 @@ rx_i_69: ;FPADD mov ecx, r15d test bl, 63 jnz short rx_body_69 - call rx_read_l2 + call rx_read_l1 rx_body_69: xor rbp, rcx - and ecx, 32767 + and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm5 movaps xmm8, xmm0 -rx_i_70: ;MUL_64 +rx_i_70: ;MULH_64 dec ebx jz rx_finish xor r8, 0bbbec3fah mov ecx, r8d test bl, 63 jnz short rx_body_70 - call rx_read_l2 + call rx_read_l1 rx_body_70: xor rbp, rcx - and ecx, 32767 + and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - imul rax, r9 + mov rcx, r9 + mul rcx + mov rax, rdx mov r13, rax rx_i_71: ;FPMUL @@ -1262,9 +1272,9 @@ rx_i_73: ;FPDIV mov ecx, r12d test bl, 63 jnz short rx_body_73 - call rx_read_l2 + call rx_read_l1 rx_body_73: - and ecx, 32767 + and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] divpd xmm0, xmm3 movaps xmm1, xmm0 @@ -1284,7 +1294,7 @@ rx_body_74: xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - imul rax, rax, -1431647438 + imul rax, rax, r13 mov rcx, rax mov eax, r9d xor eax, 0aaaacb32h @@ -1298,9 +1308,9 @@ rx_i_75: ;CALL mov ecx, r14d test bl, 63 jnz short rx_body_75 - call rx_read_l2 + call rx_read_l1 rx_body_75: - and ecx, 32767 + and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] mov r13, rax cmp r11d, -1160798683 @@ -1314,9 +1324,9 @@ rx_i_76: ;FPADD mov ecx, r11d test bl, 63 jnz short rx_body_76 - call rx_read_l1 + call rx_read_l2 rx_body_76: - and ecx, 2047 + and ecx, 32767 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm3 movaps xmm7, xmm0 @@ -1332,9 +1342,9 @@ rx_i_77: ;RET mov ecx, r14d test bl, 63 jnz short rx_body_77 - call rx_read_l2 + call rx_read_l1 rx_body_77: - and ecx, 32767 + and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] mov rcx, rax mov eax, r11d @@ -1368,9 +1378,9 @@ rx_i_79: ;CALL mov ecx, r11d test bl, 63 jnz short rx_body_79 - call rx_read_l1 + call rx_read_l2 rx_body_79: - and ecx, 2047 + and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] mov rcx, rax mov eax, r11d @@ -1388,11 +1398,12 @@ rx_i_80: ;ROR_64 mov ecx, r13d test bl, 63 jnz short rx_body_80 - call rx_read_l2 + call rx_read_l1 rx_body_80: - and ecx, 32767 + and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - ror rax, 4 + mov rcx, r11 + ror rax, cl mov rcx, rax mov eax, r11d xor eax, 01a681d13h @@ -1410,7 +1421,7 @@ rx_i_81: ;AND_64 rx_body_81: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - and rax, r13 + and rax, 338325607 mov r8, rax rx_i_82: ;JUMP @@ -1432,20 +1443,22 @@ rx_body_82: cmp r12d, -68969733 jo rx_i_145 -rx_i_83: ;IMULH_64 +rx_i_83: ;DIV_64 dec ebx jz rx_finish xor r10, 0d9b6a533h mov ecx, r10d test bl, 63 jnz short rx_body_83 - call rx_read_l1 + call rx_read_l2 rx_body_83: - and ecx, 2047 + and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] - mov rcx, r10 - imul rcx + ; magic divide by 91850728 + mov rcx, 13477737914993774191 + mul rcx mov rax, rdx + shr rax, 26 mov r12, rax rx_i_84: ;SAR_64 @@ -1455,12 +1468,11 @@ rx_i_84: ;SAR_64 mov ecx, r15d test bl, 63 jnz short rx_body_84 - call rx_read_l2 + call rx_read_l1 rx_body_84: - and ecx, 32767 + and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - mov rcx, r10 - sar rax, cl + sar rax, 45 mov rcx, rax mov eax, r13d xor eax, 0ec5c52e6h @@ -1478,7 +1490,7 @@ rx_i_85: ;MUL_64 rx_body_85: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - imul rax, r8 + imul rax, 20014507 mov r10, rax rx_i_86: ;AND_64 @@ -1499,7 +1511,7 @@ rx_body_86: and eax, 2047 mov qword ptr [rsi + rax * 8], rcx -rx_i_87: ;ADD_32 +rx_i_87: ;SUB_64 dec ebx jz rx_finish xor r9, 0d75a0ecfh @@ -1511,7 +1523,7 @@ rx_body_87: xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - add eax, r12d + sub rax, r12 mov r8, rax rx_i_88: ;ROR_64 @@ -1537,9 +1549,9 @@ rx_i_89: ;MUL_64 mov ecx, r9d test bl, 63 jnz short rx_body_89 - call rx_read_l2 + call rx_read_l1 rx_body_89: - and ecx, 32767 + and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] imul rax, r8 mov rcx, rax @@ -1555,9 +1567,9 @@ rx_i_90: ;FPADD mov ecx, r12d test bl, 63 jnz short rx_body_90 - call rx_read_l1 + call rx_read_l2 rx_body_90: - and ecx, 2047 + and ecx, 32767 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm3 movaps xmm6, xmm0 @@ -1587,9 +1599,9 @@ rx_i_92: ;JUMP mov ecx, r8d test bl, 63 jnz short rx_body_92 - call rx_read_l2 + call rx_read_l1 rx_body_92: - and ecx, 32767 + and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] mov r12, rax cmp r14d, 1288893603 @@ -1621,10 +1633,10 @@ rx_i_94: ;CALL mov ecx, r13d test bl, 63 jnz short rx_body_94 - call rx_read_l1 + call rx_read_l2 rx_body_94: xor rbp, rcx - and ecx, 2047 + and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] mov r8, rax cmp r13d, -343122976 @@ -1656,13 +1668,13 @@ rx_i_96: ;MUL_32 mov ecx, r11d test bl, 63 jnz short rx_body_96 - call rx_read_l2 + call rx_read_l1 rx_body_96: xor rbp, rcx - and ecx, 32767 + and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] mov ecx, eax - mov eax, -1354397081 + mov eax, r11d imul rax, rcx mov r11, rax @@ -1673,9 +1685,9 @@ rx_i_97: ;FPDIV mov ecx, r15d test bl, 63 jnz short rx_body_97 - call rx_read_l1 + call rx_read_l2 rx_body_97: - and ecx, 2047 + and ecx, 32767 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] divpd xmm0, xmm9 movaps xmm1, xmm0 @@ -1694,9 +1706,9 @@ rx_i_98: ;SUB_64 mov ecx, r14d test bl, 63 jnz short rx_body_98 - call rx_read_l1 + call rx_read_l2 rx_body_98: - and ecx, 2047 + and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] sub rax, r15 mov r14, rax @@ -1708,9 +1720,9 @@ rx_i_99: ;FPMUL mov ecx, r9d test bl, 63 jnz short rx_body_99 - call rx_read_l2 + call rx_read_l1 rx_body_99: - and ecx, 32767 + and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] mulpd xmm0, xmm6 movaps xmm1, xmm0 @@ -1747,7 +1759,7 @@ rx_i_101: ;SUB_64 rx_body_101: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - sub rax, r8 + sub rax, 1732300336 mov r11, rax rx_i_102: ;FPMUL @@ -1774,9 +1786,9 @@ rx_i_103: ;MUL_64 mov ecx, r10d test bl, 63 jnz short rx_body_103 - call rx_read_l1 + call rx_read_l2 rx_body_103: - and ecx, 2047 + and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] imul rax, r13 mov rcx, rax @@ -1792,9 +1804,9 @@ rx_i_104: ;IMUL_32 mov ecx, r11d test bl, 63 jnz short rx_body_104 - call rx_read_l1 + call rx_read_l2 rx_body_104: - and ecx, 2047 + and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] movsxd rcx, eax mov rax, -1913070089 @@ -1805,7 +1817,7 @@ rx_body_104: and eax, 2047 mov qword ptr [rsi + rax * 8], rcx -rx_i_105: ;MULH_64 +rx_i_105: ;MUL_32 dec ebx jz rx_finish xor r13, 036a51f72h @@ -1817,9 +1829,9 @@ rx_body_105: xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - mov rcx, r15 - mul rcx - mov rax, rdx + mov ecx, eax + mov eax, r15d + imul rax, rcx mov rcx, rax mov eax, r14d xor eax, 09c8724edh @@ -1914,9 +1926,9 @@ rx_i_110: ;SHR_64 mov ecx, r9d test bl, 63 jnz short rx_body_110 - call rx_read_l1 + call rx_read_l2 rx_body_110: - and ecx, 2047 + and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] mov rcx, r9 shr rax, cl @@ -1933,9 +1945,9 @@ rx_i_111: ;CALL mov ecx, r8d test bl, 63 jnz short rx_body_111 - call rx_read_l1 + call rx_read_l2 rx_body_111: - and ecx, 2047 + and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] mov rcx, rax mov eax, r12d @@ -1953,32 +1965,34 @@ rx_i_112: ;SUB_64 mov ecx, r12d test bl, 63 jnz short rx_body_112 - call rx_read_l2 + call rx_read_l1 rx_body_112: - and ecx, 32767 + and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - sub rax, r13 + sub rax, -1025977295 mov rcx, rax mov eax, r14d xor eax, 0c2d8d431h and eax, 2047 mov qword ptr [rsi + rax * 8], rcx -rx_i_113: ;MUL_64 +rx_i_113: ;MULH_64 dec ebx jz rx_finish xor r10, 07a4f8cbbh mov ecx, r10d test bl, 63 jnz short rx_body_113 - call rx_read_l1 + call rx_read_l2 rx_body_113: - and ecx, 2047 + and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] - imul rax, r9 + mov rcx, r9 + mul rcx + mov rax, rdx mov r13, rax -rx_i_114: ;IMUL_32 +rx_i_114: ;IMULH_64 dec ebx jz rx_finish xor r13, 06e83e2cdh @@ -1989,9 +2003,9 @@ rx_i_114: ;IMUL_32 rx_body_114: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - movsxd rcx, eax - movsxd rax, r15d - imul rax, rcx + mov rcx, r15 + imul rcx + mov rax, rdx mov r14, rax rx_i_115: ;IDIV_64 @@ -2001,25 +2015,18 @@ rx_i_115: ;IDIV_64 mov ecx, r14d test bl, 63 jnz short rx_body_115 - call rx_read_l2 + call rx_read_l1 rx_body_115: - and ecx, 32767 + and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - mov edx, r10d - cmp edx, -1 - jne short safe_idiv_115 - mov rcx, rax - rol rcx, 1 - dec rcx - jz short result_idiv_115 -safe_idiv_115: - mov ecx, 1 - test edx, edx - cmovne ecx, edx - movsxd rcx, ecx - cqo - idiv rcx -result_idiv_115: + ; magic divide by 587029837 + mov rdx, 527204905636414983 + imul rdx + mov rax, rdx + xor edx, edx + sar rax, 24 + sets dl + add rax, rdx mov r14, rax rx_i_116: ;IMUL_32 @@ -2042,7 +2049,7 @@ rx_body_116: and eax, 32767 mov qword ptr [rsi + rax * 8], rcx -rx_i_117: ;IMULH_64 +rx_i_117: ;DIV_64 dec ebx jz rx_finish xor r11, 015f2012bh @@ -2053,9 +2060,11 @@ rx_i_117: ;IMULH_64 rx_body_117: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - mov rcx, -1205826972 - imul rcx + ; magic divide by 3089140324 + mov rcx, 12823658721283834045 + mul rcx mov rax, rdx + shr rax, 31 mov rcx, rax mov eax, r15d xor eax, 0b8208a64h @@ -2069,9 +2078,9 @@ rx_i_118: ;FPSUB mov ecx, r9d test bl, 63 jnz short rx_body_118 - call rx_read_l2 + call rx_read_l1 rx_body_118: - and ecx, 32767 + and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] subpd xmm0, xmm5 movaps xmm6, xmm0 @@ -2097,10 +2106,10 @@ rx_i_120: ;FPADD mov ecx, r12d test bl, 63 jnz short rx_body_120 - call rx_read_l1 + call rx_read_l2 rx_body_120: xor rbp, rcx - and ecx, 2047 + and ecx, 32767 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm4 movaps xmm8, xmm0 @@ -2112,9 +2121,9 @@ rx_i_121: ;FPSUB mov ecx, r9d test bl, 63 jnz short rx_body_121 - call rx_read_l1 + call rx_read_l2 rx_body_121: - and ecx, 2047 + and ecx, 32767 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] subpd xmm0, xmm5 movaps xmm8, xmm0 @@ -2126,9 +2135,9 @@ rx_i_122: ;CALL mov ecx, r10d test bl, 63 jnz short rx_body_122 - call rx_read_l1 + call rx_read_l2 rx_body_122: - and ecx, 2047 + and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] mov rcx, rax mov eax, r14d @@ -2139,7 +2148,7 @@ rx_body_122: jno short rx_i_123 call rx_i_192 -rx_i_123: ;ADD_64 +rx_i_123: ;ADD_32 dec ebx jz rx_finish xor r13, 073e9f58ah @@ -2150,7 +2159,7 @@ rx_i_123: ;ADD_64 rx_body_123: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - add rax, r15 + add eax, 1530846772 mov r13, rax rx_i_124: ;JUMP @@ -2160,9 +2169,9 @@ rx_i_124: ;JUMP mov ecx, r12d test bl, 63 jnz short rx_body_124 - call rx_read_l2 + call rx_read_l1 rx_body_124: - and ecx, 32767 + and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] mov rcx, rax mov eax, r11d @@ -2179,13 +2188,13 @@ rx_i_125: ;MUL_32 mov ecx, r8d test bl, 63 jnz short rx_body_125 - call rx_read_l2 + call rx_read_l1 rx_body_125: xor rbp, rcx - and ecx, 32767 + and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] mov ecx, eax - mov eax, r14d + mov eax, 1774711622 imul rax, rcx mov r14, rax @@ -2196,9 +2205,9 @@ rx_i_126: ;FPMUL mov ecx, r8d test bl, 63 jnz short rx_body_126 - call rx_read_l1 + call rx_read_l2 rx_body_126: - and ecx, 2047 + and ecx, 32767 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] mulpd xmm0, xmm6 movaps xmm1, xmm0 @@ -2229,10 +2238,10 @@ rx_i_128: ;MUL_64 mov ecx, r13d test bl, 63 jnz short rx_body_128 - call rx_read_l2 + call rx_read_l1 rx_body_128: xor rbp, rcx - and ecx, 32767 + and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] imul rax, r9 mov r9, rax @@ -2252,7 +2261,7 @@ rx_body_129: cmp r13d, -590624856 jge rx_i_154 -rx_i_130: ;DIV_64 +rx_i_130: ;IDIV_64 dec ebx jz rx_finish xor r9, 077c3b332h @@ -2263,9 +2272,14 @@ rx_i_130: ;DIV_64 rx_body_130: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - mov ecx, -281794782 + ; magic divide by -281794782 + mov rdx, -8786110448882479839 + imul rdx + mov rax, rdx xor edx, edx - div rcx + sar rax, 27 + sets dl + add rax, rdx mov rcx, rax mov eax, r11d xor eax, 0ef342722h @@ -2317,7 +2331,7 @@ rx_i_133: ;OR_64 rx_body_133: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - or rax, -1000526796 + or rax, r13 mov rcx, rax mov eax, r15d xor eax, 0c45d2c34h @@ -2335,7 +2349,7 @@ rx_i_134: ;ADD_64 rx_body_134: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - add rax, 1516102347 + add rax, r8 mov r13, rax rx_i_135: ;FPMUL @@ -2383,10 +2397,10 @@ rx_i_137: ;SHR_64 mov ecx, r11d test bl, 63 jnz short rx_body_137 - call rx_read_l1 + call rx_read_l2 rx_body_137: xor rbp, rcx - and ecx, 2047 + and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] mov rcx, r9 shr rax, cl @@ -2420,7 +2434,7 @@ rx_body_139: xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - add rax, 515364082 + add rax, r8 mov rcx, rax mov eax, r11d xor eax, 01eb7d4f2h @@ -2434,9 +2448,9 @@ rx_i_140: ;IMUL_32 mov ecx, r14d test bl, 63 jnz short rx_body_140 - call rx_read_l2 + call rx_read_l1 rx_body_140: - and ecx, 32767 + and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] movsxd rcx, eax movsxd rax, r11d @@ -2469,9 +2483,9 @@ rx_i_142: ;JUMP mov ecx, r11d test bl, 63 jnz short rx_body_142 - call rx_read_l2 + call rx_read_l1 rx_body_142: - and ecx, 32767 + and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] mov rcx, rax mov eax, r10d @@ -2481,39 +2495,39 @@ rx_body_142: cmp r12d, 1365939282 js rx_i_257 -rx_i_143: ;MUL_32 +rx_i_143: ;IMUL_32 dec ebx jz rx_finish xor r15, 037f4b5d0h mov ecx, r15d test bl, 63 jnz short rx_body_143 - call rx_read_l2 + call rx_read_l1 rx_body_143: - and ecx, 32767 + and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - mov ecx, eax - mov eax, r11d + movsxd rcx, eax + movsxd rax, r11d imul rax, rcx mov r9, rax -rx_i_144: ;IMUL_32 +rx_i_144: ;IMULH_64 dec ebx jz rx_finish xor r10, 02e59e00ah mov ecx, r10d test bl, 63 jnz short rx_body_144 - call rx_read_l2 + call rx_read_l1 rx_body_144: - and ecx, 32767 + and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - movsxd rcx, eax - movsxd rax, r11d - imul rax, rcx + mov rcx, -1304483355 + imul rcx + mov rax, rdx mov r15, rax -rx_i_145: ;IMUL_32 +rx_i_145: ;IMULH_64 dec ebx jz rx_finish xor r13, 08d5c798h @@ -2524,9 +2538,9 @@ rx_i_145: ;IMUL_32 rx_body_145: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - movsxd rcx, eax - movsxd rax, r11d - imul rax, rcx + mov rcx, r11 + imul rcx + mov rax, rdx mov rcx, rax mov eax, r10d xor eax, 0dd491985h @@ -2562,14 +2576,14 @@ rx_body_147: xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - imul rax, rax, 1784404616 + imul rax, rax, r11 mov rcx, rax mov eax, r12d xor eax, 06a5bda88h and eax, 2047 mov qword ptr [rsi + rax * 8], rcx -rx_i_148: ;ADD_32 +rx_i_148: ;SUB_64 dec ebx jz rx_finish xor r10, 0783e5c4eh @@ -2580,7 +2594,7 @@ rx_i_148: ;ADD_32 rx_body_148: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - add eax, r14d + sub rax, r14 mov rcx, rax mov eax, r10d xor eax, 08c783d2ch @@ -2607,7 +2621,7 @@ rx_body_149: and eax, 2047 mov qword ptr [rsi + rax * 8], rcx -rx_i_150: ;IMUL_32 +rx_i_150: ;IMULH_64 dec ebx jz rx_finish xor r9, 01504ca7ah @@ -2618,9 +2632,9 @@ rx_i_150: ;IMUL_32 rx_body_150: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - movsxd rcx, eax - movsxd rax, r8d - imul rax, rcx + mov rcx, -933976796 + imul rcx + mov rax, rdx mov rcx, rax mov eax, r9d xor eax, 0c854a524h @@ -2638,7 +2652,7 @@ rx_i_151: ;AND_64 rx_body_151: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - and rax, r13 + and rax, -2018584590 mov rcx, rax mov eax, r11d xor eax, 087aed7f2h @@ -2688,12 +2702,12 @@ rx_i_154: ;MUL_32 mov ecx, r10d test bl, 63 jnz short rx_body_154 - call rx_read_l2 + call rx_read_l1 rx_body_154: - and ecx, 32767 + and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] mov ecx, eax - mov eax, r13d + mov eax, -820047839 imul rax, rcx mov r10, rax @@ -2704,9 +2718,9 @@ rx_i_155: ;ROL_64 mov ecx, r11d test bl, 63 jnz short rx_body_155 - call rx_read_l1 + call rx_read_l2 rx_body_155: - and ecx, 2047 + and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] mov rcx, r10 rol rax, cl @@ -2723,9 +2737,9 @@ rx_i_156: ;IMUL_32 mov ecx, r10d test bl, 63 jnz short rx_body_156 - call rx_read_l2 + call rx_read_l1 rx_body_156: - and ecx, 32767 + and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] movsxd rcx, eax movsxd rax, r15d @@ -2757,7 +2771,7 @@ rx_i_158: ;ADD_64 rx_body_158: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - add rax, 1233402159 + add rax, r13 mov r10, rax rx_i_159: ;CALL @@ -2767,9 +2781,9 @@ rx_i_159: ;CALL mov ecx, r13d test bl, 63 jnz short rx_body_159 - call rx_read_l1 + call rx_read_l2 rx_body_159: - and ecx, 2047 + and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] mov rcx, rax mov eax, r13d @@ -2780,7 +2794,7 @@ rx_body_159: ja short rx_i_160 call rx_i_181 -rx_i_160: ;ADD_32 +rx_i_160: ;SUB_64 dec ebx jz rx_finish xor r14, 0b1685b90h @@ -2792,7 +2806,7 @@ rx_body_160: xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - add eax, 1518778665 + sub rax, r14 mov rcx, rax mov eax, r10d xor eax, 05a86b929h @@ -2806,18 +2820,16 @@ rx_i_161: ;IDIV_64 mov ecx, r15d test bl, 63 jnz short rx_body_161 - call rx_read_l1 + call rx_read_l2 rx_body_161: - and ecx, 2047 + and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] mov edx, r14d cmp edx, -1 - jne short safe_idiv_161 - mov rcx, rax - rol rcx, 1 - dec rcx - jz short result_idiv_161 -safe_idiv_161: + jne short body_idiv_161 + neg rax + jmp short result_idiv_161 +body_idiv_161: mov ecx, 1 test edx, edx cmovne ecx, edx @@ -2838,23 +2850,22 @@ rx_i_162: ;SHL_64 rx_body_162: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - mov rcx, r10 - shl rax, cl + shl rax, 7 mov r13, rax -rx_i_163: ;ADD_32 +rx_i_163: ;SUB_64 dec ebx jz rx_finish xor r12, 0e3486c0ah mov ecx, r12d test bl, 63 jnz short rx_body_163 - call rx_read_l2 + call rx_read_l1 rx_body_163: xor rbp, rcx - and ecx, 32767 + and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - add eax, -2101130488 + sub rax, r8 mov rcx, rax mov eax, r14d xor eax, 082c34b08h @@ -2909,12 +2920,11 @@ rx_i_166: ;SHR_64 mov ecx, r9d test bl, 63 jnz short rx_body_166 - call rx_read_l2 + call rx_read_l1 rx_body_166: - and ecx, 32767 + and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - mov rcx, r8 - shr rax, cl + shr rax, 62 mov rcx, rax mov eax, r13d xor eax, 0bb67f8abh @@ -2986,9 +2996,9 @@ rx_i_170: ;FPSQRT mov ecx, r8d test bl, 63 jnz short rx_body_170 - call rx_read_l2 + call rx_read_l1 rx_body_170: - and ecx, 32767 + and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] andps xmm0, xmm10 sqrtpd xmm6, xmm0 @@ -2997,7 +3007,7 @@ rx_body_170: and eax, 32767 movlpd qword ptr [rsi + rax * 8], xmm6 -rx_i_171: ;IMUL_32 +rx_i_171: ;IMULH_64 dec ebx jz rx_finish xor r15, 09901e05bh @@ -3008,9 +3018,9 @@ rx_i_171: ;IMUL_32 rx_body_171: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - movsxd rcx, eax - movsxd rax, r12d - imul rax, rcx + mov rcx, r12 + imul rcx + mov rax, rdx mov r12, rax rx_i_172: ;SUB_64 @@ -3024,7 +3034,7 @@ rx_i_172: ;SUB_64 rx_body_172: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - sub rax, r11 + sub rax, -478081934 mov r12, rax rx_i_173: ;MUL_64 @@ -3039,7 +3049,7 @@ rx_body_173: xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - imul rax, r12 + imul rax, -1386172772 mov rcx, rax mov eax, r12d xor eax, 0ad60ae9ch @@ -3088,11 +3098,11 @@ rx_i_176: ;SUB_64 mov ecx, r9d test bl, 63 jnz short rx_body_176 - call rx_read_l2 + call rx_read_l1 rx_body_176: - and ecx, 32767 + and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - sub rax, r14 + sub rax, -2101315181 mov r10, rax rx_i_177: ;ADD_64 @@ -3102,11 +3112,11 @@ rx_i_177: ;ADD_64 mov ecx, r10d test bl, 63 jnz short rx_body_177 - call rx_read_l2 + call rx_read_l1 rx_body_177: - and ecx, 32767 + and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - add rax, r10 + add rax, 794235831 mov rcx, rax mov eax, r13d xor eax, 02f5713b7h @@ -3120,9 +3130,9 @@ rx_i_178: ;RET mov ecx, r15d test bl, 63 jnz short rx_body_178 - call rx_read_l1 + call rx_read_l2 rx_body_178: - and ecx, 2047 + and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] mov rcx, rax mov eax, r12d @@ -3140,9 +3150,9 @@ rx_i_179: ;FPADD mov ecx, r12d test bl, 63 jnz short rx_body_179 - call rx_read_l1 + call rx_read_l2 rx_body_179: - and ecx, 2047 + and ecx, 32767 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm2 movaps xmm8, xmm0 @@ -3154,11 +3164,11 @@ rx_i_180: ;AND_32 mov ecx, r15d test bl, 63 jnz short rx_body_180 - call rx_read_l2 + call rx_read_l1 rx_body_180: - and ecx, 32767 + and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - and eax, 1995308563 + and eax, r9d mov rcx, rax mov eax, r9d xor eax, 076edfe13h @@ -3172,10 +3182,10 @@ rx_i_181: ;CALL mov ecx, r10d test bl, 63 jnz short rx_body_181 - call rx_read_l2 + call rx_read_l1 rx_body_181: xor rbp, rcx - and ecx, 32767 + and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] mov r10, rax cmp r12d, -1612576918 @@ -3208,7 +3218,7 @@ rx_i_183: ;ADD_64 rx_body_183: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - add rax, 137260710 + add rax, r11 mov r10, rax rx_i_184: ;XOR_32 @@ -3218,12 +3228,12 @@ rx_i_184: ;XOR_32 mov ecx, r12d test bl, 63 jnz short rx_body_184 - call rx_read_l2 + call rx_read_l1 rx_body_184: xor rbp, rcx - and ecx, 32767 + and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - xor eax, 790123591 + xor eax, r13d mov r12, rax rx_i_185: ;JUMP @@ -3233,9 +3243,9 @@ rx_i_185: ;JUMP mov ecx, r10d test bl, 63 jnz short rx_body_185 - call rx_read_l1 + call rx_read_l2 rx_body_185: - and ecx, 2047 + and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] mov rcx, rax mov eax, r9d @@ -3257,7 +3267,7 @@ rx_body_186: xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - or rax, r15 + or rax, -1252263008 mov rcx, rax mov eax, r10d xor eax, 0b55bfba0h @@ -3271,9 +3281,9 @@ rx_i_187: ;FPMUL mov ecx, r13d test bl, 63 jnz short rx_body_187 - call rx_read_l2 + call rx_read_l1 rx_body_187: - and ecx, 32767 + and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] mulpd xmm0, xmm6 movaps xmm1, xmm0 @@ -3288,10 +3298,10 @@ rx_i_188: ;FPSUB mov ecx, r9d test bl, 63 jnz short rx_body_188 - call rx_read_l1 + call rx_read_l2 rx_body_188: xor rbp, rcx - and ecx, 2047 + and ecx, 32767 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] subpd xmm0, xmm3 movaps xmm4, xmm0 @@ -3320,9 +3330,9 @@ rx_i_190: ;RET mov ecx, r12d test bl, 63 jnz short rx_body_190 - call rx_read_l2 + call rx_read_l1 rx_body_190: - and ecx, 32767 + and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] mov r13, rax cmp rsp, rdi @@ -3336,9 +3346,9 @@ rx_i_191: ;FPSQRT mov ecx, r15d test bl, 63 jnz short rx_body_191 - call rx_read_l1 + call rx_read_l2 rx_body_191: - and ecx, 2047 + and ecx, 32767 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] andps xmm0, xmm10 sqrtpd xmm6, xmm0 @@ -3388,9 +3398,9 @@ rx_i_194: ;FPMUL mov ecx, r12d test bl, 63 jnz short rx_body_194 - call rx_read_l2 + call rx_read_l1 rx_body_194: - and ecx, 32767 + and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] mulpd xmm0, xmm2 movaps xmm1, xmm0 @@ -3413,22 +3423,21 @@ rx_i_195: ;SHL_64 rx_body_195: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - mov rcx, r8 - shl rax, cl + shl rax, 27 mov r9, rax -rx_i_196: ;ADD_32 +rx_i_196: ;SUB_64 dec ebx jz rx_finish xor r8, 0c2a9f41bh mov ecx, r8d test bl, 63 jnz short rx_body_196 - call rx_read_l2 + call rx_read_l1 rx_body_196: - and ecx, 32767 + and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - add eax, -1907903895 + sub rax, r8 mov rcx, rax mov eax, r13d xor eax, 08e47b269h @@ -3442,44 +3451,48 @@ rx_i_197: ;MUL_64 mov ecx, r12d test bl, 63 jnz short rx_body_197 - call rx_read_l1 + call rx_read_l2 rx_body_197: xor rbp, rcx - and ecx, 2047 + and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] imul rax, r15 mov r11, rax -rx_i_198: ;MUL_64 +rx_i_198: ;MULH_64 dec ebx jz rx_finish xor r14, 0c8d95bbbh mov ecx, r14d test bl, 63 jnz short rx_body_198 - call rx_read_l1 + call rx_read_l2 rx_body_198: - and ecx, 2047 + and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] - imul rax, r14 + mov rcx, r14 + mul rcx + mov rax, rdx mov rcx, rax mov eax, r8d xor eax, 01149cba0h and eax, 2047 mov qword ptr [rsi + rax * 8], rcx -rx_i_199: ;MUL_64 +rx_i_199: ;MULH_64 dec ebx jz rx_finish xor r13, 050049e2eh mov ecx, r13d test bl, 63 jnz short rx_body_199 - call rx_read_l2 + call rx_read_l1 rx_body_199: - and ecx, 32767 + and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - imul rax, r10 + mov rcx, r10 + mul rcx + mov rax, rdx mov rcx, rax mov eax, r10d xor eax, 0d0e71e9ah @@ -3530,9 +3543,9 @@ rx_i_202: ;FPADD mov ecx, r13d test bl, 63 jnz short rx_body_202 - call rx_read_l2 + call rx_read_l1 rx_body_202: - and ecx, 32767 + and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm9 movaps xmm5, xmm0 @@ -3544,9 +3557,9 @@ rx_i_203: ;FPSUB mov ecx, r10d test bl, 63 jnz short rx_body_203 - call rx_read_l1 + call rx_read_l2 rx_body_203: - and ecx, 2047 + and ecx, 32767 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] subpd xmm0, xmm2 movaps xmm7, xmm0 @@ -3562,9 +3575,9 @@ rx_i_204: ;MUL_64 mov ecx, r9d test bl, 63 jnz short rx_body_204 - call rx_read_l1 + call rx_read_l2 rx_body_204: - and ecx, 2047 + and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] imul rax, r15 mov rcx, rax @@ -3606,7 +3619,7 @@ rx_body_206: subpd xmm0, xmm7 movaps xmm4, xmm0 -rx_i_207: ;IMULH_64 +rx_i_207: ;IDIV_64 dec ebx jz rx_finish xor r9, 039ccdd30h @@ -3618,9 +3631,14 @@ rx_body_207: xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - mov rcx, r12 - imul rcx + ; magic divide by 314297476 + mov rdx, 1969376361274661135 + imul rdx mov rax, rdx + xor edx, edx + sar rax, 25 + sets dl + add rax, rdx mov rcx, rax mov eax, r9d xor eax, 012bbcc84h @@ -3638,7 +3656,7 @@ rx_i_208: ;MUL_64 rx_body_208: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - imul rax, r12 + imul rax, -486588965 mov r10, rax rx_i_209: ;XOR_64 @@ -3653,7 +3671,7 @@ rx_body_209: xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - xor rax, -1016364182 + xor rax, r15 mov rcx, rax mov eax, r12d xor eax, 0c36b836ah @@ -3667,13 +3685,13 @@ rx_i_210: ;MUL_32 mov ecx, r12d test bl, 63 jnz short rx_body_210 - call rx_read_l2 + call rx_read_l1 rx_body_210: xor rbp, rcx - and ecx, 32767 + and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] mov ecx, eax - mov eax, -1027162400 + mov eax, r12d imul rax, rcx mov rcx, rax mov eax, r15d @@ -3688,11 +3706,12 @@ rx_i_211: ;ROR_64 mov ecx, r12d test bl, 63 jnz short rx_body_211 - call rx_read_l2 + call rx_read_l1 rx_body_211: - and ecx, 32767 + and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - ror rax, 27 + mov rcx, r9 + ror rax, cl mov rcx, rax mov eax, r11d xor eax, 0212e615h @@ -3750,7 +3769,7 @@ rx_body_214: shl rax, cl mov r14, rax -rx_i_215: ;ADD_64 +rx_i_215: ;ADD_32 dec ebx jz rx_finish xor r15, 08359265eh @@ -3762,7 +3781,7 @@ rx_body_215: xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - add rax, r12 + add eax, r12d mov r10, rax rx_i_216: ;MUL_64 @@ -3772,9 +3791,9 @@ rx_i_216: ;MUL_64 mov ecx, r12d test bl, 63 jnz short rx_body_216 - call rx_read_l1 + call rx_read_l2 rx_body_216: - and ecx, 2047 + and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] imul rax, r13 mov rcx, rax @@ -3783,7 +3802,7 @@ rx_body_216: and eax, 2047 mov qword ptr [rsi + rax * 8], rcx -rx_i_217: ;MUL_32 +rx_i_217: ;IMUL_32 dec ebx jz rx_finish xor r8, 040d5b526h @@ -3794,8 +3813,8 @@ rx_i_217: ;MUL_32 rx_body_217: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - mov ecx, eax - mov eax, r9d + movsxd rcx, eax + movsxd rax, r9d imul rax, rcx mov rcx, rax mov eax, r10d @@ -3810,9 +3829,9 @@ rx_i_218: ;FPSQRT mov ecx, r11d test bl, 63 jnz short rx_body_218 - call rx_read_l2 + call rx_read_l1 rx_body_218: - and ecx, 32767 + and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] andps xmm0, xmm10 sqrtpd xmm3, xmm0 @@ -3832,7 +3851,7 @@ rx_i_219: ;OR_64 rx_body_219: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - or rax, -740915304 + or rax, r10 mov rcx, rax mov eax, r15d xor eax, 0d3d68798h @@ -3859,7 +3878,7 @@ rx_body_220: and eax, 32767 mov qword ptr [rsi + rax * 8], rcx -rx_i_221: ;IMUL_32 +rx_i_221: ;IMULH_64 dec ebx jz rx_finish xor r9, 0a3deb512h @@ -3870,9 +3889,9 @@ rx_i_221: ;IMUL_32 rx_body_221: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - movsxd rcx, eax - movsxd rax, r15d - imul rax, rcx + mov rcx, 2146087761 + imul rcx + mov rax, rdx mov rcx, rax mov eax, r11d xor eax, 07feab351h @@ -3886,9 +3905,9 @@ rx_i_222: ;FPMUL mov ecx, r9d test bl, 63 jnz short rx_body_222 - call rx_read_l2 + call rx_read_l1 rx_body_222: - and ecx, 32767 + and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] mulpd xmm0, xmm5 movaps xmm1, xmm0 @@ -3926,31 +3945,31 @@ rx_i_224: ;XOR_32 mov ecx, r12d test bl, 63 jnz short rx_body_224 - call rx_read_l1 + call rx_read_l2 rx_body_224: - and ecx, 2047 + and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] - xor eax, r14d + xor eax, -452933987 mov rcx, rax mov eax, r11d xor eax, 0e500c69dh and eax, 2047 mov qword ptr [rsi + rax * 8], rcx -rx_i_225: ;IMUL_32 +rx_i_225: ;IMULH_64 dec ebx jz rx_finish xor r13, 0c558367eh mov ecx, r13d test bl, 63 jnz short rx_body_225 - call rx_read_l2 + call rx_read_l1 rx_body_225: - and ecx, 32767 + and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - movsxd rcx, eax - movsxd rax, r10d - imul rax, rcx + mov rcx, r10 + imul rcx + mov rax, rdx mov rcx, rax mov eax, r12d xor eax, 0fe304a4ah @@ -3983,9 +4002,9 @@ rx_i_227: ;FPMUL mov ecx, r11d test bl, 63 jnz short rx_body_227 - call rx_read_l1 + call rx_read_l2 rx_body_227: - and ecx, 2047 + and ecx, 32767 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] mulpd xmm0, xmm7 movaps xmm1, xmm0 @@ -4018,9 +4037,9 @@ rx_i_229: ;IMUL_32 mov ecx, r11d test bl, 63 jnz short rx_body_229 - call rx_read_l2 + call rx_read_l1 rx_body_229: - and ecx, 32767 + and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] movsxd rcx, eax movsxd rax, r12d @@ -4130,12 +4149,12 @@ rx_i_235: ;MUL_32 mov ecx, r13d test bl, 63 jnz short rx_body_235 - call rx_read_l2 + call rx_read_l1 rx_body_235: - and ecx, 32767 + and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] mov ecx, eax - mov eax, r12d + mov eax, 212286089 imul rax, rcx mov rcx, rax mov eax, r15d @@ -4150,9 +4169,9 @@ rx_i_236: ;FPADD mov ecx, r15d test bl, 63 jnz short rx_body_236 - call rx_read_l2 + call rx_read_l1 rx_body_236: - and ecx, 32767 + and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm4 movaps xmm3, xmm0 @@ -4164,9 +4183,9 @@ rx_i_237: ;JUMP mov ecx, r15d test bl, 63 jnz short rx_body_237 - call rx_read_l2 + call rx_read_l1 rx_body_237: - and ecx, 32767 + and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] mov r11, rax cmp r12d, -121899164 @@ -4179,10 +4198,10 @@ rx_i_238: ;FPADD mov ecx, r8d test bl, 63 jnz short rx_body_238 - call rx_read_l2 + call rx_read_l1 rx_body_238: xor rbp, rcx - and ecx, 32767 + and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm6 movaps xmm7, xmm0 @@ -4212,10 +4231,10 @@ rx_i_240: ;IMUL_32 mov ecx, r9d test bl, 63 jnz short rx_body_240 - call rx_read_l1 + call rx_read_l2 rx_body_240: xor rbp, rcx - and ecx, 2047 + and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] movsxd rcx, eax mov rax, -423830277 @@ -4247,11 +4266,11 @@ rx_i_242: ;MULH_64 mov ecx, r12d test bl, 63 jnz short rx_body_242 - call rx_read_l2 + call rx_read_l1 rx_body_242: - and ecx, 32767 + and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - mov rcx, 319324914 + mov rcx, r12 mul rcx mov rax, rdx mov rcx, rax @@ -4271,7 +4290,7 @@ rx_i_243: ;OR_64 rx_body_243: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - or rax, 1198180774 + or rax, r9 mov r14, rax rx_i_244: ;ROR_64 @@ -4281,9 +4300,9 @@ rx_i_244: ;ROR_64 mov ecx, r11d test bl, 63 jnz short rx_body_244 - call rx_read_l2 + call rx_read_l1 rx_body_244: - and ecx, 32767 + and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] mov rcx, r14 ror rax, cl @@ -4300,33 +4319,35 @@ rx_i_245: ;AND_32 mov ecx, r13d test bl, 63 jnz short rx_body_245 - call rx_read_l1 + call rx_read_l2 rx_body_245: xor rbp, rcx - and ecx, 2047 + and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] - and eax, -1546539637 + and eax, r10d mov rcx, rax mov eax, r12d xor eax, 0a3d1ad8bh and eax, 2047 mov qword ptr [rsi + rax * 8], rcx -rx_i_246: ;IMULH_64 +rx_i_246: ;DIV_64 dec ebx jz rx_finish xor r15, 027eeaa2eh mov ecx, r15d test bl, 63 jnz short rx_body_246 - call rx_read_l2 + call rx_read_l1 rx_body_246: xor rbp, rcx - and ecx, 32767 + and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - mov rcx, r9 - imul rcx + ; magic divide by 4138158808 + mov rcx, 9572876028959826425 + mul rcx mov rax, rdx + shr rax, 31 mov r12, rax rx_i_247: ;MUL_32 @@ -4349,21 +4370,21 @@ rx_body_247: and eax, 2047 mov qword ptr [rsi + rax * 8], rcx -rx_i_248: ;MULH_64 +rx_i_248: ;MUL_32 dec ebx jz rx_finish xor r8, 0649df46fh mov ecx, r8d test bl, 63 jnz short rx_body_248 - call rx_read_l2 + call rx_read_l1 rx_body_248: xor rbp, rcx - and ecx, 32767 + and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - mov rcx, r15 - mul rcx - mov rax, rdx + mov ecx, eax + mov eax, r15d + imul rax, rcx mov rcx, rax mov eax, r9d xor eax, 07b10fc32h @@ -4377,10 +4398,10 @@ rx_i_249: ;IMUL_32 mov ecx, r15d test bl, 63 jnz short rx_body_249 - call rx_read_l2 + call rx_read_l1 rx_body_249: xor rbp, rcx - and ecx, 32767 + and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] movsxd rcx, eax movsxd rax, r11d @@ -4391,18 +4412,18 @@ rx_body_249: and eax, 32767 mov qword ptr [rsi + rax * 8], rcx -rx_i_250: ;SUB_32 +rx_i_250: ;MUL_64 dec ebx jz rx_finish xor r13, 083eafe6fh mov ecx, r13d test bl, 63 jnz short rx_body_250 - call rx_read_l2 + call rx_read_l1 rx_body_250: - and ecx, 32767 + and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - sub eax, r8d + imul rax, r8 mov rcx, rax mov eax, r14d xor eax, 031115b87h @@ -4416,9 +4437,9 @@ rx_i_251: ;FPMUL mov ecx, r13d test bl, 63 jnz short rx_body_251 - call rx_read_l2 + call rx_read_l1 rx_body_251: - and ecx, 32767 + and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] mulpd xmm0, xmm2 movaps xmm1, xmm0 @@ -4437,12 +4458,11 @@ rx_i_252: ;SHL_64 mov ecx, r14d test bl, 63 jnz short rx_body_252 - call rx_read_l2 + call rx_read_l1 rx_body_252: - and ecx, 32767 + and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - mov rcx, r8 - shl rax, cl + shl rax, 53 mov r14, rax rx_i_253: ;CALL @@ -4490,9 +4510,9 @@ rx_i_255: ;FPADD mov ecx, r9d test bl, 63 jnz short rx_body_255 - call rx_read_l1 + call rx_read_l2 rx_body_255: - and ecx, 2047 + and ecx, 32767 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm5 movaps xmm6, xmm0 @@ -4501,7 +4521,7 @@ rx_body_255: and eax, 2047 movlpd qword ptr [rsi + rax * 8], xmm6 -rx_i_256: ;MUL_64 +rx_i_256: ;MULH_64 dec ebx jz rx_finish xor r8, 08375472ch @@ -4513,7 +4533,9 @@ rx_body_256: xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - imul rax, r15 + mov rcx, r15 + mul rcx + mov rax, rdx mov rcx, rax mov eax, r9d xor eax, 0f8942c0h @@ -4527,9 +4549,9 @@ rx_i_257: ;FPADD mov ecx, r12d test bl, 63 jnz short rx_body_257 - call rx_read_l2 + call rx_read_l1 rx_body_257: - and ecx, 32767 + and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm5 movaps xmm3, xmm0 @@ -4545,10 +4567,10 @@ rx_i_258: ;MUL_32 mov ecx, r11d test bl, 63 jnz short rx_body_258 - call rx_read_l1 + call rx_read_l2 rx_body_258: xor rbp, rcx - and ecx, 2047 + and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] mov ecx, eax mov eax, r14d @@ -4580,10 +4602,10 @@ rx_i_260: ;FPSUB mov ecx, r13d test bl, 63 jnz short rx_body_260 - call rx_read_l2 + call rx_read_l1 rx_body_260: xor rbp, rcx - and ecx, 32767 + and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] subpd xmm0, xmm5 movaps xmm9, xmm0 @@ -4595,10 +4617,10 @@ rx_i_261: ;FPDIV mov ecx, r14d test bl, 63 jnz short rx_body_261 - call rx_read_l2 + call rx_read_l1 rx_body_261: xor rbp, rcx - and ecx, 32767 + and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] divpd xmm0, xmm3 movaps xmm1, xmm0 @@ -4622,7 +4644,7 @@ rx_body_262: xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - and rax, r13 + and rax, -1569587450 mov rcx, rax mov eax, r11d xor eax, 0a271ff06h @@ -4636,10 +4658,10 @@ rx_i_263: ;FPMUL mov ecx, r11d test bl, 63 jnz short rx_body_263 - call rx_read_l1 + call rx_read_l2 rx_body_263: xor rbp, rcx - and ecx, 2047 + and ecx, 32767 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] mulpd xmm0, xmm8 movaps xmm1, xmm0 @@ -4654,9 +4676,9 @@ rx_i_264: ;FPMUL mov ecx, r11d test bl, 63 jnz short rx_body_264 - call rx_read_l2 + call rx_read_l1 rx_body_264: - and ecx, 32767 + and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] mulpd xmm0, xmm3 movaps xmm1, xmm0 @@ -4671,10 +4693,10 @@ rx_i_265: ;FPADD mov ecx, r13d test bl, 63 jnz short rx_body_265 - call rx_read_l2 + call rx_read_l1 rx_body_265: xor rbp, rcx - and ecx, 32767 + and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm8 movaps xmm2, xmm0 @@ -4690,9 +4712,9 @@ rx_i_266: ;CALL mov ecx, r13d test bl, 63 jnz short rx_body_266 - call rx_read_l1 + call rx_read_l2 rx_body_266: - and ecx, 2047 + and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] mov r10, rax cmp r12d, 136160027 @@ -4710,7 +4732,8 @@ rx_i_267: ;ROL_64 rx_body_267: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - rol rax, 56 + mov rcx, r10 + rol rax, cl mov r11, rax rx_i_268: ;JUMP @@ -4720,10 +4743,10 @@ rx_i_268: ;JUMP mov ecx, r12d test bl, 63 jnz short rx_body_268 - call rx_read_l2 + call rx_read_l1 rx_body_268: xor rbp, rcx - and ecx, 32767 + and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] mov r13, rax cmp r15d, -2062812966 @@ -4740,8 +4763,7 @@ rx_i_269: ;ROL_64 rx_body_269: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - mov rcx, r8 - rol rax, cl + rol rax, 50 mov rcx, rax mov eax, r10d xor eax, 01ba81447h @@ -4777,7 +4799,7 @@ rx_body_271: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] mov ecx, eax - mov eax, -2032281772 + mov eax, r10d imul rax, rcx mov rcx, rax mov eax, r9d @@ -4792,9 +4814,9 @@ rx_i_272: ;AND_64 mov ecx, r12d test bl, 63 jnz short rx_body_272 - call rx_read_l2 + call rx_read_l1 rx_body_272: - and ecx, 32767 + and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] and rax, r12 mov r13, rax @@ -4826,9 +4848,9 @@ rx_i_274: ;FPADD mov ecx, r15d test bl, 63 jnz short rx_body_274 - call rx_read_l2 + call rx_read_l1 rx_body_274: - and ecx, 32767 + and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm4 movaps xmm6, xmm0 @@ -4837,24 +4859,26 @@ rx_body_274: and eax, 2047 movhpd qword ptr [rsi + rax * 8], xmm6 -rx_i_275: ;DIV_64 +rx_i_275: ;IDIV_64 dec ebx jz rx_finish xor r10, 0788eceb7h mov ecx, r10d test bl, 63 jnz short rx_body_275 - call rx_read_l2 + call rx_read_l1 rx_body_275: xor rbp, rcx - and ecx, 32767 + and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - mov ecx, 1 - mov edx, r11d - test edx, edx - cmovne ecx, edx + ; magic divide by -333089764 + mov rdx, -7433071640624659213 + imul rdx + mov rax, rdx xor edx, edx - div rcx + sar rax, 27 + sets dl + add rax, rdx mov r13, rax rx_i_276: ;JUMP @@ -4864,10 +4888,10 @@ rx_i_276: ;JUMP mov ecx, r9d test bl, 63 jnz short rx_body_276 - call rx_read_l1 + call rx_read_l2 rx_body_276: xor rbp, rcx - and ecx, 2047 + and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] mov rcx, rax mov eax, r12d @@ -4884,9 +4908,9 @@ rx_i_277: ;IMUL_32 mov ecx, r11d test bl, 63 jnz short rx_body_277 - call rx_read_l2 + call rx_read_l1 rx_body_277: - and ecx, 32767 + and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] movsxd rcx, eax movsxd rax, r10d @@ -4922,9 +4946,9 @@ rx_i_279: ;FPADD mov ecx, r15d test bl, 63 jnz short rx_body_279 - call rx_read_l1 + call rx_read_l2 rx_body_279: - and ecx, 2047 + and ecx, 32767 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm5 movaps xmm9, xmm0 @@ -4933,20 +4957,22 @@ rx_body_279: and eax, 2047 movlpd qword ptr [rsi + rax * 8], xmm9 -rx_i_280: ;IMULH_64 +rx_i_280: ;DIV_64 dec ebx jz rx_finish xor r12, 066246b43h mov ecx, r12d test bl, 63 jnz short rx_body_280 - call rx_read_l2 + call rx_read_l1 rx_body_280: - and ecx, 32767 + and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - mov rcx, r11 - imul rcx + ; magic divide by 555412224 + mov rcx, 2228867111296024113 + mul rcx mov rax, rdx + shr rax, 26 mov rcx, rax mov eax, r13d xor eax, 0211aeb00h @@ -4964,7 +4990,7 @@ rx_i_281: ;SUB_64 rx_body_281: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - sub rax, r10 + sub rax, -202979002 mov rcx, rax mov eax, r11d xor eax, 0f3e6c946h @@ -4982,7 +5008,7 @@ rx_i_282: ;SUB_64 rx_body_282: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - sub rax, 1367326224 + sub rax, r12 mov r11, rax rx_i_283: ;ADD_64 @@ -4992,12 +5018,12 @@ rx_i_283: ;ADD_64 mov ecx, r9d test bl, 63 jnz short rx_body_283 - call rx_read_l1 + call rx_read_l2 rx_body_283: xor rbp, rcx - and ecx, 2047 + and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] - add rax, -1156732976 + add rax, r12 mov rcx, rax mov eax, r12d xor eax, 0bb0da7d0h @@ -5023,7 +5049,7 @@ rx_body_284: and eax, 32767 movlpd qword ptr [rsi + rax * 8], xmm9 -rx_i_285: ;MUL_32 +rx_i_285: ;IMUL_32 dec ebx jz rx_finish xor r8, 09adb333bh @@ -5034,8 +5060,8 @@ rx_i_285: ;MUL_32 rx_body_285: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - mov ecx, eax - mov eax, r8d + movsxd rcx, eax + movsxd rax, r8d imul rax, rcx mov r14, rax @@ -5070,21 +5096,14 @@ rx_body_287: xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - mov edx, r15d - cmp edx, -1 - jne short safe_idiv_287 - mov rcx, rax - rol rcx, 1 - dec rcx - jz short result_idiv_287 -safe_idiv_287: - mov ecx, 1 - test edx, edx - cmovne ecx, edx - movsxd rcx, ecx - cqo - idiv rcx -result_idiv_287: + ; magic divide by 1227278330 + mov rdx, 8069498232143512385 + imul rdx + mov rax, rdx + xor edx, edx + sar rax, 29 + sets dl + add rax, rdx mov rcx, rax mov eax, r8d xor eax, 04926c7fah @@ -5116,9 +5135,9 @@ rx_i_289: ;FPMUL mov ecx, r14d test bl, 63 jnz short rx_body_289 - call rx_read_l2 + call rx_read_l1 rx_body_289: - and ecx, 32767 + and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] mulpd xmm0, xmm9 movaps xmm1, xmm0 @@ -5133,9 +5152,9 @@ rx_i_290: ;FPSUB mov ecx, r15d test bl, 63 jnz short rx_body_290 - call rx_read_l2 + call rx_read_l1 rx_body_290: - and ecx, 32767 + and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] subpd xmm0, xmm8 movaps xmm9, xmm0 @@ -5147,9 +5166,9 @@ rx_i_291: ;RET mov ecx, r13d test bl, 63 jnz short rx_body_291 - call rx_read_l2 + call rx_read_l1 rx_body_291: - and ecx, 32767 + and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] mov rcx, rax mov eax, r14d @@ -5167,11 +5186,12 @@ rx_i_292: ;ROL_64 mov ecx, r13d test bl, 63 jnz short rx_body_292 - call rx_read_l1 + call rx_read_l2 rx_body_292: - and ecx, 2047 + and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] - rol rax, 23 + mov rcx, r8 + rol rax, cl mov r10, rax rx_i_293: ;FPADD @@ -5181,9 +5201,9 @@ rx_i_293: ;FPADD mov ecx, r9d test bl, 63 jnz short rx_body_293 - call rx_read_l2 + call rx_read_l1 rx_body_293: - and ecx, 32767 + and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm5 movaps xmm8, xmm0 @@ -5195,9 +5215,9 @@ rx_i_294: ;RET mov ecx, r14d test bl, 63 jnz short rx_body_294 - call rx_read_l2 + call rx_read_l1 rx_body_294: - and ecx, 32767 + and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] mov rcx, rax mov eax, r8d @@ -5215,10 +5235,10 @@ rx_i_295: ;FPSUB mov ecx, r9d test bl, 63 jnz short rx_body_295 - call rx_read_l1 + call rx_read_l2 rx_body_295: xor rbp, rcx - and ecx, 2047 + and ecx, 32767 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] subpd xmm0, xmm8 movaps xmm7, xmm0 @@ -5230,9 +5250,9 @@ rx_i_296: ;FPSQRT mov ecx, r14d test bl, 63 jnz short rx_body_296 - call rx_read_l1 + call rx_read_l2 rx_body_296: - and ecx, 2047 + and ecx, 32767 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] andps xmm0, xmm10 sqrtpd xmm8, xmm0 @@ -5258,9 +5278,9 @@ rx_i_298: ;FPSUB mov ecx, r14d test bl, 63 jnz short rx_body_298 - call rx_read_l1 + call rx_read_l2 rx_body_298: - and ecx, 2047 + and ecx, 32767 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] subpd xmm0, xmm7 movaps xmm6, xmm0 @@ -5277,7 +5297,7 @@ rx_body_299: xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - add rax, 21400308 + add rax, r10 mov rcx, rax mov eax, r12d xor eax, 01468af4h @@ -5291,10 +5311,10 @@ rx_i_300: ;FPSUB mov ecx, r12d test bl, 63 jnz short rx_body_300 - call rx_read_l2 + call rx_read_l1 rx_body_300: xor rbp, rcx - and ecx, 32767 + and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] subpd xmm0, xmm8 movaps xmm2, xmm0 @@ -5342,9 +5362,9 @@ rx_i_303: ;FPADD mov ecx, r14d test bl, 63 jnz short rx_body_303 - call rx_read_l1 + call rx_read_l2 rx_body_303: - and ecx, 2047 + and ecx, 32767 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm3 movaps xmm9, xmm0 @@ -5364,21 +5384,21 @@ rx_i_304: ;MUL_64 rx_body_304: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - imul rax, r15 + imul rax, 2007686513 mov r13, rax -rx_i_305: ;SUB_32 +rx_i_305: ;MUL_64 dec ebx jz rx_finish xor r11, 03c6c62b8h mov ecx, r11d test bl, 63 jnz short rx_body_305 - call rx_read_l2 + call rx_read_l1 rx_body_305: - and ecx, 32767 + and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - sub eax, -65873120 + imul rax, rax, r15 mov r10, rax rx_i_306: ;ADD_64 @@ -5388,11 +5408,11 @@ rx_i_306: ;ADD_64 mov ecx, r15d test bl, 63 jnz short rx_body_306 - call rx_read_l2 + call rx_read_l1 rx_body_306: - and ecx, 32767 + and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - add rax, r15 + add rax, 400578979 mov r13, rax rx_i_307: ;SHL_64 @@ -5406,8 +5426,7 @@ rx_i_307: ;SHL_64 rx_body_307: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - mov rcx, r8 - shl rax, cl + shl rax, 33 mov r10, rax rx_i_308: ;MUL_64 @@ -5417,9 +5436,9 @@ rx_i_308: ;MUL_64 mov ecx, r11d test bl, 63 jnz short rx_body_308 - call rx_read_l2 + call rx_read_l1 rx_body_308: - and ecx, 32767 + and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] imul rax, r13 mov r15, rax @@ -5431,9 +5450,9 @@ rx_i_309: ;IMUL_32 mov ecx, r9d test bl, 63 jnz short rx_body_309 - call rx_read_l1 + call rx_read_l2 rx_body_309: - and ecx, 2047 + and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] movsxd rcx, eax mov rax, -1652850028 @@ -5482,20 +5501,20 @@ rx_body_311: andps xmm0, xmm1 movaps xmm4, xmm0 -rx_i_312: ;MULH_64 +rx_i_312: ;MUL_32 dec ebx jz rx_finish xor r13, 0b18904cdh mov ecx, r13d test bl, 63 jnz short rx_body_312 - call rx_read_l1 + call rx_read_l2 rx_body_312: - and ecx, 2047 + and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] - mov rcx, -1147928648 - mul rcx - mov rax, rdx + mov ecx, eax + mov eax, r14d + imul rax, rcx mov r10, rax rx_i_313: ;ROR_64 @@ -5523,9 +5542,9 @@ rx_i_314: ;IMUL_32 mov ecx, r15d test bl, 63 jnz short rx_body_314 - call rx_read_l1 + call rx_read_l2 rx_body_314: - and ecx, 2047 + and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] movsxd rcx, eax movsxd rax, r9d @@ -5543,9 +5562,9 @@ rx_i_315: ;XOR_64 mov ecx, r9d test bl, 63 jnz short rx_body_315 - call rx_read_l1 + call rx_read_l2 rx_body_315: - and ecx, 2047 + and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] xor rax, r15 mov r9, rax @@ -5592,11 +5611,12 @@ rx_i_318: ;ROR_64 mov ecx, r9d test bl, 63 jnz short rx_body_318 - call rx_read_l1 + call rx_read_l2 rx_body_318: - and ecx, 2047 + and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] - ror rax, 41 + mov rcx, r11 + ror rax, cl mov rcx, rax mov eax, r15d xor eax, 061cb9db8h @@ -5610,12 +5630,11 @@ rx_i_319: ;SHR_64 mov ecx, r13d test bl, 63 jnz short rx_body_319 - call rx_read_l2 + call rx_read_l1 rx_body_319: - and ecx, 32767 + and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - mov rcx, r15 - shr rax, cl + shr rax, 46 mov rcx, rax mov eax, r11d xor eax, 01f931a08h @@ -5640,19 +5659,19 @@ rx_body_320: and eax, 2047 movlpd qword ptr [rsi + rax * 8], xmm2 -rx_i_321: ;MUL_32 +rx_i_321: ;IMUL_32 dec ebx jz rx_finish xor r11, 0a7bae383h mov ecx, r11d test bl, 63 jnz short rx_body_321 - call rx_read_l1 + call rx_read_l2 rx_body_321: - and ecx, 2047 + and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] - mov ecx, eax - mov eax, r9d + movsxd rcx, eax + movsxd rax, r9d imul rax, rcx mov rcx, rax mov eax, r12d @@ -5680,19 +5699,21 @@ rx_body_322: jno short rx_i_323 call rx_i_343 -rx_i_323: ;MUL_64 +rx_i_323: ;MULH_64 dec ebx jz rx_finish xor r14, 07b07664bh mov ecx, r14d test bl, 63 jnz short rx_body_323 - call rx_read_l1 + call rx_read_l2 rx_body_323: xor rbp, rcx - and ecx, 2047 + and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] - imul rax, rax, -696924877 + mov rcx, r14 + mul rcx + mov rax, rdx mov rcx, rax mov eax, r14d xor eax, 0d675c533h @@ -5731,10 +5752,10 @@ rx_i_325: ;OR_32 rx_body_325: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - or eax, -281580460 + or eax, r8d mov r13, rax -rx_i_326: ;MUL_64 +rx_i_326: ;MULH_64 dec ebx jz rx_finish xor r11, 0d1b27540h @@ -5746,14 +5767,16 @@ rx_body_326: xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - imul rax, r8 + mov rcx, -1233771581 + mul rcx + mov rax, rdx mov rcx, rax mov eax, r9d xor eax, 0b67623c3h and eax, 2047 mov qword ptr [rsi + rax * 8], rcx -rx_i_327: ;IMULH_64 +rx_i_327: ;DIV_64 dec ebx jz rx_finish xor r9, 09665f98dh @@ -5765,9 +5788,11 @@ rx_body_327: xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - mov rcx, r15 - imul rcx + ; magic divide by 1572662125 + mov rcx, 12594593786994192665 + mul rcx mov rax, rdx + shr rax, 30 mov r12, rax rx_i_328: ;SHR_64 @@ -5781,8 +5806,7 @@ rx_i_328: ;SHR_64 rx_body_328: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - mov rcx, r13 - shr rax, cl + shr rax, 18 mov r9, rax rx_i_329: ;RET @@ -5792,9 +5816,9 @@ rx_i_329: ;RET mov ecx, r11d test bl, 63 jnz short rx_body_329 - call rx_read_l1 + call rx_read_l2 rx_body_329: - and ecx, 2047 + and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] mov r11, rax cmp rsp, rdi @@ -5808,13 +5832,13 @@ rx_i_330: ;MUL_32 mov ecx, r9d test bl, 63 jnz short rx_body_330 - call rx_read_l1 + call rx_read_l2 rx_body_330: xor rbp, rcx - and ecx, 2047 + and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] mov ecx, eax - mov eax, r13d + mov eax, -1349816041 imul rax, rcx mov rcx, rax mov eax, r11d @@ -5829,10 +5853,10 @@ rx_i_331: ;FPADD mov ecx, r9d test bl, 63 jnz short rx_body_331 - call rx_read_l1 + call rx_read_l2 rx_body_331: xor rbp, rcx - and ecx, 2047 + and ecx, 32767 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm3 movaps xmm9, xmm0 @@ -5863,12 +5887,12 @@ rx_i_333: ;OR_64 mov ecx, r14d test bl, 63 jnz short rx_body_333 - call rx_read_l2 + call rx_read_l1 rx_body_333: xor rbp, rcx - and ecx, 32767 + and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - or rax, -175125848 + or rax, r12 mov r11, rax rx_i_334: ;ADD_64 @@ -5878,10 +5902,10 @@ rx_i_334: ;ADD_64 mov ecx, r8d test bl, 63 jnz short rx_body_334 - call rx_read_l1 + call rx_read_l2 rx_body_334: xor rbp, rcx - and ecx, 2047 + and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] add rax, r13 mov r8, rax @@ -5893,9 +5917,9 @@ rx_i_335: ;SUB_64 mov ecx, r15d test bl, 63 jnz short rx_body_335 - call rx_read_l1 + call rx_read_l2 rx_body_335: - and ecx, 2047 + and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] sub rax, r8 mov rcx, rax @@ -5916,8 +5940,7 @@ rx_body_336: xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - mov rcx, r10 - ror rax, cl + ror rax, 42 mov rcx, rax mov eax, r11d xor eax, 02644c5ah @@ -5949,10 +5972,10 @@ rx_i_338: ;MUL_64 mov ecx, r12d test bl, 63 jnz short rx_body_338 - call rx_read_l2 + call rx_read_l1 rx_body_338: xor rbp, rcx - and ecx, 32767 + and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] imul rax, r12 mov r11, rax @@ -5978,9 +6001,9 @@ rx_i_340: ;FPADD mov ecx, r15d test bl, 63 jnz short rx_body_340 - call rx_read_l1 + call rx_read_l2 rx_body_340: - and ecx, 2047 + and ecx, 32767 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm5 movaps xmm5, xmm0 @@ -6012,9 +6035,9 @@ rx_i_342: ;FPSUB mov ecx, r9d test bl, 63 jnz short rx_body_342 - call rx_read_l2 + call rx_read_l1 rx_body_342: - and ecx, 32767 + and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] subpd xmm0, xmm2 movaps xmm3, xmm0 @@ -6045,25 +6068,27 @@ rx_i_344: ;FPSUB mov ecx, r10d test bl, 63 jnz short rx_body_344 - call rx_read_l2 + call rx_read_l1 rx_body_344: - and ecx, 32767 + and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] subpd xmm0, xmm6 movaps xmm5, xmm0 -rx_i_345: ;MUL_64 +rx_i_345: ;MULH_64 dec ebx jz rx_finish xor r12, 0bbbcdbach mov ecx, r12d test bl, 63 jnz short rx_body_345 - call rx_read_l1 + call rx_read_l2 rx_body_345: - and ecx, 2047 + and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] - imul rax, r13 + mov rcx, r13 + mul rcx + mov rax, rdx mov rcx, rax mov eax, r9d xor eax, 0ef03b0ddh @@ -6077,9 +6102,9 @@ rx_i_346: ;AND_32 mov ecx, r12d test bl, 63 jnz short rx_body_346 - call rx_read_l1 + call rx_read_l2 rx_body_346: - and ecx, 2047 + and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] and eax, r15d mov rcx, rax @@ -6127,9 +6152,9 @@ rx_i_349: ;OR_64 mov ecx, r8d test bl, 63 jnz short rx_body_349 - call rx_read_l2 + call rx_read_l1 rx_body_349: - and ecx, 32767 + and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] or rax, r15 mov r13, rax @@ -6141,9 +6166,9 @@ rx_i_350: ;CALL mov ecx, r9d test bl, 63 jnz short rx_body_350 - call rx_read_l1 + call rx_read_l2 rx_body_350: - and ecx, 2047 + and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] mov rcx, rax mov eax, r12d @@ -6161,9 +6186,9 @@ rx_i_351: ;MUL_64 mov ecx, r11d test bl, 63 jnz short rx_body_351 - call rx_read_l2 + call rx_read_l1 rx_body_351: - and ecx, 32767 + and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] imul rax, r10 mov r13, rax @@ -6205,18 +6230,20 @@ rx_body_353: and eax, 2047 movlpd qword ptr [rsi + rax * 8], xmm7 -rx_i_354: ;MUL_64 +rx_i_354: ;MULH_64 dec ebx jz rx_finish xor r13, 02412fc10h mov ecx, r13d test bl, 63 jnz short rx_body_354 - call rx_read_l2 + call rx_read_l1 rx_body_354: - and ecx, 32767 + and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - imul rax, r13 + mov rcx, r13 + mul rcx + mov rax, rdx mov r13, rax rx_i_355: ;MUL_64 @@ -6226,9 +6253,9 @@ rx_i_355: ;MUL_64 mov ecx, r10d test bl, 63 jnz short rx_body_355 - call rx_read_l1 + call rx_read_l2 rx_body_355: - and ecx, 2047 + and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] imul rax, r14 mov rcx, rax @@ -6237,19 +6264,19 @@ rx_body_355: and eax, 2047 mov qword ptr [rsi + rax * 8], rcx -rx_i_356: ;SUB_64 +rx_i_356: ;MUL_64 dec ebx jz rx_finish xor r10, 01cd85d80h mov ecx, r10d test bl, 63 jnz short rx_body_356 - call rx_read_l2 + call rx_read_l1 rx_body_356: xor rbp, rcx - and ecx, 32767 + and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - sub rax, r10 + imul rax, r10 mov r11, rax rx_i_357: ;ADD_64 @@ -6259,27 +6286,27 @@ rx_i_357: ;ADD_64 mov ecx, r10d test bl, 63 jnz short rx_body_357 - call rx_read_l2 + call rx_read_l1 rx_body_357: - and ecx, 32767 + and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - add rax, 820073637 + add rax, r11 mov r11, rax -rx_i_358: ;IMUL_32 +rx_i_358: ;IMULH_64 dec ebx jz rx_finish xor r13, 088fa6e5ah mov ecx, r13d test bl, 63 jnz short rx_body_358 - call rx_read_l2 + call rx_read_l1 rx_body_358: - and ecx, 32767 + and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - movsxd rcx, eax - movsxd rax, r11d - imul rax, rcx + mov rcx, r11 + imul rcx + mov rax, rdx mov r9, rax rx_i_359: ;FPSUB @@ -6289,10 +6316,10 @@ rx_i_359: ;FPSUB mov ecx, r10d test bl, 63 jnz short rx_body_359 - call rx_read_l2 + call rx_read_l1 rx_body_359: xor rbp, rcx - and ecx, 32767 + and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] subpd xmm0, xmm9 movaps xmm4, xmm0 @@ -6350,7 +6377,7 @@ rx_i_362: ;SUB_64 rx_body_362: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - sub rax, 1082179469 + sub rax, r9 mov rcx, rax mov eax, r15d xor eax, 04080bf8dh @@ -6364,9 +6391,9 @@ rx_i_363: ;FPMUL mov ecx, r12d test bl, 63 jnz short rx_body_363 - call rx_read_l1 + call rx_read_l2 rx_body_363: - and ecx, 2047 + and ecx, 32767 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] mulpd xmm0, xmm6 movaps xmm1, xmm0 @@ -6390,19 +6417,19 @@ rx_body_364: mov rax, rdx mov r8, rax -rx_i_365: ;MUL_32 +rx_i_365: ;IMUL_32 dec ebx jz rx_finish xor r15, 02db4444ah mov ecx, r15d test bl, 63 jnz short rx_body_365 - call rx_read_l2 + call rx_read_l1 rx_body_365: - and ecx, 32767 + and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - mov ecx, eax - mov eax, r9d + movsxd rcx, eax + movsxd rax, r9d imul rax, rcx mov rcx, rax mov eax, r12d @@ -6417,9 +6444,9 @@ rx_i_366: ;IMUL_32 mov ecx, r12d test bl, 63 jnz short rx_body_366 - call rx_read_l2 + call rx_read_l1 rx_body_366: - and ecx, 32767 + and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] movsxd rcx, eax movsxd rax, r8d @@ -6437,43 +6464,44 @@ rx_i_367: ;ROR_64 mov ecx, r9d test bl, 63 jnz short rx_body_367 - call rx_read_l2 + call rx_read_l1 rx_body_367: xor rbp, rcx - and ecx, 32767 + and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - mov rcx, r9 - ror rax, cl + ror rax, 18 mov r12, rax -rx_i_368: ;SUB_64 +rx_i_368: ;SUB_32 dec ebx jz rx_finish xor r10, 0a14836bah mov ecx, r10d test bl, 63 jnz short rx_body_368 - call rx_read_l1 + call rx_read_l2 rx_body_368: - and ecx, 2047 + and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] - sub rax, r10 + sub eax, r10d mov r8, rax -rx_i_369: ;IMULH_64 +rx_i_369: ;DIV_64 dec ebx jz rx_finish xor r9, 053fe22e2h mov ecx, r9d test bl, 63 jnz short rx_body_369 - call rx_read_l1 + call rx_read_l2 rx_body_369: - and ecx, 2047 + and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] - mov rcx, r13 - imul rcx + ; magic divide by 470792991 + mov rcx, 1314739240972876203 + mul rcx mov rax, rdx + shr rax, 25 mov r9, rax rx_i_370: ;FPSUB @@ -6483,9 +6511,9 @@ rx_i_370: ;FPSUB mov ecx, r15d test bl, 63 jnz short rx_body_370 - call rx_read_l1 + call rx_read_l2 rx_body_370: - and ecx, 2047 + and ecx, 32767 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] subpd xmm0, xmm6 movaps xmm6, xmm0 @@ -6520,9 +6548,9 @@ rx_i_372: ;SHL_64 mov ecx, r10d test bl, 63 jnz short rx_body_372 - call rx_read_l2 + call rx_read_l1 rx_body_372: - and ecx, 32767 + and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] mov rcx, r13 shl rax, cl @@ -6535,9 +6563,9 @@ rx_i_373: ;FPMUL mov ecx, r15d test bl, 63 jnz short rx_body_373 - call rx_read_l2 + call rx_read_l1 rx_body_373: - and ecx, 32767 + and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] mulpd xmm0, xmm8 movaps xmm1, xmm0 @@ -6569,11 +6597,11 @@ rx_i_375: ;ADD_64 mov ecx, r9d test bl, 63 jnz short rx_body_375 - call rx_read_l1 + call rx_read_l2 rx_body_375: - and ecx, 2047 + and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] - add rax, r15 + add rax, -332030999 mov rcx, rax mov eax, r12d xor eax, 0ec359be9h @@ -6591,7 +6619,7 @@ rx_i_376: ;ADD_64 rx_body_376: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - add rax, 476136066 + add rax, r9 mov rcx, rax mov eax, r8d xor eax, 01c614282h @@ -6612,20 +6640,20 @@ rx_body_377: subpd xmm0, xmm3 movaps xmm7, xmm0 -rx_i_378: ;MULH_64 +rx_i_378: ;MUL_32 dec ebx jz rx_finish xor r12, 082aa21ach mov ecx, r12d test bl, 63 jnz short rx_body_378 - call rx_read_l1 + call rx_read_l2 rx_body_378: - and ecx, 2047 + and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] - mov rcx, 547725353 - mul rcx - mov rax, rdx + mov ecx, eax + mov eax, r14d + imul rax, rcx mov r15, rax rx_i_379: ;ROR_64 @@ -6635,25 +6663,26 @@ rx_i_379: ;ROR_64 mov ecx, r10d test bl, 63 jnz short rx_body_379 - call rx_read_l2 + call rx_read_l1 rx_body_379: - and ecx, 32767 + and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - ror rax, 56 + mov rcx, r9 + ror rax, cl mov r13, rax -rx_i_380: ;SUB_32 +rx_i_380: ;MUL_64 dec ebx jz rx_finish xor r11, 0229e3d6eh mov ecx, r11d test bl, 63 jnz short rx_body_380 - call rx_read_l1 + call rx_read_l2 rx_body_380: - and ecx, 2047 + and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] - sub eax, -1443002912 + imul rax, rax, r10 mov rcx, rax mov eax, r13d xor eax, 0a9fd85e0h @@ -6667,10 +6696,10 @@ rx_i_381: ;XOR_32 mov ecx, r8d test bl, 63 jnz short rx_body_381 - call rx_read_l2 + call rx_read_l1 rx_body_381: xor rbp, rcx - and ecx, 32767 + and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] xor eax, r14d mov r9, rax @@ -6682,9 +6711,9 @@ rx_i_382: ;ROL_64 mov ecx, r14d test bl, 63 jnz short rx_body_382 - call rx_read_l1 + call rx_read_l2 rx_body_382: - and ecx, 2047 + and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] rol rax, 55 mov r11, rax @@ -6718,7 +6747,7 @@ rx_i_384: ;XOR_64 rx_body_384: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - xor rax, r11 + xor rax, 1413715044 mov rcx, rax mov eax, r9d xor eax, 054439464h @@ -6750,26 +6779,26 @@ rx_i_386: ;FPADD mov ecx, r9d test bl, 63 jnz short rx_body_386 - call rx_read_l2 + call rx_read_l1 rx_body_386: - and ecx, 32767 + and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm8 movaps xmm9, xmm0 -rx_i_387: ;SUB_64 +rx_i_387: ;SUB_32 dec ebx jz rx_finish xor r9, 0d4f7bc6ah mov ecx, r9d test bl, 63 jnz short rx_body_387 - call rx_read_l2 + call rx_read_l1 rx_body_387: xor rbp, rcx - and ecx, 32767 + and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - sub rax, r15 + sub eax, r15d mov r9, rax rx_i_388: ;RET @@ -6779,9 +6808,9 @@ rx_i_388: ;RET mov ecx, r8d test bl, 63 jnz short rx_body_388 - call rx_read_l2 + call rx_read_l1 rx_body_388: - and ecx, 32767 + and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] mov rcx, rax mov eax, r11d @@ -6799,9 +6828,9 @@ rx_i_389: ;JUMP mov ecx, r11d test bl, 63 jnz short rx_body_389 - call rx_read_l1 + call rx_read_l2 rx_body_389: - and ecx, 2047 + and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] mov r14, rax cmp r9d, -350609584 @@ -6842,11 +6871,12 @@ rx_i_392: ;SAR_64 mov ecx, r14d test bl, 63 jnz short rx_body_392 - call rx_read_l2 + call rx_read_l1 rx_body_392: - and ecx, 32767 + and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - sar rax, 0 + mov rcx, r9 + sar rax, cl mov rcx, rax mov eax, r13d xor eax, 08c4a0f0dh @@ -6864,7 +6894,7 @@ rx_i_393: ;AND_64 rx_body_393: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - and rax, 552339548 + and rax, r12 mov rcx, rax mov eax, r13d xor eax, 020ec085ch @@ -6878,14 +6908,14 @@ rx_i_394: ;FPADD mov ecx, r12d test bl, 63 jnz short rx_body_394 - call rx_read_l2 + call rx_read_l1 rx_body_394: - and ecx, 32767 + and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm9 movaps xmm6, xmm0 -rx_i_395: ;IMUL_32 +rx_i_395: ;IMULH_64 dec ebx jz rx_finish xor r8, 04ae4fe8ch @@ -6897,9 +6927,9 @@ rx_body_395: xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - movsxd rcx, eax - movsxd rax, r13d - imul rax, rcx + mov rcx, r13 + imul rcx + mov rax, rdx mov r8, rax rx_i_396: ;ROR_64 @@ -6909,9 +6939,9 @@ rx_i_396: ;ROR_64 mov ecx, r10d test bl, 63 jnz short rx_body_396 - call rx_read_l1 + call rx_read_l2 rx_body_396: - and ecx, 2047 + and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] ror rax, 62 mov rcx, rax @@ -6920,19 +6950,19 @@ rx_body_396: and eax, 2047 mov qword ptr [rsi + rax * 8], rcx -rx_i_397: ;SUB_64 +rx_i_397: ;SUB_32 dec ebx jz rx_finish xor r8, 0916f3819h mov ecx, r8d test bl, 63 jnz short rx_body_397 - call rx_read_l1 + call rx_read_l2 rx_body_397: xor rbp, rcx - and ecx, 2047 + and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] - sub rax, r12 + sub eax, r12d mov rcx, rax mov eax, r10d xor eax, 0146db5dfh @@ -6946,11 +6976,12 @@ rx_i_398: ;SHR_64 mov ecx, r8d test bl, 63 jnz short rx_body_398 - call rx_read_l1 + call rx_read_l2 rx_body_398: - and ecx, 2047 + and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] - shr rax, 44 + mov rcx, r8 + shr rax, cl mov rcx, rax mov eax, r11d xor eax, 0724e7136h @@ -6982,11 +7013,11 @@ rx_i_400: ;AND_64 mov ecx, r13d test bl, 63 jnz short rx_body_400 - call rx_read_l1 + call rx_read_l2 rx_body_400: - and ecx, 2047 + and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] - and rax, r11 + and rax, -1800645748 mov rcx, rax mov eax, r14d xor eax, 094ac538ch @@ -7000,9 +7031,9 @@ rx_i_401: ;FPSUB mov ecx, r13d test bl, 63 jnz short rx_body_401 - call rx_read_l1 + call rx_read_l2 rx_body_401: - and ecx, 2047 + and ecx, 32767 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] subpd xmm0, xmm4 movaps xmm6, xmm0 @@ -7027,7 +7058,7 @@ rx_body_402: je short rx_i_403 ret -rx_i_403: ;IMUL_32 +rx_i_403: ;IMULH_64 dec ebx jz rx_finish xor r9, 0e59500f7h @@ -7038,29 +7069,29 @@ rx_i_403: ;IMUL_32 rx_body_403: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - movsxd rcx, eax - movsxd rax, r12d - imul rax, rcx + mov rcx, r12 + imul rcx + mov rax, rdx mov rcx, rax mov eax, r11d xor eax, 01ff394a0h and eax, 2047 mov qword ptr [rsi + rax * 8], rcx -rx_i_404: ;MULH_64 +rx_i_404: ;MUL_32 dec ebx jz rx_finish xor r15, 05b8ceb2fh mov ecx, r15d test bl, 63 jnz short rx_body_404 - call rx_read_l1 + call rx_read_l2 rx_body_404: - and ecx, 2047 + and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] - mov rcx, r8 - mul rcx - mov rax, rdx + mov ecx, eax + mov eax, r8d + imul rax, rcx mov r15, rax rx_i_405: ;CALL @@ -7070,9 +7101,9 @@ rx_i_405: ;CALL mov ecx, r8d test bl, 63 jnz short rx_body_405 - call rx_read_l1 + call rx_read_l2 rx_body_405: - and ecx, 2047 + and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] mov rcx, rax mov eax, r12d @@ -7090,9 +7121,9 @@ rx_i_406: ;FPDIV mov ecx, r9d test bl, 63 jnz short rx_body_406 - call rx_read_l2 + call rx_read_l1 rx_body_406: - and ecx, 32767 + and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] divpd xmm0, xmm7 movaps xmm1, xmm0 @@ -7111,10 +7142,10 @@ rx_i_407: ;FPSUB mov ecx, r14d test bl, 63 jnz short rx_body_407 - call rx_read_l2 + call rx_read_l1 rx_body_407: xor rbp, rcx - and ecx, 32767 + and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] subpd xmm0, xmm9 movaps xmm8, xmm0 @@ -7126,11 +7157,11 @@ rx_i_408: ;MUL_64 mov ecx, r15d test bl, 63 jnz short rx_body_408 - call rx_read_l1 + call rx_read_l2 rx_body_408: - and ecx, 2047 + and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] - imul rax, r9 + imul rax, 693109961 mov rcx, rax mov eax, r10d xor eax, 0295004c9h @@ -7159,9 +7190,9 @@ rx_i_410: ;RET mov ecx, r15d test bl, 63 jnz short rx_body_410 - call rx_read_l1 + call rx_read_l2 rx_body_410: - and ecx, 2047 + and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] mov r8, rax cmp rsp, rdi @@ -7229,33 +7260,33 @@ rx_i_414: ;AND_64 mov ecx, r14d test bl, 63 jnz short rx_body_414 - call rx_read_l1 + call rx_read_l2 rx_body_414: xor rbp, rcx - and ecx, 2047 + and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] - and rax, r8 + and rax, -378293327 mov rcx, rax mov eax, r10d xor eax, 0e973b3b1h and eax, 2047 mov qword ptr [rsi + rax * 8], rcx -rx_i_415: ;IMUL_32 +rx_i_415: ;IMULH_64 dec ebx jz rx_finish xor r8, 08c3e59a1h mov ecx, r8d test bl, 63 jnz short rx_body_415 - call rx_read_l1 + call rx_read_l2 rx_body_415: xor rbp, rcx - and ecx, 2047 + and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] - movsxd rcx, eax - mov rax, -538093385 - imul rax, rcx + mov rcx, r8 + imul rcx + mov rax, rdx mov r9, rax rx_i_416: ;FPADD @@ -7291,18 +7322,20 @@ rx_body_417: sub rax, r12 mov r10, rax -rx_i_418: ;MUL_64 +rx_i_418: ;MULH_64 dec ebx jz rx_finish xor r10, 02bd61c5fh mov ecx, r10d test bl, 63 jnz short rx_body_418 - call rx_read_l1 + call rx_read_l2 rx_body_418: - and ecx, 2047 + and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] - imul rax, r11 + mov rcx, r11 + mul rcx + mov rax, rdx mov r10, rax rx_i_419: ;OR_64 @@ -7312,9 +7345,9 @@ rx_i_419: ;OR_64 mov ecx, r9d test bl, 63 jnz short rx_body_419 - call rx_read_l1 + call rx_read_l2 rx_body_419: - and ecx, 2047 + and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] or rax, r14 mov rcx, rax @@ -7334,8 +7367,7 @@ rx_i_420: ;ROR_64 rx_body_420: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - mov rcx, r11 - ror rax, cl + ror rax, 38 mov r9, rax rx_i_421: ;CALL @@ -7345,33 +7377,33 @@ rx_i_421: ;CALL mov ecx, r12d test bl, 63 jnz short rx_body_421 - call rx_read_l2 + call rx_read_l1 rx_body_421: xor rbp, rcx - and ecx, 32767 + and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] mov r10, rax cmp r8d, -1600409762 jo short rx_i_422 call rx_i_31 -rx_i_422: ;MUL_32 +rx_i_422: ;IMUL_32 dec ebx jz rx_finish xor r11, 04dd16ca4h mov ecx, r11d test bl, 63 jnz short rx_body_422 - call rx_read_l2 + call rx_read_l1 rx_body_422: - and ecx, 32767 + and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - mov ecx, eax - mov eax, r10d + movsxd rcx, eax + movsxd rax, r10d imul rax, rcx mov r13, rax -rx_i_423: ;SUB_64 +rx_i_423: ;MUL_64 dec ebx jz rx_finish xor r12, 04df5ce05h @@ -7382,7 +7414,7 @@ rx_i_423: ;SUB_64 rx_body_423: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - sub rax, r10 + imul rax, r10 mov rcx, rax mov eax, r15d xor eax, 0a5d40d0ah @@ -7396,10 +7428,10 @@ rx_i_424: ;FPADD mov ecx, r13d test bl, 63 jnz short rx_body_424 - call rx_read_l2 + call rx_read_l1 rx_body_424: xor rbp, rcx - and ecx, 32767 + and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm7 movaps xmm9, xmm0 @@ -7408,7 +7440,7 @@ rx_body_424: and eax, 2047 movlpd qword ptr [rsi + rax * 8], xmm9 -rx_i_425: ;MUL_32 +rx_i_425: ;IMUL_32 dec ebx jz rx_finish xor r8, 0a3c5391dh @@ -7419,25 +7451,27 @@ rx_i_425: ;MUL_32 rx_body_425: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - mov ecx, eax - mov eax, r10d + movsxd rcx, eax + movsxd rax, r10d imul rax, rcx mov r14, rax -rx_i_426: ;IMULH_64 +rx_i_426: ;DIV_64 dec ebx jz rx_finish xor r12, 09dd55ba0h mov ecx, r12d test bl, 63 jnz short rx_body_426 - call rx_read_l2 + call rx_read_l1 rx_body_426: - and ecx, 32767 + and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - mov rcx, r9 - imul rcx + ; magic divide by 3704238575 + mov rcx, 1336782190693946083 + mul rcx mov rax, rdx + shr rax, 28 mov rcx, rax mov eax, r14d xor eax, 0dcca31efh @@ -7456,7 +7490,7 @@ rx_body_427: xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - mov rcx, r11 + mov rcx, -2146332428 mul rcx mov rax, rdx mov rcx, rax @@ -7492,11 +7526,11 @@ rx_i_429: ;MUL_64 mov ecx, r12d test bl, 63 jnz short rx_body_429 - call rx_read_l2 + call rx_read_l1 rx_body_429: - and ecx, 32767 + and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - imul rax, rax, 1990438276 + imul rax, rax, r9 mov r15, rax rx_i_430: ;FPADD @@ -7540,14 +7574,14 @@ rx_i_432: ;SUB_64 mov ecx, r10d test bl, 63 jnz short rx_body_432 - call rx_read_l2 + call rx_read_l1 rx_body_432: - and ecx, 32767 + and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - sub rax, r10 + sub rax, 876274173 mov r8, rax -rx_i_433: ;ADD_64 +rx_i_433: ;ADD_32 dec ebx jz rx_finish xor r13, 0bbb88499h @@ -7558,7 +7592,7 @@ rx_i_433: ;ADD_64 rx_body_433: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - add rax, r12 + add eax, 1193456495 mov rcx, rax mov eax, r12d xor eax, 04722b36fh @@ -7572,9 +7606,9 @@ rx_i_434: ;FPDIV mov ecx, r13d test bl, 63 jnz short rx_body_434 - call rx_read_l2 + call rx_read_l1 rx_body_434: - and ecx, 32767 + and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] divpd xmm0, xmm3 movaps xmm1, xmm0 @@ -7598,7 +7632,7 @@ rx_body_435: xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - imul rax, r15 + imul rax, 1971717631 mov rcx, rax mov eax, r9d xor eax, 0758605ffh @@ -7612,9 +7646,9 @@ rx_i_436: ;FPADD mov ecx, r15d test bl, 63 jnz short rx_body_436 - call rx_read_l2 + call rx_read_l1 rx_body_436: - and ecx, 32767 + and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm2 movaps xmm7, xmm0 @@ -7665,11 +7699,11 @@ rx_i_439: ;OR_64 mov ecx, r13d test bl, 63 jnz short rx_body_439 - call rx_read_l2 + call rx_read_l1 rx_body_439: - and ecx, 32767 + and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - or rax, r15 + or rax, -1299288575 mov rcx, rax mov eax, r10d xor eax, 0b28e6e01h @@ -7705,7 +7739,7 @@ rx_body_441: xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - add rax, 529736748 + add rax, r14 mov rcx, rax mov eax, r9d xor eax, 01f93242ch @@ -7753,9 +7787,9 @@ rx_i_444: ;FPSUB mov ecx, r8d test bl, 63 jnz short rx_body_444 - call rx_read_l2 + call rx_read_l1 rx_body_444: - and ecx, 32767 + and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] subpd xmm0, xmm7 movaps xmm5, xmm0 @@ -7881,7 +7915,7 @@ rx_body_451: xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - add rax, -287502157 + add rax, r10 mov r8, rax rx_i_452: ;RET @@ -7891,9 +7925,9 @@ rx_i_452: ;RET mov ecx, r13d test bl, 63 jnz short rx_body_452 - call rx_read_l1 + call rx_read_l2 rx_body_452: - and ecx, 2047 + and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] mov rcx, rax mov eax, r11d @@ -7904,20 +7938,20 @@ rx_body_452: je short rx_i_453 ret -rx_i_453: ;IMUL_32 +rx_i_453: ;IMULH_64 dec ebx jz rx_finish xor r11, 0a2096aa4h mov ecx, r11d test bl, 63 jnz short rx_body_453 - call rx_read_l1 + call rx_read_l2 rx_body_453: - and ecx, 2047 + and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] - movsxd rcx, eax - movsxd rax, r14d - imul rax, rcx + mov rcx, r14 + imul rcx + mov rax, rdx mov r8, rax rx_i_454: ;FPADD @@ -7927,9 +7961,9 @@ rx_i_454: ;FPADD mov ecx, r13d test bl, 63 jnz short rx_body_454 - call rx_read_l1 + call rx_read_l2 rx_body_454: - and ecx, 2047 + and ecx, 32767 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm9 movaps xmm4, xmm0 @@ -7960,11 +7994,11 @@ rx_i_456: ;AND_64 mov ecx, r9d test bl, 63 jnz short rx_body_456 - call rx_read_l2 + call rx_read_l1 rx_body_456: - and ecx, 32767 + and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - and rax, r11 + and rax, 401943615 mov rcx, rax mov eax, r9d xor eax, 017f52c3fh @@ -7983,7 +8017,7 @@ rx_body_457: xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - sub rax, r10 + sub rax, 1482178870 mov rcx, rax mov eax, r10d xor eax, 058584136h @@ -7997,14 +8031,15 @@ rx_i_458: ;SAR_64 mov ecx, r11d test bl, 63 jnz short rx_body_458 - call rx_read_l1 + call rx_read_l2 rx_body_458: - and ecx, 2047 + and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] - sar rax, 22 + mov rcx, r8 + sar rax, cl mov r14, rax -rx_i_459: ;SUB_64 +rx_i_459: ;MUL_64 dec ebx jz rx_finish xor r9, 0346f46adh @@ -8015,14 +8050,14 @@ rx_i_459: ;SUB_64 rx_body_459: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - sub rax, 381354340 + imul rax, rax, r9 mov rcx, rax mov eax, r13d xor eax, 016bb0164h and eax, 2047 mov qword ptr [rsi + rax * 8], rcx -rx_i_460: ;ADD_64 +rx_i_460: ;ADD_32 dec ebx jz rx_finish xor r11, 098ab71fch @@ -8033,7 +8068,7 @@ rx_i_460: ;ADD_64 rx_body_460: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - add rax, r14 + add eax, -347784553 mov rcx, rax mov eax, r12d xor eax, 0eb453a97h @@ -8047,11 +8082,11 @@ rx_i_461: ;XOR_64 mov ecx, r11d test bl, 63 jnz short rx_body_461 - call rx_read_l2 + call rx_read_l1 rx_body_461: - and ecx, 32767 + and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - xor rax, r13 + xor rax, 1659853721 mov rcx, rax mov eax, r12d xor eax, 062ef5b99h @@ -8065,14 +8100,14 @@ rx_i_462: ;ADD_64 mov ecx, r10d test bl, 63 jnz short rx_body_462 - call rx_read_l2 + call rx_read_l1 rx_body_462: - and ecx, 32767 + and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - add rax, -1734323376 + add rax, r8 mov r15, rax -rx_i_463: ;ADD_64 +rx_i_463: ;ADD_32 dec ebx jz rx_finish xor r9, 08c29341h @@ -8083,7 +8118,7 @@ rx_i_463: ;ADD_64 rx_body_463: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - add rax, r15 + add eax, r15d mov r10, rax rx_i_464: ;MUL_64 @@ -8111,14 +8146,14 @@ rx_i_465: ;FPADD mov ecx, r12d test bl, 63 jnz short rx_body_465 - call rx_read_l2 + call rx_read_l1 rx_body_465: - and ecx, 32767 + and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm5 movaps xmm2, xmm0 -rx_i_466: ;MUL_32 +rx_i_466: ;IMUL_32 dec ebx jz rx_finish xor r13, 05c541c42h @@ -8130,8 +8165,8 @@ rx_body_466: xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - mov ecx, eax - mov eax, 282682508 + movsxd rcx, eax + mov rax, 282682508 imul rax, rcx mov r9, rax @@ -8150,7 +8185,7 @@ rx_body_467: addpd xmm0, xmm9 movaps xmm8, xmm0 -rx_i_468: ;IMUL_32 +rx_i_468: ;IMULH_64 dec ebx jz rx_finish xor r8, 091044dc3h @@ -8162,9 +8197,9 @@ rx_body_468: xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - movsxd rcx, eax - mov rax, -13394825 - imul rax, rcx + mov rcx, r8 + imul rcx + mov rax, rdx mov rcx, rax mov eax, r8d xor eax, 0ff339c77h @@ -8178,12 +8213,12 @@ rx_i_469: ;MUL_32 mov ecx, r9d test bl, 63 jnz short rx_body_469 - call rx_read_l1 + call rx_read_l2 rx_body_469: - and ecx, 2047 + and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] mov ecx, eax - mov eax, 294019485 + mov eax, r9d imul rax, rcx mov rcx, rax mov eax, r9d @@ -8198,9 +8233,9 @@ rx_i_470: ;OR_64 mov ecx, r14d test bl, 63 jnz short rx_body_470 - call rx_read_l1 + call rx_read_l2 rx_body_470: - and ecx, 2047 + and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] or rax, r11 mov rcx, rax @@ -8216,9 +8251,9 @@ rx_i_471: ;IMUL_32 mov ecx, r14d test bl, 63 jnz short rx_body_471 - call rx_read_l1 + call rx_read_l2 rx_body_471: - and ecx, 2047 + and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] movsxd rcx, eax movsxd rax, r13d @@ -8232,10 +8267,10 @@ rx_i_472: ;JUMP mov ecx, r9d test bl, 63 jnz short rx_body_472 - call rx_read_l2 + call rx_read_l1 rx_body_472: xor rbp, rcx - and ecx, 32767 + and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] mov r10, rax cmp r10d, 1738497427 @@ -8252,7 +8287,7 @@ rx_i_473: ;MUL_64 rx_body_473: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - imul rax, rax, -751043211 + imul rax, rax, r11 mov r12, rax rx_i_474: ;JUMP @@ -8262,10 +8297,10 @@ rx_i_474: ;JUMP mov ecx, r9d test bl, 63 jnz short rx_body_474 - call rx_read_l2 + call rx_read_l1 rx_body_474: xor rbp, rcx - and ecx, 32767 + and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] mov r15, rax cmp r15d, -233120543 @@ -8278,9 +8313,9 @@ rx_i_475: ;FPSUB mov ecx, r10d test bl, 63 jnz short rx_body_475 - call rx_read_l2 + call rx_read_l1 rx_body_475: - and ecx, 32767 + and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] subpd xmm0, xmm9 movaps xmm7, xmm0 @@ -8306,9 +8341,9 @@ rx_i_477: ;FPADD mov ecx, r12d test bl, 63 jnz short rx_body_477 - call rx_read_l1 + call rx_read_l2 rx_body_477: - and ecx, 2047 + and ecx, 32767 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm9 movaps xmm6, xmm0 @@ -8324,9 +8359,9 @@ rx_i_478: ;MUL_64 mov ecx, r14d test bl, 63 jnz short rx_body_478 - call rx_read_l1 + call rx_read_l2 rx_body_478: - and ecx, 2047 + and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] imul rax, r10 mov r12, rax @@ -8363,7 +8398,7 @@ rx_body_480: addpd xmm0, xmm4 movaps xmm6, xmm0 -rx_i_481: ;IMUL_32 +rx_i_481: ;IMULH_64 dec ebx jz rx_finish xor r14, 0225ba1f9h @@ -8374,9 +8409,9 @@ rx_i_481: ;IMUL_32 rx_body_481: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - movsxd rcx, eax - movsxd rax, r13d - imul rax, rcx + mov rcx, r13 + imul rcx + mov rax, rdx mov r12, rax rx_i_482: ;AND_32 @@ -8386,11 +8421,11 @@ rx_i_482: ;AND_32 mov ecx, r14d test bl, 63 jnz short rx_body_482 - call rx_read_l2 + call rx_read_l1 rx_body_482: - and ecx, 32767 + and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - and eax, r12d + and eax, 1304556205 mov r11, rax rx_i_483: ;FPADD @@ -8429,10 +8464,10 @@ rx_i_485: ;JUMP mov ecx, r13d test bl, 63 jnz short rx_body_485 - call rx_read_l2 + call rx_read_l1 rx_body_485: xor rbp, rcx - and ecx, 32767 + and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] mov rcx, rax mov eax, r15d @@ -8453,7 +8488,7 @@ rx_i_486: ;ADD_64 rx_body_486: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - add rax, 942846898 + add rax, r8 mov rcx, rax mov eax, r8d xor eax, 03832b3b2h @@ -8471,7 +8506,7 @@ rx_i_487: ;SUB_64 rx_body_487: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - sub rax, -333279706 + sub rax, r9 mov r11, rax rx_i_488: ;IMUL_32 @@ -8481,9 +8516,9 @@ rx_i_488: ;IMUL_32 mov ecx, r12d test bl, 63 jnz short rx_body_488 - call rx_read_l1 + call rx_read_l2 rx_body_488: - and ecx, 2047 + and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] movsxd rcx, eax mov rax, 297357073 @@ -8517,10 +8552,10 @@ rx_i_490: ;ROR_64 mov ecx, r11d test bl, 63 jnz short rx_body_490 - call rx_read_l2 + call rx_read_l1 rx_body_490: xor rbp, rcx - and ecx, 32767 + and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] mov rcx, r9 ror rax, cl @@ -8537,9 +8572,9 @@ rx_i_491: ;FPADD mov ecx, r8d test bl, 63 jnz short rx_body_491 - call rx_read_l2 + call rx_read_l1 rx_body_491: - and ecx, 32767 + and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] addpd xmm0, xmm9 movaps xmm7, xmm0 @@ -8555,25 +8590,20 @@ rx_i_492: ;IDIV_64 mov ecx, r9d test bl, 63 jnz short rx_body_492 - call rx_read_l2 + call rx_read_l1 rx_body_492: - and ecx, 32767 + and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - mov edx, r9d - cmp edx, -1 - jne short safe_idiv_492 + ; magic divide by -1779388031 mov rcx, rax - rol rcx, 1 - dec rcx - jz short result_idiv_492 -safe_idiv_492: - mov ecx, 1 - test edx, edx - cmovne ecx, edx - movsxd rcx, ecx - cqo - idiv rcx -result_idiv_492: + mov rdx, 7315366159790064091 + imul rdx + mov rax, rdx + xor edx, edx + sub rax, rcx + sar rax, 30 + sets dl + add rax, rdx mov r12, rax rx_i_493: ;FPSUB @@ -8590,20 +8620,20 @@ rx_body_493: subpd xmm0, xmm9 movaps xmm4, xmm0 -rx_i_494: ;MULH_64 +rx_i_494: ;MUL_32 dec ebx jz rx_finish xor r10, 0b0d50e46h mov ecx, r10d test bl, 63 jnz short rx_body_494 - call rx_read_l2 + call rx_read_l1 rx_body_494: - and ecx, 32767 + and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - mov rcx, r11 - mul rcx - mov rax, rdx + mov ecx, eax + mov eax, r11d + imul rax, rcx mov r14, rax rx_i_495: ;FPMUL @@ -8613,9 +8643,9 @@ rx_i_495: ;FPMUL mov ecx, r11d test bl, 63 jnz short rx_body_495 - call rx_read_l1 + call rx_read_l2 rx_body_495: - and ecx, 2047 + and ecx, 32767 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] mulpd xmm0, xmm2 movaps xmm1, xmm0 @@ -8623,7 +8653,7 @@ rx_body_495: andps xmm0, xmm1 movaps xmm8, xmm0 -rx_i_496: ;DIV_64 +rx_i_496: ;IDIV_64 dec ebx jz rx_finish xor r14, 0fe757b73h @@ -8634,9 +8664,14 @@ rx_i_496: ;DIV_64 rx_body_496: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - mov ecx, -359802064 + ; magic divide by -359802064 + mov rdx, -860153514353783887 + imul rdx + mov rax, rdx xor edx, edx - div rcx + sar rax, 24 + sets dl + add rax, rdx mov r9, rax rx_i_497: ;FPMUL @@ -8678,19 +8713,19 @@ rx_body_498: and eax, 2047 movlpd qword ptr [rsi + rax * 8], xmm8 -rx_i_499: ;MUL_32 +rx_i_499: ;IMUL_32 dec ebx jz rx_finish xor r12, 08925556bh mov ecx, r12d test bl, 63 jnz short rx_body_499 - call rx_read_l2 + call rx_read_l1 rx_body_499: - and ecx, 32767 + and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - mov ecx, eax - mov eax, -1795485757 + movsxd rcx, eax + mov rax, -1795485757 imul rax, rcx mov r8, rax @@ -8701,9 +8736,9 @@ rx_i_500: ;FPSQRT mov ecx, r10d test bl, 63 jnz short rx_body_500 - call rx_read_l1 + call rx_read_l2 rx_body_500: - and ecx, 2047 + and ecx, 32767 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] andps xmm0, xmm10 sqrtpd xmm2, xmm0 @@ -8733,10 +8768,10 @@ rx_i_502: ;RET mov ecx, r10d test bl, 63 jnz short rx_body_502 - call rx_read_l1 + call rx_read_l2 rx_body_502: xor rbp, rcx - and ecx, 2047 + and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] mov rcx, rax mov eax, r9d @@ -8754,9 +8789,9 @@ rx_i_503: ;FPSUB mov ecx, r13d test bl, 63 jnz short rx_body_503 - call rx_read_l1 + call rx_read_l2 rx_body_503: - and ecx, 2047 + and ecx, 32767 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] subpd xmm0, xmm2 movaps xmm9, xmm0 @@ -8790,9 +8825,9 @@ rx_i_505: ;FPSUB mov ecx, r12d test bl, 63 jnz short rx_body_505 - call rx_read_l2 + call rx_read_l1 rx_body_505: - and ecx, 32767 + and ecx, 2047 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] subpd xmm0, xmm4 movaps xmm8, xmm0 @@ -8808,9 +8843,9 @@ rx_i_506: ;FPSUB mov ecx, r9d test bl, 63 jnz short rx_body_506 - call rx_read_l1 + call rx_read_l2 rx_body_506: - and ecx, 2047 + and ecx, 32767 cvtdq2pd xmm0, qword ptr [rsi+rcx*8] subpd xmm0, xmm9 movaps xmm3, xmm0 @@ -8887,12 +8922,11 @@ rx_i_511: ;SHR_64 mov ecx, r11d test bl, 63 jnz short rx_body_511 - call rx_read_l1 + call rx_read_l2 rx_body_511: - and ecx, 2047 + and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] - mov rcx, r10 - shr rax, cl + shr rax, 56 mov r11, rax jmp rx_i_0 From 1426fcbab5a8f1aa6213203f5713ddacbd70abc6 Mon Sep 17 00:00:00 2001 From: tevador Date: Sat, 12 Jan 2019 16:05:09 +0100 Subject: [PATCH 14/35] Print average program code size Fixed assembly for MUL_64 and IMUL_32 Division weight 4 -> 8 --- src/AssemblyGeneratorX86.cpp | 4 +- src/CompiledVirtualMachine.cpp | 3 +- src/CompiledVirtualMachine.hpp | 4 + src/JitCompilerX86.cpp | 18 +- src/JitCompilerX86.hpp | 2 + src/instructionWeights.hpp | 11 +- src/main.cpp | 2 + src/program.inc | 518 +++++++++++++++++++-------------- 8 files changed, 337 insertions(+), 225 deletions(-) diff --git a/src/AssemblyGeneratorX86.cpp b/src/AssemblyGeneratorX86.cpp index efa0818..4cb009e 100644 --- a/src/AssemblyGeneratorX86.cpp +++ b/src/AssemblyGeneratorX86.cpp @@ -222,7 +222,7 @@ namespace RandomX { void AssemblyGeneratorX86::h_MUL_64(Instruction& instr, int i) { genar(instr, i); asmCode << "\timul rax, "; - if ((instr.locb & 7) >= 6) { + if ((instr.locb & 3) == 0) { asmCode << "rax, "; } genbia(instr); @@ -250,7 +250,7 @@ namespace RandomX { void AssemblyGeneratorX86::h_IMUL_32(Instruction& instr, int i) { genar(instr, i); asmCode << "\tmovsxd rcx, eax" << std::endl; - if ((instr.locb & 7) >= 6) { + if ((instr.locb & 3) == 0) { asmCode << "\tmov rax, " << instr.imm32 << std::endl; } else { diff --git a/src/CompiledVirtualMachine.cpp b/src/CompiledVirtualMachine.cpp index 7803003..ef78d2f 100644 --- a/src/CompiledVirtualMachine.cpp +++ b/src/CompiledVirtualMachine.cpp @@ -26,7 +26,7 @@ along with RandomX. If not, see. namespace RandomX { CompiledVirtualMachine::CompiledVirtualMachine(bool softAes) : VirtualMachine(softAes) { - + totalSize = 0; } void CompiledVirtualMachine::setDataset(dataset_t ds, bool lightClient) { @@ -48,6 +48,7 @@ namespace RandomX { void CompiledVirtualMachine::execute() { //executeProgram(reg, mem, scratchpad, readDataset); + totalSize += compiler.getCodeSize(); compiler.getProgramFunc()(reg, mem, scratchpad); #ifdef TRACEVM for (int32_t i = InstructionCount - 1; i >= 0; --i) { diff --git a/src/CompiledVirtualMachine.hpp b/src/CompiledVirtualMachine.hpp index cf131d1..a77bdb8 100644 --- a/src/CompiledVirtualMachine.hpp +++ b/src/CompiledVirtualMachine.hpp @@ -44,10 +44,14 @@ namespace RandomX { void* getProgram() { return compiler.getCode(); } + uint64_t getTotalSize() { + return totalSize; + } private: #ifdef TRACEVM convertible_t tracepad[InstructionCount]; #endif JitCompilerX86 compiler; + uint64_t totalSize; }; } \ No newline at end of file diff --git a/src/JitCompilerX86.cpp b/src/JitCompilerX86.cpp index 32bad3a..2a101f0 100644 --- a/src/JitCompilerX86.cpp +++ b/src/JitCompilerX86.cpp @@ -116,6 +116,10 @@ namespace RandomX { const int32_t readDatasetL1Offset = readDatasetL2Offset - readDatasetL1Size; const int32_t epilogueOffset = readDatasetL1Offset - epilogueSize; + size_t JitCompilerX86::getCodeSize() { + return codePos - prologueSize + readDatasetL1Size + readDatasetL2Size; + } + JitCompilerX86::JitCompilerX86() { #ifdef _WIN32 code = (uint8_t*)VirtualAlloc(nullptr, CodeSize, MEM_COMMIT, PAGE_EXECUTE_READWRITE); @@ -196,6 +200,7 @@ namespace RandomX { void JitCompilerX86::genar(Instruction& instr) { gena(instr); emit(0xce048b48); //mov rax,QWORD PTR [rsi+rcx*8] + emit(0xdc580f66); } void JitCompilerX86::genaf(Instruction& instr) { @@ -437,7 +442,7 @@ namespace RandomX { void JitCompilerX86::h_DIV_64(Instruction& instr, int i) { genar(instr); - if (instr.locb & 3) { + if (instr.locb & 7) { #ifdef MAGIC_DIVISION if (instr.imm32 != 0) { uint32_t divisor = instr.imm32; @@ -496,7 +501,7 @@ namespace RandomX { void JitCompilerX86::h_IDIV_64(Instruction& instr, int i) { genar(instr); - if (instr.locb & 3) { + if (instr.locb & 7) { #ifdef MAGIC_DIVISION int64_t divisor = instr.imm32; if ((divisor & -divisor) == divisor || (divisor & -divisor) == -divisor) { @@ -566,8 +571,8 @@ namespace RandomX { #ifndef MAGIC_DIVISION } #endif - emit(0xc88b480b75fffa83); - emit(0x1274c9ff48c1d148); + emit(0xd8f7480575fffa83); //cmp edx,-1 + emit(uint16_t(0x12eb)); //jmp result emit(0x0fd28500000001b9); emit(0x489948c96348ca45); emit(uint16_t(0xf9f7)); //idiv rcx @@ -766,6 +771,10 @@ namespace RandomX { emitByte(0xc3); //ret } + void JitCompilerX86::h_NOP(Instruction& instr, int i) { + genar(instr); + } + #include "instructionWeights.hpp" #define INST_HANDLE(x) REPN(&JitCompilerX86::h_##x, WT(x)) @@ -801,6 +810,7 @@ namespace RandomX { INST_HANDLE(JUMP) INST_HANDLE(CALL) INST_HANDLE(RET) + INST_HANDLE(NOP) }; #endif diff --git a/src/JitCompilerX86.hpp b/src/JitCompilerX86.hpp index d95cbad..0c0c48c 100644 --- a/src/JitCompilerX86.hpp +++ b/src/JitCompilerX86.hpp @@ -51,6 +51,7 @@ namespace RandomX { uint8_t* getCode() { return code; } + size_t getCodeSize(); private: static InstructionGeneratorX86 engine[256]; uint8_t* code; @@ -114,6 +115,7 @@ namespace RandomX { void h_JUMP(Instruction&, int); void h_CALL(Instruction&, int); void h_RET(Instruction&, int); + void h_NOP(Instruction&, int); }; } \ No newline at end of file diff --git a/src/instructionWeights.hpp b/src/instructionWeights.hpp index 7771a35..de027b7 100644 --- a/src/instructionWeights.hpp +++ b/src/instructionWeights.hpp @@ -24,12 +24,12 @@ along with RandomX. If not, see. #define WT_SUB_64 12 #define WT_SUB_32 2 #define WT_MUL_64 23 -#define WT_MULH_64 10 +#define WT_MULH_64 5 #define WT_MUL_32 15 #define WT_IMUL_32 15 -#define WT_IMULH_64 6 -#define WT_DIV_64 4 -#define WT_IDIV_64 4 +#define WT_IMULH_64 3 +#define WT_DIV_64 8 +#define WT_IDIV_64 8 #define WT_AND_64 4 #define WT_AND_32 2 #define WT_OR_64 4 @@ -50,6 +50,7 @@ along with RandomX. If not, see. #define WT_JUMP 11 #define WT_CALL 11 #define WT_RET 12 +#define WT_NOP 0 constexpr int wtSum = WT_ADD_64 + WT_ADD_32 + WT_SUB_64 + WT_SUB_32 + \ @@ -57,7 +58,7 @@ WT_MUL_64 + WT_MULH_64 + WT_MUL_32 + WT_IMUL_32 + WT_IMULH_64 + \ WT_DIV_64 + WT_IDIV_64 + WT_AND_64 + WT_AND_32 + WT_OR_64 + \ WT_OR_32 + WT_XOR_64 + WT_XOR_32 + WT_SHL_64 + WT_SHR_64 + \ WT_SAR_64 + WT_ROL_64 + WT_ROR_64 + WT_FPADD + WT_FPSUB + WT_FPMUL \ -+ WT_FPDIV + WT_FPSQRT + WT_FPROUND + WT_JUMP + WT_CALL + WT_RET; ++ WT_FPDIV + WT_FPSQRT + WT_FPROUND + WT_JUMP + WT_CALL + WT_RET + WT_NOP; static_assert(wtSum == 256, "Sum of instruction weights must be 256"); diff --git a/src/main.cpp b/src/main.cpp index a0ffc0a..6366821 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -270,6 +270,8 @@ int main(int argc, char** argv) { } else { mine(vms[0], std::ref(atomicNonce), std::ref(result), programCount, 0); + if (compiled) + std::cout << "Average program size: " << ((RandomX::CompiledVirtualMachine*)vms[0])->getTotalSize() / programCount << std::endl; } double elapsed = sw.getElapsed(); std::cout << "Calculated result: "; diff --git a/src/program.inc b/src/program.inc index 79a7dda..538f664 100644 --- a/src/program.inc +++ b/src/program.inc @@ -19,7 +19,7 @@ rx_body_0: ja short rx_i_1 call rx_i_30 -rx_i_1: ;DIV_64 +rx_i_1: ;IDIV_64 dec ebx jz rx_finish xor r15, 06afc2fa4h @@ -30,12 +30,19 @@ rx_i_1: ;DIV_64 rx_body_1: and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] - mov ecx, 1 mov edx, r10d + cmp edx, -1 + jne short body_idiv_1 + neg rax + jmp short result_idiv_1 +body_idiv_1: + mov ecx, 1 test edx, edx cmovne ecx, edx - xor edx, edx - div rcx + movsxd rcx, ecx + cqo + idiv rcx +result_idiv_1: mov r12, rax rx_i_2: ;JUMP @@ -80,7 +87,7 @@ rx_body_3: and eax, 32767 movhpd qword ptr [rsi + rax * 8], xmm8 -rx_i_4: ;MULH_64 +rx_i_4: ;MUL_32 dec ebx jz rx_finish xor r14, 077daefb4h @@ -91,16 +98,16 @@ rx_i_4: ;MULH_64 rx_body_4: and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] - mov rcx, r14 - mul rcx - mov rax, rdx + mov ecx, eax + mov eax, r14d + imul rax, rcx mov rcx, rax mov eax, r9d xor eax, 06ce10c20h and eax, 2047 mov qword ptr [rsi + rax * 8], rcx -rx_i_5: ;MUL_32 +rx_i_5: ;IMUL_32 dec ebx jz rx_finish xor r15, 0379f9ee0h @@ -112,8 +119,8 @@ rx_body_5: xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - mov ecx, eax - mov eax, 1037420699 + movsxd rcx, eax + movsxd rax, r12d imul rax, rcx mov r12, rax @@ -171,7 +178,7 @@ rx_body_8: and eax, 2047 mov qword ptr [rsi + rax * 8], rcx -rx_i_9: ;DIV_64 +rx_i_9: ;IDIV_64 dec ebx jz rx_finish xor r14, 085121c54h @@ -184,10 +191,13 @@ rx_body_9: and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] ; magic divide by 565870810 - mov rcx, 8750690209911200579 - mul rcx + mov rdx, 8750690209911200579 + imul rdx mov rax, rdx - shr rax, 28 + xor edx, edx + sar rax, 28 + sets dl + add rax, rdx mov r10, rax rx_i_10: ;AND_64 @@ -434,10 +444,10 @@ rx_i_23: ;MUL_64 rx_body_23: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - imul rax, 1283724485 + imul rax, rax, 1283724485 mov r8, rax -rx_i_24: ;IMUL_32 +rx_i_24: ;DIV_64 dec ebx jz rx_finish xor r8, 070d3b8c7h @@ -449,9 +459,12 @@ rx_body_24: xor rbp, rcx and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] - movsxd rcx, eax - movsxd rax, r15d - imul rax, rcx + mov ecx, 1 + mov edx, r15d + test edx, edx + cmovne ecx, edx + xor edx, edx + div rcx mov rcx, rax mov eax, r15d xor eax, 099b77a68h @@ -480,7 +493,7 @@ rx_body_25: and eax, 2047 movlpd qword ptr [rsi + rax * 8], xmm6 -rx_i_26: ;IMUL_32 +rx_i_26: ;IMULH_64 dec ebx jz rx_finish xor r11, 0e311468ch @@ -491,9 +504,9 @@ rx_i_26: ;IMUL_32 rx_body_26: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - movsxd rcx, eax - movsxd rax, r13d - imul rax, rcx + mov rcx, 812644844 + imul rcx + mov rax, rdx mov rcx, rax mov eax, r9d xor eax, 0306ff9ech @@ -933,7 +946,7 @@ rx_body_53: je short rx_i_54 ret -rx_i_54: ;IMULH_64 +rx_i_54: ;DIV_64 dec ebx jz rx_finish xor r11, 060638de0h @@ -944,9 +957,11 @@ rx_i_54: ;IMULH_64 rx_body_54: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - mov rcx, r8 - imul rcx + ; magic divide by 282209221 + mov rcx, 1096650948274100047 + mul rcx mov rax, rdx + shr rax, 24 mov rcx, rax mov eax, r12d xor eax, 010d22bc5h @@ -974,7 +989,7 @@ rx_body_55: and eax, 2047 movhpd qword ptr [rsi + rax * 8], xmm3 -rx_i_56: ;DIV_64 +rx_i_56: ;IDIV_64 dec ebx jz rx_finish xor r14, 0f1456b8eh @@ -985,13 +1000,16 @@ rx_i_56: ;DIV_64 rx_body_56: and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] - ; magic divide by 4244198545 - add rax, 1 - sbb rax, 0 - mov rcx, 9333701248213440683 - mul rcx + ; magic divide by -50768751 + mov rcx, rax + mov rdx, 6254795139557318139 + imul rdx mov rax, rdx - shr rax, 31 + xor edx, edx + sub rax, rcx + sar rax, 25 + sets dl + add rax, rdx mov rcx, rax mov eax, r8d xor eax, 0fcf95491h @@ -1009,14 +1027,14 @@ rx_i_57: ;MUL_64 rx_body_57: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - imul rax, 172123015 + imul rax, rax, 172123015 mov rcx, rax mov eax, r15d xor eax, 0a426387h and eax, 2047 mov qword ptr [rsi + rax * 8], rcx -rx_i_58: ;IMULH_64 +rx_i_58: ;DIV_64 dec ebx jz rx_finish xor r14, 0bcec0ebah @@ -1027,9 +1045,11 @@ rx_i_58: ;IMULH_64 rx_body_58: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - mov rcx, r13 - imul rcx + ; magic divide by 1506547423 + mov rcx, 6573653217342526495 + mul rcx mov rax, rdx + shr rax, 29 mov r8, rax rx_i_59: ;FPSUB @@ -1294,7 +1314,7 @@ rx_body_74: xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - imul rax, rax, r13 + imul rax, r13 mov rcx, rax mov eax, r9d xor eax, 0aaaacb32h @@ -1355,7 +1375,7 @@ rx_body_77: je short rx_i_78 ret -rx_i_78: ;MULH_64 +rx_i_78: ;MUL_32 dec ebx jz rx_finish xor r9, 0edeca680h @@ -1366,9 +1386,9 @@ rx_i_78: ;MULH_64 rx_body_78: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - mov rcx, r8 - mul rcx - mov rax, rdx + mov ecx, eax + mov eax, r8d + imul rax, rcx mov r15, rax rx_i_79: ;CALL @@ -1443,7 +1463,7 @@ rx_body_82: cmp r12d, -68969733 jo rx_i_145 -rx_i_83: ;DIV_64 +rx_i_83: ;IDIV_64 dec ebx jz rx_finish xor r10, 0d9b6a533h @@ -1455,10 +1475,13 @@ rx_body_83: and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] ; magic divide by 91850728 - mov rcx, 13477737914993774191 - mul rcx + mov rdx, 842358619687110887 + imul rdx mov rax, rdx - shr rax, 26 + xor edx, edx + sar rax, 22 + sets dl + add rax, rdx mov r12, rax rx_i_84: ;SAR_64 @@ -1490,7 +1513,7 @@ rx_i_85: ;MUL_64 rx_body_85: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - imul rax, 20014507 + imul rax, rax, 20014507 mov r10, rax rx_i_86: ;AND_64 @@ -1661,7 +1684,7 @@ rx_body_95: and eax, 32767 mov qword ptr [rsi + rax * 8], rcx -rx_i_96: ;MUL_32 +rx_i_96: ;IMUL_32 dec ebx jz rx_finish xor r11, 04f912ef8h @@ -1673,8 +1696,8 @@ rx_body_96: xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - mov ecx, eax - mov eax, r11d + movsxd rcx, eax + mov rax, -1354397081 imul rax, rcx mov r11, rax @@ -1797,7 +1820,7 @@ rx_body_103: and eax, 2047 mov qword ptr [rsi + rax * 8], rcx -rx_i_104: ;IMUL_32 +rx_i_104: ;DIV_64 dec ebx jz rx_finish xor r11, 075deaf71h @@ -1808,9 +1831,11 @@ rx_i_104: ;IMUL_32 rx_body_104: and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] - movsxd rcx, eax - mov rax, -1913070089 - imul rax, rcx + ; magic divide by 2381897207 + mov rcx, 16631314374404138087 + mul rcx + mov rax, rdx + shr rax, 31 mov rcx, rax mov eax, r15d xor eax, 08df8ddf7h @@ -1992,7 +2017,7 @@ rx_body_113: mov rax, rdx mov r13, rax -rx_i_114: ;IMULH_64 +rx_i_114: ;DIV_64 dec ebx jz rx_finish xor r13, 06e83e2cdh @@ -2003,9 +2028,11 @@ rx_i_114: ;IMULH_64 rx_body_114: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - mov rcx, r15 - imul rcx + ; magic divide by 770835683 + mov rcx, 12847770974664443757 + mul rcx mov rax, rdx + shr rax, 29 mov r14, rax rx_i_115: ;IDIV_64 @@ -2029,7 +2056,7 @@ rx_body_115: add rax, rdx mov r14, rax -rx_i_116: ;IMUL_32 +rx_i_116: ;DIV_64 dec ebx jz rx_finish xor r10, 0d122702eh @@ -2040,16 +2067,18 @@ rx_i_116: ;IMUL_32 rx_body_116: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - movsxd rcx, eax - mov rax, -1850776691 - imul rax, rcx + ; magic divide by 2444190605 + mov rcx, 16207443550472271289 + mul rcx + mov rax, rdx + shr rax, 31 mov rcx, rax mov eax, r8d xor eax, 091af638dh and eax, 32767 mov qword ptr [rsi + rax * 8], rcx -rx_i_117: ;DIV_64 +rx_i_117: ;IDIV_64 dec ebx jz rx_finish xor r11, 015f2012bh @@ -2060,11 +2089,14 @@ rx_i_117: ;DIV_64 rx_body_117: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - ; magic divide by 3089140324 - mov rcx, 12823658721283834045 - mul rcx + ; magic divide by -1205826972 + mov rdx, -8213052572424165513 + imul rdx mov rax, rdx - shr rax, 31 + xor edx, edx + sar rax, 29 + sets dl + add rax, rdx mov rcx, rax mov eax, r15d xor eax, 0b8208a64h @@ -2181,7 +2213,7 @@ rx_body_124: cmp r11d, 1719505436 jns rx_i_237 -rx_i_125: ;MUL_32 +rx_i_125: ;IMUL_32 dec ebx jz rx_finish xor r8, 0ebec27cdh @@ -2193,8 +2225,8 @@ rx_body_125: xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - mov ecx, eax - mov eax, 1774711622 + movsxd rcx, eax + movsxd rax, r14d imul rax, rcx mov r14, rax @@ -2511,7 +2543,7 @@ rx_body_143: imul rax, rcx mov r9, rax -rx_i_144: ;IMULH_64 +rx_i_144: ;DIV_64 dec ebx jz rx_finish xor r10, 02e59e00ah @@ -2522,12 +2554,15 @@ rx_i_144: ;IMULH_64 rx_body_144: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - mov rcx, -1304483355 - imul rcx - mov rax, rdx + mov ecx, 1 + mov edx, r11d + test edx, edx + cmovne ecx, edx + xor edx, edx + div rcx mov r15, rax -rx_i_145: ;IMULH_64 +rx_i_145: ;DIV_64 dec ebx jz rx_finish xor r13, 08d5c798h @@ -2538,16 +2573,18 @@ rx_i_145: ;IMULH_64 rx_body_145: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - mov rcx, r11 - imul rcx + ; magic divide by 3712555397 + mov rcx, 10670300378317066981 + mul rcx mov rax, rdx + shr rax, 31 mov rcx, rax mov eax, r10d xor eax, 0dd491985h and eax, 2047 mov qword ptr [rsi + rax * 8], rcx -rx_i_146: ;IMUL_32 +rx_i_146: ;IMULH_64 dec ebx jz rx_finish xor r13, 02327e6e2h @@ -2559,9 +2596,9 @@ rx_body_146: xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - movsxd rcx, eax - movsxd rax, r12d - imul rax, rcx + mov rcx, r12 + imul rcx + mov rax, rdx mov r10, rax rx_i_147: ;MUL_64 @@ -2576,7 +2613,7 @@ rx_body_147: xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - imul rax, rax, r11 + imul rax, r11 mov rcx, rax mov eax, r12d xor eax, 06a5bda88h @@ -2621,7 +2658,7 @@ rx_body_149: and eax, 2047 mov qword ptr [rsi + rax * 8], rcx -rx_i_150: ;IMULH_64 +rx_i_150: ;DIV_64 dec ebx jz rx_finish xor r9, 01504ca7ah @@ -2632,9 +2669,12 @@ rx_i_150: ;IMULH_64 rx_body_150: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - mov rcx, -933976796 - imul rcx - mov rax, rdx + mov ecx, 1 + mov edx, r8d + test edx, edx + cmovne ecx, edx + xor edx, edx + div rcx mov rcx, rax mov eax, r9d xor eax, 0c854a524h @@ -2872,7 +2912,7 @@ rx_body_163: and eax, 2047 mov qword ptr [rsi + rax * 8], rcx -rx_i_164: ;MULH_64 +rx_i_164: ;MUL_32 dec ebx jz rx_finish xor r12, 01f0c2737h @@ -2884,9 +2924,9 @@ rx_body_164: xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - mov rcx, r9 - mul rcx - mov rax, rdx + mov ecx, eax + mov eax, r9d + imul rax, rcx mov rcx, rax mov eax, r13d xor eax, 09aa6da19h @@ -3007,7 +3047,7 @@ rx_body_170: and eax, 32767 movlpd qword ptr [rsi + rax * 8], xmm6 -rx_i_171: ;IMULH_64 +rx_i_171: ;DIV_64 dec ebx jz rx_finish xor r15, 09901e05bh @@ -3018,9 +3058,13 @@ rx_i_171: ;IMULH_64 rx_body_171: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - mov rcx, r12 - imul rcx + ; magic divide by 2064150457 + add rax, 1 + sbb rax, 0 + mov rcx, 4797867461985617359 + mul rcx mov rax, rdx + shr rax, 29 mov r12, rax rx_i_172: ;SUB_64 @@ -3049,7 +3093,7 @@ rx_body_173: xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - imul rax, -1386172772 + imul rax, rax, -1386172772 mov rcx, rax mov eax, r12d xor eax, 0ad60ae9ch @@ -3371,7 +3415,7 @@ rx_body_192: and eax, 32767 movlpd qword ptr [rsi + rax * 8], xmm8 -rx_i_193: ;MULH_64 +rx_i_193: ;MUL_32 dec ebx jz rx_finish xor r12, 0e9939ach @@ -3382,9 +3426,9 @@ rx_i_193: ;MULH_64 rx_body_193: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - mov rcx, r12 - mul rcx - mov rax, rdx + mov ecx, eax + mov eax, r12d + imul rax, rcx mov rcx, rax mov eax, r15d xor eax, 074e097dch @@ -3656,7 +3700,7 @@ rx_i_208: ;MUL_64 rx_body_208: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - imul rax, -486588965 + imul rax, rax, -486588965 mov r10, rax rx_i_209: ;XOR_64 @@ -3878,7 +3922,7 @@ rx_body_220: and eax, 32767 mov qword ptr [rsi + rax * 8], rcx -rx_i_221: ;IMULH_64 +rx_i_221: ;DIV_64 dec ebx jz rx_finish xor r9, 0a3deb512h @@ -3889,9 +3933,12 @@ rx_i_221: ;IMULH_64 rx_body_221: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - mov rcx, 2146087761 - imul rcx - mov rax, rdx + mov ecx, 1 + mov edx, r15d + test edx, edx + cmovne ecx, edx + xor edx, edx + div rcx mov rcx, rax mov eax, r11d xor eax, 07feab351h @@ -3956,7 +4003,7 @@ rx_body_224: and eax, 2047 mov qword ptr [rsi + rax * 8], rcx -rx_i_225: ;IMULH_64 +rx_i_225: ;DIV_64 dec ebx jz rx_finish xor r13, 0c558367eh @@ -3967,9 +4014,12 @@ rx_i_225: ;IMULH_64 rx_body_225: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - mov rcx, r10 - imul rcx + ; magic divide by 4264577610 + shr rax, 1 + mov rcx, 9289098447696480965 + mul rcx mov rax, rdx + shr rax, 30 mov rcx, rax mov eax, r12d xor eax, 0fe304a4ah @@ -4030,7 +4080,7 @@ rx_body_228: andps xmm0, xmm10 sqrtpd xmm7, xmm0 -rx_i_229: ;IMUL_32 +rx_i_229: ;IMULH_64 dec ebx jz rx_finish xor r11, 05c535836h @@ -4041,9 +4091,9 @@ rx_i_229: ;IMUL_32 rx_body_229: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - movsxd rcx, eax - movsxd rax, r12d - imul rax, rcx + mov rcx, 334017248 + imul rcx + mov rax, rdx mov rcx, rax mov eax, r13d xor eax, 013e8b2e0h @@ -4142,7 +4192,7 @@ rx_body_234: andps xmm0, xmm1 movaps xmm4, xmm0 -rx_i_235: ;MUL_32 +rx_i_235: ;IMUL_32 dec ebx jz rx_finish xor r13, 0b6cb9ff2h @@ -4153,8 +4203,8 @@ rx_i_235: ;MUL_32 rx_body_235: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - mov ecx, eax - mov eax, 212286089 + movsxd rcx, eax + movsxd rax, r12d imul rax, rcx mov rcx, rax mov eax, r15d @@ -4224,7 +4274,7 @@ rx_body_239: add rax, r10 mov r10, rax -rx_i_240: ;IMUL_32 +rx_i_240: ;IMULH_64 dec ebx jz rx_finish xor r9, 0d65d29f9h @@ -4236,9 +4286,9 @@ rx_body_240: xor rbp, rcx and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] - movsxd rcx, eax - mov rax, -423830277 - imul rax, rcx + mov rcx, r14 + imul rcx + mov rax, rdx mov r8, rax rx_i_241: ;FPADD @@ -4259,7 +4309,7 @@ rx_body_241: and eax, 2047 movlpd qword ptr [rsi + rax * 8], xmm7 -rx_i_242: ;MULH_64 +rx_i_242: ;MUL_32 dec ebx jz rx_finish xor r12, 01119b0f9h @@ -4270,9 +4320,9 @@ rx_i_242: ;MULH_64 rx_body_242: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - mov rcx, r12 - mul rcx - mov rax, rdx + mov ecx, eax + mov eax, r12d + imul rax, rcx mov rcx, rax mov eax, r10d xor eax, 0130882f2h @@ -4331,7 +4381,7 @@ rx_body_245: and eax, 2047 mov qword ptr [rsi + rax * 8], rcx -rx_i_246: ;DIV_64 +rx_i_246: ;IDIV_64 dec ebx jz rx_finish xor r15, 027eeaa2eh @@ -4343,14 +4393,17 @@ rx_body_246: xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - ; magic divide by 4138158808 - mov rcx, 9572876028959826425 - mul rcx + ; magic divide by -156808488 + mov rdx, -3947299202596036367 + imul rdx mov rax, rdx - shr rax, 31 + xor edx, edx + sar rax, 25 + sets dl + add rax, rdx mov r12, rax -rx_i_247: ;MUL_32 +rx_i_247: ;IMUL_32 dec ebx jz rx_finish xor r10, 0c4de0296h @@ -4361,8 +4414,8 @@ rx_i_247: ;MUL_32 rx_body_247: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - mov ecx, eax - mov eax, r14d + movsxd rcx, eax + movsxd rax, r14d imul rax, rcx mov rcx, rax mov eax, r9d @@ -4391,7 +4444,7 @@ rx_body_248: and eax, 2047 mov qword ptr [rsi + rax * 8], rcx -rx_i_249: ;IMUL_32 +rx_i_249: ;IMULH_64 dec ebx jz rx_finish xor r15, 0499552cch @@ -4403,9 +4456,9 @@ rx_body_249: xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - movsxd rcx, eax - movsxd rax, r11d - imul rax, rcx + mov rcx, -508571655 + imul rcx + mov rax, rdx mov rcx, rax mov eax, r13d xor eax, 0e1afcff9h @@ -4957,7 +5010,7 @@ rx_body_279: and eax, 2047 movlpd qword ptr [rsi + rax * 8], xmm9 -rx_i_280: ;DIV_64 +rx_i_280: ;IDIV_64 dec ebx jz rx_finish xor r12, 066246b43h @@ -4969,10 +5022,13 @@ rx_body_280: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] ; magic divide by 555412224 - mov rcx, 2228867111296024113 - mul rcx + mov rdx, 2228867111296024113 + imul rdx mov rax, rdx - shr rax, 26 + xor edx, edx + sar rax, 26 + sets dl + add rax, rdx mov rcx, rax mov eax, r13d xor eax, 0211aeb00h @@ -5384,7 +5440,7 @@ rx_i_304: ;MUL_64 rx_body_304: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - imul rax, 2007686513 + imul rax, rax, 2007686513 mov r13, rax rx_i_305: ;MUL_64 @@ -5398,7 +5454,7 @@ rx_i_305: ;MUL_64 rx_body_305: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - imul rax, rax, r15 + imul rax, r15 mov r10, rax rx_i_306: ;ADD_64 @@ -5443,7 +5499,7 @@ rx_body_308: imul rax, r13 mov r15, rax -rx_i_309: ;IMUL_32 +rx_i_309: ;DIV_64 dec ebx jz rx_finish xor r9, 090c42304h @@ -5454,9 +5510,11 @@ rx_i_309: ;IMUL_32 rx_body_309: and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] - movsxd rcx, eax - mov rax, -1652850028 - imul rax, rcx + ; magic divide by 2642117268 + mov rcx, 14993309243657753043 + mul rcx + mov rax, rdx + shr rax, 31 mov rcx, rax mov eax, r9d xor eax, 09d7b8294h @@ -5776,7 +5834,7 @@ rx_body_326: and eax, 2047 mov qword ptr [rsi + rax * 8], rcx -rx_i_327: ;DIV_64 +rx_i_327: ;IDIV_64 dec ebx jz rx_finish xor r9, 09665f98dh @@ -5789,10 +5847,15 @@ rx_body_327: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] ; magic divide by 1572662125 - mov rcx, 12594593786994192665 - mul rcx + mov rcx, rax + mov rdx, -5852150286715358951 + imul rdx mov rax, rdx - shr rax, 30 + xor edx, edx + add rax, rcx + sar rax, 30 + sets dl + add rax, rdx mov r12, rax rx_i_328: ;SHR_64 @@ -5825,7 +5888,7 @@ rx_body_329: je short rx_i_330 ret -rx_i_330: ;MUL_32 +rx_i_330: ;IMUL_32 dec ebx jz rx_finish xor r9, 0f6a93f19h @@ -5837,8 +5900,8 @@ rx_body_330: xor rbp, rcx and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] - mov ecx, eax - mov eax, -1349816041 + movsxd rcx, eax + movsxd rax, r13d imul rax, rcx mov rcx, rax mov eax, r11d @@ -6008,7 +6071,7 @@ rx_body_340: addpd xmm0, xmm5 movaps xmm5, xmm0 -rx_i_341: ;MULH_64 +rx_i_341: ;MUL_32 dec ebx jz rx_finish xor r12, 019eb9ea5h @@ -6019,9 +6082,9 @@ rx_i_341: ;MULH_64 rx_body_341: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - mov rcx, r15 - mul rcx - mov rax, rdx + mov ecx, eax + mov eax, r15d + imul rax, rcx mov rcx, rax mov eax, r8d xor eax, 024736405h @@ -6230,7 +6293,7 @@ rx_body_353: and eax, 2047 movlpd qword ptr [rsi + rax * 8], xmm7 -rx_i_354: ;MULH_64 +rx_i_354: ;MUL_32 dec ebx jz rx_finish xor r13, 02412fc10h @@ -6241,9 +6304,9 @@ rx_i_354: ;MULH_64 rx_body_354: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - mov rcx, r13 - mul rcx - mov rax, rdx + mov ecx, eax + mov eax, r13d + imul rax, rcx mov r13, rax rx_i_355: ;MUL_64 @@ -6293,7 +6356,7 @@ rx_body_357: add rax, r11 mov r11, rax -rx_i_358: ;IMULH_64 +rx_i_358: ;DIV_64 dec ebx jz rx_finish xor r13, 088fa6e5ah @@ -6304,9 +6367,12 @@ rx_i_358: ;IMULH_64 rx_body_358: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - mov rcx, r11 - imul rcx + ; magic divide by 3667831238 + shr rax, 1 + mov rcx, 2700102505175032865 + mul rcx mov rax, rdx + shr rax, 28 mov r9, rax rx_i_359: ;FPSUB @@ -6401,7 +6467,7 @@ rx_body_363: andps xmm0, xmm1 movaps xmm3, xmm0 -rx_i_364: ;MULH_64 +rx_i_364: ;MUL_32 dec ebx jz rx_finish xor r11, 0badaf867h @@ -6412,9 +6478,9 @@ rx_i_364: ;MULH_64 rx_body_364: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - mov rcx, r8 - mul rcx - mov rax, rdx + mov ecx, eax + mov eax, r8d + imul rax, rcx mov r8, rax rx_i_365: ;IMUL_32 @@ -6486,7 +6552,7 @@ rx_body_368: sub eax, r10d mov r8, rax -rx_i_369: ;DIV_64 +rx_i_369: ;IDIV_64 dec ebx jz rx_finish xor r9, 053fe22e2h @@ -6498,10 +6564,13 @@ rx_body_369: and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] ; magic divide by 470792991 - mov rcx, 1314739240972876203 - mul rcx + mov rdx, 1314739240972876203 + imul rdx mov rax, rdx - shr rax, 25 + xor edx, edx + sar rax, 25 + sets dl + add rax, rdx mov r9, rax rx_i_370: ;FPSUB @@ -6682,7 +6751,7 @@ rx_i_380: ;MUL_64 rx_body_380: and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] - imul rax, rax, r10 + imul rax, r10 mov rcx, rax mov eax, r13d xor eax, 0a9fd85e0h @@ -6915,7 +6984,7 @@ rx_body_394: addpd xmm0, xmm9 movaps xmm6, xmm0 -rx_i_395: ;IMULH_64 +rx_i_395: ;DIV_64 dec ebx jz rx_finish xor r8, 04ae4fe8ch @@ -6927,9 +6996,11 @@ rx_body_395: xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - mov rcx, r13 - imul rcx + ; magic divide by 939698704 + mov rcx, 5269518980991934091 + mul rcx mov rax, rdx + shr rax, 28 mov r8, rax rx_i_396: ;ROR_64 @@ -7058,7 +7129,7 @@ rx_body_402: je short rx_i_403 ret -rx_i_403: ;IMULH_64 +rx_i_403: ;DIV_64 dec ebx jz rx_finish xor r9, 0e59500f7h @@ -7069,9 +7140,11 @@ rx_i_403: ;IMULH_64 rx_body_403: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - mov rcx, r12 - imul rcx + ; magic divide by 536056992 + mov rcx, 4618688153536407095 + mul rcx mov rax, rdx + shr rax, 27 mov rcx, rax mov eax, r11d xor eax, 01ff394a0h @@ -7161,7 +7234,7 @@ rx_i_408: ;MUL_64 rx_body_408: and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] - imul rax, 693109961 + imul rax, rax, 693109961 mov rcx, rax mov eax, r10d xor eax, 0295004c9h @@ -7272,7 +7345,7 @@ rx_body_414: and eax, 2047 mov qword ptr [rsi + rax * 8], rcx -rx_i_415: ;IMULH_64 +rx_i_415: ;DIV_64 dec ebx jz rx_finish xor r8, 08c3e59a1h @@ -7284,9 +7357,13 @@ rx_body_415: xor rbp, rcx and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] - mov rcx, r8 - imul rcx + ; magic divide by 3756873911 + add rax, 1 + sbb rax, 0 + mov rcx, 10544426615208851175 + mul rcx mov rax, rdx + shr rax, 31 mov r9, rax rx_i_416: ;FPADD @@ -7456,7 +7533,7 @@ rx_body_425: imul rax, rcx mov r14, rax -rx_i_426: ;DIV_64 +rx_i_426: ;IDIV_64 dec ebx jz rx_finish xor r12, 09dd55ba0h @@ -7467,18 +7544,21 @@ rx_i_426: ;DIV_64 rx_body_426: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - ; magic divide by 3704238575 - mov rcx, 1336782190693946083 - mul rcx + ; magic divide by -590728721 + mov rdx, -4191230239118101979 + imul rdx mov rax, rdx - shr rax, 28 + xor edx, edx + sar rax, 27 + sets dl + add rax, rdx mov rcx, rax mov eax, r14d xor eax, 0dcca31efh and eax, 2047 mov qword ptr [rsi + rax * 8], rcx -rx_i_427: ;MULH_64 +rx_i_427: ;MUL_32 dec ebx jz rx_finish xor r11, 0d6cae9aeh @@ -7490,9 +7570,9 @@ rx_body_427: xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - mov rcx, -2146332428 - mul rcx - mov rax, rdx + mov ecx, eax + mov eax, -2146332428 + imul rax, rcx mov rcx, rax mov eax, r9d xor eax, 0801190f4h @@ -7530,7 +7610,7 @@ rx_i_429: ;MUL_64 rx_body_429: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - imul rax, rax, r9 + imul rax, r9 mov r15, rax rx_i_430: ;FPADD @@ -7632,7 +7712,7 @@ rx_body_435: xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - imul rax, 1971717631 + imul rax, rax, 1971717631 mov rcx, rax mov eax, r9d xor eax, 0758605ffh @@ -7816,7 +7896,7 @@ rx_body_445: and eax, 2047 mov qword ptr [rsi + rax * 8], rcx -rx_i_446: ;MULH_64 +rx_i_446: ;MUL_32 dec ebx jz rx_finish xor r12, 01734708eh @@ -7828,9 +7908,9 @@ rx_body_446: xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - mov rcx, r15 - mul rcx - mov rax, rdx + mov ecx, eax + mov eax, r15d + imul rax, rcx mov rcx, rax mov eax, r13d xor eax, 03166163h @@ -7938,7 +8018,7 @@ rx_body_452: je short rx_i_453 ret -rx_i_453: ;IMULH_64 +rx_i_453: ;DIV_64 dec ebx jz rx_finish xor r11, 0a2096aa4h @@ -7949,9 +8029,12 @@ rx_i_453: ;IMULH_64 rx_body_453: and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] - mov rcx, r14 - imul rcx + ; magic divide by 380157076 + shr rax, 2 + mov rcx, 3256390890604862173 + mul rcx mov rax, rdx + shr rax, 24 mov r8, rax rx_i_454: ;FPADD @@ -8050,7 +8133,7 @@ rx_i_459: ;MUL_64 rx_body_459: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - imul rax, rax, r9 + imul rax, r9 mov rcx, rax mov eax, r13d xor eax, 016bb0164h @@ -8185,7 +8268,7 @@ rx_body_467: addpd xmm0, xmm9 movaps xmm8, xmm0 -rx_i_468: ;IMULH_64 +rx_i_468: ;DIV_64 dec ebx jz rx_finish xor r8, 091044dc3h @@ -8197,16 +8280,20 @@ rx_body_468: xor rbp, rcx and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - mov rcx, r8 - imul rcx + ; magic divide by 4281572471 + add rax, 1 + sbb rax, 0 + mov rcx, 9252227195836753313 + mul rcx mov rax, rdx + shr rax, 31 mov rcx, rax mov eax, r8d xor eax, 0ff339c77h and eax, 32767 mov qword ptr [rsi + rax * 8], rcx -rx_i_469: ;MUL_32 +rx_i_469: ;IMUL_32 dec ebx jz rx_finish xor r9, 0c0186beh @@ -8217,8 +8304,8 @@ rx_i_469: ;MUL_32 rx_body_469: and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] - mov ecx, eax - mov eax, r9d + movsxd rcx, eax + mov rax, 294019485 imul rax, rcx mov rcx, rax mov eax, r9d @@ -8287,7 +8374,7 @@ rx_i_473: ;MUL_64 rx_body_473: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - imul rax, rax, r11 + imul rax, r11 mov r12, rax rx_i_474: ;JUMP @@ -8398,7 +8485,7 @@ rx_body_480: addpd xmm0, xmm4 movaps xmm6, xmm0 -rx_i_481: ;IMULH_64 +rx_i_481: ;DIV_64 dec ebx jz rx_finish xor r14, 0225ba1f9h @@ -8409,9 +8496,12 @@ rx_i_481: ;IMULH_64 rx_body_481: and ecx, 2047 mov rax, qword ptr [rsi+rcx*8] - mov rcx, r13 - imul rcx + ; magic divide by 2101516912 + shr rax, 4 + mov rcx, 147267437180322377 + mul rcx mov rax, rdx + shr rax, 20 mov r12, rax rx_i_482: ;AND_32 @@ -8509,7 +8599,7 @@ rx_body_487: sub rax, r9 mov r11, rax -rx_i_488: ;IMUL_32 +rx_i_488: ;DIV_64 dec ebx jz rx_finish xor r12, 0d8b1788eh @@ -8520,9 +8610,11 @@ rx_i_488: ;IMUL_32 rx_body_488: and ecx, 32767 mov rax, qword ptr [rsi+rcx*8] - movsxd rcx, eax - mov rax, 297357073 - imul rax, rcx + ; magic divide by 297357073 + mov rcx, 16652572300311555393 + mul rcx + mov rax, rdx + shr rax, 28 mov r12, rax rx_i_489: ;JUMP From 67e741ff22d14330f4275b95b7375baa73f546a4 Mon Sep 17 00:00:00 2001 From: tevador Date: Sat, 12 Jan 2019 20:27:35 +0100 Subject: [PATCH 15/35] Reduced x86 code size by 512 bytes (and ecx -> and eax) --- src/AssemblyGeneratorX86.cpp | 14 +- src/JitCompilerX86-static.S | 4 +- src/JitCompilerX86-static.asm | 4 +- src/JitCompilerX86.cpp | 11 +- src/asm/program_read.inc | 46 +- src/asm/program_transform_address.inc | 180 +- src/executeProgram-win64.asm | 52 +- src/program.inc | 3372 ++++++++++++------------- 8 files changed, 1841 insertions(+), 1842 deletions(-) diff --git a/src/AssemblyGeneratorX86.cpp b/src/AssemblyGeneratorX86.cpp index 4cb009e..16b06c7 100644 --- a/src/AssemblyGeneratorX86.cpp +++ b/src/AssemblyGeneratorX86.cpp @@ -66,34 +66,34 @@ namespace RandomX { void AssemblyGeneratorX86::gena(Instruction& instr, int i) { asmCode << "\txor " << regR[instr.rega % RegistersCount] << ", 0" << std::hex << instr.addra << "h" << std::dec << std::endl; - asmCode << "\tmov ecx, " << regR32[instr.rega % RegistersCount] << std::endl; + asmCode << "\tmov eax, " << regR32[instr.rega % RegistersCount] << std::endl; asmCode << "\ttest " << regIc8 << ", 63" << std::endl; asmCode << "\tjnz short rx_body_" << i << std::endl; if (instr.loca & 3) { asmCode << "\tcall rx_read_l1" << std::endl; asmCode << "rx_body_" << i << ":" << std::endl; if ((instr.loca & 192) == 0) - asmCode << "\txor " << regMx << ", rcx" << std::endl; - asmCode << "\tand ecx, " << (ScratchpadL1 - 1) << std::endl; + asmCode << "\txor " << regMx << ", rax" << std::endl; + asmCode << "\tand eax, " << (ScratchpadL1 - 1) << std::endl; } else { asmCode << "\tcall rx_read_l2" << std::endl; asmCode << "rx_body_" << i << ":" << std::endl; if ((instr.loca & 192) == 0) - asmCode << "\txor " << regMx << ", rcx" << std::endl; - asmCode << "\tand ecx, " << (ScratchpadL2 - 1) << std::endl; + asmCode << "\txor " << regMx << ", rax" << std::endl; + asmCode << "\tand eax, " << (ScratchpadL2 - 1) << std::endl; } } void AssemblyGeneratorX86::genar(Instruction& instr, int i) { gena(instr, i); - asmCode << "\tmov rax, qword ptr [" << regScratchpadAddr << "+rcx*8]" << std::endl; + asmCode << "\tmov rax, qword ptr [" << regScratchpadAddr << "+rax*8]" << std::endl; } void AssemblyGeneratorX86::genaf(Instruction& instr, int i) { gena(instr, i); - asmCode << "\tcvtdq2pd xmm0, qword ptr [" << regScratchpadAddr << "+rcx*8]" << std::endl; + asmCode << "\tcvtdq2pd xmm0, qword ptr [" << regScratchpadAddr << "+rax*8]" << std::endl; } void AssemblyGeneratorX86::genbiashift(Instruction& instr, const char* instrx86) { diff --git a/src/JitCompilerX86-static.S b/src/JitCompilerX86-static.S index fdc32b1..875256a 100644 --- a/src/JitCompilerX86-static.S +++ b/src/JitCompilerX86-static.S @@ -48,7 +48,7 @@ DECL(randomx_program_begin): DECL(randomx_program_epilogue): #include "asm/program_epilogue_linux.inc" -#define scratchpad_mask and ecx, 2040 +#define scratchpad_mask and eax, 2040 .align 64 DECL(randomx_program_read_l1): @@ -56,7 +56,7 @@ DECL(randomx_program_read_l1): #undef scratchpad_mask -#define scratchpad_mask and ecx, 32760 +#define scratchpad_mask and eax, 32760 .align 64 DECL(randomx_program_read_l2): diff --git a/src/JitCompilerX86-static.asm b/src/JitCompilerX86-static.asm index 7a2b3c4..48b09ff 100644 --- a/src/JitCompilerX86-static.asm +++ b/src/JitCompilerX86-static.asm @@ -42,7 +42,7 @@ randomx_program_epilogue PROC randomx_program_epilogue ENDP scratchpad_mask MACRO - and ecx, 2040 + and eax, 2040 ENDM ALIGN 64 @@ -51,7 +51,7 @@ randomx_program_read_l1 PROC randomx_program_read_l1 ENDP scratchpad_mask MACRO - and ecx, 32760 + and eax, 32760 ENDM ALIGN 64 diff --git a/src/JitCompilerX86.cpp b/src/JitCompilerX86.cpp index 2a101f0..8175485 100644 --- a/src/JitCompilerX86.cpp +++ b/src/JitCompilerX86.cpp @@ -175,7 +175,7 @@ namespace RandomX { emitByte(0xf0 + (instr.rega % RegistersCount)); emit(instr.addra); emit(uint16_t(0x8b41)); //mov - emitByte(0xc8 + (instr.rega % RegistersCount)); //ecx, rega + emitByte(0xc0 + (instr.rega % RegistersCount)); //eax, rega emit(0x753fc3f6); //test bl,0x3f; jne emit(uint16_t(0xe805)); if (instr.loca & 3) { //A.LOC.W @@ -186,9 +186,9 @@ namespace RandomX { } if ((instr.loca & 192) == 0) { //A.LOC.X emit(uint16_t(0x3348)); - emitByte(0xe9); //xor rbp, rcx + emitByte(0xe8); //xor rbp, rax } - emit(uint16_t(0xe181)); //and ecx, + emitByte(0x25); //and eax, if (instr.loca & 3) { emit(ScratchpadL1 - 1); //first 16 KiB of scratchpad } @@ -199,14 +199,13 @@ namespace RandomX { void JitCompilerX86::genar(Instruction& instr) { gena(instr); - emit(0xce048b48); //mov rax,QWORD PTR [rsi+rcx*8] - emit(0xdc580f66); + emit(0xc6048b48); //mov rax,QWORD PTR [rsi+rax*8] } void JitCompilerX86::genaf(Instruction& instr) { gena(instr); emitByte(0xf3); - emit(0xce04e60f); //cvtdq2pd xmm0,QWORD PTR [rsi+rcx*8] + emit(0xc604e60f); //cvtdq2pd xmm0,QWORD PTR [rsi+rax*8] } void JitCompilerX86::genbiashift(Instruction& instr, uint16_t opcodeReg, uint16_t opcodeImm) { diff --git a/src/asm/program_read.inc b/src/asm/program_read.inc index adf8e92..8ddf97d 100644 --- a/src/asm/program_read.inc +++ b/src/asm/program_read.inc @@ -1,32 +1,32 @@ - push rcx ;# preserve ecx + push rax ;# preserve eax db 0, 0, 0, 0 ;# TransformAddress placeholder - mov rax, qword ptr [rdi] ;# load the dataset address - xor rbp, rcx ;# modify "mx" + mov rcx, qword ptr [rdi] ;# load the dataset address + xor rbp, rax ;# modify "mx" ;# prefetch cacheline "mx" and rbp, -64 ;# align "mx" to the start of a cache line mov edx, ebp ;# edx = mx - prefetchnta byte ptr [rax+rdx] + prefetchnta byte ptr [rcx+rdx] ;# read cacheline "ma" ror rbp, 32 ;# swap "ma" and "mx" mov edx, ebp ;# edx = ma scratchpad_mask ;# limit address to the specified scratchpad size aligned to multiple of 8 - lea rcx, [rsi+rcx*8] ;# scratchpad cache line - lea rax, [rax+rdx] ;# dataset cache line - mov rdx, qword ptr [rax+0] ;# load first dataset quadword (prefetched into the cache by now) - xor qword ptr [rcx+0], rdx ;# XOR the dataset item with a scratchpad item, repeat for the rest of the cacheline - mov rdx, qword ptr [rax+8] - xor qword ptr [rcx+8], rdx - mov rdx, qword ptr [rax+16] - xor qword ptr [rcx+16], rdx - mov rdx, qword ptr [rax+24] - xor qword ptr [rcx+24], rdx - mov rdx, qword ptr [rax+32] - xor qword ptr [rcx+32], rdx - mov rdx, qword ptr [rax+40] - xor qword ptr [rcx+40], rdx - mov rdx, qword ptr [rax+48] - xor qword ptr [rcx+48], rdx - mov rdx, qword ptr [rax+56] - xor qword ptr [rcx+56], rdx - pop rcx ;# restore ecx + lea rax, [rsi+rax*8] ;# scratchpad cache line + lea rcx, [rcx+rdx] ;# dataset cache line + mov rdx, qword ptr [rcx+0] ;# load first dataset quadword (prefetched into the cache by now) + xor qword ptr [rax+0], rdx ;# XOR the dataset item with a scratchpad item, repeat for the rest of the cacheline + mov rdx, qword ptr [rcx+8] + xor qword ptr [rax+8], rdx + mov rdx, qword ptr [rcx+16] + xor qword ptr [rax+16], rdx + mov rdx, qword ptr [rcx+24] + xor qword ptr [rax+24], rdx + mov rdx, qword ptr [rcx+32] + xor qword ptr [rax+32], rdx + mov rdx, qword ptr [rcx+40] + xor qword ptr [rax+40], rdx + mov rdx, qword ptr [rcx+48] + xor qword ptr [rax+48], rdx + mov rdx, qword ptr [rcx+56] + xor qword ptr [rax+56], rdx + pop rax ;# restore eax ret \ No newline at end of file diff --git a/src/asm/program_transform_address.inc b/src/asm/program_transform_address.inc index 8d2a79f..0815e29 100644 --- a/src/asm/program_transform_address.inc +++ b/src/asm/program_transform_address.inc @@ -1,154 +1,154 @@ ;# 90 address transformations ;# forced REX prefix is used to make all transformations 4 bytes long - lea ecx, [rcx+rcx*8+109] + lea eax, [rax+rax*8+109] db 64 - xor ecx, 96 - lea ecx, [rcx+rcx*8-19] + xor eax, 96 + lea eax, [rax+rax*8-19] db 64 - add ecx, -98 + add eax, -98 db 64 - add ecx, -21 + add eax, -21 db 64 - xor ecx, -80 - lea ecx, [rcx+rcx*8-92] + xor eax, -80 + lea eax, [rax+rax*8-92] db 64 - add ecx, 113 - lea ecx, [rcx+rcx*8+100] + add eax, 113 + lea eax, [rax+rax*8+100] db 64 - add ecx, -39 + add eax, -39 db 64 - xor ecx, 120 - lea ecx, [rcx+rcx*8-119] + xor eax, 120 + lea eax, [rax+rax*8-119] db 64 - add ecx, -113 + add eax, -113 db 64 - add ecx, 111 + add eax, 111 db 64 - xor ecx, 104 - lea ecx, [rcx+rcx*8-83] - lea ecx, [rcx+rcx*8+127] + xor eax, 104 + lea eax, [rax+rax*8-83] + lea eax, [rax+rax*8+127] db 64 - xor ecx, -112 + xor eax, -112 db 64 - add ecx, 89 + add eax, 89 db 64 - add ecx, -32 + add eax, -32 db 64 - add ecx, 104 + add eax, 104 db 64 - xor ecx, -120 + xor eax, -120 db 64 - xor ecx, 24 - lea ecx, [rcx+rcx*8+9] + xor eax, 24 + lea eax, [rax+rax*8+9] db 64 - add ecx, -31 + add eax, -31 db 64 - xor ecx, -16 + xor eax, -16 db 64 - add ecx, 68 - lea ecx, [rcx+rcx*8-110] + add eax, 68 + lea eax, [rax+rax*8-110] db 64 - xor ecx, 64 + xor eax, 64 db 64 - xor ecx, -40 + xor eax, -40 db 64 - xor ecx, -8 + xor eax, -8 db 64 - add ecx, -10 + add eax, -10 db 64 - xor ecx, -32 + xor eax, -32 db 64 - add ecx, 14 - lea ecx, [rcx+rcx*8-46] + add eax, 14 + lea eax, [rax+rax*8-46] db 64 - xor ecx, -104 - lea ecx, [rcx+rcx*8+36] + xor eax, -104 + lea eax, [rax+rax*8+36] db 64 - add ecx, 100 - lea ecx, [rcx+rcx*8-65] - lea ecx, [rcx+rcx*8+27] - lea ecx, [rcx+rcx*8+91] + add eax, 100 + lea eax, [rax+rax*8-65] + lea eax, [rax+rax*8+27] + lea eax, [rax+rax*8+91] db 64 - add ecx, -101 + add eax, -101 db 64 - add ecx, -94 - lea ecx, [rcx+rcx*8-10] + add eax, -94 + lea eax, [rax+rax*8-10] db 64 - xor ecx, 80 + xor eax, 80 db 64 - add ecx, -108 + add eax, -108 db 64 - add ecx, -58 + add eax, -58 db 64 - xor ecx, 48 - lea ecx, [rcx+rcx*8+73] + xor eax, 48 + lea eax, [rax+rax*8+73] db 64 - xor ecx, -48 + xor eax, -48 db 64 - xor ecx, 32 + xor eax, 32 db 64 - xor ecx, -96 + xor eax, -96 db 64 - add ecx, 118 + add eax, 118 db 64 - add ecx, 91 - lea ecx, [rcx+rcx*8+18] + add eax, 91 + lea eax, [rax+rax*8+18] db 64 - add ecx, -11 - lea ecx, [rcx+rcx*8+63] + add eax, -11 + lea eax, [rax+rax*8+63] db 64 - add ecx, 114 - lea ecx, [rcx+rcx*8+45] + add eax, 114 + lea eax, [rax+rax*8+45] db 64 - add ecx, -67 + add eax, -67 db 64 - add ecx, 53 - lea ecx, [rcx+rcx*8-101] - lea ecx, [rcx+rcx*8-1] + add eax, 53 + lea eax, [rax+rax*8-101] + lea eax, [rax+rax*8-1] db 64 - xor ecx, 16 - lea ecx, [rcx+rcx*8-37] - lea ecx, [rcx+rcx*8-28] - lea ecx, [rcx+rcx*8-55] + xor eax, 16 + lea eax, [rax+rax*8-37] + lea eax, [rax+rax*8-28] + lea eax, [rax+rax*8-55] db 64 - xor ecx, -88 + xor eax, -88 db 64 - xor ecx, -72 + xor eax, -72 db 64 - add ecx, 36 + add eax, 36 db 64 - xor ecx, -56 + xor eax, -56 db 64 - add ecx, 116 + add eax, 116 db 64 - xor ecx, 88 + xor eax, 88 db 64 - xor ecx, -128 + xor eax, -128 db 64 - add ecx, 50 + add eax, 50 db 64 - add ecx, 105 + add eax, 105 db 64 - add ecx, -37 + add eax, -37 db 64 - xor ecx, 112 + xor eax, 112 db 64 - xor ecx, 8 + xor eax, 8 db 64 - xor ecx, -24 - lea ecx, [rcx+rcx*8+118] + xor eax, -24 + lea eax, [rax+rax*8+118] db 64 - xor ecx, 72 + xor eax, 72 db 64 - xor ecx, -64 + xor eax, -64 db 64 - add ecx, 40 - lea ecx, [rcx+rcx*8-74] - lea ecx, [rcx+rcx*8+82] - lea ecx, [rcx+rcx*8+54] + add eax, 40 + lea eax, [rax+rax*8-74] + lea eax, [rax+rax*8+82] + lea eax, [rax+rax*8+54] db 64 - xor ecx, 56 + xor eax, 56 db 64 - xor ecx, 40 + xor eax, 40 db 64 - add ecx, 87 \ No newline at end of file + add eax, 87 \ No newline at end of file diff --git a/src/executeProgram-win64.asm b/src/executeProgram-win64.asm index 841bb16..53eec9c 100644 --- a/src/executeProgram-win64.asm +++ b/src/executeProgram-win64.asm @@ -222,42 +222,42 @@ TransformAddress MACRO reg32, reg64 ENDM ReadMemoryRandom MACRO spmask -;# IN ecx = random 32-bit address +;# IN eax = random 32-bit address ;# GLOBAL rdi = address of the dataset address ;# GLOBAL rsi = address of the scratchpad ;# GLOBAL rbp = low 32 bits = "mx", high 32 bits = "ma" ;# MODIFY rcx, rdx - push rcx ;# preserve ecx - TransformAddress ecx, rcx ;# TransformAddress function - mov rax, qword ptr [rdi] ;# load the dataset address - xor rbp, rcx ;# modify "mx" + push rax ;# preserve eax + TransformAddress eax, rax ;# TransformAddress function + mov rcx, qword ptr [rdi] ;# load the dataset address + xor rbp, rax ;# modify "mx" ; prefetch cacheline "mx" and rbp, -64 ;# align "mx" to the start of a cache line mov edx, ebp ;# edx = mx - prefetchnta byte ptr [rax+rdx] + prefetchnta byte ptr [rcx+rdx] ; read cacheline "ma" ror rbp, 32 ;# swap "ma" and "mx" mov edx, ebp ;# edx = ma - and ecx, spmask-7 ;# limit address to the specified scratchpad size aligned to multiple of 8 - lea rcx, [rsi+rcx*8] ;# scratchpad cache line - lea rax, [rax+rdx] ;# dataset cache line - mov rdx, qword ptr [rax+0] ;# load first dataset quadword (prefetched into the cache by now) - xor qword ptr [rcx+0], rdx ;# XOR the dataset item with a scratchpad item, repeat for the rest of the cacheline - mov rdx, qword ptr [rax+8] - xor qword ptr [rcx+8], rdx - mov rdx, qword ptr [rax+16] - xor qword ptr [rcx+16], rdx - mov rdx, qword ptr [rax+24] - xor qword ptr [rcx+24], rdx - mov rdx, qword ptr [rax+32] - xor qword ptr [rcx+32], rdx - mov rdx, qword ptr [rax+40] - xor qword ptr [rcx+40], rdx - mov rdx, qword ptr [rax+48] - xor qword ptr [rcx+48], rdx - mov rdx, qword ptr [rax+56] - xor qword ptr [rcx+56], rdx - pop rcx ;# restore ecx + and eax, spmask-7 ;# limit address to the specified scratchpad size aligned to multiple of 8 + lea rax, [rsi+rax*8] ;# scratchpad cache line + lea rcx, [rcx+rdx] ;# dataset cache line + mov rdx, qword ptr [rcx+0] ;# load first dataset quadword (prefetched into the cache by now) + xor qword ptr [rax+0], rdx ;# XOR the dataset item with a scratchpad item, repeat for the rest of the cacheline + mov rdx, qword ptr [rcx+8] + xor qword ptr [rax+8], rdx + mov rdx, qword ptr [rcx+16] + xor qword ptr [rax+16], rdx + mov rdx, qword ptr [rcx+24] + xor qword ptr [rax+24], rdx + mov rdx, qword ptr [rcx+32] + xor qword ptr [rax+32], rdx + mov rdx, qword ptr [rcx+40] + xor qword ptr [rax+40], rdx + mov rdx, qword ptr [rcx+48] + xor qword ptr [rax+48], rdx + mov rdx, qword ptr [rcx+56] + xor qword ptr [rax+56], rdx + pop rax ;# restore eax ret ENDM diff --git a/src/program.inc b/src/program.inc index 538f664..698eeb3 100644 --- a/src/program.inc +++ b/src/program.inc @@ -2,14 +2,14 @@ rx_i_0: ;CALL dec ebx jz rx_finish xor r9, 0ca9788ah - mov ecx, r9d + mov eax, r9d test bl, 63 jnz short rx_body_0 call rx_read_l2 rx_body_0: - xor rbp, rcx - and ecx, 32767 - mov rax, qword ptr [rsi+rcx*8] + xor rbp, rax + and eax, 32767 + mov rax, qword ptr [rsi+rax*8] mov rcx, rax mov eax, r12d xor eax, 01a8e4171h @@ -23,13 +23,13 @@ rx_i_1: ;IDIV_64 dec ebx jz rx_finish xor r15, 06afc2fa4h - mov ecx, r15d + mov eax, r15d test bl, 63 jnz short rx_body_1 call rx_read_l2 rx_body_1: - and ecx, 32767 - mov rax, qword ptr [rsi+rcx*8] + and eax, 32767 + mov rax, qword ptr [rsi+rax*8] mov edx, r10d cmp edx, -1 jne short body_idiv_1 @@ -49,14 +49,14 @@ rx_i_2: ;JUMP dec ebx jz rx_finish xor r15, 097210f7bh - mov ecx, r15d + mov eax, r15d test bl, 63 jnz short rx_body_2 call rx_read_l1 rx_body_2: - xor rbp, rcx - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + xor rbp, rax + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] mov rcx, rax mov eax, r9d xor eax, 05060ccf7h @@ -69,14 +69,14 @@ rx_i_3: ;FPDIV dec ebx jz rx_finish xor r13, 082c73195h - mov ecx, r13d + mov eax, r13d test bl, 63 jnz short rx_body_3 call rx_read_l2 rx_body_3: - xor rbp, rcx - and ecx, 32767 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + xor rbp, rax + and eax, 32767 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] divpd xmm0, xmm9 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -91,13 +91,13 @@ rx_i_4: ;MUL_32 dec ebx jz rx_finish xor r14, 077daefb4h - mov ecx, r14d + mov eax, r14d test bl, 63 jnz short rx_body_4 call rx_read_l2 rx_body_4: - and ecx, 32767 - mov rax, qword ptr [rsi+rcx*8] + and eax, 32767 + mov rax, qword ptr [rsi+rax*8] mov ecx, eax mov eax, r14d imul rax, rcx @@ -111,16 +111,16 @@ rx_i_5: ;IMUL_32 dec ebx jz rx_finish xor r15, 0379f9ee0h - mov ecx, r15d + mov eax, r15d test bl, 63 jnz short rx_body_5 call rx_read_l1 rx_body_5: - xor rbp, rcx - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + xor rbp, rax + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] movsxd rcx, eax - movsxd rax, r12d + mov rax, 1037420699 imul rax, rcx mov r12, rax @@ -128,13 +128,13 @@ rx_i_6: ;MUL_64 dec ebx jz rx_finish xor r8, 03bae7272h - mov ecx, r8d + mov eax, r8d test bl, 63 jnz short rx_body_6 call rx_read_l1 rx_body_6: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] imul rax, r15 mov rcx, rax mov eax, r9d @@ -146,13 +146,13 @@ rx_i_7: ;FPADD dec ebx jz rx_finish xor r10, 0e264ed81h - mov ecx, r10d + mov eax, r10d test bl, 63 jnz short rx_body_7 call rx_read_l2 rx_body_7: - and ecx, 32767 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + and eax, 32767 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] addpd xmm0, xmm6 movaps xmm6, xmm0 mov eax, r14d @@ -164,13 +164,13 @@ rx_i_8: ;XOR_64 dec ebx jz rx_finish xor r13, 068c1e5d2h - mov ecx, r13d + mov eax, r13d test bl, 63 jnz short rx_body_8 call rx_read_l2 rx_body_8: - and ecx, 32767 - mov rax, qword ptr [rsi+rcx*8] + and eax, 32767 + mov rax, qword ptr [rsi+rax*8] xor rax, r11 mov rcx, rax mov eax, r12d @@ -182,14 +182,14 @@ rx_i_9: ;IDIV_64 dec ebx jz rx_finish xor r14, 085121c54h - mov ecx, r14d + mov eax, r14d test bl, 63 jnz short rx_body_9 call rx_read_l2 rx_body_9: - xor rbp, rcx - and ecx, 32767 - mov rax, qword ptr [rsi+rcx*8] + xor rbp, rax + and eax, 32767 + mov rax, qword ptr [rsi+rax*8] ; magic divide by 565870810 mov rdx, 8750690209911200579 imul rdx @@ -204,13 +204,13 @@ rx_i_10: ;AND_64 dec ebx jz rx_finish xor r8, 052efde3eh - mov ecx, r8d + mov eax, r8d test bl, 63 jnz short rx_body_10 call rx_read_l1 rx_body_10: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] and rax, r10 mov r13, rax @@ -218,13 +218,13 @@ rx_i_11: ;FPADD dec ebx jz rx_finish xor r10, 0a9bf8aa1h - mov ecx, r10d + mov eax, r10d test bl, 63 jnz short rx_body_11 call rx_read_l1 rx_body_11: - and ecx, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + and eax, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] addpd xmm0, xmm5 movaps xmm4, xmm0 mov eax, r12d @@ -236,13 +236,13 @@ rx_i_12: ;FPSQRT dec ebx jz rx_finish xor r10, 0db2691ch - mov ecx, r10d + mov eax, r10d test bl, 63 jnz short rx_body_12 call rx_read_l1 rx_body_12: - and ecx, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + and eax, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] andps xmm0, xmm10 sqrtpd xmm8, xmm0 mov eax, r8d @@ -254,14 +254,14 @@ rx_i_13: ;FPADD dec ebx jz rx_finish xor r12, 061c0d34dh - mov ecx, r12d + mov eax, r12d test bl, 63 jnz short rx_body_13 call rx_read_l1 rx_body_13: - xor rbp, rcx - and ecx, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + xor rbp, rax + and eax, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] addpd xmm0, xmm3 movaps xmm9, xmm0 @@ -269,14 +269,14 @@ rx_i_14: ;XOR_64 dec ebx jz rx_finish xor r10, 0e761d1beh - mov ecx, r10d + mov eax, r10d test bl, 63 jnz short rx_body_14 call rx_read_l1 rx_body_14: - xor rbp, rcx - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + xor rbp, rax + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] xor rax, r9 mov rcx, rax mov eax, r10d @@ -288,13 +288,13 @@ rx_i_15: ;RET dec ebx jz rx_finish xor r11, 074ddb688h - mov ecx, r11d + mov eax, r11d test bl, 63 jnz short rx_body_15 call rx_read_l1 rx_body_15: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] mov rcx, rax mov eax, r14d xor eax, 0468b38b8h @@ -308,13 +308,13 @@ rx_i_16: ;ADD_64 dec ebx jz rx_finish xor r14, 06be90627h - mov ecx, r14d + mov eax, r14d test bl, 63 jnz short rx_body_16 call rx_read_l1 rx_body_16: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] add rax, r10 mov rcx, rax mov eax, r9d @@ -326,13 +326,13 @@ rx_i_17: ;FPMUL dec ebx jz rx_finish xor r11, 0fbc6fc35h - mov ecx, r11d + mov eax, r11d test bl, 63 jnz short rx_body_17 call rx_read_l2 rx_body_17: - and ecx, 32767 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + and eax, 32767 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] mulpd xmm0, xmm4 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -347,13 +347,13 @@ rx_i_18: ;FPSUB dec ebx jz rx_finish xor r14, 0c28ca080h - mov ecx, r14d + mov eax, r14d test bl, 63 jnz short rx_body_18 call rx_read_l1 rx_body_18: - and ecx, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + and eax, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] subpd xmm0, xmm4 movaps xmm3, xmm0 mov eax, r11d @@ -365,14 +365,14 @@ rx_i_19: ;FPSUB dec ebx jz rx_finish xor r13, 0ac009c30h - mov ecx, r13d + mov eax, r13d test bl, 63 jnz short rx_body_19 call rx_read_l1 rx_body_19: - xor rbp, rcx - and ecx, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + xor rbp, rax + and eax, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] subpd xmm0, xmm8 movaps xmm7, xmm0 @@ -380,13 +380,13 @@ rx_i_20: ;FPSUB dec ebx jz rx_finish xor r13, 0ecca967dh - mov ecx, r13d + mov eax, r13d test bl, 63 jnz short rx_body_20 call rx_read_l1 rx_body_20: - and ecx, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + and eax, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] subpd xmm0, xmm2 movaps xmm7, xmm0 mov eax, r15d @@ -398,14 +398,14 @@ rx_i_21: ;ROR_64 dec ebx jz rx_finish xor r8, 0977f0284h - mov ecx, r8d + mov eax, r8d test bl, 63 jnz short rx_body_21 call rx_read_l1 rx_body_21: - xor rbp, rcx - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + xor rbp, rax + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] mov rcx, r9 ror rax, cl mov rcx, rax @@ -418,14 +418,14 @@ rx_i_22: ;ADD_64 dec ebx jz rx_finish xor r13, 080bdfefah - mov ecx, r13d + mov eax, r13d test bl, 63 jnz short rx_body_22 call rx_read_l2 rx_body_22: - xor rbp, rcx - and ecx, 32767 - mov rax, qword ptr [rsi+rcx*8] + xor rbp, rax + and eax, 32767 + mov rax, qword ptr [rsi+rax*8] add rax, r8 mov rcx, rax mov eax, r10d @@ -437,13 +437,13 @@ rx_i_23: ;MUL_64 dec ebx jz rx_finish xor r15, 0e1e0d3c4h - mov ecx, r15d + mov eax, r15d test bl, 63 jnz short rx_body_23 call rx_read_l1 rx_body_23: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] imul rax, rax, 1283724485 mov r8, rax @@ -451,14 +451,14 @@ rx_i_24: ;DIV_64 dec ebx jz rx_finish xor r8, 070d3b8c7h - mov ecx, r8d + mov eax, r8d test bl, 63 jnz short rx_body_24 call rx_read_l2 rx_body_24: - xor rbp, rcx - and ecx, 32767 - mov rax, qword ptr [rsi+rcx*8] + xor rbp, rax + and eax, 32767 + mov rax, qword ptr [rsi+rax*8] mov ecx, 1 mov edx, r15d test edx, edx @@ -475,14 +475,14 @@ rx_i_25: ;FPMUL dec ebx jz rx_finish xor r12, 01cf77a04h - mov ecx, r12d + mov eax, r12d test bl, 63 jnz short rx_body_25 call rx_read_l1 rx_body_25: - xor rbp, rcx - and ecx, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + xor rbp, rax + and eax, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] mulpd xmm0, xmm9 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -497,13 +497,13 @@ rx_i_26: ;IMULH_64 dec ebx jz rx_finish xor r11, 0e311468ch - mov ecx, r11d + mov eax, r11d test bl, 63 jnz short rx_body_26 call rx_read_l1 rx_body_26: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] mov rcx, 812644844 imul rcx mov rax, rdx @@ -517,13 +517,13 @@ rx_i_27: ;FPMUL dec ebx jz rx_finish xor r12, 01fd9911ah - mov ecx, r12d + mov eax, r12d test bl, 63 jnz short rx_body_27 call rx_read_l1 rx_body_27: - and ecx, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + and eax, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] mulpd xmm0, xmm3 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -534,13 +534,13 @@ rx_i_28: ;AND_32 dec ebx jz rx_finish xor r13, 067df757eh - mov ecx, r13d + mov eax, r13d test bl, 63 jnz short rx_body_28 call rx_read_l1 rx_body_28: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] and eax, 565865719 mov r14, rax @@ -548,13 +548,13 @@ rx_i_29: ;SUB_64 dec ebx jz rx_finish xor r12, 0be2e7c42h - mov ecx, r12d + mov eax, r12d test bl, 63 jnz short rx_body_29 call rx_read_l1 rx_body_29: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] sub rax, r13 mov r14, rax @@ -562,13 +562,13 @@ rx_i_30: ;FPADD dec ebx jz rx_finish xor r11, 084d067f7h - mov ecx, r11d + mov eax, r11d test bl, 63 jnz short rx_body_30 call rx_read_l1 rx_body_30: - and ecx, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + and eax, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] addpd xmm0, xmm3 movaps xmm7, xmm0 @@ -576,14 +576,14 @@ rx_i_31: ;ROR_64 dec ebx jz rx_finish xor r14, 0d352ce37h - mov ecx, r14d + mov eax, r14d test bl, 63 jnz short rx_body_31 call rx_read_l1 rx_body_31: - xor rbp, rcx - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + xor rbp, rax + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] ror rax, 55 mov r14, rax @@ -591,13 +591,13 @@ rx_i_32: ;AND_32 dec ebx jz rx_finish xor r12, 0a1f248dah - mov ecx, r12d + mov eax, r12d test bl, 63 jnz short rx_body_32 call rx_read_l2 rx_body_32: - and ecx, 32767 - mov rax, qword ptr [rsi+rcx*8] + and eax, 32767 + mov rax, qword ptr [rsi+rax*8] and eax, r14d mov r9, rax @@ -605,13 +605,13 @@ rx_i_33: ;MUL_64 dec ebx jz rx_finish xor r9, 0554720fch - mov ecx, r9d + mov eax, r9d test bl, 63 jnz short rx_body_33 call rx_read_l1 rx_body_33: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] imul rax, r15 mov r12, rax @@ -619,14 +619,14 @@ rx_i_34: ;CALL dec ebx jz rx_finish xor r13, 0665e91f1h - mov ecx, r13d + mov eax, r13d test bl, 63 jnz short rx_body_34 call rx_read_l1 rx_body_34: - xor rbp, rcx - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + xor rbp, rax + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] mov r15, rax cmp r14d, -380224718 jns short rx_i_35 @@ -636,13 +636,13 @@ rx_i_35: ;CALL dec ebx jz rx_finish xor r15, 05ef1be79h - mov ecx, r15d + mov eax, r15d test bl, 63 jnz short rx_body_35 call rx_read_l2 rx_body_35: - and ecx, 32767 - mov rax, qword ptr [rsi+rcx*8] + and eax, 32767 + mov rax, qword ptr [rsi+rax*8] mov r8, rax cmp r9d, -2040787098 jns short rx_i_36 @@ -652,13 +652,13 @@ rx_i_36: ;FPMUL dec ebx jz rx_finish xor r8, 012ec7e3ah - mov ecx, r8d + mov eax, r8d test bl, 63 jnz short rx_body_36 call rx_read_l1 rx_body_36: - and ecx, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + and eax, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] mulpd xmm0, xmm6 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -669,13 +669,13 @@ rx_i_37: ;FPSUB dec ebx jz rx_finish xor r12, 0d0706601h - mov ecx, r12d + mov eax, r12d test bl, 63 jnz short rx_body_37 call rx_read_l2 rx_body_37: - and ecx, 32767 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + and eax, 32767 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] subpd xmm0, xmm2 movaps xmm9, xmm0 mov eax, r9d @@ -687,13 +687,13 @@ rx_i_38: ;SUB_64 dec ebx jz rx_finish xor r9, 064056913h - mov ecx, r9d + mov eax, r9d test bl, 63 jnz short rx_body_38 call rx_read_l1 rx_body_38: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] sub rax, r14 mov r10, rax @@ -701,14 +701,14 @@ rx_i_39: ;ADD_64 dec ebx jz rx_finish xor r14, 02c1f1eb0h - mov ecx, r14d + mov eax, r14d test bl, 63 jnz short rx_body_39 call rx_read_l2 rx_body_39: - xor rbp, rcx - and ecx, 32767 - mov rax, qword ptr [rsi+rcx*8] + xor rbp, rax + and eax, 32767 + mov rax, qword ptr [rsi+rax*8] add rax, r14 mov r14, rax @@ -716,13 +716,13 @@ rx_i_40: ;CALL dec ebx jz rx_finish xor r10, 068fd9009h - mov ecx, r10d + mov eax, r10d test bl, 63 jnz short rx_body_40 call rx_read_l1 rx_body_40: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] mov rcx, rax mov eax, r9d xor eax, 0b2a27eceh @@ -736,13 +736,13 @@ rx_i_41: ;JUMP dec ebx jz rx_finish xor r9, 037a30933h - mov ecx, r9d + mov eax, r9d test bl, 63 jnz short rx_body_41 call rx_read_l2 rx_body_41: - and ecx, 32767 - mov rax, qword ptr [rsi+rcx*8] + and eax, 32767 + mov rax, qword ptr [rsi+rax*8] mov r9, rax cmp r14d, -1070581824 jo rx_i_127 @@ -751,14 +751,14 @@ rx_i_42: ;FPADD dec ebx jz rx_finish xor r15, 0bc1de9f6h - mov ecx, r15d + mov eax, r15d test bl, 63 jnz short rx_body_42 call rx_read_l2 rx_body_42: - xor rbp, rcx - and ecx, 32767 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + xor rbp, rax + and eax, 32767 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] addpd xmm0, xmm6 movaps xmm6, xmm0 @@ -766,13 +766,13 @@ rx_i_43: ;SUB_64 dec ebx jz rx_finish xor r12, 02b2a2eech - mov ecx, r12d + mov eax, r12d test bl, 63 jnz short rx_body_43 call rx_read_l2 rx_body_43: - and ecx, 32767 - mov rax, qword ptr [rsi+rcx*8] + and eax, 32767 + mov rax, qword ptr [rsi+rax*8] sub rax, r8 mov rcx, rax mov eax, r11d @@ -784,13 +784,13 @@ rx_i_44: ;SAR_64 dec ebx jz rx_finish xor r11, 0685817abh - mov ecx, r11d + mov eax, r11d test bl, 63 jnz short rx_body_44 call rx_read_l1 rx_body_44: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] mov rcx, r9 sar rax, cl mov r15, rax @@ -799,14 +799,14 @@ rx_i_45: ;FPSUB dec ebx jz rx_finish xor r12, 08cd244ebh - mov ecx, r12d + mov eax, r12d test bl, 63 jnz short rx_body_45 call rx_read_l1 rx_body_45: - xor rbp, rcx - and ecx, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + xor rbp, rax + and eax, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] subpd xmm0, xmm2 movaps xmm5, xmm0 @@ -814,13 +814,13 @@ rx_i_46: ;ADD_64 dec ebx jz rx_finish xor r8, 06d8f4254h - mov ecx, r8d + mov eax, r8d test bl, 63 jnz short rx_body_46 call rx_read_l1 rx_body_46: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] add rax, r9 mov rcx, rax mov eax, r8d @@ -832,14 +832,14 @@ rx_i_47: ;JUMP dec ebx jz rx_finish xor r12, 05ba232c6h - mov ecx, r12d + mov eax, r12d test bl, 63 jnz short rx_body_47 call rx_read_l1 rx_body_47: - xor rbp, rcx - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + xor rbp, rax + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] mov rcx, rax mov eax, r13d xor eax, 071ba231h @@ -852,13 +852,13 @@ rx_i_48: ;FPDIV dec ebx jz rx_finish xor r8, 0aaed618fh - mov ecx, r8d + mov eax, r8d test bl, 63 jnz short rx_body_48 call rx_read_l1 rx_body_48: - and ecx, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + and eax, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] divpd xmm0, xmm3 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -873,13 +873,13 @@ rx_i_49: ;FPSUB dec ebx jz rx_finish xor r8, 0f96c6a45h - mov ecx, r8d + mov eax, r8d test bl, 63 jnz short rx_body_49 call rx_read_l1 rx_body_49: - and ecx, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + and eax, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] subpd xmm0, xmm3 movaps xmm5, xmm0 @@ -887,14 +887,14 @@ rx_i_50: ;AND_64 dec ebx jz rx_finish xor r9, 0da3e4842h - mov ecx, r9d + mov eax, r9d test bl, 63 jnz short rx_body_50 call rx_read_l2 rx_body_50: - xor rbp, rcx - and ecx, 32767 - mov rax, qword ptr [rsi+rcx*8] + xor rbp, rax + and eax, 32767 + mov rax, qword ptr [rsi+rax*8] and rax, r10 mov rcx, rax mov eax, r15d @@ -906,13 +906,13 @@ rx_i_51: ;SUB_64 dec ebx jz rx_finish xor r10, 0302b676ah - mov ecx, r10d + mov eax, r10d test bl, 63 jnz short rx_body_51 call rx_read_l1 rx_body_51: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] sub rax, r15 mov r15, rax @@ -920,13 +920,13 @@ rx_i_52: ;FPSQRT dec ebx jz rx_finish xor r11, 0fa88f48bh - mov ecx, r11d + mov eax, r11d test bl, 63 jnz short rx_body_52 call rx_read_l1 rx_body_52: - and ecx, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + and eax, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] andps xmm0, xmm10 sqrtpd xmm7, xmm0 @@ -934,13 +934,13 @@ rx_i_53: ;RET dec ebx jz rx_finish xor r13, 03dff9b9eh - mov ecx, r13d + mov eax, r13d test bl, 63 jnz short rx_body_53 call rx_read_l1 rx_body_53: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] mov r13, rax cmp rsp, rdi je short rx_i_54 @@ -950,13 +950,13 @@ rx_i_54: ;DIV_64 dec ebx jz rx_finish xor r11, 060638de0h - mov ecx, r11d + mov eax, r11d test bl, 63 jnz short rx_body_54 call rx_read_l1 rx_body_54: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] ; magic divide by 282209221 mov rcx, 1096650948274100047 mul rcx @@ -972,13 +972,13 @@ rx_i_55: ;FPMUL dec ebx jz rx_finish xor r10, 0dda983d4h - mov ecx, r10d + mov eax, r10d test bl, 63 jnz short rx_body_55 call rx_read_l1 rx_body_55: - and ecx, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + and eax, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] mulpd xmm0, xmm5 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -993,13 +993,13 @@ rx_i_56: ;IDIV_64 dec ebx jz rx_finish xor r14, 0f1456b8eh - mov ecx, r14d + mov eax, r14d test bl, 63 jnz short rx_body_56 call rx_read_l2 rx_body_56: - and ecx, 32767 - mov rax, qword ptr [rsi+rcx*8] + and eax, 32767 + mov rax, qword ptr [rsi+rax*8] ; magic divide by -50768751 mov rcx, rax mov rdx, 6254795139557318139 @@ -1020,13 +1020,13 @@ rx_i_57: ;MUL_64 dec ebx jz rx_finish xor r9, 010dc4571h - mov ecx, r9d + mov eax, r9d test bl, 63 jnz short rx_body_57 call rx_read_l1 rx_body_57: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] imul rax, rax, 172123015 mov rcx, rax mov eax, r15d @@ -1038,13 +1038,13 @@ rx_i_58: ;DIV_64 dec ebx jz rx_finish xor r14, 0bcec0ebah - mov ecx, r14d + mov eax, r14d test bl, 63 jnz short rx_body_58 call rx_read_l1 rx_body_58: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] ; magic divide by 1506547423 mov rcx, 6573653217342526495 mul rcx @@ -1056,13 +1056,13 @@ rx_i_59: ;FPSUB dec ebx jz rx_finish xor r11, 0980dd402h - mov ecx, r11d + mov eax, r11d test bl, 63 jnz short rx_body_59 call rx_read_l1 rx_body_59: - and ecx, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + and eax, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] subpd xmm0, xmm8 movaps xmm7, xmm0 @@ -1070,13 +1070,13 @@ rx_i_60: ;CALL dec ebx jz rx_finish xor r15, 03de14d1eh - mov ecx, r15d + mov eax, r15d test bl, 63 jnz short rx_body_60 call rx_read_l2 rx_body_60: - and ecx, 32767 - mov rax, qword ptr [rsi+rcx*8] + and eax, 32767 + mov rax, qword ptr [rsi+rax*8] mov rcx, rax mov eax, r11d xor eax, 07bb60f45h @@ -1090,13 +1090,13 @@ rx_i_61: ;JUMP dec ebx jz rx_finish xor r13, 05058ce64h - mov ecx, r13d + mov eax, r13d test bl, 63 jnz short rx_body_61 call rx_read_l1 rx_body_61: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] mov r11, rax cmp r15d, 1933164545 jns rx_i_120 @@ -1105,13 +1105,13 @@ rx_i_62: ;FPSUB dec ebx jz rx_finish xor r15, 0c3089414h - mov ecx, r15d + mov eax, r15d test bl, 63 jnz short rx_body_62 call rx_read_l1 rx_body_62: - and ecx, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + and eax, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] subpd xmm0, xmm8 movaps xmm2, xmm0 mov eax, r10d @@ -1123,13 +1123,13 @@ rx_i_63: ;FPSUB dec ebx jz rx_finish xor r9, 065cf272eh - mov ecx, r9d + mov eax, r9d test bl, 63 jnz short rx_body_63 call rx_read_l1 rx_body_63: - and ecx, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + and eax, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] subpd xmm0, xmm7 movaps xmm8, xmm0 @@ -1137,13 +1137,13 @@ rx_i_64: ;SUB_64 dec ebx jz rx_finish xor r13, 0ae54dfbfh - mov ecx, r13d + mov eax, r13d test bl, 63 jnz short rx_body_64 call rx_read_l2 rx_body_64: - and ecx, 32767 - mov rax, qword ptr [rsi+rcx*8] + and eax, 32767 + mov rax, qword ptr [rsi+rax*8] sub rax, r15 mov r9, rax @@ -1151,13 +1151,13 @@ rx_i_65: ;JUMP dec ebx jz rx_finish xor r13, 07b366ce6h - mov ecx, r13d + mov eax, r13d test bl, 63 jnz short rx_body_65 call rx_read_l2 rx_body_65: - and ecx, 32767 - mov rax, qword ptr [rsi+rcx*8] + and eax, 32767 + mov rax, qword ptr [rsi+rax*8] mov r11, rax cmp r8d, 1498056607 js rx_i_129 @@ -1166,14 +1166,14 @@ rx_i_66: ;FPDIV dec ebx jz rx_finish xor r15, 015a1b689h - mov ecx, r15d + mov eax, r15d test bl, 63 jnz short rx_body_66 call rx_read_l1 rx_body_66: - xor rbp, rcx - and ecx, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + xor rbp, rax + and eax, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] divpd xmm0, xmm3 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -1188,13 +1188,13 @@ rx_i_67: ;JUMP dec ebx jz rx_finish xor r14, 088393ba0h - mov ecx, r14d + mov eax, r14d test bl, 63 jnz short rx_body_67 call rx_read_l1 rx_body_67: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] mov r9, rax cmp r13d, 2031541081 jns rx_i_79 @@ -1203,13 +1203,13 @@ rx_i_68: ;FPADD dec ebx jz rx_finish xor r13, 03aa5c3a4h - mov ecx, r13d + mov eax, r13d test bl, 63 jnz short rx_body_68 call rx_read_l2 rx_body_68: - and ecx, 32767 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + and eax, 32767 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] addpd xmm0, xmm2 movaps xmm4, xmm0 mov eax, r12d @@ -1221,14 +1221,14 @@ rx_i_69: ;FPADD dec ebx jz rx_finish xor r15, 0376c9c27h - mov ecx, r15d + mov eax, r15d test bl, 63 jnz short rx_body_69 call rx_read_l1 rx_body_69: - xor rbp, rcx - and ecx, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + xor rbp, rax + and eax, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] addpd xmm0, xmm5 movaps xmm8, xmm0 @@ -1236,14 +1236,14 @@ rx_i_70: ;MULH_64 dec ebx jz rx_finish xor r8, 0bbbec3fah - mov ecx, r8d + mov eax, r8d test bl, 63 jnz short rx_body_70 call rx_read_l1 rx_body_70: - xor rbp, rcx - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + xor rbp, rax + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] mov rcx, r9 mul rcx mov rax, rdx @@ -1253,13 +1253,13 @@ rx_i_71: ;FPMUL dec ebx jz rx_finish xor r14, 0e9efb350h - mov ecx, r14d + mov eax, r14d test bl, 63 jnz short rx_body_71 call rx_read_l1 rx_body_71: - and ecx, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + and eax, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] mulpd xmm0, xmm5 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -1270,13 +1270,13 @@ rx_i_72: ;JUMP dec ebx jz rx_finish xor r13, 0f4e51e28h - mov ecx, r13d + mov eax, r13d test bl, 63 jnz short rx_body_72 call rx_read_l1 rx_body_72: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] mov rcx, rax mov eax, r11d xor eax, 0da624dd9h @@ -1289,13 +1289,13 @@ rx_i_73: ;FPDIV dec ebx jz rx_finish xor r12, 0c24ddbd4h - mov ecx, r12d + mov eax, r12d test bl, 63 jnz short rx_body_73 call rx_read_l1 rx_body_73: - and ecx, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + and eax, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] divpd xmm0, xmm3 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -1306,14 +1306,14 @@ rx_i_74: ;MUL_64 dec ebx jz rx_finish xor r8, 04c4b0c7fh - mov ecx, r8d + mov eax, r8d test bl, 63 jnz short rx_body_74 call rx_read_l1 rx_body_74: - xor rbp, rcx - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + xor rbp, rax + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] imul rax, r13 mov rcx, rax mov eax, r9d @@ -1325,13 +1325,13 @@ rx_i_75: ;CALL dec ebx jz rx_finish xor r14, 03bcc02e3h - mov ecx, r14d + mov eax, r14d test bl, 63 jnz short rx_body_75 call rx_read_l1 rx_body_75: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] mov r13, rax cmp r11d, -1160798683 jno short rx_i_76 @@ -1341,13 +1341,13 @@ rx_i_76: ;FPADD dec ebx jz rx_finish xor r11, 04b0ff63eh - mov ecx, r11d + mov eax, r11d test bl, 63 jnz short rx_body_76 call rx_read_l2 rx_body_76: - and ecx, 32767 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + and eax, 32767 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] addpd xmm0, xmm3 movaps xmm7, xmm0 mov eax, r15d @@ -1359,13 +1359,13 @@ rx_i_77: ;RET dec ebx jz rx_finish xor r14, 0b956b3e8h - mov ecx, r14d + mov eax, r14d test bl, 63 jnz short rx_body_77 call rx_read_l1 rx_body_77: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] mov rcx, rax mov eax, r11d xor eax, 03a92bc7ah @@ -1379,13 +1379,13 @@ rx_i_78: ;MUL_32 dec ebx jz rx_finish xor r9, 0edeca680h - mov ecx, r9d + mov eax, r9d test bl, 63 jnz short rx_body_78 call rx_read_l1 rx_body_78: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] mov ecx, eax mov eax, r8d imul rax, rcx @@ -1395,13 +1395,13 @@ rx_i_79: ;CALL dec ebx jz rx_finish xor r11, 0fbdddcb5h - mov ecx, r11d + mov eax, r11d test bl, 63 jnz short rx_body_79 call rx_read_l2 rx_body_79: - and ecx, 32767 - mov rax, qword ptr [rsi+rcx*8] + and eax, 32767 + mov rax, qword ptr [rsi+rax*8] mov rcx, rax mov eax, r11d xor eax, 06b4a7b43h @@ -1415,13 +1415,13 @@ rx_i_80: ;ROR_64 dec ebx jz rx_finish xor r13, 09cec97a1h - mov ecx, r13d + mov eax, r13d test bl, 63 jnz short rx_body_80 call rx_read_l1 rx_body_80: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] mov rcx, r11 ror rax, cl mov rcx, rax @@ -1434,13 +1434,13 @@ rx_i_81: ;AND_64 dec ebx jz rx_finish xor r15, 078228167h - mov ecx, r15d + mov eax, r15d test bl, 63 jnz short rx_body_81 call rx_read_l1 rx_body_81: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] and rax, 338325607 mov r8, rax @@ -1448,13 +1448,13 @@ rx_i_82: ;JUMP dec ebx jz rx_finish xor r11, 078cae1ffh - mov ecx, r11d + mov eax, r11d test bl, 63 jnz short rx_body_82 call rx_read_l1 rx_body_82: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] mov rcx, rax mov eax, r10d xor eax, 0fbe39afbh @@ -1467,13 +1467,13 @@ rx_i_83: ;IDIV_64 dec ebx jz rx_finish xor r10, 0d9b6a533h - mov ecx, r10d + mov eax, r10d test bl, 63 jnz short rx_body_83 call rx_read_l2 rx_body_83: - and ecx, 32767 - mov rax, qword ptr [rsi+rcx*8] + and eax, 32767 + mov rax, qword ptr [rsi+rax*8] ; magic divide by 91850728 mov rdx, 842358619687110887 imul rdx @@ -1488,13 +1488,13 @@ rx_i_84: ;SAR_64 dec ebx jz rx_finish xor r15, 0e9e75336h - mov ecx, r15d + mov eax, r15d test bl, 63 jnz short rx_body_84 call rx_read_l1 rx_body_84: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] sar rax, 45 mov rcx, rax mov eax, r13d @@ -1506,13 +1506,13 @@ rx_i_85: ;MUL_64 dec ebx jz rx_finish xor r13, 04c0d378ah - mov ecx, r13d + mov eax, r13d test bl, 63 jnz short rx_body_85 call rx_read_l1 rx_body_85: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] imul rax, rax, 20014507 mov r10, rax @@ -1520,13 +1520,13 @@ rx_i_86: ;AND_64 dec ebx jz rx_finish xor r11, 04386e368h - mov ecx, r11d + mov eax, r11d test bl, 63 jnz short rx_body_86 call rx_read_l1 rx_body_86: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] and rax, r8 mov rcx, rax mov eax, r12d @@ -1538,14 +1538,14 @@ rx_i_87: ;SUB_64 dec ebx jz rx_finish xor r9, 0d75a0ecfh - mov ecx, r9d + mov eax, r9d test bl, 63 jnz short rx_body_87 call rx_read_l1 rx_body_87: - xor rbp, rcx - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + xor rbp, rax + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] sub rax, r12 mov r8, rax @@ -1553,14 +1553,14 @@ rx_i_88: ;ROR_64 dec ebx jz rx_finish xor r9, 031bb7f7ah - mov ecx, r9d + mov eax, r9d test bl, 63 jnz short rx_body_88 call rx_read_l1 rx_body_88: - xor rbp, rcx - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + xor rbp, rax + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] mov rcx, r14 ror rax, cl mov r9, rax @@ -1569,13 +1569,13 @@ rx_i_89: ;MUL_64 dec ebx jz rx_finish xor r9, 03b45ecebh - mov ecx, r9d + mov eax, r9d test bl, 63 jnz short rx_body_89 call rx_read_l1 rx_body_89: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] imul rax, r8 mov rcx, rax mov eax, r10d @@ -1587,13 +1587,13 @@ rx_i_90: ;FPADD dec ebx jz rx_finish xor r12, 0ee08e76bh - mov ecx, r12d + mov eax, r12d test bl, 63 jnz short rx_body_90 call rx_read_l2 rx_body_90: - and ecx, 32767 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + and eax, 32767 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] addpd xmm0, xmm3 movaps xmm6, xmm0 @@ -1601,14 +1601,14 @@ rx_i_91: ;FPMUL dec ebx jz rx_finish xor r9, 042e28e94h - mov ecx, r9d + mov eax, r9d test bl, 63 jnz short rx_body_91 call rx_read_l1 rx_body_91: - xor rbp, rcx - and ecx, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + xor rbp, rax + and eax, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] mulpd xmm0, xmm2 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -1619,13 +1619,13 @@ rx_i_92: ;JUMP dec ebx jz rx_finish xor r8, 0729260e1h - mov ecx, r8d + mov eax, r8d test bl, 63 jnz short rx_body_92 call rx_read_l1 rx_body_92: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] mov r12, rax cmp r14d, 1288893603 jge rx_i_170 @@ -1634,14 +1634,14 @@ rx_i_93: ;FPADD dec ebx jz rx_finish xor r8, 0bfcebaf4h - mov ecx, r8d + mov eax, r8d test bl, 63 jnz short rx_body_93 call rx_read_l1 rx_body_93: - xor rbp, rcx - and ecx, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + xor rbp, rax + and eax, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] addpd xmm0, xmm2 movaps xmm2, xmm0 mov eax, r10d @@ -1653,14 +1653,14 @@ rx_i_94: ;CALL dec ebx jz rx_finish xor r13, 0ea326630h - mov ecx, r13d + mov eax, r13d test bl, 63 jnz short rx_body_94 call rx_read_l2 rx_body_94: - xor rbp, rcx - and ecx, 32767 - mov rax, qword ptr [rsi+rcx*8] + xor rbp, rax + and eax, 32767 + mov rax, qword ptr [rsi+rax*8] mov r8, rax cmp r13d, -343122976 jns short rx_i_95 @@ -1670,13 +1670,13 @@ rx_i_95: ;MUL_64 dec ebx jz rx_finish xor r13, 0b5451a2dh - mov ecx, r13d + mov eax, r13d test bl, 63 jnz short rx_body_95 call rx_read_l1 rx_body_95: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] imul rax, r10 mov rcx, rax mov eax, r15d @@ -1688,16 +1688,16 @@ rx_i_96: ;IMUL_32 dec ebx jz rx_finish xor r11, 04f912ef8h - mov ecx, r11d + mov eax, r11d test bl, 63 jnz short rx_body_96 call rx_read_l1 rx_body_96: - xor rbp, rcx - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + xor rbp, rax + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] movsxd rcx, eax - mov rax, -1354397081 + movsxd rax, r11d imul rax, rcx mov r11, rax @@ -1705,13 +1705,13 @@ rx_i_97: ;FPDIV dec ebx jz rx_finish xor r15, 0acc45b3bh - mov ecx, r15d + mov eax, r15d test bl, 63 jnz short rx_body_97 call rx_read_l2 rx_body_97: - and ecx, 32767 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + and eax, 32767 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] divpd xmm0, xmm9 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -1726,13 +1726,13 @@ rx_i_98: ;SUB_64 dec ebx jz rx_finish xor r14, 09900a4e8h - mov ecx, r14d + mov eax, r14d test bl, 63 jnz short rx_body_98 call rx_read_l2 rx_body_98: - and ecx, 32767 - mov rax, qword ptr [rsi+rcx*8] + and eax, 32767 + mov rax, qword ptr [rsi+rax*8] sub rax, r15 mov r14, rax @@ -1740,13 +1740,13 @@ rx_i_99: ;FPMUL dec ebx jz rx_finish xor r9, 0841b2984h - mov ecx, r9d + mov eax, r9d test bl, 63 jnz short rx_body_99 call rx_read_l1 rx_body_99: - and ecx, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + and eax, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] mulpd xmm0, xmm6 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -1761,13 +1761,13 @@ rx_i_100: ;ADD_64 dec ebx jz rx_finish xor r15, 07ebea48fh - mov ecx, r15d + mov eax, r15d test bl, 63 jnz short rx_body_100 call rx_read_l1 rx_body_100: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] add rax, r9 mov r14, rax @@ -1775,13 +1775,13 @@ rx_i_101: ;SUB_64 dec ebx jz rx_finish xor r10, 0631209d3h - mov ecx, r10d + mov eax, r10d test bl, 63 jnz short rx_body_101 call rx_read_l1 rx_body_101: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] sub rax, 1732300336 mov r11, rax @@ -1789,13 +1789,13 @@ rx_i_102: ;FPMUL dec ebx jz rx_finish xor r10, 0e50bf07ah - mov ecx, r10d + mov eax, r10d test bl, 63 jnz short rx_body_102 call rx_read_l1 rx_body_102: - and ecx, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + and eax, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] mulpd xmm0, xmm3 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -1806,13 +1806,13 @@ rx_i_103: ;MUL_64 dec ebx jz rx_finish xor r10, 02b7096f1h - mov ecx, r10d + mov eax, r10d test bl, 63 jnz short rx_body_103 call rx_read_l2 rx_body_103: - and ecx, 32767 - mov rax, qword ptr [rsi+rcx*8] + and eax, 32767 + mov rax, qword ptr [rsi+rax*8] imul rax, r13 mov rcx, rax mov eax, r15d @@ -1824,13 +1824,13 @@ rx_i_104: ;DIV_64 dec ebx jz rx_finish xor r11, 075deaf71h - mov ecx, r11d + mov eax, r11d test bl, 63 jnz short rx_body_104 call rx_read_l2 rx_body_104: - and ecx, 32767 - mov rax, qword ptr [rsi+rcx*8] + and eax, 32767 + mov rax, qword ptr [rsi+rax*8] ; magic divide by 2381897207 mov rcx, 16631314374404138087 mul rcx @@ -1846,14 +1846,14 @@ rx_i_105: ;MUL_32 dec ebx jz rx_finish xor r13, 036a51f72h - mov ecx, r13d + mov eax, r13d test bl, 63 jnz short rx_body_105 call rx_read_l1 rx_body_105: - xor rbp, rcx - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + xor rbp, rax + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] mov ecx, eax mov eax, r15d imul rax, rcx @@ -1867,13 +1867,13 @@ rx_i_106: ;FPMUL dec ebx jz rx_finish xor r11, 07b512986h - mov ecx, r11d + mov eax, r11d test bl, 63 jnz short rx_body_106 call rx_read_l1 rx_body_106: - and ecx, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + and eax, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] mulpd xmm0, xmm3 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -1888,13 +1888,13 @@ rx_i_107: ;JUMP dec ebx jz rx_finish xor r12, 0f1d2e50h - mov ecx, r12d + mov eax, r12d test bl, 63 jnz short rx_body_107 call rx_read_l1 rx_body_107: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] mov rcx, rax mov eax, r14d xor eax, 07243ab81h @@ -1907,13 +1907,13 @@ rx_i_108: ;FPMUL dec ebx jz rx_finish xor r9, 07327ba60h - mov ecx, r9d + mov eax, r9d test bl, 63 jnz short rx_body_108 call rx_read_l1 rx_body_108: - and ecx, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + and eax, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] mulpd xmm0, xmm5 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -1928,14 +1928,14 @@ rx_i_109: ;ROR_64 dec ebx jz rx_finish xor r15, 0594e37deh - mov ecx, r15d + mov eax, r15d test bl, 63 jnz short rx_body_109 call rx_read_l1 rx_body_109: - xor rbp, rcx - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + xor rbp, rax + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] mov rcx, r10 ror rax, cl mov rcx, rax @@ -1948,13 +1948,13 @@ rx_i_110: ;SHR_64 dec ebx jz rx_finish xor r9, 04cdf5ebah - mov ecx, r9d + mov eax, r9d test bl, 63 jnz short rx_body_110 call rx_read_l2 rx_body_110: - and ecx, 32767 - mov rax, qword ptr [rsi+rcx*8] + and eax, 32767 + mov rax, qword ptr [rsi+rax*8] mov rcx, r9 shr rax, cl mov rcx, rax @@ -1967,13 +1967,13 @@ rx_i_111: ;CALL dec ebx jz rx_finish xor r8, 02e16c97ch - mov ecx, r8d + mov eax, r8d test bl, 63 jnz short rx_body_111 call rx_read_l2 rx_body_111: - and ecx, 32767 - mov rax, qword ptr [rsi+rcx*8] + and eax, 32767 + mov rax, qword ptr [rsi+rax*8] mov rcx, rax mov eax, r12d xor eax, 05d237d0bh @@ -1987,13 +1987,13 @@ rx_i_112: ;SUB_64 dec ebx jz rx_finish xor r12, 0d42ddbd4h - mov ecx, r12d + mov eax, r12d test bl, 63 jnz short rx_body_112 call rx_read_l1 rx_body_112: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] sub rax, -1025977295 mov rcx, rax mov eax, r14d @@ -2005,13 +2005,13 @@ rx_i_113: ;MULH_64 dec ebx jz rx_finish xor r10, 07a4f8cbbh - mov ecx, r10d + mov eax, r10d test bl, 63 jnz short rx_body_113 call rx_read_l2 rx_body_113: - and ecx, 32767 - mov rax, qword ptr [rsi+rcx*8] + and eax, 32767 + mov rax, qword ptr [rsi+rax*8] mov rcx, r9 mul rcx mov rax, rdx @@ -2021,13 +2021,13 @@ rx_i_114: ;DIV_64 dec ebx jz rx_finish xor r13, 06e83e2cdh - mov ecx, r13d + mov eax, r13d test bl, 63 jnz short rx_body_114 call rx_read_l1 rx_body_114: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] ; magic divide by 770835683 mov rcx, 12847770974664443757 mul rcx @@ -2039,13 +2039,13 @@ rx_i_115: ;IDIV_64 dec ebx jz rx_finish xor r14, 0336c980eh - mov ecx, r14d + mov eax, r14d test bl, 63 jnz short rx_body_115 call rx_read_l1 rx_body_115: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] ; magic divide by 587029837 mov rdx, 527204905636414983 imul rdx @@ -2060,13 +2060,13 @@ rx_i_116: ;DIV_64 dec ebx jz rx_finish xor r10, 0d122702eh - mov ecx, r10d + mov eax, r10d test bl, 63 jnz short rx_body_116 call rx_read_l1 rx_body_116: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] ; magic divide by 2444190605 mov rcx, 16207443550472271289 mul rcx @@ -2082,13 +2082,13 @@ rx_i_117: ;IDIV_64 dec ebx jz rx_finish xor r11, 015f2012bh - mov ecx, r11d + mov eax, r11d test bl, 63 jnz short rx_body_117 call rx_read_l1 rx_body_117: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] ; magic divide by -1205826972 mov rdx, -8213052572424165513 imul rdx @@ -2107,13 +2107,13 @@ rx_i_118: ;FPSUB dec ebx jz rx_finish xor r9, 037ddf43dh - mov ecx, r9d + mov eax, r9d test bl, 63 jnz short rx_body_118 call rx_read_l1 rx_body_118: - and ecx, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + and eax, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] subpd xmm0, xmm5 movaps xmm6, xmm0 @@ -2121,13 +2121,13 @@ rx_i_119: ;FPSUB dec ebx jz rx_finish xor r9, 0bba475f3h - mov ecx, r9d + mov eax, r9d test bl, 63 jnz short rx_body_119 call rx_read_l1 rx_body_119: - and ecx, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + and eax, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] subpd xmm0, xmm3 movaps xmm5, xmm0 @@ -2135,14 +2135,14 @@ rx_i_120: ;FPADD dec ebx jz rx_finish xor r12, 0e5561e3eh - mov ecx, r12d + mov eax, r12d test bl, 63 jnz short rx_body_120 call rx_read_l2 rx_body_120: - xor rbp, rcx - and ecx, 32767 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + xor rbp, rax + and eax, 32767 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] addpd xmm0, xmm4 movaps xmm8, xmm0 @@ -2150,13 +2150,13 @@ rx_i_121: ;FPSUB dec ebx jz rx_finish xor r9, 03ab8f73h - mov ecx, r9d + mov eax, r9d test bl, 63 jnz short rx_body_121 call rx_read_l2 rx_body_121: - and ecx, 32767 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + and eax, 32767 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] subpd xmm0, xmm5 movaps xmm8, xmm0 @@ -2164,13 +2164,13 @@ rx_i_122: ;CALL dec ebx jz rx_finish xor r10, 04e0dbd40h - mov ecx, r10d + mov eax, r10d test bl, 63 jnz short rx_body_122 call rx_read_l2 rx_body_122: - and ecx, 32767 - mov rax, qword ptr [rsi+rcx*8] + and eax, 32767 + mov rax, qword ptr [rsi+rax*8] mov rcx, rax mov eax, r14d xor eax, 078f6ec29h @@ -2184,13 +2184,13 @@ rx_i_123: ;ADD_32 dec ebx jz rx_finish xor r13, 073e9f58ah - mov ecx, r13d + mov eax, r13d test bl, 63 jnz short rx_body_123 call rx_read_l1 rx_body_123: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] add eax, 1530846772 mov r13, rax @@ -2198,13 +2198,13 @@ rx_i_124: ;JUMP dec ebx jz rx_finish xor r12, 0e3fa3670h - mov ecx, r12d + mov eax, r12d test bl, 63 jnz short rx_body_124 call rx_read_l1 rx_body_124: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] mov rcx, rax mov eax, r11d xor eax, 0667d921ch @@ -2217,16 +2217,16 @@ rx_i_125: ;IMUL_32 dec ebx jz rx_finish xor r8, 0ebec27cdh - mov ecx, r8d + mov eax, r8d test bl, 63 jnz short rx_body_125 call rx_read_l1 rx_body_125: - xor rbp, rcx - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + xor rbp, rax + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] movsxd rcx, eax - movsxd rax, r14d + mov rax, 1774711622 imul rax, rcx mov r14, rax @@ -2234,13 +2234,13 @@ rx_i_126: ;FPMUL dec ebx jz rx_finish xor r8, 01feb5264h - mov ecx, r8d + mov eax, r8d test bl, 63 jnz short rx_body_126 call rx_read_l2 rx_body_126: - and ecx, 32767 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + and eax, 32767 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] mulpd xmm0, xmm6 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -2251,15 +2251,15 @@ rx_i_127: ;IMUL_32 dec ebx jz rx_finish xor r9, 0405f500fh - mov ecx, r9d + mov eax, r9d test bl, 63 jnz short rx_body_127 call rx_read_l1 rx_body_127: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] movsxd rcx, eax - movsxd rax, r10d + mov rax, -1027270754 imul rax, rcx mov r8, rax @@ -2267,14 +2267,14 @@ rx_i_128: ;MUL_64 dec ebx jz rx_finish xor r13, 0459f1154h - mov ecx, r13d + mov eax, r13d test bl, 63 jnz short rx_body_128 call rx_read_l1 rx_body_128: - xor rbp, rcx - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + xor rbp, rax + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] imul rax, r9 mov r9, rax @@ -2282,13 +2282,13 @@ rx_i_129: ;JUMP dec ebx jz rx_finish xor r9, 081918b4ch - mov ecx, r9d + mov eax, r9d test bl, 63 jnz short rx_body_129 call rx_read_l1 rx_body_129: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] mov r9, rax cmp r13d, -590624856 jge rx_i_154 @@ -2297,13 +2297,13 @@ rx_i_130: ;IDIV_64 dec ebx jz rx_finish xor r9, 077c3b332h - mov ecx, r9d + mov eax, r9d test bl, 63 jnz short rx_body_130 call rx_read_l1 rx_body_130: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] ; magic divide by -281794782 mov rdx, -8786110448882479839 imul rdx @@ -2322,13 +2322,13 @@ rx_i_131: ;RET dec ebx jz rx_finish xor r12, 05792310bh - mov ecx, r12d + mov eax, r12d test bl, 63 jnz short rx_body_131 call rx_read_l1 rx_body_131: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] mov rcx, rax mov eax, r15d xor eax, 0dff06f75h @@ -2342,13 +2342,13 @@ rx_i_132: ;FPADD dec ebx jz rx_finish xor r10, 0ebc6e10h - mov ecx, r10d + mov eax, r10d test bl, 63 jnz short rx_body_132 call rx_read_l1 rx_body_132: - and ecx, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + and eax, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] addpd xmm0, xmm6 movaps xmm7, xmm0 @@ -2356,13 +2356,13 @@ rx_i_133: ;OR_64 dec ebx jz rx_finish xor r14, 0822f8b60h - mov ecx, r14d + mov eax, r14d test bl, 63 jnz short rx_body_133 call rx_read_l1 rx_body_133: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] or rax, r13 mov rcx, rax mov eax, r15d @@ -2374,13 +2374,13 @@ rx_i_134: ;ADD_64 dec ebx jz rx_finish xor r10, 0d0f18593h - mov ecx, r10d + mov eax, r10d test bl, 63 jnz short rx_body_134 call rx_read_l1 rx_body_134: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] add rax, r8 mov r13, rax @@ -2388,13 +2388,13 @@ rx_i_135: ;FPMUL dec ebx jz rx_finish xor r11, 088212ef9h - mov ecx, r11d + mov eax, r11d test bl, 63 jnz short rx_body_135 call rx_read_l1 rx_body_135: - and ecx, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + and eax, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] mulpd xmm0, xmm3 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -2405,13 +2405,13 @@ rx_i_136: ;FPDIV dec ebx jz rx_finish xor r8, 01ae56e03h - mov ecx, r8d + mov eax, r8d test bl, 63 jnz short rx_body_136 call rx_read_l1 rx_body_136: - and ecx, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + and eax, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] divpd xmm0, xmm8 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -2426,14 +2426,14 @@ rx_i_137: ;SHR_64 dec ebx jz rx_finish xor r11, 015a24231h - mov ecx, r11d + mov eax, r11d test bl, 63 jnz short rx_body_137 call rx_read_l2 rx_body_137: - xor rbp, rcx - and ecx, 32767 - mov rax, qword ptr [rsi+rcx*8] + xor rbp, rax + and eax, 32767 + mov rax, qword ptr [rsi+rax*8] mov rcx, r9 shr rax, cl mov r11, rax @@ -2442,13 +2442,13 @@ rx_i_138: ;RET dec ebx jz rx_finish xor r13, 02fd380c5h - mov ecx, r13d + mov eax, r13d test bl, 63 jnz short rx_body_138 call rx_read_l1 rx_body_138: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] mov r10, rax cmp rsp, rdi je short rx_i_139 @@ -2458,14 +2458,14 @@ rx_i_139: ;ADD_64 dec ebx jz rx_finish xor r9, 093172470h - mov ecx, r9d + mov eax, r9d test bl, 63 jnz short rx_body_139 call rx_read_l1 rx_body_139: - xor rbp, rcx - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + xor rbp, rax + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] add rax, r8 mov rcx, rax mov eax, r11d @@ -2477,15 +2477,15 @@ rx_i_140: ;IMUL_32 dec ebx jz rx_finish xor r14, 052543553h - mov ecx, r14d + mov eax, r14d test bl, 63 jnz short rx_body_140 call rx_read_l1 rx_body_140: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] movsxd rcx, eax - movsxd rax, r11d + mov rax, -140239781 imul rax, rcx mov r14, rax @@ -2493,14 +2493,14 @@ rx_i_141: ;FPADD dec ebx jz rx_finish xor r8, 02f636da1h - mov ecx, r8d + mov eax, r8d test bl, 63 jnz short rx_body_141 call rx_read_l1 rx_body_141: - xor rbp, rcx - and ecx, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + xor rbp, rax + and eax, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] addpd xmm0, xmm2 movaps xmm9, xmm0 mov eax, r9d @@ -2512,13 +2512,13 @@ rx_i_142: ;JUMP dec ebx jz rx_finish xor r11, 0b11a4f2ch - mov ecx, r11d + mov eax, r11d test bl, 63 jnz short rx_body_142 call rx_read_l1 rx_body_142: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] mov rcx, rax mov eax, r10d xor eax, 0516a9452h @@ -2531,13 +2531,13 @@ rx_i_143: ;IMUL_32 dec ebx jz rx_finish xor r15, 037f4b5d0h - mov ecx, r15d + mov eax, r15d test bl, 63 jnz short rx_body_143 call rx_read_l1 rx_body_143: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] movsxd rcx, eax movsxd rax, r11d imul rax, rcx @@ -2547,13 +2547,13 @@ rx_i_144: ;DIV_64 dec ebx jz rx_finish xor r10, 02e59e00ah - mov ecx, r10d + mov eax, r10d test bl, 63 jnz short rx_body_144 call rx_read_l1 rx_body_144: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] mov ecx, 1 mov edx, r11d test edx, edx @@ -2566,13 +2566,13 @@ rx_i_145: ;DIV_64 dec ebx jz rx_finish xor r13, 08d5c798h - mov ecx, r13d + mov eax, r13d test bl, 63 jnz short rx_body_145 call rx_read_l1 rx_body_145: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] ; magic divide by 3712555397 mov rcx, 10670300378317066981 mul rcx @@ -2588,14 +2588,14 @@ rx_i_146: ;IMULH_64 dec ebx jz rx_finish xor r13, 02327e6e2h - mov ecx, r13d + mov eax, r13d test bl, 63 jnz short rx_body_146 call rx_read_l1 rx_body_146: - xor rbp, rcx - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + xor rbp, rax + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] mov rcx, r12 imul rcx mov rax, rdx @@ -2605,14 +2605,14 @@ rx_i_147: ;MUL_64 dec ebx jz rx_finish xor r13, 03a7df043h - mov ecx, r13d + mov eax, r13d test bl, 63 jnz short rx_body_147 call rx_read_l1 rx_body_147: - xor rbp, rcx - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + xor rbp, rax + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] imul rax, r11 mov rcx, rax mov eax, r12d @@ -2624,13 +2624,13 @@ rx_i_148: ;SUB_64 dec ebx jz rx_finish xor r10, 0783e5c4eh - mov ecx, r10d + mov eax, r10d test bl, 63 jnz short rx_body_148 call rx_read_l1 rx_body_148: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] sub rax, r14 mov rcx, rax mov eax, r10d @@ -2642,13 +2642,13 @@ rx_i_149: ;MUL_32 dec ebx jz rx_finish xor r12, 0aa0f5b2fh - mov ecx, r12d + mov eax, r12d test bl, 63 jnz short rx_body_149 call rx_read_l1 rx_body_149: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] mov ecx, eax mov eax, r14d imul rax, rcx @@ -2662,13 +2662,13 @@ rx_i_150: ;DIV_64 dec ebx jz rx_finish xor r9, 01504ca7ah - mov ecx, r9d + mov eax, r9d test bl, 63 jnz short rx_body_150 call rx_read_l1 rx_body_150: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] mov ecx, 1 mov edx, r8d test edx, edx @@ -2685,13 +2685,13 @@ rx_i_151: ;AND_64 dec ebx jz rx_finish xor r9, 0ea72a7cfh - mov ecx, r9d + mov eax, r9d test bl, 63 jnz short rx_body_151 call rx_read_l1 rx_body_151: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] and rax, -2018584590 mov rcx, rax mov eax, r11d @@ -2703,13 +2703,13 @@ rx_i_152: ;SAR_64 dec ebx jz rx_finish xor r13, 0ad0e7a88h - mov ecx, r13d + mov eax, r13d test bl, 63 jnz short rx_body_152 call rx_read_l1 rx_body_152: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] mov rcx, r10 sar rax, cl mov r10, rax @@ -2718,13 +2718,13 @@ rx_i_153: ;FPMUL dec ebx jz rx_finish xor r15, 0fd95ab87h - mov ecx, r15d + mov eax, r15d test bl, 63 jnz short rx_body_153 call rx_read_l1 rx_body_153: - and ecx, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + and eax, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] mulpd xmm0, xmm2 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -2739,13 +2739,13 @@ rx_i_154: ;MUL_32 dec ebx jz rx_finish xor r10, 0256697b0h - mov ecx, r10d + mov eax, r10d test bl, 63 jnz short rx_body_154 call rx_read_l1 rx_body_154: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] mov ecx, eax mov eax, -820047839 imul rax, rcx @@ -2755,13 +2755,13 @@ rx_i_155: ;ROL_64 dec ebx jz rx_finish xor r11, 0d23f3b78h - mov ecx, r11d + mov eax, r11d test bl, 63 jnz short rx_body_155 call rx_read_l2 rx_body_155: - and ecx, 32767 - mov rax, qword ptr [rsi+rcx*8] + and eax, 32767 + mov rax, qword ptr [rsi+rax*8] mov rcx, r10 rol rax, cl mov rcx, rax @@ -2774,13 +2774,13 @@ rx_i_156: ;IMUL_32 dec ebx jz rx_finish xor r10, 098917533h - mov ecx, r10d + mov eax, r10d test bl, 63 jnz short rx_body_156 call rx_read_l1 rx_body_156: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] movsxd rcx, eax movsxd rax, r15d imul rax, rcx @@ -2790,13 +2790,13 @@ rx_i_157: ;ADD_64 dec ebx jz rx_finish xor r10, 0dfac3efch - mov ecx, r10d + mov eax, r10d test bl, 63 jnz short rx_body_157 call rx_read_l1 rx_body_157: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] add rax, r12 mov r14, rax @@ -2804,13 +2804,13 @@ rx_i_158: ;ADD_64 dec ebx jz rx_finish xor r15, 0a64de090h - mov ecx, r15d + mov eax, r15d test bl, 63 jnz short rx_body_158 call rx_read_l1 rx_body_158: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] add rax, r13 mov r10, rax @@ -2818,13 +2818,13 @@ rx_i_159: ;CALL dec ebx jz rx_finish xor r13, 0952a3abbh - mov ecx, r13d + mov eax, r13d test bl, 63 jnz short rx_body_159 call rx_read_l2 rx_body_159: - and ecx, 32767 - mov rax, qword ptr [rsi+rcx*8] + and eax, 32767 + mov rax, qword ptr [rsi+rax*8] mov rcx, rax mov eax, r13d xor eax, 0ff7d3697h @@ -2838,14 +2838,14 @@ rx_i_160: ;SUB_64 dec ebx jz rx_finish xor r14, 0b1685b90h - mov ecx, r14d + mov eax, r14d test bl, 63 jnz short rx_body_160 call rx_read_l1 rx_body_160: - xor rbp, rcx - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + xor rbp, rax + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] sub rax, r14 mov rcx, rax mov eax, r10d @@ -2857,13 +2857,13 @@ rx_i_161: ;IDIV_64 dec ebx jz rx_finish xor r15, 0ea992531h - mov ecx, r15d + mov eax, r15d test bl, 63 jnz short rx_body_161 call rx_read_l2 rx_body_161: - and ecx, 32767 - mov rax, qword ptr [rsi+rcx*8] + and eax, 32767 + mov rax, qword ptr [rsi+rax*8] mov edx, r14d cmp edx, -1 jne short body_idiv_161 @@ -2883,13 +2883,13 @@ rx_i_162: ;SHL_64 dec ebx jz rx_finish xor r9, 01fd57a4ah - mov ecx, r9d + mov eax, r9d test bl, 63 jnz short rx_body_162 call rx_read_l1 rx_body_162: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] shl rax, 7 mov r13, rax @@ -2897,14 +2897,14 @@ rx_i_163: ;SUB_64 dec ebx jz rx_finish xor r12, 0e3486c0ah - mov ecx, r12d + mov eax, r12d test bl, 63 jnz short rx_body_163 call rx_read_l1 rx_body_163: - xor rbp, rcx - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + xor rbp, rax + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] sub rax, r8 mov rcx, rax mov eax, r14d @@ -2916,14 +2916,14 @@ rx_i_164: ;MUL_32 dec ebx jz rx_finish xor r12, 01f0c2737h - mov ecx, r12d + mov eax, r12d test bl, 63 jnz short rx_body_164 call rx_read_l1 rx_body_164: - xor rbp, rcx - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + xor rbp, rax + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] mov ecx, eax mov eax, r9d imul rax, rcx @@ -2937,13 +2937,13 @@ rx_i_165: ;RET dec ebx jz rx_finish xor r12, 0debb493eh - mov ecx, r12d + mov eax, r12d test bl, 63 jnz short rx_body_165 call rx_read_l1 rx_body_165: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] mov rcx, rax mov eax, r9d xor eax, 06450685ch @@ -2957,13 +2957,13 @@ rx_i_166: ;SHR_64 dec ebx jz rx_finish xor r9, 0fe684081h - mov ecx, r9d + mov eax, r9d test bl, 63 jnz short rx_body_166 call rx_read_l1 rx_body_166: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] shr rax, 62 mov rcx, rax mov eax, r13d @@ -2975,13 +2975,13 @@ rx_i_167: ;FPMUL dec ebx jz rx_finish xor r11, 0d10371ch - mov ecx, r11d + mov eax, r11d test bl, 63 jnz short rx_body_167 call rx_read_l1 rx_body_167: - and ecx, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + and eax, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] mulpd xmm0, xmm4 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -2996,13 +2996,13 @@ rx_i_168: ;FPDIV dec ebx jz rx_finish xor r12, 071b15effh - mov ecx, r12d + mov eax, r12d test bl, 63 jnz short rx_body_168 call rx_read_l1 rx_body_168: - and ecx, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + and eax, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] divpd xmm0, xmm5 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -3013,13 +3013,13 @@ rx_i_169: ;CALL dec ebx jz rx_finish xor r11, 072790347h - mov ecx, r11d + mov eax, r11d test bl, 63 jnz short rx_body_169 call rx_read_l1 rx_body_169: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] mov rcx, rax mov eax, r14d xor eax, 0b353bf8dh @@ -3033,13 +3033,13 @@ rx_i_170: ;FPSQRT dec ebx jz rx_finish xor r8, 04ae8a020h - mov ecx, r8d + mov eax, r8d test bl, 63 jnz short rx_body_170 call rx_read_l1 rx_body_170: - and ecx, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + and eax, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] andps xmm0, xmm10 sqrtpd xmm6, xmm0 mov eax, r14d @@ -3051,13 +3051,13 @@ rx_i_171: ;DIV_64 dec ebx jz rx_finish xor r15, 09901e05bh - mov ecx, r15d + mov eax, r15d test bl, 63 jnz short rx_body_171 call rx_read_l1 rx_body_171: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] ; magic divide by 2064150457 add rax, 1 sbb rax, 0 @@ -3071,13 +3071,13 @@ rx_i_172: ;SUB_64 dec ebx jz rx_finish xor r13, 050e8c510h - mov ecx, r13d + mov eax, r13d test bl, 63 jnz short rx_body_172 call rx_read_l1 rx_body_172: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] sub rax, -478081934 mov r12, rax @@ -3085,14 +3085,14 @@ rx_i_173: ;MUL_64 dec ebx jz rx_finish xor r14, 05422cf8fh - mov ecx, r14d + mov eax, r14d test bl, 63 jnz short rx_body_173 call rx_read_l1 rx_body_173: - xor rbp, rcx - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + xor rbp, rax + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] imul rax, rax, -1386172772 mov rcx, rax mov eax, r12d @@ -3104,13 +3104,13 @@ rx_i_174: ;FPDIV dec ebx jz rx_finish xor r12, 0a025c3dbh - mov ecx, r12d + mov eax, r12d test bl, 63 jnz short rx_body_174 call rx_read_l1 rx_body_174: - and ecx, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + and eax, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] divpd xmm0, xmm9 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -3125,13 +3125,13 @@ rx_i_175: ;XOR_32 dec ebx jz rx_finish xor r13, 08f74c11h - mov ecx, r13d + mov eax, r13d test bl, 63 jnz short rx_body_175 call rx_read_l1 rx_body_175: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] xor eax, r8d mov r8, rax @@ -3139,13 +3139,13 @@ rx_i_176: ;SUB_64 dec ebx jz rx_finish xor r9, 01f2ed5f1h - mov ecx, r9d + mov eax, r9d test bl, 63 jnz short rx_body_176 call rx_read_l1 rx_body_176: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] sub rax, -2101315181 mov r10, rax @@ -3153,13 +3153,13 @@ rx_i_177: ;ADD_64 dec ebx jz rx_finish xor r10, 0d2072c79h - mov ecx, r10d + mov eax, r10d test bl, 63 jnz short rx_body_177 call rx_read_l1 rx_body_177: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] add rax, 794235831 mov rcx, rax mov eax, r13d @@ -3171,13 +3171,13 @@ rx_i_178: ;RET dec ebx jz rx_finish xor r15, 0a8e51933h - mov ecx, r15d + mov eax, r15d test bl, 63 jnz short rx_body_178 call rx_read_l2 rx_body_178: - and ecx, 32767 - mov rax, qword ptr [rsi+rcx*8] + and eax, 32767 + mov rax, qword ptr [rsi+rax*8] mov rcx, rax mov eax, r12d xor eax, 0c366b275h @@ -3191,13 +3191,13 @@ rx_i_179: ;FPADD dec ebx jz rx_finish xor r12, 0934ad492h - mov ecx, r12d + mov eax, r12d test bl, 63 jnz short rx_body_179 call rx_read_l2 rx_body_179: - and ecx, 32767 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + and eax, 32767 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] addpd xmm0, xmm2 movaps xmm8, xmm0 @@ -3205,13 +3205,13 @@ rx_i_180: ;AND_32 dec ebx jz rx_finish xor r15, 01cb3ce1fh - mov ecx, r15d + mov eax, r15d test bl, 63 jnz short rx_body_180 call rx_read_l1 rx_body_180: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] and eax, r9d mov rcx, rax mov eax, r9d @@ -3223,14 +3223,14 @@ rx_i_181: ;CALL dec ebx jz rx_finish xor r10, 023c7845fh - mov ecx, r10d + mov eax, r10d test bl, 63 jnz short rx_body_181 call rx_read_l1 rx_body_181: - xor rbp, rcx - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + xor rbp, rax + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] mov r10, rax cmp r12d, -1612576918 jbe short rx_i_182 @@ -3240,14 +3240,14 @@ rx_i_182: ;FPSUB dec ebx jz rx_finish xor r8, 0f8884327h - mov ecx, r8d + mov eax, r8d test bl, 63 jnz short rx_body_182 call rx_read_l1 rx_body_182: - xor rbp, rcx - and ecx, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + xor rbp, rax + and eax, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] subpd xmm0, xmm7 movaps xmm6, xmm0 @@ -3255,13 +3255,13 @@ rx_i_183: ;ADD_64 dec ebx jz rx_finish xor r13, 013070461h - mov ecx, r13d + mov eax, r13d test bl, 63 jnz short rx_body_183 call rx_read_l1 rx_body_183: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] add rax, r11 mov r10, rax @@ -3269,14 +3269,14 @@ rx_i_184: ;XOR_32 dec ebx jz rx_finish xor r12, 04764cdf7h - mov ecx, r12d + mov eax, r12d test bl, 63 jnz short rx_body_184 call rx_read_l1 rx_body_184: - xor rbp, rcx - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + xor rbp, rax + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] xor eax, r13d mov r12, rax @@ -3284,13 +3284,13 @@ rx_i_185: ;JUMP dec ebx jz rx_finish xor r10, 03c41026fh - mov ecx, r10d + mov eax, r10d test bl, 63 jnz short rx_body_185 call rx_read_l2 rx_body_185: - and ecx, 32767 - mov rax, qword ptr [rsi+rcx*8] + and eax, 32767 + mov rax, qword ptr [rsi+rax*8] mov rcx, rax mov eax, r9d xor eax, 0a5fae4a3h @@ -3303,14 +3303,14 @@ rx_i_186: ;OR_64 dec ebx jz rx_finish xor r9, 0cded414bh - mov ecx, r9d + mov eax, r9d test bl, 63 jnz short rx_body_186 call rx_read_l1 rx_body_186: - xor rbp, rcx - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + xor rbp, rax + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] or rax, -1252263008 mov rcx, rax mov eax, r10d @@ -3322,13 +3322,13 @@ rx_i_187: ;FPMUL dec ebx jz rx_finish xor r13, 05c6d64a8h - mov ecx, r13d + mov eax, r13d test bl, 63 jnz short rx_body_187 call rx_read_l1 rx_body_187: - and ecx, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + and eax, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] mulpd xmm0, xmm6 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -3339,14 +3339,14 @@ rx_i_188: ;FPSUB dec ebx jz rx_finish xor r9, 04659becbh - mov ecx, r9d + mov eax, r9d test bl, 63 jnz short rx_body_188 call rx_read_l2 rx_body_188: - xor rbp, rcx - and ecx, 32767 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + xor rbp, rax + and eax, 32767 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] subpd xmm0, xmm3 movaps xmm4, xmm0 @@ -3354,13 +3354,13 @@ rx_i_189: ;FPDIV dec ebx jz rx_finish xor r11, 0c52741d5h - mov ecx, r11d + mov eax, r11d test bl, 63 jnz short rx_body_189 call rx_read_l1 rx_body_189: - and ecx, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + and eax, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] divpd xmm0, xmm7 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -3371,13 +3371,13 @@ rx_i_190: ;RET dec ebx jz rx_finish xor r12, 0217bf5f3h - mov ecx, r12d + mov eax, r12d test bl, 63 jnz short rx_body_190 call rx_read_l1 rx_body_190: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] mov r13, rax cmp rsp, rdi je short rx_i_191 @@ -3387,13 +3387,13 @@ rx_i_191: ;FPSQRT dec ebx jz rx_finish xor r15, 0884f3526h - mov ecx, r15d + mov eax, r15d test bl, 63 jnz short rx_body_191 call rx_read_l2 rx_body_191: - and ecx, 32767 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + and eax, 32767 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] andps xmm0, xmm10 sqrtpd xmm6, xmm0 @@ -3401,13 +3401,13 @@ rx_i_192: ;FPSQRT dec ebx jz rx_finish xor r8, 0d76edad3h - mov ecx, r8d + mov eax, r8d test bl, 63 jnz short rx_body_192 call rx_read_l1 rx_body_192: - and ecx, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + and eax, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] andps xmm0, xmm10 sqrtpd xmm8, xmm0 mov eax, r8d @@ -3419,13 +3419,13 @@ rx_i_193: ;MUL_32 dec ebx jz rx_finish xor r12, 0e9939ach - mov ecx, r12d + mov eax, r12d test bl, 63 jnz short rx_body_193 call rx_read_l1 rx_body_193: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] mov ecx, eax mov eax, r12d imul rax, rcx @@ -3439,13 +3439,13 @@ rx_i_194: ;FPMUL dec ebx jz rx_finish xor r12, 0f21ca520h - mov ecx, r12d + mov eax, r12d test bl, 63 jnz short rx_body_194 call rx_read_l1 rx_body_194: - and ecx, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + and eax, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] mulpd xmm0, xmm2 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -3460,13 +3460,13 @@ rx_i_195: ;SHL_64 dec ebx jz rx_finish xor r10, 09405152ch - mov ecx, r10d + mov eax, r10d test bl, 63 jnz short rx_body_195 call rx_read_l1 rx_body_195: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] shl rax, 27 mov r9, rax @@ -3474,13 +3474,13 @@ rx_i_196: ;SUB_64 dec ebx jz rx_finish xor r8, 0c2a9f41bh - mov ecx, r8d + mov eax, r8d test bl, 63 jnz short rx_body_196 call rx_read_l1 rx_body_196: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] sub rax, r8 mov rcx, rax mov eax, r13d @@ -3492,14 +3492,14 @@ rx_i_197: ;MUL_64 dec ebx jz rx_finish xor r12, 0229208efh - mov ecx, r12d + mov eax, r12d test bl, 63 jnz short rx_body_197 call rx_read_l2 rx_body_197: - xor rbp, rcx - and ecx, 32767 - mov rax, qword ptr [rsi+rcx*8] + xor rbp, rax + and eax, 32767 + mov rax, qword ptr [rsi+rax*8] imul rax, r15 mov r11, rax @@ -3507,13 +3507,13 @@ rx_i_198: ;MULH_64 dec ebx jz rx_finish xor r14, 0c8d95bbbh - mov ecx, r14d + mov eax, r14d test bl, 63 jnz short rx_body_198 call rx_read_l2 rx_body_198: - and ecx, 32767 - mov rax, qword ptr [rsi+rcx*8] + and eax, 32767 + mov rax, qword ptr [rsi+rax*8] mov rcx, r14 mul rcx mov rax, rdx @@ -3527,13 +3527,13 @@ rx_i_199: ;MULH_64 dec ebx jz rx_finish xor r13, 050049e2eh - mov ecx, r13d + mov eax, r13d test bl, 63 jnz short rx_body_199 call rx_read_l1 rx_body_199: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] mov rcx, r10 mul rcx mov rax, rdx @@ -3547,13 +3547,13 @@ rx_i_200: ;FPSUB dec ebx jz rx_finish xor r10, 0c63b99e8h - mov ecx, r10d + mov eax, r10d test bl, 63 jnz short rx_body_200 call rx_read_l1 rx_body_200: - and ecx, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + and eax, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] subpd xmm0, xmm2 movaps xmm4, xmm0 mov eax, r12d @@ -3565,14 +3565,14 @@ rx_i_201: ;FPADD dec ebx jz rx_finish xor r8, 0cdda801dh - mov ecx, r8d + mov eax, r8d test bl, 63 jnz short rx_body_201 call rx_read_l1 rx_body_201: - xor rbp, rcx - and ecx, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + xor rbp, rax + and eax, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] addpd xmm0, xmm9 movaps xmm4, xmm0 mov eax, r12d @@ -3584,13 +3584,13 @@ rx_i_202: ;FPADD dec ebx jz rx_finish xor r13, 0fa44b04ah - mov ecx, r13d + mov eax, r13d test bl, 63 jnz short rx_body_202 call rx_read_l1 rx_body_202: - and ecx, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + and eax, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] addpd xmm0, xmm9 movaps xmm5, xmm0 @@ -3598,13 +3598,13 @@ rx_i_203: ;FPSUB dec ebx jz rx_finish xor r10, 0d73e472ch - mov ecx, r10d + mov eax, r10d test bl, 63 jnz short rx_body_203 call rx_read_l2 rx_body_203: - and ecx, 32767 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + and eax, 32767 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] subpd xmm0, xmm2 movaps xmm7, xmm0 mov eax, r15d @@ -3616,13 +3616,13 @@ rx_i_204: ;MUL_64 dec ebx jz rx_finish xor r9, 01af8ab1dh - mov ecx, r9d + mov eax, r9d test bl, 63 jnz short rx_body_204 call rx_read_l2 rx_body_204: - and ecx, 32767 - mov rax, qword ptr [rsi+rcx*8] + and eax, 32767 + mov rax, qword ptr [rsi+rax*8] imul rax, r15 mov rcx, rax mov eax, r8d @@ -3634,14 +3634,14 @@ rx_i_205: ;FPMUL dec ebx jz rx_finish xor r14, 094e997c5h - mov ecx, r14d + mov eax, r14d test bl, 63 jnz short rx_body_205 call rx_read_l1 rx_body_205: - xor rbp, rcx - and ecx, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + xor rbp, rax + and eax, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] mulpd xmm0, xmm8 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -3652,14 +3652,14 @@ rx_i_206: ;FPSUB dec ebx jz rx_finish xor r11, 0e836a177h - mov ecx, r11d + mov eax, r11d test bl, 63 jnz short rx_body_206 call rx_read_l1 rx_body_206: - xor rbp, rcx - and ecx, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + xor rbp, rax + and eax, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] subpd xmm0, xmm7 movaps xmm4, xmm0 @@ -3667,14 +3667,14 @@ rx_i_207: ;IDIV_64 dec ebx jz rx_finish xor r9, 039ccdd30h - mov ecx, r9d + mov eax, r9d test bl, 63 jnz short rx_body_207 call rx_read_l1 rx_body_207: - xor rbp, rcx - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + xor rbp, rax + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] ; magic divide by 314297476 mov rdx, 1969376361274661135 imul rdx @@ -3693,13 +3693,13 @@ rx_i_208: ;MUL_64 dec ebx jz rx_finish xor r9, 0f4f126c5h - mov ecx, r9d + mov eax, r9d test bl, 63 jnz short rx_body_208 call rx_read_l1 rx_body_208: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] imul rax, rax, -486588965 mov r10, rax @@ -3707,14 +3707,14 @@ rx_i_209: ;XOR_64 dec ebx jz rx_finish xor r8, 0b84811f1h - mov ecx, r8d + mov eax, r8d test bl, 63 jnz short rx_body_209 call rx_read_l1 rx_body_209: - xor rbp, rcx - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + xor rbp, rax + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] xor rax, r15 mov rcx, rax mov eax, r12d @@ -3726,14 +3726,14 @@ rx_i_210: ;MUL_32 dec ebx jz rx_finish xor r12, 0c5efc90ah - mov ecx, r12d + mov eax, r12d test bl, 63 jnz short rx_body_210 call rx_read_l1 rx_body_210: - xor rbp, rcx - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + xor rbp, rax + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] mov ecx, eax mov eax, r12d imul rax, rcx @@ -3747,13 +3747,13 @@ rx_i_211: ;ROR_64 dec ebx jz rx_finish xor r12, 0ce533072h - mov ecx, r12d + mov eax, r12d test bl, 63 jnz short rx_body_211 call rx_read_l1 rx_body_211: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] mov rcx, r9 ror rax, cl mov rcx, rax @@ -3766,14 +3766,14 @@ rx_i_212: ;MUL_64 dec ebx jz rx_finish xor r13, 06b465fdbh - mov ecx, r13d + mov eax, r13d test bl, 63 jnz short rx_body_212 call rx_read_l1 rx_body_212: - xor rbp, rcx - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + xor rbp, rax + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] imul rax, r13 mov rcx, rax mov eax, r15d @@ -3785,16 +3785,16 @@ rx_i_213: ;IMUL_32 dec ebx jz rx_finish xor r13, 02dd1d503h - mov ecx, r13d + mov eax, r13d test bl, 63 jnz short rx_body_213 call rx_read_l1 rx_body_213: - xor rbp, rcx - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + xor rbp, rax + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] movsxd rcx, eax - mov rax, 129993589 + movsxd rax, r14d imul rax, rcx mov r14, rax @@ -3802,13 +3802,13 @@ rx_i_214: ;SHL_64 dec ebx jz rx_finish xor r9, 0a159f313h - mov ecx, r9d + mov eax, r9d test bl, 63 jnz short rx_body_214 call rx_read_l1 rx_body_214: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] mov rcx, r14 shl rax, cl mov r14, rax @@ -3817,14 +3817,14 @@ rx_i_215: ;ADD_32 dec ebx jz rx_finish xor r15, 08359265eh - mov ecx, r15d + mov eax, r15d test bl, 63 jnz short rx_body_215 call rx_read_l1 rx_body_215: - xor rbp, rcx - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + xor rbp, rax + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] add eax, r12d mov r10, rax @@ -3832,13 +3832,13 @@ rx_i_216: ;MUL_64 dec ebx jz rx_finish xor r12, 080696de3h - mov ecx, r12d + mov eax, r12d test bl, 63 jnz short rx_body_216 call rx_read_l2 rx_body_216: - and ecx, 32767 - mov rax, qword ptr [rsi+rcx*8] + and eax, 32767 + mov rax, qword ptr [rsi+rax*8] imul rax, r13 mov rcx, rax mov eax, r15d @@ -3850,13 +3850,13 @@ rx_i_217: ;IMUL_32 dec ebx jz rx_finish xor r8, 040d5b526h - mov ecx, r8d + mov eax, r8d test bl, 63 jnz short rx_body_217 call rx_read_l1 rx_body_217: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] movsxd rcx, eax movsxd rax, r9d imul rax, rcx @@ -3870,13 +3870,13 @@ rx_i_218: ;FPSQRT dec ebx jz rx_finish xor r11, 083c0bd93h - mov ecx, r11d + mov eax, r11d test bl, 63 jnz short rx_body_218 call rx_read_l1 rx_body_218: - and ecx, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + and eax, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] andps xmm0, xmm10 sqrtpd xmm3, xmm0 mov eax, r11d @@ -3888,13 +3888,13 @@ rx_i_219: ;OR_64 dec ebx jz rx_finish xor r8, 0ca37f668h - mov ecx, r8d + mov eax, r8d test bl, 63 jnz short rx_body_219 call rx_read_l1 rx_body_219: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] or rax, r10 mov rcx, rax mov eax, r15d @@ -3906,13 +3906,13 @@ rx_i_220: ;IMUL_32 dec ebx jz rx_finish xor r9, 0bb44c384h - mov ecx, r9d + mov eax, r9d test bl, 63 jnz short rx_body_220 call rx_read_l1 rx_body_220: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] movsxd rcx, eax movsxd rax, r11d imul rax, rcx @@ -3926,13 +3926,13 @@ rx_i_221: ;DIV_64 dec ebx jz rx_finish xor r9, 0a3deb512h - mov ecx, r9d + mov eax, r9d test bl, 63 jnz short rx_body_221 call rx_read_l1 rx_body_221: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] mov ecx, 1 mov edx, r15d test edx, edx @@ -3949,13 +3949,13 @@ rx_i_222: ;FPMUL dec ebx jz rx_finish xor r9, 084a02d64h - mov ecx, r9d + mov eax, r9d test bl, 63 jnz short rx_body_222 call rx_read_l1 rx_body_222: - and ecx, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + and eax, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] mulpd xmm0, xmm5 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -3970,14 +3970,14 @@ rx_i_223: ;FPSUB dec ebx jz rx_finish xor r8, 01e5cc085h - mov ecx, r8d + mov eax, r8d test bl, 63 jnz short rx_body_223 call rx_read_l1 rx_body_223: - xor rbp, rcx - and ecx, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + xor rbp, rax + and eax, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] subpd xmm0, xmm3 movaps xmm2, xmm0 mov eax, r10d @@ -3989,13 +3989,13 @@ rx_i_224: ;XOR_32 dec ebx jz rx_finish xor r12, 053982440h - mov ecx, r12d + mov eax, r12d test bl, 63 jnz short rx_body_224 call rx_read_l2 rx_body_224: - and ecx, 32767 - mov rax, qword ptr [rsi+rcx*8] + and eax, 32767 + mov rax, qword ptr [rsi+rax*8] xor eax, -452933987 mov rcx, rax mov eax, r11d @@ -4007,13 +4007,13 @@ rx_i_225: ;DIV_64 dec ebx jz rx_finish xor r13, 0c558367eh - mov ecx, r13d + mov eax, r13d test bl, 63 jnz short rx_body_225 call rx_read_l1 rx_body_225: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] ; magic divide by 4264577610 shr rax, 1 mov rcx, 9289098447696480965 @@ -4030,13 +4030,13 @@ rx_i_226: ;JUMP dec ebx jz rx_finish xor r10, 040139b65h - mov ecx, r10d + mov eax, r10d test bl, 63 jnz short rx_body_226 call rx_read_l1 rx_body_226: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] mov rcx, rax mov eax, r8d xor eax, 0978b2498h @@ -4049,13 +4049,13 @@ rx_i_227: ;FPMUL dec ebx jz rx_finish xor r11, 0fa312dbdh - mov ecx, r11d + mov eax, r11d test bl, 63 jnz short rx_body_227 call rx_read_l2 rx_body_227: - and ecx, 32767 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + and eax, 32767 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] mulpd xmm0, xmm7 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -4070,13 +4070,13 @@ rx_i_228: ;FPSQRT dec ebx jz rx_finish xor r11, 0b64246c0h - mov ecx, r11d + mov eax, r11d test bl, 63 jnz short rx_body_228 call rx_read_l1 rx_body_228: - and ecx, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + and eax, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] andps xmm0, xmm10 sqrtpd xmm7, xmm0 @@ -4084,13 +4084,13 @@ rx_i_229: ;IMULH_64 dec ebx jz rx_finish xor r11, 05c535836h - mov ecx, r11d + mov eax, r11d test bl, 63 jnz short rx_body_229 call rx_read_l1 rx_body_229: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] mov rcx, 334017248 imul rcx mov rax, rdx @@ -4104,13 +4104,13 @@ rx_i_230: ;FPMUL dec ebx jz rx_finish xor r15, 0f394972eh - mov ecx, r15d + mov eax, r15d test bl, 63 jnz short rx_body_230 call rx_read_l1 rx_body_230: - and ecx, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + and eax, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] mulpd xmm0, xmm6 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -4125,14 +4125,14 @@ rx_i_231: ;RET dec ebx jz rx_finish xor r9, 0bb56428dh - mov ecx, r9d + mov eax, r9d test bl, 63 jnz short rx_body_231 call rx_read_l1 rx_body_231: - xor rbp, rcx - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + xor rbp, rax + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] mov rcx, rax mov eax, r9d xor eax, 0e6c9edaah @@ -4146,13 +4146,13 @@ rx_i_232: ;FPMUL dec ebx jz rx_finish xor r15, 09ab46ab3h - mov ecx, r15d + mov eax, r15d test bl, 63 jnz short rx_body_232 call rx_read_l1 rx_body_232: - and ecx, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + and eax, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] mulpd xmm0, xmm3 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -4163,14 +4163,14 @@ rx_i_233: ;JUMP dec ebx jz rx_finish xor r13, 08eb2cd76h - mov ecx, r13d + mov eax, r13d test bl, 63 jnz short rx_body_233 call rx_read_l1 rx_body_233: - xor rbp, rcx - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + xor rbp, rax + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] mov r14, rax cmp r12d, 392389867 jo rx_i_268 @@ -4179,13 +4179,13 @@ rx_i_234: ;FPDIV dec ebx jz rx_finish xor r15, 0ba687578h - mov ecx, r15d + mov eax, r15d test bl, 63 jnz short rx_body_234 call rx_read_l1 rx_body_234: - and ecx, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + and eax, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] divpd xmm0, xmm4 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -4196,15 +4196,15 @@ rx_i_235: ;IMUL_32 dec ebx jz rx_finish xor r13, 0b6cb9ff2h - mov ecx, r13d + mov eax, r13d test bl, 63 jnz short rx_body_235 call rx_read_l1 rx_body_235: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] movsxd rcx, eax - movsxd rax, r12d + mov rax, 212286089 imul rax, rcx mov rcx, rax mov eax, r15d @@ -4216,13 +4216,13 @@ rx_i_236: ;FPADD dec ebx jz rx_finish xor r15, 03ad196ach - mov ecx, r15d + mov eax, r15d test bl, 63 jnz short rx_body_236 call rx_read_l1 rx_body_236: - and ecx, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + and eax, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] addpd xmm0, xmm4 movaps xmm3, xmm0 @@ -4230,13 +4230,13 @@ rx_i_237: ;JUMP dec ebx jz rx_finish xor r15, 0fab4600h - mov ecx, r15d + mov eax, r15d test bl, 63 jnz short rx_body_237 call rx_read_l1 rx_body_237: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] mov r11, rax cmp r12d, -121899164 jge rx_i_295 @@ -4245,14 +4245,14 @@ rx_i_238: ;FPADD dec ebx jz rx_finish xor r8, 0158f119fh - mov ecx, r8d + mov eax, r8d test bl, 63 jnz short rx_body_238 call rx_read_l1 rx_body_238: - xor rbp, rcx - and ecx, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + xor rbp, rax + and eax, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] addpd xmm0, xmm6 movaps xmm7, xmm0 mov eax, r15d @@ -4264,13 +4264,13 @@ rx_i_239: ;ADD_64 dec ebx jz rx_finish xor r13, 044f30b3fh - mov ecx, r13d + mov eax, r13d test bl, 63 jnz short rx_body_239 call rx_read_l1 rx_body_239: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] add rax, r10 mov r10, rax @@ -4278,14 +4278,14 @@ rx_i_240: ;IMULH_64 dec ebx jz rx_finish xor r9, 0d65d29f9h - mov ecx, r9d + mov eax, r9d test bl, 63 jnz short rx_body_240 call rx_read_l2 rx_body_240: - xor rbp, rcx - and ecx, 32767 - mov rax, qword ptr [rsi+rcx*8] + xor rbp, rax + and eax, 32767 + mov rax, qword ptr [rsi+rax*8] mov rcx, r14 imul rcx mov rax, rdx @@ -4295,13 +4295,13 @@ rx_i_241: ;FPADD dec ebx jz rx_finish xor r11, 0ce5260adh - mov ecx, r11d + mov eax, r11d test bl, 63 jnz short rx_body_241 call rx_read_l1 rx_body_241: - and ecx, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + and eax, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] addpd xmm0, xmm3 movaps xmm7, xmm0 mov eax, r15d @@ -4313,13 +4313,13 @@ rx_i_242: ;MUL_32 dec ebx jz rx_finish xor r12, 01119b0f9h - mov ecx, r12d + mov eax, r12d test bl, 63 jnz short rx_body_242 call rx_read_l1 rx_body_242: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] mov ecx, eax mov eax, r12d imul rax, rcx @@ -4333,13 +4333,13 @@ rx_i_243: ;OR_64 dec ebx jz rx_finish xor r12, 0d6c2ce3dh - mov ecx, r12d + mov eax, r12d test bl, 63 jnz short rx_body_243 call rx_read_l1 rx_body_243: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] or rax, r9 mov r14, rax @@ -4347,13 +4347,13 @@ rx_i_244: ;ROR_64 dec ebx jz rx_finish xor r11, 0c6a6248h - mov ecx, r11d + mov eax, r11d test bl, 63 jnz short rx_body_244 call rx_read_l1 rx_body_244: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] mov rcx, r14 ror rax, cl mov rcx, rax @@ -4366,14 +4366,14 @@ rx_i_245: ;AND_32 dec ebx jz rx_finish xor r13, 084505739h - mov ecx, r13d + mov eax, r13d test bl, 63 jnz short rx_body_245 call rx_read_l2 rx_body_245: - xor rbp, rcx - and ecx, 32767 - mov rax, qword ptr [rsi+rcx*8] + xor rbp, rax + and eax, 32767 + mov rax, qword ptr [rsi+rax*8] and eax, r10d mov rcx, rax mov eax, r12d @@ -4385,14 +4385,14 @@ rx_i_246: ;IDIV_64 dec ebx jz rx_finish xor r15, 027eeaa2eh - mov ecx, r15d + mov eax, r15d test bl, 63 jnz short rx_body_246 call rx_read_l1 rx_body_246: - xor rbp, rcx - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + xor rbp, rax + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] ; magic divide by -156808488 mov rdx, -3947299202596036367 imul rdx @@ -4407,13 +4407,13 @@ rx_i_247: ;IMUL_32 dec ebx jz rx_finish xor r10, 0c4de0296h - mov ecx, r10d + mov eax, r10d test bl, 63 jnz short rx_body_247 call rx_read_l1 rx_body_247: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] movsxd rcx, eax movsxd rax, r14d imul rax, rcx @@ -4427,14 +4427,14 @@ rx_i_248: ;MUL_32 dec ebx jz rx_finish xor r8, 0649df46fh - mov ecx, r8d + mov eax, r8d test bl, 63 jnz short rx_body_248 call rx_read_l1 rx_body_248: - xor rbp, rcx - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + xor rbp, rax + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] mov ecx, eax mov eax, r15d imul rax, rcx @@ -4448,14 +4448,14 @@ rx_i_249: ;IMULH_64 dec ebx jz rx_finish xor r15, 0499552cch - mov ecx, r15d + mov eax, r15d test bl, 63 jnz short rx_body_249 call rx_read_l1 rx_body_249: - xor rbp, rcx - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + xor rbp, rax + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] mov rcx, -508571655 imul rcx mov rax, rdx @@ -4469,13 +4469,13 @@ rx_i_250: ;MUL_64 dec ebx jz rx_finish xor r13, 083eafe6fh - mov ecx, r13d + mov eax, r13d test bl, 63 jnz short rx_body_250 call rx_read_l1 rx_body_250: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] imul rax, r8 mov rcx, rax mov eax, r14d @@ -4487,13 +4487,13 @@ rx_i_251: ;FPMUL dec ebx jz rx_finish xor r13, 0a25a4d8ah - mov ecx, r13d + mov eax, r13d test bl, 63 jnz short rx_body_251 call rx_read_l1 rx_body_251: - and ecx, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + and eax, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] mulpd xmm0, xmm2 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -4508,13 +4508,13 @@ rx_i_252: ;SHL_64 dec ebx jz rx_finish xor r14, 08a75ad41h - mov ecx, r14d + mov eax, r14d test bl, 63 jnz short rx_body_252 call rx_read_l1 rx_body_252: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] shl rax, 53 mov r14, rax @@ -4522,13 +4522,13 @@ rx_i_253: ;CALL dec ebx jz rx_finish xor r14, 057f3f596h - mov ecx, r14d + mov eax, r14d test bl, 63 jnz short rx_body_253 call rx_read_l1 rx_body_253: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] mov rcx, rax mov eax, r13d xor eax, 0654b460bh @@ -4542,13 +4542,13 @@ rx_i_254: ;FPADD dec ebx jz rx_finish xor r14, 04cfb709eh - mov ecx, r14d + mov eax, r14d test bl, 63 jnz short rx_body_254 call rx_read_l1 rx_body_254: - and ecx, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + and eax, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] addpd xmm0, xmm4 movaps xmm8, xmm0 mov eax, r8d @@ -4560,13 +4560,13 @@ rx_i_255: ;FPADD dec ebx jz rx_finish xor r9, 0b96ec9ech - mov ecx, r9d + mov eax, r9d test bl, 63 jnz short rx_body_255 call rx_read_l2 rx_body_255: - and ecx, 32767 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + and eax, 32767 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] addpd xmm0, xmm5 movaps xmm6, xmm0 mov eax, r14d @@ -4578,14 +4578,14 @@ rx_i_256: ;MULH_64 dec ebx jz rx_finish xor r8, 08375472ch - mov ecx, r8d + mov eax, r8d test bl, 63 jnz short rx_body_256 call rx_read_l1 rx_body_256: - xor rbp, rcx - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + xor rbp, rax + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] mov rcx, r15 mul rcx mov rax, rdx @@ -4599,13 +4599,13 @@ rx_i_257: ;FPADD dec ebx jz rx_finish xor r12, 0d75a8c3fh - mov ecx, r12d + mov eax, r12d test bl, 63 jnz short rx_body_257 call rx_read_l1 rx_body_257: - and ecx, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + and eax, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] addpd xmm0, xmm5 movaps xmm3, xmm0 mov eax, r11d @@ -4617,14 +4617,14 @@ rx_i_258: ;MUL_32 dec ebx jz rx_finish xor r11, 064fdbda0h - mov ecx, r11d + mov eax, r11d test bl, 63 jnz short rx_body_258 call rx_read_l2 rx_body_258: - xor rbp, rcx - and ecx, 32767 - mov rax, qword ptr [rsi+rcx*8] + xor rbp, rax + and eax, 32767 + mov rax, qword ptr [rsi+rax*8] mov ecx, eax mov eax, r14d imul rax, rcx @@ -4638,13 +4638,13 @@ rx_i_259: ;FPADD dec ebx jz rx_finish xor r11, 02e36a073h - mov ecx, r11d + mov eax, r11d test bl, 63 jnz short rx_body_259 call rx_read_l1 rx_body_259: - and ecx, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + and eax, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] addpd xmm0, xmm9 movaps xmm3, xmm0 @@ -4652,14 +4652,14 @@ rx_i_260: ;FPSUB dec ebx jz rx_finish xor r13, 0f94e9fa9h - mov ecx, r13d + mov eax, r13d test bl, 63 jnz short rx_body_260 call rx_read_l1 rx_body_260: - xor rbp, rcx - and ecx, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + xor rbp, rax + and eax, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] subpd xmm0, xmm5 movaps xmm9, xmm0 @@ -4667,14 +4667,14 @@ rx_i_261: ;FPDIV dec ebx jz rx_finish xor r14, 02346171ch - mov ecx, r14d + mov eax, r14d test bl, 63 jnz short rx_body_261 call rx_read_l1 rx_body_261: - xor rbp, rcx - and ecx, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + xor rbp, rax + and eax, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] divpd xmm0, xmm3 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -4689,14 +4689,14 @@ rx_i_262: ;AND_64 dec ebx jz rx_finish xor r10, 01c42baa6h - mov ecx, r10d + mov eax, r10d test bl, 63 jnz short rx_body_262 call rx_read_l1 rx_body_262: - xor rbp, rcx - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + xor rbp, rax + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] and rax, -1569587450 mov rcx, rax mov eax, r11d @@ -4708,14 +4708,14 @@ rx_i_263: ;FPMUL dec ebx jz rx_finish xor r11, 0b39b140h - mov ecx, r11d + mov eax, r11d test bl, 63 jnz short rx_body_263 call rx_read_l2 rx_body_263: - xor rbp, rcx - and ecx, 32767 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + xor rbp, rax + and eax, 32767 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] mulpd xmm0, xmm8 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -4726,13 +4726,13 @@ rx_i_264: ;FPMUL dec ebx jz rx_finish xor r11, 01a07d201h - mov ecx, r11d + mov eax, r11d test bl, 63 jnz short rx_body_264 call rx_read_l1 rx_body_264: - and ecx, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + and eax, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] mulpd xmm0, xmm3 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -4743,14 +4743,14 @@ rx_i_265: ;FPADD dec ebx jz rx_finish xor r13, 07a3eb340h - mov ecx, r13d + mov eax, r13d test bl, 63 jnz short rx_body_265 call rx_read_l1 rx_body_265: - xor rbp, rcx - and ecx, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + xor rbp, rax + and eax, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] addpd xmm0, xmm8 movaps xmm2, xmm0 mov eax, r10d @@ -4762,13 +4762,13 @@ rx_i_266: ;CALL dec ebx jz rx_finish xor r13, 03d0a3a89h - mov ecx, r13d + mov eax, r13d test bl, 63 jnz short rx_body_266 call rx_read_l2 rx_body_266: - and ecx, 32767 - mov rax, qword ptr [rsi+rcx*8] + and eax, 32767 + mov rax, qword ptr [rsi+rax*8] mov r10, rax cmp r12d, 136160027 jbe short rx_i_267 @@ -4778,13 +4778,13 @@ rx_i_267: ;ROL_64 dec ebx jz rx_finish xor r8, 0c6c7b37h - mov ecx, r8d + mov eax, r8d test bl, 63 jnz short rx_body_267 call rx_read_l1 rx_body_267: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] mov rcx, r10 rol rax, cl mov r11, rax @@ -4793,14 +4793,14 @@ rx_i_268: ;JUMP dec ebx jz rx_finish xor r12, 0c2510cebh - mov ecx, r12d + mov eax, r12d test bl, 63 jnz short rx_body_268 call rx_read_l1 rx_body_268: - xor rbp, rcx - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + xor rbp, rax + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] mov r13, rax cmp r15d, -2062812966 jl rx_i_381 @@ -4809,13 +4809,13 @@ rx_i_269: ;ROL_64 dec ebx jz rx_finish xor r11, 0c80cc899h - mov ecx, r11d + mov eax, r11d test bl, 63 jnz short rx_body_269 call rx_read_l1 rx_body_269: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] rol rax, 50 mov rcx, rax mov eax, r10d @@ -4827,13 +4827,13 @@ rx_i_270: ;FPMUL dec ebx jz rx_finish xor r11, 0eb355caah - mov ecx, r11d + mov eax, r11d test bl, 63 jnz short rx_body_270 call rx_read_l1 rx_body_270: - and ecx, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + and eax, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] mulpd xmm0, xmm9 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -4844,13 +4844,13 @@ rx_i_271: ;MUL_32 dec ebx jz rx_finish xor r13, 0c6f12299h - mov ecx, r13d + mov eax, r13d test bl, 63 jnz short rx_body_271 call rx_read_l1 rx_body_271: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] mov ecx, eax mov eax, r10d imul rax, rcx @@ -4864,13 +4864,13 @@ rx_i_272: ;AND_64 dec ebx jz rx_finish xor r12, 0695a5dd2h - mov ecx, r12d + mov eax, r12d test bl, 63 jnz short rx_body_272 call rx_read_l1 rx_body_272: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] and rax, r12 mov r13, rax @@ -4878,14 +4878,14 @@ rx_i_273: ;JUMP dec ebx jz rx_finish xor r9, 0d315e4dch - mov ecx, r9d + mov eax, r9d test bl, 63 jnz short rx_body_273 call rx_read_l1 rx_body_273: - xor rbp, rcx - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + xor rbp, rax + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] mov rcx, rax mov eax, r13d xor eax, 063972038h @@ -4898,13 +4898,13 @@ rx_i_274: ;FPADD dec ebx jz rx_finish xor r15, 0b66ca7e0h - mov ecx, r15d + mov eax, r15d test bl, 63 jnz short rx_body_274 call rx_read_l1 rx_body_274: - and ecx, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + and eax, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] addpd xmm0, xmm4 movaps xmm6, xmm0 mov eax, r14d @@ -4916,14 +4916,14 @@ rx_i_275: ;IDIV_64 dec ebx jz rx_finish xor r10, 0788eceb7h - mov ecx, r10d + mov eax, r10d test bl, 63 jnz short rx_body_275 call rx_read_l1 rx_body_275: - xor rbp, rcx - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + xor rbp, rax + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] ; magic divide by -333089764 mov rdx, -7433071640624659213 imul rdx @@ -4938,14 +4938,14 @@ rx_i_276: ;JUMP dec ebx jz rx_finish xor r9, 0c6ac5edah - mov ecx, r9d + mov eax, r9d test bl, 63 jnz short rx_body_276 call rx_read_l2 rx_body_276: - xor rbp, rcx - and ecx, 32767 - mov rax, qword ptr [rsi+rcx*8] + xor rbp, rax + and eax, 32767 + mov rax, qword ptr [rsi+rax*8] mov rcx, rax mov eax, r12d xor eax, 0b65161a6h @@ -4958,13 +4958,13 @@ rx_i_277: ;IMUL_32 dec ebx jz rx_finish xor r11, 0c9549789h - mov ecx, r11d + mov eax, r11d test bl, 63 jnz short rx_body_277 call rx_read_l1 rx_body_277: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] movsxd rcx, eax movsxd rax, r10d imul rax, rcx @@ -4978,13 +4978,13 @@ rx_i_278: ;FPSUB dec ebx jz rx_finish xor r9, 0a2bc66c9h - mov ecx, r9d + mov eax, r9d test bl, 63 jnz short rx_body_278 call rx_read_l1 rx_body_278: - and ecx, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + and eax, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] subpd xmm0, xmm7 movaps xmm4, xmm0 mov eax, r12d @@ -4996,13 +4996,13 @@ rx_i_279: ;FPADD dec ebx jz rx_finish xor r15, 0f1a91458h - mov ecx, r15d + mov eax, r15d test bl, 63 jnz short rx_body_279 call rx_read_l2 rx_body_279: - and ecx, 32767 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + and eax, 32767 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] addpd xmm0, xmm5 movaps xmm9, xmm0 mov eax, r9d @@ -5014,13 +5014,13 @@ rx_i_280: ;IDIV_64 dec ebx jz rx_finish xor r12, 066246b43h - mov ecx, r12d + mov eax, r12d test bl, 63 jnz short rx_body_280 call rx_read_l1 rx_body_280: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] ; magic divide by 555412224 mov rdx, 2228867111296024113 imul rdx @@ -5039,13 +5039,13 @@ rx_i_281: ;SUB_64 dec ebx jz rx_finish xor r10, 05a762727h - mov ecx, r10d + mov eax, r10d test bl, 63 jnz short rx_body_281 call rx_read_l1 rx_body_281: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] sub rax, -202979002 mov rcx, rax mov eax, r11d @@ -5057,13 +5057,13 @@ rx_i_282: ;SUB_64 dec ebx jz rx_finish xor r15, 0de1ab603h - mov ecx, r15d + mov eax, r15d test bl, 63 jnz short rx_body_282 call rx_read_l1 rx_body_282: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] sub rax, r12 mov r11, rax @@ -5071,14 +5071,14 @@ rx_i_283: ;ADD_64 dec ebx jz rx_finish xor r9, 0df4d084fh - mov ecx, r9d + mov eax, r9d test bl, 63 jnz short rx_body_283 call rx_read_l2 rx_body_283: - xor rbp, rcx - and ecx, 32767 - mov rax, qword ptr [rsi+rcx*8] + xor rbp, rax + and eax, 32767 + mov rax, qword ptr [rsi+rax*8] add rax, r12 mov rcx, rax mov eax, r12d @@ -5090,14 +5090,14 @@ rx_i_284: ;FPADD dec ebx jz rx_finish xor r15, 0e68f36ach - mov ecx, r15d + mov eax, r15d test bl, 63 jnz short rx_body_284 call rx_read_l1 rx_body_284: - xor rbp, rcx - and ecx, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + xor rbp, rax + and eax, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] addpd xmm0, xmm6 movaps xmm9, xmm0 mov eax, r9d @@ -5109,13 +5109,13 @@ rx_i_285: ;IMUL_32 dec ebx jz rx_finish xor r8, 09adb333bh - mov ecx, r8d + mov eax, r8d test bl, 63 jnz short rx_body_285 call rx_read_l1 rx_body_285: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] movsxd rcx, eax movsxd rax, r8d imul rax, rcx @@ -5125,13 +5125,13 @@ rx_i_286: ;ROL_64 dec ebx jz rx_finish xor r14, 082f5e36ch - mov ecx, r14d + mov eax, r14d test bl, 63 jnz short rx_body_286 call rx_read_l1 rx_body_286: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] mov rcx, r9 rol rax, cl mov rcx, rax @@ -5144,14 +5144,14 @@ rx_i_287: ;IDIV_64 dec ebx jz rx_finish xor r11, 049547c9ch - mov ecx, r11d + mov eax, r11d test bl, 63 jnz short rx_body_287 call rx_read_l1 rx_body_287: - xor rbp, rcx - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + xor rbp, rax + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] ; magic divide by 1227278330 mov rdx, 8069498232143512385 imul rdx @@ -5170,13 +5170,13 @@ rx_i_288: ;MUL_64 dec ebx jz rx_finish xor r10, 08716ac8bh - mov ecx, r10d + mov eax, r10d test bl, 63 jnz short rx_body_288 call rx_read_l1 rx_body_288: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] imul rax, r8 mov rcx, rax mov eax, r9d @@ -5188,13 +5188,13 @@ rx_i_289: ;FPMUL dec ebx jz rx_finish xor r14, 0efef52b5h - mov ecx, r14d + mov eax, r14d test bl, 63 jnz short rx_body_289 call rx_read_l1 rx_body_289: - and ecx, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + and eax, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] mulpd xmm0, xmm9 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -5205,13 +5205,13 @@ rx_i_290: ;FPSUB dec ebx jz rx_finish xor r15, 060665748h - mov ecx, r15d + mov eax, r15d test bl, 63 jnz short rx_body_290 call rx_read_l1 rx_body_290: - and ecx, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + and eax, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] subpd xmm0, xmm8 movaps xmm9, xmm0 @@ -5219,13 +5219,13 @@ rx_i_291: ;RET dec ebx jz rx_finish xor r13, 0ddf4bd1ah - mov ecx, r13d + mov eax, r13d test bl, 63 jnz short rx_body_291 call rx_read_l1 rx_body_291: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] mov rcx, rax mov eax, r14d xor eax, 0768a9d75h @@ -5239,13 +5239,13 @@ rx_i_292: ;ROL_64 dec ebx jz rx_finish xor r13, 05a87cc3dh - mov ecx, r13d + mov eax, r13d test bl, 63 jnz short rx_body_292 call rx_read_l2 rx_body_292: - and ecx, 32767 - mov rax, qword ptr [rsi+rcx*8] + and eax, 32767 + mov rax, qword ptr [rsi+rax*8] mov rcx, r8 rol rax, cl mov r10, rax @@ -5254,13 +5254,13 @@ rx_i_293: ;FPADD dec ebx jz rx_finish xor r9, 0c61f4279h - mov ecx, r9d + mov eax, r9d test bl, 63 jnz short rx_body_293 call rx_read_l1 rx_body_293: - and ecx, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + and eax, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] addpd xmm0, xmm5 movaps xmm8, xmm0 @@ -5268,13 +5268,13 @@ rx_i_294: ;RET dec ebx jz rx_finish xor r14, 0f3b9d85h - mov ecx, r14d + mov eax, r14d test bl, 63 jnz short rx_body_294 call rx_read_l1 rx_body_294: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] mov rcx, rax mov eax, r8d xor eax, 0ef8571b7h @@ -5288,14 +5288,14 @@ rx_i_295: ;FPSUB dec ebx jz rx_finish xor r9, 0f42798fdh - mov ecx, r9d + mov eax, r9d test bl, 63 jnz short rx_body_295 call rx_read_l2 rx_body_295: - xor rbp, rcx - and ecx, 32767 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + xor rbp, rax + and eax, 32767 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] subpd xmm0, xmm8 movaps xmm7, xmm0 @@ -5303,13 +5303,13 @@ rx_i_296: ;FPSQRT dec ebx jz rx_finish xor r14, 018738758h - mov ecx, r14d + mov eax, r14d test bl, 63 jnz short rx_body_296 call rx_read_l2 rx_body_296: - and ecx, 32767 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + and eax, 32767 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] andps xmm0, xmm10 sqrtpd xmm8, xmm0 @@ -5317,13 +5317,13 @@ rx_i_297: ;ADD_64 dec ebx jz rx_finish xor r15, 0de3b9d9bh - mov ecx, r15d + mov eax, r15d test bl, 63 jnz short rx_body_297 call rx_read_l1 rx_body_297: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] add rax, r10 mov r14, rax @@ -5331,13 +5331,13 @@ rx_i_298: ;FPSUB dec ebx jz rx_finish xor r14, 084f53637h - mov ecx, r14d + mov eax, r14d test bl, 63 jnz short rx_body_298 call rx_read_l2 rx_body_298: - and ecx, 32767 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + and eax, 32767 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] subpd xmm0, xmm7 movaps xmm6, xmm0 @@ -5345,14 +5345,14 @@ rx_i_299: ;ADD_64 dec ebx jz rx_finish xor r12, 042f4897h - mov ecx, r12d + mov eax, r12d test bl, 63 jnz short rx_body_299 call rx_read_l1 rx_body_299: - xor rbp, rcx - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + xor rbp, rax + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] add rax, r10 mov rcx, rax mov eax, r12d @@ -5364,14 +5364,14 @@ rx_i_300: ;FPSUB dec ebx jz rx_finish xor r12, 095765693h - mov ecx, r12d + mov eax, r12d test bl, 63 jnz short rx_body_300 call rx_read_l1 rx_body_300: - xor rbp, rcx - and ecx, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + xor rbp, rax + and eax, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] subpd xmm0, xmm8 movaps xmm2, xmm0 @@ -5379,13 +5379,13 @@ rx_i_301: ;FPMUL dec ebx jz rx_finish xor r8, 0a0ec5eech - mov ecx, r8d + mov eax, r8d test bl, 63 jnz short rx_body_301 call rx_read_l1 rx_body_301: - and ecx, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + and eax, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] mulpd xmm0, xmm5 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -5400,14 +5400,14 @@ rx_i_302: ;ADD_64 dec ebx jz rx_finish xor r15, 0f6f8c345h - mov ecx, r15d + mov eax, r15d test bl, 63 jnz short rx_body_302 call rx_read_l1 rx_body_302: - xor rbp, rcx - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + xor rbp, rax + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] add rax, r10 mov r11, rax @@ -5415,13 +5415,13 @@ rx_i_303: ;FPADD dec ebx jz rx_finish xor r14, 082a3e965h - mov ecx, r14d + mov eax, r14d test bl, 63 jnz short rx_body_303 call rx_read_l2 rx_body_303: - and ecx, 32767 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + and eax, 32767 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] addpd xmm0, xmm3 movaps xmm9, xmm0 mov eax, r9d @@ -5433,13 +5433,13 @@ rx_i_304: ;MUL_64 dec ebx jz rx_finish xor r12, 04940c652h - mov ecx, r12d + mov eax, r12d test bl, 63 jnz short rx_body_304 call rx_read_l1 rx_body_304: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] imul rax, rax, 2007686513 mov r13, rax @@ -5447,13 +5447,13 @@ rx_i_305: ;MUL_64 dec ebx jz rx_finish xor r11, 03c6c62b8h - mov ecx, r11d + mov eax, r11d test bl, 63 jnz short rx_body_305 call rx_read_l1 rx_body_305: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] imul rax, r15 mov r10, rax @@ -5461,13 +5461,13 @@ rx_i_306: ;ADD_64 dec ebx jz rx_finish xor r15, 08b34cdfch - mov ecx, r15d + mov eax, r15d test bl, 63 jnz short rx_body_306 call rx_read_l1 rx_body_306: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] add rax, 400578979 mov r13, rax @@ -5475,13 +5475,13 @@ rx_i_307: ;SHL_64 dec ebx jz rx_finish xor r15, 04c36adb1h - mov ecx, r15d + mov eax, r15d test bl, 63 jnz short rx_body_307 call rx_read_l1 rx_body_307: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] shl rax, 33 mov r10, rax @@ -5489,13 +5489,13 @@ rx_i_308: ;MUL_64 dec ebx jz rx_finish xor r11, 0a4213b21h - mov ecx, r11d + mov eax, r11d test bl, 63 jnz short rx_body_308 call rx_read_l1 rx_body_308: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] imul rax, r13 mov r15, rax @@ -5503,13 +5503,13 @@ rx_i_309: ;DIV_64 dec ebx jz rx_finish xor r9, 090c42304h - mov ecx, r9d + mov eax, r9d test bl, 63 jnz short rx_body_309 call rx_read_l2 rx_body_309: - and ecx, 32767 - mov rax, qword ptr [rsi+rcx*8] + and eax, 32767 + mov rax, qword ptr [rsi+rax*8] ; magic divide by 2642117268 mov rcx, 14993309243657753043 mul rcx @@ -5525,13 +5525,13 @@ rx_i_310: ;FPMUL dec ebx jz rx_finish xor r9, 0f78e1c8ch - mov ecx, r9d + mov eax, r9d test bl, 63 jnz short rx_body_310 call rx_read_l1 rx_body_310: - and ecx, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + and eax, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] mulpd xmm0, xmm6 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -5546,13 +5546,13 @@ rx_i_311: ;FPMUL dec ebx jz rx_finish xor r8, 0ff8848cfh - mov ecx, r8d + mov eax, r8d test bl, 63 jnz short rx_body_311 call rx_read_l1 rx_body_311: - and ecx, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + and eax, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] mulpd xmm0, xmm4 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -5563,13 +5563,13 @@ rx_i_312: ;MUL_32 dec ebx jz rx_finish xor r13, 0b18904cdh - mov ecx, r13d + mov eax, r13d test bl, 63 jnz short rx_body_312 call rx_read_l2 rx_body_312: - and ecx, 32767 - mov rax, qword ptr [rsi+rcx*8] + and eax, 32767 + mov rax, qword ptr [rsi+rax*8] mov ecx, eax mov eax, r14d imul rax, rcx @@ -5579,13 +5579,13 @@ rx_i_313: ;ROR_64 dec ebx jz rx_finish xor r8, 0a0d0befh - mov ecx, r8d + mov eax, r8d test bl, 63 jnz short rx_body_313 call rx_read_l1 rx_body_313: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] ror rax, 62 mov rcx, rax mov eax, r14d @@ -5597,15 +5597,15 @@ rx_i_314: ;IMUL_32 dec ebx jz rx_finish xor r15, 01e3c65f7h - mov ecx, r15d + mov eax, r15d test bl, 63 jnz short rx_body_314 call rx_read_l2 rx_body_314: - and ecx, 32767 - mov rax, qword ptr [rsi+rcx*8] + and eax, 32767 + mov rax, qword ptr [rsi+rax*8] movsxd rcx, eax - movsxd rax, r9d + mov rax, 2143811925 imul rax, rcx mov rcx, rax mov eax, r9d @@ -5617,13 +5617,13 @@ rx_i_315: ;XOR_64 dec ebx jz rx_finish xor r9, 02e36ddafh - mov ecx, r9d + mov eax, r9d test bl, 63 jnz short rx_body_315 call rx_read_l2 rx_body_315: - and ecx, 32767 - mov rax, qword ptr [rsi+rcx*8] + and eax, 32767 + mov rax, qword ptr [rsi+rax*8] xor rax, r15 mov r9, rax @@ -5631,14 +5631,14 @@ rx_i_316: ;RET dec ebx jz rx_finish xor r14, 05b0cb5bbh - mov ecx, r14d + mov eax, r14d test bl, 63 jnz short rx_body_316 call rx_read_l1 rx_body_316: - xor rbp, rcx - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + xor rbp, rax + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] mov rcx, rax mov eax, r8d xor eax, 03602c513h @@ -5652,13 +5652,13 @@ rx_i_317: ;FPADD dec ebx jz rx_finish xor r9, 0c74e7415h - mov ecx, r9d + mov eax, r9d test bl, 63 jnz short rx_body_317 call rx_read_l1 rx_body_317: - and ecx, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + and eax, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] addpd xmm0, xmm7 movaps xmm5, xmm0 @@ -5666,13 +5666,13 @@ rx_i_318: ;ROR_64 dec ebx jz rx_finish xor r9, 057621d9ah - mov ecx, r9d + mov eax, r9d test bl, 63 jnz short rx_body_318 call rx_read_l2 rx_body_318: - and ecx, 32767 - mov rax, qword ptr [rsi+rcx*8] + and eax, 32767 + mov rax, qword ptr [rsi+rax*8] mov rcx, r11 ror rax, cl mov rcx, rax @@ -5685,13 +5685,13 @@ rx_i_319: ;SHR_64 dec ebx jz rx_finish xor r13, 08ee02d99h - mov ecx, r13d + mov eax, r13d test bl, 63 jnz short rx_body_319 call rx_read_l1 rx_body_319: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] shr rax, 46 mov rcx, rax mov eax, r11d @@ -5703,13 +5703,13 @@ rx_i_320: ;FPADD dec ebx jz rx_finish xor r15, 013461188h - mov ecx, r15d + mov eax, r15d test bl, 63 jnz short rx_body_320 call rx_read_l1 rx_body_320: - and ecx, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + and eax, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] addpd xmm0, xmm4 movaps xmm2, xmm0 mov eax, r10d @@ -5721,13 +5721,13 @@ rx_i_321: ;IMUL_32 dec ebx jz rx_finish xor r11, 0a7bae383h - mov ecx, r11d + mov eax, r11d test bl, 63 jnz short rx_body_321 call rx_read_l2 rx_body_321: - and ecx, 32767 - mov rax, qword ptr [rsi+rcx*8] + and eax, 32767 + mov rax, qword ptr [rsi+rax*8] movsxd rcx, eax movsxd rax, r9d imul rax, rcx @@ -5741,13 +5741,13 @@ rx_i_322: ;CALL dec ebx jz rx_finish xor r14, 08215399bh - mov ecx, r14d + mov eax, r14d test bl, 63 jnz short rx_body_322 call rx_read_l1 rx_body_322: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] mov rcx, rax mov eax, r11d xor eax, 054292224h @@ -5761,14 +5761,14 @@ rx_i_323: ;MULH_64 dec ebx jz rx_finish xor r14, 07b07664bh - mov ecx, r14d + mov eax, r14d test bl, 63 jnz short rx_body_323 call rx_read_l2 rx_body_323: - xor rbp, rcx - and ecx, 32767 - mov rax, qword ptr [rsi+rcx*8] + xor rbp, rax + and eax, 32767 + mov rax, qword ptr [rsi+rax*8] mov rcx, r14 mul rcx mov rax, rdx @@ -5782,13 +5782,13 @@ rx_i_324: ;FPDIV dec ebx jz rx_finish xor r9, 0f956baffh - mov ecx, r9d + mov eax, r9d test bl, 63 jnz short rx_body_324 call rx_read_l1 rx_body_324: - and ecx, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + and eax, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] divpd xmm0, xmm2 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -5803,13 +5803,13 @@ rx_i_325: ;OR_32 dec ebx jz rx_finish xor r11, 0708ab9d1h - mov ecx, r11d + mov eax, r11d test bl, 63 jnz short rx_body_325 call rx_read_l1 rx_body_325: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] or eax, r8d mov r13, rax @@ -5817,14 +5817,14 @@ rx_i_326: ;MULH_64 dec ebx jz rx_finish xor r11, 0d1b27540h - mov ecx, r11d + mov eax, r11d test bl, 63 jnz short rx_body_326 call rx_read_l1 rx_body_326: - xor rbp, rcx - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + xor rbp, rax + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] mov rcx, -1233771581 mul rcx mov rax, rdx @@ -5838,14 +5838,14 @@ rx_i_327: ;IDIV_64 dec ebx jz rx_finish xor r9, 09665f98dh - mov ecx, r9d + mov eax, r9d test bl, 63 jnz short rx_body_327 call rx_read_l1 rx_body_327: - xor rbp, rcx - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + xor rbp, rax + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] ; magic divide by 1572662125 mov rcx, rax mov rdx, -5852150286715358951 @@ -5862,13 +5862,13 @@ rx_i_328: ;SHR_64 dec ebx jz rx_finish xor r12, 0fb9c32adh - mov ecx, r12d + mov eax, r12d test bl, 63 jnz short rx_body_328 call rx_read_l1 rx_body_328: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] shr rax, 18 mov r9, rax @@ -5876,13 +5876,13 @@ rx_i_329: ;RET dec ebx jz rx_finish xor r11, 0e1110623h - mov ecx, r11d + mov eax, r11d test bl, 63 jnz short rx_body_329 call rx_read_l2 rx_body_329: - and ecx, 32767 - mov rax, qword ptr [rsi+rcx*8] + and eax, 32767 + mov rax, qword ptr [rsi+rax*8] mov r11, rax cmp rsp, rdi je short rx_i_330 @@ -5892,16 +5892,16 @@ rx_i_330: ;IMUL_32 dec ebx jz rx_finish xor r9, 0f6a93f19h - mov ecx, r9d + mov eax, r9d test bl, 63 jnz short rx_body_330 call rx_read_l2 rx_body_330: - xor rbp, rcx - and ecx, 32767 - mov rax, qword ptr [rsi+rcx*8] + xor rbp, rax + and eax, 32767 + mov rax, qword ptr [rsi+rax*8] movsxd rcx, eax - movsxd rax, r13d + mov rax, -1349816041 imul rax, rcx mov rcx, rax mov eax, r11d @@ -5913,14 +5913,14 @@ rx_i_331: ;FPADD dec ebx jz rx_finish xor r9, 0bc9bbe4ah - mov ecx, r9d + mov eax, r9d test bl, 63 jnz short rx_body_331 call rx_read_l2 rx_body_331: - xor rbp, rcx - and ecx, 32767 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + xor rbp, rax + and eax, 32767 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] addpd xmm0, xmm3 movaps xmm9, xmm0 @@ -5928,14 +5928,14 @@ rx_i_332: ;FPADD dec ebx jz rx_finish xor r12, 0f253cd4eh - mov ecx, r12d + mov eax, r12d test bl, 63 jnz short rx_body_332 call rx_read_l1 rx_body_332: - xor rbp, rcx - and ecx, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + xor rbp, rax + and eax, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] addpd xmm0, xmm6 movaps xmm3, xmm0 mov eax, r11d @@ -5947,14 +5947,14 @@ rx_i_333: ;OR_64 dec ebx jz rx_finish xor r14, 0f009758bh - mov ecx, r14d + mov eax, r14d test bl, 63 jnz short rx_body_333 call rx_read_l1 rx_body_333: - xor rbp, rcx - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + xor rbp, rax + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] or rax, r12 mov r11, rax @@ -5962,14 +5962,14 @@ rx_i_334: ;ADD_64 dec ebx jz rx_finish xor r8, 0dda04168h - mov ecx, r8d + mov eax, r8d test bl, 63 jnz short rx_body_334 call rx_read_l2 rx_body_334: - xor rbp, rcx - and ecx, 32767 - mov rax, qword ptr [rsi+rcx*8] + xor rbp, rax + and eax, 32767 + mov rax, qword ptr [rsi+rax*8] add rax, r13 mov r8, rax @@ -5977,13 +5977,13 @@ rx_i_335: ;SUB_64 dec ebx jz rx_finish xor r15, 03e6cfb73h - mov ecx, r15d + mov eax, r15d test bl, 63 jnz short rx_body_335 call rx_read_l2 rx_body_335: - and ecx, 32767 - mov rax, qword ptr [rsi+rcx*8] + and eax, 32767 + mov rax, qword ptr [rsi+rax*8] sub rax, r8 mov rcx, rax mov eax, r12d @@ -5995,14 +5995,14 @@ rx_i_336: ;ROR_64 dec ebx jz rx_finish xor r15, 0aea0a435h - mov ecx, r15d + mov eax, r15d test bl, 63 jnz short rx_body_336 call rx_read_l1 rx_body_336: - xor rbp, rcx - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + xor rbp, rax + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] ror rax, 42 mov rcx, rax mov eax, r11d @@ -6014,13 +6014,13 @@ rx_i_337: ;ADD_64 dec ebx jz rx_finish xor r8, 03d6c4ab2h - mov ecx, r8d + mov eax, r8d test bl, 63 jnz short rx_body_337 call rx_read_l1 rx_body_337: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] add rax, r12 mov rcx, rax mov eax, r13d @@ -6032,14 +6032,14 @@ rx_i_338: ;MUL_64 dec ebx jz rx_finish xor r12, 0d428a742h - mov ecx, r12d + mov eax, r12d test bl, 63 jnz short rx_body_338 call rx_read_l1 rx_body_338: - xor rbp, rcx - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + xor rbp, rax + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] imul rax, r12 mov r11, rax @@ -6047,13 +6047,13 @@ rx_i_339: ;FPADD dec ebx jz rx_finish xor r9, 04596ef73h - mov ecx, r9d + mov eax, r9d test bl, 63 jnz short rx_body_339 call rx_read_l1 rx_body_339: - and ecx, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + and eax, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] addpd xmm0, xmm6 movaps xmm2, xmm0 @@ -6061,13 +6061,13 @@ rx_i_340: ;FPADD dec ebx jz rx_finish xor r15, 0e51629cch - mov ecx, r15d + mov eax, r15d test bl, 63 jnz short rx_body_340 call rx_read_l2 rx_body_340: - and ecx, 32767 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + and eax, 32767 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] addpd xmm0, xmm5 movaps xmm5, xmm0 @@ -6075,13 +6075,13 @@ rx_i_341: ;MUL_32 dec ebx jz rx_finish xor r12, 019eb9ea5h - mov ecx, r12d + mov eax, r12d test bl, 63 jnz short rx_body_341 call rx_read_l1 rx_body_341: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] mov ecx, eax mov eax, r15d imul rax, rcx @@ -6095,13 +6095,13 @@ rx_i_342: ;FPSUB dec ebx jz rx_finish xor r9, 09ccc7abah - mov ecx, r9d + mov eax, r9d test bl, 63 jnz short rx_body_342 call rx_read_l1 rx_body_342: - and ecx, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + and eax, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] subpd xmm0, xmm2 movaps xmm3, xmm0 @@ -6109,14 +6109,14 @@ rx_i_343: ;XOR_64 dec ebx jz rx_finish xor r14, 056f6cf0bh - mov ecx, r14d + mov eax, r14d test bl, 63 jnz short rx_body_343 call rx_read_l1 rx_body_343: - xor rbp, rcx - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + xor rbp, rax + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] xor rax, r13 mov rcx, rax mov eax, r15d @@ -6128,13 +6128,13 @@ rx_i_344: ;FPSUB dec ebx jz rx_finish xor r10, 03ef9bcc4h - mov ecx, r10d + mov eax, r10d test bl, 63 jnz short rx_body_344 call rx_read_l1 rx_body_344: - and ecx, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + and eax, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] subpd xmm0, xmm6 movaps xmm5, xmm0 @@ -6142,13 +6142,13 @@ rx_i_345: ;MULH_64 dec ebx jz rx_finish xor r12, 0bbbcdbach - mov ecx, r12d + mov eax, r12d test bl, 63 jnz short rx_body_345 call rx_read_l2 rx_body_345: - and ecx, 32767 - mov rax, qword ptr [rsi+rcx*8] + and eax, 32767 + mov rax, qword ptr [rsi+rax*8] mov rcx, r13 mul rcx mov rax, rdx @@ -6162,13 +6162,13 @@ rx_i_346: ;AND_32 dec ebx jz rx_finish xor r12, 0ae9d1e96h - mov ecx, r12d + mov eax, r12d test bl, 63 jnz short rx_body_346 call rx_read_l2 rx_body_346: - and ecx, 32767 - mov rax, qword ptr [rsi+rcx*8] + and eax, 32767 + mov rax, qword ptr [rsi+rax*8] and eax, r15d mov rcx, rax mov eax, r13d @@ -6180,13 +6180,13 @@ rx_i_347: ;ADD_64 dec ebx jz rx_finish xor r14, 070c34d69h - mov ecx, r14d + mov eax, r14d test bl, 63 jnz short rx_body_347 call rx_read_l1 rx_body_347: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] add rax, r10 mov r13, rax @@ -6194,13 +6194,13 @@ rx_i_348: ;FPSUB dec ebx jz rx_finish xor r13, 0523ff904h - mov ecx, r13d + mov eax, r13d test bl, 63 jnz short rx_body_348 call rx_read_l1 rx_body_348: - and ecx, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + and eax, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] subpd xmm0, xmm3 movaps xmm9, xmm0 mov eax, r9d @@ -6212,13 +6212,13 @@ rx_i_349: ;OR_64 dec ebx jz rx_finish xor r8, 018e0e5ddh - mov ecx, r8d + mov eax, r8d test bl, 63 jnz short rx_body_349 call rx_read_l1 rx_body_349: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] or rax, r15 mov r13, rax @@ -6226,13 +6226,13 @@ rx_i_350: ;CALL dec ebx jz rx_finish xor r9, 09bd050f0h - mov ecx, r9d + mov eax, r9d test bl, 63 jnz short rx_body_350 call rx_read_l2 rx_body_350: - and ecx, 32767 - mov rax, qword ptr [rsi+rcx*8] + and eax, 32767 + mov rax, qword ptr [rsi+rax*8] mov rcx, rax mov eax, r12d xor eax, 0c5901b43h @@ -6246,13 +6246,13 @@ rx_i_351: ;MUL_64 dec ebx jz rx_finish xor r11, 0a3a5906fh - mov ecx, r11d + mov eax, r11d test bl, 63 jnz short rx_body_351 call rx_read_l1 rx_body_351: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] imul rax, r10 mov r13, rax @@ -6260,14 +6260,14 @@ rx_i_352: ;FPADD dec ebx jz rx_finish xor r10, 0afc9af2bh - mov ecx, r10d + mov eax, r10d test bl, 63 jnz short rx_body_352 call rx_read_l1 rx_body_352: - xor rbp, rcx - and ecx, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + xor rbp, rax + and eax, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] addpd xmm0, xmm6 movaps xmm2, xmm0 mov eax, r10d @@ -6279,13 +6279,13 @@ rx_i_353: ;FPSUB dec ebx jz rx_finish xor r13, 02e65278bh - mov ecx, r13d + mov eax, r13d test bl, 63 jnz short rx_body_353 call rx_read_l1 rx_body_353: - and ecx, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + and eax, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] subpd xmm0, xmm2 movaps xmm7, xmm0 mov eax, r15d @@ -6297,13 +6297,13 @@ rx_i_354: ;MUL_32 dec ebx jz rx_finish xor r13, 02412fc10h - mov ecx, r13d + mov eax, r13d test bl, 63 jnz short rx_body_354 call rx_read_l1 rx_body_354: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] mov ecx, eax mov eax, r13d imul rax, rcx @@ -6313,13 +6313,13 @@ rx_i_355: ;MUL_64 dec ebx jz rx_finish xor r10, 06bd6e65fh - mov ecx, r10d + mov eax, r10d test bl, 63 jnz short rx_body_355 call rx_read_l2 rx_body_355: - and ecx, 32767 - mov rax, qword ptr [rsi+rcx*8] + and eax, 32767 + mov rax, qword ptr [rsi+rax*8] imul rax, r14 mov rcx, rax mov eax, r8d @@ -6331,14 +6331,14 @@ rx_i_356: ;MUL_64 dec ebx jz rx_finish xor r10, 01cd85d80h - mov ecx, r10d + mov eax, r10d test bl, 63 jnz short rx_body_356 call rx_read_l1 rx_body_356: - xor rbp, rcx - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + xor rbp, rax + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] imul rax, r10 mov r11, rax @@ -6346,13 +6346,13 @@ rx_i_357: ;ADD_64 dec ebx jz rx_finish xor r10, 0f7daed36h - mov ecx, r10d + mov eax, r10d test bl, 63 jnz short rx_body_357 call rx_read_l1 rx_body_357: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] add rax, r11 mov r11, rax @@ -6360,13 +6360,13 @@ rx_i_358: ;DIV_64 dec ebx jz rx_finish xor r13, 088fa6e5ah - mov ecx, r13d + mov eax, r13d test bl, 63 jnz short rx_body_358 call rx_read_l1 rx_body_358: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] ; magic divide by 3667831238 shr rax, 1 mov rcx, 2700102505175032865 @@ -6379,14 +6379,14 @@ rx_i_359: ;FPSUB dec ebx jz rx_finish xor r10, 0714fc2cdh - mov ecx, r10d + mov eax, r10d test bl, 63 jnz short rx_body_359 call rx_read_l1 rx_body_359: - xor rbp, rcx - and ecx, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + xor rbp, rax + and eax, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] subpd xmm0, xmm9 movaps xmm4, xmm0 mov eax, r12d @@ -6398,13 +6398,13 @@ rx_i_360: ;FPMUL dec ebx jz rx_finish xor r10, 0c2d110b5h - mov ecx, r10d + mov eax, r10d test bl, 63 jnz short rx_body_360 call rx_read_l1 rx_body_360: - and ecx, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + and eax, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] mulpd xmm0, xmm8 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -6415,13 +6415,13 @@ rx_i_361: ;FPDIV dec ebx jz rx_finish xor r15, 01d125a7fh - mov ecx, r15d + mov eax, r15d test bl, 63 jnz short rx_body_361 call rx_read_l1 rx_body_361: - and ecx, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + and eax, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] divpd xmm0, xmm6 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -6436,13 +6436,13 @@ rx_i_362: ;SUB_64 dec ebx jz rx_finish xor r9, 0ed8954bdh - mov ecx, r9d + mov eax, r9d test bl, 63 jnz short rx_body_362 call rx_read_l1 rx_body_362: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] sub rax, r9 mov rcx, rax mov eax, r15d @@ -6454,13 +6454,13 @@ rx_i_363: ;FPMUL dec ebx jz rx_finish xor r12, 09f75887bh - mov ecx, r12d + mov eax, r12d test bl, 63 jnz short rx_body_363 call rx_read_l2 rx_body_363: - and ecx, 32767 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + and eax, 32767 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] mulpd xmm0, xmm6 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -6471,13 +6471,13 @@ rx_i_364: ;MUL_32 dec ebx jz rx_finish xor r11, 0badaf867h - mov ecx, r11d + mov eax, r11d test bl, 63 jnz short rx_body_364 call rx_read_l1 rx_body_364: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] mov ecx, eax mov eax, r8d imul rax, rcx @@ -6487,13 +6487,13 @@ rx_i_365: ;IMUL_32 dec ebx jz rx_finish xor r15, 02db4444ah - mov ecx, r15d + mov eax, r15d test bl, 63 jnz short rx_body_365 call rx_read_l1 rx_body_365: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] movsxd rcx, eax movsxd rax, r9d imul rax, rcx @@ -6507,13 +6507,13 @@ rx_i_366: ;IMUL_32 dec ebx jz rx_finish xor r12, 0bff7218fh - mov ecx, r12d + mov eax, r12d test bl, 63 jnz short rx_body_366 call rx_read_l1 rx_body_366: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] movsxd rcx, eax movsxd rax, r8d imul rax, rcx @@ -6527,14 +6527,14 @@ rx_i_367: ;ROR_64 dec ebx jz rx_finish xor r9, 04d14cb3ah - mov ecx, r9d + mov eax, r9d test bl, 63 jnz short rx_body_367 call rx_read_l1 rx_body_367: - xor rbp, rcx - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + xor rbp, rax + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] ror rax, 18 mov r12, rax @@ -6542,13 +6542,13 @@ rx_i_368: ;SUB_32 dec ebx jz rx_finish xor r10, 0a14836bah - mov ecx, r10d + mov eax, r10d test bl, 63 jnz short rx_body_368 call rx_read_l2 rx_body_368: - and ecx, 32767 - mov rax, qword ptr [rsi+rcx*8] + and eax, 32767 + mov rax, qword ptr [rsi+rax*8] sub eax, r10d mov r8, rax @@ -6556,13 +6556,13 @@ rx_i_369: ;IDIV_64 dec ebx jz rx_finish xor r9, 053fe22e2h - mov ecx, r9d + mov eax, r9d test bl, 63 jnz short rx_body_369 call rx_read_l2 rx_body_369: - and ecx, 32767 - mov rax, qword ptr [rsi+rcx*8] + and eax, 32767 + mov rax, qword ptr [rsi+rax*8] ; magic divide by 470792991 mov rdx, 1314739240972876203 imul rdx @@ -6577,13 +6577,13 @@ rx_i_370: ;FPSUB dec ebx jz rx_finish xor r15, 010e1fb24h - mov ecx, r15d + mov eax, r15d test bl, 63 jnz short rx_body_370 call rx_read_l2 rx_body_370: - and ecx, 32767 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + and eax, 32767 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] subpd xmm0, xmm6 movaps xmm6, xmm0 mov eax, r14d @@ -6595,14 +6595,14 @@ rx_i_371: ;FPADD dec ebx jz rx_finish xor r8, 0ebbd5cc9h - mov ecx, r8d + mov eax, r8d test bl, 63 jnz short rx_body_371 call rx_read_l1 rx_body_371: - xor rbp, rcx - and ecx, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + xor rbp, rax + and eax, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] addpd xmm0, xmm9 movaps xmm5, xmm0 mov eax, r13d @@ -6614,13 +6614,13 @@ rx_i_372: ;SHL_64 dec ebx jz rx_finish xor r10, 098ab79d7h - mov ecx, r10d + mov eax, r10d test bl, 63 jnz short rx_body_372 call rx_read_l1 rx_body_372: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] mov rcx, r13 shl rax, cl mov r9, rax @@ -6629,13 +6629,13 @@ rx_i_373: ;FPMUL dec ebx jz rx_finish xor r15, 056438b3h - mov ecx, r15d + mov eax, r15d test bl, 63 jnz short rx_body_373 call rx_read_l1 rx_body_373: - and ecx, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + and eax, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] mulpd xmm0, xmm8 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -6646,13 +6646,13 @@ rx_i_374: ;FPMUL dec ebx jz rx_finish xor r11, 0dbcce604h - mov ecx, r11d + mov eax, r11d test bl, 63 jnz short rx_body_374 call rx_read_l1 rx_body_374: - and ecx, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + and eax, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] mulpd xmm0, xmm2 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -6663,13 +6663,13 @@ rx_i_375: ;ADD_64 dec ebx jz rx_finish xor r9, 0edea6200h - mov ecx, r9d + mov eax, r9d test bl, 63 jnz short rx_body_375 call rx_read_l2 rx_body_375: - and ecx, 32767 - mov rax, qword ptr [rsi+rcx*8] + and eax, 32767 + mov rax, qword ptr [rsi+rax*8] add rax, -332030999 mov rcx, rax mov eax, r12d @@ -6681,13 +6681,13 @@ rx_i_376: ;ADD_64 dec ebx jz rx_finish xor r14, 05e61b279h - mov ecx, r14d + mov eax, r14d test bl, 63 jnz short rx_body_376 call rx_read_l1 rx_body_376: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] add rax, r9 mov rcx, rax mov eax, r8d @@ -6699,13 +6699,13 @@ rx_i_377: ;FPSUB dec ebx jz rx_finish xor r14, 0fc1fb433h - mov ecx, r14d + mov eax, r14d test bl, 63 jnz short rx_body_377 call rx_read_l1 rx_body_377: - and ecx, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + and eax, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] subpd xmm0, xmm3 movaps xmm7, xmm0 @@ -6713,13 +6713,13 @@ rx_i_378: ;MUL_32 dec ebx jz rx_finish xor r12, 082aa21ach - mov ecx, r12d + mov eax, r12d test bl, 63 jnz short rx_body_378 call rx_read_l2 rx_body_378: - and ecx, 32767 - mov rax, qword ptr [rsi+rcx*8] + and eax, 32767 + mov rax, qword ptr [rsi+rax*8] mov ecx, eax mov eax, r14d imul rax, rcx @@ -6729,13 +6729,13 @@ rx_i_379: ;ROR_64 dec ebx jz rx_finish xor r10, 05dba41fbh - mov ecx, r10d + mov eax, r10d test bl, 63 jnz short rx_body_379 call rx_read_l1 rx_body_379: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] mov rcx, r9 ror rax, cl mov r13, rax @@ -6744,13 +6744,13 @@ rx_i_380: ;MUL_64 dec ebx jz rx_finish xor r11, 0229e3d6eh - mov ecx, r11d + mov eax, r11d test bl, 63 jnz short rx_body_380 call rx_read_l2 rx_body_380: - and ecx, 32767 - mov rax, qword ptr [rsi+rcx*8] + and eax, 32767 + mov rax, qword ptr [rsi+rax*8] imul rax, r10 mov rcx, rax mov eax, r13d @@ -6762,14 +6762,14 @@ rx_i_381: ;XOR_32 dec ebx jz rx_finish xor r8, 019816ff9h - mov ecx, r8d + mov eax, r8d test bl, 63 jnz short rx_body_381 call rx_read_l1 rx_body_381: - xor rbp, rcx - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + xor rbp, rax + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] xor eax, r14d mov r9, rax @@ -6777,13 +6777,13 @@ rx_i_382: ;ROL_64 dec ebx jz rx_finish xor r14, 036b5b81fh - mov ecx, r14d + mov eax, r14d test bl, 63 jnz short rx_body_382 call rx_read_l2 rx_body_382: - and ecx, 32767 - mov rax, qword ptr [rsi+rcx*8] + and eax, 32767 + mov rax, qword ptr [rsi+rax*8] rol rax, 55 mov r11, rax @@ -6791,13 +6791,13 @@ rx_i_383: ;FPSUB dec ebx jz rx_finish xor r15, 05f798ec3h - mov ecx, r15d + mov eax, r15d test bl, 63 jnz short rx_body_383 call rx_read_l1 rx_body_383: - and ecx, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + and eax, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] subpd xmm0, xmm4 movaps xmm5, xmm0 mov eax, r13d @@ -6809,13 +6809,13 @@ rx_i_384: ;XOR_64 dec ebx jz rx_finish xor r10, 05b459fd7h - mov ecx, r10d + mov eax, r10d test bl, 63 jnz short rx_body_384 call rx_read_l1 rx_body_384: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] xor rax, 1413715044 mov rcx, rax mov eax, r9d @@ -6827,13 +6827,13 @@ rx_i_385: ;MUL_64 dec ebx jz rx_finish xor r15, 0c91749bbh - mov ecx, r15d + mov eax, r15d test bl, 63 jnz short rx_body_385 call rx_read_l1 rx_body_385: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] imul rax, r12 mov rcx, rax mov eax, r13d @@ -6845,13 +6845,13 @@ rx_i_386: ;FPADD dec ebx jz rx_finish xor r9, 0575b4bdch - mov ecx, r9d + mov eax, r9d test bl, 63 jnz short rx_body_386 call rx_read_l1 rx_body_386: - and ecx, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + and eax, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] addpd xmm0, xmm8 movaps xmm9, xmm0 @@ -6859,14 +6859,14 @@ rx_i_387: ;SUB_32 dec ebx jz rx_finish xor r9, 0d4f7bc6ah - mov ecx, r9d + mov eax, r9d test bl, 63 jnz short rx_body_387 call rx_read_l1 rx_body_387: - xor rbp, rcx - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + xor rbp, rax + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] sub eax, r15d mov r9, rax @@ -6874,13 +6874,13 @@ rx_i_388: ;RET dec ebx jz rx_finish xor r8, 08a949356h - mov ecx, r8d + mov eax, r8d test bl, 63 jnz short rx_body_388 call rx_read_l1 rx_body_388: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] mov rcx, rax mov eax, r11d xor eax, 0a0985cc2h @@ -6894,13 +6894,13 @@ rx_i_389: ;JUMP dec ebx jz rx_finish xor r11, 06531ad2eh - mov ecx, r11d + mov eax, r11d test bl, 63 jnz short rx_body_389 call rx_read_l2 rx_body_389: - and ecx, 32767 - mov rax, qword ptr [rsi+rcx*8] + and eax, 32767 + mov rax, qword ptr [rsi+rax*8] mov r14, rax cmp r9d, -350609584 jge rx_i_421 @@ -6909,13 +6909,13 @@ rx_i_390: ;FPADD dec ebx jz rx_finish xor r15, 02914abeah - mov ecx, r15d + mov eax, r15d test bl, 63 jnz short rx_body_390 call rx_read_l1 rx_body_390: - and ecx, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + and eax, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] addpd xmm0, xmm4 movaps xmm3, xmm0 @@ -6923,13 +6923,13 @@ rx_i_391: ;FPADD dec ebx jz rx_finish xor r8, 0473a41f0h - mov ecx, r8d + mov eax, r8d test bl, 63 jnz short rx_body_391 call rx_read_l1 rx_body_391: - and ecx, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + and eax, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] addpd xmm0, xmm3 movaps xmm6, xmm0 @@ -6937,13 +6937,13 @@ rx_i_392: ;SAR_64 dec ebx jz rx_finish xor r14, 01ebc1f0dh - mov ecx, r14d + mov eax, r14d test bl, 63 jnz short rx_body_392 call rx_read_l1 rx_body_392: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] mov rcx, r9 sar rax, cl mov rcx, rax @@ -6956,13 +6956,13 @@ rx_i_393: ;AND_64 dec ebx jz rx_finish xor r14, 0742e95b1h - mov ecx, r14d + mov eax, r14d test bl, 63 jnz short rx_body_393 call rx_read_l1 rx_body_393: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] and rax, r12 mov rcx, rax mov eax, r13d @@ -6974,13 +6974,13 @@ rx_i_394: ;FPADD dec ebx jz rx_finish xor r12, 0db885c2ch - mov ecx, r12d + mov eax, r12d test bl, 63 jnz short rx_body_394 call rx_read_l1 rx_body_394: - and ecx, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + and eax, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] addpd xmm0, xmm9 movaps xmm6, xmm0 @@ -6988,14 +6988,14 @@ rx_i_395: ;DIV_64 dec ebx jz rx_finish xor r8, 04ae4fe8ch - mov ecx, r8d + mov eax, r8d test bl, 63 jnz short rx_body_395 call rx_read_l1 rx_body_395: - xor rbp, rcx - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + xor rbp, rax + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] ; magic divide by 939698704 mov rcx, 5269518980991934091 mul rcx @@ -7007,13 +7007,13 @@ rx_i_396: ;ROR_64 dec ebx jz rx_finish xor r10, 07b41862bh - mov ecx, r10d + mov eax, r10d test bl, 63 jnz short rx_body_396 call rx_read_l2 rx_body_396: - and ecx, 32767 - mov rax, qword ptr [rsi+rcx*8] + and eax, 32767 + mov rax, qword ptr [rsi+rax*8] ror rax, 62 mov rcx, rax mov eax, r12d @@ -7025,14 +7025,14 @@ rx_i_397: ;SUB_32 dec ebx jz rx_finish xor r8, 0916f3819h - mov ecx, r8d + mov eax, r8d test bl, 63 jnz short rx_body_397 call rx_read_l2 rx_body_397: - xor rbp, rcx - and ecx, 32767 - mov rax, qword ptr [rsi+rcx*8] + xor rbp, rax + and eax, 32767 + mov rax, qword ptr [rsi+rax*8] sub eax, r12d mov rcx, rax mov eax, r10d @@ -7044,13 +7044,13 @@ rx_i_398: ;SHR_64 dec ebx jz rx_finish xor r8, 04eb6fd2ah - mov ecx, r8d + mov eax, r8d test bl, 63 jnz short rx_body_398 call rx_read_l2 rx_body_398: - and ecx, 32767 - mov rax, qword ptr [rsi+rcx*8] + and eax, 32767 + mov rax, qword ptr [rsi+rax*8] mov rcx, r8 shr rax, cl mov rcx, rax @@ -7063,14 +7063,14 @@ rx_i_399: ;FPMUL dec ebx jz rx_finish xor r11, 0899a98cfh - mov ecx, r11d + mov eax, r11d test bl, 63 jnz short rx_body_399 call rx_read_l1 rx_body_399: - xor rbp, rcx - and ecx, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + xor rbp, rax + and eax, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] mulpd xmm0, xmm2 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -7081,13 +7081,13 @@ rx_i_400: ;AND_64 dec ebx jz rx_finish xor r13, 0aae75db6h - mov ecx, r13d + mov eax, r13d test bl, 63 jnz short rx_body_400 call rx_read_l2 rx_body_400: - and ecx, 32767 - mov rax, qword ptr [rsi+rcx*8] + and eax, 32767 + mov rax, qword ptr [rsi+rax*8] and rax, -1800645748 mov rcx, rax mov eax, r14d @@ -7099,13 +7099,13 @@ rx_i_401: ;FPSUB dec ebx jz rx_finish xor r13, 032e81f25h - mov ecx, r13d + mov eax, r13d test bl, 63 jnz short rx_body_401 call rx_read_l2 rx_body_401: - and ecx, 32767 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + and eax, 32767 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] subpd xmm0, xmm4 movaps xmm6, xmm0 mov eax, r14d @@ -7117,13 +7117,13 @@ rx_i_402: ;RET dec ebx jz rx_finish xor r9, 0fa1a07ffh - mov ecx, r9d + mov eax, r9d test bl, 63 jnz short rx_body_402 call rx_read_l1 rx_body_402: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] mov r14, rax cmp rsp, rdi je short rx_i_403 @@ -7133,13 +7133,13 @@ rx_i_403: ;DIV_64 dec ebx jz rx_finish xor r9, 0e59500f7h - mov ecx, r9d + mov eax, r9d test bl, 63 jnz short rx_body_403 call rx_read_l1 rx_body_403: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] ; magic divide by 536056992 mov rcx, 4618688153536407095 mul rcx @@ -7155,13 +7155,13 @@ rx_i_404: ;MUL_32 dec ebx jz rx_finish xor r15, 05b8ceb2fh - mov ecx, r15d + mov eax, r15d test bl, 63 jnz short rx_body_404 call rx_read_l2 rx_body_404: - and ecx, 32767 - mov rax, qword ptr [rsi+rcx*8] + and eax, 32767 + mov rax, qword ptr [rsi+rax*8] mov ecx, eax mov eax, r8d imul rax, rcx @@ -7171,13 +7171,13 @@ rx_i_405: ;CALL dec ebx jz rx_finish xor r8, 0f61082a3h - mov ecx, r8d + mov eax, r8d test bl, 63 jnz short rx_body_405 call rx_read_l2 rx_body_405: - and ecx, 32767 - mov rax, qword ptr [rsi+rcx*8] + and eax, 32767 + mov rax, qword ptr [rsi+rax*8] mov rcx, rax mov eax, r12d xor eax, 06b0af6c1h @@ -7191,13 +7191,13 @@ rx_i_406: ;FPDIV dec ebx jz rx_finish xor r9, 0af6886b7h - mov ecx, r9d + mov eax, r9d test bl, 63 jnz short rx_body_406 call rx_read_l1 rx_body_406: - and ecx, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + and eax, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] divpd xmm0, xmm7 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -7212,14 +7212,14 @@ rx_i_407: ;FPSUB dec ebx jz rx_finish xor r14, 09699566fh - mov ecx, r14d + mov eax, r14d test bl, 63 jnz short rx_body_407 call rx_read_l1 rx_body_407: - xor rbp, rcx - and ecx, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + xor rbp, rax + and eax, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] subpd xmm0, xmm9 movaps xmm8, xmm0 @@ -7227,13 +7227,13 @@ rx_i_408: ;MUL_64 dec ebx jz rx_finish xor r15, 066e79fa6h - mov ecx, r15d + mov eax, r15d test bl, 63 jnz short rx_body_408 call rx_read_l2 rx_body_408: - and ecx, 32767 - mov rax, qword ptr [rsi+rcx*8] + and eax, 32767 + mov rax, qword ptr [rsi+rax*8] imul rax, rax, 693109961 mov rcx, rax mov eax, r10d @@ -7245,14 +7245,14 @@ rx_i_409: ;MUL_64 dec ebx jz rx_finish xor r11, 04b6caa9ah - mov ecx, r11d + mov eax, r11d test bl, 63 jnz short rx_body_409 call rx_read_l1 rx_body_409: - xor rbp, rcx - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + xor rbp, rax + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] imul rax, r15 mov r8, rax @@ -7260,13 +7260,13 @@ rx_i_410: ;RET dec ebx jz rx_finish xor r15, 0d17f245eh - mov ecx, r15d + mov eax, r15d test bl, 63 jnz short rx_body_410 call rx_read_l2 rx_body_410: - and ecx, 32767 - mov rax, qword ptr [rsi+rcx*8] + and eax, 32767 + mov rax, qword ptr [rsi+rax*8] mov r8, rax cmp rsp, rdi je short rx_i_411 @@ -7276,13 +7276,13 @@ rx_i_411: ;RET dec ebx jz rx_finish xor r12, 0364f10e7h - mov ecx, r12d + mov eax, r12d test bl, 63 jnz short rx_body_411 call rx_read_l1 rx_body_411: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] mov r12, rax cmp rsp, rdi je short rx_i_412 @@ -7292,13 +7292,13 @@ rx_i_412: ;FPDIV dec ebx jz rx_finish xor r10, 0ac90e7ah - mov ecx, r10d + mov eax, r10d test bl, 63 jnz short rx_body_412 call rx_read_l1 rx_body_412: - and ecx, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + and eax, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] divpd xmm0, xmm4 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -7313,13 +7313,13 @@ rx_i_413: ;FPMUL dec ebx jz rx_finish xor r11, 04b6037abh - mov ecx, r11d + mov eax, r11d test bl, 63 jnz short rx_body_413 call rx_read_l1 rx_body_413: - and ecx, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + and eax, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] mulpd xmm0, xmm2 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -7330,14 +7330,14 @@ rx_i_414: ;AND_64 dec ebx jz rx_finish xor r14, 06c01554dh - mov ecx, r14d + mov eax, r14d test bl, 63 jnz short rx_body_414 call rx_read_l2 rx_body_414: - xor rbp, rcx - and ecx, 32767 - mov rax, qword ptr [rsi+rcx*8] + xor rbp, rax + and eax, 32767 + mov rax, qword ptr [rsi+rax*8] and rax, -378293327 mov rcx, rax mov eax, r10d @@ -7349,14 +7349,14 @@ rx_i_415: ;DIV_64 dec ebx jz rx_finish xor r8, 08c3e59a1h - mov ecx, r8d + mov eax, r8d test bl, 63 jnz short rx_body_415 call rx_read_l2 rx_body_415: - xor rbp, rcx - and ecx, 32767 - mov rax, qword ptr [rsi+rcx*8] + xor rbp, rax + and eax, 32767 + mov rax, qword ptr [rsi+rax*8] ; magic divide by 3756873911 add rax, 1 sbb rax, 0 @@ -7370,14 +7370,14 @@ rx_i_416: ;FPADD dec ebx jz rx_finish xor r12, 0f3fafde9h - mov ecx, r12d + mov eax, r12d test bl, 63 jnz short rx_body_416 call rx_read_l1 rx_body_416: - xor rbp, rcx - and ecx, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + xor rbp, rax + and eax, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] addpd xmm0, xmm3 movaps xmm5, xmm0 mov eax, r13d @@ -7389,13 +7389,13 @@ rx_i_417: ;SUB_64 dec ebx jz rx_finish xor r10, 03c6481fah - mov ecx, r10d + mov eax, r10d test bl, 63 jnz short rx_body_417 call rx_read_l1 rx_body_417: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] sub rax, r12 mov r10, rax @@ -7403,13 +7403,13 @@ rx_i_418: ;MULH_64 dec ebx jz rx_finish xor r10, 02bd61c5fh - mov ecx, r10d + mov eax, r10d test bl, 63 jnz short rx_body_418 call rx_read_l2 rx_body_418: - and ecx, 32767 - mov rax, qword ptr [rsi+rcx*8] + and eax, 32767 + mov rax, qword ptr [rsi+rax*8] mov rcx, r11 mul rcx mov rax, rdx @@ -7419,13 +7419,13 @@ rx_i_419: ;OR_64 dec ebx jz rx_finish xor r9, 0b6ab9d32h - mov ecx, r9d + mov eax, r9d test bl, 63 jnz short rx_body_419 call rx_read_l2 rx_body_419: - and ecx, 32767 - mov rax, qword ptr [rsi+rcx*8] + and eax, 32767 + mov rax, qword ptr [rsi+rax*8] or rax, r14 mov rcx, rax mov eax, r14d @@ -7437,13 +7437,13 @@ rx_i_420: ;ROR_64 dec ebx jz rx_finish xor r9, 0f9690ceah - mov ecx, r9d + mov eax, r9d test bl, 63 jnz short rx_body_420 call rx_read_l1 rx_body_420: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] ror rax, 38 mov r9, rax @@ -7451,14 +7451,14 @@ rx_i_421: ;CALL dec ebx jz rx_finish xor r12, 01ada0f39h - mov ecx, r12d + mov eax, r12d test bl, 63 jnz short rx_body_421 call rx_read_l1 rx_body_421: - xor rbp, rcx - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + xor rbp, rax + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] mov r10, rax cmp r8d, -1600409762 jo short rx_i_422 @@ -7468,13 +7468,13 @@ rx_i_422: ;IMUL_32 dec ebx jz rx_finish xor r11, 04dd16ca4h - mov ecx, r11d + mov eax, r11d test bl, 63 jnz short rx_body_422 call rx_read_l1 rx_body_422: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] movsxd rcx, eax movsxd rax, r10d imul rax, rcx @@ -7484,13 +7484,13 @@ rx_i_423: ;MUL_64 dec ebx jz rx_finish xor r12, 04df5ce05h - mov ecx, r12d + mov eax, r12d test bl, 63 jnz short rx_body_423 call rx_read_l1 rx_body_423: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] imul rax, r10 mov rcx, rax mov eax, r15d @@ -7502,14 +7502,14 @@ rx_i_424: ;FPADD dec ebx jz rx_finish xor r13, 01ad12ce2h - mov ecx, r13d + mov eax, r13d test bl, 63 jnz short rx_body_424 call rx_read_l1 rx_body_424: - xor rbp, rcx - and ecx, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + xor rbp, rax + and eax, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] addpd xmm0, xmm7 movaps xmm9, xmm0 mov eax, r9d @@ -7521,15 +7521,15 @@ rx_i_425: ;IMUL_32 dec ebx jz rx_finish xor r8, 0a3c5391dh - mov ecx, r8d + mov eax, r8d test bl, 63 jnz short rx_body_425 call rx_read_l1 rx_body_425: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] movsxd rcx, eax - movsxd rax, r10d + mov rax, 1776029069 imul rax, rcx mov r14, rax @@ -7537,13 +7537,13 @@ rx_i_426: ;IDIV_64 dec ebx jz rx_finish xor r12, 09dd55ba0h - mov ecx, r12d + mov eax, r12d test bl, 63 jnz short rx_body_426 call rx_read_l1 rx_body_426: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] ; magic divide by -590728721 mov rdx, -4191230239118101979 imul rdx @@ -7562,14 +7562,14 @@ rx_i_427: ;MUL_32 dec ebx jz rx_finish xor r11, 0d6cae9aeh - mov ecx, r11d + mov eax, r11d test bl, 63 jnz short rx_body_427 call rx_read_l1 rx_body_427: - xor rbp, rcx - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + xor rbp, rax + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] mov ecx, eax mov eax, -2146332428 imul rax, rcx @@ -7583,13 +7583,13 @@ rx_i_428: ;RET dec ebx jz rx_finish xor r11, 0f807a961h - mov ecx, r11d + mov eax, r11d test bl, 63 jnz short rx_body_428 call rx_read_l1 rx_body_428: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] mov rcx, rax mov eax, r8d xor eax, 0e3b86b2fh @@ -7603,13 +7603,13 @@ rx_i_429: ;MUL_64 dec ebx jz rx_finish xor r12, 0650a4102h - mov ecx, r12d + mov eax, r12d test bl, 63 jnz short rx_body_429 call rx_read_l1 rx_body_429: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] imul rax, r9 mov r15, rax @@ -7617,14 +7617,14 @@ rx_i_430: ;FPADD dec ebx jz rx_finish xor r14, 019cc0e5h - mov ecx, r14d + mov eax, r14d test bl, 63 jnz short rx_body_430 call rx_read_l1 rx_body_430: - xor rbp, rcx - and ecx, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + xor rbp, rax + and eax, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] addpd xmm0, xmm8 movaps xmm5, xmm0 mov eax, r13d @@ -7636,13 +7636,13 @@ rx_i_431: ;ROR_64 dec ebx jz rx_finish xor r12, 0ed17ab58h - mov ecx, r12d + mov eax, r12d test bl, 63 jnz short rx_body_431 call rx_read_l1 rx_body_431: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] mov rcx, r13 ror rax, cl mov r13, rax @@ -7651,13 +7651,13 @@ rx_i_432: ;SUB_64 dec ebx jz rx_finish xor r10, 01c3b321fh - mov ecx, r10d + mov eax, r10d test bl, 63 jnz short rx_body_432 call rx_read_l1 rx_body_432: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] sub rax, 876274173 mov r8, rax @@ -7665,13 +7665,13 @@ rx_i_433: ;ADD_32 dec ebx jz rx_finish xor r13, 0bbb88499h - mov ecx, r13d + mov eax, r13d test bl, 63 jnz short rx_body_433 call rx_read_l1 rx_body_433: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] add eax, 1193456495 mov rcx, rax mov eax, r12d @@ -7683,13 +7683,13 @@ rx_i_434: ;FPDIV dec ebx jz rx_finish xor r13, 0167edabdh - mov ecx, r13d + mov eax, r13d test bl, 63 jnz short rx_body_434 call rx_read_l1 rx_body_434: - and ecx, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + and eax, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] divpd xmm0, xmm3 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -7704,14 +7704,14 @@ rx_i_435: ;MUL_64 dec ebx jz rx_finish xor r15, 0b940480ah - mov ecx, r15d + mov eax, r15d test bl, 63 jnz short rx_body_435 call rx_read_l1 rx_body_435: - xor rbp, rcx - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + xor rbp, rax + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] imul rax, rax, 1971717631 mov rcx, rax mov eax, r9d @@ -7723,13 +7723,13 @@ rx_i_436: ;FPADD dec ebx jz rx_finish xor r15, 0bfc3ca8bh - mov ecx, r15d + mov eax, r15d test bl, 63 jnz short rx_body_436 call rx_read_l1 rx_body_436: - and ecx, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + and eax, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] addpd xmm0, xmm2 movaps xmm7, xmm0 mov eax, r15d @@ -7741,14 +7741,14 @@ rx_i_437: ;FPMUL dec ebx jz rx_finish xor r8, 098a6bcf7h - mov ecx, r8d + mov eax, r8d test bl, 63 jnz short rx_body_437 call rx_read_l1 rx_body_437: - xor rbp, rcx - and ecx, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + xor rbp, rax + and eax, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] mulpd xmm0, xmm3 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -7759,13 +7759,13 @@ rx_i_438: ;FPMUL dec ebx jz rx_finish xor r10, 0325b38ebh - mov ecx, r10d + mov eax, r10d test bl, 63 jnz short rx_body_438 call rx_read_l1 rx_body_438: - and ecx, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + and eax, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] mulpd xmm0, xmm9 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -7776,13 +7776,13 @@ rx_i_439: ;OR_64 dec ebx jz rx_finish xor r13, 05e807e81h - mov ecx, r13d + mov eax, r13d test bl, 63 jnz short rx_body_439 call rx_read_l1 rx_body_439: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] or rax, -1299288575 mov rcx, rax mov eax, r10d @@ -7794,14 +7794,14 @@ rx_i_440: ;CALL dec ebx jz rx_finish xor r10, 062f83728h - mov ecx, r10d + mov eax, r10d test bl, 63 jnz short rx_body_440 call rx_read_l1 rx_body_440: - xor rbp, rcx - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + xor rbp, rax + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] mov r9, rax cmp r12d, 2127765370 jns short rx_i_441 @@ -7811,14 +7811,14 @@ rx_i_441: ;ADD_64 dec ebx jz rx_finish xor r14, 0d18ec075h - mov ecx, r14d + mov eax, r14d test bl, 63 jnz short rx_body_441 call rx_read_l1 rx_body_441: - xor rbp, rcx - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + xor rbp, rax + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] add rax, r14 mov rcx, rax mov eax, r9d @@ -7830,13 +7830,13 @@ rx_i_442: ;FPSQRT dec ebx jz rx_finish xor r14, 0a53dd1bh - mov ecx, r14d + mov eax, r14d test bl, 63 jnz short rx_body_442 call rx_read_l1 rx_body_442: - and ecx, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + and eax, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] andps xmm0, xmm10 sqrtpd xmm3, xmm0 @@ -7844,13 +7844,13 @@ rx_i_443: ;RET dec ebx jz rx_finish xor r14, 0232d1285h - mov ecx, r14d + mov eax, r14d test bl, 63 jnz short rx_body_443 call rx_read_l1 rx_body_443: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] mov rcx, rax mov eax, r9d xor eax, 04f71c419h @@ -7864,13 +7864,13 @@ rx_i_444: ;FPSUB dec ebx jz rx_finish xor r8, 042455dd8h - mov ecx, r8d + mov eax, r8d test bl, 63 jnz short rx_body_444 call rx_read_l1 rx_body_444: - and ecx, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + and eax, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] subpd xmm0, xmm7 movaps xmm5, xmm0 mov eax, r13d @@ -7882,13 +7882,13 @@ rx_i_445: ;ADD_64 dec ebx jz rx_finish xor r13, 09ae009b2h - mov ecx, r13d + mov eax, r13d test bl, 63 jnz short rx_body_445 call rx_read_l1 rx_body_445: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] add rax, r11 mov rcx, rax mov eax, r9d @@ -7900,14 +7900,14 @@ rx_i_446: ;MUL_32 dec ebx jz rx_finish xor r12, 01734708eh - mov ecx, r12d + mov eax, r12d test bl, 63 jnz short rx_body_446 call rx_read_l1 rx_body_446: - xor rbp, rcx - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + xor rbp, rax + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] mov ecx, eax mov eax, r15d imul rax, rcx @@ -7921,13 +7921,13 @@ rx_i_447: ;FPADD dec ebx jz rx_finish xor r8, 01596d0e8h - mov ecx, r8d + mov eax, r8d test bl, 63 jnz short rx_body_447 call rx_read_l1 rx_body_447: - and ecx, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + and eax, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] addpd xmm0, xmm7 movaps xmm5, xmm0 mov eax, r13d @@ -7939,14 +7939,14 @@ rx_i_448: ;FPSUB dec ebx jz rx_finish xor r9, 0390cfdb0h - mov ecx, r9d + mov eax, r9d test bl, 63 jnz short rx_body_448 call rx_read_l1 rx_body_448: - xor rbp, rcx - and ecx, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + xor rbp, rax + and eax, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] subpd xmm0, xmm3 movaps xmm9, xmm0 @@ -7954,13 +7954,13 @@ rx_i_449: ;ROL_64 dec ebx jz rx_finish xor r8, 04f27744bh - mov ecx, r8d + mov eax, r8d test bl, 63 jnz short rx_body_449 call rx_read_l1 rx_body_449: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] rol rax, 28 mov r8, rax @@ -7968,13 +7968,13 @@ rx_i_450: ;SAR_64 dec ebx jz rx_finish xor r8, 04e2c76ffh - mov ecx, r8d + mov eax, r8d test bl, 63 jnz short rx_body_450 call rx_read_l1 rx_body_450: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] mov rcx, r12 sar rax, cl mov rcx, rax @@ -7987,14 +7987,14 @@ rx_i_451: ;ADD_64 dec ebx jz rx_finish xor r8, 0c4d99ac9h - mov ecx, r8d + mov eax, r8d test bl, 63 jnz short rx_body_451 call rx_read_l1 rx_body_451: - xor rbp, rcx - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + xor rbp, rax + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] add rax, r10 mov r8, rax @@ -8002,13 +8002,13 @@ rx_i_452: ;RET dec ebx jz rx_finish xor r13, 040130b88h - mov ecx, r13d + mov eax, r13d test bl, 63 jnz short rx_body_452 call rx_read_l2 rx_body_452: - and ecx, 32767 - mov rax, qword ptr [rsi+rcx*8] + and eax, 32767 + mov rax, qword ptr [rsi+rax*8] mov rcx, rax mov eax, r11d xor eax, 0e27dea25h @@ -8022,13 +8022,13 @@ rx_i_453: ;DIV_64 dec ebx jz rx_finish xor r11, 0a2096aa4h - mov ecx, r11d + mov eax, r11d test bl, 63 jnz short rx_body_453 call rx_read_l2 rx_body_453: - and ecx, 32767 - mov rax, qword ptr [rsi+rcx*8] + and eax, 32767 + mov rax, qword ptr [rsi+rax*8] ; magic divide by 380157076 shr rax, 2 mov rcx, 3256390890604862173 @@ -8041,13 +8041,13 @@ rx_i_454: ;FPADD dec ebx jz rx_finish xor r13, 081314291h - mov ecx, r13d + mov eax, r13d test bl, 63 jnz short rx_body_454 call rx_read_l2 rx_body_454: - and ecx, 32767 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + and eax, 32767 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] addpd xmm0, xmm9 movaps xmm4, xmm0 mov eax, r12d @@ -8059,14 +8059,14 @@ rx_i_455: ;OR_64 dec ebx jz rx_finish xor r8, 059263cdbh - mov ecx, r8d + mov eax, r8d test bl, 63 jnz short rx_body_455 call rx_read_l1 rx_body_455: - xor rbp, rcx - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + xor rbp, rax + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] or rax, r9 mov r8, rax @@ -8074,13 +8074,13 @@ rx_i_456: ;AND_64 dec ebx jz rx_finish xor r9, 010e8fe6h - mov ecx, r9d + mov eax, r9d test bl, 63 jnz short rx_body_456 call rx_read_l1 rx_body_456: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] and rax, 401943615 mov rcx, rax mov eax, r9d @@ -8092,14 +8092,14 @@ rx_i_457: ;SUB_64 dec ebx jz rx_finish xor r9, 09de1a3efh - mov ecx, r9d + mov eax, r9d test bl, 63 jnz short rx_body_457 call rx_read_l1 rx_body_457: - xor rbp, rcx - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + xor rbp, rax + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] sub rax, 1482178870 mov rcx, rax mov eax, r10d @@ -8111,13 +8111,13 @@ rx_i_458: ;SAR_64 dec ebx jz rx_finish xor r11, 05c79df6eh - mov ecx, r11d + mov eax, r11d test bl, 63 jnz short rx_body_458 call rx_read_l2 rx_body_458: - and ecx, 32767 - mov rax, qword ptr [rsi+rcx*8] + and eax, 32767 + mov rax, qword ptr [rsi+rax*8] mov rcx, r8 sar rax, cl mov r14, rax @@ -8126,13 +8126,13 @@ rx_i_459: ;MUL_64 dec ebx jz rx_finish xor r9, 0346f46adh - mov ecx, r9d + mov eax, r9d test bl, 63 jnz short rx_body_459 call rx_read_l1 rx_body_459: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] imul rax, r9 mov rcx, rax mov eax, r13d @@ -8144,13 +8144,13 @@ rx_i_460: ;ADD_32 dec ebx jz rx_finish xor r11, 098ab71fch - mov ecx, r11d + mov eax, r11d test bl, 63 jnz short rx_body_460 call rx_read_l1 rx_body_460: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] add eax, -347784553 mov rcx, rax mov eax, r12d @@ -8162,13 +8162,13 @@ rx_i_461: ;XOR_64 dec ebx jz rx_finish xor r11, 0c814e926h - mov ecx, r11d + mov eax, r11d test bl, 63 jnz short rx_body_461 call rx_read_l1 rx_body_461: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] xor rax, 1659853721 mov rcx, rax mov eax, r12d @@ -8180,13 +8180,13 @@ rx_i_462: ;ADD_64 dec ebx jz rx_finish xor r10, 0c64b4a9eh - mov ecx, r10d + mov eax, r10d test bl, 63 jnz short rx_body_462 call rx_read_l1 rx_body_462: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] add rax, r8 mov r15, rax @@ -8194,13 +8194,13 @@ rx_i_463: ;ADD_32 dec ebx jz rx_finish xor r9, 08c29341h - mov ecx, r9d + mov eax, r9d test bl, 63 jnz short rx_body_463 call rx_read_l1 rx_body_463: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] add eax, r15d mov r10, rax @@ -8208,13 +8208,13 @@ rx_i_464: ;MUL_64 dec ebx jz rx_finish xor r12, 06ff587fdh - mov ecx, r12d + mov eax, r12d test bl, 63 jnz short rx_body_464 call rx_read_l1 rx_body_464: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] imul rax, r15 mov rcx, rax mov eax, r13d @@ -8226,13 +8226,13 @@ rx_i_465: ;FPADD dec ebx jz rx_finish xor r12, 0b62c0003h - mov ecx, r12d + mov eax, r12d test bl, 63 jnz short rx_body_465 call rx_read_l1 rx_body_465: - and ecx, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + and eax, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] addpd xmm0, xmm5 movaps xmm2, xmm0 @@ -8240,16 +8240,16 @@ rx_i_466: ;IMUL_32 dec ebx jz rx_finish xor r13, 05c541c42h - mov ecx, r13d + mov eax, r13d test bl, 63 jnz short rx_body_466 call rx_read_l1 rx_body_466: - xor rbp, rcx - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + xor rbp, rax + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] movsxd rcx, eax - mov rax, 282682508 + movsxd rax, r13d imul rax, rcx mov r9, rax @@ -8257,14 +8257,14 @@ rx_i_467: ;FPADD dec ebx jz rx_finish xor r8, 0cbb33f81h - mov ecx, r8d + mov eax, r8d test bl, 63 jnz short rx_body_467 call rx_read_l1 rx_body_467: - xor rbp, rcx - and ecx, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + xor rbp, rax + and eax, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] addpd xmm0, xmm9 movaps xmm8, xmm0 @@ -8272,14 +8272,14 @@ rx_i_468: ;DIV_64 dec ebx jz rx_finish xor r8, 091044dc3h - mov ecx, r8d + mov eax, r8d test bl, 63 jnz short rx_body_468 call rx_read_l1 rx_body_468: - xor rbp, rcx - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + xor rbp, rax + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] ; magic divide by 4281572471 add rax, 1 sbb rax, 0 @@ -8297,15 +8297,15 @@ rx_i_469: ;IMUL_32 dec ebx jz rx_finish xor r9, 0c0186beh - mov ecx, r9d + mov eax, r9d test bl, 63 jnz short rx_body_469 call rx_read_l2 rx_body_469: - and ecx, 32767 - mov rax, qword ptr [rsi+rcx*8] + and eax, 32767 + mov rax, qword ptr [rsi+rax*8] movsxd rcx, eax - mov rax, 294019485 + movsxd rax, r9d imul rax, rcx mov rcx, rax mov eax, r9d @@ -8317,13 +8317,13 @@ rx_i_470: ;OR_64 dec ebx jz rx_finish xor r14, 090849e3eh - mov ecx, r14d + mov eax, r14d test bl, 63 jnz short rx_body_470 call rx_read_l2 rx_body_470: - and ecx, 32767 - mov rax, qword ptr [rsi+rcx*8] + and eax, 32767 + mov rax, qword ptr [rsi+rax*8] or rax, r11 mov rcx, rax mov eax, r14d @@ -8335,15 +8335,15 @@ rx_i_471: ;IMUL_32 dec ebx jz rx_finish xor r14, 0cedba9b6h - mov ecx, r14d + mov eax, r14d test bl, 63 jnz short rx_body_471 call rx_read_l2 rx_body_471: - and ecx, 32767 - mov rax, qword ptr [rsi+rcx*8] + and eax, 32767 + mov rax, qword ptr [rsi+rax*8] movsxd rcx, eax - movsxd rax, r13d + mov rax, 1914863189 imul rax, rcx mov r14, rax @@ -8351,14 +8351,14 @@ rx_i_472: ;JUMP dec ebx jz rx_finish xor r9, 038f4b9d6h - mov ecx, r9d + mov eax, r9d test bl, 63 jnz short rx_body_472 call rx_read_l1 rx_body_472: - xor rbp, rcx - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + xor rbp, rax + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] mov r10, rax cmp r10d, 1738497427 jl rx_i_8 @@ -8367,13 +8367,13 @@ rx_i_473: ;MUL_64 dec ebx jz rx_finish xor r14, 01fb7637dh - mov ecx, r14d + mov eax, r14d test bl, 63 jnz short rx_body_473 call rx_read_l1 rx_body_473: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] imul rax, r11 mov r12, rax @@ -8381,14 +8381,14 @@ rx_i_474: ;JUMP dec ebx jz rx_finish xor r9, 0b5c0b4d4h - mov ecx, r9d + mov eax, r9d test bl, 63 jnz short rx_body_474 call rx_read_l1 rx_body_474: - xor rbp, rcx - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + xor rbp, rax + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] mov r15, rax cmp r15d, -233120543 jo rx_i_69 @@ -8397,13 +8397,13 @@ rx_i_475: ;FPSUB dec ebx jz rx_finish xor r10, 0910dcdeeh - mov ecx, r10d + mov eax, r10d test bl, 63 jnz short rx_body_475 call rx_read_l1 rx_body_475: - and ecx, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + and eax, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] subpd xmm0, xmm9 movaps xmm7, xmm0 @@ -8411,13 +8411,13 @@ rx_i_476: ;FPADD dec ebx jz rx_finish xor r8, 07ab3b5a4h - mov ecx, r8d + mov eax, r8d test bl, 63 jnz short rx_body_476 call rx_read_l1 rx_body_476: - and ecx, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + and eax, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] addpd xmm0, xmm2 movaps xmm9, xmm0 @@ -8425,13 +8425,13 @@ rx_i_477: ;FPADD dec ebx jz rx_finish xor r12, 07a29ec63h - mov ecx, r12d + mov eax, r12d test bl, 63 jnz short rx_body_477 call rx_read_l2 rx_body_477: - and ecx, 32767 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + and eax, 32767 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] addpd xmm0, xmm9 movaps xmm6, xmm0 mov eax, r14d @@ -8443,13 +8443,13 @@ rx_i_478: ;MUL_64 dec ebx jz rx_finish xor r14, 02d3d7e7fh - mov ecx, r14d + mov eax, r14d test bl, 63 jnz short rx_body_478 call rx_read_l2 rx_body_478: - and ecx, 32767 - mov rax, qword ptr [rsi+rcx*8] + and eax, 32767 + mov rax, qword ptr [rsi+rax*8] imul rax, r10 mov r12, rax @@ -8457,13 +8457,13 @@ rx_i_479: ;MUL_64 dec ebx jz rx_finish xor r12, 09b49c793h - mov ecx, r12d + mov eax, r12d test bl, 63 jnz short rx_body_479 call rx_read_l1 rx_body_479: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] imul rax, r14 mov rcx, rax mov eax, r13d @@ -8475,13 +8475,13 @@ rx_i_480: ;FPADD dec ebx jz rx_finish xor r9, 0a9cc4f01h - mov ecx, r9d + mov eax, r9d test bl, 63 jnz short rx_body_480 call rx_read_l1 rx_body_480: - and ecx, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + and eax, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] addpd xmm0, xmm4 movaps xmm6, xmm0 @@ -8489,13 +8489,13 @@ rx_i_481: ;DIV_64 dec ebx jz rx_finish xor r14, 0225ba1f9h - mov ecx, r14d + mov eax, r14d test bl, 63 jnz short rx_body_481 call rx_read_l1 rx_body_481: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] ; magic divide by 2101516912 shr rax, 4 mov rcx, 147267437180322377 @@ -8508,13 +8508,13 @@ rx_i_482: ;AND_32 dec ebx jz rx_finish xor r14, 044a0f592h - mov ecx, r14d + mov eax, r14d test bl, 63 jnz short rx_body_482 call rx_read_l1 rx_body_482: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] and eax, 1304556205 mov r11, rax @@ -8522,13 +8522,13 @@ rx_i_483: ;FPADD dec ebx jz rx_finish xor r11, 07f71f219h - mov ecx, r11d + mov eax, r11d test bl, 63 jnz short rx_body_483 call rx_read_l1 rx_body_483: - and ecx, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + and eax, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] addpd xmm0, xmm6 movaps xmm6, xmm0 @@ -8536,14 +8536,14 @@ rx_i_484: ;SHR_64 dec ebx jz rx_finish xor r12, 07027bacdh - mov ecx, r12d + mov eax, r12d test bl, 63 jnz short rx_body_484 call rx_read_l1 rx_body_484: - xor rbp, rcx - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + xor rbp, rax + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] shr rax, 37 mov r11, rax @@ -8551,14 +8551,14 @@ rx_i_485: ;JUMP dec ebx jz rx_finish xor r13, 03a04647h - mov ecx, r13d + mov eax, r13d test bl, 63 jnz short rx_body_485 call rx_read_l1 rx_body_485: - xor rbp, rcx - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + xor rbp, rax + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] mov rcx, rax mov eax, r15d xor eax, 02112cbaeh @@ -8571,13 +8571,13 @@ rx_i_486: ;ADD_64 dec ebx jz rx_finish xor r15, 0ad072937h - mov ecx, r15d + mov eax, r15d test bl, 63 jnz short rx_body_486 call rx_read_l1 rx_body_486: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] add rax, r8 mov rcx, rax mov eax, r8d @@ -8589,13 +8589,13 @@ rx_i_487: ;SUB_64 dec ebx jz rx_finish xor r11, 07f78ad34h - mov ecx, r11d + mov eax, r11d test bl, 63 jnz short rx_body_487 call rx_read_l1 rx_body_487: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] sub rax, r9 mov r11, rax @@ -8603,13 +8603,13 @@ rx_i_488: ;DIV_64 dec ebx jz rx_finish xor r12, 0d8b1788eh - mov ecx, r12d + mov eax, r12d test bl, 63 jnz short rx_body_488 call rx_read_l2 rx_body_488: - and ecx, 32767 - mov rax, qword ptr [rsi+rcx*8] + and eax, 32767 + mov rax, qword ptr [rsi+rax*8] ; magic divide by 297357073 mov rcx, 16652572300311555393 mul rcx @@ -8621,14 +8621,14 @@ rx_i_489: ;JUMP dec ebx jz rx_finish xor r10, 0b2ec9f3ah - mov ecx, r10d + mov eax, r10d test bl, 63 jnz short rx_body_489 call rx_read_l1 rx_body_489: - xor rbp, rcx - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + xor rbp, rax + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] mov rcx, rax mov eax, r8d xor eax, 0bcd0a942h @@ -8641,14 +8641,14 @@ rx_i_490: ;ROR_64 dec ebx jz rx_finish xor r11, 015c7f598h - mov ecx, r11d + mov eax, r11d test bl, 63 jnz short rx_body_490 call rx_read_l1 rx_body_490: - xor rbp, rcx - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + xor rbp, rax + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] mov rcx, r9 ror rax, cl mov rcx, rax @@ -8661,13 +8661,13 @@ rx_i_491: ;FPADD dec ebx jz rx_finish xor r8, 0902da6bdh - mov ecx, r8d + mov eax, r8d test bl, 63 jnz short rx_body_491 call rx_read_l1 rx_body_491: - and ecx, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + and eax, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] addpd xmm0, xmm9 movaps xmm7, xmm0 mov eax, r15d @@ -8679,13 +8679,13 @@ rx_i_492: ;IDIV_64 dec ebx jz rx_finish xor r9, 0491090d9h - mov ecx, r9d + mov eax, r9d test bl, 63 jnz short rx_body_492 call rx_read_l1 rx_body_492: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] ; magic divide by -1779388031 mov rcx, rax mov rdx, 7315366159790064091 @@ -8702,13 +8702,13 @@ rx_i_493: ;FPSUB dec ebx jz rx_finish xor r8, 09de81282h - mov ecx, r8d + mov eax, r8d test bl, 63 jnz short rx_body_493 call rx_read_l1 rx_body_493: - and ecx, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + and eax, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] subpd xmm0, xmm9 movaps xmm4, xmm0 @@ -8716,13 +8716,13 @@ rx_i_494: ;MUL_32 dec ebx jz rx_finish xor r10, 0b0d50e46h - mov ecx, r10d + mov eax, r10d test bl, 63 jnz short rx_body_494 call rx_read_l1 rx_body_494: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] mov ecx, eax mov eax, r11d imul rax, rcx @@ -8732,13 +8732,13 @@ rx_i_495: ;FPMUL dec ebx jz rx_finish xor r11, 0e276cad1h - mov ecx, r11d + mov eax, r11d test bl, 63 jnz short rx_body_495 call rx_read_l2 rx_body_495: - and ecx, 32767 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + and eax, 32767 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] mulpd xmm0, xmm2 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -8749,13 +8749,13 @@ rx_i_496: ;IDIV_64 dec ebx jz rx_finish xor r14, 0fe757b73h - mov ecx, r14d + mov eax, r14d test bl, 63 jnz short rx_body_496 call rx_read_l1 rx_body_496: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] ; magic divide by -359802064 mov rdx, -860153514353783887 imul rdx @@ -8770,14 +8770,14 @@ rx_i_497: ;FPMUL dec ebx jz rx_finish xor r8, 08d25742eh - mov ecx, r8d + mov eax, r8d test bl, 63 jnz short rx_body_497 call rx_read_l1 rx_body_497: - xor rbp, rcx - and ecx, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + xor rbp, rax + and eax, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] mulpd xmm0, xmm3 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -8788,13 +8788,13 @@ rx_i_498: ;FPMUL dec ebx jz rx_finish xor r15, 0e066fd15h - mov ecx, r15d + mov eax, r15d test bl, 63 jnz short rx_body_498 call rx_read_l1 rx_body_498: - and ecx, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + and eax, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] mulpd xmm0, xmm9 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 @@ -8809,15 +8809,15 @@ rx_i_499: ;IMUL_32 dec ebx jz rx_finish xor r12, 08925556bh - mov ecx, r12d + mov eax, r12d test bl, 63 jnz short rx_body_499 call rx_read_l1 rx_body_499: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] movsxd rcx, eax - mov rax, -1795485757 + movsxd rax, r13d imul rax, rcx mov r8, rax @@ -8825,13 +8825,13 @@ rx_i_500: ;FPSQRT dec ebx jz rx_finish xor r10, 04bc870ebh - mov ecx, r10d + mov eax, r10d test bl, 63 jnz short rx_body_500 call rx_read_l2 rx_body_500: - and ecx, 32767 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + and eax, 32767 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] andps xmm0, xmm10 sqrtpd xmm2, xmm0 @@ -8839,13 +8839,13 @@ rx_i_501: ;XOR_64 dec ebx jz rx_finish xor r8, 07d46c503h - mov ecx, r8d + mov eax, r8d test bl, 63 jnz short rx_body_501 call rx_read_l1 rx_body_501: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] xor rax, r10 mov rcx, rax mov eax, r12d @@ -8857,14 +8857,14 @@ rx_i_502: ;RET dec ebx jz rx_finish xor r10, 09e70b20ch - mov ecx, r10d + mov eax, r10d test bl, 63 jnz short rx_body_502 call rx_read_l2 rx_body_502: - xor rbp, rcx - and ecx, 32767 - mov rax, qword ptr [rsi+rcx*8] + xor rbp, rax + and eax, 32767 + mov rax, qword ptr [rsi+rax*8] mov rcx, rax mov eax, r9d xor eax, 08d85312h @@ -8878,13 +8878,13 @@ rx_i_503: ;FPSUB dec ebx jz rx_finish xor r13, 0442e4850h - mov ecx, r13d + mov eax, r13d test bl, 63 jnz short rx_body_503 call rx_read_l2 rx_body_503: - and ecx, 32767 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + and eax, 32767 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] subpd xmm0, xmm2 movaps xmm9, xmm0 mov eax, r9d @@ -8896,13 +8896,13 @@ rx_i_504: ;FPADD dec ebx jz rx_finish xor r13, 099d48347h - mov ecx, r13d + mov eax, r13d test bl, 63 jnz short rx_body_504 call rx_read_l1 rx_body_504: - and ecx, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + and eax, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] addpd xmm0, xmm9 movaps xmm4, xmm0 mov eax, r12d @@ -8914,13 +8914,13 @@ rx_i_505: ;FPSUB dec ebx jz rx_finish xor r12, 032c0a28ah - mov ecx, r12d + mov eax, r12d test bl, 63 jnz short rx_body_505 call rx_read_l1 rx_body_505: - and ecx, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + and eax, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] subpd xmm0, xmm4 movaps xmm8, xmm0 mov eax, r8d @@ -8932,13 +8932,13 @@ rx_i_506: ;FPSUB dec ebx jz rx_finish xor r9, 0a973d58ch - mov ecx, r9d + mov eax, r9d test bl, 63 jnz short rx_body_506 call rx_read_l2 rx_body_506: - and ecx, 32767 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + and eax, 32767 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] subpd xmm0, xmm9 movaps xmm3, xmm0 @@ -8946,13 +8946,13 @@ rx_i_507: ;RET dec ebx jz rx_finish xor r10, 0d3b7165ch - mov ecx, r10d + mov eax, r10d test bl, 63 jnz short rx_body_507 call rx_read_l1 rx_body_507: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] mov r14, rax cmp rsp, rdi je short rx_i_508 @@ -8962,13 +8962,13 @@ rx_i_508: ;RET dec ebx jz rx_finish xor r13, 0da34d818h - mov ecx, r13d + mov eax, r13d test bl, 63 jnz short rx_body_508 call rx_read_l1 rx_body_508: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] mov r8, rax cmp rsp, rdi je short rx_i_509 @@ -8978,13 +8978,13 @@ rx_i_509: ;FPROUND dec ebx jz rx_finish xor r11, 01b2873f2h - mov ecx, r11d + mov eax, r11d test bl, 63 jnz short rx_body_509 call rx_read_l1 rx_body_509: - and ecx, 2047 - mov rax, qword ptr [rsi+rcx*8] + and eax, 2047 + mov rax, qword ptr [rsi+rax*8] mov rcx, rax rol rax, 34 and eax, 24576 @@ -8997,13 +8997,13 @@ rx_i_510: ;FPADD dec ebx jz rx_finish xor r8, 0db65513ch - mov ecx, r8d + mov eax, r8d test bl, 63 jnz short rx_body_510 call rx_read_l1 rx_body_510: - and ecx, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rcx*8] + and eax, 2047 + cvtdq2pd xmm0, qword ptr [rsi+rax*8] addpd xmm0, xmm2 movaps xmm9, xmm0 @@ -9011,13 +9011,13 @@ rx_i_511: ;SHR_64 dec ebx jz rx_finish xor r11, 02bd79286h - mov ecx, r11d + mov eax, r11d test bl, 63 jnz short rx_body_511 call rx_read_l2 rx_body_511: - and ecx, 32767 - mov rax, qword ptr [rsi+rcx*8] + and eax, 32767 + mov rax, qword ptr [rsi+rax*8] shr rax, 56 mov r11, rax From 48d85643de0666731037015da02d5fea89d6fc3d Mon Sep 17 00:00:00 2001 From: tevador Date: Sun, 13 Jan 2019 13:47:25 +0100 Subject: [PATCH 16/35] Dataset intialization algorithm (AES) --- src/Cache.cpp | 5 -- src/Cache.hpp | 4 +- src/JitCompilerX86.cpp | 2 +- src/JitCompilerX86.hpp | 1 - src/VirtualMachine.cpp | 10 ++-- src/common.hpp | 12 ++--- src/dataset.cpp | 117 ++++++++++++++++------------------------- 7 files changed, 58 insertions(+), 93 deletions(-) diff --git a/src/Cache.cpp b/src/Cache.cpp index eb03f9d..bb1758f 100644 --- a/src/Cache.cpp +++ b/src/Cache.cpp @@ -134,11 +134,6 @@ namespace RandomX { //Argon2d memory fill argonFill(seed, seedSize); - //Circular shift of the cache buffer by 512 bytes - //realized by copying the first 512 bytes to the back - //of the buffer and shifting the start by 512 bytes - memcpy(memory + CacheSize, memory, CacheShift); - //AES keys expandAesKeys((__m128i*)seed, keys.data()); } diff --git a/src/Cache.hpp b/src/Cache.hpp index 7a34ee8..4137b97 100644 --- a/src/Cache.hpp +++ b/src/Cache.hpp @@ -47,11 +47,11 @@ namespace RandomX { } const uint8_t* getCache() { - return memory + CacheShift; + return memory; } private: alignas(16) KeysContainer keys; - uint8_t memory[CacheSize + CacheShift]; + uint8_t memory[CacheSize]; void argonFill(const void* seed, size_t seedSize); }; } \ No newline at end of file diff --git a/src/JitCompilerX86.cpp b/src/JitCompilerX86.cpp index 8175485..955d8ba 100644 --- a/src/JitCompilerX86.cpp +++ b/src/JitCompilerX86.cpp @@ -17,7 +17,7 @@ You should have received a copy of the GNU General Public License along with RandomX. If not, see. */ -//#define MAGIC_DIVISION +#define MAGIC_DIVISION #include "JitCompilerX86.hpp" #include "Pcg32.hpp" #include diff --git a/src/JitCompilerX86.hpp b/src/JitCompilerX86.hpp index 0c0c48c..e6a7e6d 100644 --- a/src/JitCompilerX86.hpp +++ b/src/JitCompilerX86.hpp @@ -33,7 +33,6 @@ namespace RandomX { typedef void(JitCompilerX86::*InstructionGeneratorX86)(Instruction&, int); constexpr uint32_t CodeSize = 64 * 1024; - constexpr uint32_t CacheLineSize = 64; struct CallOffset { CallOffset(int32_t p, int32_t i) : pos(p), index(i) {} diff --git a/src/VirtualMachine.cpp b/src/VirtualMachine.cpp index 103d245..6e8cfad 100644 --- a/src/VirtualMachine.cpp +++ b/src/VirtualMachine.cpp @@ -56,7 +56,7 @@ namespace RandomX { if (light) { auto lds = mem.ds.lightDataset = new LightClientDataset(); lds->cache = ds.cache; - lds->block = (uint8_t*)_mm_malloc(DatasetBlockSize, sizeof(__m128i)); + //lds->block = (uint8_t*)_mm_malloc(DatasetBlockSize, sizeof(__m128i)); lds->blockNumber = -1; if (lds->block == nullptr) { throw std::bad_alloc(); @@ -78,13 +78,13 @@ namespace RandomX { if (lightClient) { auto cache = mem.ds.lightDataset->cache; if (softAes) { - for (int i = 0; i < ScratchpadSize / DatasetBlockSize; ++i) { - initBlock(cache->getCache(), ((uint8_t*)scratchpad) + DatasetBlockSize * i, (ScratchpadSize / DatasetBlockSize) * index + i, cache->getKeys()); + for (int i = 0; i < ScratchpadSize / CacheLineSize; ++i) { + initBlock(cache->getCache(), ((uint8_t*)scratchpad) + CacheLineSize * i, (ScratchpadSize / CacheLineSize) * index + i, cache->getKeys()); } } else { - for (int i = 0; i < ScratchpadSize / DatasetBlockSize; ++i) { - initBlock(cache->getCache(), ((uint8_t*)scratchpad) + DatasetBlockSize * i, (ScratchpadSize / DatasetBlockSize) * index + i, cache->getKeys()); + for (int i = 0; i < ScratchpadSize / CacheLineSize; ++i) { + initBlock(cache->getCache(), ((uint8_t*)scratchpad) + CacheLineSize * i, (ScratchpadSize / CacheLineSize) * index + i, cache->getKeys()); } } } diff --git a/src/common.hpp b/src/common.hpp index acda52a..3831175 100644 --- a/src/common.hpp +++ b/src/common.hpp @@ -34,13 +34,13 @@ namespace RandomX { constexpr int SeedSize = 32; constexpr int ResultSize = 32; - constexpr int CacheBlockSize = 1024; - constexpr int CacheShift = CacheBlockSize / 2; + constexpr int CacheBlockCount = 1024 * 1024; + constexpr int CacheLineSize = 64; constexpr int BlockExpansionRatio = 64; - constexpr uint32_t DatasetBlockSize = BlockExpansionRatio * CacheBlockSize; - constexpr uint32_t DatasetBlockCount = 65536; - constexpr uint32_t CacheSize = DatasetBlockCount * CacheBlockSize; - constexpr uint64_t DatasetSize = (uint64_t)DatasetBlockCount * DatasetBlockSize; + constexpr int DatasetBlockCount = BlockExpansionRatio * CacheBlockCount; + constexpr int DatasetIterations = 64; + constexpr uint32_t CacheSize = CacheBlockCount * CacheLineSize; + constexpr uint64_t DatasetSize = (uint64_t)CacheSize * BlockExpansionRatio; constexpr int ArgonIterations = 12; constexpr uint32_t ArgonMemorySize = 65536; //KiB diff --git a/src/dataset.cpp b/src/dataset.cpp index 70561c1..d9c7b3f 100644 --- a/src/dataset.cpp +++ b/src/dataset.cpp @@ -56,59 +56,55 @@ namespace RandomX { return soft ? soft_aesdec(in, key) : _mm_aesdec_si128(in, key); } - template +#define AES_ROUND(i) x0 = aesdec(x0, keys[i]); \ + x1 = aesenc(x1, keys[i]); \ + x2 = aesdec(x2, keys[i]); \ + x3 = aesenc(x3, keys[i]) + + template void initBlock(const uint8_t* in, uint8_t* out, uint32_t blockNumber, const KeysContainer& keys) { - __m128i xin, xout; + __m128i x0, x1, x2, x3, iv; + //block number 0..67108863 //Initialization vector = block number extended to 128 bits - xout = _mm_cvtsi32_si128(blockNumber); - //Expand + AES - for (uint32_t i = 0; i < DatasetBlockSize / sizeof(__m128i); ++i) { - if ((i % 32) == 0) { - xin = _mm_set_epi64x(*(uint64_t*)(in + i / 4), 0); - xout = _mm_xor_si128(xin, xout); - } - if (enc) { - xout = aesenc(xout, keys[0]); - xout = aesenc(xout, keys[1]); - xout = aesenc(xout, keys[2]); - xout = aesenc(xout, keys[3]); - xout = aesenc(xout, keys[4]); - xout = aesenc(xout, keys[5]); - xout = aesenc(xout, keys[6]); - xout = aesenc(xout, keys[7]); - xout = aesenc(xout, keys[8]); - xout = aesenc(xout, keys[9]); - } - else { - xout = aesdec(xout, keys[0]); - xout = aesdec(xout, keys[1]); - xout = aesdec(xout, keys[2]); - xout = aesdec(xout, keys[3]); - xout = aesdec(xout, keys[4]); - xout = aesdec(xout, keys[5]); - xout = aesdec(xout, keys[6]); - xout = aesdec(xout, keys[7]); - xout = aesdec(xout, keys[8]); - xout = aesdec(xout, keys[9]); - } - _mm_store_si128((__m128i*)(out + i * sizeof(__m128i)), xout); + iv = _mm_cvtsi32_si128(blockNumber); + uint32_t cacheBlockNumber = blockNumber / BlockExpansionRatio; //0..1048575 + __m128i* cacheCacheLine = (__m128i*)(in + cacheBlockNumber * CacheLineSize); + __m128i* datasetCacheLine = (__m128i*)out; + + x0 = _mm_load_si128(cacheCacheLine + 0); + x1 = _mm_load_si128(cacheCacheLine + 1); + x2 = _mm_load_si128(cacheCacheLine + 2); + x3 = _mm_load_si128(cacheCacheLine + 3); + + x0 = _mm_xor_si128(x0, iv); + x1 = _mm_xor_si128(x1, iv); + x2 = _mm_xor_si128(x2, iv); + x3 = _mm_xor_si128(x3, iv); + + for (auto i = 0; i < DatasetIterations; ++i) { + AES_ROUND(0); + AES_ROUND(1); + AES_ROUND(2); + AES_ROUND(3); + AES_ROUND(4); + AES_ROUND(5); + AES_ROUND(6); + AES_ROUND(7); + AES_ROUND(8); + AES_ROUND(9); } - //Shuffle - Pcg32 gen(&xout); - shuffle((uint32_t*)out, DatasetBlockSize, gen); + + _mm_store_si128(datasetCacheLine + 0, x0); + _mm_store_si128(datasetCacheLine + 1, x1); + _mm_store_si128(datasetCacheLine + 2, x2); + _mm_store_si128(datasetCacheLine + 3, x3); } template - void initBlock(const uint8_t*, uint8_t*, uint32_t, const KeysContainer&); + void initBlock(const uint8_t*, uint8_t*, uint32_t, const KeysContainer&); template - void initBlock(const uint8_t*, uint8_t*, uint32_t, const KeysContainer&); - - template - void initBlock(const uint8_t*, uint8_t*, uint32_t, const KeysContainer&); - - template - void initBlock(const uint8_t*, uint8_t*, uint32_t, const KeysContainer&); + void initBlock(const uint8_t*, uint8_t*, uint32_t, const KeysContainer&); convertible_t datasetRead(addr_t addr, MemoryRegisters& memory) { convertible_t data; @@ -122,37 +118,12 @@ namespace RandomX { return data; } - template - void initBlock(const uint8_t* cache, uint8_t* block, uint32_t blockNumber, const KeysContainer& keys) { - if (blockNumber % 2 == 1) { - initBlock(cache + blockNumber * CacheBlockSize, block, blockNumber, keys); - } - else { - initBlock(cache + blockNumber * CacheBlockSize, block, blockNumber, keys); - } - } - - template - void initBlock(const uint8_t*, uint8_t*, uint32_t, const KeysContainer&); - - template - void initBlock(const uint8_t*, uint8_t*, uint32_t, const KeysContainer&); - template convertible_t datasetReadLight(addr_t addr, MemoryRegisters& memory) { convertible_t data; LightClientDataset* lds = memory.ds.lightDataset; - auto blockNumber = memory.ma / DatasetBlockSize; - if (lds->blockNumber != blockNumber) { - initBlock(lds->cache->getCache(), (uint8_t*)lds->block, blockNumber, lds->cache->getKeys()); - lds->blockNumber = blockNumber; - } - data.u64 = *(uint64_t*)(lds->block + (memory.ma % DatasetBlockSize)); - memory.ma += 8; - memory.mx ^= addr; - if ((memory.mx & 0xFFF8) == 0) { - memory.ma = memory.mx & ~7; - } + auto blockNumber = memory.ma / CacheLineSize; + return data; } @@ -179,7 +150,7 @@ namespace RandomX { template void datasetInit(Cache* cache, dataset_t ds, uint32_t startBlock, uint32_t blockCount) { for (uint32_t i = startBlock; i < startBlock + blockCount; ++i) { - initBlock(cache->getCache(), ds.dataset + i * DatasetBlockSize, i, cache->getKeys()); + initBlock(cache->getCache(), ds.dataset + i * CacheLineSize, i, cache->getKeys()); } } From a7ffe8c19a838c3738bf0c09d3eb6a30118b44f7 Mon Sep 17 00:00:00 2001 From: tevador Date: Sun, 13 Jan 2019 21:14:59 +0100 Subject: [PATCH 17/35] Mix dataset cacheline with registers r0-r7 --- src/AssemblyGeneratorX86.cpp | 12 +- src/JitCompilerX86-static.S | 17 +- src/JitCompilerX86-static.asm | 20 +- src/JitCompilerX86-static.hpp | 3 +- src/JitCompilerX86.cpp | 37 +- src/asm/program_read.inc | 28 +- src/executeProgram-win64.asm | 45 +- src/program.inc | 1296 ++++++++++++++++----------------- 8 files changed, 691 insertions(+), 767 deletions(-) diff --git a/src/AssemblyGeneratorX86.cpp b/src/AssemblyGeneratorX86.cpp index 16b06c7..8a4a0a1 100644 --- a/src/AssemblyGeneratorX86.cpp +++ b/src/AssemblyGeneratorX86.cpp @@ -67,20 +67,16 @@ namespace RandomX { void AssemblyGeneratorX86::gena(Instruction& instr, int i) { asmCode << "\txor " << regR[instr.rega % RegistersCount] << ", 0" << std::hex << instr.addra << "h" << std::dec << std::endl; asmCode << "\tmov eax, " << regR32[instr.rega % RegistersCount] << std::endl; + if ((instr.loca & 192) == 0) + asmCode << "\txor " << regMx << ", rax" << std::endl; asmCode << "\ttest " << regIc8 << ", 63" << std::endl; asmCode << "\tjnz short rx_body_" << i << std::endl; + asmCode << "\tcall rx_read" << std::endl; + asmCode << "rx_body_" << i << ":" << std::endl; if (instr.loca & 3) { - asmCode << "\tcall rx_read_l1" << std::endl; - asmCode << "rx_body_" << i << ":" << std::endl; - if ((instr.loca & 192) == 0) - asmCode << "\txor " << regMx << ", rax" << std::endl; asmCode << "\tand eax, " << (ScratchpadL1 - 1) << std::endl; } else { - asmCode << "\tcall rx_read_l2" << std::endl; - asmCode << "rx_body_" << i << ":" << std::endl; - if ((instr.loca & 192) == 0) - asmCode << "\txor " << regMx << ", rax" << std::endl; asmCode << "\tand eax, " << (ScratchpadL2 - 1) << std::endl; } } diff --git a/src/JitCompilerX86-static.S b/src/JitCompilerX86-static.S index 875256a..e0e8f62 100644 --- a/src/JitCompilerX86-static.S +++ b/src/JitCompilerX86-static.S @@ -29,8 +29,7 @@ .global DECL(randomx_program_prologue) .global DECL(randomx_program_begin) .global DECL(randomx_program_epilogue) -.global DECL(randomx_program_read_l1) -.global DECL(randomx_program_read_l2) +.global DECL(randomx_program_read) .global DECL(randomx_program_end) .global DECL(randomx_program_transform) @@ -48,22 +47,10 @@ DECL(randomx_program_begin): DECL(randomx_program_epilogue): #include "asm/program_epilogue_linux.inc" -#define scratchpad_mask and eax, 2040 - .align 64 -DECL(randomx_program_read_l1): +DECL(randomx_program_read): #include "asm/program_read.inc" -#undef scratchpad_mask - -#define scratchpad_mask and eax, 32760 - -.align 64 -DECL(randomx_program_read_l2): - #include "asm/program_read.inc" - -#undef scratchpad_mask - .align 64 DECL(randomx_program_end): nop diff --git a/src/JitCompilerX86-static.asm b/src/JitCompilerX86-static.asm index 48b09ff..cbbf658 100644 --- a/src/JitCompilerX86-static.asm +++ b/src/JitCompilerX86-static.asm @@ -20,8 +20,7 @@ _RANDOMX_JITX86_STATIC SEGMENT PAGE READ EXECUTE PUBLIC randomx_program_prologue PUBLIC randomx_program_begin PUBLIC randomx_program_epilogue -PUBLIC randomx_program_read_l1 -PUBLIC randomx_program_read_l2 +PUBLIC randomx_program_read PUBLIC randomx_program_end PUBLIC randomx_program_transform @@ -41,23 +40,10 @@ randomx_program_epilogue PROC include asm/program_epilogue_win64.inc randomx_program_epilogue ENDP -scratchpad_mask MACRO - and eax, 2040 -ENDM - ALIGN 64 -randomx_program_read_l1 PROC +randomx_program_read PROC include asm/program_read.inc -randomx_program_read_l1 ENDP - -scratchpad_mask MACRO - and eax, 32760 -ENDM - -ALIGN 64 -randomx_program_read_l2 PROC - include asm/program_read.inc -randomx_program_read_l2 ENDP +randomx_program_read ENDP ALIGN 64 randomx_program_end PROC diff --git a/src/JitCompilerX86-static.hpp b/src/JitCompilerX86-static.hpp index f5904ad..e72244a 100644 --- a/src/JitCompilerX86-static.hpp +++ b/src/JitCompilerX86-static.hpp @@ -22,7 +22,6 @@ extern "C" { void randomx_program_begin(); void randomx_program_epilogue(); void randomx_program_transform(); - void randomx_program_read_l1(); - void randomx_program_read_l2(); + void randomx_program_read(); void randomx_program_end(); } \ No newline at end of file diff --git a/src/JitCompilerX86.cpp b/src/JitCompilerX86.cpp index 955d8ba..f76ab74 100644 --- a/src/JitCompilerX86.cpp +++ b/src/JitCompilerX86.cpp @@ -102,22 +102,19 @@ namespace RandomX { const uint8_t* codePrologue = (uint8_t*)&randomx_program_prologue; const uint8_t* codeProgramBegin = (uint8_t*)&randomx_program_begin; const uint8_t* codeEpilogue = (uint8_t*)&randomx_program_epilogue; - const uint8_t* codeReadDatasetL1 = (uint8_t*)&randomx_program_read_l1; - const uint8_t* codeReadDatasetL2 = (uint8_t*)&randomx_program_read_l2; + const uint8_t* codeReadDataset = (uint8_t*)&randomx_program_read; const uint8_t* codeProgramEnd = (uint8_t*)&randomx_program_end; const uint32_t* addressTransformations = (uint32_t*)&randomx_program_transform; const int32_t prologueSize = codeProgramBegin - codePrologue; - const int32_t epilogueSize = codeReadDatasetL1 - codeEpilogue; - const int32_t readDatasetL1Size = codeReadDatasetL2 - codeReadDatasetL1; - const int32_t readDatasetL2Size = codeProgramEnd - codeReadDatasetL2; + const int32_t epilogueSize = codeReadDataset - codeEpilogue; + const int32_t readDatasetSize = codeProgramEnd - codeReadDataset; - const int32_t readDatasetL2Offset = CodeSize - readDatasetL2Size; - const int32_t readDatasetL1Offset = readDatasetL2Offset - readDatasetL1Size; - const int32_t epilogueOffset = readDatasetL1Offset - epilogueSize; + const int32_t readDatasetOffset = CodeSize - readDatasetSize; + const int32_t epilogueOffset = readDatasetOffset - epilogueSize; size_t JitCompilerX86::getCodeSize() { - return codePos - prologueSize + readDatasetL1Size + readDatasetL2Size; + return codePos - prologueSize + readDatasetSize; } JitCompilerX86::JitCompilerX86() { @@ -131,9 +128,8 @@ namespace RandomX { throw std::runtime_error("mmap failed"); #endif memcpy(code, codePrologue, prologueSize); - memcpy(code + CodeSize - epilogueSize - readDatasetL1Size - readDatasetL2Size, codeEpilogue, epilogueSize); - memcpy(code + CodeSize - readDatasetL1Size - readDatasetL2Size, codeReadDatasetL1, readDatasetL1Size); - memcpy(code + CodeSize - readDatasetL2Size, codeReadDatasetL2, readDatasetL2Size); + memcpy(code + CodeSize - epilogueSize - readDatasetSize, codeEpilogue, epilogueSize); + memcpy(code + CodeSize - readDatasetSize, codeReadDataset, readDatasetSize); } void JitCompilerX86::generateProgram(Pcg32& gen) { @@ -150,10 +146,8 @@ namespace RandomX { emitByte(0xe9); emit(instructionOffsets[0] - (codePos + 4)); fixCallOffsets(); - uint32_t transformL1 = addressTransformations[gen.getUniform(0, TransformationCount - 1)]; - uint32_t transformL2 = addressTransformations[gen.getUniform(0, TransformationCount - 1)]; - *reinterpret_cast(code + readDatasetL1Offset + 1) = transformL1; - *reinterpret_cast(code + readDatasetL2Offset + 1) = transformL2; + uint32_t transform = addressTransformations[gen.getUniform(0, TransformationCount - 1)]; + *reinterpret_cast(code + readDatasetOffset) = transform; } void JitCompilerX86::generateCode(Instruction& instr, int i) { @@ -176,18 +170,13 @@ namespace RandomX { emit(instr.addra); emit(uint16_t(0x8b41)); //mov emitByte(0xc0 + (instr.rega % RegistersCount)); //eax, rega - emit(0x753fc3f6); //test bl,0x3f; jne - emit(uint16_t(0xe805)); - if (instr.loca & 3) { //A.LOC.W - emit(readDatasetL1Offset - (codePos + 4)); - } - else { - emit(readDatasetL2Offset - (codePos + 4)); - } if ((instr.loca & 192) == 0) { //A.LOC.X emit(uint16_t(0x3348)); emitByte(0xe8); //xor rbp, rax } + emit(0x753fc3f6); //test bl,0x3f; jne + emit(uint16_t(0xe805)); + emit(readDatasetOffset - (codePos + 4)); emitByte(0x25); //and eax, if (instr.loca & 3) { emit(ScratchpadL1 - 1); //first 16 KiB of scratchpad diff --git a/src/asm/program_read.inc b/src/asm/program_read.inc index 8ddf97d..c7650ea 100644 --- a/src/asm/program_read.inc +++ b/src/asm/program_read.inc @@ -1,4 +1,3 @@ - push rax ;# preserve eax db 0, 0, 0, 0 ;# TransformAddress placeholder mov rcx, qword ptr [rdi] ;# load the dataset address xor rbp, rax ;# modify "mx" @@ -9,24 +8,13 @@ ;# read cacheline "ma" ror rbp, 32 ;# swap "ma" and "mx" mov edx, ebp ;# edx = ma - scratchpad_mask ;# limit address to the specified scratchpad size aligned to multiple of 8 - lea rax, [rsi+rax*8] ;# scratchpad cache line lea rcx, [rcx+rdx] ;# dataset cache line - mov rdx, qword ptr [rcx+0] ;# load first dataset quadword (prefetched into the cache by now) - xor qword ptr [rax+0], rdx ;# XOR the dataset item with a scratchpad item, repeat for the rest of the cacheline - mov rdx, qword ptr [rcx+8] - xor qword ptr [rax+8], rdx - mov rdx, qword ptr [rcx+16] - xor qword ptr [rax+16], rdx - mov rdx, qword ptr [rcx+24] - xor qword ptr [rax+24], rdx - mov rdx, qword ptr [rcx+32] - xor qword ptr [rax+32], rdx - mov rdx, qword ptr [rcx+40] - xor qword ptr [rax+40], rdx - mov rdx, qword ptr [rcx+48] - xor qword ptr [rax+48], rdx - mov rdx, qword ptr [rcx+56] - xor qword ptr [rax+56], rdx - pop rax ;# restore eax + xor r8, qword ptr [rcx+0] + xor r9, qword ptr [rcx+8] + xor r10, qword ptr [rcx+16] + xor r11, qword ptr [rcx+24] + xor r12, qword ptr [rcx+32] + xor r13, qword ptr [rcx+40] + xor r14, qword ptr [rcx+48] + xor r15, qword ptr [rcx+56] ret \ No newline at end of file diff --git a/src/executeProgram-win64.asm b/src/executeProgram-win64.asm index 53eec9c..2cc98fb 100644 --- a/src/executeProgram-win64.asm +++ b/src/executeProgram-win64.asm @@ -221,54 +221,33 @@ TransformAddress MACRO reg32, reg64 ;xor reg32, -8 ;# C = all except 0 to 7 ENDM -ReadMemoryRandom MACRO spmask +ALIGN 64 +rx_read: ;# IN eax = random 32-bit address ;# GLOBAL rdi = address of the dataset address ;# GLOBAL rsi = address of the scratchpad ;# GLOBAL rbp = low 32 bits = "mx", high 32 bits = "ma" ;# MODIFY rcx, rdx - push rax ;# preserve eax TransformAddress eax, rax ;# TransformAddress function mov rcx, qword ptr [rdi] ;# load the dataset address xor rbp, rax ;# modify "mx" - ; prefetch cacheline "mx" + ;# prefetch cacheline "mx" and rbp, -64 ;# align "mx" to the start of a cache line mov edx, ebp ;# edx = mx prefetchnta byte ptr [rcx+rdx] - ; read cacheline "ma" + ;# read cacheline "ma" ror rbp, 32 ;# swap "ma" and "mx" mov edx, ebp ;# edx = ma - and eax, spmask-7 ;# limit address to the specified scratchpad size aligned to multiple of 8 - lea rax, [rsi+rax*8] ;# scratchpad cache line lea rcx, [rcx+rdx] ;# dataset cache line - mov rdx, qword ptr [rcx+0] ;# load first dataset quadword (prefetched into the cache by now) - xor qword ptr [rax+0], rdx ;# XOR the dataset item with a scratchpad item, repeat for the rest of the cacheline - mov rdx, qword ptr [rcx+8] - xor qword ptr [rax+8], rdx - mov rdx, qword ptr [rcx+16] - xor qword ptr [rax+16], rdx - mov rdx, qword ptr [rcx+24] - xor qword ptr [rax+24], rdx - mov rdx, qword ptr [rcx+32] - xor qword ptr [rax+32], rdx - mov rdx, qword ptr [rcx+40] - xor qword ptr [rax+40], rdx - mov rdx, qword ptr [rcx+48] - xor qword ptr [rax+48], rdx - mov rdx, qword ptr [rcx+56] - xor qword ptr [rax+56], rdx - pop rax ;# restore eax + xor r8, qword ptr [rcx+0] + xor r9, qword ptr [rcx+8] + xor r10, qword ptr [rcx+16] + xor r11, qword ptr [rcx+24] + xor r12, qword ptr [rcx+32] + xor r13, qword ptr [rcx+40] + xor r14, qword ptr [rcx+48] + xor r15, qword ptr [rcx+56] ret -ENDM - -ALIGN 64 -rx_read_l1: -ReadMemoryRandom 2047 - -ALIGN 64 -rx_read_l2: -ReadMemoryRandom 32767 - executeProgram ENDP _RANDOMX_EXECUTE_PROGRAM ENDS diff --git a/src/program.inc b/src/program.inc index 698eeb3..4437f97 100644 --- a/src/program.inc +++ b/src/program.inc @@ -3,11 +3,11 @@ rx_i_0: ;CALL jz rx_finish xor r9, 0ca9788ah mov eax, r9d + xor rbp, rax test bl, 63 jnz short rx_body_0 - call rx_read_l2 + call rx_read rx_body_0: - xor rbp, rax and eax, 32767 mov rax, qword ptr [rsi+rax*8] mov rcx, rax @@ -26,7 +26,7 @@ rx_i_1: ;IDIV_64 mov eax, r15d test bl, 63 jnz short rx_body_1 - call rx_read_l2 + call rx_read rx_body_1: and eax, 32767 mov rax, qword ptr [rsi+rax*8] @@ -50,11 +50,11 @@ rx_i_2: ;JUMP jz rx_finish xor r15, 097210f7bh mov eax, r15d + xor rbp, rax test bl, 63 jnz short rx_body_2 - call rx_read_l1 + call rx_read rx_body_2: - xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] mov rcx, rax @@ -70,11 +70,11 @@ rx_i_3: ;FPDIV jz rx_finish xor r13, 082c73195h mov eax, r13d + xor rbp, rax test bl, 63 jnz short rx_body_3 - call rx_read_l2 + call rx_read rx_body_3: - xor rbp, rax and eax, 32767 cvtdq2pd xmm0, qword ptr [rsi+rax*8] divpd xmm0, xmm9 @@ -94,7 +94,7 @@ rx_i_4: ;MUL_32 mov eax, r14d test bl, 63 jnz short rx_body_4 - call rx_read_l2 + call rx_read rx_body_4: and eax, 32767 mov rax, qword ptr [rsi+rax*8] @@ -112,11 +112,11 @@ rx_i_5: ;IMUL_32 jz rx_finish xor r15, 0379f9ee0h mov eax, r15d + xor rbp, rax test bl, 63 jnz short rx_body_5 - call rx_read_l1 + call rx_read rx_body_5: - xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] movsxd rcx, eax @@ -131,7 +131,7 @@ rx_i_6: ;MUL_64 mov eax, r8d test bl, 63 jnz short rx_body_6 - call rx_read_l1 + call rx_read rx_body_6: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -149,7 +149,7 @@ rx_i_7: ;FPADD mov eax, r10d test bl, 63 jnz short rx_body_7 - call rx_read_l2 + call rx_read rx_body_7: and eax, 32767 cvtdq2pd xmm0, qword ptr [rsi+rax*8] @@ -167,7 +167,7 @@ rx_i_8: ;XOR_64 mov eax, r13d test bl, 63 jnz short rx_body_8 - call rx_read_l2 + call rx_read rx_body_8: and eax, 32767 mov rax, qword ptr [rsi+rax*8] @@ -183,11 +183,11 @@ rx_i_9: ;IDIV_64 jz rx_finish xor r14, 085121c54h mov eax, r14d + xor rbp, rax test bl, 63 jnz short rx_body_9 - call rx_read_l2 + call rx_read rx_body_9: - xor rbp, rax and eax, 32767 mov rax, qword ptr [rsi+rax*8] ; magic divide by 565870810 @@ -207,7 +207,7 @@ rx_i_10: ;AND_64 mov eax, r8d test bl, 63 jnz short rx_body_10 - call rx_read_l1 + call rx_read rx_body_10: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -221,7 +221,7 @@ rx_i_11: ;FPADD mov eax, r10d test bl, 63 jnz short rx_body_11 - call rx_read_l1 + call rx_read rx_body_11: and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] @@ -239,7 +239,7 @@ rx_i_12: ;FPSQRT mov eax, r10d test bl, 63 jnz short rx_body_12 - call rx_read_l1 + call rx_read rx_body_12: and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] @@ -255,11 +255,11 @@ rx_i_13: ;FPADD jz rx_finish xor r12, 061c0d34dh mov eax, r12d + xor rbp, rax test bl, 63 jnz short rx_body_13 - call rx_read_l1 + call rx_read rx_body_13: - xor rbp, rax and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] addpd xmm0, xmm3 @@ -270,11 +270,11 @@ rx_i_14: ;XOR_64 jz rx_finish xor r10, 0e761d1beh mov eax, r10d + xor rbp, rax test bl, 63 jnz short rx_body_14 - call rx_read_l1 + call rx_read rx_body_14: - xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] xor rax, r9 @@ -291,7 +291,7 @@ rx_i_15: ;RET mov eax, r11d test bl, 63 jnz short rx_body_15 - call rx_read_l1 + call rx_read rx_body_15: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -311,7 +311,7 @@ rx_i_16: ;ADD_64 mov eax, r14d test bl, 63 jnz short rx_body_16 - call rx_read_l1 + call rx_read rx_body_16: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -329,7 +329,7 @@ rx_i_17: ;FPMUL mov eax, r11d test bl, 63 jnz short rx_body_17 - call rx_read_l2 + call rx_read rx_body_17: and eax, 32767 cvtdq2pd xmm0, qword ptr [rsi+rax*8] @@ -350,7 +350,7 @@ rx_i_18: ;FPSUB mov eax, r14d test bl, 63 jnz short rx_body_18 - call rx_read_l1 + call rx_read rx_body_18: and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] @@ -366,11 +366,11 @@ rx_i_19: ;FPSUB jz rx_finish xor r13, 0ac009c30h mov eax, r13d + xor rbp, rax test bl, 63 jnz short rx_body_19 - call rx_read_l1 + call rx_read rx_body_19: - xor rbp, rax and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] subpd xmm0, xmm8 @@ -383,7 +383,7 @@ rx_i_20: ;FPSUB mov eax, r13d test bl, 63 jnz short rx_body_20 - call rx_read_l1 + call rx_read rx_body_20: and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] @@ -399,11 +399,11 @@ rx_i_21: ;ROR_64 jz rx_finish xor r8, 0977f0284h mov eax, r8d + xor rbp, rax test bl, 63 jnz short rx_body_21 - call rx_read_l1 + call rx_read rx_body_21: - xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] mov rcx, r9 @@ -419,11 +419,11 @@ rx_i_22: ;ADD_64 jz rx_finish xor r13, 080bdfefah mov eax, r13d + xor rbp, rax test bl, 63 jnz short rx_body_22 - call rx_read_l2 + call rx_read rx_body_22: - xor rbp, rax and eax, 32767 mov rax, qword ptr [rsi+rax*8] add rax, r8 @@ -440,7 +440,7 @@ rx_i_23: ;MUL_64 mov eax, r15d test bl, 63 jnz short rx_body_23 - call rx_read_l1 + call rx_read rx_body_23: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -452,11 +452,11 @@ rx_i_24: ;DIV_64 jz rx_finish xor r8, 070d3b8c7h mov eax, r8d + xor rbp, rax test bl, 63 jnz short rx_body_24 - call rx_read_l2 + call rx_read rx_body_24: - xor rbp, rax and eax, 32767 mov rax, qword ptr [rsi+rax*8] mov ecx, 1 @@ -476,11 +476,11 @@ rx_i_25: ;FPMUL jz rx_finish xor r12, 01cf77a04h mov eax, r12d + xor rbp, rax test bl, 63 jnz short rx_body_25 - call rx_read_l1 + call rx_read rx_body_25: - xor rbp, rax and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] mulpd xmm0, xmm9 @@ -500,7 +500,7 @@ rx_i_26: ;IMULH_64 mov eax, r11d test bl, 63 jnz short rx_body_26 - call rx_read_l1 + call rx_read rx_body_26: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -520,7 +520,7 @@ rx_i_27: ;FPMUL mov eax, r12d test bl, 63 jnz short rx_body_27 - call rx_read_l1 + call rx_read rx_body_27: and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] @@ -537,7 +537,7 @@ rx_i_28: ;AND_32 mov eax, r13d test bl, 63 jnz short rx_body_28 - call rx_read_l1 + call rx_read rx_body_28: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -551,7 +551,7 @@ rx_i_29: ;SUB_64 mov eax, r12d test bl, 63 jnz short rx_body_29 - call rx_read_l1 + call rx_read rx_body_29: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -565,7 +565,7 @@ rx_i_30: ;FPADD mov eax, r11d test bl, 63 jnz short rx_body_30 - call rx_read_l1 + call rx_read rx_body_30: and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] @@ -577,11 +577,11 @@ rx_i_31: ;ROR_64 jz rx_finish xor r14, 0d352ce37h mov eax, r14d + xor rbp, rax test bl, 63 jnz short rx_body_31 - call rx_read_l1 + call rx_read rx_body_31: - xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] ror rax, 55 @@ -594,7 +594,7 @@ rx_i_32: ;AND_32 mov eax, r12d test bl, 63 jnz short rx_body_32 - call rx_read_l2 + call rx_read rx_body_32: and eax, 32767 mov rax, qword ptr [rsi+rax*8] @@ -608,7 +608,7 @@ rx_i_33: ;MUL_64 mov eax, r9d test bl, 63 jnz short rx_body_33 - call rx_read_l1 + call rx_read rx_body_33: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -620,11 +620,11 @@ rx_i_34: ;CALL jz rx_finish xor r13, 0665e91f1h mov eax, r13d + xor rbp, rax test bl, 63 jnz short rx_body_34 - call rx_read_l1 + call rx_read rx_body_34: - xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] mov r15, rax @@ -639,7 +639,7 @@ rx_i_35: ;CALL mov eax, r15d test bl, 63 jnz short rx_body_35 - call rx_read_l2 + call rx_read rx_body_35: and eax, 32767 mov rax, qword ptr [rsi+rax*8] @@ -655,7 +655,7 @@ rx_i_36: ;FPMUL mov eax, r8d test bl, 63 jnz short rx_body_36 - call rx_read_l1 + call rx_read rx_body_36: and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] @@ -672,7 +672,7 @@ rx_i_37: ;FPSUB mov eax, r12d test bl, 63 jnz short rx_body_37 - call rx_read_l2 + call rx_read rx_body_37: and eax, 32767 cvtdq2pd xmm0, qword ptr [rsi+rax*8] @@ -690,7 +690,7 @@ rx_i_38: ;SUB_64 mov eax, r9d test bl, 63 jnz short rx_body_38 - call rx_read_l1 + call rx_read rx_body_38: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -702,11 +702,11 @@ rx_i_39: ;ADD_64 jz rx_finish xor r14, 02c1f1eb0h mov eax, r14d + xor rbp, rax test bl, 63 jnz short rx_body_39 - call rx_read_l2 + call rx_read rx_body_39: - xor rbp, rax and eax, 32767 mov rax, qword ptr [rsi+rax*8] add rax, r14 @@ -719,7 +719,7 @@ rx_i_40: ;CALL mov eax, r10d test bl, 63 jnz short rx_body_40 - call rx_read_l1 + call rx_read rx_body_40: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -739,7 +739,7 @@ rx_i_41: ;JUMP mov eax, r9d test bl, 63 jnz short rx_body_41 - call rx_read_l2 + call rx_read rx_body_41: and eax, 32767 mov rax, qword ptr [rsi+rax*8] @@ -752,11 +752,11 @@ rx_i_42: ;FPADD jz rx_finish xor r15, 0bc1de9f6h mov eax, r15d + xor rbp, rax test bl, 63 jnz short rx_body_42 - call rx_read_l2 + call rx_read rx_body_42: - xor rbp, rax and eax, 32767 cvtdq2pd xmm0, qword ptr [rsi+rax*8] addpd xmm0, xmm6 @@ -769,7 +769,7 @@ rx_i_43: ;SUB_64 mov eax, r12d test bl, 63 jnz short rx_body_43 - call rx_read_l2 + call rx_read rx_body_43: and eax, 32767 mov rax, qword ptr [rsi+rax*8] @@ -787,7 +787,7 @@ rx_i_44: ;SAR_64 mov eax, r11d test bl, 63 jnz short rx_body_44 - call rx_read_l1 + call rx_read rx_body_44: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -800,11 +800,11 @@ rx_i_45: ;FPSUB jz rx_finish xor r12, 08cd244ebh mov eax, r12d + xor rbp, rax test bl, 63 jnz short rx_body_45 - call rx_read_l1 + call rx_read rx_body_45: - xor rbp, rax and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] subpd xmm0, xmm2 @@ -817,7 +817,7 @@ rx_i_46: ;ADD_64 mov eax, r8d test bl, 63 jnz short rx_body_46 - call rx_read_l1 + call rx_read rx_body_46: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -833,11 +833,11 @@ rx_i_47: ;JUMP jz rx_finish xor r12, 05ba232c6h mov eax, r12d + xor rbp, rax test bl, 63 jnz short rx_body_47 - call rx_read_l1 + call rx_read rx_body_47: - xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] mov rcx, rax @@ -855,7 +855,7 @@ rx_i_48: ;FPDIV mov eax, r8d test bl, 63 jnz short rx_body_48 - call rx_read_l1 + call rx_read rx_body_48: and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] @@ -876,7 +876,7 @@ rx_i_49: ;FPSUB mov eax, r8d test bl, 63 jnz short rx_body_49 - call rx_read_l1 + call rx_read rx_body_49: and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] @@ -888,11 +888,11 @@ rx_i_50: ;AND_64 jz rx_finish xor r9, 0da3e4842h mov eax, r9d + xor rbp, rax test bl, 63 jnz short rx_body_50 - call rx_read_l2 + call rx_read rx_body_50: - xor rbp, rax and eax, 32767 mov rax, qword ptr [rsi+rax*8] and rax, r10 @@ -909,7 +909,7 @@ rx_i_51: ;SUB_64 mov eax, r10d test bl, 63 jnz short rx_body_51 - call rx_read_l1 + call rx_read rx_body_51: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -923,7 +923,7 @@ rx_i_52: ;FPSQRT mov eax, r11d test bl, 63 jnz short rx_body_52 - call rx_read_l1 + call rx_read rx_body_52: and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] @@ -937,7 +937,7 @@ rx_i_53: ;RET mov eax, r13d test bl, 63 jnz short rx_body_53 - call rx_read_l1 + call rx_read rx_body_53: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -953,7 +953,7 @@ rx_i_54: ;DIV_64 mov eax, r11d test bl, 63 jnz short rx_body_54 - call rx_read_l1 + call rx_read rx_body_54: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -975,7 +975,7 @@ rx_i_55: ;FPMUL mov eax, r10d test bl, 63 jnz short rx_body_55 - call rx_read_l1 + call rx_read rx_body_55: and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] @@ -996,7 +996,7 @@ rx_i_56: ;IDIV_64 mov eax, r14d test bl, 63 jnz short rx_body_56 - call rx_read_l2 + call rx_read rx_body_56: and eax, 32767 mov rax, qword ptr [rsi+rax*8] @@ -1023,7 +1023,7 @@ rx_i_57: ;MUL_64 mov eax, r9d test bl, 63 jnz short rx_body_57 - call rx_read_l1 + call rx_read rx_body_57: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -1041,7 +1041,7 @@ rx_i_58: ;DIV_64 mov eax, r14d test bl, 63 jnz short rx_body_58 - call rx_read_l1 + call rx_read rx_body_58: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -1059,7 +1059,7 @@ rx_i_59: ;FPSUB mov eax, r11d test bl, 63 jnz short rx_body_59 - call rx_read_l1 + call rx_read rx_body_59: and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] @@ -1073,7 +1073,7 @@ rx_i_60: ;CALL mov eax, r15d test bl, 63 jnz short rx_body_60 - call rx_read_l2 + call rx_read rx_body_60: and eax, 32767 mov rax, qword ptr [rsi+rax*8] @@ -1093,7 +1093,7 @@ rx_i_61: ;JUMP mov eax, r13d test bl, 63 jnz short rx_body_61 - call rx_read_l1 + call rx_read rx_body_61: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -1108,7 +1108,7 @@ rx_i_62: ;FPSUB mov eax, r15d test bl, 63 jnz short rx_body_62 - call rx_read_l1 + call rx_read rx_body_62: and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] @@ -1126,7 +1126,7 @@ rx_i_63: ;FPSUB mov eax, r9d test bl, 63 jnz short rx_body_63 - call rx_read_l1 + call rx_read rx_body_63: and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] @@ -1140,7 +1140,7 @@ rx_i_64: ;SUB_64 mov eax, r13d test bl, 63 jnz short rx_body_64 - call rx_read_l2 + call rx_read rx_body_64: and eax, 32767 mov rax, qword ptr [rsi+rax*8] @@ -1154,7 +1154,7 @@ rx_i_65: ;JUMP mov eax, r13d test bl, 63 jnz short rx_body_65 - call rx_read_l2 + call rx_read rx_body_65: and eax, 32767 mov rax, qword ptr [rsi+rax*8] @@ -1167,11 +1167,11 @@ rx_i_66: ;FPDIV jz rx_finish xor r15, 015a1b689h mov eax, r15d + xor rbp, rax test bl, 63 jnz short rx_body_66 - call rx_read_l1 + call rx_read rx_body_66: - xor rbp, rax and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] divpd xmm0, xmm3 @@ -1191,7 +1191,7 @@ rx_i_67: ;JUMP mov eax, r14d test bl, 63 jnz short rx_body_67 - call rx_read_l1 + call rx_read rx_body_67: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -1206,7 +1206,7 @@ rx_i_68: ;FPADD mov eax, r13d test bl, 63 jnz short rx_body_68 - call rx_read_l2 + call rx_read rx_body_68: and eax, 32767 cvtdq2pd xmm0, qword ptr [rsi+rax*8] @@ -1222,11 +1222,11 @@ rx_i_69: ;FPADD jz rx_finish xor r15, 0376c9c27h mov eax, r15d + xor rbp, rax test bl, 63 jnz short rx_body_69 - call rx_read_l1 + call rx_read rx_body_69: - xor rbp, rax and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] addpd xmm0, xmm5 @@ -1237,11 +1237,11 @@ rx_i_70: ;MULH_64 jz rx_finish xor r8, 0bbbec3fah mov eax, r8d + xor rbp, rax test bl, 63 jnz short rx_body_70 - call rx_read_l1 + call rx_read rx_body_70: - xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] mov rcx, r9 @@ -1256,7 +1256,7 @@ rx_i_71: ;FPMUL mov eax, r14d test bl, 63 jnz short rx_body_71 - call rx_read_l1 + call rx_read rx_body_71: and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] @@ -1273,7 +1273,7 @@ rx_i_72: ;JUMP mov eax, r13d test bl, 63 jnz short rx_body_72 - call rx_read_l1 + call rx_read rx_body_72: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -1292,7 +1292,7 @@ rx_i_73: ;FPDIV mov eax, r12d test bl, 63 jnz short rx_body_73 - call rx_read_l1 + call rx_read rx_body_73: and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] @@ -1307,11 +1307,11 @@ rx_i_74: ;MUL_64 jz rx_finish xor r8, 04c4b0c7fh mov eax, r8d + xor rbp, rax test bl, 63 jnz short rx_body_74 - call rx_read_l1 + call rx_read rx_body_74: - xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] imul rax, r13 @@ -1328,7 +1328,7 @@ rx_i_75: ;CALL mov eax, r14d test bl, 63 jnz short rx_body_75 - call rx_read_l1 + call rx_read rx_body_75: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -1344,7 +1344,7 @@ rx_i_76: ;FPADD mov eax, r11d test bl, 63 jnz short rx_body_76 - call rx_read_l2 + call rx_read rx_body_76: and eax, 32767 cvtdq2pd xmm0, qword ptr [rsi+rax*8] @@ -1362,7 +1362,7 @@ rx_i_77: ;RET mov eax, r14d test bl, 63 jnz short rx_body_77 - call rx_read_l1 + call rx_read rx_body_77: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -1382,7 +1382,7 @@ rx_i_78: ;MUL_32 mov eax, r9d test bl, 63 jnz short rx_body_78 - call rx_read_l1 + call rx_read rx_body_78: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -1398,7 +1398,7 @@ rx_i_79: ;CALL mov eax, r11d test bl, 63 jnz short rx_body_79 - call rx_read_l2 + call rx_read rx_body_79: and eax, 32767 mov rax, qword ptr [rsi+rax*8] @@ -1418,7 +1418,7 @@ rx_i_80: ;ROR_64 mov eax, r13d test bl, 63 jnz short rx_body_80 - call rx_read_l1 + call rx_read rx_body_80: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -1437,7 +1437,7 @@ rx_i_81: ;AND_64 mov eax, r15d test bl, 63 jnz short rx_body_81 - call rx_read_l1 + call rx_read rx_body_81: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -1451,7 +1451,7 @@ rx_i_82: ;JUMP mov eax, r11d test bl, 63 jnz short rx_body_82 - call rx_read_l1 + call rx_read rx_body_82: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -1470,7 +1470,7 @@ rx_i_83: ;IDIV_64 mov eax, r10d test bl, 63 jnz short rx_body_83 - call rx_read_l2 + call rx_read rx_body_83: and eax, 32767 mov rax, qword ptr [rsi+rax*8] @@ -1491,7 +1491,7 @@ rx_i_84: ;SAR_64 mov eax, r15d test bl, 63 jnz short rx_body_84 - call rx_read_l1 + call rx_read rx_body_84: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -1509,7 +1509,7 @@ rx_i_85: ;MUL_64 mov eax, r13d test bl, 63 jnz short rx_body_85 - call rx_read_l1 + call rx_read rx_body_85: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -1523,7 +1523,7 @@ rx_i_86: ;AND_64 mov eax, r11d test bl, 63 jnz short rx_body_86 - call rx_read_l1 + call rx_read rx_body_86: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -1539,11 +1539,11 @@ rx_i_87: ;SUB_64 jz rx_finish xor r9, 0d75a0ecfh mov eax, r9d + xor rbp, rax test bl, 63 jnz short rx_body_87 - call rx_read_l1 + call rx_read rx_body_87: - xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] sub rax, r12 @@ -1554,11 +1554,11 @@ rx_i_88: ;ROR_64 jz rx_finish xor r9, 031bb7f7ah mov eax, r9d + xor rbp, rax test bl, 63 jnz short rx_body_88 - call rx_read_l1 + call rx_read rx_body_88: - xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] mov rcx, r14 @@ -1572,7 +1572,7 @@ rx_i_89: ;MUL_64 mov eax, r9d test bl, 63 jnz short rx_body_89 - call rx_read_l1 + call rx_read rx_body_89: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -1590,7 +1590,7 @@ rx_i_90: ;FPADD mov eax, r12d test bl, 63 jnz short rx_body_90 - call rx_read_l2 + call rx_read rx_body_90: and eax, 32767 cvtdq2pd xmm0, qword ptr [rsi+rax*8] @@ -1602,11 +1602,11 @@ rx_i_91: ;FPMUL jz rx_finish xor r9, 042e28e94h mov eax, r9d + xor rbp, rax test bl, 63 jnz short rx_body_91 - call rx_read_l1 + call rx_read rx_body_91: - xor rbp, rax and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] mulpd xmm0, xmm2 @@ -1622,7 +1622,7 @@ rx_i_92: ;JUMP mov eax, r8d test bl, 63 jnz short rx_body_92 - call rx_read_l1 + call rx_read rx_body_92: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -1635,11 +1635,11 @@ rx_i_93: ;FPADD jz rx_finish xor r8, 0bfcebaf4h mov eax, r8d + xor rbp, rax test bl, 63 jnz short rx_body_93 - call rx_read_l1 + call rx_read rx_body_93: - xor rbp, rax and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] addpd xmm0, xmm2 @@ -1654,11 +1654,11 @@ rx_i_94: ;CALL jz rx_finish xor r13, 0ea326630h mov eax, r13d + xor rbp, rax test bl, 63 jnz short rx_body_94 - call rx_read_l2 + call rx_read rx_body_94: - xor rbp, rax and eax, 32767 mov rax, qword ptr [rsi+rax*8] mov r8, rax @@ -1673,7 +1673,7 @@ rx_i_95: ;MUL_64 mov eax, r13d test bl, 63 jnz short rx_body_95 - call rx_read_l1 + call rx_read rx_body_95: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -1689,11 +1689,11 @@ rx_i_96: ;IMUL_32 jz rx_finish xor r11, 04f912ef8h mov eax, r11d + xor rbp, rax test bl, 63 jnz short rx_body_96 - call rx_read_l1 + call rx_read rx_body_96: - xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] movsxd rcx, eax @@ -1708,7 +1708,7 @@ rx_i_97: ;FPDIV mov eax, r15d test bl, 63 jnz short rx_body_97 - call rx_read_l2 + call rx_read rx_body_97: and eax, 32767 cvtdq2pd xmm0, qword ptr [rsi+rax*8] @@ -1729,7 +1729,7 @@ rx_i_98: ;SUB_64 mov eax, r14d test bl, 63 jnz short rx_body_98 - call rx_read_l2 + call rx_read rx_body_98: and eax, 32767 mov rax, qword ptr [rsi+rax*8] @@ -1743,7 +1743,7 @@ rx_i_99: ;FPMUL mov eax, r9d test bl, 63 jnz short rx_body_99 - call rx_read_l1 + call rx_read rx_body_99: and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] @@ -1764,7 +1764,7 @@ rx_i_100: ;ADD_64 mov eax, r15d test bl, 63 jnz short rx_body_100 - call rx_read_l1 + call rx_read rx_body_100: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -1778,7 +1778,7 @@ rx_i_101: ;SUB_64 mov eax, r10d test bl, 63 jnz short rx_body_101 - call rx_read_l1 + call rx_read rx_body_101: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -1792,7 +1792,7 @@ rx_i_102: ;FPMUL mov eax, r10d test bl, 63 jnz short rx_body_102 - call rx_read_l1 + call rx_read rx_body_102: and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] @@ -1809,7 +1809,7 @@ rx_i_103: ;MUL_64 mov eax, r10d test bl, 63 jnz short rx_body_103 - call rx_read_l2 + call rx_read rx_body_103: and eax, 32767 mov rax, qword ptr [rsi+rax*8] @@ -1827,7 +1827,7 @@ rx_i_104: ;DIV_64 mov eax, r11d test bl, 63 jnz short rx_body_104 - call rx_read_l2 + call rx_read rx_body_104: and eax, 32767 mov rax, qword ptr [rsi+rax*8] @@ -1847,11 +1847,11 @@ rx_i_105: ;MUL_32 jz rx_finish xor r13, 036a51f72h mov eax, r13d + xor rbp, rax test bl, 63 jnz short rx_body_105 - call rx_read_l1 + call rx_read rx_body_105: - xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] mov ecx, eax @@ -1870,7 +1870,7 @@ rx_i_106: ;FPMUL mov eax, r11d test bl, 63 jnz short rx_body_106 - call rx_read_l1 + call rx_read rx_body_106: and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] @@ -1891,7 +1891,7 @@ rx_i_107: ;JUMP mov eax, r12d test bl, 63 jnz short rx_body_107 - call rx_read_l1 + call rx_read rx_body_107: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -1910,7 +1910,7 @@ rx_i_108: ;FPMUL mov eax, r9d test bl, 63 jnz short rx_body_108 - call rx_read_l1 + call rx_read rx_body_108: and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] @@ -1929,11 +1929,11 @@ rx_i_109: ;ROR_64 jz rx_finish xor r15, 0594e37deh mov eax, r15d + xor rbp, rax test bl, 63 jnz short rx_body_109 - call rx_read_l1 + call rx_read rx_body_109: - xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] mov rcx, r10 @@ -1951,7 +1951,7 @@ rx_i_110: ;SHR_64 mov eax, r9d test bl, 63 jnz short rx_body_110 - call rx_read_l2 + call rx_read rx_body_110: and eax, 32767 mov rax, qword ptr [rsi+rax*8] @@ -1970,7 +1970,7 @@ rx_i_111: ;CALL mov eax, r8d test bl, 63 jnz short rx_body_111 - call rx_read_l2 + call rx_read rx_body_111: and eax, 32767 mov rax, qword ptr [rsi+rax*8] @@ -1990,7 +1990,7 @@ rx_i_112: ;SUB_64 mov eax, r12d test bl, 63 jnz short rx_body_112 - call rx_read_l1 + call rx_read rx_body_112: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -2008,7 +2008,7 @@ rx_i_113: ;MULH_64 mov eax, r10d test bl, 63 jnz short rx_body_113 - call rx_read_l2 + call rx_read rx_body_113: and eax, 32767 mov rax, qword ptr [rsi+rax*8] @@ -2024,7 +2024,7 @@ rx_i_114: ;DIV_64 mov eax, r13d test bl, 63 jnz short rx_body_114 - call rx_read_l1 + call rx_read rx_body_114: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -2042,7 +2042,7 @@ rx_i_115: ;IDIV_64 mov eax, r14d test bl, 63 jnz short rx_body_115 - call rx_read_l1 + call rx_read rx_body_115: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -2063,7 +2063,7 @@ rx_i_116: ;DIV_64 mov eax, r10d test bl, 63 jnz short rx_body_116 - call rx_read_l1 + call rx_read rx_body_116: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -2085,7 +2085,7 @@ rx_i_117: ;IDIV_64 mov eax, r11d test bl, 63 jnz short rx_body_117 - call rx_read_l1 + call rx_read rx_body_117: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -2110,7 +2110,7 @@ rx_i_118: ;FPSUB mov eax, r9d test bl, 63 jnz short rx_body_118 - call rx_read_l1 + call rx_read rx_body_118: and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] @@ -2124,7 +2124,7 @@ rx_i_119: ;FPSUB mov eax, r9d test bl, 63 jnz short rx_body_119 - call rx_read_l1 + call rx_read rx_body_119: and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] @@ -2136,11 +2136,11 @@ rx_i_120: ;FPADD jz rx_finish xor r12, 0e5561e3eh mov eax, r12d + xor rbp, rax test bl, 63 jnz short rx_body_120 - call rx_read_l2 + call rx_read rx_body_120: - xor rbp, rax and eax, 32767 cvtdq2pd xmm0, qword ptr [rsi+rax*8] addpd xmm0, xmm4 @@ -2153,7 +2153,7 @@ rx_i_121: ;FPSUB mov eax, r9d test bl, 63 jnz short rx_body_121 - call rx_read_l2 + call rx_read rx_body_121: and eax, 32767 cvtdq2pd xmm0, qword ptr [rsi+rax*8] @@ -2167,7 +2167,7 @@ rx_i_122: ;CALL mov eax, r10d test bl, 63 jnz short rx_body_122 - call rx_read_l2 + call rx_read rx_body_122: and eax, 32767 mov rax, qword ptr [rsi+rax*8] @@ -2187,7 +2187,7 @@ rx_i_123: ;ADD_32 mov eax, r13d test bl, 63 jnz short rx_body_123 - call rx_read_l1 + call rx_read rx_body_123: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -2201,7 +2201,7 @@ rx_i_124: ;JUMP mov eax, r12d test bl, 63 jnz short rx_body_124 - call rx_read_l1 + call rx_read rx_body_124: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -2218,11 +2218,11 @@ rx_i_125: ;IMUL_32 jz rx_finish xor r8, 0ebec27cdh mov eax, r8d + xor rbp, rax test bl, 63 jnz short rx_body_125 - call rx_read_l1 + call rx_read rx_body_125: - xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] movsxd rcx, eax @@ -2237,7 +2237,7 @@ rx_i_126: ;FPMUL mov eax, r8d test bl, 63 jnz short rx_body_126 - call rx_read_l2 + call rx_read rx_body_126: and eax, 32767 cvtdq2pd xmm0, qword ptr [rsi+rax*8] @@ -2254,7 +2254,7 @@ rx_i_127: ;IMUL_32 mov eax, r9d test bl, 63 jnz short rx_body_127 - call rx_read_l1 + call rx_read rx_body_127: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -2268,11 +2268,11 @@ rx_i_128: ;MUL_64 jz rx_finish xor r13, 0459f1154h mov eax, r13d + xor rbp, rax test bl, 63 jnz short rx_body_128 - call rx_read_l1 + call rx_read rx_body_128: - xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] imul rax, r9 @@ -2285,7 +2285,7 @@ rx_i_129: ;JUMP mov eax, r9d test bl, 63 jnz short rx_body_129 - call rx_read_l1 + call rx_read rx_body_129: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -2300,7 +2300,7 @@ rx_i_130: ;IDIV_64 mov eax, r9d test bl, 63 jnz short rx_body_130 - call rx_read_l1 + call rx_read rx_body_130: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -2325,7 +2325,7 @@ rx_i_131: ;RET mov eax, r12d test bl, 63 jnz short rx_body_131 - call rx_read_l1 + call rx_read rx_body_131: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -2345,7 +2345,7 @@ rx_i_132: ;FPADD mov eax, r10d test bl, 63 jnz short rx_body_132 - call rx_read_l1 + call rx_read rx_body_132: and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] @@ -2359,7 +2359,7 @@ rx_i_133: ;OR_64 mov eax, r14d test bl, 63 jnz short rx_body_133 - call rx_read_l1 + call rx_read rx_body_133: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -2377,7 +2377,7 @@ rx_i_134: ;ADD_64 mov eax, r10d test bl, 63 jnz short rx_body_134 - call rx_read_l1 + call rx_read rx_body_134: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -2391,7 +2391,7 @@ rx_i_135: ;FPMUL mov eax, r11d test bl, 63 jnz short rx_body_135 - call rx_read_l1 + call rx_read rx_body_135: and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] @@ -2408,7 +2408,7 @@ rx_i_136: ;FPDIV mov eax, r8d test bl, 63 jnz short rx_body_136 - call rx_read_l1 + call rx_read rx_body_136: and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] @@ -2427,11 +2427,11 @@ rx_i_137: ;SHR_64 jz rx_finish xor r11, 015a24231h mov eax, r11d + xor rbp, rax test bl, 63 jnz short rx_body_137 - call rx_read_l2 + call rx_read rx_body_137: - xor rbp, rax and eax, 32767 mov rax, qword ptr [rsi+rax*8] mov rcx, r9 @@ -2445,7 +2445,7 @@ rx_i_138: ;RET mov eax, r13d test bl, 63 jnz short rx_body_138 - call rx_read_l1 + call rx_read rx_body_138: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -2459,11 +2459,11 @@ rx_i_139: ;ADD_64 jz rx_finish xor r9, 093172470h mov eax, r9d + xor rbp, rax test bl, 63 jnz short rx_body_139 - call rx_read_l1 + call rx_read rx_body_139: - xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] add rax, r8 @@ -2480,7 +2480,7 @@ rx_i_140: ;IMUL_32 mov eax, r14d test bl, 63 jnz short rx_body_140 - call rx_read_l1 + call rx_read rx_body_140: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -2494,11 +2494,11 @@ rx_i_141: ;FPADD jz rx_finish xor r8, 02f636da1h mov eax, r8d + xor rbp, rax test bl, 63 jnz short rx_body_141 - call rx_read_l1 + call rx_read rx_body_141: - xor rbp, rax and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] addpd xmm0, xmm2 @@ -2515,7 +2515,7 @@ rx_i_142: ;JUMP mov eax, r11d test bl, 63 jnz short rx_body_142 - call rx_read_l1 + call rx_read rx_body_142: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -2534,7 +2534,7 @@ rx_i_143: ;IMUL_32 mov eax, r15d test bl, 63 jnz short rx_body_143 - call rx_read_l1 + call rx_read rx_body_143: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -2550,7 +2550,7 @@ rx_i_144: ;DIV_64 mov eax, r10d test bl, 63 jnz short rx_body_144 - call rx_read_l1 + call rx_read rx_body_144: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -2569,7 +2569,7 @@ rx_i_145: ;DIV_64 mov eax, r13d test bl, 63 jnz short rx_body_145 - call rx_read_l1 + call rx_read rx_body_145: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -2589,11 +2589,11 @@ rx_i_146: ;IMULH_64 jz rx_finish xor r13, 02327e6e2h mov eax, r13d + xor rbp, rax test bl, 63 jnz short rx_body_146 - call rx_read_l1 + call rx_read rx_body_146: - xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] mov rcx, r12 @@ -2606,11 +2606,11 @@ rx_i_147: ;MUL_64 jz rx_finish xor r13, 03a7df043h mov eax, r13d + xor rbp, rax test bl, 63 jnz short rx_body_147 - call rx_read_l1 + call rx_read rx_body_147: - xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] imul rax, r11 @@ -2627,7 +2627,7 @@ rx_i_148: ;SUB_64 mov eax, r10d test bl, 63 jnz short rx_body_148 - call rx_read_l1 + call rx_read rx_body_148: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -2645,7 +2645,7 @@ rx_i_149: ;MUL_32 mov eax, r12d test bl, 63 jnz short rx_body_149 - call rx_read_l1 + call rx_read rx_body_149: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -2665,7 +2665,7 @@ rx_i_150: ;DIV_64 mov eax, r9d test bl, 63 jnz short rx_body_150 - call rx_read_l1 + call rx_read rx_body_150: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -2688,7 +2688,7 @@ rx_i_151: ;AND_64 mov eax, r9d test bl, 63 jnz short rx_body_151 - call rx_read_l1 + call rx_read rx_body_151: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -2706,7 +2706,7 @@ rx_i_152: ;SAR_64 mov eax, r13d test bl, 63 jnz short rx_body_152 - call rx_read_l1 + call rx_read rx_body_152: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -2721,7 +2721,7 @@ rx_i_153: ;FPMUL mov eax, r15d test bl, 63 jnz short rx_body_153 - call rx_read_l1 + call rx_read rx_body_153: and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] @@ -2742,7 +2742,7 @@ rx_i_154: ;MUL_32 mov eax, r10d test bl, 63 jnz short rx_body_154 - call rx_read_l1 + call rx_read rx_body_154: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -2758,7 +2758,7 @@ rx_i_155: ;ROL_64 mov eax, r11d test bl, 63 jnz short rx_body_155 - call rx_read_l2 + call rx_read rx_body_155: and eax, 32767 mov rax, qword ptr [rsi+rax*8] @@ -2777,7 +2777,7 @@ rx_i_156: ;IMUL_32 mov eax, r10d test bl, 63 jnz short rx_body_156 - call rx_read_l1 + call rx_read rx_body_156: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -2793,7 +2793,7 @@ rx_i_157: ;ADD_64 mov eax, r10d test bl, 63 jnz short rx_body_157 - call rx_read_l1 + call rx_read rx_body_157: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -2807,7 +2807,7 @@ rx_i_158: ;ADD_64 mov eax, r15d test bl, 63 jnz short rx_body_158 - call rx_read_l1 + call rx_read rx_body_158: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -2821,7 +2821,7 @@ rx_i_159: ;CALL mov eax, r13d test bl, 63 jnz short rx_body_159 - call rx_read_l2 + call rx_read rx_body_159: and eax, 32767 mov rax, qword ptr [rsi+rax*8] @@ -2839,11 +2839,11 @@ rx_i_160: ;SUB_64 jz rx_finish xor r14, 0b1685b90h mov eax, r14d + xor rbp, rax test bl, 63 jnz short rx_body_160 - call rx_read_l1 + call rx_read rx_body_160: - xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] sub rax, r14 @@ -2860,7 +2860,7 @@ rx_i_161: ;IDIV_64 mov eax, r15d test bl, 63 jnz short rx_body_161 - call rx_read_l2 + call rx_read rx_body_161: and eax, 32767 mov rax, qword ptr [rsi+rax*8] @@ -2886,7 +2886,7 @@ rx_i_162: ;SHL_64 mov eax, r9d test bl, 63 jnz short rx_body_162 - call rx_read_l1 + call rx_read rx_body_162: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -2898,11 +2898,11 @@ rx_i_163: ;SUB_64 jz rx_finish xor r12, 0e3486c0ah mov eax, r12d + xor rbp, rax test bl, 63 jnz short rx_body_163 - call rx_read_l1 + call rx_read rx_body_163: - xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] sub rax, r8 @@ -2917,11 +2917,11 @@ rx_i_164: ;MUL_32 jz rx_finish xor r12, 01f0c2737h mov eax, r12d + xor rbp, rax test bl, 63 jnz short rx_body_164 - call rx_read_l1 + call rx_read rx_body_164: - xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] mov ecx, eax @@ -2940,7 +2940,7 @@ rx_i_165: ;RET mov eax, r12d test bl, 63 jnz short rx_body_165 - call rx_read_l1 + call rx_read rx_body_165: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -2960,7 +2960,7 @@ rx_i_166: ;SHR_64 mov eax, r9d test bl, 63 jnz short rx_body_166 - call rx_read_l1 + call rx_read rx_body_166: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -2978,7 +2978,7 @@ rx_i_167: ;FPMUL mov eax, r11d test bl, 63 jnz short rx_body_167 - call rx_read_l1 + call rx_read rx_body_167: and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] @@ -2999,7 +2999,7 @@ rx_i_168: ;FPDIV mov eax, r12d test bl, 63 jnz short rx_body_168 - call rx_read_l1 + call rx_read rx_body_168: and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] @@ -3016,7 +3016,7 @@ rx_i_169: ;CALL mov eax, r11d test bl, 63 jnz short rx_body_169 - call rx_read_l1 + call rx_read rx_body_169: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -3036,7 +3036,7 @@ rx_i_170: ;FPSQRT mov eax, r8d test bl, 63 jnz short rx_body_170 - call rx_read_l1 + call rx_read rx_body_170: and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] @@ -3054,7 +3054,7 @@ rx_i_171: ;DIV_64 mov eax, r15d test bl, 63 jnz short rx_body_171 - call rx_read_l1 + call rx_read rx_body_171: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -3074,7 +3074,7 @@ rx_i_172: ;SUB_64 mov eax, r13d test bl, 63 jnz short rx_body_172 - call rx_read_l1 + call rx_read rx_body_172: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -3086,11 +3086,11 @@ rx_i_173: ;MUL_64 jz rx_finish xor r14, 05422cf8fh mov eax, r14d + xor rbp, rax test bl, 63 jnz short rx_body_173 - call rx_read_l1 + call rx_read rx_body_173: - xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] imul rax, rax, -1386172772 @@ -3107,7 +3107,7 @@ rx_i_174: ;FPDIV mov eax, r12d test bl, 63 jnz short rx_body_174 - call rx_read_l1 + call rx_read rx_body_174: and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] @@ -3128,7 +3128,7 @@ rx_i_175: ;XOR_32 mov eax, r13d test bl, 63 jnz short rx_body_175 - call rx_read_l1 + call rx_read rx_body_175: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -3142,7 +3142,7 @@ rx_i_176: ;SUB_64 mov eax, r9d test bl, 63 jnz short rx_body_176 - call rx_read_l1 + call rx_read rx_body_176: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -3156,7 +3156,7 @@ rx_i_177: ;ADD_64 mov eax, r10d test bl, 63 jnz short rx_body_177 - call rx_read_l1 + call rx_read rx_body_177: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -3174,7 +3174,7 @@ rx_i_178: ;RET mov eax, r15d test bl, 63 jnz short rx_body_178 - call rx_read_l2 + call rx_read rx_body_178: and eax, 32767 mov rax, qword ptr [rsi+rax*8] @@ -3194,7 +3194,7 @@ rx_i_179: ;FPADD mov eax, r12d test bl, 63 jnz short rx_body_179 - call rx_read_l2 + call rx_read rx_body_179: and eax, 32767 cvtdq2pd xmm0, qword ptr [rsi+rax*8] @@ -3208,7 +3208,7 @@ rx_i_180: ;AND_32 mov eax, r15d test bl, 63 jnz short rx_body_180 - call rx_read_l1 + call rx_read rx_body_180: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -3224,11 +3224,11 @@ rx_i_181: ;CALL jz rx_finish xor r10, 023c7845fh mov eax, r10d + xor rbp, rax test bl, 63 jnz short rx_body_181 - call rx_read_l1 + call rx_read rx_body_181: - xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] mov r10, rax @@ -3241,11 +3241,11 @@ rx_i_182: ;FPSUB jz rx_finish xor r8, 0f8884327h mov eax, r8d + xor rbp, rax test bl, 63 jnz short rx_body_182 - call rx_read_l1 + call rx_read rx_body_182: - xor rbp, rax and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] subpd xmm0, xmm7 @@ -3258,7 +3258,7 @@ rx_i_183: ;ADD_64 mov eax, r13d test bl, 63 jnz short rx_body_183 - call rx_read_l1 + call rx_read rx_body_183: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -3270,11 +3270,11 @@ rx_i_184: ;XOR_32 jz rx_finish xor r12, 04764cdf7h mov eax, r12d + xor rbp, rax test bl, 63 jnz short rx_body_184 - call rx_read_l1 + call rx_read rx_body_184: - xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] xor eax, r13d @@ -3287,7 +3287,7 @@ rx_i_185: ;JUMP mov eax, r10d test bl, 63 jnz short rx_body_185 - call rx_read_l2 + call rx_read rx_body_185: and eax, 32767 mov rax, qword ptr [rsi+rax*8] @@ -3304,11 +3304,11 @@ rx_i_186: ;OR_64 jz rx_finish xor r9, 0cded414bh mov eax, r9d + xor rbp, rax test bl, 63 jnz short rx_body_186 - call rx_read_l1 + call rx_read rx_body_186: - xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] or rax, -1252263008 @@ -3325,7 +3325,7 @@ rx_i_187: ;FPMUL mov eax, r13d test bl, 63 jnz short rx_body_187 - call rx_read_l1 + call rx_read rx_body_187: and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] @@ -3340,11 +3340,11 @@ rx_i_188: ;FPSUB jz rx_finish xor r9, 04659becbh mov eax, r9d + xor rbp, rax test bl, 63 jnz short rx_body_188 - call rx_read_l2 + call rx_read rx_body_188: - xor rbp, rax and eax, 32767 cvtdq2pd xmm0, qword ptr [rsi+rax*8] subpd xmm0, xmm3 @@ -3357,7 +3357,7 @@ rx_i_189: ;FPDIV mov eax, r11d test bl, 63 jnz short rx_body_189 - call rx_read_l1 + call rx_read rx_body_189: and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] @@ -3374,7 +3374,7 @@ rx_i_190: ;RET mov eax, r12d test bl, 63 jnz short rx_body_190 - call rx_read_l1 + call rx_read rx_body_190: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -3390,7 +3390,7 @@ rx_i_191: ;FPSQRT mov eax, r15d test bl, 63 jnz short rx_body_191 - call rx_read_l2 + call rx_read rx_body_191: and eax, 32767 cvtdq2pd xmm0, qword ptr [rsi+rax*8] @@ -3404,7 +3404,7 @@ rx_i_192: ;FPSQRT mov eax, r8d test bl, 63 jnz short rx_body_192 - call rx_read_l1 + call rx_read rx_body_192: and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] @@ -3422,7 +3422,7 @@ rx_i_193: ;MUL_32 mov eax, r12d test bl, 63 jnz short rx_body_193 - call rx_read_l1 + call rx_read rx_body_193: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -3442,7 +3442,7 @@ rx_i_194: ;FPMUL mov eax, r12d test bl, 63 jnz short rx_body_194 - call rx_read_l1 + call rx_read rx_body_194: and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] @@ -3463,7 +3463,7 @@ rx_i_195: ;SHL_64 mov eax, r10d test bl, 63 jnz short rx_body_195 - call rx_read_l1 + call rx_read rx_body_195: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -3477,7 +3477,7 @@ rx_i_196: ;SUB_64 mov eax, r8d test bl, 63 jnz short rx_body_196 - call rx_read_l1 + call rx_read rx_body_196: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -3493,11 +3493,11 @@ rx_i_197: ;MUL_64 jz rx_finish xor r12, 0229208efh mov eax, r12d + xor rbp, rax test bl, 63 jnz short rx_body_197 - call rx_read_l2 + call rx_read rx_body_197: - xor rbp, rax and eax, 32767 mov rax, qword ptr [rsi+rax*8] imul rax, r15 @@ -3510,7 +3510,7 @@ rx_i_198: ;MULH_64 mov eax, r14d test bl, 63 jnz short rx_body_198 - call rx_read_l2 + call rx_read rx_body_198: and eax, 32767 mov rax, qword ptr [rsi+rax*8] @@ -3530,7 +3530,7 @@ rx_i_199: ;MULH_64 mov eax, r13d test bl, 63 jnz short rx_body_199 - call rx_read_l1 + call rx_read rx_body_199: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -3550,7 +3550,7 @@ rx_i_200: ;FPSUB mov eax, r10d test bl, 63 jnz short rx_body_200 - call rx_read_l1 + call rx_read rx_body_200: and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] @@ -3566,11 +3566,11 @@ rx_i_201: ;FPADD jz rx_finish xor r8, 0cdda801dh mov eax, r8d + xor rbp, rax test bl, 63 jnz short rx_body_201 - call rx_read_l1 + call rx_read rx_body_201: - xor rbp, rax and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] addpd xmm0, xmm9 @@ -3587,7 +3587,7 @@ rx_i_202: ;FPADD mov eax, r13d test bl, 63 jnz short rx_body_202 - call rx_read_l1 + call rx_read rx_body_202: and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] @@ -3601,7 +3601,7 @@ rx_i_203: ;FPSUB mov eax, r10d test bl, 63 jnz short rx_body_203 - call rx_read_l2 + call rx_read rx_body_203: and eax, 32767 cvtdq2pd xmm0, qword ptr [rsi+rax*8] @@ -3619,7 +3619,7 @@ rx_i_204: ;MUL_64 mov eax, r9d test bl, 63 jnz short rx_body_204 - call rx_read_l2 + call rx_read rx_body_204: and eax, 32767 mov rax, qword ptr [rsi+rax*8] @@ -3635,11 +3635,11 @@ rx_i_205: ;FPMUL jz rx_finish xor r14, 094e997c5h mov eax, r14d + xor rbp, rax test bl, 63 jnz short rx_body_205 - call rx_read_l1 + call rx_read rx_body_205: - xor rbp, rax and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] mulpd xmm0, xmm8 @@ -3653,11 +3653,11 @@ rx_i_206: ;FPSUB jz rx_finish xor r11, 0e836a177h mov eax, r11d + xor rbp, rax test bl, 63 jnz short rx_body_206 - call rx_read_l1 + call rx_read rx_body_206: - xor rbp, rax and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] subpd xmm0, xmm7 @@ -3668,11 +3668,11 @@ rx_i_207: ;IDIV_64 jz rx_finish xor r9, 039ccdd30h mov eax, r9d + xor rbp, rax test bl, 63 jnz short rx_body_207 - call rx_read_l1 + call rx_read rx_body_207: - xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] ; magic divide by 314297476 @@ -3696,7 +3696,7 @@ rx_i_208: ;MUL_64 mov eax, r9d test bl, 63 jnz short rx_body_208 - call rx_read_l1 + call rx_read rx_body_208: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -3708,11 +3708,11 @@ rx_i_209: ;XOR_64 jz rx_finish xor r8, 0b84811f1h mov eax, r8d + xor rbp, rax test bl, 63 jnz short rx_body_209 - call rx_read_l1 + call rx_read rx_body_209: - xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] xor rax, r15 @@ -3727,11 +3727,11 @@ rx_i_210: ;MUL_32 jz rx_finish xor r12, 0c5efc90ah mov eax, r12d + xor rbp, rax test bl, 63 jnz short rx_body_210 - call rx_read_l1 + call rx_read rx_body_210: - xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] mov ecx, eax @@ -3750,7 +3750,7 @@ rx_i_211: ;ROR_64 mov eax, r12d test bl, 63 jnz short rx_body_211 - call rx_read_l1 + call rx_read rx_body_211: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -3767,11 +3767,11 @@ rx_i_212: ;MUL_64 jz rx_finish xor r13, 06b465fdbh mov eax, r13d + xor rbp, rax test bl, 63 jnz short rx_body_212 - call rx_read_l1 + call rx_read rx_body_212: - xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] imul rax, r13 @@ -3786,11 +3786,11 @@ rx_i_213: ;IMUL_32 jz rx_finish xor r13, 02dd1d503h mov eax, r13d + xor rbp, rax test bl, 63 jnz short rx_body_213 - call rx_read_l1 + call rx_read rx_body_213: - xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] movsxd rcx, eax @@ -3805,7 +3805,7 @@ rx_i_214: ;SHL_64 mov eax, r9d test bl, 63 jnz short rx_body_214 - call rx_read_l1 + call rx_read rx_body_214: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -3818,11 +3818,11 @@ rx_i_215: ;ADD_32 jz rx_finish xor r15, 08359265eh mov eax, r15d + xor rbp, rax test bl, 63 jnz short rx_body_215 - call rx_read_l1 + call rx_read rx_body_215: - xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] add eax, r12d @@ -3835,7 +3835,7 @@ rx_i_216: ;MUL_64 mov eax, r12d test bl, 63 jnz short rx_body_216 - call rx_read_l2 + call rx_read rx_body_216: and eax, 32767 mov rax, qword ptr [rsi+rax*8] @@ -3853,7 +3853,7 @@ rx_i_217: ;IMUL_32 mov eax, r8d test bl, 63 jnz short rx_body_217 - call rx_read_l1 + call rx_read rx_body_217: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -3873,7 +3873,7 @@ rx_i_218: ;FPSQRT mov eax, r11d test bl, 63 jnz short rx_body_218 - call rx_read_l1 + call rx_read rx_body_218: and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] @@ -3891,7 +3891,7 @@ rx_i_219: ;OR_64 mov eax, r8d test bl, 63 jnz short rx_body_219 - call rx_read_l1 + call rx_read rx_body_219: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -3909,7 +3909,7 @@ rx_i_220: ;IMUL_32 mov eax, r9d test bl, 63 jnz short rx_body_220 - call rx_read_l1 + call rx_read rx_body_220: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -3929,7 +3929,7 @@ rx_i_221: ;DIV_64 mov eax, r9d test bl, 63 jnz short rx_body_221 - call rx_read_l1 + call rx_read rx_body_221: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -3952,7 +3952,7 @@ rx_i_222: ;FPMUL mov eax, r9d test bl, 63 jnz short rx_body_222 - call rx_read_l1 + call rx_read rx_body_222: and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] @@ -3971,11 +3971,11 @@ rx_i_223: ;FPSUB jz rx_finish xor r8, 01e5cc085h mov eax, r8d + xor rbp, rax test bl, 63 jnz short rx_body_223 - call rx_read_l1 + call rx_read rx_body_223: - xor rbp, rax and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] subpd xmm0, xmm3 @@ -3992,7 +3992,7 @@ rx_i_224: ;XOR_32 mov eax, r12d test bl, 63 jnz short rx_body_224 - call rx_read_l2 + call rx_read rx_body_224: and eax, 32767 mov rax, qword ptr [rsi+rax*8] @@ -4010,7 +4010,7 @@ rx_i_225: ;DIV_64 mov eax, r13d test bl, 63 jnz short rx_body_225 - call rx_read_l1 + call rx_read rx_body_225: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -4033,7 +4033,7 @@ rx_i_226: ;JUMP mov eax, r10d test bl, 63 jnz short rx_body_226 - call rx_read_l1 + call rx_read rx_body_226: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -4052,7 +4052,7 @@ rx_i_227: ;FPMUL mov eax, r11d test bl, 63 jnz short rx_body_227 - call rx_read_l2 + call rx_read rx_body_227: and eax, 32767 cvtdq2pd xmm0, qword ptr [rsi+rax*8] @@ -4073,7 +4073,7 @@ rx_i_228: ;FPSQRT mov eax, r11d test bl, 63 jnz short rx_body_228 - call rx_read_l1 + call rx_read rx_body_228: and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] @@ -4087,7 +4087,7 @@ rx_i_229: ;IMULH_64 mov eax, r11d test bl, 63 jnz short rx_body_229 - call rx_read_l1 + call rx_read rx_body_229: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -4107,7 +4107,7 @@ rx_i_230: ;FPMUL mov eax, r15d test bl, 63 jnz short rx_body_230 - call rx_read_l1 + call rx_read rx_body_230: and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] @@ -4126,11 +4126,11 @@ rx_i_231: ;RET jz rx_finish xor r9, 0bb56428dh mov eax, r9d + xor rbp, rax test bl, 63 jnz short rx_body_231 - call rx_read_l1 + call rx_read rx_body_231: - xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] mov rcx, rax @@ -4149,7 +4149,7 @@ rx_i_232: ;FPMUL mov eax, r15d test bl, 63 jnz short rx_body_232 - call rx_read_l1 + call rx_read rx_body_232: and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] @@ -4164,11 +4164,11 @@ rx_i_233: ;JUMP jz rx_finish xor r13, 08eb2cd76h mov eax, r13d + xor rbp, rax test bl, 63 jnz short rx_body_233 - call rx_read_l1 + call rx_read rx_body_233: - xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] mov r14, rax @@ -4182,7 +4182,7 @@ rx_i_234: ;FPDIV mov eax, r15d test bl, 63 jnz short rx_body_234 - call rx_read_l1 + call rx_read rx_body_234: and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] @@ -4199,7 +4199,7 @@ rx_i_235: ;IMUL_32 mov eax, r13d test bl, 63 jnz short rx_body_235 - call rx_read_l1 + call rx_read rx_body_235: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -4219,7 +4219,7 @@ rx_i_236: ;FPADD mov eax, r15d test bl, 63 jnz short rx_body_236 - call rx_read_l1 + call rx_read rx_body_236: and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] @@ -4233,7 +4233,7 @@ rx_i_237: ;JUMP mov eax, r15d test bl, 63 jnz short rx_body_237 - call rx_read_l1 + call rx_read rx_body_237: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -4246,11 +4246,11 @@ rx_i_238: ;FPADD jz rx_finish xor r8, 0158f119fh mov eax, r8d + xor rbp, rax test bl, 63 jnz short rx_body_238 - call rx_read_l1 + call rx_read rx_body_238: - xor rbp, rax and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] addpd xmm0, xmm6 @@ -4267,7 +4267,7 @@ rx_i_239: ;ADD_64 mov eax, r13d test bl, 63 jnz short rx_body_239 - call rx_read_l1 + call rx_read rx_body_239: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -4279,11 +4279,11 @@ rx_i_240: ;IMULH_64 jz rx_finish xor r9, 0d65d29f9h mov eax, r9d + xor rbp, rax test bl, 63 jnz short rx_body_240 - call rx_read_l2 + call rx_read rx_body_240: - xor rbp, rax and eax, 32767 mov rax, qword ptr [rsi+rax*8] mov rcx, r14 @@ -4298,7 +4298,7 @@ rx_i_241: ;FPADD mov eax, r11d test bl, 63 jnz short rx_body_241 - call rx_read_l1 + call rx_read rx_body_241: and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] @@ -4316,7 +4316,7 @@ rx_i_242: ;MUL_32 mov eax, r12d test bl, 63 jnz short rx_body_242 - call rx_read_l1 + call rx_read rx_body_242: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -4336,7 +4336,7 @@ rx_i_243: ;OR_64 mov eax, r12d test bl, 63 jnz short rx_body_243 - call rx_read_l1 + call rx_read rx_body_243: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -4350,7 +4350,7 @@ rx_i_244: ;ROR_64 mov eax, r11d test bl, 63 jnz short rx_body_244 - call rx_read_l1 + call rx_read rx_body_244: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -4367,11 +4367,11 @@ rx_i_245: ;AND_32 jz rx_finish xor r13, 084505739h mov eax, r13d + xor rbp, rax test bl, 63 jnz short rx_body_245 - call rx_read_l2 + call rx_read rx_body_245: - xor rbp, rax and eax, 32767 mov rax, qword ptr [rsi+rax*8] and eax, r10d @@ -4386,11 +4386,11 @@ rx_i_246: ;IDIV_64 jz rx_finish xor r15, 027eeaa2eh mov eax, r15d + xor rbp, rax test bl, 63 jnz short rx_body_246 - call rx_read_l1 + call rx_read rx_body_246: - xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] ; magic divide by -156808488 @@ -4410,7 +4410,7 @@ rx_i_247: ;IMUL_32 mov eax, r10d test bl, 63 jnz short rx_body_247 - call rx_read_l1 + call rx_read rx_body_247: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -4428,11 +4428,11 @@ rx_i_248: ;MUL_32 jz rx_finish xor r8, 0649df46fh mov eax, r8d + xor rbp, rax test bl, 63 jnz short rx_body_248 - call rx_read_l1 + call rx_read rx_body_248: - xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] mov ecx, eax @@ -4449,11 +4449,11 @@ rx_i_249: ;IMULH_64 jz rx_finish xor r15, 0499552cch mov eax, r15d + xor rbp, rax test bl, 63 jnz short rx_body_249 - call rx_read_l1 + call rx_read rx_body_249: - xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] mov rcx, -508571655 @@ -4472,7 +4472,7 @@ rx_i_250: ;MUL_64 mov eax, r13d test bl, 63 jnz short rx_body_250 - call rx_read_l1 + call rx_read rx_body_250: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -4490,7 +4490,7 @@ rx_i_251: ;FPMUL mov eax, r13d test bl, 63 jnz short rx_body_251 - call rx_read_l1 + call rx_read rx_body_251: and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] @@ -4511,7 +4511,7 @@ rx_i_252: ;SHL_64 mov eax, r14d test bl, 63 jnz short rx_body_252 - call rx_read_l1 + call rx_read rx_body_252: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -4525,7 +4525,7 @@ rx_i_253: ;CALL mov eax, r14d test bl, 63 jnz short rx_body_253 - call rx_read_l1 + call rx_read rx_body_253: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -4545,7 +4545,7 @@ rx_i_254: ;FPADD mov eax, r14d test bl, 63 jnz short rx_body_254 - call rx_read_l1 + call rx_read rx_body_254: and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] @@ -4563,7 +4563,7 @@ rx_i_255: ;FPADD mov eax, r9d test bl, 63 jnz short rx_body_255 - call rx_read_l2 + call rx_read rx_body_255: and eax, 32767 cvtdq2pd xmm0, qword ptr [rsi+rax*8] @@ -4579,11 +4579,11 @@ rx_i_256: ;MULH_64 jz rx_finish xor r8, 08375472ch mov eax, r8d + xor rbp, rax test bl, 63 jnz short rx_body_256 - call rx_read_l1 + call rx_read rx_body_256: - xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] mov rcx, r15 @@ -4602,7 +4602,7 @@ rx_i_257: ;FPADD mov eax, r12d test bl, 63 jnz short rx_body_257 - call rx_read_l1 + call rx_read rx_body_257: and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] @@ -4618,11 +4618,11 @@ rx_i_258: ;MUL_32 jz rx_finish xor r11, 064fdbda0h mov eax, r11d + xor rbp, rax test bl, 63 jnz short rx_body_258 - call rx_read_l2 + call rx_read rx_body_258: - xor rbp, rax and eax, 32767 mov rax, qword ptr [rsi+rax*8] mov ecx, eax @@ -4641,7 +4641,7 @@ rx_i_259: ;FPADD mov eax, r11d test bl, 63 jnz short rx_body_259 - call rx_read_l1 + call rx_read rx_body_259: and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] @@ -4653,11 +4653,11 @@ rx_i_260: ;FPSUB jz rx_finish xor r13, 0f94e9fa9h mov eax, r13d + xor rbp, rax test bl, 63 jnz short rx_body_260 - call rx_read_l1 + call rx_read rx_body_260: - xor rbp, rax and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] subpd xmm0, xmm5 @@ -4668,11 +4668,11 @@ rx_i_261: ;FPDIV jz rx_finish xor r14, 02346171ch mov eax, r14d + xor rbp, rax test bl, 63 jnz short rx_body_261 - call rx_read_l1 + call rx_read rx_body_261: - xor rbp, rax and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] divpd xmm0, xmm3 @@ -4690,11 +4690,11 @@ rx_i_262: ;AND_64 jz rx_finish xor r10, 01c42baa6h mov eax, r10d + xor rbp, rax test bl, 63 jnz short rx_body_262 - call rx_read_l1 + call rx_read rx_body_262: - xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] and rax, -1569587450 @@ -4709,11 +4709,11 @@ rx_i_263: ;FPMUL jz rx_finish xor r11, 0b39b140h mov eax, r11d + xor rbp, rax test bl, 63 jnz short rx_body_263 - call rx_read_l2 + call rx_read rx_body_263: - xor rbp, rax and eax, 32767 cvtdq2pd xmm0, qword ptr [rsi+rax*8] mulpd xmm0, xmm8 @@ -4729,7 +4729,7 @@ rx_i_264: ;FPMUL mov eax, r11d test bl, 63 jnz short rx_body_264 - call rx_read_l1 + call rx_read rx_body_264: and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] @@ -4744,11 +4744,11 @@ rx_i_265: ;FPADD jz rx_finish xor r13, 07a3eb340h mov eax, r13d + xor rbp, rax test bl, 63 jnz short rx_body_265 - call rx_read_l1 + call rx_read rx_body_265: - xor rbp, rax and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] addpd xmm0, xmm8 @@ -4765,7 +4765,7 @@ rx_i_266: ;CALL mov eax, r13d test bl, 63 jnz short rx_body_266 - call rx_read_l2 + call rx_read rx_body_266: and eax, 32767 mov rax, qword ptr [rsi+rax*8] @@ -4781,7 +4781,7 @@ rx_i_267: ;ROL_64 mov eax, r8d test bl, 63 jnz short rx_body_267 - call rx_read_l1 + call rx_read rx_body_267: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -4794,11 +4794,11 @@ rx_i_268: ;JUMP jz rx_finish xor r12, 0c2510cebh mov eax, r12d + xor rbp, rax test bl, 63 jnz short rx_body_268 - call rx_read_l1 + call rx_read rx_body_268: - xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] mov r13, rax @@ -4812,7 +4812,7 @@ rx_i_269: ;ROL_64 mov eax, r11d test bl, 63 jnz short rx_body_269 - call rx_read_l1 + call rx_read rx_body_269: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -4830,7 +4830,7 @@ rx_i_270: ;FPMUL mov eax, r11d test bl, 63 jnz short rx_body_270 - call rx_read_l1 + call rx_read rx_body_270: and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] @@ -4847,7 +4847,7 @@ rx_i_271: ;MUL_32 mov eax, r13d test bl, 63 jnz short rx_body_271 - call rx_read_l1 + call rx_read rx_body_271: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -4867,7 +4867,7 @@ rx_i_272: ;AND_64 mov eax, r12d test bl, 63 jnz short rx_body_272 - call rx_read_l1 + call rx_read rx_body_272: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -4879,11 +4879,11 @@ rx_i_273: ;JUMP jz rx_finish xor r9, 0d315e4dch mov eax, r9d + xor rbp, rax test bl, 63 jnz short rx_body_273 - call rx_read_l1 + call rx_read rx_body_273: - xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] mov rcx, rax @@ -4901,7 +4901,7 @@ rx_i_274: ;FPADD mov eax, r15d test bl, 63 jnz short rx_body_274 - call rx_read_l1 + call rx_read rx_body_274: and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] @@ -4917,11 +4917,11 @@ rx_i_275: ;IDIV_64 jz rx_finish xor r10, 0788eceb7h mov eax, r10d + xor rbp, rax test bl, 63 jnz short rx_body_275 - call rx_read_l1 + call rx_read rx_body_275: - xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] ; magic divide by -333089764 @@ -4939,11 +4939,11 @@ rx_i_276: ;JUMP jz rx_finish xor r9, 0c6ac5edah mov eax, r9d + xor rbp, rax test bl, 63 jnz short rx_body_276 - call rx_read_l2 + call rx_read rx_body_276: - xor rbp, rax and eax, 32767 mov rax, qword ptr [rsi+rax*8] mov rcx, rax @@ -4961,7 +4961,7 @@ rx_i_277: ;IMUL_32 mov eax, r11d test bl, 63 jnz short rx_body_277 - call rx_read_l1 + call rx_read rx_body_277: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -4981,7 +4981,7 @@ rx_i_278: ;FPSUB mov eax, r9d test bl, 63 jnz short rx_body_278 - call rx_read_l1 + call rx_read rx_body_278: and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] @@ -4999,7 +4999,7 @@ rx_i_279: ;FPADD mov eax, r15d test bl, 63 jnz short rx_body_279 - call rx_read_l2 + call rx_read rx_body_279: and eax, 32767 cvtdq2pd xmm0, qword ptr [rsi+rax*8] @@ -5017,7 +5017,7 @@ rx_i_280: ;IDIV_64 mov eax, r12d test bl, 63 jnz short rx_body_280 - call rx_read_l1 + call rx_read rx_body_280: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -5042,7 +5042,7 @@ rx_i_281: ;SUB_64 mov eax, r10d test bl, 63 jnz short rx_body_281 - call rx_read_l1 + call rx_read rx_body_281: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -5060,7 +5060,7 @@ rx_i_282: ;SUB_64 mov eax, r15d test bl, 63 jnz short rx_body_282 - call rx_read_l1 + call rx_read rx_body_282: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -5072,11 +5072,11 @@ rx_i_283: ;ADD_64 jz rx_finish xor r9, 0df4d084fh mov eax, r9d + xor rbp, rax test bl, 63 jnz short rx_body_283 - call rx_read_l2 + call rx_read rx_body_283: - xor rbp, rax and eax, 32767 mov rax, qword ptr [rsi+rax*8] add rax, r12 @@ -5091,11 +5091,11 @@ rx_i_284: ;FPADD jz rx_finish xor r15, 0e68f36ach mov eax, r15d + xor rbp, rax test bl, 63 jnz short rx_body_284 - call rx_read_l1 + call rx_read rx_body_284: - xor rbp, rax and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] addpd xmm0, xmm6 @@ -5112,7 +5112,7 @@ rx_i_285: ;IMUL_32 mov eax, r8d test bl, 63 jnz short rx_body_285 - call rx_read_l1 + call rx_read rx_body_285: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -5128,7 +5128,7 @@ rx_i_286: ;ROL_64 mov eax, r14d test bl, 63 jnz short rx_body_286 - call rx_read_l1 + call rx_read rx_body_286: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -5145,11 +5145,11 @@ rx_i_287: ;IDIV_64 jz rx_finish xor r11, 049547c9ch mov eax, r11d + xor rbp, rax test bl, 63 jnz short rx_body_287 - call rx_read_l1 + call rx_read rx_body_287: - xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] ; magic divide by 1227278330 @@ -5173,7 +5173,7 @@ rx_i_288: ;MUL_64 mov eax, r10d test bl, 63 jnz short rx_body_288 - call rx_read_l1 + call rx_read rx_body_288: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -5191,7 +5191,7 @@ rx_i_289: ;FPMUL mov eax, r14d test bl, 63 jnz short rx_body_289 - call rx_read_l1 + call rx_read rx_body_289: and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] @@ -5208,7 +5208,7 @@ rx_i_290: ;FPSUB mov eax, r15d test bl, 63 jnz short rx_body_290 - call rx_read_l1 + call rx_read rx_body_290: and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] @@ -5222,7 +5222,7 @@ rx_i_291: ;RET mov eax, r13d test bl, 63 jnz short rx_body_291 - call rx_read_l1 + call rx_read rx_body_291: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -5242,7 +5242,7 @@ rx_i_292: ;ROL_64 mov eax, r13d test bl, 63 jnz short rx_body_292 - call rx_read_l2 + call rx_read rx_body_292: and eax, 32767 mov rax, qword ptr [rsi+rax*8] @@ -5257,7 +5257,7 @@ rx_i_293: ;FPADD mov eax, r9d test bl, 63 jnz short rx_body_293 - call rx_read_l1 + call rx_read rx_body_293: and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] @@ -5271,7 +5271,7 @@ rx_i_294: ;RET mov eax, r14d test bl, 63 jnz short rx_body_294 - call rx_read_l1 + call rx_read rx_body_294: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -5289,11 +5289,11 @@ rx_i_295: ;FPSUB jz rx_finish xor r9, 0f42798fdh mov eax, r9d + xor rbp, rax test bl, 63 jnz short rx_body_295 - call rx_read_l2 + call rx_read rx_body_295: - xor rbp, rax and eax, 32767 cvtdq2pd xmm0, qword ptr [rsi+rax*8] subpd xmm0, xmm8 @@ -5306,7 +5306,7 @@ rx_i_296: ;FPSQRT mov eax, r14d test bl, 63 jnz short rx_body_296 - call rx_read_l2 + call rx_read rx_body_296: and eax, 32767 cvtdq2pd xmm0, qword ptr [rsi+rax*8] @@ -5320,7 +5320,7 @@ rx_i_297: ;ADD_64 mov eax, r15d test bl, 63 jnz short rx_body_297 - call rx_read_l1 + call rx_read rx_body_297: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -5334,7 +5334,7 @@ rx_i_298: ;FPSUB mov eax, r14d test bl, 63 jnz short rx_body_298 - call rx_read_l2 + call rx_read rx_body_298: and eax, 32767 cvtdq2pd xmm0, qword ptr [rsi+rax*8] @@ -5346,11 +5346,11 @@ rx_i_299: ;ADD_64 jz rx_finish xor r12, 042f4897h mov eax, r12d + xor rbp, rax test bl, 63 jnz short rx_body_299 - call rx_read_l1 + call rx_read rx_body_299: - xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] add rax, r10 @@ -5365,11 +5365,11 @@ rx_i_300: ;FPSUB jz rx_finish xor r12, 095765693h mov eax, r12d + xor rbp, rax test bl, 63 jnz short rx_body_300 - call rx_read_l1 + call rx_read rx_body_300: - xor rbp, rax and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] subpd xmm0, xmm8 @@ -5382,7 +5382,7 @@ rx_i_301: ;FPMUL mov eax, r8d test bl, 63 jnz short rx_body_301 - call rx_read_l1 + call rx_read rx_body_301: and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] @@ -5401,11 +5401,11 @@ rx_i_302: ;ADD_64 jz rx_finish xor r15, 0f6f8c345h mov eax, r15d + xor rbp, rax test bl, 63 jnz short rx_body_302 - call rx_read_l1 + call rx_read rx_body_302: - xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] add rax, r10 @@ -5418,7 +5418,7 @@ rx_i_303: ;FPADD mov eax, r14d test bl, 63 jnz short rx_body_303 - call rx_read_l2 + call rx_read rx_body_303: and eax, 32767 cvtdq2pd xmm0, qword ptr [rsi+rax*8] @@ -5436,7 +5436,7 @@ rx_i_304: ;MUL_64 mov eax, r12d test bl, 63 jnz short rx_body_304 - call rx_read_l1 + call rx_read rx_body_304: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -5450,7 +5450,7 @@ rx_i_305: ;MUL_64 mov eax, r11d test bl, 63 jnz short rx_body_305 - call rx_read_l1 + call rx_read rx_body_305: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -5464,7 +5464,7 @@ rx_i_306: ;ADD_64 mov eax, r15d test bl, 63 jnz short rx_body_306 - call rx_read_l1 + call rx_read rx_body_306: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -5478,7 +5478,7 @@ rx_i_307: ;SHL_64 mov eax, r15d test bl, 63 jnz short rx_body_307 - call rx_read_l1 + call rx_read rx_body_307: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -5492,7 +5492,7 @@ rx_i_308: ;MUL_64 mov eax, r11d test bl, 63 jnz short rx_body_308 - call rx_read_l1 + call rx_read rx_body_308: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -5506,7 +5506,7 @@ rx_i_309: ;DIV_64 mov eax, r9d test bl, 63 jnz short rx_body_309 - call rx_read_l2 + call rx_read rx_body_309: and eax, 32767 mov rax, qword ptr [rsi+rax*8] @@ -5528,7 +5528,7 @@ rx_i_310: ;FPMUL mov eax, r9d test bl, 63 jnz short rx_body_310 - call rx_read_l1 + call rx_read rx_body_310: and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] @@ -5549,7 +5549,7 @@ rx_i_311: ;FPMUL mov eax, r8d test bl, 63 jnz short rx_body_311 - call rx_read_l1 + call rx_read rx_body_311: and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] @@ -5566,7 +5566,7 @@ rx_i_312: ;MUL_32 mov eax, r13d test bl, 63 jnz short rx_body_312 - call rx_read_l2 + call rx_read rx_body_312: and eax, 32767 mov rax, qword ptr [rsi+rax*8] @@ -5582,7 +5582,7 @@ rx_i_313: ;ROR_64 mov eax, r8d test bl, 63 jnz short rx_body_313 - call rx_read_l1 + call rx_read rx_body_313: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -5600,7 +5600,7 @@ rx_i_314: ;IMUL_32 mov eax, r15d test bl, 63 jnz short rx_body_314 - call rx_read_l2 + call rx_read rx_body_314: and eax, 32767 mov rax, qword ptr [rsi+rax*8] @@ -5620,7 +5620,7 @@ rx_i_315: ;XOR_64 mov eax, r9d test bl, 63 jnz short rx_body_315 - call rx_read_l2 + call rx_read rx_body_315: and eax, 32767 mov rax, qword ptr [rsi+rax*8] @@ -5632,11 +5632,11 @@ rx_i_316: ;RET jz rx_finish xor r14, 05b0cb5bbh mov eax, r14d + xor rbp, rax test bl, 63 jnz short rx_body_316 - call rx_read_l1 + call rx_read rx_body_316: - xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] mov rcx, rax @@ -5655,7 +5655,7 @@ rx_i_317: ;FPADD mov eax, r9d test bl, 63 jnz short rx_body_317 - call rx_read_l1 + call rx_read rx_body_317: and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] @@ -5669,7 +5669,7 @@ rx_i_318: ;ROR_64 mov eax, r9d test bl, 63 jnz short rx_body_318 - call rx_read_l2 + call rx_read rx_body_318: and eax, 32767 mov rax, qword ptr [rsi+rax*8] @@ -5688,7 +5688,7 @@ rx_i_319: ;SHR_64 mov eax, r13d test bl, 63 jnz short rx_body_319 - call rx_read_l1 + call rx_read rx_body_319: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -5706,7 +5706,7 @@ rx_i_320: ;FPADD mov eax, r15d test bl, 63 jnz short rx_body_320 - call rx_read_l1 + call rx_read rx_body_320: and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] @@ -5724,7 +5724,7 @@ rx_i_321: ;IMUL_32 mov eax, r11d test bl, 63 jnz short rx_body_321 - call rx_read_l2 + call rx_read rx_body_321: and eax, 32767 mov rax, qword ptr [rsi+rax*8] @@ -5744,7 +5744,7 @@ rx_i_322: ;CALL mov eax, r14d test bl, 63 jnz short rx_body_322 - call rx_read_l1 + call rx_read rx_body_322: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -5762,11 +5762,11 @@ rx_i_323: ;MULH_64 jz rx_finish xor r14, 07b07664bh mov eax, r14d + xor rbp, rax test bl, 63 jnz short rx_body_323 - call rx_read_l2 + call rx_read rx_body_323: - xor rbp, rax and eax, 32767 mov rax, qword ptr [rsi+rax*8] mov rcx, r14 @@ -5785,7 +5785,7 @@ rx_i_324: ;FPDIV mov eax, r9d test bl, 63 jnz short rx_body_324 - call rx_read_l1 + call rx_read rx_body_324: and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] @@ -5806,7 +5806,7 @@ rx_i_325: ;OR_32 mov eax, r11d test bl, 63 jnz short rx_body_325 - call rx_read_l1 + call rx_read rx_body_325: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -5818,11 +5818,11 @@ rx_i_326: ;MULH_64 jz rx_finish xor r11, 0d1b27540h mov eax, r11d + xor rbp, rax test bl, 63 jnz short rx_body_326 - call rx_read_l1 + call rx_read rx_body_326: - xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] mov rcx, -1233771581 @@ -5839,11 +5839,11 @@ rx_i_327: ;IDIV_64 jz rx_finish xor r9, 09665f98dh mov eax, r9d + xor rbp, rax test bl, 63 jnz short rx_body_327 - call rx_read_l1 + call rx_read rx_body_327: - xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] ; magic divide by 1572662125 @@ -5865,7 +5865,7 @@ rx_i_328: ;SHR_64 mov eax, r12d test bl, 63 jnz short rx_body_328 - call rx_read_l1 + call rx_read rx_body_328: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -5879,7 +5879,7 @@ rx_i_329: ;RET mov eax, r11d test bl, 63 jnz short rx_body_329 - call rx_read_l2 + call rx_read rx_body_329: and eax, 32767 mov rax, qword ptr [rsi+rax*8] @@ -5893,11 +5893,11 @@ rx_i_330: ;IMUL_32 jz rx_finish xor r9, 0f6a93f19h mov eax, r9d + xor rbp, rax test bl, 63 jnz short rx_body_330 - call rx_read_l2 + call rx_read rx_body_330: - xor rbp, rax and eax, 32767 mov rax, qword ptr [rsi+rax*8] movsxd rcx, eax @@ -5914,11 +5914,11 @@ rx_i_331: ;FPADD jz rx_finish xor r9, 0bc9bbe4ah mov eax, r9d + xor rbp, rax test bl, 63 jnz short rx_body_331 - call rx_read_l2 + call rx_read rx_body_331: - xor rbp, rax and eax, 32767 cvtdq2pd xmm0, qword ptr [rsi+rax*8] addpd xmm0, xmm3 @@ -5929,11 +5929,11 @@ rx_i_332: ;FPADD jz rx_finish xor r12, 0f253cd4eh mov eax, r12d + xor rbp, rax test bl, 63 jnz short rx_body_332 - call rx_read_l1 + call rx_read rx_body_332: - xor rbp, rax and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] addpd xmm0, xmm6 @@ -5948,11 +5948,11 @@ rx_i_333: ;OR_64 jz rx_finish xor r14, 0f009758bh mov eax, r14d + xor rbp, rax test bl, 63 jnz short rx_body_333 - call rx_read_l1 + call rx_read rx_body_333: - xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] or rax, r12 @@ -5963,11 +5963,11 @@ rx_i_334: ;ADD_64 jz rx_finish xor r8, 0dda04168h mov eax, r8d + xor rbp, rax test bl, 63 jnz short rx_body_334 - call rx_read_l2 + call rx_read rx_body_334: - xor rbp, rax and eax, 32767 mov rax, qword ptr [rsi+rax*8] add rax, r13 @@ -5980,7 +5980,7 @@ rx_i_335: ;SUB_64 mov eax, r15d test bl, 63 jnz short rx_body_335 - call rx_read_l2 + call rx_read rx_body_335: and eax, 32767 mov rax, qword ptr [rsi+rax*8] @@ -5996,11 +5996,11 @@ rx_i_336: ;ROR_64 jz rx_finish xor r15, 0aea0a435h mov eax, r15d + xor rbp, rax test bl, 63 jnz short rx_body_336 - call rx_read_l1 + call rx_read rx_body_336: - xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] ror rax, 42 @@ -6017,7 +6017,7 @@ rx_i_337: ;ADD_64 mov eax, r8d test bl, 63 jnz short rx_body_337 - call rx_read_l1 + call rx_read rx_body_337: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -6033,11 +6033,11 @@ rx_i_338: ;MUL_64 jz rx_finish xor r12, 0d428a742h mov eax, r12d + xor rbp, rax test bl, 63 jnz short rx_body_338 - call rx_read_l1 + call rx_read rx_body_338: - xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] imul rax, r12 @@ -6050,7 +6050,7 @@ rx_i_339: ;FPADD mov eax, r9d test bl, 63 jnz short rx_body_339 - call rx_read_l1 + call rx_read rx_body_339: and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] @@ -6064,7 +6064,7 @@ rx_i_340: ;FPADD mov eax, r15d test bl, 63 jnz short rx_body_340 - call rx_read_l2 + call rx_read rx_body_340: and eax, 32767 cvtdq2pd xmm0, qword ptr [rsi+rax*8] @@ -6078,7 +6078,7 @@ rx_i_341: ;MUL_32 mov eax, r12d test bl, 63 jnz short rx_body_341 - call rx_read_l1 + call rx_read rx_body_341: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -6098,7 +6098,7 @@ rx_i_342: ;FPSUB mov eax, r9d test bl, 63 jnz short rx_body_342 - call rx_read_l1 + call rx_read rx_body_342: and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] @@ -6110,11 +6110,11 @@ rx_i_343: ;XOR_64 jz rx_finish xor r14, 056f6cf0bh mov eax, r14d + xor rbp, rax test bl, 63 jnz short rx_body_343 - call rx_read_l1 + call rx_read rx_body_343: - xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] xor rax, r13 @@ -6131,7 +6131,7 @@ rx_i_344: ;FPSUB mov eax, r10d test bl, 63 jnz short rx_body_344 - call rx_read_l1 + call rx_read rx_body_344: and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] @@ -6145,7 +6145,7 @@ rx_i_345: ;MULH_64 mov eax, r12d test bl, 63 jnz short rx_body_345 - call rx_read_l2 + call rx_read rx_body_345: and eax, 32767 mov rax, qword ptr [rsi+rax*8] @@ -6165,7 +6165,7 @@ rx_i_346: ;AND_32 mov eax, r12d test bl, 63 jnz short rx_body_346 - call rx_read_l2 + call rx_read rx_body_346: and eax, 32767 mov rax, qword ptr [rsi+rax*8] @@ -6183,7 +6183,7 @@ rx_i_347: ;ADD_64 mov eax, r14d test bl, 63 jnz short rx_body_347 - call rx_read_l1 + call rx_read rx_body_347: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -6197,7 +6197,7 @@ rx_i_348: ;FPSUB mov eax, r13d test bl, 63 jnz short rx_body_348 - call rx_read_l1 + call rx_read rx_body_348: and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] @@ -6215,7 +6215,7 @@ rx_i_349: ;OR_64 mov eax, r8d test bl, 63 jnz short rx_body_349 - call rx_read_l1 + call rx_read rx_body_349: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -6229,7 +6229,7 @@ rx_i_350: ;CALL mov eax, r9d test bl, 63 jnz short rx_body_350 - call rx_read_l2 + call rx_read rx_body_350: and eax, 32767 mov rax, qword ptr [rsi+rax*8] @@ -6249,7 +6249,7 @@ rx_i_351: ;MUL_64 mov eax, r11d test bl, 63 jnz short rx_body_351 - call rx_read_l1 + call rx_read rx_body_351: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -6261,11 +6261,11 @@ rx_i_352: ;FPADD jz rx_finish xor r10, 0afc9af2bh mov eax, r10d + xor rbp, rax test bl, 63 jnz short rx_body_352 - call rx_read_l1 + call rx_read rx_body_352: - xor rbp, rax and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] addpd xmm0, xmm6 @@ -6282,7 +6282,7 @@ rx_i_353: ;FPSUB mov eax, r13d test bl, 63 jnz short rx_body_353 - call rx_read_l1 + call rx_read rx_body_353: and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] @@ -6300,7 +6300,7 @@ rx_i_354: ;MUL_32 mov eax, r13d test bl, 63 jnz short rx_body_354 - call rx_read_l1 + call rx_read rx_body_354: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -6316,7 +6316,7 @@ rx_i_355: ;MUL_64 mov eax, r10d test bl, 63 jnz short rx_body_355 - call rx_read_l2 + call rx_read rx_body_355: and eax, 32767 mov rax, qword ptr [rsi+rax*8] @@ -6332,11 +6332,11 @@ rx_i_356: ;MUL_64 jz rx_finish xor r10, 01cd85d80h mov eax, r10d + xor rbp, rax test bl, 63 jnz short rx_body_356 - call rx_read_l1 + call rx_read rx_body_356: - xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] imul rax, r10 @@ -6349,7 +6349,7 @@ rx_i_357: ;ADD_64 mov eax, r10d test bl, 63 jnz short rx_body_357 - call rx_read_l1 + call rx_read rx_body_357: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -6363,7 +6363,7 @@ rx_i_358: ;DIV_64 mov eax, r13d test bl, 63 jnz short rx_body_358 - call rx_read_l1 + call rx_read rx_body_358: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -6380,11 +6380,11 @@ rx_i_359: ;FPSUB jz rx_finish xor r10, 0714fc2cdh mov eax, r10d + xor rbp, rax test bl, 63 jnz short rx_body_359 - call rx_read_l1 + call rx_read rx_body_359: - xor rbp, rax and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] subpd xmm0, xmm9 @@ -6401,7 +6401,7 @@ rx_i_360: ;FPMUL mov eax, r10d test bl, 63 jnz short rx_body_360 - call rx_read_l1 + call rx_read rx_body_360: and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] @@ -6418,7 +6418,7 @@ rx_i_361: ;FPDIV mov eax, r15d test bl, 63 jnz short rx_body_361 - call rx_read_l1 + call rx_read rx_body_361: and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] @@ -6439,7 +6439,7 @@ rx_i_362: ;SUB_64 mov eax, r9d test bl, 63 jnz short rx_body_362 - call rx_read_l1 + call rx_read rx_body_362: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -6457,7 +6457,7 @@ rx_i_363: ;FPMUL mov eax, r12d test bl, 63 jnz short rx_body_363 - call rx_read_l2 + call rx_read rx_body_363: and eax, 32767 cvtdq2pd xmm0, qword ptr [rsi+rax*8] @@ -6474,7 +6474,7 @@ rx_i_364: ;MUL_32 mov eax, r11d test bl, 63 jnz short rx_body_364 - call rx_read_l1 + call rx_read rx_body_364: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -6490,7 +6490,7 @@ rx_i_365: ;IMUL_32 mov eax, r15d test bl, 63 jnz short rx_body_365 - call rx_read_l1 + call rx_read rx_body_365: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -6510,7 +6510,7 @@ rx_i_366: ;IMUL_32 mov eax, r12d test bl, 63 jnz short rx_body_366 - call rx_read_l1 + call rx_read rx_body_366: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -6528,11 +6528,11 @@ rx_i_367: ;ROR_64 jz rx_finish xor r9, 04d14cb3ah mov eax, r9d + xor rbp, rax test bl, 63 jnz short rx_body_367 - call rx_read_l1 + call rx_read rx_body_367: - xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] ror rax, 18 @@ -6545,7 +6545,7 @@ rx_i_368: ;SUB_32 mov eax, r10d test bl, 63 jnz short rx_body_368 - call rx_read_l2 + call rx_read rx_body_368: and eax, 32767 mov rax, qword ptr [rsi+rax*8] @@ -6559,7 +6559,7 @@ rx_i_369: ;IDIV_64 mov eax, r9d test bl, 63 jnz short rx_body_369 - call rx_read_l2 + call rx_read rx_body_369: and eax, 32767 mov rax, qword ptr [rsi+rax*8] @@ -6580,7 +6580,7 @@ rx_i_370: ;FPSUB mov eax, r15d test bl, 63 jnz short rx_body_370 - call rx_read_l2 + call rx_read rx_body_370: and eax, 32767 cvtdq2pd xmm0, qword ptr [rsi+rax*8] @@ -6596,11 +6596,11 @@ rx_i_371: ;FPADD jz rx_finish xor r8, 0ebbd5cc9h mov eax, r8d + xor rbp, rax test bl, 63 jnz short rx_body_371 - call rx_read_l1 + call rx_read rx_body_371: - xor rbp, rax and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] addpd xmm0, xmm9 @@ -6617,7 +6617,7 @@ rx_i_372: ;SHL_64 mov eax, r10d test bl, 63 jnz short rx_body_372 - call rx_read_l1 + call rx_read rx_body_372: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -6632,7 +6632,7 @@ rx_i_373: ;FPMUL mov eax, r15d test bl, 63 jnz short rx_body_373 - call rx_read_l1 + call rx_read rx_body_373: and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] @@ -6649,7 +6649,7 @@ rx_i_374: ;FPMUL mov eax, r11d test bl, 63 jnz short rx_body_374 - call rx_read_l1 + call rx_read rx_body_374: and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] @@ -6666,7 +6666,7 @@ rx_i_375: ;ADD_64 mov eax, r9d test bl, 63 jnz short rx_body_375 - call rx_read_l2 + call rx_read rx_body_375: and eax, 32767 mov rax, qword ptr [rsi+rax*8] @@ -6684,7 +6684,7 @@ rx_i_376: ;ADD_64 mov eax, r14d test bl, 63 jnz short rx_body_376 - call rx_read_l1 + call rx_read rx_body_376: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -6702,7 +6702,7 @@ rx_i_377: ;FPSUB mov eax, r14d test bl, 63 jnz short rx_body_377 - call rx_read_l1 + call rx_read rx_body_377: and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] @@ -6716,7 +6716,7 @@ rx_i_378: ;MUL_32 mov eax, r12d test bl, 63 jnz short rx_body_378 - call rx_read_l2 + call rx_read rx_body_378: and eax, 32767 mov rax, qword ptr [rsi+rax*8] @@ -6732,7 +6732,7 @@ rx_i_379: ;ROR_64 mov eax, r10d test bl, 63 jnz short rx_body_379 - call rx_read_l1 + call rx_read rx_body_379: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -6747,7 +6747,7 @@ rx_i_380: ;MUL_64 mov eax, r11d test bl, 63 jnz short rx_body_380 - call rx_read_l2 + call rx_read rx_body_380: and eax, 32767 mov rax, qword ptr [rsi+rax*8] @@ -6763,11 +6763,11 @@ rx_i_381: ;XOR_32 jz rx_finish xor r8, 019816ff9h mov eax, r8d + xor rbp, rax test bl, 63 jnz short rx_body_381 - call rx_read_l1 + call rx_read rx_body_381: - xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] xor eax, r14d @@ -6780,7 +6780,7 @@ rx_i_382: ;ROL_64 mov eax, r14d test bl, 63 jnz short rx_body_382 - call rx_read_l2 + call rx_read rx_body_382: and eax, 32767 mov rax, qword ptr [rsi+rax*8] @@ -6794,7 +6794,7 @@ rx_i_383: ;FPSUB mov eax, r15d test bl, 63 jnz short rx_body_383 - call rx_read_l1 + call rx_read rx_body_383: and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] @@ -6812,7 +6812,7 @@ rx_i_384: ;XOR_64 mov eax, r10d test bl, 63 jnz short rx_body_384 - call rx_read_l1 + call rx_read rx_body_384: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -6830,7 +6830,7 @@ rx_i_385: ;MUL_64 mov eax, r15d test bl, 63 jnz short rx_body_385 - call rx_read_l1 + call rx_read rx_body_385: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -6848,7 +6848,7 @@ rx_i_386: ;FPADD mov eax, r9d test bl, 63 jnz short rx_body_386 - call rx_read_l1 + call rx_read rx_body_386: and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] @@ -6860,11 +6860,11 @@ rx_i_387: ;SUB_32 jz rx_finish xor r9, 0d4f7bc6ah mov eax, r9d + xor rbp, rax test bl, 63 jnz short rx_body_387 - call rx_read_l1 + call rx_read rx_body_387: - xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] sub eax, r15d @@ -6877,7 +6877,7 @@ rx_i_388: ;RET mov eax, r8d test bl, 63 jnz short rx_body_388 - call rx_read_l1 + call rx_read rx_body_388: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -6897,7 +6897,7 @@ rx_i_389: ;JUMP mov eax, r11d test bl, 63 jnz short rx_body_389 - call rx_read_l2 + call rx_read rx_body_389: and eax, 32767 mov rax, qword ptr [rsi+rax*8] @@ -6912,7 +6912,7 @@ rx_i_390: ;FPADD mov eax, r15d test bl, 63 jnz short rx_body_390 - call rx_read_l1 + call rx_read rx_body_390: and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] @@ -6926,7 +6926,7 @@ rx_i_391: ;FPADD mov eax, r8d test bl, 63 jnz short rx_body_391 - call rx_read_l1 + call rx_read rx_body_391: and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] @@ -6940,7 +6940,7 @@ rx_i_392: ;SAR_64 mov eax, r14d test bl, 63 jnz short rx_body_392 - call rx_read_l1 + call rx_read rx_body_392: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -6959,7 +6959,7 @@ rx_i_393: ;AND_64 mov eax, r14d test bl, 63 jnz short rx_body_393 - call rx_read_l1 + call rx_read rx_body_393: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -6977,7 +6977,7 @@ rx_i_394: ;FPADD mov eax, r12d test bl, 63 jnz short rx_body_394 - call rx_read_l1 + call rx_read rx_body_394: and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] @@ -6989,11 +6989,11 @@ rx_i_395: ;DIV_64 jz rx_finish xor r8, 04ae4fe8ch mov eax, r8d + xor rbp, rax test bl, 63 jnz short rx_body_395 - call rx_read_l1 + call rx_read rx_body_395: - xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] ; magic divide by 939698704 @@ -7010,7 +7010,7 @@ rx_i_396: ;ROR_64 mov eax, r10d test bl, 63 jnz short rx_body_396 - call rx_read_l2 + call rx_read rx_body_396: and eax, 32767 mov rax, qword ptr [rsi+rax*8] @@ -7026,11 +7026,11 @@ rx_i_397: ;SUB_32 jz rx_finish xor r8, 0916f3819h mov eax, r8d + xor rbp, rax test bl, 63 jnz short rx_body_397 - call rx_read_l2 + call rx_read rx_body_397: - xor rbp, rax and eax, 32767 mov rax, qword ptr [rsi+rax*8] sub eax, r12d @@ -7047,7 +7047,7 @@ rx_i_398: ;SHR_64 mov eax, r8d test bl, 63 jnz short rx_body_398 - call rx_read_l2 + call rx_read rx_body_398: and eax, 32767 mov rax, qword ptr [rsi+rax*8] @@ -7064,11 +7064,11 @@ rx_i_399: ;FPMUL jz rx_finish xor r11, 0899a98cfh mov eax, r11d + xor rbp, rax test bl, 63 jnz short rx_body_399 - call rx_read_l1 + call rx_read rx_body_399: - xor rbp, rax and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] mulpd xmm0, xmm2 @@ -7084,7 +7084,7 @@ rx_i_400: ;AND_64 mov eax, r13d test bl, 63 jnz short rx_body_400 - call rx_read_l2 + call rx_read rx_body_400: and eax, 32767 mov rax, qword ptr [rsi+rax*8] @@ -7102,7 +7102,7 @@ rx_i_401: ;FPSUB mov eax, r13d test bl, 63 jnz short rx_body_401 - call rx_read_l2 + call rx_read rx_body_401: and eax, 32767 cvtdq2pd xmm0, qword ptr [rsi+rax*8] @@ -7120,7 +7120,7 @@ rx_i_402: ;RET mov eax, r9d test bl, 63 jnz short rx_body_402 - call rx_read_l1 + call rx_read rx_body_402: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -7136,7 +7136,7 @@ rx_i_403: ;DIV_64 mov eax, r9d test bl, 63 jnz short rx_body_403 - call rx_read_l1 + call rx_read rx_body_403: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -7158,7 +7158,7 @@ rx_i_404: ;MUL_32 mov eax, r15d test bl, 63 jnz short rx_body_404 - call rx_read_l2 + call rx_read rx_body_404: and eax, 32767 mov rax, qword ptr [rsi+rax*8] @@ -7174,7 +7174,7 @@ rx_i_405: ;CALL mov eax, r8d test bl, 63 jnz short rx_body_405 - call rx_read_l2 + call rx_read rx_body_405: and eax, 32767 mov rax, qword ptr [rsi+rax*8] @@ -7194,7 +7194,7 @@ rx_i_406: ;FPDIV mov eax, r9d test bl, 63 jnz short rx_body_406 - call rx_read_l1 + call rx_read rx_body_406: and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] @@ -7213,11 +7213,11 @@ rx_i_407: ;FPSUB jz rx_finish xor r14, 09699566fh mov eax, r14d + xor rbp, rax test bl, 63 jnz short rx_body_407 - call rx_read_l1 + call rx_read rx_body_407: - xor rbp, rax and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] subpd xmm0, xmm9 @@ -7230,7 +7230,7 @@ rx_i_408: ;MUL_64 mov eax, r15d test bl, 63 jnz short rx_body_408 - call rx_read_l2 + call rx_read rx_body_408: and eax, 32767 mov rax, qword ptr [rsi+rax*8] @@ -7246,11 +7246,11 @@ rx_i_409: ;MUL_64 jz rx_finish xor r11, 04b6caa9ah mov eax, r11d + xor rbp, rax test bl, 63 jnz short rx_body_409 - call rx_read_l1 + call rx_read rx_body_409: - xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] imul rax, r15 @@ -7263,7 +7263,7 @@ rx_i_410: ;RET mov eax, r15d test bl, 63 jnz short rx_body_410 - call rx_read_l2 + call rx_read rx_body_410: and eax, 32767 mov rax, qword ptr [rsi+rax*8] @@ -7279,7 +7279,7 @@ rx_i_411: ;RET mov eax, r12d test bl, 63 jnz short rx_body_411 - call rx_read_l1 + call rx_read rx_body_411: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -7295,7 +7295,7 @@ rx_i_412: ;FPDIV mov eax, r10d test bl, 63 jnz short rx_body_412 - call rx_read_l1 + call rx_read rx_body_412: and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] @@ -7316,7 +7316,7 @@ rx_i_413: ;FPMUL mov eax, r11d test bl, 63 jnz short rx_body_413 - call rx_read_l1 + call rx_read rx_body_413: and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] @@ -7331,11 +7331,11 @@ rx_i_414: ;AND_64 jz rx_finish xor r14, 06c01554dh mov eax, r14d + xor rbp, rax test bl, 63 jnz short rx_body_414 - call rx_read_l2 + call rx_read rx_body_414: - xor rbp, rax and eax, 32767 mov rax, qword ptr [rsi+rax*8] and rax, -378293327 @@ -7350,11 +7350,11 @@ rx_i_415: ;DIV_64 jz rx_finish xor r8, 08c3e59a1h mov eax, r8d + xor rbp, rax test bl, 63 jnz short rx_body_415 - call rx_read_l2 + call rx_read rx_body_415: - xor rbp, rax and eax, 32767 mov rax, qword ptr [rsi+rax*8] ; magic divide by 3756873911 @@ -7371,11 +7371,11 @@ rx_i_416: ;FPADD jz rx_finish xor r12, 0f3fafde9h mov eax, r12d + xor rbp, rax test bl, 63 jnz short rx_body_416 - call rx_read_l1 + call rx_read rx_body_416: - xor rbp, rax and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] addpd xmm0, xmm3 @@ -7392,7 +7392,7 @@ rx_i_417: ;SUB_64 mov eax, r10d test bl, 63 jnz short rx_body_417 - call rx_read_l1 + call rx_read rx_body_417: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -7406,7 +7406,7 @@ rx_i_418: ;MULH_64 mov eax, r10d test bl, 63 jnz short rx_body_418 - call rx_read_l2 + call rx_read rx_body_418: and eax, 32767 mov rax, qword ptr [rsi+rax*8] @@ -7422,7 +7422,7 @@ rx_i_419: ;OR_64 mov eax, r9d test bl, 63 jnz short rx_body_419 - call rx_read_l2 + call rx_read rx_body_419: and eax, 32767 mov rax, qword ptr [rsi+rax*8] @@ -7440,7 +7440,7 @@ rx_i_420: ;ROR_64 mov eax, r9d test bl, 63 jnz short rx_body_420 - call rx_read_l1 + call rx_read rx_body_420: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -7452,11 +7452,11 @@ rx_i_421: ;CALL jz rx_finish xor r12, 01ada0f39h mov eax, r12d + xor rbp, rax test bl, 63 jnz short rx_body_421 - call rx_read_l1 + call rx_read rx_body_421: - xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] mov r10, rax @@ -7471,7 +7471,7 @@ rx_i_422: ;IMUL_32 mov eax, r11d test bl, 63 jnz short rx_body_422 - call rx_read_l1 + call rx_read rx_body_422: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -7487,7 +7487,7 @@ rx_i_423: ;MUL_64 mov eax, r12d test bl, 63 jnz short rx_body_423 - call rx_read_l1 + call rx_read rx_body_423: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -7503,11 +7503,11 @@ rx_i_424: ;FPADD jz rx_finish xor r13, 01ad12ce2h mov eax, r13d + xor rbp, rax test bl, 63 jnz short rx_body_424 - call rx_read_l1 + call rx_read rx_body_424: - xor rbp, rax and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] addpd xmm0, xmm7 @@ -7524,7 +7524,7 @@ rx_i_425: ;IMUL_32 mov eax, r8d test bl, 63 jnz short rx_body_425 - call rx_read_l1 + call rx_read rx_body_425: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -7540,7 +7540,7 @@ rx_i_426: ;IDIV_64 mov eax, r12d test bl, 63 jnz short rx_body_426 - call rx_read_l1 + call rx_read rx_body_426: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -7563,11 +7563,11 @@ rx_i_427: ;MUL_32 jz rx_finish xor r11, 0d6cae9aeh mov eax, r11d + xor rbp, rax test bl, 63 jnz short rx_body_427 - call rx_read_l1 + call rx_read rx_body_427: - xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] mov ecx, eax @@ -7586,7 +7586,7 @@ rx_i_428: ;RET mov eax, r11d test bl, 63 jnz short rx_body_428 - call rx_read_l1 + call rx_read rx_body_428: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -7606,7 +7606,7 @@ rx_i_429: ;MUL_64 mov eax, r12d test bl, 63 jnz short rx_body_429 - call rx_read_l1 + call rx_read rx_body_429: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -7618,11 +7618,11 @@ rx_i_430: ;FPADD jz rx_finish xor r14, 019cc0e5h mov eax, r14d + xor rbp, rax test bl, 63 jnz short rx_body_430 - call rx_read_l1 + call rx_read rx_body_430: - xor rbp, rax and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] addpd xmm0, xmm8 @@ -7639,7 +7639,7 @@ rx_i_431: ;ROR_64 mov eax, r12d test bl, 63 jnz short rx_body_431 - call rx_read_l1 + call rx_read rx_body_431: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -7654,7 +7654,7 @@ rx_i_432: ;SUB_64 mov eax, r10d test bl, 63 jnz short rx_body_432 - call rx_read_l1 + call rx_read rx_body_432: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -7668,7 +7668,7 @@ rx_i_433: ;ADD_32 mov eax, r13d test bl, 63 jnz short rx_body_433 - call rx_read_l1 + call rx_read rx_body_433: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -7686,7 +7686,7 @@ rx_i_434: ;FPDIV mov eax, r13d test bl, 63 jnz short rx_body_434 - call rx_read_l1 + call rx_read rx_body_434: and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] @@ -7705,11 +7705,11 @@ rx_i_435: ;MUL_64 jz rx_finish xor r15, 0b940480ah mov eax, r15d + xor rbp, rax test bl, 63 jnz short rx_body_435 - call rx_read_l1 + call rx_read rx_body_435: - xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] imul rax, rax, 1971717631 @@ -7726,7 +7726,7 @@ rx_i_436: ;FPADD mov eax, r15d test bl, 63 jnz short rx_body_436 - call rx_read_l1 + call rx_read rx_body_436: and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] @@ -7742,11 +7742,11 @@ rx_i_437: ;FPMUL jz rx_finish xor r8, 098a6bcf7h mov eax, r8d + xor rbp, rax test bl, 63 jnz short rx_body_437 - call rx_read_l1 + call rx_read rx_body_437: - xor rbp, rax and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] mulpd xmm0, xmm3 @@ -7762,7 +7762,7 @@ rx_i_438: ;FPMUL mov eax, r10d test bl, 63 jnz short rx_body_438 - call rx_read_l1 + call rx_read rx_body_438: and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] @@ -7779,7 +7779,7 @@ rx_i_439: ;OR_64 mov eax, r13d test bl, 63 jnz short rx_body_439 - call rx_read_l1 + call rx_read rx_body_439: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -7795,11 +7795,11 @@ rx_i_440: ;CALL jz rx_finish xor r10, 062f83728h mov eax, r10d + xor rbp, rax test bl, 63 jnz short rx_body_440 - call rx_read_l1 + call rx_read rx_body_440: - xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] mov r9, rax @@ -7812,11 +7812,11 @@ rx_i_441: ;ADD_64 jz rx_finish xor r14, 0d18ec075h mov eax, r14d + xor rbp, rax test bl, 63 jnz short rx_body_441 - call rx_read_l1 + call rx_read rx_body_441: - xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] add rax, r14 @@ -7833,7 +7833,7 @@ rx_i_442: ;FPSQRT mov eax, r14d test bl, 63 jnz short rx_body_442 - call rx_read_l1 + call rx_read rx_body_442: and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] @@ -7847,7 +7847,7 @@ rx_i_443: ;RET mov eax, r14d test bl, 63 jnz short rx_body_443 - call rx_read_l1 + call rx_read rx_body_443: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -7867,7 +7867,7 @@ rx_i_444: ;FPSUB mov eax, r8d test bl, 63 jnz short rx_body_444 - call rx_read_l1 + call rx_read rx_body_444: and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] @@ -7885,7 +7885,7 @@ rx_i_445: ;ADD_64 mov eax, r13d test bl, 63 jnz short rx_body_445 - call rx_read_l1 + call rx_read rx_body_445: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -7901,11 +7901,11 @@ rx_i_446: ;MUL_32 jz rx_finish xor r12, 01734708eh mov eax, r12d + xor rbp, rax test bl, 63 jnz short rx_body_446 - call rx_read_l1 + call rx_read rx_body_446: - xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] mov ecx, eax @@ -7924,7 +7924,7 @@ rx_i_447: ;FPADD mov eax, r8d test bl, 63 jnz short rx_body_447 - call rx_read_l1 + call rx_read rx_body_447: and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] @@ -7940,11 +7940,11 @@ rx_i_448: ;FPSUB jz rx_finish xor r9, 0390cfdb0h mov eax, r9d + xor rbp, rax test bl, 63 jnz short rx_body_448 - call rx_read_l1 + call rx_read rx_body_448: - xor rbp, rax and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] subpd xmm0, xmm3 @@ -7957,7 +7957,7 @@ rx_i_449: ;ROL_64 mov eax, r8d test bl, 63 jnz short rx_body_449 - call rx_read_l1 + call rx_read rx_body_449: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -7971,7 +7971,7 @@ rx_i_450: ;SAR_64 mov eax, r8d test bl, 63 jnz short rx_body_450 - call rx_read_l1 + call rx_read rx_body_450: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -7988,11 +7988,11 @@ rx_i_451: ;ADD_64 jz rx_finish xor r8, 0c4d99ac9h mov eax, r8d + xor rbp, rax test bl, 63 jnz short rx_body_451 - call rx_read_l1 + call rx_read rx_body_451: - xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] add rax, r10 @@ -8005,7 +8005,7 @@ rx_i_452: ;RET mov eax, r13d test bl, 63 jnz short rx_body_452 - call rx_read_l2 + call rx_read rx_body_452: and eax, 32767 mov rax, qword ptr [rsi+rax*8] @@ -8025,7 +8025,7 @@ rx_i_453: ;DIV_64 mov eax, r11d test bl, 63 jnz short rx_body_453 - call rx_read_l2 + call rx_read rx_body_453: and eax, 32767 mov rax, qword ptr [rsi+rax*8] @@ -8044,7 +8044,7 @@ rx_i_454: ;FPADD mov eax, r13d test bl, 63 jnz short rx_body_454 - call rx_read_l2 + call rx_read rx_body_454: and eax, 32767 cvtdq2pd xmm0, qword ptr [rsi+rax*8] @@ -8060,11 +8060,11 @@ rx_i_455: ;OR_64 jz rx_finish xor r8, 059263cdbh mov eax, r8d + xor rbp, rax test bl, 63 jnz short rx_body_455 - call rx_read_l1 + call rx_read rx_body_455: - xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] or rax, r9 @@ -8077,7 +8077,7 @@ rx_i_456: ;AND_64 mov eax, r9d test bl, 63 jnz short rx_body_456 - call rx_read_l1 + call rx_read rx_body_456: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -8093,11 +8093,11 @@ rx_i_457: ;SUB_64 jz rx_finish xor r9, 09de1a3efh mov eax, r9d + xor rbp, rax test bl, 63 jnz short rx_body_457 - call rx_read_l1 + call rx_read rx_body_457: - xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] sub rax, 1482178870 @@ -8114,7 +8114,7 @@ rx_i_458: ;SAR_64 mov eax, r11d test bl, 63 jnz short rx_body_458 - call rx_read_l2 + call rx_read rx_body_458: and eax, 32767 mov rax, qword ptr [rsi+rax*8] @@ -8129,7 +8129,7 @@ rx_i_459: ;MUL_64 mov eax, r9d test bl, 63 jnz short rx_body_459 - call rx_read_l1 + call rx_read rx_body_459: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -8147,7 +8147,7 @@ rx_i_460: ;ADD_32 mov eax, r11d test bl, 63 jnz short rx_body_460 - call rx_read_l1 + call rx_read rx_body_460: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -8165,7 +8165,7 @@ rx_i_461: ;XOR_64 mov eax, r11d test bl, 63 jnz short rx_body_461 - call rx_read_l1 + call rx_read rx_body_461: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -8183,7 +8183,7 @@ rx_i_462: ;ADD_64 mov eax, r10d test bl, 63 jnz short rx_body_462 - call rx_read_l1 + call rx_read rx_body_462: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -8197,7 +8197,7 @@ rx_i_463: ;ADD_32 mov eax, r9d test bl, 63 jnz short rx_body_463 - call rx_read_l1 + call rx_read rx_body_463: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -8211,7 +8211,7 @@ rx_i_464: ;MUL_64 mov eax, r12d test bl, 63 jnz short rx_body_464 - call rx_read_l1 + call rx_read rx_body_464: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -8229,7 +8229,7 @@ rx_i_465: ;FPADD mov eax, r12d test bl, 63 jnz short rx_body_465 - call rx_read_l1 + call rx_read rx_body_465: and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] @@ -8241,11 +8241,11 @@ rx_i_466: ;IMUL_32 jz rx_finish xor r13, 05c541c42h mov eax, r13d + xor rbp, rax test bl, 63 jnz short rx_body_466 - call rx_read_l1 + call rx_read rx_body_466: - xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] movsxd rcx, eax @@ -8258,11 +8258,11 @@ rx_i_467: ;FPADD jz rx_finish xor r8, 0cbb33f81h mov eax, r8d + xor rbp, rax test bl, 63 jnz short rx_body_467 - call rx_read_l1 + call rx_read rx_body_467: - xor rbp, rax and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] addpd xmm0, xmm9 @@ -8273,11 +8273,11 @@ rx_i_468: ;DIV_64 jz rx_finish xor r8, 091044dc3h mov eax, r8d + xor rbp, rax test bl, 63 jnz short rx_body_468 - call rx_read_l1 + call rx_read rx_body_468: - xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] ; magic divide by 4281572471 @@ -8300,7 +8300,7 @@ rx_i_469: ;IMUL_32 mov eax, r9d test bl, 63 jnz short rx_body_469 - call rx_read_l2 + call rx_read rx_body_469: and eax, 32767 mov rax, qword ptr [rsi+rax*8] @@ -8320,7 +8320,7 @@ rx_i_470: ;OR_64 mov eax, r14d test bl, 63 jnz short rx_body_470 - call rx_read_l2 + call rx_read rx_body_470: and eax, 32767 mov rax, qword ptr [rsi+rax*8] @@ -8338,7 +8338,7 @@ rx_i_471: ;IMUL_32 mov eax, r14d test bl, 63 jnz short rx_body_471 - call rx_read_l2 + call rx_read rx_body_471: and eax, 32767 mov rax, qword ptr [rsi+rax*8] @@ -8352,11 +8352,11 @@ rx_i_472: ;JUMP jz rx_finish xor r9, 038f4b9d6h mov eax, r9d + xor rbp, rax test bl, 63 jnz short rx_body_472 - call rx_read_l1 + call rx_read rx_body_472: - xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] mov r10, rax @@ -8370,7 +8370,7 @@ rx_i_473: ;MUL_64 mov eax, r14d test bl, 63 jnz short rx_body_473 - call rx_read_l1 + call rx_read rx_body_473: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -8382,11 +8382,11 @@ rx_i_474: ;JUMP jz rx_finish xor r9, 0b5c0b4d4h mov eax, r9d + xor rbp, rax test bl, 63 jnz short rx_body_474 - call rx_read_l1 + call rx_read rx_body_474: - xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] mov r15, rax @@ -8400,7 +8400,7 @@ rx_i_475: ;FPSUB mov eax, r10d test bl, 63 jnz short rx_body_475 - call rx_read_l1 + call rx_read rx_body_475: and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] @@ -8414,7 +8414,7 @@ rx_i_476: ;FPADD mov eax, r8d test bl, 63 jnz short rx_body_476 - call rx_read_l1 + call rx_read rx_body_476: and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] @@ -8428,7 +8428,7 @@ rx_i_477: ;FPADD mov eax, r12d test bl, 63 jnz short rx_body_477 - call rx_read_l2 + call rx_read rx_body_477: and eax, 32767 cvtdq2pd xmm0, qword ptr [rsi+rax*8] @@ -8446,7 +8446,7 @@ rx_i_478: ;MUL_64 mov eax, r14d test bl, 63 jnz short rx_body_478 - call rx_read_l2 + call rx_read rx_body_478: and eax, 32767 mov rax, qword ptr [rsi+rax*8] @@ -8460,7 +8460,7 @@ rx_i_479: ;MUL_64 mov eax, r12d test bl, 63 jnz short rx_body_479 - call rx_read_l1 + call rx_read rx_body_479: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -8478,7 +8478,7 @@ rx_i_480: ;FPADD mov eax, r9d test bl, 63 jnz short rx_body_480 - call rx_read_l1 + call rx_read rx_body_480: and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] @@ -8492,7 +8492,7 @@ rx_i_481: ;DIV_64 mov eax, r14d test bl, 63 jnz short rx_body_481 - call rx_read_l1 + call rx_read rx_body_481: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -8511,7 +8511,7 @@ rx_i_482: ;AND_32 mov eax, r14d test bl, 63 jnz short rx_body_482 - call rx_read_l1 + call rx_read rx_body_482: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -8525,7 +8525,7 @@ rx_i_483: ;FPADD mov eax, r11d test bl, 63 jnz short rx_body_483 - call rx_read_l1 + call rx_read rx_body_483: and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] @@ -8537,11 +8537,11 @@ rx_i_484: ;SHR_64 jz rx_finish xor r12, 07027bacdh mov eax, r12d + xor rbp, rax test bl, 63 jnz short rx_body_484 - call rx_read_l1 + call rx_read rx_body_484: - xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] shr rax, 37 @@ -8552,11 +8552,11 @@ rx_i_485: ;JUMP jz rx_finish xor r13, 03a04647h mov eax, r13d + xor rbp, rax test bl, 63 jnz short rx_body_485 - call rx_read_l1 + call rx_read rx_body_485: - xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] mov rcx, rax @@ -8574,7 +8574,7 @@ rx_i_486: ;ADD_64 mov eax, r15d test bl, 63 jnz short rx_body_486 - call rx_read_l1 + call rx_read rx_body_486: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -8592,7 +8592,7 @@ rx_i_487: ;SUB_64 mov eax, r11d test bl, 63 jnz short rx_body_487 - call rx_read_l1 + call rx_read rx_body_487: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -8606,7 +8606,7 @@ rx_i_488: ;DIV_64 mov eax, r12d test bl, 63 jnz short rx_body_488 - call rx_read_l2 + call rx_read rx_body_488: and eax, 32767 mov rax, qword ptr [rsi+rax*8] @@ -8622,11 +8622,11 @@ rx_i_489: ;JUMP jz rx_finish xor r10, 0b2ec9f3ah mov eax, r10d + xor rbp, rax test bl, 63 jnz short rx_body_489 - call rx_read_l1 + call rx_read rx_body_489: - xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] mov rcx, rax @@ -8642,11 +8642,11 @@ rx_i_490: ;ROR_64 jz rx_finish xor r11, 015c7f598h mov eax, r11d + xor rbp, rax test bl, 63 jnz short rx_body_490 - call rx_read_l1 + call rx_read rx_body_490: - xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] mov rcx, r9 @@ -8664,7 +8664,7 @@ rx_i_491: ;FPADD mov eax, r8d test bl, 63 jnz short rx_body_491 - call rx_read_l1 + call rx_read rx_body_491: and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] @@ -8682,7 +8682,7 @@ rx_i_492: ;IDIV_64 mov eax, r9d test bl, 63 jnz short rx_body_492 - call rx_read_l1 + call rx_read rx_body_492: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -8705,7 +8705,7 @@ rx_i_493: ;FPSUB mov eax, r8d test bl, 63 jnz short rx_body_493 - call rx_read_l1 + call rx_read rx_body_493: and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] @@ -8719,7 +8719,7 @@ rx_i_494: ;MUL_32 mov eax, r10d test bl, 63 jnz short rx_body_494 - call rx_read_l1 + call rx_read rx_body_494: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -8735,7 +8735,7 @@ rx_i_495: ;FPMUL mov eax, r11d test bl, 63 jnz short rx_body_495 - call rx_read_l2 + call rx_read rx_body_495: and eax, 32767 cvtdq2pd xmm0, qword ptr [rsi+rax*8] @@ -8752,7 +8752,7 @@ rx_i_496: ;IDIV_64 mov eax, r14d test bl, 63 jnz short rx_body_496 - call rx_read_l1 + call rx_read rx_body_496: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -8771,11 +8771,11 @@ rx_i_497: ;FPMUL jz rx_finish xor r8, 08d25742eh mov eax, r8d + xor rbp, rax test bl, 63 jnz short rx_body_497 - call rx_read_l1 + call rx_read rx_body_497: - xor rbp, rax and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] mulpd xmm0, xmm3 @@ -8791,7 +8791,7 @@ rx_i_498: ;FPMUL mov eax, r15d test bl, 63 jnz short rx_body_498 - call rx_read_l1 + call rx_read rx_body_498: and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] @@ -8812,7 +8812,7 @@ rx_i_499: ;IMUL_32 mov eax, r12d test bl, 63 jnz short rx_body_499 - call rx_read_l1 + call rx_read rx_body_499: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -8828,7 +8828,7 @@ rx_i_500: ;FPSQRT mov eax, r10d test bl, 63 jnz short rx_body_500 - call rx_read_l2 + call rx_read rx_body_500: and eax, 32767 cvtdq2pd xmm0, qword ptr [rsi+rax*8] @@ -8842,7 +8842,7 @@ rx_i_501: ;XOR_64 mov eax, r8d test bl, 63 jnz short rx_body_501 - call rx_read_l1 + call rx_read rx_body_501: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -8858,11 +8858,11 @@ rx_i_502: ;RET jz rx_finish xor r10, 09e70b20ch mov eax, r10d + xor rbp, rax test bl, 63 jnz short rx_body_502 - call rx_read_l2 + call rx_read rx_body_502: - xor rbp, rax and eax, 32767 mov rax, qword ptr [rsi+rax*8] mov rcx, rax @@ -8881,7 +8881,7 @@ rx_i_503: ;FPSUB mov eax, r13d test bl, 63 jnz short rx_body_503 - call rx_read_l2 + call rx_read rx_body_503: and eax, 32767 cvtdq2pd xmm0, qword ptr [rsi+rax*8] @@ -8899,7 +8899,7 @@ rx_i_504: ;FPADD mov eax, r13d test bl, 63 jnz short rx_body_504 - call rx_read_l1 + call rx_read rx_body_504: and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] @@ -8917,7 +8917,7 @@ rx_i_505: ;FPSUB mov eax, r12d test bl, 63 jnz short rx_body_505 - call rx_read_l1 + call rx_read rx_body_505: and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] @@ -8935,7 +8935,7 @@ rx_i_506: ;FPSUB mov eax, r9d test bl, 63 jnz short rx_body_506 - call rx_read_l2 + call rx_read rx_body_506: and eax, 32767 cvtdq2pd xmm0, qword ptr [rsi+rax*8] @@ -8949,7 +8949,7 @@ rx_i_507: ;RET mov eax, r10d test bl, 63 jnz short rx_body_507 - call rx_read_l1 + call rx_read rx_body_507: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -8965,7 +8965,7 @@ rx_i_508: ;RET mov eax, r13d test bl, 63 jnz short rx_body_508 - call rx_read_l1 + call rx_read rx_body_508: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -8981,7 +8981,7 @@ rx_i_509: ;FPROUND mov eax, r11d test bl, 63 jnz short rx_body_509 - call rx_read_l1 + call rx_read rx_body_509: and eax, 2047 mov rax, qword ptr [rsi+rax*8] @@ -9000,7 +9000,7 @@ rx_i_510: ;FPADD mov eax, r8d test bl, 63 jnz short rx_body_510 - call rx_read_l1 + call rx_read rx_body_510: and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] @@ -9014,7 +9014,7 @@ rx_i_511: ;SHR_64 mov eax, r11d test bl, 63 jnz short rx_body_511 - call rx_read_l2 + call rx_read rx_body_511: and eax, 32767 mov rax, qword ptr [rsi+rax*8] From 8b1102ee05200c0b22b8c89b6b204142725e1958 Mon Sep 17 00:00:00 2001 From: tevador Date: Tue, 15 Jan 2019 00:01:11 +0100 Subject: [PATCH 18/35] Interpreter + async mode --- src/AddressTransform.cpp | 292 ++++++++++++++++++++++++++++++ src/AssemblyGeneratorX86.cpp | 4 +- src/Cache.hpp | 2 +- src/CompiledVirtualMachine.cpp | 13 +- src/CompiledVirtualMachine.hpp | 5 +- src/InterpretedVirtualMachine.cpp | 246 +++++++++++++++---------- src/InterpretedVirtualMachine.hpp | 31 +++- src/JitCompilerX86.cpp | 6 +- src/LightClientAsyncWorker.cpp | 94 ++++++++++ src/LightClientAsyncWorker.hpp | 52 ++++++ src/VirtualMachine.cpp | 56 +----- src/VirtualMachine.hpp | 9 +- src/common.hpp | 23 ++- src/dataset.cpp | 48 +++-- src/dataset.hpp | 6 +- src/instructions.hpp | 8 +- src/instructionsPortable.cpp | 6 +- src/main.cpp | 16 +- 18 files changed, 702 insertions(+), 215 deletions(-) create mode 100644 src/AddressTransform.cpp create mode 100644 src/LightClientAsyncWorker.cpp create mode 100644 src/LightClientAsyncWorker.hpp diff --git a/src/AddressTransform.cpp b/src/AddressTransform.cpp new file mode 100644 index 0000000..b8070a0 --- /dev/null +++ b/src/AddressTransform.cpp @@ -0,0 +1,292 @@ +/* +Copyright (c) 2019 tevador + +This file is part of RandomX. + +RandomX is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +RandomX is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with RandomX. If not, see. +*/ + +#include "common.hpp" +#include "InterpretedVirtualMachine.hpp" + +#include +#include +#include + +namespace RandomX { + + class Mul9Transform : public ITransform { + public: + Mul9Transform(int32_t cc) : c(cc) { + std::ostringstream oss; + oss << "mul9_" << std::hex << (cc & 255); + name = oss.str(); + } + int32_t apply(int32_t x) const override { + return 9 * x + c; + } + const char* getName() const override { + return name.c_str(); + } + std::ostream& printAsm(std::ostream& os) const override { + os << "lea ecx, [rcx+rcx*8" << std::showpos << c << "]" << std::noshowpos << std::endl; + return os; + } + std::ostream& printCxx(std::ostream& os) const override { + os << "static const Mul9Transform " << name << "(" << c << ");" << std::endl; + return os; + } + private: + int32_t c; + std::string name; + }; + + class AddTransform : public ITransform { + public: + AddTransform(int32_t cc) : c(cc) { + std::ostringstream oss; + oss << "add_" << std::hex << (cc & 255); + name = oss.str(); + } + int32_t apply(int32_t x) const override { + return x + c; + } + const char* getName() const override { + return name.c_str(); + } + std::ostream& printAsm(std::ostream& os) const override { + os << "db 64" << std::endl; + os << "add ecx, " << c << std::endl; + return os; + } + std::ostream& printCxx(std::ostream& os) const override { + os << "static const AddTransform " << name << "(" << c << ");" << std::endl; + return os; + } + private: + int32_t c; + std::string name; + }; + + class XorTransform : public ITransform { + public: + XorTransform(int32_t cc) : c(cc) { + std::ostringstream oss; + oss << "xor_" << std::hex << (cc & 255); + name = oss.str(); + } + int32_t apply(int32_t x) const override { + return x ^ c; + } + const char* getName() const override { + return name.c_str(); + } + std::ostream& printAsm(std::ostream& os) const override { + os << "db 64" << std::endl; + os << "xor ecx, " << c << std::endl; + return os; + } + std::ostream& printCxx(std::ostream& os) const override { + os << "static const XorTransform " << name << "(" << c << ");" << std::endl; + return os; + } + private: + int32_t c; + std::string name; + }; + + static const Mul9Transform mul9_6d(109); + static const XorTransform xor_60(96); + static const Mul9Transform mul9_ed(-19); + static const AddTransform add_9e(-98); + static const AddTransform add_eb(-21); + static const XorTransform xor_b0(-80); + static const Mul9Transform mul9_a4(-92); + static const AddTransform add_71(113); + static const Mul9Transform mul9_64(100); + static const AddTransform add_d9(-39); + static const XorTransform xor_78(120); + static const Mul9Transform mul9_89(-119); + static const AddTransform add_8f(-113); + static const AddTransform add_6f(111); + static const XorTransform xor_68(104); + static const Mul9Transform mul9_ad(-83); + static const Mul9Transform mul9_7f(127); + static const XorTransform xor_90(-112); + static const AddTransform add_59(89); + static const AddTransform add_e0(-32); + static const AddTransform add_68(104); + static const XorTransform xor_88(-120); + static const XorTransform xor_18(24); + static const Mul9Transform mul9_9(9); + static const AddTransform add_e1(-31); + static const XorTransform xor_f0(-16); + static const AddTransform add_44(68); + static const Mul9Transform mul9_92(-110); + static const XorTransform xor_40(64); + static const XorTransform xor_d8(-40); + static const XorTransform xor_f8(-8); + static const AddTransform add_f6(-10); + static const XorTransform xor_e0(-32); + static const AddTransform add_e(14); + static const Mul9Transform mul9_d2(-46); + static const XorTransform xor_98(-104); + static const Mul9Transform mul9_24(36); + static const AddTransform add_64(100); + static const Mul9Transform mul9_bf(-65); + static const Mul9Transform mul9_1b(27); + static const Mul9Transform mul9_5b(91); + static const AddTransform add_9b(-101); + static const AddTransform add_a2(-94); + static const Mul9Transform mul9_f6(-10); + static const XorTransform xor_50(80); + static const AddTransform add_94(-108); + static const AddTransform add_c6(-58); + static const XorTransform xor_30(48); + static const Mul9Transform mul9_49(73); + static const XorTransform xor_d0(-48); + static const XorTransform xor_20(32); + static const XorTransform xor_a0(-96); + static const AddTransform add_76(118); + static const AddTransform add_5b(91); + static const Mul9Transform mul9_12(18); + static const AddTransform add_f5(-11); + static const Mul9Transform mul9_3f(63); + static const AddTransform add_72(114); + static const Mul9Transform mul9_2d(45); + static const AddTransform add_bd(-67); + static const AddTransform add_35(53); + static const Mul9Transform mul9_9b(-101); + static const Mul9Transform mul9_ff(-1); + static const XorTransform xor_10(16); + static const Mul9Transform mul9_db(-37); + static const Mul9Transform mul9_e4(-28); + static const Mul9Transform mul9_c9(-55); + static const XorTransform xor_a8(-88); + static const XorTransform xor_b8(-72); + static const AddTransform add_24(36); + static const XorTransform xor_c8(-56); + static const AddTransform add_74(116); + static const XorTransform xor_58(88); + static const XorTransform xor_80(-128); + static const AddTransform add_32(50); + static const AddTransform add_69(105); + static const AddTransform add_db(-37); + static const XorTransform xor_70(112); + static const XorTransform xor_8(8); + static const XorTransform xor_e8(-24); + static const Mul9Transform mul9_76(118); + static const XorTransform xor_48(72); + static const XorTransform xor_c0(-64); + static const AddTransform add_28(40); + static const Mul9Transform mul9_b6(-74); + static const Mul9Transform mul9_52(82); + static const Mul9Transform mul9_36(54); + static const XorTransform xor_38(56); + static const XorTransform xor_28(40); + static const AddTransform add_57(87); + + const ITransform* InterpretedVirtualMachine::addressTransformations[TransformationCount] = { + (ITransform*)&mul9_6d, + (ITransform*)&xor_60, + (ITransform*)&mul9_ed, + (ITransform*)&add_9e, + (ITransform*)&add_eb, + (ITransform*)&xor_b0, + (ITransform*)&mul9_a4, + (ITransform*)&add_71, + (ITransform*)&mul9_64, + (ITransform*)&add_d9, + (ITransform*)&xor_78, + (ITransform*)&mul9_89, + (ITransform*)&add_8f, + (ITransform*)&add_6f, + (ITransform*)&xor_68, + (ITransform*)&mul9_ad, + (ITransform*)&mul9_7f, + (ITransform*)&xor_90, + (ITransform*)&add_59, + (ITransform*)&add_e0, + (ITransform*)&add_68, + (ITransform*)&xor_88, + (ITransform*)&xor_18, + (ITransform*)&mul9_9, + (ITransform*)&add_e1, + (ITransform*)&xor_f0, + (ITransform*)&add_44, + (ITransform*)&mul9_92, + (ITransform*)&xor_40, + (ITransform*)&xor_d8, + (ITransform*)&xor_f8, + (ITransform*)&add_f6, + (ITransform*)&xor_e0, + (ITransform*)&add_e, + (ITransform*)&mul9_d2, + (ITransform*)&xor_98, + (ITransform*)&mul9_24, + (ITransform*)&add_64, + (ITransform*)&mul9_bf, + (ITransform*)&mul9_1b, + (ITransform*)&mul9_5b, + (ITransform*)&add_9b, + (ITransform*)&add_a2, + (ITransform*)&mul9_f6, + (ITransform*)&xor_50, + (ITransform*)&add_94, + (ITransform*)&add_c6, + (ITransform*)&xor_30, + (ITransform*)&mul9_49, + (ITransform*)&xor_d0, + (ITransform*)&xor_20, + (ITransform*)&xor_a0, + (ITransform*)&add_76, + (ITransform*)&add_5b, + (ITransform*)&mul9_12, + (ITransform*)&add_f5, + (ITransform*)&mul9_3f, + (ITransform*)&add_72, + (ITransform*)&mul9_2d, + (ITransform*)&add_bd, + (ITransform*)&add_35, + (ITransform*)&mul9_9b, + (ITransform*)&mul9_ff, + (ITransform*)&xor_10, + (ITransform*)&mul9_db, + (ITransform*)&mul9_e4, + (ITransform*)&mul9_c9, + (ITransform*)&xor_a8, + (ITransform*)&xor_b8, + (ITransform*)&add_24, + (ITransform*)&xor_c8, + (ITransform*)&add_74, + (ITransform*)&xor_58, + (ITransform*)&xor_80, + (ITransform*)&add_32, + (ITransform*)&add_69, + (ITransform*)&add_db, + (ITransform*)&xor_70, + (ITransform*)&xor_8, + (ITransform*)&xor_e8, + (ITransform*)&mul9_76, + (ITransform*)&xor_48, + (ITransform*)&xor_c0, + (ITransform*)&add_28, + (ITransform*)&mul9_b6, + (ITransform*)&mul9_52, + (ITransform*)&mul9_36, + (ITransform*)&xor_38, + (ITransform*)&xor_28, + (ITransform*)&add_57, + }; +} \ No newline at end of file diff --git a/src/AssemblyGeneratorX86.cpp b/src/AssemblyGeneratorX86.cpp index 8a4a0a1..25ae7ef 100644 --- a/src/AssemblyGeneratorX86.cpp +++ b/src/AssemblyGeneratorX86.cpp @@ -67,12 +67,12 @@ namespace RandomX { void AssemblyGeneratorX86::gena(Instruction& instr, int i) { asmCode << "\txor " << regR[instr.rega % RegistersCount] << ", 0" << std::hex << instr.addra << "h" << std::dec << std::endl; asmCode << "\tmov eax, " << regR32[instr.rega % RegistersCount] << std::endl; - if ((instr.loca & 192) == 0) - asmCode << "\txor " << regMx << ", rax" << std::endl; asmCode << "\ttest " << regIc8 << ", 63" << std::endl; asmCode << "\tjnz short rx_body_" << i << std::endl; asmCode << "\tcall rx_read" << std::endl; asmCode << "rx_body_" << i << ":" << std::endl; + if ((instr.loca & 192) == 0) + asmCode << "\txor " << regMx << ", rax" << std::endl; if (instr.loca & 3) { asmCode << "\tand eax, " << (ScratchpadL1 - 1) << std::endl; } diff --git a/src/Cache.hpp b/src/Cache.hpp index 4137b97..8a2b93a 100644 --- a/src/Cache.hpp +++ b/src/Cache.hpp @@ -46,7 +46,7 @@ namespace RandomX { return keys; } - const uint8_t* getCache() { + const uint8_t* getCache() const { return memory; } private: diff --git a/src/CompiledVirtualMachine.cpp b/src/CompiledVirtualMachine.cpp index ef78d2f..28a3cca 100644 --- a/src/CompiledVirtualMachine.cpp +++ b/src/CompiledVirtualMachine.cpp @@ -25,15 +25,16 @@ along with RandomX. If not, see. namespace RandomX { - CompiledVirtualMachine::CompiledVirtualMachine(bool softAes) : VirtualMachine(softAes) { + CompiledVirtualMachine::CompiledVirtualMachine() { totalSize = 0; } - void CompiledVirtualMachine::setDataset(dataset_t ds, bool lightClient) { - if (lightClient) { - throw std::runtime_error("Compiled VM does not support light-client mode"); - } - VirtualMachine::setDataset(ds, lightClient); + void CompiledVirtualMachine::setDataset(dataset_t ds) { + mem.ds = ds; + } + + void CompiledVirtualMachine::initializeScratchpad(uint32_t index) { + memcpy(scratchpad, mem.ds.dataset + ScratchpadSize * index, ScratchpadSize); } void CompiledVirtualMachine::initializeProgram(const void* seed) { diff --git a/src/CompiledVirtualMachine.hpp b/src/CompiledVirtualMachine.hpp index a77bdb8..98b0b78 100644 --- a/src/CompiledVirtualMachine.hpp +++ b/src/CompiledVirtualMachine.hpp @@ -37,8 +37,9 @@ namespace RandomX { void operator delete(void* ptr) { _mm_free(ptr); } - CompiledVirtualMachine(bool softAes); - void setDataset(dataset_t ds, bool light = false) override; + CompiledVirtualMachine(); + void setDataset(dataset_t ds) override; + void initializeScratchpad(uint32_t index) override; void initializeProgram(const void* seed) override; virtual void execute() override; void* getProgram() { diff --git a/src/InterpretedVirtualMachine.cpp b/src/InterpretedVirtualMachine.cpp index a6a3a0c..54d2279 100644 --- a/src/InterpretedVirtualMachine.cpp +++ b/src/InterpretedVirtualMachine.cpp @@ -21,11 +21,15 @@ along with RandomX. If not, see. #include "InterpretedVirtualMachine.hpp" #include "Pcg32.hpp" #include "instructions.hpp" +#include "dataset.hpp" +#include "Cache.hpp" +#include "LightClientAsyncWorker.hpp" #include #include #include #include #include +#include #ifdef STATS #include #endif @@ -38,6 +42,57 @@ constexpr bool fpuCheck = false; namespace RandomX { + InterpretedVirtualMachine::~InterpretedVirtualMachine() { + if (asyncWorker) { + delete mem.ds.asyncWorker; + } + } + + void InterpretedVirtualMachine::setDataset(dataset_t ds) { + if (asyncWorker) { + if (softAes) { + mem.ds.asyncWorker = new LightClientAsyncWorker(ds.cache); + } + else { + mem.ds.asyncWorker = new LightClientAsyncWorker(ds.cache); + } + readDataset = &datasetReadLightAsync; + } + else { + mem.ds = ds; + if (softAes) { + readDataset = &datasetReadLight; + } + else { + readDataset = &datasetReadLight; + } + } + } + + void InterpretedVirtualMachine::initializeScratchpad(uint32_t index) { + uint32_t startingBlock = (ScratchpadSize / CacheLineSize) * index; + if (asyncWorker) { + ILightClientAsyncWorker* worker = mem.ds.asyncWorker; + const uint32_t blocksPerThread = (ScratchpadSize / CacheLineSize) / 2; + worker->prepareBlocks(scratchpad, startingBlock, blocksPerThread); //async first half + worker->getBlocks(scratchpad + ScratchpadLength / 2, startingBlock + blocksPerThread, blocksPerThread); //sync second half + worker->sync(); + } + else { + auto cache = mem.ds.cache; + if (softAes) { + for (int i = 0; i < ScratchpadSize / CacheLineSize; ++i) { + initBlock(cache->getCache(), ((uint8_t*)scratchpad) + CacheLineSize * i, (ScratchpadSize / CacheLineSize) * index + i, cache->getKeys()); + } + } + else { + for (int i = 0; i < ScratchpadSize / CacheLineSize; ++i) { + initBlock(cache->getCache(), ((uint8_t*)scratchpad) + CacheLineSize * i, (ScratchpadSize / CacheLineSize) * index + i, cache->getKeys()); + } + } + } + } + void InterpretedVirtualMachine::initializeProgram(const void* seed) { Pcg32 gen(seed); for (unsigned i = 0; i < sizeof(reg) / sizeof(Pcg32::result_type); ++i) { @@ -50,6 +105,7 @@ namespace RandomX { } //std::cout << reg; p.initialize(gen); + currentTransform = addressTransformations[gen.getUniform(0, TransformationCount - 1)]; mem.ma = (gen() ^ *(((uint32_t*)seed) + 4)) & ~7; mem.mx = *(((uint32_t*)seed) + 5); pc = 0; @@ -74,61 +130,61 @@ namespace RandomX { #endif } - convertible_t InterpretedVirtualMachine::loada(Instruction& inst) { - convertible_t& rega = reg.r[inst.rega % RegistersCount]; - rega.i64 ^= inst.addra; //sign-extend addra + convertible_t InterpretedVirtualMachine::loada(Instruction& instr) { + convertible_t& rega = reg.r[instr.rega % RegistersCount]; + rega.i64 ^= instr.addra; //sign-extend addra addr_t addr = rega.u32; - switch (inst.loca & 7) - { - case 0: - case 1: - case 2: - case 3: - return readDataset(addr, mem); - case 4: - return scratchpad[addr % ScratchpadL2]; + if ((ic % 64) == 0) { + addr = currentTransform->apply(addr); +#ifdef STATS + datasetAccess[mem.ma / (DatasetBlockCount / 256) / CacheLineSize]++; +#endif + readDataset(addr, mem, reg); + } - case 5: - case 6: - case 7: - return scratchpad[addr % ScratchpadL1]; + if ((instr.loca & 192) == 0) { + mem.mx ^= addr; + } + + if (instr.loca & 3) { + return scratchpad[addr % ScratchpadL1]; + } + else { + return scratchpad[addr % ScratchpadL2]; } } - convertible_t InterpretedVirtualMachine::loadbr1(Instruction& inst) { - switch (inst.locb & 7) - { - case 0: - case 1: - case 2: - case 3: - case 4: - case 5: - return reg.r[inst.regb % RegistersCount]; - case 6: - case 7: - convertible_t temp; - temp.i64 = inst.imm32; //sign-extend imm32 - return temp; + convertible_t InterpretedVirtualMachine::loadbia(Instruction& instr) { + if (instr.locb & 3) { + return reg.r[instr.regb % RegistersCount]; + } + else { + convertible_t temp; + temp.i64 = instr.imm32; //sign-extend imm32 + return temp; } } - convertible_t InterpretedVirtualMachine::loadbr0(Instruction& inst) { - switch (inst.locb & 7) - { - case 0: - case 1: - case 2: - case 3: - return reg.r[inst.regb % RegistersCount]; - case 4: - case 5: - case 6: - case 7: - convertible_t temp; - temp.u64 = inst.imm8; - return temp; + convertible_t InterpretedVirtualMachine::loadbiashift(Instruction& instr) { + if (instr.locb & 1) { + return reg.r[instr.regb % RegistersCount]; + } + else { + convertible_t temp; + temp.u64 = instr.imm8; + return temp; + } + } + + convertible_t InterpretedVirtualMachine::loadbiadiv(Instruction& instr) { + if (instr.locb & 3) { + convertible_t temp; + temp.u64 = instr.imm32; + return temp; + } + else { + return reg.r[instr.regb % RegistersCount]; } } @@ -174,26 +230,6 @@ namespace RandomX { } } - void InterpretedVirtualMachine::writecflo(Instruction& inst, fpu_reg_t& regc) { - addr_t addr; - switch (inst.locc & 7) - { - case 4: - addr = reg.r[inst.regc % RegistersCount].u32 ^ inst.addrc; - scratchpad[addr % ScratchpadL2] = regc.lo; - break; - - case 5: - case 6: - case 7: - addr = reg.r[inst.regc % RegistersCount].u32 ^ inst.addrc; - scratchpad[addr % ScratchpadL1] = regc.lo; - - default: - break; - } - } - #define ALU_RETIRE(x) x(a, b, c); \ if(trace) std::cout << std::hex << /*a.u64 << " " << b.u64 << " " <<*/ c.u64 << std::endl; @@ -242,7 +278,7 @@ namespace RandomX { #define ALU_INST(x) void InterpretedVirtualMachine::h_##x(Instruction& inst) { \ INC_COUNT(x) \ convertible_t a = loada(inst); \ - convertible_t b = loadbr1(inst); \ + convertible_t b = loadbia(inst); \ convertible_t& c = getcr(inst); \ ALU_RETIRE(x) \ } @@ -250,7 +286,15 @@ namespace RandomX { #define ALU_INST_SR(x) void InterpretedVirtualMachine::h_##x(Instruction& inst) { \ INC_COUNT(x) \ convertible_t a = loada(inst); \ - convertible_t b = loadbr0(inst); \ + convertible_t b = loadbiashift(inst); \ + convertible_t& c = getcr(inst); \ + ALU_RETIRE(x) \ + } + +#define ALU_INST_DIV(x) void InterpretedVirtualMachine::h_##x(Instruction& inst) { \ + INC_COUNT(x) \ + convertible_t a = loada(inst); \ + convertible_t b = loadbiadiv(inst); \ convertible_t& c = getcr(inst); \ ALU_RETIRE(x) \ } @@ -282,8 +326,8 @@ namespace RandomX { ALU_INST(MUL_32) ALU_INST(IMUL_32) ALU_INST(IMULH_64) - ALU_INST(DIV_64) - ALU_INST(IDIV_64) + ALU_INST_DIV(DIV_64) + ALU_INST_DIV(IDIV_64) ALU_INST(AND_64) ALU_INST(AND_32) ALU_INST(OR_64) @@ -301,42 +345,68 @@ namespace RandomX { FPU_INST(FPSUB) FPU_INST(FPMUL) FPU_INST(FPDIV) - FPU_INST_NB(FPSQRT) - FPU_INST_NB(FPROUND) + + void InterpretedVirtualMachine::h_FPROUND(Instruction& inst) { + convertible_t a = loada(inst); + convertible_t& c = getcr(inst); + c.u64 = a.u64; + if (trace) std::cout << std::hex << a.u64 << std::endl; + FPROUND(a, inst.imm8); + } + + void InterpretedVirtualMachine::h_JUMP(Instruction& inst) { + convertible_t a = loada(inst); + convertible_t& c = getcr(inst); + c.u64 = a.u64; + if (trace) std::cout << std::hex << a.u64 << std::endl; + if (JMP_COND(inst.locb, reg.r[inst.regb % RegistersCount], inst.imm32)) { +#ifdef STATS + count_JUMP_taken++; + count_jump_taken[inst.locb & 7]++; +#endif + pc += (inst.imm8 & 127) + 1; + pc = pc % ProgramLength; + } +#ifdef STATS + else { + count_JUMP_not_taken++; + count_jump_not_taken[inst.locb & 7]++; + } +#endif + } void InterpretedVirtualMachine::h_CALL(Instruction& inst) { convertible_t a = loada(inst); + convertible_t& c = getcr(inst); + c.u64 = a.u64; + if (trace) std::cout << std::hex << a.u64 << std::endl; if (JMP_COND(inst.locb, reg.r[inst.regb % RegistersCount], inst.imm32)) { #ifdef STATS count_CALL_taken++; count_jump_taken[inst.locb & 7]++; count_retdepth = std::max(0, count_retdepth - 1); #endif - stackPush(a); stackPush(pc); #ifdef STATS count_max_stack = std::max(count_max_stack, (int)stack.size()); #endif pc += (inst.imm8 & 127) + 1; pc = pc % ProgramLength; - if (trace) std::cout << std::hex << a.u64 << std::endl; } - else { - convertible_t& c = getcr(inst); #ifdef STATS + else { count_CALL_not_taken++; count_jump_not_taken[inst.locb & 7]++; -#endif - c.u64 = a.u64; - if (trace) std::cout << std::hex << /*a.u64 << " " <<*/ c.u64 << std::endl; } +#endif } void InterpretedVirtualMachine::h_RET(Instruction& inst) { convertible_t a = loada(inst); - convertible_t b = loadbr1(inst); convertible_t& c = getcr(inst); + c.u64 = a.u64; + if (trace) std::cout << std::hex << a.u64 << std::endl; if (stack.size() > 0) { #ifdef STATS count_RET_taken++; @@ -344,22 +414,13 @@ namespace RandomX { count_retdepth_max = std::max(count_retdepth_max, count_retdepth); #endif auto raddr = stackPopAddress(); - auto retval = stackPopValue(); - c.u64 = a.u64 ^ retval.u64; pc = raddr; } - else { #ifdef STATS - if (stack.size() == 0) - count_RET_stack_empty++; - else { - count_RET_not_taken++; - count_jump_not_taken[inst.locb & 7]++; - } -#endif - c.u64 = a.u64; + else { + count_RET_stack_empty++; } - if (trace) std::cout << std::hex << /*a.u64 << " " <<*/ c.u64 << std::endl; +#endif } #include "instructionWeights.hpp" @@ -394,6 +455,7 @@ namespace RandomX { INST_HANDLE(FPDIV) INST_HANDLE(FPSQRT) INST_HANDLE(FPROUND) + INST_HANDLE(JUMP) INST_HANDLE(CALL) INST_HANDLE(RET) }; diff --git a/src/InterpretedVirtualMachine.hpp b/src/InterpretedVirtualMachine.hpp index 8c34936..7745cad 100644 --- a/src/InterpretedVirtualMachine.hpp +++ b/src/InterpretedVirtualMachine.hpp @@ -25,23 +25,37 @@ along with RandomX. If not, see. namespace RandomX { + class ITransform { + public: + virtual int32_t apply(int32_t) const = 0; + virtual const char* getName() const = 0; + virtual std::ostream& printAsm(std::ostream&) const = 0; + virtual std::ostream& printCxx(std::ostream&) const = 0; + }; + class InterpretedVirtualMachine; typedef void(InterpretedVirtualMachine::*InstructionHandler)(Instruction&); class InterpretedVirtualMachine : public VirtualMachine { public: - InterpretedVirtualMachine(bool softAes) : VirtualMachine(softAes) {} - virtual void initializeProgram(const void* seed) override; - virtual void execute() override; + InterpretedVirtualMachine(bool soft, bool async) : softAes(soft), asyncWorker(async) {} + ~InterpretedVirtualMachine(); + void setDataset(dataset_t ds) override; + void initializeScratchpad(uint32_t index) override; + void initializeProgram(const void* seed) override; + void execute() override; const Program& getProgam() { return p; } private: static InstructionHandler engine[256]; + static const ITransform* addressTransformations[TransformationCount]; + bool softAes, asyncWorker; Program p; std::vector stack; uint64_t pc, ic; + const ITransform* currentTransform; #ifdef STATS int count_ADD_64 = 0; int count_ADD_32 = 0; @@ -71,11 +85,12 @@ namespace RandomX { int count_FPDIV = 0; int count_FPSQRT = 0; int count_FPROUND = 0; + int count_JUMP_taken = 0; + int count_JUMP_not_taken = 0; int count_CALL_taken = 0; int count_CALL_not_taken = 0; int count_RET_stack_empty = 0; int count_RET_taken = 0; - int count_RET_not_taken = 0; int count_jump_taken[8] = { 0 }; int count_jump_not_taken[8] = { 0 }; int count_max_stack = 0; @@ -89,14 +104,15 @@ namespace RandomX { int count_FPSUB_nop2 = 0; int count_FPMUL_nop = 0; int count_FPMUL_nop2 = 0; + int datasetAccess[256] = { 0 }; #endif convertible_t loada(Instruction&); - convertible_t loadbr0(Instruction&); - convertible_t loadbr1(Instruction&); + convertible_t loadbiashift(Instruction&); + convertible_t loadbiadiv(Instruction&); + convertible_t loadbia(Instruction&); convertible_t& getcr(Instruction&); void writecf(Instruction&, fpu_reg_t&); - void writecflo(Instruction&, fpu_reg_t&); void stackPush(convertible_t& c) { stack.push_back(c); @@ -148,6 +164,7 @@ namespace RandomX { void h_FPDIV(Instruction&); void h_FPSQRT(Instruction&); void h_FPROUND(Instruction&); + void h_JUMP(Instruction&); void h_CALL(Instruction&); void h_RET(Instruction&); }; diff --git a/src/JitCompilerX86.cpp b/src/JitCompilerX86.cpp index f76ab74..b41f7b5 100644 --- a/src/JitCompilerX86.cpp +++ b/src/JitCompilerX86.cpp @@ -170,13 +170,13 @@ namespace RandomX { emit(instr.addra); emit(uint16_t(0x8b41)); //mov emitByte(0xc0 + (instr.rega % RegistersCount)); //eax, rega + emit(0x753fc3f6); //test bl,0x3f; jne + emit(uint16_t(0xe805)); + emit(readDatasetOffset - (codePos + 4)); if ((instr.loca & 192) == 0) { //A.LOC.X emit(uint16_t(0x3348)); emitByte(0xe8); //xor rbp, rax } - emit(0x753fc3f6); //test bl,0x3f; jne - emit(uint16_t(0xe805)); - emit(readDatasetOffset - (codePos + 4)); emitByte(0x25); //and eax, if (instr.loca & 3) { emit(ScratchpadL1 - 1); //first 16 KiB of scratchpad diff --git a/src/LightClientAsyncWorker.cpp b/src/LightClientAsyncWorker.cpp new file mode 100644 index 0000000..c069f3f --- /dev/null +++ b/src/LightClientAsyncWorker.cpp @@ -0,0 +1,94 @@ +/* +Copyright (c) 2019 tevador + +This file is part of RandomX. + +RandomX is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +RandomX is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with RandomX. If not, see. +*/ + +#include "LightClientAsyncWorker.hpp" +#include "dataset.hpp" +#include "Cache.hpp" + +namespace RandomX { + + template + LightClientAsyncWorker::LightClientAsyncWorker(const Cache* c) : ILightClientAsyncWorker(c), output(nullptr), hasWork(false), workerThread(&LightClientAsyncWorker::runWorker, this) { + + } + + template + void LightClientAsyncWorker::prepareBlock(addr_t addr) { + { + std::lock_guard lk(mutex); + startBlock = addr / CacheLineSize; + blockCount = 1; + output = currentLine.data(); + hasWork = true; + } + notifier.notify_all(); + } + + template + const uint64_t* LightClientAsyncWorker::getBlock(addr_t addr) { + uint32_t currentBlock = addr / CacheLineSize; + if (currentBlock != startBlock || output != currentLine.data()) { + initBlock(cache->getCache(), (uint8_t*)currentLine.data(), currentBlock, cache->getKeys()); + } + else { + sync(); + } + return currentLine.data(); + } + + template + void LightClientAsyncWorker::prepareBlocks(void* out, uint32_t startBlock, uint32_t blockCount) { + { + std::lock_guard lk(mutex); + startBlock = startBlock; + blockCount = blockCount; + output = out; + hasWork = true; + } + notifier.notify_all(); + } + + template + void LightClientAsyncWorker::getBlocks(void* out, uint32_t startBlock, uint32_t blockCount) { + for (uint32_t i = 0; i < blockCount; ++i) { + initBlock(cache->getCache(), (uint8_t*)out + CacheLineSize * i, startBlock + i, cache->getKeys()); + } + } + + template + void LightClientAsyncWorker::sync() { + std::unique_lock lk(mutex); + notifier.wait(lk, [this] { return !hasWork; }); + } + + template + void LightClientAsyncWorker::runWorker() { + for (;;) { + std::unique_lock lk(mutex); + notifier.wait(lk, [this] { return hasWork; }); + getBlocks(output, startBlock, blockCount); + hasWork = false; + lk.unlock(); + notifier.notify_all(); + } + } + + template class LightClientAsyncWorker; + template class LightClientAsyncWorker; +} \ No newline at end of file diff --git a/src/LightClientAsyncWorker.hpp b/src/LightClientAsyncWorker.hpp new file mode 100644 index 0000000..7596fd5 --- /dev/null +++ b/src/LightClientAsyncWorker.hpp @@ -0,0 +1,52 @@ +/* +Copyright (c) 2019 tevador + +This file is part of RandomX. + +RandomX is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +RandomX is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with RandomX. If not, see. +*/ + +#include "common.hpp" + +#include +#include +#include +#include + +namespace RandomX { + + class Cache; + + using DatasetLine = std::array; + + template + class LightClientAsyncWorker : public ILightClientAsyncWorker { + public: + LightClientAsyncWorker(const Cache*); + void prepareBlock(addr_t) final; + void prepareBlocks(void* out, uint32_t startBlock, uint32_t blockCount) final; + const uint64_t* getBlock(addr_t) final; + void getBlocks(void* out, uint32_t startBlock, uint32_t blockCount) final; + void sync() final; + private: + void runWorker(); + std::condition_variable notifier; + std::mutex mutex; + DatasetLine currentLine; + void* output; + uint32_t startBlock, blockCount; + bool hasWork; + std::thread workerThread; + }; +} \ No newline at end of file diff --git a/src/VirtualMachine.cpp b/src/VirtualMachine.cpp index 6e8cfad..7a2be48 100644 --- a/src/VirtualMachine.cpp +++ b/src/VirtualMachine.cpp @@ -19,8 +19,6 @@ along with RandomX. If not, see. #include "VirtualMachine.hpp" #include "common.hpp" -#include "dataset.hpp" -#include "Cache.hpp" #include "t1ha/t1ha.h" #include "blake2/blake2.h" #include @@ -37,62 +35,10 @@ std::ostream& operator<<(std::ostream& os, const RandomX::RegisterFile& rf) { namespace RandomX { - VirtualMachine::VirtualMachine(bool softAes) : softAes(softAes), lightClient(false) { + VirtualMachine::VirtualMachine() { mem.ds.dataset = nullptr; } - VirtualMachine::~VirtualMachine() { - if (lightClient) { - delete mem.ds.lightDataset->block; - delete mem.ds.lightDataset; - } - } - - void VirtualMachine::setDataset(dataset_t ds, bool light) { - if (mem.ds.dataset != nullptr) { - throw std::runtime_error("Dataset is already initialized"); - } - lightClient = light; - if (light) { - auto lds = mem.ds.lightDataset = new LightClientDataset(); - lds->cache = ds.cache; - //lds->block = (uint8_t*)_mm_malloc(DatasetBlockSize, sizeof(__m128i)); - lds->blockNumber = -1; - if (lds->block == nullptr) { - throw std::bad_alloc(); - } - if (softAes) { - readDataset = &datasetReadLight; - } - else { - readDataset = &datasetReadLight; - } - } - else { - mem.ds = ds; - readDataset = &datasetRead; - } - } - - void VirtualMachine::initializeScratchpad(uint32_t index) { - if (lightClient) { - auto cache = mem.ds.lightDataset->cache; - if (softAes) { - for (int i = 0; i < ScratchpadSize / CacheLineSize; ++i) { - initBlock(cache->getCache(), ((uint8_t*)scratchpad) + CacheLineSize * i, (ScratchpadSize / CacheLineSize) * index + i, cache->getKeys()); - } - } - else { - for (int i = 0; i < ScratchpadSize / CacheLineSize; ++i) { - initBlock(cache->getCache(), ((uint8_t*)scratchpad) + CacheLineSize * i, (ScratchpadSize / CacheLineSize) * index + i, cache->getKeys()); - } - } - } - else { - memcpy(scratchpad, mem.ds.dataset + ScratchpadSize * index, ScratchpadSize); - } - } - void VirtualMachine::getResult(void* out) { constexpr size_t smallStateLength = sizeof(RegisterFile) / sizeof(uint64_t) + 2; uint64_t smallState[smallStateLength]; diff --git a/src/VirtualMachine.hpp b/src/VirtualMachine.hpp index bbcfec3..78f7cf6 100644 --- a/src/VirtualMachine.hpp +++ b/src/VirtualMachine.hpp @@ -25,10 +25,10 @@ namespace RandomX { class VirtualMachine { public: - VirtualMachine(bool softAes); - virtual ~VirtualMachine(); - virtual void setDataset(dataset_t ds, bool light = false); - void initializeScratchpad(uint32_t index); + VirtualMachine(); + virtual ~VirtualMachine() {} + virtual void setDataset(dataset_t ds) = 0; + virtual void initializeScratchpad(uint32_t index) = 0; virtual void initializeProgram(const void* seed) = 0; virtual void execute() = 0; void getResult(void*); @@ -36,7 +36,6 @@ namespace RandomX { return reg; } protected: - bool softAes, lightClient; DatasetReadFunc readDataset; alignas(16) RegisterFile reg; MemoryRegisters mem; diff --git a/src/common.hpp b/src/common.hpp index 3831175..62fae70 100644 --- a/src/common.hpp +++ b/src/common.hpp @@ -38,7 +38,7 @@ namespace RandomX { constexpr int CacheLineSize = 64; constexpr int BlockExpansionRatio = 64; constexpr int DatasetBlockCount = BlockExpansionRatio * CacheBlockCount; - constexpr int DatasetIterations = 64; + constexpr int DatasetIterations = 16; constexpr uint32_t CacheSize = CacheBlockCount * CacheLineSize; constexpr uint64_t DatasetSize = (uint64_t)CacheSize * BlockExpansionRatio; @@ -86,16 +86,25 @@ namespace RandomX { return i % RandomX::ProgramLength; } - struct LightClientDataset { - Cache* cache; - uint8_t* block; - uint32_t blockNumber; + class ILightClientAsyncWorker { + public: + virtual void prepareBlock(addr_t) = 0; + virtual void prepareBlocks(void* out, uint32_t startBlock, uint32_t blockCount) = 0; + virtual const uint64_t* getBlock(addr_t) = 0; + virtual void getBlocks(void* out, uint32_t startBlock, uint32_t blockCount) = 0; + virtual void sync() = 0; + const Cache* getCache() { + return cache; + } + protected: + ILightClientAsyncWorker(const Cache* c) : cache(c) {} + const Cache* cache; }; union dataset_t { uint8_t* dataset; Cache* cache; - LightClientDataset* lightDataset; + ILightClientAsyncWorker* asyncWorker; }; struct MemoryRegisters { @@ -112,7 +121,7 @@ namespace RandomX { static_assert(sizeof(RegisterFile) == 3 * RegistersCount * sizeof(convertible_t), "Invalid alignment of struct RandomX::RegisterFile"); - typedef convertible_t(*DatasetReadFunc)(addr_t, MemoryRegisters&); + typedef void(*DatasetReadFunc)(addr_t, MemoryRegisters&, RegisterFile&); typedef void(*ProgramFunc)(RegisterFile&, MemoryRegisters&, convertible_t*); diff --git a/src/dataset.cpp b/src/dataset.cpp index d9c7b3f..ae31963 100644 --- a/src/dataset.cpp +++ b/src/dataset.cpp @@ -30,7 +30,7 @@ along with RandomX. If not, see. #if defined(__SSE2__) #include -#define PREFETCH(memory) _mm_prefetch((const char *)((memory).ds.dataset + (memory).ma), _MM_HINT_T0) +#define PREFETCH(memory) _mm_prefetch((const char *)((memory).ds.dataset + (memory).ma), _MM_HINT_NTA) #else #define PREFETCH(memory) #endif @@ -106,32 +106,44 @@ namespace RandomX { template void initBlock(const uint8_t*, uint8_t*, uint32_t, const KeysContainer&); - convertible_t datasetRead(addr_t addr, MemoryRegisters& memory) { - convertible_t data; - data.u64 = *(uint64_t*)(memory.ds.dataset + memory.ma); - memory.ma += 8; + void datasetRead(addr_t addr, MemoryRegisters& memory, RegisterFile& reg) { + uint64_t* datasetLine = (uint64_t*)(memory.ds.dataset + memory.ma); memory.mx ^= addr; - if ((memory.mx & 0xFFF8) == 0) { - memory.ma = memory.mx & ~7; - PREFETCH(memory); - } - return data; + memory.mx &= -64; //align to cache line + std::swap(memory.mx, memory.ma); + PREFETCH(memory); + for (int i = 0; i < RegistersCount; ++i) + reg.r[i].u64 ^= datasetLine[i]; } template - convertible_t datasetReadLight(addr_t addr, MemoryRegisters& memory) { - convertible_t data; - LightClientDataset* lds = memory.ds.lightDataset; - auto blockNumber = memory.ma / CacheLineSize; - - return data; + void datasetReadLight(addr_t addr, MemoryRegisters& memory, RegisterFile& reg) { + Cache* cache = memory.ds.cache; + uint64_t datasetLine[CacheLineSize / sizeof(uint64_t)]; + initBlock(cache->getCache(), (uint8_t*)datasetLine, memory.ma / CacheLineSize, cache->getKeys()); + for (int i = 0; i < RegistersCount; ++i) + reg.r[i].u64 ^= datasetLine[i]; + memory.mx ^= addr; + memory.mx &= -64; //align to cache line + std::swap(memory.mx, memory.ma); } template - convertible_t datasetReadLight(addr_t addr, MemoryRegisters& memory); + void datasetReadLight(addr_t addr, MemoryRegisters& memory, RegisterFile& reg); template - convertible_t datasetReadLight(addr_t addr, MemoryRegisters& memory); + void datasetReadLight(addr_t addr, MemoryRegisters& memory, RegisterFile& reg); + + void datasetReadLightAsync(addr_t addr, MemoryRegisters& memory, RegisterFile& reg) { + ILightClientAsyncWorker* aw = memory.ds.asyncWorker; + const uint64_t* datasetLine = aw->getBlock(memory.ma); + for (int i = 0; i < RegistersCount; ++i) + reg.r[i].u64 ^= datasetLine[i]; + memory.mx ^= addr; + memory.mx &= -64; //align to cache line + std::swap(memory.mx, memory.ma); + aw->prepareBlock(memory.ma); + } void datasetAlloc(dataset_t& ds, bool largePages) { if (sizeof(size_t) <= 4) diff --git a/src/dataset.hpp b/src/dataset.hpp index 5f9836c..0103271 100644 --- a/src/dataset.hpp +++ b/src/dataset.hpp @@ -40,12 +40,14 @@ namespace RandomX { template void datasetInit(Cache* cache, dataset_t ds, uint32_t startBlock, uint32_t blockCount); - convertible_t datasetRead(addr_t addr, MemoryRegisters& memory); + void datasetRead(addr_t addr, MemoryRegisters& memory, RegisterFile&); template void datasetInitCache(const void* seed, dataset_t& dataset); template - convertible_t datasetReadLight(addr_t addr, MemoryRegisters& memory); + void datasetReadLight(addr_t addr, MemoryRegisters& memory, RegisterFile&); + + void datasetReadLightAsync(addr_t addr, MemoryRegisters& memory, RegisterFile& reg); } diff --git a/src/instructions.hpp b/src/instructions.hpp index 2321be6..dc5d4ee 100644 --- a/src/instructions.hpp +++ b/src/instructions.hpp @@ -22,12 +22,6 @@ along with RandomX. If not, see. namespace RandomX { - //Clears the 11 least-significant bits before conversion. This is done so the number - //fits exactly into the 52-bit mantissa without rounding. - inline double convertSigned52(int64_t x) { - return (double)(x & -2048L); - } - extern "C" { void ADD_64(convertible_t& a, convertible_t& b, convertible_t& c); void ADD_32(convertible_t& a, convertible_t& b, convertible_t& c); @@ -53,11 +47,11 @@ namespace RandomX { void ROR_64(convertible_t& a, convertible_t& b, convertible_t& c); bool JMP_COND(uint8_t, convertible_t&, int32_t); void FPINIT(); + void FPROUND(convertible_t, uint8_t); void FPADD(convertible_t& a, fpu_reg_t& b, fpu_reg_t& c); void FPSUB(convertible_t& a, fpu_reg_t& b, fpu_reg_t& c); void FPMUL(convertible_t& a, fpu_reg_t& b, fpu_reg_t& c); void FPDIV(convertible_t& a, fpu_reg_t& b, fpu_reg_t& c); void FPSQRT(convertible_t& a, fpu_reg_t& b, fpu_reg_t& c); - void FPROUND(convertible_t& a, fpu_reg_t& b, fpu_reg_t& c); } } \ No newline at end of file diff --git a/src/instructionsPortable.cpp b/src/instructionsPortable.cpp index 790506b..78bdb6f 100644 --- a/src/instructionsPortable.cpp +++ b/src/instructionsPortable.cpp @@ -370,9 +370,9 @@ namespace RandomX { #endif } - void FPROUND(convertible_t& a, fpu_reg_t& b, fpu_reg_t& c) { - c.lo.f64 = convertSigned52(a.i64); - switch (a.u64 & 3) { + void FPROUND(convertible_t a, uint8_t rot) { + uint64_t flag = ror64(a.u64, rot); + switch (flag & 3) { case RoundDown: #ifdef DEBUG std::cout << "Round FE_DOWNWARD (" << FE_DOWNWARD << ") = " << diff --git a/src/main.cpp b/src/main.cpp index 6366821..3295500 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -162,7 +162,7 @@ void mine(RandomX::VirtualMachine* vm, std::atomic& atomicNonce, AtomicHash } int main(int argc, char** argv) { - bool softAes, lightClient, genAsm, compiled, help, largePages; + bool softAes, lightClient, genAsm, compiled, help, largePages, async; int programCount, threadCount; readOption("--help", argc, argv, help); @@ -178,6 +178,7 @@ int main(int argc, char** argv) { readIntOption("--threads", argc, argv, threadCount, 1); readIntOption("--nonces", argc, argv, programCount, 1000); readOption("--largePages", argc, argv, largePages); + readOption("--async", argc, argv, async); if (genAsm) { generateAsm(programCount); @@ -250,12 +251,12 @@ int main(int argc, char** argv) { for (int i = 0; i < threadCount; ++i) { RandomX::VirtualMachine* vm; if (compiled) { - vm = new RandomX::CompiledVirtualMachine(softAes); + vm = new RandomX::CompiledVirtualMachine(); } else { - vm = new RandomX::InterpretedVirtualMachine(softAes); + vm = new RandomX::InterpretedVirtualMachine(softAes, async); } - vm->setDataset(dataset, lightClient); + vm->setDataset(dataset); vms.push_back(vm); } std::cout << "Running benchmark (" << programCount << " programs) ..." << std::endl; @@ -278,7 +279,12 @@ int main(int argc, char** argv) { result.print(std::cout); if(programCount == 1000) std::cout << "Reference result: 3e1c5f9b9d0bf8ffa250f860bf5f7ab76ac823b206ddee6a592660119a3640c6" << std::endl; - std::cout << "Performance: " << programCount / elapsed << " programs per second" << std::endl; + if (lightClient) { + std::cout << "Performance: " << 1000 * elapsed / programCount << " ms per program" << std::endl; + } + else { + std::cout << "Performance: " << programCount / elapsed << " programs per second" << std::endl; + } } catch (std::exception& e) { std::cout << "ERROR: " << e.what() << std::endl; From 4fb168e249d22e565ea9dadd02c2c6b7dda93736 Mon Sep 17 00:00:00 2001 From: tevador Date: Fri, 18 Jan 2019 17:57:47 +0100 Subject: [PATCH 19/35] Large page support for cache Bug fixes --- makefile | 13 ++++++++++++- src/Cache.hpp | 24 ++++++++++++++++++++++-- src/JitCompilerX86-static.asm | 4 ++++ src/JitCompilerX86.cpp | 4 ++++ src/LightClientAsyncWorker.cpp | 22 +++++++++++++++++++--- src/LightClientAsyncWorker.hpp | 10 +++++++++- src/Stopwatch.hpp | 4 ++-- src/common.hpp | 21 +++++++++++---------- src/dataset.cpp | 22 +++++++++++++++++----- src/dataset.hpp | 5 ++++- src/executeProgram-win64.asm | 4 ++++ src/main.cpp | 29 ++++++++++++++++++++++------- src/virtualMemory.cpp | 6 +++++- 13 files changed, 135 insertions(+), 33 deletions(-) diff --git a/makefile b/makefile index d0a969c..0dcd7de 100644 --- a/makefile +++ b/makefile @@ -11,7 +11,7 @@ SRCDIR=src OBJDIR=obj LDFLAGS=-lpthread TOBJS=$(addprefix $(OBJDIR)/,instructionsPortable.o TestAluFpu.o) -ROBJS=$(addprefix $(OBJDIR)/,argon2_core.o argon2_ref.o AssemblyGeneratorX86.o blake2b.o CompiledVirtualMachine.o dataset.o JitCompilerX86.o instructionsPortable.o Instruction.o InterpretedVirtualMachine.o main.o Program.o softAes.o VirtualMachine.o t1ha2.o Cache.o virtualMemory.o divideByConstantCodegen.o) +ROBJS=$(addprefix $(OBJDIR)/,argon2_core.o argon2_ref.o AssemblyGeneratorX86.o blake2b.o CompiledVirtualMachine.o dataset.o JitCompilerX86.o instructionsPortable.o Instruction.o InterpretedVirtualMachine.o main.o Program.o softAes.o VirtualMachine.o t1ha2.o Cache.o virtualMemory.o divideByConstantCodegen.o LightClientAsyncWorker.o AddressTransform.o) ifeq ($(PLATFORM),x86_64) ROBJS += $(OBJDIR)/JitCompilerX86-static.o endif @@ -27,6 +27,11 @@ debug: CCFLAGS += -g debug: LDFLAGS += -g debug: $(BINDIR)/randomx +profile: CXXFLAGS += -pg +profile: CCFLAGS += -pg +profile: LDFLAGS += -pg +profile: $(BINDIR)/randomx + test: CXXFLAGS += -O0 test: $(BINDIR)/AluFpuTest @@ -38,6 +43,9 @@ $(BINDIR)/AluFpuTest: $(TOBJS) | $(BINDIR) $(OBJDIR)/TestAluFpu.o: $(addprefix $(SRCDIR)/,TestAluFpu.cpp instructions.hpp Pcg32.hpp) | $(OBJDIR) $(CXX) $(CXXFLAGS) -c $(SRCDIR)/TestAluFpu.cpp -o $@ + +$(OBJDIR)/AddressTransform.o: $(addprefix $(SRCDIR)/,AddressTransform.cpp InterpretedVirtualMachine.hpp common.hpp) | $(OBJDIR) + $(CXX) $(CXXFLAGS) -c $(SRCDIR)/AddressTransform.cpp -o $@ $(OBJDIR)/argon2_core.o: $(addprefix $(SRCDIR)/,argon2_core.c argon2_core.h blake2/blake2.h blake2/blake2-impl.h) | $(OBJDIR) $(CC) $(CCFLAGS) -c $(SRCDIR)/argon2_core.c -o $@ @@ -74,6 +82,9 @@ $(OBJDIR)/Instruction.o: $(addprefix $(SRCDIR)/,Instruction.cpp Instruction.hpp $(OBJDIR)/InterpretedVirtualMachine.o: $(addprefix $(SRCDIR)/,InterpretedVirtualMachine.cpp InterpretedVirtualMachine.hpp Pcg32.hpp instructions.hpp instructionWeights.hpp) | $(OBJDIR) $(CXX) $(CXXFLAGS) -c $(SRCDIR)/InterpretedVirtualMachine.cpp -o $@ + +$(OBJDIR)/LightClientAsyncWorker.o: $(addprefix $(SRCDIR)/,LightClientAsyncWorker.cpp LightClientAsyncWorker.hpp common.hpp) | $(OBJDIR) + $(CXX) $(CXXFLAGS) -c $(SRCDIR)/LightClientAsyncWorker.cpp -o $@ $(OBJDIR)/main.o: $(addprefix $(SRCDIR)/,main.cpp InterpretedVirtualMachine.hpp Stopwatch.hpp blake2/blake2.h) | $(OBJDIR) $(CXX) $(CXXFLAGS) -c $(SRCDIR)/main.cpp -o $@ diff --git a/src/Cache.hpp b/src/Cache.hpp index 8a2b93a..bc3d6ed 100644 --- a/src/Cache.hpp +++ b/src/Cache.hpp @@ -23,12 +23,32 @@ along with RandomX. If not, see. #include #include "common.hpp" #include "dataset.hpp" +#include "virtualMemory.hpp" namespace RandomX { class Cache { public: - void* operator new(size_t size) { + static void* alloc(bool largePages) { + if (largePages) { + return allocLargePagesMemory(sizeof(Cache)); + } + else { + void* ptr = _mm_malloc(sizeof(Cache), sizeof(__m128i)); + if (ptr == nullptr) + throw std::bad_alloc(); + return ptr; + } + } + static void dealloc(Cache* cache, bool largePages) { + if (largePages) { + //allocLargePagesMemory(sizeof(Cache)); + } + else { + _mm_free(cache); + } + } + /*void* operator new(size_t size) { void* ptr = _mm_malloc(size, sizeof(__m128i)); if (ptr == nullptr) throw std::bad_alloc(); @@ -37,7 +57,7 @@ namespace RandomX { void operator delete(void* ptr) { _mm_free(ptr); - } + }*/ template void initialize(const void* seed, size_t seedSize); diff --git a/src/JitCompilerX86-static.asm b/src/JitCompilerX86-static.asm index cbbf658..031c2e4 100644 --- a/src/JitCompilerX86-static.asm +++ b/src/JitCompilerX86-static.asm @@ -15,6 +15,8 @@ ;# You should have received a copy of the GNU General Public License ;# along with RandomX. If not, see. +IFDEF RAX + _RANDOMX_JITX86_STATIC SEGMENT PAGE READ EXECUTE PUBLIC randomx_program_prologue @@ -57,4 +59,6 @@ randomx_program_transform ENDP _RANDOMX_JITX86_STATIC ENDS +ENDIF + END \ No newline at end of file diff --git a/src/JitCompilerX86.cpp b/src/JitCompilerX86.cpp index b41f7b5..f828d0a 100644 --- a/src/JitCompilerX86.cpp +++ b/src/JitCompilerX86.cpp @@ -46,6 +46,10 @@ namespace RandomX { void JitCompilerX86::generateProgram(Pcg32& gen) { } + + size_t JitCompilerX86::getCodeSize() { + return 0; + } #else /* diff --git a/src/LightClientAsyncWorker.cpp b/src/LightClientAsyncWorker.cpp index c069f3f..32aa508 100644 --- a/src/LightClientAsyncWorker.cpp +++ b/src/LightClientAsyncWorker.cpp @@ -24,12 +24,19 @@ along with RandomX. If not, see. namespace RandomX { template - LightClientAsyncWorker::LightClientAsyncWorker(const Cache* c) : ILightClientAsyncWorker(c), output(nullptr), hasWork(false), workerThread(&LightClientAsyncWorker::runWorker, this) { + LightClientAsyncWorker::LightClientAsyncWorker(const Cache* c) : ILightClientAsyncWorker(c), output(nullptr), hasWork(false), +#ifdef TRACE + sw(true), +#endif + workerThread(&LightClientAsyncWorker::runWorker, this) { } template void LightClientAsyncWorker::prepareBlock(addr_t addr) { +#ifdef TRACE + std::cout << sw.getElapsed() << ": prepareBlock-enter " << addr << std::endl; +#endif { std::lock_guard lk(mutex); startBlock = addr / CacheLineSize; @@ -37,6 +44,9 @@ namespace RandomX { output = currentLine.data(); hasWork = true; } +#ifdef TRACE + std::cout << sw.getElapsed() << ": prepareBlock-notify " << startBlock << "/" << blockCount << std::endl; +#endif notifier.notify_all(); } @@ -54,10 +64,13 @@ namespace RandomX { template void LightClientAsyncWorker::prepareBlocks(void* out, uint32_t startBlock, uint32_t blockCount) { +#ifdef TRACE + std::cout << sw.getElapsed() << ": prepareBlocks-enter " << startBlock << "/" << blockCount << std::endl; +#endif { std::lock_guard lk(mutex); - startBlock = startBlock; - blockCount = blockCount; + this->startBlock = startBlock; + this->blockCount = blockCount; output = out; hasWork = true; } @@ -79,6 +92,9 @@ namespace RandomX { template void LightClientAsyncWorker::runWorker() { +#ifdef TRACE + std::cout << sw.getElapsed() << ": runWorker-enter " << std::endl; +#endif for (;;) { std::unique_lock lk(mutex); notifier.wait(lk, [this] { return hasWork; }); diff --git a/src/LightClientAsyncWorker.hpp b/src/LightClientAsyncWorker.hpp index 7596fd5..29571e5 100644 --- a/src/LightClientAsyncWorker.hpp +++ b/src/LightClientAsyncWorker.hpp @@ -17,12 +17,17 @@ You should have received a copy of the GNU General Public License along with RandomX. If not, see. */ +//#define TRACE #include "common.hpp" #include #include #include #include +#ifdef TRACE +#include "Stopwatch.hpp" +#include +#endif namespace RandomX { @@ -43,10 +48,13 @@ namespace RandomX { void runWorker(); std::condition_variable notifier; std::mutex mutex; - DatasetLine currentLine; + alignas(16) DatasetLine currentLine; void* output; uint32_t startBlock, blockCount; bool hasWork; +#ifdef TRACE + Stopwatch sw; +#endif std::thread workerThread; }; } \ No newline at end of file diff --git a/src/Stopwatch.hpp b/src/Stopwatch.hpp index 4f3a5a1..931bc02 100644 --- a/src/Stopwatch.hpp +++ b/src/Stopwatch.hpp @@ -53,7 +53,7 @@ public: isRunning = false; } } - double getElapsed() { + double getElapsed() const { return getElapsedNanosec() / 1e+9; } private: @@ -63,7 +63,7 @@ private: uint64_t elapsed; bool isRunning; - uint64_t getElapsedNanosec() { + uint64_t getElapsedNanosec() const { uint64_t elns = elapsed; if (isRunning) { chrono_t endMark = std::chrono::high_resolution_clock::now(); diff --git a/src/common.hpp b/src/common.hpp index 62fae70..fea337f 100644 --- a/src/common.hpp +++ b/src/common.hpp @@ -34,20 +34,21 @@ namespace RandomX { constexpr int SeedSize = 32; constexpr int ResultSize = 32; - constexpr int CacheBlockCount = 1024 * 1024; - constexpr int CacheLineSize = 64; - constexpr int BlockExpansionRatio = 64; - constexpr int DatasetBlockCount = BlockExpansionRatio * CacheBlockCount; - constexpr int DatasetIterations = 16; - constexpr uint32_t CacheSize = CacheBlockCount * CacheLineSize; - constexpr uint64_t DatasetSize = (uint64_t)CacheSize * BlockExpansionRatio; - - constexpr int ArgonIterations = 12; - constexpr uint32_t ArgonMemorySize = 65536; //KiB + constexpr int ArgonIterations = 6; + constexpr uint32_t ArgonMemorySize = 131072; //KiB constexpr int ArgonLanes = 1; const char ArgonSalt[] = "Monero\x1A$"; constexpr int ArgonSaltSize = sizeof(ArgonSalt) - 1; + constexpr int CacheLineSize = 64; + constexpr uint64_t DatasetSize = 4ULL * 1024 * 1024 * 1024; //4 GiB + constexpr uint32_t CacheSize = ArgonMemorySize * 1024; + constexpr int CacheBlockCount = CacheSize / CacheLineSize; + constexpr int BlockExpansionRatio = DatasetSize / CacheSize; + constexpr int DatasetBlockCount = BlockExpansionRatio * CacheBlockCount; + constexpr int DatasetIterations = 32; + + #ifdef TRACE constexpr bool trace = true; #else diff --git a/src/dataset.cpp b/src/dataset.cpp index ae31963..2ef6e7f 100644 --- a/src/dataset.cpp +++ b/src/dataset.cpp @@ -67,7 +67,7 @@ namespace RandomX { //block number 0..67108863 //Initialization vector = block number extended to 128 bits iv = _mm_cvtsi32_si128(blockNumber); - uint32_t cacheBlockNumber = blockNumber / BlockExpansionRatio; //0..1048575 + uint32_t cacheBlockNumber = blockNumber / BlockExpansionRatio; //0..2097151 __m128i* cacheCacheLine = (__m128i*)(in + cacheBlockNumber * CacheLineSize); __m128i* datasetCacheLine = (__m128i*)out; @@ -173,14 +173,26 @@ namespace RandomX { void datasetInit(Cache*, dataset_t, uint32_t, uint32_t); template - void datasetInitCache(const void* seed, dataset_t& ds) { - ds.cache = new Cache(); + void datasetInitCache(const void* seed, dataset_t& ds, bool largePages) { + ds.cache = new(Cache::alloc(largePages)) Cache(); ds.cache->initialize(seed, SeedSize); } template - void datasetInitCache(const void*, dataset_t&); + void datasetInitCache(const void*, dataset_t&, bool); template - void datasetInitCache(const void*, dataset_t&); + void datasetInitCache(const void*, dataset_t&, bool); + + template + void aesBench(uint32_t blockCount) { + alignas(16) KeysContainer keys; + alignas(16) uint8_t buffer[CacheLineSize]; + for (uint32_t block = 0; block < blockCount; ++block) { + initBlock(buffer, buffer, 0, keys); + } + } + + template void aesBench(uint32_t blockCount); + template void aesBench(uint32_t blockCount); } diff --git a/src/dataset.hpp b/src/dataset.hpp index 0103271..bdd34d3 100644 --- a/src/dataset.hpp +++ b/src/dataset.hpp @@ -43,11 +43,14 @@ namespace RandomX { void datasetRead(addr_t addr, MemoryRegisters& memory, RegisterFile&); template - void datasetInitCache(const void* seed, dataset_t& dataset); + void datasetInitCache(const void* seed, dataset_t& dataset, bool largePages); template void datasetReadLight(addr_t addr, MemoryRegisters& memory, RegisterFile&); void datasetReadLightAsync(addr_t addr, MemoryRegisters& memory, RegisterFile& reg); + + template + void aesBench(uint32_t blockCount); } diff --git a/src/executeProgram-win64.asm b/src/executeProgram-win64.asm index 2cc98fb..2da88b5 100644 --- a/src/executeProgram-win64.asm +++ b/src/executeProgram-win64.asm @@ -15,6 +15,8 @@ ;# You should have received a copy of the GNU General Public License ;# along with RandomX. If not, see. +IFDEF RAX + _RANDOMX_EXECUTE_PROGRAM SEGMENT PAGE READ EXECUTE PUBLIC executeProgram @@ -252,4 +254,6 @@ executeProgram ENDP _RANDOMX_EXECUTE_PROGRAM ENDS +ENDIF + END diff --git a/src/main.cpp b/src/main.cpp index 3295500..db3850e 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -162,7 +162,7 @@ void mine(RandomX::VirtualMachine* vm, std::atomic& atomicNonce, AtomicHash } int main(int argc, char** argv) { - bool softAes, lightClient, genAsm, compiled, help, largePages, async; + bool softAes, lightClient, genAsm, compiled, help, largePages, async, aesBench; int programCount, threadCount; readOption("--help", argc, argv, help); @@ -179,29 +179,44 @@ int main(int argc, char** argv) { readIntOption("--nonces", argc, argv, programCount, 1000); readOption("--largePages", argc, argv, largePages); readOption("--async", argc, argv, async); + readOption("--aesBench", argc, argv, aesBench); if (genAsm) { generateAsm(programCount); return 0; } + if (softAes) + std::cout << "Using software AES." << std::endl; + + if(aesBench) { + programCount *= 10; + Stopwatch sw(true); + if (softAes) { + RandomX::aesBench(programCount); + } + else { + RandomX::aesBench(programCount); + } + sw.stop(); + std::cout << "AES performance: " << programCount / sw.getElapsed() << " blocks/s" << std::endl; + return 0; + } + std::atomic atomicNonce(0); AtomicHash result; std::vector vms; std::vector threads; RandomX::dataset_t dataset; - if (softAes) - std::cout << "Using software AES." << std::endl; std::cout << "Initializing..." << std::endl; - try { Stopwatch sw(true); if (softAes) { - RandomX::datasetInitCache(seed, dataset); + RandomX::datasetInitCache(seed, dataset, largePages); } else { - RandomX::datasetInitCache(seed, dataset); + RandomX::datasetInitCache(seed, dataset, largePages); } if (RandomX::trace) { std::cout << "Keys: " << std::endl; @@ -243,7 +258,7 @@ int main(int argc, char** argv) { RandomX::datasetInit(cache, dataset, 0, RandomX::DatasetBlockCount); } } - delete cache; + RandomX::Cache::dealloc(cache, largePages); threads.clear(); std::cout << "Dataset (4 GiB) initialized in " << sw.getElapsed() << " s" << std::endl; } diff --git a/src/virtualMemory.cpp b/src/virtualMemory.cpp index e6e44fc..f324e95 100644 --- a/src/virtualMemory.cpp +++ b/src/virtualMemory.cpp @@ -88,11 +88,15 @@ void* allocExecutableMemory(std::size_t bytes) { return mem; } +constexpr std::size_t align(std::size_t pos, uint32_t align) { + return ((pos - 1) / align + 1) * align; +} + void* allocLargePagesMemory(std::size_t bytes) { void* mem; #ifdef _WIN32 setPrivilege("SeLockMemoryPrivilege", 1); - mem = VirtualAlloc(NULL, bytes, MEM_COMMIT | MEM_RESERVE | MEM_LARGE_PAGES, PAGE_READWRITE); + mem = VirtualAlloc(NULL, align(bytes, 2 * 1024 * 1024), MEM_COMMIT | MEM_RESERVE | MEM_LARGE_PAGES, PAGE_READWRITE); if (mem == nullptr) throw std::runtime_error(getErrorMessage("allocLargePagesMemory - VirtualAlloc")); #else From 89bc68d093b872f8479f593f89148226961e4b93 Mon Sep 17 00:00:00 2001 From: tevador Date: Fri, 18 Jan 2019 18:44:06 +0100 Subject: [PATCH 20/35] Memory-bound dataset initialization --- src/common.hpp | 6 ++--- src/dataset.cpp | 61 +++++++++++++++++++++++++------------------------ 2 files changed, 34 insertions(+), 33 deletions(-) diff --git a/src/common.hpp b/src/common.hpp index fea337f..e0d4116 100644 --- a/src/common.hpp +++ b/src/common.hpp @@ -34,8 +34,8 @@ namespace RandomX { constexpr int SeedSize = 32; constexpr int ResultSize = 32; - constexpr int ArgonIterations = 6; - constexpr uint32_t ArgonMemorySize = 131072; //KiB + constexpr int ArgonIterations = 3; + constexpr uint32_t ArgonMemorySize = 262144; //KiB constexpr int ArgonLanes = 1; const char ArgonSalt[] = "Monero\x1A$"; constexpr int ArgonSaltSize = sizeof(ArgonSalt) - 1; @@ -46,7 +46,7 @@ namespace RandomX { constexpr int CacheBlockCount = CacheSize / CacheLineSize; constexpr int BlockExpansionRatio = DatasetSize / CacheSize; constexpr int DatasetBlockCount = BlockExpansionRatio * CacheBlockCount; - constexpr int DatasetIterations = 32; + constexpr int DatasetIterations = 10; #ifdef TRACE diff --git a/src/dataset.cpp b/src/dataset.cpp index 2ef6e7f..e2b4b54 100644 --- a/src/dataset.cpp +++ b/src/dataset.cpp @@ -62,42 +62,43 @@ namespace RandomX { x3 = aesenc(x3, keys[i]) template - void initBlock(const uint8_t* in, uint8_t* out, uint32_t blockNumber, const KeysContainer& keys) { - __m128i x0, x1, x2, x3, iv; - //block number 0..67108863 - //Initialization vector = block number extended to 128 bits - iv = _mm_cvtsi32_si128(blockNumber); - uint32_t cacheBlockNumber = blockNumber / BlockExpansionRatio; //0..2097151 - __m128i* cacheCacheLine = (__m128i*)(in + cacheBlockNumber * CacheLineSize); - __m128i* datasetCacheLine = (__m128i*)out; + void initBlock(const uint8_t* intermediate, uint8_t* out, uint32_t blockNumber, const KeysContainer& keys) { + __m128i x0, x1, x2, x3; - x0 = _mm_load_si128(cacheCacheLine + 0); - x1 = _mm_load_si128(cacheCacheLine + 1); - x2 = _mm_load_si128(cacheCacheLine + 2); - x3 = _mm_load_si128(cacheCacheLine + 3); + __m128i* xit = (__m128i*)intermediate; + __m128i* xout = (__m128i*)out; - x0 = _mm_xor_si128(x0, iv); - x1 = _mm_xor_si128(x1, iv); - x2 = _mm_xor_si128(x2, iv); - x3 = _mm_xor_si128(x3, iv); + x0 = _mm_cvtsi32_si128(blockNumber); + constexpr int mask = (CacheSize / CacheLineSize) - 1; for (auto i = 0; i < DatasetIterations; ++i) { - AES_ROUND(0); - AES_ROUND(1); - AES_ROUND(2); - AES_ROUND(3); - AES_ROUND(4); - AES_ROUND(5); - AES_ROUND(6); - AES_ROUND(7); - AES_ROUND(8); - AES_ROUND(9); + x0 = aesenc(x0, keys[0]); + x0 = aesenc(x0, keys[1]); + x1 = aesenc(x0, keys[2]); + x1 = aesenc(x1, keys[3]); + x2 = aesenc(x1, keys[4]); + x2 = aesenc(x2, keys[5]); + x3 = aesenc(x2, keys[6]); + x3 = aesenc(x3, keys[7]); + + int index = _mm_cvtsi128_si32(x3); + index &= mask; + + __m128i t0 = _mm_load_si128(xit + 4 * index + 0); + __m128i t1 = _mm_load_si128(xit + 4 * index + 1); + __m128i t2 = _mm_load_si128(xit + 4 * index + 2); + __m128i t3 = _mm_load_si128(xit + 4 * index + 3); + + x0 = _mm_xor_si128(x0, t0); + x1 = _mm_xor_si128(x1, t1); + x2 = _mm_xor_si128(x2, t2); + x3 = _mm_xor_si128(x3, t3); } - _mm_store_si128(datasetCacheLine + 0, x0); - _mm_store_si128(datasetCacheLine + 1, x1); - _mm_store_si128(datasetCacheLine + 2, x2); - _mm_store_si128(datasetCacheLine + 3, x3); + _mm_store_si128(xout + 0, x0); + _mm_store_si128(xout + 1, x1); + _mm_store_si128(xout + 2, x2); + _mm_store_si128(xout + 3, x3); } template From 93c324709bfad810d0884d17a9af7ea9aab3133d Mon Sep 17 00:00:00 2001 From: tevador Date: Fri, 18 Jan 2019 19:06:46 +0100 Subject: [PATCH 21/35] Related to previous changes --- src/main.cpp | 2 +- src/program.inc | 272 ++++++++++++++++++++++++------------------------ 2 files changed, 137 insertions(+), 137 deletions(-) diff --git a/src/main.cpp b/src/main.cpp index db3850e..6ac64b7 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -229,7 +229,7 @@ int main(int argc, char** argv) { std::cout << std::endl; } if (lightClient) { - std::cout << "Cache (64 MiB) initialized in " << sw.getElapsed() << " s" << std::endl; + std::cout << "Cache (256 MiB) initialized in " << sw.getElapsed() << " s" << std::endl; } else { RandomX::Cache* cache = dataset.cache; diff --git a/src/program.inc b/src/program.inc index 4437f97..afc9573 100644 --- a/src/program.inc +++ b/src/program.inc @@ -3,11 +3,11 @@ rx_i_0: ;CALL jz rx_finish xor r9, 0ca9788ah mov eax, r9d - xor rbp, rax test bl, 63 jnz short rx_body_0 call rx_read rx_body_0: + xor rbp, rax and eax, 32767 mov rax, qword ptr [rsi+rax*8] mov rcx, rax @@ -50,11 +50,11 @@ rx_i_2: ;JUMP jz rx_finish xor r15, 097210f7bh mov eax, r15d - xor rbp, rax test bl, 63 jnz short rx_body_2 call rx_read rx_body_2: + xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] mov rcx, rax @@ -70,11 +70,11 @@ rx_i_3: ;FPDIV jz rx_finish xor r13, 082c73195h mov eax, r13d - xor rbp, rax test bl, 63 jnz short rx_body_3 call rx_read rx_body_3: + xor rbp, rax and eax, 32767 cvtdq2pd xmm0, qword ptr [rsi+rax*8] divpd xmm0, xmm9 @@ -112,11 +112,11 @@ rx_i_5: ;IMUL_32 jz rx_finish xor r15, 0379f9ee0h mov eax, r15d - xor rbp, rax test bl, 63 jnz short rx_body_5 call rx_read rx_body_5: + xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] movsxd rcx, eax @@ -183,11 +183,11 @@ rx_i_9: ;IDIV_64 jz rx_finish xor r14, 085121c54h mov eax, r14d - xor rbp, rax test bl, 63 jnz short rx_body_9 call rx_read rx_body_9: + xor rbp, rax and eax, 32767 mov rax, qword ptr [rsi+rax*8] ; magic divide by 565870810 @@ -255,11 +255,11 @@ rx_i_13: ;FPADD jz rx_finish xor r12, 061c0d34dh mov eax, r12d - xor rbp, rax test bl, 63 jnz short rx_body_13 call rx_read rx_body_13: + xor rbp, rax and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] addpd xmm0, xmm3 @@ -270,11 +270,11 @@ rx_i_14: ;XOR_64 jz rx_finish xor r10, 0e761d1beh mov eax, r10d - xor rbp, rax test bl, 63 jnz short rx_body_14 call rx_read rx_body_14: + xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] xor rax, r9 @@ -366,11 +366,11 @@ rx_i_19: ;FPSUB jz rx_finish xor r13, 0ac009c30h mov eax, r13d - xor rbp, rax test bl, 63 jnz short rx_body_19 call rx_read rx_body_19: + xor rbp, rax and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] subpd xmm0, xmm8 @@ -399,11 +399,11 @@ rx_i_21: ;ROR_64 jz rx_finish xor r8, 0977f0284h mov eax, r8d - xor rbp, rax test bl, 63 jnz short rx_body_21 call rx_read rx_body_21: + xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] mov rcx, r9 @@ -419,11 +419,11 @@ rx_i_22: ;ADD_64 jz rx_finish xor r13, 080bdfefah mov eax, r13d - xor rbp, rax test bl, 63 jnz short rx_body_22 call rx_read rx_body_22: + xor rbp, rax and eax, 32767 mov rax, qword ptr [rsi+rax*8] add rax, r8 @@ -452,11 +452,11 @@ rx_i_24: ;DIV_64 jz rx_finish xor r8, 070d3b8c7h mov eax, r8d - xor rbp, rax test bl, 63 jnz short rx_body_24 call rx_read rx_body_24: + xor rbp, rax and eax, 32767 mov rax, qword ptr [rsi+rax*8] mov ecx, 1 @@ -476,11 +476,11 @@ rx_i_25: ;FPMUL jz rx_finish xor r12, 01cf77a04h mov eax, r12d - xor rbp, rax test bl, 63 jnz short rx_body_25 call rx_read rx_body_25: + xor rbp, rax and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] mulpd xmm0, xmm9 @@ -577,11 +577,11 @@ rx_i_31: ;ROR_64 jz rx_finish xor r14, 0d352ce37h mov eax, r14d - xor rbp, rax test bl, 63 jnz short rx_body_31 call rx_read rx_body_31: + xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] ror rax, 55 @@ -620,11 +620,11 @@ rx_i_34: ;CALL jz rx_finish xor r13, 0665e91f1h mov eax, r13d - xor rbp, rax test bl, 63 jnz short rx_body_34 call rx_read rx_body_34: + xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] mov r15, rax @@ -702,11 +702,11 @@ rx_i_39: ;ADD_64 jz rx_finish xor r14, 02c1f1eb0h mov eax, r14d - xor rbp, rax test bl, 63 jnz short rx_body_39 call rx_read rx_body_39: + xor rbp, rax and eax, 32767 mov rax, qword ptr [rsi+rax*8] add rax, r14 @@ -752,11 +752,11 @@ rx_i_42: ;FPADD jz rx_finish xor r15, 0bc1de9f6h mov eax, r15d - xor rbp, rax test bl, 63 jnz short rx_body_42 call rx_read rx_body_42: + xor rbp, rax and eax, 32767 cvtdq2pd xmm0, qword ptr [rsi+rax*8] addpd xmm0, xmm6 @@ -800,11 +800,11 @@ rx_i_45: ;FPSUB jz rx_finish xor r12, 08cd244ebh mov eax, r12d - xor rbp, rax test bl, 63 jnz short rx_body_45 call rx_read rx_body_45: + xor rbp, rax and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] subpd xmm0, xmm2 @@ -833,11 +833,11 @@ rx_i_47: ;JUMP jz rx_finish xor r12, 05ba232c6h mov eax, r12d - xor rbp, rax test bl, 63 jnz short rx_body_47 call rx_read rx_body_47: + xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] mov rcx, rax @@ -888,11 +888,11 @@ rx_i_50: ;AND_64 jz rx_finish xor r9, 0da3e4842h mov eax, r9d - xor rbp, rax test bl, 63 jnz short rx_body_50 call rx_read rx_body_50: + xor rbp, rax and eax, 32767 mov rax, qword ptr [rsi+rax*8] and rax, r10 @@ -1167,11 +1167,11 @@ rx_i_66: ;FPDIV jz rx_finish xor r15, 015a1b689h mov eax, r15d - xor rbp, rax test bl, 63 jnz short rx_body_66 call rx_read rx_body_66: + xor rbp, rax and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] divpd xmm0, xmm3 @@ -1222,11 +1222,11 @@ rx_i_69: ;FPADD jz rx_finish xor r15, 0376c9c27h mov eax, r15d - xor rbp, rax test bl, 63 jnz short rx_body_69 call rx_read rx_body_69: + xor rbp, rax and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] addpd xmm0, xmm5 @@ -1237,11 +1237,11 @@ rx_i_70: ;MULH_64 jz rx_finish xor r8, 0bbbec3fah mov eax, r8d - xor rbp, rax test bl, 63 jnz short rx_body_70 call rx_read rx_body_70: + xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] mov rcx, r9 @@ -1307,11 +1307,11 @@ rx_i_74: ;MUL_64 jz rx_finish xor r8, 04c4b0c7fh mov eax, r8d - xor rbp, rax test bl, 63 jnz short rx_body_74 call rx_read rx_body_74: + xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] imul rax, r13 @@ -1539,11 +1539,11 @@ rx_i_87: ;SUB_64 jz rx_finish xor r9, 0d75a0ecfh mov eax, r9d - xor rbp, rax test bl, 63 jnz short rx_body_87 call rx_read rx_body_87: + xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] sub rax, r12 @@ -1554,11 +1554,11 @@ rx_i_88: ;ROR_64 jz rx_finish xor r9, 031bb7f7ah mov eax, r9d - xor rbp, rax test bl, 63 jnz short rx_body_88 call rx_read rx_body_88: + xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] mov rcx, r14 @@ -1602,11 +1602,11 @@ rx_i_91: ;FPMUL jz rx_finish xor r9, 042e28e94h mov eax, r9d - xor rbp, rax test bl, 63 jnz short rx_body_91 call rx_read rx_body_91: + xor rbp, rax and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] mulpd xmm0, xmm2 @@ -1635,11 +1635,11 @@ rx_i_93: ;FPADD jz rx_finish xor r8, 0bfcebaf4h mov eax, r8d - xor rbp, rax test bl, 63 jnz short rx_body_93 call rx_read rx_body_93: + xor rbp, rax and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] addpd xmm0, xmm2 @@ -1654,11 +1654,11 @@ rx_i_94: ;CALL jz rx_finish xor r13, 0ea326630h mov eax, r13d - xor rbp, rax test bl, 63 jnz short rx_body_94 call rx_read rx_body_94: + xor rbp, rax and eax, 32767 mov rax, qword ptr [rsi+rax*8] mov r8, rax @@ -1689,11 +1689,11 @@ rx_i_96: ;IMUL_32 jz rx_finish xor r11, 04f912ef8h mov eax, r11d - xor rbp, rax test bl, 63 jnz short rx_body_96 call rx_read rx_body_96: + xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] movsxd rcx, eax @@ -1847,11 +1847,11 @@ rx_i_105: ;MUL_32 jz rx_finish xor r13, 036a51f72h mov eax, r13d - xor rbp, rax test bl, 63 jnz short rx_body_105 call rx_read rx_body_105: + xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] mov ecx, eax @@ -1929,11 +1929,11 @@ rx_i_109: ;ROR_64 jz rx_finish xor r15, 0594e37deh mov eax, r15d - xor rbp, rax test bl, 63 jnz short rx_body_109 call rx_read rx_body_109: + xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] mov rcx, r10 @@ -2136,11 +2136,11 @@ rx_i_120: ;FPADD jz rx_finish xor r12, 0e5561e3eh mov eax, r12d - xor rbp, rax test bl, 63 jnz short rx_body_120 call rx_read rx_body_120: + xor rbp, rax and eax, 32767 cvtdq2pd xmm0, qword ptr [rsi+rax*8] addpd xmm0, xmm4 @@ -2218,11 +2218,11 @@ rx_i_125: ;IMUL_32 jz rx_finish xor r8, 0ebec27cdh mov eax, r8d - xor rbp, rax test bl, 63 jnz short rx_body_125 call rx_read rx_body_125: + xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] movsxd rcx, eax @@ -2268,11 +2268,11 @@ rx_i_128: ;MUL_64 jz rx_finish xor r13, 0459f1154h mov eax, r13d - xor rbp, rax test bl, 63 jnz short rx_body_128 call rx_read rx_body_128: + xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] imul rax, r9 @@ -2427,11 +2427,11 @@ rx_i_137: ;SHR_64 jz rx_finish xor r11, 015a24231h mov eax, r11d - xor rbp, rax test bl, 63 jnz short rx_body_137 call rx_read rx_body_137: + xor rbp, rax and eax, 32767 mov rax, qword ptr [rsi+rax*8] mov rcx, r9 @@ -2459,11 +2459,11 @@ rx_i_139: ;ADD_64 jz rx_finish xor r9, 093172470h mov eax, r9d - xor rbp, rax test bl, 63 jnz short rx_body_139 call rx_read rx_body_139: + xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] add rax, r8 @@ -2494,11 +2494,11 @@ rx_i_141: ;FPADD jz rx_finish xor r8, 02f636da1h mov eax, r8d - xor rbp, rax test bl, 63 jnz short rx_body_141 call rx_read rx_body_141: + xor rbp, rax and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] addpd xmm0, xmm2 @@ -2589,11 +2589,11 @@ rx_i_146: ;IMULH_64 jz rx_finish xor r13, 02327e6e2h mov eax, r13d - xor rbp, rax test bl, 63 jnz short rx_body_146 call rx_read rx_body_146: + xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] mov rcx, r12 @@ -2606,11 +2606,11 @@ rx_i_147: ;MUL_64 jz rx_finish xor r13, 03a7df043h mov eax, r13d - xor rbp, rax test bl, 63 jnz short rx_body_147 call rx_read rx_body_147: + xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] imul rax, r11 @@ -2839,11 +2839,11 @@ rx_i_160: ;SUB_64 jz rx_finish xor r14, 0b1685b90h mov eax, r14d - xor rbp, rax test bl, 63 jnz short rx_body_160 call rx_read rx_body_160: + xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] sub rax, r14 @@ -2898,11 +2898,11 @@ rx_i_163: ;SUB_64 jz rx_finish xor r12, 0e3486c0ah mov eax, r12d - xor rbp, rax test bl, 63 jnz short rx_body_163 call rx_read rx_body_163: + xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] sub rax, r8 @@ -2917,11 +2917,11 @@ rx_i_164: ;MUL_32 jz rx_finish xor r12, 01f0c2737h mov eax, r12d - xor rbp, rax test bl, 63 jnz short rx_body_164 call rx_read rx_body_164: + xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] mov ecx, eax @@ -3086,11 +3086,11 @@ rx_i_173: ;MUL_64 jz rx_finish xor r14, 05422cf8fh mov eax, r14d - xor rbp, rax test bl, 63 jnz short rx_body_173 call rx_read rx_body_173: + xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] imul rax, rax, -1386172772 @@ -3224,11 +3224,11 @@ rx_i_181: ;CALL jz rx_finish xor r10, 023c7845fh mov eax, r10d - xor rbp, rax test bl, 63 jnz short rx_body_181 call rx_read rx_body_181: + xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] mov r10, rax @@ -3241,11 +3241,11 @@ rx_i_182: ;FPSUB jz rx_finish xor r8, 0f8884327h mov eax, r8d - xor rbp, rax test bl, 63 jnz short rx_body_182 call rx_read rx_body_182: + xor rbp, rax and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] subpd xmm0, xmm7 @@ -3270,11 +3270,11 @@ rx_i_184: ;XOR_32 jz rx_finish xor r12, 04764cdf7h mov eax, r12d - xor rbp, rax test bl, 63 jnz short rx_body_184 call rx_read rx_body_184: + xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] xor eax, r13d @@ -3304,11 +3304,11 @@ rx_i_186: ;OR_64 jz rx_finish xor r9, 0cded414bh mov eax, r9d - xor rbp, rax test bl, 63 jnz short rx_body_186 call rx_read rx_body_186: + xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] or rax, -1252263008 @@ -3340,11 +3340,11 @@ rx_i_188: ;FPSUB jz rx_finish xor r9, 04659becbh mov eax, r9d - xor rbp, rax test bl, 63 jnz short rx_body_188 call rx_read rx_body_188: + xor rbp, rax and eax, 32767 cvtdq2pd xmm0, qword ptr [rsi+rax*8] subpd xmm0, xmm3 @@ -3493,11 +3493,11 @@ rx_i_197: ;MUL_64 jz rx_finish xor r12, 0229208efh mov eax, r12d - xor rbp, rax test bl, 63 jnz short rx_body_197 call rx_read rx_body_197: + xor rbp, rax and eax, 32767 mov rax, qword ptr [rsi+rax*8] imul rax, r15 @@ -3566,11 +3566,11 @@ rx_i_201: ;FPADD jz rx_finish xor r8, 0cdda801dh mov eax, r8d - xor rbp, rax test bl, 63 jnz short rx_body_201 call rx_read rx_body_201: + xor rbp, rax and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] addpd xmm0, xmm9 @@ -3635,11 +3635,11 @@ rx_i_205: ;FPMUL jz rx_finish xor r14, 094e997c5h mov eax, r14d - xor rbp, rax test bl, 63 jnz short rx_body_205 call rx_read rx_body_205: + xor rbp, rax and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] mulpd xmm0, xmm8 @@ -3653,11 +3653,11 @@ rx_i_206: ;FPSUB jz rx_finish xor r11, 0e836a177h mov eax, r11d - xor rbp, rax test bl, 63 jnz short rx_body_206 call rx_read rx_body_206: + xor rbp, rax and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] subpd xmm0, xmm7 @@ -3668,11 +3668,11 @@ rx_i_207: ;IDIV_64 jz rx_finish xor r9, 039ccdd30h mov eax, r9d - xor rbp, rax test bl, 63 jnz short rx_body_207 call rx_read rx_body_207: + xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] ; magic divide by 314297476 @@ -3708,11 +3708,11 @@ rx_i_209: ;XOR_64 jz rx_finish xor r8, 0b84811f1h mov eax, r8d - xor rbp, rax test bl, 63 jnz short rx_body_209 call rx_read rx_body_209: + xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] xor rax, r15 @@ -3727,11 +3727,11 @@ rx_i_210: ;MUL_32 jz rx_finish xor r12, 0c5efc90ah mov eax, r12d - xor rbp, rax test bl, 63 jnz short rx_body_210 call rx_read rx_body_210: + xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] mov ecx, eax @@ -3767,11 +3767,11 @@ rx_i_212: ;MUL_64 jz rx_finish xor r13, 06b465fdbh mov eax, r13d - xor rbp, rax test bl, 63 jnz short rx_body_212 call rx_read rx_body_212: + xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] imul rax, r13 @@ -3786,11 +3786,11 @@ rx_i_213: ;IMUL_32 jz rx_finish xor r13, 02dd1d503h mov eax, r13d - xor rbp, rax test bl, 63 jnz short rx_body_213 call rx_read rx_body_213: + xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] movsxd rcx, eax @@ -3818,11 +3818,11 @@ rx_i_215: ;ADD_32 jz rx_finish xor r15, 08359265eh mov eax, r15d - xor rbp, rax test bl, 63 jnz short rx_body_215 call rx_read rx_body_215: + xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] add eax, r12d @@ -3971,11 +3971,11 @@ rx_i_223: ;FPSUB jz rx_finish xor r8, 01e5cc085h mov eax, r8d - xor rbp, rax test bl, 63 jnz short rx_body_223 call rx_read rx_body_223: + xor rbp, rax and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] subpd xmm0, xmm3 @@ -4126,11 +4126,11 @@ rx_i_231: ;RET jz rx_finish xor r9, 0bb56428dh mov eax, r9d - xor rbp, rax test bl, 63 jnz short rx_body_231 call rx_read rx_body_231: + xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] mov rcx, rax @@ -4164,11 +4164,11 @@ rx_i_233: ;JUMP jz rx_finish xor r13, 08eb2cd76h mov eax, r13d - xor rbp, rax test bl, 63 jnz short rx_body_233 call rx_read rx_body_233: + xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] mov r14, rax @@ -4246,11 +4246,11 @@ rx_i_238: ;FPADD jz rx_finish xor r8, 0158f119fh mov eax, r8d - xor rbp, rax test bl, 63 jnz short rx_body_238 call rx_read rx_body_238: + xor rbp, rax and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] addpd xmm0, xmm6 @@ -4279,11 +4279,11 @@ rx_i_240: ;IMULH_64 jz rx_finish xor r9, 0d65d29f9h mov eax, r9d - xor rbp, rax test bl, 63 jnz short rx_body_240 call rx_read rx_body_240: + xor rbp, rax and eax, 32767 mov rax, qword ptr [rsi+rax*8] mov rcx, r14 @@ -4367,11 +4367,11 @@ rx_i_245: ;AND_32 jz rx_finish xor r13, 084505739h mov eax, r13d - xor rbp, rax test bl, 63 jnz short rx_body_245 call rx_read rx_body_245: + xor rbp, rax and eax, 32767 mov rax, qword ptr [rsi+rax*8] and eax, r10d @@ -4386,11 +4386,11 @@ rx_i_246: ;IDIV_64 jz rx_finish xor r15, 027eeaa2eh mov eax, r15d - xor rbp, rax test bl, 63 jnz short rx_body_246 call rx_read rx_body_246: + xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] ; magic divide by -156808488 @@ -4428,11 +4428,11 @@ rx_i_248: ;MUL_32 jz rx_finish xor r8, 0649df46fh mov eax, r8d - xor rbp, rax test bl, 63 jnz short rx_body_248 call rx_read rx_body_248: + xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] mov ecx, eax @@ -4449,11 +4449,11 @@ rx_i_249: ;IMULH_64 jz rx_finish xor r15, 0499552cch mov eax, r15d - xor rbp, rax test bl, 63 jnz short rx_body_249 call rx_read rx_body_249: + xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] mov rcx, -508571655 @@ -4579,11 +4579,11 @@ rx_i_256: ;MULH_64 jz rx_finish xor r8, 08375472ch mov eax, r8d - xor rbp, rax test bl, 63 jnz short rx_body_256 call rx_read rx_body_256: + xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] mov rcx, r15 @@ -4618,11 +4618,11 @@ rx_i_258: ;MUL_32 jz rx_finish xor r11, 064fdbda0h mov eax, r11d - xor rbp, rax test bl, 63 jnz short rx_body_258 call rx_read rx_body_258: + xor rbp, rax and eax, 32767 mov rax, qword ptr [rsi+rax*8] mov ecx, eax @@ -4653,11 +4653,11 @@ rx_i_260: ;FPSUB jz rx_finish xor r13, 0f94e9fa9h mov eax, r13d - xor rbp, rax test bl, 63 jnz short rx_body_260 call rx_read rx_body_260: + xor rbp, rax and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] subpd xmm0, xmm5 @@ -4668,11 +4668,11 @@ rx_i_261: ;FPDIV jz rx_finish xor r14, 02346171ch mov eax, r14d - xor rbp, rax test bl, 63 jnz short rx_body_261 call rx_read rx_body_261: + xor rbp, rax and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] divpd xmm0, xmm3 @@ -4690,11 +4690,11 @@ rx_i_262: ;AND_64 jz rx_finish xor r10, 01c42baa6h mov eax, r10d - xor rbp, rax test bl, 63 jnz short rx_body_262 call rx_read rx_body_262: + xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] and rax, -1569587450 @@ -4709,11 +4709,11 @@ rx_i_263: ;FPMUL jz rx_finish xor r11, 0b39b140h mov eax, r11d - xor rbp, rax test bl, 63 jnz short rx_body_263 call rx_read rx_body_263: + xor rbp, rax and eax, 32767 cvtdq2pd xmm0, qword ptr [rsi+rax*8] mulpd xmm0, xmm8 @@ -4744,11 +4744,11 @@ rx_i_265: ;FPADD jz rx_finish xor r13, 07a3eb340h mov eax, r13d - xor rbp, rax test bl, 63 jnz short rx_body_265 call rx_read rx_body_265: + xor rbp, rax and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] addpd xmm0, xmm8 @@ -4794,11 +4794,11 @@ rx_i_268: ;JUMP jz rx_finish xor r12, 0c2510cebh mov eax, r12d - xor rbp, rax test bl, 63 jnz short rx_body_268 call rx_read rx_body_268: + xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] mov r13, rax @@ -4879,11 +4879,11 @@ rx_i_273: ;JUMP jz rx_finish xor r9, 0d315e4dch mov eax, r9d - xor rbp, rax test bl, 63 jnz short rx_body_273 call rx_read rx_body_273: + xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] mov rcx, rax @@ -4917,11 +4917,11 @@ rx_i_275: ;IDIV_64 jz rx_finish xor r10, 0788eceb7h mov eax, r10d - xor rbp, rax test bl, 63 jnz short rx_body_275 call rx_read rx_body_275: + xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] ; magic divide by -333089764 @@ -4939,11 +4939,11 @@ rx_i_276: ;JUMP jz rx_finish xor r9, 0c6ac5edah mov eax, r9d - xor rbp, rax test bl, 63 jnz short rx_body_276 call rx_read rx_body_276: + xor rbp, rax and eax, 32767 mov rax, qword ptr [rsi+rax*8] mov rcx, rax @@ -5072,11 +5072,11 @@ rx_i_283: ;ADD_64 jz rx_finish xor r9, 0df4d084fh mov eax, r9d - xor rbp, rax test bl, 63 jnz short rx_body_283 call rx_read rx_body_283: + xor rbp, rax and eax, 32767 mov rax, qword ptr [rsi+rax*8] add rax, r12 @@ -5091,11 +5091,11 @@ rx_i_284: ;FPADD jz rx_finish xor r15, 0e68f36ach mov eax, r15d - xor rbp, rax test bl, 63 jnz short rx_body_284 call rx_read rx_body_284: + xor rbp, rax and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] addpd xmm0, xmm6 @@ -5145,11 +5145,11 @@ rx_i_287: ;IDIV_64 jz rx_finish xor r11, 049547c9ch mov eax, r11d - xor rbp, rax test bl, 63 jnz short rx_body_287 call rx_read rx_body_287: + xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] ; magic divide by 1227278330 @@ -5289,11 +5289,11 @@ rx_i_295: ;FPSUB jz rx_finish xor r9, 0f42798fdh mov eax, r9d - xor rbp, rax test bl, 63 jnz short rx_body_295 call rx_read rx_body_295: + xor rbp, rax and eax, 32767 cvtdq2pd xmm0, qword ptr [rsi+rax*8] subpd xmm0, xmm8 @@ -5346,11 +5346,11 @@ rx_i_299: ;ADD_64 jz rx_finish xor r12, 042f4897h mov eax, r12d - xor rbp, rax test bl, 63 jnz short rx_body_299 call rx_read rx_body_299: + xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] add rax, r10 @@ -5365,11 +5365,11 @@ rx_i_300: ;FPSUB jz rx_finish xor r12, 095765693h mov eax, r12d - xor rbp, rax test bl, 63 jnz short rx_body_300 call rx_read rx_body_300: + xor rbp, rax and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] subpd xmm0, xmm8 @@ -5401,11 +5401,11 @@ rx_i_302: ;ADD_64 jz rx_finish xor r15, 0f6f8c345h mov eax, r15d - xor rbp, rax test bl, 63 jnz short rx_body_302 call rx_read rx_body_302: + xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] add rax, r10 @@ -5632,11 +5632,11 @@ rx_i_316: ;RET jz rx_finish xor r14, 05b0cb5bbh mov eax, r14d - xor rbp, rax test bl, 63 jnz short rx_body_316 call rx_read rx_body_316: + xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] mov rcx, rax @@ -5762,11 +5762,11 @@ rx_i_323: ;MULH_64 jz rx_finish xor r14, 07b07664bh mov eax, r14d - xor rbp, rax test bl, 63 jnz short rx_body_323 call rx_read rx_body_323: + xor rbp, rax and eax, 32767 mov rax, qword ptr [rsi+rax*8] mov rcx, r14 @@ -5818,11 +5818,11 @@ rx_i_326: ;MULH_64 jz rx_finish xor r11, 0d1b27540h mov eax, r11d - xor rbp, rax test bl, 63 jnz short rx_body_326 call rx_read rx_body_326: + xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] mov rcx, -1233771581 @@ -5839,11 +5839,11 @@ rx_i_327: ;IDIV_64 jz rx_finish xor r9, 09665f98dh mov eax, r9d - xor rbp, rax test bl, 63 jnz short rx_body_327 call rx_read rx_body_327: + xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] ; magic divide by 1572662125 @@ -5893,11 +5893,11 @@ rx_i_330: ;IMUL_32 jz rx_finish xor r9, 0f6a93f19h mov eax, r9d - xor rbp, rax test bl, 63 jnz short rx_body_330 call rx_read rx_body_330: + xor rbp, rax and eax, 32767 mov rax, qword ptr [rsi+rax*8] movsxd rcx, eax @@ -5914,11 +5914,11 @@ rx_i_331: ;FPADD jz rx_finish xor r9, 0bc9bbe4ah mov eax, r9d - xor rbp, rax test bl, 63 jnz short rx_body_331 call rx_read rx_body_331: + xor rbp, rax and eax, 32767 cvtdq2pd xmm0, qword ptr [rsi+rax*8] addpd xmm0, xmm3 @@ -5929,11 +5929,11 @@ rx_i_332: ;FPADD jz rx_finish xor r12, 0f253cd4eh mov eax, r12d - xor rbp, rax test bl, 63 jnz short rx_body_332 call rx_read rx_body_332: + xor rbp, rax and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] addpd xmm0, xmm6 @@ -5948,11 +5948,11 @@ rx_i_333: ;OR_64 jz rx_finish xor r14, 0f009758bh mov eax, r14d - xor rbp, rax test bl, 63 jnz short rx_body_333 call rx_read rx_body_333: + xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] or rax, r12 @@ -5963,11 +5963,11 @@ rx_i_334: ;ADD_64 jz rx_finish xor r8, 0dda04168h mov eax, r8d - xor rbp, rax test bl, 63 jnz short rx_body_334 call rx_read rx_body_334: + xor rbp, rax and eax, 32767 mov rax, qword ptr [rsi+rax*8] add rax, r13 @@ -5996,11 +5996,11 @@ rx_i_336: ;ROR_64 jz rx_finish xor r15, 0aea0a435h mov eax, r15d - xor rbp, rax test bl, 63 jnz short rx_body_336 call rx_read rx_body_336: + xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] ror rax, 42 @@ -6033,11 +6033,11 @@ rx_i_338: ;MUL_64 jz rx_finish xor r12, 0d428a742h mov eax, r12d - xor rbp, rax test bl, 63 jnz short rx_body_338 call rx_read rx_body_338: + xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] imul rax, r12 @@ -6110,11 +6110,11 @@ rx_i_343: ;XOR_64 jz rx_finish xor r14, 056f6cf0bh mov eax, r14d - xor rbp, rax test bl, 63 jnz short rx_body_343 call rx_read rx_body_343: + xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] xor rax, r13 @@ -6261,11 +6261,11 @@ rx_i_352: ;FPADD jz rx_finish xor r10, 0afc9af2bh mov eax, r10d - xor rbp, rax test bl, 63 jnz short rx_body_352 call rx_read rx_body_352: + xor rbp, rax and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] addpd xmm0, xmm6 @@ -6332,11 +6332,11 @@ rx_i_356: ;MUL_64 jz rx_finish xor r10, 01cd85d80h mov eax, r10d - xor rbp, rax test bl, 63 jnz short rx_body_356 call rx_read rx_body_356: + xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] imul rax, r10 @@ -6380,11 +6380,11 @@ rx_i_359: ;FPSUB jz rx_finish xor r10, 0714fc2cdh mov eax, r10d - xor rbp, rax test bl, 63 jnz short rx_body_359 call rx_read rx_body_359: + xor rbp, rax and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] subpd xmm0, xmm9 @@ -6528,11 +6528,11 @@ rx_i_367: ;ROR_64 jz rx_finish xor r9, 04d14cb3ah mov eax, r9d - xor rbp, rax test bl, 63 jnz short rx_body_367 call rx_read rx_body_367: + xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] ror rax, 18 @@ -6596,11 +6596,11 @@ rx_i_371: ;FPADD jz rx_finish xor r8, 0ebbd5cc9h mov eax, r8d - xor rbp, rax test bl, 63 jnz short rx_body_371 call rx_read rx_body_371: + xor rbp, rax and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] addpd xmm0, xmm9 @@ -6763,11 +6763,11 @@ rx_i_381: ;XOR_32 jz rx_finish xor r8, 019816ff9h mov eax, r8d - xor rbp, rax test bl, 63 jnz short rx_body_381 call rx_read rx_body_381: + xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] xor eax, r14d @@ -6860,11 +6860,11 @@ rx_i_387: ;SUB_32 jz rx_finish xor r9, 0d4f7bc6ah mov eax, r9d - xor rbp, rax test bl, 63 jnz short rx_body_387 call rx_read rx_body_387: + xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] sub eax, r15d @@ -6989,11 +6989,11 @@ rx_i_395: ;DIV_64 jz rx_finish xor r8, 04ae4fe8ch mov eax, r8d - xor rbp, rax test bl, 63 jnz short rx_body_395 call rx_read rx_body_395: + xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] ; magic divide by 939698704 @@ -7026,11 +7026,11 @@ rx_i_397: ;SUB_32 jz rx_finish xor r8, 0916f3819h mov eax, r8d - xor rbp, rax test bl, 63 jnz short rx_body_397 call rx_read rx_body_397: + xor rbp, rax and eax, 32767 mov rax, qword ptr [rsi+rax*8] sub eax, r12d @@ -7064,11 +7064,11 @@ rx_i_399: ;FPMUL jz rx_finish xor r11, 0899a98cfh mov eax, r11d - xor rbp, rax test bl, 63 jnz short rx_body_399 call rx_read rx_body_399: + xor rbp, rax and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] mulpd xmm0, xmm2 @@ -7213,11 +7213,11 @@ rx_i_407: ;FPSUB jz rx_finish xor r14, 09699566fh mov eax, r14d - xor rbp, rax test bl, 63 jnz short rx_body_407 call rx_read rx_body_407: + xor rbp, rax and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] subpd xmm0, xmm9 @@ -7246,11 +7246,11 @@ rx_i_409: ;MUL_64 jz rx_finish xor r11, 04b6caa9ah mov eax, r11d - xor rbp, rax test bl, 63 jnz short rx_body_409 call rx_read rx_body_409: + xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] imul rax, r15 @@ -7331,11 +7331,11 @@ rx_i_414: ;AND_64 jz rx_finish xor r14, 06c01554dh mov eax, r14d - xor rbp, rax test bl, 63 jnz short rx_body_414 call rx_read rx_body_414: + xor rbp, rax and eax, 32767 mov rax, qword ptr [rsi+rax*8] and rax, -378293327 @@ -7350,11 +7350,11 @@ rx_i_415: ;DIV_64 jz rx_finish xor r8, 08c3e59a1h mov eax, r8d - xor rbp, rax test bl, 63 jnz short rx_body_415 call rx_read rx_body_415: + xor rbp, rax and eax, 32767 mov rax, qword ptr [rsi+rax*8] ; magic divide by 3756873911 @@ -7371,11 +7371,11 @@ rx_i_416: ;FPADD jz rx_finish xor r12, 0f3fafde9h mov eax, r12d - xor rbp, rax test bl, 63 jnz short rx_body_416 call rx_read rx_body_416: + xor rbp, rax and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] addpd xmm0, xmm3 @@ -7452,11 +7452,11 @@ rx_i_421: ;CALL jz rx_finish xor r12, 01ada0f39h mov eax, r12d - xor rbp, rax test bl, 63 jnz short rx_body_421 call rx_read rx_body_421: + xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] mov r10, rax @@ -7503,11 +7503,11 @@ rx_i_424: ;FPADD jz rx_finish xor r13, 01ad12ce2h mov eax, r13d - xor rbp, rax test bl, 63 jnz short rx_body_424 call rx_read rx_body_424: + xor rbp, rax and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] addpd xmm0, xmm7 @@ -7563,11 +7563,11 @@ rx_i_427: ;MUL_32 jz rx_finish xor r11, 0d6cae9aeh mov eax, r11d - xor rbp, rax test bl, 63 jnz short rx_body_427 call rx_read rx_body_427: + xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] mov ecx, eax @@ -7618,11 +7618,11 @@ rx_i_430: ;FPADD jz rx_finish xor r14, 019cc0e5h mov eax, r14d - xor rbp, rax test bl, 63 jnz short rx_body_430 call rx_read rx_body_430: + xor rbp, rax and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] addpd xmm0, xmm8 @@ -7705,11 +7705,11 @@ rx_i_435: ;MUL_64 jz rx_finish xor r15, 0b940480ah mov eax, r15d - xor rbp, rax test bl, 63 jnz short rx_body_435 call rx_read rx_body_435: + xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] imul rax, rax, 1971717631 @@ -7742,11 +7742,11 @@ rx_i_437: ;FPMUL jz rx_finish xor r8, 098a6bcf7h mov eax, r8d - xor rbp, rax test bl, 63 jnz short rx_body_437 call rx_read rx_body_437: + xor rbp, rax and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] mulpd xmm0, xmm3 @@ -7795,11 +7795,11 @@ rx_i_440: ;CALL jz rx_finish xor r10, 062f83728h mov eax, r10d - xor rbp, rax test bl, 63 jnz short rx_body_440 call rx_read rx_body_440: + xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] mov r9, rax @@ -7812,11 +7812,11 @@ rx_i_441: ;ADD_64 jz rx_finish xor r14, 0d18ec075h mov eax, r14d - xor rbp, rax test bl, 63 jnz short rx_body_441 call rx_read rx_body_441: + xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] add rax, r14 @@ -7901,11 +7901,11 @@ rx_i_446: ;MUL_32 jz rx_finish xor r12, 01734708eh mov eax, r12d - xor rbp, rax test bl, 63 jnz short rx_body_446 call rx_read rx_body_446: + xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] mov ecx, eax @@ -7940,11 +7940,11 @@ rx_i_448: ;FPSUB jz rx_finish xor r9, 0390cfdb0h mov eax, r9d - xor rbp, rax test bl, 63 jnz short rx_body_448 call rx_read rx_body_448: + xor rbp, rax and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] subpd xmm0, xmm3 @@ -7988,11 +7988,11 @@ rx_i_451: ;ADD_64 jz rx_finish xor r8, 0c4d99ac9h mov eax, r8d - xor rbp, rax test bl, 63 jnz short rx_body_451 call rx_read rx_body_451: + xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] add rax, r10 @@ -8060,11 +8060,11 @@ rx_i_455: ;OR_64 jz rx_finish xor r8, 059263cdbh mov eax, r8d - xor rbp, rax test bl, 63 jnz short rx_body_455 call rx_read rx_body_455: + xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] or rax, r9 @@ -8093,11 +8093,11 @@ rx_i_457: ;SUB_64 jz rx_finish xor r9, 09de1a3efh mov eax, r9d - xor rbp, rax test bl, 63 jnz short rx_body_457 call rx_read rx_body_457: + xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] sub rax, 1482178870 @@ -8241,11 +8241,11 @@ rx_i_466: ;IMUL_32 jz rx_finish xor r13, 05c541c42h mov eax, r13d - xor rbp, rax test bl, 63 jnz short rx_body_466 call rx_read rx_body_466: + xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] movsxd rcx, eax @@ -8258,11 +8258,11 @@ rx_i_467: ;FPADD jz rx_finish xor r8, 0cbb33f81h mov eax, r8d - xor rbp, rax test bl, 63 jnz short rx_body_467 call rx_read rx_body_467: + xor rbp, rax and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] addpd xmm0, xmm9 @@ -8273,11 +8273,11 @@ rx_i_468: ;DIV_64 jz rx_finish xor r8, 091044dc3h mov eax, r8d - xor rbp, rax test bl, 63 jnz short rx_body_468 call rx_read rx_body_468: + xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] ; magic divide by 4281572471 @@ -8352,11 +8352,11 @@ rx_i_472: ;JUMP jz rx_finish xor r9, 038f4b9d6h mov eax, r9d - xor rbp, rax test bl, 63 jnz short rx_body_472 call rx_read rx_body_472: + xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] mov r10, rax @@ -8382,11 +8382,11 @@ rx_i_474: ;JUMP jz rx_finish xor r9, 0b5c0b4d4h mov eax, r9d - xor rbp, rax test bl, 63 jnz short rx_body_474 call rx_read rx_body_474: + xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] mov r15, rax @@ -8537,11 +8537,11 @@ rx_i_484: ;SHR_64 jz rx_finish xor r12, 07027bacdh mov eax, r12d - xor rbp, rax test bl, 63 jnz short rx_body_484 call rx_read rx_body_484: + xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] shr rax, 37 @@ -8552,11 +8552,11 @@ rx_i_485: ;JUMP jz rx_finish xor r13, 03a04647h mov eax, r13d - xor rbp, rax test bl, 63 jnz short rx_body_485 call rx_read rx_body_485: + xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] mov rcx, rax @@ -8622,11 +8622,11 @@ rx_i_489: ;JUMP jz rx_finish xor r10, 0b2ec9f3ah mov eax, r10d - xor rbp, rax test bl, 63 jnz short rx_body_489 call rx_read rx_body_489: + xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] mov rcx, rax @@ -8642,11 +8642,11 @@ rx_i_490: ;ROR_64 jz rx_finish xor r11, 015c7f598h mov eax, r11d - xor rbp, rax test bl, 63 jnz short rx_body_490 call rx_read rx_body_490: + xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] mov rcx, r9 @@ -8771,11 +8771,11 @@ rx_i_497: ;FPMUL jz rx_finish xor r8, 08d25742eh mov eax, r8d - xor rbp, rax test bl, 63 jnz short rx_body_497 call rx_read rx_body_497: + xor rbp, rax and eax, 2047 cvtdq2pd xmm0, qword ptr [rsi+rax*8] mulpd xmm0, xmm3 @@ -8858,11 +8858,11 @@ rx_i_502: ;RET jz rx_finish xor r10, 09e70b20ch mov eax, r10d - xor rbp, rax test bl, 63 jnz short rx_body_502 call rx_read rx_body_502: + xor rbp, rax and eax, 32767 mov rax, qword ptr [rsi+rax*8] mov rcx, rax From 16db6070256e97554a10c953722a29ec17431e52 Mon Sep 17 00:00:00 2001 From: tevador Date: Fri, 18 Jan 2019 23:51:18 +0100 Subject: [PATCH 22/35] Scratchpad size increased to 1 MiB New AES-based scratchpad hashing function --- makefile | 5 +- src/AssemblyGeneratorX86.cpp | 95 +-- src/JitCompilerX86.cpp | 61 +- src/VirtualMachine.cpp | 8 +- src/common.hpp | 7 +- src/dataset.cpp | 24 +- src/dataset.hpp | 1 - src/hashAes1Rx4.cpp | 73 ++ src/hashAes1Rx4.hpp | 23 + src/main.cpp | 2 +- src/program.inc | 1539 ++++++++++++++++------------------ src/softAes.h | 10 + 12 files changed, 923 insertions(+), 925 deletions(-) create mode 100644 src/hashAes1Rx4.cpp create mode 100644 src/hashAes1Rx4.hpp diff --git a/makefile b/makefile index 0dcd7de..f805724 100644 --- a/makefile +++ b/makefile @@ -11,7 +11,7 @@ SRCDIR=src OBJDIR=obj LDFLAGS=-lpthread TOBJS=$(addprefix $(OBJDIR)/,instructionsPortable.o TestAluFpu.o) -ROBJS=$(addprefix $(OBJDIR)/,argon2_core.o argon2_ref.o AssemblyGeneratorX86.o blake2b.o CompiledVirtualMachine.o dataset.o JitCompilerX86.o instructionsPortable.o Instruction.o InterpretedVirtualMachine.o main.o Program.o softAes.o VirtualMachine.o t1ha2.o Cache.o virtualMemory.o divideByConstantCodegen.o LightClientAsyncWorker.o AddressTransform.o) +ROBJS=$(addprefix $(OBJDIR)/,argon2_core.o argon2_ref.o AssemblyGeneratorX86.o blake2b.o CompiledVirtualMachine.o dataset.o JitCompilerX86.o instructionsPortable.o Instruction.o InterpretedVirtualMachine.o main.o Program.o softAes.o VirtualMachine.o t1ha2.o Cache.o virtualMemory.o divideByConstantCodegen.o LightClientAsyncWorker.o AddressTransform.o hashAes1Rx4.o) ifeq ($(PLATFORM),x86_64) ROBJS += $(OBJDIR)/JitCompilerX86-static.o endif @@ -68,6 +68,9 @@ $(OBJDIR)/dataset.o: $(addprefix $(SRCDIR)/,dataset.cpp common.hpp Pcg32.hpp) | $(OBJDIR)/divideByConstantCodegen.o: $(addprefix $(SRCDIR)/,divideByConstantCodegen.c divideByConstantCodegen.h) | $(OBJDIR) $(CC) $(CCFLAGS) -c $(SRCDIR)/divideByConstantCodegen.c -o $@ +$(OBJDIR)/hashAes1Rx4.o: $(addprefix $(SRCDIR)/,hashAes1Rx4.cpp softAes.h) | $(OBJDIR) + $(CXX) $(CXXFLAGS) -c $(SRCDIR)/hashAes1Rx4.cpp -o $@ + $(OBJDIR)/JitCompilerX86.o: $(addprefix $(SRCDIR)/,JitCompilerX86.cpp JitCompilerX86.hpp Instruction.hpp instructionWeights.hpp) | $(OBJDIR) $(CXX) $(CXXFLAGS) -c $(SRCDIR)/JitCompilerX86.cpp -o $@ diff --git a/src/AssemblyGeneratorX86.cpp b/src/AssemblyGeneratorX86.cpp index 25ae7ef..4a35dfb 100644 --- a/src/AssemblyGeneratorX86.cpp +++ b/src/AssemblyGeneratorX86.cpp @@ -73,11 +73,16 @@ namespace RandomX { asmCode << "rx_body_" << i << ":" << std::endl; if ((instr.loca & 192) == 0) asmCode << "\txor " << regMx << ", rax" << std::endl; - if (instr.loca & 3) { - asmCode << "\tand eax, " << (ScratchpadL1 - 1) << std::endl; + if (instr.loca & 15) { + if (instr.loca & 3) { + asmCode << "\tand eax, " << (ScratchpadL1 - 1) << std::endl; + } + else { + asmCode << "\tand eax, " << (ScratchpadL2 - 1) << std::endl; + } } else { - asmCode << "\tand eax, " << (ScratchpadL2 - 1) << std::endl; + asmCode << "\tand eax, " << (ScratchpadL3 - 1) << std::endl; } } @@ -123,40 +128,32 @@ namespace RandomX { } void AssemblyGeneratorX86::gencr(Instruction& instr, bool rax = true) { - switch (instr.locc & 7) - { - case 0: - if(rax) - asmCode << "\tmov rcx, rax" << std::endl; - asmCode << "\tmov eax, " << regR32[instr.regc % RegistersCount] << std::endl; - asmCode << "\txor eax, 0" << std::hex << instr.addrc << "h" << std::dec << std::endl; - asmCode << "\tand eax, " << (ScratchpadL2 - 1) << std::endl; - asmCode << "\tmov qword ptr [" << regScratchpadAddr << " + rax * 8], rcx" << std::endl; - if (trace) { - asmCode << "\tmov qword ptr [" << regScratchpadAddr << " + " << regIc << " * 8 + 262136], rcx" << std::endl; - } - return; - - case 1: - case 2: - case 3: - if (rax) - asmCode << "\tmov rcx, rax" << std::endl; - asmCode << "\tmov eax, " << regR32[instr.regc % RegistersCount] << std::endl; - asmCode << "\txor eax, 0" << std::hex << instr.addrc << "h" << std::dec << std::endl; - asmCode << "\tand eax, " << (ScratchpadL1 - 1) << std::endl; - asmCode << "\tmov qword ptr [" << regScratchpadAddr << " + rax * 8], rcx" << std::endl; - if (trace) { - asmCode << "\tmov qword ptr [" << regScratchpadAddr << " + " << regIc << " * 8 + 262136], rcx" << std::endl; - } - return; - - default: + if (instr.locc & 16) { //write to register asmCode << "\tmov " << regR[instr.regc % RegistersCount] << ", " << (rax ? "rax" : "rcx") << std::endl; if (trace) { asmCode << "\tmov qword ptr [" << regScratchpadAddr << " + " << regIc << " * 8 + 262136], " << (rax ? "rax" : "rcx") << std::endl; } - return; + } + else { //write to scratchpad + if (rax) + asmCode << "\tmov rcx, rax" << std::endl; + asmCode << "\tmov eax, " << regR32[instr.regc % RegistersCount] << std::endl; + asmCode << "\txor eax, 0" << std::hex << instr.addrc << "h" << std::dec << std::endl; + if (instr.locc & 15) { + if (instr.locc & 3) { + asmCode << "\tand eax, " << (ScratchpadL1 - 1) << std::endl; + } + else { + asmCode << "\tand eax, " << (ScratchpadL2 - 1) << std::endl; + } + } + else { + asmCode << "\tand eax, " << (ScratchpadL3 - 1) << std::endl; + } + asmCode << "\tmov qword ptr [" << regScratchpadAddr << " + rax * 8], rcx" << std::endl; + if (trace) { + asmCode << "\tmov qword ptr [" << regScratchpadAddr << " + " << regIc << " * 8 + 262136], rcx" << std::endl; + } } } @@ -164,23 +161,21 @@ namespace RandomX { if(move) asmCode << "\tmovaps " << regF[instr.regc % RegistersCount] << ", xmm0" << std::endl; const char* store = (instr.locc & 128) ? "movhpd" : "movlpd"; - switch (instr.locc & 7) - { - case 4: - asmCode << "\tmov eax, " << regR32[instr.regc % RegistersCount] << std::endl; - asmCode << "\txor eax, 0" << std::hex << instr.addrc << "h" << std::dec << std::endl; - asmCode << "\tand eax, " << (ScratchpadL2 - 1) << std::endl; - asmCode << "\t" << store << " qword ptr [" << regScratchpadAddr << " + rax * 8], " << regF[instr.regc % RegistersCount] << std::endl; - break; - - case 5: - case 6: - case 7: - asmCode << "\tmov eax, " << regR32[instr.regc % RegistersCount] << std::endl; - asmCode << "\txor eax, 0" << std::hex << instr.addrc << "h" << std::dec << std::endl; - asmCode << "\tand eax, " << (ScratchpadL1 - 1) << std::endl; - asmCode << "\t" << store << " qword ptr [" << regScratchpadAddr << " + rax * 8], " << regF[instr.regc % RegistersCount] << std::endl; - break; + if (instr.locc & 16) { //write to scratchpad + asmCode << "\tmov eax, " << regR32[instr.regc % RegistersCount] << std::endl; + asmCode << "\txor eax, 0" << std::hex << instr.addrc << "h" << std::dec << std::endl; + if (instr.locc & 15) { + if (instr.locc & 3) { + asmCode << "\tand eax, " << (ScratchpadL1 - 1) << std::endl; + } + else { + asmCode << "\tand eax, " << (ScratchpadL2 - 1) << std::endl; + } + } + else { + asmCode << "\tand eax, " << (ScratchpadL3 - 1) << std::endl; + } + asmCode << "\t" << store << " qword ptr [" << regScratchpadAddr << " + rax * 8], " << regF[instr.regc % RegistersCount] << std::endl; } if (trace) { asmCode << "\t" << store << " qword ptr [" << regScratchpadAddr << " + " << regIc << " * 8 + 262136], " << regF[instr.regc % RegistersCount] << std::endl; diff --git a/src/JitCompilerX86.cpp b/src/JitCompilerX86.cpp index f828d0a..070d13a 100644 --- a/src/JitCompilerX86.cpp +++ b/src/JitCompilerX86.cpp @@ -182,11 +182,16 @@ namespace RandomX { emitByte(0xe8); //xor rbp, rax } emitByte(0x25); //and eax, - if (instr.loca & 3) { - emit(ScratchpadL1 - 1); //first 16 KiB of scratchpad + if (instr.loca & 15) { + if (instr.loca & 3) { + emit(ScratchpadL1 - 1); //first 16 KiB of scratchpad + } + else { + emit(ScratchpadL2 - 1); //first 256 KiB of scratchpad + } } else { - emit(ScratchpadL2 - 1); //whole scratchpad + emit(ScratchpadL3 - 1); //whole scratchpad } } @@ -266,27 +271,27 @@ namespace RandomX { } void JitCompilerX86::gencr(Instruction& instr, bool rax = true) { - switch (instr.locc & 7) - { - case 0: - scratchpadStoreR(instr, ScratchpadL2, rax); - break; - - case 1: - case 2: - case 3: - scratchpadStoreR(instr, ScratchpadL1, rax); - break; - - default: - emit(uint16_t(0x8b4c)); //mov - if (rax) { - emitByte(0xc0 + 8 * (instr.regc % RegistersCount)); //regc, rax + if (instr.locc & 16) { //write to register + emit(uint16_t(0x8b4c)); //mov + if (rax) { + emitByte(0xc0 + 8 * (instr.regc % RegistersCount)); //regc, rax + } + else { + emitByte(0xc1 + 8 * (instr.regc % RegistersCount)); //regc, rcx + } + } + else { + if (instr.locc & 15) { + if (instr.locc & 3) { + scratchpadStoreR(instr, ScratchpadL1, rax); } else { - emitByte(0xc1 + 8 * (instr.regc % RegistersCount)); //regc, rcx + scratchpadStoreR(instr, ScratchpadL2, rax); } - break; + } + else { + scratchpadStoreR(instr, ScratchpadL3, rax); + } } } @@ -314,13 +319,17 @@ namespace RandomX { } emit(uint16_t(0x280f)); //movaps emitByte(0xc0 + 8 * regc); // regc, xmm0 - if (instr.locc & 4) //C.LOC.R - { - if (instr.locc & 3) { //C.LOC.W - scratchpadStoreF(instr, regc, ScratchpadL1, (instr.locc & 128)); //first 16 KiB of scratchpad + if (instr.locc & 16) { //write to scratchpad + if (instr.locc & 15) { + if (instr.locc & 3) { //C.LOC.W + scratchpadStoreF(instr, regc, ScratchpadL1, (instr.locc & 128)); //first 16 KiB of scratchpad + } + else { + scratchpadStoreF(instr, regc, ScratchpadL2, (instr.locc & 128)); //first 256 KiB of scratchpad + } } else { - scratchpadStoreF(instr, regc, ScratchpadL2, (instr.locc & 128)); //whole scratchpad + scratchpadStoreF(instr, regc, ScratchpadL3, (instr.locc & 128)); //whole scratchpad } } } diff --git a/src/VirtualMachine.cpp b/src/VirtualMachine.cpp index 7a2be48..0cdc007 100644 --- a/src/VirtualMachine.cpp +++ b/src/VirtualMachine.cpp @@ -19,7 +19,7 @@ along with RandomX. If not, see. #include "VirtualMachine.hpp" #include "common.hpp" -#include "t1ha/t1ha.h" +#include "hashAes1Rx4.hpp" #include "blake2/blake2.h" #include #include @@ -40,10 +40,10 @@ namespace RandomX { } void VirtualMachine::getResult(void* out) { - constexpr size_t smallStateLength = sizeof(RegisterFile) / sizeof(uint64_t) + 2; - uint64_t smallState[smallStateLength]; + constexpr size_t smallStateLength = sizeof(RegisterFile) / sizeof(uint64_t) + 8; + alignas(16) uint64_t smallState[smallStateLength]; memcpy(smallState, ®, sizeof(RegisterFile)); - smallState[smallStateLength - 1] = t1ha2_atonce128(&smallState[smallStateLength - 2], scratchpad, ScratchpadSize, reg.r[0].u64); + hashAes1Rx4(scratchpad, ScratchpadSize, smallState + 24); blake2b(out, ResultSize, smallState, sizeof(smallState), nullptr, 0); } } \ No newline at end of file diff --git a/src/common.hpp b/src/common.hpp index e0d4116..cffa53c 100644 --- a/src/common.hpp +++ b/src/common.hpp @@ -74,10 +74,11 @@ namespace RandomX { constexpr int ProgramLength = 512; constexpr uint32_t InstructionCount = 1024 * 1024; - constexpr uint32_t ScratchpadSize = 256 * 1024; + constexpr uint32_t ScratchpadSize = 1024 * 1024; constexpr uint32_t ScratchpadLength = ScratchpadSize / sizeof(convertible_t); - constexpr uint32_t ScratchpadL1 = ScratchpadSize / 16 / sizeof(convertible_t); - constexpr uint32_t ScratchpadL2 = ScratchpadSize / sizeof(convertible_t); + constexpr uint32_t ScratchpadL1 = ScratchpadSize / 64 / sizeof(convertible_t); + constexpr uint32_t ScratchpadL2 = ScratchpadSize / 4 / sizeof(convertible_t); + constexpr uint32_t ScratchpadL3 = ScratchpadSize / sizeof(convertible_t); constexpr uint32_t TransformationCount = 90; constexpr int RegistersCount = 8; diff --git a/src/dataset.cpp b/src/dataset.cpp index e2b4b54..6029611 100644 --- a/src/dataset.cpp +++ b/src/dataset.cpp @@ -27,6 +27,7 @@ along with RandomX. If not, see. #include "Pcg32.hpp" #include "Cache.hpp" #include "virtualMemory.hpp" +#include "softAes.h" #if defined(__SSE2__) #include @@ -46,21 +47,6 @@ namespace RandomX { } } - template - static inline __m128i aesenc(__m128i in, __m128i key) { - return soft ? soft_aesenc(in, key) : _mm_aesenc_si128(in, key); - } - - template - static inline __m128i aesdec(__m128i in, __m128i key) { - return soft ? soft_aesdec(in, key) : _mm_aesdec_si128(in, key); - } - -#define AES_ROUND(i) x0 = aesdec(x0, keys[i]); \ - x1 = aesenc(x1, keys[i]); \ - x2 = aesdec(x2, keys[i]); \ - x3 = aesenc(x3, keys[i]) - template void initBlock(const uint8_t* intermediate, uint8_t* out, uint32_t blockNumber, const KeysContainer& keys) { __m128i x0, x1, x2, x3; @@ -73,13 +59,13 @@ namespace RandomX { for (auto i = 0; i < DatasetIterations; ++i) { x0 = aesenc(x0, keys[0]); - x0 = aesenc(x0, keys[1]); + //x0 = aesenc(x0, keys[1]); x1 = aesenc(x0, keys[2]); - x1 = aesenc(x1, keys[3]); + //x1 = aesenc(x1, keys[3]); x2 = aesenc(x1, keys[4]); - x2 = aesenc(x2, keys[5]); + //x2 = aesenc(x2, keys[5]); x3 = aesenc(x2, keys[6]); - x3 = aesenc(x3, keys[7]); + //x3 = aesenc(x3, keys[7]); int index = _mm_cvtsi128_si32(x3); index &= mask; diff --git a/src/dataset.hpp b/src/dataset.hpp index bdd34d3..312b924 100644 --- a/src/dataset.hpp +++ b/src/dataset.hpp @@ -23,7 +23,6 @@ along with RandomX. If not, see. #include #include "intrinPortable.h" #include "common.hpp" -#include "softAes.h" namespace RandomX { diff --git a/src/hashAes1Rx4.cpp b/src/hashAes1Rx4.cpp new file mode 100644 index 0000000..1f25335 --- /dev/null +++ b/src/hashAes1Rx4.cpp @@ -0,0 +1,73 @@ +/* +Copyright (c) 2019 tevador + +This file is part of RandomX. + +RandomX is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +RandomX is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with RandomX. If not, see. +*/ + +#include "softAes.h" + +template +void hashAes1Rx4(const void *input, size_t inputSize, void *hash) { + const uint8_t* inptr = (uint8_t*)input; + const uint8_t* inputEnd = inptr + inputSize; + + __m128i state0, state1, state2, state3; + __m128i in0, in1, in2, in3; + + //intial state + state0 = _mm_set_epi32(0x9d04b0ae, 0x59943385, 0x30ac8d93, 0x3fe49f5d); + state1 = _mm_set_epi32(0x8a39ebf1, 0xddc10935, 0xa724ecd3, 0x7b0c6064); + state2 = _mm_set_epi32(0x7ec70420, 0xdf01edda, 0x7c12ecf7, 0xfb5382e3); + state3 = _mm_set_epi32(0x94a9d201, 0x5082d1c8, 0xb2e74109, 0x7728b705); + + //process 64 bytes at a time in 4 lanes + while (inptr < inputEnd) { + in0 = _mm_load_si128((__m128i*)inptr + 0); + in1 = _mm_load_si128((__m128i*)inptr + 1); + in2 = _mm_load_si128((__m128i*)inptr + 2); + in3 = _mm_load_si128((__m128i*)inptr + 3); + + state0 = aesenc(state0, in0); + state1 = aesdec(state1, in1); + state2 = aesenc(state2, in2); + state3 = aesdec(state3, in3); + + inptr += 64; + } + + //two extra rounds to achieve full diffusion + __m128i xkey0 = _mm_set_epi32(0x4ff637c5, 0x053bd705, 0x8231a744, 0xc3767b17); + __m128i xkey1 = _mm_set_epi32(0x6594a1a6, 0xa8879d58, 0xb01da200, 0x8a8fae2e); + + state0 = aesenc(state0, xkey0); + state1 = aesdec(state1, xkey0); + state2 = aesenc(state2, xkey0); + state3 = aesdec(state3, xkey0); + + state0 = aesenc(state0, xkey1); + state1 = aesdec(state1, xkey1); + state2 = aesenc(state2, xkey1); + state3 = aesdec(state3, xkey1); + + //output hash + _mm_store_si128((__m128i*)hash + 0, state0); + _mm_store_si128((__m128i*)hash + 1, state1); + _mm_store_si128((__m128i*)hash + 2, state2); + _mm_store_si128((__m128i*)hash + 3, state3); +} + +template void hashAes1Rx4(const void *input, size_t inputSize, void *hash); +template void hashAes1Rx4(const void *input, size_t inputSize, void *hash); diff --git a/src/hashAes1Rx4.hpp b/src/hashAes1Rx4.hpp new file mode 100644 index 0000000..a9af1fc --- /dev/null +++ b/src/hashAes1Rx4.hpp @@ -0,0 +1,23 @@ +/* +Copyright (c) 2019 tevador + +This file is part of RandomX. + +RandomX is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +RandomX is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with RandomX. If not, see. +*/ + +#include "softAes.h" + +template +void hashAes1Rx4(const void *input, size_t inputSize, void *hash); \ No newline at end of file diff --git a/src/main.cpp b/src/main.cpp index 6ac64b7..5edb0df 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -145,7 +145,7 @@ void mine(RandomX::VirtualMachine* vm, std::atomic& atomicNonce, AtomicHash //std::cout << "Thread " << thread << " nonce " << nonce << std::endl; *noncePtr = nonce; blake2b(hash, sizeof(hash), blockTemplate, sizeof(blockTemplate), nullptr, 0); - int spIndex = ((uint8_t*)hash)[24] | ((((uint8_t*)hash)[25] & 63) << 8); + int spIndex = ((uint8_t*)hash)[24] | ((((uint8_t*)hash)[25] & 15) << 8); vm->initializeScratchpad(spIndex); vm->initializeProgram(hash); //dump((char*)((RandomX::CompiledVirtualMachine*)vm)->getProgram(), RandomX::CodeSize, "code-1337-jmp.txt"); diff --git a/src/program.inc b/src/program.inc index afc9573..f06ca58 100644 --- a/src/program.inc +++ b/src/program.inc @@ -10,11 +10,7 @@ rx_body_0: xor rbp, rax and eax, 32767 mov rax, qword ptr [rsi+rax*8] - mov rcx, rax - mov eax, r12d - xor eax, 01a8e4171h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx + mov r12, rax cmp r11d, 445530481 ja short rx_i_1 call rx_i_30 @@ -28,7 +24,7 @@ rx_i_1: ;IDIV_64 jnz short rx_body_1 call rx_read rx_body_1: - and eax, 32767 + and eax, 131071 mov rax, qword ptr [rsi+rax*8] mov edx, r10d cmp edx, -1 @@ -82,10 +78,6 @@ rx_body_3: cmpeqpd xmm1, xmm1 andps xmm0, xmm1 movaps xmm8, xmm0 - mov eax, r8d - xor eax, 06bb1a0b2h - and eax, 32767 - movhpd qword ptr [rsi + rax * 8], xmm8 rx_i_4: ;MUL_32 dec ebx @@ -101,11 +93,7 @@ rx_body_4: mov ecx, eax mov eax, r14d imul rax, rcx - mov rcx, rax - mov eax, r9d - xor eax, 06ce10c20h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx + mov r9, rax rx_i_5: ;IMUL_32 dec ebx @@ -122,7 +110,11 @@ rx_body_5: movsxd rcx, eax mov rax, 1037420699 imul rax, rcx - mov r12, rax + mov rcx, rax + mov eax, r12d + xor eax, 03dd5c89bh + and eax, 32767 + mov qword ptr [rsi + rax * 8], rcx rx_i_6: ;MUL_64 dec ebx @@ -139,7 +131,7 @@ rx_body_6: mov rcx, rax mov eax, r9d xor eax, 098a649d1h - and eax, 32767 + and eax, 131071 mov qword ptr [rsi + rax * 8], rcx rx_i_7: ;FPADD @@ -155,10 +147,6 @@ rx_body_7: cvtdq2pd xmm0, qword ptr [rsi+rax*8] addpd xmm0, xmm6 movaps xmm6, xmm0 - mov eax, r14d - xor eax, 057c8c41bh - and eax, 32767 - movlpd qword ptr [rsi + rax * 8], xmm6 rx_i_8: ;XOR_64 dec ebx @@ -169,7 +157,7 @@ rx_i_8: ;XOR_64 jnz short rx_body_8 call rx_read rx_body_8: - and eax, 32767 + and eax, 131071 mov rax, qword ptr [rsi+rax*8] xor rax, r11 mov rcx, rax @@ -212,7 +200,11 @@ rx_body_10: and eax, 2047 mov rax, qword ptr [rsi+rax*8] and rax, r10 - mov r13, rax + mov rcx, rax + mov eax, r13d + xor eax, 0d49dbd9fh + and eax, 2047 + mov qword ptr [rsi + rax * 8], rcx rx_i_11: ;FPADD dec ebx @@ -245,10 +237,6 @@ rx_body_12: cvtdq2pd xmm0, qword ptr [rsi+rax*8] andps xmm0, xmm10 sqrtpd xmm8, xmm0 - mov eax, r8d - xor eax, 096dc67c9h - and eax, 2047 - movlpd qword ptr [rsi + rax * 8], xmm8 rx_i_13: ;FPADD dec ebx @@ -264,6 +252,10 @@ rx_body_13: cvtdq2pd xmm0, qword ptr [rsi+rax*8] addpd xmm0, xmm3 movaps xmm9, xmm0 + mov eax, r9d + xor eax, 04f2f223ch + and eax, 2047 + movhpd qword ptr [rsi + rax * 8], xmm9 rx_i_14: ;XOR_64 dec ebx @@ -278,11 +270,7 @@ rx_body_14: and eax, 2047 mov rax, qword ptr [rsi+rax*8] xor rax, r9 - mov rcx, rax - mov eax, r10d - xor eax, 03c1a72f8h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx + mov r10, rax rx_i_15: ;RET dec ebx @@ -295,11 +283,7 @@ rx_i_15: ;RET rx_body_15: and eax, 2047 mov rax, qword ptr [rsi+rax*8] - mov rcx, rax - mov eax, r14d - xor eax, 0468b38b8h - and eax, 32767 - mov qword ptr [rsi + rax * 8], rcx + mov r14, rax cmp rsp, rdi je short rx_i_16 ret @@ -319,7 +303,7 @@ rx_body_16: mov rcx, rax mov eax, r9d xor eax, 0d7e75aeh - and eax, 32767 + and eax, 131071 mov qword ptr [rsi + rax * 8], rcx rx_i_17: ;FPMUL @@ -338,10 +322,6 @@ rx_body_17: cmpeqpd xmm1, xmm1 andps xmm0, xmm1 movaps xmm4, xmm0 - mov eax, r12d - xor eax, 0f77ffe16h - and eax, 32767 - movlpd qword ptr [rsi + rax * 8], xmm4 rx_i_18: ;FPSUB dec ebx @@ -389,10 +369,6 @@ rx_body_20: cvtdq2pd xmm0, qword ptr [rsi+rax*8] subpd xmm0, xmm2 movaps xmm7, xmm0 - mov eax, r15d - xor eax, 0aad81365h - and eax, 2047 - movlpd qword ptr [rsi + rax * 8], xmm7 rx_i_21: ;ROR_64 dec ebx @@ -408,11 +384,7 @@ rx_body_21: mov rax, qword ptr [rsi+rax*8] mov rcx, r9 ror rax, cl - mov rcx, rax - mov eax, r15d - xor eax, 0db5e0aafh - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx + mov r15, rax rx_i_22: ;ADD_64 dec ebx @@ -424,14 +396,10 @@ rx_i_22: ;ADD_64 call rx_read rx_body_22: xor rbp, rax - and eax, 32767 + and eax, 131071 mov rax, qword ptr [rsi+rax*8] add rax, r8 - mov rcx, rax - mov eax, r10d - xor eax, 0cfa09799h - and eax, 32767 - mov qword ptr [rsi + rax * 8], rcx + mov r10, rax rx_i_23: ;MUL_64 dec ebx @@ -445,7 +413,11 @@ rx_body_23: and eax, 2047 mov rax, qword ptr [rsi+rax*8] imul rax, rax, 1283724485 - mov r8, rax + mov rcx, rax + mov eax, r8d + xor eax, 04c8414c5h + and eax, 2047 + mov qword ptr [rsi + rax * 8], rcx rx_i_24: ;DIV_64 dec ebx @@ -457,7 +429,7 @@ rx_i_24: ;DIV_64 call rx_read rx_body_24: xor rbp, rax - and eax, 32767 + and eax, 131071 mov rax, qword ptr [rsi+rax*8] mov ecx, 1 mov edx, r15d @@ -507,11 +479,7 @@ rx_body_26: mov rcx, 812644844 imul rcx mov rax, rdx - mov rcx, rax - mov eax, r9d - xor eax, 0306ff9ech - and eax, 32767 - mov qword ptr [rsi + rax * 8], rcx + mov r9, rax rx_i_27: ;FPMUL dec ebx @@ -542,7 +510,11 @@ rx_body_28: and eax, 2047 mov rax, qword ptr [rsi+rax*8] and eax, 565865719 - mov r14, rax + mov rcx, rax + mov eax, r14d + xor eax, 021ba6cf7h + and eax, 2047 + mov qword ptr [rsi + rax * 8], rcx rx_i_29: ;SUB_64 dec ebx @@ -556,7 +528,11 @@ rx_body_29: and eax, 2047 mov rax, qword ptr [rsi+rax*8] sub rax, r13 - mov r14, rax + mov rcx, rax + mov eax, r14d + xor eax, 073e1a073h + and eax, 2047 + mov qword ptr [rsi + rax * 8], rcx rx_i_30: ;FPADD dec ebx @@ -585,7 +561,11 @@ rx_body_31: and eax, 2047 mov rax, qword ptr [rsi+rax*8] ror rax, 55 - mov r14, rax + mov rcx, rax + mov eax, r14d + xor eax, 01e2da792h + and eax, 2047 + mov qword ptr [rsi + rax * 8], rcx rx_i_32: ;AND_32 dec ebx @@ -596,7 +576,7 @@ rx_i_32: ;AND_32 jnz short rx_body_32 call rx_read rx_body_32: - and eax, 32767 + and eax, 131071 mov rax, qword ptr [rsi+rax*8] and eax, r14d mov r9, rax @@ -627,7 +607,11 @@ rx_body_34: xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] - mov r15, rax + mov rcx, rax + mov eax, r15d + xor eax, 0e9563b32h + and eax, 2047 + mov qword ptr [rsi + rax * 8], rcx cmp r14d, -380224718 jns short rx_i_35 call rx_i_108 @@ -643,7 +627,11 @@ rx_i_35: ;CALL rx_body_35: and eax, 32767 mov rax, qword ptr [rsi+rax*8] - mov r8, rax + mov rcx, rax + mov eax, r8d + xor eax, 0865c0f66h + and eax, 2047 + mov qword ptr [rsi + rax * 8], rcx cmp r9d, -2040787098 jns short rx_i_36 call rx_i_58 @@ -678,10 +666,6 @@ rx_body_37: cvtdq2pd xmm0, qword ptr [rsi+rax*8] subpd xmm0, xmm2 movaps xmm9, xmm0 - mov eax, r9d - xor eax, 0bca81c78h - and eax, 2047 - movhpd qword ptr [rsi + rax * 8], xmm9 rx_i_38: ;SUB_64 dec ebx @@ -695,7 +679,11 @@ rx_body_38: and eax, 2047 mov rax, qword ptr [rsi+rax*8] sub rax, r14 - mov r10, rax + mov rcx, rax + mov eax, r10d + xor eax, 087c32de2h + and eax, 2047 + mov qword ptr [rsi + rax * 8], rcx rx_i_39: ;ADD_64 dec ebx @@ -707,10 +695,14 @@ rx_i_39: ;ADD_64 call rx_read rx_body_39: xor rbp, rax - and eax, 32767 + and eax, 131071 mov rax, qword ptr [rsi+rax*8] add rax, r14 - mov r14, rax + mov rcx, rax + mov eax, r14d + xor eax, 0f4101ad9h + and eax, 2047 + mov qword ptr [rsi + rax * 8], rcx rx_i_40: ;CALL dec ebx @@ -774,11 +766,7 @@ rx_body_43: and eax, 32767 mov rax, qword ptr [rsi+rax*8] sub rax, r8 - mov rcx, rax - mov eax, r11d - xor eax, 064f3e4bfh - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx + mov r11, rax rx_i_44: ;SAR_64 dec ebx @@ -793,7 +781,11 @@ rx_body_44: mov rax, qword ptr [rsi+rax*8] mov rcx, r9 sar rax, cl - mov r15, rax + mov rcx, rax + mov eax, r15d + xor eax, 0372116f6h + and eax, 2047 + mov qword ptr [rsi + rax * 8], rcx rx_i_45: ;FPSUB dec ebx @@ -809,6 +801,10 @@ rx_body_45: cvtdq2pd xmm0, qword ptr [rsi+rax*8] subpd xmm0, xmm2 movaps xmm5, xmm0 + mov eax, r13d + xor eax, 0977132cdh + and eax, 2047 + movhpd qword ptr [rsi + rax * 8], xmm5 rx_i_46: ;ADD_64 dec ebx @@ -822,11 +818,7 @@ rx_body_46: and eax, 2047 mov rax, qword ptr [rsi+rax*8] add rax, r9 - mov rcx, rax - mov eax, r8d - xor eax, 0e9f58436h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx + mov r8, rax rx_i_47: ;JUMP dec ebx @@ -864,10 +856,6 @@ rx_body_48: cmpeqpd xmm1, xmm1 andps xmm0, xmm1 movaps xmm9, xmm0 - mov eax, r9d - xor eax, 020e5d9e9h - and eax, 2047 - movhpd qword ptr [rsi + rax * 8], xmm9 rx_i_49: ;FPSUB dec ebx @@ -896,11 +884,7 @@ rx_body_50: and eax, 32767 mov rax, qword ptr [rsi+rax*8] and rax, r10 - mov rcx, rax - mov eax, r15d - xor eax, 06ac56a2ah - and eax, 32767 - mov qword ptr [rsi + rax * 8], rcx + mov r15, rax rx_i_51: ;SUB_64 dec ebx @@ -914,7 +898,11 @@ rx_body_51: and eax, 2047 mov rax, qword ptr [rsi+rax*8] sub rax, r15 - mov r15, rax + mov rcx, rax + mov eax, r15d + xor eax, 018fd1fbfh + and eax, 32767 + mov qword ptr [rsi + rax * 8], rcx rx_i_52: ;FPSQRT dec ebx @@ -941,7 +929,11 @@ rx_i_53: ;RET rx_body_53: and eax, 2047 mov rax, qword ptr [rsi+rax*8] - mov r13, rax + mov rcx, rax + mov eax, r13d + xor eax, 078ed00edh + and eax, 2047 + mov qword ptr [rsi + rax * 8], rcx cmp rsp, rdi je short rx_i_54 ret @@ -984,10 +976,6 @@ rx_body_55: cmpeqpd xmm1, xmm1 andps xmm0, xmm1 movaps xmm3, xmm0 - mov eax, r11d - xor eax, 07c79cddh - and eax, 2047 - movhpd qword ptr [rsi + rax * 8], xmm3 rx_i_56: ;IDIV_64 dec ebx @@ -1010,11 +998,7 @@ rx_body_56: sar rax, 25 sets dl add rax, rdx - mov rcx, rax - mov eax, r8d - xor eax, 0fcf95491h - and eax, 32767 - mov qword ptr [rsi + rax * 8], rcx + mov r8, rax rx_i_57: ;MUL_64 dec ebx @@ -1075,7 +1059,7 @@ rx_i_60: ;CALL jnz short rx_body_60 call rx_read rx_body_60: - and eax, 32767 + and eax, 131071 mov rax, qword ptr [rsi+rax*8] mov rcx, rax mov eax, r11d @@ -1132,6 +1116,10 @@ rx_body_63: cvtdq2pd xmm0, qword ptr [rsi+rax*8] subpd xmm0, xmm7 movaps xmm8, xmm0 + mov eax, r8d + xor eax, 0be13d69eh + and eax, 2047 + movlpd qword ptr [rsi + rax * 8], xmm8 rx_i_64: ;SUB_64 dec ebx @@ -1158,7 +1146,11 @@ rx_i_65: ;JUMP rx_body_65: and eax, 32767 mov rax, qword ptr [rsi+rax*8] - mov r11, rax + mov rcx, rax + mov eax, r11d + xor eax, 0594a879fh + and eax, 2047 + mov qword ptr [rsi + rax * 8], rcx cmp r8d, 1498056607 js rx_i_129 @@ -1195,7 +1187,11 @@ rx_i_67: ;JUMP rx_body_67: and eax, 2047 mov rax, qword ptr [rsi+rax*8] - mov r9, rax + mov rcx, rax + mov eax, r9d + xor eax, 07916db59h + and eax, 32767 + mov qword ptr [rsi + rax * 8], rcx cmp r13d, 2031541081 jns rx_i_79 @@ -1265,6 +1261,10 @@ rx_body_71: cmpeqpd xmm1, xmm1 andps xmm0, xmm1 movaps xmm7, xmm0 + mov eax, r15d + xor eax, 056660eedh + and eax, 131071 + movlpd qword ptr [rsi + rax * 8], xmm7 rx_i_72: ;JUMP dec ebx @@ -1277,11 +1277,7 @@ rx_i_72: ;JUMP rx_body_72: and eax, 2047 mov rax, qword ptr [rsi+rax*8] - mov rcx, rax - mov eax, r11d - xor eax, 0da624dd9h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx + mov r11, rax cmp r9d, -631091751 jno rx_i_191 @@ -1389,7 +1385,11 @@ rx_body_78: mov ecx, eax mov eax, r8d imul rax, rcx - mov r15, rax + mov rcx, rax + mov eax, r15d + xor eax, 0697e6195h + and eax, 2047 + mov qword ptr [rsi + rax * 8], rcx rx_i_79: ;CALL dec ebx @@ -1402,11 +1402,7 @@ rx_i_79: ;CALL rx_body_79: and eax, 32767 mov rax, qword ptr [rsi+rax*8] - mov rcx, rax - mov eax, r11d - xor eax, 06b4a7b43h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx + mov r11, rax cmp r13d, 1800043331 ja short rx_i_80 call rx_i_93 @@ -1442,7 +1438,11 @@ rx_body_81: and eax, 2047 mov rax, qword ptr [rsi+rax*8] and rax, 338325607 - mov r8, rax + mov rcx, rax + mov eax, r8d + xor eax, 0142a7067h + and eax, 2047 + mov qword ptr [rsi + rax * 8], rcx rx_i_82: ;JUMP dec ebx @@ -1455,11 +1455,7 @@ rx_i_82: ;JUMP rx_body_82: and eax, 2047 mov rax, qword ptr [rsi+rax*8] - mov rcx, rax - mov eax, r10d - xor eax, 0fbe39afbh - and eax, 32767 - mov qword ptr [rsi + rax * 8], rcx + mov r10, rax cmp r12d, -68969733 jo rx_i_145 @@ -1499,7 +1495,7 @@ rx_body_84: mov rcx, rax mov eax, r13d xor eax, 0ec5c52e6h - and eax, 32767 + and eax, 131071 mov qword ptr [rsi + rax * 8], rcx rx_i_85: ;MUL_64 @@ -1514,7 +1510,11 @@ rx_body_85: and eax, 2047 mov rax, qword ptr [rsi+rax*8] imul rax, rax, 20014507 - mov r10, rax + mov rcx, rax + mov eax, r10d + xor eax, 013165abh + and eax, 32767 + mov qword ptr [rsi + rax * 8], rcx rx_i_86: ;AND_64 dec ebx @@ -1528,11 +1528,7 @@ rx_body_86: and eax, 2047 mov rax, qword ptr [rsi+rax*8] and rax, r8 - mov rcx, rax - mov eax, r12d - xor eax, 0a90410e4h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx + mov r12, rax rx_i_87: ;SUB_64 dec ebx @@ -1577,11 +1573,7 @@ rx_body_89: and eax, 2047 mov rax, qword ptr [rsi+rax*8] imul rax, r8 - mov rcx, rax - mov eax, r10d - xor eax, 0e67532afh - and eax, 32767 - mov qword ptr [rsi + rax * 8], rcx + mov r10, rax rx_i_90: ;FPADD dec ebx @@ -1661,7 +1653,11 @@ rx_body_94: xor rbp, rax and eax, 32767 mov rax, qword ptr [rsi+rax*8] - mov r8, rax + mov rcx, rax + mov eax, r8d + xor eax, 0eb8c5be0h + and eax, 2047 + mov qword ptr [rsi + rax * 8], rcx cmp r13d, -343122976 jns short rx_i_95 call rx_i_157 @@ -1678,11 +1674,7 @@ rx_body_95: and eax, 2047 mov rax, qword ptr [rsi+rax*8] imul rax, r10 - mov rcx, rax - mov eax, r15d - xor eax, 01023aa04h - and eax, 32767 - mov qword ptr [rsi + rax * 8], rcx + mov r15, rax rx_i_96: ;IMUL_32 dec ebx @@ -1710,17 +1702,13 @@ rx_i_97: ;FPDIV jnz short rx_body_97 call rx_read rx_body_97: - and eax, 32767 + and eax, 131071 cvtdq2pd xmm0, qword ptr [rsi+rax*8] divpd xmm0, xmm9 movaps xmm1, xmm0 cmpeqpd xmm1, xmm1 andps xmm0, xmm1 movaps xmm5, xmm0 - mov eax, r13d - xor eax, 0c477e850h - and eax, 2047 - movlpd qword ptr [rsi + rax * 8], xmm5 rx_i_98: ;SUB_64 dec ebx @@ -1734,7 +1722,11 @@ rx_body_98: and eax, 32767 mov rax, qword ptr [rsi+rax*8] sub rax, r15 - mov r14, rax + mov rcx, rax + mov eax, r14d + xor eax, 0d067d49ah + and eax, 2047 + mov qword ptr [rsi + rax * 8], rcx rx_i_99: ;FPMUL dec ebx @@ -1836,11 +1828,7 @@ rx_body_104: mul rcx mov rax, rdx shr rax, 31 - mov rcx, rax - mov eax, r15d - xor eax, 08df8ddf7h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx + mov r15, rax rx_i_105: ;MUL_32 dec ebx @@ -1857,11 +1845,7 @@ rx_body_105: mov ecx, eax mov eax, r15d imul rax, rcx - mov rcx, rax - mov eax, r14d - xor eax, 09c8724edh - and eax, 32767 - mov qword ptr [rsi + rax * 8], rcx + mov r14, rax rx_i_106: ;FPMUL dec ebx @@ -1919,10 +1903,6 @@ rx_body_108: cmpeqpd xmm1, xmm1 andps xmm0, xmm1 movaps xmm9, xmm0 - mov eax, r9d - xor eax, 0678b65beh - and eax, 32767 - movhpd qword ptr [rsi + rax * 8], xmm9 rx_i_109: ;ROR_64 dec ebx @@ -1953,7 +1933,7 @@ rx_i_110: ;SHR_64 jnz short rx_body_110 call rx_read rx_body_110: - and eax, 32767 + and eax, 131071 mov rax, qword ptr [rsi+rax*8] mov rcx, r9 shr rax, cl @@ -1972,12 +1952,12 @@ rx_i_111: ;CALL jnz short rx_body_111 call rx_read rx_body_111: - and eax, 32767 + and eax, 131071 mov rax, qword ptr [rsi+rax*8] mov rcx, rax mov eax, r12d xor eax, 05d237d0bh - and eax, 32767 + and eax, 131071 mov qword ptr [rsi + rax * 8], rcx cmp r14d, 1562606859 jl short rx_i_112 @@ -1995,11 +1975,7 @@ rx_body_112: and eax, 2047 mov rax, qword ptr [rsi+rax*8] sub rax, -1025977295 - mov rcx, rax - mov eax, r14d - xor eax, 0c2d8d431h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx + mov r14, rax rx_i_113: ;MULH_64 dec ebx @@ -2015,7 +1991,11 @@ rx_body_113: mov rcx, r9 mul rcx mov rax, rdx - mov r13, rax + mov rcx, rax + mov eax, r13d + xor eax, 0dea3f7e3h + and eax, 32767 + mov qword ptr [rsi + rax * 8], rcx rx_i_114: ;DIV_64 dec ebx @@ -2075,7 +2055,7 @@ rx_body_116: mov rcx, rax mov eax, r8d xor eax, 091af638dh - and eax, 32767 + and eax, 131071 mov qword ptr [rsi + rax * 8], rcx rx_i_117: ;IDIV_64 @@ -2097,11 +2077,7 @@ rx_body_117: sar rax, 29 sets dl add rax, rdx - mov rcx, rax - mov eax, r15d - xor eax, 0b8208a64h - and eax, 32767 - mov qword ptr [rsi + rax * 8], rcx + mov r15, rax rx_i_118: ;FPSUB dec ebx @@ -2130,6 +2106,10 @@ rx_body_119: cvtdq2pd xmm0, qword ptr [rsi+rax*8] subpd xmm0, xmm3 movaps xmm5, xmm0 + mov eax, r13d + xor eax, 02401488h + and eax, 2047 + movlpd qword ptr [rsi + rax * 8], xmm5 rx_i_120: ;FPADD dec ebx @@ -2155,7 +2135,7 @@ rx_i_121: ;FPSUB jnz short rx_body_121 call rx_read rx_body_121: - and eax, 32767 + and eax, 131071 cvtdq2pd xmm0, qword ptr [rsi+rax*8] subpd xmm0, xmm5 movaps xmm8, xmm0 @@ -2169,7 +2149,7 @@ rx_i_122: ;CALL jnz short rx_body_122 call rx_read rx_body_122: - and eax, 32767 + and eax, 131071 mov rax, qword ptr [rsi+rax*8] mov rcx, rax mov eax, r14d @@ -2205,11 +2185,7 @@ rx_i_124: ;JUMP rx_body_124: and eax, 2047 mov rax, qword ptr [rsi+rax*8] - mov rcx, rax - mov eax, r11d - xor eax, 0667d921ch - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx + mov r11, rax cmp r11d, 1719505436 jns rx_i_237 @@ -2228,7 +2204,11 @@ rx_body_125: movsxd rcx, eax mov rax, 1774711622 imul rax, rcx - mov r14, rax + mov rcx, rax + mov eax, r14d + xor eax, 069c7f346h + and eax, 2047 + mov qword ptr [rsi + rax * 8], rcx rx_i_126: ;FPMUL dec ebx @@ -2276,7 +2256,11 @@ rx_body_128: and eax, 2047 mov rax, qword ptr [rsi+rax*8] imul rax, r9 - mov r9, rax + mov rcx, rax + mov eax, r9d + xor eax, 0cb2ee635h + and eax, 32767 + mov qword ptr [rsi + rax * 8], rcx rx_i_129: ;JUMP dec ebx @@ -2329,11 +2313,7 @@ rx_i_131: ;RET rx_body_131: and eax, 2047 mov rax, qword ptr [rsi+rax*8] - mov rcx, rax - mov eax, r15d - xor eax, 0dff06f75h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx + mov r15, rax cmp rsp, rdi je short rx_i_132 ret @@ -2351,6 +2331,10 @@ rx_body_132: cvtdq2pd xmm0, qword ptr [rsi+rax*8] addpd xmm0, xmm6 movaps xmm7, xmm0 + mov eax, r15d + xor eax, 0b0c38959h + and eax, 2047 + movlpd qword ptr [rsi + rax * 8], xmm7 rx_i_133: ;OR_64 dec ebx @@ -2364,11 +2348,7 @@ rx_body_133: and eax, 2047 mov rax, qword ptr [rsi+rax*8] or rax, r13 - mov rcx, rax - mov eax, r15d - xor eax, 0c45d2c34h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx + mov r15, rax rx_i_134: ;ADD_64 dec ebx @@ -2382,7 +2362,11 @@ rx_body_134: and eax, 2047 mov rax, qword ptr [rsi+rax*8] add rax, r8 - mov r13, rax + mov rcx, rax + mov eax, r13d + xor eax, 05a5de2cbh + and eax, 32767 + mov qword ptr [rsi + rax * 8], rcx rx_i_135: ;FPMUL dec ebx @@ -2400,6 +2384,10 @@ rx_body_135: cmpeqpd xmm1, xmm1 andps xmm0, xmm1 movaps xmm8, xmm0 + mov eax, r8d + xor eax, 0b29f3d2ah + and eax, 2047 + movlpd qword ptr [rsi + rax * 8], xmm8 rx_i_136: ;FPDIV dec ebx @@ -2417,10 +2405,6 @@ rx_body_136: cmpeqpd xmm1, xmm1 andps xmm0, xmm1 movaps xmm5, xmm0 - mov eax, r13d - xor eax, 0efd7799dh - and eax, 2047 - movhpd qword ptr [rsi + rax * 8], xmm5 rx_i_137: ;SHR_64 dec ebx @@ -2449,7 +2433,11 @@ rx_i_138: ;RET rx_body_138: and eax, 2047 mov rax, qword ptr [rsi+rax*8] - mov r10, rax + mov rcx, rax + mov eax, r10d + xor eax, 08e1fd158h + and eax, 2047 + mov qword ptr [rsi + rax * 8], rcx cmp rsp, rdi je short rx_i_139 ret @@ -2467,11 +2455,7 @@ rx_body_139: and eax, 2047 mov rax, qword ptr [rsi+rax*8] add rax, r8 - mov rcx, rax - mov eax, r11d - xor eax, 01eb7d4f2h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx + mov r11, rax rx_i_140: ;IMUL_32 dec ebx @@ -2487,7 +2471,11 @@ rx_body_140: movsxd rcx, eax mov rax, -140239781 imul rax, rcx - mov r14, rax + mov rcx, rax + mov eax, r14d + xor eax, 0f7a41c5bh + and eax, 2047 + mov qword ptr [rsi + rax * 8], rcx rx_i_141: ;FPADD dec ebx @@ -2522,7 +2510,7 @@ rx_body_142: mov rcx, rax mov eax, r10d xor eax, 0516a9452h - and eax, 32767 + and eax, 131071 mov qword ptr [rsi + rax * 8], rcx cmp r12d, 1365939282 js rx_i_257 @@ -2614,11 +2602,7 @@ rx_body_147: and eax, 2047 mov rax, qword ptr [rsi+rax*8] imul rax, r11 - mov rcx, rax - mov eax, r12d - xor eax, 06a5bda88h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx + mov r12, rax rx_i_148: ;SUB_64 dec ebx @@ -2652,11 +2636,7 @@ rx_body_149: mov ecx, eax mov eax, r14d imul rax, rcx - mov rcx, rax - mov eax, r8d - xor eax, 09046b787h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx + mov r8, rax rx_i_150: ;DIV_64 dec ebx @@ -2693,11 +2673,7 @@ rx_body_151: and eax, 2047 mov rax, qword ptr [rsi+rax*8] and rax, -2018584590 - mov rcx, rax - mov eax, r11d - xor eax, 087aed7f2h - and eax, 32767 - mov qword ptr [rsi + rax * 8], rcx + mov r11, rax rx_i_152: ;SAR_64 dec ebx @@ -2730,10 +2706,6 @@ rx_body_153: cmpeqpd xmm1, xmm1 andps xmm0, xmm1 movaps xmm8, xmm0 - mov eax, r8d - xor eax, 09111c981h - and eax, 2047 - movhpd qword ptr [rsi + rax * 8], xmm8 rx_i_154: ;MUL_32 dec ebx @@ -2749,7 +2721,11 @@ rx_body_154: mov ecx, eax mov eax, -820047839 imul rax, rcx - mov r10, rax + mov rcx, rax + mov eax, r10d + xor eax, 0cf1f1021h + and eax, 2047 + mov qword ptr [rsi + rax * 8], rcx rx_i_155: ;ROL_64 dec ebx @@ -2764,11 +2740,7 @@ rx_body_155: mov rax, qword ptr [rsi+rax*8] mov rcx, r10 rol rax, cl - mov rcx, rax - mov eax, r13d - xor eax, 01c5d3ebeh - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx + mov r13, rax rx_i_156: ;IMUL_32 dec ebx @@ -2784,7 +2756,11 @@ rx_body_156: movsxd rcx, eax movsxd rax, r15d imul rax, rcx - mov r15, rax + mov rcx, rax + mov eax, r15d + xor eax, 0b803e8a9h + and eax, 2047 + mov qword ptr [rsi + rax * 8], rcx rx_i_157: ;ADD_64 dec ebx @@ -2812,7 +2788,11 @@ rx_body_158: and eax, 2047 mov rax, qword ptr [rsi+rax*8] add rax, r13 - mov r10, rax + mov rcx, rax + mov eax, r10d + xor eax, 04984392fh + and eax, 2047 + mov qword ptr [rsi + rax * 8], rcx rx_i_159: ;CALL dec ebx @@ -2823,13 +2803,9 @@ rx_i_159: ;CALL jnz short rx_body_159 call rx_read rx_body_159: - and eax, 32767 + and eax, 131071 mov rax, qword ptr [rsi+rax*8] - mov rcx, rax - mov eax, r13d - xor eax, 0ff7d3697h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx + mov r13, rax cmp r15d, -8571241 ja short rx_i_160 call rx_i_181 @@ -2862,7 +2838,7 @@ rx_i_161: ;IDIV_64 jnz short rx_body_161 call rx_read rx_body_161: - and eax, 32767 + and eax, 131071 mov rax, qword ptr [rsi+rax*8] mov edx, r14d cmp edx, -1 @@ -2877,7 +2853,11 @@ body_idiv_161: cqo idiv rcx result_idiv_161: - mov r8, rax + mov rcx, rax + mov eax, r8d + xor eax, 0db9043dah + and eax, 32767 + mov qword ptr [rsi + rax * 8], rcx rx_i_162: ;SHL_64 dec ebx @@ -2891,7 +2871,11 @@ rx_body_162: and eax, 2047 mov rax, qword ptr [rsi+rax*8] shl rax, 7 - mov r13, rax + mov rcx, rax + mov eax, r13d + xor eax, 0170a46d8h + and eax, 2047 + mov qword ptr [rsi + rax * 8], rcx rx_i_163: ;SUB_64 dec ebx @@ -2906,11 +2890,7 @@ rx_body_163: and eax, 2047 mov rax, qword ptr [rsi+rax*8] sub rax, r8 - mov rcx, rax - mov eax, r14d - xor eax, 082c34b08h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx + mov r14, rax rx_i_164: ;MUL_32 dec ebx @@ -2927,11 +2907,7 @@ rx_body_164: mov ecx, eax mov eax, r9d imul rax, rcx - mov rcx, rax - mov eax, r13d - xor eax, 09aa6da19h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx + mov r13, rax rx_i_165: ;RET dec ebx @@ -2944,11 +2920,7 @@ rx_i_165: ;RET rx_body_165: and eax, 2047 mov rax, qword ptr [rsi+rax*8] - mov rcx, rax - mov eax, r9d - xor eax, 06450685ch - and eax, 32767 - mov qword ptr [rsi + rax * 8], rcx + mov r9, rax cmp rsp, rdi je short rx_i_166 ret @@ -2968,7 +2940,7 @@ rx_body_166: mov rcx, rax mov eax, r13d xor eax, 0bb67f8abh - and eax, 32767 + and eax, 131071 mov qword ptr [rsi + rax * 8], rcx rx_i_167: ;FPMUL @@ -2987,10 +2959,6 @@ rx_body_167: cmpeqpd xmm1, xmm1 andps xmm0, xmm1 movaps xmm2, xmm0 - mov eax, r10d - xor eax, 02a58510fh - and eax, 2047 - movlpd qword ptr [rsi + rax * 8], xmm2 rx_i_168: ;FPDIV dec ebx @@ -3008,6 +2976,10 @@ rx_body_168: cmpeqpd xmm1, xmm1 andps xmm0, xmm1 movaps xmm7, xmm0 + mov eax, r15d + xor eax, 08d1a76f8h + and eax, 131071 + movhpd qword ptr [rsi + rax * 8], xmm7 rx_i_169: ;CALL dec ebx @@ -3042,10 +3014,6 @@ rx_body_170: cvtdq2pd xmm0, qword ptr [rsi+rax*8] andps xmm0, xmm10 sqrtpd xmm6, xmm0 - mov eax, r14d - xor eax, 0a4256a99h - and eax, 32767 - movlpd qword ptr [rsi + rax * 8], xmm6 rx_i_171: ;DIV_64 dec ebx @@ -3065,7 +3033,11 @@ rx_body_171: mul rcx mov rax, rdx shr rax, 29 - mov r12, rax + mov rcx, rax + mov eax, r12d + xor eax, 07b086fb9h + and eax, 2047 + mov qword ptr [rsi + rax * 8], rcx rx_i_172: ;SUB_64 dec ebx @@ -3094,11 +3066,7 @@ rx_body_173: and eax, 2047 mov rax, qword ptr [rsi+rax*8] imul rax, rax, -1386172772 - mov rcx, rax - mov eax, r12d - xor eax, 0ad60ae9ch - and eax, 32767 - mov qword ptr [rsi + rax * 8], rcx + mov r12, rax rx_i_174: ;FPDIV dec ebx @@ -3161,11 +3129,7 @@ rx_body_177: and eax, 2047 mov rax, qword ptr [rsi+rax*8] add rax, 794235831 - mov rcx, rax - mov eax, r13d - xor eax, 02f5713b7h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx + mov r13, rax rx_i_178: ;RET dec ebx @@ -3250,6 +3214,10 @@ rx_body_182: cvtdq2pd xmm0, qword ptr [rsi+rax*8] subpd xmm0, xmm7 movaps xmm6, xmm0 + mov eax, r14d + xor eax, 07c8d12a5h + and eax, 131071 + movhpd qword ptr [rsi + rax * 8], xmm6 rx_i_183: ;ADD_64 dec ebx @@ -3278,7 +3246,11 @@ rx_body_184: and eax, 2047 mov rax, qword ptr [rsi+rax*8] xor eax, r13d - mov r12, rax + mov rcx, rax + mov eax, r12d + xor eax, 02f185447h + and eax, 2047 + mov qword ptr [rsi + rax * 8], rcx rx_i_185: ;JUMP dec ebx @@ -3392,7 +3364,7 @@ rx_i_191: ;FPSQRT jnz short rx_body_191 call rx_read rx_body_191: - and eax, 32767 + and eax, 131071 cvtdq2pd xmm0, qword ptr [rsi+rax*8] andps xmm0, xmm10 sqrtpd xmm6, xmm0 @@ -3410,10 +3382,6 @@ rx_body_192: cvtdq2pd xmm0, qword ptr [rsi+rax*8] andps xmm0, xmm10 sqrtpd xmm8, xmm0 - mov eax, r8d - xor eax, 0f8fd2040h - and eax, 32767 - movlpd qword ptr [rsi + rax * 8], xmm8 rx_i_193: ;MUL_32 dec ebx @@ -3451,10 +3419,6 @@ rx_body_194: cmpeqpd xmm1, xmm1 andps xmm0, xmm1 movaps xmm5, xmm0 - mov eax, r13d - xor eax, 040eb9f47h - and eax, 2047 - movlpd qword ptr [rsi + rax * 8], xmm5 rx_i_195: ;SHL_64 dec ebx @@ -3485,7 +3449,7 @@ rx_body_196: mov rcx, rax mov eax, r13d xor eax, 08e47b269h - and eax, 32767 + and eax, 131071 mov qword ptr [rsi + rax * 8], rcx rx_i_197: ;MUL_64 @@ -3501,7 +3465,11 @@ rx_body_197: and eax, 32767 mov rax, qword ptr [rsi+rax*8] imul rax, r15 - mov r11, rax + mov rcx, rax + mov eax, r11d + xor eax, 0b1d1e60dh + and eax, 2047 + mov qword ptr [rsi + rax * 8], rcx rx_i_198: ;MULH_64 dec ebx @@ -3517,11 +3485,7 @@ rx_body_198: mov rcx, r14 mul rcx mov rax, rdx - mov rcx, rax - mov eax, r8d - xor eax, 01149cba0h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx + mov r8, rax rx_i_199: ;MULH_64 dec ebx @@ -3537,11 +3501,7 @@ rx_body_199: mov rcx, r10 mul rcx mov rax, rdx - mov rcx, rax - mov eax, r10d - xor eax, 0d0e71e9ah - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx + mov r10, rax rx_i_200: ;FPSUB dec ebx @@ -3556,10 +3516,6 @@ rx_body_200: cvtdq2pd xmm0, qword ptr [rsi+rax*8] subpd xmm0, xmm2 movaps xmm4, xmm0 - mov eax, r12d - xor eax, 0b05ce8abh - and eax, 2047 - movhpd qword ptr [rsi + rax * 8], xmm4 rx_i_201: ;FPADD dec ebx @@ -3624,11 +3580,7 @@ rx_body_204: and eax, 32767 mov rax, qword ptr [rsi+rax*8] imul rax, r15 - mov rcx, rax - mov eax, r8d - xor eax, 0eb8fc30fh - and eax, 32767 - mov qword ptr [rsi + rax * 8], rcx + mov r8, rax rx_i_205: ;FPMUL dec ebx @@ -3662,6 +3614,10 @@ rx_body_206: cvtdq2pd xmm0, qword ptr [rsi+rax*8] subpd xmm0, xmm7 movaps xmm4, xmm0 + mov eax, r12d + xor eax, 0d01fb731h + and eax, 2047 + movlpd qword ptr [rsi + rax * 8], xmm4 rx_i_207: ;IDIV_64 dec ebx @@ -3701,7 +3657,11 @@ rx_body_208: and eax, 2047 mov rax, qword ptr [rsi+rax*8] imul rax, rax, -486588965 - mov r10, rax + mov rcx, rax + mov eax, r10d + xor eax, 0e2ff3ddbh + and eax, 2047 + mov qword ptr [rsi + rax * 8], rcx rx_i_209: ;XOR_64 dec ebx @@ -3740,7 +3700,7 @@ rx_body_210: mov rcx, rax mov eax, r15d xor eax, 0c2c6bee0h - and eax, 32767 + and eax, 131071 mov qword ptr [rsi + rax * 8], rcx rx_i_211: ;ROR_64 @@ -3775,11 +3735,7 @@ rx_body_212: and eax, 2047 mov rax, qword ptr [rsi+rax*8] imul rax, r13 - mov rcx, rax - mov eax, r15d - xor eax, 067d81043h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx + mov r15, rax rx_i_213: ;IMUL_32 dec ebx @@ -3796,7 +3752,11 @@ rx_body_213: movsxd rcx, eax movsxd rax, r14d imul rax, rcx - mov r14, rax + mov rcx, rax + mov eax, r14d + xor eax, 07bf8b75h + and eax, 2047 + mov qword ptr [rsi + rax * 8], rcx rx_i_214: ;SHL_64 dec ebx @@ -3811,7 +3771,11 @@ rx_body_214: mov rax, qword ptr [rsi+rax*8] mov rcx, r14 shl rax, cl - mov r14, rax + mov rcx, rax + mov eax, r14d + xor eax, 0936ebe0bh + and eax, 2047 + mov qword ptr [rsi + rax * 8], rcx rx_i_215: ;ADD_32 dec ebx @@ -3826,7 +3790,11 @@ rx_body_215: and eax, 2047 mov rax, qword ptr [rsi+rax*8] add eax, r12d - mov r10, rax + mov rcx, rax + mov eax, r10d + xor eax, 01194f02bh + and eax, 2047 + mov qword ptr [rsi + rax * 8], rcx rx_i_216: ;MUL_64 dec ebx @@ -3837,7 +3805,7 @@ rx_i_216: ;MUL_64 jnz short rx_body_216 call rx_read rx_body_216: - and eax, 32767 + and eax, 131071 mov rax, qword ptr [rsi+rax*8] imul rax, r13 mov rcx, rax @@ -3860,11 +3828,7 @@ rx_body_217: movsxd rcx, eax movsxd rax, r9d imul rax, rcx - mov rcx, rax - mov eax, r10d - xor eax, 017e667h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx + mov r10, rax rx_i_218: ;FPSQRT dec ebx @@ -3879,10 +3843,6 @@ rx_body_218: cvtdq2pd xmm0, qword ptr [rsi+rax*8] andps xmm0, xmm10 sqrtpd xmm3, xmm0 - mov eax, r11d - xor eax, 0dd192e86h - and eax, 2047 - movhpd qword ptr [rsi + rax * 8], xmm3 rx_i_219: ;OR_64 dec ebx @@ -3916,11 +3876,7 @@ rx_body_220: movsxd rcx, eax movsxd rax, r11d imul rax, rcx - mov rcx, rax - mov eax, r11d - xor eax, 0903fd173h - and eax, 32767 - mov qword ptr [rsi + rax * 8], rcx + mov r11, rax rx_i_221: ;DIV_64 dec ebx @@ -3939,11 +3895,7 @@ rx_body_221: cmovne ecx, edx xor edx, edx div rcx - mov rcx, rax - mov eax, r11d - xor eax, 07feab351h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx + mov r11, rax rx_i_222: ;FPMUL dec ebx @@ -3961,10 +3913,6 @@ rx_body_222: cmpeqpd xmm1, xmm1 andps xmm0, xmm1 movaps xmm7, xmm0 - mov eax, r15d - xor eax, 0d7601963h - and eax, 2047 - movhpd qword ptr [rsi + rax * 8], xmm7 rx_i_223: ;FPSUB dec ebx @@ -4037,11 +3985,7 @@ rx_i_226: ;JUMP rx_body_226: and eax, 2047 mov rax, qword ptr [rsi+rax*8] - mov rcx, rax - mov eax, r8d - xor eax, 0978b2498h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx + mov r8, rax cmp r8d, -1752488808 jno rx_i_328 @@ -4079,6 +4023,10 @@ rx_body_228: cvtdq2pd xmm0, qword ptr [rsi+rax*8] andps xmm0, xmm10 sqrtpd xmm7, xmm0 + mov eax, r15d + xor eax, 0ffdff798h + and eax, 2047 + movhpd qword ptr [rsi + rax * 8], xmm7 rx_i_229: ;IMULH_64 dec ebx @@ -4094,11 +4042,7 @@ rx_body_229: mov rcx, 334017248 imul rcx mov rax, rdx - mov rcx, rax - mov eax, r13d - xor eax, 013e8b2e0h - and eax, 32767 - mov qword ptr [rsi + rax * 8], rcx + mov r13, rax rx_i_230: ;FPMUL dec ebx @@ -4133,11 +4077,7 @@ rx_body_231: xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] - mov rcx, rax - mov eax, r9d - xor eax, 0e6c9edaah - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx + mov r9, rax cmp rsp, rdi je short rx_i_232 ret @@ -4158,6 +4098,10 @@ rx_body_232: cmpeqpd xmm1, xmm1 andps xmm0, xmm1 movaps xmm7, xmm0 + mov eax, r15d + xor eax, 07e732935h + and eax, 2047 + movhpd qword ptr [rsi + rax * 8], xmm7 rx_i_233: ;JUMP dec ebx @@ -4206,11 +4150,7 @@ rx_body_235: movsxd rcx, eax mov rax, 212286089 imul rax, rcx - mov rcx, rax - mov eax, r15d - xor eax, 0ca73a89h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx + mov r15, rax rx_i_236: ;FPADD dec ebx @@ -4255,10 +4195,6 @@ rx_body_238: cvtdq2pd xmm0, qword ptr [rsi+rax*8] addpd xmm0, xmm6 movaps xmm7, xmm0 - mov eax, r15d - xor eax, 0331bbf8h - and eax, 2047 - movlpd qword ptr [rsi + rax * 8], xmm7 rx_i_239: ;ADD_64 dec ebx @@ -4272,7 +4208,11 @@ rx_body_239: and eax, 2047 mov rax, qword ptr [rsi+rax*8] add rax, r10 - mov r10, rax + mov rcx, rax + mov eax, r10d + xor eax, 0e42cdf41h + and eax, 2047 + mov qword ptr [rsi + rax * 8], rcx rx_i_240: ;IMULH_64 dec ebx @@ -4284,12 +4224,16 @@ rx_i_240: ;IMULH_64 call rx_read rx_body_240: xor rbp, rax - and eax, 32767 + and eax, 131071 mov rax, qword ptr [rsi+rax*8] mov rcx, r14 imul rcx mov rax, rdx - mov r8, rax + mov rcx, rax + mov eax, r8d + xor eax, 0e6bcdcfbh + and eax, 32767 + mov qword ptr [rsi + rax * 8], rcx rx_i_241: ;FPADD dec ebx @@ -4304,10 +4248,6 @@ rx_body_241: cvtdq2pd xmm0, qword ptr [rsi+rax*8] addpd xmm0, xmm3 movaps xmm7, xmm0 - mov eax, r15d - xor eax, 0bc2423ebh - and eax, 2047 - movlpd qword ptr [rsi + rax * 8], xmm7 rx_i_242: ;MUL_32 dec ebx @@ -4323,11 +4263,7 @@ rx_body_242: mov ecx, eax mov eax, r12d imul rax, rcx - mov rcx, rax - mov eax, r10d - xor eax, 0130882f2h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx + mov r10, rax rx_i_243: ;OR_64 dec ebx @@ -4372,7 +4308,7 @@ rx_i_245: ;AND_32 call rx_read rx_body_245: xor rbp, rax - and eax, 32767 + and eax, 131071 mov rax, qword ptr [rsi+rax*8] and eax, r10d mov rcx, rax @@ -4459,11 +4395,7 @@ rx_body_249: mov rcx, -508571655 imul rcx mov rax, rdx - mov rcx, rax - mov eax, r13d - xor eax, 0e1afcff9h - and eax, 32767 - mov qword ptr [rsi + rax * 8], rcx + mov r13, rax rx_i_250: ;MUL_64 dec ebx @@ -4477,11 +4409,7 @@ rx_body_250: and eax, 2047 mov rax, qword ptr [rsi+rax*8] imul rax, r8 - mov rcx, rax - mov eax, r14d - xor eax, 031115b87h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx + mov r14, rax rx_i_251: ;FPMUL dec ebx @@ -4499,10 +4427,6 @@ rx_body_251: cmpeqpd xmm1, xmm1 andps xmm0, xmm1 movaps xmm4, xmm0 - mov eax, r12d - xor eax, 05ed767a3h - and eax, 2047 - movlpd qword ptr [rsi + rax * 8], xmm4 rx_i_252: ;SHL_64 dec ebx @@ -4516,7 +4440,11 @@ rx_body_252: and eax, 2047 mov rax, qword ptr [rsi+rax*8] shl rax, 53 - mov r14, rax + mov rcx, rax + mov eax, r14d + xor eax, 0b178001h + and eax, 2047 + mov qword ptr [rsi + rax * 8], rcx rx_i_253: ;CALL dec ebx @@ -4551,10 +4479,6 @@ rx_body_254: cvtdq2pd xmm0, qword ptr [rsi+rax*8] addpd xmm0, xmm4 movaps xmm8, xmm0 - mov eax, r8d - xor eax, 0c251872eh - and eax, 2047 - movlpd qword ptr [rsi + rax * 8], xmm8 rx_i_255: ;FPADD dec ebx @@ -4589,11 +4513,7 @@ rx_body_256: mov rcx, r15 mul rcx mov rax, rdx - mov rcx, rax - mov eax, r9d - xor eax, 0f8942c0h - and eax, 32767 - mov qword ptr [rsi + rax * 8], rcx + mov r9, rax rx_i_257: ;FPADD dec ebx @@ -4608,10 +4528,6 @@ rx_body_257: cvtdq2pd xmm0, qword ptr [rsi+rax*8] addpd xmm0, xmm5 movaps xmm3, xmm0 - mov eax, r11d - xor eax, 0373b1b6fh - and eax, 2047 - movlpd qword ptr [rsi + rax * 8], xmm3 rx_i_258: ;MUL_32 dec ebx @@ -4647,6 +4563,10 @@ rx_body_259: cvtdq2pd xmm0, qword ptr [rsi+rax*8] addpd xmm0, xmm9 movaps xmm3, xmm0 + mov eax, r11d + xor eax, 06c1856f0h + and eax, 32767 + movlpd qword ptr [rsi + rax * 8], xmm3 rx_i_260: ;FPSUB dec ebx @@ -4714,7 +4634,7 @@ rx_i_263: ;FPMUL call rx_read rx_body_263: xor rbp, rax - and eax, 32767 + and eax, 131071 cvtdq2pd xmm0, qword ptr [rsi+rax*8] mulpd xmm0, xmm8 movaps xmm1, xmm0 @@ -4801,7 +4721,11 @@ rx_body_268: xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] - mov r13, rax + mov rcx, rax + mov eax, r13d + xor eax, 0850bf8dah + and eax, 2047 + mov qword ptr [rsi + rax * 8], rcx cmp r15d, -2062812966 jl rx_i_381 @@ -4817,11 +4741,7 @@ rx_body_269: and eax, 2047 mov rax, qword ptr [rsi+rax*8] rol rax, 50 - mov rcx, rax - mov eax, r10d - xor eax, 01ba81447h - and eax, 32767 - mov qword ptr [rsi + rax * 8], rcx + mov r10, rax rx_i_270: ;FPMUL dec ebx @@ -4839,6 +4759,10 @@ rx_body_270: cmpeqpd xmm1, xmm1 andps xmm0, xmm1 movaps xmm7, xmm0 + mov eax, r15d + xor eax, 03981662bh + and eax, 32767 + movhpd qword ptr [rsi + rax * 8], xmm7 rx_i_271: ;MUL_32 dec ebx @@ -4872,7 +4796,11 @@ rx_body_272: and eax, 2047 mov rax, qword ptr [rsi+rax*8] and rax, r12 - mov r13, rax + mov rcx, rax + mov eax, r13d + xor eax, 0d45957b7h + and eax, 2047 + mov qword ptr [rsi + rax * 8], rcx rx_i_273: ;JUMP dec ebx @@ -4886,11 +4814,7 @@ rx_body_273: xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] - mov rcx, rax - mov eax, r13d - xor eax, 063972038h - and eax, 32767 - mov qword ptr [rsi + rax * 8], rcx + mov r13, rax cmp r12d, 1670848568 jl rx_i_372 @@ -4907,10 +4831,6 @@ rx_body_274: cvtdq2pd xmm0, qword ptr [rsi+rax*8] addpd xmm0, xmm4 movaps xmm6, xmm0 - mov eax, r14d - xor eax, 06a2b2b5bh - and eax, 2047 - movhpd qword ptr [rsi + rax * 8], xmm6 rx_i_275: ;IDIV_64 dec ebx @@ -4944,13 +4864,9 @@ rx_i_276: ;JUMP call rx_read rx_body_276: xor rbp, rax - and eax, 32767 + and eax, 131071 mov rax, qword ptr [rsi+rax*8] - mov rcx, rax - mov eax, r12d - xor eax, 0b65161a6h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx + mov r12, rax cmp r11d, -1236180570 jns rx_i_404 @@ -4968,11 +4884,7 @@ rx_body_277: movsxd rcx, eax movsxd rax, r10d imul rax, rcx - mov rcx, rax - mov eax, r9d - xor eax, 01aca20a3h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx + mov r9, rax rx_i_278: ;FPSUB dec ebx @@ -4987,10 +4899,6 @@ rx_body_278: cvtdq2pd xmm0, qword ptr [rsi+rax*8] subpd xmm0, xmm7 movaps xmm4, xmm0 - mov eax, r12d - xor eax, 02d00ad10h - and eax, 2047 - movhpd qword ptr [rsi + rax * 8], xmm4 rx_i_279: ;FPADD dec ebx @@ -5119,7 +5027,11 @@ rx_body_285: movsxd rcx, eax movsxd rax, r8d imul rax, rcx - mov r14, rax + mov rcx, rax + mov eax, r14d + xor eax, 09308cd6dh + and eax, 32767 + mov qword ptr [rsi + rax * 8], rcx rx_i_286: ;ROL_64 dec ebx @@ -5134,11 +5046,7 @@ rx_body_286: mov rax, qword ptr [rsi+rax*8] mov rcx, r9 rol rax, cl - mov rcx, rax - mov eax, r15d - xor eax, 0546e75d1h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx + mov r15, rax rx_i_287: ;IDIV_64 dec ebx @@ -5160,11 +5068,7 @@ rx_body_287: sar rax, 29 sets dl add rax, rdx - mov rcx, rax - mov eax, r8d - xor eax, 04926c7fah - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx + mov r8, rax rx_i_288: ;MUL_64 dec ebx @@ -5214,6 +5118,10 @@ rx_body_290: cvtdq2pd xmm0, qword ptr [rsi+rax*8] subpd xmm0, xmm8 movaps xmm9, xmm0 + mov eax, r9d + xor eax, 02f4d18d7h + and eax, 2047 + movhpd qword ptr [rsi + rax * 8], xmm9 rx_i_291: ;RET dec ebx @@ -5229,7 +5137,7 @@ rx_body_291: mov rcx, rax mov eax, r14d xor eax, 0768a9d75h - and eax, 32767 + and eax, 131071 mov qword ptr [rsi + rax * 8], rcx cmp rsp, rdi je short rx_i_292 @@ -5248,7 +5156,11 @@ rx_body_292: mov rax, qword ptr [rsi+rax*8] mov rcx, r8 rol rax, cl - mov r10, rax + mov rcx, rax + mov eax, r10d + xor eax, 035600fe9h + and eax, 2047 + mov qword ptr [rsi + rax * 8], rcx rx_i_293: ;FPADD dec ebx @@ -5263,6 +5175,10 @@ rx_body_293: cvtdq2pd xmm0, qword ptr [rsi+rax*8] addpd xmm0, xmm5 movaps xmm8, xmm0 + mov eax, r8d + xor eax, 014844990h + and eax, 2047 + movhpd qword ptr [rsi + rax * 8], xmm8 rx_i_294: ;RET dec ebx @@ -5275,11 +5191,7 @@ rx_i_294: ;RET rx_body_294: and eax, 2047 mov rax, qword ptr [rsi+rax*8] - mov rcx, rax - mov eax, r8d - xor eax, 0ef8571b7h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx + mov r8, rax cmp rsp, rdi je short rx_i_295 ret @@ -5294,10 +5206,14 @@ rx_i_295: ;FPSUB call rx_read rx_body_295: xor rbp, rax - and eax, 32767 + and eax, 131071 cvtdq2pd xmm0, qword ptr [rsi+rax*8] subpd xmm0, xmm8 movaps xmm7, xmm0 + mov eax, r15d + xor eax, 08a66e69fh + and eax, 2047 + movlpd qword ptr [rsi + rax * 8], xmm7 rx_i_296: ;FPSQRT dec ebx @@ -5308,10 +5224,14 @@ rx_i_296: ;FPSQRT jnz short rx_body_296 call rx_read rx_body_296: - and eax, 32767 + and eax, 131071 cvtdq2pd xmm0, qword ptr [rsi+rax*8] andps xmm0, xmm10 sqrtpd xmm8, xmm0 + mov eax, r8d + xor eax, 0f3a594cah + and eax, 2047 + movlpd qword ptr [rsi + rax * 8], xmm8 rx_i_297: ;ADD_64 dec ebx @@ -5340,6 +5260,10 @@ rx_body_298: cvtdq2pd xmm0, qword ptr [rsi+rax*8] subpd xmm0, xmm7 movaps xmm6, xmm0 + mov eax, r14d + xor eax, 0d10f7c42h + and eax, 2047 + movhpd qword ptr [rsi + rax * 8], xmm6 rx_i_299: ;ADD_64 dec ebx @@ -5354,11 +5278,7 @@ rx_body_299: and eax, 2047 mov rax, qword ptr [rsi+rax*8] add rax, r10 - mov rcx, rax - mov eax, r12d - xor eax, 01468af4h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx + mov r12, rax rx_i_300: ;FPSUB dec ebx @@ -5391,10 +5311,6 @@ rx_body_301: cmpeqpd xmm1, xmm1 andps xmm0, xmm1 movaps xmm7, xmm0 - mov eax, r15d - xor eax, 0433cf2d6h - and eax, 32767 - movhpd qword ptr [rsi + rax * 8], xmm7 rx_i_302: ;ADD_64 dec ebx @@ -5409,7 +5325,11 @@ rx_body_302: and eax, 2047 mov rax, qword ptr [rsi+rax*8] add rax, r10 - mov r11, rax + mov rcx, rax + mov eax, r11d + xor eax, 0afbbe406h + and eax, 2047 + mov qword ptr [rsi + rax * 8], rcx rx_i_303: ;FPADD dec ebx @@ -5424,10 +5344,6 @@ rx_body_303: cvtdq2pd xmm0, qword ptr [rsi+rax*8] addpd xmm0, xmm3 movaps xmm9, xmm0 - mov eax, r9d - xor eax, 0bb9ee490h - and eax, 2047 - movlpd qword ptr [rsi + rax * 8], xmm9 rx_i_304: ;MUL_64 dec ebx @@ -5455,7 +5371,11 @@ rx_body_305: and eax, 2047 mov rax, qword ptr [rsi+rax*8] imul rax, r15 - mov r10, rax + mov rcx, rax + mov eax, r10d + xor eax, 0fc12db20h + and eax, 32767 + mov qword ptr [rsi + rax * 8], rcx rx_i_306: ;ADD_64 dec ebx @@ -5469,7 +5389,11 @@ rx_body_306: and eax, 2047 mov rax, qword ptr [rsi+rax*8] add rax, 400578979 - mov r13, rax + mov rcx, rax + mov eax, r13d + xor eax, 017e059a3h + and eax, 2047 + mov qword ptr [rsi + rax * 8], rcx rx_i_307: ;SHL_64 dec ebx @@ -5497,7 +5421,11 @@ rx_body_308: and eax, 2047 mov rax, qword ptr [rsi+rax*8] imul rax, r13 - mov r15, rax + mov rcx, rax + mov eax, r15d + xor eax, 0c2d34e82h + and eax, 2047 + mov qword ptr [rsi + rax * 8], rcx rx_i_309: ;DIV_64 dec ebx @@ -5515,11 +5443,7 @@ rx_body_309: mul rcx mov rax, rdx shr rax, 31 - mov rcx, rax - mov eax, r9d - xor eax, 09d7b8294h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx + mov r9, rax rx_i_310: ;FPMUL dec ebx @@ -5537,10 +5461,6 @@ rx_body_310: cmpeqpd xmm1, xmm1 andps xmm0, xmm1 movaps xmm7, xmm0 - mov eax, r15d - xor eax, 07c9816c0h - and eax, 32767 - movhpd qword ptr [rsi + rax * 8], xmm7 rx_i_311: ;FPMUL dec ebx @@ -5573,7 +5493,11 @@ rx_body_312: mov ecx, eax mov eax, r14d imul rax, rcx - mov r10, rax + mov rcx, rax + mov eax, r10d + xor eax, 0bb93ffb8h + and eax, 2047 + mov qword ptr [rsi + rax * 8], rcx rx_i_313: ;ROR_64 dec ebx @@ -5587,11 +5511,7 @@ rx_body_313: and eax, 2047 mov rax, qword ptr [rsi+rax*8] ror rax, 62 - mov rcx, rax - mov eax, r14d - xor eax, 09500d514h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx + mov r14, rax rx_i_314: ;IMUL_32 dec ebx @@ -5661,6 +5581,10 @@ rx_body_317: cvtdq2pd xmm0, qword ptr [rsi+rax*8] addpd xmm0, xmm7 movaps xmm5, xmm0 + mov eax, r13d + xor eax, 0b5bc8h + and eax, 2047 + movlpd qword ptr [rsi + rax * 8], xmm5 rx_i_318: ;ROR_64 dec ebx @@ -5693,11 +5617,7 @@ rx_body_319: and eax, 2047 mov rax, qword ptr [rsi+rax*8] shr rax, 46 - mov rcx, rax - mov eax, r11d - xor eax, 01f931a08h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx + mov r11, rax rx_i_320: ;FPADD dec ebx @@ -5712,10 +5632,6 @@ rx_body_320: cvtdq2pd xmm0, qword ptr [rsi+rax*8] addpd xmm0, xmm4 movaps xmm2, xmm0 - mov eax, r10d - xor eax, 02bdc7349h - and eax, 2047 - movlpd qword ptr [rsi + rax * 8], xmm2 rx_i_321: ;IMUL_32 dec ebx @@ -5748,11 +5664,7 @@ rx_i_322: ;CALL rx_body_322: and eax, 2047 mov rax, qword ptr [rsi+rax*8] - mov rcx, rax - mov eax, r11d - xor eax, 054292224h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx + mov r11, rax cmp r11d, 1411981860 jno short rx_i_323 call rx_i_343 @@ -5772,11 +5684,7 @@ rx_body_323: mov rcx, r14 mul rcx mov rax, rdx - mov rcx, rax - mov eax, r14d - xor eax, 0d675c533h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx + mov r14, rax rx_i_324: ;FPDIV dec ebx @@ -5794,10 +5702,6 @@ rx_body_324: cmpeqpd xmm1, xmm1 andps xmm0, xmm1 movaps xmm9, xmm0 - mov eax, r9d - xor eax, 0944856d4h - and eax, 32767 - movhpd qword ptr [rsi + rax * 8], xmm9 rx_i_325: ;OR_32 dec ebx @@ -5811,7 +5715,11 @@ rx_body_325: and eax, 2047 mov rax, qword ptr [rsi+rax*8] or eax, r8d - mov r13, rax + mov rcx, rax + mov eax, r13d + xor eax, 0ef376c54h + and eax, 2047 + mov qword ptr [rsi + rax * 8], rcx rx_i_326: ;MULH_64 dec ebx @@ -5828,11 +5736,7 @@ rx_body_326: mov rcx, -1233771581 mul rcx mov rax, rdx - mov rcx, rax - mov eax, r9d - xor eax, 0b67623c3h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx + mov r9, rax rx_i_327: ;IDIV_64 dec ebx @@ -5870,7 +5774,11 @@ rx_body_328: and eax, 2047 mov rax, qword ptr [rsi+rax*8] shr rax, 18 - mov r9, rax + mov rcx, rax + mov eax, r9d + xor eax, 04d159415h + and eax, 32767 + mov qword ptr [rsi + rax * 8], rcx rx_i_329: ;RET dec ebx @@ -5881,7 +5789,7 @@ rx_i_329: ;RET jnz short rx_body_329 call rx_read rx_body_329: - and eax, 32767 + and eax, 131071 mov rax, qword ptr [rsi+rax*8] mov r11, rax cmp rsp, rdi @@ -5903,11 +5811,7 @@ rx_body_330: movsxd rcx, eax mov rax, -1349816041 imul rax, rcx - mov rcx, rax - mov eax, r11d - xor eax, 0af8b7117h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx + mov r11, rax rx_i_331: ;FPADD dec ebx @@ -5938,10 +5842,6 @@ rx_body_332: cvtdq2pd xmm0, qword ptr [rsi+rax*8] addpd xmm0, xmm6 movaps xmm3, xmm0 - mov eax, r11d - xor eax, 0116c919eh - and eax, 2047 - movlpd qword ptr [rsi + rax * 8], xmm3 rx_i_333: ;OR_64 dec ebx @@ -5956,7 +5856,11 @@ rx_body_333: and eax, 2047 mov rax, qword ptr [rsi+rax*8] or rax, r12 - mov r11, rax + mov rcx, rax + mov eax, r11d + xor eax, 0f58fcaa8h + and eax, 32767 + mov qword ptr [rsi + rax * 8], rcx rx_i_334: ;ADD_64 dec ebx @@ -5985,11 +5889,7 @@ rx_body_335: and eax, 32767 mov rax, qword ptr [rsi+rax*8] sub rax, r8 - mov rcx, rax - mov eax, r12d - xor eax, 07ffe4218h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx + mov r12, rax rx_i_336: ;ROR_64 dec ebx @@ -6041,7 +5941,11 @@ rx_body_338: and eax, 2047 mov rax, qword ptr [rsi+rax*8] imul rax, r12 - mov r11, rax + mov rcx, rax + mov eax, r11d + xor eax, 0184d2abbh + and eax, 2047 + mov qword ptr [rsi + rax * 8], rcx rx_i_339: ;FPADD dec ebx @@ -6066,10 +5970,14 @@ rx_i_340: ;FPADD jnz short rx_body_340 call rx_read rx_body_340: - and eax, 32767 + and eax, 131071 cvtdq2pd xmm0, qword ptr [rsi+rax*8] addpd xmm0, xmm5 movaps xmm5, xmm0 + mov eax, r13d + xor eax, 038b653beh + and eax, 2047 + movhpd qword ptr [rsi + rax * 8], xmm5 rx_i_341: ;MUL_32 dec ebx @@ -6085,11 +5993,7 @@ rx_body_341: mov ecx, eax mov eax, r15d imul rax, rcx - mov rcx, rax - mov eax, r8d - xor eax, 024736405h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx + mov r8, rax rx_i_342: ;FPSUB dec ebx @@ -6137,6 +6041,10 @@ rx_body_344: cvtdq2pd xmm0, qword ptr [rsi+rax*8] subpd xmm0, xmm6 movaps xmm5, xmm0 + mov eax, r13d + xor eax, 0627d9feah + and eax, 2047 + movlpd qword ptr [rsi + rax * 8], xmm5 rx_i_345: ;MULH_64 dec ebx @@ -6147,7 +6055,7 @@ rx_i_345: ;MULH_64 jnz short rx_body_345 call rx_read rx_body_345: - and eax, 32767 + and eax, 131071 mov rax, qword ptr [rsi+rax*8] mov rcx, r13 mul rcx @@ -6170,11 +6078,7 @@ rx_body_346: and eax, 32767 mov rax, qword ptr [rsi+rax*8] and eax, r15d - mov rcx, rax - mov eax, r13d - xor eax, 0ed2d3987h - and eax, 32767 - mov qword ptr [rsi + rax * 8], rcx + mov r13, rax rx_i_347: ;ADD_64 dec ebx @@ -6188,7 +6092,11 @@ rx_body_347: and eax, 2047 mov rax, qword ptr [rsi+rax*8] add rax, r10 - mov r13, rax + mov rcx, rax + mov eax, r13d + xor eax, 0d529429ah + and eax, 2047 + mov qword ptr [rsi + rax * 8], rcx rx_i_348: ;FPSUB dec ebx @@ -6220,7 +6128,11 @@ rx_body_349: and eax, 2047 mov rax, qword ptr [rsi+rax*8] or rax, r15 - mov r13, rax + mov rcx, rax + mov eax, r13d + xor eax, 05c449453h + and eax, 2047 + mov qword ptr [rsi + rax * 8], rcx rx_i_350: ;CALL dec ebx @@ -6233,11 +6145,7 @@ rx_i_350: ;CALL rx_body_350: and eax, 32767 mov rax, qword ptr [rsi+rax*8] - mov rcx, rax - mov eax, r12d - xor eax, 0c5901b43h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx + mov r12, rax cmp r9d, -980411581 ja short rx_i_351 call rx_i_352 @@ -6254,7 +6162,11 @@ rx_body_351: and eax, 2047 mov rax, qword ptr [rsi+rax*8] imul rax, r10 - mov r13, rax + mov rcx, rax + mov eax, r13d + xor eax, 0985ba4h + and eax, 2047 + mov qword ptr [rsi + rax * 8], rcx rx_i_352: ;FPADD dec ebx @@ -6288,10 +6200,6 @@ rx_body_353: cvtdq2pd xmm0, qword ptr [rsi+rax*8] subpd xmm0, xmm2 movaps xmm7, xmm0 - mov eax, r15d - xor eax, 0b3c9f7aeh - and eax, 2047 - movlpd qword ptr [rsi + rax * 8], xmm7 rx_i_354: ;MUL_32 dec ebx @@ -6307,7 +6215,11 @@ rx_body_354: mov ecx, eax mov eax, r13d imul rax, rcx - mov r13, rax + mov rcx, rax + mov eax, r13d + xor eax, 049cc2e0ch + and eax, 2047 + mov qword ptr [rsi + rax * 8], rcx rx_i_355: ;MUL_64 dec ebx @@ -6321,11 +6233,7 @@ rx_body_355: and eax, 32767 mov rax, qword ptr [rsi+rax*8] imul rax, r14 - mov rcx, rax - mov eax, r8d - xor eax, 0c1062b3ch - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx + mov r8, rax rx_i_356: ;MUL_64 dec ebx @@ -6410,6 +6318,10 @@ rx_body_360: cmpeqpd xmm1, xmm1 andps xmm0, xmm1 movaps xmm8, xmm0 + mov eax, r8d + xor eax, 0c41a4103h + and eax, 131071 + movlpd qword ptr [rsi + rax * 8], xmm8 rx_i_361: ;FPDIV dec ebx @@ -6427,10 +6339,6 @@ rx_body_361: cmpeqpd xmm1, xmm1 andps xmm0, xmm1 movaps xmm6, xmm0 - mov eax, r14d - xor eax, 0ad0b81f5h - and eax, 2047 - movlpd qword ptr [rsi + rax * 8], xmm6 rx_i_362: ;SUB_64 dec ebx @@ -6481,7 +6389,11 @@ rx_body_364: mov ecx, eax mov eax, r8d imul rax, rcx - mov r8, rax + mov rcx, rax + mov eax, r8d + xor eax, 0bb8ee9ch + and eax, 2047 + mov qword ptr [rsi + rax * 8], rcx rx_i_365: ;IMUL_32 dec ebx @@ -6497,11 +6409,7 @@ rx_body_365: movsxd rcx, eax movsxd rax, r9d imul rax, rcx - mov rcx, rax - mov eax, r12d - xor eax, 0bfd87d37h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx + mov r12, rax rx_i_366: ;IMUL_32 dec ebx @@ -6517,11 +6425,7 @@ rx_body_366: movsxd rcx, eax movsxd rax, r8d imul rax, rcx - mov rcx, rax - mov eax, r15d - xor eax, 0c3d6bcb7h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx + mov r15, rax rx_i_367: ;ROR_64 dec ebx @@ -6536,7 +6440,11 @@ rx_body_367: and eax, 2047 mov rax, qword ptr [rsi+rax*8] ror rax, 18 - mov r12, rax + mov rcx, rax + mov eax, r12d + xor eax, 0ad9b92e8h + and eax, 2047 + mov qword ptr [rsi + rax * 8], rcx rx_i_368: ;SUB_32 dec ebx @@ -6586,10 +6494,6 @@ rx_body_370: cvtdq2pd xmm0, qword ptr [rsi+rax*8] subpd xmm0, xmm6 movaps xmm6, xmm0 - mov eax, r14d - xor eax, 0a120e0edh - and eax, 2047 - movlpd qword ptr [rsi + rax * 8], xmm6 rx_i_371: ;FPADD dec ebx @@ -6658,6 +6562,10 @@ rx_body_374: cmpeqpd xmm1, xmm1 andps xmm0, xmm1 movaps xmm2, xmm0 + mov eax, r10d + xor eax, 03507e810h + and eax, 2047 + movlpd qword ptr [rsi + rax * 8], xmm2 rx_i_375: ;ADD_64 dec ebx @@ -6668,14 +6576,10 @@ rx_i_375: ;ADD_64 jnz short rx_body_375 call rx_read rx_body_375: - and eax, 32767 + and eax, 131071 mov rax, qword ptr [rsi+rax*8] add rax, -332030999 - mov rcx, rax - mov eax, r12d - xor eax, 0ec359be9h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx + mov r12, rax rx_i_376: ;ADD_64 dec ebx @@ -6718,7 +6622,7 @@ rx_i_378: ;MUL_32 jnz short rx_body_378 call rx_read rx_body_378: - and eax, 32767 + and eax, 131071 mov rax, qword ptr [rsi+rax*8] mov ecx, eax mov eax, r14d @@ -6738,7 +6642,11 @@ rx_body_379: mov rax, qword ptr [rsi+rax*8] mov rcx, r9 ror rax, cl - mov r13, rax + mov rcx, rax + mov eax, r13d + xor eax, 03a2dc429h + and eax, 2047 + mov qword ptr [rsi + rax * 8], rcx rx_i_380: ;MUL_64 dec ebx @@ -6771,7 +6679,11 @@ rx_body_381: and eax, 2047 mov rax, qword ptr [rsi+rax*8] xor eax, r14d - mov r9, rax + mov rcx, rax + mov eax, r9d + xor eax, 032349ff8h + and eax, 32767 + mov qword ptr [rsi + rax * 8], rcx rx_i_382: ;ROL_64 dec ebx @@ -6785,7 +6697,11 @@ rx_body_382: and eax, 32767 mov rax, qword ptr [rsi+rax*8] rol rax, 55 - mov r11, rax + mov rcx, rax + mov eax, r11d + xor eax, 0a6a2e0b1h + and eax, 2047 + mov qword ptr [rsi + rax * 8], rcx rx_i_383: ;FPSUB dec ebx @@ -6800,10 +6716,6 @@ rx_body_383: cvtdq2pd xmm0, qword ptr [rsi+rax*8] subpd xmm0, xmm4 movaps xmm5, xmm0 - mov eax, r13d - xor eax, 0c9f5cc22h - and eax, 2047 - movlpd qword ptr [rsi + rax * 8], xmm5 rx_i_384: ;XOR_64 dec ebx @@ -6817,11 +6729,7 @@ rx_body_384: and eax, 2047 mov rax, qword ptr [rsi+rax*8] xor rax, 1413715044 - mov rcx, rax - mov eax, r9d - xor eax, 054439464h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx + mov r9, rax rx_i_385: ;MUL_64 dec ebx @@ -6868,7 +6776,11 @@ rx_body_387: and eax, 2047 mov rax, qword ptr [rsi+rax*8] sub eax, r15d - mov r9, rax + mov rcx, rax + mov eax, r9d + xor eax, 028cbb7adh + and eax, 2047 + mov qword ptr [rsi + rax * 8], rcx rx_i_388: ;RET dec ebx @@ -6881,11 +6793,7 @@ rx_i_388: ;RET rx_body_388: and eax, 2047 mov rax, qword ptr [rsi+rax*8] - mov rcx, rax - mov eax, r11d - xor eax, 0a0985cc2h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx + mov r11, rax cmp rsp, rdi je short rx_i_389 ret @@ -6901,7 +6809,11 @@ rx_i_389: ;JUMP rx_body_389: and eax, 32767 mov rax, qword ptr [rsi+rax*8] - mov r14, rax + mov rcx, rax + mov eax, r14d + xor eax, 0eb1a1f50h + and eax, 2047 + mov qword ptr [rsi + rax * 8], rcx cmp r9d, -350609584 jge rx_i_421 @@ -6918,6 +6830,10 @@ rx_body_390: cvtdq2pd xmm0, qword ptr [rsi+rax*8] addpd xmm0, xmm4 movaps xmm3, xmm0 + mov eax, r11d + xor eax, 0e5c5acbbh + and eax, 2047 + movlpd qword ptr [rsi + rax * 8], xmm3 rx_i_391: ;FPADD dec ebx @@ -6946,11 +6862,7 @@ rx_body_392: mov rax, qword ptr [rsi+rax*8] mov rcx, r9 sar rax, cl - mov rcx, rax - mov eax, r13d - xor eax, 08c4a0f0dh - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx + mov r13, rax rx_i_393: ;AND_64 dec ebx @@ -6964,11 +6876,7 @@ rx_body_393: and eax, 2047 mov rax, qword ptr [rsi+rax*8] and rax, r12 - mov rcx, rax - mov eax, r13d - xor eax, 020ec085ch - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx + mov r13, rax rx_i_394: ;FPADD dec ebx @@ -7001,7 +6909,11 @@ rx_body_395: mul rcx mov rax, rdx shr rax, 28 - mov r8, rax + mov rcx, rax + mov eax, r8d + xor eax, 03802aa10h + and eax, 2047 + mov qword ptr [rsi + rax * 8], rcx rx_i_396: ;ROR_64 dec ebx @@ -7034,11 +6946,7 @@ rx_body_397: and eax, 32767 mov rax, qword ptr [rsi+rax*8] sub eax, r12d - mov rcx, rax - mov eax, r10d - xor eax, 0146db5dfh - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx + mov r10, rax rx_i_398: ;SHR_64 dec ebx @@ -7089,11 +6997,7 @@ rx_body_400: and eax, 32767 mov rax, qword ptr [rsi+rax*8] and rax, -1800645748 - mov rcx, rax - mov eax, r14d - xor eax, 094ac538ch - and eax, 32767 - mov qword ptr [rsi + rax * 8], rcx + mov r14, rax rx_i_401: ;FPSUB dec ebx @@ -7160,12 +7064,16 @@ rx_i_404: ;MUL_32 jnz short rx_body_404 call rx_read rx_body_404: - and eax, 32767 + and eax, 131071 mov rax, qword ptr [rsi+rax*8] mov ecx, eax mov eax, r8d imul rax, rcx - mov r15, rax + mov rcx, rax + mov eax, r15d + xor eax, 08f83c4f1h + and eax, 2047 + mov qword ptr [rsi + rax * 8], rcx rx_i_405: ;CALL dec ebx @@ -7178,11 +7086,7 @@ rx_i_405: ;CALL rx_body_405: and eax, 32767 mov rax, qword ptr [rsi+rax*8] - mov rcx, rax - mov eax, r12d - xor eax, 06b0af6c1h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx + mov r12, rax cmp r10d, 1795880641 ja short rx_i_406 call rx_i_494 @@ -7203,10 +7107,6 @@ rx_body_406: cmpeqpd xmm1, xmm1 andps xmm0, xmm1 movaps xmm9, xmm0 - mov eax, r9d - xor eax, 09862adefh - and eax, 32767 - movlpd qword ptr [rsi + rax * 8], xmm9 rx_i_407: ;FPSUB dec ebx @@ -7232,14 +7132,10 @@ rx_i_408: ;MUL_64 jnz short rx_body_408 call rx_read rx_body_408: - and eax, 32767 + and eax, 131071 mov rax, qword ptr [rsi+rax*8] imul rax, rax, 693109961 - mov rcx, rax - mov eax, r10d - xor eax, 0295004c9h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx + mov r10, rax rx_i_409: ;MUL_64 dec ebx @@ -7254,7 +7150,11 @@ rx_body_409: and eax, 2047 mov rax, qword ptr [rsi+rax*8] imul rax, r15 - mov r8, rax + mov rcx, rax + mov eax, r8d + xor eax, 05a68b80fh + and eax, 2047 + mov qword ptr [rsi + rax * 8], rcx rx_i_410: ;RET dec ebx @@ -7283,7 +7183,11 @@ rx_i_411: ;RET rx_body_411: and eax, 2047 mov rax, qword ptr [rsi+rax*8] - mov r12, rax + mov rcx, rax + mov eax, r12d + xor eax, 0b492f6bah + and eax, 2047 + mov qword ptr [rsi + rax * 8], rcx cmp rsp, rdi je short rx_i_412 ret @@ -7304,10 +7208,6 @@ rx_body_412: cmpeqpd xmm1, xmm1 andps xmm0, xmm1 movaps xmm3, xmm0 - mov eax, r11d - xor eax, 0bbd2640ah - and eax, 2047 - movlpd qword ptr [rsi + rax * 8], xmm3 rx_i_413: ;FPMUL dec ebx @@ -7325,6 +7225,10 @@ rx_body_413: cmpeqpd xmm1, xmm1 andps xmm0, xmm1 movaps xmm4, xmm0 + mov eax, r12d + xor eax, 043989376h + and eax, 131071 + movlpd qword ptr [rsi + rax * 8], xmm4 rx_i_414: ;AND_64 dec ebx @@ -7339,11 +7243,7 @@ rx_body_414: and eax, 32767 mov rax, qword ptr [rsi+rax*8] and rax, -378293327 - mov rcx, rax - mov eax, r10d - xor eax, 0e973b3b1h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx + mov r10, rax rx_i_415: ;DIV_64 dec ebx @@ -7397,7 +7297,11 @@ rx_body_417: and eax, 2047 mov rax, qword ptr [rsi+rax*8] sub rax, r12 - mov r10, rax + mov rcx, rax + mov eax, r10d + xor eax, 0dfa7569ch + and eax, 2047 + mov qword ptr [rsi + rax * 8], rcx rx_i_418: ;MULH_64 dec ebx @@ -7408,7 +7312,7 @@ rx_i_418: ;MULH_64 jnz short rx_body_418 call rx_read rx_body_418: - and eax, 32767 + and eax, 131071 mov rax, qword ptr [rsi+rax*8] mov rcx, r11 mul rcx @@ -7424,7 +7328,7 @@ rx_i_419: ;OR_64 jnz short rx_body_419 call rx_read rx_body_419: - and eax, 32767 + and eax, 131071 mov rax, qword ptr [rsi+rax*8] or rax, r14 mov rcx, rax @@ -7445,7 +7349,11 @@ rx_body_420: and eax, 2047 mov rax, qword ptr [rsi+rax*8] ror rax, 38 - mov r9, rax + mov rcx, rax + mov eax, r9d + xor eax, 08f7bb3ech + and eax, 2047 + mov qword ptr [rsi + rax * 8], rcx rx_i_421: ;CALL dec ebx @@ -7478,7 +7386,11 @@ rx_body_422: movsxd rcx, eax movsxd rax, r10d imul rax, rcx - mov r13, rax + mov rcx, rax + mov eax, r13d + xor eax, 07c614e2h + and eax, 2047 + mov qword ptr [rsi + rax * 8], rcx rx_i_423: ;MUL_64 dec ebx @@ -7492,11 +7404,7 @@ rx_body_423: and eax, 2047 mov rax, qword ptr [rsi+rax*8] imul rax, r10 - mov rcx, rax - mov eax, r15d - xor eax, 0a5d40d0ah - and eax, 32767 - mov qword ptr [rsi + rax * 8], rcx + mov r15, rax rx_i_424: ;FPADD dec ebx @@ -7512,10 +7420,6 @@ rx_body_424: cvtdq2pd xmm0, qword ptr [rsi+rax*8] addpd xmm0, xmm7 movaps xmm9, xmm0 - mov eax, r9d - xor eax, 0565ae8aah - and eax, 2047 - movlpd qword ptr [rsi + rax * 8], xmm9 rx_i_425: ;IMUL_32 dec ebx @@ -7531,7 +7435,11 @@ rx_body_425: movsxd rcx, eax mov rax, 1776029069 imul rax, rcx - mov r14, rax + mov rcx, rax + mov eax, r14d + xor eax, 069dc0d8dh + and eax, 2047 + mov qword ptr [rsi + rax * 8], rcx rx_i_426: ;IDIV_64 dec ebx @@ -7552,11 +7460,7 @@ rx_body_426: sar rax, 27 sets dl add rax, rdx - mov rcx, rax - mov eax, r14d - xor eax, 0dcca31efh - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx + mov r14, rax rx_i_427: ;MUL_32 dec ebx @@ -7573,11 +7477,7 @@ rx_body_427: mov ecx, eax mov eax, -2146332428 imul rax, rcx - mov rcx, rax - mov eax, r9d - xor eax, 0801190f4h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx + mov r9, rax rx_i_428: ;RET dec ebx @@ -7590,11 +7490,7 @@ rx_i_428: ;RET rx_body_428: and eax, 2047 mov rax, qword ptr [rsi+rax*8] - mov rcx, rax - mov eax, r8d - xor eax, 0e3b86b2fh - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx + mov r8, rax cmp rsp, rdi je short rx_i_429 ret @@ -7611,7 +7507,11 @@ rx_body_429: and eax, 2047 mov rax, qword ptr [rsi+rax*8] imul rax, r9 - mov r15, rax + mov rcx, rax + mov eax, r15d + xor eax, 076a3ad84h + and eax, 2047 + mov qword ptr [rsi + rax * 8], rcx rx_i_430: ;FPADD dec ebx @@ -7695,10 +7595,6 @@ rx_body_434: cmpeqpd xmm1, xmm1 andps xmm0, xmm1 movaps xmm9, xmm0 - mov eax, r9d - xor eax, 08c1cfc74h - and eax, 2047 - movhpd qword ptr [rsi + rax * 8], xmm9 rx_i_435: ;MUL_64 dec ebx @@ -7732,10 +7628,6 @@ rx_body_436: cvtdq2pd xmm0, qword ptr [rsi+rax*8] addpd xmm0, xmm2 movaps xmm7, xmm0 - mov eax, r15d - xor eax, 0bfa76c43h - and eax, 2047 - movhpd qword ptr [rsi + rax * 8], xmm7 rx_i_437: ;FPMUL dec ebx @@ -7771,6 +7663,10 @@ rx_body_438: cmpeqpd xmm1, xmm1 andps xmm0, xmm1 movaps xmm4, xmm0 + mov eax, r12d + xor eax, 0b7c490eeh + and eax, 2047 + movlpd qword ptr [rsi + rax * 8], xmm4 rx_i_439: ;OR_64 dec ebx @@ -7784,11 +7680,7 @@ rx_body_439: and eax, 2047 mov rax, qword ptr [rsi+rax*8] or rax, -1299288575 - mov rcx, rax - mov eax, r10d - xor eax, 0b28e6e01h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx + mov r10, rax rx_i_440: ;CALL dec ebx @@ -7802,7 +7694,11 @@ rx_body_440: xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] - mov r9, rax + mov rcx, rax + mov eax, r9d + xor eax, 07ed31f7ah + and eax, 2047 + mov qword ptr [rsi + rax * 8], rcx cmp r12d, 2127765370 jns short rx_i_441 call rx_i_41 @@ -7823,7 +7719,7 @@ rx_body_441: mov rcx, rax mov eax, r9d xor eax, 01f93242ch - and eax, 32767 + and eax, 131071 mov qword ptr [rsi + rax * 8], rcx rx_i_442: ;FPSQRT @@ -7851,11 +7747,7 @@ rx_i_443: ;RET rx_body_443: and eax, 2047 mov rax, qword ptr [rsi+rax*8] - mov rcx, rax - mov eax, r9d - xor eax, 04f71c419h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx + mov r9, rax cmp rsp, rdi je short rx_i_444 ret @@ -7873,10 +7765,6 @@ rx_body_444: cvtdq2pd xmm0, qword ptr [rsi+rax*8] subpd xmm0, xmm7 movaps xmm5, xmm0 - mov eax, r13d - xor eax, 0ce416070h - and eax, 2047 - movhpd qword ptr [rsi + rax * 8], xmm5 rx_i_445: ;ADD_64 dec ebx @@ -7890,11 +7778,7 @@ rx_body_445: and eax, 2047 mov rax, qword ptr [rsi+rax*8] add rax, r11 - mov rcx, rax - mov eax, r9d - xor eax, 084d1f575h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx + mov r9, rax rx_i_446: ;MUL_32 dec ebx @@ -7962,7 +7846,11 @@ rx_body_449: and eax, 2047 mov rax, qword ptr [rsi+rax*8] rol rax, 28 - mov r8, rax + mov rcx, rax + mov eax, r8d + xor eax, 089e19790h + and eax, 2047 + mov qword ptr [rsi + rax * 8], rcx rx_i_450: ;SAR_64 dec ebx @@ -7977,11 +7865,7 @@ rx_body_450: mov rax, qword ptr [rsi+rax*8] mov rcx, r12 sar rax, cl - mov rcx, rax - mov eax, r8d - xor eax, 0f6de92ach - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx + mov r8, rax rx_i_451: ;ADD_64 dec ebx @@ -7996,7 +7880,11 @@ rx_body_451: and eax, 2047 mov rax, qword ptr [rsi+rax*8] add rax, r10 - mov r8, rax + mov rcx, rax + mov eax, r8d + xor eax, 0eedd10b3h + and eax, 2047 + mov qword ptr [rsi + rax * 8], rcx rx_i_452: ;RET dec ebx @@ -8009,11 +7897,7 @@ rx_i_452: ;RET rx_body_452: and eax, 32767 mov rax, qword ptr [rsi+rax*8] - mov rcx, rax - mov eax, r11d - xor eax, 0e27dea25h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx + mov r11, rax cmp rsp, rdi je short rx_i_453 ret @@ -8027,7 +7911,7 @@ rx_i_453: ;DIV_64 jnz short rx_body_453 call rx_read rx_body_453: - and eax, 32767 + and eax, 131071 mov rax, qword ptr [rsi+rax*8] ; magic divide by 380157076 shr rax, 2 @@ -8035,7 +7919,11 @@ rx_body_453: mul rcx mov rax, rdx shr rax, 24 - mov r8, rax + mov rcx, rax + mov eax, r8d + xor eax, 016a8bc94h + and eax, 2047 + mov qword ptr [rsi + rax * 8], rcx rx_i_454: ;FPADD dec ebx @@ -8050,10 +7938,6 @@ rx_body_454: cvtdq2pd xmm0, qword ptr [rsi+rax*8] addpd xmm0, xmm9 movaps xmm4, xmm0 - mov eax, r12d - xor eax, 07e41c60fh - and eax, 2047 - movhpd qword ptr [rsi + rax * 8], xmm4 rx_i_455: ;OR_64 dec ebx @@ -8082,11 +7966,7 @@ rx_body_456: and eax, 2047 mov rax, qword ptr [rsi+rax*8] and rax, 401943615 - mov rcx, rax - mov eax, r9d - xor eax, 017f52c3fh - and eax, 32767 - mov qword ptr [rsi + rax * 8], rcx + mov r9, rax rx_i_457: ;SUB_64 dec ebx @@ -8116,11 +7996,15 @@ rx_i_458: ;SAR_64 jnz short rx_body_458 call rx_read rx_body_458: - and eax, 32767 + and eax, 131071 mov rax, qword ptr [rsi+rax*8] mov rcx, r8 sar rax, cl - mov r14, rax + mov rcx, rax + mov eax, r14d + xor eax, 028f0a8ch + and eax, 2047 + mov qword ptr [rsi + rax * 8], rcx rx_i_459: ;MUL_64 dec ebx @@ -8152,11 +8036,7 @@ rx_body_460: and eax, 2047 mov rax, qword ptr [rsi+rax*8] add eax, -347784553 - mov rcx, rax - mov eax, r12d - xor eax, 0eb453a97h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx + mov r12, rax rx_i_461: ;XOR_64 dec ebx @@ -8188,7 +8068,11 @@ rx_body_462: and eax, 2047 mov rax, qword ptr [rsi+rax*8] add rax, r8 - mov r15, rax + mov rcx, rax + mov eax, r15d + xor eax, 098a05350h + and eax, 2047 + mov qword ptr [rsi + rax * 8], rcx rx_i_463: ;ADD_32 dec ebx @@ -8202,7 +8086,11 @@ rx_body_463: and eax, 2047 mov rax, qword ptr [rsi+rax*8] add eax, r15d - mov r10, rax + mov rcx, rax + mov eax, r10d + xor eax, 0c8204c90h + and eax, 2047 + mov qword ptr [rsi + rax * 8], rcx rx_i_464: ;MUL_64 dec ebx @@ -8216,11 +8104,7 @@ rx_body_464: and eax, 2047 mov rax, qword ptr [rsi+rax*8] imul rax, r15 - mov rcx, rax - mov eax, r13d - xor eax, 0d0673df8h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx + mov r13, rax rx_i_465: ;FPADD dec ebx @@ -8235,6 +8119,10 @@ rx_body_465: cvtdq2pd xmm0, qword ptr [rsi+rax*8] addpd xmm0, xmm5 movaps xmm2, xmm0 + mov eax, r10d + xor eax, 0d11c1242h + and eax, 32767 + movhpd qword ptr [rsi + rax * 8], xmm2 rx_i_466: ;IMUL_32 dec ebx @@ -8287,11 +8175,7 @@ rx_body_468: mul rcx mov rax, rdx shr rax, 31 - mov rcx, rax - mov eax, r8d - xor eax, 0ff339c77h - and eax, 32767 - mov qword ptr [rsi + rax * 8], rcx + mov r8, rax rx_i_469: ;IMUL_32 dec ebx @@ -8302,7 +8186,7 @@ rx_i_469: ;IMUL_32 jnz short rx_body_469 call rx_read rx_body_469: - and eax, 32767 + and eax, 131071 mov rax, qword ptr [rsi+rax*8] movsxd rcx, eax movsxd rax, r9d @@ -8322,14 +8206,10 @@ rx_i_470: ;OR_64 jnz short rx_body_470 call rx_read rx_body_470: - and eax, 32767 + and eax, 131071 mov rax, qword ptr [rsi+rax*8] or rax, r11 - mov rcx, rax - mov eax, r14d - xor eax, 090d56b4ch - and eax, 32767 - mov qword ptr [rsi + rax * 8], rcx + mov r14, rax rx_i_471: ;IMUL_32 dec ebx @@ -8420,6 +8300,10 @@ rx_body_476: cvtdq2pd xmm0, qword ptr [rsi+rax*8] addpd xmm0, xmm2 movaps xmm9, xmm0 + mov eax, r9d + xor eax, 0b01bb14ch + and eax, 2047 + movlpd qword ptr [rsi + rax * 8], xmm9 rx_i_477: ;FPADD dec ebx @@ -8434,10 +8318,6 @@ rx_body_477: cvtdq2pd xmm0, qword ptr [rsi+rax*8] addpd xmm0, xmm9 movaps xmm6, xmm0 - mov eax, r14d - xor eax, 0e81fc7a6h - and eax, 2047 - movhpd qword ptr [rsi + rax * 8], xmm6 rx_i_478: ;MUL_64 dec ebx @@ -8448,7 +8328,7 @@ rx_i_478: ;MUL_64 jnz short rx_body_478 call rx_read rx_body_478: - and eax, 32767 + and eax, 131071 mov rax, qword ptr [rsi+rax*8] imul rax, r10 mov r12, rax @@ -8465,11 +8345,7 @@ rx_body_479: and eax, 2047 mov rax, qword ptr [rsi+rax*8] imul rax, r14 - mov rcx, rax - mov eax, r13d - xor eax, 0c42735ech - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx + mov r13, rax rx_i_480: ;FPADD dec ebx @@ -8516,7 +8392,11 @@ rx_body_482: and eax, 2047 mov rax, qword ptr [rsi+rax*8] and eax, 1304556205 - mov r11, rax + mov rcx, rax + mov eax, r11d + xor eax, 04dc1f2adh + and eax, 2047 + mov qword ptr [rsi + rax * 8], rcx rx_i_483: ;FPADD dec ebx @@ -8531,6 +8411,10 @@ rx_body_483: cvtdq2pd xmm0, qword ptr [rsi+rax*8] addpd xmm0, xmm6 movaps xmm6, xmm0 + mov eax, r14d + xor eax, 0545908cah + and eax, 2047 + movlpd qword ptr [rsi + rax * 8], xmm6 rx_i_484: ;SHR_64 dec ebx @@ -8545,7 +8429,11 @@ rx_body_484: and eax, 2047 mov rax, qword ptr [rsi+rax*8] shr rax, 37 - mov r11, rax + mov rcx, rax + mov eax, r11d + xor eax, 074a50ee0h + and eax, 2047 + mov qword ptr [rsi + rax * 8], rcx rx_i_485: ;JUMP dec ebx @@ -8562,7 +8450,7 @@ rx_body_485: mov rcx, rax mov eax, r15d xor eax, 02112cbaeh - and eax, 32767 + and eax, 131071 mov qword ptr [rsi + rax * 8], rcx cmp r8d, 554879918 jno rx_i_58 @@ -8597,7 +8485,11 @@ rx_body_487: and eax, 2047 mov rax, qword ptr [rsi+rax*8] sub rax, r9 - mov r11, rax + mov rcx, rax + mov eax, r11d + xor eax, 0ec228e26h + and eax, 2047 + mov qword ptr [rsi + rax * 8], rcx rx_i_488: ;DIV_64 dec ebx @@ -8629,11 +8521,7 @@ rx_body_489: xor rbp, rax and eax, 2047 mov rax, qword ptr [rsi+rax*8] - mov rcx, rax - mov eax, r8d - xor eax, 0bcd0a942h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx + mov r8, rax cmp r15d, -1127175870 jge rx_i_75 @@ -8651,11 +8539,7 @@ rx_body_490: mov rax, qword ptr [rsi+rax*8] mov rcx, r9 ror rax, cl - mov rcx, rax - mov eax, r15d - xor eax, 0ab8bd68h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx + mov r15, rax rx_i_491: ;FPADD dec ebx @@ -8696,7 +8580,11 @@ rx_body_492: sar rax, 30 sets dl add rax, rdx - mov r12, rax + mov rcx, rax + mov eax, r12d + xor eax, 095f0b181h + and eax, 2047 + mov qword ptr [rsi + rax * 8], rcx rx_i_493: ;FPSUB dec ebx @@ -8711,6 +8599,10 @@ rx_body_493: cvtdq2pd xmm0, qword ptr [rsi+rax*8] subpd xmm0, xmm9 movaps xmm4, xmm0 + mov eax, r12d + xor eax, 02feb2fd7h + and eax, 2047 + movhpd qword ptr [rsi + rax * 8], xmm4 rx_i_494: ;MUL_32 dec ebx @@ -8744,6 +8636,10 @@ rx_body_495: cmpeqpd xmm1, xmm1 andps xmm0, xmm1 movaps xmm8, xmm0 + mov eax, r8d + xor eax, 02d12bd27h + and eax, 2047 + movlpd qword ptr [rsi + rax * 8], xmm8 rx_i_496: ;IDIV_64 dec ebx @@ -8800,10 +8696,6 @@ rx_body_498: cmpeqpd xmm1, xmm1 andps xmm0, xmm1 movaps xmm8, xmm0 - mov eax, r8d - xor eax, 09dc5a1f9h - and eax, 2047 - movlpd qword ptr [rsi + rax * 8], xmm8 rx_i_499: ;IMUL_32 dec ebx @@ -8834,6 +8726,10 @@ rx_body_500: cvtdq2pd xmm0, qword ptr [rsi+rax*8] andps xmm0, xmm10 sqrtpd xmm2, xmm0 + mov eax, r10d + xor eax, 04a250342h + and eax, 32767 + movhpd qword ptr [rsi + rax * 8], xmm2 rx_i_501: ;XOR_64 dec ebx @@ -8863,13 +8759,9 @@ rx_i_502: ;RET call rx_read rx_body_502: xor rbp, rax - and eax, 32767 + and eax, 131071 mov rax, qword ptr [rsi+rax*8] - mov rcx, rax - mov eax, r9d - xor eax, 08d85312h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx + mov r9, rax cmp rsp, rdi je short rx_i_503 ret @@ -8905,10 +8797,6 @@ rx_body_504: cvtdq2pd xmm0, qword ptr [rsi+rax*8] addpd xmm0, xmm9 movaps xmm4, xmm0 - mov eax, r12d - xor eax, 0be8cbb18h - and eax, 32767 - movhpd qword ptr [rsi + rax * 8], xmm4 rx_i_505: ;FPSUB dec ebx @@ -8941,6 +8829,10 @@ rx_body_506: cvtdq2pd xmm0, qword ptr [rsi+rax*8] subpd xmm0, xmm9 movaps xmm3, xmm0 + mov eax, r11d + xor eax, 05e890759h + and eax, 2047 + movlpd qword ptr [rsi + rax * 8], xmm3 rx_i_507: ;RET dec ebx @@ -8991,7 +8883,10 @@ rx_body_509: or eax, 40896 mov dword ptr [rsp - 8], eax ldmxcsr dword ptr [rsp - 8] - mov r10, rcx + mov eax, r10d + xor eax, 06cd84each + and eax, 32767 + mov qword ptr [rsi + rax * 8], rcx rx_i_510: ;FPADD dec ebx @@ -9006,6 +8901,10 @@ rx_body_510: cvtdq2pd xmm0, qword ptr [rsi+rax*8] addpd xmm0, xmm2 movaps xmm9, xmm0 + mov eax, r9d + xor eax, 097614097h + and eax, 2047 + movlpd qword ptr [rsi + rax * 8], xmm9 rx_i_511: ;SHR_64 dec ebx @@ -9016,7 +8915,7 @@ rx_i_511: ;SHR_64 jnz short rx_body_511 call rx_read rx_body_511: - and eax, 32767 + and eax, 131071 mov rax, qword ptr [rsi+rax*8] shr rax, 56 mov r11, rax diff --git a/src/softAes.h b/src/softAes.h index 1f7bd99..e4b675e 100644 --- a/src/softAes.h +++ b/src/softAes.h @@ -26,3 +26,13 @@ __m128i soft_aeskeygenassist(__m128i key, uint8_t rcon); __m128i soft_aesenc(__m128i in, __m128i key); __m128i soft_aesdec(__m128i in, __m128i key); + +template +inline __m128i aesenc(__m128i in, __m128i key) { + return soft ? soft_aesenc(in, key) : _mm_aesenc_si128(in, key); +} + +template +inline __m128i aesdec(__m128i in, __m128i key) { + return soft ? soft_aesdec(in, key) : _mm_aesdec_si128(in, key); +} \ No newline at end of file From bd0dba88a8f165e59633a2308845751c13a2c30e Mon Sep 17 00:00:00 2001 From: tevador Date: Sun, 20 Jan 2019 00:44:01 +0100 Subject: [PATCH 23/35] 4 scratchpad segments --- src/CompiledVirtualMachine.cpp | 7 +++- src/CompiledVirtualMachine.hpp | 2 +- src/InterpretedVirtualMachine.cpp | 2 +- src/InterpretedVirtualMachine.hpp | 2 +- src/JitCompilerX86.cpp | 28 +++++++-------- src/VirtualMachine.cpp | 9 +++-- src/VirtualMachine.hpp | 9 +++-- src/asm/program_epilogue_store.inc | 16 ++++----- src/asm/program_prologue_load.inc | 56 ++++++------------------------ src/main.cpp | 26 +++++++++++--- 10 files changed, 75 insertions(+), 82 deletions(-) diff --git a/src/CompiledVirtualMachine.cpp b/src/CompiledVirtualMachine.cpp index 28a3cca..5e87b50 100644 --- a/src/CompiledVirtualMachine.cpp +++ b/src/CompiledVirtualMachine.cpp @@ -33,7 +33,7 @@ namespace RandomX { mem.ds = ds; } - void CompiledVirtualMachine::initializeScratchpad(uint32_t index) { + void CompiledVirtualMachine::initializeScratchpad(uint8_t* scratchpad, int32_t index) { memcpy(scratchpad, mem.ds.dataset + ScratchpadSize * index, ScratchpadSize); } @@ -42,6 +42,11 @@ namespace RandomX { for (unsigned i = 0; i < sizeof(reg) / sizeof(Pcg32::result_type); ++i) { *(((uint32_t*)®) + i) = gen(); } + FPINIT(); + for (int i = 0; i < RegistersCount; ++i) { + reg.f[i].lo.f64 = (double)reg.f[i].lo.i64; + reg.f[i].hi.f64 = (double)reg.f[i].hi.i64; + } compiler.generateProgram(gen); mem.ma = (gen() ^ *(((uint32_t*)seed) + 4)) & ~7; mem.mx = *(((uint32_t*)seed) + 5); diff --git a/src/CompiledVirtualMachine.hpp b/src/CompiledVirtualMachine.hpp index 98b0b78..f969732 100644 --- a/src/CompiledVirtualMachine.hpp +++ b/src/CompiledVirtualMachine.hpp @@ -39,7 +39,7 @@ namespace RandomX { } CompiledVirtualMachine(); void setDataset(dataset_t ds) override; - void initializeScratchpad(uint32_t index) override; + void initializeScratchpad(uint8_t* scratchpad, int32_t index) override; void initializeProgram(const void* seed) override; virtual void execute() override; void* getProgram() { diff --git a/src/InterpretedVirtualMachine.cpp b/src/InterpretedVirtualMachine.cpp index 54d2279..d7e4fc4 100644 --- a/src/InterpretedVirtualMachine.cpp +++ b/src/InterpretedVirtualMachine.cpp @@ -69,7 +69,7 @@ namespace RandomX { } } - void InterpretedVirtualMachine::initializeScratchpad(uint32_t index) { + void InterpretedVirtualMachine::initializeScratchpad(uint8_t* scratchpad, int32_t index) { uint32_t startingBlock = (ScratchpadSize / CacheLineSize) * index; if (asyncWorker) { ILightClientAsyncWorker* worker = mem.ds.asyncWorker; diff --git a/src/InterpretedVirtualMachine.hpp b/src/InterpretedVirtualMachine.hpp index 7745cad..fba081a 100644 --- a/src/InterpretedVirtualMachine.hpp +++ b/src/InterpretedVirtualMachine.hpp @@ -42,7 +42,7 @@ namespace RandomX { InterpretedVirtualMachine(bool soft, bool async) : softAes(soft), asyncWorker(async) {} ~InterpretedVirtualMachine(); void setDataset(dataset_t ds) override; - void initializeScratchpad(uint32_t index) override; + void initializeScratchpad(uint8_t* scratchpad, int32_t index) override; void initializeProgram(const void* seed) override; void execute() override; const Program& getProgam() { diff --git a/src/JitCompilerX86.cpp b/src/JitCompilerX86.cpp index 070d13a..ee91fc3 100644 --- a/src/JitCompilerX86.cpp +++ b/src/JitCompilerX86.cpp @@ -182,17 +182,17 @@ namespace RandomX { emitByte(0xe8); //xor rbp, rax } emitByte(0x25); //and eax, - if (instr.loca & 15) { + //if (instr.loca & 15) { if (instr.loca & 3) { emit(ScratchpadL1 - 1); //first 16 KiB of scratchpad } else { emit(ScratchpadL2 - 1); //first 256 KiB of scratchpad } - } + /*} else { emit(ScratchpadL3 - 1); //whole scratchpad - } + }*/ } void JitCompilerX86::genar(Instruction& instr) { @@ -271,7 +271,7 @@ namespace RandomX { } void JitCompilerX86::gencr(Instruction& instr, bool rax = true) { - if (instr.locc & 16) { //write to register + if (instr.locc & 8) { //write to register emit(uint16_t(0x8b4c)); //mov if (rax) { emitByte(0xc0 + 8 * (instr.regc % RegistersCount)); //regc, rax @@ -281,17 +281,17 @@ namespace RandomX { } } else { - if (instr.locc & 15) { - if (instr.locc & 3) { + //if (instr.locc & 7) { + if (instr.locc & 1) { scratchpadStoreR(instr, ScratchpadL1, rax); } else { scratchpadStoreR(instr, ScratchpadL2, rax); } - } + /*} else { scratchpadStoreR(instr, ScratchpadL3, rax); - } + }*/ } } @@ -319,18 +319,18 @@ namespace RandomX { } emit(uint16_t(0x280f)); //movaps emitByte(0xc0 + 8 * regc); // regc, xmm0 - if (instr.locc & 16) { //write to scratchpad - if (instr.locc & 15) { - if (instr.locc & 3) { //C.LOC.W + if (instr.locc & 8) { //write to scratchpad + //if (instr.locc & 7) { + if (instr.locc & 1) { //C.LOC.W scratchpadStoreF(instr, regc, ScratchpadL1, (instr.locc & 128)); //first 16 KiB of scratchpad } else { scratchpadStoreF(instr, regc, ScratchpadL2, (instr.locc & 128)); //first 256 KiB of scratchpad } - } - else { + //} + /*else { scratchpadStoreF(instr, regc, ScratchpadL3, (instr.locc & 128)); //whole scratchpad - } + }*/ } } diff --git a/src/VirtualMachine.cpp b/src/VirtualMachine.cpp index 0cdc007..01de3d9 100644 --- a/src/VirtualMachine.cpp +++ b/src/VirtualMachine.cpp @@ -39,11 +39,16 @@ namespace RandomX { mem.ds.dataset = nullptr; } - void VirtualMachine::getResult(void* out) { + void VirtualMachine::getResult(void* scratchpad, size_t scratchpadSize, void* out) { constexpr size_t smallStateLength = sizeof(RegisterFile) / sizeof(uint64_t) + 8; alignas(16) uint64_t smallState[smallStateLength]; memcpy(smallState, ®, sizeof(RegisterFile)); - hashAes1Rx4(scratchpad, ScratchpadSize, smallState + 24); + if (scratchpadSize > 0) { + hashAes1Rx4(scratchpad, scratchpadSize, smallState + 24); + } + else { + memset(smallState + 24, 0, 64); + } blake2b(out, ResultSize, smallState, sizeof(smallState), nullptr, 0); } } \ No newline at end of file diff --git a/src/VirtualMachine.hpp b/src/VirtualMachine.hpp index 78f7cf6..fe48e13 100644 --- a/src/VirtualMachine.hpp +++ b/src/VirtualMachine.hpp @@ -28,10 +28,13 @@ namespace RandomX { VirtualMachine(); virtual ~VirtualMachine() {} virtual void setDataset(dataset_t ds) = 0; - virtual void initializeScratchpad(uint32_t index) = 0; + virtual void initializeScratchpad(uint8_t* scratchpad, int32_t index) = 0; + void setScratchpad(void* ptr) { + scratchpad = (convertible_t*)ptr; + } virtual void initializeProgram(const void* seed) = 0; virtual void execute() = 0; - void getResult(void*); + void getResult(void*, size_t, void*); const RegisterFile& getRegisterFile() { return reg; } @@ -39,6 +42,6 @@ namespace RandomX { DatasetReadFunc readDataset; alignas(16) RegisterFile reg; MemoryRegisters mem; - alignas(64) convertible_t scratchpad[ScratchpadLength]; + convertible_t* scratchpad; }; } \ No newline at end of file diff --git a/src/asm/program_epilogue_store.inc b/src/asm/program_epilogue_store.inc index 90b26ce..95a4752 100644 --- a/src/asm/program_epilogue_store.inc +++ b/src/asm/program_epilogue_store.inc @@ -12,12 +12,12 @@ mov qword ptr [rcx+40], r13 mov qword ptr [rcx+48], r14 mov qword ptr [rcx+56], r15 - movdqa xmmword ptr [rcx+64], xmm8 - movdqa xmmword ptr [rcx+80], xmm9 - movdqa xmmword ptr [rcx+96], xmm2 - movdqa xmmword ptr [rcx+112], xmm3 + movapd xmmword ptr [rcx+64], xmm8 + movapd xmmword ptr [rcx+80], xmm9 + movapd xmmword ptr [rcx+96], xmm2 + movapd xmmword ptr [rcx+112], xmm3 lea rcx, [rcx+64] - movdqa xmmword ptr [rcx+64], xmm4 - movdqa xmmword ptr [rcx+80], xmm5 - movdqa xmmword ptr [rcx+96], xmm6 - movdqa xmmword ptr [rcx+112], xmm7 \ No newline at end of file + movapd xmmword ptr [rcx+64], xmm4 + movapd xmmword ptr [rcx+80], xmm5 + movapd xmmword ptr [rcx+96], xmm6 + movapd xmmword ptr [rcx+112], xmm7 \ No newline at end of file diff --git a/src/asm/program_prologue_load.inc b/src/asm/program_prologue_load.inc index ef4f96e..9ceeed6 100644 --- a/src/asm/program_prologue_load.inc +++ b/src/asm/program_prologue_load.inc @@ -1,14 +1,10 @@ mov rdi, rsp ;# beginning of VM stack - mov ebx, 1048577 ;# number of VM instructions to execute + 1 + mov ebx, 262145 ;# number of VM instructions to execute + 1 xorps xmm10, xmm10 cmpeqpd xmm10, xmm10 psrlq xmm10, 1 ;# mask for absolute value = 0x7fffffffffffffff7fffffffffffffff - ;# reset rounding mode - mov dword ptr [rsp-8], 40896 - ldmxcsr dword ptr [rsp-8] - ;# load integer registers mov r8, qword ptr [rcx+0] mov r9, qword ptr [rcx+8] @@ -19,45 +15,13 @@ mov r14, qword ptr [rcx+48] mov r15, qword ptr [rcx+56] - ;# initialize floating point registers - xorps xmm8, xmm8 - cvtsi2sd xmm8, qword ptr [rcx+72] - pslldq xmm8, 8 - cvtsi2sd xmm8, qword ptr [rcx+64] - - xorps xmm9, xmm9 - cvtsi2sd xmm9, qword ptr [rcx+88] - pslldq xmm9, 8 - cvtsi2sd xmm9, qword ptr [rcx+80] - - xorps xmm2, xmm2 - cvtsi2sd xmm2, qword ptr [rcx+104] - pslldq xmm2, 8 - cvtsi2sd xmm2, qword ptr [rcx+96] - - xorps xmm3, xmm3 - cvtsi2sd xmm3, qword ptr [rcx+120] - pslldq xmm3, 8 - cvtsi2sd xmm3, qword ptr [rcx+112] - + ;# load floating point registers + movapd xmm8, xmmword ptr [rcx+64] + movapd xmm9, xmmword ptr [rcx+80] + movapd xmm2, xmmword ptr [rcx+96] + movapd xmm3, xmmword ptr [rcx+112] lea rcx, [rcx+64] - - xorps xmm4, xmm4 - cvtsi2sd xmm4, qword ptr [rcx+72] - pslldq xmm4, 8 - cvtsi2sd xmm4, qword ptr [rcx+64] - - xorps xmm5, xmm5 - cvtsi2sd xmm5, qword ptr [rcx+88] - pslldq xmm5, 8 - cvtsi2sd xmm5, qword ptr [rcx+80] - - xorps xmm6, xmm6 - cvtsi2sd xmm6, qword ptr [rcx+104] - pslldq xmm6, 8 - cvtsi2sd xmm6, qword ptr [rcx+96] - - xorps xmm7, xmm7 - cvtsi2sd xmm7, qword ptr [rcx+120] - pslldq xmm7, 8 - cvtsi2sd xmm7, qword ptr [rcx+112] \ No newline at end of file + movapd xmm4, xmmword ptr [rcx+64] + movapd xmm5, xmmword ptr [rcx+80] + movapd xmm6, xmmword ptr [rcx+96] + movapd xmm7, xmmword ptr [rcx+112] diff --git a/src/main.cpp b/src/main.cpp index 5edb0df..84c76c8 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -130,7 +130,7 @@ void generateAsm(int nonce) { asmX86.printCode(std::cout); } -void mine(RandomX::VirtualMachine* vm, std::atomic& atomicNonce, AtomicHash& result, int noncesCount, int thread) { +void mine(RandomX::VirtualMachine* vm, std::atomic& atomicNonce, AtomicHash& result, int noncesCount, int thread, uint8_t* scratchpad) { uint64_t hash[4]; unsigned char blockTemplate[] = { 0x07, 0x07, 0xf7, 0xa4, 0xf0, 0xd6, 0x05, 0xb3, 0x03, 0x26, 0x08, 0x16, 0xba, 0x3f, 0x10, 0x90, 0x2e, 0x1a, 0x14, @@ -146,11 +146,20 @@ void mine(RandomX::VirtualMachine* vm, std::atomic& atomicNonce, AtomicHash *noncePtr = nonce; blake2b(hash, sizeof(hash), blockTemplate, sizeof(blockTemplate), nullptr, 0); int spIndex = ((uint8_t*)hash)[24] | ((((uint8_t*)hash)[25] & 15) << 8); - vm->initializeScratchpad(spIndex); + vm->initializeScratchpad(scratchpad, spIndex); vm->initializeProgram(hash); //dump((char*)((RandomX::CompiledVirtualMachine*)vm)->getProgram(), RandomX::CodeSize, "code-1337-jmp.txt"); + vm->setScratchpad(scratchpad + 3 * RandomX::ScratchpadSize / 4); vm->execute(); - vm->getResult(hash); + vm->setScratchpad(scratchpad + 2 * RandomX::ScratchpadSize / 4); + vm->execute(); + vm->getResult(nullptr, 0, hash); + vm->initializeProgram(hash); + vm->setScratchpad(scratchpad + 1 * RandomX::ScratchpadSize / 4); + vm->execute(); + vm->setScratchpad(scratchpad + 0 * RandomX::ScratchpadSize / 4); + vm->execute(); + vm->getResult(scratchpad, RandomX::ScratchpadSize, hash); result.xorWith(hash); if (RandomX::trace) { std::cout << "Nonce: " << nonce << " "; @@ -274,18 +283,25 @@ int main(int argc, char** argv) { vm->setDataset(dataset); vms.push_back(vm); } + uint8_t* scratchpadMem; + if (largePages) { + scratchpadMem = (uint8_t*)allocLargePagesMemory(RandomX::ScratchpadSize * (threadCount + 1) / 2); + } + else { + scratchpadMem = (uint8_t*)_mm_malloc(threadCount * RandomX::ScratchpadSize, RandomX::CacheLineSize); + } std::cout << "Running benchmark (" << programCount << " programs) ..." << std::endl; sw.restart(); if (threadCount > 1) { for (int i = 0; i < vms.size(); ++i) { - threads.push_back(std::thread(&mine, vms[i], std::ref(atomicNonce), std::ref(result), programCount, i)); + threads.push_back(std::thread(&mine, vms[i], std::ref(atomicNonce), std::ref(result), programCount, i, scratchpadMem + RandomX::ScratchpadSize * i)); } for (int i = 0; i < threads.size(); ++i) { threads[i].join(); } } else { - mine(vms[0], std::ref(atomicNonce), std::ref(result), programCount, 0); + mine(vms[0], std::ref(atomicNonce), std::ref(result), programCount, 0, scratchpadMem); if (compiled) std::cout << "Average program size: " << ((RandomX::CompiledVirtualMachine*)vms[0])->getTotalSize() / programCount << std::endl; } From d2cb08622105118e8567f838b80ed00fd85fc35c Mon Sep 17 00:00:00 2001 From: tevador Date: Thu, 24 Jan 2019 19:29:59 +0100 Subject: [PATCH 24/35] ASM code generator for "small" programs that fit into the uOP cache --- src/AssemblyGeneratorX86.cpp | 804 ++- src/AssemblyGeneratorX86.hpp | 66 +- src/CompiledVirtualMachine.cpp | 29 +- src/Instruction.cpp | 413 +- src/Instruction.hpp | 60 +- src/InterpretedVirtualMachine.cpp | 325 +- src/JitCompilerX86.cpp | 4 +- src/Program.cpp | 1 - src/common.hpp | 16 +- src/executeProgram-win64.asm | 245 +- src/instructionWeights.hpp | 88 +- src/main.cpp | 50 +- src/program.inc | 9610 +++-------------------------- 13 files changed, 1796 insertions(+), 9915 deletions(-) diff --git a/src/AssemblyGeneratorX86.cpp b/src/AssemblyGeneratorX86.cpp index 4a35dfb..f1c3de8 100644 --- a/src/AssemblyGeneratorX86.cpp +++ b/src/AssemblyGeneratorX86.cpp @@ -30,12 +30,20 @@ namespace RandomX { static const char* regR[8] = { "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15" }; static const char* regR32[8] = { "r8d", "r9d", "r10d", "r11d", "r12d", "r13d", "r14d", "r15d" }; - static const char* regF[8] = { "xmm8", "xmm9", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" }; + static const char* regFE[8] = { "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" }; + static const char* regF[4] = { "xmm0", "xmm1", "xmm2", "xmm3" }; + static const char* regE[4] = { "xmm4", "xmm5", "xmm6", "xmm7" }; + static const char* regA[4] = { "xmm8", "xmm9", "xmm10", "xmm11" }; + static const char* regA4 = "xmm12"; + static const char* dblMin = "xmm13"; + static const char* absMask = "xmm14"; + static const char* signMask = "xmm15"; static const char* regMx = "rbp"; - static const char* regIc = "ebx"; + static const char* regIc = "rbx"; + static const char* regIc32 = "ebx"; static const char* regIc8 = "bl"; - static const char* regStackBeginAddr = "rdi"; + static const char* regDatasetAddr = "rdi"; static const char* regScratchpadAddr = "rsi"; void AssemblyGeneratorX86::generateProgram(const void* seed) { @@ -49,226 +57,217 @@ namespace RandomX { for (unsigned j = 0; j < sizeof(instr) / sizeof(Pcg32::result_type); ++j) { *(((uint32_t*)&instr) + j) = gen(); } + instr.src %= RegistersCount; + instr.dst %= RegistersCount; generateCode(instr, i); - asmCode << std::endl; + //asmCode << std::endl; } - if(ProgramLength > 0) - asmCode << "\tjmp rx_i_0" << std::endl; } void AssemblyGeneratorX86::generateCode(Instruction& instr, int i) { - asmCode << "rx_i_" << i << ": ;" << instr.getName() << std::endl; - asmCode << "\tdec " << regIc << std::endl; - asmCode << "\tjz rx_finish" << std::endl; + asmCode << "\t; " << instr; auto generator = engine[instr.opcode]; (this->*generator)(instr, i); } - void AssemblyGeneratorX86::gena(Instruction& instr, int i) { - asmCode << "\txor " << regR[instr.rega % RegistersCount] << ", 0" << std::hex << instr.addra << "h" << std::dec << std::endl; - asmCode << "\tmov eax, " << regR32[instr.rega % RegistersCount] << std::endl; - asmCode << "\ttest " << regIc8 << ", 63" << std::endl; - asmCode << "\tjnz short rx_body_" << i << std::endl; - asmCode << "\tcall rx_read" << std::endl; - asmCode << "rx_body_" << i << ":" << std::endl; - if ((instr.loca & 192) == 0) - asmCode << "\txor " << regMx << ", rax" << std::endl; - if (instr.loca & 15) { - if (instr.loca & 3) { - asmCode << "\tand eax, " << (ScratchpadL1 - 1) << std::endl; - } - else { - asmCode << "\tand eax, " << (ScratchpadL2 - 1) << std::endl; - } + void AssemblyGeneratorX86::genAddressReg(Instruction& instr, const char* reg = "eax") { + asmCode << "\tmov " << reg << ", " << regR32[instr.src] << std::endl; + asmCode << "\tand " << reg << ", " << ((instr.alt % 4) ? ScratchpadL1Mask : ScratchpadL2Mask) << std::endl; + } + + int32_t AssemblyGeneratorX86::genAddressImm(Instruction& instr) { + return instr.imm32 & ((instr.alt % 4) ? ScratchpadL1Mask : ScratchpadL2Mask); + } + + //1 uOP + void AssemblyGeneratorX86::h_IADD_R(Instruction& instr, int i) { + if (instr.src != instr.dst) { + asmCode << "\tadd " << regR[instr.dst] << ", " << regR[instr.src] << std::endl; } else { - asmCode << "\tand eax, " << (ScratchpadL3 - 1) << std::endl; + asmCode << "\tadd " << regR[instr.dst] << ", " << instr.imm32 << std::endl; } } - void AssemblyGeneratorX86::genar(Instruction& instr, int i) { - gena(instr, i); - asmCode << "\tmov rax, qword ptr [" << regScratchpadAddr << "+rax*8]" << std::endl; - } - - - void AssemblyGeneratorX86::genaf(Instruction& instr, int i) { - gena(instr, i); - asmCode << "\tcvtdq2pd xmm0, qword ptr [" << regScratchpadAddr << "+rax*8]" << std::endl; - } - - void AssemblyGeneratorX86::genbiashift(Instruction& instr, const char* instrx86) { - if (instr.locb & 1) { - asmCode << "\tmov rcx, " << regR[instr.regb % RegistersCount] << std::endl; - asmCode << "\t" << instrx86 << " rax, cl" << std::endl; - } else { - asmCode << "\t" << instrx86 << " rax, " << (instr.imm8 & 63) << std::endl;; - } - } - - void AssemblyGeneratorX86::genbia(Instruction& instr) { - if (instr.locb & 3) { - asmCode << regR[instr.regb % RegistersCount] << std::endl; - } else { - asmCode << instr.imm32 << std::endl;; - } - } - - void AssemblyGeneratorX86::genbia32(Instruction& instr) { - if (instr.locb & 3) { - asmCode << regR32[instr.regb % RegistersCount] << std::endl; + //2.75 uOP + void AssemblyGeneratorX86::h_IADD_M(Instruction& instr, int i) { + if (instr.src != instr.dst) { + genAddressReg(instr); + asmCode << "\tadd " << regR[instr.dst] << ", qword ptr [rsi+rax]" << std::endl; } else { - asmCode << instr.imm32 << std::endl;; + asmCode << "\tadd " << regR[instr.dst] << ", qword ptr [rsi+" << genAddressImm(instr) << "]" << std::endl; } } - void AssemblyGeneratorX86::genbf(Instruction& instr, const char* instrx86) { - asmCode << "\t" << instrx86 << " xmm0, " << regF[instr.regb % RegistersCount] << std::endl; + //1 uOP + void AssemblyGeneratorX86::h_IADD_RC(Instruction& instr, int i) { + asmCode << "\tlea " << regR[instr.dst] << ", [" << regR[instr.dst] << "+" << regR[instr.src] << std::showpos << instr.imm32 << std::noshowpos << "]" << std::endl; } - void AssemblyGeneratorX86::gencr(Instruction& instr, bool rax = true) { - if (instr.locc & 16) { //write to register - asmCode << "\tmov " << regR[instr.regc % RegistersCount] << ", " << (rax ? "rax" : "rcx") << std::endl; - if (trace) { - asmCode << "\tmov qword ptr [" << regScratchpadAddr << " + " << regIc << " * 8 + 262136], " << (rax ? "rax" : "rcx") << std::endl; - } + //1 uOP + void AssemblyGeneratorX86::h_ISUB_R(Instruction& instr, int i) { + if (instr.src != instr.dst) { + asmCode << "\tsub " << regR[instr.dst] << ", " << regR[instr.src] << std::endl; } - else { //write to scratchpad - if (rax) - asmCode << "\tmov rcx, rax" << std::endl; - asmCode << "\tmov eax, " << regR32[instr.regc % RegistersCount] << std::endl; - asmCode << "\txor eax, 0" << std::hex << instr.addrc << "h" << std::dec << std::endl; - if (instr.locc & 15) { - if (instr.locc & 3) { - asmCode << "\tand eax, " << (ScratchpadL1 - 1) << std::endl; - } - else { - asmCode << "\tand eax, " << (ScratchpadL2 - 1) << std::endl; - } - } - else { - asmCode << "\tand eax, " << (ScratchpadL3 - 1) << std::endl; - } - asmCode << "\tmov qword ptr [" << regScratchpadAddr << " + rax * 8], rcx" << std::endl; - if (trace) { - asmCode << "\tmov qword ptr [" << regScratchpadAddr << " + " << regIc << " * 8 + 262136], rcx" << std::endl; - } + else { + asmCode << "\tsub " << regR[instr.dst] << ", " << instr.imm32 << std::endl; } } - void AssemblyGeneratorX86::gencf(Instruction& instr, bool move = true) { - if(move) - asmCode << "\tmovaps " << regF[instr.regc % RegistersCount] << ", xmm0" << std::endl; - const char* store = (instr.locc & 128) ? "movhpd" : "movlpd"; - if (instr.locc & 16) { //write to scratchpad - asmCode << "\tmov eax, " << regR32[instr.regc % RegistersCount] << std::endl; - asmCode << "\txor eax, 0" << std::hex << instr.addrc << "h" << std::dec << std::endl; - if (instr.locc & 15) { - if (instr.locc & 3) { - asmCode << "\tand eax, " << (ScratchpadL1 - 1) << std::endl; - } - else { - asmCode << "\tand eax, " << (ScratchpadL2 - 1) << std::endl; - } - } - else { - asmCode << "\tand eax, " << (ScratchpadL3 - 1) << std::endl; - } - asmCode << "\t" << store << " qword ptr [" << regScratchpadAddr << " + rax * 8], " << regF[instr.regc % RegistersCount] << std::endl; + //2.75 uOP + void AssemblyGeneratorX86::h_ISUB_M(Instruction& instr, int i) { + if (instr.src != instr.dst) { + genAddressReg(instr); + asmCode << "\tsub " << regR[instr.dst] << ", qword ptr [rsi+rax]" << std::endl; } - if (trace) { - asmCode << "\t" << store << " qword ptr [" << regScratchpadAddr << " + " << regIc << " * 8 + 262136], " << regF[instr.regc % RegistersCount] << std::endl; + else { + asmCode << "\tsub " << regR[instr.dst] << ", qword ptr [rsi+" << genAddressImm(instr) << "]" << std::endl; } } - void AssemblyGeneratorX86::h_ADD_64(Instruction& instr, int i) { - genar(instr, i); - asmCode << "\tadd rax, "; - genbia(instr); - gencr(instr); + //1 uOP + void AssemblyGeneratorX86::h_IMUL_9C(Instruction& instr, int i) { + asmCode << "\tlea " << regR[instr.dst] << ", [" << regR[instr.dst] << "+" << regR[instr.dst] << "*8" << std::showpos << instr.imm32 << std::noshowpos << "]" << std::endl; } - void AssemblyGeneratorX86::h_ADD_32(Instruction& instr, int i) { - genar(instr, i); - asmCode << "\tadd eax, "; - genbia32(instr); - gencr(instr); - } - - void AssemblyGeneratorX86::h_SUB_64(Instruction& instr, int i) { - genar(instr, i); - asmCode << "\tsub rax, "; - genbia(instr); - gencr(instr); - } - - void AssemblyGeneratorX86::h_SUB_32(Instruction& instr, int i) { - genar(instr, i); - asmCode << "\tsub eax, "; - genbia32(instr); - gencr(instr); - } - - void AssemblyGeneratorX86::h_MUL_64(Instruction& instr, int i) { - genar(instr, i); - asmCode << "\timul rax, "; - if ((instr.locb & 3) == 0) { - asmCode << "rax, "; + //1 uOP + void AssemblyGeneratorX86::h_IMUL_R(Instruction& instr, int i) { + if (instr.src != instr.dst) { + asmCode << "\timul " << regR[instr.dst] << ", " << regR[instr.src] << std::endl; + } + else { + asmCode << "\timul " << regR[instr.dst] << ", " << instr.imm32 << std::endl; } - genbia(instr); - gencr(instr); } - void AssemblyGeneratorX86::h_MULH_64(Instruction& instr, int i) { - genar(instr, i); - asmCode << "\tmov rcx, "; - genbia(instr); - asmCode << "\tmul rcx" << std::endl; - asmCode << "\tmov rax, rdx" << std::endl; - gencr(instr); + //2.75 uOP + void AssemblyGeneratorX86::h_IMUL_M(Instruction& instr, int i) { + if (instr.src != instr.dst) { + genAddressReg(instr); + asmCode << "\timul " << regR[instr.dst] << ", qword ptr [rsi+rax]" << std::endl; + } + else { + asmCode << "\timul " << regR[instr.dst] << ", qword ptr [rsi+" << genAddressImm(instr) << "]" << std::endl; + } } - void AssemblyGeneratorX86::h_MUL_32(Instruction& instr, int i) { - genar(instr, i); - asmCode << "\tmov ecx, eax" << std::endl; - asmCode << "\tmov eax, "; - genbia32(instr); - asmCode << "\timul rax, rcx" << std::endl; - gencr(instr); + //4 uOPs + void AssemblyGeneratorX86::h_IMULH_R(Instruction& instr, int i) { + if (instr.src != instr.dst) { + asmCode << "\tmov rax, " << regR[instr.dst] << std::endl; + asmCode << "\tmul " << regR[instr.src] << std::endl; + asmCode << "\tmov " << regR[instr.dst] << ", rdx" << std::endl; + } + else { + asmCode << "\tmov eax, " << instr.imm32 << std::endl; + asmCode << "\tmul " << regR[instr.dst] << std::endl; + asmCode << "\tadd " << regR[instr.dst] << ", rdx" << std::endl; + } } - void AssemblyGeneratorX86::h_IMUL_32(Instruction& instr, int i) { - genar(instr, i); - asmCode << "\tmovsxd rcx, eax" << std::endl; - if ((instr.locb & 3) == 0) { + //5.75 uOPs + void AssemblyGeneratorX86::h_IMULH_M(Instruction& instr, int i) { + if (instr.src != instr.dst) { + genAddressReg(instr, "ecx"); + asmCode << "\tmov rax, " << regR[instr.dst] << std::endl; + asmCode << "\tmul qword ptr [rsi+rcx]" << std::endl; + } + else { + asmCode << "\tmov rax, " << regR[instr.dst] << std::endl; + asmCode << "\tmul qword ptr [rsi+" << genAddressImm(instr) << "]" << std::endl; + } + asmCode << "\tmov " << regR[instr.dst] << ", rdx" << std::endl; + } + + //4 uOPs + void AssemblyGeneratorX86::h_ISMULH_R(Instruction& instr, int i) { + if (instr.src != instr.dst) { + asmCode << "\tmov rax, " << regR[instr.dst] << std::endl; + asmCode << "\timul " << regR[instr.src] << std::endl; + asmCode << "\tmov " << regR[instr.dst] << ", rdx" << std::endl; + } + else { asmCode << "\tmov rax, " << instr.imm32 << std::endl; + asmCode << "\timul " << regR[instr.dst] << std::endl; + asmCode << "\tadd " << regR[instr.dst] << ", rdx" << std::endl; + } + } + + //5.75 uOPs + void AssemblyGeneratorX86::h_ISMULH_M(Instruction& instr, int i) { + if (instr.src != instr.dst) { + genAddressReg(instr, "ecx"); + asmCode << "\tmov rax, " << regR[instr.dst] << std::endl; + asmCode << "\timul qword ptr [rsi+rcx]" << std::endl; } else { - asmCode << "\tmovsxd rax, " << regR32[instr.regb % RegistersCount] << std::endl; + asmCode << "\tmov rax, " << regR[instr.dst] << std::endl; + asmCode << "\timul qword ptr [rsi+" << genAddressImm(instr) << "]" << std::endl; } - asmCode << "\timul rax, rcx" << std::endl; - gencr(instr); + asmCode << "\tmov " << regR[instr.dst] << ", rdx" << std::endl; } - void AssemblyGeneratorX86::h_IMULH_64(Instruction& instr, int i) { - genar(instr, i); - asmCode << "\tmov rcx, "; - genbia(instr); - asmCode << "\timul rcx" << std::endl; - asmCode << "\tmov rax, rdx" << std::endl; - gencr(instr); + //1 uOP + void AssemblyGeneratorX86::h_INEG_R(Instruction& instr, int i) { + asmCode << "\tneg " << regR[instr.dst] << std::endl; } - void AssemblyGeneratorX86::h_DIV_64(Instruction& instr, int i) { - genar(instr, i); - if (instr.locb & 3) { -#ifdef MAGIC_DIVISION - if (instr.imm32 != 0) { - uint32_t divisor = instr.imm32; - asmCode << "\t; magic divide by " << divisor << std::endl; - if (divisor & (divisor - 1)) { - magicu_info mi = compute_unsigned_magic_info(divisor, sizeof(uint64_t) * 8); + //1 uOP + void AssemblyGeneratorX86::h_IXOR_R(Instruction& instr, int i) { + if (instr.src != instr.dst) { + asmCode << "\txor " << regR[instr.dst] << ", " << regR[instr.src] << std::endl; + } + else { + asmCode << "\txor " << regR[instr.dst] << ", " << instr.imm32 << std::endl; + } + } + + //2.75 uOP + void AssemblyGeneratorX86::h_IXOR_M(Instruction& instr, int i) { + if (instr.src != instr.dst) { + genAddressReg(instr); + asmCode << "\txor " << regR[instr.dst] << ", qword ptr [rsi+rax]" << std::endl; + } + else { + asmCode << "\txor " << regR[instr.dst] << ", qword ptr [rsi+" << genAddressImm(instr) << "]" << std::endl; + } + } + + //1.75 uOPs + void AssemblyGeneratorX86::h_IROR_R(Instruction& instr, int i) { + if (instr.src != instr.dst) { + asmCode << "\tmov ecx, " << regR32[instr.src] << std::endl; + asmCode << "\tror " << regR[instr.dst] << ", cl" << std::endl; + } + else { + asmCode << "\tror " << regR[instr.dst] << ", " << (instr.imm32 & 63) << std::endl; + } + } + + //1.75 uOPs + void AssemblyGeneratorX86::h_IROL_R(Instruction& instr, int i) { + if (instr.src != instr.dst) { + asmCode << "\tmov ecx, " << regR32[instr.src] << std::endl; + asmCode << "\trol " << regR[instr.dst] << ", cl" << std::endl; + } + else { + asmCode << "\trol " << regR[instr.dst] << ", " << (instr.imm32 & 63) << std::endl; + } + } + + //~6 uOPs + void AssemblyGeneratorX86::h_IDIV_C(Instruction& instr, int i) { + if (instr.imm32 != 0) { + uint32_t divisor = instr.imm32; + if (divisor & (divisor - 1)) { + magicu_info mi = compute_unsigned_magic_info(divisor, sizeof(uint64_t) * 8); + if (mi.pre_shift == 0 && !mi.increment) { + asmCode << "\tmov rax, " << mi.multiplier << std::endl; + asmCode << "\tmul " << regR[instr.dst] << std::endl; + } + else { + asmCode << "\tmov rax, " << regR[instr.dst] << std::endl; if (mi.pre_shift > 0) asmCode << "\tshr rax, " << mi.pre_shift << std::endl; if (mi.increment) { @@ -277,326 +276,249 @@ namespace RandomX { } asmCode << "\tmov rcx, " << mi.multiplier << std::endl; asmCode << "\tmul rcx" << std::endl; - asmCode << "\tmov rax, rdx" << std::endl; - if (mi.post_shift > 0) - asmCode << "\tshr rax, " << mi.post_shift << std::endl; - } - else { //divisor is a power of two - int shift = 0; - while (divisor >>= 1) - ++shift; - if(shift > 0) - asmCode << "\tshr rax, " << shift << std::endl; } + if (mi.post_shift > 0) + asmCode << "\tshr rdx, " << mi.post_shift << std::endl; + asmCode << "\tadd " << regR[instr.dst] << ", rdx" << std::endl; } -#else - if (instr.imm32 == 0) { - asmCode << "\tmov ecx, 1" << std::endl; - } - else { - asmCode << "\tmov ecx, " << instr.imm32 << std::endl; - } -#endif - } - else { - asmCode << "\tmov ecx, 1" << std::endl; - asmCode << "\tmov edx, " << regR32[instr.regb % RegistersCount] << std::endl; - asmCode << "\ttest edx, edx" << std::endl; - asmCode << "\tcmovne ecx, edx" << std::endl; -#ifdef MAGIC_DIVISION - asmCode << "\txor edx, edx" << std::endl; - asmCode << "\tdiv rcx" << std::endl; -#endif - } -#ifndef MAGIC_DIVISION - asmCode << "\txor edx, edx" << std::endl; - asmCode << "\tdiv rcx" << std::endl; -#endif - gencr(instr); - } - - void AssemblyGeneratorX86::h_IDIV_64(Instruction& instr, int i) { - genar(instr, i); - if (instr.locb & 3) { -#ifdef MAGIC_DIVISION - int64_t divisor = instr.imm32; - asmCode << "\t; magic divide by " << divisor << std::endl; - if ((divisor & -divisor) == divisor || (divisor & -divisor) == -divisor) { - // +/- power of two - bool negative = divisor < 0; - if (negative) - divisor = -divisor; + else { //divisor is a power of two int shift = 0; - uint64_t unsignedDivisor = divisor; - while (unsignedDivisor >>= 1) + while (divisor >>= 1) ++shift; - if (shift > 0) { - asmCode << "\tmov rcx, rax" << std::endl; - asmCode << "\tsar rcx, 63" << std::endl; - uint32_t mask = (1ULL << shift) + 0xFFFFFFFF; - asmCode << "\tand ecx, 0" << std::hex << mask << std::dec << "h" << std::endl; - asmCode << "\tadd rax, rcx" << std::endl; - asmCode << "\tsar rax, " << shift << std::endl; - } - if (negative) - asmCode << "\tneg rax" << std::endl; + if(shift > 0) + asmCode << "\tshr " << regR[instr.dst] << ", " << shift << std::endl; } - else if (divisor != 0) { - magics_info mi = compute_signed_magic_info(divisor); - if ((divisor >= 0) != (mi.multiplier >= 0)) - asmCode << "\tmov rcx, rax" << std::endl; - asmCode << "\tmov rdx, " << mi.multiplier << std::endl; - asmCode << "\timul rdx" << std::endl; - asmCode << "\tmov rax, rdx" << std::endl; - asmCode << "\txor edx, edx" << std::endl; - bool haveSF = false; - if (divisor > 0 && mi.multiplier < 0) { - asmCode << "\tadd rax, rcx" << std::endl; - haveSF = true; - } - if (divisor < 0 && mi.multiplier > 0) { - asmCode << "\tsub rax, rcx" << std::endl; - haveSF = true; - } - if (mi.shift > 0) { - asmCode << "\tsar rax, " << mi.shift << std::endl; - haveSF = true; - } - if (!haveSF) - asmCode << "\ttest rax, rax" << std::endl; - asmCode << "\tsets dl" << std::endl; - asmCode << "\tadd rax, rdx" << std::endl; + } + } + + //~8.5 uOPs + void AssemblyGeneratorX86::h_ISDIV_C(Instruction& instr, int i) { + int64_t divisor = instr.imm32; + if ((divisor & -divisor) == divisor || (divisor & -divisor) == -divisor) { + asmCode << "\tmov rax, " << regR[instr.dst] << std::endl; + // +/- power of two + bool negative = divisor < 0; + if (negative) + divisor = -divisor; + int shift = 0; + uint64_t unsignedDivisor = divisor; + while (unsignedDivisor >>= 1) + ++shift; + if (shift > 0) { + asmCode << "\tmov rcx, rax" << std::endl; + asmCode << "\tsar rcx, 63" << std::endl; + uint32_t mask = (1ULL << shift) + 0xFFFFFFFF; + asmCode << "\tand ecx, 0" << std::hex << mask << std::dec << "h" << std::endl; + asmCode << "\tadd rax, rcx" << std::endl; + asmCode << "\tsar rax, " << shift << std::endl; } -#else - asmCode << "\tmov edx, " << instr.imm32 << std::endl; -#endif + if (negative) + asmCode << "\tneg rax" << std::endl; + asmCode << "\tadd " << regR[instr.dst] << ", rax" << std::endl; } - else { - asmCode << "\tmov edx, " << regR32[instr.regb % RegistersCount] << std::endl; -#ifndef MAGIC_DIVISION + else if (divisor != 0) { + magics_info mi = compute_signed_magic_info(divisor); + asmCode << "\tmov rax, " << mi.multiplier << std::endl; + asmCode << "\timul " << regR[instr.dst] << std::endl; + //asmCode << "\tmov rax, rdx" << std::endl; + asmCode << "\txor eax, eax" << std::endl; + bool haveSF = false; + if (divisor > 0 && mi.multiplier < 0) { + asmCode << "\tadd rdx, " << regR[instr.dst] << std::endl; + haveSF = true; + } + if (divisor < 0 && mi.multiplier > 0) { + asmCode << "\tsub rdx, " << regR[instr.dst] << std::endl; + haveSF = true; + } + if (mi.shift > 0) { + asmCode << "\tsar rdx, " << mi.shift << std::endl; + haveSF = true; + } + if (!haveSF) + asmCode << "\ttest rdx, rdx" << std::endl; + asmCode << "\tsets al" << std::endl; + asmCode << "\tadd rdx, rax" << std::endl; + asmCode << "\tadd " << regR[instr.dst] << ", rdx" << std::endl; } -#endif - asmCode << "\tcmp edx, -1" << std::endl; - asmCode << "\tjne short body_idiv_" << i << std::endl; - asmCode << "\tneg rax" << std::endl; - asmCode << "\tjmp short result_idiv_" << i << std::endl; - asmCode << "body_idiv_" << i << ":" << std::endl; - asmCode << "\tmov ecx, 1" << std::endl; - asmCode << "\ttest edx, edx" << std::endl; - asmCode << "\tcmovne ecx, edx" << std::endl; - asmCode << "\tmovsxd rcx, ecx" << std::endl; - asmCode << "\tcqo" << std::endl; - asmCode << "\tidiv rcx" << std::endl; - asmCode << "result_idiv_" << i << ":" << std::endl; -#ifdef MAGIC_DIVISION - } -#endif - gencr(instr); } - void AssemblyGeneratorX86::h_AND_64(Instruction& instr, int i) { - genar(instr, i); - asmCode << "\tand rax, "; - genbia(instr); - gencr(instr); + //1 uOPs + void AssemblyGeneratorX86::h_FPSWAP_R(Instruction& instr, int i) { + asmCode << "\tshufpd " << regFE[instr.dst] << ", " << regFE[instr.dst] << ", 1" << std::endl; } - void AssemblyGeneratorX86::h_AND_32(Instruction& instr, int i) { - genar(instr, i); - asmCode << "\tand eax, "; - genbia32(instr); - gencr(instr); + //1 uOP + void AssemblyGeneratorX86::h_FPADD_R(Instruction& instr, int i) { + instr.dst %= 4; + instr.src %= 4; + asmCode << "\taddpd " << regF[instr.dst] << ", " << regA[instr.src] << std::endl; } - void AssemblyGeneratorX86::h_OR_64(Instruction& instr, int i) { - genar(instr, i); - asmCode << "\tor rax, "; - genbia(instr); - gencr(instr); + //5 uOPs + void AssemblyGeneratorX86::h_FPADD_M(Instruction& instr, int i) { + instr.dst %= 4; + genAddressReg(instr); + asmCode << "\tcvtdq2pd xmm12, qword ptr [rsi+rax]" << std::endl; + asmCode << "\taddpd " << regF[instr.dst] << ", xmm12" << std::endl; } - void AssemblyGeneratorX86::h_OR_32(Instruction& instr, int i) { - genar(instr, i); - asmCode << "\tor eax, "; - genbia32(instr); - gencr(instr); + //1 uOP + void AssemblyGeneratorX86::h_FPSUB_R(Instruction& instr, int i) { + instr.dst %= 4; + instr.src %= 4; + asmCode << "\tsubpd " << regF[instr.dst] << ", " << regA[instr.src] << std::endl; } - void AssemblyGeneratorX86::h_XOR_64(Instruction& instr, int i) { - genar(instr, i); - asmCode << "\txor rax, "; - genbia(instr); - gencr(instr); + //5 uOPs + void AssemblyGeneratorX86::h_FPSUB_M(Instruction& instr, int i) { + instr.dst %= 4; + genAddressReg(instr); + asmCode << "\tcvtdq2pd xmm12, qword ptr [rsi+rax]" << std::endl; + asmCode << "\tsubpd " << regF[instr.dst] << ", xmm12" << std::endl; } - void AssemblyGeneratorX86::h_XOR_32(Instruction& instr, int i) { - genar(instr, i); - asmCode << "\txor eax, "; - genbia32(instr); - gencr(instr); + //1 uOP + void AssemblyGeneratorX86::h_FPNEG_R(Instruction& instr, int i) { + instr.dst %= 4; + asmCode << "\txorps " << regF[instr.dst] << ", " << signMask << std::endl; } - void AssemblyGeneratorX86::h_SHL_64(Instruction& instr, int i) { - genar(instr, i); - genbiashift(instr, "shl"); - gencr(instr); + //1 uOPs + void AssemblyGeneratorX86::h_FPMUL_R(Instruction& instr, int i) { + instr.dst %= 4; + instr.src %= 4; + asmCode << "\tmulpd " << regE[instr.dst] << ", " << regA[instr.src] << std::endl; } - void AssemblyGeneratorX86::h_SHR_64(Instruction& instr, int i) { - genar(instr, i); - genbiashift(instr, "shr"); - gencr(instr); + //6 uOPs + void AssemblyGeneratorX86::h_FPMUL_M(Instruction& instr, int i) { + instr.dst %= 4; + genAddressReg(instr); + asmCode << "\tcvtdq2pd xmm12, qword ptr [rsi+rax]" << std::endl; + asmCode << "\tmulpd " << regE[instr.dst] << ", xmm12" << std::endl; + asmCode << "\tmaxpd " << regE[instr.dst] << ", " << dblMin << std::endl; } - void AssemblyGeneratorX86::h_SAR_64(Instruction& instr, int i) { - genar(instr, i); - genbiashift(instr, "sar"); - gencr(instr); + //2 uOPs + void AssemblyGeneratorX86::h_FPDIV_R(Instruction& instr, int i) { + instr.dst %= 4; + instr.src %= 4; + asmCode << "\tdivpd " << regE[instr.dst] << ", " << regA[instr.src] << std::endl; + asmCode << "\tmaxpd " << regE[instr.dst] << ", " << dblMin << std::endl; } - void AssemblyGeneratorX86::h_ROL_64(Instruction& instr, int i) { - genar(instr, i); - genbiashift(instr, "rol"); - gencr(instr); + //6 uOPs + void AssemblyGeneratorX86::h_FPDIV_M(Instruction& instr, int i) { + instr.dst %= 4; + genAddressReg(instr); + asmCode << "\tcvtdq2pd xmm12, qword ptr [rsi+rax]" << std::endl; + asmCode << "\tdivpd " << regE[instr.dst] << ", xmm12" << std::endl; + asmCode << "\tmaxpd " << regE[instr.dst] << ", " << dblMin << std::endl; } - void AssemblyGeneratorX86::h_ROR_64(Instruction& instr, int i) { - genar(instr, i); - genbiashift(instr, "ror"); - gencr(instr); - } + //1 uOP + void AssemblyGeneratorX86::h_FPSQRT_R(Instruction& instr, int i) { + instr.dst %= 4; + asmCode << "\tsqrtpd " << regE[instr.dst] << ", " << regE[instr.dst] << std::endl; + } - void AssemblyGeneratorX86::h_FPADD(Instruction& instr, int i) { - genaf(instr, i); - genbf(instr, "addpd"); - gencf(instr); - } - - void AssemblyGeneratorX86::h_FPSUB(Instruction& instr, int i) { - genaf(instr, i); - genbf(instr, "subpd"); - gencf(instr); - } - - void AssemblyGeneratorX86::h_FPMUL(Instruction& instr, int i) { - genaf(instr, i); - genbf(instr, "mulpd"); - asmCode << "\tmovaps xmm1, xmm0" << std::endl; - asmCode << "\tcmpeqpd xmm1, xmm1" << std::endl; - asmCode << "\tandps xmm0, xmm1" << std::endl; - gencf(instr); - } - - void AssemblyGeneratorX86::h_FPDIV(Instruction& instr, int i) { - genaf(instr, i); - genbf(instr, "divpd"); - asmCode << "\tmovaps xmm1, xmm0" << std::endl; - asmCode << "\tcmpeqpd xmm1, xmm1" << std::endl; - asmCode << "\tandps xmm0, xmm1" << std::endl; - gencf(instr); - } - - void AssemblyGeneratorX86::h_FPSQRT(Instruction& instr, int i) { - genaf(instr, i); - asmCode << "\tandps xmm0, xmm10" << std::endl; - asmCode << "\tsqrtpd " << regF[instr.regc % RegistersCount] << ", xmm0" << std::endl; - gencf(instr, false); - } - - void AssemblyGeneratorX86::h_FPROUND(Instruction& instr, int i) { - genar(instr, i); - asmCode << "\tmov rcx, rax" << std::endl; - int rotate = (13 - (instr.imm8 & 63)) & 63; + //6 uOPs + void AssemblyGeneratorX86::h_CFROUND(Instruction& instr, int i) { + asmCode << "\tmov rax, " << regR[instr.dst] << std::endl; + int rotate = (13 - (instr.alt & 63)) & 63; if (rotate != 0) asmCode << "\trol rax, " << rotate << std::endl; asmCode << "\tand eax, 24576" << std::endl; asmCode << "\tor eax, 40896" << std::endl; - asmCode << "\tmov dword ptr [rsp - 8], eax" << std::endl; - asmCode << "\tldmxcsr dword ptr [rsp - 8]" << std::endl; - gencr(instr, false); + asmCode << "\tmov dword ptr [rsp-8], eax" << std::endl; + asmCode << "\tldmxcsr dword ptr [rsp-8]" << std::endl; } - static inline const char* jumpCondition(Instruction& instr, bool invert = false) { - switch ((instr.locb & 7) ^ invert) + static inline const char* condition(Instruction& instr, bool invert = false) { + switch (((instr.alt >> 2) & 7) ^ invert) { case 0: - return "jbe"; + return "be"; case 1: - return "ja"; + return "a"; case 2: - return "js"; + return "s"; case 3: - return "jns"; + return "ns"; case 4: - return "jo"; + return "o"; case 5: - return "jno"; + return "no"; case 6: - return "jl"; + return "l"; case 7: - return "jge"; + return "ge"; } } - void AssemblyGeneratorX86::h_JUMP(Instruction& instr, int i) { - genar(instr, i); - gencr(instr); - asmCode << "\tcmp " << regR32[instr.regb % RegistersCount] << ", " << instr.imm32 << std::endl; - asmCode << "\t" << jumpCondition(instr); - asmCode << " rx_i_" << wrapInstr(i + (instr.imm8 & 127) + 2) << std::endl; + //4 uOPs + void AssemblyGeneratorX86::h_COND_R(Instruction& instr, int i) { + asmCode << "\txor ecx, ecx" << std::endl; + asmCode << "\tcmp " << regR32[instr.src] << ", " << instr.imm32 << std::endl; + asmCode << "\tset" << condition(instr) << " cl" << std::endl; + asmCode << "\tadd " << regR[instr.dst] << ", rcx" << std::endl; } - void AssemblyGeneratorX86::h_CALL(Instruction& instr, int i) { - genar(instr, i); - gencr(instr); - asmCode << "\tcmp " << regR32[instr.regb % RegistersCount] << ", " << instr.imm32 << std::endl; - asmCode << "\t" << jumpCondition(instr, true); - asmCode << " short rx_i_" << wrapInstr(i + 1) << std::endl; - asmCode << "\tcall rx_i_" << wrapInstr(i + (instr.imm8 & 127) + 2) << std::endl; - } - - void AssemblyGeneratorX86::h_RET(Instruction& instr, int i) { - genar(instr, i); - gencr(instr); - asmCode << "\tcmp rsp, " << regStackBeginAddr << std::endl; - asmCode << "\tje short rx_i_" << wrapInstr(i + 1) << std::endl; - asmCode << "\tret" << std::endl; + //6 uOPs + void AssemblyGeneratorX86::h_COND_M(Instruction& instr, int i) { + asmCode << "\txor ecx, ecx" << std::endl; + genAddressReg(instr); + asmCode << "\tcmp dword ptr [rsi+rax], " << instr.imm32 << std::endl; + asmCode << "\tset" << condition(instr) << " cl" << std::endl; + asmCode << "\tadd " << regR[instr.dst] << ", rcx" << std::endl; } #include "instructionWeights.hpp" #define INST_HANDLE(x) REPN(&AssemblyGeneratorX86::h_##x, WT(x)) InstructionGenerator AssemblyGeneratorX86::engine[256] = { - INST_HANDLE(ADD_64) - INST_HANDLE(ADD_32) - INST_HANDLE(SUB_64) - INST_HANDLE(SUB_32) - INST_HANDLE(MUL_64) - INST_HANDLE(MULH_64) - INST_HANDLE(MUL_32) - INST_HANDLE(IMUL_32) - INST_HANDLE(IMULH_64) - INST_HANDLE(DIV_64) - INST_HANDLE(IDIV_64) - INST_HANDLE(AND_64) - INST_HANDLE(AND_32) - INST_HANDLE(OR_64) - INST_HANDLE(OR_32) - INST_HANDLE(XOR_64) - INST_HANDLE(XOR_32) - INST_HANDLE(SHL_64) - INST_HANDLE(SHR_64) - INST_HANDLE(SAR_64) - INST_HANDLE(ROL_64) - INST_HANDLE(ROR_64) - INST_HANDLE(FPADD) - INST_HANDLE(FPSUB) - INST_HANDLE(FPMUL) - INST_HANDLE(FPDIV) - INST_HANDLE(FPSQRT) - INST_HANDLE(FPROUND) - INST_HANDLE(JUMP) - INST_HANDLE(CALL) - INST_HANDLE(RET) + //Integer + INST_HANDLE(IADD_R) + INST_HANDLE(IADD_M) + INST_HANDLE(IADD_RC) + INST_HANDLE(ISUB_R) + INST_HANDLE(ISUB_M) + INST_HANDLE(IMUL_9C) + INST_HANDLE(IMUL_R) + INST_HANDLE(IMUL_M) + INST_HANDLE(IMULH_R) + INST_HANDLE(IMULH_M) + INST_HANDLE(ISMULH_R) + INST_HANDLE(ISMULH_M) + INST_HANDLE(IDIV_C) + INST_HANDLE(ISDIV_C) + INST_HANDLE(INEG_R) + INST_HANDLE(IXOR_R) + INST_HANDLE(IXOR_M) + INST_HANDLE(IROR_R) + INST_HANDLE(IROL_R) + + //Common floating point + INST_HANDLE(FPSWAP_R) + + //Floating point group F + INST_HANDLE(FPADD_R) + INST_HANDLE(FPADD_M) + INST_HANDLE(FPSUB_R) + INST_HANDLE(FPSUB_M) + INST_HANDLE(FPNEG_R) + + //Floating point group E + INST_HANDLE(FPMUL_R) + INST_HANDLE(FPMUL_M) + INST_HANDLE(FPDIV_R) + INST_HANDLE(FPDIV_M) + INST_HANDLE(FPSQRT_R) + + //Control + INST_HANDLE(COND_R) + INST_HANDLE(COND_M) + INST_HANDLE(CFROUND) }; } \ No newline at end of file diff --git a/src/AssemblyGeneratorX86.hpp b/src/AssemblyGeneratorX86.hpp index d2e2eb0..2d3c9a6 100644 --- a/src/AssemblyGeneratorX86.hpp +++ b/src/AssemblyGeneratorX86.hpp @@ -47,39 +47,43 @@ namespace RandomX { void genbf(Instruction&, const char*); void gencr(Instruction&, bool); void gencf(Instruction&, bool); + void genAddressReg(Instruction&, const char*); + int32_t genAddressImm(Instruction&); void generateCode(Instruction&, int); - void h_ADD_64(Instruction&, int); - void h_ADD_32(Instruction&, int); - void h_SUB_64(Instruction&, int); - void h_SUB_32(Instruction&, int); - void h_MUL_64(Instruction&, int); - void h_MULH_64(Instruction&, int); - void h_MUL_32(Instruction&, int); - void h_IMUL_32(Instruction&, int); - void h_IMULH_64(Instruction&, int); - void h_DIV_64(Instruction&, int); - void h_IDIV_64(Instruction&, int); - void h_AND_64(Instruction&, int); - void h_AND_32(Instruction&, int); - void h_OR_64(Instruction&, int); - void h_OR_32(Instruction&, int); - void h_XOR_64(Instruction&, int); - void h_XOR_32(Instruction&, int); - void h_SHL_64(Instruction&, int); - void h_SHR_64(Instruction&, int); - void h_SAR_64(Instruction&, int); - void h_ROL_64(Instruction&, int); - void h_ROR_64(Instruction&, int); - void h_FPADD(Instruction&, int); - void h_FPSUB(Instruction&, int); - void h_FPMUL(Instruction&, int); - void h_FPDIV(Instruction&, int); - void h_FPSQRT(Instruction&, int); - void h_FPROUND(Instruction&, int); - void h_JUMP(Instruction&, int); - void h_CALL(Instruction&, int); - void h_RET(Instruction&, int); + void h_IADD_R(Instruction&, int); + void h_IADD_M(Instruction&, int); + void h_IADD_RC(Instruction&, int); + void h_ISUB_R(Instruction&, int); + void h_ISUB_M(Instruction&, int); + void h_IMUL_9C(Instruction&, int); + void h_IMUL_R(Instruction&, int); + void h_IMUL_M(Instruction&, int); + void h_IMULH_R(Instruction&, int); + void h_IMULH_M(Instruction&, int); + void h_ISMULH_R(Instruction&, int); + void h_ISMULH_M(Instruction&, int); + void h_IDIV_C(Instruction&, int); + void h_ISDIV_C(Instruction&, int); + void h_INEG_R(Instruction&, int); + void h_IXOR_R(Instruction&, int); + void h_IXOR_M(Instruction&, int); + void h_IROR_R(Instruction&, int); + void h_IROL_R(Instruction&, int); + void h_FPSWAP_R(Instruction&, int); + void h_FPADD_R(Instruction&, int); + void h_FPADD_M(Instruction&, int); + void h_FPSUB_R(Instruction&, int); + void h_FPSUB_M(Instruction&, int); + void h_FPNEG_R(Instruction&, int); + void h_FPMUL_R(Instruction&, int); + void h_FPMUL_M(Instruction&, int); + void h_FPDIV_R(Instruction&, int); + void h_FPDIV_M(Instruction&, int); + void h_FPSQRT_R(Instruction&, int); + void h_COND_R(Instruction&, int); + void h_COND_M(Instruction&, int); + void h_CFROUND(Instruction&, int); }; } \ No newline at end of file diff --git a/src/CompiledVirtualMachine.cpp b/src/CompiledVirtualMachine.cpp index 5e87b50..f0a63d1 100644 --- a/src/CompiledVirtualMachine.cpp +++ b/src/CompiledVirtualMachine.cpp @@ -25,6 +25,12 @@ along with RandomX. If not, see. namespace RandomX { + constexpr int mantissaSize = 52; + constexpr int exponentSize = 11; + constexpr uint64_t mantissaMask = (1ULL << mantissaSize) - 1; + constexpr uint64_t exponentMask = (1ULL << exponentSize) - 1; + constexpr int exponentBias = 1023; + CompiledVirtualMachine::CompiledVirtualMachine() { totalSize = 0; } @@ -37,25 +43,42 @@ namespace RandomX { memcpy(scratchpad, mem.ds.dataset + ScratchpadSize * index, ScratchpadSize); } + static uint64_t getSmallPositiveFloatBits(uint64_t entropy) { + auto exponent = entropy >> 60; //0..15 + auto mantissa = entropy & mantissaMask; + exponent += exponentBias; + exponent &= exponentMask; + exponent <<= mantissaSize; + return exponent | mantissa; + } + void CompiledVirtualMachine::initializeProgram(const void* seed) { Pcg32 gen(seed); for (unsigned i = 0; i < sizeof(reg) / sizeof(Pcg32::result_type); ++i) { *(((uint32_t*)®) + i) = gen(); } FPINIT(); - for (int i = 0; i < RegistersCount; ++i) { + /*for (int i = 0; i < RegistersCount / 2; ++i) { reg.f[i].lo.f64 = (double)reg.f[i].lo.i64; reg.f[i].hi.f64 = (double)reg.f[i].hi.i64; } + for (int i = 0; i < RegistersCount / 2; ++i) { + reg.g[i].lo.f64 = std::abs((double)reg.g[i].lo.i64); + reg.g[i].hi.f64 = std::abs((double)reg.g[i].hi.i64); + }*/ + for (int i = 0; i < RegistersCount / 2; ++i) { + reg.a[i].lo.u64 = getSmallPositiveFloatBits(reg.f[i].lo.u64); + reg.a[i].hi.u64 = getSmallPositiveFloatBits(reg.f[i].hi.u64); + } compiler.generateProgram(gen); mem.ma = (gen() ^ *(((uint32_t*)seed) + 4)) & ~7; mem.mx = *(((uint32_t*)seed) + 5); } void CompiledVirtualMachine::execute() { - //executeProgram(reg, mem, scratchpad, readDataset); + executeProgram(reg, mem, scratchpad, InstructionCount); totalSize += compiler.getCodeSize(); - compiler.getProgramFunc()(reg, mem, scratchpad); + //compiler.getProgramFunc()(reg, mem, scratchpad); #ifdef TRACEVM for (int32_t i = InstructionCount - 1; i >= 0; --i) { std::cout << std::hex << tracepad[i].u64 << std::endl; diff --git a/src/Instruction.cpp b/src/Instruction.cpp index b668a81..c766ffd 100644 --- a/src/Instruction.cpp +++ b/src/Instruction.cpp @@ -18,54 +18,391 @@ along with RandomX. If not, see. */ #include "Instruction.hpp" +#include "common.hpp" namespace RandomX { void Instruction::print(std::ostream& os) const { - os << " A: loc = " << std::dec << (loca & 7) << ", reg: " << (rega & 7) << std::endl; - os << " B: loc = " << (locb & 7) << ", reg: " << (regb & 7) << std::endl; - os << " C: loc = " << (locc & 7) << ", reg: " << (regc & 7) << std::endl; - os << " addra = " << std::hex << addra << std::endl; - os << " addrc = " << addrc << std::endl; - os << " imm8 = " << std::dec << (int)imm8 << std::endl; - os << " imm32 = " << imm32 << std::endl; + os << names[opcode] << " "; + auto handler = engine[opcode]; + (this->*handler)(os); + } + + void Instruction::genAddressReg(std::ostream& os) const { + os << ((alt % 4) ? "L1" : "L2") << "[r" << (int)src << "]"; + } + + void Instruction::genAddressImm(std::ostream& os) const { + os << ((alt % 4) ? "L1" : "L2") << "[" << (imm32 & ((alt % 4) ? ScratchpadL1Mask : ScratchpadL2Mask)) << "]"; + } + + void Instruction::h_IADD_R(std::ostream& os) const { + if (src != dst) { + os << "r" << (int)dst << ", r" << (int)src << std::endl; } + else { + os << "r" << (int)dst << ", " << imm32 << std::endl; + } + } + + void Instruction::h_IADD_M(std::ostream& os) const { + if (src != dst) { + os << "r" << (int)dst << ", "; + genAddressReg(os); + os << std::endl; + } + else { + os << "r" << (int)dst << ", "; + genAddressImm(os); + os << std::endl; + } + } + + void Instruction::h_IADD_RC(std::ostream& os) const { + os << "r" << (int)dst << ", r" << (int)src << ", " << imm32 << std::endl; + } + + //1 uOP + void Instruction::h_ISUB_R(std::ostream& os) const { + if (src != dst) { + os << "r" << (int)dst << ", r" << (int)src << std::endl; + } + else { + os << "r" << (int)dst << ", " << imm32 << std::endl; + } + } + + void Instruction::h_ISUB_M(std::ostream& os) const { + if (src != dst) { + os << "r" << (int)dst << ", "; + genAddressReg(os); + os << std::endl; + } + else { + os << "r" << (int)dst << ", "; + genAddressImm(os); + os << std::endl; + } + } + + void Instruction::h_IMUL_9C(std::ostream& os) const { + os << "r" << (int)dst << ", " << imm32 << std::endl; + } + + void Instruction::h_IMUL_R(std::ostream& os) const { + if (src != dst) { + os << "r" << (int)dst << ", r" << (int)src << std::endl; + } + else { + os << "r" << (int)dst << ", " << imm32 << std::endl; + } + } + + void Instruction::h_IMUL_M(std::ostream& os) const { + if (src != dst) { + os << "r" << (int)dst << ", "; + genAddressReg(os); + os << std::endl; + } + else { + os << "r" << (int)dst << ", "; + genAddressImm(os); + os << std::endl; + } + } + + void Instruction::h_IMULH_R(std::ostream& os) const { + if (src != dst) { + os << "r" << (int)dst << ", r" << (int)src << std::endl; + } + else { + os << "r" << (int)dst << ", " << imm32 << std::endl; + } + } + + void Instruction::h_IMULH_M(std::ostream& os) const { + if (src != dst) { + os << "r" << (int)dst << ", "; + genAddressReg(os); + os << std::endl; + } + else { + os << "r" << (int)dst << ", "; + genAddressImm(os); + os << std::endl; + } + } + + void Instruction::h_ISMULH_R(std::ostream& os) const { + if (src != dst) { + os << "r" << (int)dst << ", r" << (int)src << std::endl; + } + else { + os << "r" << (int)dst << ", " << imm32 << std::endl; + } + } + + void Instruction::h_ISMULH_M(std::ostream& os) const { + if (src != dst) { + os << "r" << (int)dst << ", "; + genAddressReg(os); + os << std::endl; + } + else { + os << "r" << (int)dst << ", "; + genAddressImm(os); + os << std::endl; + } + } + + void Instruction::h_INEG_R(std::ostream& os) const { + os << "r" << (int)dst << std::endl; + } + + void Instruction::h_IXOR_R(std::ostream& os) const { + if (src != dst) { + os << "r" << (int)dst << ", r" << (int)src << std::endl; + } + else { + os << "r" << (int)dst << ", " << imm32 << std::endl; + } + } + + void Instruction::h_IXOR_M(std::ostream& os) const { + if (src != dst) { + os << "r" << (int)dst << ", "; + genAddressReg(os); + os << std::endl; + } + else { + os << "r" << (int)dst << ", "; + genAddressImm(os); + os << std::endl; + } + } + + void Instruction::h_IROR_R(std::ostream& os) const { + if (src != dst) { + os << "r" << (int)dst << ", r" << (int)src << std::endl; + } + else { + os << "r" << (int)dst << ", " << (imm32 & 63) << std::endl; + } + } + + void Instruction::h_IROL_R(std::ostream& os) const { + if (src != dst) { + os << "r" << (int)dst << ", r" << (int)src << std::endl; + } + else { + os << "r" << (int)dst << ", " << (imm32 & 63) << std::endl; + } + } + + void Instruction::h_IDIV_C(std::ostream& os) const { + os << "r" << (int)dst << ", " << (uint32_t)imm32 << std::endl; + } + + void Instruction::h_ISDIV_C(std::ostream& os) const { + os << "r" << (int)dst << ", " << imm32 << std::endl; + } + + void Instruction::h_FPSWAP_R(std::ostream& os) const { + const char reg = (dst >= 4) ? 'e' : 'f'; + auto dstIndex = dst % 4; + os << reg << dstIndex << std::endl; + } + + void Instruction::h_FPADD_R(std::ostream& os) const { + auto dstIndex = dst % 4; + auto srcIndex = src % 4; + os << "f" << dstIndex << ", a" << srcIndex << std::endl; + } + + void Instruction::h_FPADD_M(std::ostream& os) const { + auto dstIndex = dst % 4; + os << "f" << dstIndex << ", "; + genAddressReg(os); + os << std::endl; + } + + void Instruction::h_FPSUB_R(std::ostream& os) const { + auto dstIndex = dst % 4; + auto srcIndex = src % 4; + os << "f" << dstIndex << ", a" << srcIndex << std::endl; + } + + void Instruction::h_FPSUB_M(std::ostream& os) const { + auto dstIndex = dst % 4; + os << "f" << dstIndex << ", "; + genAddressReg(os); + os << std::endl; + } + + void Instruction::h_FPNEG_R(std::ostream& os) const { + auto dstIndex = dst % 4; + os << "f" << dstIndex << std::endl; + } + + void Instruction::h_FPMUL_R(std::ostream& os) const { + auto dstIndex = dst % 4; + auto srcIndex = src % 4; + os << "e" << dstIndex << ", a" << srcIndex << std::endl; + } + + void Instruction::h_FPMUL_M(std::ostream& os) const { + auto dstIndex = dst % 4; + os << "e" << dstIndex << ", "; + genAddressReg(os); + os << std::endl; + } + + void Instruction::h_FPDIV_R(std::ostream& os) const { + auto dstIndex = dst % 4; + auto srcIndex = src % 4; + os << "e" << dstIndex << ", a" << srcIndex << std::endl; + } + + void Instruction::h_FPDIV_M(std::ostream& os) const { + auto dstIndex = dst % 4; + os << "e" << dstIndex << ", "; + genAddressReg(os); + os << std::endl; + } + + void Instruction::h_FPSQRT_R(std::ostream& os) const { + auto dstIndex = dst % 4; + os << "e" << dstIndex << std::endl; + } + + void Instruction::h_CFROUND(std::ostream& os) const { + os << "r" << (int)dst << ", " << (alt & 63) << std::endl; + } + + static inline const char* condition(int index) { + switch (index) + { + case 0: + return "be"; + case 1: + return "ab"; + case 2: + return "sg"; + case 3: + return "ns"; + case 4: + return "of"; + case 5: + return "no"; + case 6: + return "lt"; + case 7: + return "ge"; + } + } + + void Instruction::h_COND_R(std::ostream& os) const { + os << "r" << (int)dst << ", " << condition((alt >> 2) & 7) << "(r" << (int)src << ", " << imm32 << ")" << std::endl; + } + + void Instruction::h_COND_M(std::ostream& os) const { + os << "r" << (int)dst << ", " << condition((alt >> 2) & 7) << "("; + genAddressReg(os); + os << ", " << imm32 << ")" << std::endl; + } #include "instructionWeights.hpp" #define INST_NAME(x) REPN(#x, WT(x)) +#define INST_HANDLE(x) REPN(&Instruction::h_##x, WT(x)) const char* Instruction::names[256] = { - INST_NAME(ADD_64) - INST_NAME(ADD_32) - INST_NAME(SUB_64) - INST_NAME(SUB_32) - INST_NAME(MUL_64) - INST_NAME(MULH_64) - INST_NAME(MUL_32) - INST_NAME(IMUL_32) - INST_NAME(IMULH_64) - INST_NAME(DIV_64) - INST_NAME(IDIV_64) - INST_NAME(AND_64) - INST_NAME(AND_32) - INST_NAME(OR_64) - INST_NAME(OR_32) - INST_NAME(XOR_64) - INST_NAME(XOR_32) - INST_NAME(SHL_64) - INST_NAME(SHR_64) - INST_NAME(SAR_64) - INST_NAME(ROL_64) - INST_NAME(ROR_64) - INST_NAME(FPADD) - INST_NAME(FPSUB) - INST_NAME(FPMUL) - INST_NAME(FPDIV) - INST_NAME(FPSQRT) - INST_NAME(FPROUND) - INST_NAME(JUMP) - INST_NAME(CALL) - INST_NAME(RET) + //Integer + INST_NAME(IADD_R) + INST_NAME(IADD_M) + INST_NAME(IADD_RC) + INST_NAME(ISUB_R) + INST_NAME(ISUB_M) + INST_NAME(IMUL_9C) + INST_NAME(IMUL_R) + INST_NAME(IMUL_M) + INST_NAME(IMULH_R) + INST_NAME(IMULH_M) + INST_NAME(ISMULH_R) + INST_NAME(ISMULH_M) + INST_NAME(IDIV_C) + INST_NAME(ISDIV_C) + INST_NAME(INEG_R) + INST_NAME(IXOR_R) + INST_NAME(IXOR_M) + INST_NAME(IROR_R) + INST_NAME(IROL_R) + + //Common floating point + INST_NAME(FPSWAP_R) + + //Floating point group F + INST_NAME(FPADD_R) + INST_NAME(FPADD_M) + INST_NAME(FPSUB_R) + INST_NAME(FPSUB_M) + INST_NAME(FPNEG_R) + + //Floating point group E + INST_NAME(FPMUL_R) + INST_NAME(FPMUL_M) + INST_NAME(FPDIV_R) + INST_NAME(FPDIV_M) + INST_NAME(FPSQRT_R) + + //Control + INST_NAME(COND_R) + INST_NAME(COND_M) + INST_NAME(CFROUND) + }; + + InstructionVisualizer Instruction::engine[256] = { + //Integer + INST_HANDLE(IADD_R) + INST_HANDLE(IADD_M) + INST_HANDLE(IADD_RC) + INST_HANDLE(ISUB_R) + INST_HANDLE(ISUB_M) + INST_HANDLE(IMUL_9C) + INST_HANDLE(IMUL_R) + INST_HANDLE(IMUL_M) + INST_HANDLE(IMULH_R) + INST_HANDLE(IMULH_M) + INST_HANDLE(ISMULH_R) + INST_HANDLE(ISMULH_M) + INST_HANDLE(IDIV_C) + INST_HANDLE(ISDIV_C) + INST_HANDLE(INEG_R) + INST_HANDLE(IXOR_R) + INST_HANDLE(IXOR_M) + INST_HANDLE(IROR_R) + INST_HANDLE(IROL_R) + + //Common floating point + INST_HANDLE(FPSWAP_R) + + //Floating point group F + INST_HANDLE(FPADD_R) + INST_HANDLE(FPADD_M) + INST_HANDLE(FPSUB_R) + INST_HANDLE(FPSUB_M) + INST_HANDLE(FPNEG_R) + + //Floating point group E + INST_HANDLE(FPMUL_R) + INST_HANDLE(FPMUL_M) + INST_HANDLE(FPDIV_R) + INST_HANDLE(FPDIV_M) + INST_HANDLE(FPSQRT_R) + + //Control + INST_HANDLE(COND_R) + INST_HANDLE(COND_M) + INST_HANDLE(CFROUND) }; } \ No newline at end of file diff --git a/src/Instruction.hpp b/src/Instruction.hpp index 33c2059..becb983 100644 --- a/src/Instruction.hpp +++ b/src/Instruction.hpp @@ -24,21 +24,17 @@ along with RandomX. If not, see. namespace RandomX { + class Instruction; + + typedef void(Instruction::*InstructionVisualizer)(std::ostream&) const; + class Instruction { public: uint8_t opcode; - uint8_t loca; - uint8_t rega; - uint8_t locb; - uint8_t regb; - uint8_t locc; - uint8_t regc; - uint8_t imm8; - int32_t addra; - union { - uint32_t addrc; - int32_t imm32; - }; + uint8_t dst; + uint8_t src; + uint8_t alt; + int32_t imm32; const char* getName() const { return names[opcode]; } @@ -49,8 +45,46 @@ namespace RandomX { private: void print(std::ostream&) const; static const char* names[256]; + static InstructionVisualizer engine[256]; + + void genAddressReg(std::ostream& os) const; + void genAddressImm(std::ostream& os) const; + + void h_IADD_R(std::ostream&) const; + void h_IADD_M(std::ostream&) const; + void h_IADD_RC(std::ostream&) const; + void h_ISUB_R(std::ostream&) const; + void h_ISUB_M(std::ostream&) const; + void h_IMUL_9C(std::ostream&) const; + void h_IMUL_R(std::ostream&) const; + void h_IMUL_M(std::ostream&) const; + void h_IMULH_R(std::ostream&) const; + void h_IMULH_M(std::ostream&) const; + void h_ISMULH_R(std::ostream&) const; + void h_ISMULH_M(std::ostream&) const; + void h_IDIV_C(std::ostream&) const; + void h_ISDIV_C(std::ostream&) const; + void h_INEG_R(std::ostream&) const; + void h_IXOR_R(std::ostream&) const; + void h_IXOR_M(std::ostream&) const; + void h_IROR_R(std::ostream&) const; + void h_IROL_R(std::ostream&) const; + void h_FPSWAP_R(std::ostream&) const; + void h_FPADD_R(std::ostream&) const; + void h_FPADD_M(std::ostream&) const; + void h_FPSUB_R(std::ostream&) const; + void h_FPSUB_M(std::ostream&) const; + void h_FPNEG_R(std::ostream&) const; + void h_FPMUL_R(std::ostream&) const; + void h_FPMUL_M(std::ostream&) const; + void h_FPDIV_R(std::ostream&) const; + void h_FPDIV_M(std::ostream&) const; + void h_FPSQRT_R(std::ostream&) const; + void h_COND_R(std::ostream&) const; + void h_COND_M(std::ostream&) const; + void h_CFROUND(std::ostream&) const; }; - static_assert(sizeof(Instruction) == 16, "Invalid alignment of struct Instruction"); + static_assert(sizeof(Instruction) == 8, "Invalid alignment of struct Instruction"); } \ No newline at end of file diff --git a/src/InterpretedVirtualMachine.cpp b/src/InterpretedVirtualMachine.cpp index d7e4fc4..d145e78 100644 --- a/src/InterpretedVirtualMachine.cpp +++ b/src/InterpretedVirtualMachine.cpp @@ -130,333 +130,10 @@ namespace RandomX { #endif } - convertible_t InterpretedVirtualMachine::loada(Instruction& instr) { - convertible_t& rega = reg.r[instr.rega % RegistersCount]; - rega.i64 ^= instr.addra; //sign-extend addra - addr_t addr = rega.u32; - - if ((ic % 64) == 0) { - addr = currentTransform->apply(addr); -#ifdef STATS - datasetAccess[mem.ma / (DatasetBlockCount / 256) / CacheLineSize]++; -#endif - readDataset(addr, mem, reg); - } - - if ((instr.loca & 192) == 0) { - mem.mx ^= addr; - } - - if (instr.loca & 3) { - return scratchpad[addr % ScratchpadL1]; - } - else { - return scratchpad[addr % ScratchpadL2]; - } - } - - convertible_t InterpretedVirtualMachine::loadbia(Instruction& instr) { - if (instr.locb & 3) { - return reg.r[instr.regb % RegistersCount]; - } - else { - convertible_t temp; - temp.i64 = instr.imm32; //sign-extend imm32 - return temp; - } - } - - convertible_t InterpretedVirtualMachine::loadbiashift(Instruction& instr) { - if (instr.locb & 1) { - return reg.r[instr.regb % RegistersCount]; - } - else { - convertible_t temp; - temp.u64 = instr.imm8; - return temp; - } - } - - convertible_t InterpretedVirtualMachine::loadbiadiv(Instruction& instr) { - if (instr.locb & 3) { - convertible_t temp; - temp.u64 = instr.imm32; - return temp; - } - else { - return reg.r[instr.regb % RegistersCount]; - } - } - - convertible_t& InterpretedVirtualMachine::getcr(Instruction& inst) { - addr_t addr; - switch (inst.locc & 7) - { - case 0: - addr = reg.r[inst.regc % RegistersCount].u32 ^ inst.addrc; - return scratchpad[addr % ScratchpadL2]; - - case 1: - case 2: - case 3: - addr = reg.r[inst.regc % RegistersCount].u32 ^ inst.addrc; - return scratchpad[addr % ScratchpadL1]; - - case 4: - case 5: - case 6: - case 7: - return reg.r[inst.regc % RegistersCount]; - } - } - - void InterpretedVirtualMachine::writecf(Instruction& inst, fpu_reg_t& regc) { - addr_t addr; - switch (inst.locc & 7) - { - case 4: - addr = reg.r[inst.regc % RegistersCount].u32 ^ inst.addrc; - scratchpad[addr % ScratchpadL2] = (inst.locc & 8) ? regc.hi : regc.lo; - break; - - case 5: - case 6: - case 7: - addr = reg.r[inst.regc % RegistersCount].u32 ^ inst.addrc; - scratchpad[addr % ScratchpadL1] = (inst.locc & 8) ? regc.hi : regc.lo; - - default: - break; - } - } - -#define ALU_RETIRE(x) x(a, b, c); \ - if(trace) std::cout << std::hex << /*a.u64 << " " << b.u64 << " " <<*/ c.u64 << std::endl; - -#define CHECK_NOP_FPDIV(b, c) -#ifndef STATS -#define CHECK_NOP_FPADD(b, c) -#define CHECK_NOP_FPSUB(b, c) -#define CHECK_NOP_FPMUL(b, c) -#else -#define CHECK_NOP_FPADD(b, c) bool loeq = (b.lo.u64 == c.lo.u64); bool hieq = (b.hi.u64 == c.hi.u64); count_FPADD_nop += loeq + hieq; if(loeq && hieq) count_FPADD_nop2++; -#define CHECK_NOP_FPSUB(b, c) bool loeq = ((b.lo.u64 & INT64_MAX) == (c.lo.u64 & INT64_MAX)); bool hieq = ((b.hi.u64 & INT64_MAX) == (c.hi.u64 & INT64_MAX)); count_FPSUB_nop += loeq + hieq; if(loeq && hieq) count_FPSUB_nop2++; -#define CHECK_NOP_FPMUL(b, c) bool loeq = (b.lo.u64 == c.lo.u64); bool hieq = (b.hi.u64 == c.hi.u64); count_FPMUL_nop += loeq + hieq; if(loeq && hieq) count_FPMUL_nop2++; -#endif - -#define FPU_RETIRE(x) x(a, b, c); \ - writecf(inst, c); \ - if(trace) { \ - std::cout << std::hex << ((inst.locc & 8) ? c.hi.u64 : c.lo.u64) << std::endl; \ - } \ - if(fpuCheck) { \ - if(c.hi.f64 != c.hi.f64 || c.lo.f64 != c.lo.f64) { \ - std::stringstream ss; \ - ss << "NaN result of " << #x << "(" << std::hex << a.u64 << ", " << b.hi.u64 << " " << b.lo.u64 << ") = " << c.hi.u64 << " " << c.lo.u64 << std::endl; \ - throw std::runtime_error(ss.str()); \ - } else if (std::fpclassify(c.hi.f64) == FP_SUBNORMAL || std::fpclassify(c.lo.f64) == FP_SUBNORMAL) {\ - std::stringstream ss; \ - ss << "Denormal result of " << #x << "(" << std::hex << a.u64 << ", " << b.hi.u64 << " " << b.lo.u64 << ") = " << c.hi.u64 << " " << c.lo.u64 << std::endl; \ - throw std::runtime_error(ss.str()); \ - } \ - } - -#ifdef STATS -#define INC_COUNT(x) count_##x++; -#else -#define INC_COUNT(x) -#endif - -#define FPU_RETIRE_FPSQRT(x) FPSQRT(a, b, c); \ - writecf(inst, c); \ - if(trace) std::cout << std::hex << ((inst.locc & 8) ? c.hi.u64 : c.lo.u64) << std::endl; - -#define FPU_RETIRE_FPROUND(x) FPROUND(a, b, c); \ - writecflo(inst, c); \ - if(trace) std::cout << std::hex << c.lo.u64 << std::endl; - -#define ALU_INST(x) void InterpretedVirtualMachine::h_##x(Instruction& inst) { \ - INC_COUNT(x) \ - convertible_t a = loada(inst); \ - convertible_t b = loadbia(inst); \ - convertible_t& c = getcr(inst); \ - ALU_RETIRE(x) \ - } - -#define ALU_INST_SR(x) void InterpretedVirtualMachine::h_##x(Instruction& inst) { \ - INC_COUNT(x) \ - convertible_t a = loada(inst); \ - convertible_t b = loadbiashift(inst); \ - convertible_t& c = getcr(inst); \ - ALU_RETIRE(x) \ - } - -#define ALU_INST_DIV(x) void InterpretedVirtualMachine::h_##x(Instruction& inst) { \ - INC_COUNT(x) \ - convertible_t a = loada(inst); \ - convertible_t b = loadbiadiv(inst); \ - convertible_t& c = getcr(inst); \ - ALU_RETIRE(x) \ - } - -#define FPU_INST(x) void InterpretedVirtualMachine::h_##x(Instruction& inst) { \ - INC_COUNT(x) \ - convertible_t a = loada(inst); \ - fpu_reg_t& b = reg.f[inst.regb % RegistersCount]; \ - fpu_reg_t btemp = b; \ - fpu_reg_t& c = reg.f[inst.regc % RegistersCount]; \ - FPU_RETIRE(x) \ - CHECK_NOP_##x(btemp, c) \ - } - -#define FPU_INST_NB(x) void InterpretedVirtualMachine::h_##x(Instruction& inst) { \ - INC_COUNT(x) \ - convertible_t a = loada(inst); \ - fpu_reg_t b; \ - fpu_reg_t& c = reg.f[inst.regc % RegistersCount]; \ - FPU_RETIRE_##x(x) \ - } - - ALU_INST(ADD_64) - ALU_INST(ADD_32) - ALU_INST(SUB_64) - ALU_INST(SUB_32) - ALU_INST(MUL_64) - ALU_INST(MULH_64) - ALU_INST(MUL_32) - ALU_INST(IMUL_32) - ALU_INST(IMULH_64) - ALU_INST_DIV(DIV_64) - ALU_INST_DIV(IDIV_64) - ALU_INST(AND_64) - ALU_INST(AND_32) - ALU_INST(OR_64) - ALU_INST(OR_32) - ALU_INST(XOR_64) - ALU_INST(XOR_32) - - ALU_INST_SR(SHL_64) - ALU_INST_SR(SHR_64) - ALU_INST_SR(SAR_64) - ALU_INST_SR(ROL_64) - ALU_INST_SR(ROR_64) - - FPU_INST(FPADD) - FPU_INST(FPSUB) - FPU_INST(FPMUL) - FPU_INST(FPDIV) - FPU_INST_NB(FPSQRT) - - void InterpretedVirtualMachine::h_FPROUND(Instruction& inst) { - convertible_t a = loada(inst); - convertible_t& c = getcr(inst); - c.u64 = a.u64; - if (trace) std::cout << std::hex << a.u64 << std::endl; - FPROUND(a, inst.imm8); - } - - void InterpretedVirtualMachine::h_JUMP(Instruction& inst) { - convertible_t a = loada(inst); - convertible_t& c = getcr(inst); - c.u64 = a.u64; - if (trace) std::cout << std::hex << a.u64 << std::endl; - if (JMP_COND(inst.locb, reg.r[inst.regb % RegistersCount], inst.imm32)) { -#ifdef STATS - count_JUMP_taken++; - count_jump_taken[inst.locb & 7]++; -#endif - pc += (inst.imm8 & 127) + 1; - pc = pc % ProgramLength; - } -#ifdef STATS - else { - count_JUMP_not_taken++; - count_jump_not_taken[inst.locb & 7]++; - } -#endif - } - - void InterpretedVirtualMachine::h_CALL(Instruction& inst) { - convertible_t a = loada(inst); - convertible_t& c = getcr(inst); - c.u64 = a.u64; - if (trace) std::cout << std::hex << a.u64 << std::endl; - if (JMP_COND(inst.locb, reg.r[inst.regb % RegistersCount], inst.imm32)) { -#ifdef STATS - count_CALL_taken++; - count_jump_taken[inst.locb & 7]++; - count_retdepth = std::max(0, count_retdepth - 1); -#endif - stackPush(pc); -#ifdef STATS - count_max_stack = std::max(count_max_stack, (int)stack.size()); -#endif - pc += (inst.imm8 & 127) + 1; - pc = pc % ProgramLength; - } -#ifdef STATS - else { - count_CALL_not_taken++; - count_jump_not_taken[inst.locb & 7]++; - } -#endif - } - - void InterpretedVirtualMachine::h_RET(Instruction& inst) { - convertible_t a = loada(inst); - convertible_t& c = getcr(inst); - c.u64 = a.u64; - if (trace) std::cout << std::hex << a.u64 << std::endl; - if (stack.size() > 0) { -#ifdef STATS - count_RET_taken++; - count_retdepth++; - count_retdepth_max = std::max(count_retdepth_max, count_retdepth); -#endif - auto raddr = stackPopAddress(); - pc = raddr; - } -#ifdef STATS - else { - count_RET_stack_empty++; - } -#endif - } - #include "instructionWeights.hpp" #define INST_HANDLE(x) REPN(&InterpretedVirtualMachine::h_##x, WT(x)) InstructionHandler InterpretedVirtualMachine::engine[256] = { - INST_HANDLE(ADD_64) - INST_HANDLE(ADD_32) - INST_HANDLE(SUB_64) - INST_HANDLE(SUB_32) - INST_HANDLE(MUL_64) - INST_HANDLE(MULH_64) - INST_HANDLE(MUL_32) - INST_HANDLE(IMUL_32) - INST_HANDLE(IMULH_64) - INST_HANDLE(DIV_64) - INST_HANDLE(IDIV_64) - INST_HANDLE(AND_64) - INST_HANDLE(AND_32) - INST_HANDLE(OR_64) - INST_HANDLE(OR_32) - INST_HANDLE(XOR_64) - INST_HANDLE(XOR_32) - INST_HANDLE(SHL_64) - INST_HANDLE(SHR_64) - INST_HANDLE(SAR_64) - INST_HANDLE(ROL_64) - INST_HANDLE(ROR_64) - INST_HANDLE(FPADD) - INST_HANDLE(FPSUB) - INST_HANDLE(FPMUL) - INST_HANDLE(FPDIV) - INST_HANDLE(FPSQRT) - INST_HANDLE(FPROUND) - INST_HANDLE(JUMP) - INST_HANDLE(CALL) - INST_HANDLE(RET) + }; } \ No newline at end of file diff --git a/src/JitCompilerX86.cpp b/src/JitCompilerX86.cpp index ee91fc3..8776d61 100644 --- a/src/JitCompilerX86.cpp +++ b/src/JitCompilerX86.cpp @@ -38,9 +38,9 @@ along with RandomX. If not, see. namespace RandomX { -#if !defined(_M_X64) && !defined(__x86_64__) +#if true || !defined(_M_X64) && !defined(__x86_64__) JitCompilerX86::JitCompilerX86() { - throw std::runtime_error("JIT compiler only supports x86-64 CPUs"); + //throw std::runtime_error("JIT compiler only supports x86-64 CPUs"); } void JitCompilerX86::generateProgram(Pcg32& gen) { diff --git a/src/Program.cpp b/src/Program.cpp index 6e94fca..b78a5ee 100644 --- a/src/Program.cpp +++ b/src/Program.cpp @@ -30,7 +30,6 @@ namespace RandomX { void Program::print(std::ostream& os) const { for (int i = 0; i < RandomX::ProgramLength; ++i) { auto instr = programBuffer[i]; - os << std::dec << instr.getName() << " (" << i << "):" << std::endl; os << instr; } } diff --git a/src/common.hpp b/src/common.hpp index cffa53c..bf235ec 100644 --- a/src/common.hpp +++ b/src/common.hpp @@ -46,7 +46,7 @@ namespace RandomX { constexpr int CacheBlockCount = CacheSize / CacheLineSize; constexpr int BlockExpansionRatio = DatasetSize / CacheSize; constexpr int DatasetBlockCount = BlockExpansionRatio * CacheBlockCount; - constexpr int DatasetIterations = 10; + constexpr int DatasetIterations = 3; #ifdef TRACE @@ -72,13 +72,15 @@ namespace RandomX { convertible_t hi; }; - constexpr int ProgramLength = 512; - constexpr uint32_t InstructionCount = 1024 * 1024; + constexpr int ProgramLength = 256; + constexpr uint32_t InstructionCount = 1024; constexpr uint32_t ScratchpadSize = 1024 * 1024; constexpr uint32_t ScratchpadLength = ScratchpadSize / sizeof(convertible_t); constexpr uint32_t ScratchpadL1 = ScratchpadSize / 64 / sizeof(convertible_t); constexpr uint32_t ScratchpadL2 = ScratchpadSize / 4 / sizeof(convertible_t); constexpr uint32_t ScratchpadL3 = ScratchpadSize / sizeof(convertible_t); + constexpr int ScratchpadL1Mask = (ScratchpadL1 - 1) * 8; + constexpr int ScratchpadL2Mask = (ScratchpadL2 - 1) * 8; constexpr uint32_t TransformationCount = 90; constexpr int RegistersCount = 8; @@ -118,17 +120,19 @@ namespace RandomX { struct RegisterFile { convertible_t r[RegistersCount]; - fpu_reg_t f[RegistersCount]; + fpu_reg_t f[RegistersCount / 2]; + fpu_reg_t g[RegistersCount / 2]; + fpu_reg_t a[RegistersCount / 2]; }; - static_assert(sizeof(RegisterFile) == 3 * RegistersCount * sizeof(convertible_t), "Invalid alignment of struct RandomX::RegisterFile"); + static_assert(sizeof(RegisterFile) == 256, "Invalid alignment of struct RandomX::RegisterFile"); typedef void(*DatasetReadFunc)(addr_t, MemoryRegisters&, RegisterFile&); typedef void(*ProgramFunc)(RegisterFile&, MemoryRegisters&, convertible_t*); extern "C" { - void executeProgram(RegisterFile&, MemoryRegisters&, convertible_t*, DatasetReadFunc); + void executeProgram(RegisterFile&, MemoryRegisters&, convertible_t*, uint64_t); } } diff --git a/src/executeProgram-win64.asm b/src/executeProgram-win64.asm index 2da88b5..17e593d 100644 --- a/src/executeProgram-win64.asm +++ b/src/executeProgram-win64.asm @@ -21,16 +21,24 @@ _RANDOMX_EXECUTE_PROGRAM SEGMENT PAGE READ EXECUTE PUBLIC executeProgram +ALIGN 16 +minDbl: +db 0, 0, 0, 0, 0, 0, 16, 0, 0, 0, 0, 0, 0, 0, 16, 0 +absMask: +db 255, 255, 255, 255, 255, 255, 255, 127, 255, 255, 255, 255, 255, 255, 255, 127 +signMask: +db 0, 0, 0, 0, 0, 0, 0, 128, 0, 0, 0, 0, 0, 0, 0, 128 + executeProgram PROC ; REGISTER ALLOCATION: ; rax -> temporary ; rbx -> "ic" ; rcx -> temporary ; rdx -> temporary - ; rsi -> convertible_t& scratchpad - ; rdi -> beginning of VM stack + ; rsi -> scratchpad pointer + ; rdi -> dataset pointer ; rbp -> "ma", "mx" - ; rsp -> end of VM stack + ; rsp -> stack pointer ; r8 -> "r0" ; r9 -> "r1" ; r10 -> "r2" @@ -39,32 +47,22 @@ executeProgram PROC ; r13 -> "r5" ; r14 -> "r6" ; r15 -> "r7" - ; xmm0 -> temporary - ; xmm1 -> temporary + ; xmm0 -> "f0" + ; xmm1 -> "f1" ; xmm2 -> "f2" ; xmm3 -> "f3" - ; xmm4 -> "f4" - ; xmm5 -> "f5" - ; xmm6 -> "f6" - ; xmm7 -> "f7" - ; xmm8 -> "f0" - ; xmm9 -> "f1" - ; xmm10 -> absolute value mask - - ; STACK STRUCTURE: - ; | - ; | - ; | saved registers - ; | - ; v - ; [rbx+8] RegisterFile& registerFile - ; [rbx+0] uint8_t* dataset - ; | - ; | - ; | VM stack - ; | - ; v - ; [rsp] last element of VM stack + ; xmm4 -> "e0" + ; xmm5 -> "e1" + ; xmm6 -> "e2" + ; xmm7 -> "e3" + ; xmm8 -> "a0" + ; xmm9 -> "a1" + ; xmm10 -> "a2" + ; xmm11 -> "a3" + ; xmm12 -> temporary + ; xmm13 -> DBL_MIN + ; xmm14 -> absolute value mask + ; xmm15 -> sign mask ; store callee-saved registers push rbx @@ -81,100 +79,117 @@ executeProgram PROC movdqu xmmword ptr [rsp+32], xmm8 movdqu xmmword ptr [rsp+16], xmm9 movdqu xmmword ptr [rsp+0], xmm10 + sub rsp, 80 + movdqu xmmword ptr [rsp+64], xmm11 + movdqu xmmword ptr [rsp+48], xmm12 + movdqu xmmword ptr [rsp+32], xmm13 + movdqu xmmword ptr [rsp+16], xmm14 + movdqu xmmword ptr [rsp+0], xmm15 ; function arguments push rcx ; RegisterFile& registerFile mov rbp, qword ptr [rdx] ; "mx", "ma" - mov rax, qword ptr [rdx+8] ; uint8_t* dataset - push rax + mov eax, ebp ; "mx" + mov rdi, qword ptr [rdx+8] ; uint8_t* dataset mov rsi, r8 ; convertible_t* scratchpad + mov rbx, r9 ; loop counter + + ;# zero integer registers + xor r8, r8 + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + + ;# load constant registers + lea rcx, [rcx+120] + movapd xmm8, xmmword ptr [rcx+72] + movapd xmm9, xmmword ptr [rcx+88] + movapd xmm10, xmmword ptr [rcx+104] + movapd xmm11, xmmword ptr [rcx+120] + movapd xmm13, xmmword ptr [minDbl] + movapd xmm14, xmmword ptr [absMask] + movapd xmm15, xmmword ptr [signMask] - mov rdi, rsp ; beginning of VM stack - mov ebx, 1048577 ; number of VM instructions to execute + 1 - - xorps xmm10, xmm10 - cmpeqpd xmm10, xmm10 - psrlq xmm10, 1 ; mask for absolute value = 0x7fffffffffffffff7fffffffffffffff - - ; reset rounding mode - mov dword ptr [rsp-8], 40896 - ldmxcsr dword ptr [rsp-8] - - ; load integer registers - mov r8, qword ptr [rcx+0] - mov r9, qword ptr [rcx+8] - mov r10, qword ptr [rcx+16] - mov r11, qword ptr [rcx+24] - mov r12, qword ptr [rcx+32] - mov r13, qword ptr [rcx+40] - mov r14, qword ptr [rcx+48] - mov r15, qword ptr [rcx+56] - - ; load register f0 hi, lo - xorps xmm8, xmm8 - cvtsi2sd xmm8, qword ptr [rcx+72] - pslldq xmm8, 8 - cvtsi2sd xmm8, qword ptr [rcx+64] - - ; load register f1 hi, lo - xorps xmm9, xmm9 - cvtsi2sd xmm9, qword ptr [rcx+88] - pslldq xmm9, 8 - cvtsi2sd xmm9, qword ptr [rcx+80] - - ; load register f2 hi, lo - xorps xmm2, xmm2 - cvtsi2sd xmm2, qword ptr [rcx+104] - pslldq xmm2, 8 - cvtsi2sd xmm2, qword ptr [rcx+96] - - ; load register f3 hi, lo - xorps xmm3, xmm3 - cvtsi2sd xmm3, qword ptr [rcx+120] - pslldq xmm3, 8 - cvtsi2sd xmm3, qword ptr [rcx+112] - - lea rcx, [rcx+64] - - ; load register f4 hi, lo - xorps xmm4, xmm4 - cvtsi2sd xmm4, qword ptr [rcx+72] - pslldq xmm4, 8 - cvtsi2sd xmm4, qword ptr [rcx+64] - - ; load register f5 hi, lo - xorps xmm5, xmm5 - cvtsi2sd xmm5, qword ptr [rcx+88] - pslldq xmm5, 8 - cvtsi2sd xmm5, qword ptr [rcx+80] - - ; load register f6 hi, lo - xorps xmm6, xmm6 - cvtsi2sd xmm6, qword ptr [rcx+104] - pslldq xmm6, 8 - cvtsi2sd xmm6, qword ptr [rcx+96] - - ; load register f7 hi, lo - xorps xmm7, xmm7 - cvtsi2sd xmm7, qword ptr [rcx+120] - pslldq xmm7, 8 - cvtsi2sd xmm7, qword ptr [rcx+112] - - jmp program_begin - - ; program body -ALIGN 64 program_begin: + xor eax, r8d ;# read address register 1 + and eax, 262080 + lea rcx, [rsi+rax] + xor r8, qword ptr [rcx+0] + xor r9, qword ptr [rcx+8] + xor r10, qword ptr [rcx+16] + xor r11, qword ptr [rcx+24] + xor r12, qword ptr [rcx+32] + xor r13, qword ptr [rcx+40] + xor r14, qword ptr [rcx+48] + xor r15, qword ptr [rcx+56] + xor eax, r9d ;# read address register 2 + and eax, 262080 + lea rcx, [rsi+rax] + cvtdq2pd xmm0, qword ptr [rcx+0] + cvtdq2pd xmm1, qword ptr [rcx+8] + cvtdq2pd xmm2, qword ptr [rcx+16] + cvtdq2pd xmm3, qword ptr [rcx+24] + cvtdq2pd xmm4, qword ptr [rcx+32] + cvtdq2pd xmm5, qword ptr [rcx+40] + cvtdq2pd xmm6, qword ptr [rcx+48] + cvtdq2pd xmm7, qword ptr [rcx+56] + andps xmm4, xmm14 + andps xmm5, xmm14 + andps xmm6, xmm14 + andps xmm7, xmm14 + + ;# 256 instructions include program.inc - -ALIGN 64 + + mov eax, r8d ;# read address register 1 + xor eax, r9d ;# read address register 2 + xor rbp, rax ;# modify "mx" + and rbp, -64 ;# align "mx" to the start of a cache line + mov edx, ebp ;# edx = mx + prefetchnta byte ptr [rdi+rdx] + ror rbp, 32 ;# swap "ma" and "mx" + mov edx, ebp ;# edx = ma + lea rcx, [rdi+rdx] ;# dataset cache line + xor r8, qword ptr [rcx+0] + xor r9, qword ptr [rcx+8] + xor r10, qword ptr [rcx+16] + xor r11, qword ptr [rcx+24] + xor r12, qword ptr [rcx+32] + xor r13, qword ptr [rcx+40] + xor r14, qword ptr [rcx+48] + xor r15, qword ptr [rcx+56] + mov eax, r12d ;# write address register 1 + and eax, 262080 + lea rcx, [rsi+rax] + mov qword ptr [rcx+0], r8 + mov qword ptr [rcx+8], r9 + mov qword ptr [rcx+16], r10 + mov qword ptr [rcx+24], r11 + mov qword ptr [rcx+32], r12 + mov qword ptr [rcx+40], r13 + mov qword ptr [rcx+48], r14 + mov qword ptr [rcx+56], r15 + xor eax, r13d ;# write address register 2 + and eax, 262080 + lea rcx, [rsi+rax] + mulpd xmm0, xmm4 + mulpd xmm1, xmm5 + mulpd xmm2, xmm6 + mulpd xmm3, xmm7 + movapd xmmword ptr [rcx+0], xmm0 + movapd xmmword ptr [rcx+16], xmm1 + movapd xmmword ptr [rcx+32], xmm2 + movapd xmmword ptr [rcx+48], xmm3 + dec ebx + jnz program_begin + rx_finish: - ; unroll the stack - mov rsp, rdi - ; save VM register values pop rcx - pop rcx mov qword ptr [rcx+0], r8 mov qword ptr [rcx+8], r9 mov qword ptr [rcx+16], r10 @@ -183,8 +198,8 @@ rx_finish: mov qword ptr [rcx+40], r13 mov qword ptr [rcx+48], r14 mov qword ptr [rcx+56], r15 - movdqa xmmword ptr [rcx+64], xmm8 - movdqa xmmword ptr [rcx+80], xmm9 + movdqa xmmword ptr [rcx+64], xmm0 + movdqa xmmword ptr [rcx+80], xmm1 movdqa xmmword ptr [rcx+96], xmm2 movdqa xmmword ptr [rcx+112], xmm3 lea rcx, [rcx+64] @@ -194,6 +209,12 @@ rx_finish: movdqa xmmword ptr [rcx+112], xmm7 ; load callee-saved registers + movdqu xmm15, xmmword ptr [rsp] + movdqu xmm14, xmmword ptr [rsp+16] + movdqu xmm13, xmmword ptr [rsp+32] + movdqu xmm12, xmmword ptr [rsp+48] + movdqu xmm11, xmmword ptr [rsp+64] + add rsp, 80 movdqu xmm10, xmmword ptr [rsp] movdqu xmm9, xmmword ptr [rsp+16] movdqu xmm8, xmmword ptr [rsp+32] diff --git a/src/instructionWeights.hpp b/src/instructionWeights.hpp index de027b7..242b5bd 100644 --- a/src/instructionWeights.hpp +++ b/src/instructionWeights.hpp @@ -19,46 +19,58 @@ along with RandomX. If not, see. #pragma once -#define WT_ADD_64 12 -#define WT_ADD_32 2 -#define WT_SUB_64 12 -#define WT_SUB_32 2 -#define WT_MUL_64 23 -#define WT_MULH_64 5 -#define WT_MUL_32 15 -#define WT_IMUL_32 15 -#define WT_IMULH_64 3 -#define WT_DIV_64 8 -#define WT_IDIV_64 8 -#define WT_AND_64 4 -#define WT_AND_32 2 -#define WT_OR_64 4 -#define WT_OR_32 2 -#define WT_XOR_64 4 -#define WT_XOR_32 2 -#define WT_SHL_64 3 -#define WT_SHR_64 3 -#define WT_SAR_64 3 -#define WT_ROL_64 6 -#define WT_ROR_64 6 -#define WT_FPADD 20 -#define WT_FPSUB 20 -#define WT_FPMUL 22 -#define WT_FPDIV 8 -#define WT_FPSQRT 6 -#define WT_FPROUND 2 -#define WT_JUMP 11 -#define WT_CALL 11 -#define WT_RET 12 +//Integer +#define WT_IADD_R 10 +#define WT_IADD_M 3 +#define WT_IADD_RC 12 +#define WT_ISUB_R 10 +#define WT_ISUB_M 3 +#define WT_IMUL_9C 12 +#define WT_IMUL_R 24 +#define WT_IMUL_M 8 +#define WT_IMULH_R 6 +#define WT_IMULH_M 2 +#define WT_ISMULH_R 6 +#define WT_ISMULH_M 2 +#define WT_IDIV_C 4 +#define WT_ISDIV_C 2 +#define WT_INEG_R 4 +#define WT_IXOR_R 15 +#define WT_IXOR_M 5 +#define WT_IROR_R 10 +#define WT_IROL_R 10 + +//Common floating point +#define WT_FPSWAP_R 6 + +//Floating point group F +#define WT_FPADD_R 18 +#define WT_FPADD_M 3 +#define WT_FPSUB_R 18 +#define WT_FPSUB_M 3 +#define WT_FPNEG_R 5 + +//Floating point group E +#define WT_FPMUL_R 18 +#define WT_FPMUL_M 3 +#define WT_FPDIV_R 6 +#define WT_FPDIV_M 1 +#define WT_FPSQRT_R 6 + +//Control +#define WT_COND_R 15 +#define WT_COND_M 5 +#define WT_CFROUND 1 + #define WT_NOP 0 - -constexpr int wtSum = WT_ADD_64 + WT_ADD_32 + WT_SUB_64 + WT_SUB_32 + \ -WT_MUL_64 + WT_MULH_64 + WT_MUL_32 + WT_IMUL_32 + WT_IMULH_64 + \ -WT_DIV_64 + WT_IDIV_64 + WT_AND_64 + WT_AND_32 + WT_OR_64 + \ -WT_OR_32 + WT_XOR_64 + WT_XOR_32 + WT_SHL_64 + WT_SHR_64 + \ -WT_SAR_64 + WT_ROL_64 + WT_ROR_64 + WT_FPADD + WT_FPSUB + WT_FPMUL \ -+ WT_FPDIV + WT_FPSQRT + WT_FPROUND + WT_JUMP + WT_CALL + WT_RET + WT_NOP; +constexpr int wtSum = WT_IADD_R + WT_IADD_M + WT_IADD_RC + WT_ISUB_R + \ +WT_ISUB_M + WT_IMUL_9C + WT_IMUL_R + WT_IMUL_M + WT_IMULH_R + \ +WT_IMULH_M + WT_ISMULH_R + WT_ISMULH_M + WT_IDIV_C + WT_ISDIV_C + \ +WT_INEG_R + WT_IXOR_R + WT_IXOR_M + WT_IROR_R + WT_IROL_R + \ +WT_FPSWAP_R + WT_FPADD_R + WT_FPADD_M + WT_FPSUB_R + WT_FPSUB_M + \ +WT_FPNEG_R + WT_FPMUL_R + WT_FPMUL_M + WT_FPDIV_R + WT_FPDIV_M + \ +WT_FPSQRT_R + WT_COND_R + WT_COND_M + WT_CFROUND + WT_NOP; static_assert(wtSum == 256, "Sum of instruction weights must be 256"); diff --git a/src/main.cpp b/src/main.cpp index 84c76c8..0b09a74 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -34,6 +34,7 @@ along with RandomX. If not, see. #include #include "dataset.hpp" #include "Cache.hpp" +#include "Pcg32.hpp" const uint8_t seed[32] = { 191, 182, 222, 175, 249, 89, 134, 104, 241, 68, 191, 62, 162, 166, 61, 64, 123, 191, 227, 193, 118, 60, 188, 53, 223, 133, 175, 24, 123, 230, 55, 74 }; @@ -130,6 +131,27 @@ void generateAsm(int nonce) { asmX86.printCode(std::cout); } +void generateNative(int nonce) { + uint64_t hash[4]; + unsigned char blockTemplate[] = { + 0x07, 0x07, 0xf7, 0xa4, 0xf0, 0xd6, 0x05, 0xb3, 0x03, 0x26, 0x08, 0x16, 0xba, 0x3f, 0x10, 0x90, 0x2e, 0x1a, 0x14, + 0x5a, 0xc5, 0xfa, 0xd3, 0xaa, 0x3a, 0xf6, 0xea, 0x44, 0xc1, 0x18, 0x69, 0xdc, 0x4f, 0x85, 0x3f, 0x00, 0x2b, 0x2e, + 0xea, 0x00, 0x00, 0x00, 0x00, 0x77, 0xb2, 0x06, 0xa0, 0x2c, 0xa5, 0xb1, 0xd4, 0xce, 0x6b, 0xbf, 0xdf, 0x0a, 0xca, + 0xc3, 0x8b, 0xde, 0xd3, 0x4d, 0x2d, 0xcd, 0xee, 0xf9, 0x5c, 0xd2, 0x0c, 0xef, 0xc1, 0x2f, 0x61, 0xd5, 0x61, 0x09 + }; + int* noncePtr = (int*)(blockTemplate + 39); + *noncePtr = nonce; + blake2b(hash, sizeof(hash), blockTemplate, sizeof(blockTemplate), nullptr, 0); + RandomX::Program prog; + Pcg32 gen(hash); + prog.initialize(gen); + for (int i = 0; i < RandomX::ProgramLength; ++i) { + prog(i).dst %= 8; + prog(i).src %= 8; + } + std::cout << prog << std::endl; +} + void mine(RandomX::VirtualMachine* vm, std::atomic& atomicNonce, AtomicHash& result, int noncesCount, int thread, uint8_t* scratchpad) { uint64_t hash[4]; unsigned char blockTemplate[] = { @@ -147,18 +169,16 @@ void mine(RandomX::VirtualMachine* vm, std::atomic& atomicNonce, AtomicHash blake2b(hash, sizeof(hash), blockTemplate, sizeof(blockTemplate), nullptr, 0); int spIndex = ((uint8_t*)hash)[24] | ((((uint8_t*)hash)[25] & 15) << 8); vm->initializeScratchpad(scratchpad, spIndex); - vm->initializeProgram(hash); + //vm->initializeProgram(hash); //dump((char*)((RandomX::CompiledVirtualMachine*)vm)->getProgram(), RandomX::CodeSize, "code-1337-jmp.txt"); - vm->setScratchpad(scratchpad + 3 * RandomX::ScratchpadSize / 4); - vm->execute(); - vm->setScratchpad(scratchpad + 2 * RandomX::ScratchpadSize / 4); - vm->execute(); - vm->getResult(nullptr, 0, hash); - vm->initializeProgram(hash); - vm->setScratchpad(scratchpad + 1 * RandomX::ScratchpadSize / 4); - vm->execute(); - vm->setScratchpad(scratchpad + 0 * RandomX::ScratchpadSize / 4); - vm->execute(); + for (int chain = 0; chain < 16; ++chain) { + vm->initializeProgram(hash); + int segment = hash[3] & 3; + vm->setScratchpad(scratchpad);// +segment * RandomX::ScratchpadSize / 4); + vm->execute(); + vm->getResult(nullptr, 0, hash); + } + //vm->initializeProgram(hash); vm->getResult(scratchpad, RandomX::ScratchpadSize, hash); result.xorWith(hash); if (RandomX::trace) { @@ -171,7 +191,7 @@ void mine(RandomX::VirtualMachine* vm, std::atomic& atomicNonce, AtomicHash } int main(int argc, char** argv) { - bool softAes, lightClient, genAsm, compiled, help, largePages, async, aesBench; + bool softAes, lightClient, genAsm, compiled, help, largePages, async, aesBench, genNative; int programCount, threadCount; readOption("--help", argc, argv, help); @@ -189,12 +209,18 @@ int main(int argc, char** argv) { readOption("--largePages", argc, argv, largePages); readOption("--async", argc, argv, async); readOption("--aesBench", argc, argv, aesBench); + readOption("--genNative", argc, argv, genNative); if (genAsm) { generateAsm(programCount); return 0; } + if (genNative) { + generateNative(programCount); + return 0; + } + if (softAes) std::cout << "Using software AES." << std::endl; diff --git a/src/program.inc b/src/program.inc index f06ca58..a91240e 100644 --- a/src/program.inc +++ b/src/program.inc @@ -1,8923 +1,745 @@ -rx_i_0: ;CALL - dec ebx - jz rx_finish - xor r9, 0ca9788ah - mov eax, r9d - test bl, 63 - jnz short rx_body_0 - call rx_read -rx_body_0: - xor rbp, rax - and eax, 32767 - mov rax, qword ptr [rsi+rax*8] - mov r12, rax - cmp r11d, 445530481 - ja short rx_i_1 - call rx_i_30 - -rx_i_1: ;IDIV_64 - dec ebx - jz rx_finish - xor r15, 06afc2fa4h - mov eax, r15d - test bl, 63 - jnz short rx_body_1 - call rx_read -rx_body_1: - and eax, 131071 - mov rax, qword ptr [rsi+rax*8] - mov edx, r10d - cmp edx, -1 - jne short body_idiv_1 - neg rax - jmp short result_idiv_1 -body_idiv_1: - mov ecx, 1 - test edx, edx - cmovne ecx, edx - movsxd rcx, ecx - cqo - idiv rcx -result_idiv_1: - mov r12, rax - -rx_i_2: ;JUMP - dec ebx - jz rx_finish - xor r15, 097210f7bh - mov eax, r15d - test bl, 63 - jnz short rx_body_2 - call rx_read -rx_body_2: - xor rbp, rax - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - mov rcx, rax - mov eax, r9d - xor eax, 05060ccf7h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - cmp r11d, 1348521207 - jno rx_i_47 - -rx_i_3: ;FPDIV - dec ebx - jz rx_finish - xor r13, 082c73195h + ; ISUB_R r0, r4 + sub r8, r12 + ; IROR_R r5, 15 + ror r13, 15 + ; ISUB_M r6, L1[r5] mov eax, r13d - test bl, 63 - jnz short rx_body_3 - call rx_read -rx_body_3: - xor rbp, rax - and eax, 32767 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - divpd xmm0, xmm9 - movaps xmm1, xmm0 - cmpeqpd xmm1, xmm1 - andps xmm0, xmm1 - movaps xmm8, xmm0 - -rx_i_4: ;MUL_32 - dec ebx - jz rx_finish - xor r14, 077daefb4h + and eax, 16376 + sub r14, qword ptr [rsi+rax] + ; IMUL_R r7, r6 + imul r15, r14 + ; FPADD_R f3, a1 + addpd xmm3, xmm9 + ; FPMUL_R e1, a3 + mulpd xmm5, xmm11 + ; IMUL_R r2, r4 + imul r10, r12 + ; IADD_RC r4, r5, 1789610138 + lea r12, [r12+r13+1789610138] + ; IADD_R r1, r4 + add r9, r12 + ; IADD_R r6, r0 + add r14, r8 + ; IXOR_R r7, r2 + xor r15, r10 + ; ISMULH_M r6, L1[6816] + mov rax, r14 + imul qword ptr [rsi+6816] + mov r14, rdx + ; ISUB_R r0, r4 + sub r8, r12 + ; IXOR_R r7, r2 + xor r15, r10 + ; INEG_R r4 + neg r12 + ; IROL_R r3, r0 + mov ecx, r8d + rol r11, cl + ; IADD_RC r2, r5, -1667142135 + lea r10, [r10+r13-1667142135] + ; ISUB_R r6, r2 + sub r14, r10 + ; IDIV_C r3, 2650709570 + mov rax, 3736177069856446853 + mul r11 + shr rdx, 29 + add r11, rdx + ; IMULH_R r3, r0 + mov rax, r11 + mul r8 + mov r11, rdx + ; FPSUB_R f0, a2 + subpd xmm0, xmm10 + ; FPADD_M f3, L2[r4] + mov eax, r12d + and eax, 262136 + cvtdq2pd xmm12, qword ptr [rsi+rax] + addpd xmm3, xmm12 + ; FPMUL_M e1, L1[r5] + mov eax, r13d + and eax, 16376 + cvtdq2pd xmm12, qword ptr [rsi+rax] + mulpd xmm5, xmm12 + maxpd xmm5, xmm13 + ; IMUL_9C r7, -778247271 + lea r15, [r15+r15*8-778247271] + ; IXOR_R r4, 1846379510 + xor r12, 1846379510 + ; COND_M r6, of(L1[r1], -397786451) + xor ecx, ecx + mov eax, r9d + and eax, 16376 + cmp dword ptr [rsi+rax], -397786451 + seto cl + add r14, rcx + ; COND_R r6, of(r3, -1033710571) + xor ecx, ecx + cmp r11d, -1033710571 + seto cl + add r14, rcx + ; COND_M r6, sg(L1[r6], 1413230028) + xor ecx, ecx mov eax, r14d - test bl, 63 - jnz short rx_body_4 - call rx_read -rx_body_4: - and eax, 32767 - mov rax, qword ptr [rsi+rax*8] - mov ecx, eax + and eax, 16376 + cmp dword ptr [rsi+rax], 1413230028 + sets cl + add r14, rcx + ; IDIV_C r0, 2791108943 + mov rax, 1774119268816201525 + mul r8 + shr rdx, 28 + add r8, rdx + ; FPSUB_M f1, L1[r6] mov eax, r14d - imul rax, rcx - mov r9, rax - -rx_i_5: ;IMUL_32 - dec ebx - jz rx_finish - xor r15, 0379f9ee0h - mov eax, r15d - test bl, 63 - jnz short rx_body_5 - call rx_read -rx_body_5: - xor rbp, rax - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - movsxd rcx, eax - mov rax, 1037420699 - imul rax, rcx - mov rcx, rax - mov eax, r12d - xor eax, 03dd5c89bh - and eax, 32767 - mov qword ptr [rsi + rax * 8], rcx - -rx_i_6: ;MUL_64 - dec ebx - jz rx_finish - xor r8, 03bae7272h - mov eax, r8d - test bl, 63 - jnz short rx_body_6 - call rx_read -rx_body_6: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - imul rax, r15 - mov rcx, rax - mov eax, r9d - xor eax, 098a649d1h - and eax, 131071 - mov qword ptr [rsi + rax * 8], rcx - -rx_i_7: ;FPADD - dec ebx - jz rx_finish - xor r10, 0e264ed81h + and eax, 16376 + cvtdq2pd xmm12, qword ptr [rsi+rax] + subpd xmm1, xmm12 + ; FPSWAP_R f0 + shufpd xmm0, xmm0, 1 + ; IADD_RC r6, r5, -640194892 + lea r14, [r14+r13-640194892] + ; FPADD_M f0, L1[r2] mov eax, r10d - test bl, 63 - jnz short rx_body_7 - call rx_read -rx_body_7: - and eax, 32767 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - addpd xmm0, xmm6 - movaps xmm6, xmm0 - -rx_i_8: ;XOR_64 - dec ebx - jz rx_finish - xor r13, 068c1e5d2h - mov eax, r13d - test bl, 63 - jnz short rx_body_8 - call rx_read -rx_body_8: - and eax, 131071 - mov rax, qword ptr [rsi+rax*8] - xor rax, r11 - mov rcx, rax - mov eax, r12d - xor eax, 050267ebdh - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - -rx_i_9: ;IDIV_64 - dec ebx - jz rx_finish - xor r14, 085121c54h - mov eax, r14d - test bl, 63 - jnz short rx_body_9 - call rx_read -rx_body_9: - xor rbp, rax - and eax, 32767 - mov rax, qword ptr [rsi+rax*8] - ; magic divide by 565870810 - mov rdx, 8750690209911200579 - imul rdx - mov rax, rdx - xor edx, edx - sar rax, 28 - sets dl - add rax, rdx - mov r10, rax - -rx_i_10: ;AND_64 - dec ebx - jz rx_finish - xor r8, 052efde3eh - mov eax, r8d - test bl, 63 - jnz short rx_body_10 - call rx_read -rx_body_10: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - and rax, r10 - mov rcx, rax - mov eax, r13d - xor eax, 0d49dbd9fh - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - -rx_i_11: ;FPADD - dec ebx - jz rx_finish - xor r10, 0a9bf8aa1h - mov eax, r10d - test bl, 63 - jnz short rx_body_11 - call rx_read -rx_body_11: - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - addpd xmm0, xmm5 - movaps xmm4, xmm0 - mov eax, r12d - xor eax, 0852d40d8h - and eax, 2047 - movlpd qword ptr [rsi + rax * 8], xmm4 - -rx_i_12: ;FPSQRT - dec ebx - jz rx_finish - xor r10, 0db2691ch - mov eax, r10d - test bl, 63 - jnz short rx_body_12 - call rx_read -rx_body_12: - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - andps xmm0, xmm10 - sqrtpd xmm8, xmm0 - -rx_i_13: ;FPADD - dec ebx - jz rx_finish - xor r12, 061c0d34dh - mov eax, r12d - test bl, 63 - jnz short rx_body_13 - call rx_read -rx_body_13: - xor rbp, rax - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - addpd xmm0, xmm3 - movaps xmm9, xmm0 - mov eax, r9d - xor eax, 04f2f223ch - and eax, 2047 - movhpd qword ptr [rsi + rax * 8], xmm9 - -rx_i_14: ;XOR_64 - dec ebx - jz rx_finish - xor r10, 0e761d1beh - mov eax, r10d - test bl, 63 - jnz short rx_body_14 - call rx_read -rx_body_14: - xor rbp, rax - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - xor rax, r9 - mov r10, rax - -rx_i_15: ;RET - dec ebx - jz rx_finish - xor r11, 074ddb688h - mov eax, r11d - test bl, 63 - jnz short rx_body_15 - call rx_read -rx_body_15: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - mov r14, rax - cmp rsp, rdi - je short rx_i_16 - ret - -rx_i_16: ;ADD_64 - dec ebx - jz rx_finish - xor r14, 06be90627h - mov eax, r14d - test bl, 63 - jnz short rx_body_16 - call rx_read -rx_body_16: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - add rax, r10 - mov rcx, rax - mov eax, r9d - xor eax, 0d7e75aeh - and eax, 131071 - mov qword ptr [rsi + rax * 8], rcx - -rx_i_17: ;FPMUL - dec ebx - jz rx_finish - xor r11, 0fbc6fc35h - mov eax, r11d - test bl, 63 - jnz short rx_body_17 - call rx_read -rx_body_17: - and eax, 32767 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - mulpd xmm0, xmm4 - movaps xmm1, xmm0 - cmpeqpd xmm1, xmm1 - andps xmm0, xmm1 - movaps xmm4, xmm0 - -rx_i_18: ;FPSUB - dec ebx - jz rx_finish - xor r14, 0c28ca080h - mov eax, r14d - test bl, 63 - jnz short rx_body_18 - call rx_read -rx_body_18: - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - subpd xmm0, xmm4 - movaps xmm3, xmm0 - mov eax, r11d - xor eax, 0869baa81h - and eax, 2047 - movlpd qword ptr [rsi + rax * 8], xmm3 - -rx_i_19: ;FPSUB - dec ebx - jz rx_finish - xor r13, 0ac009c30h - mov eax, r13d - test bl, 63 - jnz short rx_body_19 - call rx_read -rx_body_19: - xor rbp, rax - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - subpd xmm0, xmm8 - movaps xmm7, xmm0 - -rx_i_20: ;FPSUB - dec ebx - jz rx_finish - xor r13, 0ecca967dh - mov eax, r13d - test bl, 63 - jnz short rx_body_20 - call rx_read -rx_body_20: - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - subpd xmm0, xmm2 - movaps xmm7, xmm0 - -rx_i_21: ;ROR_64 - dec ebx - jz rx_finish - xor r8, 0977f0284h - mov eax, r8d - test bl, 63 - jnz short rx_body_21 - call rx_read -rx_body_21: - xor rbp, rax - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - mov rcx, r9 - ror rax, cl - mov r15, rax - -rx_i_22: ;ADD_64 - dec ebx - jz rx_finish - xor r13, 080bdfefah - mov eax, r13d - test bl, 63 - jnz short rx_body_22 - call rx_read -rx_body_22: - xor rbp, rax - and eax, 131071 - mov rax, qword ptr [rsi+rax*8] - add rax, r8 - mov r10, rax - -rx_i_23: ;MUL_64 - dec ebx - jz rx_finish - xor r15, 0e1e0d3c4h - mov eax, r15d - test bl, 63 - jnz short rx_body_23 - call rx_read -rx_body_23: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - imul rax, rax, 1283724485 - mov rcx, rax - mov eax, r8d - xor eax, 04c8414c5h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - -rx_i_24: ;DIV_64 - dec ebx - jz rx_finish - xor r8, 070d3b8c7h - mov eax, r8d - test bl, 63 - jnz short rx_body_24 - call rx_read -rx_body_24: - xor rbp, rax - and eax, 131071 - mov rax, qword ptr [rsi+rax*8] - mov ecx, 1 - mov edx, r15d - test edx, edx - cmovne ecx, edx - xor edx, edx - div rcx - mov rcx, rax - mov eax, r15d - xor eax, 099b77a68h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - -rx_i_25: ;FPMUL - dec ebx - jz rx_finish - xor r12, 01cf77a04h - mov eax, r12d - test bl, 63 - jnz short rx_body_25 - call rx_read -rx_body_25: - xor rbp, rax - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - mulpd xmm0, xmm9 - movaps xmm1, xmm0 - cmpeqpd xmm1, xmm1 - andps xmm0, xmm1 - movaps xmm6, xmm0 - mov eax, r14d - xor eax, 0baf5c2d4h - and eax, 2047 - movlpd qword ptr [rsi + rax * 8], xmm6 - -rx_i_26: ;IMULH_64 - dec ebx - jz rx_finish - xor r11, 0e311468ch - mov eax, r11d - test bl, 63 - jnz short rx_body_26 - call rx_read -rx_body_26: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - mov rcx, 812644844 - imul rcx - mov rax, rdx - mov r9, rax - -rx_i_27: ;FPMUL - dec ebx - jz rx_finish - xor r12, 01fd9911ah - mov eax, r12d - test bl, 63 - jnz short rx_body_27 - call rx_read -rx_body_27: - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - mulpd xmm0, xmm3 - movaps xmm1, xmm0 - cmpeqpd xmm1, xmm1 - andps xmm0, xmm1 - movaps xmm6, xmm0 - -rx_i_28: ;AND_32 - dec ebx - jz rx_finish - xor r13, 067df757eh - mov eax, r13d - test bl, 63 - jnz short rx_body_28 - call rx_read -rx_body_28: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - and eax, 565865719 - mov rcx, rax - mov eax, r14d - xor eax, 021ba6cf7h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - -rx_i_29: ;SUB_64 - dec ebx - jz rx_finish - xor r12, 0be2e7c42h - mov eax, r12d - test bl, 63 - jnz short rx_body_29 - call rx_read -rx_body_29: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - sub rax, r13 - mov rcx, rax - mov eax, r14d - xor eax, 073e1a073h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - -rx_i_30: ;FPADD - dec ebx - jz rx_finish - xor r11, 084d067f7h - mov eax, r11d - test bl, 63 - jnz short rx_body_30 - call rx_read -rx_body_30: - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - addpd xmm0, xmm3 - movaps xmm7, xmm0 - -rx_i_31: ;ROR_64 - dec ebx - jz rx_finish - xor r14, 0d352ce37h - mov eax, r14d - test bl, 63 - jnz short rx_body_31 - call rx_read -rx_body_31: - xor rbp, rax - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - ror rax, 55 - mov rcx, rax - mov eax, r14d - xor eax, 01e2da792h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - -rx_i_32: ;AND_32 - dec ebx - jz rx_finish - xor r12, 0a1f248dah - mov eax, r12d - test bl, 63 - jnz short rx_body_32 - call rx_read -rx_body_32: - and eax, 131071 - mov rax, qword ptr [rsi+rax*8] - and eax, r14d - mov r9, rax - -rx_i_33: ;MUL_64 - dec ebx - jz rx_finish - xor r9, 0554720fch - mov eax, r9d - test bl, 63 - jnz short rx_body_33 - call rx_read -rx_body_33: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - imul rax, r15 - mov r12, rax - -rx_i_34: ;CALL - dec ebx - jz rx_finish - xor r13, 0665e91f1h - mov eax, r13d - test bl, 63 - jnz short rx_body_34 - call rx_read -rx_body_34: - xor rbp, rax - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - mov rcx, rax - mov eax, r15d - xor eax, 0e9563b32h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - cmp r14d, -380224718 - jns short rx_i_35 - call rx_i_108 - -rx_i_35: ;CALL - dec ebx - jz rx_finish - xor r15, 05ef1be79h - mov eax, r15d - test bl, 63 - jnz short rx_body_35 - call rx_read -rx_body_35: - and eax, 32767 - mov rax, qword ptr [rsi+rax*8] - mov rcx, rax - mov eax, r8d - xor eax, 0865c0f66h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - cmp r9d, -2040787098 - jns short rx_i_36 - call rx_i_58 - -rx_i_36: ;FPMUL - dec ebx - jz rx_finish - xor r8, 012ec7e3ah - mov eax, r8d - test bl, 63 - jnz short rx_body_36 - call rx_read -rx_body_36: - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - mulpd xmm0, xmm6 - movaps xmm1, xmm0 - cmpeqpd xmm1, xmm1 - andps xmm0, xmm1 - movaps xmm7, xmm0 - -rx_i_37: ;FPSUB - dec ebx - jz rx_finish - xor r12, 0d0706601h - mov eax, r12d - test bl, 63 - jnz short rx_body_37 - call rx_read -rx_body_37: - and eax, 32767 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - subpd xmm0, xmm2 - movaps xmm9, xmm0 - -rx_i_38: ;SUB_64 - dec ebx - jz rx_finish - xor r9, 064056913h - mov eax, r9d - test bl, 63 - jnz short rx_body_38 - call rx_read -rx_body_38: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - sub rax, r14 - mov rcx, rax - mov eax, r10d - xor eax, 087c32de2h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - -rx_i_39: ;ADD_64 - dec ebx - jz rx_finish - xor r14, 02c1f1eb0h - mov eax, r14d - test bl, 63 - jnz short rx_body_39 - call rx_read -rx_body_39: - xor rbp, rax - and eax, 131071 - mov rax, qword ptr [rsi+rax*8] - add rax, r14 - mov rcx, rax - mov eax, r14d - xor eax, 0f4101ad9h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - -rx_i_40: ;CALL - dec ebx - jz rx_finish - xor r10, 068fd9009h - mov eax, r10d - test bl, 63 - jnz short rx_body_40 - call rx_read -rx_body_40: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - mov rcx, rax - mov eax, r9d - xor eax, 0b2a27eceh - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - cmp r12d, -1297973554 - jns short rx_i_41 - call rx_i_90 - -rx_i_41: ;JUMP - dec ebx - jz rx_finish - xor r9, 037a30933h - mov eax, r9d - test bl, 63 - jnz short rx_body_41 - call rx_read -rx_body_41: - and eax, 32767 - mov rax, qword ptr [rsi+rax*8] - mov r9, rax - cmp r14d, -1070581824 - jo rx_i_127 - -rx_i_42: ;FPADD - dec ebx - jz rx_finish - xor r15, 0bc1de9f6h - mov eax, r15d - test bl, 63 - jnz short rx_body_42 - call rx_read -rx_body_42: - xor rbp, rax - and eax, 32767 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - addpd xmm0, xmm6 - movaps xmm6, xmm0 - -rx_i_43: ;SUB_64 - dec ebx - jz rx_finish - xor r12, 02b2a2eech - mov eax, r12d - test bl, 63 - jnz short rx_body_43 - call rx_read -rx_body_43: - and eax, 32767 - mov rax, qword ptr [rsi+rax*8] - sub rax, r8 - mov r11, rax - -rx_i_44: ;SAR_64 - dec ebx - jz rx_finish - xor r11, 0685817abh - mov eax, r11d - test bl, 63 - jnz short rx_body_44 - call rx_read -rx_body_44: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - mov rcx, r9 - sar rax, cl - mov rcx, rax - mov eax, r15d - xor eax, 0372116f6h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - -rx_i_45: ;FPSUB - dec ebx - jz rx_finish - xor r12, 08cd244ebh - mov eax, r12d - test bl, 63 - jnz short rx_body_45 - call rx_read -rx_body_45: - xor rbp, rax - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - subpd xmm0, xmm2 - movaps xmm5, xmm0 - mov eax, r13d - xor eax, 0977132cdh - and eax, 2047 - movhpd qword ptr [rsi + rax * 8], xmm5 - -rx_i_46: ;ADD_64 - dec ebx - jz rx_finish - xor r8, 06d8f4254h - mov eax, r8d - test bl, 63 - jnz short rx_body_46 - call rx_read -rx_body_46: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - add rax, r9 - mov r8, rax - -rx_i_47: ;JUMP - dec ebx - jz rx_finish - xor r12, 05ba232c6h - mov eax, r12d - test bl, 63 - jnz short rx_body_47 - call rx_read -rx_body_47: - xor rbp, rax - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - mov rcx, rax - mov eax, r13d - xor eax, 071ba231h - and eax, 32767 - mov qword ptr [rsi + rax * 8], rcx - cmp r10d, 119251505 - jbe rx_i_131 - -rx_i_48: ;FPDIV - dec ebx - jz rx_finish - xor r8, 0aaed618fh - mov eax, r8d - test bl, 63 - jnz short rx_body_48 - call rx_read -rx_body_48: - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - divpd xmm0, xmm3 - movaps xmm1, xmm0 - cmpeqpd xmm1, xmm1 - andps xmm0, xmm1 - movaps xmm9, xmm0 - -rx_i_49: ;FPSUB - dec ebx - jz rx_finish - xor r8, 0f96c6a45h - mov eax, r8d - test bl, 63 - jnz short rx_body_49 - call rx_read -rx_body_49: - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - subpd xmm0, xmm3 - movaps xmm5, xmm0 - -rx_i_50: ;AND_64 - dec ebx - jz rx_finish - xor r9, 0da3e4842h - mov eax, r9d - test bl, 63 - jnz short rx_body_50 - call rx_read -rx_body_50: - xor rbp, rax - and eax, 32767 - mov rax, qword ptr [rsi+rax*8] - and rax, r10 - mov r15, rax - -rx_i_51: ;SUB_64 - dec ebx - jz rx_finish - xor r10, 0302b676ah - mov eax, r10d - test bl, 63 - jnz short rx_body_51 - call rx_read -rx_body_51: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - sub rax, r15 - mov rcx, rax - mov eax, r15d - xor eax, 018fd1fbfh - and eax, 32767 - mov qword ptr [rsi + rax * 8], rcx - -rx_i_52: ;FPSQRT - dec ebx - jz rx_finish - xor r11, 0fa88f48bh - mov eax, r11d - test bl, 63 - jnz short rx_body_52 - call rx_read -rx_body_52: - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - andps xmm0, xmm10 - sqrtpd xmm7, xmm0 - -rx_i_53: ;RET - dec ebx - jz rx_finish - xor r13, 03dff9b9eh - mov eax, r13d - test bl, 63 - jnz short rx_body_53 - call rx_read -rx_body_53: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - mov rcx, rax - mov eax, r13d - xor eax, 078ed00edh - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - cmp rsp, rdi - je short rx_i_54 - ret - -rx_i_54: ;DIV_64 - dec ebx - jz rx_finish - xor r11, 060638de0h - mov eax, r11d - test bl, 63 - jnz short rx_body_54 - call rx_read -rx_body_54: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - ; magic divide by 282209221 - mov rcx, 1096650948274100047 - mul rcx - mov rax, rdx - shr rax, 24 - mov rcx, rax - mov eax, r12d - xor eax, 010d22bc5h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - -rx_i_55: ;FPMUL - dec ebx - jz rx_finish - xor r10, 0dda983d4h - mov eax, r10d - test bl, 63 - jnz short rx_body_55 - call rx_read -rx_body_55: - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - mulpd xmm0, xmm5 - movaps xmm1, xmm0 - cmpeqpd xmm1, xmm1 - andps xmm0, xmm1 - movaps xmm3, xmm0 - -rx_i_56: ;IDIV_64 - dec ebx - jz rx_finish - xor r14, 0f1456b8eh - mov eax, r14d - test bl, 63 - jnz short rx_body_56 - call rx_read -rx_body_56: - and eax, 32767 - mov rax, qword ptr [rsi+rax*8] - ; magic divide by -50768751 - mov rcx, rax - mov rdx, 6254795139557318139 - imul rdx - mov rax, rdx - xor edx, edx - sub rax, rcx - sar rax, 25 - sets dl - add rax, rdx - mov r8, rax - -rx_i_57: ;MUL_64 - dec ebx - jz rx_finish - xor r9, 010dc4571h - mov eax, r9d - test bl, 63 - jnz short rx_body_57 - call rx_read -rx_body_57: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - imul rax, rax, 172123015 - mov rcx, rax - mov eax, r15d - xor eax, 0a426387h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - -rx_i_58: ;DIV_64 - dec ebx - jz rx_finish - xor r14, 0bcec0ebah - mov eax, r14d - test bl, 63 - jnz short rx_body_58 - call rx_read -rx_body_58: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - ; magic divide by 1506547423 - mov rcx, 6573653217342526495 - mul rcx - mov rax, rdx - shr rax, 29 - mov r8, rax - -rx_i_59: ;FPSUB - dec ebx - jz rx_finish - xor r11, 0980dd402h - mov eax, r11d - test bl, 63 - jnz short rx_body_59 - call rx_read -rx_body_59: - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - subpd xmm0, xmm8 - movaps xmm7, xmm0 - -rx_i_60: ;CALL - dec ebx - jz rx_finish - xor r15, 03de14d1eh - mov eax, r15d - test bl, 63 - jnz short rx_body_60 - call rx_read -rx_body_60: - and eax, 131071 - mov rax, qword ptr [rsi+rax*8] - mov rcx, rax - mov eax, r11d - xor eax, 07bb60f45h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - cmp r11d, 2075529029 - jo short rx_i_61 - call rx_i_116 - -rx_i_61: ;JUMP - dec ebx - jz rx_finish - xor r13, 05058ce64h - mov eax, r13d - test bl, 63 - jnz short rx_body_61 - call rx_read -rx_body_61: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - mov r11, rax - cmp r15d, 1933164545 - jns rx_i_120 - -rx_i_62: ;FPSUB - dec ebx - jz rx_finish - xor r15, 0c3089414h - mov eax, r15d - test bl, 63 - jnz short rx_body_62 - call rx_read -rx_body_62: - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - subpd xmm0, xmm8 - movaps xmm2, xmm0 - mov eax, r10d - xor eax, 05c4789e3h - and eax, 2047 - movhpd qword ptr [rsi + rax * 8], xmm2 - -rx_i_63: ;FPSUB - dec ebx - jz rx_finish - xor r9, 065cf272eh - mov eax, r9d - test bl, 63 - jnz short rx_body_63 - call rx_read -rx_body_63: - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - subpd xmm0, xmm7 - movaps xmm8, xmm0 - mov eax, r8d - xor eax, 0be13d69eh - and eax, 2047 - movlpd qword ptr [rsi + rax * 8], xmm8 - -rx_i_64: ;SUB_64 - dec ebx - jz rx_finish - xor r13, 0ae54dfbfh - mov eax, r13d - test bl, 63 - jnz short rx_body_64 - call rx_read -rx_body_64: - and eax, 32767 - mov rax, qword ptr [rsi+rax*8] - sub rax, r15 - mov r9, rax - -rx_i_65: ;JUMP - dec ebx - jz rx_finish - xor r13, 07b366ce6h - mov eax, r13d - test bl, 63 - jnz short rx_body_65 - call rx_read -rx_body_65: - and eax, 32767 - mov rax, qword ptr [rsi+rax*8] - mov rcx, rax - mov eax, r11d - xor eax, 0594a879fh - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - cmp r8d, 1498056607 - js rx_i_129 - -rx_i_66: ;FPDIV - dec ebx - jz rx_finish - xor r15, 015a1b689h - mov eax, r15d - test bl, 63 - jnz short rx_body_66 - call rx_read -rx_body_66: - xor rbp, rax - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - divpd xmm0, xmm3 - movaps xmm1, xmm0 - cmpeqpd xmm1, xmm1 - andps xmm0, xmm1 - movaps xmm9, xmm0 - mov eax, r9d - xor eax, 07305e78h - and eax, 2047 - movlpd qword ptr [rsi + rax * 8], xmm9 - -rx_i_67: ;JUMP - dec ebx - jz rx_finish - xor r14, 088393ba0h - mov eax, r14d - test bl, 63 - jnz short rx_body_67 - call rx_read -rx_body_67: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - mov rcx, rax - mov eax, r9d - xor eax, 07916db59h - and eax, 32767 - mov qword ptr [rsi + rax * 8], rcx - cmp r13d, 2031541081 - jns rx_i_79 - -rx_i_68: ;FPADD - dec ebx - jz rx_finish - xor r13, 03aa5c3a4h - mov eax, r13d - test bl, 63 - jnz short rx_body_68 - call rx_read -rx_body_68: - and eax, 32767 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - addpd xmm0, xmm2 - movaps xmm4, xmm0 - mov eax, r12d - xor eax, 03c51ef39h - and eax, 2047 - movhpd qword ptr [rsi + rax * 8], xmm4 - -rx_i_69: ;FPADD - dec ebx - jz rx_finish - xor r15, 0376c9c27h - mov eax, r15d - test bl, 63 - jnz short rx_body_69 - call rx_read -rx_body_69: - xor rbp, rax - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - addpd xmm0, xmm5 - movaps xmm8, xmm0 - -rx_i_70: ;MULH_64 - dec ebx - jz rx_finish - xor r8, 0bbbec3fah - mov eax, r8d - test bl, 63 - jnz short rx_body_70 - call rx_read -rx_body_70: - xor rbp, rax - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - mov rcx, r9 - mul rcx - mov rax, rdx - mov r13, rax - -rx_i_71: ;FPMUL - dec ebx - jz rx_finish - xor r14, 0e9efb350h - mov eax, r14d - test bl, 63 - jnz short rx_body_71 - call rx_read -rx_body_71: - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - mulpd xmm0, xmm5 - movaps xmm1, xmm0 - cmpeqpd xmm1, xmm1 - andps xmm0, xmm1 - movaps xmm7, xmm0 - mov eax, r15d - xor eax, 056660eedh - and eax, 131071 - movlpd qword ptr [rsi + rax * 8], xmm7 - -rx_i_72: ;JUMP - dec ebx - jz rx_finish - xor r13, 0f4e51e28h - mov eax, r13d - test bl, 63 - jnz short rx_body_72 - call rx_read -rx_body_72: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - mov r11, rax - cmp r9d, -631091751 - jno rx_i_191 - -rx_i_73: ;FPDIV - dec ebx - jz rx_finish - xor r12, 0c24ddbd4h - mov eax, r12d - test bl, 63 - jnz short rx_body_73 - call rx_read -rx_body_73: - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - divpd xmm0, xmm3 - movaps xmm1, xmm0 - cmpeqpd xmm1, xmm1 - andps xmm0, xmm1 - movaps xmm2, xmm0 - -rx_i_74: ;MUL_64 - dec ebx - jz rx_finish - xor r8, 04c4b0c7fh - mov eax, r8d - test bl, 63 - jnz short rx_body_74 - call rx_read -rx_body_74: - xor rbp, rax - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - imul rax, r13 - mov rcx, rax - mov eax, r9d - xor eax, 0aaaacb32h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - -rx_i_75: ;CALL - dec ebx - jz rx_finish - xor r14, 03bcc02e3h - mov eax, r14d - test bl, 63 - jnz short rx_body_75 - call rx_read -rx_body_75: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - mov r13, rax - cmp r11d, -1160798683 - jno short rx_i_76 - call rx_i_202 - -rx_i_76: ;FPADD - dec ebx - jz rx_finish - xor r11, 04b0ff63eh - mov eax, r11d - test bl, 63 - jnz short rx_body_76 - call rx_read -rx_body_76: - and eax, 32767 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - addpd xmm0, xmm3 - movaps xmm7, xmm0 - mov eax, r15d - xor eax, 083bc0396h - and eax, 32767 - movlpd qword ptr [rsi + rax * 8], xmm7 - -rx_i_77: ;RET - dec ebx - jz rx_finish - xor r14, 0b956b3e8h - mov eax, r14d - test bl, 63 - jnz short rx_body_77 - call rx_read -rx_body_77: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - mov rcx, rax - mov eax, r11d - xor eax, 03a92bc7ah - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - cmp rsp, rdi - je short rx_i_78 - ret - -rx_i_78: ;MUL_32 - dec ebx - jz rx_finish - xor r9, 0edeca680h - mov eax, r9d - test bl, 63 - jnz short rx_body_78 - call rx_read -rx_body_78: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - mov ecx, eax - mov eax, r8d - imul rax, rcx - mov rcx, rax - mov eax, r15d - xor eax, 0697e6195h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - -rx_i_79: ;CALL - dec ebx - jz rx_finish - xor r11, 0fbdddcb5h - mov eax, r11d - test bl, 63 - jnz short rx_body_79 - call rx_read -rx_body_79: - and eax, 32767 - mov rax, qword ptr [rsi+rax*8] - mov r11, rax - cmp r13d, 1800043331 - ja short rx_i_80 - call rx_i_93 - -rx_i_80: ;ROR_64 - dec ebx - jz rx_finish - xor r13, 09cec97a1h - mov eax, r13d - test bl, 63 - jnz short rx_body_80 - call rx_read -rx_body_80: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - mov rcx, r11 - ror rax, cl - mov rcx, rax - mov eax, r11d - xor eax, 01a681d13h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - -rx_i_81: ;AND_64 - dec ebx - jz rx_finish - xor r15, 078228167h - mov eax, r15d - test bl, 63 - jnz short rx_body_81 - call rx_read -rx_body_81: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - and rax, 338325607 - mov rcx, rax - mov eax, r8d - xor eax, 0142a7067h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - -rx_i_82: ;JUMP - dec ebx - jz rx_finish - xor r11, 078cae1ffh - mov eax, r11d - test bl, 63 - jnz short rx_body_82 - call rx_read -rx_body_82: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - mov r10, rax - cmp r12d, -68969733 - jo rx_i_145 - -rx_i_83: ;IDIV_64 - dec ebx - jz rx_finish - xor r10, 0d9b6a533h - mov eax, r10d - test bl, 63 - jnz short rx_body_83 - call rx_read -rx_body_83: - and eax, 32767 - mov rax, qword ptr [rsi+rax*8] - ; magic divide by 91850728 - mov rdx, 842358619687110887 - imul rdx - mov rax, rdx - xor edx, edx - sar rax, 22 - sets dl - add rax, rdx - mov r12, rax - -rx_i_84: ;SAR_64 - dec ebx - jz rx_finish - xor r15, 0e9e75336h - mov eax, r15d - test bl, 63 - jnz short rx_body_84 - call rx_read -rx_body_84: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - sar rax, 45 - mov rcx, rax - mov eax, r13d - xor eax, 0ec5c52e6h - and eax, 131071 - mov qword ptr [rsi + rax * 8], rcx - -rx_i_85: ;MUL_64 - dec ebx - jz rx_finish - xor r13, 04c0d378ah - mov eax, r13d - test bl, 63 - jnz short rx_body_85 - call rx_read -rx_body_85: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - imul rax, rax, 20014507 - mov rcx, rax - mov eax, r10d - xor eax, 013165abh - and eax, 32767 - mov qword ptr [rsi + rax * 8], rcx - -rx_i_86: ;AND_64 - dec ebx - jz rx_finish - xor r11, 04386e368h - mov eax, r11d - test bl, 63 - jnz short rx_body_86 - call rx_read -rx_body_86: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - and rax, r8 - mov r12, rax - -rx_i_87: ;SUB_64 - dec ebx - jz rx_finish - xor r9, 0d75a0ecfh - mov eax, r9d - test bl, 63 - jnz short rx_body_87 - call rx_read -rx_body_87: - xor rbp, rax - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - sub rax, r12 - mov r8, rax - -rx_i_88: ;ROR_64 - dec ebx - jz rx_finish - xor r9, 031bb7f7ah - mov eax, r9d - test bl, 63 - jnz short rx_body_88 - call rx_read -rx_body_88: - xor rbp, rax - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - mov rcx, r14 - ror rax, cl - mov r9, rax - -rx_i_89: ;MUL_64 - dec ebx - jz rx_finish - xor r9, 03b45ecebh - mov eax, r9d - test bl, 63 - jnz short rx_body_89 - call rx_read -rx_body_89: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - imul rax, r8 - mov r10, rax - -rx_i_90: ;FPADD - dec ebx - jz rx_finish - xor r12, 0ee08e76bh - mov eax, r12d - test bl, 63 - jnz short rx_body_90 - call rx_read -rx_body_90: - and eax, 32767 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - addpd xmm0, xmm3 - movaps xmm6, xmm0 - -rx_i_91: ;FPMUL - dec ebx - jz rx_finish - xor r9, 042e28e94h - mov eax, r9d - test bl, 63 - jnz short rx_body_91 - call rx_read -rx_body_91: - xor rbp, rax - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - mulpd xmm0, xmm2 - movaps xmm1, xmm0 - cmpeqpd xmm1, xmm1 - andps xmm0, xmm1 - movaps xmm4, xmm0 - -rx_i_92: ;JUMP - dec ebx - jz rx_finish - xor r8, 0729260e1h - mov eax, r8d - test bl, 63 - jnz short rx_body_92 - call rx_read -rx_body_92: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - mov r12, rax - cmp r14d, 1288893603 - jge rx_i_170 - -rx_i_93: ;FPADD - dec ebx - jz rx_finish - xor r8, 0bfcebaf4h - mov eax, r8d - test bl, 63 - jnz short rx_body_93 - call rx_read -rx_body_93: - xor rbp, rax - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - addpd xmm0, xmm2 - movaps xmm2, xmm0 - mov eax, r10d - xor eax, 07e48a0d8h - and eax, 2047 - movhpd qword ptr [rsi + rax * 8], xmm2 - -rx_i_94: ;CALL - dec ebx - jz rx_finish - xor r13, 0ea326630h - mov eax, r13d - test bl, 63 - jnz short rx_body_94 - call rx_read -rx_body_94: - xor rbp, rax - and eax, 32767 - mov rax, qword ptr [rsi+rax*8] - mov rcx, rax - mov eax, r8d - xor eax, 0eb8c5be0h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - cmp r13d, -343122976 - jns short rx_i_95 - call rx_i_157 - -rx_i_95: ;MUL_64 - dec ebx - jz rx_finish - xor r13, 0b5451a2dh - mov eax, r13d - test bl, 63 - jnz short rx_body_95 - call rx_read -rx_body_95: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - imul rax, r10 - mov r15, rax - -rx_i_96: ;IMUL_32 - dec ebx - jz rx_finish - xor r11, 04f912ef8h - mov eax, r11d - test bl, 63 - jnz short rx_body_96 - call rx_read -rx_body_96: - xor rbp, rax - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - movsxd rcx, eax - movsxd rax, r11d - imul rax, rcx - mov r11, rax - -rx_i_97: ;FPDIV - dec ebx - jz rx_finish - xor r15, 0acc45b3bh - mov eax, r15d - test bl, 63 - jnz short rx_body_97 - call rx_read -rx_body_97: - and eax, 131071 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - divpd xmm0, xmm9 - movaps xmm1, xmm0 - cmpeqpd xmm1, xmm1 - andps xmm0, xmm1 - movaps xmm5, xmm0 - -rx_i_98: ;SUB_64 - dec ebx - jz rx_finish - xor r14, 09900a4e8h - mov eax, r14d - test bl, 63 - jnz short rx_body_98 - call rx_read -rx_body_98: - and eax, 32767 - mov rax, qword ptr [rsi+rax*8] - sub rax, r15 - mov rcx, rax - mov eax, r14d - xor eax, 0d067d49ah - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - -rx_i_99: ;FPMUL - dec ebx - jz rx_finish - xor r9, 0841b2984h - mov eax, r9d - test bl, 63 - jnz short rx_body_99 - call rx_read -rx_body_99: - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - mulpd xmm0, xmm6 - movaps xmm1, xmm0 - cmpeqpd xmm1, xmm1 - andps xmm0, xmm1 - movaps xmm4, xmm0 - mov eax, r12d - xor eax, 04c21df83h - and eax, 2047 - movhpd qword ptr [rsi + rax * 8], xmm4 - -rx_i_100: ;ADD_64 - dec ebx - jz rx_finish - xor r15, 07ebea48fh - mov eax, r15d - test bl, 63 - jnz short rx_body_100 - call rx_read -rx_body_100: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - add rax, r9 - mov r14, rax - -rx_i_101: ;SUB_64 - dec ebx - jz rx_finish - xor r10, 0631209d3h - mov eax, r10d - test bl, 63 - jnz short rx_body_101 - call rx_read -rx_body_101: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - sub rax, 1732300336 - mov r11, rax - -rx_i_102: ;FPMUL - dec ebx - jz rx_finish - xor r10, 0e50bf07ah - mov eax, r10d - test bl, 63 - jnz short rx_body_102 - call rx_read -rx_body_102: - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - mulpd xmm0, xmm3 - movaps xmm1, xmm0 - cmpeqpd xmm1, xmm1 - andps xmm0, xmm1 - movaps xmm7, xmm0 - -rx_i_103: ;MUL_64 - dec ebx - jz rx_finish - xor r10, 02b7096f1h - mov eax, r10d - test bl, 63 - jnz short rx_body_103 - call rx_read -rx_body_103: - and eax, 32767 - mov rax, qword ptr [rsi+rax*8] - imul rax, r13 - mov rcx, rax - mov eax, r15d - xor eax, 0e4dd92b6h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - -rx_i_104: ;DIV_64 - dec ebx - jz rx_finish - xor r11, 075deaf71h - mov eax, r11d - test bl, 63 - jnz short rx_body_104 - call rx_read -rx_body_104: - and eax, 32767 - mov rax, qword ptr [rsi+rax*8] - ; magic divide by 2381897207 - mov rcx, 16631314374404138087 - mul rcx - mov rax, rdx - shr rax, 31 - mov r15, rax - -rx_i_105: ;MUL_32 - dec ebx - jz rx_finish - xor r13, 036a51f72h - mov eax, r13d - test bl, 63 - jnz short rx_body_105 - call rx_read -rx_body_105: - xor rbp, rax - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - mov ecx, eax - mov eax, r15d - imul rax, rcx - mov r14, rax - -rx_i_106: ;FPMUL - dec ebx - jz rx_finish - xor r11, 07b512986h - mov eax, r11d - test bl, 63 - jnz short rx_body_106 - call rx_read -rx_body_106: - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - mulpd xmm0, xmm3 - movaps xmm1, xmm0 - cmpeqpd xmm1, xmm1 - andps xmm0, xmm1 - movaps xmm4, xmm0 - mov eax, r12d - xor eax, 03cb2505h - and eax, 2047 - movlpd qword ptr [rsi + rax * 8], xmm4 - -rx_i_107: ;JUMP - dec ebx - jz rx_finish - xor r12, 0f1d2e50h - mov eax, r12d - test bl, 63 - jnz short rx_body_107 - call rx_read -rx_body_107: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - mov rcx, rax - mov eax, r14d - xor eax, 07243ab81h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - cmp r11d, 1917037441 - jl rx_i_143 - -rx_i_108: ;FPMUL - dec ebx - jz rx_finish - xor r9, 07327ba60h - mov eax, r9d - test bl, 63 - jnz short rx_body_108 - call rx_read -rx_body_108: - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - mulpd xmm0, xmm5 - movaps xmm1, xmm0 - cmpeqpd xmm1, xmm1 - andps xmm0, xmm1 - movaps xmm9, xmm0 - -rx_i_109: ;ROR_64 - dec ebx - jz rx_finish - xor r15, 0594e37deh - mov eax, r15d - test bl, 63 - jnz short rx_body_109 - call rx_read -rx_body_109: - xor rbp, rax - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - mov rcx, r10 - ror rax, cl - mov rcx, rax - mov eax, r11d - xor eax, 094ab5a5ch - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - -rx_i_110: ;SHR_64 - dec ebx - jz rx_finish - xor r9, 04cdf5ebah - mov eax, r9d - test bl, 63 - jnz short rx_body_110 - call rx_read -rx_body_110: - and eax, 131071 - mov rax, qword ptr [rsi+rax*8] - mov rcx, r9 - shr rax, cl - mov rcx, rax - mov eax, r14d - xor eax, 0ec68532fh - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - -rx_i_111: ;CALL - dec ebx - jz rx_finish - xor r8, 02e16c97ch - mov eax, r8d - test bl, 63 - jnz short rx_body_111 - call rx_read -rx_body_111: - and eax, 131071 - mov rax, qword ptr [rsi+rax*8] - mov rcx, rax - mov eax, r12d - xor eax, 05d237d0bh - and eax, 131071 - mov qword ptr [rsi + rax * 8], rcx - cmp r14d, 1562606859 - jl short rx_i_112 - call rx_i_212 - -rx_i_112: ;SUB_64 - dec ebx - jz rx_finish - xor r12, 0d42ddbd4h - mov eax, r12d - test bl, 63 - jnz short rx_body_112 - call rx_read -rx_body_112: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - sub rax, -1025977295 - mov r14, rax - -rx_i_113: ;MULH_64 - dec ebx - jz rx_finish - xor r10, 07a4f8cbbh - mov eax, r10d - test bl, 63 - jnz short rx_body_113 - call rx_read -rx_body_113: - and eax, 32767 - mov rax, qword ptr [rsi+rax*8] - mov rcx, r9 - mul rcx - mov rax, rdx - mov rcx, rax - mov eax, r13d - xor eax, 0dea3f7e3h - and eax, 32767 - mov qword ptr [rsi + rax * 8], rcx - -rx_i_114: ;DIV_64 - dec ebx - jz rx_finish - xor r13, 06e83e2cdh - mov eax, r13d - test bl, 63 - jnz short rx_body_114 - call rx_read -rx_body_114: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - ; magic divide by 770835683 - mov rcx, 12847770974664443757 - mul rcx - mov rax, rdx - shr rax, 29 - mov r14, rax - -rx_i_115: ;IDIV_64 - dec ebx - jz rx_finish - xor r14, 0336c980eh - mov eax, r14d - test bl, 63 - jnz short rx_body_115 - call rx_read -rx_body_115: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - ; magic divide by 587029837 - mov rdx, 527204905636414983 - imul rdx - mov rax, rdx - xor edx, edx - sar rax, 24 - sets dl - add rax, rdx - mov r14, rax - -rx_i_116: ;DIV_64 - dec ebx - jz rx_finish - xor r10, 0d122702eh - mov eax, r10d - test bl, 63 - jnz short rx_body_116 - call rx_read -rx_body_116: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - ; magic divide by 2444190605 - mov rcx, 16207443550472271289 - mul rcx - mov rax, rdx - shr rax, 31 - mov rcx, rax - mov eax, r8d - xor eax, 091af638dh - and eax, 131071 - mov qword ptr [rsi + rax * 8], rcx - -rx_i_117: ;IDIV_64 - dec ebx - jz rx_finish - xor r11, 015f2012bh - mov eax, r11d - test bl, 63 - jnz short rx_body_117 - call rx_read -rx_body_117: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - ; magic divide by -1205826972 - mov rdx, -8213052572424165513 - imul rdx - mov rax, rdx - xor edx, edx - sar rax, 29 - sets dl - add rax, rdx - mov r15, rax - -rx_i_118: ;FPSUB - dec ebx - jz rx_finish - xor r9, 037ddf43dh - mov eax, r9d - test bl, 63 - jnz short rx_body_118 - call rx_read -rx_body_118: - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - subpd xmm0, xmm5 - movaps xmm6, xmm0 - -rx_i_119: ;FPSUB - dec ebx - jz rx_finish - xor r9, 0bba475f3h - mov eax, r9d - test bl, 63 - jnz short rx_body_119 - call rx_read -rx_body_119: - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - subpd xmm0, xmm3 - movaps xmm5, xmm0 - mov eax, r13d - xor eax, 02401488h - and eax, 2047 - movlpd qword ptr [rsi + rax * 8], xmm5 - -rx_i_120: ;FPADD - dec ebx - jz rx_finish - xor r12, 0e5561e3eh - mov eax, r12d - test bl, 63 - jnz short rx_body_120 - call rx_read -rx_body_120: - xor rbp, rax - and eax, 32767 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - addpd xmm0, xmm4 - movaps xmm8, xmm0 - -rx_i_121: ;FPSUB - dec ebx - jz rx_finish - xor r9, 03ab8f73h - mov eax, r9d - test bl, 63 - jnz short rx_body_121 - call rx_read -rx_body_121: - and eax, 131071 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - subpd xmm0, xmm5 - movaps xmm8, xmm0 - -rx_i_122: ;CALL - dec ebx - jz rx_finish - xor r10, 04e0dbd40h - mov eax, r10d - test bl, 63 - jnz short rx_body_122 - call rx_read -rx_body_122: - and eax, 131071 - mov rax, qword ptr [rsi+rax*8] - mov rcx, rax - mov eax, r14d - xor eax, 078f6ec29h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - cmp r11d, 2029448233 - jno short rx_i_123 - call rx_i_192 - -rx_i_123: ;ADD_32 - dec ebx - jz rx_finish - xor r13, 073e9f58ah - mov eax, r13d - test bl, 63 - jnz short rx_body_123 - call rx_read -rx_body_123: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - add eax, 1530846772 - mov r13, rax - -rx_i_124: ;JUMP - dec ebx - jz rx_finish - xor r12, 0e3fa3670h - mov eax, r12d - test bl, 63 - jnz short rx_body_124 - call rx_read -rx_body_124: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - mov r11, rax - cmp r11d, 1719505436 - jns rx_i_237 - -rx_i_125: ;IMUL_32 - dec ebx - jz rx_finish - xor r8, 0ebec27cdh - mov eax, r8d - test bl, 63 - jnz short rx_body_125 - call rx_read -rx_body_125: - xor rbp, rax - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - movsxd rcx, eax - mov rax, 1774711622 - imul rax, rcx - mov rcx, rax - mov eax, r14d - xor eax, 069c7f346h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - -rx_i_126: ;FPMUL - dec ebx - jz rx_finish - xor r8, 01feb5264h - mov eax, r8d - test bl, 63 - jnz short rx_body_126 - call rx_read -rx_body_126: - and eax, 32767 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - mulpd xmm0, xmm6 - movaps xmm1, xmm0 - cmpeqpd xmm1, xmm1 - andps xmm0, xmm1 - movaps xmm2, xmm0 - -rx_i_127: ;IMUL_32 - dec ebx - jz rx_finish - xor r9, 0405f500fh - mov eax, r9d - test bl, 63 - jnz short rx_body_127 - call rx_read -rx_body_127: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - movsxd rcx, eax - mov rax, -1027270754 - imul rax, rcx - mov r8, rax - -rx_i_128: ;MUL_64 - dec ebx - jz rx_finish - xor r13, 0459f1154h - mov eax, r13d - test bl, 63 - jnz short rx_body_128 - call rx_read -rx_body_128: - xor rbp, rax - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - imul rax, r9 - mov rcx, rax - mov eax, r9d - xor eax, 0cb2ee635h - and eax, 32767 - mov qword ptr [rsi + rax * 8], rcx - -rx_i_129: ;JUMP - dec ebx - jz rx_finish - xor r9, 081918b4ch - mov eax, r9d - test bl, 63 - jnz short rx_body_129 - call rx_read -rx_body_129: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - mov r9, rax - cmp r13d, -590624856 - jge rx_i_154 - -rx_i_130: ;IDIV_64 - dec ebx - jz rx_finish - xor r9, 077c3b332h - mov eax, r9d - test bl, 63 - jnz short rx_body_130 - call rx_read -rx_body_130: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - ; magic divide by -281794782 - mov rdx, -8786110448882479839 - imul rdx - mov rax, rdx - xor edx, edx - sar rax, 27 - sets dl - add rax, rdx - mov rcx, rax - mov eax, r11d - xor eax, 0ef342722h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - -rx_i_131: ;RET - dec ebx - jz rx_finish - xor r12, 05792310bh - mov eax, r12d - test bl, 63 - jnz short rx_body_131 - call rx_read -rx_body_131: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - mov r15, rax - cmp rsp, rdi - je short rx_i_132 - ret - -rx_i_132: ;FPADD - dec ebx - jz rx_finish - xor r10, 0ebc6e10h - mov eax, r10d - test bl, 63 - jnz short rx_body_132 - call rx_read -rx_body_132: - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - addpd xmm0, xmm6 - movaps xmm7, xmm0 - mov eax, r15d - xor eax, 0b0c38959h - and eax, 2047 - movlpd qword ptr [rsi + rax * 8], xmm7 - -rx_i_133: ;OR_64 - dec ebx - jz rx_finish - xor r14, 0822f8b60h - mov eax, r14d - test bl, 63 - jnz short rx_body_133 - call rx_read -rx_body_133: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - or rax, r13 - mov r15, rax - -rx_i_134: ;ADD_64 - dec ebx - jz rx_finish - xor r10, 0d0f18593h - mov eax, r10d - test bl, 63 - jnz short rx_body_134 - call rx_read -rx_body_134: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - add rax, r8 - mov rcx, rax - mov eax, r13d - xor eax, 05a5de2cbh - and eax, 32767 - mov qword ptr [rsi + rax * 8], rcx - -rx_i_135: ;FPMUL - dec ebx - jz rx_finish - xor r11, 088212ef9h - mov eax, r11d - test bl, 63 - jnz short rx_body_135 - call rx_read -rx_body_135: - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - mulpd xmm0, xmm3 - movaps xmm1, xmm0 - cmpeqpd xmm1, xmm1 - andps xmm0, xmm1 - movaps xmm8, xmm0 - mov eax, r8d - xor eax, 0b29f3d2ah - and eax, 2047 - movlpd qword ptr [rsi + rax * 8], xmm8 - -rx_i_136: ;FPDIV - dec ebx - jz rx_finish - xor r8, 01ae56e03h - mov eax, r8d - test bl, 63 - jnz short rx_body_136 - call rx_read -rx_body_136: - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - divpd xmm0, xmm8 - movaps xmm1, xmm0 - cmpeqpd xmm1, xmm1 - andps xmm0, xmm1 - movaps xmm5, xmm0 - -rx_i_137: ;SHR_64 - dec ebx - jz rx_finish - xor r11, 015a24231h - mov eax, r11d - test bl, 63 - jnz short rx_body_137 - call rx_read -rx_body_137: - xor rbp, rax - and eax, 32767 - mov rax, qword ptr [rsi+rax*8] - mov rcx, r9 - shr rax, cl - mov r11, rax - -rx_i_138: ;RET - dec ebx - jz rx_finish - xor r13, 02fd380c5h - mov eax, r13d - test bl, 63 - jnz short rx_body_138 - call rx_read -rx_body_138: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - mov rcx, rax - mov eax, r10d - xor eax, 08e1fd158h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - cmp rsp, rdi - je short rx_i_139 - ret - -rx_i_139: ;ADD_64 - dec ebx - jz rx_finish - xor r9, 093172470h - mov eax, r9d - test bl, 63 - jnz short rx_body_139 - call rx_read -rx_body_139: - xor rbp, rax - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - add rax, r8 - mov r11, rax - -rx_i_140: ;IMUL_32 - dec ebx - jz rx_finish - xor r14, 052543553h - mov eax, r14d - test bl, 63 - jnz short rx_body_140 - call rx_read -rx_body_140: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - movsxd rcx, eax - mov rax, -140239781 - imul rax, rcx - mov rcx, rax - mov eax, r14d - xor eax, 0f7a41c5bh - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - -rx_i_141: ;FPADD - dec ebx - jz rx_finish - xor r8, 02f636da1h - mov eax, r8d - test bl, 63 - jnz short rx_body_141 - call rx_read -rx_body_141: - xor rbp, rax - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - addpd xmm0, xmm2 - movaps xmm9, xmm0 - mov eax, r9d - xor eax, 099ff9ffdh - and eax, 2047 - movlpd qword ptr [rsi + rax * 8], xmm9 - -rx_i_142: ;JUMP - dec ebx - jz rx_finish - xor r11, 0b11a4f2ch - mov eax, r11d - test bl, 63 - jnz short rx_body_142 - call rx_read -rx_body_142: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - mov rcx, rax - mov eax, r10d - xor eax, 0516a9452h - and eax, 131071 - mov qword ptr [rsi + rax * 8], rcx - cmp r12d, 1365939282 - js rx_i_257 - -rx_i_143: ;IMUL_32 - dec ebx - jz rx_finish - xor r15, 037f4b5d0h - mov eax, r15d - test bl, 63 - jnz short rx_body_143 - call rx_read -rx_body_143: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - movsxd rcx, eax - movsxd rax, r11d - imul rax, rcx - mov r9, rax - -rx_i_144: ;DIV_64 - dec ebx - jz rx_finish - xor r10, 02e59e00ah - mov eax, r10d - test bl, 63 - jnz short rx_body_144 - call rx_read -rx_body_144: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - mov ecx, 1 - mov edx, r11d - test edx, edx - cmovne ecx, edx - xor edx, edx - div rcx - mov r15, rax - -rx_i_145: ;DIV_64 - dec ebx - jz rx_finish - xor r13, 08d5c798h - mov eax, r13d - test bl, 63 - jnz short rx_body_145 - call rx_read -rx_body_145: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - ; magic divide by 3712555397 - mov rcx, 10670300378317066981 - mul rcx - mov rax, rdx - shr rax, 31 - mov rcx, rax - mov eax, r10d - xor eax, 0dd491985h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - -rx_i_146: ;IMULH_64 - dec ebx - jz rx_finish - xor r13, 02327e6e2h - mov eax, r13d - test bl, 63 - jnz short rx_body_146 - call rx_read -rx_body_146: - xor rbp, rax - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - mov rcx, r12 - imul rcx - mov rax, rdx - mov r10, rax - -rx_i_147: ;MUL_64 - dec ebx - jz rx_finish - xor r13, 03a7df043h - mov eax, r13d - test bl, 63 - jnz short rx_body_147 - call rx_read -rx_body_147: - xor rbp, rax - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - imul rax, r11 - mov r12, rax - -rx_i_148: ;SUB_64 - dec ebx - jz rx_finish - xor r10, 0783e5c4eh - mov eax, r10d - test bl, 63 - jnz short rx_body_148 - call rx_read -rx_body_148: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - sub rax, r14 - mov rcx, rax - mov eax, r10d - xor eax, 08c783d2ch - and eax, 32767 - mov qword ptr [rsi + rax * 8], rcx - -rx_i_149: ;MUL_32 - dec ebx - jz rx_finish - xor r12, 0aa0f5b2fh - mov eax, r12d - test bl, 63 - jnz short rx_body_149 - call rx_read -rx_body_149: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - mov ecx, eax - mov eax, r14d - imul rax, rcx - mov r8, rax - -rx_i_150: ;DIV_64 - dec ebx - jz rx_finish - xor r9, 01504ca7ah - mov eax, r9d - test bl, 63 - jnz short rx_body_150 - call rx_read -rx_body_150: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - mov ecx, 1 - mov edx, r8d - test edx, edx - cmovne ecx, edx - xor edx, edx - div rcx - mov rcx, rax - mov eax, r9d - xor eax, 0c854a524h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - -rx_i_151: ;AND_64 - dec ebx - jz rx_finish - xor r9, 0ea72a7cfh - mov eax, r9d - test bl, 63 - jnz short rx_body_151 - call rx_read -rx_body_151: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - and rax, -2018584590 - mov r11, rax - -rx_i_152: ;SAR_64 - dec ebx - jz rx_finish - xor r13, 0ad0e7a88h - mov eax, r13d - test bl, 63 - jnz short rx_body_152 - call rx_read -rx_body_152: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - mov rcx, r10 - sar rax, cl - mov r10, rax - -rx_i_153: ;FPMUL - dec ebx - jz rx_finish - xor r15, 0fd95ab87h - mov eax, r15d - test bl, 63 - jnz short rx_body_153 - call rx_read -rx_body_153: - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - mulpd xmm0, xmm2 - movaps xmm1, xmm0 - cmpeqpd xmm1, xmm1 - andps xmm0, xmm1 - movaps xmm8, xmm0 - -rx_i_154: ;MUL_32 - dec ebx - jz rx_finish - xor r10, 0256697b0h - mov eax, r10d - test bl, 63 - jnz short rx_body_154 - call rx_read -rx_body_154: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - mov ecx, eax - mov eax, -820047839 - imul rax, rcx - mov rcx, rax - mov eax, r10d - xor eax, 0cf1f1021h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - -rx_i_155: ;ROL_64 - dec ebx - jz rx_finish - xor r11, 0d23f3b78h - mov eax, r11d - test bl, 63 - jnz short rx_body_155 - call rx_read -rx_body_155: - and eax, 32767 - mov rax, qword ptr [rsi+rax*8] - mov rcx, r10 - rol rax, cl - mov r13, rax - -rx_i_156: ;IMUL_32 - dec ebx - jz rx_finish - xor r10, 098917533h - mov eax, r10d - test bl, 63 - jnz short rx_body_156 - call rx_read -rx_body_156: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - movsxd rcx, eax - movsxd rax, r15d - imul rax, rcx - mov rcx, rax - mov eax, r15d - xor eax, 0b803e8a9h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - -rx_i_157: ;ADD_64 - dec ebx - jz rx_finish - xor r10, 0dfac3efch - mov eax, r10d - test bl, 63 - jnz short rx_body_157 - call rx_read -rx_body_157: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - add rax, r12 - mov r14, rax - -rx_i_158: ;ADD_64 - dec ebx - jz rx_finish - xor r15, 0a64de090h - mov eax, r15d - test bl, 63 - jnz short rx_body_158 - call rx_read -rx_body_158: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - add rax, r13 - mov rcx, rax - mov eax, r10d - xor eax, 04984392fh - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - -rx_i_159: ;CALL - dec ebx - jz rx_finish - xor r13, 0952a3abbh - mov eax, r13d - test bl, 63 - jnz short rx_body_159 - call rx_read -rx_body_159: - and eax, 131071 - mov rax, qword ptr [rsi+rax*8] - mov r13, rax - cmp r15d, -8571241 - ja short rx_i_160 - call rx_i_181 - -rx_i_160: ;SUB_64 - dec ebx - jz rx_finish - xor r14, 0b1685b90h - mov eax, r14d - test bl, 63 - jnz short rx_body_160 - call rx_read -rx_body_160: - xor rbp, rax - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - sub rax, r14 - mov rcx, rax - mov eax, r10d - xor eax, 05a86b929h - and eax, 32767 - mov qword ptr [rsi + rax * 8], rcx - -rx_i_161: ;IDIV_64 - dec ebx - jz rx_finish - xor r15, 0ea992531h - mov eax, r15d - test bl, 63 - jnz short rx_body_161 - call rx_read -rx_body_161: - and eax, 131071 - mov rax, qword ptr [rsi+rax*8] - mov edx, r14d - cmp edx, -1 - jne short body_idiv_161 - neg rax - jmp short result_idiv_161 -body_idiv_161: - mov ecx, 1 - test edx, edx - cmovne ecx, edx - movsxd rcx, ecx - cqo - idiv rcx -result_idiv_161: - mov rcx, rax - mov eax, r8d - xor eax, 0db9043dah - and eax, 32767 - mov qword ptr [rsi + rax * 8], rcx - -rx_i_162: ;SHL_64 - dec ebx - jz rx_finish - xor r9, 01fd57a4ah - mov eax, r9d - test bl, 63 - jnz short rx_body_162 - call rx_read -rx_body_162: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - shl rax, 7 - mov rcx, rax - mov eax, r13d - xor eax, 0170a46d8h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - -rx_i_163: ;SUB_64 - dec ebx - jz rx_finish - xor r12, 0e3486c0ah - mov eax, r12d - test bl, 63 - jnz short rx_body_163 - call rx_read -rx_body_163: - xor rbp, rax - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - sub rax, r8 - mov r14, rax - -rx_i_164: ;MUL_32 - dec ebx - jz rx_finish - xor r12, 01f0c2737h - mov eax, r12d - test bl, 63 - jnz short rx_body_164 - call rx_read -rx_body_164: - xor rbp, rax - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - mov ecx, eax - mov eax, r9d - imul rax, rcx - mov r13, rax - -rx_i_165: ;RET - dec ebx - jz rx_finish - xor r12, 0debb493eh - mov eax, r12d - test bl, 63 - jnz short rx_body_165 - call rx_read -rx_body_165: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - mov r9, rax - cmp rsp, rdi - je short rx_i_166 - ret - -rx_i_166: ;SHR_64 - dec ebx - jz rx_finish - xor r9, 0fe684081h - mov eax, r9d - test bl, 63 - jnz short rx_body_166 - call rx_read -rx_body_166: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - shr rax, 62 - mov rcx, rax - mov eax, r13d - xor eax, 0bb67f8abh - and eax, 131071 - mov qword ptr [rsi + rax * 8], rcx - -rx_i_167: ;FPMUL - dec ebx - jz rx_finish - xor r11, 0d10371ch - mov eax, r11d - test bl, 63 - jnz short rx_body_167 - call rx_read -rx_body_167: - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - mulpd xmm0, xmm4 - movaps xmm1, xmm0 - cmpeqpd xmm1, xmm1 - andps xmm0, xmm1 - movaps xmm2, xmm0 - -rx_i_168: ;FPDIV - dec ebx - jz rx_finish - xor r12, 071b15effh - mov eax, r12d - test bl, 63 - jnz short rx_body_168 - call rx_read -rx_body_168: - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - divpd xmm0, xmm5 - movaps xmm1, xmm0 - cmpeqpd xmm1, xmm1 - andps xmm0, xmm1 - movaps xmm7, xmm0 - mov eax, r15d - xor eax, 08d1a76f8h - and eax, 131071 - movhpd qword ptr [rsi + rax * 8], xmm7 - -rx_i_169: ;CALL - dec ebx - jz rx_finish - xor r11, 072790347h - mov eax, r11d - test bl, 63 - jnz short rx_body_169 - call rx_read -rx_body_169: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - mov rcx, rax - mov eax, r14d - xor eax, 0b353bf8dh - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - cmp r10d, -1286357107 - ja short rx_i_170 - call rx_i_197 - -rx_i_170: ;FPSQRT - dec ebx - jz rx_finish - xor r8, 04ae8a020h - mov eax, r8d - test bl, 63 - jnz short rx_body_170 - call rx_read -rx_body_170: - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - andps xmm0, xmm10 - sqrtpd xmm6, xmm0 - -rx_i_171: ;DIV_64 - dec ebx - jz rx_finish - xor r15, 09901e05bh - mov eax, r15d - test bl, 63 - jnz short rx_body_171 - call rx_read -rx_body_171: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - ; magic divide by 2064150457 - add rax, 1 - sbb rax, 0 - mov rcx, 4797867461985617359 - mul rcx - mov rax, rdx - shr rax, 29 - mov rcx, rax - mov eax, r12d - xor eax, 07b086fb9h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - -rx_i_172: ;SUB_64 - dec ebx - jz rx_finish - xor r13, 050e8c510h - mov eax, r13d - test bl, 63 - jnz short rx_body_172 - call rx_read -rx_body_172: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - sub rax, -478081934 - mov r12, rax - -rx_i_173: ;MUL_64 - dec ebx - jz rx_finish - xor r14, 05422cf8fh - mov eax, r14d - test bl, 63 - jnz short rx_body_173 - call rx_read -rx_body_173: - xor rbp, rax - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - imul rax, rax, -1386172772 - mov r12, rax - -rx_i_174: ;FPDIV - dec ebx - jz rx_finish - xor r12, 0a025c3dbh - mov eax, r12d - test bl, 63 - jnz short rx_body_174 - call rx_read -rx_body_174: - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - divpd xmm0, xmm9 - movaps xmm1, xmm0 - cmpeqpd xmm1, xmm1 - andps xmm0, xmm1 - movaps xmm6, xmm0 - mov eax, r14d - xor eax, 02be6989fh - and eax, 32767 - movlpd qword ptr [rsi + rax * 8], xmm6 - -rx_i_175: ;XOR_32 - dec ebx - jz rx_finish - xor r13, 08f74c11h - mov eax, r13d - test bl, 63 - jnz short rx_body_175 - call rx_read -rx_body_175: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - xor eax, r8d - mov r8, rax - -rx_i_176: ;SUB_64 - dec ebx - jz rx_finish - xor r9, 01f2ed5f1h - mov eax, r9d - test bl, 63 - jnz short rx_body_176 - call rx_read -rx_body_176: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - sub rax, -2101315181 - mov r10, rax - -rx_i_177: ;ADD_64 - dec ebx - jz rx_finish - xor r10, 0d2072c79h - mov eax, r10d - test bl, 63 - jnz short rx_body_177 - call rx_read -rx_body_177: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - add rax, 794235831 - mov r13, rax - -rx_i_178: ;RET - dec ebx - jz rx_finish - xor r15, 0a8e51933h - mov eax, r15d - test bl, 63 - jnz short rx_body_178 - call rx_read -rx_body_178: - and eax, 32767 - mov rax, qword ptr [rsi+rax*8] - mov rcx, rax - mov eax, r12d - xor eax, 0c366b275h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - cmp rsp, rdi - je short rx_i_179 - ret - -rx_i_179: ;FPADD - dec ebx - jz rx_finish - xor r12, 0934ad492h - mov eax, r12d - test bl, 63 - jnz short rx_body_179 - call rx_read -rx_body_179: - and eax, 32767 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - addpd xmm0, xmm2 - movaps xmm8, xmm0 - -rx_i_180: ;AND_32 - dec ebx - jz rx_finish - xor r15, 01cb3ce1fh - mov eax, r15d - test bl, 63 - jnz short rx_body_180 - call rx_read -rx_body_180: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - and eax, r9d - mov rcx, rax - mov eax, r9d - xor eax, 076edfe13h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - -rx_i_181: ;CALL - dec ebx - jz rx_finish - xor r10, 023c7845fh - mov eax, r10d - test bl, 63 - jnz short rx_body_181 - call rx_read -rx_body_181: - xor rbp, rax - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - mov r10, rax - cmp r12d, -1612576918 - jbe short rx_i_182 - call rx_i_211 - -rx_i_182: ;FPSUB - dec ebx - jz rx_finish - xor r8, 0f8884327h - mov eax, r8d - test bl, 63 - jnz short rx_body_182 - call rx_read -rx_body_182: - xor rbp, rax - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - subpd xmm0, xmm7 - movaps xmm6, xmm0 - mov eax, r14d - xor eax, 07c8d12a5h - and eax, 131071 - movhpd qword ptr [rsi + rax * 8], xmm6 - -rx_i_183: ;ADD_64 - dec ebx - jz rx_finish - xor r13, 013070461h - mov eax, r13d - test bl, 63 - jnz short rx_body_183 - call rx_read -rx_body_183: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - add rax, r11 - mov r10, rax - -rx_i_184: ;XOR_32 - dec ebx - jz rx_finish - xor r12, 04764cdf7h - mov eax, r12d - test bl, 63 - jnz short rx_body_184 - call rx_read -rx_body_184: - xor rbp, rax - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - xor eax, r13d - mov rcx, rax - mov eax, r12d - xor eax, 02f185447h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - -rx_i_185: ;JUMP - dec ebx - jz rx_finish - xor r10, 03c41026fh - mov eax, r10d - test bl, 63 - jnz short rx_body_185 - call rx_read -rx_body_185: - and eax, 32767 - mov rax, qword ptr [rsi+rax*8] - mov rcx, rax - mov eax, r9d - xor eax, 0a5fae4a3h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - cmp r15d, -1510284125 - jbe rx_i_246 - -rx_i_186: ;OR_64 - dec ebx - jz rx_finish - xor r9, 0cded414bh - mov eax, r9d - test bl, 63 - jnz short rx_body_186 - call rx_read -rx_body_186: - xor rbp, rax - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - or rax, -1252263008 - mov rcx, rax - mov eax, r10d - xor eax, 0b55bfba0h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - -rx_i_187: ;FPMUL - dec ebx - jz rx_finish - xor r13, 05c6d64a8h - mov eax, r13d - test bl, 63 - jnz short rx_body_187 - call rx_read -rx_body_187: - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - mulpd xmm0, xmm6 - movaps xmm1, xmm0 - cmpeqpd xmm1, xmm1 - andps xmm0, xmm1 - movaps xmm5, xmm0 - -rx_i_188: ;FPSUB - dec ebx - jz rx_finish - xor r9, 04659becbh - mov eax, r9d - test bl, 63 - jnz short rx_body_188 - call rx_read -rx_body_188: - xor rbp, rax - and eax, 32767 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - subpd xmm0, xmm3 - movaps xmm4, xmm0 - -rx_i_189: ;FPDIV - dec ebx - jz rx_finish - xor r11, 0c52741d5h - mov eax, r11d - test bl, 63 - jnz short rx_body_189 - call rx_read -rx_body_189: - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - divpd xmm0, xmm7 - movaps xmm1, xmm0 - cmpeqpd xmm1, xmm1 - andps xmm0, xmm1 - movaps xmm5, xmm0 - -rx_i_190: ;RET - dec ebx - jz rx_finish - xor r12, 0217bf5f3h - mov eax, r12d - test bl, 63 - jnz short rx_body_190 - call rx_read -rx_body_190: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - mov r13, rax - cmp rsp, rdi - je short rx_i_191 - ret - -rx_i_191: ;FPSQRT - dec ebx - jz rx_finish - xor r15, 0884f3526h - mov eax, r15d - test bl, 63 - jnz short rx_body_191 - call rx_read -rx_body_191: - and eax, 131071 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - andps xmm0, xmm10 - sqrtpd xmm6, xmm0 - -rx_i_192: ;FPSQRT - dec ebx - jz rx_finish - xor r8, 0d76edad3h - mov eax, r8d - test bl, 63 - jnz short rx_body_192 - call rx_read -rx_body_192: - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - andps xmm0, xmm10 - sqrtpd xmm8, xmm0 - -rx_i_193: ;MUL_32 - dec ebx - jz rx_finish - xor r12, 0e9939ach - mov eax, r12d - test bl, 63 - jnz short rx_body_193 - call rx_read -rx_body_193: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - mov ecx, eax - mov eax, r12d - imul rax, rcx - mov rcx, rax - mov eax, r15d - xor eax, 074e097dch - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - -rx_i_194: ;FPMUL - dec ebx - jz rx_finish - xor r12, 0f21ca520h - mov eax, r12d - test bl, 63 - jnz short rx_body_194 - call rx_read -rx_body_194: - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - mulpd xmm0, xmm2 - movaps xmm1, xmm0 - cmpeqpd xmm1, xmm1 - andps xmm0, xmm1 - movaps xmm5, xmm0 - -rx_i_195: ;SHL_64 - dec ebx - jz rx_finish - xor r10, 09405152ch - mov eax, r10d - test bl, 63 - jnz short rx_body_195 - call rx_read -rx_body_195: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - shl rax, 27 - mov r9, rax - -rx_i_196: ;SUB_64 - dec ebx - jz rx_finish - xor r8, 0c2a9f41bh - mov eax, r8d - test bl, 63 - jnz short rx_body_196 - call rx_read -rx_body_196: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - sub rax, r8 - mov rcx, rax - mov eax, r13d - xor eax, 08e47b269h - and eax, 131071 - mov qword ptr [rsi + rax * 8], rcx - -rx_i_197: ;MUL_64 - dec ebx - jz rx_finish - xor r12, 0229208efh - mov eax, r12d - test bl, 63 - jnz short rx_body_197 - call rx_read -rx_body_197: - xor rbp, rax - and eax, 32767 - mov rax, qword ptr [rsi+rax*8] - imul rax, r15 - mov rcx, rax - mov eax, r11d - xor eax, 0b1d1e60dh - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - -rx_i_198: ;MULH_64 - dec ebx - jz rx_finish - xor r14, 0c8d95bbbh - mov eax, r14d - test bl, 63 - jnz short rx_body_198 - call rx_read -rx_body_198: - and eax, 32767 - mov rax, qword ptr [rsi+rax*8] - mov rcx, r14 - mul rcx - mov rax, rdx - mov r8, rax - -rx_i_199: ;MULH_64 - dec ebx - jz rx_finish - xor r13, 050049e2eh - mov eax, r13d - test bl, 63 - jnz short rx_body_199 - call rx_read -rx_body_199: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - mov rcx, r10 - mul rcx - mov rax, rdx - mov r10, rax - -rx_i_200: ;FPSUB - dec ebx - jz rx_finish - xor r10, 0c63b99e8h - mov eax, r10d - test bl, 63 - jnz short rx_body_200 - call rx_read -rx_body_200: - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - subpd xmm0, xmm2 - movaps xmm4, xmm0 - -rx_i_201: ;FPADD - dec ebx - jz rx_finish - xor r8, 0cdda801dh - mov eax, r8d - test bl, 63 - jnz short rx_body_201 - call rx_read -rx_body_201: - xor rbp, rax - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - addpd xmm0, xmm9 - movaps xmm4, xmm0 - mov eax, r12d - xor eax, 040cfe68eh - and eax, 2047 - movlpd qword ptr [rsi + rax * 8], xmm4 - -rx_i_202: ;FPADD - dec ebx - jz rx_finish - xor r13, 0fa44b04ah - mov eax, r13d - test bl, 63 - jnz short rx_body_202 - call rx_read -rx_body_202: - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - addpd xmm0, xmm9 - movaps xmm5, xmm0 - -rx_i_203: ;FPSUB - dec ebx - jz rx_finish - xor r10, 0d73e472ch - mov eax, r10d - test bl, 63 - jnz short rx_body_203 - call rx_read -rx_body_203: - and eax, 32767 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - subpd xmm0, xmm2 - movaps xmm7, xmm0 - mov eax, r15d - xor eax, 09bdff355h - and eax, 2047 - movhpd qword ptr [rsi + rax * 8], xmm7 - -rx_i_204: ;MUL_64 - dec ebx - jz rx_finish - xor r9, 01af8ab1dh - mov eax, r9d - test bl, 63 - jnz short rx_body_204 - call rx_read -rx_body_204: - and eax, 32767 - mov rax, qword ptr [rsi+rax*8] - imul rax, r15 - mov r8, rax - -rx_i_205: ;FPMUL - dec ebx - jz rx_finish - xor r14, 094e997c5h - mov eax, r14d - test bl, 63 - jnz short rx_body_205 - call rx_read -rx_body_205: - xor rbp, rax - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - mulpd xmm0, xmm8 - movaps xmm1, xmm0 - cmpeqpd xmm1, xmm1 - andps xmm0, xmm1 - movaps xmm5, xmm0 - -rx_i_206: ;FPSUB - dec ebx - jz rx_finish - xor r11, 0e836a177h - mov eax, r11d - test bl, 63 - jnz short rx_body_206 - call rx_read -rx_body_206: - xor rbp, rax - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - subpd xmm0, xmm7 - movaps xmm4, xmm0 - mov eax, r12d - xor eax, 0d01fb731h - and eax, 2047 - movlpd qword ptr [rsi + rax * 8], xmm4 - -rx_i_207: ;IDIV_64 - dec ebx - jz rx_finish - xor r9, 039ccdd30h - mov eax, r9d - test bl, 63 - jnz short rx_body_207 - call rx_read -rx_body_207: - xor rbp, rax - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - ; magic divide by 314297476 - mov rdx, 1969376361274661135 - imul rdx - mov rax, rdx - xor edx, edx - sar rax, 25 - sets dl - add rax, rdx - mov rcx, rax - mov eax, r9d - xor eax, 012bbcc84h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - -rx_i_208: ;MUL_64 - dec ebx - jz rx_finish - xor r9, 0f4f126c5h - mov eax, r9d - test bl, 63 - jnz short rx_body_208 - call rx_read -rx_body_208: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - imul rax, rax, -486588965 - mov rcx, rax - mov eax, r10d - xor eax, 0e2ff3ddbh - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - -rx_i_209: ;XOR_64 - dec ebx - jz rx_finish - xor r8, 0b84811f1h - mov eax, r8d - test bl, 63 - jnz short rx_body_209 - call rx_read -rx_body_209: - xor rbp, rax - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - xor rax, r15 - mov rcx, rax - mov eax, r12d - xor eax, 0c36b836ah - and eax, 32767 - mov qword ptr [rsi + rax * 8], rcx - -rx_i_210: ;MUL_32 - dec ebx - jz rx_finish - xor r12, 0c5efc90ah - mov eax, r12d - test bl, 63 - jnz short rx_body_210 - call rx_read -rx_body_210: - xor rbp, rax - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - mov ecx, eax - mov eax, r12d - imul rax, rcx - mov rcx, rax - mov eax, r15d - xor eax, 0c2c6bee0h - and eax, 131071 - mov qword ptr [rsi + rax * 8], rcx - -rx_i_211: ;ROR_64 - dec ebx - jz rx_finish - xor r12, 0ce533072h - mov eax, r12d - test bl, 63 - jnz short rx_body_211 - call rx_read -rx_body_211: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - mov rcx, r9 - ror rax, cl - mov rcx, rax - mov eax, r11d - xor eax, 0212e615h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - -rx_i_212: ;MUL_64 - dec ebx - jz rx_finish - xor r13, 06b465fdbh - mov eax, r13d - test bl, 63 - jnz short rx_body_212 - call rx_read -rx_body_212: - xor rbp, rax - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - imul rax, r13 - mov r15, rax - -rx_i_213: ;IMUL_32 - dec ebx - jz rx_finish - xor r13, 02dd1d503h - mov eax, r13d - test bl, 63 - jnz short rx_body_213 - call rx_read -rx_body_213: - xor rbp, rax - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - movsxd rcx, eax - movsxd rax, r14d - imul rax, rcx - mov rcx, rax - mov eax, r14d - xor eax, 07bf8b75h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - -rx_i_214: ;SHL_64 - dec ebx - jz rx_finish - xor r9, 0a159f313h - mov eax, r9d - test bl, 63 - jnz short rx_body_214 - call rx_read -rx_body_214: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - mov rcx, r14 - shl rax, cl - mov rcx, rax - mov eax, r14d - xor eax, 0936ebe0bh - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - -rx_i_215: ;ADD_32 - dec ebx - jz rx_finish - xor r15, 08359265eh - mov eax, r15d - test bl, 63 - jnz short rx_body_215 - call rx_read -rx_body_215: - xor rbp, rax - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - add eax, r12d - mov rcx, rax - mov eax, r10d - xor eax, 01194f02bh - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - -rx_i_216: ;MUL_64 - dec ebx - jz rx_finish - xor r12, 080696de3h - mov eax, r12d - test bl, 63 - jnz short rx_body_216 - call rx_read -rx_body_216: - and eax, 131071 - mov rax, qword ptr [rsi+rax*8] - imul rax, r13 - mov rcx, rax - mov eax, r15d - xor eax, 03b609d2bh - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - -rx_i_217: ;IMUL_32 - dec ebx - jz rx_finish - xor r8, 040d5b526h - mov eax, r8d - test bl, 63 - jnz short rx_body_217 - call rx_read -rx_body_217: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - movsxd rcx, eax - movsxd rax, r9d - imul rax, rcx - mov r10, rax - -rx_i_218: ;FPSQRT - dec ebx - jz rx_finish - xor r11, 083c0bd93h - mov eax, r11d - test bl, 63 - jnz short rx_body_218 - call rx_read -rx_body_218: - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - andps xmm0, xmm10 - sqrtpd xmm3, xmm0 - -rx_i_219: ;OR_64 - dec ebx - jz rx_finish - xor r8, 0ca37f668h - mov eax, r8d - test bl, 63 - jnz short rx_body_219 - call rx_read -rx_body_219: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - or rax, r10 - mov rcx, rax - mov eax, r15d - xor eax, 0d3d68798h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - -rx_i_220: ;IMUL_32 - dec ebx - jz rx_finish - xor r9, 0bb44c384h - mov eax, r9d - test bl, 63 - jnz short rx_body_220 - call rx_read -rx_body_220: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - movsxd rcx, eax - movsxd rax, r11d - imul rax, rcx - mov r11, rax - -rx_i_221: ;DIV_64 - dec ebx - jz rx_finish - xor r9, 0a3deb512h - mov eax, r9d - test bl, 63 - jnz short rx_body_221 - call rx_read -rx_body_221: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - mov ecx, 1 - mov edx, r15d - test edx, edx - cmovne ecx, edx - xor edx, edx - div rcx - mov r11, rax - -rx_i_222: ;FPMUL - dec ebx - jz rx_finish - xor r9, 084a02d64h - mov eax, r9d - test bl, 63 - jnz short rx_body_222 - call rx_read -rx_body_222: - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - mulpd xmm0, xmm5 - movaps xmm1, xmm0 - cmpeqpd xmm1, xmm1 - andps xmm0, xmm1 - movaps xmm7, xmm0 - -rx_i_223: ;FPSUB - dec ebx - jz rx_finish - xor r8, 01e5cc085h - mov eax, r8d - test bl, 63 - jnz short rx_body_223 - call rx_read -rx_body_223: - xor rbp, rax - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - subpd xmm0, xmm3 - movaps xmm2, xmm0 - mov eax, r10d - xor eax, 07fca59eeh - and eax, 32767 - movhpd qword ptr [rsi + rax * 8], xmm2 - -rx_i_224: ;XOR_32 - dec ebx - jz rx_finish - xor r12, 053982440h - mov eax, r12d - test bl, 63 - jnz short rx_body_224 - call rx_read -rx_body_224: - and eax, 32767 - mov rax, qword ptr [rsi+rax*8] - xor eax, -452933987 - mov rcx, rax - mov eax, r11d - xor eax, 0e500c69dh - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - -rx_i_225: ;DIV_64 - dec ebx - jz rx_finish - xor r13, 0c558367eh - mov eax, r13d - test bl, 63 - jnz short rx_body_225 - call rx_read -rx_body_225: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - ; magic divide by 4264577610 - shr rax, 1 - mov rcx, 9289098447696480965 - mul rcx - mov rax, rdx - shr rax, 30 - mov rcx, rax - mov eax, r12d - xor eax, 0fe304a4ah - and eax, 32767 - mov qword ptr [rsi + rax * 8], rcx - -rx_i_226: ;JUMP - dec ebx - jz rx_finish - xor r10, 040139b65h - mov eax, r10d - test bl, 63 - jnz short rx_body_226 - call rx_read -rx_body_226: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - mov r8, rax - cmp r8d, -1752488808 - jno rx_i_328 - -rx_i_227: ;FPMUL - dec ebx - jz rx_finish - xor r11, 0fa312dbdh - mov eax, r11d - test bl, 63 - jnz short rx_body_227 - call rx_read -rx_body_227: - and eax, 32767 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - mulpd xmm0, xmm7 - movaps xmm1, xmm0 - cmpeqpd xmm1, xmm1 - andps xmm0, xmm1 - movaps xmm3, xmm0 - mov eax, r11d - xor eax, 0aabe2a0ah - and eax, 2047 - movhpd qword ptr [rsi + rax * 8], xmm3 - -rx_i_228: ;FPSQRT - dec ebx - jz rx_finish - xor r11, 0b64246c0h - mov eax, r11d - test bl, 63 - jnz short rx_body_228 - call rx_read -rx_body_228: - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - andps xmm0, xmm10 - sqrtpd xmm7, xmm0 - mov eax, r15d - xor eax, 0ffdff798h - and eax, 2047 - movhpd qword ptr [rsi + rax * 8], xmm7 - -rx_i_229: ;IMULH_64 - dec ebx - jz rx_finish - xor r11, 05c535836h - mov eax, r11d - test bl, 63 - jnz short rx_body_229 - call rx_read -rx_body_229: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - mov rcx, 334017248 - imul rcx - mov rax, rdx - mov r13, rax - -rx_i_230: ;FPMUL - dec ebx - jz rx_finish - xor r15, 0f394972eh - mov eax, r15d - test bl, 63 - jnz short rx_body_230 - call rx_read -rx_body_230: - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - mulpd xmm0, xmm6 - movaps xmm1, xmm0 - cmpeqpd xmm1, xmm1 - andps xmm0, xmm1 - movaps xmm5, xmm0 - mov eax, r13d - xor eax, 01dc2b4f6h - and eax, 2047 - movhpd qword ptr [rsi + rax * 8], xmm5 - -rx_i_231: ;RET - dec ebx - jz rx_finish - xor r9, 0bb56428dh - mov eax, r9d - test bl, 63 - jnz short rx_body_231 - call rx_read -rx_body_231: - xor rbp, rax - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - mov r9, rax - cmp rsp, rdi - je short rx_i_232 - ret - -rx_i_232: ;FPMUL - dec ebx - jz rx_finish - xor r15, 09ab46ab3h - mov eax, r15d - test bl, 63 - jnz short rx_body_232 - call rx_read -rx_body_232: - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - mulpd xmm0, xmm3 - movaps xmm1, xmm0 - cmpeqpd xmm1, xmm1 - andps xmm0, xmm1 - movaps xmm7, xmm0 - mov eax, r15d - xor eax, 07e732935h - and eax, 2047 - movhpd qword ptr [rsi + rax * 8], xmm7 - -rx_i_233: ;JUMP - dec ebx - jz rx_finish - xor r13, 08eb2cd76h - mov eax, r13d - test bl, 63 - jnz short rx_body_233 - call rx_read -rx_body_233: - xor rbp, rax - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - mov r14, rax - cmp r12d, 392389867 - jo rx_i_268 - -rx_i_234: ;FPDIV - dec ebx - jz rx_finish - xor r15, 0ba687578h - mov eax, r15d - test bl, 63 - jnz short rx_body_234 - call rx_read -rx_body_234: - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - divpd xmm0, xmm4 - movaps xmm1, xmm0 - cmpeqpd xmm1, xmm1 - andps xmm0, xmm1 - movaps xmm4, xmm0 - -rx_i_235: ;IMUL_32 - dec ebx - jz rx_finish - xor r13, 0b6cb9ff2h - mov eax, r13d - test bl, 63 - jnz short rx_body_235 - call rx_read -rx_body_235: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - movsxd rcx, eax - mov rax, 212286089 - imul rax, rcx - mov r15, rax - -rx_i_236: ;FPADD - dec ebx - jz rx_finish - xor r15, 03ad196ach - mov eax, r15d - test bl, 63 - jnz short rx_body_236 - call rx_read -rx_body_236: - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - addpd xmm0, xmm4 - movaps xmm3, xmm0 - -rx_i_237: ;JUMP - dec ebx - jz rx_finish - xor r15, 0fab4600h - mov eax, r15d - test bl, 63 - jnz short rx_body_237 - call rx_read -rx_body_237: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - mov r11, rax - cmp r12d, -121899164 - jge rx_i_295 - -rx_i_238: ;FPADD - dec ebx - jz rx_finish - xor r8, 0158f119fh - mov eax, r8d - test bl, 63 - jnz short rx_body_238 - call rx_read -rx_body_238: - xor rbp, rax - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - addpd xmm0, xmm6 - movaps xmm7, xmm0 - -rx_i_239: ;ADD_64 - dec ebx - jz rx_finish - xor r13, 044f30b3fh - mov eax, r13d - test bl, 63 - jnz short rx_body_239 - call rx_read -rx_body_239: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - add rax, r10 - mov rcx, rax - mov eax, r10d - xor eax, 0e42cdf41h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - -rx_i_240: ;IMULH_64 - dec ebx - jz rx_finish - xor r9, 0d65d29f9h - mov eax, r9d - test bl, 63 - jnz short rx_body_240 - call rx_read -rx_body_240: - xor rbp, rax - and eax, 131071 - mov rax, qword ptr [rsi+rax*8] - mov rcx, r14 - imul rcx - mov rax, rdx - mov rcx, rax - mov eax, r8d - xor eax, 0e6bcdcfbh - and eax, 32767 - mov qword ptr [rsi + rax * 8], rcx - -rx_i_241: ;FPADD - dec ebx - jz rx_finish - xor r11, 0ce5260adh - mov eax, r11d - test bl, 63 - jnz short rx_body_241 - call rx_read -rx_body_241: - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - addpd xmm0, xmm3 - movaps xmm7, xmm0 - -rx_i_242: ;MUL_32 - dec ebx - jz rx_finish - xor r12, 01119b0f9h - mov eax, r12d - test bl, 63 - jnz short rx_body_242 - call rx_read -rx_body_242: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - mov ecx, eax - mov eax, r12d - imul rax, rcx - mov r10, rax - -rx_i_243: ;OR_64 - dec ebx - jz rx_finish - xor r12, 0d6c2ce3dh - mov eax, r12d - test bl, 63 - jnz short rx_body_243 - call rx_read -rx_body_243: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - or rax, r9 - mov r14, rax - -rx_i_244: ;ROR_64 - dec ebx - jz rx_finish - xor r11, 0c6a6248h - mov eax, r11d - test bl, 63 - jnz short rx_body_244 - call rx_read -rx_body_244: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - mov rcx, r14 - ror rax, cl - mov rcx, rax - mov eax, r9d - xor eax, 0b4a1fad6h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - -rx_i_245: ;AND_32 - dec ebx - jz rx_finish - xor r13, 084505739h - mov eax, r13d - test bl, 63 - jnz short rx_body_245 - call rx_read -rx_body_245: - xor rbp, rax - and eax, 131071 - mov rax, qword ptr [rsi+rax*8] - and eax, r10d - mov rcx, rax - mov eax, r12d - xor eax, 0a3d1ad8bh - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - -rx_i_246: ;IDIV_64 - dec ebx - jz rx_finish - xor r15, 027eeaa2eh - mov eax, r15d - test bl, 63 - jnz short rx_body_246 - call rx_read -rx_body_246: - xor rbp, rax - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - ; magic divide by -156808488 - mov rdx, -3947299202596036367 - imul rdx - mov rax, rdx - xor edx, edx - sar rax, 25 - sets dl - add rax, rdx - mov r12, rax - -rx_i_247: ;IMUL_32 - dec ebx - jz rx_finish - xor r10, 0c4de0296h - mov eax, r10d - test bl, 63 - jnz short rx_body_247 - call rx_read -rx_body_247: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - movsxd rcx, eax - movsxd rax, r14d - imul rax, rcx - mov rcx, rax - mov eax, r9d - xor eax, 03814cf80h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - -rx_i_248: ;MUL_32 - dec ebx - jz rx_finish - xor r8, 0649df46fh - mov eax, r8d - test bl, 63 - jnz short rx_body_248 - call rx_read -rx_body_248: - xor rbp, rax - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - mov ecx, eax - mov eax, r15d - imul rax, rcx - mov rcx, rax - mov eax, r9d - xor eax, 07b10fc32h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - -rx_i_249: ;IMULH_64 - dec ebx - jz rx_finish - xor r15, 0499552cch - mov eax, r15d - test bl, 63 - jnz short rx_body_249 - call rx_read -rx_body_249: - xor rbp, rax - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - mov rcx, -508571655 - imul rcx - mov rax, rdx - mov r13, rax - -rx_i_250: ;MUL_64 - dec ebx - jz rx_finish - xor r13, 083eafe6fh - mov eax, r13d - test bl, 63 - jnz short rx_body_250 - call rx_read -rx_body_250: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - imul rax, r8 - mov r14, rax - -rx_i_251: ;FPMUL - dec ebx - jz rx_finish - xor r13, 0a25a4d8ah - mov eax, r13d - test bl, 63 - jnz short rx_body_251 - call rx_read -rx_body_251: - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - mulpd xmm0, xmm2 - movaps xmm1, xmm0 - cmpeqpd xmm1, xmm1 - andps xmm0, xmm1 - movaps xmm4, xmm0 - -rx_i_252: ;SHL_64 - dec ebx - jz rx_finish - xor r14, 08a75ad41h - mov eax, r14d - test bl, 63 - jnz short rx_body_252 - call rx_read -rx_body_252: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - shl rax, 53 - mov rcx, rax - mov eax, r14d - xor eax, 0b178001h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - -rx_i_253: ;CALL - dec ebx - jz rx_finish - xor r14, 057f3f596h - mov eax, r14d - test bl, 63 - jnz short rx_body_253 - call rx_read -rx_body_253: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - mov rcx, rax - mov eax, r13d - xor eax, 0654b460bh - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - cmp r15d, 1699431947 - js short rx_i_254 - call rx_i_367 - -rx_i_254: ;FPADD - dec ebx - jz rx_finish - xor r14, 04cfb709eh - mov eax, r14d - test bl, 63 - jnz short rx_body_254 - call rx_read -rx_body_254: - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - addpd xmm0, xmm4 - movaps xmm8, xmm0 - -rx_i_255: ;FPADD - dec ebx - jz rx_finish - xor r9, 0b96ec9ech - mov eax, r9d - test bl, 63 - jnz short rx_body_255 - call rx_read -rx_body_255: - and eax, 32767 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - addpd xmm0, xmm5 - movaps xmm6, xmm0 - mov eax, r14d - xor eax, 0ae781d10h - and eax, 2047 - movlpd qword ptr [rsi + rax * 8], xmm6 - -rx_i_256: ;MULH_64 - dec ebx - jz rx_finish - xor r8, 08375472ch - mov eax, r8d - test bl, 63 - jnz short rx_body_256 - call rx_read -rx_body_256: - xor rbp, rax - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - mov rcx, r15 - mul rcx - mov rax, rdx - mov r9, rax - -rx_i_257: ;FPADD - dec ebx - jz rx_finish - xor r12, 0d75a8c3fh - mov eax, r12d - test bl, 63 - jnz short rx_body_257 - call rx_read -rx_body_257: - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - addpd xmm0, xmm5 - movaps xmm3, xmm0 - -rx_i_258: ;MUL_32 - dec ebx - jz rx_finish - xor r11, 064fdbda0h - mov eax, r11d - test bl, 63 - jnz short rx_body_258 - call rx_read -rx_body_258: - xor rbp, rax - and eax, 32767 - mov rax, qword ptr [rsi+rax*8] - mov ecx, eax - mov eax, r14d - imul rax, rcx - mov rcx, rax - mov eax, r9d - xor eax, 01c58ef2dh - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - -rx_i_259: ;FPADD - dec ebx - jz rx_finish - xor r11, 02e36a073h - mov eax, r11d - test bl, 63 - jnz short rx_body_259 - call rx_read -rx_body_259: - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - addpd xmm0, xmm9 - movaps xmm3, xmm0 - mov eax, r11d - xor eax, 06c1856f0h - and eax, 32767 - movlpd qword ptr [rsi + rax * 8], xmm3 - -rx_i_260: ;FPSUB - dec ebx - jz rx_finish - xor r13, 0f94e9fa9h - mov eax, r13d - test bl, 63 - jnz short rx_body_260 - call rx_read -rx_body_260: - xor rbp, rax - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - subpd xmm0, xmm5 - movaps xmm9, xmm0 - -rx_i_261: ;FPDIV - dec ebx - jz rx_finish - xor r14, 02346171ch - mov eax, r14d - test bl, 63 - jnz short rx_body_261 - call rx_read -rx_body_261: - xor rbp, rax - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - divpd xmm0, xmm3 - movaps xmm1, xmm0 - cmpeqpd xmm1, xmm1 - andps xmm0, xmm1 - movaps xmm3, xmm0 - mov eax, r11d - xor eax, 0745a48e9h - and eax, 2047 - movhpd qword ptr [rsi + rax * 8], xmm3 - -rx_i_262: ;AND_64 - dec ebx - jz rx_finish - xor r10, 01c42baa6h - mov eax, r10d - test bl, 63 - jnz short rx_body_262 - call rx_read -rx_body_262: - xor rbp, rax - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - and rax, -1569587450 - mov rcx, rax - mov eax, r11d - xor eax, 0a271ff06h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - -rx_i_263: ;FPMUL - dec ebx - jz rx_finish - xor r11, 0b39b140h - mov eax, r11d - test bl, 63 - jnz short rx_body_263 - call rx_read -rx_body_263: - xor rbp, rax - and eax, 131071 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - mulpd xmm0, xmm8 - movaps xmm1, xmm0 - cmpeqpd xmm1, xmm1 - andps xmm0, xmm1 - movaps xmm6, xmm0 - -rx_i_264: ;FPMUL - dec ebx - jz rx_finish - xor r11, 01a07d201h - mov eax, r11d - test bl, 63 - jnz short rx_body_264 - call rx_read -rx_body_264: - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - mulpd xmm0, xmm3 - movaps xmm1, xmm0 - cmpeqpd xmm1, xmm1 - andps xmm0, xmm1 - movaps xmm7, xmm0 - -rx_i_265: ;FPADD - dec ebx - jz rx_finish - xor r13, 07a3eb340h - mov eax, r13d - test bl, 63 - jnz short rx_body_265 - call rx_read -rx_body_265: - xor rbp, rax - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] + and eax, 16376 + cvtdq2pd xmm12, qword ptr [rsi+rax] + addpd xmm0, xmm12 + ; IMUL_R r6, r5 + imul r14, r13 + ; IROL_R r4, r1 + mov ecx, r9d + rol r12, cl + ; FPDIV_R e2, a0 + divpd xmm6, xmm8 + maxpd xmm6, xmm13 + ; IADD_RC r0, r2, -487084195 + lea r8, [r8+r10-487084195] + ; FPADD_R f0, a0 addpd xmm0, xmm8 - movaps xmm2, xmm0 - mov eax, r10d - xor eax, 04c559414h - and eax, 2047 - movhpd qword ptr [rsi + rax * 8], xmm2 - -rx_i_266: ;CALL - dec ebx - jz rx_finish - xor r13, 03d0a3a89h - mov eax, r13d - test bl, 63 - jnz short rx_body_266 - call rx_read -rx_body_266: - and eax, 32767 - mov rax, qword ptr [rsi+rax*8] - mov r10, rax - cmp r12d, 136160027 - jbe short rx_i_267 - call rx_i_295 - -rx_i_267: ;ROL_64 - dec ebx - jz rx_finish - xor r8, 0c6c7b37h - mov eax, r8d - test bl, 63 - jnz short rx_body_267 - call rx_read -rx_body_267: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - mov rcx, r10 - rol rax, cl - mov r11, rax - -rx_i_268: ;JUMP - dec ebx - jz rx_finish - xor r12, 0c2510cebh + ; IXOR_R r5, r3 + xor r13, r11 + ; IMUL_R r2, r4 + imul r10, r12 + ; FPMUL_R e0, a0 + mulpd xmm4, xmm8 + ; FPSUB_R f3, a3 + subpd xmm3, xmm11 + ; IMUL_M r4, L1[4856] + imul r12, qword ptr [rsi+4856] + ; IMUL_9C r2, 7951348 + lea r10, [r10+r10*8+7951348] + ; COND_R r3, ab(r7, 984532162) + xor ecx, ecx + cmp r15d, 984532162 + seta cl + add r11, rcx + ; IXOR_M r7, L1[r4] mov eax, r12d - test bl, 63 - jnz short rx_body_268 - call rx_read -rx_body_268: - xor rbp, rax - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - mov rcx, rax + and eax, 16376 + xor r15, qword ptr [rsi+rax] + ; IMUL_R r4, 248971329 + imul r12, 248971329 + ; IXOR_R r3, r1 + xor r11, r9 + ; IMUL_R r3, 2098482639 + imul r11, 2098482639 + ; IXOR_R r6, r3 + xor r14, r11 + ; IXOR_R r5, r4 + xor r13, r12 + ; IADD_R r5, r4 + add r13, r12 + ; IMUL_9C r7, 66530302 + lea r15, [r15+r15*8+66530302] + ; IMULH_R r0, r5 + mov rax, r8 + mul r13 + mov r8, rdx + ; IMUL_R r2, r7 + imul r10, r15 + ; IMUL_R r1, 770985098 + imul r9, 770985098 + ; COND_R r7, be(r5, 58538265) + xor ecx, ecx + cmp r13d, 58538265 + setbe cl + add r15, rcx + ; IMUL_9C r3, 245704334 + lea r11, [r11+r11*8+245704334] + ; ISMULH_R r2, r4 + mov rax, r10 + imul r12 + mov r10, rdx + ; FPDIV_R e3, a3 + divpd xmm7, xmm11 + maxpd xmm7, xmm13 + ; IMULH_R r5, r2 + mov rax, r13 + mul r10 + mov r13, rdx + ; ISUB_M r7, L1[r5] mov eax, r13d - xor eax, 0850bf8dah - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - cmp r15d, -2062812966 - jl rx_i_381 - -rx_i_269: ;ROL_64 - dec ebx - jz rx_finish - xor r11, 0c80cc899h + and eax, 16376 + sub r15, qword ptr [rsi+rax] + ; FPMUL_R e3, a3 + mulpd xmm7, xmm11 + ; IMUL_R r3, r4 + imul r11, r12 + ; FPSWAP_R f1 + shufpd xmm1, xmm1, 1 + ; IMULH_R r1, 633797287 + mov eax, 633797287 + mul r9 + add r9, rdx + ; IADD_R r4, r3 + add r12, r11 + ; IROR_R r2, r7 + mov ecx, r15d + ror r10, cl + ; FPSUB_R f0, a2 + subpd xmm0, xmm10 + ; FPSUB_R f2, a2 + subpd xmm2, xmm10 + ; FPMUL_R e0, a2 + mulpd xmm4, xmm10 + ; IMUL_M r4, L1[r3] mov eax, r11d - test bl, 63 - jnz short rx_body_269 - call rx_read -rx_body_269: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - rol rax, 50 - mov r10, rax - -rx_i_270: ;FPMUL - dec ebx - jz rx_finish - xor r11, 0eb355caah - mov eax, r11d - test bl, 63 - jnz short rx_body_270 - call rx_read -rx_body_270: - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - mulpd xmm0, xmm9 - movaps xmm1, xmm0 - cmpeqpd xmm1, xmm1 - andps xmm0, xmm1 - movaps xmm7, xmm0 + and eax, 16376 + imul r12, qword ptr [rsi+rax] + ; IMUL_9C r1, -1901091890 + lea r9, [r9+r9*8-1901091890] + ; IROR_R r2, r6 + mov ecx, r14d + ror r10, cl + ; IMULH_R r5, r3 + mov rax, r13 + mul r11 + mov r13, rdx + ; FPSUB_M f1, L1[r7] mov eax, r15d - xor eax, 03981662bh - and eax, 32767 - movhpd qword ptr [rsi + rax * 8], xmm7 - -rx_i_271: ;MUL_32 - dec ebx - jz rx_finish - xor r13, 0c6f12299h - mov eax, r13d - test bl, 63 - jnz short rx_body_271 - call rx_read -rx_body_271: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - mov ecx, eax - mov eax, r10d - imul rax, rcx - mov rcx, rax + and eax, 16376 + cvtdq2pd xmm12, qword ptr [rsi+rax] + subpd xmm1, xmm12 + ; IMUL_M r2, L1[r1] mov eax, r9d - xor eax, 086ddd754h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - -rx_i_272: ;AND_64 - dec ebx - jz rx_finish - xor r12, 0695a5dd2h - mov eax, r12d - test bl, 63 - jnz short rx_body_272 - call rx_read -rx_body_272: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - and rax, r12 - mov rcx, rax - mov eax, r13d - xor eax, 0d45957b7h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - -rx_i_273: ;JUMP - dec ebx - jz rx_finish - xor r9, 0d315e4dch - mov eax, r9d - test bl, 63 - jnz short rx_body_273 - call rx_read -rx_body_273: - xor rbp, rax - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - mov r13, rax - cmp r12d, 1670848568 - jl rx_i_372 - -rx_i_274: ;FPADD - dec ebx - jz rx_finish - xor r15, 0b66ca7e0h - mov eax, r15d - test bl, 63 - jnz short rx_body_274 - call rx_read -rx_body_274: - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - addpd xmm0, xmm4 - movaps xmm6, xmm0 - -rx_i_275: ;IDIV_64 - dec ebx - jz rx_finish - xor r10, 0788eceb7h - mov eax, r10d - test bl, 63 - jnz short rx_body_275 - call rx_read -rx_body_275: - xor rbp, rax - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - ; magic divide by -333089764 - mov rdx, -7433071640624659213 - imul rdx - mov rax, rdx - xor edx, edx - sar rax, 27 - sets dl - add rax, rdx - mov r13, rax - -rx_i_276: ;JUMP - dec ebx - jz rx_finish - xor r9, 0c6ac5edah - mov eax, r9d - test bl, 63 - jnz short rx_body_276 - call rx_read -rx_body_276: - xor rbp, rax - and eax, 131071 - mov rax, qword ptr [rsi+rax*8] - mov r12, rax - cmp r11d, -1236180570 - jns rx_i_404 - -rx_i_277: ;IMUL_32 - dec ebx - jz rx_finish - xor r11, 0c9549789h - mov eax, r11d - test bl, 63 - jnz short rx_body_277 - call rx_read -rx_body_277: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - movsxd rcx, eax - movsxd rax, r10d - imul rax, rcx - mov r9, rax - -rx_i_278: ;FPSUB - dec ebx - jz rx_finish - xor r9, 0a2bc66c9h - mov eax, r9d - test bl, 63 - jnz short rx_body_278 - call rx_read -rx_body_278: - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - subpd xmm0, xmm7 - movaps xmm4, xmm0 - -rx_i_279: ;FPADD - dec ebx - jz rx_finish - xor r15, 0f1a91458h - mov eax, r15d - test bl, 63 - jnz short rx_body_279 - call rx_read -rx_body_279: - and eax, 32767 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - addpd xmm0, xmm5 - movaps xmm9, xmm0 - mov eax, r9d - xor eax, 0475ade01h - and eax, 2047 - movlpd qword ptr [rsi + rax * 8], xmm9 - -rx_i_280: ;IDIV_64 - dec ebx - jz rx_finish - xor r12, 066246b43h - mov eax, r12d - test bl, 63 - jnz short rx_body_280 - call rx_read -rx_body_280: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - ; magic divide by 555412224 - mov rdx, 2228867111296024113 - imul rdx - mov rax, rdx - xor edx, edx - sar rax, 26 - sets dl - add rax, rdx - mov rcx, rax - mov eax, r13d - xor eax, 0211aeb00h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - -rx_i_281: ;SUB_64 - dec ebx - jz rx_finish - xor r10, 05a762727h - mov eax, r10d - test bl, 63 - jnz short rx_body_281 - call rx_read -rx_body_281: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - sub rax, -202979002 - mov rcx, rax - mov eax, r11d - xor eax, 0f3e6c946h - and eax, 32767 - mov qword ptr [rsi + rax * 8], rcx - -rx_i_282: ;SUB_64 - dec ebx - jz rx_finish - xor r15, 0de1ab603h - mov eax, r15d - test bl, 63 - jnz short rx_body_282 - call rx_read -rx_body_282: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - sub rax, r12 - mov r11, rax - -rx_i_283: ;ADD_64 - dec ebx - jz rx_finish - xor r9, 0df4d084fh - mov eax, r9d - test bl, 63 - jnz short rx_body_283 - call rx_read -rx_body_283: - xor rbp, rax - and eax, 32767 - mov rax, qword ptr [rsi+rax*8] - add rax, r12 - mov rcx, rax - mov eax, r12d - xor eax, 0bb0da7d0h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - -rx_i_284: ;FPADD - dec ebx - jz rx_finish - xor r15, 0e68f36ach - mov eax, r15d - test bl, 63 - jnz short rx_body_284 - call rx_read -rx_body_284: - xor rbp, rax - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - addpd xmm0, xmm6 - movaps xmm9, xmm0 - mov eax, r9d - xor eax, 0936f2960h - and eax, 32767 - movlpd qword ptr [rsi + rax * 8], xmm9 - -rx_i_285: ;IMUL_32 - dec ebx - jz rx_finish - xor r8, 09adb333bh - mov eax, r8d - test bl, 63 - jnz short rx_body_285 - call rx_read -rx_body_285: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - movsxd rcx, eax - movsxd rax, r8d - imul rax, rcx - mov rcx, rax - mov eax, r14d - xor eax, 09308cd6dh - and eax, 32767 - mov qword ptr [rsi + rax * 8], rcx - -rx_i_286: ;ROL_64 - dec ebx - jz rx_finish - xor r14, 082f5e36ch - mov eax, r14d - test bl, 63 - jnz short rx_body_286 - call rx_read -rx_body_286: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - mov rcx, r9 - rol rax, cl - mov r15, rax - -rx_i_287: ;IDIV_64 - dec ebx - jz rx_finish - xor r11, 049547c9ch - mov eax, r11d - test bl, 63 - jnz short rx_body_287 - call rx_read -rx_body_287: - xor rbp, rax - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - ; magic divide by 1227278330 - mov rdx, 8069498232143512385 - imul rdx - mov rax, rdx - xor edx, edx - sar rax, 29 - sets dl - add rax, rdx - mov r8, rax - -rx_i_288: ;MUL_64 - dec ebx - jz rx_finish - xor r10, 08716ac8bh - mov eax, r10d - test bl, 63 - jnz short rx_body_288 - call rx_read -rx_body_288: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - imul rax, r8 - mov rcx, rax - mov eax, r9d - xor eax, 062eafa1bh - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - -rx_i_289: ;FPMUL - dec ebx - jz rx_finish - xor r14, 0efef52b5h - mov eax, r14d - test bl, 63 - jnz short rx_body_289 - call rx_read -rx_body_289: - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - mulpd xmm0, xmm9 - movaps xmm1, xmm0 - cmpeqpd xmm1, xmm1 - andps xmm0, xmm1 - movaps xmm8, xmm0 - -rx_i_290: ;FPSUB - dec ebx - jz rx_finish - xor r15, 060665748h - mov eax, r15d - test bl, 63 - jnz short rx_body_290 - call rx_read -rx_body_290: - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - subpd xmm0, xmm8 - movaps xmm9, xmm0 - mov eax, r9d - xor eax, 02f4d18d7h - and eax, 2047 - movhpd qword ptr [rsi + rax * 8], xmm9 - -rx_i_291: ;RET - dec ebx - jz rx_finish - xor r13, 0ddf4bd1ah - mov eax, r13d - test bl, 63 - jnz short rx_body_291 - call rx_read -rx_body_291: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - mov rcx, rax - mov eax, r14d - xor eax, 0768a9d75h - and eax, 131071 - mov qword ptr [rsi + rax * 8], rcx - cmp rsp, rdi - je short rx_i_292 - ret - -rx_i_292: ;ROL_64 - dec ebx - jz rx_finish - xor r13, 05a87cc3dh - mov eax, r13d - test bl, 63 - jnz short rx_body_292 - call rx_read -rx_body_292: - and eax, 32767 - mov rax, qword ptr [rsi+rax*8] - mov rcx, r8 - rol rax, cl - mov rcx, rax - mov eax, r10d - xor eax, 035600fe9h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - -rx_i_293: ;FPADD - dec ebx - jz rx_finish - xor r9, 0c61f4279h - mov eax, r9d - test bl, 63 - jnz short rx_body_293 - call rx_read -rx_body_293: - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - addpd xmm0, xmm5 - movaps xmm8, xmm0 - mov eax, r8d - xor eax, 014844990h - and eax, 2047 - movhpd qword ptr [rsi + rax * 8], xmm8 - -rx_i_294: ;RET - dec ebx - jz rx_finish - xor r14, 0f3b9d85h - mov eax, r14d - test bl, 63 - jnz short rx_body_294 - call rx_read -rx_body_294: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - mov r8, rax - cmp rsp, rdi - je short rx_i_295 - ret - -rx_i_295: ;FPSUB - dec ebx - jz rx_finish - xor r9, 0f42798fdh - mov eax, r9d - test bl, 63 - jnz short rx_body_295 - call rx_read -rx_body_295: - xor rbp, rax - and eax, 131071 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - subpd xmm0, xmm8 - movaps xmm7, xmm0 - mov eax, r15d - xor eax, 08a66e69fh - and eax, 2047 - movlpd qword ptr [rsi + rax * 8], xmm7 - -rx_i_296: ;FPSQRT - dec ebx - jz rx_finish - xor r14, 018738758h - mov eax, r14d - test bl, 63 - jnz short rx_body_296 - call rx_read -rx_body_296: - and eax, 131071 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - andps xmm0, xmm10 - sqrtpd xmm8, xmm0 - mov eax, r8d - xor eax, 0f3a594cah - and eax, 2047 - movlpd qword ptr [rsi + rax * 8], xmm8 - -rx_i_297: ;ADD_64 - dec ebx - jz rx_finish - xor r15, 0de3b9d9bh - mov eax, r15d - test bl, 63 - jnz short rx_body_297 - call rx_read -rx_body_297: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - add rax, r10 - mov r14, rax - -rx_i_298: ;FPSUB - dec ebx - jz rx_finish - xor r14, 084f53637h - mov eax, r14d - test bl, 63 - jnz short rx_body_298 - call rx_read -rx_body_298: - and eax, 32767 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - subpd xmm0, xmm7 - movaps xmm6, xmm0 - mov eax, r14d - xor eax, 0d10f7c42h - and eax, 2047 - movhpd qword ptr [rsi + rax * 8], xmm6 - -rx_i_299: ;ADD_64 - dec ebx - jz rx_finish - xor r12, 042f4897h - mov eax, r12d - test bl, 63 - jnz short rx_body_299 - call rx_read -rx_body_299: - xor rbp, rax - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - add rax, r10 - mov r12, rax - -rx_i_300: ;FPSUB - dec ebx - jz rx_finish - xor r12, 095765693h - mov eax, r12d - test bl, 63 - jnz short rx_body_300 - call rx_read -rx_body_300: - xor rbp, rax - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - subpd xmm0, xmm8 - movaps xmm2, xmm0 - -rx_i_301: ;FPMUL - dec ebx - jz rx_finish - xor r8, 0a0ec5eech - mov eax, r8d - test bl, 63 - jnz short rx_body_301 - call rx_read -rx_body_301: - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - mulpd xmm0, xmm5 - movaps xmm1, xmm0 - cmpeqpd xmm1, xmm1 - andps xmm0, xmm1 - movaps xmm7, xmm0 - -rx_i_302: ;ADD_64 - dec ebx - jz rx_finish - xor r15, 0f6f8c345h - mov eax, r15d - test bl, 63 - jnz short rx_body_302 - call rx_read -rx_body_302: - xor rbp, rax - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - add rax, r10 - mov rcx, rax - mov eax, r11d - xor eax, 0afbbe406h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - -rx_i_303: ;FPADD - dec ebx - jz rx_finish - xor r14, 082a3e965h - mov eax, r14d - test bl, 63 - jnz short rx_body_303 - call rx_read -rx_body_303: - and eax, 32767 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - addpd xmm0, xmm3 - movaps xmm9, xmm0 - -rx_i_304: ;MUL_64 - dec ebx - jz rx_finish - xor r12, 04940c652h - mov eax, r12d - test bl, 63 - jnz short rx_body_304 - call rx_read -rx_body_304: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - imul rax, rax, 2007686513 - mov r13, rax - -rx_i_305: ;MUL_64 - dec ebx - jz rx_finish - xor r11, 03c6c62b8h - mov eax, r11d - test bl, 63 - jnz short rx_body_305 - call rx_read -rx_body_305: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - imul rax, r15 - mov rcx, rax - mov eax, r10d - xor eax, 0fc12db20h - and eax, 32767 - mov qword ptr [rsi + rax * 8], rcx - -rx_i_306: ;ADD_64 - dec ebx - jz rx_finish - xor r15, 08b34cdfch - mov eax, r15d - test bl, 63 - jnz short rx_body_306 - call rx_read -rx_body_306: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - add rax, 400578979 - mov rcx, rax - mov eax, r13d - xor eax, 017e059a3h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - -rx_i_307: ;SHL_64 - dec ebx - jz rx_finish - xor r15, 04c36adb1h - mov eax, r15d - test bl, 63 - jnz short rx_body_307 - call rx_read -rx_body_307: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - shl rax, 33 - mov r10, rax - -rx_i_308: ;MUL_64 - dec ebx - jz rx_finish - xor r11, 0a4213b21h - mov eax, r11d - test bl, 63 - jnz short rx_body_308 - call rx_read -rx_body_308: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - imul rax, r13 - mov rcx, rax - mov eax, r15d - xor eax, 0c2d34e82h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - -rx_i_309: ;DIV_64 - dec ebx - jz rx_finish - xor r9, 090c42304h - mov eax, r9d - test bl, 63 - jnz short rx_body_309 - call rx_read -rx_body_309: - and eax, 32767 - mov rax, qword ptr [rsi+rax*8] - ; magic divide by 2642117268 - mov rcx, 14993309243657753043 - mul rcx - mov rax, rdx - shr rax, 31 - mov r9, rax - -rx_i_310: ;FPMUL - dec ebx - jz rx_finish - xor r9, 0f78e1c8ch - mov eax, r9d - test bl, 63 - jnz short rx_body_310 - call rx_read -rx_body_310: - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - mulpd xmm0, xmm6 - movaps xmm1, xmm0 - cmpeqpd xmm1, xmm1 - andps xmm0, xmm1 - movaps xmm7, xmm0 - -rx_i_311: ;FPMUL - dec ebx - jz rx_finish - xor r8, 0ff8848cfh - mov eax, r8d - test bl, 63 - jnz short rx_body_311 - call rx_read -rx_body_311: - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - mulpd xmm0, xmm4 - movaps xmm1, xmm0 - cmpeqpd xmm1, xmm1 - andps xmm0, xmm1 - movaps xmm4, xmm0 - -rx_i_312: ;MUL_32 - dec ebx - jz rx_finish - xor r13, 0b18904cdh - mov eax, r13d - test bl, 63 - jnz short rx_body_312 - call rx_read -rx_body_312: - and eax, 32767 - mov rax, qword ptr [rsi+rax*8] - mov ecx, eax - mov eax, r14d - imul rax, rcx - mov rcx, rax - mov eax, r10d - xor eax, 0bb93ffb8h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - -rx_i_313: ;ROR_64 - dec ebx - jz rx_finish - xor r8, 0a0d0befh - mov eax, r8d - test bl, 63 - jnz short rx_body_313 - call rx_read -rx_body_313: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - ror rax, 62 - mov r14, rax - -rx_i_314: ;IMUL_32 - dec ebx - jz rx_finish - xor r15, 01e3c65f7h - mov eax, r15d - test bl, 63 - jnz short rx_body_314 - call rx_read -rx_body_314: - and eax, 32767 - mov rax, qword ptr [rsi+rax*8] - movsxd rcx, eax - mov rax, 2143811925 - imul rax, rcx - mov rcx, rax - mov eax, r9d - xor eax, 07fc7f955h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - -rx_i_315: ;XOR_64 - dec ebx - jz rx_finish - xor r9, 02e36ddafh - mov eax, r9d - test bl, 63 - jnz short rx_body_315 - call rx_read -rx_body_315: - and eax, 32767 - mov rax, qword ptr [rsi+rax*8] - xor rax, r15 - mov r9, rax - -rx_i_316: ;RET - dec ebx - jz rx_finish - xor r14, 05b0cb5bbh - mov eax, r14d - test bl, 63 - jnz short rx_body_316 - call rx_read -rx_body_316: - xor rbp, rax - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - mov rcx, rax - mov eax, r8d - xor eax, 03602c513h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - cmp rsp, rdi - je short rx_i_317 - ret - -rx_i_317: ;FPADD - dec ebx - jz rx_finish - xor r9, 0c74e7415h - mov eax, r9d - test bl, 63 - jnz short rx_body_317 - call rx_read -rx_body_317: - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - addpd xmm0, xmm7 - movaps xmm5, xmm0 - mov eax, r13d - xor eax, 0b5bc8h - and eax, 2047 - movlpd qword ptr [rsi + rax * 8], xmm5 - -rx_i_318: ;ROR_64 - dec ebx - jz rx_finish - xor r9, 057621d9ah - mov eax, r9d - test bl, 63 - jnz short rx_body_318 - call rx_read -rx_body_318: - and eax, 32767 - mov rax, qword ptr [rsi+rax*8] - mov rcx, r11 - ror rax, cl - mov rcx, rax - mov eax, r15d - xor eax, 061cb9db8h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - -rx_i_319: ;SHR_64 - dec ebx - jz rx_finish - xor r13, 08ee02d99h - mov eax, r13d - test bl, 63 - jnz short rx_body_319 - call rx_read -rx_body_319: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - shr rax, 46 - mov r11, rax - -rx_i_320: ;FPADD - dec ebx - jz rx_finish - xor r15, 013461188h - mov eax, r15d - test bl, 63 - jnz short rx_body_320 - call rx_read -rx_body_320: - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - addpd xmm0, xmm4 - movaps xmm2, xmm0 - -rx_i_321: ;IMUL_32 - dec ebx - jz rx_finish - xor r11, 0a7bae383h - mov eax, r11d - test bl, 63 - jnz short rx_body_321 - call rx_read -rx_body_321: - and eax, 32767 - mov rax, qword ptr [rsi+rax*8] - movsxd rcx, eax - movsxd rax, r9d - imul rax, rcx - mov rcx, rax - mov eax, r12d - xor eax, 0f213dach - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - -rx_i_322: ;CALL - dec ebx - jz rx_finish - xor r14, 08215399bh - mov eax, r14d - test bl, 63 - jnz short rx_body_322 - call rx_read -rx_body_322: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - mov r11, rax - cmp r11d, 1411981860 - jno short rx_i_323 - call rx_i_343 - -rx_i_323: ;MULH_64 - dec ebx - jz rx_finish - xor r14, 07b07664bh - mov eax, r14d - test bl, 63 - jnz short rx_body_323 - call rx_read -rx_body_323: - xor rbp, rax - and eax, 32767 - mov rax, qword ptr [rsi+rax*8] - mov rcx, r14 - mul rcx - mov rax, rdx - mov r14, rax - -rx_i_324: ;FPDIV - dec ebx - jz rx_finish - xor r9, 0f956baffh - mov eax, r9d - test bl, 63 - jnz short rx_body_324 - call rx_read -rx_body_324: - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - divpd xmm0, xmm2 - movaps xmm1, xmm0 - cmpeqpd xmm1, xmm1 - andps xmm0, xmm1 - movaps xmm9, xmm0 - -rx_i_325: ;OR_32 - dec ebx - jz rx_finish - xor r11, 0708ab9d1h - mov eax, r11d - test bl, 63 - jnz short rx_body_325 - call rx_read -rx_body_325: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - or eax, r8d - mov rcx, rax - mov eax, r13d - xor eax, 0ef376c54h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - -rx_i_326: ;MULH_64 - dec ebx - jz rx_finish - xor r11, 0d1b27540h - mov eax, r11d - test bl, 63 - jnz short rx_body_326 - call rx_read -rx_body_326: - xor rbp, rax - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - mov rcx, -1233771581 - mul rcx - mov rax, rdx - mov r9, rax - -rx_i_327: ;IDIV_64 - dec ebx - jz rx_finish - xor r9, 09665f98dh - mov eax, r9d - test bl, 63 - jnz short rx_body_327 - call rx_read -rx_body_327: - xor rbp, rax - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - ; magic divide by 1572662125 - mov rcx, rax - mov rdx, -5852150286715358951 - imul rdx - mov rax, rdx - xor edx, edx - add rax, rcx - sar rax, 30 - sets dl - add rax, rdx - mov r12, rax - -rx_i_328: ;SHR_64 - dec ebx - jz rx_finish - xor r12, 0fb9c32adh - mov eax, r12d - test bl, 63 - jnz short rx_body_328 - call rx_read -rx_body_328: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - shr rax, 18 - mov rcx, rax - mov eax, r9d - xor eax, 04d159415h - and eax, 32767 - mov qword ptr [rsi + rax * 8], rcx - -rx_i_329: ;RET - dec ebx - jz rx_finish - xor r11, 0e1110623h - mov eax, r11d - test bl, 63 - jnz short rx_body_329 - call rx_read -rx_body_329: - and eax, 131071 - mov rax, qword ptr [rsi+rax*8] - mov r11, rax - cmp rsp, rdi - je short rx_i_330 - ret - -rx_i_330: ;IMUL_32 - dec ebx - jz rx_finish - xor r9, 0f6a93f19h - mov eax, r9d - test bl, 63 - jnz short rx_body_330 - call rx_read -rx_body_330: - xor rbp, rax - and eax, 32767 - mov rax, qword ptr [rsi+rax*8] - movsxd rcx, eax - mov rax, -1349816041 - imul rax, rcx - mov r11, rax - -rx_i_331: ;FPADD - dec ebx - jz rx_finish - xor r9, 0bc9bbe4ah - mov eax, r9d - test bl, 63 - jnz short rx_body_331 - call rx_read -rx_body_331: - xor rbp, rax - and eax, 32767 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - addpd xmm0, xmm3 - movaps xmm9, xmm0 - -rx_i_332: ;FPADD - dec ebx - jz rx_finish - xor r12, 0f253cd4eh - mov eax, r12d - test bl, 63 - jnz short rx_body_332 - call rx_read -rx_body_332: - xor rbp, rax - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - addpd xmm0, xmm6 - movaps xmm3, xmm0 - -rx_i_333: ;OR_64 - dec ebx - jz rx_finish - xor r14, 0f009758bh - mov eax, r14d - test bl, 63 - jnz short rx_body_333 - call rx_read -rx_body_333: - xor rbp, rax - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - or rax, r12 - mov rcx, rax - mov eax, r11d - xor eax, 0f58fcaa8h - and eax, 32767 - mov qword ptr [rsi + rax * 8], rcx - -rx_i_334: ;ADD_64 - dec ebx - jz rx_finish - xor r8, 0dda04168h - mov eax, r8d - test bl, 63 - jnz short rx_body_334 - call rx_read -rx_body_334: - xor rbp, rax - and eax, 32767 - mov rax, qword ptr [rsi+rax*8] - add rax, r13 - mov r8, rax - -rx_i_335: ;SUB_64 - dec ebx - jz rx_finish - xor r15, 03e6cfb73h - mov eax, r15d - test bl, 63 - jnz short rx_body_335 - call rx_read -rx_body_335: - and eax, 32767 - mov rax, qword ptr [rsi+rax*8] - sub rax, r8 - mov r12, rax - -rx_i_336: ;ROR_64 - dec ebx - jz rx_finish - xor r15, 0aea0a435h - mov eax, r15d - test bl, 63 - jnz short rx_body_336 - call rx_read -rx_body_336: - xor rbp, rax - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - ror rax, 42 - mov rcx, rax - mov eax, r11d - xor eax, 02644c5ah - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - -rx_i_337: ;ADD_64 - dec ebx - jz rx_finish - xor r8, 03d6c4ab2h - mov eax, r8d - test bl, 63 - jnz short rx_body_337 - call rx_read -rx_body_337: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - add rax, r12 - mov rcx, rax - mov eax, r13d - xor eax, 0dab07c39h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - -rx_i_338: ;MUL_64 - dec ebx - jz rx_finish - xor r12, 0d428a742h - mov eax, r12d - test bl, 63 - jnz short rx_body_338 - call rx_read -rx_body_338: - xor rbp, rax - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - imul rax, r12 - mov rcx, rax - mov eax, r11d - xor eax, 0184d2abbh - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - -rx_i_339: ;FPADD - dec ebx - jz rx_finish - xor r9, 04596ef73h - mov eax, r9d - test bl, 63 - jnz short rx_body_339 - call rx_read -rx_body_339: - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - addpd xmm0, xmm6 - movaps xmm2, xmm0 - -rx_i_340: ;FPADD - dec ebx - jz rx_finish - xor r15, 0e51629cch - mov eax, r15d - test bl, 63 - jnz short rx_body_340 - call rx_read -rx_body_340: - and eax, 131071 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - addpd xmm0, xmm5 - movaps xmm5, xmm0 - mov eax, r13d - xor eax, 038b653beh - and eax, 2047 - movhpd qword ptr [rsi + rax * 8], xmm5 - -rx_i_341: ;MUL_32 - dec ebx - jz rx_finish - xor r12, 019eb9ea5h - mov eax, r12d - test bl, 63 - jnz short rx_body_341 - call rx_read -rx_body_341: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - mov ecx, eax - mov eax, r15d - imul rax, rcx - mov r8, rax - -rx_i_342: ;FPSUB - dec ebx - jz rx_finish - xor r9, 09ccc7abah - mov eax, r9d - test bl, 63 - jnz short rx_body_342 - call rx_read -rx_body_342: - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - subpd xmm0, xmm2 - movaps xmm3, xmm0 - -rx_i_343: ;XOR_64 - dec ebx - jz rx_finish - xor r14, 056f6cf0bh - mov eax, r14d - test bl, 63 - jnz short rx_body_343 - call rx_read -rx_body_343: - xor rbp, rax - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - xor rax, r13 - mov rcx, rax - mov eax, r15d - xor eax, 0d9a469a9h - and eax, 32767 - mov qword ptr [rsi + rax * 8], rcx - -rx_i_344: ;FPSUB - dec ebx - jz rx_finish - xor r10, 03ef9bcc4h - mov eax, r10d - test bl, 63 - jnz short rx_body_344 - call rx_read -rx_body_344: - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - subpd xmm0, xmm6 - movaps xmm5, xmm0 - mov eax, r13d - xor eax, 0627d9feah - and eax, 2047 - movlpd qword ptr [rsi + rax * 8], xmm5 - -rx_i_345: ;MULH_64 - dec ebx - jz rx_finish - xor r12, 0bbbcdbach - mov eax, r12d - test bl, 63 - jnz short rx_body_345 - call rx_read -rx_body_345: - and eax, 131071 - mov rax, qword ptr [rsi+rax*8] - mov rcx, r13 - mul rcx - mov rax, rdx - mov rcx, rax - mov eax, r9d - xor eax, 0ef03b0ddh - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - -rx_i_346: ;AND_32 - dec ebx - jz rx_finish - xor r12, 0ae9d1e96h - mov eax, r12d - test bl, 63 - jnz short rx_body_346 - call rx_read -rx_body_346: - and eax, 32767 - mov rax, qword ptr [rsi+rax*8] - and eax, r15d - mov r13, rax - -rx_i_347: ;ADD_64 - dec ebx - jz rx_finish - xor r14, 070c34d69h - mov eax, r14d - test bl, 63 - jnz short rx_body_347 - call rx_read -rx_body_347: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - add rax, r10 - mov rcx, rax - mov eax, r13d - xor eax, 0d529429ah - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - -rx_i_348: ;FPSUB - dec ebx - jz rx_finish - xor r13, 0523ff904h - mov eax, r13d - test bl, 63 - jnz short rx_body_348 - call rx_read -rx_body_348: - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - subpd xmm0, xmm3 - movaps xmm9, xmm0 - mov eax, r9d - xor eax, 039c35461h - and eax, 2047 - movhpd qword ptr [rsi + rax * 8], xmm9 - -rx_i_349: ;OR_64 - dec ebx - jz rx_finish - xor r8, 018e0e5ddh - mov eax, r8d - test bl, 63 - jnz short rx_body_349 - call rx_read -rx_body_349: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - or rax, r15 - mov rcx, rax - mov eax, r13d - xor eax, 05c449453h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - -rx_i_350: ;CALL - dec ebx - jz rx_finish - xor r9, 09bd050f0h - mov eax, r9d - test bl, 63 - jnz short rx_body_350 - call rx_read -rx_body_350: - and eax, 32767 - mov rax, qword ptr [rsi+rax*8] - mov r12, rax - cmp r9d, -980411581 - ja short rx_i_351 - call rx_i_352 - -rx_i_351: ;MUL_64 - dec ebx - jz rx_finish - xor r11, 0a3a5906fh - mov eax, r11d - test bl, 63 - jnz short rx_body_351 - call rx_read -rx_body_351: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - imul rax, r10 - mov rcx, rax - mov eax, r13d - xor eax, 0985ba4h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - -rx_i_352: ;FPADD - dec ebx - jz rx_finish - xor r10, 0afc9af2bh - mov eax, r10d - test bl, 63 - jnz short rx_body_352 - call rx_read -rx_body_352: - xor rbp, rax - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - addpd xmm0, xmm6 - movaps xmm2, xmm0 - mov eax, r10d - xor eax, 03bf686f2h - and eax, 2047 - movlpd qword ptr [rsi + rax * 8], xmm2 - -rx_i_353: ;FPSUB - dec ebx - jz rx_finish - xor r13, 02e65278bh - mov eax, r13d - test bl, 63 - jnz short rx_body_353 - call rx_read -rx_body_353: - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - subpd xmm0, xmm2 - movaps xmm7, xmm0 - -rx_i_354: ;MUL_32 - dec ebx - jz rx_finish - xor r13, 02412fc10h - mov eax, r13d - test bl, 63 - jnz short rx_body_354 - call rx_read -rx_body_354: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - mov ecx, eax - mov eax, r13d - imul rax, rcx - mov rcx, rax - mov eax, r13d - xor eax, 049cc2e0ch - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - -rx_i_355: ;MUL_64 - dec ebx - jz rx_finish - xor r10, 06bd6e65fh - mov eax, r10d - test bl, 63 - jnz short rx_body_355 - call rx_read -rx_body_355: - and eax, 32767 - mov rax, qword ptr [rsi+rax*8] - imul rax, r14 - mov r8, rax - -rx_i_356: ;MUL_64 - dec ebx - jz rx_finish - xor r10, 01cd85d80h - mov eax, r10d - test bl, 63 - jnz short rx_body_356 - call rx_read -rx_body_356: - xor rbp, rax - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - imul rax, r10 - mov r11, rax - -rx_i_357: ;ADD_64 - dec ebx - jz rx_finish - xor r10, 0f7daed36h - mov eax, r10d - test bl, 63 - jnz short rx_body_357 - call rx_read -rx_body_357: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - add rax, r11 - mov r11, rax - -rx_i_358: ;DIV_64 - dec ebx - jz rx_finish - xor r13, 088fa6e5ah - mov eax, r13d - test bl, 63 - jnz short rx_body_358 - call rx_read -rx_body_358: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - ; magic divide by 3667831238 - shr rax, 1 - mov rcx, 2700102505175032865 - mul rcx - mov rax, rdx - shr rax, 28 - mov r9, rax - -rx_i_359: ;FPSUB - dec ebx - jz rx_finish - xor r10, 0714fc2cdh - mov eax, r10d - test bl, 63 - jnz short rx_body_359 - call rx_read -rx_body_359: - xor rbp, rax - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - subpd xmm0, xmm9 - movaps xmm4, xmm0 - mov eax, r12d - xor eax, 0f16b9be3h - and eax, 32767 - movlpd qword ptr [rsi + rax * 8], xmm4 - -rx_i_360: ;FPMUL - dec ebx - jz rx_finish - xor r10, 0c2d110b5h - mov eax, r10d - test bl, 63 - jnz short rx_body_360 - call rx_read -rx_body_360: - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - mulpd xmm0, xmm8 - movaps xmm1, xmm0 - cmpeqpd xmm1, xmm1 - andps xmm0, xmm1 - movaps xmm8, xmm0 - mov eax, r8d - xor eax, 0c41a4103h - and eax, 131071 - movlpd qword ptr [rsi + rax * 8], xmm8 - -rx_i_361: ;FPDIV - dec ebx - jz rx_finish - xor r15, 01d125a7fh - mov eax, r15d - test bl, 63 - jnz short rx_body_361 - call rx_read -rx_body_361: - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - divpd xmm0, xmm6 - movaps xmm1, xmm0 - cmpeqpd xmm1, xmm1 - andps xmm0, xmm1 - movaps xmm6, xmm0 - -rx_i_362: ;SUB_64 - dec ebx - jz rx_finish - xor r9, 0ed8954bdh - mov eax, r9d - test bl, 63 - jnz short rx_body_362 - call rx_read -rx_body_362: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - sub rax, r9 - mov rcx, rax - mov eax, r15d - xor eax, 04080bf8dh - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - -rx_i_363: ;FPMUL - dec ebx - jz rx_finish - xor r12, 09f75887bh - mov eax, r12d - test bl, 63 - jnz short rx_body_363 - call rx_read -rx_body_363: - and eax, 32767 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - mulpd xmm0, xmm6 - movaps xmm1, xmm0 - cmpeqpd xmm1, xmm1 - andps xmm0, xmm1 - movaps xmm3, xmm0 - -rx_i_364: ;MUL_32 - dec ebx - jz rx_finish - xor r11, 0badaf867h - mov eax, r11d - test bl, 63 - jnz short rx_body_364 - call rx_read -rx_body_364: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - mov ecx, eax - mov eax, r8d - imul rax, rcx - mov rcx, rax - mov eax, r8d - xor eax, 0bb8ee9ch - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - -rx_i_365: ;IMUL_32 - dec ebx - jz rx_finish - xor r15, 02db4444ah - mov eax, r15d - test bl, 63 - jnz short rx_body_365 - call rx_read -rx_body_365: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - movsxd rcx, eax - movsxd rax, r9d - imul rax, rcx - mov r12, rax - -rx_i_366: ;IMUL_32 - dec ebx - jz rx_finish - xor r12, 0bff7218fh - mov eax, r12d - test bl, 63 - jnz short rx_body_366 - call rx_read -rx_body_366: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - movsxd rcx, eax - movsxd rax, r8d - imul rax, rcx - mov r15, rax - -rx_i_367: ;ROR_64 - dec ebx - jz rx_finish - xor r9, 04d14cb3ah - mov eax, r9d - test bl, 63 - jnz short rx_body_367 - call rx_read -rx_body_367: - xor rbp, rax - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - ror rax, 18 - mov rcx, rax - mov eax, r12d - xor eax, 0ad9b92e8h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - -rx_i_368: ;SUB_32 - dec ebx - jz rx_finish - xor r10, 0a14836bah - mov eax, r10d - test bl, 63 - jnz short rx_body_368 - call rx_read -rx_body_368: - and eax, 32767 - mov rax, qword ptr [rsi+rax*8] - sub eax, r10d - mov r8, rax - -rx_i_369: ;IDIV_64 - dec ebx - jz rx_finish - xor r9, 053fe22e2h - mov eax, r9d - test bl, 63 - jnz short rx_body_369 - call rx_read -rx_body_369: - and eax, 32767 - mov rax, qword ptr [rsi+rax*8] - ; magic divide by 470792991 - mov rdx, 1314739240972876203 - imul rdx - mov rax, rdx - xor edx, edx - sar rax, 25 - sets dl - add rax, rdx - mov r9, rax - -rx_i_370: ;FPSUB - dec ebx - jz rx_finish - xor r15, 010e1fb24h - mov eax, r15d - test bl, 63 - jnz short rx_body_370 - call rx_read -rx_body_370: - and eax, 32767 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - subpd xmm0, xmm6 - movaps xmm6, xmm0 - -rx_i_371: ;FPADD - dec ebx - jz rx_finish - xor r8, 0ebbd5cc9h - mov eax, r8d - test bl, 63 - jnz short rx_body_371 - call rx_read -rx_body_371: - xor rbp, rax - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] + and eax, 16376 + imul r10, qword ptr [rsi+rax] + ; IMUL_R r6, r0 + imul r14, r8 + ; IADD_R r7, r6 + add r15, r14 + ; FPSUB_R f2, a3 + subpd xmm2, xmm11 + ; COND_R r5, no(r2, -1589295370) + xor ecx, ecx + cmp r10d, -1589295370 + setno cl + add r13, rcx + ; IMUL_9C r7, 420978486 + lea r15, [r15+r15*8+420978486] + ; IROL_R r4, r2 + mov ecx, r10d + rol r12, cl + ; IMUL_9C r0, -1084530831 + lea r8, [r8+r8*8-1084530831] + ; FPNEG_R f3 + xorps xmm3, xmm15 + ; IROR_R r6, r4 + mov ecx, r12d + ror r14, cl + ; IROL_R r4, r5 + mov ecx, r13d + rol r12, cl + ; FPSUB_R f2, a3 + subpd xmm2, xmm11 + ; FPMUL_R e2, a2 + mulpd xmm6, xmm10 + ; ISMULH_M r6, L2[98600] + mov rax, r14 + imul qword ptr [rsi+98600] + mov r14, rdx + ; IXOR_R r0, r6 + xor r8, r14 + ; FPSWAP_R f1 + shufpd xmm1, xmm1, 1 + ; FPADD_R f0, a1 addpd xmm0, xmm9 - movaps xmm5, xmm0 - mov eax, r13d - xor eax, 0c40fe413h - and eax, 2047 - movhpd qword ptr [rsi + rax * 8], xmm5 - -rx_i_372: ;SHL_64 - dec ebx - jz rx_finish - xor r10, 098ab79d7h - mov eax, r10d - test bl, 63 - jnz short rx_body_372 - call rx_read -rx_body_372: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - mov rcx, r13 - shl rax, cl - mov r9, rax - -rx_i_373: ;FPMUL - dec ebx - jz rx_finish - xor r15, 056438b3h - mov eax, r15d - test bl, 63 - jnz short rx_body_373 - call rx_read -rx_body_373: - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - mulpd xmm0, xmm8 - movaps xmm1, xmm0 - cmpeqpd xmm1, xmm1 - andps xmm0, xmm1 - movaps xmm4, xmm0 - -rx_i_374: ;FPMUL - dec ebx - jz rx_finish - xor r11, 0dbcce604h - mov eax, r11d - test bl, 63 - jnz short rx_body_374 - call rx_read -rx_body_374: - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - mulpd xmm0, xmm2 - movaps xmm1, xmm0 - cmpeqpd xmm1, xmm1 - andps xmm0, xmm1 - movaps xmm2, xmm0 - mov eax, r10d - xor eax, 03507e810h - and eax, 2047 - movlpd qword ptr [rsi + rax * 8], xmm2 - -rx_i_375: ;ADD_64 - dec ebx - jz rx_finish - xor r9, 0edea6200h + ; COND_R r1, ab(r3, -991705199) + xor ecx, ecx + cmp r11d, -991705199 + seta cl + add r9, rcx + ; IMULH_M r4, L2[r2] + mov ecx, r10d + and ecx, 262136 + mov rax, r12 + mul qword ptr [rsi+rcx] + mov r12, rdx + ; IROR_R r2, r6 + mov ecx, r14d + ror r10, cl + ; FPDIV_R e0, a1 + divpd xmm4, xmm9 + maxpd xmm4, xmm13 + ; IMUL_R r1, r7 + imul r9, r15 + ; COND_R r6, ns(r2, 939392855) + xor ecx, ecx + cmp r10d, 939392855 + setns cl + add r14, rcx + ; FPMUL_R e3, a1 + mulpd xmm7, xmm9 + ; COND_R r2, ab(r2, -499266314) + xor ecx, ecx + cmp r10d, -499266314 + seta cl + add r10, rcx + ; COND_M r7, lt(L1[r1], -1624420482) + xor ecx, ecx mov eax, r9d - test bl, 63 - jnz short rx_body_375 - call rx_read -rx_body_375: - and eax, 131071 - mov rax, qword ptr [rsi+rax*8] - add rax, -332030999 - mov r12, rax - -rx_i_376: ;ADD_64 - dec ebx - jz rx_finish - xor r14, 05e61b279h - mov eax, r14d - test bl, 63 - jnz short rx_body_376 - call rx_read -rx_body_376: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - add rax, r9 - mov rcx, rax - mov eax, r8d - xor eax, 01c614282h - and eax, 32767 - mov qword ptr [rsi + rax * 8], rcx - -rx_i_377: ;FPSUB - dec ebx - jz rx_finish - xor r14, 0fc1fb433h - mov eax, r14d - test bl, 63 - jnz short rx_body_377 - call rx_read -rx_body_377: - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - subpd xmm0, xmm3 - movaps xmm7, xmm0 - -rx_i_378: ;MUL_32 - dec ebx - jz rx_finish - xor r12, 082aa21ach + and eax, 16376 + cmp dword ptr [rsi+rax], -1624420482 + setl cl + add r15, rcx + ; COND_R r1, lt(r1, 1525413977) + xor ecx, ecx + cmp r9d, 1525413977 + setl cl + add r9, rcx + ; IMUL_R r4, r5 + imul r12, r13 + ; IMUL_R r4, r2 + imul r12, r10 + ; FPSQRT_R e1 + sqrtpd xmm5, xmm5 + ; ISUB_R r2, r6 + sub r10, r14 + ; FPDIV_R e1, a0 + divpd xmm5, xmm8 + maxpd xmm5, xmm13 + ; FPMUL_R e2, a3 + mulpd xmm6, xmm11 + ; IADD_R r6, 671627590 + add r14, 671627590 + ; COND_M r6, sg(L1[r4], -780452820) + xor ecx, ecx mov eax, r12d - test bl, 63 - jnz short rx_body_378 - call rx_read -rx_body_378: - and eax, 131071 - mov rax, qword ptr [rsi+rax*8] - mov ecx, eax - mov eax, r14d - imul rax, rcx - mov r15, rax - -rx_i_379: ;ROR_64 - dec ebx - jz rx_finish - xor r10, 05dba41fbh - mov eax, r10d - test bl, 63 - jnz short rx_body_379 - call rx_read -rx_body_379: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - mov rcx, r9 - ror rax, cl - mov rcx, rax - mov eax, r13d - xor eax, 03a2dc429h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - -rx_i_380: ;MUL_64 - dec ebx - jz rx_finish - xor r11, 0229e3d6eh - mov eax, r11d - test bl, 63 - jnz short rx_body_380 - call rx_read -rx_body_380: - and eax, 32767 - mov rax, qword ptr [rsi+rax*8] - imul rax, r10 - mov rcx, rax - mov eax, r13d - xor eax, 0a9fd85e0h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - -rx_i_381: ;XOR_32 - dec ebx - jz rx_finish - xor r8, 019816ff9h - mov eax, r8d - test bl, 63 - jnz short rx_body_381 - call rx_read -rx_body_381: - xor rbp, rax - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - xor eax, r14d - mov rcx, rax - mov eax, r9d - xor eax, 032349ff8h - and eax, 32767 - mov qword ptr [rsi + rax * 8], rcx - -rx_i_382: ;ROL_64 - dec ebx - jz rx_finish - xor r14, 036b5b81fh - mov eax, r14d - test bl, 63 - jnz short rx_body_382 - call rx_read -rx_body_382: - and eax, 32767 - mov rax, qword ptr [rsi+rax*8] - rol rax, 55 - mov rcx, rax - mov eax, r11d - xor eax, 0a6a2e0b1h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - -rx_i_383: ;FPSUB - dec ebx - jz rx_finish - xor r15, 05f798ec3h - mov eax, r15d - test bl, 63 - jnz short rx_body_383 - call rx_read -rx_body_383: - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - subpd xmm0, xmm4 - movaps xmm5, xmm0 - -rx_i_384: ;XOR_64 - dec ebx - jz rx_finish - xor r10, 05b459fd7h - mov eax, r10d - test bl, 63 - jnz short rx_body_384 - call rx_read -rx_body_384: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - xor rax, 1413715044 - mov r9, rax - -rx_i_385: ;MUL_64 - dec ebx - jz rx_finish - xor r15, 0c91749bbh - mov eax, r15d - test bl, 63 - jnz short rx_body_385 - call rx_read -rx_body_385: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - imul rax, r12 - mov rcx, rax - mov eax, r13d - xor eax, 0fb9b50b9h - and eax, 32767 - mov qword ptr [rsi + rax * 8], rcx - -rx_i_386: ;FPADD - dec ebx - jz rx_finish - xor r9, 0575b4bdch - mov eax, r9d - test bl, 63 - jnz short rx_body_386 - call rx_read -rx_body_386: - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] + and eax, 16376 + cmp dword ptr [rsi+rax], -780452820 + sets cl + add r14, rcx + ; IMULH_R r4, r7 + mov rax, r12 + mul r15 + mov r12, rdx + ; FPMUL_R e3, a1 + mulpd xmm7, xmm9 + ; FPADD_R f0, a0 addpd xmm0, xmm8 - movaps xmm9, xmm0 - -rx_i_387: ;SUB_32 - dec ebx - jz rx_finish - xor r9, 0d4f7bc6ah - mov eax, r9d - test bl, 63 - jnz short rx_body_387 - call rx_read -rx_body_387: - xor rbp, rax - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - sub eax, r15d - mov rcx, rax - mov eax, r9d - xor eax, 028cbb7adh - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - -rx_i_388: ;RET - dec ebx - jz rx_finish - xor r8, 08a949356h - mov eax, r8d - test bl, 63 - jnz short rx_body_388 - call rx_read -rx_body_388: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - mov r11, rax - cmp rsp, rdi - je short rx_i_389 - ret - -rx_i_389: ;JUMP - dec ebx - jz rx_finish - xor r11, 06531ad2eh - mov eax, r11d - test bl, 63 - jnz short rx_body_389 - call rx_read -rx_body_389: - and eax, 32767 - mov rax, qword ptr [rsi+rax*8] - mov rcx, rax - mov eax, r14d - xor eax, 0eb1a1f50h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - cmp r9d, -350609584 - jge rx_i_421 - -rx_i_390: ;FPADD - dec ebx - jz rx_finish - xor r15, 02914abeah - mov eax, r15d - test bl, 63 - jnz short rx_body_390 - call rx_read -rx_body_390: - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - addpd xmm0, xmm4 - movaps xmm3, xmm0 - mov eax, r11d - xor eax, 0e5c5acbbh - and eax, 2047 - movlpd qword ptr [rsi + rax * 8], xmm3 - -rx_i_391: ;FPADD - dec ebx - jz rx_finish - xor r8, 0473a41f0h - mov eax, r8d - test bl, 63 - jnz short rx_body_391 - call rx_read -rx_body_391: - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - addpd xmm0, xmm3 - movaps xmm6, xmm0 - -rx_i_392: ;SAR_64 - dec ebx - jz rx_finish - xor r14, 01ebc1f0dh - mov eax, r14d - test bl, 63 - jnz short rx_body_392 - call rx_read -rx_body_392: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - mov rcx, r9 - sar rax, cl - mov r13, rax - -rx_i_393: ;AND_64 - dec ebx - jz rx_finish - xor r14, 0742e95b1h - mov eax, r14d - test bl, 63 - jnz short rx_body_393 - call rx_read -rx_body_393: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - and rax, r12 - mov r13, rax - -rx_i_394: ;FPADD - dec ebx - jz rx_finish - xor r12, 0db885c2ch - mov eax, r12d - test bl, 63 - jnz short rx_body_394 - call rx_read -rx_body_394: - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - addpd xmm0, xmm9 - movaps xmm6, xmm0 - -rx_i_395: ;DIV_64 - dec ebx - jz rx_finish - xor r8, 04ae4fe8ch - mov eax, r8d - test bl, 63 - jnz short rx_body_395 - call rx_read -rx_body_395: - xor rbp, rax - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - ; magic divide by 939698704 - mov rcx, 5269518980991934091 - mul rcx - mov rax, rdx - shr rax, 28 - mov rcx, rax - mov eax, r8d - xor eax, 03802aa10h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - -rx_i_396: ;ROR_64 - dec ebx - jz rx_finish - xor r10, 07b41862bh + ; FPMUL_R e0, a1 + mulpd xmm4, xmm9 + ; IMUL_R r7, r3 + imul r15, r11 + ; IROL_R r0, r7 + mov ecx, r15d + rol r8, cl + ; IMUL_R r1, r7 + imul r9, r15 + ; COND_R r0, no(r7, 449007464) + xor ecx, ecx + cmp r15d, 449007464 + setno cl + add r8, rcx + ; ISMULH_M r6, L2[134288] + mov rax, r14 + imul qword ptr [rsi+134288] + mov r14, rdx + ; IMULH_R r5, r2 + mov rax, r13 + mul r10 + mov r13, rdx + ; IMULH_R r7, r4 + mov rax, r15 + mul r12 + mov r15, rdx + ; FPDIV_R e3, a0 + divpd xmm7, xmm8 + maxpd xmm7, xmm13 + ; IXOR_R r3, r4 + xor r11, r12 + ; IDIV_C r1, 72349044 + mov rax, 8555331009525020641 + mul r9 + shr rdx, 25 + add r9, rdx + ; IADD_R r5, r4 + add r13, r12 + ; IROR_R r2, r4 + mov ecx, r12d + ror r10, cl + ; FPSUB_M f1, L1[r2] mov eax, r10d - test bl, 63 - jnz short rx_body_396 - call rx_read -rx_body_396: - and eax, 32767 - mov rax, qword ptr [rsi+rax*8] - ror rax, 62 - mov rcx, rax + and eax, 16376 + cvtdq2pd xmm12, qword ptr [rsi+rax] + subpd xmm1, xmm12 + ; FPMUL_R e2, a3 + mulpd xmm6, xmm11 + ; IADD_R r5, r6 + add r13, r14 + ; IXOR_M r1, L1[r4] mov eax, r12d - xor eax, 01ee1c837h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - -rx_i_397: ;SUB_32 - dec ebx - jz rx_finish - xor r8, 0916f3819h + and eax, 16376 + xor r9, qword ptr [rsi+rax] + ; ISUB_R r2, -1544880589 + sub r10, -1544880589 + ; FPNEG_R f0 + xorps xmm0, xmm15 + ; IROR_R r1, r6 + mov ecx, r14d + ror r9, cl + ; IMUL_R r6, r4 + imul r14, r12 + ; IMULH_M r4, L2[r1] + mov ecx, r9d + and ecx, 262136 + mov rax, r12 + mul qword ptr [rsi+rcx] + mov r12, rdx + ; IXOR_R r3, r0 + xor r11, r8 + ; FPSWAP_R f0 + shufpd xmm0, xmm0, 1 + ; FPSWAP_R f0 + shufpd xmm0, xmm0, 1 + ; COND_R r0, ns(r2, -308295242) + xor ecx, ecx + cmp r10d, -308295242 + setns cl + add r8, rcx + ; IMUL_9C r1, 591587965 + lea r9, [r9+r9*8+591587965] + ; FPADD_R f3, a1 + addpd xmm3, xmm9 + ; IMUL_R r5, r4 + imul r13, r12 + ; IMUL_M r7, L1[r0] mov eax, r8d - test bl, 63 - jnz short rx_body_397 - call rx_read -rx_body_397: - xor rbp, rax - and eax, 32767 - mov rax, qword ptr [rsi+rax*8] - sub eax, r12d - mov r10, rax - -rx_i_398: ;SHR_64 - dec ebx - jz rx_finish - xor r8, 04eb6fd2ah - mov eax, r8d - test bl, 63 - jnz short rx_body_398 - call rx_read -rx_body_398: - and eax, 32767 - mov rax, qword ptr [rsi+rax*8] - mov rcx, r8 - shr rax, cl - mov rcx, rax - mov eax, r11d - xor eax, 0724e7136h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - -rx_i_399: ;FPMUL - dec ebx - jz rx_finish - xor r11, 0899a98cfh - mov eax, r11d - test bl, 63 - jnz short rx_body_399 - call rx_read -rx_body_399: - xor rbp, rax - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - mulpd xmm0, xmm2 - movaps xmm1, xmm0 - cmpeqpd xmm1, xmm1 - andps xmm0, xmm1 - movaps xmm6, xmm0 - -rx_i_400: ;AND_64 - dec ebx - jz rx_finish - xor r13, 0aae75db6h - mov eax, r13d - test bl, 63 - jnz short rx_body_400 - call rx_read -rx_body_400: - and eax, 32767 - mov rax, qword ptr [rsi+rax*8] - and rax, -1800645748 - mov r14, rax - -rx_i_401: ;FPSUB - dec ebx - jz rx_finish - xor r13, 032e81f25h - mov eax, r13d - test bl, 63 - jnz short rx_body_401 - call rx_read -rx_body_401: - and eax, 32767 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - subpd xmm0, xmm4 - movaps xmm6, xmm0 - mov eax, r14d - xor eax, 03ea60344h - and eax, 32767 - movlpd qword ptr [rsi + rax * 8], xmm6 - -rx_i_402: ;RET - dec ebx - jz rx_finish - xor r9, 0fa1a07ffh + and eax, 16376 + imul r15, qword ptr [rsi+rax] + ; COND_R r6, sg(r5, -1119525789) + xor ecx, ecx + cmp r13d, -1119525789 + sets cl + add r14, rcx + ; IMUL_M r0, L1[r1] mov eax, r9d - test bl, 63 - jnz short rx_body_402 - call rx_read -rx_body_402: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - mov r14, rax - cmp rsp, rdi - je short rx_i_403 - ret - -rx_i_403: ;DIV_64 - dec ebx - jz rx_finish - xor r9, 0e59500f7h - mov eax, r9d - test bl, 63 - jnz short rx_body_403 - call rx_read -rx_body_403: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - ; magic divide by 536056992 - mov rcx, 4618688153536407095 - mul rcx - mov rax, rdx - shr rax, 27 - mov rcx, rax - mov eax, r11d - xor eax, 01ff394a0h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - -rx_i_404: ;MUL_32 - dec ebx - jz rx_finish - xor r15, 05b8ceb2fh + and eax, 16376 + imul r8, qword ptr [rsi+rax] + ; IADD_M r3, L2[r7] mov eax, r15d - test bl, 63 - jnz short rx_body_404 - call rx_read -rx_body_404: - and eax, 131071 - mov rax, qword ptr [rsi+rax*8] - mov ecx, eax - mov eax, r8d - imul rax, rcx - mov rcx, rax + and eax, 262136 + add r11, qword ptr [rsi+rax] + ; IADD_R r0, r1 + add r8, r9 + ; FPSUB_R f2, a1 + subpd xmm2, xmm9 + ; IXOR_M r0, L2[r7] mov eax, r15d - xor eax, 08f83c4f1h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - -rx_i_405: ;CALL - dec ebx - jz rx_finish - xor r8, 0f61082a3h - mov eax, r8d - test bl, 63 - jnz short rx_body_405 - call rx_read -rx_body_405: - and eax, 32767 - mov rax, qword ptr [rsi+rax*8] - mov r12, rax - cmp r10d, 1795880641 - ja short rx_i_406 - call rx_i_494 - -rx_i_406: ;FPDIV - dec ebx - jz rx_finish - xor r9, 0af6886b7h - mov eax, r9d - test bl, 63 - jnz short rx_body_406 - call rx_read -rx_body_406: - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - divpd xmm0, xmm7 - movaps xmm1, xmm0 - cmpeqpd xmm1, xmm1 - andps xmm0, xmm1 - movaps xmm9, xmm0 - -rx_i_407: ;FPSUB - dec ebx - jz rx_finish - xor r14, 09699566fh - mov eax, r14d - test bl, 63 - jnz short rx_body_407 - call rx_read -rx_body_407: - xor rbp, rax - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] + and eax, 262136 + xor r8, qword ptr [rsi+rax] + ; COND_R r6, be(r6, 1481939391) + xor ecx, ecx + cmp r14d, 1481939391 + setbe cl + add r14, rcx + ; FPADD_R f0, a1 + addpd xmm0, xmm9 + ; IXOR_R r3, r2 + xor r11, r10 + ; FPSUB_R f0, a1 subpd xmm0, xmm9 - movaps xmm8, xmm0 - -rx_i_408: ;MUL_64 - dec ebx - jz rx_finish - xor r15, 066e79fa6h + ; IXOR_R r7, r3 + xor r15, r11 + ; IXOR_M r6, L1[r4] + mov eax, r12d + and eax, 16376 + xor r14, qword ptr [rsi+rax] + ; IMULH_R r2, r7 + mov rax, r10 + mul r15 + mov r10, rdx + ; ISUB_R r5, r1 + sub r13, r9 + ; FPMUL_R e1, a3 + mulpd xmm5, xmm11 + ; FPADD_R f3, a2 + addpd xmm3, xmm10 + ; FPSWAP_R f1 + shufpd xmm1, xmm1, 1 + ; FPSUB_R f1, a3 + subpd xmm1, xmm11 + ; FPSUB_M f0, L1[r4] + mov eax, r12d + and eax, 16376 + cvtdq2pd xmm12, qword ptr [rsi+rax] + subpd xmm0, xmm12 + ; FPMUL_R e1, a2 + mulpd xmm5, xmm10 + ; FPADD_R f3, a0 + addpd xmm3, xmm8 + ; IROL_R r2, r4 + mov ecx, r12d + rol r10, cl + ; COND_M r7, ab(L2[r7], -2012390318) + xor ecx, ecx mov eax, r15d - test bl, 63 - jnz short rx_body_408 - call rx_read -rx_body_408: - and eax, 131071 - mov rax, qword ptr [rsi+rax*8] - imul rax, rax, 693109961 - mov r10, rax - -rx_i_409: ;MUL_64 - dec ebx - jz rx_finish - xor r11, 04b6caa9ah - mov eax, r11d - test bl, 63 - jnz short rx_body_409 - call rx_read -rx_body_409: - xor rbp, rax - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - imul rax, r15 - mov rcx, rax + and eax, 262136 + cmp dword ptr [rsi+rax], -2012390318 + seta cl + add r15, rcx + ; IMUL_9C r4, -38079585 + lea r12, [r12+r12*8-38079585] + ; IXOR_R r0, r1 + xor r8, r9 + ; FPMUL_R e1, a3 + mulpd xmm5, xmm11 + ; FPMUL_R e1, a1 + mulpd xmm5, xmm9 + ; FPSUB_R f1, a2 + subpd xmm1, xmm10 + ; IMUL_9C r4, -847745598 + lea r12, [r12+r12*8-847745598] + ; FPSQRT_R e1 + sqrtpd xmm5, xmm5 + ; IADD_R r7, r6 + add r15, r14 + ; FPSUB_R f3, a0 + subpd xmm3, xmm8 + ; FPSUB_R f1, a1 + subpd xmm1, xmm9 + ; IADD_R r7, r6 + add r15, r14 + ; IROL_R r2, r5 + mov ecx, r13d + rol r10, cl + ; IADD_RC r4, r2, 1338806320 + lea r12, [r12+r10+1338806320] + ; FPSQRT_R e3 + sqrtpd xmm7, xmm7 + ; IMUL_R r5, r0 + imul r13, r8 + ; FPADD_R f2, a1 + addpd xmm2, xmm9 + ; INEG_R r6 + neg r14 + ; IXOR_M r6, L1[r2] + mov eax, r10d + and eax, 16376 + xor r14, qword ptr [rsi+rax] + ; FPSUB_R f2, a2 + subpd xmm2, xmm10 + ; FPADD_R f2, a2 + addpd xmm2, xmm10 + ; FPADD_R f1, a2 + addpd xmm1, xmm10 + ; COND_R r3, be(r4, 174667458) + xor ecx, ecx + cmp r12d, 174667458 + setbe cl + add r11, rcx + ; INEG_R r6 + neg r14 + ; IXOR_R r6, r3 + xor r14, r11 + ; COND_M r5, sg(L1[r0], -864345921) + xor ecx, ecx mov eax, r8d - xor eax, 05a68b80fh - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - -rx_i_410: ;RET - dec ebx - jz rx_finish - xor r15, 0d17f245eh + and eax, 16376 + cmp dword ptr [rsi+rax], -864345921 + sets cl + add r13, rcx + ; IROL_R r7, r3 + mov ecx, r11d + rol r15, cl + ; FPSUB_R f1, a2 + subpd xmm1, xmm10 + ; IADD_M r1, L1[r0] + mov eax, r8d + and eax, 16376 + add r9, qword ptr [rsi+rax] + ; IMULH_R r1, r3 + mov rax, r9 + mul r11 + mov r9, rdx + ; IMUL_R r0, -1489192296 + imul r8, -1489192296 + ; FPMUL_R e0, a2 + mulpd xmm4, xmm10 + ; COND_R r1, ge(r1, -1358904097) + xor ecx, ecx + cmp r9d, -1358904097 + setge cl + add r9, rcx + ; FPSUB_R f1, a1 + subpd xmm1, xmm9 + ; FPADD_R f2, a3 + addpd xmm2, xmm11 + ; IROR_R r4, r7 + mov ecx, r15d + ror r12, cl + ; ISDIV_C r1, -1368098113 + mov rax, -7238896260565957085 + imul r9 + xor eax, eax + sar rdx, 29 + sets al + add rdx, rax + add r9, rdx + ; IADD_M r4, L1[r1] + mov eax, r9d + and eax, 16376 + add r12, qword ptr [rsi+rax] + ; IMUL_R r0, -1011605520 + imul r8, -1011605520 + ; FPSUB_R f3, a1 + subpd xmm3, xmm9 + ; IADD_RC r1, r4, 272540736 + lea r9, [r9+r12+272540736] + ; FPSWAP_R f2 + shufpd xmm2, xmm2, 1 + ; IROR_R r3, r2 + mov ecx, r10d + ror r11, cl + ; IMUL_R r3, 2085105439 + imul r11, 2085105439 + ; FPMUL_R e0, a0 + mulpd xmm4, xmm8 + ; IMUL_9C r6, -483723153 + lea r14, [r14+r14*8-483723153] + ; FPSUB_M f3, L1[r7] mov eax, r15d - test bl, 63 - jnz short rx_body_410 - call rx_read -rx_body_410: - and eax, 32767 - mov rax, qword ptr [rsi+rax*8] - mov r8, rax - cmp rsp, rdi - je short rx_i_411 - ret - -rx_i_411: ;RET - dec ebx - jz rx_finish - xor r12, 0364f10e7h + and eax, 16376 + cvtdq2pd xmm12, qword ptr [rsi+rax] + subpd xmm3, xmm12 + ; IMUL_R r3, r2 + imul r11, r10 + ; ISMULH_R r7, r1 + mov rax, r15 + imul r9 + mov r15, rdx + ; COND_R r1, of(r7, 778804236) + xor ecx, ecx + cmp r15d, 778804236 + seto cl + add r9, rcx + ; FPSUB_R f3, a2 + subpd xmm3, xmm10 + ; IROL_R r5, r7 + mov ecx, r15d + rol r13, cl + ; FPADD_R f1, a0 + addpd xmm1, xmm8 + ; FPADD_R f2, a3 + addpd xmm2, xmm11 + ; IMUL_R r6, r0 + imul r14, r8 + ; ISUB_M r2, L2[r4] mov eax, r12d - test bl, 63 - jnz short rx_body_411 - call rx_read -rx_body_411: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - mov rcx, rax - mov eax, r12d - xor eax, 0b492f6bah - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - cmp rsp, rdi - je short rx_i_412 - ret - -rx_i_412: ;FPDIV - dec ebx - jz rx_finish - xor r10, 0ac90e7ah - mov eax, r10d - test bl, 63 - jnz short rx_body_412 - call rx_read -rx_body_412: - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - divpd xmm0, xmm4 - movaps xmm1, xmm0 - cmpeqpd xmm1, xmm1 - andps xmm0, xmm1 - movaps xmm3, xmm0 - -rx_i_413: ;FPMUL - dec ebx - jz rx_finish - xor r11, 04b6037abh - mov eax, r11d - test bl, 63 - jnz short rx_body_413 - call rx_read -rx_body_413: - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - mulpd xmm0, xmm2 - movaps xmm1, xmm0 - cmpeqpd xmm1, xmm1 - andps xmm0, xmm1 - movaps xmm4, xmm0 - mov eax, r12d - xor eax, 043989376h - and eax, 131071 - movlpd qword ptr [rsi + rax * 8], xmm4 - -rx_i_414: ;AND_64 - dec ebx - jz rx_finish - xor r14, 06c01554dh + and eax, 262136 + sub r10, qword ptr [rsi+rax] + ; IXOR_R r0, r6 + xor r8, r14 + ; INEG_R r6 + neg r14 + ; FPMUL_R e2, a3 + mulpd xmm6, xmm11 + ; IADD_RC r4, r6, -1312075035 + lea r12, [r12+r14-1312075035] + ; IMUL_R r1, r5 + imul r9, r13 + ; IXOR_M r7, L2[r6] mov eax, r14d - test bl, 63 - jnz short rx_body_414 - call rx_read -rx_body_414: - xor rbp, rax - and eax, 32767 - mov rax, qword ptr [rsi+rax*8] - and rax, -378293327 - mov r10, rax - -rx_i_415: ;DIV_64 - dec ebx - jz rx_finish - xor r8, 08c3e59a1h - mov eax, r8d - test bl, 63 - jnz short rx_body_415 - call rx_read -rx_body_415: - xor rbp, rax - and eax, 32767 - mov rax, qword ptr [rsi+rax*8] - ; magic divide by 3756873911 - add rax, 1 - sbb rax, 0 - mov rcx, 10544426615208851175 - mul rcx - mov rax, rdx - shr rax, 31 - mov r9, rax - -rx_i_416: ;FPADD - dec ebx - jz rx_finish - xor r12, 0f3fafde9h + and eax, 262136 + xor r15, qword ptr [rsi+rax] + ; IROR_R r2, 23 + ror r10, 23 + ; FPMUL_R e0, a2 + mulpd xmm4, xmm10 + ; ISMULH_M r5, L1[r2] + mov ecx, r10d + and ecx, 16376 + mov rax, r13 + imul qword ptr [rsi+rcx] + mov r13, rdx + ; ISUB_M r7, L1[r4] mov eax, r12d - test bl, 63 - jnz short rx_body_416 - call rx_read -rx_body_416: - xor rbp, rax - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - addpd xmm0, xmm3 - movaps xmm5, xmm0 - mov eax, r13d - xor eax, 0f84b5382h - and eax, 2047 - movlpd qword ptr [rsi + rax * 8], xmm5 - -rx_i_417: ;SUB_64 - dec ebx - jz rx_finish - xor r10, 03c6481fah + and eax, 16376 + sub r15, qword ptr [rsi+rax] + ; COND_R r0, sg(r2, 1538841628) + xor ecx, ecx + cmp r10d, 1538841628 + sets cl + add r8, rcx + ; IMUL_R r6, r2 + imul r14, r10 + ; ISUB_R r0, r1 + sub r8, r9 + ; IMUL_R r5, r7 + imul r13, r15 + ; IADD_RC r1, r0, 516706834 + lea r9, [r9+r8+516706834] + ; INEG_R r5 + neg r13 + ; FPSQRT_R e3 + sqrtpd xmm7, xmm7 + ; IADD_RC r5, r4, -1679394922 + lea r13, [r13+r12-1679394922] + ; FPSUB_R f1, a1 + subpd xmm1, xmm9 + ; IMUL_R r0, r2 + imul r8, r10 + ; ISUB_R r3, r2 + sub r11, r10 + ; FPDIV_R e0, a3 + divpd xmm4, xmm11 + maxpd xmm4, xmm13 + ; ISUB_R r1, r5 + sub r9, r13 + ; COND_M r2, be(L2[r2], 1840094725) + xor ecx, ecx mov eax, r10d - test bl, 63 - jnz short rx_body_417 - call rx_read -rx_body_417: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - sub rax, r12 - mov rcx, rax - mov eax, r10d - xor eax, 0dfa7569ch - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - -rx_i_418: ;MULH_64 - dec ebx - jz rx_finish - xor r10, 02bd61c5fh - mov eax, r10d - test bl, 63 - jnz short rx_body_418 - call rx_read -rx_body_418: - and eax, 131071 - mov rax, qword ptr [rsi+rax*8] - mov rcx, r11 - mul rcx - mov rax, rdx - mov r10, rax - -rx_i_419: ;OR_64 - dec ebx - jz rx_finish - xor r9, 0b6ab9d32h - mov eax, r9d - test bl, 63 - jnz short rx_body_419 - call rx_read -rx_body_419: - and eax, 131071 - mov rax, qword ptr [rsi+rax*8] - or rax, r14 - mov rcx, rax - mov eax, r14d - xor eax, 0beeca8dbh - and eax, 32767 - mov qword ptr [rsi + rax * 8], rcx - -rx_i_420: ;ROR_64 - dec ebx - jz rx_finish - xor r9, 0f9690ceah - mov eax, r9d - test bl, 63 - jnz short rx_body_420 - call rx_read -rx_body_420: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - ror rax, 38 - mov rcx, rax - mov eax, r9d - xor eax, 08f7bb3ech - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - -rx_i_421: ;CALL - dec ebx - jz rx_finish - xor r12, 01ada0f39h - mov eax, r12d - test bl, 63 - jnz short rx_body_421 - call rx_read -rx_body_421: - xor rbp, rax - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - mov r10, rax - cmp r8d, -1600409762 - jo short rx_i_422 - call rx_i_31 - -rx_i_422: ;IMUL_32 - dec ebx - jz rx_finish - xor r11, 04dd16ca4h - mov eax, r11d - test bl, 63 - jnz short rx_body_422 - call rx_read -rx_body_422: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - movsxd rcx, eax - movsxd rax, r10d - imul rax, rcx - mov rcx, rax - mov eax, r13d - xor eax, 07c614e2h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - -rx_i_423: ;MUL_64 - dec ebx - jz rx_finish - xor r12, 04df5ce05h - mov eax, r12d - test bl, 63 - jnz short rx_body_423 - call rx_read -rx_body_423: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - imul rax, r10 - mov r15, rax - -rx_i_424: ;FPADD - dec ebx - jz rx_finish - xor r13, 01ad12ce2h - mov eax, r13d - test bl, 63 - jnz short rx_body_424 - call rx_read -rx_body_424: - xor rbp, rax - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - addpd xmm0, xmm7 - movaps xmm9, xmm0 - -rx_i_425: ;IMUL_32 - dec ebx - jz rx_finish - xor r8, 0a3c5391dh - mov eax, r8d - test bl, 63 - jnz short rx_body_425 - call rx_read -rx_body_425: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - movsxd rcx, eax - mov rax, 1776029069 - imul rax, rcx - mov rcx, rax - mov eax, r14d - xor eax, 069dc0d8dh - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - -rx_i_426: ;IDIV_64 - dec ebx - jz rx_finish - xor r12, 09dd55ba0h - mov eax, r12d - test bl, 63 - jnz short rx_body_426 - call rx_read -rx_body_426: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - ; magic divide by -590728721 - mov rdx, -4191230239118101979 - imul rdx - mov rax, rdx - xor edx, edx - sar rax, 27 - sets dl - add rax, rdx - mov r14, rax - -rx_i_427: ;MUL_32 - dec ebx - jz rx_finish - xor r11, 0d6cae9aeh - mov eax, r11d - test bl, 63 - jnz short rx_body_427 - call rx_read -rx_body_427: - xor rbp, rax - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - mov ecx, eax - mov eax, -2146332428 - imul rax, rcx - mov r9, rax - -rx_i_428: ;RET - dec ebx - jz rx_finish - xor r11, 0f807a961h - mov eax, r11d - test bl, 63 - jnz short rx_body_428 - call rx_read -rx_body_428: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - mov r8, rax - cmp rsp, rdi - je short rx_i_429 - ret - -rx_i_429: ;MUL_64 - dec ebx - jz rx_finish - xor r12, 0650a4102h - mov eax, r12d - test bl, 63 - jnz short rx_body_429 - call rx_read -rx_body_429: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - imul rax, r9 - mov rcx, rax + and eax, 262136 + cmp dword ptr [rsi+rax], 1840094725 + setbe cl + add r10, rcx + ; IMUL_M r6, L1[r7] mov eax, r15d - xor eax, 076a3ad84h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - -rx_i_430: ;FPADD - dec ebx - jz rx_finish - xor r14, 019cc0e5h - mov eax, r14d - test bl, 63 - jnz short rx_body_430 - call rx_read -rx_body_430: - xor rbp, rax - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - addpd xmm0, xmm8 - movaps xmm5, xmm0 - mov eax, r13d - xor eax, 058891433h - and eax, 32767 - movhpd qword ptr [rsi + rax * 8], xmm5 - -rx_i_431: ;ROR_64 - dec ebx - jz rx_finish - xor r12, 0ed17ab58h - mov eax, r12d - test bl, 63 - jnz short rx_body_431 - call rx_read -rx_body_431: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - mov rcx, r13 - ror rax, cl - mov r13, rax - -rx_i_432: ;SUB_64 - dec ebx - jz rx_finish - xor r10, 01c3b321fh - mov eax, r10d - test bl, 63 - jnz short rx_body_432 - call rx_read -rx_body_432: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - sub rax, 876274173 - mov r8, rax - -rx_i_433: ;ADD_32 - dec ebx - jz rx_finish - xor r13, 0bbb88499h - mov eax, r13d - test bl, 63 - jnz short rx_body_433 - call rx_read -rx_body_433: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - add eax, 1193456495 - mov rcx, rax - mov eax, r12d - xor eax, 04722b36fh - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - -rx_i_434: ;FPDIV - dec ebx - jz rx_finish - xor r13, 0167edabdh - mov eax, r13d - test bl, 63 - jnz short rx_body_434 - call rx_read -rx_body_434: - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - divpd xmm0, xmm3 - movaps xmm1, xmm0 - cmpeqpd xmm1, xmm1 - andps xmm0, xmm1 - movaps xmm9, xmm0 - -rx_i_435: ;MUL_64 - dec ebx - jz rx_finish - xor r15, 0b940480ah - mov eax, r15d - test bl, 63 - jnz short rx_body_435 - call rx_read -rx_body_435: - xor rbp, rax - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - imul rax, rax, 1971717631 - mov rcx, rax - mov eax, r9d - xor eax, 0758605ffh - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - -rx_i_436: ;FPADD - dec ebx - jz rx_finish - xor r15, 0bfc3ca8bh - mov eax, r15d - test bl, 63 - jnz short rx_body_436 - call rx_read -rx_body_436: - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - addpd xmm0, xmm2 - movaps xmm7, xmm0 - -rx_i_437: ;FPMUL - dec ebx - jz rx_finish - xor r8, 098a6bcf7h - mov eax, r8d - test bl, 63 - jnz short rx_body_437 - call rx_read -rx_body_437: - xor rbp, rax - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - mulpd xmm0, xmm3 - movaps xmm1, xmm0 - cmpeqpd xmm1, xmm1 - andps xmm0, xmm1 - movaps xmm8, xmm0 - -rx_i_438: ;FPMUL - dec ebx - jz rx_finish - xor r10, 0325b38ebh - mov eax, r10d - test bl, 63 - jnz short rx_body_438 - call rx_read -rx_body_438: - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - mulpd xmm0, xmm9 - movaps xmm1, xmm0 - cmpeqpd xmm1, xmm1 - andps xmm0, xmm1 - movaps xmm4, xmm0 - mov eax, r12d - xor eax, 0b7c490eeh - and eax, 2047 - movlpd qword ptr [rsi + rax * 8], xmm4 - -rx_i_439: ;OR_64 - dec ebx - jz rx_finish - xor r13, 05e807e81h - mov eax, r13d - test bl, 63 - jnz short rx_body_439 - call rx_read -rx_body_439: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - or rax, -1299288575 - mov r10, rax - -rx_i_440: ;CALL - dec ebx - jz rx_finish - xor r10, 062f83728h - mov eax, r10d - test bl, 63 - jnz short rx_body_440 - call rx_read -rx_body_440: - xor rbp, rax - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - mov rcx, rax - mov eax, r9d - xor eax, 07ed31f7ah - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - cmp r12d, 2127765370 - jns short rx_i_441 - call rx_i_41 - -rx_i_441: ;ADD_64 - dec ebx - jz rx_finish - xor r14, 0d18ec075h - mov eax, r14d - test bl, 63 - jnz short rx_body_441 - call rx_read -rx_body_441: - xor rbp, rax - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - add rax, r14 - mov rcx, rax - mov eax, r9d - xor eax, 01f93242ch - and eax, 131071 - mov qword ptr [rsi + rax * 8], rcx - -rx_i_442: ;FPSQRT - dec ebx - jz rx_finish - xor r14, 0a53dd1bh - mov eax, r14d - test bl, 63 - jnz short rx_body_442 - call rx_read -rx_body_442: - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - andps xmm0, xmm10 - sqrtpd xmm3, xmm0 - -rx_i_443: ;RET - dec ebx - jz rx_finish - xor r14, 0232d1285h - mov eax, r14d - test bl, 63 - jnz short rx_body_443 - call rx_read -rx_body_443: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - mov r9, rax - cmp rsp, rdi - je short rx_i_444 - ret - -rx_i_444: ;FPSUB - dec ebx - jz rx_finish - xor r8, 042455dd8h - mov eax, r8d - test bl, 63 - jnz short rx_body_444 - call rx_read -rx_body_444: - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - subpd xmm0, xmm7 - movaps xmm5, xmm0 - -rx_i_445: ;ADD_64 - dec ebx - jz rx_finish - xor r13, 09ae009b2h - mov eax, r13d - test bl, 63 - jnz short rx_body_445 - call rx_read -rx_body_445: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - add rax, r11 - mov r9, rax - -rx_i_446: ;MUL_32 - dec ebx - jz rx_finish - xor r12, 01734708eh - mov eax, r12d - test bl, 63 - jnz short rx_body_446 - call rx_read -rx_body_446: - xor rbp, rax - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - mov ecx, eax - mov eax, r15d - imul rax, rcx - mov rcx, rax - mov eax, r13d - xor eax, 03166163h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - -rx_i_447: ;FPADD - dec ebx - jz rx_finish - xor r8, 01596d0e8h - mov eax, r8d - test bl, 63 - jnz short rx_body_447 - call rx_read -rx_body_447: - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - addpd xmm0, xmm7 - movaps xmm5, xmm0 - mov eax, r13d - xor eax, 0b384d4afh - and eax, 2047 - movlpd qword ptr [rsi + rax * 8], xmm5 - -rx_i_448: ;FPSUB - dec ebx - jz rx_finish - xor r9, 0390cfdb0h - mov eax, r9d - test bl, 63 - jnz short rx_body_448 - call rx_read -rx_body_448: - xor rbp, rax - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - subpd xmm0, xmm3 - movaps xmm9, xmm0 - -rx_i_449: ;ROL_64 - dec ebx - jz rx_finish - xor r8, 04f27744bh - mov eax, r8d - test bl, 63 - jnz short rx_body_449 - call rx_read -rx_body_449: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - rol rax, 28 - mov rcx, rax - mov eax, r8d - xor eax, 089e19790h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - -rx_i_450: ;SAR_64 - dec ebx - jz rx_finish - xor r8, 04e2c76ffh - mov eax, r8d - test bl, 63 - jnz short rx_body_450 - call rx_read -rx_body_450: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - mov rcx, r12 - sar rax, cl - mov r8, rax - -rx_i_451: ;ADD_64 - dec ebx - jz rx_finish - xor r8, 0c4d99ac9h - mov eax, r8d - test bl, 63 - jnz short rx_body_451 - call rx_read -rx_body_451: - xor rbp, rax - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - add rax, r10 - mov rcx, rax - mov eax, r8d - xor eax, 0eedd10b3h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - -rx_i_452: ;RET - dec ebx - jz rx_finish - xor r13, 040130b88h - mov eax, r13d - test bl, 63 - jnz short rx_body_452 - call rx_read -rx_body_452: - and eax, 32767 - mov rax, qword ptr [rsi+rax*8] - mov r11, rax - cmp rsp, rdi - je short rx_i_453 - ret - -rx_i_453: ;DIV_64 - dec ebx - jz rx_finish - xor r11, 0a2096aa4h - mov eax, r11d - test bl, 63 - jnz short rx_body_453 - call rx_read -rx_body_453: - and eax, 131071 - mov rax, qword ptr [rsi+rax*8] - ; magic divide by 380157076 - shr rax, 2 - mov rcx, 3256390890604862173 - mul rcx - mov rax, rdx - shr rax, 24 - mov rcx, rax - mov eax, r8d - xor eax, 016a8bc94h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - -rx_i_454: ;FPADD - dec ebx - jz rx_finish - xor r13, 081314291h - mov eax, r13d - test bl, 63 - jnz short rx_body_454 - call rx_read -rx_body_454: - and eax, 32767 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - addpd xmm0, xmm9 - movaps xmm4, xmm0 - -rx_i_455: ;OR_64 - dec ebx - jz rx_finish - xor r8, 059263cdbh - mov eax, r8d - test bl, 63 - jnz short rx_body_455 - call rx_read -rx_body_455: - xor rbp, rax - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - or rax, r9 - mov r8, rax - -rx_i_456: ;AND_64 - dec ebx - jz rx_finish - xor r9, 010e8fe6h - mov eax, r9d - test bl, 63 - jnz short rx_body_456 - call rx_read -rx_body_456: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - and rax, 401943615 - mov r9, rax - -rx_i_457: ;SUB_64 - dec ebx - jz rx_finish - xor r9, 09de1a3efh - mov eax, r9d - test bl, 63 - jnz short rx_body_457 - call rx_read -rx_body_457: - xor rbp, rax - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - sub rax, 1482178870 - mov rcx, rax - mov eax, r10d - xor eax, 058584136h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - -rx_i_458: ;SAR_64 - dec ebx - jz rx_finish - xor r11, 05c79df6eh - mov eax, r11d - test bl, 63 - jnz short rx_body_458 - call rx_read -rx_body_458: - and eax, 131071 - mov rax, qword ptr [rsi+rax*8] - mov rcx, r8 - sar rax, cl - mov rcx, rax - mov eax, r14d - xor eax, 028f0a8ch - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - -rx_i_459: ;MUL_64 - dec ebx - jz rx_finish - xor r9, 0346f46adh - mov eax, r9d - test bl, 63 - jnz short rx_body_459 - call rx_read -rx_body_459: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - imul rax, r9 - mov rcx, rax - mov eax, r13d - xor eax, 016bb0164h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - -rx_i_460: ;ADD_32 - dec ebx - jz rx_finish - xor r11, 098ab71fch - mov eax, r11d - test bl, 63 - jnz short rx_body_460 - call rx_read -rx_body_460: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - add eax, -347784553 - mov r12, rax - -rx_i_461: ;XOR_64 - dec ebx - jz rx_finish - xor r11, 0c814e926h - mov eax, r11d - test bl, 63 - jnz short rx_body_461 - call rx_read -rx_body_461: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - xor rax, 1659853721 - mov rcx, rax - mov eax, r12d - xor eax, 062ef5b99h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - -rx_i_462: ;ADD_64 - dec ebx - jz rx_finish - xor r10, 0c64b4a9eh - mov eax, r10d - test bl, 63 - jnz short rx_body_462 - call rx_read -rx_body_462: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - add rax, r8 - mov rcx, rax - mov eax, r15d - xor eax, 098a05350h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - -rx_i_463: ;ADD_32 - dec ebx - jz rx_finish - xor r9, 08c29341h - mov eax, r9d - test bl, 63 - jnz short rx_body_463 - call rx_read -rx_body_463: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - add eax, r15d - mov rcx, rax - mov eax, r10d - xor eax, 0c8204c90h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - -rx_i_464: ;MUL_64 - dec ebx - jz rx_finish - xor r12, 06ff587fdh - mov eax, r12d - test bl, 63 - jnz short rx_body_464 - call rx_read -rx_body_464: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - imul rax, r15 - mov r13, rax - -rx_i_465: ;FPADD - dec ebx - jz rx_finish - xor r12, 0b62c0003h - mov eax, r12d - test bl, 63 - jnz short rx_body_465 - call rx_read -rx_body_465: - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - addpd xmm0, xmm5 - movaps xmm2, xmm0 - mov eax, r10d - xor eax, 0d11c1242h - and eax, 32767 - movhpd qword ptr [rsi + rax * 8], xmm2 - -rx_i_466: ;IMUL_32 - dec ebx - jz rx_finish - xor r13, 05c541c42h - mov eax, r13d - test bl, 63 - jnz short rx_body_466 - call rx_read -rx_body_466: - xor rbp, rax - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - movsxd rcx, eax - movsxd rax, r13d - imul rax, rcx - mov r9, rax - -rx_i_467: ;FPADD - dec ebx - jz rx_finish - xor r8, 0cbb33f81h - mov eax, r8d - test bl, 63 - jnz short rx_body_467 - call rx_read -rx_body_467: - xor rbp, rax - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - addpd xmm0, xmm9 - movaps xmm8, xmm0 - -rx_i_468: ;DIV_64 - dec ebx - jz rx_finish - xor r8, 091044dc3h - mov eax, r8d - test bl, 63 - jnz short rx_body_468 - call rx_read -rx_body_468: - xor rbp, rax - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - ; magic divide by 4281572471 - add rax, 1 - sbb rax, 0 - mov rcx, 9252227195836753313 - mul rcx - mov rax, rdx - shr rax, 31 - mov r8, rax - -rx_i_469: ;IMUL_32 - dec ebx - jz rx_finish - xor r9, 0c0186beh - mov eax, r9d - test bl, 63 - jnz short rx_body_469 - call rx_read -rx_body_469: - and eax, 131071 - mov rax, qword ptr [rsi+rax*8] - movsxd rcx, eax - movsxd rax, r9d - imul rax, rcx - mov rcx, rax - mov eax, r9d - xor eax, 01186619dh - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - -rx_i_470: ;OR_64 - dec ebx - jz rx_finish - xor r14, 090849e3eh - mov eax, r14d - test bl, 63 - jnz short rx_body_470 - call rx_read -rx_body_470: - and eax, 131071 - mov rax, qword ptr [rsi+rax*8] - or rax, r11 - mov r14, rax - -rx_i_471: ;IMUL_32 - dec ebx - jz rx_finish - xor r14, 0cedba9b6h - mov eax, r14d - test bl, 63 - jnz short rx_body_471 - call rx_read -rx_body_471: - and eax, 32767 - mov rax, qword ptr [rsi+rax*8] - movsxd rcx, eax - mov rax, 1914863189 - imul rax, rcx - mov r14, rax - -rx_i_472: ;JUMP - dec ebx - jz rx_finish - xor r9, 038f4b9d6h - mov eax, r9d - test bl, 63 - jnz short rx_body_472 - call rx_read -rx_body_472: - xor rbp, rax - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - mov r10, rax - cmp r10d, 1738497427 - jl rx_i_8 - -rx_i_473: ;MUL_64 - dec ebx - jz rx_finish - xor r14, 01fb7637dh - mov eax, r14d - test bl, 63 - jnz short rx_body_473 - call rx_read -rx_body_473: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - imul rax, r11 - mov r12, rax - -rx_i_474: ;JUMP - dec ebx - jz rx_finish - xor r9, 0b5c0b4d4h - mov eax, r9d - test bl, 63 - jnz short rx_body_474 - call rx_read -rx_body_474: - xor rbp, rax - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - mov r15, rax - cmp r15d, -233120543 - jo rx_i_69 - -rx_i_475: ;FPSUB - dec ebx - jz rx_finish - xor r10, 0910dcdeeh - mov eax, r10d - test bl, 63 - jnz short rx_body_475 - call rx_read -rx_body_475: - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - subpd xmm0, xmm9 - movaps xmm7, xmm0 - -rx_i_476: ;FPADD - dec ebx - jz rx_finish - xor r8, 07ab3b5a4h - mov eax, r8d - test bl, 63 - jnz short rx_body_476 - call rx_read -rx_body_476: - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - addpd xmm0, xmm2 - movaps xmm9, xmm0 - mov eax, r9d - xor eax, 0b01bb14ch - and eax, 2047 - movlpd qword ptr [rsi + rax * 8], xmm9 - -rx_i_477: ;FPADD - dec ebx - jz rx_finish - xor r12, 07a29ec63h - mov eax, r12d - test bl, 63 - jnz short rx_body_477 - call rx_read -rx_body_477: - and eax, 32767 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - addpd xmm0, xmm9 - movaps xmm6, xmm0 - -rx_i_478: ;MUL_64 - dec ebx - jz rx_finish - xor r14, 02d3d7e7fh - mov eax, r14d - test bl, 63 - jnz short rx_body_478 - call rx_read -rx_body_478: - and eax, 131071 - mov rax, qword ptr [rsi+rax*8] - imul rax, r10 - mov r12, rax - -rx_i_479: ;MUL_64 - dec ebx - jz rx_finish - xor r12, 09b49c793h - mov eax, r12d - test bl, 63 - jnz short rx_body_479 - call rx_read -rx_body_479: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - imul rax, r14 - mov r13, rax - -rx_i_480: ;FPADD - dec ebx - jz rx_finish - xor r9, 0a9cc4f01h - mov eax, r9d - test bl, 63 - jnz short rx_body_480 - call rx_read -rx_body_480: - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - addpd xmm0, xmm4 - movaps xmm6, xmm0 - -rx_i_481: ;DIV_64 - dec ebx - jz rx_finish - xor r14, 0225ba1f9h - mov eax, r14d - test bl, 63 - jnz short rx_body_481 - call rx_read -rx_body_481: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - ; magic divide by 2101516912 - shr rax, 4 - mov rcx, 147267437180322377 - mul rcx - mov rax, rdx - shr rax, 20 - mov r12, rax - -rx_i_482: ;AND_32 - dec ebx - jz rx_finish - xor r14, 044a0f592h - mov eax, r14d - test bl, 63 - jnz short rx_body_482 - call rx_read -rx_body_482: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - and eax, 1304556205 - mov rcx, rax - mov eax, r11d - xor eax, 04dc1f2adh - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - -rx_i_483: ;FPADD - dec ebx - jz rx_finish - xor r11, 07f71f219h - mov eax, r11d - test bl, 63 - jnz short rx_body_483 - call rx_read -rx_body_483: - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - addpd xmm0, xmm6 - movaps xmm6, xmm0 - mov eax, r14d - xor eax, 0545908cah - and eax, 2047 - movlpd qword ptr [rsi + rax * 8], xmm6 - -rx_i_484: ;SHR_64 - dec ebx - jz rx_finish - xor r12, 07027bacdh - mov eax, r12d - test bl, 63 - jnz short rx_body_484 - call rx_read -rx_body_484: - xor rbp, rax - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - shr rax, 37 - mov rcx, rax - mov eax, r11d - xor eax, 074a50ee0h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - -rx_i_485: ;JUMP - dec ebx - jz rx_finish - xor r13, 03a04647h - mov eax, r13d - test bl, 63 - jnz short rx_body_485 - call rx_read -rx_body_485: - xor rbp, rax - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - mov rcx, rax - mov eax, r15d - xor eax, 02112cbaeh - and eax, 131071 - mov qword ptr [rsi + rax * 8], rcx - cmp r8d, 554879918 - jno rx_i_58 - -rx_i_486: ;ADD_64 - dec ebx - jz rx_finish - xor r15, 0ad072937h - mov eax, r15d - test bl, 63 - jnz short rx_body_486 - call rx_read -rx_body_486: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - add rax, r8 - mov rcx, rax - mov eax, r8d - xor eax, 03832b3b2h - and eax, 32767 - mov qword ptr [rsi + rax * 8], rcx - -rx_i_487: ;SUB_64 - dec ebx - jz rx_finish - xor r11, 07f78ad34h - mov eax, r11d - test bl, 63 - jnz short rx_body_487 - call rx_read -rx_body_487: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - sub rax, r9 - mov rcx, rax - mov eax, r11d - xor eax, 0ec228e26h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - -rx_i_488: ;DIV_64 - dec ebx - jz rx_finish - xor r12, 0d8b1788eh - mov eax, r12d - test bl, 63 - jnz short rx_body_488 - call rx_read -rx_body_488: - and eax, 32767 - mov rax, qword ptr [rsi+rax*8] - ; magic divide by 297357073 - mov rcx, 16652572300311555393 - mul rcx - mov rax, rdx - shr rax, 28 - mov r12, rax - -rx_i_489: ;JUMP - dec ebx - jz rx_finish - xor r10, 0b2ec9f3ah - mov eax, r10d - test bl, 63 - jnz short rx_body_489 - call rx_read -rx_body_489: - xor rbp, rax - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - mov r8, rax - cmp r15d, -1127175870 - jge rx_i_75 - -rx_i_490: ;ROR_64 - dec ebx - jz rx_finish - xor r11, 015c7f598h - mov eax, r11d - test bl, 63 - jnz short rx_body_490 - call rx_read -rx_body_490: - xor rbp, rax - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - mov rcx, r9 - ror rax, cl - mov r15, rax - -rx_i_491: ;FPADD - dec ebx - jz rx_finish - xor r8, 0902da6bdh - mov eax, r8d - test bl, 63 - jnz short rx_body_491 - call rx_read -rx_body_491: - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - addpd xmm0, xmm9 - movaps xmm7, xmm0 - mov eax, r15d - xor eax, 0b0f0fca4h - and eax, 32767 - movhpd qword ptr [rsi + rax * 8], xmm7 - -rx_i_492: ;IDIV_64 - dec ebx - jz rx_finish - xor r9, 0491090d9h - mov eax, r9d - test bl, 63 - jnz short rx_body_492 - call rx_read -rx_body_492: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - ; magic divide by -1779388031 - mov rcx, rax - mov rdx, 7315366159790064091 - imul rdx - mov rax, rdx - xor edx, edx - sub rax, rcx - sar rax, 30 - sets dl - add rax, rdx - mov rcx, rax - mov eax, r12d - xor eax, 095f0b181h - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - -rx_i_493: ;FPSUB - dec ebx - jz rx_finish - xor r8, 09de81282h - mov eax, r8d - test bl, 63 - jnz short rx_body_493 - call rx_read -rx_body_493: - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - subpd xmm0, xmm9 - movaps xmm4, xmm0 - mov eax, r12d - xor eax, 02feb2fd7h - and eax, 2047 - movhpd qword ptr [rsi + rax * 8], xmm4 - -rx_i_494: ;MUL_32 - dec ebx - jz rx_finish - xor r10, 0b0d50e46h - mov eax, r10d - test bl, 63 - jnz short rx_body_494 - call rx_read -rx_body_494: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - mov ecx, eax - mov eax, r11d - imul rax, rcx - mov r14, rax - -rx_i_495: ;FPMUL - dec ebx - jz rx_finish - xor r11, 0e276cad1h - mov eax, r11d - test bl, 63 - jnz short rx_body_495 - call rx_read -rx_body_495: - and eax, 32767 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - mulpd xmm0, xmm2 - movaps xmm1, xmm0 - cmpeqpd xmm1, xmm1 - andps xmm0, xmm1 - movaps xmm8, xmm0 - mov eax, r8d - xor eax, 02d12bd27h - and eax, 2047 - movlpd qword ptr [rsi + rax * 8], xmm8 - -rx_i_496: ;IDIV_64 - dec ebx - jz rx_finish - xor r14, 0fe757b73h - mov eax, r14d - test bl, 63 - jnz short rx_body_496 - call rx_read -rx_body_496: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - ; magic divide by -359802064 - mov rdx, -860153514353783887 - imul rdx - mov rax, rdx - xor edx, edx - sar rax, 24 - sets dl - add rax, rdx - mov r9, rax - -rx_i_497: ;FPMUL - dec ebx - jz rx_finish - xor r8, 08d25742eh - mov eax, r8d - test bl, 63 - jnz short rx_body_497 - call rx_read -rx_body_497: - xor rbp, rax - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - mulpd xmm0, xmm3 - movaps xmm1, xmm0 - cmpeqpd xmm1, xmm1 - andps xmm0, xmm1 - movaps xmm8, xmm0 - -rx_i_498: ;FPMUL - dec ebx - jz rx_finish - xor r15, 0e066fd15h - mov eax, r15d - test bl, 63 - jnz short rx_body_498 - call rx_read -rx_body_498: - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - mulpd xmm0, xmm9 - movaps xmm1, xmm0 - cmpeqpd xmm1, xmm1 - andps xmm0, xmm1 - movaps xmm8, xmm0 - -rx_i_499: ;IMUL_32 - dec ebx - jz rx_finish - xor r12, 08925556bh - mov eax, r12d - test bl, 63 - jnz short rx_body_499 - call rx_read -rx_body_499: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - movsxd rcx, eax - movsxd rax, r13d - imul rax, rcx - mov r8, rax - -rx_i_500: ;FPSQRT - dec ebx - jz rx_finish - xor r10, 04bc870ebh - mov eax, r10d - test bl, 63 - jnz short rx_body_500 - call rx_read -rx_body_500: - and eax, 32767 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - andps xmm0, xmm10 - sqrtpd xmm2, xmm0 - mov eax, r10d - xor eax, 04a250342h - and eax, 32767 - movhpd qword ptr [rsi + rax * 8], xmm2 - -rx_i_501: ;XOR_64 - dec ebx - jz rx_finish - xor r8, 07d46c503h - mov eax, r8d - test bl, 63 - jnz short rx_body_501 - call rx_read -rx_body_501: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - xor rax, r10 - mov rcx, rax - mov eax, r12d - xor eax, 03e22874bh - and eax, 2047 - mov qword ptr [rsi + rax * 8], rcx - -rx_i_502: ;RET - dec ebx - jz rx_finish - xor r10, 09e70b20ch - mov eax, r10d - test bl, 63 - jnz short rx_body_502 - call rx_read -rx_body_502: - xor rbp, rax - and eax, 131071 - mov rax, qword ptr [rsi+rax*8] - mov r9, rax - cmp rsp, rdi - je short rx_i_503 - ret - -rx_i_503: ;FPSUB - dec ebx - jz rx_finish - xor r13, 0442e4850h - mov eax, r13d - test bl, 63 - jnz short rx_body_503 - call rx_read -rx_body_503: - and eax, 32767 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - subpd xmm0, xmm2 - movaps xmm9, xmm0 - mov eax, r9d - xor eax, 080465282h - and eax, 2047 - movlpd qword ptr [rsi + rax * 8], xmm9 - -rx_i_504: ;FPADD - dec ebx - jz rx_finish - xor r13, 099d48347h - mov eax, r13d - test bl, 63 - jnz short rx_body_504 - call rx_read -rx_body_504: - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - addpd xmm0, xmm9 - movaps xmm4, xmm0 - -rx_i_505: ;FPSUB - dec ebx - jz rx_finish - xor r12, 032c0a28ah - mov eax, r12d - test bl, 63 - jnz short rx_body_505 - call rx_read -rx_body_505: - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - subpd xmm0, xmm4 - movaps xmm8, xmm0 - mov eax, r8d - xor eax, 021b54eaeh - and eax, 32767 - movhpd qword ptr [rsi + rax * 8], xmm8 - -rx_i_506: ;FPSUB - dec ebx - jz rx_finish - xor r9, 0a973d58ch - mov eax, r9d - test bl, 63 - jnz short rx_body_506 - call rx_read -rx_body_506: - and eax, 32767 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - subpd xmm0, xmm9 - movaps xmm3, xmm0 - mov eax, r11d - xor eax, 05e890759h - and eax, 2047 - movlpd qword ptr [rsi + rax * 8], xmm3 - -rx_i_507: ;RET - dec ebx - jz rx_finish - xor r10, 0d3b7165ch - mov eax, r10d - test bl, 63 - jnz short rx_body_507 - call rx_read -rx_body_507: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - mov r14, rax - cmp rsp, rdi - je short rx_i_508 - ret - -rx_i_508: ;RET - dec ebx - jz rx_finish - xor r13, 0da34d818h - mov eax, r13d - test bl, 63 - jnz short rx_body_508 - call rx_read -rx_body_508: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - mov r8, rax - cmp rsp, rdi - je short rx_i_509 - ret - -rx_i_509: ;FPROUND - dec ebx - jz rx_finish - xor r11, 01b2873f2h - mov eax, r11d - test bl, 63 - jnz short rx_body_509 - call rx_read -rx_body_509: - and eax, 2047 - mov rax, qword ptr [rsi+rax*8] - mov rcx, rax - rol rax, 34 - and eax, 24576 - or eax, 40896 - mov dword ptr [rsp - 8], eax - ldmxcsr dword ptr [rsp - 8] - mov eax, r10d - xor eax, 06cd84each - and eax, 32767 - mov qword ptr [rsi + rax * 8], rcx - -rx_i_510: ;FPADD - dec ebx - jz rx_finish - xor r8, 0db65513ch - mov eax, r8d - test bl, 63 - jnz short rx_body_510 - call rx_read -rx_body_510: - and eax, 2047 - cvtdq2pd xmm0, qword ptr [rsi+rax*8] - addpd xmm0, xmm2 - movaps xmm9, xmm0 - mov eax, r9d - xor eax, 097614097h - and eax, 2047 - movlpd qword ptr [rsi + rax * 8], xmm9 - -rx_i_511: ;SHR_64 - dec ebx - jz rx_finish - xor r11, 02bd79286h - mov eax, r11d - test bl, 63 - jnz short rx_body_511 - call rx_read -rx_body_511: - and eax, 131071 - mov rax, qword ptr [rsi+rax*8] - shr rax, 56 - mov r11, rax - - jmp rx_i_0 + and eax, 16376 + imul r14, qword ptr [rsi+rax] + ; IMULH_M r6, L1[r5] + mov ecx, r13d + and ecx, 16376 + mov rax, r14 + mul qword ptr [rsi+rcx] + mov r14, rdx + ; IMUL_9C r7, -1048659408 + lea r15, [r15+r15*8-1048659408] + ; IMUL_R r6, r3 + imul r14, r11 + ; FPADD_R f3, a0 + addpd xmm3, xmm8 + ; IMULH_R r0, r3 + mov rax, r8 + mul r11 + mov r8, rdx + ; FPSWAP_R f0 + shufpd xmm0, xmm0, 1 + ; FPSQRT_R e3 + sqrtpd xmm7, xmm7 + ; IMULH_R r2, r0 + mov rax, r10 + mul r8 + mov r10, rdx + ; FPDIV_R e1, a1 + divpd xmm5, xmm9 + maxpd xmm5, xmm13 From 005c67f64c5670f6b87c3bca36f497622ead63cf Mon Sep 17 00:00:00 2001 From: tevador Date: Sun, 27 Jan 2019 10:52:30 +0100 Subject: [PATCH 25/35] Added explicit STORE instructions JIT compiler --- src/AssemblyGeneratorX86.cpp | 22 +- src/AssemblyGeneratorX86.hpp | 12 +- src/CompiledVirtualMachine.cpp | 6 +- src/Instruction.cpp | 24 +- src/Instruction.hpp | 3 + src/JitCompilerX86-static.S | 45 +- src/JitCompilerX86-static.asm | 57 +- src/JitCompilerX86-static.hpp | 17 +- src/JitCompilerX86.cpp | 1217 ++++++++++++----------- src/JitCompilerX86.hpp | 126 +-- src/asm/program_epilogue_store.inc | 20 +- src/asm/program_epilogue_win64.inc | 8 +- src/asm/program_load_flt.inc | 14 + src/asm/program_load_int.inc | 10 + src/asm/program_prologue_linux.inc | 11 +- src/asm/program_prologue_load.inc | 43 +- src/asm/program_prologue_win64.inc | 21 +- src/asm/program_read.inc | 20 - src/asm/program_read_dataset.inc | 16 + src/asm/program_store_flt.inc | 11 + src/asm/program_store_int.inc | 10 + src/asm/program_xmm_constants.inc | 6 + src/common.hpp | 4 +- src/executeProgram-win64.asm | 21 +- src/instructionWeights.hpp | 63 +- src/main.cpp | 2 +- src/program.inc | 1460 ++++++++++++++-------------- 27 files changed, 1751 insertions(+), 1518 deletions(-) create mode 100644 src/asm/program_load_flt.inc create mode 100644 src/asm/program_load_int.inc delete mode 100644 src/asm/program_read.inc create mode 100644 src/asm/program_read_dataset.inc create mode 100644 src/asm/program_store_flt.inc create mode 100644 src/asm/program_store_int.inc create mode 100644 src/asm/program_xmm_constants.inc diff --git a/src/AssemblyGeneratorX86.cpp b/src/AssemblyGeneratorX86.cpp index f1c3de8..11bb3f0 100644 --- a/src/AssemblyGeneratorX86.cpp +++ b/src/AssemblyGeneratorX86.cpp @@ -75,6 +75,11 @@ namespace RandomX { asmCode << "\tand " << reg << ", " << ((instr.alt % 4) ? ScratchpadL1Mask : ScratchpadL2Mask) << std::endl; } + void AssemblyGeneratorX86::genAddressRegDst(Instruction& instr, int maskAlign = 8) { + asmCode << "\tmov eax" << ", " << regR32[instr.dst] << std::endl; + asmCode << "\tand eax" << ", " << ((instr.alt % 4) ? (ScratchpadL1Mask & (-maskAlign)) : (ScratchpadL2Mask & (-maskAlign))) << std::endl; + } + int32_t AssemblyGeneratorX86::genAddressImm(Instruction& instr) { return instr.imm32 & ((instr.alt % 4) ? ScratchpadL1Mask : ScratchpadL2Mask); } @@ -425,7 +430,7 @@ namespace RandomX { //6 uOPs void AssemblyGeneratorX86::h_CFROUND(Instruction& instr, int i) { - asmCode << "\tmov rax, " << regR[instr.dst] << std::endl; + asmCode << "\tmov rax, " << regR[instr.src] << std::endl; int rotate = (13 - (instr.alt & 63)) & 63; if (rotate != 0) asmCode << "\trol rax, " << rotate << std::endl; @@ -474,6 +479,18 @@ namespace RandomX { asmCode << "\tadd " << regR[instr.dst] << ", rcx" << std::endl; } + //3 uOPs + void AssemblyGeneratorX86::h_ISTORE(Instruction& instr, int i) { + genAddressRegDst(instr); + asmCode << "\tmov qword ptr [rsi+rax], " << regR[instr.src] << std::endl; + } + + //3 uOPs + void AssemblyGeneratorX86::h_FSTORE(Instruction& instr, int i) { + genAddressRegDst(instr, 16); + asmCode << "\tmovapd xmmword ptr [rsi+rax], " << regFE[instr.src] << std::endl; + } + #include "instructionWeights.hpp" #define INST_HANDLE(x) REPN(&AssemblyGeneratorX86::h_##x, WT(x)) @@ -520,5 +537,8 @@ namespace RandomX { INST_HANDLE(COND_R) INST_HANDLE(COND_M) INST_HANDLE(CFROUND) + + INST_HANDLE(ISTORE) + INST_HANDLE(FSTORE) }; } \ No newline at end of file diff --git a/src/AssemblyGeneratorX86.hpp b/src/AssemblyGeneratorX86.hpp index 2d3c9a6..5c22142 100644 --- a/src/AssemblyGeneratorX86.hpp +++ b/src/AssemblyGeneratorX86.hpp @@ -38,16 +38,8 @@ namespace RandomX { static InstructionGenerator engine[256]; std::stringstream asmCode; - void gena(Instruction&, int); - void genar(Instruction&, int); - void genaf(Instruction&, int); - void genbiashift(Instruction&, const char*); - void genbia(Instruction&); - void genbia32(Instruction&); - void genbf(Instruction&, const char*); - void gencr(Instruction&, bool); - void gencf(Instruction&, bool); void genAddressReg(Instruction&, const char*); + void genAddressRegDst(Instruction&, int); int32_t genAddressImm(Instruction&); void generateCode(Instruction&, int); @@ -85,5 +77,7 @@ namespace RandomX { void h_COND_R(Instruction&, int); void h_COND_M(Instruction&, int); void h_CFROUND(Instruction&, int); + void h_ISTORE(Instruction&, int); + void h_FSTORE(Instruction&, int); }; } \ No newline at end of file diff --git a/src/CompiledVirtualMachine.cpp b/src/CompiledVirtualMachine.cpp index f0a63d1..f5d33d0 100644 --- a/src/CompiledVirtualMachine.cpp +++ b/src/CompiledVirtualMachine.cpp @@ -71,14 +71,14 @@ namespace RandomX { reg.a[i].hi.u64 = getSmallPositiveFloatBits(reg.f[i].hi.u64); } compiler.generateProgram(gen); - mem.ma = (gen() ^ *(((uint32_t*)seed) + 4)) & ~7; + mem.ma = (gen() ^ *(((uint32_t*)seed) + 4)) & -64; mem.mx = *(((uint32_t*)seed) + 5); } void CompiledVirtualMachine::execute() { - executeProgram(reg, mem, scratchpad, InstructionCount); + //executeProgram(reg, mem, scratchpad, InstructionCount); totalSize += compiler.getCodeSize(); - //compiler.getProgramFunc()(reg, mem, scratchpad); + compiler.getProgramFunc()(reg, mem, scratchpad, InstructionCount); #ifdef TRACEVM for (int32_t i = InstructionCount - 1; i >= 0; --i) { std::cout << std::hex << tracepad[i].u64 << std::endl; diff --git a/src/Instruction.cpp b/src/Instruction.cpp index c766ffd..13cfc1d 100644 --- a/src/Instruction.cpp +++ b/src/Instruction.cpp @@ -32,6 +32,10 @@ namespace RandomX { os << ((alt % 4) ? "L1" : "L2") << "[r" << (int)src << "]"; } + void Instruction::genAddressRegDst(std::ostream& os) const { + os << ((alt % 4) ? "L1" : "L2") << "[r" << (int)dst << "]"; + } + void Instruction::genAddressImm(std::ostream& os) const { os << ((alt % 4) ? "L1" : "L2") << "[" << (imm32 & ((alt % 4) ? ScratchpadL1Mask : ScratchpadL2Mask)) << "]"; } @@ -276,7 +280,7 @@ namespace RandomX { } void Instruction::h_CFROUND(std::ostream& os) const { - os << "r" << (int)dst << ", " << (alt & 63) << std::endl; + os << "r" << (int)src << ", " << (alt & 63) << std::endl; } static inline const char* condition(int index) { @@ -311,6 +315,18 @@ namespace RandomX { os << ", " << imm32 << ")" << std::endl; } + void Instruction::h_ISTORE(std::ostream& os) const { + genAddressRegDst(os); + os << ", r" << (int)src << std::endl; + } + + void Instruction::h_FSTORE(std::ostream& os) const { + const char reg = (src >= 4) ? 'e' : 'f'; + genAddressRegDst(os); + auto srcIndex = src % 4; + os << ", " << reg << srcIndex << std::endl; + } + #include "instructionWeights.hpp" #define INST_NAME(x) REPN(#x, WT(x)) #define INST_HANDLE(x) REPN(&Instruction::h_##x, WT(x)) @@ -358,6 +374,9 @@ namespace RandomX { INST_NAME(COND_R) INST_NAME(COND_M) INST_NAME(CFROUND) + + INST_NAME(ISTORE) + INST_NAME(FSTORE) }; InstructionVisualizer Instruction::engine[256] = { @@ -403,6 +422,9 @@ namespace RandomX { INST_HANDLE(COND_R) INST_HANDLE(COND_M) INST_HANDLE(CFROUND) + + INST_HANDLE(ISTORE) + INST_HANDLE(FSTORE) }; } \ No newline at end of file diff --git a/src/Instruction.hpp b/src/Instruction.hpp index becb983..017d92f 100644 --- a/src/Instruction.hpp +++ b/src/Instruction.hpp @@ -49,6 +49,7 @@ namespace RandomX { void genAddressReg(std::ostream& os) const; void genAddressImm(std::ostream& os) const; + void genAddressRegDst(std::ostream&) const; void h_IADD_R(std::ostream&) const; void h_IADD_M(std::ostream&) const; @@ -83,6 +84,8 @@ namespace RandomX { void h_COND_R(std::ostream&) const; void h_COND_M(std::ostream&) const; void h_CFROUND(std::ostream&) const; + void h_ISTORE(std::ostream&) const; + void h_FSTORE(std::ostream&) const; }; static_assert(sizeof(Instruction) == 8, "Invalid alignment of struct Instruction"); diff --git a/src/JitCompilerX86-static.S b/src/JitCompilerX86-static.S index e0e8f62..a799e11 100644 --- a/src/JitCompilerX86-static.S +++ b/src/JitCompilerX86-static.S @@ -27,11 +27,16 @@ #define DECL(x) x #endif .global DECL(randomx_program_prologue) -.global DECL(randomx_program_begin) +.global DECL(randomx_loop_begin) +.global DECL(randomx_program_load_int) +.global DECL(randomx_program_load_flt) +.global DECL(randomx_program_start) +.global DECL(randomx_program_read_dataset) +.global DECL(randomx_program_store_int) +.global DECL(randomx_program_store_flt) +.global DECL(randomx_program_loop_end) .global DECL(randomx_program_epilogue) -.global DECL(randomx_program_read) .global DECL(randomx_program_end) -.global DECL(randomx_program_transform) #define db .byte @@ -40,21 +45,37 @@ DECL(randomx_program_prologue): #include "asm/program_prologue_linux.inc" .align 64 -DECL(randomx_program_begin): + #include "asm/program_xmm_constants.inc" + +.align 64 +DECL(randomx_loop_begin): + nop + +DECL(randomx_program_load_int): + #include "asm/program_load_int.inc" + +DECL(randomx_program_load_flt): + #include "asm/program_load_flt.inc" + +DECL(randomx_program_start): + nop + +DECL(randomx_program_read_dataset): + #include "asm/program_read_dataset.inc" + +DECL(randomx_program_store_int): + #include "asm/program_store_int.inc" + +DECL(randomx_program_store_flt): + #include "asm/program_store_flt.inc" + +DECL(randomx_program_loop_end): nop .align 64 DECL(randomx_program_epilogue): #include "asm/program_epilogue_linux.inc" -.align 64 -DECL(randomx_program_read): - #include "asm/program_read.inc" - .align 64 DECL(randomx_program_end): nop - -.align 8 -DECL(randomx_program_transform): - #include "asm/program_transform_address.inc" diff --git a/src/JitCompilerX86-static.asm b/src/JitCompilerX86-static.asm index 031c2e4..8d5a4fe 100644 --- a/src/JitCompilerX86-static.asm +++ b/src/JitCompilerX86-static.asm @@ -20,12 +20,16 @@ IFDEF RAX _RANDOMX_JITX86_STATIC SEGMENT PAGE READ EXECUTE PUBLIC randomx_program_prologue -PUBLIC randomx_program_begin +PUBLIC randomx_loop_begin +PUBLIC randomx_program_load_int +PUBLIC randomx_program_load_flt +PUBLIC randomx_program_start +PUBLIC randomx_program_read_dataset +PUBLIC randomx_program_store_int +PUBLIC randomx_program_store_flt +PUBLIC randomx_program_loop_end PUBLIC randomx_program_epilogue -PUBLIC randomx_program_read PUBLIC randomx_program_end -PUBLIC randomx_program_transform - ALIGN 64 randomx_program_prologue PROC @@ -33,30 +37,51 @@ randomx_program_prologue PROC randomx_program_prologue ENDP ALIGN 64 -randomx_program_begin PROC + include asm/program_xmm_constants.inc + +ALIGN 64 +randomx_loop_begin PROC nop -randomx_program_begin ENDP +randomx_loop_begin ENDP + +randomx_program_load_int PROC + include asm/program_load_int.inc +randomx_program_load_int ENDP + +randomx_program_load_flt PROC + include asm/program_load_flt.inc +randomx_program_load_flt ENDP + +randomx_program_start PROC + nop +randomx_program_start ENDP + +randomx_program_read_dataset PROC + include asm/program_read_dataset.inc +randomx_program_read_dataset ENDP + +randomx_program_store_int PROC + include asm/program_store_int.inc +randomx_program_store_int ENDP + +randomx_program_store_flt PROC + include asm/program_store_flt.inc +randomx_program_store_flt ENDP + +randomx_program_loop_end PROC + nop +randomx_program_loop_end ENDP ALIGN 64 randomx_program_epilogue PROC include asm/program_epilogue_win64.inc randomx_program_epilogue ENDP -ALIGN 64 -randomx_program_read PROC - include asm/program_read.inc -randomx_program_read ENDP - ALIGN 64 randomx_program_end PROC nop randomx_program_end ENDP -ALIGN 8 -randomx_program_transform PROC - include asm/program_transform_address.inc -randomx_program_transform ENDP - _RANDOMX_JITX86_STATIC ENDS ENDIF diff --git a/src/JitCompilerX86-static.hpp b/src/JitCompilerX86-static.hpp index e72244a..df5cd28 100644 --- a/src/JitCompilerX86-static.hpp +++ b/src/JitCompilerX86-static.hpp @@ -18,10 +18,15 @@ along with RandomX. If not, see. */ extern "C" { - void randomx_program_prologue(); - void randomx_program_begin(); - void randomx_program_epilogue(); - void randomx_program_transform(); - void randomx_program_read(); - void randomx_program_end(); + void randomx_program_prologue(); + void randomx_loop_begin(); + void randomx_program_load_int(); + void randomx_program_load_flt(); + void randomx_program_start(); + void randomx_program_read_dataset(); + void randomx_program_store_int(); + void randomx_program_store_flt(); + void randomx_program_loop_end(); + void randomx_program_epilogue(); + void randomx_program_end(); } \ No newline at end of file diff --git a/src/JitCompilerX86.cpp b/src/JitCompilerX86.cpp index 8776d61..e001464 100644 --- a/src/JitCompilerX86.cpp +++ b/src/JitCompilerX86.cpp @@ -38,7 +38,7 @@ along with RandomX. If not, see. namespace RandomX { -#if true || !defined(_M_X64) && !defined(__x86_64__) +#if !defined(_M_X64) && !defined(__x86_64__) JitCompilerX86::JitCompilerX86() { //throw std::runtime_error("JIT compiler only supports x86-64 CPUs"); } @@ -53,69 +53,132 @@ namespace RandomX { #else /* - REGISTER ALLOCATION: - rax -> temporary - rbx -> "ic" - rcx -> temporary - rdx -> temporary - rsi -> convertible_t* scratchpad - rdi -> beginning of VM stack - rbp -> "ma", "mx" - rsp -> end of VM stack - r8 -> "r0" - r9 -> "r1" - r10 -> "r2" - r11 -> "r3" - r12 -> "r4" - r13 -> "r5" - r14 -> "r6" - r15 -> "r7" - xmm0 -> temporary - xmm1 -> temporary - xmm2 -> "f2" - xmm3 -> "f3" - xmm4 -> "f4" - xmm5 -> "f5" - xmm6 -> "f6" - xmm7 -> "f7" - xmm8 -> "f0" - xmm9 -> "f1" - xmm10 -> absolute value mask 0x7fffffffffffffff7fffffffffffffff + REGISTER ALLOCATION: - STACK STRUCTURE: - - | - | - | saved registers - | - v - [rdi+8] RegisterFile& registerFile - [rdi] uint8_t* dataset - | - | - | VM stack - | - v - [rsp] last element of VM stack + ; rax -> temporary + ; rbx -> loop counter "lc" + ; rcx -> temporary + ; rdx -> temporary + ; rsi -> scratchpad pointer + ; rdi -> dataset pointer + ; rbp -> memory registers "ma" (high 32 bits), "mx" (low 32 bits) + ; rsp -> stack pointer + ; r8 -> "r0" + ; r9 -> "r1" + ; r10 -> "r2" + ; r11 -> "r3" + ; r12 -> "r4" + ; r13 -> "r5" + ; r14 -> "r6" + ; r15 -> "r7" + ; xmm0 -> "f0" + ; xmm1 -> "f1" + ; xmm2 -> "f2" + ; xmm3 -> "f3" + ; xmm4 -> "e0" + ; xmm5 -> "e1" + ; xmm6 -> "e2" + ; xmm7 -> "e3" + ; xmm8 -> "a0" + ; xmm9 -> "a1" + ; xmm10 -> "a2" + ; xmm11 -> "a3" + ; xmm12 -> temporary + ; xmm13 -> DBL_MIN + ; xmm14 -> absolute value mask 0x7fffffffffffffff7fffffffffffffff + ; xmm15 -> sign mask 0x80000000000000008000000000000000 */ #include "JitCompilerX86-static.hpp" const uint8_t* codePrologue = (uint8_t*)&randomx_program_prologue; - const uint8_t* codeProgramBegin = (uint8_t*)&randomx_program_begin; + const uint8_t* codeLoopBegin = (uint8_t*)&randomx_loop_begin; + const uint8_t* codeLoadInt = (uint8_t*)&randomx_program_load_int; + const uint8_t* codeLoadFlt = (uint8_t*)&randomx_program_load_flt; + const uint8_t* codeProgamStart = (uint8_t*)&randomx_program_start; + const uint8_t* codeReadDataset = (uint8_t*)&randomx_program_read_dataset; + const uint8_t* codeStoreInt = (uint8_t*)&randomx_program_store_int; + const uint8_t* codeStoreFlt = (uint8_t*)&randomx_program_store_flt; + const uint8_t* codeLoopEnd = (uint8_t*)&randomx_program_loop_end; const uint8_t* codeEpilogue = (uint8_t*)&randomx_program_epilogue; - const uint8_t* codeReadDataset = (uint8_t*)&randomx_program_read; const uint8_t* codeProgramEnd = (uint8_t*)&randomx_program_end; - const uint32_t* addressTransformations = (uint32_t*)&randomx_program_transform; - const int32_t prologueSize = codeProgramBegin - codePrologue; - const int32_t epilogueSize = codeReadDataset - codeEpilogue; - const int32_t readDatasetSize = codeProgramEnd - codeReadDataset; + const int32_t prologueSize = codeLoopBegin - codePrologue; + const int32_t epilogueSize = codeProgramEnd - codeEpilogue; - const int32_t readDatasetOffset = CodeSize - readDatasetSize; - const int32_t epilogueOffset = readDatasetOffset - epilogueSize; + const int32_t loadIntSize = codeLoadFlt - codeLoadInt; + const int32_t loadFltSize = codeProgamStart - codeLoadFlt; + const int32_t readDatasetSize = codeStoreInt - codeReadDataset; + const int32_t storeIntSize = codeStoreFlt - codeStoreInt; + const int32_t storeFltSize = codeLoopEnd - codeStoreFlt; + + const int32_t epilogueOffset = CodeSize - epilogueSize; + + static const uint8_t REX_ADD_RR[] = { 0x4d, 0x03 }; + static const uint8_t REX_ADD_RM[] = { 0x4c, 0x03 }; + static const uint8_t REX_SUB_RR[] = { 0x4d, 0x2b }; + static const uint8_t REX_SUB_RM[] = { 0x4c, 0x2b }; + static const uint8_t REX_MOV_RR[] = { 0x41, 0x8b }; + static const uint8_t REX_MOV_RR64[] = { 0x49, 0x8b }; + static const uint8_t REX_MOV_R64R[] = { 0x4c, 0x8b }; + static const uint8_t REX_IMUL_RR[] = { 0x4d, 0x0f, 0xaf }; + static const uint8_t REX_IMUL_RRI[] = { 0x4d, 0x69 }; + static const uint8_t REX_IMUL_RM[] = { 0x4c, 0x0f, 0xaf }; + static const uint8_t REX_MUL_R[] = { 0x49, 0xf7 }; + static const uint8_t REX_MUL_M[] = { 0x48, 0xf7 }; + static const uint8_t REX_81[] = { 0x49, 0x81 }; + static const uint8_t AND_EAX_I = 0x25; + static const uint8_t MOV_EAX_I = 0xb8; + static const uint8_t MOV_RAX_I[] = { 0x48, 0xb8 }; + static const uint8_t MOV_RCX_I[] = { 0x48, 0xb9 }; + static const uint8_t REX_LEA[] = { 0x4f, 0x8d }; + static const uint8_t REX_MUL_MEM[] = { 0x48, 0xf7, 0x24, 0x0e }; + static const uint8_t REX_IMUL_MEM[] = { 0x48, 0xf7, 0x2c, 0x0e }; + static const uint8_t REX_SHR_RAX[] = { 0x48, 0xc1, 0xe8 }; + static const uint8_t RAX_ADD_SBB_1[] = { 0x48, 0x83, 0xC0, 0x01, 0x48, 0x83, 0xD8, 0x00 }; + static const uint8_t MUL_RCX[] = { 0x48, 0xf7, 0xe1 }; + static const uint8_t REX_SHR_RDX[] = { 0x48, 0xc1, 0xea }; + static const uint8_t REX_SH[] = { 0x49, 0xc1 }; + static const uint8_t MOV_RCX_RAX_SAR_RCX_63[] = { 0x48, 0x89, 0xc1, 0x48, 0xc1, 0xf9, 0x3f }; + static const uint8_t AND_ECX_I[] = { 0x81, 0xe1 }; + static const uint8_t ADD_RAX_RCX[] = { 0x48, 0x01, 0xC8 }; + static const uint8_t SAR_RAX_I8[] = { 0x48, 0xC1, 0xF8 }; + static const uint8_t NEG_RAX[] = { 0x48, 0xF7, 0xD8 }; + static const uint8_t ADD_R_RAX[] = { 0x49, 0x01 }; + static const uint8_t XOR_EAX_EAX[] = { 0x31, 0xC0 }; + static const uint8_t ADD_RDX_R[] = { 0x4c, 0x01 }; + static const uint8_t SUB_RDX_R[] = { 0x4c, 0x29 }; + static const uint8_t SAR_RDX_I8[] = { 0x48, 0xC1, 0xFA }; + static const uint8_t TEST_RDX_RDX[] = { 0x48, 0x85, 0xD2 }; + static const uint8_t SETS_AL_ADD_RDX_RAX[] = { 0x0F, 0x98, 0xC0, 0x48, 0x01, 0xC2 }; + static const uint8_t REX_NEG[] = { 0x49, 0xF7 }; + static const uint8_t REX_XOR_RR[] = { 0x4D, 0x33 }; + static const uint8_t REX_XOR_RI[] = { 0x49, 0x81 }; + static const uint8_t REX_XOR_RM[] = { 0x4c, 0x33 }; + static const uint8_t REX_ROT_CL[] = { 0x49, 0xd3 }; + static const uint8_t REX_ROT_I8[] = { 0x49, 0xc1 }; + static const uint8_t SHUFPD[] = { 0x66, 0x0f, 0xc6 }; + static const uint8_t REX_ADDPD[] = { 0x66, 0x41, 0x0f, 0x58 }; + static const uint8_t REX_CVTDQ2PD_XMM12[] = { 0xf3, 0x44, 0x0f, 0xe6, 0x24, 0x06 }; + static const uint8_t REX_SUBPD[] = { 0x66, 0x41, 0x0f, 0x5c }; + static const uint8_t REX_XORPS[] = { 0x41, 0x0f, 0x57 }; + static const uint8_t REX_MULPD[] = { 0x66, 0x41, 0x0f, 0x59 }; + static const uint8_t REX_MAXPD[] = { 0x66, 0x41, 0x0f, 0x5f }; + static const uint8_t REX_DIVPD[] = { 0x66, 0x41, 0x0f, 0x5e }; + static const uint8_t SQRTPD[] = { 0x66, 0x0f, 0x51 }; + static const uint8_t AND_OR_MOV_LDMXCSR[] = { 0x25, 0x00, 0x60, 0x00, 0x00, 0x0D, 0xC0, 0x9F, 0x00, 0x00, 0x89, 0x44, 0x24, 0xF8, 0x0F, 0xAE, 0x54, 0x24, 0xF8 }; + static const uint8_t ROL_RAX[] = { 0x48, 0xc1, 0xc0 }; + static const uint8_t XOR_ECX_ECX[] = { 0x33, 0xC9 }; + static const uint8_t REX_CMP_R32I[] = { 0x41, 0x81 }; + static const uint8_t REX_CMP_M32I[] = { 0x81, 0x3c, 0x06 }; + static const uint8_t MOVAPD[] = { 0x66, 0x0f, 0x29 }; + static const uint8_t REX_MOV_MR[] = { 0x4c, 0x89 }; + static const uint8_t REX_XOR_EAX[] = { 0x41, 0x33 }; + static const uint8_t SUB_EBX[] = { 0x83, 0xEB, 0x01 }; + static const uint8_t JNZ[] = { 0x0f, 0x85 }; + static const uint8_t JMP = 0xe9; size_t JitCompilerX86::getCodeSize() { return codePos - prologueSize + readDatasetSize; @@ -132,687 +195,613 @@ namespace RandomX { throw std::runtime_error("mmap failed"); #endif memcpy(code, codePrologue, prologueSize); - memcpy(code + CodeSize - epilogueSize - readDatasetSize, codeEpilogue, epilogueSize); - memcpy(code + CodeSize - readDatasetSize, codeReadDataset, readDatasetSize); + memcpy(code + CodeSize - epilogueSize, codeEpilogue, epilogueSize); } void JitCompilerX86::generateProgram(Pcg32& gen) { - instructionOffsets.clear(); - callOffsets.clear(); + auto addressRegisters = gen(); + int readReg1 = addressRegisters & 1; + addressRegisters >>= 1; + int readReg2 = 2 + (addressRegisters & 1); + addressRegisters >>= 1; + int writeReg1 = 4 + (addressRegisters & 1); + addressRegisters >>= 1; + int writeReg2 = 6 + (addressRegisters & 1); codePos = prologueSize; + emit(REX_XOR_EAX); + emitByte(0xc0 + readReg1); + memcpy(code + codePos, codeLoadInt, loadIntSize); + codePos += loadIntSize; + emit(REX_XOR_EAX); + emitByte(0xc0 + readReg2); + memcpy(code + codePos, codeLoadFlt, loadFltSize); + codePos += loadFltSize; Instruction instr; for (unsigned i = 0; i < ProgramLength; ++i) { for (unsigned j = 0; j < sizeof(instr) / sizeof(Pcg32::result_type); ++j) { *(((uint32_t*)&instr) + j) = gen(); } - generateCode(instr, i); + instr.src %= RegistersCount; + instr.dst %= RegistersCount; + generateCode(instr); } - emitByte(0xe9); - emit(instructionOffsets[0] - (codePos + 4)); - fixCallOffsets(); - uint32_t transform = addressTransformations[gen.getUniform(0, TransformationCount - 1)]; - *reinterpret_cast(code + readDatasetOffset) = transform; + emit(REX_MOV_RR); + emitByte(0xc0 + readReg1); + emit(REX_XOR_EAX); + emitByte(0xc0 + readReg2); + memcpy(code + codePos, codeReadDataset, readDatasetSize); + codePos += readDatasetSize; + emit(REX_MOV_RR); + emitByte(0xc0 + writeReg1); + memcpy(code + codePos, codeStoreInt, storeIntSize); + codePos += storeIntSize; + emit(REX_XOR_EAX); + emitByte(0xc0 + writeReg2); + memcpy(code + codePos, codeStoreFlt, storeFltSize); + codePos += storeFltSize; + emit(SUB_EBX); + emit(JNZ); + emit32(prologueSize - codePos - 4); + emitByte(JMP); + emit32(epilogueOffset - codePos - 4); + emitByte(0x90); } - void JitCompilerX86::generateCode(Instruction& instr, int i) { - instructionOffsets.push_back(codePos); - emit(0x840fcbff); //dec ebx; jz - emit(epilogueOffset - (codePos + 4)); //jump offset (RIP-relative) + void JitCompilerX86::generateCode(Instruction& instr) { auto generator = engine[instr.opcode]; - (this->*generator)(instr, i); + (this->*generator)(instr); } - void JitCompilerX86::fixCallOffsets() { - for (CallOffset& co : callOffsets) { - *reinterpret_cast(code + co.pos) = instructionOffsets[co.index] - (co.pos + 4); - } + void JitCompilerX86::genAddressReg(Instruction& instr, bool rax = true) { + emit(REX_MOV_RR); + emitByte((rax ? 0xc0 : 0xc8) + instr.src); + if (rax) + emitByte(AND_EAX_I); + else + emit(AND_ECX_I); + emit32((instr.alt % 4) ? ScratchpadL1Mask : ScratchpadL2Mask); } - void JitCompilerX86::gena(Instruction& instr) { - emit(uint16_t(0x8149)); //xor - emitByte(0xf0 + (instr.rega % RegistersCount)); - emit(instr.addra); - emit(uint16_t(0x8b41)); //mov - emitByte(0xc0 + (instr.rega % RegistersCount)); //eax, rega - emit(0x753fc3f6); //test bl,0x3f; jne - emit(uint16_t(0xe805)); - emit(readDatasetOffset - (codePos + 4)); - if ((instr.loca & 192) == 0) { //A.LOC.X - emit(uint16_t(0x3348)); - emitByte(0xe8); //xor rbp, rax - } - emitByte(0x25); //and eax, - //if (instr.loca & 15) { - if (instr.loca & 3) { - emit(ScratchpadL1 - 1); //first 16 KiB of scratchpad - } - else { - emit(ScratchpadL2 - 1); //first 256 KiB of scratchpad - } - /*} - else { - emit(ScratchpadL3 - 1); //whole scratchpad - }*/ + void JitCompilerX86::genAddressRegDst(Instruction& instr, bool align16 = false) { + emit(REX_MOV_RR); + emitByte(0xc0 + instr.dst); + emitByte(AND_EAX_I); + int32_t maskL1 = align16 ? ScratchpadL1Mask16 : ScratchpadL1Mask; + int32_t maskL2 = align16 ? ScratchpadL2Mask16 : ScratchpadL2Mask; + emit32((instr.alt % 4) ? maskL1 : maskL2); } - void JitCompilerX86::genar(Instruction& instr) { - gena(instr); - emit(0xc6048b48); //mov rax,QWORD PTR [rsi+rax*8] + void JitCompilerX86::genAddressImm(Instruction& instr) { + emit32(instr.imm32 & ((instr.alt % 4) ? ScratchpadL1Mask : ScratchpadL2Mask)); } - void JitCompilerX86::genaf(Instruction& instr) { - gena(instr); - emitByte(0xf3); - emit(0xc604e60f); //cvtdq2pd xmm0,QWORD PTR [rsi+rax*8] - } - - void JitCompilerX86::genbiashift(Instruction& instr, uint16_t opcodeReg, uint16_t opcodeImm) { - if (instr.locb & 1) { - emit(uint16_t(0x8b49)); //mov - emitByte(0xc8 + (instr.regb % RegistersCount)); //rcx, regb - emitByte(0x48); //REX.W - emit(opcodeReg); //xxx rax, cl + void JitCompilerX86::h_IADD_R(Instruction& instr) { + if (instr.src != instr.dst) { + emit(REX_ADD_RR); + emitByte(0xc0 + 8 * instr.dst + instr.src); } else { - emitByte(0x48); //REX.W - emit(opcodeImm); //xxx rax, imm8 - emitByte((instr.imm8 & 63)); + emit(REX_81); + emitByte(0xc0 + instr.dst); + emit32(instr.imm32); } } - void JitCompilerX86::genbia(Instruction& instr, uint16_t opcodeReg, uint16_t opcodeImm) { - if (instr.locb & 3) { - emit(opcodeReg); // xxx rax, r64 - emitByte(0xc0 + (instr.regb % RegistersCount)); + void JitCompilerX86::h_IADD_M(Instruction& instr) { + if (instr.src != instr.dst) { + genAddressReg(instr); + emit(REX_ADD_RM); + emitByte(0x04 + 8 * instr.dst); + emitByte(0x06); } else { - emit(opcodeImm); // xxx rax, imm32 - emit(instr.imm32); + emit(REX_ADD_RM); + emitByte(0x86 + 8 * instr.dst); + genAddressImm(instr); } } - void JitCompilerX86::genbia32(Instruction& instr, uint16_t opcodeReg, uint8_t opcodeImm) { - if (instr.locb & 3) { - emit(opcodeReg); // xxx eax, r32 - emitByte(0xc0 + (instr.regb % RegistersCount)); + void JitCompilerX86::genSIB(int scale, int index, int base) { + emitByte((scale << 5) | (index << 3) | base); + } + + void JitCompilerX86::h_IADD_RC(Instruction& instr) { + emit(REX_LEA); + emitByte(0x84 + 8 * instr.dst); + genSIB(0, instr.src, instr.dst); + emit32(instr.imm32); + } + + void JitCompilerX86::h_ISUB_R(Instruction& instr) { + if (instr.src != instr.dst) { + emit(REX_SUB_RR); + emitByte(0xc0 + 8 * instr.dst + instr.src); } else { - emitByte(opcodeImm); // xxx eax, imm32 - emit(instr.imm32); + emit(REX_81); + emitByte(0xe8 + instr.dst); + genAddressImm(instr); } } - void JitCompilerX86::genbf(Instruction& instr, uint8_t opcode) { - int regb = (instr.regb % RegistersCount); - emitByte(0x66); //xxxpd xmm0,regb - if (regb <= 1) { - emitByte(0x41); //REX - } - emitByte(0x0f); - emitByte(opcode); - emitByte(0xc0 + regb); - } - - - void JitCompilerX86::scratchpadStoreR(Instruction& instr, uint32_t scratchpadSize, bool rax) { - if (rax) { - emit(0x41c88b48); //mov rcx, rax; REX + void JitCompilerX86::h_ISUB_M(Instruction& instr) { + if (instr.src != instr.dst) { + genAddressReg(instr); + emit(REX_SUB_RM); + emitByte(0x04 + 8 * instr.dst); + emitByte(0x06); } else { - emitByte(0x41); + emit(REX_SUB_RM); + emitByte(0x86 + 8 * instr.dst); + genAddressImm(instr); } - emitByte(0x8b); // mov - emitByte(0xc0 + (instr.regc % RegistersCount)); //eax, regc - emitByte(0x35); // xor eax - emit(instr.addrc); - emitByte(0x25); //and - emit(scratchpadSize - 1); - emit(0xc60c8948); // mov QWORD PTR [rsi+rax*8],rcx } - void JitCompilerX86::gencr(Instruction& instr, bool rax = true) { - if (instr.locc & 8) { //write to register - emit(uint16_t(0x8b4c)); //mov - if (rax) { - emitByte(0xc0 + 8 * (instr.regc % RegistersCount)); //regc, rax - } - else { - emitByte(0xc1 + 8 * (instr.regc % RegistersCount)); //regc, rcx - } + void JitCompilerX86::h_IMUL_9C(Instruction& instr) { + emit(REX_LEA); + emitByte(0x84 + 8 * instr.dst); + genSIB(3, instr.src, instr.dst); + emit32(instr.imm32); + } + + void JitCompilerX86::h_IMUL_R(Instruction& instr) { + if (instr.src != instr.dst) { + emit(REX_IMUL_RR); + emitByte(0xc0 + 8 * instr.dst + instr.src); } else { - //if (instr.locc & 7) { - if (instr.locc & 1) { - scratchpadStoreR(instr, ScratchpadL1, rax); + emit(REX_IMUL_RRI); + emitByte(0xc0 + 9 * instr.dst); + genAddressImm(instr); + } + } + + void JitCompilerX86::h_IMUL_M(Instruction& instr) { + if (instr.src != instr.dst) { + genAddressReg(instr); + emit(REX_IMUL_RM); + emitByte(0x04 + 8 * instr.dst); + emitByte(0x06); + } + else { + emit(REX_IMUL_RM); + emitByte(0x86 + 8 * instr.dst); + genAddressImm(instr); + } + } + + void JitCompilerX86::h_IMULH_R(Instruction& instr) { + if (instr.src != instr.dst) { + emit(REX_MOV_RR64); + emitByte(0xc0 + instr.dst); + emit(REX_MUL_R); + emitByte(0xe0 + instr.src); + emit(REX_MOV_R64R); + emitByte(0xc2 + 8 * instr.dst); + } + else { + emitByte(MOV_EAX_I); + emit32(instr.imm32); + emit(REX_MUL_R); + emitByte(0xe0 + instr.dst); + emit(REX_ADD_RM); + emitByte(0xc2 + 8 * instr.dst); + } + } + + void JitCompilerX86::h_IMULH_M(Instruction& instr) { + if (instr.src != instr.dst) { + genAddressReg(instr, false); + emit(REX_MOV_RR64); + emitByte(0xc0 + instr.dst); + emit(REX_MUL_MEM); + } + else { + emit(REX_MOV_RR64); + emitByte(0xc0 + instr.dst); + emit(REX_MUL_M); + emitByte(0xa6); + genAddressImm(instr); + } + emit(REX_MOV_R64R); + emitByte(0xc2 + 8 * instr.dst); + } + + void JitCompilerX86::h_ISMULH_R(Instruction& instr) { + if (instr.src != instr.dst) { + emit(REX_MOV_RR64); + emitByte(0xc0 + instr.dst); + emit(REX_MUL_R); + emitByte(0xe8 + instr.src); + emit(REX_MOV_R64R); + emitByte(0xc2 + 8 * instr.dst); + } + else { + emitByte(MOV_EAX_I); + emit32(instr.imm32); + emit(REX_MUL_R); + emitByte(0xe8 + instr.dst); + emit(REX_ADD_RM); + emitByte(0xc2 + 8 * instr.dst); + } + } + + void JitCompilerX86::h_ISMULH_M(Instruction& instr) { + if (instr.src != instr.dst) { + genAddressReg(instr, false); + emit(REX_MOV_RR64); + emitByte(0xc0 + instr.dst); + emit(REX_IMUL_MEM); + } + else { + emit(REX_MOV_RR64); + emitByte(0xc0 + instr.dst); + emit(REX_MUL_M); + emitByte(0xae); + genAddressImm(instr); + } + emit(REX_MOV_R64R); + emitByte(0xc2 + 8 * instr.dst); + } + + void JitCompilerX86::h_IDIV_C(Instruction& instr) { + if (instr.imm32 != 0) { + uint32_t divisor = instr.imm32; + if (divisor & (divisor - 1)) { + magicu_info mi = compute_unsigned_magic_info(divisor, sizeof(uint64_t) * 8); + if (mi.pre_shift == 0 && !mi.increment) { + emit(MOV_RAX_I); + emit64(mi.multiplier); + emit(REX_MUL_R); + emitByte(0xe0 + instr.dst); } else { - scratchpadStoreR(instr, ScratchpadL2, rax); - } - /*} - else { - scratchpadStoreR(instr, ScratchpadL3, rax); - }*/ - } - } - - void JitCompilerX86::scratchpadStoreF(Instruction& instr, int regc, uint32_t scratchpadSize, bool storeHigh) { - emit(uint16_t(0x8b41)); //mov - emitByte(0xc0 + regc); //eax, regc - emitByte(0x35); // xor eax - emit(instr.addrc); - emitByte(0x25); //and - emit(scratchpadSize - 1); - emitByte(0x66); //movhpd/movlpd QWORD PTR [rsi+rax*8], regc - if (regc <= 1) { - emitByte(0x44); //REX - } - emitByte(0x0f); - emitByte(storeHigh ? 0x17 : 0x13); - emitByte(4 + 8 * regc); - emitByte(0xc6); - } - - void JitCompilerX86::gencf(Instruction& instr) { - int regc = (instr.regc % RegistersCount); - if (regc <= 1) { - emitByte(0x44); //REX - } - emit(uint16_t(0x280f)); //movaps - emitByte(0xc0 + 8 * regc); // regc, xmm0 - if (instr.locc & 8) { //write to scratchpad - //if (instr.locc & 7) { - if (instr.locc & 1) { //C.LOC.W - scratchpadStoreF(instr, regc, ScratchpadL1, (instr.locc & 128)); //first 16 KiB of scratchpad - } - else { - scratchpadStoreF(instr, regc, ScratchpadL2, (instr.locc & 128)); //first 256 KiB of scratchpad - } - //} - /*else { - scratchpadStoreF(instr, regc, ScratchpadL3, (instr.locc & 128)); //whole scratchpad - }*/ - } - } - - void JitCompilerX86::h_ADD_64(Instruction& instr, int i) { - genar(instr); - genbia(instr, 0x0349, 0x0548); - gencr(instr); - } - - void JitCompilerX86::h_ADD_32(Instruction& instr, int i) { - genar(instr); - genbia32(instr, 0x0341, 0x05); - gencr(instr); - } - - void JitCompilerX86::h_SUB_64(Instruction& instr, int i) { - genar(instr); - genbia(instr, 0x2b49, 0x2d48); - gencr(instr); - } - - void JitCompilerX86::h_SUB_32(Instruction& instr, int i) { - genar(instr); - genbia32(instr, 0x2b41, 0x2d); - gencr(instr); - } - - void JitCompilerX86::h_MUL_64(Instruction& instr, int i) { - genar(instr); - if ((instr.locb & 7) <= 5) { - emitByte(0x49); //REX - emit(uint16_t(0xaf0f)); // imul rax, r64 - emitByte(0xc0 + (instr.regb % RegistersCount)); - } - else { - emitByte(0x48); //REX - emit(uint16_t(0xc069)); // imul rax, rax, imm32 - emit(instr.imm32); - } - gencr(instr); - } - - void JitCompilerX86::h_MULH_64(Instruction& instr, int i) { - genar(instr); - if ((instr.locb & 7) <= 5) { - emit(uint16_t(0x8b49)); //mov rcx, r64 - emitByte(0xc8 + (instr.regb % RegistersCount)); - } - else { - emitByte(0x48); - emit(uint16_t(0xc1c7)); // mov rcx, imm32 - emit(instr.imm32); - } - emitByte(0x48); - emit(uint16_t(0xe1f7)); // mul rcx - emitByte(0x48); - emit(uint16_t(0xc28b)); // mov rax,rdx - gencr(instr); - } - - void JitCompilerX86::h_MUL_32(Instruction& instr, int i) { - genar(instr); - emit(uint16_t(0xc88b)); //mov ecx, eax - if ((instr.locb & 7) <= 5) { - emit(uint16_t(0x8b41)); // mov eax, r32 - emitByte(0xc0 + (instr.regb % RegistersCount)); - } - else { - emitByte(0xb8); // mov eax, imm32 - emit(instr.imm32); - } - emit(0xc1af0f48); //imul rax,rcx - gencr(instr); - } - - void JitCompilerX86::h_IMUL_32(Instruction& instr, int i) { - genar(instr); - emitByte(0x48); - emit(uint16_t(0xc863)); //movsxd rcx,eax - if ((instr.locb & 7) <= 5) { - emit(uint16_t(0x6349)); //movsxd rax,r32 - emitByte(0xc0 + (instr.regb % RegistersCount)); - } - else { - emitByte(0x48); - emit(uint16_t(0xc0c7)); // mov rax, imm32 - emit(instr.imm32); - } - emit(0xc1af0f48); //imul rax,rcx - gencr(instr); - } - - void JitCompilerX86::h_IMULH_64(Instruction& instr, int i) { - genar(instr); - if ((instr.locb & 7) <= 5) { - emit(uint16_t(0x8b49)); //mov rcx, r64 - emitByte(0xc8 + (instr.regb % RegistersCount)); - } - else { - emitByte(0x48); - emit(uint16_t(0xc1c7)); // mov rcx, imm32 - emit(instr.imm32); - } - emitByte(0x48); - emit(uint16_t(0xe9f7)); // imul rcx - emitByte(0x48); - emit(uint16_t(0xc28b)); // mov rax,rdx - gencr(instr); - } - - void JitCompilerX86::h_DIV_64(Instruction& instr, int i) { - genar(instr); - if (instr.locb & 7) { -#ifdef MAGIC_DIVISION - if (instr.imm32 != 0) { - uint32_t divisor = instr.imm32; - if (divisor & (divisor - 1)) { - magicu_info mi = compute_unsigned_magic_info(divisor, sizeof(uint64_t) * 8); + emit(REX_MOV_RR64); + emitByte(0xc0 + instr.dst); if (mi.pre_shift > 0) { - if (mi.pre_shift == 1) { - emitByte(0x48); - emit(uint16_t(0xe8d1)); //shr rax,1 - } - else { - emit(0x00e8c148 | (mi.pre_shift << 24)); //shr rax, pre_shift - } + emit(REX_SHR_RAX); + emitByte(mi.pre_shift); } if (mi.increment) { - emit(0x00d8834801c08348); //add rax,1; sbb rax,0 + emit(RAX_ADD_SBB_1); } - emit(uint16_t(0xb948)); //movabs rcx, multiplier - emit(mi.multiplier); - emit(0x48e1f748); //mul rcx; REX - emit(uint16_t(0xc28b)); //mov rax,rdx - if (mi.post_shift > 0) - emit(0x00e8c148 | (mi.post_shift << 24)); //shr rax, post_shift - } - else { //divisor is a power of two - int shift = 0; - while (divisor >>= 1) - ++shift; - if (shift > 0) - emit(0x00e8c148 | (shift << 24)); //shr rax, shift + emit(MOV_RCX_I); + emit64(mi.multiplier); + emit(MUL_RCX); } - } -#else - emitByte(0xb9); //mov ecx, imm32 - emit(instr.imm32 != 0 ? instr.imm32 : 1); -#endif - } - else { - emitByte(0xb9); //mov ecx, 1 - emit(1); - emit(uint16_t(0x8b41)); //mov edx, r32 - emitByte(0xd0 + (instr.regb % RegistersCount)); - emit(0x450fd285); //test edx, edx; cmovne ecx,edx - emitByte(0xca); -#ifdef MAGIC_DIVISION - emit(0xf748d233); //xor edx,edx; div rcx - emitByte(0xf1); -#endif - } -#ifndef MAGIC_DIVISION - emit(0xf748d233); //xor edx,edx; div rcx - emitByte(0xf1); -#endif - gencr(instr); - } - - void JitCompilerX86::h_IDIV_64(Instruction& instr, int i) { - genar(instr); - if (instr.locb & 7) { -#ifdef MAGIC_DIVISION - int64_t divisor = instr.imm32; - if ((divisor & -divisor) == divisor || (divisor & -divisor) == -divisor) { - // +/- power of two - bool negative = divisor < 0; - if (negative) - divisor = -divisor; + if (mi.post_shift > 0) { + emit(REX_SHR_RDX); + emitByte(mi.post_shift); + } + emit(REX_ADD_RR); + emitByte(0xc2 + 8 * instr.dst); + } + else { //divisor is a power of two int shift = 0; - uint64_t unsignedDivisor = divisor; - while (unsignedDivisor >>= 1) + while (divisor >>= 1) ++shift; if (shift > 0) { - emitByte(0x48); - emit(uint16_t(0xc88b)); //mov rcx, rax - emit(0x3ff9c148); //sar rcx, 63 - uint32_t mask = (1ULL << shift) - 1; - emit(uint16_t(0xe181)); //and ecx, mask - emit(mask); - emitByte(0x48); - emit(uint16_t(0xc103)); //add rax, rcx - emit(0x00f8c148 | (shift << 24)); //sar rax, shift - } - if (negative) { - emitByte(0x48); - emit(uint16_t(0xd8f7)); //neg rax + emit(REX_SH); + emitByte(0xe8 + instr.dst); } } - else if (divisor != 0) { - magics_info mi = compute_signed_magic_info(divisor); - if ((divisor >= 0) != (mi.multiplier >= 0)) { - emitByte(0x48); - emit(uint16_t(0xc88b)); //mov rcx, rax - } - emit(uint16_t(0xba48)); //movabs rdx, multiplier - emit(mi.multiplier); - emit(0xd233c28b48eaf748); //imul rdx; mov rax,rdx; xor edx,edx - bool haveSF = false; - if (divisor > 0 && mi.multiplier < 0) { - emitByte(0x48); - emit(uint16_t(0xc103)); //add rax, rcx - haveSF = true; - } - if (divisor < 0 && mi.multiplier > 0) { - emitByte(0x48); - emit(uint16_t(0xc12b)); //sub rax, rcx - haveSF = true; - } - if (mi.shift > 0) { - emit(0x00f8c148 | (mi.shift << 24)); //sar rax, shift - haveSF = true; - } - if (!haveSF) { - emitByte(0x48); - emit(uint16_t(0x85c0)); - } - emit(0x48c2980f); //sets dl; add rax, rdx - emit(uint16_t(0xc203)); + } + } + + void JitCompilerX86::h_ISDIV_C(Instruction& instr) { + int64_t divisor = instr.imm32; + if ((divisor & -divisor) == divisor || (divisor & -divisor) == -divisor) { + emit(REX_MOV_RR64); + emitByte(0xc0 + instr.dst); + // +/- power of two + bool negative = divisor < 0; + if (negative) + divisor = -divisor; + int shift = 0; + uint64_t unsignedDivisor = divisor; + while (unsignedDivisor >>= 1) + ++shift; + if (shift > 0) { + emit(MOV_RCX_RAX_SAR_RCX_63); + uint32_t mask = (1ULL << shift) - 1; + emit(AND_ECX_I); + emit32(mask); + emit(ADD_RAX_RCX); + emit(SAR_RAX_I8); + emitByte(shift); } -#else - emitByte(0xba); // mov edx, imm32 - emit(instr.imm32); -#endif + if (negative) + emit(NEG_RAX); + emit(ADD_R_RAX); + emitByte(0xc0 + instr.dst); + } + else if (divisor != 0) { + magics_info mi = compute_signed_magic_info(divisor); + emit(MOV_RAX_I); + emit64(mi.multiplier); + emit(REX_MUL_R); + emitByte(0xe8 + instr.dst); + emit(XOR_EAX_EAX); + bool haveSF = false; + if (divisor > 0 && mi.multiplier < 0) { + emit(ADD_RDX_R); + emitByte(0xc2 + 8 * instr.dst); + haveSF = true; + } + if (divisor < 0 && mi.multiplier > 0) { + emit(SUB_RDX_R); + emitByte(0xc2 + 8 * instr.dst); + haveSF = true; + } + if (mi.shift > 0) { + emit(SAR_RDX_I8); + emitByte(mi.shift); + haveSF = true; + } + if (!haveSF) + emit(TEST_RDX_RDX); + emit(SETS_AL_ADD_RDX_RAX); + emit(ADD_R_RAX); + emitByte(0xd0 + instr.dst); + } + } + + void JitCompilerX86::h_INEG_R(Instruction& instr) { + emit(REX_NEG); + emitByte(0xd8 + instr.dst); + } + + void JitCompilerX86::h_IXOR_R(Instruction& instr) { + if (instr.src != instr.dst) { + emit(REX_XOR_RR); + emitByte(0xc0 + 8 * instr.dst + instr.src); } else { - emit(uint16_t(0x8b41)); //mov edx, r32 - emitByte(0xd0 + (instr.regb % RegistersCount)); -#ifndef MAGIC_DIVISION + emit(REX_XOR_RI); + emitByte(0xf0 + instr.dst); + emit32(instr.imm32); } -#endif - emit(0xd8f7480575fffa83); //cmp edx,-1 - emit(uint16_t(0x12eb)); //jmp result - emit(0x0fd28500000001b9); - emit(0x489948c96348ca45); - emit(uint16_t(0xf9f7)); //idiv rcx -#ifdef MAGIC_DIVISION - } -#endif - gencr(instr); } - void JitCompilerX86::h_AND_64(Instruction& instr, int i) { - genar(instr); - genbia(instr, 0x2349, 0x2548); - gencr(instr); + void JitCompilerX86::h_IXOR_M(Instruction& instr) { + if (instr.src != instr.dst) { + genAddressReg(instr); + emit(REX_XOR_RM); + emitByte(0x04 + 8 * instr.dst); + emitByte(0x06); + } + else { + emit(REX_XOR_RM); + emitByte(0x86 + 8 * instr.dst); + genAddressImm(instr); + } } - void JitCompilerX86::h_AND_32(Instruction& instr, int i) { - genar(instr); - genbia32(instr, 0x2341, 0x25); - gencr(instr); + void JitCompilerX86::h_IROR_R(Instruction& instr) { + if (instr.src != instr.dst) { + emit(REX_MOV_RR); + emitByte(0xc8 + instr.src); + emit(REX_ROT_CL); + emitByte(0xc8 + instr.dst); + } + else { + emit(REX_ROT_I8); + emitByte(0xc8 + instr.dst); + emitByte(instr.imm32 & 63); + } } - void JitCompilerX86::h_OR_64(Instruction& instr, int i) { - genar(instr); - genbia(instr, 0x0b49, 0x0d48); - gencr(instr); + void JitCompilerX86::h_IROL_R(Instruction& instr) { + if (instr.src != instr.dst) { + emit(REX_MOV_RR); + emitByte(0xc8 + instr.src); + emit(REX_ROT_CL); + emitByte(0xc0 + instr.dst); + } + else { + emit(REX_ROT_I8); + emitByte(0xc0 + instr.dst); + emitByte(instr.imm32 & 63); + } } - void JitCompilerX86::h_OR_32(Instruction& instr, int i) { - genar(instr); - genbia32(instr, 0x0b41, 0x0d); - gencr(instr); + void JitCompilerX86::h_FPSWAP_R(Instruction& instr) { + emit(SHUFPD); + emitByte(0xc0 + 9 * instr.dst); + emitByte(1); } - void JitCompilerX86::h_XOR_64(Instruction& instr, int i) { - genar(instr); - genbia(instr, 0x3349, 0x3548); - gencr(instr); + void JitCompilerX86::h_FPADD_R(Instruction& instr) { + instr.dst %= 4; + instr.src %= 4; + emit(REX_ADDPD); + emitByte(0xc0 + instr.src + 8 * instr.dst); } - void JitCompilerX86::h_XOR_32(Instruction& instr, int i) { - genar(instr); - genbia32(instr, 0x3341, 0x35); - gencr(instr); + void JitCompilerX86::h_FPADD_M(Instruction& instr) { + instr.dst %= 4; + genAddressReg(instr); + emit(REX_CVTDQ2PD_XMM12); + emit(REX_ADDPD); + emitByte(0xc4 + 8 * instr.dst); } - void JitCompilerX86::h_SHL_64(Instruction& instr, int i) { - genar(instr); - genbiashift(instr, 0xe0d3, 0xe0c1); - gencr(instr); + void JitCompilerX86::h_FPSUB_R(Instruction& instr) { + instr.dst %= 4; + instr.src %= 4; + emit(REX_SUBPD); + emitByte(0xc0 + instr.src + 8 * instr.dst); } - void JitCompilerX86::h_SHR_64(Instruction& instr, int i) { - genar(instr); - genbiashift(instr, 0xe8d3, 0xe8c1); - gencr(instr); + void JitCompilerX86::h_FPSUB_M(Instruction& instr) { + instr.dst %= 4; + genAddressReg(instr); + emit(REX_CVTDQ2PD_XMM12); + emit(REX_SUBPD); + emitByte(0xc4 + 8 * instr.dst); } - void JitCompilerX86::h_SAR_64(Instruction& instr, int i) { - genar(instr); - genbiashift(instr, 0xf8d3, 0xf8c1); - gencr(instr); + void JitCompilerX86::h_FPNEG_R(Instruction& instr) { + instr.dst %= 4; + emit(REX_XORPS); + emitByte(0xc7 + 8 * instr.dst); } - void JitCompilerX86::h_ROL_64(Instruction& instr, int i) { - genar(instr); - genbiashift(instr, 0xc0d3, 0xc0c1); - gencr(instr); + void JitCompilerX86::h_FPMUL_R(Instruction& instr) { + instr.dst %= 4; + instr.src %= 4; + emit(REX_MULPD); + emitByte(0xe0 + instr.src + 8 * instr.dst); } - void JitCompilerX86::h_ROR_64(Instruction& instr, int i) { - genar(instr); - genbiashift(instr, 0xc8d3, 0xc8c1); - gencr(instr); + void JitCompilerX86::h_FPMUL_M(Instruction& instr) { + instr.dst %= 4; + genAddressReg(instr); + emit(REX_CVTDQ2PD_XMM12); + emit(REX_MULPD); + emitByte(0xe4 + 8 * instr.dst); + emit(REX_MAXPD); + emitByte(0xe5 + 8 * instr.dst); } - void JitCompilerX86::h_FPADD(Instruction& instr, int i) { - genaf(instr); - genbf(instr, 0x58); - gencf(instr); + void JitCompilerX86::h_FPDIV_R(Instruction& instr) { + instr.dst %= 4; + instr.src %= 4; + emit(REX_DIVPD); + emitByte(0xe0 + instr.src + 8 * instr.dst); + emit(REX_MAXPD); + emitByte(0xe5 + 8 * instr.dst); } - void JitCompilerX86::h_FPSUB(Instruction& instr, int i) { - genaf(instr); - genbf(instr, 0x5c); - gencf(instr); + void JitCompilerX86::h_FPDIV_M(Instruction& instr) { + instr.dst %= 4; + genAddressReg(instr); + emit(REX_CVTDQ2PD_XMM12); + emit(REX_DIVPD); + emitByte(0xe4 + 8 * instr.dst); + emit(REX_MAXPD); + emitByte(0xe5 + 8 * instr.dst); } - void JitCompilerX86::h_FPMUL(Instruction& instr, int i) { - genaf(instr); - genbf(instr, 0x59); - emit(0x00c9c20f66c8280f); //movaps xmm1,xmm0; cmpeqpd xmm1,xmm1 - emit(uint16_t(0x540f)); //andps xmm0,xmm1 - emitByte(0xc1); - gencf(instr); + void JitCompilerX86::h_FPSQRT_R(Instruction& instr) { + instr.dst %= 4; + emit(SQRTPD); + emitByte(0xe4 + 9 * instr.dst); } - void JitCompilerX86::h_FPDIV(Instruction& instr, int i) { - genaf(instr); - genbf(instr, 0x5e); - emit(0x00c9c20f66c8280f); //movaps xmm1,xmm0; cmpeqpd xmm1,xmm1 - emit(uint16_t(0x540f)); //andps xmm0,xmm1 - emitByte(0xc1); - gencf(instr); - } - - void JitCompilerX86::h_FPSQRT(Instruction& instr, int i) { - genaf(instr); - emit(0xc0510f66c2540f41); //andps xmm0,xmm10; sqrtpd xmm0,xmm0 - gencf(instr); - } - - void JitCompilerX86::h_FPROUND(Instruction& instr, int i) { - genar(instr); - emitByte(0x48); - emit(uint16_t(0xc88b)); //mov rcx,rax - int rotate = (13 - (instr.imm8 & 63)) & 63; + void JitCompilerX86::h_CFROUND(Instruction& instr) { + emit(REX_MOV_RR64); + emitByte(0xc0 + instr.src); + int rotate = (13 - (instr.alt & 63)) & 63; if (rotate != 0) { - emitByte(0x48); - emit(uint16_t(0xc0c1)); //rol rax + emit(ROL_RAX); emitByte(rotate); } - emit(uint16_t(0x0025)); - emit(0x00009fc00d000060); //and eax,0x6000; or eax,0x9fc0 - emit(0x2454ae0ff8244489); //ldmxcsr DWORD PTR [rsp-0x8] - emitByte(0xf8); - gencr(instr, false); //result in rcx + emit(AND_OR_MOV_LDMXCSR); } - static inline uint8_t jumpCondition(Instruction& instr, bool invert = false) { - switch ((instr.locb & 7) ^ invert) + static inline uint8_t condition(Instruction& instr, bool invert = false) { + switch ((instr.alt & 7) ^ invert) { case 0: - return 0x76; //jbe + return 0x96; //setbe case 1: - return 0x77; //ja + return 0x97; //seta case 2: - return 0x78; //js + return 0x98; //sets case 3: - return 0x79; //jns + return 0x99; //setns case 4: - return 0x70; //jo + return 0x90; //seto case 5: - return 0x71; //jno + return 0x91; //setno case 6: - return 0x7c; //jl + return 0x9c; //setl case 7: - return 0x7d; //jge + return 0x9d; //setge } } - void JitCompilerX86::h_JUMP(Instruction& instr, int i) { - genar(instr); - gencr(instr); - emit(uint16_t(0x8141)); //cmp regb, imm32 - emitByte(0xf8 + (instr.regb % RegistersCount)); - emit(instr.imm32); - emitByte(0x0f); //near jump - emitByte(jumpCondition(instr) + 0x10); - i = wrapInstr(i + (instr.imm8 & 127) + 2); - if (i < instructionOffsets.size()) { - emit(instructionOffsets[i] - (codePos + 4)); - } - else { - callOffsets.push_back(CallOffset(codePos, i)); - codePos += 4; - } + void JitCompilerX86::h_COND_R(Instruction& instr) { + emit(XOR_ECX_ECX); + emit(REX_CMP_R32I); + emitByte(0xf8 + instr.src); + emit32(instr.imm32); + emitByte(0x0f); + emitByte(condition(instr)); + emitByte(0xc1); + emit(REX_ADD_RM); + emitByte(0xc1 + 8 * instr.dst); } - void JitCompilerX86::h_CALL(Instruction& instr, int i) { - genar(instr); - gencr(instr); - emit(uint16_t(0x8141)); //cmp regb, imm32 - emitByte(0xf8 + (instr.regb % RegistersCount)); - emit(instr.imm32); - emitByte(jumpCondition(instr, true)); - emitByte(0x05); - emitByte(0xe8); //call - i = wrapInstr(i + (instr.imm8 & 127) + 2); - if (i < instructionOffsets.size()) { - emit(instructionOffsets[i] - (codePos + 4)); - } - else { - callOffsets.push_back(CallOffset(codePos, i)); - codePos += 4; - } + void JitCompilerX86::h_COND_M(Instruction& instr) { + emit(XOR_ECX_ECX); + genAddressReg(instr); + emit(REX_CMP_M32I); + emit32(instr.imm32); + emitByte(0x0f); + emitByte(condition(instr)); + emitByte(0xc1); + emit(REX_ADD_RM); + emitByte(0xc1 + 8 * instr.dst); } - void JitCompilerX86::h_RET(Instruction& instr, int i) { - genar(instr); - int crlen = 0; - if ((instr.locc & 7) <= 3) { - crlen = 17; - } - emit(0x74e73b48); //cmp rsp, rdi; je - emitByte(0x01); - emitByte(0xc3); //ret + void JitCompilerX86::h_ISTORE(Instruction& instr) { + genAddressRegDst(instr); + emit(REX_MOV_MR); + emitByte(0x04 + 8 * instr.src); + emitByte(0x06); } - void JitCompilerX86::h_NOP(Instruction& instr, int i) { - genar(instr); + void JitCompilerX86::h_FSTORE(Instruction& instr) { + genAddressRegDst(instr, true); + emit(MOVAPD); + emitByte(0x04 + 8 * instr.src); + emitByte(0x06); } #include "instructionWeights.hpp" #define INST_HANDLE(x) REPN(&JitCompilerX86::h_##x, WT(x)) InstructionGeneratorX86 JitCompilerX86::engine[256] = { - INST_HANDLE(ADD_64) - INST_HANDLE(ADD_32) - INST_HANDLE(SUB_64) - INST_HANDLE(SUB_32) - INST_HANDLE(MUL_64) - INST_HANDLE(MULH_64) - INST_HANDLE(MUL_32) - INST_HANDLE(IMUL_32) - INST_HANDLE(IMULH_64) - INST_HANDLE(DIV_64) - INST_HANDLE(IDIV_64) - INST_HANDLE(AND_64) - INST_HANDLE(AND_32) - INST_HANDLE(OR_64) - INST_HANDLE(OR_32) - INST_HANDLE(XOR_64) - INST_HANDLE(XOR_32) - INST_HANDLE(SHL_64) - INST_HANDLE(SHR_64) - INST_HANDLE(SAR_64) - INST_HANDLE(ROL_64) - INST_HANDLE(ROR_64) - INST_HANDLE(FPADD) - INST_HANDLE(FPSUB) - INST_HANDLE(FPMUL) - INST_HANDLE(FPDIV) - INST_HANDLE(FPSQRT) - INST_HANDLE(FPROUND) - INST_HANDLE(JUMP) - INST_HANDLE(CALL) - INST_HANDLE(RET) - INST_HANDLE(NOP) + INST_HANDLE(IADD_R) + INST_HANDLE(IADD_M) + INST_HANDLE(IADD_RC) + INST_HANDLE(ISUB_R) + INST_HANDLE(ISUB_M) + INST_HANDLE(IMUL_9C) + INST_HANDLE(IMUL_R) + INST_HANDLE(IMUL_M) + INST_HANDLE(IMULH_R) + INST_HANDLE(IMULH_M) + INST_HANDLE(ISMULH_R) + INST_HANDLE(ISMULH_M) + INST_HANDLE(IDIV_C) + INST_HANDLE(ISDIV_C) + INST_HANDLE(INEG_R) + INST_HANDLE(IXOR_R) + INST_HANDLE(IXOR_M) + INST_HANDLE(IROR_R) + INST_HANDLE(IROL_R) + INST_HANDLE(FPSWAP_R) + INST_HANDLE(FPADD_R) + INST_HANDLE(FPADD_M) + INST_HANDLE(FPSUB_R) + INST_HANDLE(FPSUB_M) + INST_HANDLE(FPNEG_R) + INST_HANDLE(FPMUL_R) + INST_HANDLE(FPMUL_M) + INST_HANDLE(FPDIV_R) + INST_HANDLE(FPDIV_M) + INST_HANDLE(FPSQRT_R) + INST_HANDLE(COND_R) + INST_HANDLE(COND_M) + INST_HANDLE(CFROUND) + INST_HANDLE(ISTORE) + INST_HANDLE(FSTORE) }; + #endif } \ No newline at end of file diff --git a/src/JitCompilerX86.hpp b/src/JitCompilerX86.hpp index e6a7e6d..fa5aa93 100644 --- a/src/JitCompilerX86.hpp +++ b/src/JitCompilerX86.hpp @@ -30,16 +30,10 @@ namespace RandomX { class JitCompilerX86; - typedef void(JitCompilerX86::*InstructionGeneratorX86)(Instruction&, int); + typedef void(JitCompilerX86::*InstructionGeneratorX86)(Instruction&); constexpr uint32_t CodeSize = 64 * 1024; - struct CallOffset { - CallOffset(int32_t p, int32_t i) : pos(p), index(i) {} - int32_t pos; - int32_t index; - }; - class JitCompilerX86 { public: JitCompilerX86(); @@ -55,66 +49,82 @@ namespace RandomX { static InstructionGeneratorX86 engine[256]; uint8_t* code; int32_t codePos; - std::vector instructionOffsets; - std::vector callOffsets; - void gena(Instruction&); - void genar(Instruction&); - void genaf(Instruction&); - void genbiashift(Instruction&, uint16_t, uint16_t); - void genbia(Instruction&, uint16_t, uint16_t); - void genbia32(Instruction&, uint16_t, uint8_t); - void genbf(Instruction&, uint8_t); - void scratchpadStoreR(Instruction&, uint32_t, bool); - void scratchpadStoreF(Instruction&, int, uint32_t, bool); - void gencr(Instruction&, bool); - void gencf(Instruction&); - void generateCode(Instruction&, int); - void fixCallOffsets(); + void genAddressReg(Instruction&, bool); + void genAddressRegDst(Instruction&, bool); + void genAddressImm(Instruction&); + void genSIB(int scale, int index, int base); + + void generateCode(Instruction&); void emitByte(uint8_t val) { code[codePos] = val; codePos++; } - template - void emit(T val) { - *reinterpret_cast(code + codePos) = val; - codePos += sizeof(T); + void emit32(uint32_t val) { + code[codePos + 0] = val; + code[codePos + 1] = val >> 8; + code[codePos + 2] = val >> 16; + code[codePos + 3] = val >> 24; + codePos += 4; } - void h_ADD_64(Instruction&, int); - void h_ADD_32(Instruction&, int); - void h_SUB_64(Instruction&, int); - void h_SUB_32(Instruction&, int); - void h_MUL_64(Instruction&, int); - void h_MULH_64(Instruction&, int); - void h_MUL_32(Instruction&, int); - void h_IMUL_32(Instruction&, int); - void h_IMULH_64(Instruction&, int); - void h_DIV_64(Instruction&, int); - void h_IDIV_64(Instruction&, int); - void h_AND_64(Instruction&, int); - void h_AND_32(Instruction&, int); - void h_OR_64(Instruction&, int); - void h_OR_32(Instruction&, int); - void h_XOR_64(Instruction&, int); - void h_XOR_32(Instruction&, int); - void h_SHL_64(Instruction&, int); - void h_SHR_64(Instruction&, int); - void h_SAR_64(Instruction&, int); - void h_ROL_64(Instruction&, int); - void h_ROR_64(Instruction&, int); - void h_FPADD(Instruction&, int); - void h_FPSUB(Instruction&, int); - void h_FPMUL(Instruction&, int); - void h_FPDIV(Instruction&, int); - void h_FPSQRT(Instruction&, int); - void h_FPROUND(Instruction&, int); - void h_JUMP(Instruction&, int); - void h_CALL(Instruction&, int); - void h_RET(Instruction&, int); - void h_NOP(Instruction&, int); + void emit64(uint64_t val) { + code[codePos + 0] = val; + code[codePos + 1] = val >> 8; + code[codePos + 2] = val >> 16; + code[codePos + 3] = val >> 24; + code[codePos + 4] = val >> 32; + code[codePos + 5] = val >> 40; + code[codePos + 6] = val >> 48; + code[codePos + 7] = val >> 56; + codePos += 8; + } + + template + void emit(const uint8_t (&src)[N]) { + for (int i = 0; i < N; ++i) { + code[codePos + i] = src[i]; + } + codePos += N; + } + + void h_IADD_R(Instruction&); + void h_IADD_M(Instruction&); + void h_IADD_RC(Instruction&); + void h_ISUB_R(Instruction&); + void h_ISUB_M(Instruction&); + void h_IMUL_9C(Instruction&); + void h_IMUL_R(Instruction&); + void h_IMUL_M(Instruction&); + void h_IMULH_R(Instruction&); + void h_IMULH_M(Instruction&); + void h_ISMULH_R(Instruction&); + void h_ISMULH_M(Instruction&); + void h_IDIV_C(Instruction&); + void h_ISDIV_C(Instruction&); + void h_INEG_R(Instruction&); + void h_IXOR_R(Instruction&); + void h_IXOR_M(Instruction&); + void h_IROR_R(Instruction&); + void h_IROL_R(Instruction&); + void h_FPSWAP_R(Instruction&); + void h_FPADD_R(Instruction&); + void h_FPADD_M(Instruction&); + void h_FPSUB_R(Instruction&); + void h_FPSUB_M(Instruction&); + void h_FPNEG_R(Instruction&); + void h_FPMUL_R(Instruction&); + void h_FPMUL_M(Instruction&); + void h_FPDIV_R(Instruction&); + void h_FPDIV_M(Instruction&); + void h_FPSQRT_R(Instruction&); + void h_COND_R(Instruction&); + void h_COND_M(Instruction&); + void h_CFROUND(Instruction&); + void h_ISTORE(Instruction&); + void h_FSTORE(Instruction&); }; } \ No newline at end of file diff --git a/src/asm/program_epilogue_store.inc b/src/asm/program_epilogue_store.inc index 95a4752..b94fa4d 100644 --- a/src/asm/program_epilogue_store.inc +++ b/src/asm/program_epilogue_store.inc @@ -1,9 +1,5 @@ - ;# unroll VM stack - mov rsp, rdi - ;# save VM register values pop rcx - pop rcx mov qword ptr [rcx+0], r8 mov qword ptr [rcx+8], r9 mov qword ptr [rcx+16], r10 @@ -12,12 +8,12 @@ mov qword ptr [rcx+40], r13 mov qword ptr [rcx+48], r14 mov qword ptr [rcx+56], r15 - movapd xmmword ptr [rcx+64], xmm8 - movapd xmmword ptr [rcx+80], xmm9 - movapd xmmword ptr [rcx+96], xmm2 - movapd xmmword ptr [rcx+112], xmm3 + movdqa xmmword ptr [rcx+64], xmm0 + movdqa xmmword ptr [rcx+80], xmm1 + movdqa xmmword ptr [rcx+96], xmm2 + movdqa xmmword ptr [rcx+112], xmm3 lea rcx, [rcx+64] - movapd xmmword ptr [rcx+64], xmm4 - movapd xmmword ptr [rcx+80], xmm5 - movapd xmmword ptr [rcx+96], xmm6 - movapd xmmword ptr [rcx+112], xmm7 \ No newline at end of file + movdqa xmmword ptr [rcx+64], xmm4 + movdqa xmmword ptr [rcx+80], xmm5 + movdqa xmmword ptr [rcx+96], xmm6 + movdqa xmmword ptr [rcx+112], xmm7 \ No newline at end of file diff --git a/src/asm/program_epilogue_win64.inc b/src/asm/program_epilogue_win64.inc index 220bed8..f2e4b44 100644 --- a/src/asm/program_epilogue_win64.inc +++ b/src/asm/program_epilogue_win64.inc @@ -1,6 +1,12 @@ include program_epilogue_store.inc ;# restore callee-saved registers - Microsoft x64 calling convention + movdqu xmm15, xmmword ptr [rsp] + movdqu xmm14, xmmword ptr [rsp+16] + movdqu xmm13, xmmword ptr [rsp+32] + movdqu xmm12, xmmword ptr [rsp+48] + movdqu xmm11, xmmword ptr [rsp+64] + add rsp, 80 movdqu xmm10, xmmword ptr [rsp] movdqu xmm9, xmmword ptr [rsp+16] movdqu xmm8, xmmword ptr [rsp+32] @@ -17,4 +23,4 @@ pop rbx ;# program finished - ret 0 \ No newline at end of file + ret diff --git a/src/asm/program_load_flt.inc b/src/asm/program_load_flt.inc new file mode 100644 index 0000000..af6f1b7 --- /dev/null +++ b/src/asm/program_load_flt.inc @@ -0,0 +1,14 @@ + and eax, 262080 + lea rcx, [rsi+rax] + cvtdq2pd xmm0, qword ptr [rcx+0] + cvtdq2pd xmm1, qword ptr [rcx+8] + cvtdq2pd xmm2, qword ptr [rcx+16] + cvtdq2pd xmm3, qword ptr [rcx+24] + cvtdq2pd xmm4, qword ptr [rcx+32] + cvtdq2pd xmm5, qword ptr [rcx+40] + cvtdq2pd xmm6, qword ptr [rcx+48] + cvtdq2pd xmm7, qword ptr [rcx+56] + andps xmm4, xmm14 + andps xmm5, xmm14 + andps xmm6, xmm14 + andps xmm7, xmm14 diff --git a/src/asm/program_load_int.inc b/src/asm/program_load_int.inc new file mode 100644 index 0000000..d139549 --- /dev/null +++ b/src/asm/program_load_int.inc @@ -0,0 +1,10 @@ + and eax, 262080 + lea rcx, [rsi+rax] + xor r8, qword ptr [rcx+0] + xor r9, qword ptr [rcx+8] + xor r10, qword ptr [rcx+16] + xor r11, qword ptr [rcx+24] + xor r12, qword ptr [rcx+32] + xor r13, qword ptr [rcx+40] + xor r14, qword ptr [rcx+48] + xor r15, qword ptr [rcx+56] diff --git a/src/asm/program_prologue_linux.inc b/src/asm/program_prologue_linux.inc index 6bc3bd2..67a967d 100644 --- a/src/asm/program_prologue_linux.inc +++ b/src/asm/program_prologue_linux.inc @@ -7,13 +7,14 @@ push r15 ;# function arguments + mov rbx, rcx ;# loop counter push rdi ;# RegisterFile& registerFile - mov rbp, qword ptr [rsi] ;# "mx", "ma" - mov rax, qword ptr [rsi+8] ;# uint8_t* dataset - push rax - mov rsi, rdx ;# convertible_t* scratchpad mov rcx, rdi + mov rbp, qword ptr [rsi] ;# "mx", "ma" + mov eax, ebp ;# "mx" + mov rdi, qword ptr [rsi+8] ;# uint8_t* dataset + mov rsi, rdx ;# convertible_t* scratchpad #include "program_prologue_load.inc" - jmp randomx_program_begin \ No newline at end of file + jmp DECL(randomx_loop_begin) \ No newline at end of file diff --git a/src/asm/program_prologue_load.inc b/src/asm/program_prologue_load.inc index 9ceeed6..ecdd4f9 100644 --- a/src/asm/program_prologue_load.inc +++ b/src/asm/program_prologue_load.inc @@ -1,27 +1,20 @@ - mov rdi, rsp ;# beginning of VM stack - mov ebx, 262145 ;# number of VM instructions to execute + 1 + ;# zero integer registers + xor r8, r8 + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 - xorps xmm10, xmm10 - cmpeqpd xmm10, xmm10 - psrlq xmm10, 1 ;# mask for absolute value = 0x7fffffffffffffff7fffffffffffffff + ;# load constant registers + lea rcx, [rcx+120] + movapd xmm8, xmmword ptr [rcx+72] + movapd xmm9, xmmword ptr [rcx+88] + movapd xmm10, xmmword ptr [rcx+104] + movapd xmm11, xmmword ptr [rcx+120] + movapd xmm13, xmmword ptr [minDbl] + movapd xmm14, xmmword ptr [absMask] + movapd xmm15, xmmword ptr [signMask] - ;# load integer registers - mov r8, qword ptr [rcx+0] - mov r9, qword ptr [rcx+8] - mov r10, qword ptr [rcx+16] - mov r11, qword ptr [rcx+24] - mov r12, qword ptr [rcx+32] - mov r13, qword ptr [rcx+40] - mov r14, qword ptr [rcx+48] - mov r15, qword ptr [rcx+56] - - ;# load floating point registers - movapd xmm8, xmmword ptr [rcx+64] - movapd xmm9, xmmword ptr [rcx+80] - movapd xmm2, xmmword ptr [rcx+96] - movapd xmm3, xmmword ptr [rcx+112] - lea rcx, [rcx+64] - movapd xmm4, xmmword ptr [rcx+64] - movapd xmm5, xmmword ptr [rcx+80] - movapd xmm6, xmmword ptr [rcx+96] - movapd xmm7, xmmword ptr [rcx+112] diff --git a/src/asm/program_prologue_win64.inc b/src/asm/program_prologue_win64.inc index bbf7851..83ae2a5 100644 --- a/src/asm/program_prologue_win64.inc +++ b/src/asm/program_prologue_win64.inc @@ -13,14 +13,21 @@ movdqu xmmword ptr [rsp+32], xmm8 movdqu xmmword ptr [rsp+16], xmm9 movdqu xmmword ptr [rsp+0], xmm10 + sub rsp, 80 + movdqu xmmword ptr [rsp+64], xmm11 + movdqu xmmword ptr [rsp+48], xmm12 + movdqu xmmword ptr [rsp+32], xmm13 + movdqu xmmword ptr [rsp+16], xmm14 + movdqu xmmword ptr [rsp+0], xmm15 - ;# function arguments - push rcx ;# RegisterFile& registerFile - mov rbp, qword ptr [rdx] ;# "mx", "ma" - mov rax, qword ptr [rdx+8] ;# uint8_t* dataset - push rax - mov rsi, r8 ;# convertible_t* scratchpad + ; function arguments + push rcx ; RegisterFile& registerFile + mov rbp, qword ptr [rdx] ; "mx", "ma" + mov eax, ebp ; "mx" + mov rdi, qword ptr [rdx+8] ; uint8_t* dataset + mov rsi, r8 ; convertible_t* scratchpad + mov rbx, r9 ; loop counter include program_prologue_load.inc - jmp randomx_program_begin \ No newline at end of file + jmp randomx_loop_begin \ No newline at end of file diff --git a/src/asm/program_read.inc b/src/asm/program_read.inc deleted file mode 100644 index c7650ea..0000000 --- a/src/asm/program_read.inc +++ /dev/null @@ -1,20 +0,0 @@ - db 0, 0, 0, 0 ;# TransformAddress placeholder - mov rcx, qword ptr [rdi] ;# load the dataset address - xor rbp, rax ;# modify "mx" - ;# prefetch cacheline "mx" - and rbp, -64 ;# align "mx" to the start of a cache line - mov edx, ebp ;# edx = mx - prefetchnta byte ptr [rcx+rdx] - ;# read cacheline "ma" - ror rbp, 32 ;# swap "ma" and "mx" - mov edx, ebp ;# edx = ma - lea rcx, [rcx+rdx] ;# dataset cache line - xor r8, qword ptr [rcx+0] - xor r9, qword ptr [rcx+8] - xor r10, qword ptr [rcx+16] - xor r11, qword ptr [rcx+24] - xor r12, qword ptr [rcx+32] - xor r13, qword ptr [rcx+40] - xor r14, qword ptr [rcx+48] - xor r15, qword ptr [rcx+56] - ret \ No newline at end of file diff --git a/src/asm/program_read_dataset.inc b/src/asm/program_read_dataset.inc new file mode 100644 index 0000000..bae4817 --- /dev/null +++ b/src/asm/program_read_dataset.inc @@ -0,0 +1,16 @@ + xor rbp, rax ;# modify "mx" + and rbp, -64 ;# align "mx" to the start of a cache line + mov edx, ebp ;# edx = mx + prefetchnta byte ptr [rdi+rdx] + ror rbp, 32 ;# swap "ma" and "mx" + mov edx, ebp ;# edx = ma + lea rcx, [rdi+rdx] ;# dataset cache line + xor r8, qword ptr [rcx+0] + xor r9, qword ptr [rcx+8] + xor r10, qword ptr [rcx+16] + xor r11, qword ptr [rcx+24] + xor r12, qword ptr [rcx+32] + xor r13, qword ptr [rcx+40] + xor r14, qword ptr [rcx+48] + xor r15, qword ptr [rcx+56] + \ No newline at end of file diff --git a/src/asm/program_store_flt.inc b/src/asm/program_store_flt.inc new file mode 100644 index 0000000..d6ca7f1 --- /dev/null +++ b/src/asm/program_store_flt.inc @@ -0,0 +1,11 @@ + and eax, 262080 + lea rcx, [rsi+rax] + mulpd xmm0, xmm4 + mulpd xmm1, xmm5 + mulpd xmm2, xmm6 + mulpd xmm3, xmm7 + movapd xmmword ptr [rcx+0], xmm0 + movapd xmmword ptr [rcx+16], xmm1 + movapd xmmword ptr [rcx+32], xmm2 + movapd xmmword ptr [rcx+48], xmm3 + diff --git a/src/asm/program_store_int.inc b/src/asm/program_store_int.inc new file mode 100644 index 0000000..75c973f --- /dev/null +++ b/src/asm/program_store_int.inc @@ -0,0 +1,10 @@ + and eax, 262080 + lea rcx, [rsi+rax] + mov qword ptr [rcx+0], r8 + mov qword ptr [rcx+8], r9 + mov qword ptr [rcx+16], r10 + mov qword ptr [rcx+24], r11 + mov qword ptr [rcx+32], r12 + mov qword ptr [rcx+40], r13 + mov qword ptr [rcx+48], r14 + mov qword ptr [rcx+56], r15 diff --git a/src/asm/program_xmm_constants.inc b/src/asm/program_xmm_constants.inc new file mode 100644 index 0000000..38c897c --- /dev/null +++ b/src/asm/program_xmm_constants.inc @@ -0,0 +1,6 @@ +minDbl: + db 0, 0, 0, 0, 0, 0, 16, 0, 0, 0, 0, 0, 0, 0, 16, 0 +absMask: + db 255, 255, 255, 255, 255, 255, 255, 127, 255, 255, 255, 255, 255, 255, 255, 127 +signMask: + db 0, 0, 0, 0, 0, 0, 0, 128, 0, 0, 0, 0, 0, 0, 0, 128 \ No newline at end of file diff --git a/src/common.hpp b/src/common.hpp index bf235ec..053f2a1 100644 --- a/src/common.hpp +++ b/src/common.hpp @@ -81,6 +81,8 @@ namespace RandomX { constexpr uint32_t ScratchpadL3 = ScratchpadSize / sizeof(convertible_t); constexpr int ScratchpadL1Mask = (ScratchpadL1 - 1) * 8; constexpr int ScratchpadL2Mask = (ScratchpadL2 - 1) * 8; + constexpr int ScratchpadL1Mask16 = (ScratchpadL1 / 2 - 1) * 16; + constexpr int ScratchpadL2Mask16 = (ScratchpadL2 / 2 - 1) * 16; constexpr uint32_t TransformationCount = 90; constexpr int RegistersCount = 8; @@ -129,7 +131,7 @@ namespace RandomX { typedef void(*DatasetReadFunc)(addr_t, MemoryRegisters&, RegisterFile&); - typedef void(*ProgramFunc)(RegisterFile&, MemoryRegisters&, convertible_t*); + typedef void(*ProgramFunc)(RegisterFile&, MemoryRegisters&, convertible_t*, uint64_t); extern "C" { void executeProgram(RegisterFile&, MemoryRegisters&, convertible_t*, uint64_t); diff --git a/src/executeProgram-win64.asm b/src/executeProgram-win64.asm index 17e593d..be3bc82 100644 --- a/src/executeProgram-win64.asm +++ b/src/executeProgram-win64.asm @@ -21,14 +21,6 @@ _RANDOMX_EXECUTE_PROGRAM SEGMENT PAGE READ EXECUTE PUBLIC executeProgram -ALIGN 16 -minDbl: -db 0, 0, 0, 0, 0, 0, 16, 0, 0, 0, 0, 0, 0, 0, 16, 0 -absMask: -db 255, 255, 255, 255, 255, 255, 255, 127, 255, 255, 255, 255, 255, 255, 255, 127 -signMask: -db 0, 0, 0, 0, 0, 0, 0, 128, 0, 0, 0, 0, 0, 0, 0, 128 - executeProgram PROC ; REGISTER ALLOCATION: ; rax -> temporary @@ -114,6 +106,17 @@ executeProgram PROC movapd xmm14, xmmword ptr [absMask] movapd xmm15, xmmword ptr [signMask] + jmp program_begin + +ALIGN 64 +minDbl: + db 0, 0, 0, 0, 0, 0, 16, 0, 0, 0, 0, 0, 0, 0, 16, 0 +absMask: + db 255, 255, 255, 255, 255, 255, 255, 127, 255, 255, 255, 255, 255, 255, 255, 127 +signMask: + db 0, 0, 0, 0, 0, 0, 0, 128, 0, 0, 0, 0, 0, 0, 0, 128 + +ALIGN 64 program_begin: xor eax, r8d ;# read address register 1 and eax, 262080 @@ -144,7 +147,7 @@ program_begin: ;# 256 instructions include program.inc - + mov eax, r8d ;# read address register 1 xor eax, r9d ;# read address register 2 xor rbp, rax ;# modify "mx" diff --git a/src/instructionWeights.hpp b/src/instructionWeights.hpp index 242b5bd..86285de 100644 --- a/src/instructionWeights.hpp +++ b/src/instructionWeights.hpp @@ -22,21 +22,21 @@ along with RandomX. If not, see. //Integer #define WT_IADD_R 10 #define WT_IADD_M 3 -#define WT_IADD_RC 12 +#define WT_IADD_RC 10 #define WT_ISUB_R 10 #define WT_ISUB_M 3 -#define WT_IMUL_9C 12 -#define WT_IMUL_R 24 -#define WT_IMUL_M 8 +#define WT_IMUL_9C 10 +#define WT_IMUL_R 20 +#define WT_IMUL_M 6 #define WT_IMULH_R 6 #define WT_IMULH_M 2 #define WT_ISMULH_R 6 #define WT_ISMULH_M 2 #define WT_IDIV_C 4 -#define WT_ISDIV_C 2 -#define WT_INEG_R 4 -#define WT_IXOR_R 15 -#define WT_IXOR_M 5 +#define WT_ISDIV_C 4 +#define WT_INEG_R 2 +#define WT_IXOR_R 12 +#define WT_IXOR_M 4 #define WT_IROR_R 10 #define WT_IROL_R 10 @@ -58,10 +58,14 @@ along with RandomX. If not, see. #define WT_FPSQRT_R 6 //Control -#define WT_COND_R 15 -#define WT_COND_M 5 +#define WT_COND_R 12 +#define WT_COND_M 4 #define WT_CFROUND 1 +//Store +#define WT_ISTORE 12 +#define WT_FSTORE 6 + #define WT_NOP 0 constexpr int wtSum = WT_IADD_R + WT_IADD_M + WT_IADD_RC + WT_ISUB_R + \ @@ -70,7 +74,7 @@ WT_IMULH_M + WT_ISMULH_R + WT_ISMULH_M + WT_IDIV_C + WT_ISDIV_C + \ WT_INEG_R + WT_IXOR_R + WT_IXOR_M + WT_IROR_R + WT_IROL_R + \ WT_FPSWAP_R + WT_FPADD_R + WT_FPADD_M + WT_FPSUB_R + WT_FPSUB_M + \ WT_FPNEG_R + WT_FPMUL_R + WT_FPMUL_M + WT_FPDIV_R + WT_FPDIV_M + \ -WT_FPSQRT_R + WT_COND_R + WT_COND_M + WT_CFROUND + WT_NOP; +WT_FPSQRT_R + WT_COND_R + WT_COND_M + WT_CFROUND + WT_ISTORE + WT_FSTORE + WT_NOP; static_assert(wtSum == 256, "Sum of instruction weights must be 256"); @@ -116,3 +120,40 @@ static_assert(wtSum == 256, #define REPN(x,N) REPNX(x,N) #define NUM(x) x #define WT(x) NUM(WT_##x) + +#define REPCASE0(x) +#define REPCASE1(x) case __COUNTER__: +#define REPCASE2(x) REPCASE1(x) case __COUNTER__: +#define REPCASE3(x) REPCASE2(x) case __COUNTER__: +#define REPCASE4(x) REPCASE3(x) case __COUNTER__: +#define REPCASE5(x) REPCASE4(x) case __COUNTER__: +#define REPCASE6(x) REPCASE5(x) case __COUNTER__: +#define REPCASE7(x) REPCASE6(x) case __COUNTER__: +#define REPCASE8(x) REPCASE7(x) case __COUNTER__: +#define REPCASE9(x) REPCASE8(x) case __COUNTER__: +#define REPCASE10(x) REPCASE9(x) case __COUNTER__: +#define REPCASE11(x) REPCASE10(x) case __COUNTER__: +#define REPCASE12(x) REPCASE11(x) case __COUNTER__: +#define REPCASE13(x) REPCASE12(x) case __COUNTER__: +#define REPCASE14(x) REPCASE13(x) case __COUNTER__: +#define REPCASE15(x) REPCASE14(x) case __COUNTER__: +#define REPCASE16(x) REPCASE15(x) case __COUNTER__: +#define REPCASE17(x) REPCASE16(x) case __COUNTER__: +#define REPCASE18(x) REPCASE17(x) case __COUNTER__: +#define REPCASE19(x) REPCASE18(x) case __COUNTER__: +#define REPCASE20(x) REPCASE19(x) case __COUNTER__: +#define REPCASE21(x) REPCASE20(x) case __COUNTER__: +#define REPCASE22(x) REPCASE21(x) case __COUNTER__: +#define REPCASE23(x) REPCASE22(x) case __COUNTER__: +#define REPCASE24(x) REPCASE23(x) case __COUNTER__: +#define REPCASE25(x) REPCASE24(x) case __COUNTER__: +#define REPCASE26(x) REPCASE25(x) case __COUNTER__: +#define REPCASE27(x) REPCASE26(x) case __COUNTER__: +#define REPCASE28(x) REPCASE27(x) case __COUNTER__: +#define REPCASE29(x) REPCASE28(x) case __COUNTER__: +#define REPCASE30(x) REPCASE29(x) case __COUNTER__: +#define REPCASE31(x) REPCASE30(x) case __COUNTER__: +#define REPCASE32(x) REPCASE31(x) case __COUNTER__: +#define REPCASENX(x,N) REPCASE##N(x) +#define REPCASEN(x,N) REPCASENX(x,N) +#define CASE_REP(x) REPCASEN(x, WT(x)) \ No newline at end of file diff --git a/src/main.cpp b/src/main.cpp index 0b09a74..12e9cdb 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -174,7 +174,7 @@ void mine(RandomX::VirtualMachine* vm, std::atomic& atomicNonce, AtomicHash for (int chain = 0; chain < 16; ++chain) { vm->initializeProgram(hash); int segment = hash[3] & 3; - vm->setScratchpad(scratchpad);// +segment * RandomX::ScratchpadSize / 4); + vm->setScratchpad(scratchpad + segment * RandomX::ScratchpadSize / 4); vm->execute(); vm->getResult(nullptr, 0, hash); } diff --git a/src/program.inc b/src/program.inc index a91240e..21f7d0b 100644 --- a/src/program.inc +++ b/src/program.inc @@ -1,745 +1,793 @@ - ; ISUB_R r0, r4 - sub r8, r12 - ; IROR_R r5, 15 - ror r13, 15 - ; ISUB_M r6, L1[r5] - mov eax, r13d - and eax, 16376 - sub r14, qword ptr [rsi+rax] - ; IMUL_R r7, r6 - imul r15, r14 - ; FPADD_R f3, a1 - addpd xmm3, xmm9 - ; FPMUL_R e1, a3 - mulpd xmm5, xmm11 - ; IMUL_R r2, r4 - imul r10, r12 - ; IADD_RC r4, r5, 1789610138 - lea r12, [r12+r13+1789610138] - ; IADD_R r1, r4 - add r9, r12 - ; IADD_R r6, r0 - add r14, r8 - ; IXOR_R r7, r2 - xor r15, r10 - ; ISMULH_M r6, L1[6816] - mov rax, r14 - imul qword ptr [rsi+6816] - mov r14, rdx - ; ISUB_R r0, r4 - sub r8, r12 - ; IXOR_R r7, r2 - xor r15, r10 - ; INEG_R r4 - neg r12 - ; IROL_R r3, r0 - mov ecx, r8d - rol r11, cl - ; IADD_RC r2, r5, -1667142135 - lea r10, [r10+r13-1667142135] - ; ISUB_R r6, r2 - sub r14, r10 - ; IDIV_C r3, 2650709570 - mov rax, 3736177069856446853 - mul r11 - shr rdx, 29 - add r11, rdx - ; IMULH_R r3, r0 - mov rax, r11 - mul r8 - mov r11, rdx - ; FPSUB_R f0, a2 - subpd xmm0, xmm10 - ; FPADD_M f3, L2[r4] - mov eax, r12d + ; FPMUL_R e0, a2 + mulpd xmm4, xmm10 + ; IADD_RC r2, r5, -1621224194 + lea r10, [r10+r13-1621224194] + ; ISTORE L2[r2], r7 + mov eax, r10d and eax, 262136 - cvtdq2pd xmm12, qword ptr [rsi+rax] - addpd xmm3, xmm12 - ; FPMUL_M e1, L1[r5] - mov eax, r13d + mov qword ptr [rsi+rax], r15 + ; FPMUL_R e2, a2 + mulpd xmm6, xmm10 + ; IMUL_R r6, r3 + imul r14, r11 + ; FPMUL_R e1, a0 + mulpd xmm5, xmm8 + ; IROR_R r5, r3 + mov ecx, r11d + ror r13, cl + ; FPMUL_R e2, a0 + mulpd xmm6, xmm8 + ; FPNEG_R f3 + xorps xmm3, xmm15 + ; IXOR_R r0, r4 + xor r8, r12 + ; ISMULH_R r3, r7 + mov rax, r11 + imul r15 + mov r11, rdx + ; FPSWAP_R f2 + shufpd xmm2, xmm2, 1 + ; ISMULH_R r6, r0 + mov rax, r14 + imul r8 + mov r14, rdx + ; FPMUL_R e0, a2 + mulpd xmm4, xmm10 + ; ISUB_R r3, r4 + sub r11, r12 + ; IADD_R r7, -1138617760 + add r15, -1138617760 + ; IROR_R r2, r6 + mov ecx, r14d + ror r10, cl + ; FPMUL_R e2, a1 + mulpd xmm6, xmm9 + ; IROR_R r7, r1 + mov ecx, r9d + ror r15, cl + ; COND_M r2, lt(L1[r7], -41618808) + xor ecx, ecx + mov eax, r15d and eax, 16376 + cmp dword ptr [rsi+rax], -41618808 + setl cl + add r10, rcx + ; FPMUL_M e3, L1[r0] + mov eax, r8d + and eax, 16376 + cvtdq2pd xmm12, qword ptr [rsi+rax] + mulpd xmm7, xmm12 + maxpd xmm7, xmm13 + ; CFROUND r1, 43 + mov rax, r9 + rol rax, 34 + and eax, 24576 + or eax, 40896 + mov dword ptr [rsp-8], eax + ldmxcsr dword ptr [rsp-8] + ; FPADD_R f2, a1 + addpd xmm2, xmm9 + ; FPNEG_R f0 + xorps xmm0, xmm15 + ; FSTORE L1[r6], f2 + mov eax, r14d + and eax, 16368 + movapd xmmword ptr [rsi+rax], xmm2 + ; IMUL_9C r6, -45112665 + lea r14, [r14+r14*8-45112665] + ; IADD_M r0, L1[r4] + mov eax, r12d + and eax, 16376 + add r8, qword ptr [rsi+rax] + ; ISTORE L1[r4], r3 + mov eax, r12d + and eax, 16376 + mov qword ptr [rsi+rax], r11 + ; ISTORE L1[r6], r6 + mov eax, r14d + and eax, 16376 + mov qword ptr [rsi+rax], r14 + ; COND_R r4, sg(r1, -1189096105) + xor ecx, ecx + cmp r9d, -1189096105 + sets cl + add r12, rcx + ; IXOR_R r2, r5 + xor r10, r13 + ; COND_R r1, be(r5, -965180434) + xor ecx, ecx + cmp r13d, -965180434 + setbe cl + add r9, rcx + ; FPMUL_M e1, L2[r3] + mov eax, r11d + and eax, 262136 cvtdq2pd xmm12, qword ptr [rsi+rax] mulpd xmm5, xmm12 maxpd xmm5, xmm13 - ; IMUL_9C r7, -778247271 - lea r15, [r15+r15*8-778247271] - ; IXOR_R r4, 1846379510 - xor r12, 1846379510 - ; COND_M r6, of(L1[r1], -397786451) + ; IMULH_R r7, r6 + mov rax, r15 + mul r14 + mov r15, rdx + ; ISMULH_M r0, L1[r4] + mov ecx, r12d + and ecx, 16376 + mov rax, r8 + imul qword ptr [rsi+rcx] + mov r8, rdx + ; IMUL_R r5, r3 + imul r13, r11 + ; COND_R r2, of(r0, -1045938770) xor ecx, ecx - mov eax, r9d - and eax, 16376 - cmp dword ptr [rsi+rax], -397786451 + cmp r8d, -1045938770 seto cl - add r14, rcx - ; COND_R r6, of(r3, -1033710571) - xor ecx, ecx - cmp r11d, -1033710571 - seto cl - add r14, rcx - ; COND_M r6, sg(L1[r6], 1413230028) - xor ecx, ecx - mov eax, r14d - and eax, 16376 - cmp dword ptr [rsi+rax], 1413230028 - sets cl - add r14, rcx - ; IDIV_C r0, 2791108943 - mov rax, 1774119268816201525 - mul r8 - shr rdx, 28 - add r8, rdx - ; FPSUB_M f1, L1[r6] - mov eax, r14d - and eax, 16376 - cvtdq2pd xmm12, qword ptr [rsi+rax] - subpd xmm1, xmm12 - ; FPSWAP_R f0 - shufpd xmm0, xmm0, 1 - ; IADD_RC r6, r5, -640194892 - lea r14, [r14+r13-640194892] - ; FPADD_M f0, L1[r2] - mov eax, r10d - and eax, 16376 - cvtdq2pd xmm12, qword ptr [rsi+rax] - addpd xmm0, xmm12 - ; IMUL_R r6, r5 - imul r14, r13 - ; IROL_R r4, r1 - mov ecx, r9d - rol r12, cl - ; FPDIV_R e2, a0 - divpd xmm6, xmm8 - maxpd xmm6, xmm13 - ; IADD_RC r0, r2, -487084195 - lea r8, [r8+r10-487084195] - ; FPADD_R f0, a0 - addpd xmm0, xmm8 - ; IXOR_R r5, r3 - xor r13, r11 - ; IMUL_R r2, r4 - imul r10, r12 - ; FPMUL_R e0, a0 - mulpd xmm4, xmm8 - ; FPSUB_R f3, a3 - subpd xmm3, xmm11 - ; IMUL_M r4, L1[4856] - imul r12, qword ptr [rsi+4856] - ; IMUL_9C r2, 7951348 - lea r10, [r10+r10*8+7951348] - ; COND_R r3, ab(r7, 984532162) - xor ecx, ecx - cmp r15d, 984532162 - seta cl - add r11, rcx - ; IXOR_M r7, L1[r4] + add r10, rcx + ; FPADD_M f3, L1[r4] mov eax, r12d and eax, 16376 - xor r15, qword ptr [rsi+rax] - ; IMUL_R r4, 248971329 - imul r12, 248971329 - ; IXOR_R r3, r1 - xor r11, r9 - ; IMUL_R r3, 2098482639 - imul r11, 2098482639 - ; IXOR_R r6, r3 - xor r14, r11 - ; IXOR_R r5, r4 - xor r13, r12 - ; IADD_R r5, r4 - add r13, r12 - ; IMUL_9C r7, 66530302 - lea r15, [r15+r15*8+66530302] - ; IMULH_R r0, r5 - mov rax, r8 - mul r13 - mov r8, rdx - ; IMUL_R r2, r7 - imul r10, r15 - ; IMUL_R r1, 770985098 - imul r9, 770985098 - ; COND_R r7, be(r5, 58538265) - xor ecx, ecx - cmp r13d, 58538265 - setbe cl - add r15, rcx - ; IMUL_9C r3, 245704334 - lea r11, [r11+r11*8+245704334] - ; ISMULH_R r2, r4 - mov rax, r10 - imul r12 - mov r10, rdx - ; FPDIV_R e3, a3 - divpd xmm7, xmm11 - maxpd xmm7, xmm13 - ; IMULH_R r5, r2 - mov rax, r13 - mul r10 - mov r13, rdx - ; ISUB_M r7, L1[r5] - mov eax, r13d + cvtdq2pd xmm12, qword ptr [rsi+rax] + addpd xmm3, xmm12 + ; IADD_R r3, r2 + add r11, r10 + ; FPADD_R f1, a0 + addpd xmm1, xmm8 + ; FPSQRT_R e3 + sqrtpd xmm7, xmm7 + ; FPSUB_R f0, a1 + subpd xmm0, xmm9 + ; IMUL_M r5, L1[r6] + mov eax, r14d and eax, 16376 - sub r15, qword ptr [rsi+rax] - ; FPMUL_R e3, a3 - mulpd xmm7, xmm11 - ; IMUL_R r3, r4 - imul r11, r12 - ; FPSWAP_R f1 - shufpd xmm1, xmm1, 1 - ; IMULH_R r1, 633797287 - mov eax, 633797287 - mul r9 - add r9, rdx - ; IADD_R r4, r3 - add r12, r11 - ; IROR_R r2, r7 - mov ecx, r15d - ror r10, cl - ; FPSUB_R f0, a2 - subpd xmm0, xmm10 - ; FPSUB_R f2, a2 - subpd xmm2, xmm10 - ; FPMUL_R e0, a2 - mulpd xmm4, xmm10 - ; IMUL_M r4, L1[r3] - mov eax, r11d - and eax, 16376 - imul r12, qword ptr [rsi+rax] - ; IMUL_9C r1, -1901091890 - lea r9, [r9+r9*8-1901091890] - ; IROR_R r2, r6 - mov ecx, r14d - ror r10, cl - ; IMULH_R r5, r3 - mov rax, r13 - mul r11 - mov r13, rdx - ; FPSUB_M f1, L1[r7] + imul r13, qword ptr [rsi+rax] + ; ISUB_R r1, r2 + sub r9, r10 + ; IMUL_R r4, r6 + imul r12, r14 + ; FPSWAP_R e3 + shufpd xmm7, xmm7, 1 + ; IMUL_M r0, L1[r7] mov eax, r15d and eax, 16376 - cvtdq2pd xmm12, qword ptr [rsi+rax] - subpd xmm1, xmm12 - ; IMUL_M r2, L1[r1] - mov eax, r9d - and eax, 16376 - imul r10, qword ptr [rsi+rax] - ; IMUL_R r6, r0 - imul r14, r8 - ; IADD_R r7, r6 - add r15, r14 - ; FPSUB_R f2, a3 - subpd xmm2, xmm11 - ; COND_R r5, no(r2, -1589295370) - xor ecx, ecx - cmp r10d, -1589295370 - setno cl - add r13, rcx - ; IMUL_9C r7, 420978486 - lea r15, [r15+r15*8+420978486] - ; IROL_R r4, r2 - mov ecx, r10d - rol r12, cl - ; IMUL_9C r0, -1084530831 - lea r8, [r8+r8*8-1084530831] - ; FPNEG_R f3 - xorps xmm3, xmm15 - ; IROR_R r6, r4 - mov ecx, r12d - ror r14, cl - ; IROL_R r4, r5 - mov ecx, r13d - rol r12, cl - ; FPSUB_R f2, a3 - subpd xmm2, xmm11 - ; FPMUL_R e2, a2 - mulpd xmm6, xmm10 - ; ISMULH_M r6, L2[98600] - mov rax, r14 - imul qword ptr [rsi+98600] - mov r14, rdx - ; IXOR_R r0, r6 - xor r8, r14 - ; FPSWAP_R f1 - shufpd xmm1, xmm1, 1 - ; FPADD_R f0, a1 - addpd xmm0, xmm9 - ; COND_R r1, ab(r3, -991705199) - xor ecx, ecx - cmp r11d, -991705199 - seta cl - add r9, rcx - ; IMULH_M r4, L2[r2] - mov ecx, r10d - and ecx, 262136 - mov rax, r12 - mul qword ptr [rsi+rcx] - mov r12, rdx - ; IROR_R r2, r6 - mov ecx, r14d - ror r10, cl - ; FPDIV_R e0, a1 - divpd xmm4, xmm9 - maxpd xmm4, xmm13 - ; IMUL_R r1, r7 - imul r9, r15 - ; COND_R r6, ns(r2, 939392855) - xor ecx, ecx - cmp r10d, 939392855 - setns cl - add r14, rcx - ; FPMUL_R e3, a1 - mulpd xmm7, xmm9 - ; COND_R r2, ab(r2, -499266314) - xor ecx, ecx - cmp r10d, -499266314 - seta cl - add r10, rcx - ; COND_M r7, lt(L1[r1], -1624420482) - xor ecx, ecx - mov eax, r9d - and eax, 16376 - cmp dword ptr [rsi+rax], -1624420482 - setl cl - add r15, rcx - ; COND_R r1, lt(r1, 1525413977) - xor ecx, ecx - cmp r9d, 1525413977 - setl cl - add r9, rcx - ; IMUL_R r4, r5 - imul r12, r13 - ; IMUL_R r4, r2 - imul r12, r10 - ; FPSQRT_R e1 - sqrtpd xmm5, xmm5 - ; ISUB_R r2, r6 - sub r10, r14 - ; FPDIV_R e1, a0 - divpd xmm5, xmm8 - maxpd xmm5, xmm13 - ; FPMUL_R e2, a3 - mulpd xmm6, xmm11 - ; IADD_R r6, 671627590 - add r14, 671627590 - ; COND_M r6, sg(L1[r4], -780452820) - xor ecx, ecx - mov eax, r12d - and eax, 16376 - cmp dword ptr [rsi+rax], -780452820 - sets cl - add r14, rcx - ; IMULH_R r4, r7 - mov rax, r12 - mul r15 - mov r12, rdx - ; FPMUL_R e3, a1 - mulpd xmm7, xmm9 - ; FPADD_R f0, a0 - addpd xmm0, xmm8 - ; FPMUL_R e0, a1 - mulpd xmm4, xmm9 - ; IMUL_R r7, r3 - imul r15, r11 - ; IROL_R r0, r7 - mov ecx, r15d - rol r8, cl - ; IMUL_R r1, r7 - imul r9, r15 - ; COND_R r0, no(r7, 449007464) - xor ecx, ecx - cmp r15d, 449007464 - setno cl - add r8, rcx - ; ISMULH_M r6, L2[134288] - mov rax, r14 - imul qword ptr [rsi+134288] - mov r14, rdx - ; IMULH_R r5, r2 - mov rax, r13 - mul r10 - mov r13, rdx - ; IMULH_R r7, r4 - mov rax, r15 - mul r12 - mov r15, rdx - ; FPDIV_R e3, a0 - divpd xmm7, xmm8 - maxpd xmm7, xmm13 - ; IXOR_R r3, r4 - xor r11, r12 - ; IDIV_C r1, 72349044 - mov rax, 8555331009525020641 - mul r9 - shr rdx, 25 - add r9, rdx - ; IADD_R r5, r4 - add r13, r12 - ; IROR_R r2, r4 - mov ecx, r12d - ror r10, cl - ; FPSUB_M f1, L1[r2] - mov eax, r10d - and eax, 16376 - cvtdq2pd xmm12, qword ptr [rsi+rax] - subpd xmm1, xmm12 - ; FPMUL_R e2, a3 - mulpd xmm6, xmm11 - ; IADD_R r5, r6 - add r13, r14 - ; IXOR_M r1, L1[r4] - mov eax, r12d - and eax, 16376 - xor r9, qword ptr [rsi+rax] - ; ISUB_R r2, -1544880589 - sub r10, -1544880589 - ; FPNEG_R f0 - xorps xmm0, xmm15 + imul r8, qword ptr [rsi+rax] ; IROR_R r1, r6 mov ecx, r14d ror r9, cl - ; IMUL_R r6, r4 - imul r14, r12 - ; IMULH_M r4, L2[r1] - mov ecx, r9d - and ecx, 262136 - mov rax, r12 - mul qword ptr [rsi+rcx] - mov r12, rdx - ; IXOR_R r3, r0 - xor r11, r8 - ; FPSWAP_R f0 - shufpd xmm0, xmm0, 1 - ; FPSWAP_R f0 - shufpd xmm0, xmm0, 1 - ; COND_R r0, ns(r2, -308295242) - xor ecx, ecx - cmp r10d, -308295242 - setns cl - add r8, rcx - ; IMUL_9C r1, 591587965 - lea r9, [r9+r9*8+591587965] - ; FPADD_R f3, a1 - addpd xmm3, xmm9 - ; IMUL_R r5, r4 - imul r13, r12 - ; IMUL_M r7, L1[r0] - mov eax, r8d - and eax, 16376 - imul r15, qword ptr [rsi+rax] - ; COND_R r6, sg(r5, -1119525789) - xor ecx, ecx - cmp r13d, -1119525789 - sets cl - add r14, rcx - ; IMUL_M r0, L1[r1] - mov eax, r9d - and eax, 16376 - imul r8, qword ptr [rsi+rax] - ; IADD_M r3, L2[r7] - mov eax, r15d - and eax, 262136 - add r11, qword ptr [rsi+rax] - ; IADD_R r0, r1 - add r8, r9 - ; FPSUB_R f2, a1 - subpd xmm2, xmm9 - ; IXOR_M r0, L2[r7] - mov eax, r15d - and eax, 262136 - xor r8, qword ptr [rsi+rax] - ; COND_R r6, be(r6, 1481939391) - xor ecx, ecx - cmp r14d, 1481939391 - setbe cl - add r14, rcx - ; FPADD_R f0, a1 - addpd xmm0, xmm9 - ; IXOR_R r3, r2 - xor r11, r10 - ; FPSUB_R f0, a1 - subpd xmm0, xmm9 - ; IXOR_R r7, r3 - xor r15, r11 - ; IXOR_M r6, L1[r4] - mov eax, r12d - and eax, 16376 - xor r14, qword ptr [rsi+rax] - ; IMULH_R r2, r7 - mov rax, r10 - mul r15 - mov r10, rdx - ; ISUB_R r5, r1 - sub r13, r9 - ; FPMUL_R e1, a3 - mulpd xmm5, xmm11 - ; FPADD_R f3, a2 - addpd xmm3, xmm10 - ; FPSWAP_R f1 - shufpd xmm1, xmm1, 1 - ; FPSUB_R f1, a3 - subpd xmm1, xmm11 - ; FPSUB_M f0, L1[r4] - mov eax, r12d - and eax, 16376 - cvtdq2pd xmm12, qword ptr [rsi+rax] - subpd xmm0, xmm12 - ; FPMUL_R e1, a2 - mulpd xmm5, xmm10 - ; FPADD_R f3, a0 - addpd xmm3, xmm8 - ; IROL_R r2, r4 + ; IROR_R r2, r4 mov ecx, r12d - rol r10, cl - ; COND_M r7, ab(L2[r7], -2012390318) - xor ecx, ecx - mov eax, r15d - and eax, 262136 - cmp dword ptr [rsi+rax], -2012390318 - seta cl - add r15, rcx - ; IMUL_9C r4, -38079585 - lea r12, [r12+r12*8-38079585] - ; IXOR_R r0, r1 - xor r8, r9 - ; FPMUL_R e1, a3 - mulpd xmm5, xmm11 - ; FPMUL_R e1, a1 - mulpd xmm5, xmm9 - ; FPSUB_R f1, a2 - subpd xmm1, xmm10 - ; IMUL_9C r4, -847745598 - lea r12, [r12+r12*8-847745598] - ; FPSQRT_R e1 - sqrtpd xmm5, xmm5 - ; IADD_R r7, r6 - add r15, r14 - ; FPSUB_R f3, a0 - subpd xmm3, xmm8 - ; FPSUB_R f1, a1 - subpd xmm1, xmm9 - ; IADD_R r7, r6 - add r15, r14 - ; IROL_R r2, r5 - mov ecx, r13d - rol r10, cl - ; IADD_RC r4, r2, 1338806320 - lea r12, [r12+r10+1338806320] - ; FPSQRT_R e3 - sqrtpd xmm7, xmm7 - ; IMUL_R r5, r0 - imul r13, r8 - ; FPADD_R f2, a1 - addpd xmm2, xmm9 - ; INEG_R r6 - neg r14 - ; IXOR_M r6, L1[r2] - mov eax, r10d - and eax, 16376 - xor r14, qword ptr [rsi+rax] - ; FPSUB_R f2, a2 - subpd xmm2, xmm10 - ; FPADD_R f2, a2 - addpd xmm2, xmm10 - ; FPADD_R f1, a2 - addpd xmm1, xmm10 - ; COND_R r3, be(r4, 174667458) - xor ecx, ecx - cmp r12d, 174667458 - setbe cl - add r11, rcx - ; INEG_R r6 - neg r14 - ; IXOR_R r6, r3 - xor r14, r11 - ; COND_M r5, sg(L1[r0], -864345921) - xor ecx, ecx - mov eax, r8d - and eax, 16376 - cmp dword ptr [rsi+rax], -864345921 - sets cl - add r13, rcx - ; IROL_R r7, r3 - mov ecx, r11d - rol r15, cl - ; FPSUB_R f1, a2 - subpd xmm1, xmm10 - ; IADD_M r1, L1[r0] - mov eax, r8d - and eax, 16376 - add r9, qword ptr [rsi+rax] - ; IMULH_R r1, r3 - mov rax, r9 - mul r11 - mov r9, rdx - ; IMUL_R r0, -1489192296 - imul r8, -1489192296 - ; FPMUL_R e0, a2 - mulpd xmm4, xmm10 - ; COND_R r1, ge(r1, -1358904097) - xor ecx, ecx - cmp r9d, -1358904097 - setge cl - add r9, rcx - ; FPSUB_R f1, a1 - subpd xmm1, xmm9 - ; FPADD_R f2, a3 - addpd xmm2, xmm11 - ; IROR_R r4, r7 - mov ecx, r15d - ror r12, cl - ; ISDIV_C r1, -1368098113 - mov rax, -7238896260565957085 - imul r9 - xor eax, eax - sar rdx, 29 - sets al - add rdx, rax - add r9, rdx - ; IADD_M r4, L1[r1] - mov eax, r9d - and eax, 16376 - add r12, qword ptr [rsi+rax] - ; IMUL_R r0, -1011605520 - imul r8, -1011605520 + ror r10, cl ; FPSUB_R f3, a1 subpd xmm3, xmm9 - ; IADD_RC r1, r4, 272540736 - lea r9, [r9+r12+272540736] + ; FSTORE L1[r0], e1 + mov eax, r8d + and eax, 16368 + movapd xmmword ptr [rsi+rax], xmm5 + ; COND_R r2, sg(r3, 1269153133) + xor ecx, ecx + cmp r11d, 1269153133 + sets cl + add r10, rcx ; FPSWAP_R f2 shufpd xmm2, xmm2, 1 - ; IROR_R r3, r2 - mov ecx, r10d - ror r11, cl - ; IMUL_R r3, 2085105439 - imul r11, 2085105439 - ; FPMUL_R e0, a0 - mulpd xmm4, xmm8 - ; IMUL_9C r6, -483723153 - lea r14, [r14+r14*8-483723153] - ; FPSUB_M f3, L1[r7] - mov eax, r15d - and eax, 16376 - cvtdq2pd xmm12, qword ptr [rsi+rax] - subpd xmm3, xmm12 - ; IMUL_R r3, r2 - imul r11, r10 - ; ISMULH_R r7, r1 - mov rax, r15 - imul r9 - mov r15, rdx - ; COND_R r1, of(r7, 778804236) + ; IADD_R r7, r5 + add r15, r13 + ; COND_R r0, be(r4, -1486502150) xor ecx, ecx - cmp r15d, 778804236 - seto cl - add r9, rcx - ; FPSUB_R f3, a2 - subpd xmm3, xmm10 - ; IROL_R r5, r7 - mov ecx, r15d - rol r13, cl - ; FPADD_R f1, a0 - addpd xmm1, xmm8 - ; FPADD_R f2, a3 - addpd xmm2, xmm11 - ; IMUL_R r6, r0 - imul r14, r8 - ; ISUB_M r2, L2[r4] - mov eax, r12d - and eax, 262136 - sub r10, qword ptr [rsi+rax] - ; IXOR_R r0, r6 - xor r8, r14 - ; INEG_R r6 - neg r14 - ; FPMUL_R e2, a3 - mulpd xmm6, xmm11 - ; IADD_RC r4, r6, -1312075035 - lea r12, [r12+r14-1312075035] - ; IMUL_R r1, r5 - imul r9, r13 - ; IXOR_M r7, L2[r6] - mov eax, r14d - and eax, 262136 - xor r15, qword ptr [rsi+rax] - ; IROR_R r2, 23 - ror r10, 23 - ; FPMUL_R e0, a2 - mulpd xmm4, xmm10 - ; ISMULH_M r5, L1[r2] - mov ecx, r10d - and ecx, 16376 - mov rax, r13 - imul qword ptr [rsi+rcx] - mov r13, rdx - ; ISUB_M r7, L1[r4] - mov eax, r12d - and eax, 16376 - sub r15, qword ptr [rsi+rax] - ; COND_R r0, sg(r2, 1538841628) - xor ecx, ecx - cmp r10d, 1538841628 - sets cl + cmp r12d, -1486502150 + setbe cl add r8, rcx - ; IMUL_R r6, r2 - imul r14, r10 - ; ISUB_R r0, r1 - sub r8, r9 + ; FPSUB_R f3, a1 + subpd xmm3, xmm9 + ; FPADD_R f0, a3 + addpd xmm0, xmm11 + ; IADD_R r2, r0 + add r10, r8 + ; FSTORE L1[r3], e2 + mov eax, r11d + and eax, 16368 + movapd xmmword ptr [rsi+rax], xmm6 + ; IXOR_R r1, r7 + xor r9, r15 ; IMUL_R r5, r7 imul r13, r15 - ; IADD_RC r1, r0, 516706834 - lea r9, [r9+r8+516706834] - ; INEG_R r5 - neg r13 + ; IXOR_R r7, 266992378 + xor r15, 266992378 + ; COND_R r7, no(r4, 1983804692) + xor ecx, ecx + cmp r12d, 1983804692 + setno cl + add r15, rcx + ; IMUL_M r2, L2[r0] + mov eax, r8d + and eax, 262136 + imul r10, qword ptr [rsi+rax] + ; FPDIV_R e3, a2 + divpd xmm7, xmm10 + maxpd xmm7, xmm13 + ; IMUL_M r0, L2[r6] + mov eax, r14d + and eax, 262136 + imul r8, qword ptr [rsi+rax] + ; ISTORE L1[r0], r7 + mov eax, r8d + and eax, 16376 + mov qword ptr [rsi+rax], r15 + ; FPMUL_R e0, a1 + mulpd xmm4, xmm9 + ; FPSUB_R f3, a1 + subpd xmm3, xmm9 + ; IROR_R r5, r4 + mov ecx, r12d + ror r13, cl + ; ISTORE L2[r7], r2 + mov eax, r15d + and eax, 262136 + mov qword ptr [rsi+rax], r10 + ; FPSWAP_R e2 + shufpd xmm6, xmm6, 1 + ; FPADD_M f3, L1[r2] + mov eax, r10d + and eax, 16376 + cvtdq2pd xmm12, qword ptr [rsi+rax] + addpd xmm3, xmm12 + ; IDIV_C r5, 2218798981 + mov rax, 17853839665672790751 + mul r13 + shr rdx, 31 + add r13, rdx + ; IADD_RC r0, r4, -1321374359 + lea r8, [r8+r12-1321374359] + ; CFROUND r6, 28 + mov rax, r14 + rol rax, 49 + and eax, 24576 + or eax, 40896 + mov dword ptr [rsp-8], eax + ldmxcsr dword ptr [rsp-8] + ; FPADD_R f2, a2 + addpd xmm2, xmm10 + ; IROL_R r7, r6 + mov ecx, r14d + rol r15, cl + ; ISUB_R r2, r4 + sub r10, r12 + ; IMULH_M r0, L1[12400] + mov rax, r8 + mul qword ptr [rsi+12400] + mov r8, rdx + ; IADD_R r2, r3 + add r10, r11 + ; COND_R r6, lt(r1, -1124202227) + xor ecx, ecx + cmp r9d, -1124202227 + setl cl + add r14, rcx + ; IROR_R r7, r4 + mov ecx, r12d + ror r15, cl + ; IMUL_R r4, r2 + imul r12, r10 + ; ISUB_R r3, r7 + sub r11, r15 + ; IADD_R r2, r7 + add r10, r15 ; FPSQRT_R e3 sqrtpd xmm7, xmm7 - ; IADD_RC r5, r4, -1679394922 - lea r13, [r13+r12-1679394922] - ; FPSUB_R f1, a1 - subpd xmm1, xmm9 - ; IMUL_R r0, r2 - imul r8, r10 - ; ISUB_R r3, r2 - sub r11, r10 + ; ISUB_R r6, 540663146 + sub r14, 540663146 + ; IROL_R r5, 58 + rol r13, 58 + ; FPADD_R f2, a1 + addpd xmm2, xmm9 + ; FPADD_R f2, a2 + addpd xmm2, xmm10 + ; FPSQRT_R e1 + sqrtpd xmm5, xmm5 + ; FPADD_R f1, a2 + addpd xmm1, xmm10 + ; IADD_R r5, r3 + add r13, r11 + ; IADD_M r7, L1[880] + add r15, qword ptr [rsi+880] + ; ISUB_R r7, r0 + sub r15, r8 + ; ISTORE L2[r0], r7 + mov eax, r8d + and eax, 262136 + mov qword ptr [rsi+rax], r15 + ; IDIV_C r2, 1014940364 + mov rax, r10 + shr rax, 2 + mov rcx, 1219717022984988185 + mul rcx + shr rdx, 24 + add r10, rdx + ; FPMUL_R e0, a2 + mulpd xmm4, xmm10 + ; IDIV_C r2, 3059159304 + mov rax, 12949335853590502915 + mul r10 + shr rdx, 31 + add r10, rdx + ; IADD_R r0, r3 + add r8, r11 + ; IMUL_9C r7, -2124093035 + lea r15, [r15+r15*8-2124093035] + ; FPSUB_R f2, a0 + subpd xmm2, xmm8 + ; FPDIV_R e0, a2 + divpd xmm4, xmm10 + maxpd xmm4, xmm13 + ; FPSUB_R f2, a3 + subpd xmm2, xmm11 + ; IMUL_R r1, r2 + imul r9, r10 + ; ISMULH_R r7, r5 + mov rax, r15 + imul r13 + mov r15, rdx + ; IMULH_R r3, r2 + mov rax, r11 + mul r10 + mov r11, rdx + ; IXOR_M r1, L2[r0] + mov eax, r8d + and eax, 262136 + xor r9, qword ptr [rsi+rax] + ; FPMUL_R e0, a1 + mulpd xmm4, xmm9 + ; ISUB_R r4, 1456841848 + sub r12, 1456841848 + ; IXOR_M r3, L2[r2] + mov eax, r10d + and eax, 262136 + xor r11, qword ptr [rsi+rax] + ; COND_M r0, of(L1[r4], 1678513610) + xor ecx, ecx + mov eax, r12d + and eax, 16376 + cmp dword ptr [rsi+rax], 1678513610 + seto cl + add r8, rcx + ; IDIV_C r4, 2674394209 + mov rax, 925772300223658071 + mul r12 + shr rdx, 27 + add r12, rdx + ; IMUL_R r4, r1 + imul r12, r9 + ; FPADD_R f1, a2 + addpd xmm1, xmm10 + ; FPSUB_R f2, a0 + subpd xmm2, xmm8 + ; FPMUL_M e1, L2[r6] + mov eax, r14d + and eax, 262136 + cvtdq2pd xmm12, qword ptr [rsi+rax] + mulpd xmm5, xmm12 + maxpd xmm5, xmm13 + ; FPSUB_M f0, L2[r3] + mov eax, r11d + and eax, 262136 + cvtdq2pd xmm12, qword ptr [rsi+rax] + subpd xmm0, xmm12 + ; IROR_R r0, r7 + mov ecx, r15d + ror r8, cl + ; FSTORE L2[r1], e0 + mov eax, r9d + and eax, 262128 + movapd xmmword ptr [rsi+rax], xmm4 + ; IROR_R r7, r6 + mov ecx, r14d + ror r15, cl + ; IMUL_9C r2, 266593902 + lea r10, [r10+r10*8+266593902] + ; IMUL_R r4, r6 + imul r12, r14 + ; FPSUB_R f2, a2 + subpd xmm2, xmm10 + ; FPMUL_R e3, a0 + mulpd xmm7, xmm8 + ; IXOR_M r7, L1[r2] + mov eax, r10d + and eax, 16376 + xor r15, qword ptr [rsi+rax] + ; IROR_R r0, r5 + mov ecx, r13d + ror r8, cl + ; FPADD_R f1, a2 + addpd xmm1, xmm10 + ; FPSQRT_R e3 + sqrtpd xmm7, xmm7 + ; FPADD_R f3, a1 + addpd xmm3, xmm9 + ; FPADD_R f1, a0 + addpd xmm1, xmm8 + ; COND_M r2, ge(L2[r2], -226330940) + xor ecx, ecx + mov eax, r10d + and eax, 262136 + cmp dword ptr [rsi+rax], -226330940 + setge cl + add r10, rcx + ; FPDIV_R e2, a3 + divpd xmm6, xmm11 + maxpd xmm6, xmm13 + ; FPMUL_R e2, a1 + mulpd xmm6, xmm9 + ; FPSUB_R f1, a0 + subpd xmm1, xmm8 + ; IMUL_R r7, r5 + imul r15, r13 + ; IMUL_R r0, r1 + imul r8, r9 + ; FPSUB_R f3, a1 + subpd xmm3, xmm9 + ; IROL_R r3, r5 + mov ecx, r13d + rol r11, cl + ; IADD_RC r5, r2, 795784298 + lea r13, [r13+r10+795784298] + ; ISUB_R r0, r4 + sub r8, r12 + ; IMUL_R r5, r4 + imul r13, r12 + ; FPSUB_R f0, a2 + subpd xmm0, xmm10 + ; FPMUL_R e3, a1 + mulpd xmm7, xmm9 + ; ISDIV_C r3, 1662492575 + mov rax, 2978515652703905219 + imul r11 + xor eax, eax + sar rdx, 28 + sets al + add rdx, rax + add r11, rdx + ; ISMULH_R r5, r0 + mov rax, r13 + imul r8 + mov r13, rdx + ; ISDIV_C r4, 1963597892 + mov rax, -8359627607928540073 + imul r12 + xor eax, eax + add rdx, r12 + sar rdx, 30 + sets al + add rdx, rax + add r12, rdx + ; IMUL_R r7, r0 + imul r15, r8 + ; IMULH_M r0, L1[r3] + mov ecx, r11d + and ecx, 16376 + mov rax, r8 + mul qword ptr [rsi+rcx] + mov r8, rdx + ; IXOR_R r3, r7 + xor r11, r15 + ; IDIV_C r4, 1146125335 + mov rax, 8640870253760721727 + mul r12 + shr rdx, 29 + add r12, rdx + ; FPSWAP_R f3 + shufpd xmm3, xmm3, 1 + ; IXOR_M r2, L1[r0] + mov eax, r8d + and eax, 16376 + xor r10, qword ptr [rsi+rax] + ; IROR_R r0, r1 + mov ecx, r9d + ror r8, cl + ; IXOR_R r7, r4 + xor r15, r12 + ; ISMULH_R r6, r2 + mov rax, r14 + imul r10 + mov r14, rdx + ; FPMUL_R e3, a2 + mulpd xmm7, xmm10 + ; IADD_RC r4, r2, 1704868083 + lea r12, [r12+r10+1704868083] + ; FPSUB_R f2, a0 + subpd xmm2, xmm8 + ; ISTORE L1[r0], r0 + mov eax, r8d + and eax, 16376 + mov qword ptr [rsi+rax], r8 + ; FPSUB_R f0, a3 + subpd xmm0, xmm11 ; FPDIV_R e0, a3 divpd xmm4, xmm11 maxpd xmm4, xmm13 - ; ISUB_R r1, r5 - sub r9, r13 - ; COND_M r2, be(L2[r2], 1840094725) + ; FPMUL_R e3, a2 + mulpd xmm7, xmm10 + ; ISUB_R r7, 1302457878 + sub r15, 1302457878 + ; IMUL_9C r1, 1330165941 + lea r9, [r9+r9*8+1330165941] + ; FPMUL_R e1, a3 + mulpd xmm5, xmm11 + ; IROL_R r0, r4 + mov ecx, r12d + rol r8, cl + ; FPSUB_M f1, L1[r0] + mov eax, r8d + and eax, 16376 + cvtdq2pd xmm12, qword ptr [rsi+rax] + subpd xmm1, xmm12 + ; IROL_R r5, r6 + mov ecx, r14d + rol r13, cl + ; COND_M r0, ab(L1[r1], -310933871) xor ecx, ecx + mov eax, r9d + and eax, 16376 + cmp dword ptr [rsi+rax], -310933871 + seta cl + add r8, rcx + ; CFROUND r7, 39 + mov rax, r15 + rol rax, 38 + and eax, 24576 + or eax, 40896 + mov dword ptr [rsp-8], eax + ldmxcsr dword ptr [rsp-8] + ; FPDIV_R e0, a1 + divpd xmm4, xmm9 + maxpd xmm4, xmm13 + ; IMUL_M r1, L1[r3] + mov eax, r11d + and eax, 16376 + imul r9, qword ptr [rsi+rax] + ; IMUL_9C r3, 1573236728 + lea r11, [r11+r11*8+1573236728] + ; FPNEG_R f3 + xorps xmm3, xmm15 + ; COND_R r1, lt(r4, -1805702334) + xor ecx, ecx + cmp r12d, -1805702334 + setl cl + add r9, rcx + ; FPSWAP_R f1 + shufpd xmm1, xmm1, 1 + ; IADD_R r7, -1421188024 + add r15, -1421188024 + ; FPMUL_R e3, a2 + mulpd xmm7, xmm10 + ; FPSUB_M f2, L2[r7] + mov eax, r15d + and eax, 262136 + cvtdq2pd xmm12, qword ptr [rsi+rax] + subpd xmm2, xmm12 + ; FPSUB_R f3, a1 + subpd xmm3, xmm9 + ; FPSQRT_R e1 + sqrtpd xmm5, xmm5 + ; ISUB_R r2, r4 + sub r10, r12 + ; ISMULH_R r4, r5 + mov rax, r12 + imul r13 + mov r12, rdx + ; COND_R r1, of(r7, 1294727006) + xor ecx, ecx + cmp r15d, 1294727006 + seto cl + add r9, rcx + ; IADD_M r5, L2[r2] mov eax, r10d and eax, 262136 - cmp dword ptr [rsi+rax], 1840094725 - setbe cl + add r13, qword ptr [rsi+rax] + ; IMUL_9C r4, 401020510 + lea r12, [r12+r12*8+401020510] + ; IROL_R r3, r0 + mov ecx, r8d + rol r11, cl + ; ISTORE L1[r7], r0 + mov eax, r15d + and eax, 16376 + mov qword ptr [rsi+rax], r8 + ; FPSUB_R f2, a1 + subpd xmm2, xmm9 + ; FPSQRT_R e3 + sqrtpd xmm7, xmm7 + ; IMUL_R r3, 720965215 + imul r11, 720965215 + ; IMUL_R r6, r2 + imul r14, r10 + ; ISTORE L1[r7], r3 + mov eax, r15d + and eax, 16376 + mov qword ptr [rsi+rax], r11 + ; IROR_R r2, r6 + mov ecx, r14d + ror r10, cl + ; FPSQRT_R e3 + sqrtpd xmm7, xmm7 + ; IMUL_9C r4, 788211341 + lea r12, [r12+r12*8+788211341] + ; IMUL_9C r3, -67993446 + lea r11, [r11+r11*8-67993446] + ; FPSWAP_R e3 + shufpd xmm7, xmm7, 1 + ; IMUL_M r2, L1[r6] + mov eax, r14d + and eax, 16376 + imul r10, qword ptr [rsi+rax] + ; COND_M r2, ge(L1[r2], -1892157506) + xor ecx, ecx + mov eax, r10d + and eax, 16376 + cmp dword ptr [rsi+rax], -1892157506 + setge cl add r10, rcx + ; FPADD_M f1, L1[r3] + mov eax, r11d + and eax, 16376 + cvtdq2pd xmm12, qword ptr [rsi+rax] + addpd xmm1, xmm12 + ; IADD_M r7, L1[r0] + mov eax, r8d + and eax, 16376 + add r15, qword ptr [rsi+rax] + ; ISDIV_C r1, 624867857 + mov rax, 7924491717200811467 + imul r9 + xor eax, eax + sar rdx, 28 + sets al + add rdx, rax + add r9, rdx + ; FPADD_R f0, a1 + addpd xmm0, xmm9 + ; ISUB_R r5, r7 + sub r13, r15 + ; FPNEG_R f0 + xorps xmm0, xmm15 + ; IMUL_R r6, r2 + imul r14, r10 + ; FPMUL_M e3, L1[r1] + mov eax, r9d + and eax, 16376 + cvtdq2pd xmm12, qword ptr [rsi+rax] + mulpd xmm7, xmm12 + maxpd xmm7, xmm13 + ; IADD_R r0, r4 + add r8, r12 + ; FPSUB_M f3, L1[r1] + mov eax, r9d + and eax, 16376 + cvtdq2pd xmm12, qword ptr [rsi+rax] + subpd xmm3, xmm12 + ; FPMUL_R e2, a0 + mulpd xmm6, xmm8 + ; INEG_R r2 + neg r10 + ; FPMUL_R e2, a2 + mulpd xmm6, xmm10 + ; FPSUB_M f3, L1[r6] + mov eax, r14d + and eax, 16376 + cvtdq2pd xmm12, qword ptr [rsi+rax] + subpd xmm3, xmm12 + ; FPADD_R f1, a3 + addpd xmm1, xmm11 + ; IMULH_R r3, r2 + mov rax, r11 + mul r10 + mov r11, rdx + ; FPSUB_R f0, a3 + subpd xmm0, xmm11 + ; IDIV_C r5, 2887845607 + mov rax, 13717520480010955377 + mul r13 + shr rdx, 31 + add r13, rdx + ; ISMULH_M r6, L1[r2] + mov ecx, r10d + and ecx, 16376 + mov rax, r14 + imul qword ptr [rsi+rcx] + mov r14, rdx + ; FPSUB_R f3, a3 + subpd xmm3, xmm11 ; IMUL_M r6, L1[r7] mov eax, r15d and eax, 16376 imul r14, qword ptr [rsi+rax] - ; IMULH_M r6, L1[r5] - mov ecx, r13d - and ecx, 16376 - mov rax, r14 - mul qword ptr [rsi+rcx] - mov r14, rdx - ; IMUL_9C r7, -1048659408 - lea r15, [r15+r15*8-1048659408] - ; IMUL_R r6, r3 - imul r14, r11 - ; FPADD_R f3, a0 - addpd xmm3, xmm8 - ; IMULH_R r0, r3 - mov rax, r8 - mul r11 - mov r8, rdx - ; FPSWAP_R f0 - shufpd xmm0, xmm0, 1 + ; FPNEG_R f0 + xorps xmm0, xmm15 + ; FPMUL_R e2, a0 + mulpd xmm6, xmm8 + ; IMUL_9C r6, 295130073 + lea r14, [r14+r14*8+295130073] + ; FPADD_R f1, a1 + addpd xmm1, xmm9 + ; IXOR_R r0, r5 + xor r8, r13 + ; FPADD_R f2, a1 + addpd xmm2, xmm9 + ; FPSWAP_R e3 + shufpd xmm7, xmm7, 1 ; FPSQRT_R e3 sqrtpd xmm7, xmm7 - ; IMULH_R r2, r0 - mov rax, r10 - mul r8 - mov r10, rdx - ; FPDIV_R e1, a1 - divpd xmm5, xmm9 - maxpd xmm5, xmm13 + ; IADD_RC r3, r6, -1317630728 + lea r11, [r11+r14-1317630728] + ; IMUL_M r2, L1[r3] + mov eax, r11d + and eax, 16376 + imul r10, qword ptr [rsi+rax] + ; IADD_RC r1, r4, 894105694 + lea r9, [r9+r12+894105694] + ; IMUL_R r7, r0 + imul r15, r8 + ; FPSUB_R f1, a0 + subpd xmm1, xmm8 + ; IMUL_M r7, L1[r1] + mov eax, r9d + and eax, 16376 + imul r15, qword ptr [rsi+rax] + ; IXOR_R r2, r4 + xor r10, r12 + ; ISUB_M r0, L1[r1] + mov eax, r9d + and eax, 16376 + sub r8, qword ptr [rsi+rax] + ; INEG_R r4 + neg r12 + ; IMUL_9C r4, -285272388 + lea r12, [r12+r12*8-285272388] + ; IMUL_R r7, r4 + imul r15, r12 + ; IMULH_M r5, L1[r7] + mov ecx, r15d + and ecx, 16376 + mov rax, r13 + mul qword ptr [rsi+rcx] + mov r13, rdx + ; IROL_R r1, r7 + mov ecx, r15d + rol r9, cl + ; IXOR_R r4, -757532727 + xor r12, -757532727 + ; IMUL_R r3, 1863959234 + imul r11, 1863959234 + ; IROL_R r4, 59 + rol r12, 59 + ; ISMULH_R r1, 2122681086 + mov rax, 2122681086 + imul r9 + add r9, rdx + ; ISTORE L2[r6], r7 + mov eax, r14d + and eax, 262136 + mov qword ptr [rsi+rax], r15 + ; ISTORE L1[r1], r5 + mov eax, r9d + and eax, 16376 + mov qword ptr [rsi+rax], r13 + ; FPMUL_R e0, a1 + mulpd xmm4, xmm9 + ; COND_R r2, ns(r1, 486049737) + xor ecx, ecx + cmp r9d, 486049737 + setns cl + add r10, rcx + ; FPMUL_M e0, L2[r7] + mov eax, r15d + and eax, 262136 + cvtdq2pd xmm12, qword ptr [rsi+rax] + mulpd xmm4, xmm12 + maxpd xmm4, xmm13 + ; FPMUL_R e3, a2 + mulpd xmm7, xmm10 + ; IROL_R r5, r2 + mov ecx, r10d + rol r13, cl + ; IADD_M r0, L1[r4] + mov eax, r12d + and eax, 16376 + add r8, qword ptr [rsi+rax] From 8f2abd6c05bf1be3c8667ca84dd7683c1cb9cde1 Mon Sep 17 00:00:00 2001 From: tevador Date: Sun, 27 Jan 2019 18:19:49 +0100 Subject: [PATCH 26/35] NOP instruction register load/store from L3 --- src/AssemblyGeneratorX86.cpp | 6 + src/AssemblyGeneratorX86.hpp | 1 + src/Instruction.cpp | 8 + src/Instruction.hpp | 1 + src/JitCompilerX86.cpp | 7 +- src/JitCompilerX86.hpp | 1 + src/asm/program_load_flt.inc | 2 +- src/asm/program_load_int.inc | 2 +- src/asm/program_store_flt.inc | 2 +- src/asm/program_store_int.inc | 2 +- src/common.hpp | 2 +- src/executeProgram-win64.asm | 8 +- src/instructionWeights.hpp | 51 +-- src/main.cpp | 4 +- src/program.inc | 760 ++++++++-------------------------- 15 files changed, 233 insertions(+), 624 deletions(-) diff --git a/src/AssemblyGeneratorX86.cpp b/src/AssemblyGeneratorX86.cpp index 11bb3f0..a46fe5d 100644 --- a/src/AssemblyGeneratorX86.cpp +++ b/src/AssemblyGeneratorX86.cpp @@ -491,6 +491,10 @@ namespace RandomX { asmCode << "\tmovapd xmmword ptr [rsi+rax], " << regFE[instr.src] << std::endl; } + void AssemblyGeneratorX86::h_NOP(Instruction& instr, int i) { + asmCode << "\tnop" << std::endl; + } + #include "instructionWeights.hpp" #define INST_HANDLE(x) REPN(&AssemblyGeneratorX86::h_##x, WT(x)) @@ -540,5 +544,7 @@ namespace RandomX { INST_HANDLE(ISTORE) INST_HANDLE(FSTORE) + + INST_HANDLE(NOP) }; } \ No newline at end of file diff --git a/src/AssemblyGeneratorX86.hpp b/src/AssemblyGeneratorX86.hpp index 5c22142..6b0c505 100644 --- a/src/AssemblyGeneratorX86.hpp +++ b/src/AssemblyGeneratorX86.hpp @@ -79,5 +79,6 @@ namespace RandomX { void h_CFROUND(Instruction&, int); void h_ISTORE(Instruction&, int); void h_FSTORE(Instruction&, int); + void h_NOP(Instruction&, int); }; } \ No newline at end of file diff --git a/src/Instruction.cpp b/src/Instruction.cpp index 13cfc1d..0aa0289 100644 --- a/src/Instruction.cpp +++ b/src/Instruction.cpp @@ -327,6 +327,10 @@ namespace RandomX { os << ", " << reg << srcIndex << std::endl; } + void Instruction::h_NOP(std::ostream& os) const { + os << std::endl; + } + #include "instructionWeights.hpp" #define INST_NAME(x) REPN(#x, WT(x)) #define INST_HANDLE(x) REPN(&Instruction::h_##x, WT(x)) @@ -377,6 +381,8 @@ namespace RandomX { INST_NAME(ISTORE) INST_NAME(FSTORE) + + INST_NAME(NOP) }; InstructionVisualizer Instruction::engine[256] = { @@ -425,6 +431,8 @@ namespace RandomX { INST_HANDLE(ISTORE) INST_HANDLE(FSTORE) + + INST_HANDLE(NOP) }; } \ No newline at end of file diff --git a/src/Instruction.hpp b/src/Instruction.hpp index 017d92f..ffa3880 100644 --- a/src/Instruction.hpp +++ b/src/Instruction.hpp @@ -86,6 +86,7 @@ namespace RandomX { void h_CFROUND(std::ostream&) const; void h_ISTORE(std::ostream&) const; void h_FSTORE(std::ostream&) const; + void h_NOP(std::ostream&) const; }; static_assert(sizeof(Instruction) == 8, "Invalid alignment of struct Instruction"); diff --git a/src/JitCompilerX86.cpp b/src/JitCompilerX86.cpp index e001464..30c6f73 100644 --- a/src/JitCompilerX86.cpp +++ b/src/JitCompilerX86.cpp @@ -181,7 +181,7 @@ namespace RandomX { static const uint8_t JMP = 0xe9; size_t JitCompilerX86::getCodeSize() { - return codePos - prologueSize + readDatasetSize; + return codePos - prologueSize; } JitCompilerX86::JitCompilerX86() { @@ -761,6 +761,10 @@ namespace RandomX { emitByte(0x06); } + void JitCompilerX86::h_NOP(Instruction& instr) { + emitByte(0x90); + } + #include "instructionWeights.hpp" #define INST_HANDLE(x) REPN(&JitCompilerX86::h_##x, WT(x)) @@ -800,6 +804,7 @@ namespace RandomX { INST_HANDLE(CFROUND) INST_HANDLE(ISTORE) INST_HANDLE(FSTORE) + INST_HANDLE(NOP) }; diff --git a/src/JitCompilerX86.hpp b/src/JitCompilerX86.hpp index fa5aa93..0aef990 100644 --- a/src/JitCompilerX86.hpp +++ b/src/JitCompilerX86.hpp @@ -125,6 +125,7 @@ namespace RandomX { void h_CFROUND(Instruction&); void h_ISTORE(Instruction&); void h_FSTORE(Instruction&); + void h_NOP(Instruction&); }; } \ No newline at end of file diff --git a/src/asm/program_load_flt.inc b/src/asm/program_load_flt.inc index af6f1b7..2c631ce 100644 --- a/src/asm/program_load_flt.inc +++ b/src/asm/program_load_flt.inc @@ -1,4 +1,4 @@ - and eax, 262080 + and eax, 1048512 lea rcx, [rsi+rax] cvtdq2pd xmm0, qword ptr [rcx+0] cvtdq2pd xmm1, qword ptr [rcx+8] diff --git a/src/asm/program_load_int.inc b/src/asm/program_load_int.inc index d139549..d9277ed 100644 --- a/src/asm/program_load_int.inc +++ b/src/asm/program_load_int.inc @@ -1,4 +1,4 @@ - and eax, 262080 + and eax, 1048512 lea rcx, [rsi+rax] xor r8, qword ptr [rcx+0] xor r9, qword ptr [rcx+8] diff --git a/src/asm/program_store_flt.inc b/src/asm/program_store_flt.inc index d6ca7f1..4bbab9f 100644 --- a/src/asm/program_store_flt.inc +++ b/src/asm/program_store_flt.inc @@ -1,4 +1,4 @@ - and eax, 262080 + and eax, 1048512 lea rcx, [rsi+rax] mulpd xmm0, xmm4 mulpd xmm1, xmm5 diff --git a/src/asm/program_store_int.inc b/src/asm/program_store_int.inc index 75c973f..03dd31a 100644 --- a/src/asm/program_store_int.inc +++ b/src/asm/program_store_int.inc @@ -1,4 +1,4 @@ - and eax, 262080 + and eax, 1048512 lea rcx, [rsi+rax] mov qword ptr [rcx+0], r8 mov qword ptr [rcx+8], r9 diff --git a/src/common.hpp b/src/common.hpp index 053f2a1..bbd5a2b 100644 --- a/src/common.hpp +++ b/src/common.hpp @@ -72,7 +72,7 @@ namespace RandomX { convertible_t hi; }; - constexpr int ProgramLength = 256; + constexpr int ProgramLength = 128; constexpr uint32_t InstructionCount = 1024; constexpr uint32_t ScratchpadSize = 1024 * 1024; constexpr uint32_t ScratchpadLength = ScratchpadSize / sizeof(convertible_t); diff --git a/src/executeProgram-win64.asm b/src/executeProgram-win64.asm index be3bc82..e9bc30a 100644 --- a/src/executeProgram-win64.asm +++ b/src/executeProgram-win64.asm @@ -119,7 +119,7 @@ signMask: ALIGN 64 program_begin: xor eax, r8d ;# read address register 1 - and eax, 262080 + and eax, 1048512 lea rcx, [rsi+rax] xor r8, qword ptr [rcx+0] xor r9, qword ptr [rcx+8] @@ -130,7 +130,7 @@ program_begin: xor r14, qword ptr [rcx+48] xor r15, qword ptr [rcx+56] xor eax, r9d ;# read address register 2 - and eax, 262080 + and eax, 1048512 lea rcx, [rsi+rax] cvtdq2pd xmm0, qword ptr [rcx+0] cvtdq2pd xmm1, qword ptr [rcx+8] @@ -166,7 +166,7 @@ program_begin: xor r14, qword ptr [rcx+48] xor r15, qword ptr [rcx+56] mov eax, r12d ;# write address register 1 - and eax, 262080 + and eax, 1048512 lea rcx, [rsi+rax] mov qword ptr [rcx+0], r8 mov qword ptr [rcx+8], r9 @@ -177,7 +177,7 @@ program_begin: mov qword ptr [rcx+48], r14 mov qword ptr [rcx+56], r15 xor eax, r13d ;# write address register 2 - and eax, 262080 + and eax, 1048512 lea rcx, [rsi+rax] mulpd xmm0, xmm4 mulpd xmm1, xmm5 diff --git a/src/instructionWeights.hpp b/src/instructionWeights.hpp index 86285de..55c9b79 100644 --- a/src/instructionWeights.hpp +++ b/src/instructionWeights.hpp @@ -20,51 +20,51 @@ along with RandomX. If not, see. #pragma once //Integer -#define WT_IADD_R 10 +#define WT_IADD_R 12 #define WT_IADD_M 3 -#define WT_IADD_RC 10 -#define WT_ISUB_R 10 +#define WT_IADD_RC 12 +#define WT_ISUB_R 12 #define WT_ISUB_M 3 #define WT_IMUL_9C 10 -#define WT_IMUL_R 20 -#define WT_IMUL_M 6 -#define WT_IMULH_R 6 -#define WT_IMULH_M 2 -#define WT_ISMULH_R 6 -#define WT_ISMULH_M 2 +#define WT_IMUL_R 16 +#define WT_IMUL_M 4 +#define WT_IMULH_R 4 +#define WT_IMULH_M 1 +#define WT_ISMULH_R 4 +#define WT_ISMULH_M 1 #define WT_IDIV_C 4 #define WT_ISDIV_C 4 #define WT_INEG_R 2 #define WT_IXOR_R 12 -#define WT_IXOR_M 4 -#define WT_IROR_R 10 -#define WT_IROL_R 10 +#define WT_IXOR_M 3 +#define WT_IROR_R 12 +#define WT_IROL_R 12 //Common floating point -#define WT_FPSWAP_R 6 +#define WT_FPSWAP_R 8 //Floating point group F -#define WT_FPADD_R 18 -#define WT_FPADD_M 3 -#define WT_FPSUB_R 18 -#define WT_FPSUB_M 3 -#define WT_FPNEG_R 5 +#define WT_FPADD_R 20 +#define WT_FPADD_M 5 +#define WT_FPSUB_R 20 +#define WT_FPSUB_M 5 +#define WT_FPNEG_R 6 //Floating point group E -#define WT_FPMUL_R 18 -#define WT_FPMUL_M 3 -#define WT_FPDIV_R 6 +#define WT_FPMUL_R 16 +#define WT_FPMUL_M 4 +#define WT_FPDIV_R 7 #define WT_FPDIV_M 1 #define WT_FPSQRT_R 6 //Control -#define WT_COND_R 12 -#define WT_COND_M 4 +#define WT_COND_R 7 +#define WT_COND_M 1 #define WT_CFROUND 1 //Store -#define WT_ISTORE 12 -#define WT_FSTORE 6 +#define WT_ISTORE 18 +#define WT_FSTORE 0 #define WT_NOP 0 @@ -115,6 +115,7 @@ static_assert(wtSum == 256, #define REP33(x) REP32(x) x, #define REP40(x) REP32(x) REP8(x) #define REP128(x) REP32(x) REP32(x) REP32(x) REP32(x) +#define REP232(x) REP128(x) REP40(x) REP40(x) REP24(x) #define REP256(x) REP128(x) REP128(x) #define REPNX(x,N) REP##N(x) #define REPN(x,N) REPNX(x,N) diff --git a/src/main.cpp b/src/main.cpp index 12e9cdb..4f5a021 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -169,12 +169,10 @@ void mine(RandomX::VirtualMachine* vm, std::atomic& atomicNonce, AtomicHash blake2b(hash, sizeof(hash), blockTemplate, sizeof(blockTemplate), nullptr, 0); int spIndex = ((uint8_t*)hash)[24] | ((((uint8_t*)hash)[25] & 15) << 8); vm->initializeScratchpad(scratchpad, spIndex); - //vm->initializeProgram(hash); + vm->setScratchpad(scratchpad); //dump((char*)((RandomX::CompiledVirtualMachine*)vm)->getProgram(), RandomX::CodeSize, "code-1337-jmp.txt"); for (int chain = 0; chain < 16; ++chain) { vm->initializeProgram(hash); - int segment = hash[3] & 3; - vm->setScratchpad(scratchpad + segment * RandomX::ScratchpadSize / 4); vm->execute(); vm->getResult(nullptr, 0, hash); } diff --git a/src/program.inc b/src/program.inc index 21f7d0b..d901e9a 100644 --- a/src/program.inc +++ b/src/program.inc @@ -10,54 +10,54 @@ mulpd xmm6, xmm10 ; IMUL_R r6, r3 imul r14, r11 - ; FPMUL_R e1, a0 - mulpd xmm5, xmm8 - ; IROR_R r5, r3 + ; FPSUB_M f1, L1[r4] + mov eax, r12d + and eax, 16376 + cvtdq2pd xmm12, qword ptr [rsi+rax] + subpd xmm1, xmm12 + ; IROL_R r5, r3 mov ecx, r11d - ror r13, cl + rol r13, cl ; FPMUL_R e2, a0 mulpd xmm6, xmm8 - ; FPNEG_R f3 - xorps xmm3, xmm15 + ; FPSUB_R f3, a0 + subpd xmm3, xmm8 ; IXOR_R r0, r4 xor r8, r12 - ; ISMULH_R r3, r7 + ; ISMULH_M r3, L1[r7] + mov ecx, r15d + and ecx, 16376 mov rax, r11 - imul r15 + imul qword ptr [rsi+rcx] mov r11, rdx ; FPSWAP_R f2 shufpd xmm2, xmm2, 1 - ; ISMULH_R r6, r0 - mov rax, r14 - imul r8 - mov r14, rdx + ; IDIV_C r6, 1248528248 + mov rax, 15864311168205210203 + mul r14 + shr rdx, 30 + add r14, rdx ; FPMUL_R e0, a2 mulpd xmm4, xmm10 - ; ISUB_R r3, r4 - sub r11, r12 + ; IADD_RC r3, r4, -52260428 + lea r11, [r11+r12-52260428] ; IADD_R r7, -1138617760 add r15, -1138617760 - ; IROR_R r2, r6 + ; IROL_R r2, r6 mov ecx, r14d - ror r10, cl - ; FPMUL_R e2, a1 - mulpd xmm6, xmm9 + rol r10, cl + ; FPNEG_R f2 + xorps xmm2, xmm15 ; IROR_R r7, r1 mov ecx, r9d ror r15, cl - ; COND_M r2, lt(L1[r7], -41618808) + ; COND_R r2, lt(r7, -41618808) xor ecx, ecx - mov eax, r15d - and eax, 16376 - cmp dword ptr [rsi+rax], -41618808 + cmp r15d, -41618808 setl cl add r10, rcx - ; FPMUL_M e3, L1[r0] - mov eax, r8d - and eax, 16376 - cvtdq2pd xmm12, qword ptr [rsi+rax] - mulpd xmm7, xmm12 - maxpd xmm7, xmm13 + ; FPMUL_R e3, a0 + mulpd xmm7, xmm8 ; CFROUND r1, 43 mov rax, r9 rol rax, 34 @@ -67,14 +67,17 @@ ldmxcsr dword ptr [rsp-8] ; FPADD_R f2, a1 addpd xmm2, xmm9 - ; FPNEG_R f0 - xorps xmm0, xmm15 - ; FSTORE L1[r6], f2 + ; FPSUB_M f0, L1[r7] + mov eax, r15d + and eax, 16376 + cvtdq2pd xmm12, qword ptr [rsi+rax] + subpd xmm0, xmm12 + ; ISTORE L1[r6], r2 mov eax, r14d - and eax, 16368 - movapd xmmword ptr [rsi+rax], xmm2 - ; IMUL_9C r6, -45112665 - lea r14, [r14+r14*8-45112665] + and eax, 16376 + mov qword ptr [rsi+rax], r10 + ; ISUB_R r6, r5 + sub r14, r13 ; IADD_M r0, L1[r4] mov eax, r12d and eax, 16376 @@ -87,41 +90,30 @@ mov eax, r14d and eax, 16376 mov qword ptr [rsi+rax], r14 - ; COND_R r4, sg(r1, -1189096105) - xor ecx, ecx - cmp r9d, -1189096105 - sets cl - add r12, rcx + ; FPSQRT_R e0 + sqrtpd xmm4, xmm4 ; IXOR_R r2, r5 xor r10, r13 - ; COND_R r1, be(r5, -965180434) - xor ecx, ecx - cmp r13d, -965180434 - setbe cl - add r9, rcx - ; FPMUL_M e1, L2[r3] - mov eax, r11d - and eax, 262136 - cvtdq2pd xmm12, qword ptr [rsi+rax] - mulpd xmm5, xmm12 - maxpd xmm5, xmm13 + ; FPSQRT_R e1 + sqrtpd xmm5, xmm5 + ; FPMUL_R e1, a3 + mulpd xmm5, xmm11 ; IMULH_R r7, r6 mov rax, r15 mul r14 mov r15, rdx - ; ISMULH_M r0, L1[r4] - mov ecx, r12d - and ecx, 16376 - mov rax, r8 - imul qword ptr [rsi+rcx] - mov r8, rdx + ; ISDIV_C r0, -1706892622 + mov rax, -5802075764249827661 + imul r8 + xor eax, eax + sar rdx, 29 + sets al + add rdx, rax + add r8, rdx ; IMUL_R r5, r3 imul r13, r11 - ; COND_R r2, of(r0, -1045938770) - xor ecx, ecx - cmp r8d, -1045938770 - seto cl - add r10, rcx + ; FPSQRT_R e2 + sqrtpd xmm6, xmm6 ; FPADD_M f3, L1[r4] mov eax, r12d and eax, 16376 @@ -131,18 +123,19 @@ add r11, r10 ; FPADD_R f1, a0 addpd xmm1, xmm8 - ; FPSQRT_R e3 - sqrtpd xmm7, xmm7 + ; FPDIV_R e3, a2 + divpd xmm7, xmm10 + maxpd xmm7, xmm13 ; FPSUB_R f0, a1 subpd xmm0, xmm9 ; IMUL_M r5, L1[r6] mov eax, r14d and eax, 16376 imul r13, qword ptr [rsi+rax] - ; ISUB_R r1, r2 - sub r9, r10 - ; IMUL_R r4, r6 - imul r12, r14 + ; IADD_RC r1, r2, -1263285243 + lea r9, [r9+r10-1263285243] + ; IMUL_9C r4, 1994773931 + lea r12, [r12+r12*8+1994773931] ; FPSWAP_R e3 shufpd xmm7, xmm7, 1 ; IMUL_M r0, L1[r7] @@ -152,69 +145,72 @@ ; IROR_R r1, r6 mov ecx, r14d ror r9, cl - ; IROR_R r2, r4 + ; IROL_R r2, r4 mov ecx, r12d - ror r10, cl + rol r10, cl ; FPSUB_R f3, a1 subpd xmm3, xmm9 - ; FSTORE L1[r0], e1 + ; ISTORE L1[r0], r5 mov eax, r8d - and eax, 16368 - movapd xmmword ptr [rsi+rax], xmm5 - ; COND_R r2, sg(r3, 1269153133) - xor ecx, ecx - cmp r11d, 1269153133 - sets cl - add r10, rcx + and eax, 16376 + mov qword ptr [rsi+rax], r13 + ; FPDIV_M e2, L2[r3] + mov eax, r11d + and eax, 262136 + cvtdq2pd xmm12, qword ptr [rsi+rax] + divpd xmm6, xmm12 + maxpd xmm6, xmm13 ; FPSWAP_R f2 shufpd xmm2, xmm2, 1 ; IADD_R r7, r5 add r15, r13 - ; COND_R r0, be(r4, -1486502150) - xor ecx, ecx - cmp r12d, -1486502150 - setbe cl - add r8, rcx - ; FPSUB_R f3, a1 - subpd xmm3, xmm9 + ; FPDIV_M e0, L1[r4] + mov eax, r12d + and eax, 16376 + cvtdq2pd xmm12, qword ptr [rsi+rax] + divpd xmm4, xmm12 + maxpd xmm4, xmm13 + ; FPADD_M f3, L1[r5] + mov eax, r13d + and eax, 16376 + cvtdq2pd xmm12, qword ptr [rsi+rax] + addpd xmm3, xmm12 ; FPADD_R f0, a3 addpd xmm0, xmm11 ; IADD_R r2, r0 add r10, r8 - ; FSTORE L1[r3], e2 + ; ISTORE L1[r3], r6 mov eax, r11d - and eax, 16368 - movapd xmmword ptr [rsi+rax], xmm6 - ; IXOR_R r1, r7 - xor r9, r15 - ; IMUL_R r5, r7 - imul r13, r15 + and eax, 16376 + mov qword ptr [rsi+rax], r14 + ; IROR_R r1, r7 + mov ecx, r15d + ror r9, cl + ; IMUL_9C r5, 301671287 + lea r13, [r13+r13*8+301671287] ; IXOR_R r7, 266992378 xor r15, 266992378 - ; COND_R r7, no(r4, 1983804692) - xor ecx, ecx - cmp r12d, 1983804692 - setno cl - add r15, rcx + ; FPSQRT_R e3 + sqrtpd xmm7, xmm7 ; IMUL_M r2, L2[r0] mov eax, r8d and eax, 262136 imul r10, qword ptr [rsi+rax] - ; FPDIV_R e3, a2 - divpd xmm7, xmm10 - maxpd xmm7, xmm13 - ; IMUL_M r0, L2[r6] - mov eax, r14d - and eax, 262136 - imul r8, qword ptr [rsi+rax] + ; FPMUL_R e3, a2 + mulpd xmm7, xmm10 + ; IMUL_R r0, r6 + imul r8, r14 ; ISTORE L1[r0], r7 mov eax, r8d and eax, 16376 mov qword ptr [rsi+rax], r15 - ; FPMUL_R e0, a1 - mulpd xmm4, xmm9 - ; FPSUB_R f3, a1 - subpd xmm3, xmm9 + ; FPNEG_R f0 + xorps xmm0, xmm15 + ; FPADD_M f3, L1[r5] + mov eax, r13d + and eax, 16376 + cvtdq2pd xmm12, qword ptr [rsi+rax] + addpd xmm3, xmm12 ; IROR_R r5, r4 mov ecx, r12d ror r13, cl @@ -222,17 +218,20 @@ mov eax, r15d and eax, 262136 mov qword ptr [rsi+rax], r10 - ; FPSWAP_R e2 - shufpd xmm6, xmm6, 1 + ; FPADD_R f2, a3 + addpd xmm2, xmm11 ; FPADD_M f3, L1[r2] mov eax, r10d and eax, 16376 cvtdq2pd xmm12, qword ptr [rsi+rax] addpd xmm3, xmm12 - ; IDIV_C r5, 2218798981 - mov rax, 17853839665672790751 - mul r13 - shr rdx, 31 + ; ISDIV_C r5, -2076168315 + mov rax, -4770095103914078469 + imul r13 + xor eax, eax + sar rdx, 29 + sets al + add rdx, rax add r13, rdx ; IADD_RC r0, r4, -1321374359 lea r8, [r8+r12-1321374359] @@ -250,28 +249,26 @@ rol r15, cl ; ISUB_R r2, r4 sub r10, r12 - ; IMULH_M r0, L1[12400] - mov rax, r8 - mul qword ptr [rsi+12400] - mov r8, rdx + ; ISMULH_R r0, -1500893068 + mov rax, -1500893068 + imul r8 + add r8, rdx ; IADD_R r2, r3 add r10, r11 - ; COND_R r6, lt(r1, -1124202227) - xor ecx, ecx - cmp r9d, -1124202227 - setl cl - add r14, rcx - ; IROR_R r7, r4 + ; FPSQRT_R e2 + sqrtpd xmm6, xmm6 + ; IROL_R r7, r4 mov ecx, r12d - ror r15, cl + rol r15, cl ; IMUL_R r4, r2 imul r12, r10 ; ISUB_R r3, r7 sub r11, r15 ; IADD_R r2, r7 add r10, r15 - ; FPSQRT_R e3 - sqrtpd xmm7, xmm7 + ; FPDIV_R e3, a0 + divpd xmm7, xmm8 + maxpd xmm7, xmm13 ; ISUB_R r6, 540663146 sub r14, 540663146 ; IROL_R r5, 58 @@ -280,67 +277,65 @@ addpd xmm2, xmm9 ; FPADD_R f2, a2 addpd xmm2, xmm10 - ; FPSQRT_R e1 - sqrtpd xmm5, xmm5 + ; FPDIV_R e1, a2 + divpd xmm5, xmm10 + maxpd xmm5, xmm13 ; FPADD_R f1, a2 addpd xmm1, xmm10 ; IADD_R r5, r3 add r13, r11 - ; IADD_M r7, L1[880] - add r15, qword ptr [rsi+880] + ; IADD_R r7, -1780268176 + add r15, -1780268176 ; ISUB_R r7, r0 sub r15, r8 ; ISTORE L2[r0], r7 mov eax, r8d and eax, 262136 mov qword ptr [rsi+rax], r15 - ; IDIV_C r2, 1014940364 - mov rax, r10 - shr rax, 2 - mov rcx, 1219717022984988185 - mul rcx - shr rdx, 24 - add r10, rdx - ; FPMUL_R e0, a2 - mulpd xmm4, xmm10 - ; IDIV_C r2, 3059159304 - mov rax, 12949335853590502915 - mul r10 - shr rdx, 31 - add r10, rdx + ; INEG_R r2 + neg r10 + ; FPNEG_R f0 + xorps xmm0, xmm15 + ; INEG_R r2 + neg r10 ; IADD_R r0, r3 add r8, r11 ; IMUL_9C r7, -2124093035 lea r15, [r15+r15*8-2124093035] - ; FPSUB_R f2, a0 - subpd xmm2, xmm8 - ; FPDIV_R e0, a2 - divpd xmm4, xmm10 + ; FPADD_M f2, L1[r0] + mov eax, r8d + and eax, 16376 + cvtdq2pd xmm12, qword ptr [rsi+rax] + addpd xmm2, xmm12 + ; FPMUL_M e0, L1[r6] + mov eax, r14d + and eax, 16376 + cvtdq2pd xmm12, qword ptr [rsi+rax] + mulpd xmm4, xmm12 maxpd xmm4, xmm13 ; FPSUB_R f2, a3 subpd xmm2, xmm11 ; IMUL_R r1, r2 imul r9, r10 - ; ISMULH_R r7, r5 - mov rax, r15 - imul r13 - mov r15, rdx + ; IDIV_C r7, 3214009572 + mov rax, 12325439725582798855 + mul r15 + shr rdx, 31 + add r15, rdx ; IMULH_R r3, r2 mov rax, r11 mul r10 mov r11, rdx - ; IXOR_M r1, L2[r0] - mov eax, r8d - and eax, 262136 - xor r9, qword ptr [rsi+rax] + ; IROR_R r1, r0 + mov ecx, r8d + ror r9, cl ; FPMUL_R e0, a1 mulpd xmm4, xmm9 - ; ISUB_R r4, 1456841848 - sub r12, 1456841848 - ; IXOR_M r3, L2[r2] - mov eax, r10d - and eax, 262136 - xor r11, qword ptr [rsi+rax] + ; IADD_RC r4, r4, 1456841848 + lea r12, [r12+r12+1456841848] + ; IROR_R r3, r2 + mov ecx, r10d + ror r11, cl ; COND_M r0, of(L1[r4], 1678513610) xor ecx, ecx mov eax, r12d @@ -348,446 +343,39 @@ cmp dword ptr [rsi+rax], 1678513610 seto cl add r8, rcx - ; IDIV_C r4, 2674394209 - mov rax, 925772300223658071 - mul r12 - shr rdx, 27 - add r12, rdx + ; INEG_R r4 + neg r12 ; IMUL_R r4, r1 imul r12, r9 ; FPADD_R f1, a2 addpd xmm1, xmm10 ; FPSUB_R f2, a0 subpd xmm2, xmm8 - ; FPMUL_M e1, L2[r6] - mov eax, r14d - and eax, 262136 - cvtdq2pd xmm12, qword ptr [rsi+rax] - mulpd xmm5, xmm12 - maxpd xmm5, xmm13 - ; FPSUB_M f0, L2[r3] - mov eax, r11d - and eax, 262136 - cvtdq2pd xmm12, qword ptr [rsi+rax] - subpd xmm0, xmm12 + ; FPMUL_R e1, a2 + mulpd xmm5, xmm10 + ; FPSUB_R f0, a3 + subpd xmm0, xmm11 ; IROR_R r0, r7 mov ecx, r15d ror r8, cl - ; FSTORE L2[r1], e0 + ; ISTORE L2[r1], r4 mov eax, r9d - and eax, 262128 - movapd xmmword ptr [rsi+rax], xmm4 - ; IROR_R r7, r6 + and eax, 262136 + mov qword ptr [rsi+rax], r12 + ; IROL_R r7, r6 mov ecx, r14d - ror r15, cl + rol r15, cl ; IMUL_9C r2, 266593902 lea r10, [r10+r10*8+266593902] ; IMUL_R r4, r6 imul r12, r14 ; FPSUB_R f2, a2 subpd xmm2, xmm10 - ; FPMUL_R e3, a0 - mulpd xmm7, xmm8 - ; IXOR_M r7, L1[r2] - mov eax, r10d - and eax, 16376 - xor r15, qword ptr [rsi+rax] + ; FPNEG_R f3 + xorps xmm3, xmm15 + ; IROR_R r7, r2 + mov ecx, r10d + ror r15, cl ; IROR_R r0, r5 mov ecx, r13d ror r8, cl - ; FPADD_R f1, a2 - addpd xmm1, xmm10 - ; FPSQRT_R e3 - sqrtpd xmm7, xmm7 - ; FPADD_R f3, a1 - addpd xmm3, xmm9 - ; FPADD_R f1, a0 - addpd xmm1, xmm8 - ; COND_M r2, ge(L2[r2], -226330940) - xor ecx, ecx - mov eax, r10d - and eax, 262136 - cmp dword ptr [rsi+rax], -226330940 - setge cl - add r10, rcx - ; FPDIV_R e2, a3 - divpd xmm6, xmm11 - maxpd xmm6, xmm13 - ; FPMUL_R e2, a1 - mulpd xmm6, xmm9 - ; FPSUB_R f1, a0 - subpd xmm1, xmm8 - ; IMUL_R r7, r5 - imul r15, r13 - ; IMUL_R r0, r1 - imul r8, r9 - ; FPSUB_R f3, a1 - subpd xmm3, xmm9 - ; IROL_R r3, r5 - mov ecx, r13d - rol r11, cl - ; IADD_RC r5, r2, 795784298 - lea r13, [r13+r10+795784298] - ; ISUB_R r0, r4 - sub r8, r12 - ; IMUL_R r5, r4 - imul r13, r12 - ; FPSUB_R f0, a2 - subpd xmm0, xmm10 - ; FPMUL_R e3, a1 - mulpd xmm7, xmm9 - ; ISDIV_C r3, 1662492575 - mov rax, 2978515652703905219 - imul r11 - xor eax, eax - sar rdx, 28 - sets al - add rdx, rax - add r11, rdx - ; ISMULH_R r5, r0 - mov rax, r13 - imul r8 - mov r13, rdx - ; ISDIV_C r4, 1963597892 - mov rax, -8359627607928540073 - imul r12 - xor eax, eax - add rdx, r12 - sar rdx, 30 - sets al - add rdx, rax - add r12, rdx - ; IMUL_R r7, r0 - imul r15, r8 - ; IMULH_M r0, L1[r3] - mov ecx, r11d - and ecx, 16376 - mov rax, r8 - mul qword ptr [rsi+rcx] - mov r8, rdx - ; IXOR_R r3, r7 - xor r11, r15 - ; IDIV_C r4, 1146125335 - mov rax, 8640870253760721727 - mul r12 - shr rdx, 29 - add r12, rdx - ; FPSWAP_R f3 - shufpd xmm3, xmm3, 1 - ; IXOR_M r2, L1[r0] - mov eax, r8d - and eax, 16376 - xor r10, qword ptr [rsi+rax] - ; IROR_R r0, r1 - mov ecx, r9d - ror r8, cl - ; IXOR_R r7, r4 - xor r15, r12 - ; ISMULH_R r6, r2 - mov rax, r14 - imul r10 - mov r14, rdx - ; FPMUL_R e3, a2 - mulpd xmm7, xmm10 - ; IADD_RC r4, r2, 1704868083 - lea r12, [r12+r10+1704868083] - ; FPSUB_R f2, a0 - subpd xmm2, xmm8 - ; ISTORE L1[r0], r0 - mov eax, r8d - and eax, 16376 - mov qword ptr [rsi+rax], r8 - ; FPSUB_R f0, a3 - subpd xmm0, xmm11 - ; FPDIV_R e0, a3 - divpd xmm4, xmm11 - maxpd xmm4, xmm13 - ; FPMUL_R e3, a2 - mulpd xmm7, xmm10 - ; ISUB_R r7, 1302457878 - sub r15, 1302457878 - ; IMUL_9C r1, 1330165941 - lea r9, [r9+r9*8+1330165941] - ; FPMUL_R e1, a3 - mulpd xmm5, xmm11 - ; IROL_R r0, r4 - mov ecx, r12d - rol r8, cl - ; FPSUB_M f1, L1[r0] - mov eax, r8d - and eax, 16376 - cvtdq2pd xmm12, qword ptr [rsi+rax] - subpd xmm1, xmm12 - ; IROL_R r5, r6 - mov ecx, r14d - rol r13, cl - ; COND_M r0, ab(L1[r1], -310933871) - xor ecx, ecx - mov eax, r9d - and eax, 16376 - cmp dword ptr [rsi+rax], -310933871 - seta cl - add r8, rcx - ; CFROUND r7, 39 - mov rax, r15 - rol rax, 38 - and eax, 24576 - or eax, 40896 - mov dword ptr [rsp-8], eax - ldmxcsr dword ptr [rsp-8] - ; FPDIV_R e0, a1 - divpd xmm4, xmm9 - maxpd xmm4, xmm13 - ; IMUL_M r1, L1[r3] - mov eax, r11d - and eax, 16376 - imul r9, qword ptr [rsi+rax] - ; IMUL_9C r3, 1573236728 - lea r11, [r11+r11*8+1573236728] - ; FPNEG_R f3 - xorps xmm3, xmm15 - ; COND_R r1, lt(r4, -1805702334) - xor ecx, ecx - cmp r12d, -1805702334 - setl cl - add r9, rcx - ; FPSWAP_R f1 - shufpd xmm1, xmm1, 1 - ; IADD_R r7, -1421188024 - add r15, -1421188024 - ; FPMUL_R e3, a2 - mulpd xmm7, xmm10 - ; FPSUB_M f2, L2[r7] - mov eax, r15d - and eax, 262136 - cvtdq2pd xmm12, qword ptr [rsi+rax] - subpd xmm2, xmm12 - ; FPSUB_R f3, a1 - subpd xmm3, xmm9 - ; FPSQRT_R e1 - sqrtpd xmm5, xmm5 - ; ISUB_R r2, r4 - sub r10, r12 - ; ISMULH_R r4, r5 - mov rax, r12 - imul r13 - mov r12, rdx - ; COND_R r1, of(r7, 1294727006) - xor ecx, ecx - cmp r15d, 1294727006 - seto cl - add r9, rcx - ; IADD_M r5, L2[r2] - mov eax, r10d - and eax, 262136 - add r13, qword ptr [rsi+rax] - ; IMUL_9C r4, 401020510 - lea r12, [r12+r12*8+401020510] - ; IROL_R r3, r0 - mov ecx, r8d - rol r11, cl - ; ISTORE L1[r7], r0 - mov eax, r15d - and eax, 16376 - mov qword ptr [rsi+rax], r8 - ; FPSUB_R f2, a1 - subpd xmm2, xmm9 - ; FPSQRT_R e3 - sqrtpd xmm7, xmm7 - ; IMUL_R r3, 720965215 - imul r11, 720965215 - ; IMUL_R r6, r2 - imul r14, r10 - ; ISTORE L1[r7], r3 - mov eax, r15d - and eax, 16376 - mov qword ptr [rsi+rax], r11 - ; IROR_R r2, r6 - mov ecx, r14d - ror r10, cl - ; FPSQRT_R e3 - sqrtpd xmm7, xmm7 - ; IMUL_9C r4, 788211341 - lea r12, [r12+r12*8+788211341] - ; IMUL_9C r3, -67993446 - lea r11, [r11+r11*8-67993446] - ; FPSWAP_R e3 - shufpd xmm7, xmm7, 1 - ; IMUL_M r2, L1[r6] - mov eax, r14d - and eax, 16376 - imul r10, qword ptr [rsi+rax] - ; COND_M r2, ge(L1[r2], -1892157506) - xor ecx, ecx - mov eax, r10d - and eax, 16376 - cmp dword ptr [rsi+rax], -1892157506 - setge cl - add r10, rcx - ; FPADD_M f1, L1[r3] - mov eax, r11d - and eax, 16376 - cvtdq2pd xmm12, qword ptr [rsi+rax] - addpd xmm1, xmm12 - ; IADD_M r7, L1[r0] - mov eax, r8d - and eax, 16376 - add r15, qword ptr [rsi+rax] - ; ISDIV_C r1, 624867857 - mov rax, 7924491717200811467 - imul r9 - xor eax, eax - sar rdx, 28 - sets al - add rdx, rax - add r9, rdx - ; FPADD_R f0, a1 - addpd xmm0, xmm9 - ; ISUB_R r5, r7 - sub r13, r15 - ; FPNEG_R f0 - xorps xmm0, xmm15 - ; IMUL_R r6, r2 - imul r14, r10 - ; FPMUL_M e3, L1[r1] - mov eax, r9d - and eax, 16376 - cvtdq2pd xmm12, qword ptr [rsi+rax] - mulpd xmm7, xmm12 - maxpd xmm7, xmm13 - ; IADD_R r0, r4 - add r8, r12 - ; FPSUB_M f3, L1[r1] - mov eax, r9d - and eax, 16376 - cvtdq2pd xmm12, qword ptr [rsi+rax] - subpd xmm3, xmm12 - ; FPMUL_R e2, a0 - mulpd xmm6, xmm8 - ; INEG_R r2 - neg r10 - ; FPMUL_R e2, a2 - mulpd xmm6, xmm10 - ; FPSUB_M f3, L1[r6] - mov eax, r14d - and eax, 16376 - cvtdq2pd xmm12, qword ptr [rsi+rax] - subpd xmm3, xmm12 - ; FPADD_R f1, a3 - addpd xmm1, xmm11 - ; IMULH_R r3, r2 - mov rax, r11 - mul r10 - mov r11, rdx - ; FPSUB_R f0, a3 - subpd xmm0, xmm11 - ; IDIV_C r5, 2887845607 - mov rax, 13717520480010955377 - mul r13 - shr rdx, 31 - add r13, rdx - ; ISMULH_M r6, L1[r2] - mov ecx, r10d - and ecx, 16376 - mov rax, r14 - imul qword ptr [rsi+rcx] - mov r14, rdx - ; FPSUB_R f3, a3 - subpd xmm3, xmm11 - ; IMUL_M r6, L1[r7] - mov eax, r15d - and eax, 16376 - imul r14, qword ptr [rsi+rax] - ; FPNEG_R f0 - xorps xmm0, xmm15 - ; FPMUL_R e2, a0 - mulpd xmm6, xmm8 - ; IMUL_9C r6, 295130073 - lea r14, [r14+r14*8+295130073] - ; FPADD_R f1, a1 - addpd xmm1, xmm9 - ; IXOR_R r0, r5 - xor r8, r13 - ; FPADD_R f2, a1 - addpd xmm2, xmm9 - ; FPSWAP_R e3 - shufpd xmm7, xmm7, 1 - ; FPSQRT_R e3 - sqrtpd xmm7, xmm7 - ; IADD_RC r3, r6, -1317630728 - lea r11, [r11+r14-1317630728] - ; IMUL_M r2, L1[r3] - mov eax, r11d - and eax, 16376 - imul r10, qword ptr [rsi+rax] - ; IADD_RC r1, r4, 894105694 - lea r9, [r9+r12+894105694] - ; IMUL_R r7, r0 - imul r15, r8 - ; FPSUB_R f1, a0 - subpd xmm1, xmm8 - ; IMUL_M r7, L1[r1] - mov eax, r9d - and eax, 16376 - imul r15, qword ptr [rsi+rax] - ; IXOR_R r2, r4 - xor r10, r12 - ; ISUB_M r0, L1[r1] - mov eax, r9d - and eax, 16376 - sub r8, qword ptr [rsi+rax] - ; INEG_R r4 - neg r12 - ; IMUL_9C r4, -285272388 - lea r12, [r12+r12*8-285272388] - ; IMUL_R r7, r4 - imul r15, r12 - ; IMULH_M r5, L1[r7] - mov ecx, r15d - and ecx, 16376 - mov rax, r13 - mul qword ptr [rsi+rcx] - mov r13, rdx - ; IROL_R r1, r7 - mov ecx, r15d - rol r9, cl - ; IXOR_R r4, -757532727 - xor r12, -757532727 - ; IMUL_R r3, 1863959234 - imul r11, 1863959234 - ; IROL_R r4, 59 - rol r12, 59 - ; ISMULH_R r1, 2122681086 - mov rax, 2122681086 - imul r9 - add r9, rdx - ; ISTORE L2[r6], r7 - mov eax, r14d - and eax, 262136 - mov qword ptr [rsi+rax], r15 - ; ISTORE L1[r1], r5 - mov eax, r9d - and eax, 16376 - mov qword ptr [rsi+rax], r13 - ; FPMUL_R e0, a1 - mulpd xmm4, xmm9 - ; COND_R r2, ns(r1, 486049737) - xor ecx, ecx - cmp r9d, 486049737 - setns cl - add r10, rcx - ; FPMUL_M e0, L2[r7] - mov eax, r15d - and eax, 262136 - cvtdq2pd xmm12, qword ptr [rsi+rax] - mulpd xmm4, xmm12 - maxpd xmm4, xmm13 - ; FPMUL_R e3, a2 - mulpd xmm7, xmm10 - ; IROL_R r5, r2 - mov ecx, r10d - rol r13, cl - ; IADD_M r0, L1[r4] - mov eax, r12d - and eax, 16376 - add r8, qword ptr [rsi+rax] From 20eb549725979519154986440a8502660a7c6535 Mon Sep 17 00:00:00 2001 From: tevador Date: Sun, 27 Jan 2019 19:33:55 +0100 Subject: [PATCH 27/35] Merged load/store of integer and FP registers --- src/JitCompilerX86-static.S | 24 ++++------ src/JitCompilerX86-static.asm | 32 +++++-------- src/JitCompilerX86-static.hpp | 8 ++-- src/JitCompilerX86.cpp | 45 +++++++------------ src/asm/program_load_int.inc | 10 ----- ...ram_load_flt.inc => program_loop_load.inc} | 14 ++++++ src/asm/program_loop_store.inc | 18 ++++++++ src/asm/program_prologue_linux.inc | 3 +- src/asm/program_prologue_load.inc | 2 + src/asm/program_prologue_win64.inc | 3 +- src/asm/program_read_dataset.inc | 1 + src/asm/program_store_flt.inc | 11 ----- src/asm/program_store_int.inc | 10 ----- src/executeProgram-win64.asm | 21 +++++---- 14 files changed, 88 insertions(+), 114 deletions(-) delete mode 100644 src/asm/program_load_int.inc rename src/asm/{program_load_flt.inc => program_loop_load.inc} (55%) create mode 100644 src/asm/program_loop_store.inc delete mode 100644 src/asm/program_store_flt.inc delete mode 100644 src/asm/program_store_int.inc diff --git a/src/JitCompilerX86-static.S b/src/JitCompilerX86-static.S index a799e11..9bf06ba 100644 --- a/src/JitCompilerX86-static.S +++ b/src/JitCompilerX86-static.S @@ -27,13 +27,11 @@ #define DECL(x) x #endif .global DECL(randomx_program_prologue) -.global DECL(randomx_loop_begin) -.global DECL(randomx_program_load_int) -.global DECL(randomx_program_load_flt) +.global DECL(randomx_program_loop_begin) +.global DECL(randomx_program_loop_load) .global DECL(randomx_program_start) .global DECL(randomx_program_read_dataset) -.global DECL(randomx_program_store_int) -.global DECL(randomx_program_store_flt) +.global DECL(randomx_program_loop_store) .global DECL(randomx_program_loop_end) .global DECL(randomx_program_epilogue) .global DECL(randomx_program_end) @@ -48,14 +46,11 @@ DECL(randomx_program_prologue): #include "asm/program_xmm_constants.inc" .align 64 -DECL(randomx_loop_begin): +DECL(randomx_program_loop_begin): nop -DECL(randomx_program_load_int): - #include "asm/program_load_int.inc" - -DECL(randomx_program_load_flt): - #include "asm/program_load_flt.inc" +DECL(randomx_program_loop_load): + #include "asm/program_loop_load.inc" DECL(randomx_program_start): nop @@ -63,11 +58,8 @@ DECL(randomx_program_start): DECL(randomx_program_read_dataset): #include "asm/program_read_dataset.inc" -DECL(randomx_program_store_int): - #include "asm/program_store_int.inc" - -DECL(randomx_program_store_flt): - #include "asm/program_store_flt.inc" +DECL(randomx_program_loop_store): + #include "asm/program_loop_store.inc" DECL(randomx_program_loop_end): nop diff --git a/src/JitCompilerX86-static.asm b/src/JitCompilerX86-static.asm index 8d5a4fe..5b2d387 100644 --- a/src/JitCompilerX86-static.asm +++ b/src/JitCompilerX86-static.asm @@ -20,13 +20,11 @@ IFDEF RAX _RANDOMX_JITX86_STATIC SEGMENT PAGE READ EXECUTE PUBLIC randomx_program_prologue -PUBLIC randomx_loop_begin -PUBLIC randomx_program_load_int -PUBLIC randomx_program_load_flt +PUBLIC randomx_program_loop_begin +PUBLIC randomx_program_loop_load PUBLIC randomx_program_start PUBLIC randomx_program_read_dataset -PUBLIC randomx_program_store_int -PUBLIC randomx_program_store_flt +PUBLIC randomx_program_loop_store PUBLIC randomx_program_loop_end PUBLIC randomx_program_epilogue PUBLIC randomx_program_end @@ -40,17 +38,13 @@ ALIGN 64 include asm/program_xmm_constants.inc ALIGN 64 -randomx_loop_begin PROC +randomx_program_loop_begin PROC nop -randomx_loop_begin ENDP +randomx_program_loop_begin ENDP -randomx_program_load_int PROC - include asm/program_load_int.inc -randomx_program_load_int ENDP - -randomx_program_load_flt PROC - include asm/program_load_flt.inc -randomx_program_load_flt ENDP +randomx_program_loop_load PROC + include asm/program_loop_load.inc +randomx_program_loop_load ENDP randomx_program_start PROC nop @@ -60,13 +54,9 @@ randomx_program_read_dataset PROC include asm/program_read_dataset.inc randomx_program_read_dataset ENDP -randomx_program_store_int PROC - include asm/program_store_int.inc -randomx_program_store_int ENDP - -randomx_program_store_flt PROC - include asm/program_store_flt.inc -randomx_program_store_flt ENDP +randomx_program_loop_store PROC + include asm/program_loop_store.inc +randomx_program_loop_store ENDP randomx_program_loop_end PROC nop diff --git a/src/JitCompilerX86-static.hpp b/src/JitCompilerX86-static.hpp index df5cd28..64abfa3 100644 --- a/src/JitCompilerX86-static.hpp +++ b/src/JitCompilerX86-static.hpp @@ -19,13 +19,11 @@ along with RandomX. If not, see. extern "C" { void randomx_program_prologue(); - void randomx_loop_begin(); - void randomx_program_load_int(); - void randomx_program_load_flt(); + void randomx_program_loop_begin(); + void randomx_program_loop_load(); void randomx_program_start(); void randomx_program_read_dataset(); - void randomx_program_store_int(); - void randomx_program_store_flt(); + void randomx_program_loop_store(); void randomx_program_loop_end(); void randomx_program_epilogue(); void randomx_program_end(); diff --git a/src/JitCompilerX86.cpp b/src/JitCompilerX86.cpp index 30c6f73..cf50582 100644 --- a/src/JitCompilerX86.cpp +++ b/src/JitCompilerX86.cpp @@ -94,13 +94,11 @@ namespace RandomX { #include "JitCompilerX86-static.hpp" const uint8_t* codePrologue = (uint8_t*)&randomx_program_prologue; - const uint8_t* codeLoopBegin = (uint8_t*)&randomx_loop_begin; - const uint8_t* codeLoadInt = (uint8_t*)&randomx_program_load_int; - const uint8_t* codeLoadFlt = (uint8_t*)&randomx_program_load_flt; + const uint8_t* codeLoopBegin = (uint8_t*)&randomx_program_loop_begin; + const uint8_t* codeLoopLoad = (uint8_t*)&randomx_program_loop_load; const uint8_t* codeProgamStart = (uint8_t*)&randomx_program_start; const uint8_t* codeReadDataset = (uint8_t*)&randomx_program_read_dataset; - const uint8_t* codeStoreInt = (uint8_t*)&randomx_program_store_int; - const uint8_t* codeStoreFlt = (uint8_t*)&randomx_program_store_flt; + const uint8_t* codeLoopStore = (uint8_t*)&randomx_program_loop_store; const uint8_t* codeLoopEnd = (uint8_t*)&randomx_program_loop_end; const uint8_t* codeEpilogue = (uint8_t*)&randomx_program_epilogue; const uint8_t* codeProgramEnd = (uint8_t*)&randomx_program_end; @@ -108,11 +106,9 @@ namespace RandomX { const int32_t prologueSize = codeLoopBegin - codePrologue; const int32_t epilogueSize = codeProgramEnd - codeEpilogue; - const int32_t loadIntSize = codeLoadFlt - codeLoadInt; - const int32_t loadFltSize = codeProgamStart - codeLoadFlt; - const int32_t readDatasetSize = codeStoreInt - codeReadDataset; - const int32_t storeIntSize = codeStoreFlt - codeStoreInt; - const int32_t storeFltSize = codeLoopEnd - codeStoreFlt; + const int32_t loopLoadSize = codeProgamStart - codeLoopLoad; + const int32_t readDatasetSize = codeLoopStore - codeReadDataset; + const int32_t loopStoreSize = codeLoopEnd - codeLoopStore; const int32_t epilogueOffset = CodeSize - epilogueSize; @@ -179,6 +175,7 @@ namespace RandomX { static const uint8_t SUB_EBX[] = { 0x83, 0xEB, 0x01 }; static const uint8_t JNZ[] = { 0x0f, 0x85 }; static const uint8_t JMP = 0xe9; + static const uint8_t REX_XOR_RAX_R64[] = { 0x49, 0x33 }; size_t JitCompilerX86::getCodeSize() { return codePos - prologueSize; @@ -204,18 +201,16 @@ namespace RandomX { addressRegisters >>= 1; int readReg2 = 2 + (addressRegisters & 1); addressRegisters >>= 1; - int writeReg1 = 4 + (addressRegisters & 1); + int readReg3 = 4 + (addressRegisters & 1); addressRegisters >>= 1; - int writeReg2 = 6 + (addressRegisters & 1); + int readReg4 = 6 + (addressRegisters & 1); codePos = prologueSize; - emit(REX_XOR_EAX); + emit(REX_XOR_RAX_R64); emitByte(0xc0 + readReg1); - memcpy(code + codePos, codeLoadInt, loadIntSize); - codePos += loadIntSize; - emit(REX_XOR_EAX); + emit(REX_XOR_RAX_R64); emitByte(0xc0 + readReg2); - memcpy(code + codePos, codeLoadFlt, loadFltSize); - codePos += loadFltSize; + memcpy(code + codePos, codeLoopLoad, loopLoadSize); + codePos += loopLoadSize; Instruction instr; for (unsigned i = 0; i < ProgramLength; ++i) { for (unsigned j = 0; j < sizeof(instr) / sizeof(Pcg32::result_type); ++j) { @@ -226,19 +221,13 @@ namespace RandomX { generateCode(instr); } emit(REX_MOV_RR); - emitByte(0xc0 + readReg1); + emitByte(0xc0 + readReg3); emit(REX_XOR_EAX); - emitByte(0xc0 + readReg2); + emitByte(0xc0 + readReg4); memcpy(code + codePos, codeReadDataset, readDatasetSize); codePos += readDatasetSize; - emit(REX_MOV_RR); - emitByte(0xc0 + writeReg1); - memcpy(code + codePos, codeStoreInt, storeIntSize); - codePos += storeIntSize; - emit(REX_XOR_EAX); - emitByte(0xc0 + writeReg2); - memcpy(code + codePos, codeStoreFlt, storeFltSize); - codePos += storeFltSize; + memcpy(code + codePos, codeLoopStore, loopStoreSize); + codePos += loopStoreSize; emit(SUB_EBX); emit(JNZ); emit32(prologueSize - codePos - 4); diff --git a/src/asm/program_load_int.inc b/src/asm/program_load_int.inc deleted file mode 100644 index d9277ed..0000000 --- a/src/asm/program_load_int.inc +++ /dev/null @@ -1,10 +0,0 @@ - and eax, 1048512 - lea rcx, [rsi+rax] - xor r8, qword ptr [rcx+0] - xor r9, qword ptr [rcx+8] - xor r10, qword ptr [rcx+16] - xor r11, qword ptr [rcx+24] - xor r12, qword ptr [rcx+32] - xor r13, qword ptr [rcx+40] - xor r14, qword ptr [rcx+48] - xor r15, qword ptr [rcx+56] diff --git a/src/asm/program_load_flt.inc b/src/asm/program_loop_load.inc similarity index 55% rename from src/asm/program_load_flt.inc rename to src/asm/program_loop_load.inc index 2c631ce..c4c1fed 100644 --- a/src/asm/program_load_flt.inc +++ b/src/asm/program_loop_load.inc @@ -1,5 +1,19 @@ + mov rdx, rax and eax, 1048512 lea rcx, [rsi+rax] + push rcx + xor r8, qword ptr [rcx+0] + xor r9, qword ptr [rcx+8] + xor r10, qword ptr [rcx+16] + xor r11, qword ptr [rcx+24] + xor r12, qword ptr [rcx+32] + xor r13, qword ptr [rcx+40] + xor r14, qword ptr [rcx+48] + xor r15, qword ptr [rcx+56] + ror rdx, 32 + and edx, 1048512 + lea rcx, [rsi+rdx] + push rcx cvtdq2pd xmm0, qword ptr [rcx+0] cvtdq2pd xmm1, qword ptr [rcx+8] cvtdq2pd xmm2, qword ptr [rcx+16] diff --git a/src/asm/program_loop_store.inc b/src/asm/program_loop_store.inc new file mode 100644 index 0000000..a0acebc --- /dev/null +++ b/src/asm/program_loop_store.inc @@ -0,0 +1,18 @@ + pop rcx + mov qword ptr [rcx+0], r8 + mov qword ptr [rcx+8], r9 + mov qword ptr [rcx+16], r10 + mov qword ptr [rcx+24], r11 + mov qword ptr [rcx+32], r12 + mov qword ptr [rcx+40], r13 + mov qword ptr [rcx+48], r14 + mov qword ptr [rcx+56], r15 + pop rcx + mulpd xmm0, xmm4 + mulpd xmm1, xmm5 + mulpd xmm2, xmm6 + mulpd xmm3, xmm7 + movapd xmmword ptr [rcx+0], xmm0 + movapd xmmword ptr [rcx+16], xmm1 + movapd xmmword ptr [rcx+32], xmm2 + movapd xmmword ptr [rcx+48], xmm3 diff --git a/src/asm/program_prologue_linux.inc b/src/asm/program_prologue_linux.inc index 67a967d..bdde664 100644 --- a/src/asm/program_prologue_linux.inc +++ b/src/asm/program_prologue_linux.inc @@ -11,10 +11,9 @@ push rdi ;# RegisterFile& registerFile mov rcx, rdi mov rbp, qword ptr [rsi] ;# "mx", "ma" - mov eax, ebp ;# "mx" mov rdi, qword ptr [rsi+8] ;# uint8_t* dataset mov rsi, rdx ;# convertible_t* scratchpad #include "program_prologue_load.inc" - jmp DECL(randomx_loop_begin) \ No newline at end of file + jmp DECL(randomx_program_loop_begin) \ No newline at end of file diff --git a/src/asm/program_prologue_load.inc b/src/asm/program_prologue_load.inc index ecdd4f9..3a994ab 100644 --- a/src/asm/program_prologue_load.inc +++ b/src/asm/program_prologue_load.inc @@ -1,3 +1,5 @@ + mov rax, rbp + ;# zero integer registers xor r8, r8 xor r9, r9 diff --git a/src/asm/program_prologue_win64.inc b/src/asm/program_prologue_win64.inc index 83ae2a5..b1da4d7 100644 --- a/src/asm/program_prologue_win64.inc +++ b/src/asm/program_prologue_win64.inc @@ -23,11 +23,10 @@ ; function arguments push rcx ; RegisterFile& registerFile mov rbp, qword ptr [rdx] ; "mx", "ma" - mov eax, ebp ; "mx" mov rdi, qword ptr [rdx+8] ; uint8_t* dataset mov rsi, r8 ; convertible_t* scratchpad mov rbx, r9 ; loop counter include program_prologue_load.inc - jmp randomx_loop_begin \ No newline at end of file + jmp randomx_program_loop_begin \ No newline at end of file diff --git a/src/asm/program_read_dataset.inc b/src/asm/program_read_dataset.inc index bae4817..061d32c 100644 --- a/src/asm/program_read_dataset.inc +++ b/src/asm/program_read_dataset.inc @@ -1,4 +1,5 @@ xor rbp, rax ;# modify "mx" + xor eax, eax and rbp, -64 ;# align "mx" to the start of a cache line mov edx, ebp ;# edx = mx prefetchnta byte ptr [rdi+rdx] diff --git a/src/asm/program_store_flt.inc b/src/asm/program_store_flt.inc deleted file mode 100644 index 4bbab9f..0000000 --- a/src/asm/program_store_flt.inc +++ /dev/null @@ -1,11 +0,0 @@ - and eax, 1048512 - lea rcx, [rsi+rax] - mulpd xmm0, xmm4 - mulpd xmm1, xmm5 - mulpd xmm2, xmm6 - mulpd xmm3, xmm7 - movapd xmmword ptr [rcx+0], xmm0 - movapd xmmword ptr [rcx+16], xmm1 - movapd xmmword ptr [rcx+32], xmm2 - movapd xmmword ptr [rcx+48], xmm3 - diff --git a/src/asm/program_store_int.inc b/src/asm/program_store_int.inc deleted file mode 100644 index 03dd31a..0000000 --- a/src/asm/program_store_int.inc +++ /dev/null @@ -1,10 +0,0 @@ - and eax, 1048512 - lea rcx, [rsi+rax] - mov qword ptr [rcx+0], r8 - mov qword ptr [rcx+8], r9 - mov qword ptr [rcx+16], r10 - mov qword ptr [rcx+24], r11 - mov qword ptr [rcx+32], r12 - mov qword ptr [rcx+40], r13 - mov qword ptr [rcx+48], r14 - mov qword ptr [rcx+56], r15 diff --git a/src/executeProgram-win64.asm b/src/executeProgram-win64.asm index e9bc30a..ac49e50 100644 --- a/src/executeProgram-win64.asm +++ b/src/executeProgram-win64.asm @@ -118,8 +118,11 @@ signMask: ALIGN 64 program_begin: - xor eax, r8d ;# read address register 1 + xor rax, r8 ;# read address register 1 + xor rax, r9 + mov rdx, rax and eax, 1048512 + push rax lea rcx, [rsi+rax] xor r8, qword ptr [rcx+0] xor r9, qword ptr [rcx+8] @@ -129,9 +132,10 @@ program_begin: xor r13, qword ptr [rcx+40] xor r14, qword ptr [rcx+48] xor r15, qword ptr [rcx+56] - xor eax, r9d ;# read address register 2 - and eax, 1048512 - lea rcx, [rsi+rax] + ror rdx, 32 + and edx, 1048512 + push rdx + lea rcx, [rsi+rdx] cvtdq2pd xmm0, qword ptr [rcx+0] cvtdq2pd xmm1, qword ptr [rcx+8] cvtdq2pd xmm2, qword ptr [rcx+16] @@ -164,9 +168,8 @@ program_begin: xor r12, qword ptr [rcx+32] xor r13, qword ptr [rcx+40] xor r14, qword ptr [rcx+48] - xor r15, qword ptr [rcx+56] - mov eax, r12d ;# write address register 1 - and eax, 1048512 + xor r15, qword ptr [rcx+56] + pop rax lea rcx, [rsi+rax] mov qword ptr [rcx+0], r8 mov qword ptr [rcx+8], r9 @@ -176,8 +179,7 @@ program_begin: mov qword ptr [rcx+40], r13 mov qword ptr [rcx+48], r14 mov qword ptr [rcx+56], r15 - xor eax, r13d ;# write address register 2 - and eax, 1048512 + pop rax lea rcx, [rsi+rax] mulpd xmm0, xmm4 mulpd xmm1, xmm5 @@ -187,6 +189,7 @@ program_begin: movapd xmmword ptr [rcx+16], xmm1 movapd xmmword ptr [rcx+32], xmm2 movapd xmmword ptr [rcx+48], xmm3 + xor eax, eax dec ebx jnz program_begin From 1ee94bef2a3f6f57c1d77e6fd953061a332b2e44 Mon Sep 17 00:00:00 2001 From: tevador Date: Mon, 4 Feb 2019 17:07:00 +0100 Subject: [PATCH 28/35] Added ISWAP instruction Scratchpad -> 2 MiB New scratchpad initialization New dataset initialization --- makefile | 5 +- src/AssemblyGeneratorX86.cpp | 18 +- src/AssemblyGeneratorX86.hpp | 1 + src/CompiledVirtualMachine.cpp | 2 +- src/Instruction.cpp | 18 +- src/Instruction.hpp | 43 ++++- src/InterpretedVirtualMachine.cpp | 39 ++-- src/InterpretedVirtualMachine.hpp | 17 +- src/JitCompilerX86.cpp | 19 +- src/JitCompilerX86.hpp | 1 + src/asm/program_loop_load.inc | 4 +- src/asm/squareHash.inc | 87 +++++++++ src/common.hpp | 17 +- src/dataset.cpp | 60 +++--- src/hashAes1Rx4.cpp | 41 ++++ src/hashAes1Rx4.hpp | 5 +- src/instructionWeights.hpp | 7 +- src/instructionsPortable.cpp | 299 ++++++++++-------------------- src/intrinPortable.h | 29 ++- src/main.cpp | 9 +- src/squareHash.S | 17 ++ src/squareHash.asm | 9 + src/squareHash.h | 71 +++++++ 23 files changed, 528 insertions(+), 290 deletions(-) create mode 100644 src/asm/squareHash.inc create mode 100644 src/squareHash.S create mode 100644 src/squareHash.asm create mode 100644 src/squareHash.h diff --git a/makefile b/makefile index f805724..87fef86 100644 --- a/makefile +++ b/makefile @@ -13,7 +13,7 @@ LDFLAGS=-lpthread TOBJS=$(addprefix $(OBJDIR)/,instructionsPortable.o TestAluFpu.o) ROBJS=$(addprefix $(OBJDIR)/,argon2_core.o argon2_ref.o AssemblyGeneratorX86.o blake2b.o CompiledVirtualMachine.o dataset.o JitCompilerX86.o instructionsPortable.o Instruction.o InterpretedVirtualMachine.o main.o Program.o softAes.o VirtualMachine.o t1ha2.o Cache.o virtualMemory.o divideByConstantCodegen.o LightClientAsyncWorker.o AddressTransform.o hashAes1Rx4.o) ifeq ($(PLATFORM),x86_64) - ROBJS += $(OBJDIR)/JitCompilerX86-static.o + ROBJS += $(OBJDIR)/JitCompilerX86-static.o $(OBJDIR)/squareHash.o endif all: release test @@ -77,6 +77,9 @@ $(OBJDIR)/JitCompilerX86.o: $(addprefix $(SRCDIR)/,JitCompilerX86.cpp JitCompile $(OBJDIR)/JitCompilerX86-static.o: $(addprefix $(SRCDIR)/,JitCompilerX86-static.S $(addprefix asm/program_, prologue_linux.inc prologue_load.inc epilogue_linux.inc epilogue_store.inc read.inc)) | $(OBJDIR) $(CXX) -x assembler-with-cpp -c $(SRCDIR)/JitCompilerX86-static.S -o $@ +$(OBJDIR)/squareHash.o: $(addprefix $(SRCDIR)/,squareHash.S $(addprefix asm/, squareHash.inc)) | $(OBJDIR) + $(CXX) -x assembler-with-cpp -c $(SRCDIR)/squareHash.S -o $@ + $(OBJDIR)/instructionsPortable.o: $(addprefix $(SRCDIR)/,instructionsPortable.cpp instructions.hpp intrinPortable.h) | $(OBJDIR) $(CXX) $(CXXFLAGS) -c $(SRCDIR)/instructionsPortable.cpp -o $@ diff --git a/src/AssemblyGeneratorX86.cpp b/src/AssemblyGeneratorX86.cpp index a46fe5d..3092e4d 100644 --- a/src/AssemblyGeneratorX86.cpp +++ b/src/AssemblyGeneratorX86.cpp @@ -72,16 +72,16 @@ namespace RandomX { void AssemblyGeneratorX86::genAddressReg(Instruction& instr, const char* reg = "eax") { asmCode << "\tmov " << reg << ", " << regR32[instr.src] << std::endl; - asmCode << "\tand " << reg << ", " << ((instr.alt % 4) ? ScratchpadL1Mask : ScratchpadL2Mask) << std::endl; + asmCode << "\tand " << reg << ", " << ((instr.mod % 4) ? ScratchpadL1Mask : ScratchpadL2Mask) << std::endl; } void AssemblyGeneratorX86::genAddressRegDst(Instruction& instr, int maskAlign = 8) { asmCode << "\tmov eax" << ", " << regR32[instr.dst] << std::endl; - asmCode << "\tand eax" << ", " << ((instr.alt % 4) ? (ScratchpadL1Mask & (-maskAlign)) : (ScratchpadL2Mask & (-maskAlign))) << std::endl; + asmCode << "\tand eax" << ", " << ((instr.mod % 4) ? (ScratchpadL1Mask & (-maskAlign)) : (ScratchpadL2Mask & (-maskAlign))) << std::endl; } int32_t AssemblyGeneratorX86::genAddressImm(Instruction& instr) { - return instr.imm32 & ((instr.alt % 4) ? ScratchpadL1Mask : ScratchpadL2Mask); + return instr.imm32 & ((instr.mod % 4) ? ScratchpadL1Mask : ScratchpadL2Mask); } //1 uOP @@ -348,6 +348,13 @@ namespace RandomX { } } + //2 uOPs + void AssemblyGeneratorX86::h_ISWAP_R(Instruction& instr, int i) { + if (instr.src != instr.dst) { + asmCode << "\txchg " << regR[instr.dst] << ", " << regR[instr.src] << std::endl; + } + } + //1 uOPs void AssemblyGeneratorX86::h_FPSWAP_R(Instruction& instr, int i) { asmCode << "\tshufpd " << regFE[instr.dst] << ", " << regFE[instr.dst] << ", 1" << std::endl; @@ -431,7 +438,7 @@ namespace RandomX { //6 uOPs void AssemblyGeneratorX86::h_CFROUND(Instruction& instr, int i) { asmCode << "\tmov rax, " << regR[instr.src] << std::endl; - int rotate = (13 - (instr.alt & 63)) & 63; + int rotate = (13 - (instr.imm32 & 63)) & 63; if (rotate != 0) asmCode << "\trol rax, " << rotate << std::endl; asmCode << "\tand eax, 24576" << std::endl; @@ -441,7 +448,7 @@ namespace RandomX { } static inline const char* condition(Instruction& instr, bool invert = false) { - switch (((instr.alt >> 2) & 7) ^ invert) + switch (((instr.mod >> 2) & 7) ^ invert) { case 0: return "be"; @@ -519,6 +526,7 @@ namespace RandomX { INST_HANDLE(IXOR_M) INST_HANDLE(IROR_R) INST_HANDLE(IROL_R) + INST_HANDLE(ISWAP_R) //Common floating point INST_HANDLE(FPSWAP_R) diff --git a/src/AssemblyGeneratorX86.hpp b/src/AssemblyGeneratorX86.hpp index 6b0c505..a8e062c 100644 --- a/src/AssemblyGeneratorX86.hpp +++ b/src/AssemblyGeneratorX86.hpp @@ -63,6 +63,7 @@ namespace RandomX { void h_IXOR_M(Instruction&, int); void h_IROR_R(Instruction&, int); void h_IROL_R(Instruction&, int); + void h_ISWAP_R(Instruction&, int); void h_FPSWAP_R(Instruction&, int); void h_FPADD_R(Instruction&, int); void h_FPADD_M(Instruction&, int); diff --git a/src/CompiledVirtualMachine.cpp b/src/CompiledVirtualMachine.cpp index f5d33d0..ebacf42 100644 --- a/src/CompiledVirtualMachine.cpp +++ b/src/CompiledVirtualMachine.cpp @@ -57,7 +57,7 @@ namespace RandomX { for (unsigned i = 0; i < sizeof(reg) / sizeof(Pcg32::result_type); ++i) { *(((uint32_t*)®) + i) = gen(); } - FPINIT(); + initFpu(); /*for (int i = 0; i < RegistersCount / 2; ++i) { reg.f[i].lo.f64 = (double)reg.f[i].lo.i64; reg.f[i].hi.f64 = (double)reg.f[i].hi.i64; diff --git a/src/Instruction.cpp b/src/Instruction.cpp index 0aa0289..ce75f43 100644 --- a/src/Instruction.cpp +++ b/src/Instruction.cpp @@ -29,15 +29,15 @@ namespace RandomX { } void Instruction::genAddressReg(std::ostream& os) const { - os << ((alt % 4) ? "L1" : "L2") << "[r" << (int)src << "]"; + os << ((mod % 4) ? "L1" : "L2") << "[r" << (int)src << "]"; } void Instruction::genAddressRegDst(std::ostream& os) const { - os << ((alt % 4) ? "L1" : "L2") << "[r" << (int)dst << "]"; + os << ((mod % 4) ? "L1" : "L2") << "[r" << (int)dst << "]"; } void Instruction::genAddressImm(std::ostream& os) const { - os << ((alt % 4) ? "L1" : "L2") << "[" << (imm32 & ((alt % 4) ? ScratchpadL1Mask : ScratchpadL2Mask)) << "]"; + os << ((mod % 4) ? "L1" : "L2") << "[" << (imm32 & ((mod % 4) ? ScratchpadL1Mask : ScratchpadL2Mask)) << "]"; } void Instruction::h_IADD_R(std::ostream& os) const { @@ -211,6 +211,10 @@ namespace RandomX { os << "r" << (int)dst << ", " << imm32 << std::endl; } + void Instruction::h_ISWAP_R(std::ostream& os) const { + os << "r" << (int)dst << ", r" << (int)src << std::endl; + } + void Instruction::h_FPSWAP_R(std::ostream& os) const { const char reg = (dst >= 4) ? 'e' : 'f'; auto dstIndex = dst % 4; @@ -280,7 +284,7 @@ namespace RandomX { } void Instruction::h_CFROUND(std::ostream& os) const { - os << "r" << (int)src << ", " << (alt & 63) << std::endl; + os << "r" << (int)src << ", " << (imm32 & 63) << std::endl; } static inline const char* condition(int index) { @@ -306,11 +310,11 @@ namespace RandomX { } void Instruction::h_COND_R(std::ostream& os) const { - os << "r" << (int)dst << ", " << condition((alt >> 2) & 7) << "(r" << (int)src << ", " << imm32 << ")" << std::endl; + os << "r" << (int)dst << ", " << condition((mod >> 2) & 7) << "(r" << (int)src << ", " << imm32 << ")" << std::endl; } void Instruction::h_COND_M(std::ostream& os) const { - os << "r" << (int)dst << ", " << condition((alt >> 2) & 7) << "("; + os << "r" << (int)dst << ", " << condition((mod >> 2) & 7) << "("; genAddressReg(os); os << ", " << imm32 << ")" << std::endl; } @@ -356,6 +360,7 @@ namespace RandomX { INST_NAME(IXOR_M) INST_NAME(IROR_R) INST_NAME(IROL_R) + INST_NAME(ISWAP_R) //Common floating point INST_NAME(FPSWAP_R) @@ -406,6 +411,7 @@ namespace RandomX { INST_HANDLE(IXOR_M) INST_HANDLE(IROR_R) INST_HANDLE(IROL_R) + INST_HANDLE(ISWAP_R) //Common floating point INST_HANDLE(FPSWAP_R) diff --git a/src/Instruction.hpp b/src/Instruction.hpp index ffa3880..987f326 100644 --- a/src/Instruction.hpp +++ b/src/Instruction.hpp @@ -28,12 +28,52 @@ namespace RandomX { typedef void(Instruction::*InstructionVisualizer)(std::ostream&) const; + namespace InstructionType { + constexpr int IADD_R = 0; + constexpr int IADD_M = 1; + constexpr int IADD_RC = 2; + constexpr int ISUB_R = 3; + constexpr int ISUB_M = 4; + constexpr int IMUL_9C = 5; + constexpr int IMUL_R = 6; + constexpr int IMUL_M = 7; + constexpr int IMULH_R = 8; + constexpr int IMULH_M = 9; + constexpr int ISMULH_R = 10; + constexpr int ISMULH_M = 11; + constexpr int IDIV_C = 12; + constexpr int ISDIV_C = 13; + constexpr int INEG_R = 14; + constexpr int IXOR_R = 15; + constexpr int IXOR_M = 16; + constexpr int IROR_R = 17; + constexpr int IROL_R = 18; + constexpr int ISWAP_R = 19; + constexpr int FPSWAP_R = 20; + constexpr int FPADD_R = 21; + constexpr int FPADD_M = 22; + constexpr int FPSUB_R = 23; + constexpr int FPSUB_M = 24; + constexpr int FPNEG_R = 25; + constexpr int FPMUL_R = 26; + constexpr int FPMUL_M = 27; + constexpr int FPDIV_R = 28; + constexpr int FPDIV_M = 29; + constexpr int FPSQRT_R = 30; + constexpr int COND_R = 31; + constexpr int COND_M = 32; + constexpr int CFROUND = 33; + constexpr int ISTORE = 34; + constexpr int FSTORE = 35; + constexpr int NOP = 36; + } + class Instruction { public: uint8_t opcode; uint8_t dst; uint8_t src; - uint8_t alt; + uint8_t mod; int32_t imm32; const char* getName() const { return names[opcode]; @@ -70,6 +110,7 @@ namespace RandomX { void h_IXOR_M(std::ostream&) const; void h_IROR_R(std::ostream&) const; void h_IROL_R(std::ostream&) const; + void h_ISWAP_R(std::ostream&) const; void h_FPSWAP_R(std::ostream&) const; void h_FPADD_R(std::ostream&) const; void h_FPADD_M(std::ostream&) const; diff --git a/src/InterpretedVirtualMachine.cpp b/src/InterpretedVirtualMachine.cpp index d145e78..af01183 100644 --- a/src/InterpretedVirtualMachine.cpp +++ b/src/InterpretedVirtualMachine.cpp @@ -30,6 +30,7 @@ along with RandomX. If not, see. #include #include #include +#include "intrinPortable.h" #ifdef STATS #include #endif @@ -98,7 +99,7 @@ namespace RandomX { for (unsigned i = 0; i < sizeof(reg) / sizeof(Pcg32::result_type); ++i) { *(((uint32_t*)®) + i) = gen(); } - FPINIT(); + initFpu(); for (int i = 0; i < RegistersCount; ++i) { reg.f[i].lo.f64 = (double)reg.f[i].lo.i64; reg.f[i].hi.f64 = (double)reg.f[i].hi.i64; @@ -114,24 +115,32 @@ namespace RandomX { } void InterpretedVirtualMachine::execute() { - while (ic > 0) { -#ifdef STATS - count_instructions[pc]++; -#endif - auto& inst = p(pc); - if(trace) std::cout << inst.getName() << " (" << std::dec << pc << ")" << std::endl; - pc = (pc + 1) % ProgramLength; - auto handler = engine[inst.opcode]; - (this->*handler)(inst); - ic--; + for(int i = 0; i < InstructionCount; ++i) { + for (int j = 0; j < ProgramLength; ++j) { + auto& ibc = byteCode[j]; + switch (ibc.type) + { + case InstructionType::CFROUND: { + uint64_t rcFlag = rotr(ibc.isrc->u64, ibc.imm.i32); + setRoundMode(rcFlag); + } + break; + } + } } -#ifdef STATS - count_endstack += stack.size(); -#endif + } #include "instructionWeights.hpp" -#define INST_HANDLE(x) REPN(&InterpretedVirtualMachine::h_##x, WT(x)) + + void InterpretedVirtualMachine::executeInstruction(Instruction& instr) { + switch (instr.opcode) + { + CASE_REP(IADD_R) + + break; + } + } InstructionHandler InterpretedVirtualMachine::engine[256] = { diff --git a/src/InterpretedVirtualMachine.hpp b/src/InterpretedVirtualMachine.hpp index fba081a..2eee73d 100644 --- a/src/InterpretedVirtualMachine.hpp +++ b/src/InterpretedVirtualMachine.hpp @@ -33,10 +33,24 @@ namespace RandomX { virtual std::ostream& printCxx(std::ostream&) const = 0; }; + struct InstructionByteCode; class InterpretedVirtualMachine; typedef void(InterpretedVirtualMachine::*InstructionHandler)(Instruction&); + struct alignas(64) InstructionByteCode { + convertible_t* idst; + convertible_t* isrc; + convertible_t imm; + fpu_reg_t* fdst; + fpu_reg_t* fsrc; + uint32_t condition; + uint32_t memMask; + uint32_t type; + }; + + constexpr int asedwfagdewsa = sizeof(InstructionByteCode); + class InterpretedVirtualMachine : public VirtualMachine { public: InterpretedVirtualMachine(bool soft, bool async) : softAes(soft), asyncWorker(async) {} @@ -53,6 +67,7 @@ namespace RandomX { static const ITransform* addressTransformations[TransformationCount]; bool softAes, asyncWorker; Program p; + InstructionByteCode byteCode[ProgramLength]; std::vector stack; uint64_t pc, ic; const ITransform* currentTransform; @@ -106,7 +121,7 @@ namespace RandomX { int count_FPMUL_nop2 = 0; int datasetAccess[256] = { 0 }; #endif - + void executeInstruction(Instruction&); convertible_t loada(Instruction&); convertible_t loadbiashift(Instruction&); convertible_t loadbiadiv(Instruction&); diff --git a/src/JitCompilerX86.cpp b/src/JitCompilerX86.cpp index cf50582..d8e7a42 100644 --- a/src/JitCompilerX86.cpp +++ b/src/JitCompilerX86.cpp @@ -176,6 +176,7 @@ namespace RandomX { static const uint8_t JNZ[] = { 0x0f, 0x85 }; static const uint8_t JMP = 0xe9; static const uint8_t REX_XOR_RAX_R64[] = { 0x49, 0x33 }; + static const uint8_t REX_XCHG[] = { 0x4d, 0x87 }; size_t JitCompilerX86::getCodeSize() { return codePos - prologueSize; @@ -248,7 +249,7 @@ namespace RandomX { emitByte(AND_EAX_I); else emit(AND_ECX_I); - emit32((instr.alt % 4) ? ScratchpadL1Mask : ScratchpadL2Mask); + emit32((instr.mod % 4) ? ScratchpadL1Mask : ScratchpadL2Mask); } void JitCompilerX86::genAddressRegDst(Instruction& instr, bool align16 = false) { @@ -257,11 +258,11 @@ namespace RandomX { emitByte(AND_EAX_I); int32_t maskL1 = align16 ? ScratchpadL1Mask16 : ScratchpadL1Mask; int32_t maskL2 = align16 ? ScratchpadL2Mask16 : ScratchpadL2Mask; - emit32((instr.alt % 4) ? maskL1 : maskL2); + emit32((instr.mod % 4) ? maskL1 : maskL2); } void JitCompilerX86::genAddressImm(Instruction& instr) { - emit32(instr.imm32 & ((instr.alt % 4) ? ScratchpadL1Mask : ScratchpadL2Mask)); + emit32(instr.imm32 & ((instr.mod % 4) ? ScratchpadL1Mask : ScratchpadL2Mask)); } void JitCompilerX86::h_IADD_R(Instruction& instr) { @@ -595,6 +596,13 @@ namespace RandomX { } } + void JitCompilerX86::h_ISWAP_R(Instruction& instr) { + if (instr.src != instr.dst) { + emit(REX_XCHG); + emitByte(0xc0 + instr.dst + 8 * instr.src); + } + } + void JitCompilerX86::h_FPSWAP_R(Instruction& instr) { emit(SHUFPD); emitByte(0xc0 + 9 * instr.dst); @@ -682,7 +690,7 @@ namespace RandomX { void JitCompilerX86::h_CFROUND(Instruction& instr) { emit(REX_MOV_RR64); emitByte(0xc0 + instr.src); - int rotate = (13 - (instr.alt & 63)) & 63; + int rotate = (13 - (instr.imm32 & 63)) & 63; if (rotate != 0) { emit(ROL_RAX); emitByte(rotate); @@ -691,7 +699,7 @@ namespace RandomX { } static inline uint8_t condition(Instruction& instr, bool invert = false) { - switch ((instr.alt & 7) ^ invert) + switch ((instr.mod & 7) ^ invert) { case 0: return 0x96; //setbe @@ -777,6 +785,7 @@ namespace RandomX { INST_HANDLE(IXOR_M) INST_HANDLE(IROR_R) INST_HANDLE(IROL_R) + INST_HANDLE(ISWAP_R) INST_HANDLE(FPSWAP_R) INST_HANDLE(FPADD_R) INST_HANDLE(FPADD_M) diff --git a/src/JitCompilerX86.hpp b/src/JitCompilerX86.hpp index 0aef990..9c85667 100644 --- a/src/JitCompilerX86.hpp +++ b/src/JitCompilerX86.hpp @@ -109,6 +109,7 @@ namespace RandomX { void h_IXOR_M(Instruction&); void h_IROR_R(Instruction&); void h_IROL_R(Instruction&); + void h_ISWAP_R(Instruction&); void h_FPSWAP_R(Instruction&); void h_FPADD_R(Instruction&); void h_FPADD_M(Instruction&); diff --git a/src/asm/program_loop_load.inc b/src/asm/program_loop_load.inc index c4c1fed..76b8f3d 100644 --- a/src/asm/program_loop_load.inc +++ b/src/asm/program_loop_load.inc @@ -1,5 +1,5 @@ mov rdx, rax - and eax, 1048512 + and eax, 2097088 lea rcx, [rsi+rax] push rcx xor r8, qword ptr [rcx+0] @@ -11,7 +11,7 @@ xor r14, qword ptr [rcx+48] xor r15, qword ptr [rcx+56] ror rdx, 32 - and edx, 1048512 + and edx, 2097088 lea rcx, [rsi+rdx] push rcx cvtdq2pd xmm0, qword ptr [rcx+0] diff --git a/src/asm/squareHash.inc b/src/asm/squareHash.inc new file mode 100644 index 0000000..b62dc9e --- /dev/null +++ b/src/asm/squareHash.inc @@ -0,0 +1,87 @@ + mov rax, 1613783669344650115 + add rax, rcx + mul rax + sub rax, rdx ;# 1 + mul rax + sub rax, rdx ;# 2 + mul rax + sub rax, rdx ;# 3 + mul rax + sub rax, rdx ;# 4 + mul rax + sub rax, rdx ;# 5 + mul rax + sub rax, rdx ;# 6 + mul rax + sub rax, rdx ;# 7 + mul rax + sub rax, rdx ;# 8 + mul rax + sub rax, rdx ;# 9 + mul rax + sub rax, rdx ;# 10 + mul rax + sub rax, rdx ;# 11 + mul rax + sub rax, rdx ;# 12 + mul rax + sub rax, rdx ;# 13 + mul rax + sub rax, rdx ;# 14 + mul rax + sub rax, rdx ;# 15 + mul rax + sub rax, rdx ;# 16 + mul rax + sub rax, rdx ;# 17 + mul rax + sub rax, rdx ;# 18 + mul rax + sub rax, rdx ;# 19 + mul rax + sub rax, rdx ;# 20 + mul rax + sub rax, rdx ;# 21 + mul rax + sub rax, rdx ;# 22 + mul rax + sub rax, rdx ;# 23 + mul rax + sub rax, rdx ;# 24 + mul rax + sub rax, rdx ;# 25 + mul rax + sub rax, rdx ;# 26 + mul rax + sub rax, rdx ;# 27 + mul rax + sub rax, rdx ;# 28 + mul rax + sub rax, rdx ;# 29 + mul rax + sub rax, rdx ;# 30 + mul rax + sub rax, rdx ;# 31 + mul rax + sub rax, rdx ;# 32 + mul rax + sub rax, rdx ;# 33 + mul rax + sub rax, rdx ;# 34 + mul rax + sub rax, rdx ;# 35 + mul rax + sub rax, rdx ;# 36 + mul rax + sub rax, rdx ;# 37 + mul rax + sub rax, rdx ;# 38 + mul rax + sub rax, rdx ;# 39 + mul rax + sub rax, rdx ;# 40 + mul rax + sub rax, rdx ;# 41 + mul rax + sub rax, rdx ;# 42 + ret \ No newline at end of file diff --git a/src/common.hpp b/src/common.hpp index bbd5a2b..e52dbc2 100644 --- a/src/common.hpp +++ b/src/common.hpp @@ -26,11 +26,6 @@ namespace RandomX { using addr_t = uint32_t; - constexpr int RoundToNearest = 0; - constexpr int RoundDown = 1; - constexpr int RoundUp = 2; - constexpr int RoundToZero = 3; - constexpr int SeedSize = 32; constexpr int ResultSize = 32; @@ -46,7 +41,7 @@ namespace RandomX { constexpr int CacheBlockCount = CacheSize / CacheLineSize; constexpr int BlockExpansionRatio = DatasetSize / CacheSize; constexpr int DatasetBlockCount = BlockExpansionRatio * CacheBlockCount; - constexpr int DatasetIterations = 3; + constexpr int DatasetIterations = 10; #ifdef TRACE @@ -72,12 +67,12 @@ namespace RandomX { convertible_t hi; }; - constexpr int ProgramLength = 128; + constexpr int ProgramLength = 256; constexpr uint32_t InstructionCount = 1024; - constexpr uint32_t ScratchpadSize = 1024 * 1024; + constexpr uint32_t ScratchpadSize = 2 * 1024 * 1024; constexpr uint32_t ScratchpadLength = ScratchpadSize / sizeof(convertible_t); - constexpr uint32_t ScratchpadL1 = ScratchpadSize / 64 / sizeof(convertible_t); - constexpr uint32_t ScratchpadL2 = ScratchpadSize / 4 / sizeof(convertible_t); + constexpr uint32_t ScratchpadL1 = ScratchpadSize / 128 / sizeof(convertible_t); + constexpr uint32_t ScratchpadL2 = ScratchpadSize / 8 / sizeof(convertible_t); constexpr uint32_t ScratchpadL3 = ScratchpadSize / sizeof(convertible_t); constexpr int ScratchpadL1Mask = (ScratchpadL1 - 1) * 8; constexpr int ScratchpadL2Mask = (ScratchpadL2 - 1) * 8; @@ -133,6 +128,8 @@ namespace RandomX { typedef void(*ProgramFunc)(RegisterFile&, MemoryRegisters&, convertible_t*, uint64_t); + typedef bool(*Condition)(convertible_t&, convertible_t&); + extern "C" { void executeProgram(RegisterFile&, MemoryRegisters&, convertible_t*, uint64_t); } diff --git a/src/dataset.cpp b/src/dataset.cpp index 6029611..b941a75 100644 --- a/src/dataset.cpp +++ b/src/dataset.cpp @@ -28,10 +28,11 @@ along with RandomX. If not, see. #include "Cache.hpp" #include "virtualMemory.hpp" #include "softAes.h" +#include "squareHash.h" #if defined(__SSE2__) #include -#define PREFETCH(memory) _mm_prefetch((const char *)((memory).ds.dataset + (memory).ma), _MM_HINT_NTA) +#define PREFETCHNTA(x) _mm_prefetch((const char *)(x), _MM_HINT_NTA) #else #define PREFETCH(memory) #endif @@ -49,42 +50,37 @@ namespace RandomX { template void initBlock(const uint8_t* intermediate, uint8_t* out, uint32_t blockNumber, const KeysContainer& keys) { - __m128i x0, x1, x2, x3; + uint64_t r0, r1, r2, r3, r4, r5, r6, r7; - __m128i* xit = (__m128i*)intermediate; - __m128i* xout = (__m128i*)out; + r0 = 4ULL * blockNumber; + r1 = r2 = r3 = r4 = r5 = r6 = r7 = 0; - x0 = _mm_cvtsi32_si128(blockNumber); - constexpr int mask = (CacheSize / CacheLineSize) - 1; + constexpr int mask = (CacheSize - 1) & -64; for (auto i = 0; i < DatasetIterations; ++i) { - x0 = aesenc(x0, keys[0]); - //x0 = aesenc(x0, keys[1]); - x1 = aesenc(x0, keys[2]); - //x1 = aesenc(x1, keys[3]); - x2 = aesenc(x1, keys[4]); - //x2 = aesenc(x2, keys[5]); - x3 = aesenc(x2, keys[6]); - //x3 = aesenc(x3, keys[7]); - - int index = _mm_cvtsi128_si32(x3); - index &= mask; - - __m128i t0 = _mm_load_si128(xit + 4 * index + 0); - __m128i t1 = _mm_load_si128(xit + 4 * index + 1); - __m128i t2 = _mm_load_si128(xit + 4 * index + 2); - __m128i t3 = _mm_load_si128(xit + 4 * index + 3); - - x0 = _mm_xor_si128(x0, t0); - x1 = _mm_xor_si128(x1, t1); - x2 = _mm_xor_si128(x2, t2); - x3 = _mm_xor_si128(x3, t3); + uint64_t* mix = (uint64_t*)(intermediate + (r0 & mask)); + PREFETCHNTA(mix); + r0 = squareHash(r0); + r0 ^= mix[0]; + r1 ^= mix[1]; + r2 ^= mix[2]; + r3 ^= mix[3]; + r4 ^= mix[4]; + r5 ^= mix[5]; + r6 ^= mix[6]; + r7 ^= mix[7]; } - _mm_store_si128(xout + 0, x0); - _mm_store_si128(xout + 1, x1); - _mm_store_si128(xout + 2, x2); - _mm_store_si128(xout + 3, x3); + uint64_t* out64 = (uint64_t*)out; + + out64[0] = r0; + out64[1] = r1; + out64[2] = r2; + out64[3] = r3; + out64[4] = r4; + out64[5] = r5; + out64[6] = r6; + out64[7] = r7; } template @@ -98,7 +94,7 @@ namespace RandomX { memory.mx ^= addr; memory.mx &= -64; //align to cache line std::swap(memory.mx, memory.ma); - PREFETCH(memory); + PREFETCHNTA(memory.ds.dataset + memory.ma); for (int i = 0; i < RegistersCount; ++i) reg.r[i].u64 ^= datasetLine[i]; } diff --git a/src/hashAes1Rx4.cpp b/src/hashAes1Rx4.cpp index 1f25335..623d4b6 100644 --- a/src/hashAes1Rx4.cpp +++ b/src/hashAes1Rx4.cpp @@ -71,3 +71,44 @@ void hashAes1Rx4(const void *input, size_t inputSize, void *hash) { template void hashAes1Rx4(const void *input, size_t inputSize, void *hash); template void hashAes1Rx4(const void *input, size_t inputSize, void *hash); + +template +void fillAes1Rx4(void *state, size_t outputSize, void *buffer) { + const uint8_t* outptr = (uint8_t*)buffer; + const uint8_t* outputEnd = outptr + outputSize; + + __m128i state0, state1, state2, state3; + __m128i key0, key1, key2, key3; + + key0 = _mm_set_epi32(0x9274f206, 0x79498d2f, 0x7d2de6ab, 0x67a04d26); + key1 = _mm_set_epi32(0xe1f7af05, 0x2a3a6f1d, 0x86658a15, 0x4f719812); + key2 = _mm_set_epi32(0xd1b1f791, 0x9e2ec914, 0x14c77bce, 0xba90750e); + key3 = _mm_set_epi32(0x179d0fd9, 0x6e57883c, 0xa53bbe4f, 0xaa07621f); + + state0 = _mm_load_si128((__m128i*)state + 0); + state1 = _mm_load_si128((__m128i*)state + 1); + state2 = _mm_load_si128((__m128i*)state + 2); + state3 = _mm_load_si128((__m128i*)state + 3); + + while (outptr < outputEnd) { + state0 = aesdec(state0, key0); + state1 = aesenc(state1, key1); + state2 = aesdec(state2, key2); + state3 = aesenc(state3, key3); + + _mm_store_si128((__m128i*)outptr + 0, state0); + _mm_store_si128((__m128i*)outptr + 1, state1); + _mm_store_si128((__m128i*)outptr + 2, state2); + _mm_store_si128((__m128i*)outptr + 3, state3); + + outptr += 64; + } + + _mm_store_si128((__m128i*)state + 0, state0); + _mm_store_si128((__m128i*)state + 1, state1); + _mm_store_si128((__m128i*)state + 2, state2); + _mm_store_si128((__m128i*)state + 3, state3); +} + +template void fillAes1Rx4(void *state, size_t outputSize, void *buffer); +template void fillAes1Rx4(void *state, size_t outputSize, void *buffer); diff --git a/src/hashAes1Rx4.hpp b/src/hashAes1Rx4.hpp index a9af1fc..8c0c156 100644 --- a/src/hashAes1Rx4.hpp +++ b/src/hashAes1Rx4.hpp @@ -20,4 +20,7 @@ along with RandomX. If not, see. #include "softAes.h" template -void hashAes1Rx4(const void *input, size_t inputSize, void *hash); \ No newline at end of file +void hashAes1Rx4(const void *input, size_t inputSize, void *hash); + +template +void fillAes1Rx4(void *state, size_t outputSize, void *buffer); diff --git a/src/instructionWeights.hpp b/src/instructionWeights.hpp index 55c9b79..d24800e 100644 --- a/src/instructionWeights.hpp +++ b/src/instructionWeights.hpp @@ -37,8 +37,9 @@ along with RandomX. If not, see. #define WT_INEG_R 2 #define WT_IXOR_R 12 #define WT_IXOR_M 3 -#define WT_IROR_R 12 -#define WT_IROL_R 12 +#define WT_IROR_R 10 +#define WT_IROL_R 10 +#define WT_ISWAP_R 4 //Common floating point #define WT_FPSWAP_R 8 @@ -72,7 +73,7 @@ constexpr int wtSum = WT_IADD_R + WT_IADD_M + WT_IADD_RC + WT_ISUB_R + \ WT_ISUB_M + WT_IMUL_9C + WT_IMUL_R + WT_IMUL_M + WT_IMULH_R + \ WT_IMULH_M + WT_ISMULH_R + WT_ISMULH_M + WT_IDIV_C + WT_ISDIV_C + \ WT_INEG_R + WT_IXOR_R + WT_IXOR_M + WT_IROR_R + WT_IROL_R + \ -WT_FPSWAP_R + WT_FPADD_R + WT_FPADD_M + WT_FPSUB_R + WT_FPSUB_M + \ +WT_ISWAP_R + WT_FPSWAP_R + WT_FPADD_R + WT_FPADD_M + WT_FPSUB_R + WT_FPSUB_M + \ WT_FPNEG_R + WT_FPMUL_R + WT_FPMUL_M + WT_FPDIV_R + WT_FPDIV_M + \ WT_FPSQRT_R + WT_COND_R + WT_COND_M + WT_CFROUND + WT_ISTORE + WT_FSTORE + WT_NOP; diff --git a/src/instructionsPortable.cpp b/src/instructionsPortable.cpp index 78bdb6f..9e1eff1 100644 --- a/src/instructionsPortable.cpp +++ b/src/instructionsPortable.cpp @@ -17,7 +17,6 @@ You should have received a copy of the GNU General Public License along with RandomX. If not, see. */ //#define DEBUG -#include "instructions.hpp" #include "intrinPortable.h" #pragma STDC FENV_ACCESS on #include @@ -29,14 +28,14 @@ along with RandomX. If not, see. #if defined(__SIZEOF_INT128__) typedef unsigned __int128 uint128_t; typedef __int128 int128_t; - static inline uint64_t __umulhi64(uint64_t a, uint64_t b) { + uint64_t mulh(uint64_t a, uint64_t b) { return ((uint128_t)a * b) >> 64; } - static inline uint64_t __imulhi64(int64_t a, int64_t b) { + int64_t smulh(int64_t a, int64_t b) { return ((int128_t)a * b) >> 64; } - #define umulhi64 __umulhi64 - #define imulhi64 __imulhi64 + #define HAVE_MULH + #define HAVE_SMULH #endif #if defined(_MSC_VER) @@ -44,62 +43,62 @@ along with RandomX. If not, see. #define EVAL_DEFINE(X) HAS_VALUE(X) #include #include - #define ror64 _rotr64 - #define rol64 _rotl64 + + uint64_t rotl(uint64_t x, int c) { + return _rotl64(x, c); + } + uint64_t rotr(uint64_t x , int c) { + return _rotr64(x, c); + } + #define HAVE_ROTL + #define HAVE_ROTR + #if EVAL_DEFINE(__MACHINEARM64_X64(1)) - #define umulhi64 __umulh + uint64_t mulh(uint64_t a, uint64_t b) { + return __umulh(a, b); + } + #define HAVE_MULH #endif + #if EVAL_DEFINE(__MACHINEX64(1)) - static inline uint64_t __imulhi64(int64_t a, int64_t b) { + int64_t smulh(int64_t a, int64_t b) { int64_t hi; _mul128(a, b, &hi); return hi; } - #define imulhi64 __imulhi64 + #define HAVE_SMULH #endif - static inline uint32_t _setRoundMode(uint32_t mode) { - return _controlfp(mode, _MCW_RC); + + static void setRoundMode__(uint32_t mode) { + _controlfp(mode, _MCW_RC); } - #define setRoundMode _setRoundMode + #define HAVE_SETROUNDMODE_IMPL #endif -#ifndef setRoundMode - #define setRoundMode fesetround +#ifndef HAVE_SETROUNDMODE_IMPL + static void setRoundMode__(uint32_t mode) { + fesetround(mode); + } #endif -#ifndef ror64 - static inline uint64_t __ror64(uint64_t a, int b) { +#ifndef HAVE_ROTR + uint64_t rotr(uint64_t a, int b) { return (a >> b) | (a << (64 - b)); } - #define ror64 __ror64 + #define HAS_ROTR #endif -#ifndef rol64 - static inline uint64_t __rol64(uint64_t a, int b) { +#ifndef HAVE_ROTL + uint64_t rotl(uint64_t a, int b) { return (a << b) | (a >> (64 - b)); } - #define rol64 __rol64 + #define HAS_ROTL #endif -#ifndef sar64 - #include - constexpr int64_t builtintShr64(int64_t value, int shift) noexcept { - return value >> shift; - } - - struct UsesArithmeticShift : std::integral_constant { - }; - - static inline int64_t __sar64(int64_t a, int b) { - return UsesArithmeticShift::value ? builtintShr64(a, b) : (a < 0 ? ~(~a >> b) : a >> b); - } - #define sar64 __sar64 -#endif - -#ifndef umulhi64 +#ifndef HAVE_MULH #define LO(x) ((x)&0xffffffff) #define HI(x) ((x)>>32) - static inline uint64_t __umulhi64(uint64_t a, uint64_t b) { + uint64_t mulh(uint64_t a, uint64_t b) { uint64_t ah = HI(a), al = LO(a); uint64_t bh = HI(b), bl = LO(b); uint64_t x00 = al * bl; @@ -112,17 +111,17 @@ along with RandomX. If not, see. return (m3 << 32) + LO(m2); } - #define umulhi64 __umulhi64 + #define HAVE_MULH #endif -#ifndef imulhi64 - static inline int64_t __imulhi64(int64_t a, int64_t b) { - int64_t hi = umulhi64(a, b); +#ifndef HAVE_SMULH + int64_t smulh(int64_t a, int64_t b) { + int64_t hi = mulh(a, b); if (a < 0LL) hi -= b; if (b < 0LL) hi -= a; return hi; } - #define imulhi64 __imulhi64 + #define HAVE_SMULH #endif // avoid undefined behavior of signed overflow @@ -137,20 +136,20 @@ static inline int32_t safeSub(int32_t a, int32_t b) { #if defined(__has_builtin) #if __has_builtin(__builtin_sub_overflow) - static inline bool __subOverflow(int32_t a, int32_t b) { + static inline bool subOverflow__(int32_t a, int32_t b) { int32_t temp; return __builtin_sub_overflow(a, b, &temp); } - #define subOverflow __subOverflow + #define HAVE_SUB_OVERFLOW #endif #endif -#ifndef subOverflow - static inline bool __subOverflow(int32_t a, int32_t b) { +#ifndef HAVE_SUB_OVERFLOW + static inline bool subOverflow__(int32_t a, int32_t b) { auto c = safeSub(a, b); return (c < a) != (b > 0); } - #define subOverflow __subOverflow + #define HAVE_SUB_OVERFLOW #endif static inline double FlushDenormalNaN(double x) { @@ -165,47 +164,57 @@ static inline double FlushNaN(double x) { return x != x ? 0.0 : x; } +void setRoundMode(uint32_t rcflag) { + switch (rcflag & 3) { + case RoundDown: + setRoundMode__(FE_DOWNWARD); + break; + case RoundUp: + setRoundMode__(FE_UPWARD); + break; + case RoundToZero: + setRoundMode__(FE_TOWARDZERO); + break; + default: + setRoundMode__(FE_TONEAREST); + break; + } +} + +bool condition(uint32_t type, int32_t value, int32_t imm32) { + switch (type & 7) + { + case 0: + return (uint32_t)value <= (uint32_t)imm32; + case 1: + return (uint32_t)value > (uint32_t)imm32; + case 2: + return safeSub(value, imm32) < 0; + case 3: + return safeSub(value, imm32) >= 0; + case 4: + return subOverflow__(value, imm32); + case 5: + return !subOverflow__(value, imm32); + case 6: + return value < imm32; + case 7: + return value >= imm32; + } +} + +void initFpu() { +#ifdef __SSE2__ + _mm_setcsr(0x9FC0); //Flush to zero, denormals are zero, default rounding mode, all exceptions disabled +#else + setRoundMode(FE_TONEAREST); +#endif +} + namespace RandomX { extern "C" { - - void ADD_64(convertible_t& a, convertible_t& b, convertible_t& c) { - c.u64 = a.u64 + b.u64; - } - - void ADD_32(convertible_t& a, convertible_t& b, convertible_t& c) { - c.u64 = a.u32 + b.u32; - } - - void SUB_64(convertible_t& a, convertible_t& b, convertible_t& c) { - c.u64 = a.u64 - b.u64; - } - - void SUB_32(convertible_t& a, convertible_t& b, convertible_t& c) { - c.u64 = a.u32 - b.u32; - } - - void MUL_64(convertible_t& a, convertible_t& b, convertible_t& c) { - c.u64 = a.u64 * b.u64; - } - - void MULH_64(convertible_t& a, convertible_t& b, convertible_t& c) { - c.u64 = umulhi64(a.u64, b.u64); - } - - void MUL_32(convertible_t& a, convertible_t& b, convertible_t& c) { - c.u64 = (uint64_t)a.u32 * b.u32; - } - - void IMUL_32(convertible_t& a, convertible_t& b, convertible_t& c) { - c.i64 = (int64_t)a.i32 * b.i32; - } - - void IMULH_64(convertible_t& a, convertible_t& b, convertible_t& c) { - c.i64 = imulhi64(a.i64, b.i64); - } - - void DIV_64(convertible_t& a, convertible_t& b, convertible_t& c) { + /*void DIV_64(convertible_t& a, convertible_t& b, convertible_t& c) { c.u64 = a.u64 / (b.u32 != 0 ? b.u32 : 1U); } @@ -216,80 +225,6 @@ namespace RandomX { c.i64 = a.i64 / (b.i32 != 0 ? b.i32 : 1); } - void AND_64(convertible_t& a, convertible_t& b, convertible_t& c) { - c.u64 = a.u64 & b.u64; - } - - void AND_32(convertible_t& a, convertible_t& b, convertible_t& c) { - c.u64 = a.u32 & b.u32; - } - - void OR_64(convertible_t& a, convertible_t& b, convertible_t& c) { - c.u64 = a.u64 | b.u64; - } - - void OR_32(convertible_t& a, convertible_t& b, convertible_t& c) { - c.u64 = a.u32 | b.u32; - } - - void XOR_64(convertible_t& a, convertible_t& b, convertible_t& c) { - c.u64 = a.u64 ^ b.u64; - } - - void XOR_32(convertible_t& a, convertible_t& b, convertible_t& c) { - c.u64 = a.u32 ^ b.u32; - } - - void SHL_64(convertible_t& a, convertible_t& b, convertible_t& c) { - c.u64 = a.u64 << (b.u64 & 63); - } - - void SHR_64(convertible_t& a, convertible_t& b, convertible_t& c) { - c.u64 = a.u64 >> (b.u64 & 63); - } - - void SAR_64(convertible_t& a, convertible_t& b, convertible_t& c) { - c.u64 = sar64(a.i64, b.u64 & 63); - } - - void ROL_64(convertible_t& a, convertible_t& b, convertible_t& c) { - c.u64 = rol64(a.u64, (b.u64 & 63)); - } - - void ROR_64(convertible_t& a, convertible_t& b, convertible_t& c) { - c.u64 = ror64(a.u64, (b.u64 & 63)); - } - - bool JMP_COND(uint8_t type, convertible_t& regb, int32_t imm32) { - switch (type & 7) - { - case 0: - return regb.u32 <= (uint32_t)imm32; - case 1: - return regb.u32 > (uint32_t)imm32; - case 2: - return safeSub(regb.i32, imm32) < 0; - case 3: - return safeSub(regb.i32, imm32) >= 0; - case 4: - return subOverflow(regb.i32, imm32); - case 5: - return !subOverflow(regb.i32, imm32); - case 6: - return regb.i32 < imm32; - case 7: - return regb.i32 >= imm32; - } - } - - void FPINIT() { -#ifdef __SSE2__ - _mm_setcsr(0x9FC0); //Flush to zero, denormals are zero, default rounding mode, all exceptions disabled -#else - setRoundMode(FE_TONEAREST); -#endif - } - void FPADD(convertible_t& a, fpu_reg_t& b, fpu_reg_t& c) { #ifdef __SSE2__ __m128i ai = _mm_loadl_epi64((const __m128i*)&a); @@ -368,48 +303,8 @@ namespace RandomX { c.lo.f64 = sqrt(std::abs(alo)); c.hi.f64 = sqrt(std::abs(ahi)); #endif - } + }*/ + - void FPROUND(convertible_t a, uint8_t rot) { - uint64_t flag = ror64(a.u64, rot); - switch (flag & 3) { - case RoundDown: -#ifdef DEBUG - std::cout << "Round FE_DOWNWARD (" << FE_DOWNWARD << ") = " << -#endif - setRoundMode(FE_DOWNWARD); -#ifdef DEBUG - std::cout << std::endl; -#endif - break; - case RoundUp: -#ifdef DEBUG - std::cout << "Round FE_UPWARD (" << FE_UPWARD << ") = " << -#endif - setRoundMode(FE_UPWARD); -#ifdef DEBUG - std::cout << std::endl; -#endif - break; - case RoundToZero: -#ifdef DEBUG - std::cout << "Round FE_TOWARDZERO (" << FE_TOWARDZERO << ") = " << -#endif - setRoundMode(FE_TOWARDZERO); -#ifdef DEBUG - std::cout << std::endl; -#endif - break; - default: -#ifdef DEBUG - std::cout << "Round FE_TONEAREST (" << FE_TONEAREST << ") = " << -#endif - setRoundMode(FE_TONEAREST); -#ifdef DEBUG - std::cout << std::endl; -#endif - break; - } - } } } \ No newline at end of file diff --git a/src/intrinPortable.h b/src/intrinPortable.h index 3a473a2..3d2136c 100644 --- a/src/intrinPortable.h +++ b/src/intrinPortable.h @@ -19,6 +19,8 @@ along with RandomX. If not, see. #pragma once +#include + #if defined(_MSC_VER) #if defined(_M_X64) || (defined(_M_IX86_FP) && _M_IX86_FP == 2) #define __SSE2__ 1 @@ -45,6 +47,18 @@ typedef union { uint8_t u8[16]; } __m128i; +typedef struct { + double lo; + double hi; +} __m128d; + +inline __m128d _mm_load_pd(const double* pd) { + __m128d x; + x.lo = *(pd + 0); + x.hi = *(pd + 1); + return x; +} + static const char* platformError = "Platform doesn't support hardware AES"; inline __m128i _mm_aeskeygenassist_si128(__m128i key, uint8_t rcon) { @@ -131,4 +145,17 @@ inline __m128i _mm_slli_si128(__m128i _A, int _Imm) { return _A; } -#endif \ No newline at end of file +#endif + +constexpr int RoundToNearest = 0; +constexpr int RoundDown = 1; +constexpr int RoundUp = 2; +constexpr int RoundToZero = 3; + +uint64_t mulh(uint64_t, uint64_t); +int64_t smulh(int64_t, int64_t); +uint64_t rotl(uint64_t, int); +uint64_t rotr(uint64_t, int); +void initFpu(); +void setRoundMode(uint32_t); +bool condition(uint32_t, int32_t, int32_t); diff --git a/src/main.cpp b/src/main.cpp index 4f5a021..c761b97 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -35,6 +35,7 @@ along with RandomX. If not, see. #include "dataset.hpp" #include "Cache.hpp" #include "Pcg32.hpp" +#include "hashAes1Rx4.hpp" const uint8_t seed[32] = { 191, 182, 222, 175, 249, 89, 134, 104, 241, 68, 191, 62, 162, 166, 61, 64, 123, 191, 227, 193, 118, 60, 188, 53, 223, 133, 175, 24, 123, 230, 55, 74 }; @@ -153,7 +154,7 @@ void generateNative(int nonce) { } void mine(RandomX::VirtualMachine* vm, std::atomic& atomicNonce, AtomicHash& result, int noncesCount, int thread, uint8_t* scratchpad) { - uint64_t hash[4]; + alignas(16) uint64_t hash[8]; unsigned char blockTemplate[] = { 0x07, 0x07, 0xf7, 0xa4, 0xf0, 0xd6, 0x05, 0xb3, 0x03, 0x26, 0x08, 0x16, 0xba, 0x3f, 0x10, 0x90, 0x2e, 0x1a, 0x14, 0x5a, 0xc5, 0xfa, 0xd3, 0xaa, 0x3a, 0xf6, 0xea, 0x44, 0xc1, 0x18, 0x69, 0xdc, 0x4f, 0x85, 0x3f, 0x00, 0x2b, 0x2e, @@ -167,8 +168,8 @@ void mine(RandomX::VirtualMachine* vm, std::atomic& atomicNonce, AtomicHash //std::cout << "Thread " << thread << " nonce " << nonce << std::endl; *noncePtr = nonce; blake2b(hash, sizeof(hash), blockTemplate, sizeof(blockTemplate), nullptr, 0); - int spIndex = ((uint8_t*)hash)[24] | ((((uint8_t*)hash)[25] & 15) << 8); - vm->initializeScratchpad(scratchpad, spIndex); + fillAes1Rx4((void*)hash, RandomX::ScratchpadSize, scratchpad); + //vm->initializeScratchpad(scratchpad, spIndex); vm->setScratchpad(scratchpad); //dump((char*)((RandomX::CompiledVirtualMachine*)vm)->getProgram(), RandomX::CodeSize, "code-1337-jmp.txt"); for (int chain = 0; chain < 16; ++chain) { @@ -309,7 +310,7 @@ int main(int argc, char** argv) { } uint8_t* scratchpadMem; if (largePages) { - scratchpadMem = (uint8_t*)allocLargePagesMemory(RandomX::ScratchpadSize * (threadCount + 1) / 2); + scratchpadMem = (uint8_t*)allocLargePagesMemory(threadCount * RandomX::ScratchpadSize); } else { scratchpadMem = (uint8_t*)_mm_malloc(threadCount * RandomX::ScratchpadSize, RandomX::CacheLineSize); diff --git a/src/squareHash.S b/src/squareHash.S new file mode 100644 index 0000000..4cd3b54 --- /dev/null +++ b/src/squareHash.S @@ -0,0 +1,17 @@ +.intel_syntax noprefix +#if defined(__APPLE__) +.text +#else +.section .text +#endif +#if defined(__WIN32__) || defined(__APPLE__) +#define DECL(x) _##x +#else +#define DECL(x) x +#endif + +.global DECL(squareHash) + +DECL(squareHash): + mov rcx, rsi + #include "asm/squareHash.inc" diff --git a/src/squareHash.asm b/src/squareHash.asm new file mode 100644 index 0000000..4433719 --- /dev/null +++ b/src/squareHash.asm @@ -0,0 +1,9 @@ +PUBLIC squareHash + +.code + +squareHash PROC + include asm/squareHash.inc +squareHash ENDP + +END \ No newline at end of file diff --git a/src/squareHash.h b/src/squareHash.h new file mode 100644 index 0000000..f80b492 --- /dev/null +++ b/src/squareHash.h @@ -0,0 +1,71 @@ +/* +Copyright (c) 2019 tevador + +This file is part of RandomX. + +RandomX is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +RandomX is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with RandomX. If not, see. +*/ + +#include + +#if !defined(_M_X64) && !defined(__x86_64__) + +typedef struct { + uint64_t lo; + uint64_t hi; +} uint128_t; + +#define LO(x) ((x)&0xffffffff) +#define HI(x) ((x)>>32) +static inline uint128_t square128(uint64_t x) { + uint64_t xh = HI(x), xl = LO(x); + uint64_t xll = xl * xl; + uint64_t xlh = xl * xh; + uint64_t xhh = xh * xh; + uint64_t m1 = 2 * LO(xlh) + HI(xll); + uint64_t m2 = 2 * HI(xlh) + LO(xhh) + HI(m1); + uint64_t m3 = HI(xhh) + HI(m2); + + uint128_t x2; + + x2.lo = (m1 << 32) + LO(xll); + x2.hi = (m3 << 32) + LO(m2); + + return x2; +} +#undef LO(x) +#undef HI(x) + +inline uint64_t squareHash(uint64_t x) { + x += 1613783669344650115; + for (int i = 0; i < 42; ++i) { + uint128_t x2 = square128(x); + x = x2.lo - x2.hi; + } + return x; +} + +#else + +#if defined(__cplusplus) +extern "C" { +#endif + +uint64_t squareHash(uint64_t); + +#if defined(__cplusplus) +} +#endif + +#endif \ No newline at end of file From b417fd08eaf8036fafc0faeb85c6d88d807b09cc Mon Sep 17 00:00:00 2001 From: tevador Date: Tue, 5 Feb 2019 23:06:44 +0100 Subject: [PATCH 29/35] 16 -> 8 chained programs constant address loads are always from L3 --- src/AssemblyGeneratorX86.cpp | 2 +- src/Instruction.cpp | 2 +- src/JitCompilerX86.cpp | 2 +- src/common.hpp | 3 +- src/instructionWeights.hpp | 4 +- src/main.cpp | 2 +- src/program.inc | 1095 +++++++++++++++++++++++----------- 7 files changed, 749 insertions(+), 361 deletions(-) diff --git a/src/AssemblyGeneratorX86.cpp b/src/AssemblyGeneratorX86.cpp index 3092e4d..70d396b 100644 --- a/src/AssemblyGeneratorX86.cpp +++ b/src/AssemblyGeneratorX86.cpp @@ -81,7 +81,7 @@ namespace RandomX { } int32_t AssemblyGeneratorX86::genAddressImm(Instruction& instr) { - return instr.imm32 & ((instr.mod % 4) ? ScratchpadL1Mask : ScratchpadL2Mask); + return instr.imm32 & ScratchpadL3Mask; } //1 uOP diff --git a/src/Instruction.cpp b/src/Instruction.cpp index ce75f43..8a175fc 100644 --- a/src/Instruction.cpp +++ b/src/Instruction.cpp @@ -37,7 +37,7 @@ namespace RandomX { } void Instruction::genAddressImm(std::ostream& os) const { - os << ((mod % 4) ? "L1" : "L2") << "[" << (imm32 & ((mod % 4) ? ScratchpadL1Mask : ScratchpadL2Mask)) << "]"; + os << "L3" << "[" << (imm32 & ScratchpadL3Mask) << "]"; } void Instruction::h_IADD_R(std::ostream& os) const { diff --git a/src/JitCompilerX86.cpp b/src/JitCompilerX86.cpp index d8e7a42..e926e4a 100644 --- a/src/JitCompilerX86.cpp +++ b/src/JitCompilerX86.cpp @@ -262,7 +262,7 @@ namespace RandomX { } void JitCompilerX86::genAddressImm(Instruction& instr) { - emit32(instr.imm32 & ((instr.mod % 4) ? ScratchpadL1Mask : ScratchpadL2Mask)); + emit32(instr.imm32 & ScratchpadL3Mask); } void JitCompilerX86::h_IADD_R(Instruction& instr) { diff --git a/src/common.hpp b/src/common.hpp index e52dbc2..ea67ff9 100644 --- a/src/common.hpp +++ b/src/common.hpp @@ -68,7 +68,7 @@ namespace RandomX { }; constexpr int ProgramLength = 256; - constexpr uint32_t InstructionCount = 1024; + constexpr uint32_t InstructionCount = 2048; constexpr uint32_t ScratchpadSize = 2 * 1024 * 1024; constexpr uint32_t ScratchpadLength = ScratchpadSize / sizeof(convertible_t); constexpr uint32_t ScratchpadL1 = ScratchpadSize / 128 / sizeof(convertible_t); @@ -78,6 +78,7 @@ namespace RandomX { constexpr int ScratchpadL2Mask = (ScratchpadL2 - 1) * 8; constexpr int ScratchpadL1Mask16 = (ScratchpadL1 / 2 - 1) * 16; constexpr int ScratchpadL2Mask16 = (ScratchpadL2 / 2 - 1) * 16; + constexpr int ScratchpadL3Mask = (ScratchpadLength - 1) * 8; constexpr uint32_t TransformationCount = 90; constexpr int RegistersCount = 8; diff --git a/src/instructionWeights.hpp b/src/instructionWeights.hpp index d24800e..0bb26ff 100644 --- a/src/instructionWeights.hpp +++ b/src/instructionWeights.hpp @@ -25,7 +25,7 @@ along with RandomX. If not, see. #define WT_IADD_RC 12 #define WT_ISUB_R 12 #define WT_ISUB_M 3 -#define WT_IMUL_9C 10 +#define WT_IMUL_9C 9 #define WT_IMUL_R 16 #define WT_IMUL_M 4 #define WT_IMULH_R 4 @@ -36,7 +36,7 @@ along with RandomX. If not, see. #define WT_ISDIV_C 4 #define WT_INEG_R 2 #define WT_IXOR_R 12 -#define WT_IXOR_M 3 +#define WT_IXOR_M 4 #define WT_IROR_R 10 #define WT_IROL_R 10 #define WT_ISWAP_R 4 diff --git a/src/main.cpp b/src/main.cpp index c761b97..58b381c 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -172,7 +172,7 @@ void mine(RandomX::VirtualMachine* vm, std::atomic& atomicNonce, AtomicHash //vm->initializeScratchpad(scratchpad, spIndex); vm->setScratchpad(scratchpad); //dump((char*)((RandomX::CompiledVirtualMachine*)vm)->getProgram(), RandomX::CodeSize, "code-1337-jmp.txt"); - for (int chain = 0; chain < 16; ++chain) { + for (int chain = 0; chain < 8; ++chain) { vm->initializeProgram(hash); vm->execute(); vm->getResult(nullptr, 0, hash); diff --git a/src/program.inc b/src/program.inc index d901e9a..e4de06f 100644 --- a/src/program.inc +++ b/src/program.inc @@ -1,381 +1,768 @@ - ; FPMUL_R e0, a2 - mulpd xmm4, xmm10 - ; IADD_RC r2, r5, -1621224194 - lea r10, [r10+r13-1621224194] - ; ISTORE L2[r2], r7 - mov eax, r10d + ; IMUL_R r0, r7 + imul r8, r15 + ; ISMULH_R r2, r1 + mov rax, r10 + imul r9 + mov r10, rdx + ; IMUL_R r2, r4 + imul r10, r12 + ; IADD_R r7, r0 + add r15, r8 + ; FPSQRT_R e0 + sqrtpd xmm4, xmm4 + ; IMUL_R r3, r6 + imul r11, r14 + ; FPMUL_R e3, a1 + mulpd xmm7, xmm9 + ; IMULH_M r6, L1[r3] + mov ecx, r11d + and ecx, 16376 + mov rax, r14 + mul qword ptr [rsi+rcx] + mov r14, rdx + ; IMUL_R r5, r1 + imul r13, r9 + ; FPADD_M f0, L2[r6] + mov eax, r14d and eax, 262136 - mov qword ptr [rsi+rax], r15 + cvtdq2pd xmm12, qword ptr [rsi+rax] + addpd xmm0, xmm12 + ; IROR_R r4, r3 + mov ecx, r11d + ror r12, cl + ; IXOR_M r4, L3[984888] + xor r12, qword ptr [rsi+984888] + ; IROR_R r0, r3 + mov ecx, r11d + ror r8, cl + ; IROR_R r0, r4 + mov ecx, r12d + ror r8, cl + ; FPMUL_R e0, a1 + mulpd xmm4, xmm9 + ; IMUL_R r0, r2 + imul r8, r10 + ; ISUB_M r0, L1[r3] + mov eax, r11d + and eax, 16376 + sub r8, qword ptr [rsi+rax] + ; FPSUB_R f3, a1 + subpd xmm3, xmm9 + ; ISWAP_R r7, r4 + xchg r15, r12 + ; IDIV_C r1, 3690475308 + mov rax, r9 + shr rax, 2 + mov rcx, 5367070356934653253 + mul rcx + shr rdx, 28 + add r9, rdx + ; IROL_R r4, r2 + mov ecx, r10d + rol r12, cl + ; IMUL_M r5, L1[r4] + mov eax, r12d + and eax, 16376 + imul r13, qword ptr [rsi+rax] + ; IROL_R r4, r7 + mov ecx, r15d + rol r12, cl + ; ISUB_R r3, r1 + sub r11, r9 + ; IADD_R r7, r0 + add r15, r8 + ; IADD_M r1, L1[r3] + mov eax, r11d + and eax, 16376 + add r9, qword ptr [rsi+rax] ; FPMUL_R e2, a2 mulpd xmm6, xmm10 - ; IMUL_R r6, r3 - imul r14, r11 - ; FPSUB_M f1, L1[r4] - mov eax, r12d - and eax, 16376 - cvtdq2pd xmm12, qword ptr [rsi+rax] - subpd xmm1, xmm12 - ; IROL_R r5, r3 - mov ecx, r11d - rol r13, cl - ; FPMUL_R e2, a0 - mulpd xmm6, xmm8 - ; FPSUB_R f3, a0 - subpd xmm3, xmm8 - ; IXOR_R r0, r4 - xor r8, r12 - ; ISMULH_M r3, L1[r7] - mov ecx, r15d - and ecx, 16376 - mov rax, r11 - imul qword ptr [rsi+rcx] - mov r11, rdx - ; FPSWAP_R f2 - shufpd xmm2, xmm2, 1 - ; IDIV_C r6, 1248528248 - mov rax, 15864311168205210203 - mul r14 - shr rdx, 30 - add r14, rdx - ; FPMUL_R e0, a2 - mulpd xmm4, xmm10 - ; IADD_RC r3, r4, -52260428 - lea r11, [r11+r12-52260428] - ; IADD_R r7, -1138617760 - add r15, -1138617760 - ; IROL_R r2, r6 - mov ecx, r14d - rol r10, cl - ; FPNEG_R f2 - xorps xmm2, xmm15 - ; IROR_R r7, r1 - mov ecx, r9d - ror r15, cl - ; COND_R r2, lt(r7, -41618808) - xor ecx, ecx - cmp r15d, -41618808 - setl cl - add r10, rcx - ; FPMUL_R e3, a0 - mulpd xmm7, xmm8 - ; CFROUND r1, 43 - mov rax, r9 - rol rax, 34 - and eax, 24576 - or eax, 40896 - mov dword ptr [rsp-8], eax - ldmxcsr dword ptr [rsp-8] - ; FPADD_R f2, a1 - addpd xmm2, xmm9 - ; FPSUB_M f0, L1[r7] - mov eax, r15d - and eax, 16376 - cvtdq2pd xmm12, qword ptr [rsi+rax] - subpd xmm0, xmm12 - ; ISTORE L1[r6], r2 - mov eax, r14d - and eax, 16376 - mov qword ptr [rsi+rax], r10 - ; ISUB_R r6, r5 - sub r14, r13 - ; IADD_M r0, L1[r4] - mov eax, r12d - and eax, 16376 - add r8, qword ptr [rsi+rax] + ; IADD_R r6, -1115286770 + add r14, -1115286770 + ; FPDIV_R e2, a3 + divpd xmm6, xmm11 + maxpd xmm6, xmm13 + ; FPADD_R f1, a2 + addpd xmm1, xmm10 + ; IXOR_R r3, r7 + xor r11, r15 ; ISTORE L1[r4], r3 mov eax, r12d and eax, 16376 mov qword ptr [rsi+rax], r11 - ; ISTORE L1[r6], r6 - mov eax, r14d - and eax, 16376 - mov qword ptr [rsi+rax], r14 - ; FPSQRT_R e0 - sqrtpd xmm4, xmm4 - ; IXOR_R r2, r5 - xor r10, r13 - ; FPSQRT_R e1 - sqrtpd xmm5, xmm5 - ; FPMUL_R e1, a3 - mulpd xmm5, xmm11 - ; IMULH_R r7, r6 - mov rax, r15 - mul r14 - mov r15, rdx - ; ISDIV_C r0, -1706892622 - mov rax, -5802075764249827661 - imul r8 - xor eax, eax - sar rdx, 29 - sets al - add rdx, rax - add r8, rdx - ; IMUL_R r5, r3 - imul r13, r11 - ; FPSQRT_R e2 - sqrtpd xmm6, xmm6 - ; FPADD_M f3, L1[r4] - mov eax, r12d - and eax, 16376 - cvtdq2pd xmm12, qword ptr [rsi+rax] - addpd xmm3, xmm12 - ; IADD_R r3, r2 - add r11, r10 - ; FPADD_R f1, a0 - addpd xmm1, xmm8 - ; FPDIV_R e3, a2 - divpd xmm7, xmm10 - maxpd xmm7, xmm13 - ; FPSUB_R f0, a1 - subpd xmm0, xmm9 - ; IMUL_M r5, L1[r6] - mov eax, r14d - and eax, 16376 - imul r13, qword ptr [rsi+rax] - ; IADD_RC r1, r2, -1263285243 - lea r9, [r9+r10-1263285243] - ; IMUL_9C r4, 1994773931 - lea r12, [r12+r12*8+1994773931] - ; FPSWAP_R e3 - shufpd xmm7, xmm7, 1 - ; IMUL_M r0, L1[r7] - mov eax, r15d - and eax, 16376 - imul r8, qword ptr [rsi+rax] - ; IROR_R r1, r6 + ; IROR_R r3, r6 mov ecx, r14d - ror r9, cl - ; IROL_R r2, r4 - mov ecx, r12d - rol r10, cl - ; FPSUB_R f3, a1 - subpd xmm3, xmm9 - ; ISTORE L1[r0], r5 - mov eax, r8d - and eax, 16376 - mov qword ptr [rsi+rax], r13 - ; FPDIV_M e2, L2[r3] - mov eax, r11d - and eax, 262136 - cvtdq2pd xmm12, qword ptr [rsi+rax] - divpd xmm6, xmm12 - maxpd xmm6, xmm13 - ; FPSWAP_R f2 - shufpd xmm2, xmm2, 1 - ; IADD_R r7, r5 - add r15, r13 - ; FPDIV_M e0, L1[r4] - mov eax, r12d - and eax, 16376 - cvtdq2pd xmm12, qword ptr [rsi+rax] - divpd xmm4, xmm12 - maxpd xmm4, xmm13 - ; FPADD_M f3, L1[r5] - mov eax, r13d - and eax, 16376 - cvtdq2pd xmm12, qword ptr [rsi+rax] - addpd xmm3, xmm12 - ; FPADD_R f0, a3 - addpd xmm0, xmm11 - ; IADD_R r2, r0 - add r10, r8 - ; ISTORE L1[r3], r6 - mov eax, r11d - and eax, 16376 - mov qword ptr [rsi+rax], r14 - ; IROR_R r1, r7 - mov ecx, r15d - ror r9, cl - ; IMUL_9C r5, 301671287 - lea r13, [r13+r13*8+301671287] - ; IXOR_R r7, 266992378 - xor r15, 266992378 - ; FPSQRT_R e3 - sqrtpd xmm7, xmm7 - ; IMUL_M r2, L2[r0] - mov eax, r8d - and eax, 262136 - imul r10, qword ptr [rsi+rax] + ror r11, cl + ; ISMULH_R r0, r6 + mov rax, r8 + imul r14 + mov r8, rdx + ; IROR_R r6, r5 + mov ecx, r13d + ror r14, cl + ; IMULH_M r6, L2[r0] + mov ecx, r8d + and ecx, 262136 + mov rax, r14 + mul qword ptr [rsi+rcx] + mov r14, rdx + ; ISUB_R r2, 1512125960 + sub r10, 1512125960 + ; IMUL_R r7, r6 + imul r15, r14 + ; IMULH_R r6, r7 + mov rax, r14 + mul r15 + mov r14, rdx + ; ISUB_R r4, r1 + sub r12, r9 ; FPMUL_R e3, a2 mulpd xmm7, xmm10 - ; IMUL_R r0, r6 - imul r8, r14 - ; ISTORE L1[r0], r7 - mov eax, r8d - and eax, 16376 - mov qword ptr [rsi+rax], r15 - ; FPNEG_R f0 - xorps xmm0, xmm15 - ; FPADD_M f3, L1[r5] - mov eax, r13d - and eax, 16376 - cvtdq2pd xmm12, qword ptr [rsi+rax] - addpd xmm3, xmm12 - ; IROR_R r5, r4 - mov ecx, r12d - ror r13, cl - ; ISTORE L2[r7], r2 - mov eax, r15d - and eax, 262136 - mov qword ptr [rsi+rax], r10 - ; FPADD_R f2, a3 - addpd xmm2, xmm11 - ; FPADD_M f3, L1[r2] - mov eax, r10d - and eax, 16376 - cvtdq2pd xmm12, qword ptr [rsi+rax] - addpd xmm3, xmm12 - ; ISDIV_C r5, -2076168315 - mov rax, -4770095103914078469 - imul r13 - xor eax, eax - sar rdx, 29 - sets al - add rdx, rax - add r13, rdx - ; IADD_RC r0, r4, -1321374359 - lea r8, [r8+r12-1321374359] - ; CFROUND r6, 28 - mov rax, r14 - rol rax, 49 - and eax, 24576 - or eax, 40896 - mov dword ptr [rsp-8], eax - ldmxcsr dword ptr [rsp-8] - ; FPADD_R f2, a2 - addpd xmm2, xmm10 - ; IROL_R r7, r6 - mov ecx, r14d - rol r15, cl - ; ISUB_R r2, r4 - sub r10, r12 - ; ISMULH_R r0, -1500893068 - mov rax, -1500893068 - imul r8 - add r8, rdx - ; IADD_R r2, r3 - add r10, r11 - ; FPSQRT_R e2 - sqrtpd xmm6, xmm6 - ; IROL_R r7, r4 - mov ecx, r12d - rol r15, cl - ; IMUL_R r4, r2 - imul r12, r10 - ; ISUB_R r3, r7 - sub r11, r15 - ; IADD_R r2, r7 - add r10, r15 - ; FPDIV_R e3, a0 - divpd xmm7, xmm8 - maxpd xmm7, xmm13 - ; ISUB_R r6, 540663146 - sub r14, 540663146 - ; IROL_R r5, 58 - rol r13, 58 - ; FPADD_R f2, a1 - addpd xmm2, xmm9 - ; FPADD_R f2, a2 - addpd xmm2, xmm10 - ; FPDIV_R e1, a2 - divpd xmm5, xmm10 - maxpd xmm5, xmm13 - ; FPADD_R f1, a2 - addpd xmm1, xmm10 - ; IADD_R r5, r3 - add r13, r11 - ; IADD_R r7, -1780268176 - add r15, -1780268176 - ; ISUB_R r7, r0 - sub r15, r8 - ; ISTORE L2[r0], r7 - mov eax, r8d - and eax, 262136 - mov qword ptr [rsi+rax], r15 - ; INEG_R r2 - neg r10 - ; FPNEG_R f0 - xorps xmm0, xmm15 - ; INEG_R r2 - neg r10 - ; IADD_R r0, r3 - add r8, r11 - ; IMUL_9C r7, -2124093035 - lea r15, [r15+r15*8-2124093035] + ; FPSQRT_R e1 + sqrtpd xmm5, xmm5 + ; IXOR_R r5, r2 + xor r13, r10 ; FPADD_M f2, L1[r0] mov eax, r8d and eax, 16376 cvtdq2pd xmm12, qword ptr [rsi+rax] addpd xmm2, xmm12 - ; FPMUL_M e0, L1[r6] + ; IMULH_R r6, r1 + mov rax, r14 + mul r9 + mov r14, rdx + ; ISUB_M r5, L1[r0] + mov eax, r8d + and eax, 16376 + sub r13, qword ptr [rsi+rax] + ; FPMUL_R e2, a3 + mulpd xmm6, xmm11 + ; IMUL_R r4, r6 + imul r12, r14 + ; FPMUL_R e3, a2 + mulpd xmm7, xmm10 + ; ISUB_R r3, r2 + sub r11, r10 + ; FPMUL_R e3, a2 + mulpd xmm7, xmm10 + ; IROL_R r7, r0 + mov ecx, r8d + rol r15, cl + ; FPSUB_R f3, a2 + subpd xmm3, xmm10 + ; IROL_R r3, r7 + mov ecx, r15d + rol r11, cl + ; ISWAP_R r5, r7 + xchg r13, r15 + ; IDIV_C r5, 749951529 + mov rax, 13205547200481862341 + mul r13 + shr rdx, 29 + add r13, rdx + ; FPADD_R f3, a0 + addpd xmm3, xmm8 + ; IMUL_M r0, L1[r4] + mov eax, r12d + and eax, 16376 + imul r8, qword ptr [rsi+rax] + ; FPADD_R f1, a1 + addpd xmm1, xmm9 + ; IROR_R r2, 60 + ror r10, 60 + ; IROR_R r5, r4 + mov ecx, r12d + ror r13, cl + ; FPADD_R f2, a0 + addpd xmm2, xmm8 + ; IXOR_M r4, L1[r6] mov eax, r14d and eax, 16376 + xor r12, qword ptr [rsi+rax] + ; IXOR_R r2, r6 + xor r10, r14 + ; FPADD_M f3, L1[r0] + mov eax, r8d + and eax, 16376 + cvtdq2pd xmm12, qword ptr [rsi+rax] + addpd xmm3, xmm12 + ; ISUB_R r7, r6 + sub r15, r14 + ; IMUL_9C r2, -962375579 + lea r10, [r10+r10*8-962375579] + ; FPSUB_R f3, a2 + subpd xmm3, xmm10 + ; FPSUB_R f3, a0 + subpd xmm3, xmm8 + ; IMUL_R r1, r5 + imul r9, r13 + ; IMUL_R r6, r4 + imul r14, r12 + ; ISWAP_R r0, r2 + xchg r8, r10 + ; ISUB_R r6, r5 + sub r14, r13 + ; FPSUB_R f2, a1 + subpd xmm2, xmm9 + ; ISDIV_C r6, 652931802 + mov rax, -3278972671018643631 + imul r14 + xor eax, eax + add rdx, r14 + sar rdx, 29 + sets al + add rdx, rax + add r14, rdx + ; IMUL_9C r5, -1142924545 + lea r13, [r13+r13*8-1142924545] + ; ISUB_R r7, 1085161834 + sub r15, 1085161834 + ; IMUL_R r4, r6 + imul r12, r14 + ; FPMUL_M e1, L1[r4] + mov eax, r12d + and eax, 16376 + cvtdq2pd xmm12, qword ptr [rsi+rax] + mulpd xmm5, xmm12 + maxpd xmm5, xmm13 + ; FPMUL_M e3, L2[r1] + mov eax, r9d + and eax, 262136 + cvtdq2pd xmm12, qword ptr [rsi+rax] + mulpd xmm7, xmm12 + maxpd xmm7, xmm13 + ; COND_R r2, lt(r5, 1635027096) + xor ecx, ecx + cmp r13d, 1635027096 + setl cl + add r10, rcx + ; IMUL_R r5, -1219696062 + imul r13, -1219696062 + ; IXOR_R r5, r0 + xor r13, r8 + ; FPNEG_R f2 + xorps xmm2, xmm15 + ; FPADD_R f3, a2 + addpd xmm3, xmm10 + ; FPSUB_R f1, a3 + subpd xmm1, xmm11 + ; FPADD_R f1, a2 + addpd xmm1, xmm10 + ; FPDIV_R e1, a3 + divpd xmm5, xmm11 + maxpd xmm5, xmm13 + ; IXOR_M r6, L1[r0] + mov eax, r8d + and eax, 16376 + xor r14, qword ptr [rsi+rax] + ; ISUB_R r7, r4 + sub r15, r12 + ; ISUB_M r6, L1[r1] + mov eax, r9d + and eax, 16376 + sub r14, qword ptr [rsi+rax] + ; ISTORE L1[r5], r3 + mov eax, r13d + and eax, 16376 + mov qword ptr [rsi+rax], r11 + ; IMUL_R r5, r1 + imul r13, r9 + ; IROR_R r3, r2 + mov ecx, r10d + ror r11, cl + ; IMUL_R r4, r7 + imul r12, r15 + ; ISDIV_C r6, -54134756 + mov rax, 7012869325244995177 + imul r14 + xor eax, eax + sub rdx, r14 + sar rdx, 25 + sets al + add rdx, rax + add r14, rdx + ; FPMUL_R e1, a2 + mulpd xmm5, xmm10 + ; FPSUB_M f2, L2[r4] + mov eax, r12d + and eax, 262136 + cvtdq2pd xmm12, qword ptr [rsi+rax] + subpd xmm2, xmm12 + ; IMUL_R r0, r5 + imul r8, r13 + ; FPMUL_R e3, a0 + mulpd xmm7, xmm8 + ; COND_R r5, be(r4, 1545677311) + xor ecx, ecx + cmp r12d, 1545677311 + setbe cl + add r13, rcx + ; IMUL_R r6, r3 + imul r14, r11 + ; IROL_R r6, r2 + mov ecx, r10d + rol r14, cl + ; FPDIV_R e3, a1 + divpd xmm7, xmm9 + maxpd xmm7, xmm13 + ; IXOR_M r5, L1[r1] + mov eax, r9d + and eax, 16376 + xor r13, qword ptr [rsi+rax] + ; COND_R r3, ab(r2, 1734636060) + xor ecx, ecx + cmp r10d, 1734636060 + seta cl + add r11, rcx + ; ISTORE L1[r2], r7 + mov eax, r10d + and eax, 16376 + mov qword ptr [rsi+rax], r15 + ; IADD_R r5, r6 + add r13, r14 + ; FPSUB_R f1, a2 + subpd xmm1, xmm10 + ; FPADD_R f2, a1 + addpd xmm2, xmm9 + ; FPSWAP_R f1 + shufpd xmm1, xmm1, 1 + ; IROL_R r2, r6 + mov ecx, r14d + rol r10, cl + ; IMUL_R r0, r4 + imul r8, r12 + ; FPSUB_R f0, a2 + subpd xmm0, xmm10 + ; ISUB_R r6, r7 + sub r14, r15 + ; IROL_R r4, r7 + mov ecx, r15d + rol r12, cl + ; FPMUL_R e2, a0 + mulpd xmm6, xmm8 + ; ISUB_R r1, r3 + sub r9, r11 + ; FPDIV_R e0, a1 + divpd xmm4, xmm9 + maxpd xmm4, xmm13 + ; FPADD_R f0, a1 + addpd xmm0, xmm9 + ; FPMUL_R e0, a2 + mulpd xmm4, xmm10 + ; FPSUB_R f2, a2 + subpd xmm2, xmm10 + ; FPSUB_M f2, L1[r6] + mov eax, r14d + and eax, 16376 + cvtdq2pd xmm12, qword ptr [rsi+rax] + subpd xmm2, xmm12 + ; FPMUL_R e0, a0 + mulpd xmm4, xmm8 + ; IXOR_M r4, L2[r7] + mov eax, r15d + and eax, 262136 + xor r12, qword ptr [rsi+rax] + ; FPSUB_R f3, a3 + subpd xmm3, xmm11 + ; ISMULH_R r1, r6 + mov rax, r9 + imul r14 + mov r9, rdx + ; COND_R r4, be(r7, 224524971) + xor ecx, ecx + cmp r15d, 224524971 + setbe cl + add r12, rcx + ; FPADD_M f2, L1[r1] + mov eax, r9d + and eax, 16376 + cvtdq2pd xmm12, qword ptr [rsi+rax] + addpd xmm2, xmm12 + ; IMUL_R r5, r4 + imul r13, r12 + ; IADD_RC r1, r5, 370966979 + lea r9, [r9+r13+370966979] + ; IADD_RC r7, r3, -1762209698 + lea r15, [r15+r11-1762209698] + ; FPMUL_M e3, L2[r2] + mov eax, r10d + and eax, 262136 + cvtdq2pd xmm12, qword ptr [rsi+rax] + mulpd xmm7, xmm12 + maxpd xmm7, xmm13 + ; ISUB_R r2, r7 + sub r10, r15 + ; IMUL_9C r3, 171157280 + lea r11, [r11+r11*8+171157280] + ; ISUB_R r3, r5 + sub r11, r13 + ; FPNEG_R f3 + xorps xmm3, xmm15 + ; FPNEG_R f2 + xorps xmm2, xmm15 + ; ISTORE L1[r4], r1 + mov eax, r12d + and eax, 16376 + mov qword ptr [rsi+rax], r9 + ; IADD_R r0, r2 + add r8, r10 + ; IXOR_R r7, r6 + xor r15, r14 + ; IROR_R r0, r4 + mov ecx, r12d + ror r8, cl + ; FPMUL_R e3, a2 + mulpd xmm7, xmm10 + ; IXOR_M r4, L1[r7] + mov eax, r15d + and eax, 16376 + xor r12, qword ptr [rsi+rax] + ; ISTORE L1[r5], r7 + mov eax, r13d + and eax, 16376 + mov qword ptr [rsi+rax], r15 + ; IMUL_9C r7, -1206742834 + lea r15, [r15+r15*8-1206742834] + ; ISMULH_R r0, r4 + mov rax, r8 + imul r12 + mov r8, rdx + ; FPADD_R f2, a0 + addpd xmm2, xmm8 + ; FPSUB_R f1, a0 + subpd xmm1, xmm8 + ; INEG_R r7 + neg r15 + ; COND_M r0, of(L1[r5], -2056260506) + xor ecx, ecx + mov eax, r13d + and eax, 16376 + cmp dword ptr [rsi+rax], -2056260506 + seto cl + add r8, rcx + ; FPSQRT_R e2 + sqrtpd xmm6, xmm6 + ; IMUL_R r3, r4 + imul r11, r12 + ; FPNEG_R f1 + xorps xmm1, xmm15 + ; FPADD_M f2, L1[r5] + mov eax, r13d + and eax, 16376 + cvtdq2pd xmm12, qword ptr [rsi+rax] + addpd xmm2, xmm12 + ; FPSUB_R f3, a0 + subpd xmm3, xmm8 + ; FPNEG_R f3 + xorps xmm3, xmm15 + ; FPMUL_M e3, L2[r5] + mov eax, r13d + and eax, 262136 + cvtdq2pd xmm12, qword ptr [rsi+rax] + mulpd xmm7, xmm12 + maxpd xmm7, xmm13 + ; ISTORE L1[r2], r2 + mov eax, r10d + and eax, 16376 + mov qword ptr [rsi+rax], r10 + ; IMUL_M r3, L2[r4] + mov eax, r12d + and eax, 262136 + imul r11, qword ptr [rsi+rax] + ; IROL_R r5, r6 + mov ecx, r14d + rol r13, cl + ; IADD_RC r4, r3, -904431293 + lea r12, [r12+r11-904431293] + ; FPSUB_R f1, a1 + subpd xmm1, xmm9 + ; IROL_R r7, r0 + mov ecx, r8d + rol r15, cl + ; ISTORE L2[r1], r7 + mov eax, r9d + and eax, 262136 + mov qword ptr [rsi+rax], r15 + ; IROL_R r4, r3 + mov ecx, r11d + rol r12, cl + ; IADD_R r5, r2 + add r13, r10 + ; COND_R r3, ge(r6, -444806705) + xor ecx, ecx + cmp r14d, -444806705 + setge cl + add r11, rcx + ; FPADD_R f0, a1 + addpd xmm0, xmm9 + ; IROL_R r0, 57 + rol r8, 57 + ; IADD_R r0, r2 + add r8, r10 + ; IADD_R r7, r4 + add r15, r12 + ; IROL_R r1, r7 + mov ecx, r15d + rol r9, cl + ; IXOR_M r7, L2[r5] + mov eax, r13d + and eax, 262136 + xor r15, qword ptr [rsi+rax] + ; ISTORE L1[r2], r0 + mov eax, r10d + and eax, 16376 + mov qword ptr [rsi+rax], r8 + ; FPADD_R f1, a2 + addpd xmm1, xmm10 + ; ISUB_R r1, r4 + sub r9, r12 + ; IXOR_R r5, r0 + xor r13, r8 + ; IXOR_M r7, L2[r1] + mov eax, r9d + and eax, 262136 + xor r15, qword ptr [rsi+rax] + ; FPSUB_R f0, a0 + subpd xmm0, xmm8 + ; IXOR_M r1, L1[r4] + mov eax, r12d + and eax, 16376 + xor r9, qword ptr [rsi+rax] + ; FPMUL_R e3, a0 + mulpd xmm7, xmm8 + ; ISDIV_C r1, 1473744194 + mov rax, -5006799265644655925 + imul r9 + xor eax, eax + add rdx, r9 + sar rdx, 30 + sets al + add rdx, rax + add r9, rdx + ; IMUL_9C r1, 1626151459 + lea r9, [r9+r9*8+1626151459] + ; IXOR_M r6, L1[r4] + mov eax, r12d + and eax, 16376 + xor r14, qword ptr [rsi+rax] + ; FPADD_R f0, a0 + addpd xmm0, xmm8 + ; FPADD_R f3, a2 + addpd xmm3, xmm10 + ; ISUB_R r6, r7 + sub r14, r15 + ; IADD_RC r1, r5, 2075955307 + lea r9, [r9+r13+2075955307] + ; IROL_R r6, r3 + mov ecx, r11d + rol r14, cl + ; IMULH_R r2, -1135671124 + mov eax, -1135671124 + mul r10 + add r10, rdx + ; ISUB_R r5, r2 + sub r13, r10 + ; IMULH_R r3, r5 + mov rax, r11 + mul r13 + mov r11, rdx + ; IADD_M r4, L3[386040] + add r12, qword ptr [rsi+386040] + ; COND_R r6, ge(r4, 1518758207) + xor ecx, ecx + cmp r12d, 1518758207 + setge cl + add r14, rcx + ; FPDIV_R e3, a1 + divpd xmm7, xmm9 + maxpd xmm7, xmm13 + ; FPNEG_R f2 + xorps xmm2, xmm15 + ; FPADD_M f1, L1[r4] + mov eax, r12d + and eax, 16376 + cvtdq2pd xmm12, qword ptr [rsi+rax] + addpd xmm1, xmm12 + ; FPMUL_M e0, L1[r4] + mov eax, r12d + and eax, 16376 cvtdq2pd xmm12, qword ptr [rsi+rax] mulpd xmm4, xmm12 maxpd xmm4, xmm13 - ; FPSUB_R f2, a3 - subpd xmm2, xmm11 - ; IMUL_R r1, r2 - imul r9, r10 - ; IDIV_C r7, 3214009572 - mov rax, 12325439725582798855 - mul r15 - shr rdx, 31 - add r15, rdx - ; IMULH_R r3, r2 - mov rax, r11 - mul r10 - mov r11, rdx - ; IROR_R r1, r0 + ; FPSQRT_R e2 + sqrtpd xmm6, xmm6 + ; IROL_R r5, r1 + mov ecx, r9d + rol r13, cl + ; FPADD_R f3, a0 + addpd xmm3, xmm8 + ; IROL_R r3, r0 mov ecx, r8d - ror r9, cl - ; FPMUL_R e0, a1 - mulpd xmm4, xmm9 - ; IADD_RC r4, r4, 1456841848 - lea r12, [r12+r12+1456841848] - ; IROR_R r3, r2 - mov ecx, r10d - ror r11, cl - ; COND_M r0, of(L1[r4], 1678513610) - xor ecx, ecx - mov eax, r12d - and eax, 16376 - cmp dword ptr [rsi+rax], 1678513610 - seto cl - add r8, rcx - ; INEG_R r4 - neg r12 - ; IMUL_R r4, r1 - imul r12, r9 - ; FPADD_R f1, a2 - addpd xmm1, xmm10 - ; FPSUB_R f2, a0 - subpd xmm2, xmm8 - ; FPMUL_R e1, a2 - mulpd xmm5, xmm10 - ; FPSUB_R f0, a3 - subpd xmm0, xmm11 + rol r11, cl + ; FPMUL_R e3, a1 + mulpd xmm7, xmm9 ; IROR_R r0, r7 mov ecx, r15d ror r8, cl - ; ISTORE L2[r1], r4 - mov eax, r9d + ; FPADD_R f2, a2 + addpd xmm2, xmm10 + ; IXOR_R r7, r0 + xor r15, r8 + ; ISTORE L1[r4], r1 + mov eax, r12d + and eax, 16376 + mov qword ptr [rsi+rax], r9 + ; ISTORE L2[r0], r4 + mov eax, r8d and eax, 262136 mov qword ptr [rsi+rax], r12 - ; IROL_R r7, r6 - mov ecx, r14d - rol r15, cl - ; IMUL_9C r2, 266593902 - lea r10, [r10+r10*8+266593902] - ; IMUL_R r4, r6 - imul r12, r14 - ; FPSUB_R f2, a2 - subpd xmm2, xmm10 - ; FPNEG_R f3 - xorps xmm3, xmm15 - ; IROR_R r7, r2 - mov ecx, r10d + ; FPDIV_R e3, a3 + divpd xmm7, xmm11 + maxpd xmm7, xmm13 + ; ISTORE L2[r4], r6 + mov eax, r12d + and eax, 262136 + mov qword ptr [rsi+rax], r14 + ; IMUL_R r3, r1 + imul r11, r9 + ; IXOR_R r2, r4 + xor r10, r12 + ; ISTORE L2[r3], r5 + mov eax, r11d + and eax, 262136 + mov qword ptr [rsi+rax], r13 + ; FPMUL_M e2, L2[r4] + mov eax, r12d + and eax, 262136 + cvtdq2pd xmm12, qword ptr [rsi+rax] + mulpd xmm6, xmm12 + maxpd xmm6, xmm13 + ; FPSUB_R f3, a0 + subpd xmm3, xmm8 + ; COND_R r1, ab(r7, -229570354) + xor ecx, ecx + cmp r15d, -229570354 + seta cl + add r9, rcx + ; IROR_R r7, r3 + mov ecx, r11d ror r15, cl - ; IROR_R r0, r5 - mov ecx, r13d - ror r8, cl + ; FPDIV_R e2, a0 + divpd xmm6, xmm8 + maxpd xmm6, xmm13 + ; IADD_R r2, r5 + add r10, r13 + ; FPDIV_R e1, a3 + divpd xmm5, xmm11 + maxpd xmm5, xmm13 + ; FPSQRT_R e2 + sqrtpd xmm6, xmm6 + ; ISUB_R r3, r7 + sub r11, r15 + ; FPADD_R f0, a0 + addpd xmm0, xmm8 + ; IMUL_M r0, L3[98136] + imul r8, qword ptr [rsi+98136] + ; IMUL_9C r5, -895487055 + lea r13, [r13+r13*8-895487055] + ; IMULH_R r2, r7 + mov rax, r10 + mul r15 + mov r10, rdx + ; IADD_R r4, r1 + add r12, r9 + ; ISDIV_C r0, 494395999 + mov rax, 5007888582388710937 + imul r8 + xor eax, eax + sar rdx, 27 + sets al + add rdx, rax + add r8, rdx + ; FPSWAP_R e0 + shufpd xmm4, xmm4, 1 + ; IXOR_R r1, r5 + xor r9, r13 + ; COND_R r2, ab(r3, 1932234501) + xor ecx, ecx + cmp r11d, 1932234501 + seta cl + add r10, rcx + ; FPMUL_R e1, a0 + mulpd xmm5, xmm8 + ; FPSUB_M f1, L1[r1] + mov eax, r9d + and eax, 16376 + cvtdq2pd xmm12, qword ptr [rsi+rax] + subpd xmm1, xmm12 + ; FPSUB_R f0, a0 + subpd xmm0, xmm8 + ; IROL_R r1, r7 + mov ecx, r15d + rol r9, cl + ; IADD_RC r0, r5, -2051588680 + lea r8, [r8+r13-2051588680] + ; COND_R r6, of(r5, -795593984) + xor ecx, ecx + cmp r13d, -795593984 + seto cl + add r14, rcx + ; FPADD_R f1, a0 + addpd xmm1, xmm8 + ; IMULH_R r7, r3 + mov rax, r15 + mul r11 + mov r15, rdx + ; ISUB_R r7, r4 + sub r15, r12 + ; IROL_R r0, r6 + mov ecx, r14d + rol r8, cl + ; ISDIV_C r1, -675825513 + mov rax, -7326980207007250257 + imul r9 + xor eax, eax + sar rdx, 28 + sets al + add rdx, rax + add r9, rdx + ; ISTORE L1[r6], r3 + mov eax, r14d + and eax, 16376 + mov qword ptr [rsi+rax], r11 + ; IROR_R r4, r3 + mov ecx, r11d + ror r12, cl + ; IDIV_C r4, 3919226376 + mov rax, r12 + shr rax, 3 + mov rcx, 2526906936258851663 + mul rcx + shr rdx, 26 + add r12, rdx + ; FPSUB_R f1, a1 + subpd xmm1, xmm9 + ; FPSUB_R f0, a0 + subpd xmm0, xmm8 + ; IADD_R r0, r2 + add r8, r10 + ; IADD_M r4, L1[r2] + mov eax, r10d + and eax, 16376 + add r12, qword ptr [rsi+rax] + ; ISTORE L1[r7], r2 + mov eax, r15d + and eax, 16376 + mov qword ptr [rsi+rax], r10 + ; FPSQRT_R e1 + sqrtpd xmm5, xmm5 + ; IADD_R r5, r4 + add r13, r12 + ; IXOR_R r6, r7 + xor r14, r15 + ; ISMULH_R r4, r7 + mov rax, r12 + imul r15 + mov r12, rdx + ; FPSQRT_R e1 + sqrtpd xmm5, xmm5 From ac4462ad42fe2e94f98a2f007eb57612391e2fe0 Mon Sep 17 00:00:00 2001 From: tevador Date: Tue, 5 Feb 2019 23:43:57 +0100 Subject: [PATCH 30/35] Renamed floating point instructions Fixed negative source operand for FMUL_M and FDIV_M --- src/AssemblyGeneratorX86.cpp | 46 ++++++++++++------------ src/AssemblyGeneratorX86.hpp | 20 +++++------ src/Instruction.cpp | 60 +++++++++++++++---------------- src/Instruction.hpp | 40 ++++++++++----------- src/InterpretedVirtualMachine.hpp | 32 ++++++++--------- src/JitCompilerX86.cpp | 43 +++++++++++----------- src/JitCompilerX86.hpp | 20 +++++------ src/instructionWeights.hpp | 26 +++++++------- src/instructions.hpp | 10 +++--- src/instructionsPortable.cpp | 10 +++--- 10 files changed, 156 insertions(+), 151 deletions(-) diff --git a/src/AssemblyGeneratorX86.cpp b/src/AssemblyGeneratorX86.cpp index 70d396b..ff812e7 100644 --- a/src/AssemblyGeneratorX86.cpp +++ b/src/AssemblyGeneratorX86.cpp @@ -356,19 +356,19 @@ namespace RandomX { } //1 uOPs - void AssemblyGeneratorX86::h_FPSWAP_R(Instruction& instr, int i) { + void AssemblyGeneratorX86::h_FSWAP_R(Instruction& instr, int i) { asmCode << "\tshufpd " << regFE[instr.dst] << ", " << regFE[instr.dst] << ", 1" << std::endl; } //1 uOP - void AssemblyGeneratorX86::h_FPADD_R(Instruction& instr, int i) { + void AssemblyGeneratorX86::h_FADD_R(Instruction& instr, int i) { instr.dst %= 4; instr.src %= 4; asmCode << "\taddpd " << regF[instr.dst] << ", " << regA[instr.src] << std::endl; } //5 uOPs - void AssemblyGeneratorX86::h_FPADD_M(Instruction& instr, int i) { + void AssemblyGeneratorX86::h_FADD_M(Instruction& instr, int i) { instr.dst %= 4; genAddressReg(instr); asmCode << "\tcvtdq2pd xmm12, qword ptr [rsi+rax]" << std::endl; @@ -376,14 +376,14 @@ namespace RandomX { } //1 uOP - void AssemblyGeneratorX86::h_FPSUB_R(Instruction& instr, int i) { + void AssemblyGeneratorX86::h_FSUB_R(Instruction& instr, int i) { instr.dst %= 4; instr.src %= 4; asmCode << "\tsubpd " << regF[instr.dst] << ", " << regA[instr.src] << std::endl; } //5 uOPs - void AssemblyGeneratorX86::h_FPSUB_M(Instruction& instr, int i) { + void AssemblyGeneratorX86::h_FSUB_M(Instruction& instr, int i) { instr.dst %= 4; genAddressReg(instr); asmCode << "\tcvtdq2pd xmm12, qword ptr [rsi+rax]" << std::endl; @@ -397,40 +397,42 @@ namespace RandomX { } //1 uOPs - void AssemblyGeneratorX86::h_FPMUL_R(Instruction& instr, int i) { + void AssemblyGeneratorX86::h_FMUL_R(Instruction& instr, int i) { instr.dst %= 4; instr.src %= 4; asmCode << "\tmulpd " << regE[instr.dst] << ", " << regA[instr.src] << std::endl; } - //6 uOPs - void AssemblyGeneratorX86::h_FPMUL_M(Instruction& instr, int i) { + //7 uOPs + void AssemblyGeneratorX86::h_FMUL_M(Instruction& instr, int i) { instr.dst %= 4; genAddressReg(instr); asmCode << "\tcvtdq2pd xmm12, qword ptr [rsi+rax]" << std::endl; + asmCode << "\tandps xmm12, xmm14" << std::endl; asmCode << "\tmulpd " << regE[instr.dst] << ", xmm12" << std::endl; asmCode << "\tmaxpd " << regE[instr.dst] << ", " << dblMin << std::endl; } //2 uOPs - void AssemblyGeneratorX86::h_FPDIV_R(Instruction& instr, int i) { + void AssemblyGeneratorX86::h_FDIV_R(Instruction& instr, int i) { instr.dst %= 4; instr.src %= 4; asmCode << "\tdivpd " << regE[instr.dst] << ", " << regA[instr.src] << std::endl; asmCode << "\tmaxpd " << regE[instr.dst] << ", " << dblMin << std::endl; } - //6 uOPs - void AssemblyGeneratorX86::h_FPDIV_M(Instruction& instr, int i) { + //7 uOPs + void AssemblyGeneratorX86::h_FDIV_M(Instruction& instr, int i) { instr.dst %= 4; genAddressReg(instr); asmCode << "\tcvtdq2pd xmm12, qword ptr [rsi+rax]" << std::endl; + asmCode << "\tandps xmm12, xmm14" << std::endl; asmCode << "\tdivpd " << regE[instr.dst] << ", xmm12" << std::endl; asmCode << "\tmaxpd " << regE[instr.dst] << ", " << dblMin << std::endl; } //1 uOP - void AssemblyGeneratorX86::h_FPSQRT_R(Instruction& instr, int i) { + void AssemblyGeneratorX86::h_FSQRT_R(Instruction& instr, int i) { instr.dst %= 4; asmCode << "\tsqrtpd " << regE[instr.dst] << ", " << regE[instr.dst] << std::endl; } @@ -529,21 +531,21 @@ namespace RandomX { INST_HANDLE(ISWAP_R) //Common floating point - INST_HANDLE(FPSWAP_R) + INST_HANDLE(FSWAP_R) //Floating point group F - INST_HANDLE(FPADD_R) - INST_HANDLE(FPADD_M) - INST_HANDLE(FPSUB_R) - INST_HANDLE(FPSUB_M) + INST_HANDLE(FADD_R) + INST_HANDLE(FADD_M) + INST_HANDLE(FSUB_R) + INST_HANDLE(FSUB_M) INST_HANDLE(FPNEG_R) //Floating point group E - INST_HANDLE(FPMUL_R) - INST_HANDLE(FPMUL_M) - INST_HANDLE(FPDIV_R) - INST_HANDLE(FPDIV_M) - INST_HANDLE(FPSQRT_R) + INST_HANDLE(FMUL_R) + INST_HANDLE(FMUL_M) + INST_HANDLE(FDIV_R) + INST_HANDLE(FDIV_M) + INST_HANDLE(FSQRT_R) //Control INST_HANDLE(COND_R) diff --git a/src/AssemblyGeneratorX86.hpp b/src/AssemblyGeneratorX86.hpp index a8e062c..5abebc1 100644 --- a/src/AssemblyGeneratorX86.hpp +++ b/src/AssemblyGeneratorX86.hpp @@ -64,17 +64,17 @@ namespace RandomX { void h_IROR_R(Instruction&, int); void h_IROL_R(Instruction&, int); void h_ISWAP_R(Instruction&, int); - void h_FPSWAP_R(Instruction&, int); - void h_FPADD_R(Instruction&, int); - void h_FPADD_M(Instruction&, int); - void h_FPSUB_R(Instruction&, int); - void h_FPSUB_M(Instruction&, int); + void h_FSWAP_R(Instruction&, int); + void h_FADD_R(Instruction&, int); + void h_FADD_M(Instruction&, int); + void h_FSUB_R(Instruction&, int); + void h_FSUB_M(Instruction&, int); void h_FPNEG_R(Instruction&, int); - void h_FPMUL_R(Instruction&, int); - void h_FPMUL_M(Instruction&, int); - void h_FPDIV_R(Instruction&, int); - void h_FPDIV_M(Instruction&, int); - void h_FPSQRT_R(Instruction&, int); + void h_FMUL_R(Instruction&, int); + void h_FMUL_M(Instruction&, int); + void h_FDIV_R(Instruction&, int); + void h_FDIV_M(Instruction&, int); + void h_FSQRT_R(Instruction&, int); void h_COND_R(Instruction&, int); void h_COND_M(Instruction&, int); void h_CFROUND(Instruction&, int); diff --git a/src/Instruction.cpp b/src/Instruction.cpp index 8a175fc..18017e7 100644 --- a/src/Instruction.cpp +++ b/src/Instruction.cpp @@ -215,32 +215,32 @@ namespace RandomX { os << "r" << (int)dst << ", r" << (int)src << std::endl; } - void Instruction::h_FPSWAP_R(std::ostream& os) const { + void Instruction::h_FSWAP_R(std::ostream& os) const { const char reg = (dst >= 4) ? 'e' : 'f'; auto dstIndex = dst % 4; os << reg << dstIndex << std::endl; } - void Instruction::h_FPADD_R(std::ostream& os) const { + void Instruction::h_FADD_R(std::ostream& os) const { auto dstIndex = dst % 4; auto srcIndex = src % 4; os << "f" << dstIndex << ", a" << srcIndex << std::endl; } - void Instruction::h_FPADD_M(std::ostream& os) const { + void Instruction::h_FADD_M(std::ostream& os) const { auto dstIndex = dst % 4; os << "f" << dstIndex << ", "; genAddressReg(os); os << std::endl; } - void Instruction::h_FPSUB_R(std::ostream& os) const { + void Instruction::h_FSUB_R(std::ostream& os) const { auto dstIndex = dst % 4; auto srcIndex = src % 4; os << "f" << dstIndex << ", a" << srcIndex << std::endl; } - void Instruction::h_FPSUB_M(std::ostream& os) const { + void Instruction::h_FSUB_M(std::ostream& os) const { auto dstIndex = dst % 4; os << "f" << dstIndex << ", "; genAddressReg(os); @@ -252,33 +252,33 @@ namespace RandomX { os << "f" << dstIndex << std::endl; } - void Instruction::h_FPMUL_R(std::ostream& os) const { + void Instruction::h_FMUL_R(std::ostream& os) const { auto dstIndex = dst % 4; auto srcIndex = src % 4; os << "e" << dstIndex << ", a" << srcIndex << std::endl; } - void Instruction::h_FPMUL_M(std::ostream& os) const { + void Instruction::h_FMUL_M(std::ostream& os) const { auto dstIndex = dst % 4; os << "e" << dstIndex << ", "; genAddressReg(os); os << std::endl; } - void Instruction::h_FPDIV_R(std::ostream& os) const { + void Instruction::h_FDIV_R(std::ostream& os) const { auto dstIndex = dst % 4; auto srcIndex = src % 4; os << "e" << dstIndex << ", a" << srcIndex << std::endl; } - void Instruction::h_FPDIV_M(std::ostream& os) const { + void Instruction::h_FDIV_M(std::ostream& os) const { auto dstIndex = dst % 4; os << "e" << dstIndex << ", "; genAddressReg(os); os << std::endl; } - void Instruction::h_FPSQRT_R(std::ostream& os) const { + void Instruction::h_FSQRT_R(std::ostream& os) const { auto dstIndex = dst % 4; os << "e" << dstIndex << std::endl; } @@ -363,21 +363,21 @@ namespace RandomX { INST_NAME(ISWAP_R) //Common floating point - INST_NAME(FPSWAP_R) + INST_NAME(FSWAP_R) //Floating point group F - INST_NAME(FPADD_R) - INST_NAME(FPADD_M) - INST_NAME(FPSUB_R) - INST_NAME(FPSUB_M) + INST_NAME(FADD_R) + INST_NAME(FADD_M) + INST_NAME(FSUB_R) + INST_NAME(FSUB_M) INST_NAME(FPNEG_R) //Floating point group E - INST_NAME(FPMUL_R) - INST_NAME(FPMUL_M) - INST_NAME(FPDIV_R) - INST_NAME(FPDIV_M) - INST_NAME(FPSQRT_R) + INST_NAME(FMUL_R) + INST_NAME(FMUL_M) + INST_NAME(FDIV_R) + INST_NAME(FDIV_M) + INST_NAME(FSQRT_R) //Control INST_NAME(COND_R) @@ -414,21 +414,21 @@ namespace RandomX { INST_HANDLE(ISWAP_R) //Common floating point - INST_HANDLE(FPSWAP_R) + INST_HANDLE(FSWAP_R) //Floating point group F - INST_HANDLE(FPADD_R) - INST_HANDLE(FPADD_M) - INST_HANDLE(FPSUB_R) - INST_HANDLE(FPSUB_M) + INST_HANDLE(FADD_R) + INST_HANDLE(FADD_M) + INST_HANDLE(FSUB_R) + INST_HANDLE(FSUB_M) INST_HANDLE(FPNEG_R) //Floating point group E - INST_HANDLE(FPMUL_R) - INST_HANDLE(FPMUL_M) - INST_HANDLE(FPDIV_R) - INST_HANDLE(FPDIV_M) - INST_HANDLE(FPSQRT_R) + INST_HANDLE(FMUL_R) + INST_HANDLE(FMUL_M) + INST_HANDLE(FDIV_R) + INST_HANDLE(FDIV_M) + INST_HANDLE(FSQRT_R) //Control INST_HANDLE(COND_R) diff --git a/src/Instruction.hpp b/src/Instruction.hpp index 987f326..f530bbc 100644 --- a/src/Instruction.hpp +++ b/src/Instruction.hpp @@ -49,17 +49,17 @@ namespace RandomX { constexpr int IROR_R = 17; constexpr int IROL_R = 18; constexpr int ISWAP_R = 19; - constexpr int FPSWAP_R = 20; - constexpr int FPADD_R = 21; - constexpr int FPADD_M = 22; - constexpr int FPSUB_R = 23; - constexpr int FPSUB_M = 24; + constexpr int FSWAP_R = 20; + constexpr int FADD_R = 21; + constexpr int FADD_M = 22; + constexpr int FSUB_R = 23; + constexpr int FSUB_M = 24; constexpr int FPNEG_R = 25; - constexpr int FPMUL_R = 26; - constexpr int FPMUL_M = 27; - constexpr int FPDIV_R = 28; - constexpr int FPDIV_M = 29; - constexpr int FPSQRT_R = 30; + constexpr int FMUL_R = 26; + constexpr int FMUL_M = 27; + constexpr int FDIV_R = 28; + constexpr int FDIV_M = 29; + constexpr int FSQRT_R = 30; constexpr int COND_R = 31; constexpr int COND_M = 32; constexpr int CFROUND = 33; @@ -111,17 +111,17 @@ namespace RandomX { void h_IROR_R(std::ostream&) const; void h_IROL_R(std::ostream&) const; void h_ISWAP_R(std::ostream&) const; - void h_FPSWAP_R(std::ostream&) const; - void h_FPADD_R(std::ostream&) const; - void h_FPADD_M(std::ostream&) const; - void h_FPSUB_R(std::ostream&) const; - void h_FPSUB_M(std::ostream&) const; + void h_FSWAP_R(std::ostream&) const; + void h_FADD_R(std::ostream&) const; + void h_FADD_M(std::ostream&) const; + void h_FSUB_R(std::ostream&) const; + void h_FSUB_M(std::ostream&) const; void h_FPNEG_R(std::ostream&) const; - void h_FPMUL_R(std::ostream&) const; - void h_FPMUL_M(std::ostream&) const; - void h_FPDIV_R(std::ostream&) const; - void h_FPDIV_M(std::ostream&) const; - void h_FPSQRT_R(std::ostream&) const; + void h_FMUL_R(std::ostream&) const; + void h_FMUL_M(std::ostream&) const; + void h_FDIV_R(std::ostream&) const; + void h_FDIV_M(std::ostream&) const; + void h_FSQRT_R(std::ostream&) const; void h_COND_R(std::ostream&) const; void h_COND_M(std::ostream&) const; void h_CFROUND(std::ostream&) const; diff --git a/src/InterpretedVirtualMachine.hpp b/src/InterpretedVirtualMachine.hpp index 2eee73d..f698d97 100644 --- a/src/InterpretedVirtualMachine.hpp +++ b/src/InterpretedVirtualMachine.hpp @@ -94,11 +94,11 @@ namespace RandomX { int count_SAR_64 = 0; int count_ROL_64 = 0; int count_ROR_64 = 0; - int count_FPADD = 0; - int count_FPSUB = 0; - int count_FPMUL = 0; - int count_FPDIV = 0; - int count_FPSQRT = 0; + int count_FADD = 0; + int count_FSUB = 0; + int count_FMUL = 0; + int count_FDIV = 0; + int count_FSQRT = 0; int count_FPROUND = 0; int count_JUMP_taken = 0; int count_JUMP_not_taken = 0; @@ -113,12 +113,12 @@ namespace RandomX { int count_retdepth_max = 0; int count_endstack = 0; int count_instructions[ProgramLength] = { 0 }; - int count_FPADD_nop = 0; - int count_FPADD_nop2 = 0; - int count_FPSUB_nop = 0; - int count_FPSUB_nop2 = 0; - int count_FPMUL_nop = 0; - int count_FPMUL_nop2 = 0; + int count_FADD_nop = 0; + int count_FADD_nop2 = 0; + int count_FSUB_nop = 0; + int count_FSUB_nop2 = 0; + int count_FMUL_nop = 0; + int count_FMUL_nop2 = 0; int datasetAccess[256] = { 0 }; #endif void executeInstruction(Instruction&); @@ -173,11 +173,11 @@ namespace RandomX { void h_SAR_64(Instruction&); void h_ROL_64(Instruction&); void h_ROR_64(Instruction&); - void h_FPADD(Instruction&); - void h_FPSUB(Instruction&); - void h_FPMUL(Instruction&); - void h_FPDIV(Instruction&); - void h_FPSQRT(Instruction&); + void h_FADD(Instruction&); + void h_FSUB(Instruction&); + void h_FMUL(Instruction&); + void h_FDIV(Instruction&); + void h_FSQRT(Instruction&); void h_FPROUND(Instruction&); void h_JUMP(Instruction&); void h_CALL(Instruction&); diff --git a/src/JitCompilerX86.cpp b/src/JitCompilerX86.cpp index e926e4a..e891a27 100644 --- a/src/JitCompilerX86.cpp +++ b/src/JitCompilerX86.cpp @@ -177,6 +177,7 @@ namespace RandomX { static const uint8_t JMP = 0xe9; static const uint8_t REX_XOR_RAX_R64[] = { 0x49, 0x33 }; static const uint8_t REX_XCHG[] = { 0x4d, 0x87 }; + static const uint8_t REX_ANDPS_XMM12[] = { 0x41, 0x0f, 0x54, 0xe6 }; size_t JitCompilerX86::getCodeSize() { return codePos - prologueSize; @@ -603,20 +604,20 @@ namespace RandomX { } } - void JitCompilerX86::h_FPSWAP_R(Instruction& instr) { + void JitCompilerX86::h_FSWAP_R(Instruction& instr) { emit(SHUFPD); emitByte(0xc0 + 9 * instr.dst); emitByte(1); } - void JitCompilerX86::h_FPADD_R(Instruction& instr) { + void JitCompilerX86::h_FADD_R(Instruction& instr) { instr.dst %= 4; instr.src %= 4; emit(REX_ADDPD); emitByte(0xc0 + instr.src + 8 * instr.dst); } - void JitCompilerX86::h_FPADD_M(Instruction& instr) { + void JitCompilerX86::h_FADD_M(Instruction& instr) { instr.dst %= 4; genAddressReg(instr); emit(REX_CVTDQ2PD_XMM12); @@ -624,14 +625,14 @@ namespace RandomX { emitByte(0xc4 + 8 * instr.dst); } - void JitCompilerX86::h_FPSUB_R(Instruction& instr) { + void JitCompilerX86::h_FSUB_R(Instruction& instr) { instr.dst %= 4; instr.src %= 4; emit(REX_SUBPD); emitByte(0xc0 + instr.src + 8 * instr.dst); } - void JitCompilerX86::h_FPSUB_M(Instruction& instr) { + void JitCompilerX86::h_FSUB_M(Instruction& instr) { instr.dst %= 4; genAddressReg(instr); emit(REX_CVTDQ2PD_XMM12); @@ -645,24 +646,25 @@ namespace RandomX { emitByte(0xc7 + 8 * instr.dst); } - void JitCompilerX86::h_FPMUL_R(Instruction& instr) { + void JitCompilerX86::h_FMUL_R(Instruction& instr) { instr.dst %= 4; instr.src %= 4; emit(REX_MULPD); emitByte(0xe0 + instr.src + 8 * instr.dst); } - void JitCompilerX86::h_FPMUL_M(Instruction& instr) { + void JitCompilerX86::h_FMUL_M(Instruction& instr) { instr.dst %= 4; genAddressReg(instr); emit(REX_CVTDQ2PD_XMM12); + emit(REX_ANDPS_XMM12); emit(REX_MULPD); emitByte(0xe4 + 8 * instr.dst); emit(REX_MAXPD); emitByte(0xe5 + 8 * instr.dst); } - void JitCompilerX86::h_FPDIV_R(Instruction& instr) { + void JitCompilerX86::h_FDIV_R(Instruction& instr) { instr.dst %= 4; instr.src %= 4; emit(REX_DIVPD); @@ -671,17 +673,18 @@ namespace RandomX { emitByte(0xe5 + 8 * instr.dst); } - void JitCompilerX86::h_FPDIV_M(Instruction& instr) { + void JitCompilerX86::h_FDIV_M(Instruction& instr) { instr.dst %= 4; genAddressReg(instr); emit(REX_CVTDQ2PD_XMM12); + emit(REX_ANDPS_XMM12); emit(REX_DIVPD); emitByte(0xe4 + 8 * instr.dst); emit(REX_MAXPD); emitByte(0xe5 + 8 * instr.dst); } - void JitCompilerX86::h_FPSQRT_R(Instruction& instr) { + void JitCompilerX86::h_FSQRT_R(Instruction& instr) { instr.dst %= 4; emit(SQRTPD); emitByte(0xe4 + 9 * instr.dst); @@ -786,17 +789,17 @@ namespace RandomX { INST_HANDLE(IROR_R) INST_HANDLE(IROL_R) INST_HANDLE(ISWAP_R) - INST_HANDLE(FPSWAP_R) - INST_HANDLE(FPADD_R) - INST_HANDLE(FPADD_M) - INST_HANDLE(FPSUB_R) - INST_HANDLE(FPSUB_M) + INST_HANDLE(FSWAP_R) + INST_HANDLE(FADD_R) + INST_HANDLE(FADD_M) + INST_HANDLE(FSUB_R) + INST_HANDLE(FSUB_M) INST_HANDLE(FPNEG_R) - INST_HANDLE(FPMUL_R) - INST_HANDLE(FPMUL_M) - INST_HANDLE(FPDIV_R) - INST_HANDLE(FPDIV_M) - INST_HANDLE(FPSQRT_R) + INST_HANDLE(FMUL_R) + INST_HANDLE(FMUL_M) + INST_HANDLE(FDIV_R) + INST_HANDLE(FDIV_M) + INST_HANDLE(FSQRT_R) INST_HANDLE(COND_R) INST_HANDLE(COND_M) INST_HANDLE(CFROUND) diff --git a/src/JitCompilerX86.hpp b/src/JitCompilerX86.hpp index 9c85667..4303cfd 100644 --- a/src/JitCompilerX86.hpp +++ b/src/JitCompilerX86.hpp @@ -110,17 +110,17 @@ namespace RandomX { void h_IROR_R(Instruction&); void h_IROL_R(Instruction&); void h_ISWAP_R(Instruction&); - void h_FPSWAP_R(Instruction&); - void h_FPADD_R(Instruction&); - void h_FPADD_M(Instruction&); - void h_FPSUB_R(Instruction&); - void h_FPSUB_M(Instruction&); + void h_FSWAP_R(Instruction&); + void h_FADD_R(Instruction&); + void h_FADD_M(Instruction&); + void h_FSUB_R(Instruction&); + void h_FSUB_M(Instruction&); void h_FPNEG_R(Instruction&); - void h_FPMUL_R(Instruction&); - void h_FPMUL_M(Instruction&); - void h_FPDIV_R(Instruction&); - void h_FPDIV_M(Instruction&); - void h_FPSQRT_R(Instruction&); + void h_FMUL_R(Instruction&); + void h_FMUL_M(Instruction&); + void h_FDIV_R(Instruction&); + void h_FDIV_M(Instruction&); + void h_FSQRT_R(Instruction&); void h_COND_R(Instruction&); void h_COND_M(Instruction&); void h_CFROUND(Instruction&); diff --git a/src/instructionWeights.hpp b/src/instructionWeights.hpp index 0bb26ff..32225e7 100644 --- a/src/instructionWeights.hpp +++ b/src/instructionWeights.hpp @@ -42,21 +42,21 @@ along with RandomX. If not, see. #define WT_ISWAP_R 4 //Common floating point -#define WT_FPSWAP_R 8 +#define WT_FSWAP_R 8 //Floating point group F -#define WT_FPADD_R 20 -#define WT_FPADD_M 5 -#define WT_FPSUB_R 20 -#define WT_FPSUB_M 5 +#define WT_FADD_R 20 +#define WT_FADD_M 5 +#define WT_FSUB_R 20 +#define WT_FSUB_M 5 #define WT_FPNEG_R 6 //Floating point group E -#define WT_FPMUL_R 16 -#define WT_FPMUL_M 4 -#define WT_FPDIV_R 7 -#define WT_FPDIV_M 1 -#define WT_FPSQRT_R 6 +#define WT_FMUL_R 16 +#define WT_FMUL_M 4 +#define WT_FDIV_R 7 +#define WT_FDIV_M 1 +#define WT_FSQRT_R 6 //Control #define WT_COND_R 7 @@ -73,9 +73,9 @@ constexpr int wtSum = WT_IADD_R + WT_IADD_M + WT_IADD_RC + WT_ISUB_R + \ WT_ISUB_M + WT_IMUL_9C + WT_IMUL_R + WT_IMUL_M + WT_IMULH_R + \ WT_IMULH_M + WT_ISMULH_R + WT_ISMULH_M + WT_IDIV_C + WT_ISDIV_C + \ WT_INEG_R + WT_IXOR_R + WT_IXOR_M + WT_IROR_R + WT_IROL_R + \ -WT_ISWAP_R + WT_FPSWAP_R + WT_FPADD_R + WT_FPADD_M + WT_FPSUB_R + WT_FPSUB_M + \ -WT_FPNEG_R + WT_FPMUL_R + WT_FPMUL_M + WT_FPDIV_R + WT_FPDIV_M + \ -WT_FPSQRT_R + WT_COND_R + WT_COND_M + WT_CFROUND + WT_ISTORE + WT_FSTORE + WT_NOP; +WT_ISWAP_R + WT_FSWAP_R + WT_FADD_R + WT_FADD_M + WT_FSUB_R + WT_FSUB_M + \ +WT_FPNEG_R + WT_FMUL_R + WT_FMUL_M + WT_FDIV_R + WT_FDIV_M + \ +WT_FSQRT_R + WT_COND_R + WT_COND_M + WT_CFROUND + WT_ISTORE + WT_FSTORE + WT_NOP; static_assert(wtSum == 256, "Sum of instruction weights must be 256"); diff --git a/src/instructions.hpp b/src/instructions.hpp index dc5d4ee..6d9a98f 100644 --- a/src/instructions.hpp +++ b/src/instructions.hpp @@ -48,10 +48,10 @@ namespace RandomX { bool JMP_COND(uint8_t, convertible_t&, int32_t); void FPINIT(); void FPROUND(convertible_t, uint8_t); - void FPADD(convertible_t& a, fpu_reg_t& b, fpu_reg_t& c); - void FPSUB(convertible_t& a, fpu_reg_t& b, fpu_reg_t& c); - void FPMUL(convertible_t& a, fpu_reg_t& b, fpu_reg_t& c); - void FPDIV(convertible_t& a, fpu_reg_t& b, fpu_reg_t& c); - void FPSQRT(convertible_t& a, fpu_reg_t& b, fpu_reg_t& c); + void FADD(convertible_t& a, fpu_reg_t& b, fpu_reg_t& c); + void FSUB(convertible_t& a, fpu_reg_t& b, fpu_reg_t& c); + void FMUL(convertible_t& a, fpu_reg_t& b, fpu_reg_t& c); + void FDIV(convertible_t& a, fpu_reg_t& b, fpu_reg_t& c); + void FSQRT(convertible_t& a, fpu_reg_t& b, fpu_reg_t& c); } } \ No newline at end of file diff --git a/src/instructionsPortable.cpp b/src/instructionsPortable.cpp index 9e1eff1..ca85ffc 100644 --- a/src/instructionsPortable.cpp +++ b/src/instructionsPortable.cpp @@ -225,7 +225,7 @@ namespace RandomX { c.i64 = a.i64 / (b.i32 != 0 ? b.i32 : 1); } - void FPADD(convertible_t& a, fpu_reg_t& b, fpu_reg_t& c) { + void FADD(convertible_t& a, fpu_reg_t& b, fpu_reg_t& c) { #ifdef __SSE2__ __m128i ai = _mm_loadl_epi64((const __m128i*)&a); __m128d ad = _mm_cvtepi32_pd(ai); @@ -240,7 +240,7 @@ namespace RandomX { #endif } - void FPSUB(convertible_t& a, fpu_reg_t& b, fpu_reg_t& c) { + void FSUB(convertible_t& a, fpu_reg_t& b, fpu_reg_t& c) { #ifdef __SSE2__ __m128i ai = _mm_loadl_epi64((const __m128i*)&a); __m128d ad = _mm_cvtepi32_pd(ai); @@ -255,7 +255,7 @@ namespace RandomX { #endif } - void FPMUL(convertible_t& a, fpu_reg_t& b, fpu_reg_t& c) { + void FMUL(convertible_t& a, fpu_reg_t& b, fpu_reg_t& c) { #ifdef __SSE2__ __m128i ai = _mm_loadl_epi64((const __m128i*)&a); __m128d ad = _mm_cvtepi32_pd(ai); @@ -272,7 +272,7 @@ namespace RandomX { #endif } - void FPDIV(convertible_t& a, fpu_reg_t& b, fpu_reg_t& c) { + void FDIV(convertible_t& a, fpu_reg_t& b, fpu_reg_t& c) { #ifdef __SSE2__ __m128i ai = _mm_loadl_epi64((const __m128i*)&a); __m128d ad = _mm_cvtepi32_pd(ai); @@ -289,7 +289,7 @@ namespace RandomX { #endif } - void FPSQRT(convertible_t& a, fpu_reg_t& b, fpu_reg_t& c) { + void FSQRT(convertible_t& a, fpu_reg_t& b, fpu_reg_t& c) { #ifdef __SSE2__ __m128i ai = _mm_loadl_epi64((const __m128i*)&a); __m128d ad = _mm_cvtepi32_pd(ai); From a586751f6b0e2f639ac83c8103051121a969d410 Mon Sep 17 00:00:00 2001 From: tevador Date: Thu, 7 Feb 2019 16:11:27 +0100 Subject: [PATCH 31/35] Removed FPNEG instruction Optimized instruction frequencies Increased the range for A registers from [1,65536) to [1, 4294967296) --- src/AssemblyGeneratorX86.cpp | 10 +- src/AssemblyGeneratorX86.hpp | 2 +- src/CompiledVirtualMachine.cpp | 2 +- src/Instruction.cpp | 8 +- src/Instruction.hpp | 4 +- src/JitCompilerX86.cpp | 14 +- src/JitCompilerX86.hpp | 2 +- src/asm/program_loop_store.inc | 4 + src/asm/program_prologue_load.inc | 2 +- src/executeProgram-win64.asm | 4 +- src/instructionWeights.hpp | 26 +- src/program.inc | 1419 ++++++++++++++--------------- 12 files changed, 738 insertions(+), 759 deletions(-) diff --git a/src/AssemblyGeneratorX86.cpp b/src/AssemblyGeneratorX86.cpp index ff812e7..e2eaf44 100644 --- a/src/AssemblyGeneratorX86.cpp +++ b/src/AssemblyGeneratorX86.cpp @@ -35,6 +35,8 @@ namespace RandomX { static const char* regE[4] = { "xmm4", "xmm5", "xmm6", "xmm7" }; static const char* regA[4] = { "xmm8", "xmm9", "xmm10", "xmm11" }; + static const char* fsumInstr[4] = { "paddb", "paddw", "paddd", "paddq" }; + static const char* regA4 = "xmm12"; static const char* dblMin = "xmm13"; static const char* absMask = "xmm14"; @@ -365,6 +367,7 @@ namespace RandomX { instr.dst %= 4; instr.src %= 4; asmCode << "\taddpd " << regF[instr.dst] << ", " << regA[instr.src] << std::endl; + //asmCode << "\t" << fsumInstr[instr.mod % 4] << " " << signMask << ", " << regF[instr.dst] << std::endl; } //5 uOPs @@ -380,6 +383,7 @@ namespace RandomX { instr.dst %= 4; instr.src %= 4; asmCode << "\tsubpd " << regF[instr.dst] << ", " << regA[instr.src] << std::endl; + //asmCode << "\t" << fsumInstr[instr.mod % 4] << " " << signMask << ", " << regF[instr.dst] << std::endl; } //5 uOPs @@ -391,9 +395,9 @@ namespace RandomX { } //1 uOP - void AssemblyGeneratorX86::h_FPNEG_R(Instruction& instr, int i) { + void AssemblyGeneratorX86::h_CFSUM_R(Instruction& instr, int i) { instr.dst %= 4; - asmCode << "\txorps " << regF[instr.dst] << ", " << signMask << std::endl; + asmCode << "\t" << fsumInstr[instr.mod % 4] << " " << signMask << ", " << regF[instr.dst] << std::endl; } //1 uOPs @@ -538,7 +542,7 @@ namespace RandomX { INST_HANDLE(FADD_M) INST_HANDLE(FSUB_R) INST_HANDLE(FSUB_M) - INST_HANDLE(FPNEG_R) + INST_HANDLE(CFSUM_R) //Floating point group E INST_HANDLE(FMUL_R) diff --git a/src/AssemblyGeneratorX86.hpp b/src/AssemblyGeneratorX86.hpp index 5abebc1..5abf707 100644 --- a/src/AssemblyGeneratorX86.hpp +++ b/src/AssemblyGeneratorX86.hpp @@ -69,7 +69,7 @@ namespace RandomX { void h_FADD_M(Instruction&, int); void h_FSUB_R(Instruction&, int); void h_FSUB_M(Instruction&, int); - void h_FPNEG_R(Instruction&, int); + void h_CFSUM_R(Instruction&, int); void h_FMUL_R(Instruction&, int); void h_FMUL_M(Instruction&, int); void h_FDIV_R(Instruction&, int); diff --git a/src/CompiledVirtualMachine.cpp b/src/CompiledVirtualMachine.cpp index ebacf42..3bf3371 100644 --- a/src/CompiledVirtualMachine.cpp +++ b/src/CompiledVirtualMachine.cpp @@ -44,7 +44,7 @@ namespace RandomX { } static uint64_t getSmallPositiveFloatBits(uint64_t entropy) { - auto exponent = entropy >> 60; //0..15 + auto exponent = entropy >> 59; //0..31 auto mantissa = entropy & mantissaMask; exponent += exponentBias; exponent &= exponentMask; diff --git a/src/Instruction.cpp b/src/Instruction.cpp index 18017e7..5784c99 100644 --- a/src/Instruction.cpp +++ b/src/Instruction.cpp @@ -247,9 +247,9 @@ namespace RandomX { os << std::endl; } - void Instruction::h_FPNEG_R(std::ostream& os) const { + void Instruction::h_CFSUM_R(std::ostream& os) const { auto dstIndex = dst % 4; - os << "f" << dstIndex << std::endl; + os << "f" << dstIndex << ", " << (1 << ((mod % 4) + 3)) << std::endl; } void Instruction::h_FMUL_R(std::ostream& os) const { @@ -370,7 +370,7 @@ namespace RandomX { INST_NAME(FADD_M) INST_NAME(FSUB_R) INST_NAME(FSUB_M) - INST_NAME(FPNEG_R) + INST_NAME(CFSUM_R) //Floating point group E INST_NAME(FMUL_R) @@ -421,7 +421,7 @@ namespace RandomX { INST_HANDLE(FADD_M) INST_HANDLE(FSUB_R) INST_HANDLE(FSUB_M) - INST_HANDLE(FPNEG_R) + INST_HANDLE(CFSUM_R) //Floating point group E INST_HANDLE(FMUL_R) diff --git a/src/Instruction.hpp b/src/Instruction.hpp index f530bbc..4f9e178 100644 --- a/src/Instruction.hpp +++ b/src/Instruction.hpp @@ -54,7 +54,7 @@ namespace RandomX { constexpr int FADD_M = 22; constexpr int FSUB_R = 23; constexpr int FSUB_M = 24; - constexpr int FPNEG_R = 25; + constexpr int CFSUM_R = 25; constexpr int FMUL_R = 26; constexpr int FMUL_M = 27; constexpr int FDIV_R = 28; @@ -116,7 +116,7 @@ namespace RandomX { void h_FADD_M(std::ostream&) const; void h_FSUB_R(std::ostream&) const; void h_FSUB_M(std::ostream&) const; - void h_FPNEG_R(std::ostream&) const; + void h_CFSUM_R(std::ostream&) const; void h_FMUL_R(std::ostream&) const; void h_FMUL_M(std::ostream&) const; void h_FDIV_R(std::ostream&) const; diff --git a/src/JitCompilerX86.cpp b/src/JitCompilerX86.cpp index e891a27..de803be 100644 --- a/src/JitCompilerX86.cpp +++ b/src/JitCompilerX86.cpp @@ -87,7 +87,7 @@ namespace RandomX { ; xmm12 -> temporary ; xmm13 -> DBL_MIN ; xmm14 -> absolute value mask 0x7fffffffffffffff7fffffffffffffff - ; xmm15 -> sign mask 0x80000000000000008000000000000000 + ; xmm15 -> unused */ @@ -178,6 +178,8 @@ namespace RandomX { static const uint8_t REX_XOR_RAX_R64[] = { 0x49, 0x33 }; static const uint8_t REX_XCHG[] = { 0x4d, 0x87 }; static const uint8_t REX_ANDPS_XMM12[] = { 0x41, 0x0f, 0x54, 0xe6 }; + static const uint8_t REX_PADD[] = { 0x66, 0x44, 0x0f }; + static const uint8_t PADD_OPCODES[] = { 0xfc, 0xfd, 0xfe, 0xd4 }; size_t JitCompilerX86::getCodeSize() { return codePos - prologueSize; @@ -615,6 +617,9 @@ namespace RandomX { instr.src %= 4; emit(REX_ADDPD); emitByte(0xc0 + instr.src + 8 * instr.dst); + //emit(REX_PADD); + //emitByte(PADD_OPCODES[instr.mod % 4]); + //emitByte(0xf8 + instr.dst); } void JitCompilerX86::h_FADD_M(Instruction& instr) { @@ -630,6 +635,9 @@ namespace RandomX { instr.src %= 4; emit(REX_SUBPD); emitByte(0xc0 + instr.src + 8 * instr.dst); + //emit(REX_PADD); + //emitByte(PADD_OPCODES[instr.mod % 4]); + //emitByte(0xf8 + instr.dst); } void JitCompilerX86::h_FSUB_M(Instruction& instr) { @@ -640,7 +648,7 @@ namespace RandomX { emitByte(0xc4 + 8 * instr.dst); } - void JitCompilerX86::h_FPNEG_R(Instruction& instr) { + void JitCompilerX86::h_CFSUM_R(Instruction& instr) { instr.dst %= 4; emit(REX_XORPS); emitByte(0xc7 + 8 * instr.dst); @@ -794,7 +802,7 @@ namespace RandomX { INST_HANDLE(FADD_M) INST_HANDLE(FSUB_R) INST_HANDLE(FSUB_M) - INST_HANDLE(FPNEG_R) + INST_HANDLE(CFSUM_R) INST_HANDLE(FMUL_R) INST_HANDLE(FMUL_M) INST_HANDLE(FDIV_R) diff --git a/src/JitCompilerX86.hpp b/src/JitCompilerX86.hpp index 4303cfd..feba888 100644 --- a/src/JitCompilerX86.hpp +++ b/src/JitCompilerX86.hpp @@ -115,7 +115,7 @@ namespace RandomX { void h_FADD_M(Instruction&); void h_FSUB_R(Instruction&); void h_FSUB_M(Instruction&); - void h_FPNEG_R(Instruction&); + void h_CFSUM_R(Instruction&); void h_FMUL_R(Instruction&); void h_FMUL_M(Instruction&); void h_FDIV_R(Instruction&); diff --git a/src/asm/program_loop_store.inc b/src/asm/program_loop_store.inc index a0acebc..bd2bbdd 100644 --- a/src/asm/program_loop_store.inc +++ b/src/asm/program_loop_store.inc @@ -12,6 +12,10 @@ mulpd xmm1, xmm5 mulpd xmm2, xmm6 mulpd xmm3, xmm7 + ;# xorpd xmm0, xmm15 + ;# xorpd xmm1, xmm15 + ;# xorpd xmm2, xmm15 + ;# xorpd xmm3, xmm15 movapd xmmword ptr [rcx+0], xmm0 movapd xmmword ptr [rcx+16], xmm1 movapd xmmword ptr [rcx+32], xmm2 diff --git a/src/asm/program_prologue_load.inc b/src/asm/program_prologue_load.inc index 3a994ab..74c2a08 100644 --- a/src/asm/program_prologue_load.inc +++ b/src/asm/program_prologue_load.inc @@ -18,5 +18,5 @@ movapd xmm11, xmmword ptr [rcx+120] movapd xmm13, xmmword ptr [minDbl] movapd xmm14, xmmword ptr [absMask] - movapd xmm15, xmmword ptr [signMask] + ;# xorpd xmm15, xmm15 diff --git a/src/executeProgram-win64.asm b/src/executeProgram-win64.asm index ac49e50..ff43578 100644 --- a/src/executeProgram-win64.asm +++ b/src/executeProgram-win64.asm @@ -54,7 +54,7 @@ executeProgram PROC ; xmm12 -> temporary ; xmm13 -> DBL_MIN ; xmm14 -> absolute value mask - ; xmm15 -> sign mask + ; xmm15 -> unused ; store callee-saved registers push rbx @@ -104,7 +104,7 @@ executeProgram PROC movapd xmm11, xmmword ptr [rcx+120] movapd xmm13, xmmword ptr [minDbl] movapd xmm14, xmmword ptr [absMask] - movapd xmm15, xmmword ptr [signMask] + ;# xorps xmm15, xmm15 jmp program_begin diff --git a/src/instructionWeights.hpp b/src/instructionWeights.hpp index 32225e7..3998d07 100644 --- a/src/instructionWeights.hpp +++ b/src/instructionWeights.hpp @@ -21,10 +21,10 @@ along with RandomX. If not, see. //Integer #define WT_IADD_R 12 -#define WT_IADD_M 3 -#define WT_IADD_RC 12 +#define WT_IADD_M 7 +#define WT_IADD_RC 16 #define WT_ISUB_R 12 -#define WT_ISUB_M 3 +#define WT_ISUB_M 7 #define WT_IMUL_9C 9 #define WT_IMUL_R 16 #define WT_IMUL_M 4 @@ -35,10 +35,10 @@ along with RandomX. If not, see. #define WT_IDIV_C 4 #define WT_ISDIV_C 4 #define WT_INEG_R 2 -#define WT_IXOR_R 12 +#define WT_IXOR_R 16 #define WT_IXOR_M 4 -#define WT_IROR_R 10 -#define WT_IROL_R 10 +#define WT_IROR_R 8 +#define WT_IROL_R 8 #define WT_ISWAP_R 4 //Common floating point @@ -49,22 +49,22 @@ along with RandomX. If not, see. #define WT_FADD_M 5 #define WT_FSUB_R 20 #define WT_FSUB_M 5 -#define WT_FPNEG_R 6 //Floating point group E -#define WT_FMUL_R 16 -#define WT_FMUL_M 4 -#define WT_FDIV_R 7 -#define WT_FDIV_M 1 +#define WT_FMUL_R 20 +#define WT_FMUL_M 0 +#define WT_FDIV_R 0 +#define WT_FDIV_M 4 #define WT_FSQRT_R 6 //Control #define WT_COND_R 7 #define WT_COND_M 1 #define WT_CFROUND 1 +#define WT_CFSUM_R 0 //Store -#define WT_ISTORE 18 +#define WT_ISTORE 16 #define WT_FSTORE 0 #define WT_NOP 0 @@ -74,7 +74,7 @@ WT_ISUB_M + WT_IMUL_9C + WT_IMUL_R + WT_IMUL_M + WT_IMULH_R + \ WT_IMULH_M + WT_ISMULH_R + WT_ISMULH_M + WT_IDIV_C + WT_ISDIV_C + \ WT_INEG_R + WT_IXOR_R + WT_IXOR_M + WT_IROR_R + WT_IROL_R + \ WT_ISWAP_R + WT_FSWAP_R + WT_FADD_R + WT_FADD_M + WT_FSUB_R + WT_FSUB_M + \ -WT_FPNEG_R + WT_FMUL_R + WT_FMUL_M + WT_FDIV_R + WT_FDIV_M + \ +WT_CFSUM_R + WT_FMUL_R + WT_FMUL_M + WT_FDIV_R + WT_FDIV_M + \ WT_FSQRT_R + WT_COND_R + WT_COND_M + WT_CFROUND + WT_ISTORE + WT_FSTORE + WT_NOP; static_assert(wtSum == 256, diff --git a/src/program.inc b/src/program.inc index e4de06f..ba4b937 100644 --- a/src/program.inc +++ b/src/program.inc @@ -1,768 +1,731 @@ - ; IMUL_R r0, r7 - imul r8, r15 - ; ISMULH_R r2, r1 - mov rax, r10 - imul r9 - mov r10, rdx - ; IMUL_R r2, r4 - imul r10, r12 - ; IADD_R r7, r0 - add r15, r8 - ; FPSQRT_R e0 - sqrtpd xmm4, xmm4 - ; IMUL_R r3, r6 - imul r11, r14 - ; FPMUL_R e3, a1 - mulpd xmm7, xmm9 - ; IMULH_M r6, L1[r3] - mov ecx, r11d - and ecx, 16376 - mov rax, r14 - mul qword ptr [rsi+rcx] - mov r14, rdx - ; IMUL_R r5, r1 - imul r13, r9 - ; FPADD_M f0, L2[r6] - mov eax, r14d + ; FMUL_R e0, a2 + mulpd xmm4, xmm10 + ; IADD_RC r2, r5, -1621224194 + lea r10, [r10+r13-1621224194] + ; ISTORE L2[r2], r7 + mov eax, r10d and eax, 262136 - cvtdq2pd xmm12, qword ptr [rsi+rax] - addpd xmm0, xmm12 - ; IROR_R r4, r3 - mov ecx, r11d - ror r12, cl - ; IXOR_M r4, L3[984888] - xor r12, qword ptr [rsi+984888] - ; IROR_R r0, r3 - mov ecx, r11d - ror r8, cl - ; IROR_R r0, r4 - mov ecx, r12d - ror r8, cl - ; FPMUL_R e0, a1 - mulpd xmm4, xmm9 - ; IMUL_R r0, r2 - imul r8, r10 - ; ISUB_M r0, L1[r3] - mov eax, r11d + mov qword ptr [rsi+rax], r15 + ; FSUB_M f2, L1[r2] + mov eax, r10d and eax, 16376 - sub r8, qword ptr [rsi+rax] - ; FPSUB_R f3, a1 - subpd xmm3, xmm9 - ; ISWAP_R r7, r4 - xchg r15, r12 - ; IDIV_C r1, 3690475308 - mov rax, r9 - shr rax, 2 - mov rcx, 5367070356934653253 - mul rcx - shr rdx, 28 - add r9, rdx - ; IROL_R r4, r2 - mov ecx, r10d - rol r12, cl - ; IMUL_M r5, L1[r4] + cvtdq2pd xmm12, qword ptr [rsi+rax] + subpd xmm2, xmm12 + ; IMUL_9C r6, -1003503212 + lea r14, [r14+r14*8-1003503212] + ; FSUB_R f1, a0 + subpd xmm1, xmm8 + ; IXOR_M r5, L2[r3] + mov eax, r11d + and eax, 262136 + xor r13, qword ptr [rsi+rax] + ; FSUB_M f2, L1[r4] mov eax, r12d and eax, 16376 - imul r13, qword ptr [rsi+rax] - ; IROL_R r4, r7 - mov ecx, r15d - rol r12, cl - ; ISUB_R r3, r1 - sub r11, r9 - ; IADD_R r7, r0 - add r15, r8 - ; IADD_M r1, L1[r3] - mov eax, r11d + cvtdq2pd xmm12, qword ptr [rsi+rax] + subpd xmm2, xmm12 + ; FSUB_R f3, a0 + subpd xmm3, xmm8 + ; ISDIV_C r0, 1400272688 + mov rax, 7072565507528518045 + imul r8 + xor eax, eax + sar rdx, 29 + sets al + add rdx, rax + add r8, rdx + ; IMUL_M r3, L1[r7] + mov eax, r15d and eax, 16376 - add r9, qword ptr [rsi+rax] - ; FPMUL_R e2, a2 - mulpd xmm6, xmm10 - ; IADD_R r6, -1115286770 - add r14, -1115286770 - ; FPDIV_R e2, a3 - divpd xmm6, xmm11 - maxpd xmm6, xmm13 - ; FPADD_R f1, a2 - addpd xmm1, xmm10 - ; IXOR_R r3, r7 - xor r11, r15 + imul r11, qword ptr [rsi+rax] + ; IROL_R r2, r3 + mov ecx, r11d + rol r10, cl + ; IMULH_R r6, r0 + mov rax, r14 + mul r8 + mov r14, rdx + ; FMUL_R e0, a2 + mulpd xmm4, xmm10 + ; IADD_RC r3, r4, -52260428 + lea r11, [r11+r12-52260428] + ; IADD_R r7, -1138617760 + add r15, -1138617760 + ; IXOR_M r2, L1[r6] + mov eax, r14d + and eax, 16376 + xor r10, qword ptr [rsi+rax] + ; FSUB_R f2, a1 + subpd xmm2, xmm9 + ; IXOR_R r7, r1 + xor r15, r9 + ; COND_R r2, lt(r7, -41618808) + xor ecx, ecx + cmp r15d, -41618808 + setl cl + add r10, rcx + ; FMUL_R e3, a0 + mulpd xmm7, xmm8 + ; COND_R r4, sg(r1, -961190365) + xor ecx, ecx + cmp r9d, -961190365 + sets cl + add r12, rcx + ; FADD_R f2, a1 + addpd xmm2, xmm9 + ; FSUB_R f0, a3 + subpd xmm0, xmm11 + ; ISTORE L1[r6], r2 + mov eax, r14d + and eax, 16376 + mov qword ptr [rsi+rax], r10 + ; ISUB_R r6, r5 + sub r14, r13 + ; IADD_M r0, L1[r4] + mov eax, r12d + and eax, 16376 + add r8, qword ptr [rsi+rax] ; ISTORE L1[r4], r3 mov eax, r12d and eax, 16376 mov qword ptr [rsi+rax], r11 - ; IROR_R r3, r6 - mov ecx, r14d - ror r11, cl - ; ISMULH_R r0, r6 - mov rax, r8 - imul r14 - mov r8, rdx - ; IROR_R r6, r5 - mov ecx, r13d - ror r14, cl - ; IMULH_M r6, L2[r0] - mov ecx, r8d - and ecx, 262136 - mov rax, r14 - mul qword ptr [rsi+rcx] - mov r14, rdx - ; ISUB_R r2, 1512125960 - sub r10, 1512125960 + ; COND_M r6, sg(L1[r6], 1048782623) + xor ecx, ecx + mov eax, r14d + and eax, 16376 + cmp dword ptr [rsi+rax], 1048782623 + sets cl + add r14, rcx + ; FSQRT_R e0 + sqrtpd xmm4, xmm4 + ; INEG_R r2 + neg r10 + ; FSQRT_R e1 + sqrtpd xmm5, xmm5 + ; FMUL_R e1, a3 + mulpd xmm5, xmm11 ; IMUL_R r7, r6 imul r15, r14 - ; IMULH_R r6, r7 - mov rax, r14 - mul r15 - mov r14, rdx - ; ISUB_R r4, r1 - sub r12, r9 - ; FPMUL_R e3, a2 - mulpd xmm7, xmm10 - ; FPSQRT_R e1 - sqrtpd xmm5, xmm5 - ; IXOR_R r5, r2 - xor r13, r10 - ; FPADD_M f2, L1[r0] - mov eax, r8d - and eax, 16376 - cvtdq2pd xmm12, qword ptr [rsi+rax] - addpd xmm2, xmm12 - ; IMULH_R r6, r1 - mov rax, r14 - mul r9 - mov r14, rdx - ; ISUB_M r5, L1[r0] - mov eax, r8d - and eax, 16376 - sub r13, qword ptr [rsi+rax] - ; FPMUL_R e2, a3 - mulpd xmm6, xmm11 - ; IMUL_R r4, r6 - imul r12, r14 - ; FPMUL_R e3, a2 - mulpd xmm7, xmm10 - ; ISUB_R r3, r2 - sub r11, r10 - ; FPMUL_R e3, a2 - mulpd xmm7, xmm10 - ; IROL_R r7, r0 - mov ecx, r8d - rol r15, cl - ; FPSUB_R f3, a2 - subpd xmm3, xmm10 - ; IROL_R r3, r7 - mov ecx, r15d - rol r11, cl - ; ISWAP_R r5, r7 - xchg r13, r15 - ; IDIV_C r5, 749951529 - mov rax, 13205547200481862341 - mul r13 - shr rdx, 29 - add r13, rdx - ; FPADD_R f3, a0 - addpd xmm3, xmm8 - ; IMUL_M r0, L1[r4] - mov eax, r12d - and eax, 16376 - imul r8, qword ptr [rsi+rax] - ; FPADD_R f1, a1 - addpd xmm1, xmm9 - ; IROR_R r2, 60 - ror r10, 60 - ; IROR_R r5, r4 - mov ecx, r12d - ror r13, cl - ; FPADD_R f2, a0 - addpd xmm2, xmm8 - ; IXOR_M r4, L1[r6] - mov eax, r14d - and eax, 16376 - xor r12, qword ptr [rsi+rax] - ; IXOR_R r2, r6 - xor r10, r14 - ; FPADD_M f3, L1[r0] - mov eax, r8d - and eax, 16376 - cvtdq2pd xmm12, qword ptr [rsi+rax] - addpd xmm3, xmm12 - ; ISUB_R r7, r6 - sub r15, r14 - ; IMUL_9C r2, -962375579 - lea r10, [r10+r10*8-962375579] - ; FPSUB_R f3, a2 - subpd xmm3, xmm10 - ; FPSUB_R f3, a0 - subpd xmm3, xmm8 - ; IMUL_R r1, r5 - imul r9, r13 - ; IMUL_R r6, r4 - imul r14, r12 - ; ISWAP_R r0, r2 - xchg r8, r10 - ; ISUB_R r6, r5 - sub r14, r13 - ; FPSUB_R f2, a1 - subpd xmm2, xmm9 - ; ISDIV_C r6, 652931802 - mov rax, -3278972671018643631 - imul r14 - xor eax, eax - add rdx, r14 - sar rdx, 29 - sets al - add rdx, rax - add r14, rdx - ; IMUL_9C r5, -1142924545 - lea r13, [r13+r13*8-1142924545] - ; ISUB_R r7, 1085161834 - sub r15, 1085161834 - ; IMUL_R r4, r6 - imul r12, r14 - ; FPMUL_M e1, L1[r4] - mov eax, r12d - and eax, 16376 - cvtdq2pd xmm12, qword ptr [rsi+rax] - mulpd xmm5, xmm12 - maxpd xmm5, xmm13 - ; FPMUL_M e3, L2[r1] - mov eax, r9d - and eax, 262136 - cvtdq2pd xmm12, qword ptr [rsi+rax] - mulpd xmm7, xmm12 - maxpd xmm7, xmm13 - ; COND_R r2, lt(r5, 1635027096) - xor ecx, ecx - cmp r13d, 1635027096 - setl cl - add r10, rcx - ; IMUL_R r5, -1219696062 - imul r13, -1219696062 - ; IXOR_R r5, r0 - xor r13, r8 - ; FPNEG_R f2 - xorps xmm2, xmm15 - ; FPADD_R f3, a2 - addpd xmm3, xmm10 - ; FPSUB_R f1, a3 - subpd xmm1, xmm11 - ; FPADD_R f1, a2 - addpd xmm1, xmm10 - ; FPDIV_R e1, a3 - divpd xmm5, xmm11 - maxpd xmm5, xmm13 - ; IXOR_M r6, L1[r0] - mov eax, r8d - and eax, 16376 - xor r14, qword ptr [rsi+rax] - ; ISUB_R r7, r4 - sub r15, r12 - ; ISUB_M r6, L1[r1] - mov eax, r9d - and eax, 16376 - sub r14, qword ptr [rsi+rax] - ; ISTORE L1[r5], r3 - mov eax, r13d - and eax, 16376 - mov qword ptr [rsi+rax], r11 - ; IMUL_R r5, r1 - imul r13, r9 - ; IROR_R r3, r2 - mov ecx, r10d - ror r11, cl - ; IMUL_R r4, r7 - imul r12, r15 - ; ISDIV_C r6, -54134756 - mov rax, 7012869325244995177 - imul r14 - xor eax, eax - sub rdx, r14 - sar rdx, 25 - sets al - add rdx, rax - add r14, rdx - ; FPMUL_R e1, a2 - mulpd xmm5, xmm10 - ; FPSUB_M f2, L2[r4] - mov eax, r12d - and eax, 262136 - cvtdq2pd xmm12, qword ptr [rsi+rax] - subpd xmm2, xmm12 - ; IMUL_R r0, r5 - imul r8, r13 - ; FPMUL_R e3, a0 - mulpd xmm7, xmm8 - ; COND_R r5, be(r4, 1545677311) - xor ecx, ecx - cmp r12d, 1545677311 - setbe cl - add r13, rcx - ; IMUL_R r6, r3 - imul r14, r11 - ; IROL_R r6, r2 - mov ecx, r10d - rol r14, cl - ; FPDIV_R e3, a1 - divpd xmm7, xmm9 - maxpd xmm7, xmm13 - ; IXOR_M r5, L1[r1] - mov eax, r9d - and eax, 16376 - xor r13, qword ptr [rsi+rax] - ; COND_R r3, ab(r2, 1734636060) - xor ecx, ecx - cmp r10d, 1734636060 - seta cl - add r11, rcx - ; ISTORE L1[r2], r7 - mov eax, r10d - and eax, 16376 - mov qword ptr [rsi+rax], r15 - ; IADD_R r5, r6 - add r13, r14 - ; FPSUB_R f1, a2 - subpd xmm1, xmm10 - ; FPADD_R f2, a1 - addpd xmm2, xmm9 - ; FPSWAP_R f1 - shufpd xmm1, xmm1, 1 - ; IROL_R r2, r6 - mov ecx, r14d - rol r10, cl - ; IMUL_R r0, r4 - imul r8, r12 - ; FPSUB_R f0, a2 - subpd xmm0, xmm10 - ; ISUB_R r6, r7 - sub r14, r15 - ; IROL_R r4, r7 - mov ecx, r15d - rol r12, cl - ; FPMUL_R e2, a0 - mulpd xmm6, xmm8 - ; ISUB_R r1, r3 - sub r9, r11 - ; FPDIV_R e0, a1 - divpd xmm4, xmm9 - maxpd xmm4, xmm13 - ; FPADD_R f0, a1 - addpd xmm0, xmm9 - ; FPMUL_R e0, a2 - mulpd xmm4, xmm10 - ; FPSUB_R f2, a2 - subpd xmm2, xmm10 - ; FPSUB_M f2, L1[r6] - mov eax, r14d - and eax, 16376 - cvtdq2pd xmm12, qword ptr [rsi+rax] - subpd xmm2, xmm12 - ; FPMUL_R e0, a0 - mulpd xmm4, xmm8 - ; IXOR_M r4, L2[r7] - mov eax, r15d - and eax, 262136 - xor r12, qword ptr [rsi+rax] - ; FPSUB_R f3, a3 - subpd xmm3, xmm11 - ; ISMULH_R r1, r6 - mov rax, r9 - imul r14 - mov r9, rdx - ; COND_R r4, be(r7, 224524971) - xor ecx, ecx - cmp r15d, 224524971 - setbe cl - add r12, rcx - ; FPADD_M f2, L1[r1] - mov eax, r9d - and eax, 16376 - cvtdq2pd xmm12, qword ptr [rsi+rax] - addpd xmm2, xmm12 - ; IMUL_R r5, r4 - imul r13, r12 - ; IADD_RC r1, r5, 370966979 - lea r9, [r9+r13+370966979] - ; IADD_RC r7, r3, -1762209698 - lea r15, [r15+r11-1762209698] - ; FPMUL_M e3, L2[r2] - mov eax, r10d - and eax, 262136 - cvtdq2pd xmm12, qword ptr [rsi+rax] - mulpd xmm7, xmm12 - maxpd xmm7, xmm13 - ; ISUB_R r2, r7 - sub r10, r15 - ; IMUL_9C r3, 171157280 - lea r11, [r11+r11*8+171157280] - ; ISUB_R r3, r5 - sub r11, r13 - ; FPNEG_R f3 - xorps xmm3, xmm15 - ; FPNEG_R f2 - xorps xmm2, xmm15 - ; ISTORE L1[r4], r1 - mov eax, r12d - and eax, 16376 - mov qword ptr [rsi+rax], r9 - ; IADD_R r0, r2 - add r8, r10 - ; IXOR_R r7, r6 - xor r15, r14 - ; IROR_R r0, r4 - mov ecx, r12d - ror r8, cl - ; FPMUL_R e3, a2 - mulpd xmm7, xmm10 - ; IXOR_M r4, L1[r7] - mov eax, r15d - and eax, 16376 - xor r12, qword ptr [rsi+rax] - ; ISTORE L1[r5], r7 - mov eax, r13d - and eax, 16376 - mov qword ptr [rsi+rax], r15 - ; IMUL_9C r7, -1206742834 - lea r15, [r15+r15*8-1206742834] - ; ISMULH_R r0, r4 + ; IMULH_R r0, r4 mov rax, r8 - imul r12 + mul r12 mov r8, rdx - ; FPADD_R f2, a0 - addpd xmm2, xmm8 - ; FPSUB_R f1, a0 - subpd xmm1, xmm8 - ; INEG_R r7 - neg r15 - ; COND_M r0, of(L1[r5], -2056260506) - xor ecx, ecx - mov eax, r13d - and eax, 16376 - cmp dword ptr [rsi+rax], -2056260506 - seto cl - add r8, rcx - ; FPSQRT_R e2 + ; IMUL_R r5, r3 + imul r13, r11 + ; FSQRT_R e2 sqrtpd xmm6, xmm6 - ; IMUL_R r3, r4 - imul r11, r12 - ; FPNEG_R f1 - xorps xmm1, xmm15 - ; FPADD_M f2, L1[r5] - mov eax, r13d - and eax, 16376 - cvtdq2pd xmm12, qword ptr [rsi+rax] - addpd xmm2, xmm12 - ; FPSUB_R f3, a0 - subpd xmm3, xmm8 - ; FPNEG_R f3 - xorps xmm3, xmm15 - ; FPMUL_M e3, L2[r5] - mov eax, r13d - and eax, 262136 - cvtdq2pd xmm12, qword ptr [rsi+rax] - mulpd xmm7, xmm12 - maxpd xmm7, xmm13 - ; ISTORE L1[r2], r2 - mov eax, r10d - and eax, 16376 - mov qword ptr [rsi+rax], r10 - ; IMUL_M r3, L2[r4] - mov eax, r12d - and eax, 262136 - imul r11, qword ptr [rsi+rax] - ; IROL_R r5, r6 - mov ecx, r14d - rol r13, cl - ; IADD_RC r4, r3, -904431293 - lea r12, [r12+r11-904431293] - ; FPSUB_R f1, a1 - subpd xmm1, xmm9 - ; IROL_R r7, r0 - mov ecx, r8d - rol r15, cl - ; ISTORE L2[r1], r7 - mov eax, r9d - and eax, 262136 - mov qword ptr [rsi+rax], r15 - ; IROL_R r4, r3 - mov ecx, r11d - rol r12, cl - ; IADD_R r5, r2 - add r13, r10 - ; COND_R r3, ge(r6, -444806705) - xor ecx, ecx - cmp r14d, -444806705 - setge cl - add r11, rcx - ; FPADD_R f0, a1 - addpd xmm0, xmm9 - ; IROL_R r0, 57 - rol r8, 57 - ; IADD_R r0, r2 - add r8, r10 - ; IADD_R r7, r4 - add r15, r12 - ; IROL_R r1, r7 - mov ecx, r15d - rol r9, cl - ; IXOR_M r7, L2[r5] - mov eax, r13d - and eax, 262136 - xor r15, qword ptr [rsi+rax] - ; ISTORE L1[r2], r0 - mov eax, r10d - and eax, 16376 - mov qword ptr [rsi+rax], r8 - ; FPADD_R f1, a2 - addpd xmm1, xmm10 - ; ISUB_R r1, r4 - sub r9, r12 - ; IXOR_R r5, r0 - xor r13, r8 - ; IXOR_M r7, L2[r1] - mov eax, r9d - and eax, 262136 - xor r15, qword ptr [rsi+rax] - ; FPSUB_R f0, a0 - subpd xmm0, xmm8 - ; IXOR_M r1, L1[r4] - mov eax, r12d - and eax, 16376 - xor r9, qword ptr [rsi+rax] - ; FPMUL_R e3, a0 - mulpd xmm7, xmm8 - ; ISDIV_C r1, 1473744194 - mov rax, -5006799265644655925 - imul r9 - xor eax, eax - add rdx, r9 - sar rdx, 30 - sets al - add rdx, rax - add r9, rdx - ; IMUL_9C r1, 1626151459 - lea r9, [r9+r9*8+1626151459] - ; IXOR_M r6, L1[r4] - mov eax, r12d - and eax, 16376 - xor r14, qword ptr [rsi+rax] - ; FPADD_R f0, a0 - addpd xmm0, xmm8 - ; FPADD_R f3, a2 - addpd xmm3, xmm10 - ; ISUB_R r6, r7 - sub r14, r15 - ; IADD_RC r1, r5, 2075955307 - lea r9, [r9+r13+2075955307] - ; IROL_R r6, r3 - mov ecx, r11d - rol r14, cl - ; IMULH_R r2, -1135671124 - mov eax, -1135671124 - mul r10 - add r10, rdx - ; ISUB_R r5, r2 - sub r13, r10 - ; IMULH_R r3, r5 - mov rax, r11 - mul r13 - mov r11, rdx - ; IADD_M r4, L3[386040] - add r12, qword ptr [rsi+386040] - ; COND_R r6, ge(r4, 1518758207) - xor ecx, ecx - cmp r12d, 1518758207 - setge cl - add r14, rcx - ; FPDIV_R e3, a1 - divpd xmm7, xmm9 - maxpd xmm7, xmm13 - ; FPNEG_R f2 - xorps xmm2, xmm15 - ; FPADD_M f1, L1[r4] - mov eax, r12d - and eax, 16376 - cvtdq2pd xmm12, qword ptr [rsi+rax] - addpd xmm1, xmm12 - ; FPMUL_M e0, L1[r4] - mov eax, r12d - and eax, 16376 - cvtdq2pd xmm12, qword ptr [rsi+rax] - mulpd xmm4, xmm12 - maxpd xmm4, xmm13 - ; FPSQRT_R e2 - sqrtpd xmm6, xmm6 - ; IROL_R r5, r1 - mov ecx, r9d - rol r13, cl - ; FPADD_R f3, a0 + ; FADD_R f3, a0 addpd xmm3, xmm8 - ; IROL_R r3, r0 - mov ecx, r8d - rol r11, cl - ; FPMUL_R e3, a1 - mulpd xmm7, xmm9 - ; IROR_R r0, r7 - mov ecx, r15d - ror r8, cl - ; FPADD_R f2, a2 - addpd xmm2, xmm10 - ; IXOR_R r7, r0 - xor r15, r8 - ; ISTORE L1[r4], r1 + ; IADD_R r3, r2 + add r11, r10 + ; FADD_R f1, a0 + addpd xmm1, xmm8 + ; FMUL_R e3, a2 + mulpd xmm7, xmm10 + ; FADD_R f0, a1 + addpd xmm0, xmm9 + ; IMUL_R r5, r6 + imul r13, r14 + ; IADD_RC r1, r2, -1263285243 + lea r9, [r9+r10-1263285243] + ; ISUB_M r4, L1[r6] + mov eax, r14d + and eax, 16376 + sub r12, qword ptr [rsi+rax] + ; IROL_R r7, r2 + mov ecx, r10d + rol r15, cl + ; IMUL_R r0, r7 + imul r8, r15 + ; IXOR_R r1, r6 + xor r9, r14 + ; IXOR_M r2, L1[r4] mov eax, r12d and eax, 16376 - mov qword ptr [rsi+rax], r9 - ; ISTORE L2[r0], r4 + xor r10, qword ptr [rsi+rax] + ; FSUB_R f3, a1 + subpd xmm3, xmm9 + ; ISTORE L1[r0], r5 mov eax, r8d - and eax, 262136 - mov qword ptr [rsi+rax], r12 - ; FPDIV_R e3, a3 - divpd xmm7, xmm11 - maxpd xmm7, xmm13 - ; ISTORE L2[r4], r6 - mov eax, r12d - and eax, 262136 - mov qword ptr [rsi+rax], r14 - ; IMUL_R r3, r1 - imul r11, r9 - ; IXOR_R r2, r4 - xor r10, r12 - ; ISTORE L2[r3], r5 + and eax, 16376 + mov qword ptr [rsi+rax], r13 + ; FDIV_M e2, L2[r3] mov eax, r11d and eax, 262136 - mov qword ptr [rsi+rax], r13 - ; FPMUL_M e2, L2[r4] - mov eax, r12d - and eax, 262136 cvtdq2pd xmm12, qword ptr [rsi+rax] - mulpd xmm6, xmm12 + andps xmm12, xmm14 + divpd xmm6, xmm12 maxpd xmm6, xmm13 - ; FPSUB_R f3, a0 - subpd xmm3, xmm8 - ; COND_R r1, ab(r7, -229570354) - xor ecx, ecx - cmp r15d, -229570354 - seta cl - add r9, rcx - ; IROR_R r7, r3 - mov ecx, r11d - ror r15, cl - ; FPDIV_R e2, a0 - divpd xmm6, xmm8 - maxpd xmm6, xmm13 - ; IADD_R r2, r5 - add r10, r13 - ; FPDIV_R e1, a3 - divpd xmm5, xmm11 - maxpd xmm5, xmm13 - ; FPSQRT_R e2 - sqrtpd xmm6, xmm6 - ; ISUB_R r3, r7 - sub r11, r15 - ; FPADD_R f0, a0 - addpd xmm0, xmm8 - ; IMUL_M r0, L3[98136] - imul r8, qword ptr [rsi+98136] - ; IMUL_9C r5, -895487055 - lea r13, [r13+r13*8-895487055] - ; IMULH_R r2, r7 - mov rax, r10 - mul r15 - mov r10, rdx - ; IADD_R r4, r1 - add r12, r9 - ; ISDIV_C r0, 494395999 - mov rax, 5007888582388710937 - imul r8 + ; IROL_R r2, r0 + mov ecx, r8d + rol r10, cl + ; IADD_R r7, r5 + add r15, r13 + ; FDIV_M e0, L1[r4] + mov eax, r12d + and eax, 16376 + cvtdq2pd xmm12, qword ptr [rsi+rax] + andps xmm12, xmm14 + divpd xmm4, xmm12 + maxpd xmm4, xmm13 + ; FADD_R f3, a1 + addpd xmm3, xmm9 + ; FADD_R f0, a3 + addpd xmm0, xmm11 + ; IADD_R r2, r0 + add r10, r8 + ; ISTORE L1[r3], r6 + mov eax, r11d + and eax, 16376 + mov qword ptr [rsi+rax], r14 + ; IXOR_R r1, r7 + xor r9, r15 + ; ISUB_M r5, L2[r7] + mov eax, r15d + and eax, 262136 + sub r13, qword ptr [rsi+rax] + ; ISDIV_C r7, 266992378 + mov rax, -9173520256920442565 + imul r15 xor eax, eax + add rdx, r15 sar rdx, 27 sets al add rdx, rax - add r8, rdx - ; FPSWAP_R e0 - shufpd xmm4, xmm4, 1 - ; IXOR_R r1, r5 - xor r9, r13 - ; COND_R r2, ab(r3, 1932234501) + add r15, rdx + ; FDIV_M e3, L1[r4] + mov eax, r12d + and eax, 16376 + cvtdq2pd xmm12, qword ptr [rsi+rax] + andps xmm12, xmm14 + divpd xmm7, xmm12 + maxpd xmm7, xmm13 + ; IMUL_R r2, r0 + imul r10, r8 + ; FMUL_R e3, a2 + mulpd xmm7, xmm10 + ; IMUL_R r0, r6 + imul r8, r14 + ; ISTORE L1[r0], r7 + mov eax, r8d + and eax, 16376 + mov qword ptr [rsi+rax], r15 + ; FSUB_R f0, a1 + subpd xmm0, xmm9 + ; FADD_R f3, a1 + addpd xmm3, xmm9 + ; IXOR_R r5, r4 + xor r13, r12 + ; ISTORE L2[r7], r2 + mov eax, r15d + and eax, 262136 + mov qword ptr [rsi+rax], r10 + ; ISWAP_R r6, r7 + xchg r14, r15 + ; FADD_R f3, a2 + addpd xmm3, xmm10 + ; ISMULH_R r5, r0 + mov rax, r13 + imul r8 + mov r13, rdx + ; IADD_M r0, L1[r4] + mov eax, r12d + and eax, 16376 + add r8, qword ptr [rsi+rax] + ; COND_R r7, ge(r6, -1972898485) xor ecx, ecx - cmp r11d, 1932234501 - seta cl + cmp r14d, -1972898485 + setge cl + add r15, rcx + ; FADD_R f2, a2 + addpd xmm2, xmm10 + ; IROR_R r7, r6 + mov ecx, r14d + ror r15, cl + ; IADD_RC r2, r4, -117457973 + lea r10, [r10+r12-117457973] + ; IMUL_R r0, -1500893068 + imul r8, -1500893068 + ; IADD_R r2, r3 + add r10, r11 + ; FSQRT_R e2 + sqrtpd xmm6, xmm6 + ; IROR_R r7, r4 + mov ecx, r12d + ror r15, cl + ; IMUL_9C r4, 381194890 + lea r12, [r12+r12*8+381194890] + ; IADD_RC r3, r7, 1050899263 + lea r11, [r11+r15+1050899263] + ; IADD_R r2, r7 + add r10, r15 + ; FMUL_R e3, a0 + mulpd xmm7, xmm8 + ; IADD_RC r6, r6, 540663146 + lea r14, [r14+r14+540663146] + ; IROR_R r5, 58 + ror r13, 58 + ; FSWAP_R f2 + shufpd xmm2, xmm2, 1 + ; FSWAP_R f2 + shufpd xmm2, xmm2, 1 + ; FMUL_R e1, a2 + mulpd xmm5, xmm10 + ; ISWAP_R r5, r6 + xchg r13, r14 + ; IADD_R r5, r3 + add r13, r11 + ; IADD_R r7, -1780268176 + add r15, -1780268176 + ; IADD_RC r7, r0, -1497756854 + lea r15, [r15+r8-1497756854] + ; ISTORE L2[r0], r7 + mov eax, r8d + and eax, 262136 + mov qword ptr [rsi+rax], r15 + ; ISMULH_R r2, r4 + mov rax, r10 + imul r12 + mov r10, rdx + ; FSUB_R f0, a2 + subpd xmm0, xmm10 + ; ISMULH_R r2, r3 + mov rax, r10 + imul r11 + mov r10, rdx + ; IADD_R r0, r3 + add r8, r11 + ; ISUB_R r7, r2 + sub r15, r10 + ; FADD_R f2, a0 + addpd xmm2, xmm8 + ; FMUL_R e0, a2 + mulpd xmm4, xmm10 + ; FADD_R f2, a3 + addpd xmm2, xmm11 + ; IMUL_R r1, r2 + imul r9, r10 + ; IMUL_M r7, L1[r5] + mov eax, r13d + and eax, 16376 + imul r15, qword ptr [rsi+rax] + ; IMUL_R r3, r2 + imul r11, r10 + ; IXOR_R r1, r0 + xor r9, r8 + ; FSUB_M f0, L1[r5] + mov eax, r13d + and eax, 16376 + cvtdq2pd xmm12, qword ptr [rsi+rax] + subpd xmm0, xmm12 + ; IADD_RC r4, r4, 1456841848 + lea r12, [r12+r12+1456841848] + ; IXOR_R r3, r2 + xor r11, r10 + ; COND_R r0, of(r4, 1678513610) + xor ecx, ecx + cmp r12d, 1678513610 + seto cl + add r8, rcx + ; ISMULH_R r4, -1620573087 + mov rax, -1620573087 + imul r12 + add r12, rdx + ; IMUL_R r4, r1 + imul r12, r9 + ; FSWAP_R e1 + shufpd xmm5, xmm5, 1 + ; FADD_M f2, L1[r0] + mov eax, r8d + and eax, 16376 + cvtdq2pd xmm12, qword ptr [rsi+rax] + addpd xmm2, xmm12 + ; FMUL_R e1, a2 + mulpd xmm5, xmm10 + ; FSUB_R f0, a3 + subpd xmm0, xmm11 + ; IXOR_R r0, r7 + xor r8, r15 + ; ISTORE L2[r1], r4 + mov eax, r9d + and eax, 262136 + mov qword ptr [rsi+rax], r12 + ; IXOR_M r7, L1[r6] + mov eax, r14d + and eax, 16376 + xor r15, qword ptr [rsi+rax] + ; ISUB_R r2, r4 + sub r10, r12 + ; ISUB_M r4, L1[r6] + mov eax, r14d + and eax, 16376 + sub r12, qword ptr [rsi+rax] + ; FADD_R f2, a2 + addpd xmm2, xmm10 + ; FSUB_R f3, a0 + subpd xmm3, xmm8 + ; IXOR_R r7, r2 + xor r15, r10 + ; IXOR_R r0, r5 + xor r8, r13 + ; FSWAP_R f1 + shufpd xmm1, xmm1, 1 + ; FMUL_R e3, a2 + mulpd xmm7, xmm10 + ; ISWAP_R r7, r1 + xchg r15, r9 + ; ISWAP_R r1, r4 + xchg r9, r12 + ; COND_R r2, ge(r2, -226330940) + xor ecx, ecx + cmp r10d, -226330940 + setge cl add r10, rcx - ; FPMUL_R e1, a0 - mulpd xmm5, xmm8 - ; FPSUB_M f1, L1[r1] + ; FMUL_R e2, a3 + mulpd xmm6, xmm11 + ; FSUB_R f2, a1 + subpd xmm2, xmm9 + ; FADD_R f1, a0 + addpd xmm1, xmm8 + ; ISUB_R r7, r5 + sub r15, r13 + ; ISUB_M r0, L1[r1] + mov eax, r9d + and eax, 16376 + sub r8, qword ptr [rsi+rax] + ; FSUB_R f3, a1 + subpd xmm3, xmm9 + ; IROL_R r3, r5 + mov ecx, r13d + rol r11, cl + ; IADD_RC r5, r2, 795784298 + lea r13, [r13+r10+795784298] + ; IADD_RC r0, r4, -2050178553 + lea r8, [r8+r12-2050178553] + ; IMUL_9C r5, 1062534001 + lea r13, [r13+r13*8+1062534001] + ; FADD_R f0, a2 + addpd xmm0, xmm10 + ; FMUL_R e3, a1 + mulpd xmm7, xmm9 + ; IDIV_C r3, 1662492575 + mov rax, 11914062610815620875 + mul r11 + shr rdx, 30 + add r11, rdx + ; IMUL_M r5, L1[r0] + mov eax, r8d + and eax, 16376 + imul r13, qword ptr [rsi+rax] + ; IDIV_C r4, 1963597892 + mov rax, r12 + shr rax, 2 + mov rcx, 1260889558222626443 + mul rcx + shr rdx, 25 + add r12, rdx + ; IMUL_9C r7, 1820045218 + lea r15, [r15+r15*8+1820045218] + ; IMUL_M r0, L1[r3] + mov eax, r11d + and eax, 16376 + imul r8, qword ptr [rsi+rax] + ; IXOR_R r3, r7 + xor r11, r15 + ; ISMULH_R r4, r2 + mov rax, r12 + imul r10 + mov r12, rdx + ; IROL_R r3, r0 + mov ecx, r8d + rol r11, cl + ; IXOR_R r2, r0 + xor r10, r8 + ; IXOR_M r0, L2[r1] + mov eax, r9d + and eax, 262136 + xor r8, qword ptr [rsi+rax] + ; ISDIV_C r7, -935446980 + mov rax, 7859804860668271393 + imul r15 + xor eax, eax + sub rdx, r15 + sar rdx, 29 + sets al + add rdx, rax + add r15, rdx + ; IMUL_M r6, L1[r2] + mov eax, r10d + and eax, 16376 + imul r14, qword ptr [rsi+rax] + ; FSUB_M f3, L1[r6] + mov eax, r14d + and eax, 16376 + cvtdq2pd xmm12, qword ptr [rsi+rax] + subpd xmm3, xmm12 + ; IADD_RC r4, r2, 1704868083 + lea r12, [r12+r10+1704868083] + ; FADD_R f2, a0 + addpd xmm2, xmm8 + ; ISTORE L1[r0], r0 + mov eax, r8d + and eax, 16376 + mov qword ptr [rsi+rax], r8 + ; FADD_R f0, a3 + addpd xmm0, xmm11 + ; FMUL_R e0, a3 + mulpd xmm4, xmm11 + ; FSUB_R f3, a2 + subpd xmm3, xmm10 + ; IADD_RC r7, r7, 1302457878 + lea r15, [r15+r15+1302457878] + ; ISUB_R r1, 1330165941 + sub r9, 1330165941 + ; FSUB_R f1, a3 + subpd xmm1, xmm11 + ; IROR_R r0, r4 + mov ecx, r12d + ror r8, cl + ; FSUB_R f1, a0 + subpd xmm1, xmm8 + ; IROR_R r5, r6 + mov ecx, r14d + ror r13, cl + ; COND_R r0, ab(r1, -310933871) + xor ecx, ecx + cmp r9d, -310933871 + seta cl + add r8, rcx + ; COND_R r4, ab(r7, 757929676) + xor ecx, ecx + cmp r15d, 757929676 + seta cl + add r12, rcx + ; FMUL_R e0, a1 + mulpd xmm4, xmm9 + ; IMUL_R r1, r3 + imul r9, r11 + ; ISUB_R r3, r2 + sub r11, r10 + ; FSUB_R f3, a2 + subpd xmm3, xmm10 + ; FDIV_M e1, L1[r4] + mov eax, r12d + and eax, 16376 + cvtdq2pd xmm12, qword ptr [rsi+rax] + andps xmm12, xmm14 + divpd xmm5, xmm12 + maxpd xmm5, xmm13 + ; IROL_R r1, 5 + rol r9, 5 + ; IADD_R r7, -1421188024 + add r15, -1421188024 + ; FSUB_R f3, a2 + subpd xmm3, xmm10 + ; FSUB_R f2, a3 + subpd xmm2, xmm11 + ; FADD_M f3, L1[r1] mov eax, r9d and eax, 16376 cvtdq2pd xmm12, qword ptr [rsi+rax] - subpd xmm1, xmm12 - ; FPSUB_R f0, a0 - subpd xmm0, xmm8 - ; IROL_R r1, r7 - mov ecx, r15d - rol r9, cl - ; IADD_RC r0, r5, -2051588680 - lea r8, [r8+r13-2051588680] - ; COND_R r6, of(r5, -795593984) + addpd xmm3, xmm12 + ; FMUL_R e1, a3 + mulpd xmm5, xmm11 + ; IADD_RC r2, r4, -317832028 + lea r10, [r10+r12-317832028] + ; IMUL_M r4, L1[r5] + mov eax, r13d + and eax, 16376 + imul r12, qword ptr [rsi+rax] + ; FDIV_M e1, L1[r7] + mov eax, r15d + and eax, 16376 + cvtdq2pd xmm12, qword ptr [rsi+rax] + andps xmm12, xmm14 + divpd xmm5, xmm12 + maxpd xmm5, xmm13 + ; IADD_R r5, r2 + add r13, r10 + ; ISUB_R r4, 401020510 + sub r12, 401020510 + ; IROR_R r3, r0 + mov ecx, r8d + ror r11, cl + ; ISTORE L1[r7], r0 + mov eax, r15d + and eax, 16376 + mov qword ptr [rsi+rax], r8 + ; FSUB_R f2, a1 + subpd xmm2, xmm9 + ; FMUL_R e3, a1 + mulpd xmm7, xmm9 + ; IMUL_9C r3, 720965215 + lea r11, [r11+r11*8+720965215] + ; IMUL_9C r6, 74948046 + lea r14, [r14+r14*8+74948046] + ; ISTORE L1[r7], r3 + mov eax, r15d + and eax, 16376 + mov qword ptr [rsi+rax], r11 + ; IXOR_R r2, r6 + xor r10, r14 + ; FMUL_R e3, a1 + mulpd xmm7, xmm9 + ; ISUB_R r4, r1 + sub r12, r9 + ; ISUB_R r3, r0 + sub r11, r8 + ; IROL_R r7, r5 + mov ecx, r13d + rol r15, cl + ; IMUL_R r2, r6 + imul r10, r14 + ; COND_R r2, ge(r2, -1892157506) xor ecx, ecx - cmp r13d, -795593984 - seto cl - add r14, rcx - ; FPADD_R f1, a0 - addpd xmm1, xmm8 - ; IMULH_R r7, r3 - mov rax, r15 - mul r11 - mov r15, rdx - ; ISUB_R r7, r4 - sub r15, r12 - ; IROL_R r0, r6 - mov ecx, r14d - rol r8, cl - ; ISDIV_C r1, -675825513 - mov rax, -7326980207007250257 - imul r9 + cmp r10d, -1892157506 + setge cl + add r10, rcx + ; FADD_R f1, a3 + addpd xmm1, xmm11 + ; IADD_R r7, r0 + add r15, r8 + ; IDIV_C r1, 624867857 + mov rax, 15848983434401622933 + mul r9 + shr rdx, 29 + add r9, rdx + ; FADD_R f0, a1 + addpd xmm0, xmm9 + ; IADD_RC r5, r7, -477591118 + lea r13, [r13+r15-477591118] + ; FSUB_R f0, a3 + subpd xmm0, xmm11 + ; ISUB_M r6, L1[r2] + mov eax, r10d + and eax, 16376 + sub r14, qword ptr [rsi+rax] + ; FMUL_R e3, a1 + mulpd xmm7, xmm9 + ; IADD_R r0, r4 + add r8, r12 + ; FSUB_R f3, a1 + subpd xmm3, xmm9 + ; FSUB_R f2, a0 + subpd xmm2, xmm8 + ; ISDIV_C r2, -396711688 + mov rax, 5964731804029407733 + imul r10 xor eax, eax + sub rdx, r10 sar rdx, 28 sets al add rdx, rax - add r9, rdx - ; ISTORE L1[r6], r3 - mov eax, r14d - and eax, 16376 - mov qword ptr [rsi+rax], r11 - ; IROR_R r4, r3 - mov ecx, r11d - ror r12, cl - ; IDIV_C r4, 3919226376 - mov rax, r12 - shr rax, 3 - mov rcx, 2526906936258851663 - mul rcx - shr rdx, 26 - add r12, rdx - ; FPSUB_R f1, a1 - subpd xmm1, xmm9 - ; FPSUB_R f0, a0 + add r10, rdx + ; FSUB_R f2, a2 + subpd xmm2, xmm10 + ; FSUB_R f3, a2 + subpd xmm3, xmm10 + ; FADD_R f1, a3 + addpd xmm1, xmm11 + ; IMUL_R r3, r2 + imul r11, r10 + ; FADD_R f0, a3 + addpd xmm0, xmm11 + ; ISMULH_R r5, r2 + mov rax, r13 + imul r10 + mov r13, rdx + ; IMULH_R r6, r2 + mov rax, r14 + mul r10 + mov r14, rdx + ; FADD_R f3, a3 + addpd xmm3, xmm11 + ; IMUL_R r6, r7 + imul r14, r15 + ; FSUB_R f0, a0 subpd xmm0, xmm8 - ; IADD_R r0, r2 - add r8, r10 - ; IADD_M r4, L1[r2] - mov eax, r10d + ; FSUB_R f2, a0 + subpd xmm2, xmm8 + ; ISUB_R r6, r4 + sub r14, r12 + ; FSWAP_R f1 + shufpd xmm1, xmm1, 1 + ; IXOR_R r0, r5 + xor r8, r13 + ; FADD_R f2, a1 + addpd xmm2, xmm9 + ; IROL_R r7, r5 + mov ecx, r13d + rol r15, cl + ; FMUL_R e3, a2 + mulpd xmm7, xmm10 + ; IADD_RC r3, r6, -1317630728 + lea r11, [r11+r14-1317630728] + ; IMUL_R r2, r3 + imul r10, r11 + ; IADD_RC r1, r4, 894105694 + lea r9, [r9+r12+894105694] + ; IMUL_9C r7, 504293473 + lea r15, [r15+r15*8+504293473] + ; FADD_M f1, L2[r0] + mov eax, r8d + and eax, 262136 + cvtdq2pd xmm12, qword ptr [rsi+rax] + addpd xmm1, xmm12 + ; IMUL_R r7, r1 + imul r15, r9 + ; IXOR_R r2, r4 + xor r10, r12 + ; IADD_RC r0, r1, 392362094 + lea r8, [r8+r9+392362094] + ; IDIV_C r4, 1645771433 + mov rax, 376097195048767223 + mul r12 + shr rdx, 25 + add r12, rdx + ; ISUB_R r4, r3 + sub r12, r11 + ; ISUB_M r7, L1[r4] + mov eax, r12d and eax, 16376 - add r12, qword ptr [rsi+rax] - ; ISTORE L1[r7], r2 + sub r15, qword ptr [rsi+rax] + ; IMUL_M r5, L1[r7] mov eax, r15d and eax, 16376 - mov qword ptr [rsi+rax], r10 - ; FPSQRT_R e1 - sqrtpd xmm5, xmm5 - ; IADD_R r5, r4 - add r13, r12 - ; IXOR_R r6, r7 - xor r14, r15 - ; ISMULH_R r4, r7 - mov rax, r12 - imul r15 - mov r12, rdx - ; FPSQRT_R e1 - sqrtpd xmm5, xmm5 + imul r13, qword ptr [rsi+rax] + ; IROR_R r1, r7 + mov ecx, r15d + ror r9, cl + ; INEG_R r4 + neg r12 + ; IMUL_R r3, 1863959234 + imul r11, 1863959234 + ; IROR_R r4, 59 + ror r12, 59 + ; IMUL_M r1, L3[363256] + imul r9, qword ptr [rsi+363256] + ; ISTORE L2[r6], r7 + mov eax, r14d + and eax, 262136 + mov qword ptr [rsi+rax], r15 + ; ISTORE L1[r1], r5 + mov eax, r9d + and eax, 16376 + mov qword ptr [rsi+rax], r13 + ; FSUB_M f0, L1[r1] + mov eax, r9d + and eax, 16376 + cvtdq2pd xmm12, qword ptr [rsi+rax] + subpd xmm0, xmm12 + ; FSQRT_R e2 + sqrtpd xmm6, xmm6 + ; FMUL_R e0, a3 + mulpd xmm4, xmm11 + ; FMUL_R e3, a2 + mulpd xmm7, xmm10 + ; IROL_R r5, r2 + mov ecx, r10d + rol r13, cl + ; IADD_R r0, r4 + add r8, r12 From 32d827d0a63c66f13cade760aeb0894e4868ee26 Mon Sep 17 00:00:00 2001 From: tevador Date: Sat, 9 Feb 2019 15:45:26 +0100 Subject: [PATCH 32/35] Interpreter with bytecode Fixed some undefined behavior with signed types Fixed different results on big endian systems Removed unused code files Restored FNEG_R instructions Updated documentation --- README.md | 134 ++-- doc/dataset.md | 85 +- doc/isa-ops.md | 187 ++--- doc/isa.md | 237 ++---- src/AssemblyGeneratorX86.cpp | 65 +- src/AssemblyGeneratorX86.hpp | 5 +- src/Cache.cpp | 1 - src/CompiledVirtualMachine.cpp | 44 +- src/CompiledVirtualMachine.hpp | 3 +- src/Instruction.cpp | 22 +- src/Instruction.hpp | 6 +- src/InterpretedVirtualMachine.cpp | 725 +++++++++++++++-- src/InterpretedVirtualMachine.hpp | 100 +-- src/JitCompilerX86.cpp | 79 +- src/JitCompilerX86.hpp | 7 +- src/LightClientAsyncWorker.cpp | 27 +- src/Pcg32.hpp | 72 -- src/Program.cpp | 8 +- src/Program.hpp | 11 +- src/VirtualMachine.cpp | 68 +- src/VirtualMachine.hpp | 20 +- src/asm/program_loop_store.inc | 4 - src/asm/program_prologue_load.inc | 3 +- src/blake2/blake2-impl.h | 105 +-- src/blake2/blake2b.c | 10 +- src/blake2/blamka-round-ref.h | 2 +- src/blake2/endian.h | 99 +++ src/common.hpp | 53 +- src/dataset.cpp | 89 +-- src/dataset.hpp | 6 +- src/executeProgram-win64.asm | 4 +- src/instructionWeights.hpp | 8 +- src/instructions.hpp | 57 -- src/instructionsPortable.cpp | 179 ++--- src/intrinPortable.h | 30 +- src/main.cpp | 19 +- src/program.inc | 281 +++---- src/squareHash.h | 5 + src/t1ha/t1ha.h | 723 ----------------- src/t1ha/t1ha2.c | 329 -------- src/t1ha/t1ha_bits.h | 1226 ----------------------------- 41 files changed, 1517 insertions(+), 3621 deletions(-) delete mode 100644 src/Pcg32.hpp create mode 100644 src/blake2/endian.h delete mode 100644 src/instructions.hpp delete mode 100644 src/t1ha/t1ha.h delete mode 100644 src/t1ha/t1ha2.c delete mode 100644 src/t1ha/t1ha_bits.h diff --git a/README.md b/README.md index fc18ed8..ed5d594 100644 --- a/README.md +++ b/README.md @@ -1,111 +1,78 @@ + + + # RandomX -RandomX is an experimental proof of work (PoW) algorithm that uses random code execution. +RandomX is a proof-of-work (PoW) algorithm that is optimized for general-purpose CPUs. RandomX uses random code execution (hence the name) together with several memory-hard techniques to achieve the following goals: -### Key features +* Prevent the development of a single-chip [ASIC](https://en.wikipedia.org/wiki/Application-specific_integrated_circuit) +* Minimize the efficiency advantage of specialized hardware compared to a general-purpose CPU -* Memory-hard (requires >4 GiB of memory) -* CPU-friendly (especially for x86 and ARM architectures) -* arguably ASIC-resistant -* inefficient on GPUs -* unusable for web-mining +## Design -## Virtual machine +The core of RandomX is a virtual machine (VM), which can be summarized by the following schematic: -RandomX is intended to be run efficiently on a general-purpose CPU. The virtual machine (VM) which runs RandomX code attempts to simulate a generic CPU using the following set of components: +![Imgur](https://i.imgur.com/8RYNWLk.png) -![Imgur](https://i.imgur.com/ZAfbX9m.png) +Notable parts of the RandomX VM are: -Full description: [vm.md](doc/vm.md). +* a large read-only 4 GiB dataset +* a 2 MiB scratchpad (read/write), which is structured into three levels L1, L2 and L3 +* 8 integer and 12 floating point registers +* an arithmetic logic unit (ALU) +* a floating point unit (FPU) +* a 2 KiB program buffer -## Dataset +The structure of the VM mimics the components that are found in a typical general purpose computer equipped with a CPU and a large amount of DRAM. The scratchpad is designed to fit into the CPU cache. The first 16 KiB and 256 KiB of the scratchpad are used more often take advantage of the faster L1 and L2 caches. The ratio of random reads from L1/L2/L3 is approximately 9:3:1, which matches the inverse latencies of typical CPU caches. -RandomX uses a 4 GiB read-only dataset. The dataset is constructed using a combination of the [Argon2d](https://en.wikipedia.org/wiki/Argon2) hashing function, [AES](https://en.wikipedia.org/wiki/Advanced_Encryption_Standard) encryption/decryption and a random permutation. The dataset is regenerated every ~34 hours. +The VM executes programs in a special instruction set, which was designed in such way that any random 8-byte word is a valid instruction and any sequence of valid instructions is a valid program. For more details see [RandomX ISA documentation](doc/isa.md). Because there are no "syntax" rules, generating a random program is as easy as filling the program buffer with random data. A RandomX program consists of 256 instructions. See [program.inc](../src/program.inc) as an example of a RandomX program translated into x86-64 assembly. -Full description: [dataset.md](doc/dataset.md). +#### Hash calculation -## Instruction set +Calculating a RandomX hash consists of initializing the 2 MiB scratchpad with random data, executing 8 RandomX loops and calculating a hash of the scratchpad. -RandomX uses a simple low-level language (instruction set), which was designed so that any random bitstring forms a valid program. Each RandomX instruction has a length of 128 bits. +Each RandomX loop is repeated 2048 times. The loop body has 4 parts: +1. The values of all registers are loaded randomly from the scratchpad (L3) +2. The RandomX program is executed +3. A random block is loaded from the dataset and mixed with integer registers +4. All register values are stored into the scratchpad (L3) -Full description: [isa.md](doc/isa.md). +Hash of the register state after 2048 interations is used to initialize the random program for the next loop. The use of 8 different programs in the course of a single hash calculation prevents mining strategies that search for "easy" programs. -## Implementation -Proof-of-concept implementation is written in C++. -``` -> bin/randomx --help -Usage: bin/randomx [OPTIONS] -Supported options: - --help shows this message - --compiled use x86-64 JIT-compiled VM (default: interpreted VM) - --lightClient use 'light-client' mode (default: full dataset mode) - --softAes use software AES (default: x86 AES-NI) - --threads T use T threads (default: 1) - --nonces N run N nonces (default: 1000) - --genAsm generate x86 asm code for nonce N -``` +The loads from the dataset are fully prefetched, so they don't slow down the loop. -Two RandomX virtual machines are implemented: +RandomX uses the [Blake2b](https://en.wikipedia.org/wiki/BLAKE_%28hash_function%29#BLAKE2) cryptographic hash function. Special hashing functions based on [AES](https://en.wikipedia.org/wiki/Advanced_Encryption_Standard) encryption are used to initialize and hash the scratchpad. -### Interpreted VM -The interpreted VM is the reference implementation, which aims for maximum portability. +#### Hash verification -The VM has been tested for correctness on the following platforms: -* Linux: x86-64, ARMv7 (32-bit), ARMv8 (64-bit) -* Windows: x86, x86-64 -* MacOS: x86-64 +RandomX is a symmetric PoW algorithm, so the verifying party has to repeat the same steps as when a hash is calculated. -The interpreted VM supports two modes: "full dataset" mode, which requires more than 4 GiB of virtual memory, and a "light-client" mode, which requires about 64 MiB of memory, but runs significantly slower because dataset blocks are created on the fly rather than simply fetched from memory. +However, to allow hash verification on devices that cannot store the whole 4 GiB dataset, RandomX allows a time-memory tradeoff by using just 256 MiB of memory at the cost of 16 times more random memory accesses. See [Dataset initialization](doc/dataset.md) for more details. -Software AES implementation is available for CPUs which don't support [AES-NI](https://en.wikipedia.org/wiki/AES_instruction_set). +#### Documentation +* [RandomX ISA](doc/isa.md) +* [RandomX instruction listing](doc/isa-ops.md) +* [Dataset initialization](doc/dataset.md) -The following table lists the performance for Intel Core i5-3230M (Ivy Bridge) CPU using a single core on Windows 64-bit, compiled with Visual Studio 2017: +# FAQ -|mode|required memory|AES|initialization time [s]|performance [programs/s]| -|------|----|-----|-------------------------|------------------| -|light client|64 MiB|software|1.0|9.2| -|light client|64 MiB|AES-NI|1.0|16| -|full dataset|4 GiB|software|54|40| -|full dataset|4 GiB|AES-NI|26|40| +### Can RandomX run on a GPU? -### JIT-compiled VM -A JIT compiler is available for x86-64 CPUs. This implementation shows the approximate performance that can be achieved using optimized mining software. The JIT compiler generates generic x86-64 code without any architecture-specific optimizations. Only "full dataset" mode is supported. +We don't expect GPUs will ever be competitive in mining RandomX. The reference miner is CPU-only. -For optimal performance, an x86-64 CPU needs: -* 32 KiB of L1 instruction cache per thread -* 16 KiB of L1 data cache per thread -* 240 KiB of L2 cache (exclusive) per thread +RandomX was designed to be efficient on CPUs. Designing an algorithm compatible with both CPUs and GPUs brings too many limitations and ultimately decreases ASIC resistance. CPUs have the advantage of not needing proprietary drivers and most CPU architectures support a large common subset of primitive operations. -The following table lists the performance of AMD Ryzen 7 1700 (clock fixed at 3350 MHz, 1.05 Vcore, dual channel DDR4 2400 MHz) on Linux 64-bit (compiled with GCC 5.4.0). +Additionally, targeting CPUs allows for more decentralized mining for several reasons: -Power consumption was measured for the whole system using a wall socket wattmeter (±1W). Table lists difference over idle power consumption. [Prime95](https://en.wikipedia.org/wiki/Prime95#Use_for_stress_testing) (small/in-place FFT) and [Cryptonight V2](https://github.com/monero-project/monero/pull/4218) power consumption are listed for comparison. +* Every computer has a CPU and even laptops will be able to mine efficiently. +* CPU mining is easier to set up - no driver compatibility issues, BIOS flashing etc. +* CPU mining is more difficult to centralize because computers can usually have only one CPU except for expensive server parts. -||threads|initialization time [s]|performance [programs/s]|power [W] -|-|------|----|-----|-------------------------| -|RandomX (interpreted)|1|27|52|16| -|RandomX (interpreted)|8|4.0|390|63| -|RandomX (interpreted)|16|3.5|620|74| -|RandomX (compiled)|1|27|407|17| -|RandomX (compiled)|2|14|810|26| -|RandomX (compiled)|4|7.3|1620|42| -|RandomX (compiled)|6|5.1|2410|56| -|RandomX (compiled)|8|4.0|3200|71| -|RandomX (compiled)|12|4.0|3670|82| -|RandomX (compiled)|16|3.5|4110|92| -|Cryptonight v2|8|-|-|47| -|Prime95|8|-|-|77| -|Prime95|16|-|-|81| +### Does RandomX facilitate botnets/malware mining or web mining? +Quite the opposite. Efficient mining requires 4 GiB of memory, which is very difficult to hide in an infected computer and disqualifies many low-end machines. Web mining is nearly impossible due to the large memory requirement and the need for a rather lengthy initialization of the dataset. -## Proof of work +### Since RandomX uses floating point calculations, how can it give reproducible results on different platforms? -RandomX VM can be used for PoW using the following steps: - -1. Initialize the VM using a 256-bit hash of any data. -2. Execute the RandomX program. -3. Calculate `blake2b(RegisterFile || t1ha2(Scratchpad))`* - -\* [blake2b](https://en.wikipedia.org/wiki/BLAKE_%28hash_function%29#BLAKE2) is a cryptographic hash function, [t1ha2](https://github.com/leo-yuriev/t1ha) is a fast hashing function. - -The above steps can be chained multiple times to prevent mining strategies that search for programs with particular properties (for example, without division). +RandomX uses only operations that are guaranteed to give correctly rounded results by the [IEEE 754](https://en.wikipedia.org/wiki/IEEE_754) standard: addition, subtraction, multiplication, division and square root. Special care is taken to avoid corner cases such as NaN values or denormals. ## Acknowledgements The following people have contributed to the design of RandomX: @@ -114,13 +81,10 @@ The following people have contributed to the design of RandomX: RandomX uses some source code from the following 3rd party repositories: * Argon2d, Blake2b hashing functions: https://github.com/P-H-C/phc-winner-argon2 -* PCG32 random number generator: https://github.com/imneme/pcg-c-basic * Software AES implementation https://github.com/fireice-uk/xmr-stak -* t1ha2 hashing function: https://github.com/leo-yuriev/t1ha ## Donations - XMR: ``` -4B9nWtGhZfAWsTxWujPDGoWfVpJvADxkxJJTmMQp3zk98n8PdLkEKXA5g7FEUjB8JPPHdP959WDWMem3FPDTK2JUU1UbVHo -``` +845xHUh5GvfHwc2R8DVJCE7BT2sd4YEcmjG8GNSdmeNsP5DTEjXd1CNgxTcjHjiFuthRHAoVEJjM7GyKzQKLJtbd56xbh7V +``` \ No newline at end of file diff --git a/doc/dataset.md b/doc/dataset.md index b3c0ee3..bb562bd 100644 --- a/doc/dataset.md +++ b/doc/dataset.md @@ -1,13 +1,13 @@ ## Dataset -The dataset serves as the source of the first operand of all instructions and provides the memory-hardness of RandomX. The size of the dataset is fixed at 4 GiB and it's divided into 65536 blocks, each 64 KiB in size. +The dataset is randomly accessed 16384 times during each hash calculation, which significantly increases memory-hardness of RandomX. The size of the dataset is fixed at 4 GiB and it's divided into 67108864 block of 64 bytes. -In order to allow PoW verification with less than 4 GiB of memory, the dataset is constructed from a 64 MiB cache, which can be used to calculate dataset blocks on the fly. To facilitate this, all random reads from the dataset are aligned to the beginning of a block. +In order to allow PoW verification with less than 4 GiB of memory, the dataset is constructed from a 256 MiB cache, which can be used to calculate dataset rows on the fly. -Because the initialization of the dataset is computationally intensive, it's recalculated on average every 1024 blocks (~34 hours). The following figure visualizes the construction of the dataset: +Because the initialization of the dataset is computationally intensive, it is recalculated only every 1024 blocks (~34 hours). The following figure visualizes the construction of the dataset: -![Imgur](https://i.imgur.com/JgLCjeq.png) +![Imgur](https://i.imgur.com/b9WHOwo.png) ### Seed block The whole dataset is constructed from a 256-bit hash of the last block whose height is divisible by 1024 **and** has at least 64 confirmations. @@ -21,7 +21,7 @@ The whole dataset is constructed from a 256-bit hash of the last block whose hei ### Cache construction -The 32-byte seed block hash is expanded into the 64 MiB cache using the "memory fill" function of Argon2d. [Argon2](https://github.com/P-H-C/phc-winner-argon2) is a memory-hard password hashing function, which is highly customizable. The variant with "d" suffix uses a data-dependent memory access pattern and provides the highest resistance against time-memory tradeoffs. +The 32-byte seed block hash is expanded into the 256 MiB cache using the "memory fill" function of Argon2d. [Argon2](https://github.com/P-H-C/phc-winner-argon2) is a memory-hard password hashing function, which is highly customizable. The variant with "d" suffix uses a data-dependent memory access pattern and provides the highest resistance against time-memory tradeoffs. Argon2 is used with the following parameters: @@ -29,8 +29,8 @@ Argon2 is used with the following parameters: |------------|--| |parallelism|1| |output size|0| -|memory|65536 (64 MiB)| -|iterations|12| +|memory|262144 (256 MiB)| +|iterations|3| |version|`0x13`| |hash type|0 (Argon2d) |password|seed block hash (32 bytes) @@ -40,43 +40,66 @@ Argon2 is used with the following parameters: The finalizer and output calculation steps of Argon2 are omitted. The output is the filled memory array. -The use of 12 iterations makes time-memory tradeoffs infeasible and thus 64 MiB is the minimum amount of memory required by RandomX. - -When the memory fill is complete, the whole memory array is cyclically shifted backwards by 512 bytes (i.e. bytes 0-511 are moved to the end of the array). This is done to misalign the array so that each 1024-byte cache block spans two subsequent Argon2 blocks. +The use of 3 iterations makes time-memory tradeoffs infeasible and thus 256 MiB is the minimum amount of memory required by RandomX. ### Dataset block generation -The full 4 GiB dataset can be generated from the 64 MiB cache. Each block is generated separately: a 1024 byte block of the cache is expanded into 64 KiB of the dataset. The algorithm has 3 steps: expansion, AES and shuffle. +The full 4 GiB dataset can be generated from the 256 MiB cache. Each 64-byte block is generated independently by XORing 16 pseudorandom Cache blocks selected by the `SquareHash` function. -#### Expansion -The 1024 cache bytes are split into 128 quadwords and interleaved with 504-byte chunks of null bytes. The resulting sequence is: 8 cache bytes + 504 null bytes + 8 cache bytes + 504 null bytes etc. Total length of the expanded block is 65536 bytes. +#### SquareHash +`SquareHash` is a custom hash function with 64-bit input and 64-bit output. It is calculated by repeatedly squaring the input, splitting the 128-bit result in to two 64-bit halves and subtracting the high half from the low half. This is repeated 42 times. It's available as a [portable C implementation](../src/squareHash.h) and [x86-64 assembly version](../src/asm/squareHash.inc). -#### AES -The 256-bit seed block hash is expanded into 10 AES round keys `k0`-`k9`. Let `i = 0...65535` be the index of the block that is being expanded. If `i` is an even number, this step uses AES *decryption* and if `i` is an odd number, it uses AES *encryption*. Since both encryption and decryption scramble random data, no distinction is made between them in the text below. +Properties of `SquareHash`: -The AES encryption is performed with 10 identical rounds using round keys `k0`-`k9`. Note that this is different from the typical AES procedure, which uses a different key schedule for decryption and a modified last round. +* It achieves full [Avalanche effect](https://en.wikipedia.org/wiki/Avalanche_effect). +* Since the whole calculation is a long dependency chain, which uses only multiplication and subtraction, the performance gains by using custom hardware are very limited. +* A single `SquareHash` calculation takes 40-80 ns, which is about the same time as DRAM access latency. Devices using low-latency memory will be bottlenecked by `SquareHash`, while CPUs will finish the hash calculation in about the same time it takes to fetch data from RAM. -Before the AES encryption is applied, each 16-byte chunk is XORed with the ciphertext of the previous chunk. This is similar to the [AES-CBC](https://en.wikipedia.org/wiki/Block_cipher_mode_of_operation#Cipher_Block_Chaining_%28CBC%29) mode of operation and forces the encryption to be sequential. For XORing the initial block, an initialization vector is formed by zero-extending `i` to 128 bits. +The output of 16 chained SquareHash calculations is used to determine Cache blocks that are XORed together to produce a Dataset block: -#### Shuffle -When the AES step is complete, the last 16-byte chunk of the block is used to initialize a PCG32 random number generator. Bits 0-63 are used as the initial state and bits 64-127 are used as the increment. The least-significant bit of the increment is always set to 1 to form an odd number. +```c++ +void initBlock(const uint8_t* cache, uint8_t* out, uint32_t blockNumber) { + uint64_t r0, r1, r2, r3, r4, r5, r6, r7; -The whole block is then divided into 16384 doublewords (4 bytes) and the [Fisher–Yates shuffle](https://en.wikipedia.org/wiki/Fisher%E2%80%93Yates_shuffle) algorithm is applied to it. The algorithm generates a random in-place permutation of the 16384 doublewords. The result of the shuffle is the `i`-th block of the dataset. + r0 = 4ULL * blockNumber; + r1 = r2 = r3 = r4 = r5 = r6 = r7 = 0; -The shuffle algorithm requires a uniform distribution of random numbers. The output of the PCG32 generator is always properly filtered to avoid the [modulo bias](https://en.wikipedia.org/wiki/Fisher%E2%80%93Yates_shuffle#Modulo_bias). + constexpr uint32_t mask = (CacheSize - 1) & CacheLineAlignMask; + + for (auto i = 0; i < DatasetIterations; ++i) { + const uint8_t* mixBlock = cache + (r0 & mask); + PREFETCHNTA(mixBlock); + r0 = squareHash(r0); + r0 ^= load64(mixBlock + 0); + r1 ^= load64(mixBlock + 8); + r2 ^= load64(mixBlock + 16); + r3 ^= load64(mixBlock + 24); + r4 ^= load64(mixBlock + 32); + r5 ^= load64(mixBlock + 40); + r6 ^= load64(mixBlock + 48); + r7 ^= load64(mixBlock + 56); + } + + store64(out + 0, r0); + store64(out + 8, r1); + store64(out + 16, r2); + store64(out + 24, r3); + store64(out + 32, r4); + store64(out + 40, r5); + store64(out + 48, r6); + store64(out + 56, r7); +} +``` + +*Note: `SquareHash` doesn't calculate squaring modulo 264+1 because the subtraction is performed modulo 264. Squaring modulo 264+1 can be calculated by adding the carry bit in every iteration (i.e. the sequence in x86-64 assembly would have to be: `mul rax; sub rax, rdx; adc rax, 0`), but this would decrease ASIC-resistance of `SquareHash`.* ### Performance -The initial 64-MiB cache construction using Argon2d takes around 1 second using an older laptop with an Intel i5-3230M CPU (Ivy Bridge). Cache generation is strictly serial and cannot be easily parallelized. +The initial 256-MiB cache construction using Argon2d takes around 1 second using an older laptop with an Intel i5-3230M CPU (Ivy Bridge). Cache generation is strictly serial and cannot be parallelized. -Dataset generation performance depends on the support of the AES-NI instruction set. The following table lists the generation runtimes using the same Ivy Bridge laptop with a single thread: +On the same laptop, full Dataset initialization takes around 100 seconds using a single thread (1.5 µs per block). -|AES|4 GiB dataset generation|single block generation| -|-----|-----------------------------|----------------| -|hardware (AES-NI)|25 s|380 µs| -|software|53 s|810 µs| +While the generation of a single block is strictly serial, multiple blocks can be easily generated in parallel, so the Dataset generation time decreases linearly with the number of threads. Using an 8-core AMD Ryzen CPU, the whole dataset can be generated in under 10 seconds. -While the generation of a single block is strictly serial, multiple blocks can be easily generated in parallel, so the dataset generation time decreases linearly with the number of threads. Using a recent 6-core CPU with AES-NI support, the whole dataset can be generated in about 4 seconds. - -Moreover, the seed block hash is known up to 64 blocks in advance, so miners can slowly precalculate the whole dataset by generating ~512 dataset blocks per minute (corresponds to less than 1% utilization of a single CPU core). +Moreover, the seed block hash is known up to 64 blocks in advance, so miners can slowly precalculate the whole dataset by generating 524288 dataset blocks per minute (corresponds to about 1% utilization of a single CPU core). ### Light clients -Light clients, who cannot or do not want to generate and keep the whole dataset in memory, can generate just the cache and then generate blocks on the fly as the program is being executed. In this case, the program execution time will be increased by roughly 100 times the single block generation time. For the Intel Ivy Bridge laptop, this amounts to around 40 milliseconds per program. \ No newline at end of file +Light clients, who cannot or do not want to generate and keep the whole dataset in memory, can generate just the cache and then generate blocks on the fly during hash calculation. In this case, the hash calculation time will be increased by 16384 times the single block generation time. For the Intel Ivy Bridge laptop, this amounts to around 24.5 milliseconds per hash. \ No newline at end of file diff --git a/doc/isa-ops.md b/doc/isa-ops.md index 5e389e3..4a1cca5 100644 --- a/doc/isa-ops.md +++ b/doc/isa-ops.md @@ -1,130 +1,103 @@ - # RandomX instruction listing -There are 31 unique instructions divided into 3 groups: - -|group|# operations|# opcodes|| -|---------|-----------------|----|-| -|integer (IA)|22|144|56.3%| -|floating point (FP)|5|76|29.7%| -|control (CL)|4|36|14.0% -||**31**|**256**|**100%** - ## Integer instructions -There are 22 integer instructions. They are divided into 3 classes (MATH, DIV, SHIFT) with different B operand selection rules. +For integer instructions, the destination is always an integer register (register group R). Source operand (if applicable) can be either an integer register or memory value. If `dst` and `src` refer to the same register, most instructions use `imm32` as the source operand instead of the register. This is indicated in the 'src == dst' column. -|# opcodes|instruction|class|signed|A width|B width|C|C width| +Memory operands are loaded as 8-byte values from the address indicated by `src`. This indirect addressing is marked with square brackets: `[src]`. + +|frequency|instruction|dst|src|`src == dst ?`|operation| |-|-|-|-|-|-|-|-| -|12|ADD_64|MATH|no|64|64|`A + B`|64| -|2|ADD_32|MATH|no|32|32|`A + B`|32| -|12|SUB_64|MATH|no|64|64|`A - B`|64| -|2|SUB_32|MATH|no|32|32|`A - B`|32| -|21|MUL_64|MATH|no|64|64|`A * B`|64| -|10|MULH_64|MATH|no|64|64|`A * B`|64| -|15|MUL_32|MATH|no|32|32|`A * B`|64| -|15|IMUL_32|MATH|yes|32|32|`A * B`|64| -|10|IMULH_64|MATH|yes|64|64|`A * B`|64| -|4|DIV_64|DIV|no|64|32|`A / B`|64| -|4|IDIV_64|DIV|yes|64|32|`A / B`|64| -|4|AND_64|MATH|no|64|64|`A & B`|64| -|2|AND_32|MATH|no|32|32|`A & B`|32| -|4|OR_64|MATH|no|64|64|`A | B`|64| -|2|OR_32|MATH|no|32|32|`A | B`|32| -|4|XOR_64|MATH|no|64|64|`A ^ B`|64| -|2|XOR_32|MATH|no|32|32|`A ^ B`|32| -|3|SHL_64|SHIFT|no|64|6|`A << B`|64| -|3|SHR_64|SHIFT|no|64|6|`A >> B`|64| -|3|SAR_64|SHIFT|yes|64|6|`A >> B`|64| -|6|ROL_64|SHIFT|no|64|6|`A <<< B`|64| -|6|ROR_64|SHIFT|no|64|6|`A >>> B`|64| +|12/256|IADD_R|R|R|`src = imm32`|`dst = dst + src`| +|7/256|IADD_M|R|mem|`src = imm32`|`dst = dst + [src]`| +|16/256|IADD_RC|R|R|`src = dst`|`dst = dst + src + imm32`| +|12/256|ISUB_R|R|R|`src = imm32`|`dst = dst - src`| +|7/256|ISUB_M|R|mem|`src = imm32`|`dst = dst - [src]`| +|9/256|IMUL_9C|R|-|-|`dst = 9 * dst + imm32`| +|16/256|IMUL_R|R|R|`src = imm32`|`dst = dst * src`| +|4/256|IMUL_M|R|mem|`src = imm32`|`dst = dst * [src]`| +|4/256|IMULH_R|R|R|`src = dst`|`dst = (dst * src) >> 64`| +|1/256|IMULH_M|R|mem|`src = imm32`|`dst = (dst * [src]) >> 64`| +|4/256|ISMULH_R|R|R|`src = dst`|`dst = (dst * src) >> 64` (signed)| +|1/256|ISMULH_M|R|mem|`src = imm32`|`dst = (dst * [src]) >> 64` (signed)| +|4/256|IDIV_C|R|-|-|`dst = dst + dst / imm32`| +|4/256|ISDIV_C|R|-|-|`dst = dst + dst / imm32` (signed)| +|2/256|INEG_R|R|-|-|`dst = -dst`| +|16/256|IXOR_R|R|R|`src = imm32`|`dst = dst ^ src`| +|4/256|IXOR_M|R|mem|`src = imm32`|`dst = dst ^ [src]`| +|10/256|IROR_R|R|R|`src = imm32`|`dst = dst >>> src`| +|4/256|ISWAP_R|R|R|`src = dst`|`temp = src; src = dst; dst = temp`| -#### 32-bit operations -Instructions ADD_32, SUB_32, AND_32, OR_32, XOR_32 only use the low-order 32 bits of the input operands. The result of these operations is 32 bits long and bits 32-63 of C are set to zero. +#### IMULH and ISMULH +These instructions output the high 64 bits of the whole 128-bit multiplication result. The result differs for signed and unsigned multiplication (`IMULH` is unsigned, `ISMULH` is signed). The variants with a register source operand do not use `imm32` (they perform a squaring operation if `dst` equals `src`). -#### Multiplication -There are 5 different multiplication operations. MUL_64 and MULH_64 both take 64-bit unsigned operands, but MUL_64 produces the low 64 bits of the result and MULH_64 produces the high 64 bits. MUL_32 and IMUL_32 use only the low-order 32 bits of the operands and produce a 64-bit result. The signed variant interprets the arguments as signed integers. IMULH_64 takes two 64-bit signed operands and produces the high-order 64 bits of the result. +#### IDIV_C and ISDIV_C +The division instructions use a constant divisor, so they can be optimized into a [multiplication by fixed-point reciprocal](https://en.wikipedia.org/wiki/Division_algorithm#Division_by_a_constant). `IDIV_C` performs unsigned division (`imm32` is zero-extended to 64 bits), while `ISDIV_C` performs signed division. In the case of division by zero, the instructions become a no-op. In the very rare case of signed overflow, the destination register is set to zero. -#### Division -For the division instructions, the dividend is 64 bits long and the divisor 32 bits long. The IDIV_64 instruction interprets both operands as signed integers. In case of division by zero or signed overflow, the result is equal to the dividend `A`. - -75% of division instructions use a runtime-constant divisor and can be optimized using a multiplication and shifts. - -#### Shift and rotate -The shift/rotate instructions use just the bottom 6 bits of the `B` operand (`imm8` is used as the immediate value). All treat `A` as unsigned except SAR_64, which performs an arithmetic right shift by copying the sign bit. +#### ISWAP_R +This instruction swaps the values of two registers. If source and destination refer to the same register, the result is a no-op. ## Floating point instructions -There are 5 floating point instructions. All floating point instructions are vector instructions that operate on two packed double precision floating point values. +For floating point instructions, the destination can be a group F or group E register. Source operand is either a group A register or a memory value. -|# opcodes|instruction|C| -|-|-|-| -|20|FPADD|`A + B`| -|20|FPSUB|`A - B`| -|22|FPMUL|`A * B`| -|8|FPDIV|`A / B`| -|6|FPSQRT|`sqrt(abs(A))`| +Memory operands are loaded as 8-byte values from the address indicated by `src`. The 8 byte value is interpreted as two 32-bit signed integers and implicitly converted to floating point format. The lower and upper memory operands are marked as `[src][0]` and `[src][1]`. -#### Conversion of operand A -Operand A is loaded from memory as a 64-bit value. All floating point instructions interpret A as two packed 32-bit signed integers and convert them into two packed double precision floating point values. +|frequency|instruction|dst|src|operation| +|-|-|-|-|-|-|-| +|8/256|FSWAP_R|F+E|-|`(dst0, dst1) = (dst1, dst0)`| +|20/256|FADD_R|F|A|`(dst0, dst1) = (dst0 + src0, dst1 + src1)`| +|5/256|FADD_M|F|mem|`(dst0, dst1) = (dst0 + [src][0], dst1 + [src][1])`| +|20/256|FSUB_R|F|A|`(dst0, dst1) = (dst0 - src0, dst1 - src1)`| +|5/256|FSUB_M|F|mem|`(dst0, dst1) = (dst0 - [src][0], dst1 - [src][1])`| +|6/256|FNEG_R|F|-|`(dst0, dst1) = (-dst0, -dst1)`| +|20/256|FMUL_R|E|A|`(dst0, dst1) = (dst0 * src0, dst1 * src1)`| +|4/256|FDIV_M|E|mem|`(dst0, dst1) = (dst0 / [src][0], dst1 / [src][1])`| +|6/256|FSQRT_R|E|-|`(dst0, dst1) = (√dst0, √dst1)`| + +#### Denormal and NaN values +Due to restrictions on the values of the floating point registers, no operation results in `NaN`. +`FDIV_M` can produce a denormal result. In that case, the result is set to `DBL_MIN = 2.22507385850720138309e-308`, which is the smallest positive normal number. #### Rounding -FPU instructions conform to the IEEE-754 specification, so they must give correctly rounded results. Initial rounding mode is *roundTiesToEven*. Rounding mode can be changed by the `FPROUND` control instruction. Denormal values must be always flushed to zero. +All floating point instructions give correctly rounded results. The rounding mode depends on the value of the `fprc` register: -#### NaN -If an operation produces NaN, the result is converted into positive zero. NaN results may never be written into registers or memory. Only division and multiplication must be checked for NaN results (`0.0 / 0.0` and `0.0 * Infinity` result in NaN). - -## Control instructions -There are 4 control instructions. - -|# opcodes|instruction|description|condition| -|-|-|-|-| -|2|FPROUND|change floating point rounding mode|- -|11|JUMP|conditional jump|(see condition table below) -|11|CALL|conditional procedure call|(see condition table below) -|12|RET|return from procedure|stack is not empty - -All control instructions behave as 'arithmetic no-op' and simply copy the input operand A into the destination C. - -The JUMP and CALL instructions use a condition function, which takes the lower 32 bits of operand B (register) and the value `imm32` and evaluates a condition based on the `B.LOC.C` flag: - -|`B.LOC.C`|signed|jump condition|probability|*x86*|*ARM* -|---|---|----------|-----|--|----| -|0|no|`B <= imm32`|0% - 100%|`JBE`|`BLS` -|1|no|`B > imm32`|0% - 100%|`JA`|`BHI` -|2|yes|`B - imm32 < 0`|50%|`JS`|`BMI` -|3|yes|`B - imm32 >= 0`|50%|`JNS`|`BPL` -|4|yes|`B - imm32` overflows|0% - 50%|`JO`|`BVS` -|5|yes|`B - imm32` doesn't overflow|50% - 100%|`JNO`|`BVC` -|6|yes|`B < imm32`|0% - 100%|`JL`|`BLT` -|7|yes|`B >= imm32`|0% - 100%|`JGE`|`BGE` - -The 'signed' column specifies if the operands are interpreted as signed or unsigned 32-bit numbers. Column 'probability' lists the expected jump probability (range means that the actual value for a specific instruction depends on `imm32`). *Columns 'x86' and 'ARM' list the corresponding hardware instructions (following a `CMP` instruction).* - -### FPROUND -The FPROUND instruction changes the rounding mode for all subsequent FPU operations depending on a two-bit flag. The flag is calculated by rotating A `imm8` bits to the right and taking the two least-significant bits: - -``` -rounding flag = (A >>> imm8)[1:0] -``` - -|rounding flag|rounding mode| +|`fprc`|rounding mode| |-------|------------| -|00|roundTiesToEven| -|01|roundTowardNegative| -|10|roundTowardPositive| -|11|roundTowardZero| +|0|roundTiesToEven| +|1|roundTowardNegative| +|2|roundTowardPositive| +|3|roundTowardZero| The rounding modes are defined by the IEEE-754 standard. -*The two-bit flag value exactly corresponds to bits 13-14 of the x86 `MXCSR` register and bits 23 and 22 (reversed) of the ARM `FPSCR` register.* +## Other instructions +There are 4 special instructions that have more than one source operand or the destination operand is a memory value. -### JUMP -If the jump condition is `true`, the JUMP instruction performs a forward jump relative to the value of `pc`. The forward offset is equal to `16 * (imm8[6:0] + 1)` bytes (1-128 instructions forward). +|frequency|instruction|dst|src|operation| +|-|-|-|-|-| +|7/256|COND_R|R|R, `imm32`|`if(condition(src, imm32)) dst = dst + 1` +|1/256|COND_M|R|mem, `imm32`|`if(condition([src], imm32)) dst = dst + 1` +|1/256|CFROUND|`fprc`|R, `imm32`|`fprc = src >>> imm32` +|16/256|ISTORE|mem|R|`[dst] = src` -### CALL -If the jump condition is `true`, the CALL instruction pushes the value of `pc` (program counter) onto the stack and then performs a forward jump relative to the value of `pc`. The forward offset is equal to `16 * (imm8[6:0] + 1)` bytes (1-128 instructions forward). +#### COND -### RET -If the stack is not empty, the RET instruction pops the return address from the stack (it's the instruction following the previous CALL) and jumps to it. +These instructions conditionally increment the destination register. The condition function depends on the `mod.cond` flag and takes the lower 32 bits of the source operand and the value `imm32`. -## Reference implementation -A portable C++ implementation of all integer and floating point instructions is available in [instructionsPortable.cpp](../src/instructionsPortable.cpp). \ No newline at end of file +|`mod.cond`|signed|`condition`|probability|*x86*|*ARM* +|---|---|----------|-----|--|----| +|0|no|`src <= imm32`|0% - 100%|`JBE`|`BLS` +|1|no|`src > imm32`|0% - 100%|`JA`|`BHI` +|2|yes|`src - imm32 < 0`|50%|`JS`|`BMI` +|3|yes|`src - imm32 >= 0`|50%|`JNS`|`BPL` +|4|yes|`src - imm32` overflows|0% - 50%|`JO`|`BVS` +|5|yes|`src - imm32` doesn't overflow|50% - 100%|`JNO`|`BVC` +|6|yes|`src < imm32`|0% - 100%|`JL`|`BLT` +|7|yes|`src >= imm32`|0% - 100%|`JGE`|`BGE` + +The 'signed' column specifies if the operands are interpreted as signed or unsigned 32-bit numbers. Column 'probability' lists the expected probability the condition is true (range means that the actual value for a specific instruction depends on `imm32`). *Columns 'x86' and 'ARM' list the corresponding hardware instructions (following a `CMP` instruction).* + +#### CFROUND +This instruction sets the value of the `fprc` register to the 2 least significant bits of the source register rotated right by `imm32`. This changes the rounding mode of all subsequent floating point instructions. + +#### ISTORE +The `ISTORE` instruction stores the value of the source integer register to the memory at the address specified by the destination register. The `src` and `dst` register can be the same. diff --git a/doc/isa.md b/doc/isa.md index d46b16e..36a2634 100644 --- a/doc/isa.md +++ b/doc/isa.md @@ -1,182 +1,91 @@ -# RandomX instruction encoding -The instruction set was designed in such way that any random 16-byte word is a valid instruction and any sequence of valid instructions is a valid program. There are no syntax rules. -The encoding of each 128-bit instruction word is following: +# RandomX instruction set architecture +RandomX VM is a complex instruction set computer ([CISC](https://en.wikipedia.org/wiki/Complex_instruction_set_computer)). All data are loaded and stored in little-endian byte order. Signed integer numbers are represented using [two's complement](https://en.wikipedia.org/wiki/Two%27s_complement). Floating point numbers are represented using the [IEEE-754 double precision format](https://en.wikipedia.org/wiki/Double-precision_floating-point_format). -![Imgur](https://i.imgur.com/xi8zuAZ.png) +## Registers -## opcode -There are 256 opcodes, which are distributed between 3 groups of instructions. There are 31 distinct operations (each operation can be encoded using multiple opcodes - for example opcodes `0x00` to `0x0d` correspond to integer addition). +RandomX has 8 integer registers `r0`-`r7` (group R) and a total of 12 floating point registers split into 3 groups: `a0`-`a3` (group A), `f0`-`f3` (group F) and `e0`-`e3` (group E). Integer registers are 64 bits wide, while floating point registers are 128 bits wide and contain a pair of floating point numbers. The lower and upper half of floating point registers are not separately addressable. -**Table 1: Instruction groups** +*Table 1: Addressable register groups* -|group|# operations|# opcodes|| +|index|R|A|F|E|F+E| +|--|--|--|--|--|--| +|0|`r0`|`a0`|`f0`|`e0`|`f0`| +|1|`r1`|`a1`|`f1`|`e1`|`f1`| +|2|`r2`|`a2`|`f2`|`e2`|`f2`| +|3|`r3`|`a3`|`f3`|`e3`|`f3`| +|4|`r4`||||`e0`| +|5|`r5`||||`e1`| +|6|`r6`||||`e2`| +|7|`r7`||||`e3`| + +Besides the directly addressable registers above, there is a 2-bit `fprc` register for rounding control, which is an implicit destination register of the `CFROUND` instruction, and two architectural 32-bit registers `ma` and `mx`, which are not accessible to any instruction. + +Integer registers `r0`-`r7` can be the source or the destination operands of integer instructions or may be used as address registers for loading the source operand from the memory (scratchpad). + +Floating point registers `a0`-`a3` are read-only and may not be written to except at the moment a program is loaded into the VM. They can be the source operand of any floating point instruction. The value of these registers is restricted to the interval `[1, 4294967296)`. + +Floating point registers `f0`-`f3` are the *additive* registers, which can be the destination of floating point addition and subtraction instructions. The absolute value of these registers will not exceed `1.0e+12`. + +Floating point registers `e0`-`e3` are the *multiplicative* registers, which can be the destination of floating point multiplication, division and square root instructions. Their value is always positive. + +## Instruction encoding + +Each instruction word is 64 bits long and has the following format: + +![Imgur](https://i.imgur.com/FtkWRwe.png) + +### opcode +There are 256 opcodes, which are distributed between 35 distinct instructions. Each instruction can be encoded using multiple opcodes (the number of opcodes specifies the frequency of the instruction in a random program). + +*Table 2: Instruction groups* + +|group|# instructions|# opcodes|| |---------|-----------------|----|-| -|integer (IA)|22|144|56.3%| -|floating point (FP)|5|76|29.7%| -|control (CL)|4|36|14.0% -||**31**|**256**|**100%** +|integer |20|143|55.9%| +|floating point |11|88|34.4%| +|other |4|25|9.7%| +||**35**|**256**|**100%** Full description of all instructions: [isa-ops.md](isa-ops.md). -## A.LOC -**Table 2: `A.LOC` encoding** +### dst +Destination register. Only bits 0-1 (register groups A, F, E) or 0-2 (groups R, F+E) are used to encode a register according to Table 1. -|bits|description| +### src + +The `src` flag encodes a source operand register according to Table 1 (only bits 0-1 or 0-2 are used). + +Immediate value `imm32` is used as the source operand in cases when `dst` and `src` encode the same register. + +For register-memory instructions, the source operand determines the `address_base` value for calculating the memory address (see below). + +### mod + +The `mod` flag is encoded as: + +*Table 3: mod flag encoding* + +|`mod`|description| |----|--------| -|0-1|`A.LOC.W` flag| -|2-5|Reserved| -|6-7|`A.LOC.X` flag| +|0-1|`mod.mem` flag| +|2-4|`mod.cond` flag| +|5-7|Reserved| -The `A.LOC.W` flag determines the address width when reading operand A from the scratchpad: +The `mod.mem` flag determines the address mask when reading from or writing to memory: -**Table 3: Operand A read address width** +*Table 3: memory address mask* -|`A.LOC.W`|address width (W)| -|---------|-| -|0|15 bits (256 KiB)| -|1-3|11 bits (16 KiB)| +|`mod.mem`|`address_mask`|(scratchpad level)| +|---------|-|---| +|0|262136|(L2)| +|1-3|16376|(L1)| -If the `A.LOC.W` flag is zero, the address space covers the whole 256 KiB scratchpad. Otherwise, just the first 16 KiB of the scratchpad are addressed. +Table 3 applies to all memory accesses except for cases when the source operand is an immediate value. In that case, `address_mask` is equal to 2097144 (L3). -If the `A.LOC.X` flag is zero, the instruction mixes the scratchpad read address into the `mx` register using XOR. This mixing happens before the address is truncated to W bits (see pseudocode below). +The address for reading/writing is calculated by applying bitwise AND operation to `address_base` and `address_mask`. -## A.REG -**Table 4: `A.REG` encoding** +The `mod.cond` flag is used only by the `COND` instruction to select a condition to be tested. -|bits|description| -|----|--------| -|0-2|`A.REG.R` flag| -|3-7|Reserved| - -The `A.REG.R` flag encodes "readAddressRegister", which is an integer register `r0`-`r7` to be used for scratchpad read address generation. Read address is generated as follows (pseudocode): - -```python -readAddressRegister = IntegerRegister(A.REG.R) -readAddressRegister = readAddressRegister XOR SignExtend(A.mask32) -readAddress = readAddressRegister[31:0] -# dataset is read if the ic register is divisible by 64 -IF ic mod 64 == 0: - DatasetRead(readAddress) -# optional mixing into the mx register -IF A.LOC.X == 0: - mx = mx XOR readAddress -# truncate to W bits -W = GetAddressWidth(A.LOC.W) -readAddress = readAddress[W-1:0] -``` - -Note that the value of the read address register is modified during address generation. - -## B.LOC -**Table 5: `B.LOC` encoding** - -|bits|description| -|----|--------| -|0-1|`B.LOC.L` flag| -|0-2|`B.LOC.C` flag| -|3-7|Reserved| - -The `B.LOC.L` flag determines the B operand. It can be either a register or immediate value. - -**Table 6: Operand B** - -|`B.LOC.L`|IA/DIV|IA/SHIFT|IA/MATH|FP|CL| -|----|--------|----|------|----|---| -|0|register|`imm8`|`imm32`|register|register| -|1|`imm32`|register|register|register|register| -|2|`imm32`|`imm8`|register|register|register| -|3|`imm32`|register|register|register|register| - -Integer instructions are split into 3 classes: integer division (IA/DIV), shift and rotate (IA/SHIFT) and other (IA/MATH). Floating point (FP) and control (CL) instructions always use a register operand. - -Register to be used as operand B is encoded in the `B.REG.R` flag (see below). - -The `B.LOC.C` flag determines the condition for the JUMP and CALL instructions. The flag partially overlaps with the `B.LOC.L` flag. - -## B.REG -**Table 7: `B.REG` encoding** - -|bits|description| -|----|--------| -|0-2|`B.REG.R` flag| -|3-7|Reserved| - -Register encoded by the `B.REG.R` depends on the instruction group: - -**Table 8: Register operands by group** - -|group|registers| -|----|--------| -|IA|`r0`-`r7`| -|FP|`f0`-`f7`| -|CL|`r0`-`r7`| - -## C.LOC -**Table 9: `C.LOC` encoding** - -|bits|description| -|----|--------| -|0-1|`C.LOC.W` flag| -|2|`C.LOC.R` flag| -|3-6|Reserved| -|7|`C.LOC.H` flag| - -The `C.LOC.W` flag determines the address width when writing operand C to the scratchpad: - -**Table 10: Operand C write address width** - -|`C.LOC.W`|address width (W)| -|---------|-| -|0|15 bits (256 KiB)| -|1-3|11 bits (16 KiB)| - -If the `C.LOC.W` flag is zero, the address space covers the whole 256 KiB scratchpad. Otherwise, just the first 16 KiB of the scratchpad are addressed. - -The `C.LOC.R` determines the destination where operand C is written: - -**Table 11: Operand C destination** - -|`C.LOC.R`|groups IA, CL|group FP -|---------|-|-| -|0|scratchpad|register -|1|register|register + scratchpad - -Integer and control instructions (groups IA and CL) write either to the scratchpad or to a register. Floating point instructions always write to a register and can also write to the scratchpad. In that case, flag `C.LOC.H` determines if the low or high half of the register is written: - -**Table 12: Floating point register write** - -|`C.LOC.H`|write bits| -|---------|----------| -|0|0-63| -|1|64-127| - -## C.REG -**Table 13: `C.REG` encoding** - -|bits|description| -|----|--------| -|0-2|`C.REG.R` flag| -|3-7|Reserved| - -The destination register encoded in the `C.REG.R` flag encodes both the write address register (if writing to the scratchpad) and the destination register (if writing to a register). The destination register depends on the instruction group (see Table 8). Write address is always generated from an integer register: - -```python -writeAddressRegister = IntegerRegister(C.REG.R) -writeAddress = writeAddressRegister[31:0] XOR C.mask32 -# truncate to W bits -W = GetAddressWidth(C.LOC.W) -writeAddress = writeAddress [W-1:0] -``` - -## imm8 -`imm8` is an 8-bit immediate value that is used as the B operand by IA/SHIFT instructions (see Table 6). Additionally, it's used by some control instructions. - -## A.mask32 -`A.mask32` is a 32-bit address mask that is used to calculate the read address for the A operand. It's sign-extended to 64 bits before use. - -## imm32 -`imm32` is a 32-bit immediate value which is used for integer instructions from groups IA/DIV and IA/OTHER (see Table 6). The immediate value is sign-extended for instructions that expect 64-bit operands. - -## C.mask32 -`C.mask32` is a 32-bit address mask that is used to calculate the write address for the C operand. `C.mask32` is equal to `imm32`. +### imm32 +A 32-bit immediate value that can be used as the source operand. The immediate value is sign-extended to 64 bits in most cases. diff --git a/src/AssemblyGeneratorX86.cpp b/src/AssemblyGeneratorX86.cpp index e2eaf44..a2d1b32 100644 --- a/src/AssemblyGeneratorX86.cpp +++ b/src/AssemblyGeneratorX86.cpp @@ -19,12 +19,12 @@ along with RandomX. If not, see. //#define TRACE #define MAGIC_DIVISION #include "AssemblyGeneratorX86.hpp" -#include "Pcg32.hpp" #include "common.hpp" #include "instructions.hpp" #ifdef MAGIC_DIVISION #include "divideByConstantCodegen.h" #endif +#include "Program.hpp" namespace RandomX { @@ -48,17 +48,10 @@ namespace RandomX { static const char* regDatasetAddr = "rdi"; static const char* regScratchpadAddr = "rsi"; - void AssemblyGeneratorX86::generateProgram(const void* seed) { + void AssemblyGeneratorX86::generateProgram(Program& prog) { asmCode.str(std::string()); //clear - Pcg32 gen(seed); - for (unsigned i = 0; i < sizeof(RegisterFile) / sizeof(Pcg32::result_type); ++i) { - gen(); - } - Instruction instr; for (unsigned i = 0; i < ProgramLength; ++i) { - for (unsigned j = 0; j < sizeof(instr) / sizeof(Pcg32::result_type); ++j) { - *(((uint32_t*)&instr) + j) = gen(); - } + Instruction& instr = prog(i); instr.src %= RegistersCount; instr.dst %= RegistersCount; generateCode(instr, i); @@ -83,7 +76,7 @@ namespace RandomX { } int32_t AssemblyGeneratorX86::genAddressImm(Instruction& instr) { - return instr.imm32 & ScratchpadL3Mask; + return (int32_t)instr.imm32 & ScratchpadL3Mask; } //1 uOP @@ -92,7 +85,7 @@ namespace RandomX { asmCode << "\tadd " << regR[instr.dst] << ", " << regR[instr.src] << std::endl; } else { - asmCode << "\tadd " << regR[instr.dst] << ", " << instr.imm32 << std::endl; + asmCode << "\tadd " << regR[instr.dst] << ", " << (int32_t)instr.imm32 << std::endl; } } @@ -109,7 +102,7 @@ namespace RandomX { //1 uOP void AssemblyGeneratorX86::h_IADD_RC(Instruction& instr, int i) { - asmCode << "\tlea " << regR[instr.dst] << ", [" << regR[instr.dst] << "+" << regR[instr.src] << std::showpos << instr.imm32 << std::noshowpos << "]" << std::endl; + asmCode << "\tlea " << regR[instr.dst] << ", [" << regR[instr.dst] << "+" << regR[instr.src] << std::showpos << (int32_t)instr.imm32 << std::noshowpos << "]" << std::endl; } //1 uOP @@ -118,7 +111,7 @@ namespace RandomX { asmCode << "\tsub " << regR[instr.dst] << ", " << regR[instr.src] << std::endl; } else { - asmCode << "\tsub " << regR[instr.dst] << ", " << instr.imm32 << std::endl; + asmCode << "\tsub " << regR[instr.dst] << ", " << (int32_t)instr.imm32 << std::endl; } } @@ -135,7 +128,7 @@ namespace RandomX { //1 uOP void AssemblyGeneratorX86::h_IMUL_9C(Instruction& instr, int i) { - asmCode << "\tlea " << regR[instr.dst] << ", [" << regR[instr.dst] << "+" << regR[instr.dst] << "*8" << std::showpos << instr.imm32 << std::noshowpos << "]" << std::endl; + asmCode << "\tlea " << regR[instr.dst] << ", [" << regR[instr.dst] << "+" << regR[instr.dst] << "*8" << std::showpos << (int32_t)instr.imm32 << std::noshowpos << "]" << std::endl; } //1 uOP @@ -144,7 +137,7 @@ namespace RandomX { asmCode << "\timul " << regR[instr.dst] << ", " << regR[instr.src] << std::endl; } else { - asmCode << "\timul " << regR[instr.dst] << ", " << instr.imm32 << std::endl; + asmCode << "\timul " << regR[instr.dst] << ", " << (int32_t)instr.imm32 << std::endl; } } @@ -161,16 +154,9 @@ namespace RandomX { //4 uOPs void AssemblyGeneratorX86::h_IMULH_R(Instruction& instr, int i) { - if (instr.src != instr.dst) { - asmCode << "\tmov rax, " << regR[instr.dst] << std::endl; - asmCode << "\tmul " << regR[instr.src] << std::endl; - asmCode << "\tmov " << regR[instr.dst] << ", rdx" << std::endl; - } - else { - asmCode << "\tmov eax, " << instr.imm32 << std::endl; - asmCode << "\tmul " << regR[instr.dst] << std::endl; - asmCode << "\tadd " << regR[instr.dst] << ", rdx" << std::endl; - } + asmCode << "\tmov rax, " << regR[instr.dst] << std::endl; + asmCode << "\tmul " << regR[instr.src] << std::endl; + asmCode << "\tmov " << regR[instr.dst] << ", rdx" << std::endl; } //5.75 uOPs @@ -189,16 +175,9 @@ namespace RandomX { //4 uOPs void AssemblyGeneratorX86::h_ISMULH_R(Instruction& instr, int i) { - if (instr.src != instr.dst) { - asmCode << "\tmov rax, " << regR[instr.dst] << std::endl; - asmCode << "\timul " << regR[instr.src] << std::endl; - asmCode << "\tmov " << regR[instr.dst] << ", rdx" << std::endl; - } - else { - asmCode << "\tmov rax, " << instr.imm32 << std::endl; - asmCode << "\timul " << regR[instr.dst] << std::endl; - asmCode << "\tadd " << regR[instr.dst] << ", rdx" << std::endl; - } + asmCode << "\tmov rax, " << regR[instr.dst] << std::endl; + asmCode << "\timul " << regR[instr.src] << std::endl; + asmCode << "\tmov " << regR[instr.dst] << ", rdx" << std::endl; } //5.75 uOPs @@ -226,7 +205,7 @@ namespace RandomX { asmCode << "\txor " << regR[instr.dst] << ", " << regR[instr.src] << std::endl; } else { - asmCode << "\txor " << regR[instr.dst] << ", " << instr.imm32 << std::endl; + asmCode << "\txor " << regR[instr.dst] << ", " << (int32_t)instr.imm32 << std::endl; } } @@ -300,7 +279,7 @@ namespace RandomX { //~8.5 uOPs void AssemblyGeneratorX86::h_ISDIV_C(Instruction& instr, int i) { - int64_t divisor = instr.imm32; + int64_t divisor = (int32_t)instr.imm32; if ((divisor & -divisor) == divisor || (divisor & -divisor) == -divisor) { asmCode << "\tmov rax, " << regR[instr.dst] << std::endl; // +/- power of two @@ -395,9 +374,9 @@ namespace RandomX { } //1 uOP - void AssemblyGeneratorX86::h_CFSUM_R(Instruction& instr, int i) { + void AssemblyGeneratorX86::h_FNEG_R(Instruction& instr, int i) { instr.dst %= 4; - asmCode << "\t" << fsumInstr[instr.mod % 4] << " " << signMask << ", " << regF[instr.dst] << std::endl; + asmCode << "\txorps " << regF[instr.dst] << ", " << signMask << std::endl; } //1 uOPs @@ -478,7 +457,7 @@ namespace RandomX { //4 uOPs void AssemblyGeneratorX86::h_COND_R(Instruction& instr, int i) { asmCode << "\txor ecx, ecx" << std::endl; - asmCode << "\tcmp " << regR32[instr.src] << ", " << instr.imm32 << std::endl; + asmCode << "\tcmp " << regR32[instr.src] << ", " << (int32_t)instr.imm32 << std::endl; asmCode << "\tset" << condition(instr) << " cl" << std::endl; asmCode << "\tadd " << regR[instr.dst] << ", rcx" << std::endl; } @@ -487,7 +466,7 @@ namespace RandomX { void AssemblyGeneratorX86::h_COND_M(Instruction& instr, int i) { asmCode << "\txor ecx, ecx" << std::endl; genAddressReg(instr); - asmCode << "\tcmp dword ptr [rsi+rax], " << instr.imm32 << std::endl; + asmCode << "\tcmp dword ptr [rsi+rax], " << (int32_t)instr.imm32 << std::endl; asmCode << "\tset" << condition(instr) << " cl" << std::endl; asmCode << "\tadd " << regR[instr.dst] << ", rcx" << std::endl; } @@ -542,7 +521,7 @@ namespace RandomX { INST_HANDLE(FADD_M) INST_HANDLE(FSUB_R) INST_HANDLE(FSUB_M) - INST_HANDLE(CFSUM_R) + INST_HANDLE(FNEG_R) //Floating point group E INST_HANDLE(FMUL_R) diff --git a/src/AssemblyGeneratorX86.hpp b/src/AssemblyGeneratorX86.hpp index 5abf707..0c1844e 100644 --- a/src/AssemblyGeneratorX86.hpp +++ b/src/AssemblyGeneratorX86.hpp @@ -24,13 +24,14 @@ along with RandomX. If not, see. namespace RandomX { + class Program; class AssemblyGeneratorX86; typedef void(AssemblyGeneratorX86::*InstructionGenerator)(Instruction&, int); class AssemblyGeneratorX86 { public: - void generateProgram(const void* seed); + void generateProgram(Program&); void printCode(std::ostream& os) { os << asmCode.rdbuf(); } @@ -69,7 +70,7 @@ namespace RandomX { void h_FADD_M(Instruction&, int); void h_FSUB_R(Instruction&, int); void h_FSUB_M(Instruction&, int); - void h_CFSUM_R(Instruction&, int); + void h_FNEG_R(Instruction&, int); void h_FMUL_R(Instruction&, int); void h_FMUL_M(Instruction&, int); void h_FDIV_R(Instruction&, int); diff --git a/src/Cache.cpp b/src/Cache.cpp index bb1758f..85d481e 100644 --- a/src/Cache.cpp +++ b/src/Cache.cpp @@ -23,7 +23,6 @@ along with RandomX. If not, see. #include "Cache.hpp" #include "softAes.h" #include "argon2.h" -#include "Pcg32.hpp" #include "argon2_core.h" namespace RandomX { diff --git a/src/CompiledVirtualMachine.cpp b/src/CompiledVirtualMachine.cpp index 3bf3371..b3b5db8 100644 --- a/src/CompiledVirtualMachine.cpp +++ b/src/CompiledVirtualMachine.cpp @@ -18,19 +18,12 @@ along with RandomX. If not, see. */ #include "CompiledVirtualMachine.hpp" -#include "Pcg32.hpp" #include "common.hpp" #include "instructions.hpp" #include namespace RandomX { - constexpr int mantissaSize = 52; - constexpr int exponentSize = 11; - constexpr uint64_t mantissaMask = (1ULL << mantissaSize) - 1; - constexpr uint64_t exponentMask = (1ULL << exponentSize) - 1; - constexpr int exponentBias = 1023; - CompiledVirtualMachine::CompiledVirtualMachine() { totalSize = 0; } @@ -39,40 +32,9 @@ namespace RandomX { mem.ds = ds; } - void CompiledVirtualMachine::initializeScratchpad(uint8_t* scratchpad, int32_t index) { - memcpy(scratchpad, mem.ds.dataset + ScratchpadSize * index, ScratchpadSize); - } - - static uint64_t getSmallPositiveFloatBits(uint64_t entropy) { - auto exponent = entropy >> 59; //0..31 - auto mantissa = entropy & mantissaMask; - exponent += exponentBias; - exponent &= exponentMask; - exponent <<= mantissaSize; - return exponent | mantissa; - } - - void CompiledVirtualMachine::initializeProgram(const void* seed) { - Pcg32 gen(seed); - for (unsigned i = 0; i < sizeof(reg) / sizeof(Pcg32::result_type); ++i) { - *(((uint32_t*)®) + i) = gen(); - } - initFpu(); - /*for (int i = 0; i < RegistersCount / 2; ++i) { - reg.f[i].lo.f64 = (double)reg.f[i].lo.i64; - reg.f[i].hi.f64 = (double)reg.f[i].hi.i64; - } - for (int i = 0; i < RegistersCount / 2; ++i) { - reg.g[i].lo.f64 = std::abs((double)reg.g[i].lo.i64); - reg.g[i].hi.f64 = std::abs((double)reg.g[i].hi.i64); - }*/ - for (int i = 0; i < RegistersCount / 2; ++i) { - reg.a[i].lo.u64 = getSmallPositiveFloatBits(reg.f[i].lo.u64); - reg.a[i].hi.u64 = getSmallPositiveFloatBits(reg.f[i].hi.u64); - } - compiler.generateProgram(gen); - mem.ma = (gen() ^ *(((uint32_t*)seed) + 4)) & -64; - mem.mx = *(((uint32_t*)seed) + 5); + void CompiledVirtualMachine::initialize() { + VirtualMachine::initialize(); + compiler.generateProgram(program); } void CompiledVirtualMachine::execute() { diff --git a/src/CompiledVirtualMachine.hpp b/src/CompiledVirtualMachine.hpp index f969732..e3b6bf0 100644 --- a/src/CompiledVirtualMachine.hpp +++ b/src/CompiledVirtualMachine.hpp @@ -39,8 +39,7 @@ namespace RandomX { } CompiledVirtualMachine(); void setDataset(dataset_t ds) override; - void initializeScratchpad(uint8_t* scratchpad, int32_t index) override; - void initializeProgram(const void* seed) override; + void initialize() override; virtual void execute() override; void* getProgram() { return compiler.getCode(); diff --git a/src/Instruction.cpp b/src/Instruction.cpp index 5784c99..35cc737 100644 --- a/src/Instruction.cpp +++ b/src/Instruction.cpp @@ -116,12 +116,7 @@ namespace RandomX { } void Instruction::h_IMULH_R(std::ostream& os) const { - if (src != dst) { - os << "r" << (int)dst << ", r" << (int)src << std::endl; - } - else { - os << "r" << (int)dst << ", " << imm32 << std::endl; - } + os << "r" << (int)dst << ", r" << (int)src << std::endl; } void Instruction::h_IMULH_M(std::ostream& os) const { @@ -138,12 +133,7 @@ namespace RandomX { } void Instruction::h_ISMULH_R(std::ostream& os) const { - if (src != dst) { - os << "r" << (int)dst << ", r" << (int)src << std::endl; - } - else { - os << "r" << (int)dst << ", " << imm32 << std::endl; - } + os << "r" << (int)dst << ", r" << (int)src << std::endl; } void Instruction::h_ISMULH_M(std::ostream& os) const { @@ -247,9 +237,9 @@ namespace RandomX { os << std::endl; } - void Instruction::h_CFSUM_R(std::ostream& os) const { + void Instruction::h_FNEG_R(std::ostream& os) const { auto dstIndex = dst % 4; - os << "f" << dstIndex << ", " << (1 << ((mod % 4) + 3)) << std::endl; + os << "f" << dstIndex << std::endl; } void Instruction::h_FMUL_R(std::ostream& os) const { @@ -370,7 +360,7 @@ namespace RandomX { INST_NAME(FADD_M) INST_NAME(FSUB_R) INST_NAME(FSUB_M) - INST_NAME(CFSUM_R) + INST_NAME(FNEG_R) //Floating point group E INST_NAME(FMUL_R) @@ -421,7 +411,7 @@ namespace RandomX { INST_HANDLE(FADD_M) INST_HANDLE(FSUB_R) INST_HANDLE(FSUB_M) - INST_HANDLE(CFSUM_R) + INST_HANDLE(FNEG_R) //Floating point group E INST_HANDLE(FMUL_R) diff --git a/src/Instruction.hpp b/src/Instruction.hpp index 4f9e178..5cfd833 100644 --- a/src/Instruction.hpp +++ b/src/Instruction.hpp @@ -54,7 +54,7 @@ namespace RandomX { constexpr int FADD_M = 22; constexpr int FSUB_R = 23; constexpr int FSUB_M = 24; - constexpr int CFSUM_R = 25; + constexpr int FNEG_R = 25; constexpr int FMUL_R = 26; constexpr int FMUL_M = 27; constexpr int FDIV_R = 28; @@ -74,7 +74,7 @@ namespace RandomX { uint8_t dst; uint8_t src; uint8_t mod; - int32_t imm32; + uint32_t imm32; const char* getName() const { return names[opcode]; } @@ -116,7 +116,7 @@ namespace RandomX { void h_FADD_M(std::ostream&) const; void h_FSUB_R(std::ostream&) const; void h_FSUB_M(std::ostream&) const; - void h_CFSUM_R(std::ostream&) const; + void h_FNEG_R(std::ostream&) const; void h_FMUL_R(std::ostream&) const; void h_FMUL_M(std::ostream&) const; void h_FDIV_R(std::ostream&) const; diff --git a/src/InterpretedVirtualMachine.cpp b/src/InterpretedVirtualMachine.cpp index af01183..9e0d5e2 100644 --- a/src/InterpretedVirtualMachine.cpp +++ b/src/InterpretedVirtualMachine.cpp @@ -19,7 +19,6 @@ along with RandomX. If not, see. //#define TRACE //#define FPUCHECK #include "InterpretedVirtualMachine.hpp" -#include "Pcg32.hpp" #include "instructions.hpp" #include "dataset.hpp" #include "Cache.hpp" @@ -34,6 +33,7 @@ along with RandomX. If not, see. #ifdef STATS #include #endif +#include "divideByConstantCodegen.h" #ifdef FPUCHECK constexpr bool fpuCheck = true; @@ -61,88 +61,683 @@ namespace RandomX { } else { mem.ds = ds; - if (softAes) { - readDataset = &datasetReadLight; - } - else { - readDataset = &datasetReadLight; - } + readDataset = &datasetReadLight; } } - void InterpretedVirtualMachine::initializeScratchpad(uint8_t* scratchpad, int32_t index) { - uint32_t startingBlock = (ScratchpadSize / CacheLineSize) * index; - if (asyncWorker) { - ILightClientAsyncWorker* worker = mem.ds.asyncWorker; - const uint32_t blocksPerThread = (ScratchpadSize / CacheLineSize) / 2; - worker->prepareBlocks(scratchpad, startingBlock, blocksPerThread); //async first half - worker->getBlocks(scratchpad + ScratchpadLength / 2, startingBlock + blocksPerThread, blocksPerThread); //sync second half - worker->sync(); - } - else { - auto cache = mem.ds.cache; - if (softAes) { - for (int i = 0; i < ScratchpadSize / CacheLineSize; ++i) { - initBlock(cache->getCache(), ((uint8_t*)scratchpad) + CacheLineSize * i, (ScratchpadSize / CacheLineSize) * index + i, cache->getKeys()); - } - } - else { - for (int i = 0; i < ScratchpadSize / CacheLineSize; ++i) { - initBlock(cache->getCache(), ((uint8_t*)scratchpad) + CacheLineSize * i, (ScratchpadSize / CacheLineSize) * index + i, cache->getKeys()); - } - } + void InterpretedVirtualMachine::initialize() { + VirtualMachine::initialize(); + for (unsigned i = 0; i < ProgramLength; ++i) { + program(i).src %= RegistersCount; + program(i).dst %= RegistersCount; } } - void InterpretedVirtualMachine::initializeProgram(const void* seed) { - Pcg32 gen(seed); - for (unsigned i = 0; i < sizeof(reg) / sizeof(Pcg32::result_type); ++i) { - *(((uint32_t*)®) + i) = gen(); + template + void InterpretedVirtualMachine::executeBytecode(int_reg_t(&r)[8], __m128d (&f)[4], __m128d (&e)[4], __m128d (&a)[4]) { + executeBytecode(N, r, f, e, a); + executeBytecode(r, f, e, a); + } + + template<> + void InterpretedVirtualMachine::executeBytecode(int_reg_t(&r)[8], __m128d (&f)[4], __m128d (&e)[4], __m128d (&a)[4]) { + } + + FORCE_INLINE void InterpretedVirtualMachine::executeBytecode(int i, int_reg_t(&r)[8], __m128d (&f)[4], __m128d (&e)[4], __m128d (&a)[4]) { + auto& ibc = byteCode[i]; + switch (ibc.type) + { + case InstructionType::IADD_R: { + *ibc.idst += *ibc.isrc; + } break; + + case InstructionType::IADD_M: { + *ibc.idst += load64(scratchpad + (*ibc.isrc & ibc.memMask)); + } break; + + case InstructionType::IADD_RC: { + *ibc.idst += *ibc.isrc + ibc.imm; + } break; + + case InstructionType::ISUB_R: { + *ibc.idst -= *ibc.isrc; + } break; + + case InstructionType::ISUB_M: { + *ibc.idst -= load64(scratchpad + (*ibc.isrc & ibc.memMask)); + } break; + + case InstructionType::IMUL_9C: { + *ibc.idst += 9 * *ibc.idst + ibc.imm; + } break; + + case InstructionType::IMUL_R: { + *ibc.idst *= *ibc.isrc; + } break; + + case InstructionType::IMUL_M: { + *ibc.idst *= load64(scratchpad + (*ibc.isrc & ibc.memMask)); + } break; + + case InstructionType::IMULH_R: { + *ibc.idst = mulh(*ibc.idst, *ibc.isrc); + } break; + + case InstructionType::IMULH_M: { + *ibc.idst = mulh(*ibc.idst, load64(scratchpad + (*ibc.isrc & ibc.memMask))); + } break; + + case InstructionType::ISMULH_R: { + *ibc.idst = smulh(unsigned64ToSigned2sCompl(*ibc.idst), unsigned64ToSigned2sCompl(*ibc.isrc)); + } break; + + case InstructionType::ISMULH_M: { + *ibc.idst = smulh(unsigned64ToSigned2sCompl(*ibc.idst), unsigned64ToSigned2sCompl(load64(scratchpad + (*ibc.isrc & ibc.memMask)))); + } break; + + case InstructionType::IDIV_C: { + if (ibc.signedMultiplier != 0) { + int_reg_t dividend = *ibc.idst; + int_reg_t quotient = dividend >> ibc.preShift; + if (ibc.increment) { + quotient = quotient == UINT64_MAX ? UINT64_MAX : quotient + 1; + } + quotient = mulh(quotient, ibc.signedMultiplier); + quotient >>= ibc.postShift; + *ibc.idst += quotient; + } + else { + *ibc.idst += *ibc.idst >> ibc.shift; + } + } break; + + case InstructionType::ISDIV_C: { + + } break; + + case InstructionType::INEG_R: { + *ibc.idst = ~(*ibc.idst) + 1; //two's complement negative + } break; + + case InstructionType::IXOR_R: { + *ibc.idst ^= *ibc.isrc; + } break; + + case InstructionType::IXOR_M: { + *ibc.idst ^= load64(scratchpad + (*ibc.isrc & ibc.memMask)); + } break; + + case InstructionType::IROR_R: { + *ibc.idst = rotr(*ibc.idst, *ibc.isrc & 63); + } break; + + case InstructionType::IROL_R: { + *ibc.idst = rotl(*ibc.idst, *ibc.isrc & 63); + } break; + + case InstructionType::ISWAP_R: { + int_reg_t temp = *ibc.isrc; + *ibc.isrc = *ibc.idst; + *ibc.idst = temp; + } break; + + case InstructionType::FSWAP_R: { + *ibc.fdst = _mm_shuffle_pd(*ibc.fdst, *ibc.fdst, 1); + } break; + + case InstructionType::FADD_R: { + *ibc.fdst = _mm_add_pd(*ibc.fdst, *ibc.fsrc); + } break; + + case InstructionType::FADD_M: { + __m128d fsrc = load_cvt_i32x2(scratchpad + (*ibc.isrc & ibc.memMask)); + *ibc.fdst = _mm_add_pd(*ibc.fdst, fsrc); + } break; + + case InstructionType::FSUB_R: { + *ibc.fdst = _mm_sub_pd(*ibc.fdst, *ibc.fsrc); + } break; + + case InstructionType::FSUB_M: { + __m128d fsrc = load_cvt_i32x2(scratchpad + (*ibc.isrc & ibc.memMask)); + *ibc.fdst = _mm_sub_pd(*ibc.fdst, fsrc); + } break; + + case InstructionType::FNEG_R: { + const __m128d signMask = _mm_castsi128_pd(_mm_set1_epi64x(1ULL << 63)); + *ibc.fdst = _mm_xor_pd(*ibc.fdst, signMask); + } break; + + case InstructionType::FMUL_R: { + *ibc.fdst = _mm_mul_pd(*ibc.fdst, *ibc.fsrc); + } break; + + case InstructionType::FDIV_M: { + __m128d fsrc = load_cvt_i32x2(scratchpad + (*ibc.isrc & ibc.memMask)); + __m128d fdst = _mm_div_pd(*ibc.fdst, fsrc); + *ibc.fdst = _mm_max_pd(fdst, _mm_set_pd(DBL_MIN, DBL_MIN)); + } break; + + case InstructionType::FSQRT_R: { + *ibc.fdst = _mm_sqrt_pd(*ibc.fdst); + } break; + + case InstructionType::COND_R: { + *ibc.idst += condition(*ibc.isrc, ibc.imm, ibc.condition) ? 1 : 0; + } break; + + case InstructionType::COND_M: { + *ibc.idst += condition(load64(scratchpad + (*ibc.isrc & ibc.memMask)), ibc.imm, ibc.condition) ? 1 : 0; + } break; + + case InstructionType::CFROUND: { + setRoundMode(rotr(*ibc.isrc, ibc.imm) % 4); + } break; + + case InstructionType::ISTORE: { + store64(scratchpad + (*ibc.idst & ibc.memMask), *ibc.isrc); + } break; + + case InstructionType::NOP: { + //nothing + } break; + + default: + UNREACHABLE; } - initFpu(); - for (int i = 0; i < RegistersCount; ++i) { - reg.f[i].lo.f64 = (double)reg.f[i].lo.i64; - reg.f[i].hi.f64 = (double)reg.f[i].hi.i64; - } - //std::cout << reg; - p.initialize(gen); - currentTransform = addressTransformations[gen.getUniform(0, TransformationCount - 1)]; - mem.ma = (gen() ^ *(((uint32_t*)seed) + 4)) & ~7; - mem.mx = *(((uint32_t*)seed) + 5); - pc = 0; - ic = InstructionCount; - stack.clear(); } void InterpretedVirtualMachine::execute() { - for(int i = 0; i < InstructionCount; ++i) { - for (int j = 0; j < ProgramLength; ++j) { - auto& ibc = byteCode[j]; - switch (ibc.type) - { - case InstructionType::CFROUND: { - uint64_t rcFlag = rotr(ibc.isrc->u64, ibc.imm.i32); - setRoundMode(rcFlag); - } - break; - } + int_reg_t r[8] = { 0, 0, 0, 0, 0, 0, 0, 0 }; + __m128d f[4]; + __m128d e[4]; + __m128d a[4]; + + a[0] = _mm_load_pd(®.a[0].lo); + a[1] = _mm_load_pd(®.a[1].lo); + a[2] = _mm_load_pd(®.a[2].lo); + a[3] = _mm_load_pd(®.a[3].lo); + + precompileProgram(r, f, e, a); + + uint32_t spAddr0 = mem.mx; + uint32_t spAddr1 = mem.ma; + + for(int iter = 0; iter < InstructionCount; ++iter) { + //std::cout << "Iteration " << iter << std::endl; + spAddr0 ^= r[readReg0]; + spAddr0 &= ScratchpadL3Mask64; + + r[0] ^= load64(scratchpad + spAddr0 + 0); + r[1] ^= load64(scratchpad + spAddr0 + 8); + r[2] ^= load64(scratchpad + spAddr0 + 16); + r[3] ^= load64(scratchpad + spAddr0 + 24); + r[4] ^= load64(scratchpad + spAddr0 + 32); + r[5] ^= load64(scratchpad + spAddr0 + 40); + r[6] ^= load64(scratchpad + spAddr0 + 48); + r[7] ^= load64(scratchpad + spAddr0 + 56); + + spAddr1 ^= r[readReg1]; + spAddr1 &= ScratchpadL3Mask64; + + f[0] = load_cvt_i32x2(scratchpad + spAddr1 + 0); + f[1] = load_cvt_i32x2(scratchpad + spAddr1 + 8); + f[2] = load_cvt_i32x2(scratchpad + spAddr1 + 16); + f[3] = load_cvt_i32x2(scratchpad + spAddr1 + 24); + e[0] = _mm_abs(load_cvt_i32x2(scratchpad + spAddr1 + 32)); + e[1] = _mm_abs(load_cvt_i32x2(scratchpad + spAddr1 + 40)); + e[2] = _mm_abs(load_cvt_i32x2(scratchpad + spAddr1 + 48)); + e[3] = _mm_abs(load_cvt_i32x2(scratchpad + spAddr1 + 56)); + + executeBytecode<0>(r, f, e, a); + + if (asyncWorker) { + ILightClientAsyncWorker* aw = mem.ds.asyncWorker; + const uint64_t* datasetLine = aw->getBlock(mem.ma); + for (int i = 0; i < RegistersCount; ++i) + r[i] ^= datasetLine[i]; + mem.mx ^= r[readReg2] ^ r[readReg3]; + mem.mx &= CacheLineAlignMask; //align to cache line + std::swap(mem.mx, mem.ma); + aw->prepareBlock(mem.ma); } + else { + mem.mx ^= r[readReg2] ^ r[readReg3]; + mem.mx &= CacheLineAlignMask; + Cache* cache = mem.ds.cache; + uint64_t datasetLine[CacheLineSize / sizeof(uint64_t)]; + initBlock(cache->getCache(), (uint8_t*)datasetLine, mem.ma / CacheLineSize, cache->getKeys()); + for (int i = 0; i < RegistersCount; ++i) + r[i] ^= datasetLine[i]; + std::swap(mem.mx, mem.ma); + } + + store64(scratchpad + spAddr1 + 0, r[0]); + store64(scratchpad + spAddr1 + 8, r[1]); + store64(scratchpad + spAddr1 + 16, r[2]); + store64(scratchpad + spAddr1 + 24, r[3]); + store64(scratchpad + spAddr1 + 32, r[4]); + store64(scratchpad + spAddr1 + 40, r[5]); + store64(scratchpad + spAddr1 + 48, r[6]); + store64(scratchpad + spAddr1 + 56, r[7]); + + _mm_store_pd((double*)(scratchpad + spAddr0 + 0), _mm_mul_pd(f[0], e[0])); + _mm_store_pd((double*)(scratchpad + spAddr0 + 16), _mm_mul_pd(f[1], e[1])); + _mm_store_pd((double*)(scratchpad + spAddr0 + 32), _mm_mul_pd(f[2], e[2])); + _mm_store_pd((double*)(scratchpad + spAddr0 + 48), _mm_mul_pd(f[3], e[3])); + + spAddr0 = 0; + spAddr1 = 0; } + store64(®.r[0], r[0]); + store64(®.r[1], r[1]); + store64(®.r[2], r[2]); + store64(®.r[3], r[3]); + store64(®.r[4], r[4]); + store64(®.r[5], r[5]); + store64(®.r[6], r[6]); + store64(®.r[7], r[7]); + + _mm_store_pd(®.f[0].lo, f[0]); + _mm_store_pd(®.f[1].lo, f[1]); + _mm_store_pd(®.f[2].lo, f[2]); + _mm_store_pd(®.f[3].lo, f[3]); + _mm_store_pd(®.e[0].lo, e[0]); + _mm_store_pd(®.e[1].lo, e[1]); + _mm_store_pd(®.e[2].lo, e[2]); + _mm_store_pd(®.e[3].lo, e[3]); } #include "instructionWeights.hpp" - void InterpretedVirtualMachine::executeInstruction(Instruction& instr) { - switch (instr.opcode) - { - CASE_REP(IADD_R) + void InterpretedVirtualMachine::precompileProgram(int_reg_t(&r)[8], __m128d (&f)[4], __m128d (&e)[4], __m128d (&a)[4]) { + for (unsigned i = 0; i < ProgramLength; ++i) { + auto& instr = program(i); + auto& ibc = byteCode[i]; + switch (instr.opcode) { + CASE_REP(IADD_R) { + auto dst = instr.dst % RegistersCount; + auto src = instr.src % RegistersCount; + ibc.type = InstructionType::IADD_R; + ibc.idst = &r[dst]; + if (src != dst) { + ibc.isrc = &r[src]; + } + else { + ibc.imm = signExtend2sCompl(instr.imm32); + ibc.isrc = &ibc.imm; + } + } break; - break; + CASE_REP(IADD_M) { + auto dst = instr.dst % RegistersCount; + auto src = instr.src % RegistersCount; + ibc.type = InstructionType::IADD_M; + ibc.idst = &r[dst]; + if (instr.src != instr.dst) { + ibc.isrc = &r[src]; + ibc.memMask = ((instr.mod % 4) ? ScratchpadL1Mask : ScratchpadL2Mask); + } + else { + ibc.imm = instr.imm32; + ibc.isrc = &ibc.imm; + ibc.memMask = ScratchpadL3Mask; + } + } break; + + CASE_REP(IADD_RC) { + auto dst = instr.dst % RegistersCount; + auto src = instr.src % RegistersCount; + ibc.type = InstructionType::IADD_RC; + ibc.idst = &r[dst]; + ibc.isrc = &r[src]; + ibc.imm = signExtend2sCompl(instr.imm32); + } break; + + CASE_REP(ISUB_R) { + auto dst = instr.dst % RegistersCount; + auto src = instr.src % RegistersCount; + ibc.type = InstructionType::ISUB_R; + ibc.idst = &r[dst]; + if (src != dst) { + ibc.isrc = &r[src]; + } + else { + ibc.imm = signExtend2sCompl(instr.imm32); + ibc.isrc = &ibc.imm; + } + } break; + + CASE_REP(ISUB_M) { + auto dst = instr.dst % RegistersCount; + auto src = instr.src % RegistersCount; + ibc.type = InstructionType::ISUB_M; + ibc.idst = &r[dst]; + if (instr.src != instr.dst) { + ibc.isrc = &r[src]; + ibc.memMask = ((instr.mod % 4) ? ScratchpadL1Mask : ScratchpadL2Mask); + } + else { + ibc.imm = instr.imm32; + ibc.isrc = &ibc.imm; + ibc.memMask = ScratchpadL3Mask; + } + } break; + + CASE_REP(IMUL_9C) { + auto dst = instr.dst % RegistersCount; + ibc.type = InstructionType::IMUL_9C; + ibc.idst = &r[dst]; + ibc.imm = signExtend2sCompl(instr.imm32); + } break; + + CASE_REP(IMUL_R) { + auto dst = instr.dst % RegistersCount; + auto src = instr.src % RegistersCount; + ibc.type = InstructionType::IMUL_R; + ibc.idst = &r[dst]; + if (src != dst) { + ibc.isrc = &r[src]; + } + else { + ibc.imm = signExtend2sCompl(instr.imm32); + ibc.isrc = &ibc.imm; + } + } break; + + CASE_REP(IMUL_M) { + auto dst = instr.dst % RegistersCount; + auto src = instr.src % RegistersCount; + ibc.type = InstructionType::IMUL_M; + ibc.idst = &r[dst]; + if (instr.src != instr.dst) { + ibc.isrc = &r[src]; + ibc.memMask = ((instr.mod % 4) ? ScratchpadL1Mask : ScratchpadL2Mask); + } + else { + ibc.imm = instr.imm32; + ibc.isrc = &ibc.imm; + ibc.memMask = ScratchpadL3Mask; + } + } break; + + CASE_REP(IMULH_R) { + auto dst = instr.dst % RegistersCount; + auto src = instr.src % RegistersCount; + ibc.type = InstructionType::IMULH_R; + ibc.idst = &r[dst]; + ibc.isrc = &r[src]; + } break; + + CASE_REP(IMULH_M) { + auto dst = instr.dst % RegistersCount; + auto src = instr.src % RegistersCount; + ibc.type = InstructionType::IMULH_M; + ibc.idst = &r[dst]; + if (instr.src != instr.dst) { + ibc.isrc = &r[src]; + ibc.memMask = ((instr.mod % 4) ? ScratchpadL1Mask : ScratchpadL2Mask); + } + else { + ibc.imm = instr.imm32; + ibc.isrc = &ibc.imm; + ibc.memMask = ScratchpadL3Mask; + } + } break; + + CASE_REP(ISMULH_R) { + auto dst = instr.dst % RegistersCount; + auto src = instr.src % RegistersCount; + ibc.type = InstructionType::ISMULH_R; + ibc.idst = &r[dst]; + ibc.isrc = &r[src]; + } break; + + CASE_REP(ISMULH_M) { + auto dst = instr.dst % RegistersCount; + auto src = instr.src % RegistersCount; + ibc.type = InstructionType::ISMULH_M; + ibc.idst = &r[dst]; + if (instr.src != instr.dst) { + ibc.isrc = &r[src]; + ibc.memMask = ((instr.mod % 4) ? ScratchpadL1Mask : ScratchpadL2Mask); + } + else { + ibc.imm = instr.imm32; + ibc.isrc = &ibc.imm; + ibc.memMask = ScratchpadL3Mask; + } + } break; + + CASE_REP(IDIV_C) { + uint32_t divisor = instr.imm32; + if (divisor != 0) { + auto dst = instr.dst % RegistersCount; + ibc.type = InstructionType::IDIV_C; + ibc.idst = &r[dst]; + if (divisor & (divisor - 1)) { + magicu_info mi = compute_unsigned_magic_info(divisor, sizeof(uint64_t) * 8); + ibc.signedMultiplier = mi.multiplier; + ibc.preShift = mi.pre_shift; + ibc.postShift = mi.post_shift; + ibc.increment = mi.increment; + } + else { + ibc.signedMultiplier = 0; + int shift = 0; + while (divisor >>= 1) + ++shift; + ibc.shift = shift; + } + } + else { + ibc.type = InstructionType::NOP; + } + } break; + + CASE_REP(ISDIV_C) { + ibc.type = InstructionType::NOP; + } break; + + CASE_REP(INEG_R) { + auto dst = instr.dst % RegistersCount; + ibc.type = InstructionType::INEG_R; + ibc.idst = &r[dst]; + } break; + + CASE_REP(IXOR_R) { + auto dst = instr.dst % RegistersCount; + auto src = instr.src % RegistersCount; + ibc.type = InstructionType::IXOR_R; + ibc.idst = &r[dst]; + if (src != dst) { + ibc.isrc = &r[src]; + } + else { + ibc.imm = signExtend2sCompl(instr.imm32); + ibc.isrc = &ibc.imm; + } + } break; + + CASE_REP(IXOR_M) { + auto dst = instr.dst % RegistersCount; + auto src = instr.src % RegistersCount; + ibc.type = InstructionType::IXOR_M; + ibc.idst = &r[dst]; + if (instr.src != instr.dst) { + ibc.isrc = &r[src]; + ibc.memMask = ((instr.mod % 4) ? ScratchpadL1Mask : ScratchpadL2Mask); + } + else { + ibc.imm = instr.imm32; + ibc.isrc = &ibc.imm; + ibc.memMask = ScratchpadL3Mask; + } + } break; + + CASE_REP(IROR_R) { + auto dst = instr.dst % RegistersCount; + auto src = instr.src % RegistersCount; + ibc.type = InstructionType::IROR_R; + ibc.idst = &r[dst]; + if (src != dst) { + ibc.isrc = &r[src]; + } + else { + ibc.imm = instr.imm32; + ibc.isrc = &ibc.imm; + } + } break; + + CASE_REP(IROL_R) { + auto dst = instr.dst % RegistersCount; + auto src = instr.src % RegistersCount; + ibc.type = InstructionType::IROL_R; + ibc.idst = &r[dst]; + if (src != dst) { + ibc.isrc = &r[src]; + } + else { + ibc.imm = instr.imm32; + ibc.isrc = &ibc.imm; + } + } break; + + CASE_REP(ISWAP_R) { + auto dst = instr.dst % RegistersCount; + auto src = instr.src % RegistersCount; + if (src != dst) { + ibc.idst = &r[dst]; + ibc.isrc = &r[src]; + ibc.type = InstructionType::ISWAP_R; + } + else { + ibc.type = InstructionType::NOP; + } + } break; + + CASE_REP(FSWAP_R) { + auto dst = instr.dst % RegistersCount; + ibc.type = InstructionType::FSWAP_R; + ibc.fdst = &f[dst]; + } break; + + CASE_REP(FADD_R) { + auto dst = instr.dst % 4; + auto src = instr.src % 4; + ibc.type = InstructionType::FADD_R; + ibc.fdst = &f[dst]; + ibc.fsrc = &a[src]; + } break; + + CASE_REP(FADD_M) { + auto dst = instr.dst % 4; + auto src = instr.src % 8; + ibc.type = InstructionType::FADD_M; + ibc.fdst = &f[dst]; + ibc.isrc = &r[src]; + ibc.memMask = ((instr.mod % 4) ? ScratchpadL1Mask : ScratchpadL2Mask); + } break; + + CASE_REP(FSUB_R) { + auto dst = instr.dst % 4; + auto src = instr.src % 4; + ibc.type = InstructionType::FSUB_R; + ibc.fdst = &f[dst]; + ibc.fsrc = &a[src]; + } break; + + CASE_REP(FSUB_M) { + auto dst = instr.dst % 4; + auto src = instr.src % 8; + ibc.type = InstructionType::FSUB_M; + ibc.fdst = &f[dst]; + ibc.isrc = &r[src]; + ibc.memMask = ((instr.mod % 4) ? ScratchpadL1Mask : ScratchpadL2Mask); + } break; + + CASE_REP(FNEG_R) { + auto dst = instr.dst % 4; + ibc.fdst = &f[dst]; + ibc.type = InstructionType::FNEG_R; + } break; + + CASE_REP(FMUL_R) { + auto dst = instr.dst % 4; + auto src = instr.src % 4; + ibc.type = InstructionType::FMUL_R; + ibc.fdst = &e[dst]; + ibc.fsrc = &a[src]; + } break; + + CASE_REP(FMUL_M) { + } break; + + CASE_REP(FDIV_R) { + } break; + + CASE_REP(FDIV_M) { + auto dst = instr.dst % 4; + auto src = instr.src % 8; + ibc.type = InstructionType::FDIV_M; + ibc.fdst = &e[dst]; + ibc.isrc = &r[src]; + ibc.memMask = ((instr.mod % 4) ? ScratchpadL1Mask : ScratchpadL2Mask); + } break; + + CASE_REP(FSQRT_R) { + auto dst = instr.dst % 4; + ibc.type = InstructionType::FSQRT_R; + ibc.fdst = &e[dst]; + } break; + + CASE_REP(COND_R) { + auto dst = instr.dst % RegistersCount; + auto src = instr.src % RegistersCount; + ibc.type = InstructionType::COND_R; + ibc.idst = &r[dst]; + ibc.isrc = &r[src]; + ibc.condition = (instr.mod >> 2) & 7; + ibc.imm = instr.imm32; + } break; + + CASE_REP(COND_M) { + auto dst = instr.dst % RegistersCount; + auto src = instr.src % RegistersCount; + ibc.type = InstructionType::COND_M; + ibc.idst = &r[dst]; + ibc.isrc = &r[src]; + ibc.condition = (instr.mod >> 2) & 7; + ibc.imm = instr.imm32; + ibc.memMask = ((instr.mod % 4) ? ScratchpadL1Mask : ScratchpadL2Mask); + } break; + + CASE_REP(CFROUND) { + auto src = instr.src % 8; + ibc.isrc = &r[src]; + ibc.type = InstructionType::CFROUND; + ibc.imm = instr.imm32 & 63; + } break; + + CASE_REP(ISTORE) { + auto dst = instr.dst % RegistersCount; + auto src = instr.src % RegistersCount; + ibc.type = InstructionType::ISTORE; + ibc.idst = &r[dst]; + ibc.isrc = &r[src]; + } break; + + CASE_REP(FSTORE) { + } break; + + CASE_REP(NOP) { + ibc.type = InstructionType::NOP; + } break; + + default: + UNREACHABLE; + } } } - - InstructionHandler InterpretedVirtualMachine::engine[256] = { - - }; } \ No newline at end of file diff --git a/src/InterpretedVirtualMachine.hpp b/src/InterpretedVirtualMachine.hpp index f698d97..4db4ae4 100644 --- a/src/InterpretedVirtualMachine.hpp +++ b/src/InterpretedVirtualMachine.hpp @@ -21,7 +21,7 @@ along with RandomX. If not, see. //#define STATS #include "VirtualMachine.hpp" #include "Program.hpp" -#include +#include "intrinPortable.h" namespace RandomX { @@ -38,15 +38,23 @@ namespace RandomX { typedef void(InterpretedVirtualMachine::*InstructionHandler)(Instruction&); - struct alignas(64) InstructionByteCode { - convertible_t* idst; - convertible_t* isrc; - convertible_t imm; - fpu_reg_t* fdst; - fpu_reg_t* fsrc; + struct alignas(16) InstructionByteCode { + int_reg_t* idst; + int_reg_t* isrc; + int_reg_t imm; + __m128d* fdst; + __m128d* fsrc; uint32_t condition; uint32_t memMask; uint32_t type; + union { + uint64_t unsignedMultiplier; + int64_t signedMultiplier; + }; + unsigned shift; + unsigned preShift; + unsigned postShift; + bool increment; }; constexpr int asedwfagdewsa = sizeof(InstructionByteCode); @@ -56,21 +64,14 @@ namespace RandomX { InterpretedVirtualMachine(bool soft, bool async) : softAes(soft), asyncWorker(async) {} ~InterpretedVirtualMachine(); void setDataset(dataset_t ds) override; - void initializeScratchpad(uint8_t* scratchpad, int32_t index) override; - void initializeProgram(const void* seed) override; + void initialize() override; void execute() override; - const Program& getProgam() { - return p; - } private: static InstructionHandler engine[256]; - static const ITransform* addressTransformations[TransformationCount]; + DatasetReadFunc readDataset; bool softAes, asyncWorker; - Program p; InstructionByteCode byteCode[ProgramLength]; - std::vector stack; - uint64_t pc, ic; - const ITransform* currentTransform; + #ifdef STATS int count_ADD_64 = 0; int count_ADD_32 = 0; @@ -121,66 +122,9 @@ namespace RandomX { int count_FMUL_nop2 = 0; int datasetAccess[256] = { 0 }; #endif - void executeInstruction(Instruction&); - convertible_t loada(Instruction&); - convertible_t loadbiashift(Instruction&); - convertible_t loadbiadiv(Instruction&); - convertible_t loadbia(Instruction&); - convertible_t& getcr(Instruction&); - void writecf(Instruction&, fpu_reg_t&); - - void stackPush(convertible_t& c) { - stack.push_back(c); - } - - void stackPush(uint64_t x) { - convertible_t c; - c.u64 = x; - stack.push_back(c); - } - - convertible_t stackPopValue() { - convertible_t top = stack.back(); - stack.pop_back(); - return top; - } - - uint64_t stackPopAddress() { - convertible_t top = stack.back(); - stack.pop_back(); - return top.u64; - } - - void h_ADD_64(Instruction&); - void h_ADD_32(Instruction&); - void h_SUB_64(Instruction&); - void h_SUB_32(Instruction&); - void h_MUL_64(Instruction&); - void h_MULH_64(Instruction&); - void h_MUL_32(Instruction&); - void h_IMUL_32(Instruction&); - void h_IMULH_64(Instruction&); - void h_DIV_64(Instruction&); - void h_IDIV_64(Instruction&); - void h_AND_64(Instruction&); - void h_AND_32(Instruction&); - void h_OR_64(Instruction&); - void h_OR_32(Instruction&); - void h_XOR_64(Instruction&); - void h_XOR_32(Instruction&); - void h_SHL_64(Instruction&); - void h_SHR_64(Instruction&); - void h_SAR_64(Instruction&); - void h_ROL_64(Instruction&); - void h_ROR_64(Instruction&); - void h_FADD(Instruction&); - void h_FSUB(Instruction&); - void h_FMUL(Instruction&); - void h_FDIV(Instruction&); - void h_FSQRT(Instruction&); - void h_FPROUND(Instruction&); - void h_JUMP(Instruction&); - void h_CALL(Instruction&); - void h_RET(Instruction&); + void precompileProgram(int_reg_t(&r)[8], __m128d (&f)[4], __m128d (&e)[4], __m128d (&a)[4]); + template + void executeBytecode(int_reg_t(&r)[8], __m128d (&f)[4], __m128d (&e)[4], __m128d (&a)[4]); + void executeBytecode(int i, int_reg_t(&r)[8], __m128d (&f)[4], __m128d (&e)[4], __m128d (&a)[4]); }; } \ No newline at end of file diff --git a/src/JitCompilerX86.cpp b/src/JitCompilerX86.cpp index de803be..b77da17 100644 --- a/src/JitCompilerX86.cpp +++ b/src/JitCompilerX86.cpp @@ -19,7 +19,7 @@ along with RandomX. If not, see. #define MAGIC_DIVISION #include "JitCompilerX86.hpp" -#include "Pcg32.hpp" +#include "Program.hpp" #include #include #ifdef MAGIC_DIVISION @@ -43,7 +43,7 @@ namespace RandomX { //throw std::runtime_error("JIT compiler only supports x86-64 CPUs"); } - void JitCompilerX86::generateProgram(Pcg32& gen) { + void JitCompilerX86::generateProgram(Program& p) { } @@ -87,7 +87,7 @@ namespace RandomX { ; xmm12 -> temporary ; xmm13 -> DBL_MIN ; xmm14 -> absolute value mask 0x7fffffffffffffff7fffffffffffffff - ; xmm15 -> unused + ; xmm15 -> sign mask 0x80000000000000008000000000000000 */ @@ -199,35 +199,32 @@ namespace RandomX { memcpy(code + CodeSize - epilogueSize, codeEpilogue, epilogueSize); } - void JitCompilerX86::generateProgram(Pcg32& gen) { - auto addressRegisters = gen(); - int readReg1 = addressRegisters & 1; + void JitCompilerX86::generateProgram(Program& prog) { + auto addressRegisters = prog.getEntropy(12); + uint32_t readReg0 = 0 + (addressRegisters & 1); addressRegisters >>= 1; - int readReg2 = 2 + (addressRegisters & 1); + uint32_t readReg1 = 2 + (addressRegisters & 1); addressRegisters >>= 1; - int readReg3 = 4 + (addressRegisters & 1); + uint32_t readReg2 = 4 + (addressRegisters & 1); addressRegisters >>= 1; - int readReg4 = 6 + (addressRegisters & 1); + uint32_t readReg3 = 6 + (addressRegisters & 1); codePos = prologueSize; emit(REX_XOR_RAX_R64); - emitByte(0xc0 + readReg1); + emitByte(0xc0 + readReg0); emit(REX_XOR_RAX_R64); - emitByte(0xc0 + readReg2); + emitByte(0xc0 + readReg1); memcpy(code + codePos, codeLoopLoad, loopLoadSize); codePos += loopLoadSize; - Instruction instr; for (unsigned i = 0; i < ProgramLength; ++i) { - for (unsigned j = 0; j < sizeof(instr) / sizeof(Pcg32::result_type); ++j) { - *(((uint32_t*)&instr) + j) = gen(); - } + Instruction& instr = prog(i); instr.src %= RegistersCount; instr.dst %= RegistersCount; generateCode(instr); } emit(REX_MOV_RR); - emitByte(0xc0 + readReg3); + emitByte(0xc0 + readReg2); emit(REX_XOR_EAX); - emitByte(0xc0 + readReg4); + emitByte(0xc0 + readReg3); memcpy(code + codePos, codeReadDataset, readDatasetSize); codePos += readDatasetSize; memcpy(code + codePos, codeLoopStore, loopStoreSize); @@ -365,22 +362,12 @@ namespace RandomX { } void JitCompilerX86::h_IMULH_R(Instruction& instr) { - if (instr.src != instr.dst) { - emit(REX_MOV_RR64); - emitByte(0xc0 + instr.dst); - emit(REX_MUL_R); - emitByte(0xe0 + instr.src); - emit(REX_MOV_R64R); - emitByte(0xc2 + 8 * instr.dst); - } - else { - emitByte(MOV_EAX_I); - emit32(instr.imm32); - emit(REX_MUL_R); - emitByte(0xe0 + instr.dst); - emit(REX_ADD_RM); - emitByte(0xc2 + 8 * instr.dst); - } + emit(REX_MOV_RR64); + emitByte(0xc0 + instr.dst); + emit(REX_MUL_R); + emitByte(0xe0 + instr.src); + emit(REX_MOV_R64R); + emitByte(0xc2 + 8 * instr.dst); } void JitCompilerX86::h_IMULH_M(Instruction& instr) { @@ -402,22 +389,12 @@ namespace RandomX { } void JitCompilerX86::h_ISMULH_R(Instruction& instr) { - if (instr.src != instr.dst) { - emit(REX_MOV_RR64); - emitByte(0xc0 + instr.dst); - emit(REX_MUL_R); - emitByte(0xe8 + instr.src); - emit(REX_MOV_R64R); - emitByte(0xc2 + 8 * instr.dst); - } - else { - emitByte(MOV_EAX_I); - emit32(instr.imm32); - emit(REX_MUL_R); - emitByte(0xe8 + instr.dst); - emit(REX_ADD_RM); - emitByte(0xc2 + 8 * instr.dst); - } + emit(REX_MOV_RR64); + emitByte(0xc0 + instr.dst); + emit(REX_MUL_R); + emitByte(0xe8 + instr.src); + emit(REX_MOV_R64R); + emitByte(0xc2 + 8 * instr.dst); } void JitCompilerX86::h_ISMULH_M(Instruction& instr) { @@ -648,7 +625,7 @@ namespace RandomX { emitByte(0xc4 + 8 * instr.dst); } - void JitCompilerX86::h_CFSUM_R(Instruction& instr) { + void JitCompilerX86::h_FNEG_R(Instruction& instr) { instr.dst %= 4; emit(REX_XORPS); emitByte(0xc7 + 8 * instr.dst); @@ -802,7 +779,7 @@ namespace RandomX { INST_HANDLE(FADD_M) INST_HANDLE(FSUB_R) INST_HANDLE(FSUB_M) - INST_HANDLE(CFSUM_R) + INST_HANDLE(FNEG_R) INST_HANDLE(FMUL_R) INST_HANDLE(FMUL_M) INST_HANDLE(FDIV_R) diff --git a/src/JitCompilerX86.hpp b/src/JitCompilerX86.hpp index feba888..e790cfe 100644 --- a/src/JitCompilerX86.hpp +++ b/src/JitCompilerX86.hpp @@ -24,10 +24,9 @@ along with RandomX. If not, see. #include #include -class Pcg32; - namespace RandomX { + class Program; class JitCompilerX86; typedef void(JitCompilerX86::*InstructionGeneratorX86)(Instruction&); @@ -37,7 +36,7 @@ namespace RandomX { class JitCompilerX86 { public: JitCompilerX86(); - void generateProgram(Pcg32&); + void generateProgram(Program&); ProgramFunc getProgramFunc() { return (ProgramFunc)code; } @@ -115,7 +114,7 @@ namespace RandomX { void h_FADD_M(Instruction&); void h_FSUB_R(Instruction&); void h_FSUB_M(Instruction&); - void h_CFSUM_R(Instruction&); + void h_FNEG_R(Instruction&); void h_FMUL_R(Instruction&); void h_FMUL_M(Instruction&); void h_FDIV_R(Instruction&); diff --git a/src/LightClientAsyncWorker.cpp b/src/LightClientAsyncWorker.cpp index 32aa508..f79d03d 100644 --- a/src/LightClientAsyncWorker.cpp +++ b/src/LightClientAsyncWorker.cpp @@ -35,7 +35,7 @@ namespace RandomX { template void LightClientAsyncWorker::prepareBlock(addr_t addr) { #ifdef TRACE - std::cout << sw.getElapsed() << ": prepareBlock-enter " << addr << std::endl; + std::cout << sw.getElapsed() << ": prepareBlock-enter " << addr / CacheLineSize << std::endl; #endif { std::lock_guard lk(mutex); @@ -47,18 +47,24 @@ namespace RandomX { #ifdef TRACE std::cout << sw.getElapsed() << ": prepareBlock-notify " << startBlock << "/" << blockCount << std::endl; #endif - notifier.notify_all(); + notifier.notify_one(); } template const uint64_t* LightClientAsyncWorker::getBlock(addr_t addr) { +#ifdef TRACE + std::cout << sw.getElapsed() << ": getBlock-enter " << addr / CacheLineSize << std::endl; +#endif uint32_t currentBlock = addr / CacheLineSize; if (currentBlock != startBlock || output != currentLine.data()) { - initBlock(cache->getCache(), (uint8_t*)currentLine.data(), currentBlock, cache->getKeys()); + initBlock(cache->getCache(), (uint8_t*)currentLine.data(), currentBlock, cache->getKeys()); } else { sync(); } +#ifdef TRACE + std::cout << sw.getElapsed() << ": getBlock-return " << addr / CacheLineSize << std::endl; +#endif return currentLine.data(); } @@ -73,14 +79,14 @@ namespace RandomX { this->blockCount = blockCount; output = out; hasWork = true; + notifier.notify_one(); } - notifier.notify_all(); } template void LightClientAsyncWorker::getBlocks(void* out, uint32_t startBlock, uint32_t blockCount) { for (uint32_t i = 0; i < blockCount; ++i) { - initBlock(cache->getCache(), (uint8_t*)out + CacheLineSize * i, startBlock + i, cache->getKeys()); + initBlock(cache->getCache(), (uint8_t*)out + CacheLineSize * i, startBlock + i, cache->getKeys()); } } @@ -98,10 +104,17 @@ namespace RandomX { for (;;) { std::unique_lock lk(mutex); notifier.wait(lk, [this] { return hasWork; }); - getBlocks(output, startBlock, blockCount); +#ifdef TRACE + std::cout << sw.getElapsed() << ": runWorker-getBlocks " << startBlock << "/" << blockCount << std::endl; +#endif + //getBlocks(output, startBlock, blockCount); + initBlock(cache->getCache(), (uint8_t*)output, startBlock, cache->getKeys()); hasWork = false; +#ifdef TRACE + std::cout << sw.getElapsed() << ": runWorker-finished " << startBlock << "/" << blockCount << std::endl; +#endif lk.unlock(); - notifier.notify_all(); + notifier.notify_one(); } } diff --git a/src/Pcg32.hpp b/src/Pcg32.hpp deleted file mode 100644 index 906800f..0000000 --- a/src/Pcg32.hpp +++ /dev/null @@ -1,72 +0,0 @@ -/* -Copyright (c) 2018 tevador - -This file is part of RandomX. - -RandomX is free software: you can redistribute it and/or modify -it under the terms of the GNU General Public License as published by -the Free Software Foundation, either version 3 of the License, or -(at your option) any later version. - -RandomX is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with RandomX. If not, see. -*/ - -// Based on: -// *Really* minimal PCG32 code / (c) 2014 M.E. O'Neill / pcg-random.org -// Licensed under Apache License 2.0 (NO WARRANTY, etc. see website) - -#pragma once -#include - -#if defined(_MSC_VER) -#pragma warning (disable : 4146) -#endif - -class Pcg32 { -public: - typedef uint32_t result_type; - static constexpr result_type min() { return 0U; } - static constexpr result_type max() { return UINT32_MAX; } - Pcg32(const void* seed) { - auto* u64seed = (const uint64_t*)seed; - state = *(u64seed + 0); - inc = *(u64seed + 1) | 1ull; - } - Pcg32(uint64_t state, uint64_t inc) : state(state), inc(inc | 1ull) { - } - result_type operator()() { - return next(); - } - result_type getUniform(result_type min, result_type max) { - const result_type range = max - min; - const result_type erange = range + 1; - result_type ret; - - for (;;) { - ret = next(); - if (ret / erange < UINT32_MAX / erange || UINT32_MAX % erange == range) { - ret %= erange; - break; - } - } - return ret + min; - } -private: - uint64_t state; - uint64_t inc; - result_type next() { - uint64_t oldstate = state; - // Advance internal state - state = oldstate * 6364136223846793005ULL + inc; - // Calculate output function (XSH RR), uses old state for max ILP - uint32_t xorshifted = ((oldstate >> 18u) ^ oldstate) >> 27u; - uint32_t rot = oldstate >> 59u; - return (xorshifted >> rot) | (xorshifted << (-rot & 31)); - } -}; diff --git a/src/Program.cpp b/src/Program.cpp index b78a5ee..bb4e086 100644 --- a/src/Program.cpp +++ b/src/Program.cpp @@ -18,15 +18,9 @@ along with RandomX. If not, see. */ #include "Program.hpp" -#include "Pcg32.hpp" +#include "hashAes1Rx4.hpp" namespace RandomX { - void Program::initialize(Pcg32& gen) { - for (unsigned i = 0; i < sizeof(programBuffer) / sizeof(Pcg32::result_type); ++i) { - *(((uint32_t*)&programBuffer) + i) = gen(); - } - } - void Program::print(std::ostream& os) const { for (int i = 0; i < RandomX::ProgramLength; ++i) { auto instr = programBuffer[i]; diff --git a/src/Program.hpp b/src/Program.hpp index 35b45d2..1f695a0 100644 --- a/src/Program.hpp +++ b/src/Program.hpp @@ -24,22 +24,25 @@ along with RandomX. If not, see. #include "common.hpp" #include "Instruction.hpp" -class Pcg32; - namespace RandomX { class Program { public: - Instruction& operator()(uint64_t pc) { + Instruction& operator()(int pc) { return programBuffer[pc]; } - void initialize(Pcg32& gen); friend std::ostream& operator<<(std::ostream& os, const Program& p) { p.print(os); return os; } + uint64_t getEntropy(int i) { + return entropyBuffer[i]; + } private: void print(std::ostream&) const; + uint64_t entropyBuffer[16]; Instruction programBuffer[ProgramLength]; }; + + static_assert(sizeof(Program) % 64 == 0, "Invalid size of class Program"); } diff --git a/src/VirtualMachine.cpp b/src/VirtualMachine.cpp index 01de3d9..2adf4e4 100644 --- a/src/VirtualMachine.cpp +++ b/src/VirtualMachine.cpp @@ -23,32 +23,72 @@ along with RandomX. If not, see. #include "blake2/blake2.h" #include #include +#include "intrinPortable.h" std::ostream& operator<<(std::ostream& os, const RandomX::RegisterFile& rf) { for (int i = 0; i < RandomX::RegistersCount; ++i) - os << std::hex << "r" << i << " = " << rf.r[i].u64 << std::endl << std::dec; + os << std::hex << "r" << i << " = " << rf.r[i] << std::endl << std::dec; for (int i = 0; i < RandomX::RegistersCount; ++i) - os << std::hex << "f" << i << " = " << rf.f[i].hi.u64 << " (" << rf.f[i].hi.f64 << ")" << std::endl - << " = " << rf.f[i].lo.u64 << " (" << rf.f[i].lo.f64 << ")" << std::endl << std::dec; + os << std::hex << "f" << i << " = " << *(uint64_t*)&rf.f[i].hi << " (" << rf.f[i].hi << ")" << std::endl + << " = " << *(uint64_t*)&rf.f[i].lo << " (" << rf.f[i].lo << ")" << std::endl << std::dec; return os; } namespace RandomX { + constexpr int mantissaSize = 52; + constexpr int exponentSize = 11; + constexpr uint64_t mantissaMask = (1ULL << mantissaSize) - 1; + constexpr uint64_t exponentMask = (1ULL << exponentSize) - 1; + constexpr int exponentBias = 1023; + + static inline uint64_t getSmallPositiveFloatBits(uint64_t entropy) { + auto exponent = entropy >> 59; //0..31 + auto mantissa = entropy & mantissaMask; + exponent += exponentBias; + exponent &= exponentMask; + exponent <<= mantissaSize; + return exponent | mantissa; + } + VirtualMachine::VirtualMachine() { mem.ds.dataset = nullptr; } - void VirtualMachine::getResult(void* scratchpad, size_t scratchpadSize, void* out) { - constexpr size_t smallStateLength = sizeof(RegisterFile) / sizeof(uint64_t) + 8; - alignas(16) uint64_t smallState[smallStateLength]; - memcpy(smallState, ®, sizeof(RegisterFile)); - if (scratchpadSize > 0) { - hashAes1Rx4(scratchpad, scratchpadSize, smallState + 24); - } - else { - memset(smallState + 24, 0, 64); - } - blake2b(out, ResultSize, smallState, sizeof(smallState), nullptr, 0); + void VirtualMachine::resetRoundingMode() { + initFpu(); } + + void VirtualMachine::initialize() { + store64(®.a[0].lo, getSmallPositiveFloatBits(program.getEntropy(0))); + store64(®.a[0].hi, getSmallPositiveFloatBits(program.getEntropy(1))); + store64(®.a[1].lo, getSmallPositiveFloatBits(program.getEntropy(2))); + store64(®.a[1].hi, getSmallPositiveFloatBits(program.getEntropy(3))); + store64(®.a[2].lo, getSmallPositiveFloatBits(program.getEntropy(4))); + store64(®.a[2].hi, getSmallPositiveFloatBits(program.getEntropy(5))); + store64(®.a[3].lo, getSmallPositiveFloatBits(program.getEntropy(6))); + store64(®.a[3].hi, getSmallPositiveFloatBits(program.getEntropy(7))); + mem.ma = program.getEntropy(8) & CacheLineAlignMask; + mem.mx = program.getEntropy(10); + auto addressRegisters = program.getEntropy(12); + readReg0 = 0 + (addressRegisters & 1); + addressRegisters >>= 1; + readReg1 = 2 + (addressRegisters & 1); + addressRegisters >>= 1; + readReg2 = 4 + (addressRegisters & 1); + addressRegisters >>= 1; + readReg3 = 6 + (addressRegisters & 1); + } + + template + void VirtualMachine::getResult(void* scratchpad, size_t scratchpadSize, void* outHash) { + if (scratchpadSize > 0) { + hashAes1Rx4(scratchpad, scratchpadSize, ®.a); + } + blake2b(outHash, ResultSize, ®, sizeof(RegisterFile), nullptr, 0); + } + + template void VirtualMachine::getResult(void* scratchpad, size_t scratchpadSize, void* outHash); + template void VirtualMachine::getResult(void* scratchpad, size_t scratchpadSize, void* outHash); + } \ No newline at end of file diff --git a/src/VirtualMachine.hpp b/src/VirtualMachine.hpp index fe48e13..d1dbe26 100644 --- a/src/VirtualMachine.hpp +++ b/src/VirtualMachine.hpp @@ -20,28 +20,36 @@ along with RandomX. If not, see. #pragma once #include #include "common.hpp" +#include "Program.hpp" namespace RandomX { + + class VirtualMachine { public: VirtualMachine(); virtual ~VirtualMachine() {} virtual void setDataset(dataset_t ds) = 0; - virtual void initializeScratchpad(uint8_t* scratchpad, int32_t index) = 0; void setScratchpad(void* ptr) { - scratchpad = (convertible_t*)ptr; + scratchpad = (uint8_t*)ptr; } - virtual void initializeProgram(const void* seed) = 0; + void resetRoundingMode(); + virtual void initialize(); virtual void execute() = 0; - void getResult(void*, size_t, void*); + template + void getResult(void* scratchpad, size_t scratchpadSize, void* outHash); const RegisterFile& getRegisterFile() { return reg; } + Program* getProgramBuffer() { + return &program; + } protected: - DatasetReadFunc readDataset; + alignas(16) Program program; alignas(16) RegisterFile reg; MemoryRegisters mem; - convertible_t* scratchpad; + uint8_t* scratchpad; + uint32_t readReg0, readReg1, readReg2, readReg3; }; } \ No newline at end of file diff --git a/src/asm/program_loop_store.inc b/src/asm/program_loop_store.inc index bd2bbdd..a0acebc 100644 --- a/src/asm/program_loop_store.inc +++ b/src/asm/program_loop_store.inc @@ -12,10 +12,6 @@ mulpd xmm1, xmm5 mulpd xmm2, xmm6 mulpd xmm3, xmm7 - ;# xorpd xmm0, xmm15 - ;# xorpd xmm1, xmm15 - ;# xorpd xmm2, xmm15 - ;# xorpd xmm3, xmm15 movapd xmmword ptr [rcx+0], xmm0 movapd xmmword ptr [rcx+16], xmm1 movapd xmmword ptr [rcx+32], xmm2 diff --git a/src/asm/program_prologue_load.inc b/src/asm/program_prologue_load.inc index 74c2a08..757cf10 100644 --- a/src/asm/program_prologue_load.inc +++ b/src/asm/program_prologue_load.inc @@ -18,5 +18,4 @@ movapd xmm11, xmmword ptr [rcx+120] movapd xmm13, xmmword ptr [minDbl] movapd xmm14, xmmword ptr [absMask] - ;# xorpd xmm15, xmm15 - + movapd xmm15, xmmword ptr [signMask] diff --git a/src/blake2/blake2-impl.h b/src/blake2/blake2-impl.h index 60b26fe..f294ba6 100644 --- a/src/blake2/blake2-impl.h +++ b/src/blake2/blake2-impl.h @@ -27,105 +27,10 @@ along with RandomX. If not, see. #define PORTABLE_BLAKE2_IMPL_H #include -#include -#if defined(_MSC_VER) -#define BLAKE2_INLINE __inline -#elif defined(__GNUC__) || defined(__clang__) -#define BLAKE2_INLINE __inline__ -#else -#define BLAKE2_INLINE -#endif +#include "endian.h" - /* Argon2 Team - Begin Code */ - /* - Not an exhaustive list, but should cover the majority of modern platforms - Additionally, the code will always be correct---this is only a performance - tweak. - */ -#if (defined(__BYTE_ORDER__) && \ - (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)) || \ - defined(__LITTLE_ENDIAN__) || defined(__ARMEL__) || defined(__MIPSEL__) || \ - defined(__AARCH64EL__) || defined(__amd64__) || defined(__i386__) || \ - defined(_M_IX86) || defined(_M_X64) || defined(_M_AMD64) || \ - defined(_M_ARM) -#define NATIVE_LITTLE_ENDIAN -#endif - /* Argon2 Team - End Code */ - -static BLAKE2_INLINE uint32_t load32(const void *src) { -#if defined(NATIVE_LITTLE_ENDIAN) - uint32_t w; - memcpy(&w, src, sizeof w); - return w; -#else - const uint8_t *p = (const uint8_t *)src; - uint32_t w = *p++; - w |= (uint32_t)(*p++) << 8; - w |= (uint32_t)(*p++) << 16; - w |= (uint32_t)(*p++) << 24; - return w; -#endif -} - -static BLAKE2_INLINE uint64_t load64(const void *src) { -#if defined(NATIVE_LITTLE_ENDIAN) - uint64_t w; - memcpy(&w, src, sizeof w); - return w; -#else - const uint8_t *p = (const uint8_t *)src; - uint64_t w = *p++; - w |= (uint64_t)(*p++) << 8; - w |= (uint64_t)(*p++) << 16; - w |= (uint64_t)(*p++) << 24; - w |= (uint64_t)(*p++) << 32; - w |= (uint64_t)(*p++) << 40; - w |= (uint64_t)(*p++) << 48; - w |= (uint64_t)(*p++) << 56; - return w; -#endif -} - -static BLAKE2_INLINE void store32(void *dst, uint32_t w) { -#if defined(NATIVE_LITTLE_ENDIAN) - memcpy(dst, &w, sizeof w); -#else - uint8_t *p = (uint8_t *)dst; - *p++ = (uint8_t)w; - w >>= 8; - *p++ = (uint8_t)w; - w >>= 8; - *p++ = (uint8_t)w; - w >>= 8; - *p++ = (uint8_t)w; -#endif -} - -static BLAKE2_INLINE void store64(void *dst, uint64_t w) { -#if defined(NATIVE_LITTLE_ENDIAN) - memcpy(dst, &w, sizeof w); -#else - uint8_t *p = (uint8_t *)dst; - *p++ = (uint8_t)w; - w >>= 8; - *p++ = (uint8_t)w; - w >>= 8; - *p++ = (uint8_t)w; - w >>= 8; - *p++ = (uint8_t)w; - w >>= 8; - *p++ = (uint8_t)w; - w >>= 8; - *p++ = (uint8_t)w; - w >>= 8; - *p++ = (uint8_t)w; - w >>= 8; - *p++ = (uint8_t)w; -#endif -} - -static BLAKE2_INLINE uint64_t load48(const void *src) { +static FORCE_INLINE uint64_t load48(const void *src) { const uint8_t *p = (const uint8_t *)src; uint64_t w = *p++; w |= (uint64_t)(*p++) << 8; @@ -136,7 +41,7 @@ static BLAKE2_INLINE uint64_t load48(const void *src) { return w; } -static BLAKE2_INLINE void store48(void *dst, uint64_t w) { +static FORCE_INLINE void store48(void *dst, uint64_t w) { uint8_t *p = (uint8_t *)dst; *p++ = (uint8_t)w; w >>= 8; @@ -151,11 +56,11 @@ static BLAKE2_INLINE void store48(void *dst, uint64_t w) { *p++ = (uint8_t)w; } -static BLAKE2_INLINE uint32_t rotr32(const uint32_t w, const unsigned c) { +static FORCE_INLINE uint32_t rotr32(const uint32_t w, const unsigned c) { return (w >> c) | (w << (32 - c)); } -static BLAKE2_INLINE uint64_t rotr64(const uint64_t w, const unsigned c) { +static FORCE_INLINE uint64_t rotr64(const uint64_t w, const unsigned c) { return (w >> c) | (w << (64 - c)); } diff --git a/src/blake2/blake2b.c b/src/blake2/blake2b.c index e7569b4..329ed3c 100644 --- a/src/blake2/blake2b.c +++ b/src/blake2/blake2b.c @@ -51,29 +51,29 @@ static const unsigned int blake2b_sigma[12][16] = { {14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3}, }; -static BLAKE2_INLINE void blake2b_set_lastnode(blake2b_state *S) { +static FORCE_INLINE void blake2b_set_lastnode(blake2b_state *S) { S->f[1] = (uint64_t)-1; } -static BLAKE2_INLINE void blake2b_set_lastblock(blake2b_state *S) { +static FORCE_INLINE void blake2b_set_lastblock(blake2b_state *S) { if (S->last_node) { blake2b_set_lastnode(S); } S->f[0] = (uint64_t)-1; } -static BLAKE2_INLINE void blake2b_increment_counter(blake2b_state *S, +static FORCE_INLINE void blake2b_increment_counter(blake2b_state *S, uint64_t inc) { S->t[0] += inc; S->t[1] += (S->t[0] < inc); } -static BLAKE2_INLINE void blake2b_invalidate_state(blake2b_state *S) { +static FORCE_INLINE void blake2b_invalidate_state(blake2b_state *S) { //clear_internal_memory(S, sizeof(*S)); /* wipe */ blake2b_set_lastblock(S); /* invalidate for further use */ } -static BLAKE2_INLINE void blake2b_init0(blake2b_state *S) { +static FORCE_INLINE void blake2b_init0(blake2b_state *S) { memset(S, 0, sizeof(*S)); memcpy(S->h, blake2b_IV, sizeof(S->h)); } diff --git a/src/blake2/blamka-round-ref.h b/src/blake2/blamka-round-ref.h index d7acd68..d087b72 100644 --- a/src/blake2/blamka-round-ref.h +++ b/src/blake2/blamka-round-ref.h @@ -30,7 +30,7 @@ along with RandomX. If not, see. #include "blake2-impl.h" /* designed by the Lyra PHC team */ -static BLAKE2_INLINE uint64_t fBlaMka(uint64_t x, uint64_t y) { +static FORCE_INLINE uint64_t fBlaMka(uint64_t x, uint64_t y) { const uint64_t m = UINT64_C(0xFFFFFFFF); const uint64_t xy = (x & m) * (y & m); return x + y + 2 * xy; diff --git a/src/blake2/endian.h b/src/blake2/endian.h new file mode 100644 index 0000000..fab1eed --- /dev/null +++ b/src/blake2/endian.h @@ -0,0 +1,99 @@ +#pragma once +#include +#include + +#if defined(_MSC_VER) +#define FORCE_INLINE __inline +#elif defined(__GNUC__) || defined(__clang__) +#define FORCE_INLINE __inline__ +#else +#define FORCE_INLINE +#endif + + /* Argon2 Team - Begin Code */ + /* + Not an exhaustive list, but should cover the majority of modern platforms + Additionally, the code will always be correct---this is only a performance + tweak. + */ +#if (defined(__BYTE_ORDER__) && \ + (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)) || \ + defined(__LITTLE_ENDIAN__) || defined(__ARMEL__) || defined(__MIPSEL__) || \ + defined(__AARCH64EL__) || defined(__amd64__) || defined(__i386__) || \ + defined(_M_IX86) || defined(_M_X64) || defined(_M_AMD64) || \ + defined(_M_ARM) +#define NATIVE_LITTLE_ENDIAN +#endif + /* Argon2 Team - End Code */ + +static FORCE_INLINE uint32_t load32(const void *src) { +#if defined(NATIVE_LITTLE_ENDIAN) + uint32_t w; + memcpy(&w, src, sizeof w); + return w; +#else + const uint8_t *p = (const uint8_t *)src; + uint32_t w = *p++; + w |= (uint32_t)(*p++) << 8; + w |= (uint32_t)(*p++) << 16; + w |= (uint32_t)(*p++) << 24; + return w; +#endif +} + +static FORCE_INLINE uint64_t load64(const void *src) { +#if defined(NATIVE_LITTLE_ENDIAN) + uint64_t w; + memcpy(&w, src, sizeof w); + return w; +#else + const uint8_t *p = (const uint8_t *)src; + uint64_t w = *p++; + w |= (uint64_t)(*p++) << 8; + w |= (uint64_t)(*p++) << 16; + w |= (uint64_t)(*p++) << 24; + w |= (uint64_t)(*p++) << 32; + w |= (uint64_t)(*p++) << 40; + w |= (uint64_t)(*p++) << 48; + w |= (uint64_t)(*p++) << 56; + return w; +#endif +} + +static FORCE_INLINE void store32(void *dst, uint32_t w) { +#if defined(NATIVE_LITTLE_ENDIAN) + memcpy(dst, &w, sizeof w); +#else + uint8_t *p = (uint8_t *)dst; + *p++ = (uint8_t)w; + w >>= 8; + *p++ = (uint8_t)w; + w >>= 8; + *p++ = (uint8_t)w; + w >>= 8; + *p++ = (uint8_t)w; +#endif +} + +static FORCE_INLINE void store64(void *dst, uint64_t w) { +#if defined(NATIVE_LITTLE_ENDIAN) + memcpy(dst, &w, sizeof w); +#else + uint8_t *p = (uint8_t *)dst; + *p++ = (uint8_t)w; + w >>= 8; + *p++ = (uint8_t)w; + w >>= 8; + *p++ = (uint8_t)w; + w >>= 8; + *p++ = (uint8_t)w; + w >>= 8; + *p++ = (uint8_t)w; + w >>= 8; + *p++ = (uint8_t)w; + w >>= 8; + *p++ = (uint8_t)w; + w >>= 8; + *p++ = (uint8_t)w; +#endif +} diff --git a/src/common.hpp b/src/common.hpp index ea67ff9..8c16825 100644 --- a/src/common.hpp +++ b/src/common.hpp @@ -21,13 +21,14 @@ along with RandomX. If not, see. #include #include +#include "blake2/endian.h" namespace RandomX { using addr_t = uint32_t; constexpr int SeedSize = 32; - constexpr int ResultSize = 32; + constexpr int ResultSize = 64; constexpr int ArgonIterations = 3; constexpr uint32_t ArgonMemorySize = 262144; //KiB @@ -36,12 +37,13 @@ namespace RandomX { constexpr int ArgonSaltSize = sizeof(ArgonSalt) - 1; constexpr int CacheLineSize = 64; + constexpr uint32_t CacheLineAlignMask = 0xFFFFFFFF & ~(CacheLineSize - 1); constexpr uint64_t DatasetSize = 4ULL * 1024 * 1024 * 1024; //4 GiB constexpr uint32_t CacheSize = ArgonMemorySize * 1024; constexpr int CacheBlockCount = CacheSize / CacheLineSize; constexpr int BlockExpansionRatio = DatasetSize / CacheSize; constexpr int DatasetBlockCount = BlockExpansionRatio * CacheBlockCount; - constexpr int DatasetIterations = 10; + constexpr int DatasetIterations = 16; #ifdef TRACE @@ -50,35 +52,36 @@ namespace RandomX { constexpr bool trace = false; #endif - union convertible_t { - double f64; - int64_t i64; - uint64_t u64; - int32_t i32; - uint32_t u32; - struct { - int32_t i32lo; - int32_t i32hi; - }; - }; +#ifndef UNREACHABLE +#ifdef __GNUC__ +#define UNREACHABLE __builtin_unreachable() +#elif _MSC_VER +#define UNREACHABLE __assume(false) +#else +#define UNREACHABLE +#endif +#endif + + using int_reg_t = uint64_t; struct fpu_reg_t { - convertible_t lo; - convertible_t hi; + double lo; + double hi; }; constexpr int ProgramLength = 256; constexpr uint32_t InstructionCount = 2048; constexpr uint32_t ScratchpadSize = 2 * 1024 * 1024; - constexpr uint32_t ScratchpadLength = ScratchpadSize / sizeof(convertible_t); - constexpr uint32_t ScratchpadL1 = ScratchpadSize / 128 / sizeof(convertible_t); - constexpr uint32_t ScratchpadL2 = ScratchpadSize / 8 / sizeof(convertible_t); - constexpr uint32_t ScratchpadL3 = ScratchpadSize / sizeof(convertible_t); + constexpr uint32_t ScratchpadLength = ScratchpadSize / sizeof(int_reg_t); + constexpr uint32_t ScratchpadL1 = ScratchpadSize / 128 / sizeof(int_reg_t); + constexpr uint32_t ScratchpadL2 = ScratchpadSize / 8 / sizeof(int_reg_t); + constexpr uint32_t ScratchpadL3 = ScratchpadSize / sizeof(int_reg_t); constexpr int ScratchpadL1Mask = (ScratchpadL1 - 1) * 8; constexpr int ScratchpadL2Mask = (ScratchpadL2 - 1) * 8; constexpr int ScratchpadL1Mask16 = (ScratchpadL1 / 2 - 1) * 16; constexpr int ScratchpadL2Mask16 = (ScratchpadL2 / 2 - 1) * 16; constexpr int ScratchpadL3Mask = (ScratchpadLength - 1) * 8; + constexpr int ScratchpadL3Mask64 = (ScratchpadLength / 8 - 1) * 64; constexpr uint32_t TransformationCount = 90; constexpr int RegistersCount = 8; @@ -117,22 +120,20 @@ namespace RandomX { static_assert(sizeof(MemoryRegisters) == 2 * sizeof(addr_t) + sizeof(uintptr_t), "Invalid alignment of struct RandomX::MemoryRegisters"); struct RegisterFile { - convertible_t r[RegistersCount]; + int_reg_t r[RegistersCount]; fpu_reg_t f[RegistersCount / 2]; - fpu_reg_t g[RegistersCount / 2]; + fpu_reg_t e[RegistersCount / 2]; fpu_reg_t a[RegistersCount / 2]; }; static_assert(sizeof(RegisterFile) == 256, "Invalid alignment of struct RandomX::RegisterFile"); - typedef void(*DatasetReadFunc)(addr_t, MemoryRegisters&, RegisterFile&); + typedef void(*DatasetReadFunc)(addr_t, MemoryRegisters&, int_reg_t(®)[RegistersCount]); - typedef void(*ProgramFunc)(RegisterFile&, MemoryRegisters&, convertible_t*, uint64_t); - - typedef bool(*Condition)(convertible_t&, convertible_t&); + typedef void(*ProgramFunc)(RegisterFile&, MemoryRegisters&, uint8_t* /* scratchpad */, uint64_t); extern "C" { - void executeProgram(RegisterFile&, MemoryRegisters&, convertible_t*, uint64_t); + void executeProgram(RegisterFile&, MemoryRegisters&, uint8_t* /* scratchpad */, uint64_t); } } diff --git a/src/dataset.cpp b/src/dataset.cpp index b941a75..5b618f9 100644 --- a/src/dataset.cpp +++ b/src/dataset.cpp @@ -24,11 +24,11 @@ along with RandomX. If not, see. #include "common.hpp" #include "dataset.hpp" -#include "Pcg32.hpp" #include "Cache.hpp" #include "virtualMemory.hpp" #include "softAes.h" #include "squareHash.h" +#include "blake2/endian.h" #if defined(__SSE2__) #include @@ -39,56 +39,38 @@ along with RandomX. If not, see. namespace RandomX { - template - static inline void shuffle(T* buffer, size_t bytes, Pcg32& gen) { - auto count = bytes / sizeof(T); - for (auto i = count - 1; i >= 1; --i) { - int j = gen.getUniform(0, i); - std::swap(buffer[j], buffer[i]); - } - } - - template - void initBlock(const uint8_t* intermediate, uint8_t* out, uint32_t blockNumber, const KeysContainer& keys) { + void initBlock(const uint8_t* cache, uint8_t* out, uint32_t blockNumber, const KeysContainer& keys) { uint64_t r0, r1, r2, r3, r4, r5, r6, r7; r0 = 4ULL * blockNumber; r1 = r2 = r3 = r4 = r5 = r6 = r7 = 0; - constexpr int mask = (CacheSize - 1) & -64; + constexpr uint32_t mask = (CacheSize - 1) & CacheLineAlignMask; for (auto i = 0; i < DatasetIterations; ++i) { - uint64_t* mix = (uint64_t*)(intermediate + (r0 & mask)); - PREFETCHNTA(mix); + const uint8_t* mixBlock = cache + (r0 & mask); + PREFETCHNTA(mixBlock); r0 = squareHash(r0); - r0 ^= mix[0]; - r1 ^= mix[1]; - r2 ^= mix[2]; - r3 ^= mix[3]; - r4 ^= mix[4]; - r5 ^= mix[5]; - r6 ^= mix[6]; - r7 ^= mix[7]; + r0 ^= load64(mixBlock + 0); + r1 ^= load64(mixBlock + 8); + r2 ^= load64(mixBlock + 16); + r3 ^= load64(mixBlock + 24); + r4 ^= load64(mixBlock + 32); + r5 ^= load64(mixBlock + 40); + r6 ^= load64(mixBlock + 48); + r7 ^= load64(mixBlock + 56); } - uint64_t* out64 = (uint64_t*)out; - - out64[0] = r0; - out64[1] = r1; - out64[2] = r2; - out64[3] = r3; - out64[4] = r4; - out64[5] = r5; - out64[6] = r6; - out64[7] = r7; + store64(out + 0, r0); + store64(out + 8, r1); + store64(out + 16, r2); + store64(out + 24, r3); + store64(out + 32, r4); + store64(out + 40, r5); + store64(out + 48, r6); + store64(out + 56, r7); } - template - void initBlock(const uint8_t*, uint8_t*, uint32_t, const KeysContainer&); - - template - void initBlock(const uint8_t*, uint8_t*, uint32_t, const KeysContainer&); - void datasetRead(addr_t addr, MemoryRegisters& memory, RegisterFile& reg) { uint64_t* datasetLine = (uint64_t*)(memory.ds.dataset + memory.ma); memory.mx ^= addr; @@ -96,34 +78,27 @@ namespace RandomX { std::swap(memory.mx, memory.ma); PREFETCHNTA(memory.ds.dataset + memory.ma); for (int i = 0; i < RegistersCount; ++i) - reg.r[i].u64 ^= datasetLine[i]; + reg.r[i] ^= datasetLine[i]; } - template - void datasetReadLight(addr_t addr, MemoryRegisters& memory, RegisterFile& reg) { + void datasetReadLight(addr_t addr, MemoryRegisters& memory, int_reg_t (®)[RegistersCount]) { + memory.mx ^= addr; + memory.mx &= CacheLineAlignMask; //align to cache line Cache* cache = memory.ds.cache; uint64_t datasetLine[CacheLineSize / sizeof(uint64_t)]; - initBlock(cache->getCache(), (uint8_t*)datasetLine, memory.ma / CacheLineSize, cache->getKeys()); + initBlock(cache->getCache(), (uint8_t*)datasetLine, memory.ma / CacheLineSize, cache->getKeys()); for (int i = 0; i < RegistersCount; ++i) - reg.r[i].u64 ^= datasetLine[i]; - memory.mx ^= addr; - memory.mx &= -64; //align to cache line + reg[i] ^= datasetLine[i]; std::swap(memory.mx, memory.ma); } - template - void datasetReadLight(addr_t addr, MemoryRegisters& memory, RegisterFile& reg); - - template - void datasetReadLight(addr_t addr, MemoryRegisters& memory, RegisterFile& reg); - - void datasetReadLightAsync(addr_t addr, MemoryRegisters& memory, RegisterFile& reg) { + void datasetReadLightAsync(addr_t addr, MemoryRegisters& memory, int_reg_t(®)[RegistersCount]) { ILightClientAsyncWorker* aw = memory.ds.asyncWorker; const uint64_t* datasetLine = aw->getBlock(memory.ma); for (int i = 0; i < RegistersCount; ++i) - reg.r[i].u64 ^= datasetLine[i]; + reg[i] ^= datasetLine[i]; memory.mx ^= addr; - memory.mx &= -64; //align to cache line + memory.mx &= CacheLineAlignMask; //align to cache line std::swap(memory.mx, memory.ma); aw->prepareBlock(memory.ma); } @@ -145,7 +120,7 @@ namespace RandomX { template void datasetInit(Cache* cache, dataset_t ds, uint32_t startBlock, uint32_t blockCount) { for (uint32_t i = startBlock; i < startBlock + blockCount; ++i) { - initBlock(cache->getCache(), ds.dataset + i * CacheLineSize, i, cache->getKeys()); + initBlock(cache->getCache(), ds.dataset + i * CacheLineSize, i, cache->getKeys()); } } @@ -172,7 +147,7 @@ namespace RandomX { alignas(16) KeysContainer keys; alignas(16) uint8_t buffer[CacheLineSize]; for (uint32_t block = 0; block < blockCount; ++block) { - initBlock(buffer, buffer, 0, keys); + initBlock(buffer, buffer, 0, keys); } } diff --git a/src/dataset.hpp b/src/dataset.hpp index 312b924..77a477d 100644 --- a/src/dataset.hpp +++ b/src/dataset.hpp @@ -31,7 +31,6 @@ namespace RandomX { template void initBlock(const uint8_t* in, uint8_t* out, uint32_t blockNumber, const KeysContainer& keys); - template void initBlock(const uint8_t* cache, uint8_t* block, uint32_t blockNumber, const KeysContainer& keys); void datasetAlloc(dataset_t& ds, bool largePages); @@ -44,10 +43,9 @@ namespace RandomX { template void datasetInitCache(const void* seed, dataset_t& dataset, bool largePages); - template - void datasetReadLight(addr_t addr, MemoryRegisters& memory, RegisterFile&); + void datasetReadLight(addr_t addr, MemoryRegisters& memory, int_reg_t(®)[RegistersCount]); - void datasetReadLightAsync(addr_t addr, MemoryRegisters& memory, RegisterFile& reg); + void datasetReadLightAsync(addr_t addr, MemoryRegisters& memory, int_reg_t(®)[RegistersCount]); template void aesBench(uint32_t blockCount); diff --git a/src/executeProgram-win64.asm b/src/executeProgram-win64.asm index ff43578..ac49e50 100644 --- a/src/executeProgram-win64.asm +++ b/src/executeProgram-win64.asm @@ -54,7 +54,7 @@ executeProgram PROC ; xmm12 -> temporary ; xmm13 -> DBL_MIN ; xmm14 -> absolute value mask - ; xmm15 -> unused + ; xmm15 -> sign mask ; store callee-saved registers push rbx @@ -104,7 +104,7 @@ executeProgram PROC movapd xmm11, xmmword ptr [rcx+120] movapd xmm13, xmmword ptr [minDbl] movapd xmm14, xmmword ptr [absMask] - ;# xorps xmm15, xmm15 + movapd xmm15, xmmword ptr [signMask] jmp program_begin diff --git a/src/instructionWeights.hpp b/src/instructionWeights.hpp index 3998d07..c336b29 100644 --- a/src/instructionWeights.hpp +++ b/src/instructionWeights.hpp @@ -37,8 +37,8 @@ along with RandomX. If not, see. #define WT_INEG_R 2 #define WT_IXOR_R 16 #define WT_IXOR_M 4 -#define WT_IROR_R 8 -#define WT_IROL_R 8 +#define WT_IROR_R 10 +#define WT_IROL_R 0 #define WT_ISWAP_R 4 //Common floating point @@ -49,6 +49,7 @@ along with RandomX. If not, see. #define WT_FADD_M 5 #define WT_FSUB_R 20 #define WT_FSUB_M 5 +#define WT_FNEG_R 6 //Floating point group E #define WT_FMUL_R 20 @@ -61,7 +62,6 @@ along with RandomX. If not, see. #define WT_COND_R 7 #define WT_COND_M 1 #define WT_CFROUND 1 -#define WT_CFSUM_R 0 //Store #define WT_ISTORE 16 @@ -74,7 +74,7 @@ WT_ISUB_M + WT_IMUL_9C + WT_IMUL_R + WT_IMUL_M + WT_IMULH_R + \ WT_IMULH_M + WT_ISMULH_R + WT_ISMULH_M + WT_IDIV_C + WT_ISDIV_C + \ WT_INEG_R + WT_IXOR_R + WT_IXOR_M + WT_IROR_R + WT_IROL_R + \ WT_ISWAP_R + WT_FSWAP_R + WT_FADD_R + WT_FADD_M + WT_FSUB_R + WT_FSUB_M + \ -WT_CFSUM_R + WT_FMUL_R + WT_FMUL_M + WT_FDIV_R + WT_FDIV_M + \ +WT_FNEG_R + WT_FMUL_R + WT_FMUL_M + WT_FDIV_R + WT_FDIV_M + \ WT_FSQRT_R + WT_COND_R + WT_COND_M + WT_CFROUND + WT_ISTORE + WT_FSTORE + WT_NOP; static_assert(wtSum == 256, diff --git a/src/instructions.hpp b/src/instructions.hpp deleted file mode 100644 index 6d9a98f..0000000 --- a/src/instructions.hpp +++ /dev/null @@ -1,57 +0,0 @@ -/* -Copyright (c) 2018 tevador - -This file is part of RandomX. - -RandomX is free software: you can redistribute it and/or modify -it under the terms of the GNU General Public License as published by -the Free Software Foundation, either version 3 of the License, or -(at your option) any later version. - -RandomX is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with RandomX. If not, see. -*/ - -#include -#include "common.hpp" - -namespace RandomX { - - extern "C" { - void ADD_64(convertible_t& a, convertible_t& b, convertible_t& c); - void ADD_32(convertible_t& a, convertible_t& b, convertible_t& c); - void SUB_64(convertible_t& a, convertible_t& b, convertible_t& c); - void SUB_32(convertible_t& a, convertible_t& b, convertible_t& c); - void MUL_64(convertible_t& a, convertible_t& b, convertible_t& c); - void MULH_64(convertible_t& a, convertible_t& b, convertible_t& c); - void MUL_32(convertible_t& a, convertible_t& b, convertible_t& c); - void IMUL_32(convertible_t& a, convertible_t& b, convertible_t& c); - void IMULH_64(convertible_t& a, convertible_t& b, convertible_t& c); - void DIV_64(convertible_t& a, convertible_t& b, convertible_t& c); - void IDIV_64(convertible_t& a, convertible_t& b, convertible_t& c); - void AND_64(convertible_t& a, convertible_t& b, convertible_t& c); - void AND_32(convertible_t& a, convertible_t& b, convertible_t& c); - void OR_64(convertible_t& a, convertible_t& b, convertible_t& c); - void OR_32(convertible_t& a, convertible_t& b, convertible_t& c); - void XOR_64(convertible_t& a, convertible_t& b, convertible_t& c); - void XOR_32(convertible_t& a, convertible_t& b, convertible_t& c); - void SHL_64(convertible_t& a, convertible_t& b, convertible_t& c); - void SHR_64(convertible_t& a, convertible_t& b, convertible_t& c); - void SAR_64(convertible_t& a, convertible_t& b, convertible_t& c); - void ROL_64(convertible_t& a, convertible_t& b, convertible_t& c); - void ROR_64(convertible_t& a, convertible_t& b, convertible_t& c); - bool JMP_COND(uint8_t, convertible_t&, int32_t); - void FPINIT(); - void FPROUND(convertible_t, uint8_t); - void FADD(convertible_t& a, fpu_reg_t& b, fpu_reg_t& c); - void FSUB(convertible_t& a, fpu_reg_t& b, fpu_reg_t& c); - void FMUL(convertible_t& a, fpu_reg_t& b, fpu_reg_t& c); - void FDIV(convertible_t& a, fpu_reg_t& b, fpu_reg_t& c); - void FSQRT(convertible_t& a, fpu_reg_t& b, fpu_reg_t& c); - } -} \ No newline at end of file diff --git a/src/instructionsPortable.cpp b/src/instructionsPortable.cpp index ca85ffc..59d19c5 100644 --- a/src/instructionsPortable.cpp +++ b/src/instructionsPortable.cpp @@ -18,12 +18,14 @@ along with RandomX. If not, see. */ //#define DEBUG #include "intrinPortable.h" +#include "blake2/endian.h" #pragma STDC FENV_ACCESS on #include #include #ifdef DEBUG #include #endif +#include "common.hpp" #if defined(__SIZEOF_INT128__) typedef unsigned __int128 uint128_t; @@ -136,18 +138,18 @@ static inline int32_t safeSub(int32_t a, int32_t b) { #if defined(__has_builtin) #if __has_builtin(__builtin_sub_overflow) - static inline bool subOverflow__(int32_t a, int32_t b) { + static inline bool subOverflow__(uint32_t a, uint32_t b) { int32_t temp; - return __builtin_sub_overflow(a, b, &temp); + return __builtin_sub_overflow(unsigned32ToSigned2sCompl(a), unsigned32ToSigned2sCompl(b), &temp); } #define HAVE_SUB_OVERFLOW #endif #endif #ifndef HAVE_SUB_OVERFLOW - static inline bool subOverflow__(int32_t a, int32_t b) { - auto c = safeSub(a, b); - return (c < a) != (b > 0); + static inline bool subOverflow__(uint32_t a, uint32_t b) { + auto c = unsigned32ToSigned2sCompl(a - b); + return (c < unsigned32ToSigned2sCompl(a)) != (unsigned32ToSigned2sCompl(b) > 0); } #define HAVE_SUB_OVERFLOW #endif @@ -166,40 +168,44 @@ static inline double FlushNaN(double x) { void setRoundMode(uint32_t rcflag) { switch (rcflag & 3) { - case RoundDown: - setRoundMode__(FE_DOWNWARD); - break; - case RoundUp: - setRoundMode__(FE_UPWARD); - break; - case RoundToZero: - setRoundMode__(FE_TOWARDZERO); - break; - default: - setRoundMode__(FE_TONEAREST); - break; + case RoundDown: + setRoundMode__(FE_DOWNWARD); + break; + case RoundUp: + setRoundMode__(FE_UPWARD); + break; + case RoundToZero: + setRoundMode__(FE_TOWARDZERO); + break; + case RoundToNearest: + setRoundMode__(FE_TONEAREST); + break; + default: + UNREACHABLE; } } -bool condition(uint32_t type, int32_t value, int32_t imm32) { +bool condition(uint32_t type, uint32_t value, uint32_t imm32) { switch (type & 7) { - case 0: - return (uint32_t)value <= (uint32_t)imm32; - case 1: - return (uint32_t)value > (uint32_t)imm32; - case 2: - return safeSub(value, imm32) < 0; - case 3: - return safeSub(value, imm32) >= 0; - case 4: - return subOverflow__(value, imm32); - case 5: - return !subOverflow__(value, imm32); - case 6: - return value < imm32; - case 7: - return value >= imm32; + case 0: + return value <= imm32; + case 1: + return value > imm32; + case 2: + return unsigned32ToSigned2sCompl(value - imm32) < 0; + case 3: + return unsigned32ToSigned2sCompl(value - imm32) >= 0; + case 4: + return subOverflow__(value, imm32); + case 5: + return !subOverflow__(value, imm32); + case 6: + return unsigned32ToSigned2sCompl(value) < unsigned32ToSigned2sCompl(imm32); + case 7: + return unsigned32ToSigned2sCompl(value) >= unsigned32ToSigned2sCompl(imm32); + default: + UNREACHABLE; } } @@ -211,100 +217,13 @@ void initFpu() { #endif } -namespace RandomX { +union double_ser_t { + double f; + uint64_t i; +}; - extern "C" { - /*void DIV_64(convertible_t& a, convertible_t& b, convertible_t& c) { - c.u64 = a.u64 / (b.u32 != 0 ? b.u32 : 1U); - } - - void IDIV_64(convertible_t& a, convertible_t& b, convertible_t& c) { - if (a.i64 == INT64_MIN && b.i32 == -1) - c.i64 = INT64_MIN; - else - c.i64 = a.i64 / (b.i32 != 0 ? b.i32 : 1); - } - - void FADD(convertible_t& a, fpu_reg_t& b, fpu_reg_t& c) { -#ifdef __SSE2__ - __m128i ai = _mm_loadl_epi64((const __m128i*)&a); - __m128d ad = _mm_cvtepi32_pd(ai); - __m128d bd = _mm_load_pd(&b.lo.f64); - __m128d cd = _mm_add_pd(ad, bd); - _mm_store_pd(&c.lo.f64, cd); -#else - double alo = (double)a.i32lo; - double ahi = (double)a.i32hi; - c.lo.f64 = alo + b.lo.f64; - c.hi.f64 = ahi + b.hi.f64; -#endif - } - - void FSUB(convertible_t& a, fpu_reg_t& b, fpu_reg_t& c) { -#ifdef __SSE2__ - __m128i ai = _mm_loadl_epi64((const __m128i*)&a); - __m128d ad = _mm_cvtepi32_pd(ai); - __m128d bd = _mm_load_pd(&b.lo.f64); - __m128d cd = _mm_sub_pd(ad, bd); - _mm_store_pd(&c.lo.f64, cd); -#else - double alo = (double)a.i32lo; - double ahi = (double)a.i32hi; - c.lo.f64 = alo - b.lo.f64; - c.hi.f64 = ahi - b.hi.f64; -#endif - } - - void FMUL(convertible_t& a, fpu_reg_t& b, fpu_reg_t& c) { -#ifdef __SSE2__ - __m128i ai = _mm_loadl_epi64((const __m128i*)&a); - __m128d ad = _mm_cvtepi32_pd(ai); - __m128d bd = _mm_load_pd(&b.lo.f64); - __m128d cd = _mm_mul_pd(ad, bd); - __m128d mask = _mm_cmpeq_pd(cd, cd); - cd = _mm_and_pd(cd, mask); - _mm_store_pd(&c.lo.f64, cd); -#else - double alo = (double)a.i32lo; - double ahi = (double)a.i32hi; - c.lo.f64 = FlushNaN(alo * b.lo.f64); - c.hi.f64 = FlushNaN(ahi * b.hi.f64); -#endif - } - - void FDIV(convertible_t& a, fpu_reg_t& b, fpu_reg_t& c) { -#ifdef __SSE2__ - __m128i ai = _mm_loadl_epi64((const __m128i*)&a); - __m128d ad = _mm_cvtepi32_pd(ai); - __m128d bd = _mm_load_pd(&b.lo.f64); - __m128d cd = _mm_div_pd(ad, bd); - __m128d mask = _mm_cmpeq_pd(cd, cd); - cd = _mm_and_pd(cd, mask); - _mm_store_pd(&c.lo.f64, cd); -#else - double alo = (double)a.i32lo; - double ahi = (double)a.i32hi; - c.lo.f64 = FlushDenormalNaN(alo / b.lo.f64); - c.hi.f64 = FlushDenormalNaN(ahi / b.hi.f64); -#endif - } - - void FSQRT(convertible_t& a, fpu_reg_t& b, fpu_reg_t& c) { -#ifdef __SSE2__ - __m128i ai = _mm_loadl_epi64((const __m128i*)&a); - __m128d ad = _mm_cvtepi32_pd(ai); - const __m128d absmask = _mm_castsi128_pd(_mm_set1_epi64x(~(1LL << 63))); - ad = _mm_and_pd(ad, absmask); - __m128d cd = _mm_sqrt_pd(ad); - _mm_store_pd(&c.lo.f64, cd); -#else - double alo = (double)a.i32lo; - double ahi = (double)a.i32hi; - c.lo.f64 = sqrt(std::abs(alo)); - c.hi.f64 = sqrt(std::abs(ahi)); -#endif - }*/ - - - } -} \ No newline at end of file +double loadDoublePortable(const void* addr) { + double_ser_t ds; + ds.i = load64(addr); + return ds.f; +} diff --git a/src/intrinPortable.h b/src/intrinPortable.h index 3d2136c..2c2e487 100644 --- a/src/intrinPortable.h +++ b/src/intrinPortable.h @@ -33,12 +33,21 @@ along with RandomX. If not, see. #else #include #endif + +inline __m128d _mm_abs(__m128d xd) { + const __m128d absmask = _mm_castsi128_pd(_mm_set1_epi64x(~(1LL << 63))); + return _mm_and_pd(xd, absmask); +} + +#define PREFETCHNTA(x) _mm_prefetch((const char *)(x), _MM_HINT_NTA) + #else #include #include #define _mm_malloc(a,b) malloc(a) #define _mm_free(a) free(a) +#define PREFETCHNTA(x) typedef union { uint64_t u64[2]; @@ -152,10 +161,29 @@ constexpr int RoundDown = 1; constexpr int RoundUp = 2; constexpr int RoundToZero = 3; +constexpr int32_t unsigned32ToSigned2sCompl(uint32_t x) { + return (-1 == ~0) ? (int32_t)x : (x > INT32_MAX ? (-(int32_t)(UINT32_MAX - x) - 1) : (int32_t)x); +} + +constexpr int64_t unsigned64ToSigned2sCompl(uint64_t x) { + return (-1 == ~0) ? (int64_t)x : (x > INT64_MAX ? (-(int64_t)(UINT64_MAX - x) - 1) : (int64_t)x); +} + +constexpr uint64_t signExtend2sCompl(uint32_t x) { + return (-1 == ~0) ? (int64_t)(int32_t)(x) : (x > INT32_MAX ? (x | 0xffffffff00000000ULL) : (uint64_t)x); +} + +inline __m128d load_cvt_i32x2(const void* addr) { + __m128i ix = _mm_load_si128((const __m128i*)addr); + return _mm_cvtepi32_pd(ix); +} + +double loadDoublePortable(const void* addr); + uint64_t mulh(uint64_t, uint64_t); int64_t smulh(int64_t, int64_t); uint64_t rotl(uint64_t, int); uint64_t rotr(uint64_t, int); void initFpu(); void setRoundMode(uint32_t); -bool condition(uint32_t, int32_t, int32_t); +bool condition(uint32_t, uint32_t, uint32_t); diff --git a/src/main.cpp b/src/main.cpp index 58b381c..b16b13b 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -34,7 +34,6 @@ along with RandomX. If not, see. #include #include "dataset.hpp" #include "Cache.hpp" -#include "Pcg32.hpp" #include "hashAes1Rx4.hpp" const uint8_t seed[32] = { 191, 182, 222, 175, 249, 89, 134, 104, 241, 68, 191, 62, 162, 166, 61, 64, 123, 191, 227, 193, 118, 60, 188, 53, 223, 133, 175, 24, 123, 230, 55, 74 }; @@ -117,7 +116,7 @@ void printUsage(const char* executable) { } void generateAsm(int nonce) { - uint64_t hash[4]; + uint64_t hash[8]; unsigned char blockTemplate[] = { 0x07, 0x07, 0xf7, 0xa4, 0xf0, 0xd6, 0x05, 0xb3, 0x03, 0x26, 0x08, 0x16, 0xba, 0x3f, 0x10, 0x90, 0x2e, 0x1a, 0x14, 0x5a, 0xc5, 0xfa, 0xd3, 0xaa, 0x3a, 0xf6, 0xea, 0x44, 0xc1, 0x18, 0x69, 0xdc, 0x4f, 0x85, 0x3f, 0x00, 0x2b, 0x2e, @@ -128,7 +127,9 @@ void generateAsm(int nonce) { *noncePtr = nonce; blake2b(hash, sizeof(hash), blockTemplate, sizeof(blockTemplate), nullptr, 0); RandomX::AssemblyGeneratorX86 asmX86; - asmX86.generateProgram(hash); + RandomX::Program p; + fillAes1Rx4(hash, sizeof(p), &p); + asmX86.generateProgram(p); asmX86.printCode(std::cout); } @@ -143,9 +144,8 @@ void generateNative(int nonce) { int* noncePtr = (int*)(blockTemplate + 39); *noncePtr = nonce; blake2b(hash, sizeof(hash), blockTemplate, sizeof(blockTemplate), nullptr, 0); - RandomX::Program prog; - Pcg32 gen(hash); - prog.initialize(gen); + alignas(16) RandomX::Program prog; + fillAes1Rx4((void*)hash, sizeof(prog), &prog); for (int i = 0; i < RandomX::ProgramLength; ++i) { prog(i).dst %= 8; prog(i).src %= 8; @@ -173,12 +173,13 @@ void mine(RandomX::VirtualMachine* vm, std::atomic& atomicNonce, AtomicHash vm->setScratchpad(scratchpad); //dump((char*)((RandomX::CompiledVirtualMachine*)vm)->getProgram(), RandomX::CodeSize, "code-1337-jmp.txt"); for (int chain = 0; chain < 8; ++chain) { - vm->initializeProgram(hash); + fillAes1Rx4((void*)hash, sizeof(RandomX::Program), vm->getProgramBuffer()); + vm->initialize(); vm->execute(); - vm->getResult(nullptr, 0, hash); + vm->getResult(nullptr, 0, hash); } //vm->initializeProgram(hash); - vm->getResult(scratchpad, RandomX::ScratchpadSize, hash); + vm->getResult(scratchpad, RandomX::ScratchpadSize, hash); result.xorWith(hash); if (RandomX::trace) { std::cout << "Nonce: " << nonce << " "; diff --git a/src/program.inc b/src/program.inc index ba4b937..5de4504 100644 --- a/src/program.inc +++ b/src/program.inc @@ -1,17 +1,14 @@ ; FMUL_R e0, a2 mulpd xmm4, xmm10 - ; IADD_RC r2, r5, -1621224194 + ; IADD_RC r2, r5, 2673743102 lea r10, [r10+r13-1621224194] ; ISTORE L2[r2], r7 mov eax, r10d and eax, 262136 mov qword ptr [rsi+rax], r15 - ; FSUB_M f2, L1[r2] - mov eax, r10d - and eax, 16376 - cvtdq2pd xmm12, qword ptr [rsi+rax] - subpd xmm2, xmm12 - ; IMUL_9C r6, -1003503212 + ; FNEG_R f2 + xorps xmm2, xmm15 + ; IMUL_9C r6, 3291464084 lea r14, [r14+r14*8-1003503212] ; FSUB_R f1, a0 subpd xmm1, xmm8 @@ -19,11 +16,8 @@ mov eax, r11d and eax, 262136 xor r13, qword ptr [rsi+rax] - ; FSUB_M f2, L1[r4] - mov eax, r12d - and eax, 16376 - cvtdq2pd xmm12, qword ptr [rsi+rax] - subpd xmm2, xmm12 + ; FNEG_R f2 + xorps xmm2, xmm15 ; FSUB_R f3, a0 subpd xmm3, xmm8 ; ISDIV_C r0, 1400272688 @@ -38,35 +32,37 @@ mov eax, r15d and eax, 16376 imul r11, qword ptr [rsi+rax] - ; IROL_R r2, r3 - mov ecx, r11d - rol r10, cl + ; ISWAP_R r2, r3 + xchg r10, r11 ; IMULH_R r6, r0 mov rax, r14 mul r8 mov r14, rdx ; FMUL_R e0, a2 mulpd xmm4, xmm10 - ; IADD_RC r3, r4, -52260428 + ; IADD_RC r3, r4, 4242706868 lea r11, [r11+r12-52260428] - ; IADD_R r7, -1138617760 + ; IADD_R r7, 3156349536 add r15, -1138617760 ; IXOR_M r2, L1[r6] mov eax, r14d and eax, 16376 xor r10, qword ptr [rsi+rax] - ; FSUB_R f2, a1 - subpd xmm2, xmm9 + ; FSUB_M f2, L1[r5] + mov eax, r13d + and eax, 16376 + cvtdq2pd xmm12, qword ptr [rsi+rax] + subpd xmm2, xmm12 ; IXOR_R r7, r1 xor r15, r9 - ; COND_R r2, lt(r7, -41618808) + ; COND_R r2, lt(r7, 4253348488) xor ecx, ecx cmp r15d, -41618808 setl cl add r10, rcx ; FMUL_R e3, a0 mulpd xmm7, xmm8 - ; COND_R r4, sg(r1, -961190365) + ; COND_R r4, sg(r1, 3333776931) xor ecx, ecx cmp r9d, -961190365 sets cl @@ -122,19 +118,21 @@ addpd xmm1, xmm8 ; FMUL_R e3, a2 mulpd xmm7, xmm10 - ; FADD_R f0, a1 - addpd xmm0, xmm9 + ; FADD_M f0, L2[r5] + mov eax, r13d + and eax, 262136 + cvtdq2pd xmm12, qword ptr [rsi+rax] + addpd xmm0, xmm12 ; IMUL_R r5, r6 imul r13, r14 - ; IADD_RC r1, r2, -1263285243 + ; IADD_RC r1, r2, 3031682053 lea r9, [r9+r10-1263285243] ; ISUB_M r4, L1[r6] mov eax, r14d and eax, 16376 sub r12, qword ptr [rsi+rax] - ; IROL_R r7, r2 - mov ecx, r10d - rol r15, cl + ; FSWAP_R e3 + shufpd xmm7, xmm7, 1 ; IMUL_R r0, r7 imul r8, r15 ; IXOR_R r1, r6 @@ -156,9 +154,8 @@ andps xmm12, xmm14 divpd xmm6, xmm12 maxpd xmm6, xmm13 - ; IROL_R r2, r0 - mov ecx, r8d - rol r10, cl + ; ISWAP_R r2, r0 + xchg r10, r8 ; IADD_R r7, r5 add r15, r13 ; FDIV_M e0, L1[r4] @@ -210,8 +207,11 @@ mov eax, r8d and eax, 16376 mov qword ptr [rsi+rax], r15 - ; FSUB_R f0, a1 - subpd xmm0, xmm9 + ; FSUB_M f0, L2[r1] + mov eax, r9d + and eax, 262136 + cvtdq2pd xmm12, qword ptr [rsi+rax] + subpd xmm0, xmm12 ; FADD_R f3, a1 addpd xmm3, xmm9 ; IXOR_R r5, r4 @@ -220,8 +220,8 @@ mov eax, r15d and eax, 262136 mov qword ptr [rsi+rax], r10 - ; ISWAP_R r6, r7 - xchg r14, r15 + ; FSWAP_R e2 + shufpd xmm6, xmm6, 1 ; FADD_R f3, a2 addpd xmm3, xmm10 ; ISMULH_R r5, r0 @@ -232,7 +232,7 @@ mov eax, r12d and eax, 16376 add r8, qword ptr [rsi+rax] - ; COND_R r7, ge(r6, -1972898485) + ; COND_R r7, ge(r6, 2322068811) xor ecx, ecx cmp r14d, -1972898485 setge cl @@ -242,9 +242,9 @@ ; IROR_R r7, r6 mov ecx, r14d ror r15, cl - ; IADD_RC r2, r4, -117457973 + ; IADD_RC r2, r4, 4177509323 lea r10, [r10+r12-117457973] - ; IMUL_R r0, -1500893068 + ; IMUL_R r0, 2794074228 imul r8, -1500893068 ; IADD_R r2, r3 add r10, r11 @@ -265,19 +265,19 @@ lea r14, [r14+r14+540663146] ; IROR_R r5, 58 ror r13, 58 - ; FSWAP_R f2 - shufpd xmm2, xmm2, 1 - ; FSWAP_R f2 - shufpd xmm2, xmm2, 1 + ; FADD_R f2, a1 + addpd xmm2, xmm9 + ; FADD_R f2, a2 + addpd xmm2, xmm10 ; FMUL_R e1, a2 mulpd xmm5, xmm10 - ; ISWAP_R r5, r6 - xchg r13, r14 + ; FSWAP_R e1 + shufpd xmm5, xmm5, 1 ; IADD_R r5, r3 add r13, r11 - ; IADD_R r7, -1780268176 + ; IADD_R r7, 2514699120 add r15, -1780268176 - ; IADD_RC r7, r0, -1497756854 + ; IADD_RC r7, r0, 2797210442 lea r15, [r15+r8-1497756854] ; ISTORE L2[r0], r7 mov eax, r8d @@ -287,8 +287,11 @@ mov rax, r10 imul r12 mov r10, rdx - ; FSUB_R f0, a2 - subpd xmm0, xmm10 + ; FSUB_M f0, L1[r2] + mov eax, r10d + and eax, 16376 + cvtdq2pd xmm12, qword ptr [rsi+rax] + subpd xmm0, xmm12 ; ISMULH_R r2, r3 mov rax, r10 imul r11 @@ -301,8 +304,11 @@ addpd xmm2, xmm8 ; FMUL_R e0, a2 mulpd xmm4, xmm10 - ; FADD_R f2, a3 - addpd xmm2, xmm11 + ; FADD_M f2, L1[r3] + mov eax, r11d + and eax, 16376 + cvtdq2pd xmm12, qword ptr [rsi+rax] + addpd xmm2, xmm12 ; IMUL_R r1, r2 imul r9, r10 ; IMUL_M r7, L1[r5] @@ -313,11 +319,8 @@ imul r11, r10 ; IXOR_R r1, r0 xor r9, r8 - ; FSUB_M f0, L1[r5] - mov eax, r13d - and eax, 16376 - cvtdq2pd xmm12, qword ptr [rsi+rax] - subpd xmm0, xmm12 + ; FNEG_R f0 + xorps xmm0, xmm15 ; IADD_RC r4, r4, 1456841848 lea r12, [r12+r12+1456841848] ; IXOR_R r3, r2 @@ -327,19 +330,16 @@ cmp r12d, 1678513610 seto cl add r8, rcx - ; ISMULH_R r4, -1620573087 - mov rax, -1620573087 + ; ISMULH_R r4, r4 + mov rax, r12 imul r12 - add r12, rdx + mov r12, rdx ; IMUL_R r4, r1 imul r12, r9 - ; FSWAP_R e1 - shufpd xmm5, xmm5, 1 - ; FADD_M f2, L1[r0] - mov eax, r8d - and eax, 16376 - cvtdq2pd xmm12, qword ptr [rsi+rax] - addpd xmm2, xmm12 + ; FADD_R f1, a2 + addpd xmm1, xmm10 + ; FSUB_R f2, a0 + subpd xmm2, xmm8 ; FMUL_R e1, a2 mulpd xmm5, xmm10 ; FSUB_R f0, a3 @@ -362,29 +362,35 @@ sub r12, qword ptr [rsi+rax] ; FADD_R f2, a2 addpd xmm2, xmm10 - ; FSUB_R f3, a0 - subpd xmm3, xmm8 + ; FSUB_M f3, L2[r4] + mov eax, r12d + and eax, 262136 + cvtdq2pd xmm12, qword ptr [rsi+rax] + subpd xmm3, xmm12 ; IXOR_R r7, r2 xor r15, r10 ; IXOR_R r0, r5 xor r8, r13 - ; FSWAP_R f1 - shufpd xmm1, xmm1, 1 + ; FADD_R f1, a2 + addpd xmm1, xmm10 ; FMUL_R e3, a2 mulpd xmm7, xmm10 - ; ISWAP_R r7, r1 - xchg r15, r9 - ; ISWAP_R r1, r4 - xchg r9, r12 - ; COND_R r2, ge(r2, -226330940) + ; FSWAP_R e3 + shufpd xmm7, xmm7, 1 + ; FSWAP_R f1 + shufpd xmm1, xmm1, 1 + ; COND_R r2, ge(r2, 4068636356) xor ecx, ecx cmp r10d, -226330940 setge cl add r10, rcx ; FMUL_R e2, a3 mulpd xmm6, xmm11 - ; FSUB_R f2, a1 - subpd xmm2, xmm9 + ; FSUB_M f2, L2[r1] + mov eax, r9d + and eax, 262136 + cvtdq2pd xmm12, qword ptr [rsi+rax] + subpd xmm2, xmm12 ; FADD_R f1, a0 addpd xmm1, xmm8 ; ISUB_R r7, r5 @@ -395,12 +401,11 @@ sub r8, qword ptr [rsi+rax] ; FSUB_R f3, a1 subpd xmm3, xmm9 - ; IROL_R r3, r5 - mov ecx, r13d - rol r11, cl + ; ISWAP_R r3, r5 + xchg r11, r13 ; IADD_RC r5, r2, 795784298 lea r13, [r13+r10+795784298] - ; IADD_RC r0, r4, -2050178553 + ; IADD_RC r0, r4, 2244788743 lea r8, [r8+r12-2050178553] ; IMUL_9C r5, 1062534001 lea r13, [r13+r13*8+1062534001] @@ -436,16 +441,15 @@ mov rax, r12 imul r10 mov r12, rdx - ; IROL_R r3, r0 - mov ecx, r8d - rol r11, cl + ; ISWAP_R r3, r0 + xchg r11, r8 ; IXOR_R r2, r0 xor r10, r8 ; IXOR_M r0, L2[r1] mov eax, r9d and eax, 262136 xor r8, qword ptr [rsi+rax] - ; ISDIV_C r7, -935446980 + ; ISDIV_C r7, 3359520316 mov rax, 7859804860668271393 imul r15 xor eax, eax @@ -458,11 +462,8 @@ mov eax, r10d and eax, 16376 imul r14, qword ptr [rsi+rax] - ; FSUB_M f3, L1[r6] - mov eax, r14d - and eax, 16376 - cvtdq2pd xmm12, qword ptr [rsi+rax] - subpd xmm3, xmm12 + ; FNEG_R f3 + xorps xmm3, xmm15 ; IADD_RC r4, r2, 1704868083 lea r12, [r12+r10+1704868083] ; FADD_R f2, a0 @@ -471,8 +472,11 @@ mov eax, r8d and eax, 16376 mov qword ptr [rsi+rax], r8 - ; FADD_R f0, a3 - addpd xmm0, xmm11 + ; FADD_M f0, L1[r7] + mov eax, r15d + and eax, 16376 + cvtdq2pd xmm12, qword ptr [rsi+rax] + addpd xmm0, xmm12 ; FMUL_R e0, a3 mulpd xmm4, xmm11 ; FSUB_R f3, a2 @@ -481,8 +485,8 @@ lea r15, [r15+r15+1302457878] ; ISUB_R r1, 1330165941 sub r9, 1330165941 - ; FSUB_R f1, a3 - subpd xmm1, xmm11 + ; FNEG_R f1 + xorps xmm1, xmm15 ; IROR_R r0, r4 mov ecx, r12d ror r8, cl @@ -491,7 +495,7 @@ ; IROR_R r5, r6 mov ecx, r14d ror r13, cl - ; COND_R r0, ab(r1, -310933871) + ; COND_R r0, ab(r1, 3984033425) xor ecx, ecx cmp r9d, -310933871 seta cl @@ -516,22 +520,22 @@ andps xmm12, xmm14 divpd xmm5, xmm12 maxpd xmm5, xmm13 - ; IROL_R r1, 5 - rol r9, 5 - ; IADD_R r7, -1421188024 + ; FSWAP_R f1 + shufpd xmm1, xmm1, 1 + ; IADD_R r7, 2873779272 add r15, -1421188024 - ; FSUB_R f3, a2 - subpd xmm3, xmm10 + ; FSUB_M f3, L2[r2] + mov eax, r10d + and eax, 262136 + cvtdq2pd xmm12, qword ptr [rsi+rax] + subpd xmm3, xmm12 ; FSUB_R f2, a3 subpd xmm2, xmm11 - ; FADD_M f3, L1[r1] - mov eax, r9d - and eax, 16376 - cvtdq2pd xmm12, qword ptr [rsi+rax] - addpd xmm3, xmm12 + ; FSUB_R f3, a1 + subpd xmm3, xmm9 ; FMUL_R e1, a3 mulpd xmm5, xmm11 - ; IADD_RC r2, r4, -317832028 + ; IADD_RC r2, r4, 3977135268 lea r10, [r10+r12-317832028] ; IMUL_M r4, L1[r5] mov eax, r13d @@ -575,12 +579,11 @@ sub r12, r9 ; ISUB_R r3, r0 sub r11, r8 - ; IROL_R r7, r5 - mov ecx, r13d - rol r15, cl + ; ISWAP_R r7, r5 + xchg r15, r13 ; IMUL_R r2, r6 imul r10, r14 - ; COND_R r2, ge(r2, -1892157506) + ; COND_R r2, ge(r2, 2402809790) xor ecx, ecx cmp r10d, -1892157506 setge cl @@ -596,7 +599,7 @@ add r9, rdx ; FADD_R f0, a1 addpd xmm0, xmm9 - ; IADD_RC r5, r7, -477591118 + ; IADD_RC r5, r7, 3817376178 lea r13, [r13+r15-477591118] ; FSUB_R f0, a3 subpd xmm0, xmm11 @@ -610,9 +613,12 @@ add r8, r12 ; FSUB_R f3, a1 subpd xmm3, xmm9 - ; FSUB_R f2, a0 - subpd xmm2, xmm8 - ; ISDIV_C r2, -396711688 + ; FSUB_M f2, L1[r4] + mov eax, r12d + and eax, 16376 + cvtdq2pd xmm12, qword ptr [rsi+rax] + subpd xmm2, xmm12 + ; ISDIV_C r2, 3898255608 mov rax, 5964731804029407733 imul r10 xor eax, eax @@ -621,16 +627,19 @@ sets al add rdx, rax add r10, rdx - ; FSUB_R f2, a2 - subpd xmm2, xmm10 + ; FNEG_R f2 + xorps xmm2, xmm15 ; FSUB_R f3, a2 subpd xmm3, xmm10 ; FADD_R f1, a3 addpd xmm1, xmm11 ; IMUL_R r3, r2 imul r11, r10 - ; FADD_R f0, a3 - addpd xmm0, xmm11 + ; FADD_M f0, L1[r3] + mov eax, r11d + and eax, 16376 + cvtdq2pd xmm12, qword ptr [rsi+rax] + addpd xmm0, xmm12 ; ISMULH_R r5, r2 mov rax, r13 imul r10 @@ -639,28 +648,30 @@ mov rax, r14 mul r10 mov r14, rdx - ; FADD_R f3, a3 - addpd xmm3, xmm11 + ; FADD_M f3, L1[r3] + mov eax, r11d + and eax, 16376 + cvtdq2pd xmm12, qword ptr [rsi+rax] + addpd xmm3, xmm12 ; IMUL_R r6, r7 imul r14, r15 ; FSUB_R f0, a0 subpd xmm0, xmm8 - ; FSUB_R f2, a0 - subpd xmm2, xmm8 + ; FNEG_R f2 + xorps xmm2, xmm15 ; ISUB_R r6, r4 sub r14, r12 - ; FSWAP_R f1 - shufpd xmm1, xmm1, 1 + ; FADD_R f1, a1 + addpd xmm1, xmm9 ; IXOR_R r0, r5 xor r8, r13 ; FADD_R f2, a1 addpd xmm2, xmm9 - ; IROL_R r7, r5 - mov ecx, r13d - rol r15, cl + ; ISWAP_R r7, r5 + xchg r15, r13 ; FMUL_R e3, a2 mulpd xmm7, xmm10 - ; IADD_RC r3, r6, -1317630728 + ; IADD_RC r3, r6, 2977336568 lea r11, [r11+r14-1317630728] ; IMUL_R r2, r3 imul r10, r11 @@ -668,11 +679,8 @@ lea r9, [r9+r12+894105694] ; IMUL_9C r7, 504293473 lea r15, [r15+r15*8+504293473] - ; FADD_M f1, L2[r0] - mov eax, r8d - and eax, 262136 - cvtdq2pd xmm12, qword ptr [rsi+rax] - addpd xmm1, xmm12 + ; FSUB_R f1, a0 + subpd xmm1, xmm8 ; IMUL_R r7, r1 imul r15, r9 ; IXOR_R r2, r4 @@ -713,19 +721,16 @@ mov eax, r9d and eax, 16376 mov qword ptr [rsi+rax], r13 - ; FSUB_M f0, L1[r1] - mov eax, r9d - and eax, 16376 - cvtdq2pd xmm12, qword ptr [rsi+rax] - subpd xmm0, xmm12 + ; FNEG_R f0 + xorps xmm0, xmm15 ; FSQRT_R e2 sqrtpd xmm6, xmm6 ; FMUL_R e0, a3 mulpd xmm4, xmm11 ; FMUL_R e3, a2 mulpd xmm7, xmm10 - ; IROL_R r5, r2 + ; IROR_R r5, r2 mov ecx, r10d - rol r13, cl + ror r13, cl ; IADD_R r0, r4 add r8, r12 diff --git a/src/squareHash.h b/src/squareHash.h index f80b492..05939d7 100644 --- a/src/squareHash.h +++ b/src/squareHash.h @@ -17,6 +17,11 @@ You should have received a copy of the GNU General Public License along with RandomX. If not, see. */ +/* + Based on the original idea by SChernykh: + https://github.com/SChernykh/xmr-stak-cpu/issues/1#issuecomment-414336613 +*/ + #include #if !defined(_M_X64) && !defined(__x86_64__) diff --git a/src/t1ha/t1ha.h b/src/t1ha/t1ha.h deleted file mode 100644 index 6b56e16..0000000 --- a/src/t1ha/t1ha.h +++ /dev/null @@ -1,723 +0,0 @@ -/* - * Copyright (c) 2016-2018 Positive Technologies, https://www.ptsecurity.com, - * Fast Positive Hash. - * - * Portions Copyright (c) 2010-2018 Leonid Yuriev , - * The 1Hippeus project (t1h). - * - * This software is provided 'as-is', without any express or implied - * warranty. In no event will the authors be held liable for any damages - * arising from the use of this software. - * - * Permission is granted to anyone to use this software for any purpose, - * including commercial applications, and to alter it and redistribute it - * freely, subject to the following restrictions: - * - * 1. The origin of this software must not be misrepresented; you must not - * claim that you wrote the original software. If you use this software - * in a product, an acknowledgement in the product documentation would be - * appreciated but is not required. - * 2. Altered source versions must be plainly marked as such, and must not be - * misrepresented as being the original software. - * 3. This notice may not be removed or altered from any source distribution. - */ - -/* - * t1ha = { Fast Positive Hash, aka "Позитивный Хэш" } - * by [Positive Technologies](https://www.ptsecurity.ru) - * - * Briefly, it is a 64-bit Hash Function: - * 1. Created for 64-bit little-endian platforms, in predominantly for x86_64, - * but portable and without penalties it can run on any 64-bit CPU. - * 2. In most cases up to 15% faster than City64, xxHash, mum-hash, metro-hash - * and all others portable hash-functions (which do not use specific - * hardware tricks). - * 3. Not suitable for cryptography. - * - * The Future will Positive. Всё будет хорошо. - * - * ACKNOWLEDGEMENT: - * The t1ha was originally developed by Leonid Yuriev (Леонид Юрьев) - * for The 1Hippeus project - zerocopy messaging in the spirit of Sparta! - */ - -#pragma once - -/***************************************************************************** - * - * PLEASE PAY ATTENTION TO THE FOLLOWING NOTES - * about macros definitions which controls t1ha behaviour and/or performance. - * - * - * 1) T1HA_SYS_UNALIGNED_ACCESS = Defines the system/platform/CPU/architecture - * abilities for unaligned data access. - * - * By default, when the T1HA_SYS_UNALIGNED_ACCESS not defined, - * it will defined on the basis hardcoded knowledge about of capabilities - * of most common CPU architectures. But you could override this - * default behavior when build t1ha library itself: - * - * // To disable unaligned access at all. - * #define T1HA_SYS_UNALIGNED_ACCESS 0 - * - * // To enable unaligned access, but indicate that it significally slow. - * #define T1HA_SYS_UNALIGNED_ACCESS 1 - * - * // To enable unaligned access, and indicate that it effecient. - * #define T1HA_SYS_UNALIGNED_ACCESS 2 - * - * - * 2) T1HA_USE_FAST_ONESHOT_READ = Controls the data reads at the end of buffer. - * - * When defined to non-zero, t1ha will use 'one shot' method for reading - * up to 8 bytes at the end of data. In this case just the one 64-bit read - * will be performed even when the available less than 8 bytes. - * - * This is little bit faster that switching by length of data tail. - * Unfortunately this will triggering a false-positive alarms from Valgrind, - * AddressSanitizer and other similar tool. - * - * By default, t1ha defines it to 1, but you could override this - * default behavior when build t1ha library itself: - * - * // For little bit faster and small code. - * #define T1HA_USE_FAST_ONESHOT_READ 1 - * - * // For calmness if doubt. - * #define T1HA_USE_FAST_ONESHOT_READ 0 - * - * - * 3) T1HA0_RUNTIME_SELECT = Controls choice fastest function in runtime. - * - * t1ha library offers the t1ha0() function as the fastest for current CPU. - * But actual CPU's features/capabilities and may be significantly different, - * especially on x86 platform. Therefore, internally, t1ha0() may require - * dynamic dispatching for choice best implementation. - * - * By default, t1ha enables such runtime choice and (may be) corresponding - * indirect calls if it reasonable, but you could override this default - * behavior when build t1ha library itself: - * - * // To enable runtime choice of fastest implementation. - * #define T1HA0_RUNTIME_SELECT 1 - * - * // To disable runtime choice of fastest implementation. - * #define T1HA0_RUNTIME_SELECT 0 - * - * When T1HA0_RUNTIME_SELECT is nonzero the t1ha0_resolve() function could - * be used to get actual t1ha0() implementation address at runtime. This is - * useful for two cases: - * - calling by local pointer-to-function usually is little - * bit faster (less overhead) than via a PLT thru the DSO boundary. - * - GNU Indirect functions (see below) don't supported by environment - * and calling by t1ha0_funcptr is not available and/or expensive. - * - * 4) T1HA_USE_INDIRECT_FUNCTIONS = Controls usage of GNU Indirect functions. - * - * In continue of T1HA0_RUNTIME_SELECT the T1HA_USE_INDIRECT_FUNCTIONS - * controls usage of ELF indirect functions feature. In general, when - * available, this reduces overhead of indirect function's calls though - * a DSO-bundary (https://sourceware.org/glibc/wiki/GNU_IFUNC). - * - * By default, t1ha engage GNU Indirect functions when it available - * and useful, but you could override this default behavior when build - * t1ha library itself: - * - * // To enable use of GNU ELF Indirect functions. - * #define T1HA_USE_INDIRECT_FUNCTIONS 1 - * - * // To disable use of GNU ELF Indirect functions. This may be useful - * // if the actual toolchain or the system's loader don't support ones. - * #define T1HA_USE_INDIRECT_FUNCTIONS 0 - * - * 5) T1HA0_AESNI_AVAILABLE = Controls AES-NI detection and dispatching on x86. - * - * In continue of T1HA0_RUNTIME_SELECT the T1HA0_AESNI_AVAILABLE controls - * detection and usage of AES-NI CPU's feature. On the other hand, this - * requires compiling parts of t1ha library with certain properly options, - * and could be difficult or inconvenient in some cases. - * - * By default, t1ha engade AES-NI for t1ha0() on the x86 platform, but - * you could override this default behavior when build t1ha library itself: - * - * // To disable detection and usage of AES-NI instructions for t1ha0(). - * // This may be useful when you unable to build t1ha library properly - * // or known that AES-NI will be unavailable at the deploy. - * #define T1HA0_AESNI_AVAILABLE 0 - * - * // To force detection and usage of AES-NI instructions for t1ha0(), - * // but I don't known reasons to anybody would need this. - * #define T1HA0_AESNI_AVAILABLE 1 - * - * 6) T1HA0_DISABLED, T1HA1_DISABLED, T1HA2_DISABLED = Controls availability of - * t1ha functions. - * - * In some cases could be useful to import/use only few of t1ha functions - * or just the one. So, this definitions allows disable corresponding parts - * of t1ha library. - * - * // To disable t1ha0(), t1ha0_32le(), t1ha0_32be() and all AES-NI. - * #define T1HA0_DISABLED - * - * // To disable t1ha1_le() and t1ha1_be(). - * #define T1HA1_DISABLED - * - * // To disable t1ha2_atonce(), t1ha2_atonce128() and so on. - * #define T1HA2_DISABLED - * - *****************************************************************************/ - -#define T1HA_VERSION_MAJOR 2 -#define T1HA_VERSION_MINOR 1 -#define T1HA_VERSION_RELEASE 0 - -#ifndef __has_attribute -#define __has_attribute(x) (0) -#endif - -#ifndef __has_include -#define __has_include(x) (0) -#endif - -#ifndef __GNUC_PREREQ -#if defined(__GNUC__) && defined(__GNUC_MINOR__) -#define __GNUC_PREREQ(maj, min) \ - ((__GNUC__ << 16) + __GNUC_MINOR__ >= ((maj) << 16) + (min)) -#else -#define __GNUC_PREREQ(maj, min) 0 -#endif -#endif /* __GNUC_PREREQ */ - -#ifndef __CLANG_PREREQ -#ifdef __clang__ -#define __CLANG_PREREQ(maj, min) \ - ((__clang_major__ << 16) + __clang_minor__ >= ((maj) << 16) + (min)) -#else -#define __CLANG_PREREQ(maj, min) (0) -#endif -#endif /* __CLANG_PREREQ */ - -#ifndef __LCC_PREREQ -#ifdef __LCC__ -#define __LCC_PREREQ(maj, min) \ - ((__LCC__ << 16) + __LCC_MINOR__ >= ((maj) << 16) + (min)) -#else -#define __LCC_PREREQ(maj, min) (0) -#endif -#endif /* __LCC_PREREQ */ - -/*****************************************************************************/ - -#ifdef _MSC_VER -/* Avoid '16' bytes padding added after data member 't1ha_context::total' - * and other warnings from std-headers if warning-level > 3. */ -#pragma warning(push, 3) -#endif - -#if defined(__cplusplus) && __cplusplus >= 201103L -#include -#include -#include -#else -#include -#include -#include -#endif - -/*****************************************************************************/ - -#if defined(i386) || defined(__386) || defined(__i386) || defined(__i386__) || \ - defined(i486) || defined(__i486) || defined(__i486__) || \ - defined(i586) | defined(__i586) || defined(__i586__) || defined(i686) || \ - defined(__i686) || defined(__i686__) || defined(_M_IX86) || \ - defined(_X86_) || defined(__THW_INTEL__) || defined(__I86__) || \ - defined(__INTEL__) || defined(__x86_64) || defined(__x86_64__) || \ - defined(__amd64__) || defined(__amd64) || defined(_M_X64) || \ - defined(_M_AMD64) || defined(__IA32__) || defined(__INTEL__) -#ifndef __ia32__ -/* LY: define neutral __ia32__ for x86 and x86-64 archs */ -#define __ia32__ 1 -#endif /* __ia32__ */ -#if !defined(__amd64__) && (defined(__x86_64) || defined(__x86_64__) || \ - defined(__amd64) || defined(_M_X64)) -/* LY: define trusty __amd64__ for all AMD64/x86-64 arch */ -#define __amd64__ 1 -#endif /* __amd64__ */ -#endif /* all x86 */ - -#if !defined(__BYTE_ORDER__) || !defined(__ORDER_LITTLE_ENDIAN__) || \ - !defined(__ORDER_BIG_ENDIAN__) - -/* *INDENT-OFF* */ -/* clang-format off */ - -#if defined(__GLIBC__) || defined(__GNU_LIBRARY__) || defined(__ANDROID__) || \ - defined(HAVE_ENDIAN_H) || __has_include() -#include -#elif defined(__APPLE__) || defined(__MACH__) || defined(__OpenBSD__) || \ - defined(HAVE_MACHINE_ENDIAN_H) || __has_include() -#include -#elif defined(HAVE_SYS_ISA_DEFS_H) || __has_include() -#include -#elif (defined(HAVE_SYS_TYPES_H) && defined(HAVE_SYS_ENDIAN_H)) || \ - (__has_include() && __has_include()) -#include -#include -#elif defined(__bsdi__) || defined(__DragonFly__) || defined(__FreeBSD__) || \ - defined(__NETBSD__) || defined(__NetBSD__) || \ - defined(HAVE_SYS_PARAM_H) || __has_include() -#include -#endif /* OS */ - -/* *INDENT-ON* */ -/* clang-format on */ - -#if defined(__BYTE_ORDER) && defined(__LITTLE_ENDIAN) && defined(__BIG_ENDIAN) -#define __ORDER_LITTLE_ENDIAN__ __LITTLE_ENDIAN -#define __ORDER_BIG_ENDIAN__ __BIG_ENDIAN -#define __BYTE_ORDER__ __BYTE_ORDER -#elif defined(_BYTE_ORDER) && defined(_LITTLE_ENDIAN) && defined(_BIG_ENDIAN) -#define __ORDER_LITTLE_ENDIAN__ _LITTLE_ENDIAN -#define __ORDER_BIG_ENDIAN__ _BIG_ENDIAN -#define __BYTE_ORDER__ _BYTE_ORDER -#else -#define __ORDER_LITTLE_ENDIAN__ 1234 -#define __ORDER_BIG_ENDIAN__ 4321 - -#if defined(__LITTLE_ENDIAN__) || \ - (defined(_LITTLE_ENDIAN) && !defined(_BIG_ENDIAN)) || \ - defined(__ARMEL__) || defined(__THUMBEL__) || defined(__AARCH64EL__) || \ - defined(__MIPSEL__) || defined(_MIPSEL) || defined(__MIPSEL) || \ - defined(_M_ARM) || defined(_M_ARM64) || defined(__e2k__) || \ - defined(__elbrus_4c__) || defined(__elbrus_8c__) || defined(__bfin__) || \ - defined(__BFIN__) || defined(__ia64__) || defined(_IA64) || \ - defined(__IA64__) || defined(__ia64) || defined(_M_IA64) || \ - defined(__itanium__) || defined(__ia32__) || defined(__CYGWIN__) || \ - defined(_WIN64) || defined(_WIN32) || defined(__TOS_WIN__) || \ - defined(__WINDOWS__) -#define __BYTE_ORDER__ __ORDER_LITTLE_ENDIAN__ - -#elif defined(__BIG_ENDIAN__) || \ - (defined(_BIG_ENDIAN) && !defined(_LITTLE_ENDIAN)) || \ - defined(__ARMEB__) || defined(__THUMBEB__) || defined(__AARCH64EB__) || \ - defined(__MIPSEB__) || defined(_MIPSEB) || defined(__MIPSEB) || \ - defined(__m68k__) || defined(M68000) || defined(__hppa__) || \ - defined(__hppa) || defined(__HPPA__) || defined(__sparc__) || \ - defined(__sparc) || defined(__370__) || defined(__THW_370__) || \ - defined(__s390__) || defined(__s390x__) || defined(__SYSC_ZARCH__) -#define __BYTE_ORDER__ __ORDER_BIG_ENDIAN__ - -#else -#error __BYTE_ORDER__ should be defined. -#endif /* Arch */ - -#endif -#endif /* __BYTE_ORDER__ || __ORDER_LITTLE_ENDIAN__ || __ORDER_BIG_ENDIAN__ */ - -/*****************************************************************************/ - -#ifndef __dll_export -#if defined(_WIN32) || defined(_WIN64) || defined(__CYGWIN__) -#if defined(__GNUC__) || __has_attribute(dllexport) -#define __dll_export __attribute__((dllexport)) -#elif defined(_MSC_VER) -#define __dll_export __declspec(dllexport) -#else -#define __dll_export -#endif -#elif defined(__GNUC__) || __has_attribute(visibility) -#define __dll_export __attribute__((visibility("default"))) -#else -#define __dll_export -#endif -#endif /* __dll_export */ - -#ifndef __dll_import -#if defined(_WIN32) || defined(_WIN64) || defined(__CYGWIN__) -#if defined(__GNUC__) || __has_attribute(dllimport) -#define __dll_import __attribute__((dllimport)) -#elif defined(_MSC_VER) -#define __dll_import __declspec(dllimport) -#else -#define __dll_import -#endif -#else -#define __dll_import -#endif -#endif /* __dll_import */ - -#ifndef __force_inline -#ifdef _MSC_VER -#define __force_inline __forceinline -#elif __GNUC_PREREQ(3, 2) || __has_attribute(always_inline) -#define __force_inline __inline __attribute__((always_inline)) -#else -#define __force_inline __inline -#endif -#endif /* __force_inline */ - -#ifndef T1HA_API -#if defined(t1ha_EXPORTS) -#define T1HA_API __dll_export -#elif defined(t1ha_IMPORTS) -#define T1HA_API __dll_import -#else -#define T1HA_API -#endif -#endif /* T1HA_API */ - -#if defined(_MSC_VER) && defined(__ia32__) -#define T1HA_ALIGN_PREFIX __declspec(align(32)) /* required only for SIMD */ -#else -#define T1HA_ALIGN_PREFIX -#endif /* _MSC_VER */ - -#if defined(__GNUC__) && defined(__ia32__) -#define T1HA_ALIGN_SUFFIX \ - __attribute__((aligned(32))) /* required only for SIMD */ -#else -#define T1HA_ALIGN_SUFFIX -#endif /* GCC x86 */ - -#ifndef T1HA_USE_INDIRECT_FUNCTIONS -/* GNU ELF indirect functions usage control. For more info please see - * https://en.wikipedia.org/wiki/Executable_and_Linkable_Format - * and https://sourceware.org/glibc/wiki/GNU_IFUNC */ -#if __has_attribute(ifunc) && \ - defined(__ELF__) /* ifunc is broken on Darwin/OSX */ -/* Use ifunc/gnu_indirect_function if corresponding attribute is available, - * Assuming compiler will generate properly code even when - * the -fstack-protector-all and/or the -fsanitize=address are enabled. */ -#define T1HA_USE_INDIRECT_FUNCTIONS 1 -#elif defined(__ELF__) && !defined(__SANITIZE_ADDRESS__) && \ - !defined(__SSP_ALL__) -/* ifunc/gnu_indirect_function will be used on ELF, but only if both - * -fstack-protector-all and -fsanitize=address are NOT enabled. */ -#define T1HA_USE_INDIRECT_FUNCTIONS 1 -#else -#define T1HA_USE_INDIRECT_FUNCTIONS 0 -#endif -#endif /* T1HA_USE_INDIRECT_FUNCTIONS */ - -#if __GNUC_PREREQ(4, 0) -#pragma GCC visibility push(hidden) -#endif /* __GNUC_PREREQ(4,0) */ - -#ifdef __cplusplus -extern "C" { -#endif - -typedef union T1HA_ALIGN_PREFIX t1ha_state256 { - uint8_t bytes[32]; - uint32_t u32[8]; - uint64_t u64[4]; - struct { - uint64_t a, b, c, d; - } n; -} t1ha_state256_t T1HA_ALIGN_SUFFIX; - -typedef struct t1ha_context { - t1ha_state256_t state; - t1ha_state256_t buffer; - size_t partial; - uint64_t total; -} t1ha_context_t; - -#ifdef _MSC_VER -#pragma warning(pop) -#endif - -/****************************************************************************** - * - * Self-testing API. - * - * Unfortunately, some compilers (exactly only Microsoft Visual C/C++) has - * a bugs which leads t1ha-functions to produce wrong results. This API allows - * check the correctness of the actual code in runtime. - * - * All check-functions returns 0 on success, or -1 in case the corresponding - * hash-function failed verification. PLEASE, always perform such checking at - * initialization of your code, if you using MSVC or other troubleful compilers. - */ - -T1HA_API int t1ha_selfcheck__all_enabled(void); - -#ifndef T1HA2_DISABLED -T1HA_API int t1ha_selfcheck__t1ha2_atonce(void); -T1HA_API int t1ha_selfcheck__t1ha2_atonce128(void); -T1HA_API int t1ha_selfcheck__t1ha2_stream(void); -T1HA_API int t1ha_selfcheck__t1ha2(void); -#endif /* T1HA2_DISABLED */ - -#ifndef T1HA1_DISABLED -T1HA_API int t1ha_selfcheck__t1ha1_le(void); -T1HA_API int t1ha_selfcheck__t1ha1_be(void); -T1HA_API int t1ha_selfcheck__t1ha1(void); -#endif /* T1HA1_DISABLED */ - -#ifndef T1HA0_DISABLED -T1HA_API int t1ha_selfcheck__t1ha0_32le(void); -T1HA_API int t1ha_selfcheck__t1ha0_32be(void); -T1HA_API int t1ha_selfcheck__t1ha0(void); - -/* Define T1HA0_AESNI_AVAILABLE to 0 for disable AES-NI support. */ -#ifndef T1HA0_AESNI_AVAILABLE -#if defined(__e2k__) || \ - (defined(__ia32__) && (!defined(_M_IX86) || _MSC_VER > 1800)) -#define T1HA0_AESNI_AVAILABLE 1 -#else -#define T1HA0_AESNI_AVAILABLE 0 -#endif -#endif /* ifndef T1HA0_AESNI_AVAILABLE */ - -#if T1HA0_AESNI_AVAILABLE -T1HA_API int t1ha_selfcheck__t1ha0_ia32aes_noavx(void); -T1HA_API int t1ha_selfcheck__t1ha0_ia32aes_avx(void); -#ifndef __e2k__ -T1HA_API int t1ha_selfcheck__t1ha0_ia32aes_avx2(void); -#endif -#endif /* if T1HA0_AESNI_AVAILABLE */ -#endif /* T1HA0_DISABLED */ - -/****************************************************************************** - * - * t1ha2 = 64 and 128-bit, SLIGHTLY MORE ATTENTION FOR QUALITY AND STRENGTH. - * - * - The recommended version of "Fast Positive Hash" with good quality - * for checksum, hash tables and fingerprinting. - * - Portable and extremely efficiency on modern 64-bit CPUs. - * Designed for 64-bit little-endian platforms, - * in other cases will runs slowly. - * - Great quality of hashing and still faster than other non-t1ha hashes. - * Provides streaming mode and 128-bit result. - * - * Note: Due performance reason 64- and 128-bit results are completely - * different each other, i.e. 64-bit result is NOT any part of 128-bit. - */ -#ifndef T1HA2_DISABLED - -/* The at-once variant with 64-bit result */ -T1HA_API uint64_t t1ha2_atonce(const void *data, size_t length, uint64_t seed); - -/* The at-once variant with 128-bit result. - * Argument `extra_result` is NOT optional and MUST be valid. - * The high 64-bit part of 128-bit hash will be always unconditionally - * stored to the address given by `extra_result` argument. */ -T1HA_API uint64_t t1ha2_atonce128(uint64_t *__restrict extra_result, - const void *__restrict data, size_t length, - uint64_t seed); - -/* The init/update/final trinity for streaming. - * Return 64 or 128-bit result depentently from `extra_result` argument. */ -T1HA_API void t1ha2_init(t1ha_context_t *ctx, uint64_t seed_x, uint64_t seed_y); -T1HA_API void t1ha2_update(t1ha_context_t *__restrict ctx, - const void *__restrict data, size_t length); - -/* Argument `extra_result` is optional and MAY be NULL. - * - If `extra_result` is NOT NULL then the 128-bit hash will be calculated, - * and high 64-bit part of it will be stored to the address given - * by `extra_result` argument. - * - Otherwise the 64-bit hash will be calculated - * and returned from function directly. - * - * Note: Due performance reason 64- and 128-bit results are completely - * different each other, i.e. 64-bit result is NOT any part of 128-bit. */ -T1HA_API uint64_t t1ha2_final(t1ha_context_t *__restrict ctx, - uint64_t *__restrict extra_result /* optional */); - -#endif /* T1HA2_DISABLED */ - -/****************************************************************************** - * - * t1ha1 = 64-bit, BASELINE FAST PORTABLE HASH: - * - * - Runs faster on 64-bit platforms in other cases may runs slowly. - * - Portable and stable, returns same 64-bit result - * on all architectures and CPUs. - * - Unfortunately it fails the "strict avalanche criteria", - * see test results at https://github.com/demerphq/smhasher. - * - * This flaw is insignificant for the t1ha1() purposes and imperceptible - * from a practical point of view. - * However, nowadays this issue has resolved in the next t1ha2(), - * that was initially planned to providing a bit more quality. - */ -#ifndef T1HA1_DISABLED - -/* The little-endian variant. */ -T1HA_API uint64_t t1ha1_le(const void *data, size_t length, uint64_t seed); - -/* The big-endian variant. */ -T1HA_API uint64_t t1ha1_be(const void *data, size_t length, uint64_t seed); - -#endif /* T1HA1_DISABLED */ - -/****************************************************************************** - * - * t1ha0 = 64-bit, JUST ONLY FASTER: - * - * - Provides fast-as-possible hashing for current CPU, including - * 32-bit systems and engaging the available hardware acceleration. - * - It is a facade that selects most quick-and-dirty hash - * for the current processor. For instance, on IA32 (x86) actual function - * will be selected in runtime, depending on current CPU capabilities - * - * BE CAREFUL!!! THIS IS MEANS: - * - * 1. The quality of hash is a subject for tradeoffs with performance. - * So, the quality and strength of t1ha0() may be lower than t1ha1(), - * especially on 32-bit targets, but then much faster. - * However, guaranteed that it passes all SMHasher tests. - * - * 2. No warranty that the hash result will be same for particular - * key on another machine or another version of libt1ha. - * - * Briefly, such hash-results and their derivatives, should be - * used only in runtime, but should not be persist or transferred - * over a network. - * - * - * When T1HA0_RUNTIME_SELECT is nonzero the t1ha0_resolve() function could - * be used to get actual t1ha0() implementation address at runtime. This is - * useful for two cases: - * - calling by local pointer-to-function usually is little - * bit faster (less overhead) than via a PLT thru the DSO boundary. - * - GNU Indirect functions (see below) don't supported by environment - * and calling by t1ha0_funcptr is not available and/or expensive. - */ - -#ifndef T1HA0_DISABLED - -/* The little-endian variant for 32-bit CPU. */ -uint64_t t1ha0_32le(const void *data, size_t length, uint64_t seed); -/* The big-endian variant for 32-bit CPU. */ -uint64_t t1ha0_32be(const void *data, size_t length, uint64_t seed); - -/* Define T1HA0_AESNI_AVAILABLE to 0 for disable AES-NI support. */ -#ifndef T1HA0_AESNI_AVAILABLE -#if defined(__e2k__) || \ - (defined(__ia32__) && (!defined(_M_IX86) || _MSC_VER > 1800)) -#define T1HA0_AESNI_AVAILABLE 1 -#else -#define T1HA0_AESNI_AVAILABLE 0 -#endif -#endif /* T1HA0_AESNI_AVAILABLE */ - -/* Define T1HA0_RUNTIME_SELECT to 0 for disable dispatching t1ha0 at runtime. */ -#ifndef T1HA0_RUNTIME_SELECT -#if T1HA0_AESNI_AVAILABLE && !defined(__e2k__) -#define T1HA0_RUNTIME_SELECT 1 -#else -#define T1HA0_RUNTIME_SELECT 0 -#endif -#endif /* T1HA0_RUNTIME_SELECT */ - -#if !T1HA0_RUNTIME_SELECT && !defined(T1HA0_USE_DEFINE) -#if defined(__LCC__) -#define T1HA0_USE_DEFINE 1 -#else -#define T1HA0_USE_DEFINE 0 -#endif -#endif /* T1HA0_USE_DEFINE */ - -#if T1HA0_AESNI_AVAILABLE -uint64_t t1ha0_ia32aes_noavx(const void *data, size_t length, uint64_t seed); -uint64_t t1ha0_ia32aes_avx(const void *data, size_t length, uint64_t seed); -#ifndef __e2k__ -uint64_t t1ha0_ia32aes_avx2(const void *data, size_t length, uint64_t seed); -#endif -#endif /* T1HA0_AESNI_AVAILABLE */ - -#if T1HA0_RUNTIME_SELECT -typedef uint64_t (*t1ha0_function_t)(const void *, size_t, uint64_t); -T1HA_API t1ha0_function_t t1ha0_resolve(void); -#if T1HA_USE_INDIRECT_FUNCTIONS -T1HA_API uint64_t t1ha0(const void *data, size_t length, uint64_t seed); -#else -/* Otherwise function pointer will be used. - * Unfortunately this may cause some overhead calling. */ -T1HA_API extern uint64_t (*t1ha0_funcptr)(const void *data, size_t length, - uint64_t seed); -static __force_inline uint64_t t1ha0(const void *data, size_t length, - uint64_t seed) { - return t1ha0_funcptr(data, length, seed); -} -#endif /* T1HA_USE_INDIRECT_FUNCTIONS */ - -#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ - -#if T1HA0_USE_DEFINE - -#if (UINTPTR_MAX > 0xffffFFFFul || ULONG_MAX > 0xffffFFFFul) && \ - (!defined(T1HA1_DISABLED) || !defined(T1HA2_DISABLED)) -#if defined(T1HA1_DISABLED) -#define t1ha0 t1ha2_atonce -#else -#define t1ha0 t1ha1_be -#endif /* T1HA1_DISABLED */ -#else /* 32/64 */ -#define t1ha0 t1ha0_32be -#endif /* 32/64 */ - -#else /* T1HA0_USE_DEFINE */ - -static __force_inline uint64_t t1ha0(const void *data, size_t length, - uint64_t seed) { -#if (UINTPTR_MAX > 0xffffFFFFul || ULONG_MAX > 0xffffFFFFul) && \ - (!defined(T1HA1_DISABLED) || !defined(T1HA2_DISABLED)) -#if defined(T1HA1_DISABLED) - return t1ha2_atonce(data, length, seed); -#else - return t1ha1_be(data, length, seed); -#endif /* T1HA1_DISABLED */ -#else /* 32/64 */ - return t1ha0_32be(data, length, seed); -#endif /* 32/64 */ -} - -#endif /* !T1HA0_USE_DEFINE */ - -#else /* !T1HA0_RUNTIME_SELECT && __BYTE_ORDER__ != __ORDER_BIG_ENDIAN__ */ - -#if T1HA0_USE_DEFINE - -#if (UINTPTR_MAX > 0xffffFFFFul || ULONG_MAX > 0xffffFFFFul) && \ - (!defined(T1HA1_DISABLED) || !defined(T1HA2_DISABLED)) -#if defined(T1HA1_DISABLED) -#define t1ha0 t1ha2_atonce -#else -#define t1ha0 t1ha1_le -#endif /* T1HA1_DISABLED */ -#else /* 32/64 */ -#define t1ha0 t1ha0_32le -#endif /* 32/64 */ - -#else - -static __force_inline uint64_t t1ha0(const void *data, size_t length, - uint64_t seed) { -#if (UINTPTR_MAX > 0xffffFFFFul || ULONG_MAX > 0xffffFFFFul) && \ - (!defined(T1HA1_DISABLED) || !defined(T1HA2_DISABLED)) -#if defined(T1HA1_DISABLED) - return t1ha2_atonce(data, length, seed); -#else - return t1ha1_le(data, length, seed); -#endif /* T1HA1_DISABLED */ -#else /* 32/64 */ - return t1ha0_32le(data, length, seed); -#endif /* 32/64 */ -} - -#endif /* !T1HA0_USE_DEFINE */ - -#endif /* !T1HA0_RUNTIME_SELECT */ - -#endif /* T1HA0_DISABLED */ - -#ifdef __cplusplus -} -#endif - -#if __GNUC_PREREQ(4, 0) -#pragma GCC visibility pop -#endif /* __GNUC_PREREQ(4,0) */ diff --git a/src/t1ha/t1ha2.c b/src/t1ha/t1ha2.c deleted file mode 100644 index b05d64c..0000000 --- a/src/t1ha/t1ha2.c +++ /dev/null @@ -1,329 +0,0 @@ -/* - * Copyright (c) 2016-2018 Positive Technologies, https://www.ptsecurity.com, - * Fast Positive Hash. - * - * Portions Copyright (c) 2010-2018 Leonid Yuriev , - * The 1Hippeus project (t1h). - * - * This software is provided 'as-is', without any express or implied - * warranty. In no event will the authors be held liable for any damages - * arising from the use of this software. - * - * Permission is granted to anyone to use this software for any purpose, - * including commercial applications, and to alter it and redistribute it - * freely, subject to the following restrictions: - * - * 1. The origin of this software must not be misrepresented; you must not - * claim that you wrote the original software. If you use this software - * in a product, an acknowledgement in the product documentation would be - * appreciated but is not required. - * 2. Altered source versions must be plainly marked as such, and must not be - * misrepresented as being the original software. - * 3. This notice may not be removed or altered from any source distribution. - */ - -/* - * t1ha = { Fast Positive Hash, aka "Позитивный Хэш" } - * by [Positive Technologies](https://www.ptsecurity.ru) - * - * Briefly, it is a 64-bit Hash Function: - * 1. Created for 64-bit little-endian platforms, in predominantly for x86_64, - * but portable and without penalties it can run on any 64-bit CPU. - * 2. In most cases up to 15% faster than City64, xxHash, mum-hash, metro-hash - * and all others portable hash-functions (which do not use specific - * hardware tricks). - * 3. Not suitable for cryptography. - * - * The Future will Positive. Всё будет хорошо. - * - * ACKNOWLEDGEMENT: - * The t1ha was originally developed by Leonid Yuriev (Леонид Юрьев) - * for The 1Hippeus project - zerocopy messaging in the spirit of Sparta! - */ - -#ifndef T1HA2_DISABLED -#include "t1ha_bits.h" -//#include "t1ha_selfcheck.h" - -static __always_inline void init_ab(t1ha_state256_t *s, uint64_t x, - uint64_t y) { - s->n.a = x; - s->n.b = y; -} - -static __always_inline void init_cd(t1ha_state256_t *s, uint64_t x, - uint64_t y) { - s->n.c = rot64(y, 23) + ~x; - s->n.d = ~y + rot64(x, 19); -} - -/* TODO: C++ template in the next version */ -#define T1HA2_UPDATE(ENDIANNES, ALIGNESS, state, v) \ - do { \ - t1ha_state256_t *const s = state; \ - const uint64_t w0 = fetch64_##ENDIANNES##_##ALIGNESS(v + 0); \ - const uint64_t w1 = fetch64_##ENDIANNES##_##ALIGNESS(v + 1); \ - const uint64_t w2 = fetch64_##ENDIANNES##_##ALIGNESS(v + 2); \ - const uint64_t w3 = fetch64_##ENDIANNES##_##ALIGNESS(v + 3); \ - \ - const uint64_t d02 = w0 + rot64(w2 + s->n.d, 56); \ - const uint64_t c13 = w1 + rot64(w3 + s->n.c, 19); \ - s->n.d ^= s->n.b + rot64(w1, 38); \ - s->n.c ^= s->n.a + rot64(w0, 57); \ - s->n.b ^= prime_6 * (c13 + w2); \ - s->n.a ^= prime_5 * (d02 + w3); \ - } while (0) - -static __always_inline void squash(t1ha_state256_t *s) { - s->n.a ^= prime_6 * (s->n.c + rot64(s->n.d, 23)); - s->n.b ^= prime_5 * (rot64(s->n.c, 19) + s->n.d); -} - -/* TODO: C++ template in the next version */ -#define T1HA2_LOOP(ENDIANNES, ALIGNESS, state, data, len) \ - do { \ - const void *detent = (const uint8_t *)data + len - 31; \ - do { \ - const uint64_t *v = (const uint64_t *)data; \ - data = (const uint64_t *)data + 4; \ - prefetch(data); \ - T1HA2_UPDATE(le, ALIGNESS, state, v); \ - } while (likely(data < detent)); \ - } while (0) - -/* TODO: C++ template in the next version */ -#define T1HA2_TAIL_AB(ENDIANNES, ALIGNESS, state, data, len) \ - do { \ - t1ha_state256_t *const s = state; \ - const uint64_t *v = (const uint64_t *)data; \ - switch (len) { \ - default: \ - mixup64(&s->n.a, &s->n.b, fetch64_##ENDIANNES##_##ALIGNESS(v++), \ - prime_4); \ - /* fall through */ \ - case 24: \ - case 23: \ - case 22: \ - case 21: \ - case 20: \ - case 19: \ - case 18: \ - case 17: \ - mixup64(&s->n.b, &s->n.a, fetch64_##ENDIANNES##_##ALIGNESS(v++), \ - prime_3); \ - /* fall through */ \ - case 16: \ - case 15: \ - case 14: \ - case 13: \ - case 12: \ - case 11: \ - case 10: \ - case 9: \ - mixup64(&s->n.a, &s->n.b, fetch64_##ENDIANNES##_##ALIGNESS(v++), \ - prime_2); \ - /* fall through */ \ - case 8: \ - case 7: \ - case 6: \ - case 5: \ - case 4: \ - case 3: \ - case 2: \ - case 1: \ - mixup64(&s->n.b, &s->n.a, tail64_##ENDIANNES##_##ALIGNESS(v, len), \ - prime_1); \ - /* fall through */ \ - case 0: \ - return final64(s->n.a, s->n.b); \ - } \ - } while (0) - -/* TODO: C++ template in the next version */ -#define T1HA2_TAIL_ABCD(ENDIANNES, ALIGNESS, state, data, len) \ - do { \ - t1ha_state256_t *const s = state; \ - const uint64_t *v = (const uint64_t *)data; \ - switch (len) { \ - default: \ - mixup64(&s->n.a, &s->n.d, fetch64_##ENDIANNES##_##ALIGNESS(v++), \ - prime_4); \ - /* fall through */ \ - case 24: \ - case 23: \ - case 22: \ - case 21: \ - case 20: \ - case 19: \ - case 18: \ - case 17: \ - mixup64(&s->n.b, &s->n.a, fetch64_##ENDIANNES##_##ALIGNESS(v++), \ - prime_3); \ - /* fall through */ \ - case 16: \ - case 15: \ - case 14: \ - case 13: \ - case 12: \ - case 11: \ - case 10: \ - case 9: \ - mixup64(&s->n.c, &s->n.b, fetch64_##ENDIANNES##_##ALIGNESS(v++), \ - prime_2); \ - /* fall through */ \ - case 8: \ - case 7: \ - case 6: \ - case 5: \ - case 4: \ - case 3: \ - case 2: \ - case 1: \ - mixup64(&s->n.d, &s->n.c, tail64_##ENDIANNES##_##ALIGNESS(v, len), \ - prime_1); \ - /* fall through */ \ - case 0: \ - return final128(s->n.a, s->n.b, s->n.c, s->n.d, extra_result); \ - } \ - } while (0) - -static __always_inline uint64_t final128(uint64_t a, uint64_t b, uint64_t c, - uint64_t d, uint64_t *h) { - mixup64(&a, &b, rot64(c, 41) ^ d, prime_0); - mixup64(&b, &c, rot64(d, 23) ^ a, prime_6); - mixup64(&c, &d, rot64(a, 19) ^ b, prime_5); - mixup64(&d, &a, rot64(b, 31) ^ c, prime_4); - *h = c + d; - return a ^ b; -} - -//------------------------------------------------------------------------------ - -uint64_t t1ha2_atonce(const void *data, size_t length, uint64_t seed) { - t1ha_state256_t state; - init_ab(&state, seed, length); - -#if T1HA_SYS_UNALIGNED_ACCESS == T1HA_UNALIGNED_ACCESS__EFFICIENT - if (unlikely(length > 32)) { - init_cd(&state, seed, length); - T1HA2_LOOP(le, unaligned, &state, data, length); - squash(&state); - length &= 31; - } - T1HA2_TAIL_AB(le, unaligned, &state, data, length); -#else - const bool misaligned = (((uintptr_t)data) & (ALIGNMENT_64 - 1)) != 0; - if (misaligned) { - if (unlikely(length > 32)) { - init_cd(&state, seed, length); - T1HA2_LOOP(le, unaligned, &state, data, length); - squash(&state); - length &= 31; - } - T1HA2_TAIL_AB(le, unaligned, &state, data, length); - } else { - if (unlikely(length > 32)) { - init_cd(&state, seed, length); - T1HA2_LOOP(le, aligned, &state, data, length); - squash(&state); - length &= 31; - } - T1HA2_TAIL_AB(le, aligned, &state, data, length); - } -#endif -} - -uint64_t t1ha2_atonce128(uint64_t *__restrict extra_result, - const void *__restrict data, size_t length, - uint64_t seed) { - t1ha_state256_t state; - init_ab(&state, seed, length); - init_cd(&state, seed, length); - -#if T1HA_SYS_UNALIGNED_ACCESS == T1HA_UNALIGNED_ACCESS__EFFICIENT - if (unlikely(length > 32)) { - T1HA2_LOOP(le, unaligned, &state, data, length); - length &= 31; - } - T1HA2_TAIL_ABCD(le, unaligned, &state, data, length); -#else - const bool misaligned = (((uintptr_t)data) & (ALIGNMENT_64 - 1)) != 0; - if (misaligned) { - if (unlikely(length > 32)) { - T1HA2_LOOP(le, unaligned, &state, data, length); - length &= 31; - } - T1HA2_TAIL_ABCD(le, unaligned, &state, data, length); - } else { - if (unlikely(length > 32)) { - T1HA2_LOOP(le, aligned, &state, data, length); - length &= 31; - } - T1HA2_TAIL_ABCD(le, aligned, &state, data, length); - } -#endif -} - -//------------------------------------------------------------------------------ - -void t1ha2_init(t1ha_context_t *ctx, uint64_t seed_x, uint64_t seed_y) { - init_ab(&ctx->state, seed_x, seed_y); - init_cd(&ctx->state, seed_x, seed_y); - ctx->partial = 0; - ctx->total = 0; -} - -void t1ha2_update(t1ha_context_t *__restrict ctx, const void *__restrict data, - size_t length) { - ctx->total += length; - - if (ctx->partial) { - const size_t left = 32 - ctx->partial; - const size_t chunk = (length >= left) ? left : length; - memcpy(ctx->buffer.bytes + ctx->partial, data, chunk); - ctx->partial += chunk; - if (ctx->partial < 32) { - assert(left >= length); - return; - } - ctx->partial = 0; - data = (const uint8_t *)data + chunk; - length -= chunk; - T1HA2_UPDATE(le, aligned, &ctx->state, ctx->buffer.u64); - } - - if (length >= 32) { -#if T1HA_SYS_UNALIGNED_ACCESS == T1HA_UNALIGNED_ACCESS__EFFICIENT - T1HA2_LOOP(le, unaligned, &ctx->state, data, length); -#else - const bool misaligned = (((uintptr_t)data) & (ALIGNMENT_64 - 1)) != 0; - if (misaligned) { - T1HA2_LOOP(le, unaligned, &ctx->state, data, length); - } else { - T1HA2_LOOP(le, aligned, &ctx->state, data, length); - } -#endif - length &= 31; - } - - if (length) - memcpy(ctx->buffer.bytes, data, ctx->partial = length); -} - -uint64_t t1ha2_final(t1ha_context_t *__restrict ctx, - uint64_t *__restrict extra_result) { - uint64_t bits = (ctx->total << 3) ^ (UINT64_C(1) << 63); -#if __BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__ - bits = bswap64(bits); -#endif - t1ha2_update(ctx, &bits, 8); - - if (likely(!extra_result)) { - squash(&ctx->state); - T1HA2_TAIL_AB(le, aligned, &ctx->state, ctx->buffer.u64, ctx->partial); - } - - T1HA2_TAIL_ABCD(le, aligned, &ctx->state, ctx->buffer.u64, ctx->partial); -} - -#endif /* T1HA2_DISABLED */ diff --git a/src/t1ha/t1ha_bits.h b/src/t1ha/t1ha_bits.h deleted file mode 100644 index 7c47851..0000000 --- a/src/t1ha/t1ha_bits.h +++ /dev/null @@ -1,1226 +0,0 @@ -/* - * Copyright (c) 2016-2018 Positive Technologies, https://www.ptsecurity.com, - * Fast Positive Hash. - * - * Portions Copyright (c) 2010-2018 Leonid Yuriev , - * The 1Hippeus project (t1h). - * - * This software is provided 'as-is', without any express or implied - * warranty. In no event will the authors be held liable for any damages - * arising from the use of this software. - * - * Permission is granted to anyone to use this software for any purpose, - * including commercial applications, and to alter it and redistribute it - * freely, subject to the following restrictions: - * - * 1. The origin of this software must not be misrepresented; you must not - * claim that you wrote the original software. If you use this software - * in a product, an acknowledgement in the product documentation would be - * appreciated but is not required. - * 2. Altered source versions must be plainly marked as such, and must not be - * misrepresented as being the original software. - * 3. This notice may not be removed or altered from any source distribution. - */ - -/* - * t1ha = { Fast Positive Hash, aka "Позитивный Хэш" } - * by [Positive Technologies](https://www.ptsecurity.ru) - * - * Briefly, it is a 64-bit Hash Function: - * 1. Created for 64-bit little-endian platforms, in predominantly for x86_64, - * but portable and without penalties it can run on any 64-bit CPU. - * 2. In most cases up to 15% faster than City64, xxHash, mum-hash, metro-hash - * and all others portable hash-functions (which do not use specific - * hardware tricks). - * 3. Not suitable for cryptography. - * - * The Future will Positive. Всё будет хорошо. - * - * ACKNOWLEDGEMENT: - * The t1ha was originally developed by Leonid Yuriev (Леонид Юрьев) - * for The 1Hippeus project - zerocopy messaging in the spirit of Sparta! - */ - -#pragma once - -#if defined(_MSC_VER) -#pragma warning(disable : 4201) /* nameless struct/union */ -#if _MSC_VER > 1800 -#pragma warning(disable : 4464) /* relative include path contains '..' */ -#endif /* 1800 */ -#endif /* MSVC */ -#include "t1ha.h" - -#ifndef T1HA_USE_FAST_ONESHOT_READ -/* Define it to 1 for little bit faster code. - * Unfortunately this may triggering a false-positive alarms from Valgrind, - * AddressSanitizer and other similar tool. - * So, define it to 0 for calmness if doubt. */ -#define T1HA_USE_FAST_ONESHOT_READ 1 -#endif /* T1HA_USE_FAST_ONESHOT_READ */ - -/*****************************************************************************/ - -#include /* for assert() */ -#include /* for bool */ -#include /* for memcpy() */ - -#if __BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__ && \ - __BYTE_ORDER__ != __ORDER_BIG_ENDIAN__ -#error Unsupported byte order. -#endif - -#define T1HA_UNALIGNED_ACCESS__UNABLE 0 -#define T1HA_UNALIGNED_ACCESS__SLOW 1 -#define T1HA_UNALIGNED_ACCESS__EFFICIENT 2 - -#ifndef T1HA_SYS_UNALIGNED_ACCESS -#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) -#define T1HA_SYS_UNALIGNED_ACCESS T1HA_UNALIGNED_ACCESS__EFFICIENT -#elif defined(__ia32__) -#define T1HA_SYS_UNALIGNED_ACCESS T1HA_UNALIGNED_ACCESS__EFFICIENT -#elif defined(__e2k__) -#define T1HA_SYS_UNALIGNED_ACCESS T1HA_UNALIGNED_ACCESS__SLOW -#elif defined(__ARM_FEATURE_UNALIGNED) -#define T1HA_SYS_UNALIGNED_ACCESS T1HA_UNALIGNED_ACCESS__EFFICIENT -#else -#define T1HA_SYS_UNALIGNED_ACCESS T1HA_UNALIGNED_ACCESS__UNABLE -#endif -#endif /* T1HA_SYS_UNALIGNED_ACCESS */ - -#define ALIGNMENT_16 2 -#define ALIGNMENT_32 4 -#if UINTPTR_MAX > 0xffffFFFFul || ULONG_MAX > 0xffffFFFFul -#define ALIGNMENT_64 8 -#else -#define ALIGNMENT_64 4 -#endif - -#ifndef PAGESIZE -#define PAGESIZE 4096 -#endif /* PAGESIZE */ - -/***************************************************************************/ - -#ifndef __has_builtin -#define __has_builtin(x) (0) -#endif - -#ifndef __has_warning -#define __has_warning(x) (0) -#endif - -#ifndef __has_feature -#define __has_feature(x) (0) -#endif - -#ifndef __has_extension -#define __has_extension(x) (0) -#endif - -#if __has_feature(address_sanitizer) -#define __SANITIZE_ADDRESS__ 1 -#endif - -#ifndef __optimize -#if defined(__clang__) && !__has_attribute(optimize) -#define __optimize(ops) -#elif defined(__GNUC__) || __has_attribute(optimize) -#define __optimize(ops) __attribute__((optimize(ops))) -#else -#define __optimize(ops) -#endif -#endif /* __optimize */ - -#ifndef __cold -#if defined(__OPTIMIZE__) -#if defined(__e2k__) -#define __cold __optimize(1) __attribute__((cold)) -#elif defined(__clang__) && !__has_attribute(cold) -/* just put infrequently used functions in separate section */ -#define __cold __attribute__((section("text.unlikely"))) __optimize("Os") -#elif defined(__GNUC__) || __has_attribute(cold) -#define __cold __attribute__((cold)) __optimize("Os") -#else -#define __cold __optimize("Os") -#endif -#else -#define __cold -#endif -#endif /* __cold */ - -#if __GNUC_PREREQ(4, 4) || defined(__clang__) - -#if defined(__ia32__) || defined(__e2k__) -#include -#endif - -#if defined(__ia32__) && !defined(__cpuid_count) -#include -#endif - -#if defined(__e2k__) -#include -#endif - -#ifndef likely -#define likely(cond) __builtin_expect(!!(cond), 1) -#endif - -#ifndef unlikely -#define unlikely(cond) __builtin_expect(!!(cond), 0) -#endif - -#if __GNUC_PREREQ(4, 5) || __has_builtin(__builtin_unreachable) -#define unreachable() __builtin_unreachable() -#endif - -#define bswap64(v) __builtin_bswap64(v) -#define bswap32(v) __builtin_bswap32(v) -#if __GNUC_PREREQ(4, 8) || __has_builtin(__builtin_bswap16) -#define bswap16(v) __builtin_bswap16(v) -#endif - -#if !defined(__maybe_unused) && (__GNUC_PREREQ(4, 3) || __has_attribute(unused)) -#define __maybe_unused __attribute__((unused)) -#endif - -#if !defined(__always_inline) && \ - (__GNUC_PREREQ(3, 2) || __has_attribute(always_inline)) -#define __always_inline __inline __attribute__((always_inline)) -#endif - -#if defined(__e2k__) - -#if __iset__ >= 3 -#define mul_64x64_high(a, b) __builtin_e2k_umulhd(a, b) -#endif /* __iset__ >= 3 */ - -#if __iset__ >= 5 -static __maybe_unused __always_inline unsigned -e2k_add64carry_first(uint64_t base, uint64_t addend, uint64_t *sum) { - *sum = base + addend; - return (unsigned)__builtin_e2k_addcd_c(base, addend, 0); -} -#define add64carry_first(base, addend, sum) \ - e2k_add64carry_first(base, addend, sum) - -static __maybe_unused __always_inline unsigned -e2k_add64carry_next(unsigned carry, uint64_t base, uint64_t addend, - uint64_t *sum) { - *sum = __builtin_e2k_addcd(base, addend, carry); - return (unsigned)__builtin_e2k_addcd_c(base, addend, carry); -} -#define add64carry_next(carry, base, addend, sum) \ - e2k_add64carry_next(carry, base, addend, sum) - -static __maybe_unused __always_inline void e2k_add64carry_last(unsigned carry, - uint64_t base, - uint64_t addend, - uint64_t *sum) { - *sum = __builtin_e2k_addcd(base, addend, carry); -} -#define add64carry_last(carry, base, addend, sum) \ - e2k_add64carry_last(carry, base, addend, sum) -#endif /* __iset__ >= 5 */ - -#define fetch64_be_aligned(ptr) ((uint64_t)__builtin_e2k_ld_64s_be(ptr)) -#define fetch32_be_aligned(ptr) ((uint32_t)__builtin_e2k_ld_32u_be(ptr)) - -#endif /* __e2k__ Elbrus */ - -#elif defined(_MSC_VER) - -#if _MSC_FULL_VER < 190024234 && defined(_M_IX86) -#pragma message( \ - "For AES-NI at least \"Microsoft C/C++ Compiler\" version 19.00.24234 (Visual Studio 2015 Update 3) is required.") -#endif -#if _MSC_FULL_VER < 191526730 -#pragma message( \ - "It is recommended to use \"Microsoft C/C++ Compiler\" version 19.15.26730 (Visual Studio 2017 15.8) or newer.") -#endif -#if _MSC_FULL_VER < 180040629 -#error At least "Microsoft C/C++ Compiler" version 18.00.40629 (Visual Studio 2013 Update 5) is required. -#endif - -#pragma warning(push, 1) - -#include -#include -#define likely(cond) (cond) -#define unlikely(cond) (cond) -#define unreachable() __assume(0) -#define bswap64(v) _byteswap_uint64(v) -#define bswap32(v) _byteswap_ulong(v) -#define bswap16(v) _byteswap_ushort(v) -#define rot64(v, s) _rotr64(v, s) -#define rot32(v, s) _rotr(v, s) -#define __always_inline __forceinline - -#if defined(_M_X64) || defined(_M_IA64) -#pragma intrinsic(_umul128) -#define mul_64x64_128(a, b, ph) _umul128(a, b, ph) -#pragma intrinsic(_addcarry_u64) -#define add64carry_first(base, addend, sum) _addcarry_u64(0, base, addend, sum) -#define add64carry_next(carry, base, addend, sum) \ - _addcarry_u64(carry, base, addend, sum) -#define add64carry_last(carry, base, addend, sum) \ - (void)_addcarry_u64(carry, base, addend, sum) -#endif - -#if defined(_M_ARM64) || defined(_M_X64) || defined(_M_IA64) -#pragma intrinsic(__umulh) -#define mul_64x64_high(a, b) __umulh(a, b) -#endif - -#if defined(_M_IX86) -#pragma intrinsic(__emulu) -#define mul_32x32_64(a, b) __emulu(a, b) - -#if _MSC_VER >= 1915 /* LY: workaround for SSA-optimizer bug */ -#pragma intrinsic(_addcarry_u32) -#define add32carry_first(base, addend, sum) _addcarry_u32(0, base, addend, sum) -#define add32carry_next(carry, base, addend, sum) \ - _addcarry_u32(carry, base, addend, sum) -#define add32carry_last(carry, base, addend, sum) \ - (void)_addcarry_u32(carry, base, addend, sum) - -static __forceinline char -msvc32_add64carry_first(uint64_t base, uint64_t addend, uint64_t *sum) { - uint32_t *const sum32 = (uint32_t *)sum; - const uint32_t base_32l = (uint32_t)base; - const uint32_t base_32h = (uint32_t)(base >> 32); - const uint32_t addend_32l = (uint32_t)addend; - const uint32_t addend_32h = (uint32_t)(addend >> 32); - return add32carry_next(add32carry_first(base_32l, addend_32l, sum32), - base_32h, addend_32h, sum32 + 1); -} -#define add64carry_first(base, addend, sum) \ - msvc32_add64carry_first(base, addend, sum) - -static __forceinline char msvc32_add64carry_next(char carry, uint64_t base, - uint64_t addend, - uint64_t *sum) { - uint32_t *const sum32 = (uint32_t *)sum; - const uint32_t base_32l = (uint32_t)base; - const uint32_t base_32h = (uint32_t)(base >> 32); - const uint32_t addend_32l = (uint32_t)addend; - const uint32_t addend_32h = (uint32_t)(addend >> 32); - return add32carry_next(add32carry_next(carry, base_32l, addend_32l, sum32), - base_32h, addend_32h, sum32 + 1); -} -#define add64carry_next(carry, base, addend, sum) \ - msvc32_add64carry_next(carry, base, addend, sum) - -static __forceinline void msvc32_add64carry_last(char carry, uint64_t base, - uint64_t addend, - uint64_t *sum) { - uint32_t *const sum32 = (uint32_t *)sum; - const uint32_t base_32l = (uint32_t)base; - const uint32_t base_32h = (uint32_t)(base >> 32); - const uint32_t addend_32l = (uint32_t)addend; - const uint32_t addend_32h = (uint32_t)(addend >> 32); - add32carry_last(add32carry_next(carry, base_32l, addend_32l, sum32), base_32h, - addend_32h, sum32 + 1); -} -#define add64carry_last(carry, base, addend, sum) \ - msvc32_add64carry_last(carry, base, addend, sum) -#endif /* _MSC_FULL_VER >= 190024231 */ - -#elif defined(_M_ARM) -#define mul_32x32_64(a, b) _arm_umull(a, b) -#endif - -#pragma warning(pop) -#pragma warning(disable : 4514) /* 'xyz': unreferenced inline function \ - has been removed */ -#pragma warning(disable : 4710) /* 'xyz': function not inlined */ -#pragma warning(disable : 4711) /* function 'xyz' selected for \ - automatic inline expansion */ -#pragma warning(disable : 4127) /* conditional expression is constant */ -#pragma warning(disable : 4702) /* unreachable code */ -#endif /* Compiler */ - -#ifndef likely -#define likely(cond) (cond) -#endif -#ifndef unlikely -#define unlikely(cond) (cond) -#endif -#ifndef __maybe_unused -#define __maybe_unused -#endif -#ifndef __always_inline -#define __always_inline __inline -#endif -#ifndef unreachable -#define unreachable() \ - do { \ - } while (1) -#endif - -#ifndef bswap64 -#if defined(bswap_64) -#define bswap64 bswap_64 -#elif defined(__bswap_64) -#define bswap64 __bswap_64 -#else -static __always_inline uint64_t bswap64(uint64_t v) { - return v << 56 | v >> 56 | ((v << 40) & UINT64_C(0x00ff000000000000)) | - ((v << 24) & UINT64_C(0x0000ff0000000000)) | - ((v << 8) & UINT64_C(0x000000ff00000000)) | - ((v >> 8) & UINT64_C(0x00000000ff000000)) | - ((v >> 24) & UINT64_C(0x0000000000ff0000)) | - ((v >> 40) & UINT64_C(0x000000000000ff00)); -} -#endif -#endif /* bswap64 */ - -#ifndef bswap32 -#if defined(bswap_32) -#define bswap32 bswap_32 -#elif defined(__bswap_32) -#define bswap32 __bswap_32 -#else -static __always_inline uint32_t bswap32(uint32_t v) { - return v << 24 | v >> 24 | ((v << 8) & UINT32_C(0x00ff0000)) | - ((v >> 8) & UINT32_C(0x0000ff00)); -} -#endif -#endif /* bswap32 */ - -#ifndef bswap16 -#if defined(bswap_16) -#define bswap16 bswap_16 -#elif defined(__bswap_16) -#define bswap16 __bswap_16 -#else -static __always_inline uint16_t bswap16(uint16_t v) { return v << 8 | v >> 8; } -#endif -#endif /* bswap16 */ - -#ifndef read_unaligned -#if defined(__GNUC__) || __has_attribute(packed) -typedef struct { - uint8_t unaligned_8; - uint16_t unaligned_16; - uint32_t unaligned_32; - uint64_t unaligned_64; -} __attribute__((packed)) t1ha_unaligned_proxy; -#define read_unaligned(ptr, bits) \ - (((const t1ha_unaligned_proxy *)((const uint8_t *)(ptr)-offsetof( \ - t1ha_unaligned_proxy, unaligned_##bits))) \ - ->unaligned_##bits) -#elif defined(_MSC_VER) -#pragma warning( \ - disable : 4235) /* nonstandard extension used: '__unaligned' \ - * keyword not supported on this architecture */ -#define read_unaligned(ptr, bits) (*(const __unaligned uint##bits##_t *)(ptr)) -#else -#pragma pack(push, 1) -typedef struct { - uint8_t unaligned_8; - uint16_t unaligned_16; - uint32_t unaligned_32; - uint64_t unaligned_64; -} t1ha_unaligned_proxy; -#pragma pack(pop) -#define read_unaligned(ptr, bits) \ - (((const t1ha_unaligned_proxy *)((const uint8_t *)(ptr)-offsetof( \ - t1ha_unaligned_proxy, unaligned_##bits))) \ - ->unaligned_##bits) -#endif -#endif /* read_unaligned */ - -#ifndef read_aligned -#if __GNUC_PREREQ(4, 8) || __has_builtin(__builtin_assume_aligned) -#define read_aligned(ptr, bits) \ - (*(const uint##bits##_t *)__builtin_assume_aligned(ptr, ALIGNMENT_##bits)) -#elif (__GNUC_PREREQ(3, 3) || __has_attribute(aligned)) && !defined(__clang__) -#define read_aligned(ptr, bits) \ - (*(const uint##bits##_t __attribute__((aligned(ALIGNMENT_##bits))) *)(ptr)) -#elif __has_attribute(assume_aligned) - -static __always_inline const - uint16_t *__attribute__((assume_aligned(ALIGNMENT_16))) - cast_aligned_16(const void *ptr) { - return (const uint16_t *)ptr; -} -static __always_inline const - uint32_t *__attribute__((assume_aligned(ALIGNMENT_32))) - cast_aligned_32(const void *ptr) { - return (const uint32_t *)ptr; -} -static __always_inline const - uint64_t *__attribute__((assume_aligned(ALIGNMENT_64))) - cast_aligned_64(const void *ptr) { - return (const uint64_t *)ptr; -} - -#define read_aligned(ptr, bits) (*cast_aligned_##bits(ptr)) - -#elif defined(_MSC_VER) -#define read_aligned(ptr, bits) \ - (*(const __declspec(align(ALIGNMENT_##bits)) uint##bits##_t *)(ptr)) -#else -#define read_aligned(ptr, bits) (*(const uint##bits##_t *)(ptr)) -#endif -#endif /* read_aligned */ - -#ifndef prefetch -#if (__GNUC_PREREQ(4, 0) || __has_builtin(__builtin_prefetch)) && \ - !defined(__ia32__) -#define prefetch(ptr) __builtin_prefetch(ptr) -#elif defined(_M_ARM64) || defined(_M_ARM) -#define prefetch(ptr) __prefetch(ptr) -#else -#define prefetch(ptr) \ - do { \ - (void)(ptr); \ - } while (0) -#endif -#endif /* prefetch */ - -#if __has_warning("-Wconstant-logical-operand") -#if defined(__clang__) -#pragma clang diagnostic ignored "-Wconstant-logical-operand" -#elif defined(__GNUC__) -#pragma GCC diagnostic ignored "-Wconstant-logical-operand" -#else -#pragma warning disable "constant-logical-operand" -#endif -#endif /* -Wconstant-logical-operand */ - -#if __has_warning("-Wtautological-pointer-compare") -#if defined(__clang__) -#pragma clang diagnostic ignored "-Wtautological-pointer-compare" -#elif defined(__GNUC__) -#pragma GCC diagnostic ignored "-Wtautological-pointer-compare" -#else -#pragma warning disable "tautological-pointer-compare" -#endif -#endif /* -Wtautological-pointer-compare */ - -/***************************************************************************/ - -#if __GNUC_PREREQ(4, 0) -#pragma GCC visibility push(hidden) -#endif /* __GNUC_PREREQ(4,0) */ - -/*---------------------------------------------------------- Little Endian */ - -#ifndef fetch16_le_aligned -static __always_inline uint16_t fetch16_le_aligned(const void *v) { - assert(((uintptr_t)v) % ALIGNMENT_16 == 0); -#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ - return read_aligned(v, 16); -#else - return bswap16(read_aligned(v, 16)); -#endif -} -#endif /* fetch16_le_aligned */ - -#ifndef fetch16_le_unaligned -static __always_inline uint16_t fetch16_le_unaligned(const void *v) { -#if T1HA_SYS_UNALIGNED_ACCESS == T1HA_UNALIGNED_ACCESS__UNABLE - const uint8_t *p = (const uint8_t *)v; - return p[0] | (uint16_t)p[1] << 8; -#elif __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ - return read_unaligned(v, 16); -#else - return bswap16(read_unaligned(v, 16)); -#endif -} -#endif /* fetch16_le_unaligned */ - -#ifndef fetch32_le_aligned -static __always_inline uint32_t fetch32_le_aligned(const void *v) { - assert(((uintptr_t)v) % ALIGNMENT_32 == 0); -#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ - return read_aligned(v, 32); -#else - return bswap32(read_aligned(v, 32)); -#endif -} -#endif /* fetch32_le_aligned */ - -#ifndef fetch32_le_unaligned -static __always_inline uint32_t fetch32_le_unaligned(const void *v) { -#if T1HA_SYS_UNALIGNED_ACCESS == T1HA_UNALIGNED_ACCESS__UNABLE - return fetch16_le_unaligned(v) | - (uint32_t)fetch16_le_unaligned((const uint8_t *)v + 2) << 16; -#elif __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ - return read_unaligned(v, 32); -#else - return bswap32(read_unaligned(v, 32)); -#endif -} -#endif /* fetch32_le_unaligned */ - -#ifndef fetch64_le_aligned -static __always_inline uint64_t fetch64_le_aligned(const void *v) { - assert(((uintptr_t)v) % ALIGNMENT_64 == 0); -#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ - return read_aligned(v, 64); -#else - return bswap64(read_aligned(v, 64)); -#endif -} -#endif /* fetch64_le_aligned */ - -#ifndef fetch64_le_unaligned -static __always_inline uint64_t fetch64_le_unaligned(const void *v) { -#if T1HA_SYS_UNALIGNED_ACCESS == T1HA_UNALIGNED_ACCESS__UNABLE - return fetch32_le_unaligned(v) | - (uint64_t)fetch32_le_unaligned((const uint8_t *)v + 4) << 32; -#elif __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ - return read_unaligned(v, 64); -#else - return bswap64(read_unaligned(v, 64)); -#endif -} -#endif /* fetch64_le_unaligned */ - -static __always_inline uint64_t tail64_le_aligned(const void *v, size_t tail) { - const uint8_t *const p = (const uint8_t *)v; -#if T1HA_USE_FAST_ONESHOT_READ && !defined(__SANITIZE_ADDRESS__) - /* We can perform a 'oneshot' read, which is little bit faster. */ - const unsigned shift = ((8 - tail) & 7) << 3; - return fetch64_le_aligned(p) & ((~UINT64_C(0)) >> shift); -#else - uint64_t r = 0; - switch (tail & 7) { - default: - unreachable(); -/* fall through */ -#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ - /* For most CPUs this code is better when not needed byte reordering. */ - case 0: - return fetch64_le_aligned(p); - case 7: - r = (uint64_t)p[6] << 8; - /* fall through */ - case 6: - r += p[5]; - r <<= 8; - /* fall through */ - case 5: - r += p[4]; - r <<= 32; - /* fall through */ - case 4: - return r + fetch32_le_aligned(p); - case 3: - r = (uint64_t)p[2] << 16; - /* fall through */ - case 2: - return r + fetch16_le_aligned(p); - case 1: - return p[0]; -#else - case 0: - r = p[7] << 8; - /* fall through */ - case 7: - r += p[6]; - r <<= 8; - /* fall through */ - case 6: - r += p[5]; - r <<= 8; - /* fall through */ - case 5: - r += p[4]; - r <<= 8; - /* fall through */ - case 4: - r += p[3]; - r <<= 8; - /* fall through */ - case 3: - r += p[2]; - r <<= 8; - /* fall through */ - case 2: - r += p[1]; - r <<= 8; - /* fall through */ - case 1: - return r + p[0]; -#endif - } -#endif /* T1HA_USE_FAST_ONESHOT_READ */ -} - -#if T1HA_USE_FAST_ONESHOT_READ && \ - T1HA_SYS_UNALIGNED_ACCESS != T1HA_UNALIGNED_ACCESS__UNABLE && \ - defined(PAGESIZE) && PAGESIZE > 42 && !defined(__SANITIZE_ADDRESS__) -#define can_read_underside(ptr, size) \ - (((PAGESIZE - (size)) & (uintptr_t)(ptr)) != 0) -#endif /* T1HA_USE_FAST_ONESHOT_READ */ - -static __always_inline uint64_t tail64_le_unaligned(const void *v, - size_t tail) { - const uint8_t *p = (const uint8_t *)v; -#if defined(can_read_underside) && \ - (UINTPTR_MAX > 0xffffFFFFul || ULONG_MAX > 0xffffFFFFul) - /* On some systems (e.g. x86_64) we can perform a 'oneshot' read, which - * is little bit faster. Thanks Marcin Żukowski - * for the reminder. */ - const unsigned offset = (8 - tail) & 7; - const unsigned shift = offset << 3; - if (likely(can_read_underside(p, 8))) { - p -= offset; - return fetch64_le_unaligned(p) >> shift; - } - return fetch64_le_unaligned(p) & ((~UINT64_C(0)) >> shift); -#else - uint64_t r = 0; - switch (tail & 7) { - default: - unreachable(); -/* fall through */ -#if T1HA_SYS_UNALIGNED_ACCESS == T1HA_UNALIGNED_ACCESS__EFFICIENT && \ - __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ - /* For most CPUs this code is better when not needed - * copying for alignment or byte reordering. */ - case 0: - return fetch64_le_unaligned(p); - case 7: - r = (uint64_t)p[6] << 8; - /* fall through */ - case 6: - r += p[5]; - r <<= 8; - /* fall through */ - case 5: - r += p[4]; - r <<= 32; - /* fall through */ - case 4: - return r + fetch32_le_unaligned(p); - case 3: - r = (uint64_t)p[2] << 16; - /* fall through */ - case 2: - return r + fetch16_le_unaligned(p); - case 1: - return p[0]; -#else - /* For most CPUs this code is better than a - * copying for alignment and/or byte reordering. */ - case 0: - r = p[7] << 8; - /* fall through */ - case 7: - r += p[6]; - r <<= 8; - /* fall through */ - case 6: - r += p[5]; - r <<= 8; - /* fall through */ - case 5: - r += p[4]; - r <<= 8; - /* fall through */ - case 4: - r += p[3]; - r <<= 8; - /* fall through */ - case 3: - r += p[2]; - r <<= 8; - /* fall through */ - case 2: - r += p[1]; - r <<= 8; - /* fall through */ - case 1: - return r + p[0]; -#endif - } -#endif /* can_read_underside */ -} - -/*------------------------------------------------------------- Big Endian */ - -#ifndef fetch16_be_aligned -static __maybe_unused __always_inline uint16_t -fetch16_be_aligned(const void *v) { - assert(((uintptr_t)v) % ALIGNMENT_16 == 0); -#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ - return read_aligned(v, 16); -#else - return bswap16(read_aligned(v, 16)); -#endif -} -#endif /* fetch16_be_aligned */ - -#ifndef fetch16_be_unaligned -static __maybe_unused __always_inline uint16_t -fetch16_be_unaligned(const void *v) { -#if T1HA_SYS_UNALIGNED_ACCESS == T1HA_UNALIGNED_ACCESS__UNABLE - const uint8_t *p = (const uint8_t *)v; - return (uint16_t)p[0] << 8 | p[1]; -#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ - return read_unaligned(v, 16); -#else - return bswap16(read_unaligned(v, 16)); -#endif -} -#endif /* fetch16_be_unaligned */ - -#ifndef fetch32_be_aligned -static __maybe_unused __always_inline uint32_t -fetch32_be_aligned(const void *v) { - assert(((uintptr_t)v) % ALIGNMENT_32 == 0); -#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ - return read_aligned(v, 32); -#else - return bswap32(read_aligned(v, 32)); -#endif -} -#endif /* fetch32_be_aligned */ - -#ifndef fetch32_be_unaligned -static __maybe_unused __always_inline uint32_t -fetch32_be_unaligned(const void *v) { -#if T1HA_SYS_UNALIGNED_ACCESS == T1HA_UNALIGNED_ACCESS__UNABLE - return (uint32_t)fetch16_be_unaligned(v) << 16 | - fetch16_be_unaligned((const uint8_t *)v + 2); -#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ - return read_unaligned(v, 32); -#else - return bswap32(read_unaligned(v, 32)); -#endif -} -#endif /* fetch32_be_unaligned */ - -#ifndef fetch64_be_aligned -static __maybe_unused __always_inline uint64_t -fetch64_be_aligned(const void *v) { - assert(((uintptr_t)v) % ALIGNMENT_64 == 0); -#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ - return read_aligned(v, 64); -#else - return bswap64(read_aligned(v, 64)); -#endif -} -#endif /* fetch64_be_aligned */ - -#ifndef fetch64_be_unaligned -static __maybe_unused __always_inline uint64_t -fetch64_be_unaligned(const void *v) { -#if T1HA_SYS_UNALIGNED_ACCESS == T1HA_UNALIGNED_ACCESS__UNABLE - return (uint64_t)fetch32_be_unaligned(v) << 32 | - fetch32_be_unaligned((const uint8_t *)v + 4); -#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ - return read_unaligned(v, 64); -#else - return bswap64(read_unaligned(v, 64)); -#endif -} -#endif /* fetch64_be_unaligned */ - -static __maybe_unused __always_inline uint64_t tail64_be_aligned(const void *v, - size_t tail) { - const uint8_t *const p = (const uint8_t *)v; -#if T1HA_USE_FAST_ONESHOT_READ && !defined(__SANITIZE_ADDRESS__) - /* We can perform a 'oneshot' read, which is little bit faster. */ - const unsigned shift = ((8 - tail) & 7) << 3; - return fetch64_be_aligned(p) >> shift; -#else - switch (tail & 7) { - default: - unreachable(); -/* fall through */ -#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ - /* For most CPUs this code is better when not byte reordering. */ - case 1: - return p[0]; - case 2: - return fetch16_be_aligned(p); - case 3: - return (uint32_t)fetch16_be_aligned(p) << 8 | p[2]; - case 4: - return fetch32_be_aligned(p); - case 5: - return (uint64_t)fetch32_be_aligned(p) << 8 | p[4]; - case 6: - return (uint64_t)fetch32_be_aligned(p) << 16 | fetch16_be_aligned(p + 4); - case 7: - return (uint64_t)fetch32_be_aligned(p) << 24 | - (uint32_t)fetch16_be_aligned(p + 4) << 8 | p[6]; - case 0: - return fetch64_be_aligned(p); -#else - case 1: - return p[0]; - case 2: - return p[1] | (uint32_t)p[0] << 8; - case 3: - return p[2] | (uint32_t)p[1] << 8 | (uint32_t)p[0] << 16; - case 4: - return p[3] | (uint32_t)p[2] << 8 | (uint32_t)p[1] << 16 | - (uint32_t)p[0] << 24; - case 5: - return p[4] | (uint32_t)p[3] << 8 | (uint32_t)p[2] << 16 | - (uint32_t)p[1] << 24 | (uint64_t)p[0] << 32; - case 6: - return p[5] | (uint32_t)p[4] << 8 | (uint32_t)p[3] << 16 | - (uint32_t)p[2] << 24 | (uint64_t)p[1] << 32 | (uint64_t)p[0] << 40; - case 7: - return p[6] | (uint32_t)p[5] << 8 | (uint32_t)p[4] << 16 | - (uint32_t)p[3] << 24 | (uint64_t)p[2] << 32 | (uint64_t)p[1] << 40 | - (uint64_t)p[0] << 48; - case 0: - return p[7] | (uint32_t)p[6] << 8 | (uint32_t)p[5] << 16 | - (uint32_t)p[4] << 24 | (uint64_t)p[3] << 32 | (uint64_t)p[2] << 40 | - (uint64_t)p[1] << 48 | (uint64_t)p[0] << 56; -#endif - } -#endif /* T1HA_USE_FAST_ONESHOT_READ */ -} - -static __maybe_unused __always_inline uint64_t -tail64_be_unaligned(const void *v, size_t tail) { - const uint8_t *p = (const uint8_t *)v; -#if defined(can_read_underside) && \ - (UINTPTR_MAX > 0xffffFFFFul || ULONG_MAX > 0xffffFFFFul) - /* On some systems (e.g. x86_64) we can perform a 'oneshot' read, which - * is little bit faster. Thanks Marcin Żukowski - * for the reminder. */ - const unsigned offset = (8 - tail) & 7; - const unsigned shift = offset << 3; - if (likely(can_read_underside(p, 8))) { - p -= offset; - return fetch64_be_unaligned(p) & ((~UINT64_C(0)) >> shift); - } - return fetch64_be_unaligned(p) >> shift; -#else - switch (tail & 7) { - default: - unreachable(); -/* fall through */ -#if T1HA_SYS_UNALIGNED_ACCESS == T1HA_UNALIGNED_ACCESS__EFFICIENT && \ - __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ - /* For most CPUs this code is better when not needed - * copying for alignment or byte reordering. */ - case 1: - return p[0]; - case 2: - return fetch16_be_unaligned(p); - case 3: - return (uint32_t)fetch16_be_unaligned(p) << 8 | p[2]; - case 4: - return fetch32_be(p); - case 5: - return (uint64_t)fetch32_be_unaligned(p) << 8 | p[4]; - case 6: - return (uint64_t)fetch32_be_unaligned(p) << 16 | - fetch16_be_unaligned(p + 4); - case 7: - return (uint64_t)fetch32_be_unaligned(p) << 24 | - (uint32_t)fetch16_be_unaligned(p + 4) << 8 | p[6]; - case 0: - return fetch64_be_unaligned(p); -#else - /* For most CPUs this code is better than a - * copying for alignment and/or byte reordering. */ - case 1: - return p[0]; - case 2: - return p[1] | (uint32_t)p[0] << 8; - case 3: - return p[2] | (uint32_t)p[1] << 8 | (uint32_t)p[0] << 16; - case 4: - return p[3] | (uint32_t)p[2] << 8 | (uint32_t)p[1] << 16 | - (uint32_t)p[0] << 24; - case 5: - return p[4] | (uint32_t)p[3] << 8 | (uint32_t)p[2] << 16 | - (uint32_t)p[1] << 24 | (uint64_t)p[0] << 32; - case 6: - return p[5] | (uint32_t)p[4] << 8 | (uint32_t)p[3] << 16 | - (uint32_t)p[2] << 24 | (uint64_t)p[1] << 32 | (uint64_t)p[0] << 40; - case 7: - return p[6] | (uint32_t)p[5] << 8 | (uint32_t)p[4] << 16 | - (uint32_t)p[3] << 24 | (uint64_t)p[2] << 32 | (uint64_t)p[1] << 40 | - (uint64_t)p[0] << 48; - case 0: - return p[7] | (uint32_t)p[6] << 8 | (uint32_t)p[5] << 16 | - (uint32_t)p[4] << 24 | (uint64_t)p[3] << 32 | (uint64_t)p[2] << 40 | - (uint64_t)p[1] << 48 | (uint64_t)p[0] << 56; -#endif - } -#endif /* can_read_underside */ -} - -/***************************************************************************/ - -#ifndef rot64 -static __always_inline uint64_t rot64(uint64_t v, unsigned s) { - return (v >> s) | (v << (64 - s)); -} -#endif /* rot64 */ - -#ifndef mul_32x32_64 -static __always_inline uint64_t mul_32x32_64(uint32_t a, uint32_t b) { - return a * (uint64_t)b; -} -#endif /* mul_32x32_64 */ - -#ifndef add64carry_first -static __maybe_unused __always_inline unsigned -add64carry_first(uint64_t base, uint64_t addend, uint64_t *sum) { -#if __has_builtin(__builtin_addcll) - unsigned long long carryout; - *sum = __builtin_addcll(base, addend, 0, &carryout); - return (unsigned)carryout; -#else - *sum = base + addend; - return *sum < addend; -#endif /* __has_builtin(__builtin_addcll) */ -} -#endif /* add64carry_fist */ - -#ifndef add64carry_next -static __maybe_unused __always_inline unsigned -add64carry_next(unsigned carry, uint64_t base, uint64_t addend, uint64_t *sum) { -#if __has_builtin(__builtin_addcll) - unsigned long long carryout; - *sum = __builtin_addcll(base, addend, carry, &carryout); - return (unsigned)carryout; -#else - *sum = base + addend + carry; - return *sum < addend || (carry && *sum == addend); -#endif /* __has_builtin(__builtin_addcll) */ -} -#endif /* add64carry_next */ - -#ifndef add64carry_last -static __maybe_unused __always_inline void -add64carry_last(unsigned carry, uint64_t base, uint64_t addend, uint64_t *sum) { -#if __has_builtin(__builtin_addcll) - unsigned long long carryout; - *sum = __builtin_addcll(base, addend, carry, &carryout); - (void)carryout; -#else - *sum = base + addend + carry; -#endif /* __has_builtin(__builtin_addcll) */ -} -#endif /* add64carry_last */ - -#ifndef mul_64x64_128 -static __maybe_unused __always_inline uint64_t mul_64x64_128(uint64_t a, - uint64_t b, - uint64_t *h) { -#if defined(__SIZEOF_INT128__) || \ - (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128) - __uint128_t r = (__uint128_t)a * (__uint128_t)b; - /* modern GCC could nicely optimize this */ - *h = (uint64_t)(r >> 64); - return (uint64_t)r; -#elif defined(mul_64x64_high) - *h = mul_64x64_high(a, b); - return a * b; -#else - /* performs 64x64 to 128 bit multiplication */ - const uint64_t ll = mul_32x32_64((uint32_t)a, (uint32_t)b); - const uint64_t lh = mul_32x32_64(a >> 32, (uint32_t)b); - const uint64_t hl = mul_32x32_64((uint32_t)a, b >> 32); - const uint64_t hh = mul_32x32_64(a >> 32, b >> 32); - - /* Few simplification are possible here for 32-bit architectures, - * but thus we would lost compatibility with the original 64-bit - * version. Think is very bad idea, because then 32-bit t1ha will - * still (relatively) very slowly and well yet not compatible. */ - uint64_t l; - add64carry_last(add64carry_first(ll, lh << 32, &l), hh, lh >> 32, h); - add64carry_last(add64carry_first(l, hl << 32, &l), *h, hl >> 32, h); - return l; -#endif -} -#endif /* mul_64x64_128() */ - -#ifndef mul_64x64_high -static __maybe_unused __always_inline uint64_t mul_64x64_high(uint64_t a, - uint64_t b) { - uint64_t h; - mul_64x64_128(a, b, &h); - return h; -} -#endif /* mul_64x64_high */ - -/***************************************************************************/ - -/* 'magic' primes */ -static const uint64_t prime_0 = UINT64_C(0xEC99BF0D8372CAAB); -static const uint64_t prime_1 = UINT64_C(0x82434FE90EDCEF39); -static const uint64_t prime_2 = UINT64_C(0xD4F06DB99D67BE4B); -static const uint64_t prime_3 = UINT64_C(0xBD9CACC22C6E9571); -static const uint64_t prime_4 = UINT64_C(0x9C06FAF4D023E3AB); -static const uint64_t prime_5 = UINT64_C(0xC060724A8424F345); -static const uint64_t prime_6 = UINT64_C(0xCB5AF53AE3AAAC31); - -/* xor high and low parts of full 128-bit product */ -static __maybe_unused __always_inline uint64_t mux64(uint64_t v, - uint64_t prime) { - uint64_t l, h; - l = mul_64x64_128(v, prime, &h); - return l ^ h; -} - -static __always_inline uint64_t final64(uint64_t a, uint64_t b) { - uint64_t x = (a + rot64(b, 41)) * prime_0; - uint64_t y = (rot64(a, 23) + b) * prime_6; - return mux64(x ^ y, prime_5); -} - -static __always_inline void mixup64(uint64_t *__restrict a, - uint64_t *__restrict b, uint64_t v, - uint64_t prime) { - uint64_t h; - *a ^= mul_64x64_128(*b + v, prime, &h); - *b += h; -} - -/***************************************************************************/ - -typedef union t1ha_uint128 { -#if defined(__SIZEOF_INT128__) || \ - (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128) - __uint128_t v; -#endif - struct { -#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ - uint64_t l, h; -#else - uint64_t h, l; -#endif - }; -} t1ha_uint128_t; - -static __always_inline t1ha_uint128_t not128(const t1ha_uint128_t v) { - t1ha_uint128_t r; -#if defined(__SIZEOF_INT128__) || \ - (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128) - r.v = ~v.v; -#else - r.l = ~v.l; - r.h = ~v.h; -#endif - return r; -} - -static __always_inline t1ha_uint128_t left128(const t1ha_uint128_t v, - unsigned s) { - t1ha_uint128_t r; - assert(s < 128); -#if defined(__SIZEOF_INT128__) || \ - (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128) - r.v = v.v << s; -#else - r.l = (s < 64) ? v.l << s : 0; - r.h = (s < 64) ? (v.h << s) | (s ? v.l >> (64 - s) : 0) : v.l << (s - 64); -#endif - return r; -} - -static __always_inline t1ha_uint128_t right128(const t1ha_uint128_t v, - unsigned s) { - t1ha_uint128_t r; - assert(s < 128); -#if defined(__SIZEOF_INT128__) || \ - (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128) - r.v = v.v >> s; -#else - r.l = (s < 64) ? (s ? v.h << (64 - s) : 0) | (v.l >> s) : v.h >> (s - 64); - r.h = (s < 64) ? v.h >> s : 0; -#endif - return r; -} - -static __always_inline t1ha_uint128_t or128(t1ha_uint128_t x, - t1ha_uint128_t y) { - t1ha_uint128_t r; -#if defined(__SIZEOF_INT128__) || \ - (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128) - r.v = x.v | y.v; -#else - r.l = x.l | y.l; - r.h = x.h | y.h; -#endif - return r; -} - -static __always_inline t1ha_uint128_t xor128(t1ha_uint128_t x, - t1ha_uint128_t y) { - t1ha_uint128_t r; -#if defined(__SIZEOF_INT128__) || \ - (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128) - r.v = x.v ^ y.v; -#else - r.l = x.l ^ y.l; - r.h = x.h ^ y.h; -#endif - return r; -} - -static __always_inline t1ha_uint128_t rot128(t1ha_uint128_t v, unsigned s) { - s &= 127; -#if defined(__SIZEOF_INT128__) || \ - (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128) - v.v = (v.v << (128 - s)) | (v.v >> s); - return v; -#else - return s ? or128(left128(v, 128 - s), right128(v, s)) : v; -#endif -} - -static __always_inline t1ha_uint128_t add128(t1ha_uint128_t x, - t1ha_uint128_t y) { - t1ha_uint128_t r; -#if defined(__SIZEOF_INT128__) || \ - (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128) - r.v = x.v + y.v; -#else - add64carry_last(add64carry_first(x.l, y.l, &r.l), x.h, y.h, &r.h); -#endif - return r; -} - -static __always_inline t1ha_uint128_t mul128(t1ha_uint128_t x, - t1ha_uint128_t y) { - t1ha_uint128_t r; -#if defined(__SIZEOF_INT128__) || \ - (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128) - r.v = x.v * y.v; -#else - r.l = mul_64x64_128(x.l, y.l, &r.h); - r.h += x.l * y.h + y.l * x.h; -#endif - return r; -} - -/***************************************************************************/ - -#if T1HA0_AESNI_AVAILABLE && defined(__ia32__) -uint64_t t1ha_ia32cpu_features(void); - -static __always_inline bool t1ha_ia32_AESNI_avail(uint64_t ia32cpu_features) { - /* check for AES-NI */ - return (ia32cpu_features & UINT32_C(0x02000000)) != 0; -} - -static __always_inline bool t1ha_ia32_AVX_avail(uint64_t ia32cpu_features) { - /* check for any AVX */ - return (ia32cpu_features & UINT32_C(0x1A000000)) == UINT32_C(0x1A000000); -} - -static __always_inline bool t1ha_ia32_AVX2_avail(uint64_t ia32cpu_features) { - /* check for 'Advanced Vector Extensions 2' */ - return ((ia32cpu_features >> 32) & 32) != 0; -} - -#endif /* T1HA0_AESNI_AVAILABLE && __ia32__ */ From 9af0cbf108df3b4f15d8d442863770bfc08650dc Mon Sep 17 00:00:00 2001 From: tevador Date: Sat, 9 Feb 2019 16:09:55 +0100 Subject: [PATCH 33/35] Documentation formatting --- README.md | 2 +- doc/dataset.md | 29 ++++++++++++++--------------- doc/isa-ops.md | 12 ++++++------ doc/isa.md | 14 +++++++------- 4 files changed, 28 insertions(+), 29 deletions(-) diff --git a/README.md b/README.md index ed5d594..2c9b876 100644 --- a/README.md +++ b/README.md @@ -24,7 +24,7 @@ Notable parts of the RandomX VM are: The structure of the VM mimics the components that are found in a typical general purpose computer equipped with a CPU and a large amount of DRAM. The scratchpad is designed to fit into the CPU cache. The first 16 KiB and 256 KiB of the scratchpad are used more often take advantage of the faster L1 and L2 caches. The ratio of random reads from L1/L2/L3 is approximately 9:3:1, which matches the inverse latencies of typical CPU caches. -The VM executes programs in a special instruction set, which was designed in such way that any random 8-byte word is a valid instruction and any sequence of valid instructions is a valid program. For more details see [RandomX ISA documentation](doc/isa.md). Because there are no "syntax" rules, generating a random program is as easy as filling the program buffer with random data. A RandomX program consists of 256 instructions. See [program.inc](../src/program.inc) as an example of a RandomX program translated into x86-64 assembly. +The VM executes programs in a special instruction set, which was designed in such way that any random 8-byte word is a valid instruction and any sequence of valid instructions is a valid program. For more details see [RandomX ISA documentation](doc/isa.md). Because there are no "syntax" rules, generating a random program is as easy as filling the program buffer with random data. A RandomX program consists of 256 instructions. See [program.inc](src/program.inc) as an example of a RandomX program translated into x86-64 assembly. #### Hash calculation diff --git a/doc/dataset.md b/doc/dataset.md index bb562bd..48d62ed 100644 --- a/doc/dataset.md +++ b/doc/dataset.md @@ -1,15 +1,14 @@ +# Dataset -## Dataset +The dataset is randomly accessed 16384 times during each hash calculation, which significantly increases memory-hardness of RandomX. The size of the dataset is fixed at 4 GiB and it's divided into 67108864 blocks of 64 bytes. -The dataset is randomly accessed 16384 times during each hash calculation, which significantly increases memory-hardness of RandomX. The size of the dataset is fixed at 4 GiB and it's divided into 67108864 block of 64 bytes. - -In order to allow PoW verification with less than 4 GiB of memory, the dataset is constructed from a 256 MiB cache, which can be used to calculate dataset rows on the fly. +In order to allow PoW verification with less than 4 GiB of memory, the dataset is constructed from a 256 MiB cache, which can be used to calculate dataset blocks on the fly. Because the initialization of the dataset is computationally intensive, it is recalculated only every 1024 blocks (~34 hours). The following figure visualizes the construction of the dataset: ![Imgur](https://i.imgur.com/b9WHOwo.png) -### Seed block +## Seed block The whole dataset is constructed from a 256-bit hash of the last block whose height is divisible by 1024 **and** has at least 64 confirmations. |block|Seed block| @@ -19,7 +18,7 @@ The whole dataset is constructed from a 256-bit hash of the last block whose hei |2113-3136|2048| |...|... -### Cache construction +## Cache construction The 32-byte seed block hash is expanded into the 256 MiB cache using the "memory fill" function of Argon2d. [Argon2](https://github.com/P-H-C/phc-winner-argon2) is a memory-hard password hashing function, which is highly customizable. The variant with "d" suffix uses a data-dependent memory access pattern and provides the highest resistance against time-memory tradeoffs. @@ -42,19 +41,19 @@ The finalizer and output calculation steps of Argon2 are omitted. The output is The use of 3 iterations makes time-memory tradeoffs infeasible and thus 256 MiB is the minimum amount of memory required by RandomX. -### Dataset block generation -The full 4 GiB dataset can be generated from the 256 MiB cache. Each 64-byte block is generated independently by XORing 16 pseudorandom Cache blocks selected by the `SquareHash` function. +## Dataset block generation +The full 4 GiB dataset can be generated from the 256 MiB cache. Each 64-byte block is generated independently by XORing 16 pseudorandom cache blocks selected by the `SquareHash` function. -#### SquareHash +### SquareHash `SquareHash` is a custom hash function with 64-bit input and 64-bit output. It is calculated by repeatedly squaring the input, splitting the 128-bit result in to two 64-bit halves and subtracting the high half from the low half. This is repeated 42 times. It's available as a [portable C implementation](../src/squareHash.h) and [x86-64 assembly version](../src/asm/squareHash.inc). Properties of `SquareHash`: * It achieves full [Avalanche effect](https://en.wikipedia.org/wiki/Avalanche_effect). * Since the whole calculation is a long dependency chain, which uses only multiplication and subtraction, the performance gains by using custom hardware are very limited. -* A single `SquareHash` calculation takes 40-80 ns, which is about the same time as DRAM access latency. Devices using low-latency memory will be bottlenecked by `SquareHash`, while CPUs will finish the hash calculation in about the same time it takes to fetch data from RAM. +* A single `SquareHash` calculation takes 40-80 ns, which is about the same time as DRAM access latency. ASIC devices using low-latency memory will be bottlenecked by `SquareHash`, while CPUs will finish the hash calculation in about the same time it takes to fetch data from RAM. -The output of 16 chained SquareHash calculations is used to determine Cache blocks that are XORed together to produce a Dataset block: +The output of 16 chained SquareHash calculations is used to determine cache blocks that are XORed together to produce a dataset block: ```c++ void initBlock(const uint8_t* cache, uint8_t* out, uint32_t blockNumber) { @@ -92,14 +91,14 @@ void initBlock(const uint8_t* cache, uint8_t* out, uint32_t blockNumber) { *Note: `SquareHash` doesn't calculate squaring modulo 264+1 because the subtraction is performed modulo 264. Squaring modulo 264+1 can be calculated by adding the carry bit in every iteration (i.e. the sequence in x86-64 assembly would have to be: `mul rax; sub rax, rdx; adc rax, 0`), but this would decrease ASIC-resistance of `SquareHash`.* -### Performance +## Performance The initial 256-MiB cache construction using Argon2d takes around 1 second using an older laptop with an Intel i5-3230M CPU (Ivy Bridge). Cache generation is strictly serial and cannot be parallelized. -On the same laptop, full Dataset initialization takes around 100 seconds using a single thread (1.5 µs per block). +On the same laptop, full dataset initialization takes around 100 seconds using a single thread (1.5 µs per block). -While the generation of a single block is strictly serial, multiple blocks can be easily generated in parallel, so the Dataset generation time decreases linearly with the number of threads. Using an 8-core AMD Ryzen CPU, the whole dataset can be generated in under 10 seconds. +While the generation of a single block is strictly serial, multiple blocks can be easily generated in parallel, so the dataset generation time decreases linearly with the number of threads. Using an 8-core AMD Ryzen CPU, the whole dataset can be generated in under 10 seconds. Moreover, the seed block hash is known up to 64 blocks in advance, so miners can slowly precalculate the whole dataset by generating 524288 dataset blocks per minute (corresponds to about 1% utilization of a single CPU core). -### Light clients +## Light clients Light clients, who cannot or do not want to generate and keep the whole dataset in memory, can generate just the cache and then generate blocks on the fly during hash calculation. In this case, the hash calculation time will be increased by 16384 times the single block generation time. For the Intel Ivy Bridge laptop, this amounts to around 24.5 milliseconds per hash. \ No newline at end of file diff --git a/doc/isa-ops.md b/doc/isa-ops.md index 4a1cca5..1ab9591 100644 --- a/doc/isa-ops.md +++ b/doc/isa-ops.md @@ -6,7 +6,7 @@ For integer instructions, the destination is always an integer register (registe Memory operands are loaded as 8-byte values from the address indicated by `src`. This indirect addressing is marked with square brackets: `[src]`. |frequency|instruction|dst|src|`src == dst ?`|operation| -|-|-|-|-|-|-|-|-| +|-|-|-|-|-|-| |12/256|IADD_R|R|R|`src = imm32`|`dst = dst + src`| |7/256|IADD_M|R|mem|`src = imm32`|`dst = dst + [src]`| |16/256|IADD_RC|R|R|`src = dst`|`dst = dst + src + imm32`| @@ -42,7 +42,7 @@ For floating point instructions, the destination can be a group F or group E reg Memory operands are loaded as 8-byte values from the address indicated by `src`. The 8 byte value is interpreted as two 32-bit signed integers and implicitly converted to floating point format. The lower and upper memory operands are marked as `[src][0]` and `[src][1]`. |frequency|instruction|dst|src|operation| -|-|-|-|-|-|-|-| +|-|-|-|-|-| |8/256|FSWAP_R|F+E|-|`(dst0, dst1) = (dst1, dst0)`| |20/256|FADD_R|F|A|`(dst0, dst1) = (dst0 + src0, dst1 + src1)`| |5/256|FADD_M|F|mem|`(dst0, dst1) = (dst0 + [src][0], dst1 + [src][1])`| @@ -67,16 +67,16 @@ All floating point instructions give correctly rounded results. The rounding mod |2|roundTowardPositive| |3|roundTowardZero| -The rounding modes are defined by the IEEE-754 standard. +The rounding modes are defined by the IEEE 754 standard. ## Other instructions There are 4 special instructions that have more than one source operand or the destination operand is a memory value. |frequency|instruction|dst|src|operation| |-|-|-|-|-| -|7/256|COND_R|R|R, `imm32`|`if(condition(src, imm32)) dst = dst + 1` -|1/256|COND_M|R|mem, `imm32`|`if(condition([src], imm32)) dst = dst + 1` -|1/256|CFROUND|`fprc`|R, `imm32`|`fprc = src >>> imm32` +|7/256|COND_R|R|R|`if(condition(src, imm32)) dst = dst + 1` +|1/256|COND_M|R|mem|`if(condition([src], imm32)) dst = dst + 1` +|1/256|CFROUND|`fprc`|R|`fprc = src >>> imm32` |16/256|ISTORE|mem|R|`[dst] = src` #### COND diff --git a/doc/isa.md b/doc/isa.md index 36a2634..83d4436 100644 --- a/doc/isa.md +++ b/doc/isa.md @@ -1,6 +1,6 @@ # RandomX instruction set architecture -RandomX VM is a complex instruction set computer ([CISC](https://en.wikipedia.org/wiki/Complex_instruction_set_computer)). All data are loaded and stored in little-endian byte order. Signed integer numbers are represented using [two's complement](https://en.wikipedia.org/wiki/Two%27s_complement). Floating point numbers are represented using the [IEEE-754 double precision format](https://en.wikipedia.org/wiki/Double-precision_floating-point_format). +RandomX VM is a complex instruction set computer ([CISC](https://en.wikipedia.org/wiki/Complex_instruction_set_computer)). All data are loaded and stored in little-endian byte order. Signed integer numbers are represented using [two's complement](https://en.wikipedia.org/wiki/Two%27s_complement). Floating point numbers are represented using the [IEEE 754 double precision format](https://en.wikipedia.org/wiki/Double-precision_floating-point_format). ## Registers @@ -36,16 +36,16 @@ Each instruction word is 64 bits long and has the following format: ![Imgur](https://i.imgur.com/FtkWRwe.png) ### opcode -There are 256 opcodes, which are distributed between 35 distinct instructions. Each instruction can be encoded using multiple opcodes (the number of opcodes specifies the frequency of the instruction in a random program). +There are 256 opcodes, which are distributed between 32 distinct instructions. Each instruction can be encoded using multiple opcodes (the number of opcodes specifies the frequency of the instruction in a random program). *Table 2: Instruction groups* |group|# instructions|# opcodes|| |---------|-----------------|----|-| -|integer |20|143|55.9%| -|floating point |11|88|34.4%| -|other |4|25|9.7%| -||**35**|**256**|**100%** +|integer |19|137|53.5%| +|floating point |9|94|36.7%| +|other |4|25|9.8%| +||**32**|**256**|**100%** Full description of all instructions: [isa-ops.md](isa-ops.md). @@ -88,4 +88,4 @@ The address for reading/writing is calculated by applying bitwise AND operation The `mod.cond` flag is used only by the `COND` instruction to select a condition to be tested. ### imm32 -A 32-bit immediate value that can be used as the source operand. The immediate value is sign-extended to 64 bits in most cases. +A 32-bit immediate value that can be used as the source operand. The immediate value is sign-extended to 64 bits unless specified otherwise. From 2798d78717ee61f7a5f92f2085ac4ae7f8507671 Mon Sep 17 00:00:00 2001 From: tevador Date: Sat, 9 Feb 2019 16:19:15 +0100 Subject: [PATCH 34/35] Render imm32 as signed in RandomX code --- src/AssemblyGeneratorX86.cpp | 1 - src/CompiledVirtualMachine.cpp | 1 - src/Instruction.cpp | 20 +- src/InterpretedVirtualMachine.cpp | 1 - src/main.cpp | 1 - src/program.inc | 1388 +++++++++++++++-------------- 6 files changed, 706 insertions(+), 706 deletions(-) diff --git a/src/AssemblyGeneratorX86.cpp b/src/AssemblyGeneratorX86.cpp index a2d1b32..9f03da1 100644 --- a/src/AssemblyGeneratorX86.cpp +++ b/src/AssemblyGeneratorX86.cpp @@ -20,7 +20,6 @@ along with RandomX. If not, see. #define MAGIC_DIVISION #include "AssemblyGeneratorX86.hpp" #include "common.hpp" -#include "instructions.hpp" #ifdef MAGIC_DIVISION #include "divideByConstantCodegen.h" #endif diff --git a/src/CompiledVirtualMachine.cpp b/src/CompiledVirtualMachine.cpp index b3b5db8..8cfc364 100644 --- a/src/CompiledVirtualMachine.cpp +++ b/src/CompiledVirtualMachine.cpp @@ -19,7 +19,6 @@ along with RandomX. If not, see. #include "CompiledVirtualMachine.hpp" #include "common.hpp" -#include "instructions.hpp" #include namespace RandomX { diff --git a/src/Instruction.cpp b/src/Instruction.cpp index 35cc737..2fefcf3 100644 --- a/src/Instruction.cpp +++ b/src/Instruction.cpp @@ -45,7 +45,7 @@ namespace RandomX { os << "r" << (int)dst << ", r" << (int)src << std::endl; } else { - os << "r" << (int)dst << ", " << imm32 << std::endl; + os << "r" << (int)dst << ", " << (int32_t)imm32 << std::endl; } } @@ -63,7 +63,7 @@ namespace RandomX { } void Instruction::h_IADD_RC(std::ostream& os) const { - os << "r" << (int)dst << ", r" << (int)src << ", " << imm32 << std::endl; + os << "r" << (int)dst << ", r" << (int)src << ", " << (int32_t)imm32 << std::endl; } //1 uOP @@ -72,7 +72,7 @@ namespace RandomX { os << "r" << (int)dst << ", r" << (int)src << std::endl; } else { - os << "r" << (int)dst << ", " << imm32 << std::endl; + os << "r" << (int)dst << ", " << (int32_t)imm32 << std::endl; } } @@ -90,7 +90,7 @@ namespace RandomX { } void Instruction::h_IMUL_9C(std::ostream& os) const { - os << "r" << (int)dst << ", " << imm32 << std::endl; + os << "r" << (int)dst << ", " << (int32_t)imm32 << std::endl; } void Instruction::h_IMUL_R(std::ostream& os) const { @@ -98,7 +98,7 @@ namespace RandomX { os << "r" << (int)dst << ", r" << (int)src << std::endl; } else { - os << "r" << (int)dst << ", " << imm32 << std::endl; + os << "r" << (int)dst << ", " << (int32_t)imm32 << std::endl; } } @@ -158,7 +158,7 @@ namespace RandomX { os << "r" << (int)dst << ", r" << (int)src << std::endl; } else { - os << "r" << (int)dst << ", " << imm32 << std::endl; + os << "r" << (int)dst << ", " << (int32_t)imm32 << std::endl; } } @@ -194,11 +194,11 @@ namespace RandomX { } void Instruction::h_IDIV_C(std::ostream& os) const { - os << "r" << (int)dst << ", " << (uint32_t)imm32 << std::endl; + os << "r" << (int)dst << ", " << imm32 << std::endl; } void Instruction::h_ISDIV_C(std::ostream& os) const { - os << "r" << (int)dst << ", " << imm32 << std::endl; + os << "r" << (int)dst << ", " << (int32_t)imm32 << std::endl; } void Instruction::h_ISWAP_R(std::ostream& os) const { @@ -300,13 +300,13 @@ namespace RandomX { } void Instruction::h_COND_R(std::ostream& os) const { - os << "r" << (int)dst << ", " << condition((mod >> 2) & 7) << "(r" << (int)src << ", " << imm32 << ")" << std::endl; + os << "r" << (int)dst << ", " << condition((mod >> 2) & 7) << "(r" << (int)src << ", " << (int32_t)imm32 << ")" << std::endl; } void Instruction::h_COND_M(std::ostream& os) const { os << "r" << (int)dst << ", " << condition((mod >> 2) & 7) << "("; genAddressReg(os); - os << ", " << imm32 << ")" << std::endl; + os << ", " << (int32_t)imm32 << ")" << std::endl; } void Instruction::h_ISTORE(std::ostream& os) const { diff --git a/src/InterpretedVirtualMachine.cpp b/src/InterpretedVirtualMachine.cpp index 9e0d5e2..0757f43 100644 --- a/src/InterpretedVirtualMachine.cpp +++ b/src/InterpretedVirtualMachine.cpp @@ -19,7 +19,6 @@ along with RandomX. If not, see. //#define TRACE //#define FPUCHECK #include "InterpretedVirtualMachine.hpp" -#include "instructions.hpp" #include "dataset.hpp" #include "Cache.hpp" #include "LightClientAsyncWorker.hpp" diff --git a/src/main.cpp b/src/main.cpp index b16b13b..0a10d8f 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -29,7 +29,6 @@ along with RandomX. If not, see. #include #include "Program.hpp" #include -#include "instructions.hpp" #include #include #include "dataset.hpp" diff --git a/src/program.inc b/src/program.inc index 5de4504..ac8957b 100644 --- a/src/program.inc +++ b/src/program.inc @@ -1,736 +1,740 @@ - ; FMUL_R e0, a2 - mulpd xmm4, xmm10 - ; IADD_RC r2, r5, 2673743102 - lea r10, [r10+r13-1621224194] - ; ISTORE L2[r2], r7 - mov eax, r10d - and eax, 262136 - mov qword ptr [rsi+rax], r15 - ; FNEG_R f2 - xorps xmm2, xmm15 - ; IMUL_9C r6, 3291464084 - lea r14, [r14+r14*8-1003503212] - ; FSUB_R f1, a0 - subpd xmm1, xmm8 - ; IXOR_M r5, L2[r3] + ; COND_M r1, sg(L1[r3], -2004237569) + xor ecx, ecx mov eax, r11d + and eax, 16376 + cmp dword ptr [rsi+rax], -2004237569 + sets cl + add r9, rcx + ; IXOR_R r7, -1379425991 + xor r15, -1379425991 + ; IXOR_R r2, r6 + xor r10, r14 + ; FSWAP_R f3 + shufpd xmm3, xmm3, 1 + ; FADD_R f1, a1 + addpd xmm1, xmm9 + ; IMUL_R r0, r5 + imul r8, r13 + ; FMUL_R e1, a3 + mulpd xmm5, xmm11 + ; IADD_R r3, r2 + add r11, r10 + ; COND_M r1, ab(L2[r6], -724006934) + xor ecx, ecx + mov eax, r14d and eax, 262136 - xor r13, qword ptr [rsi+rax] - ; FNEG_R f2 - xorps xmm2, xmm15 - ; FSUB_R f3, a0 - subpd xmm3, xmm8 - ; ISDIV_C r0, 1400272688 - mov rax, 7072565507528518045 + cmp dword ptr [rsi+rax], -724006934 + seta cl + add r9, rcx + ; IADD_RC r2, r7, -854121467 + lea r10, [r10+r15-854121467] + ; IADD_RC r5, r6, 1291744030 + lea r13, [r13+r14+1291744030] + ; ISTORE L2[r6], r4 + mov eax, r14d + and eax, 262136 + mov qword ptr [rsi+rax], r12 + ; IMUL_R r6, r7 + imul r14, r15 + ; FSUB_R f0, a3 + subpd xmm0, xmm11 + ; IADD_M r3, L1[r0] + mov eax, r8d + and eax, 16376 + add r11, qword ptr [rsi+rax] + ; ISDIV_C r4, -692911499 + mov rax, -893288710803585809 + imul r12 + xor eax, eax + sar rdx, 25 + sets al + add rdx, rax + add r12, rdx + ; FMUL_R e0, a0 + mulpd xmm4, xmm8 + ; FDIV_M e1, L1[r0] + mov eax, r8d + and eax, 16376 + cvtdq2pd xmm12, qword ptr [rsi+rax] + andps xmm12, xmm14 + divpd xmm5, xmm12 + maxpd xmm5, xmm13 + ; FMUL_R e0, a1 + mulpd xmm4, xmm9 + ; COND_M r0, no(L1[r1], -540292380) + xor ecx, ecx + mov eax, r9d + and eax, 16376 + cmp dword ptr [rsi+rax], -540292380 + setno cl + add r8, rcx + ; FSUB_R f1, a1 + subpd xmm1, xmm9 + ; IADD_RC r0, r2, 310371682 + lea r8, [r8+r10+310371682] + ; COND_R r3, lt(r0, -1067603143) + xor ecx, ecx + cmp r8d, -1067603143 + setl cl + add r11, rcx + ; FMUL_R e0, a0 + mulpd xmm4, xmm8 + ; FADD_R f0, a3 + addpd xmm0, xmm11 + ; COND_R r4, sg(r3, -389806289) + xor ecx, ecx + cmp r11d, -389806289 + sets cl + add r12, rcx + ; FMUL_R e0, a3 + mulpd xmm4, xmm11 + ; ISTORE L2[r7], r4 + mov eax, r15d + and eax, 262136 + mov qword ptr [rsi+rax], r12 + ; IADD_RC r4, r2, 1888908452 + lea r12, [r12+r10+1888908452] + ; IADD_R r1, r2 + add r9, r10 + ; IXOR_R r6, r5 + xor r14, r13 + ; IADD_M r7, L1[r0] + mov eax, r8d + and eax, 16376 + add r15, qword ptr [rsi+rax] + ; IADD_R r5, r6 + add r13, r14 + ; FSUB_R f0, a1 + subpd xmm0, xmm9 + ; IMULH_R r5, r4 + mov rax, r13 + mul r12 + mov r13, rdx + ; IMUL_9C r7, 753606235 + lea r15, [r15+r15*8+753606235] + ; FSWAP_R e2 + shufpd xmm6, xmm6, 1 + ; IMUL_M r7, L1[r1] + mov eax, r9d + and eax, 16376 + imul r15, qword ptr [rsi+rax] + ; IMUL_R r5, 1431156245 + imul r13, 1431156245 + ; IADD_RC r4, r2, 1268508410 + lea r12, [r12+r10+1268508410] + ; FSWAP_R f2 + shufpd xmm2, xmm2, 1 + ; ISDIV_C r0, -845194077 + mov rax, -5858725577819591251 imul r8 xor eax, eax - sar rdx, 29 + sar rdx, 28 sets al add rdx, rax add r8, rdx - ; IMUL_M r3, L1[r7] - mov eax, r15d - and eax, 16376 - imul r11, qword ptr [rsi+rax] - ; ISWAP_R r2, r3 - xchg r10, r11 - ; IMULH_R r6, r0 - mov rax, r14 - mul r8 - mov r14, rdx - ; FMUL_R e0, a2 - mulpd xmm4, xmm10 - ; IADD_RC r3, r4, 4242706868 - lea r11, [r11+r12-52260428] - ; IADD_R r7, 3156349536 - add r15, -1138617760 - ; IXOR_M r2, L1[r6] - mov eax, r14d - and eax, 16376 - xor r10, qword ptr [rsi+rax] - ; FSUB_M f2, L1[r5] - mov eax, r13d - and eax, 16376 - cvtdq2pd xmm12, qword ptr [rsi+rax] - subpd xmm2, xmm12 - ; IXOR_R r7, r1 - xor r15, r9 - ; COND_R r2, lt(r7, 4253348488) + ; COND_R r0, ab(r5, 1644043355) xor ecx, ecx - cmp r15d, -41618808 + cmp r13d, 1644043355 + seta cl + add r8, rcx + ; COND_R r5, lt(r0, 1216385844) + xor ecx, ecx + cmp r8d, 1216385844 setl cl - add r10, rcx - ; FMUL_R e3, a0 - mulpd xmm7, xmm8 - ; COND_R r4, sg(r1, 3333776931) - xor ecx, ecx - cmp r9d, -961190365 - sets cl - add r12, rcx - ; FADD_R f2, a1 - addpd xmm2, xmm9 - ; FSUB_R f0, a3 - subpd xmm0, xmm11 - ; ISTORE L1[r6], r2 - mov eax, r14d - and eax, 16376 - mov qword ptr [rsi+rax], r10 - ; ISUB_R r6, r5 - sub r14, r13 - ; IADD_M r0, L1[r4] + add r13, rcx + ; IMUL_R r5, r2 + imul r13, r10 + ; ISTORE L1[r4], r6 mov eax, r12d and eax, 16376 - add r8, qword ptr [rsi+rax] - ; ISTORE L1[r4], r3 - mov eax, r12d - and eax, 16376 - mov qword ptr [rsi+rax], r11 - ; COND_M r6, sg(L1[r6], 1048782623) - xor ecx, ecx - mov eax, r14d - and eax, 16376 - cmp dword ptr [rsi+rax], 1048782623 - sets cl - add r14, rcx - ; FSQRT_R e0 - sqrtpd xmm4, xmm4 - ; INEG_R r2 - neg r10 + mov qword ptr [rsi+rax], r14 + ; IXOR_R r4, r3 + xor r12, r11 + ; IXOR_R r6, r2 + xor r14, r10 ; FSQRT_R e1 sqrtpd xmm5, xmm5 - ; FMUL_R e1, a3 - mulpd xmm5, xmm11 - ; IMUL_R r7, r6 - imul r15, r14 - ; IMULH_R r0, r4 - mov rax, r8 - mul r12 - mov r8, rdx - ; IMUL_R r5, r3 - imul r13, r11 - ; FSQRT_R e2 - sqrtpd xmm6, xmm6 - ; FADD_R f3, a0 - addpd xmm3, xmm8 - ; IADD_R r3, r2 - add r11, r10 - ; FADD_R f1, a0 - addpd xmm1, xmm8 - ; FMUL_R e3, a2 - mulpd xmm7, xmm10 - ; FADD_M f0, L2[r5] - mov eax, r13d - and eax, 262136 - cvtdq2pd xmm12, qword ptr [rsi+rax] - addpd xmm0, xmm12 - ; IMUL_R r5, r6 - imul r13, r14 - ; IADD_RC r1, r2, 3031682053 - lea r9, [r9+r10-1263285243] - ; ISUB_M r4, L1[r6] - mov eax, r14d - and eax, 16376 - sub r12, qword ptr [rsi+rax] - ; FSWAP_R e3 - shufpd xmm7, xmm7, 1 - ; IMUL_R r0, r7 - imul r8, r15 - ; IXOR_R r1, r6 - xor r9, r14 - ; IXOR_M r2, L1[r4] - mov eax, r12d - and eax, 16376 - xor r10, qword ptr [rsi+rax] - ; FSUB_R f3, a1 - subpd xmm3, xmm9 - ; ISTORE L1[r0], r5 - mov eax, r8d - and eax, 16376 - mov qword ptr [rsi+rax], r13 - ; FDIV_M e2, L2[r3] + ; COND_R r5, be(r1, 1781435695) + xor ecx, ecx + cmp r9d, 1781435695 + setbe cl + add r13, rcx + ; ISDIV_C r0, 1367038890 + mov rax, 1811126293978922977 + imul r8 + xor eax, eax + sar rdx, 27 + sets al + add rdx, rax + add r8, rdx + ; FDIV_M e1, L1[r3] mov eax, r11d - and eax, 262136 + and eax, 16376 cvtdq2pd xmm12, qword ptr [rsi+rax] andps xmm12, xmm14 - divpd xmm6, xmm12 - maxpd xmm6, xmm13 - ; ISWAP_R r2, r0 - xchg r10, r8 - ; IADD_R r7, r5 - add r15, r13 - ; FDIV_M e0, L1[r4] - mov eax, r12d + divpd xmm5, xmm12 + maxpd xmm5, xmm13 + ; FMUL_R e2, a0 + mulpd xmm6, xmm8 + ; ISTORE L1[r5], r4 + mov eax, r13d + and eax, 16376 + mov qword ptr [rsi+rax], r12 + ; IXOR_R r0, r4 + xor r8, r12 + ; IMUL_R r5, r1 + imul r13, r9 + ; FDIV_M e0, L1[r2] + mov eax, r10d and eax, 16376 cvtdq2pd xmm12, qword ptr [rsi+rax] andps xmm12, xmm14 divpd xmm4, xmm12 maxpd xmm4, xmm13 - ; FADD_R f3, a1 - addpd xmm3, xmm9 - ; FADD_R f0, a3 - addpd xmm0, xmm11 - ; IADD_R r2, r0 - add r10, r8 - ; ISTORE L1[r3], r6 - mov eax, r11d + ; IMUL_R r6, r1 + imul r14, r9 + ; FSUB_M f1, L1[r0] + mov eax, r8d and eax, 16376 - mov qword ptr [rsi+rax], r14 - ; IXOR_R r1, r7 - xor r9, r15 - ; ISUB_M r5, L2[r7] - mov eax, r15d + cvtdq2pd xmm12, qword ptr [rsi+rax] + subpd xmm1, xmm12 + ; COND_R r2, ns(r1, 392878356) + xor ecx, ecx + cmp r9d, 392878356 + setns cl + add r10, rcx + ; IADD_R r6, r5 + add r14, r13 + ; FMUL_R e2, a0 + mulpd xmm6, xmm8 + ; ISTORE L1[r0], r3 + mov eax, r8d + and eax, 16376 + mov qword ptr [rsi+rax], r11 + ; IMUL_R r1, r3 + imul r9, r11 + ; IMUL_R r5, r2 + imul r13, r10 + ; FADD_R f0, a0 + addpd xmm0, xmm8 + ; FADD_R f0, a1 + addpd xmm0, xmm9 + ; FSUB_R f0, a0 + subpd xmm0, xmm8 + ; IMUL_R r3, r5 + imul r11, r13 + ; IADD_R r1, r5 + add r9, r13 + ; IXOR_M r0, L1[r5] + mov eax, r13d + and eax, 16376 + xor r8, qword ptr [rsi+rax] + ; FNEG_R f2 + xorps xmm2, xmm15 + ; IDIV_C r5, 2577129788 + mov rax, 15371395512010654233 + mul r13 + shr rdx, 31 + add r13, rdx + ; COND_R r5, be(r5, -999219370) + xor ecx, ecx + cmp r13d, -999219370 + setbe cl + add r13, rcx + ; ISTORE L2[r0], r2 + mov eax, r8d and eax, 262136 - sub r13, qword ptr [rsi+rax] - ; ISDIV_C r7, 266992378 - mov rax, -9173520256920442565 - imul r15 + mov qword ptr [rsi+rax], r10 + ; FSUB_R f3, a3 + subpd xmm3, xmm11 + ; IROR_R r7, r6 + mov ecx, r14d + ror r15, cl + ; COND_R r6, ab(r4, 1309137534) + xor ecx, ecx + cmp r12d, 1309137534 + seta cl + add r14, rcx + ; FMUL_R e3, a0 + mulpd xmm7, xmm8 + ; COND_M r3, no(L2[r5], 483660199) + xor ecx, ecx + mov eax, r13d + and eax, 262136 + cmp dword ptr [rsi+rax], 483660199 + setno cl + add r11, rcx + ; IMUL_R r1, r6 + imul r9, r14 + ; IADD_RC r7, r2, -1340630490 + lea r15, [r15+r10-1340630490] + ; IADD_M r0, L3[1554088] + add r8, qword ptr [rsi+1554088] + ; FMUL_R e2, a3 + mulpd xmm6, xmm11 + ; IDIV_C r0, 1566192452 + mov rax, 12646619898641986559 + mul r8 + shr rdx, 30 + add r8, rdx + ; FADD_R f0, a1 + addpd xmm0, xmm9 + ; ISWAP_R r6, r0 + xchg r14, r8 + ; IMUL_9C r4, 1340891034 + lea r12, [r12+r12*8+1340891034] + ; IROR_R r7, r2 + mov ecx, r10d + ror r15, cl + ; FSQRT_R e2 + sqrtpd xmm6, xmm6 + ; FADD_R f2, a1 + addpd xmm2, xmm9 + ; IMUL_R r4, r3 + imul r12, r11 + ; IADD_RC r6, r3, -1584624397 + lea r14, [r14+r11-1584624397] + ; IROR_R r1, r7 + mov ecx, r15d + ror r9, cl + ; IXOR_R r4, r7 + xor r12, r15 + ; FSWAP_R f0 + shufpd xmm0, xmm0, 1 + ; FSWAP_R f3 + shufpd xmm3, xmm3, 1 + ; IROR_R r5, 3 + ror r13, 3 + ; FADD_R f3, a0 + addpd xmm3, xmm8 + ; FMUL_R e0, a0 + mulpd xmm4, xmm8 + ; IADD_R r4, r1 + add r12, r9 + ; COND_M r4, ge(L1[r6], -1612023931) + xor ecx, ecx + mov eax, r14d + and eax, 16376 + cmp dword ptr [rsi+rax], -1612023931 + setge cl + add r12, rcx + ; FSWAP_R e2 + shufpd xmm6, xmm6, 1 + ; IADD_R r3, r7 + add r11, r15 + ; COND_R r5, be(r2, -1083018923) + xor ecx, ecx + cmp r10d, -1083018923 + setbe cl + add r13, rcx + ; IADD_R r3, r7 + add r11, r15 + ; ISTORE L2[r6], r0 + mov eax, r14d + and eax, 262136 + mov qword ptr [rsi+rax], r8 + ; IXOR_R r2, r3 + xor r10, r11 + ; FMUL_R e2, a3 + mulpd xmm6, xmm11 + ; FMUL_R e3, a3 + mulpd xmm7, xmm11 + ; FADD_R f0, a2 + addpd xmm0, xmm10 + ; ISTORE L1[r5], r1 + mov eax, r13d + and eax, 16376 + mov qword ptr [rsi+rax], r9 + ; FMUL_R e3, a3 + mulpd xmm7, xmm11 + ; ISWAP_R r1, r2 + xchg r9, r10 + ; FSWAP_R e0 + shufpd xmm4, xmm4, 1 + ; FSUB_R f1, a2 + subpd xmm1, xmm10 + ; FSUB_R f0, a0 + subpd xmm0, xmm8 + ; IROR_R r7, r0 + mov ecx, r8d + ror r15, cl + ; IADD_RC r5, r4, 283260945 + lea r13, [r13+r12+283260945] + ; ISDIV_C r6, -340125851 + mov rax, -3639652898025032137 + imul r14 xor eax, eax - add rdx, r15 - sar rdx, 27 + sar rdx, 26 sets al add rdx, rax - add r15, rdx - ; FDIV_M e3, L1[r4] + add r14, rdx + ; ISTORE L2[r2], r3 + mov eax, r10d + and eax, 262136 + mov qword ptr [rsi+rax], r11 + ; IADD_RC r6, r6, -935765909 + lea r14, [r14+r14-935765909] + ; ISDIV_C r3, -701703430 + mov rax, -7056770631919985199 + imul r11 + xor eax, eax + sar rdx, 28 + sets al + add rdx, rax + add r11, rdx + ; IXOR_M r3, L2[r1] + mov eax, r9d + and eax, 262136 + xor r11, qword ptr [rsi+rax] + ; FADD_R f2, a1 + addpd xmm2, xmm9 + ; ISTORE L1[r5], r7 + mov eax, r13d + and eax, 16376 + mov qword ptr [rsi+rax], r15 + ; FSUB_R f2, a0 + subpd xmm2, xmm8 + ; FMUL_R e3, a2 + mulpd xmm7, xmm10 + ; IADD_R r2, r5 + add r10, r13 + ; IADD_RC r2, r5, -1056770544 + lea r10, [r10+r13-1056770544] + ; ISTORE L2[r2], r3 + mov eax, r10d + and eax, 262136 + mov qword ptr [rsi+rax], r11 + ; ISMULH_R r7, r1 + mov rax, r15 + imul r9 + mov r15, rdx + ; IXOR_R r0, r5 + xor r8, r13 + ; ISTORE L1[r4], r0 mov eax, r12d and eax, 16376 + mov qword ptr [rsi+rax], r8 + ; INEG_R r5 + neg r13 + ; FSUB_R f0, a1 + subpd xmm0, xmm9 + ; IMUL_R r6, -244261682 + imul r14, -244261682 + ; IMUL_R r1, r0 + imul r9, r8 + ; IMUL_9C r3, -985744277 + lea r11, [r11+r11*8-985744277] + ; IROR_R r2, r1 + mov ecx, r9d + ror r10, cl + ; ISUB_R r4, -1079131550 + sub r12, -1079131550 + ; FNEG_R f3 + xorps xmm3, xmm15 + ; COND_R r4, ns(r5, -362284631) + xor ecx, ecx + cmp r13d, -362284631 + setns cl + add r12, rcx + ; FSUB_R f2, a0 + subpd xmm2, xmm8 + ; IXOR_R r4, r5 + xor r12, r13 + ; FNEG_R f1 + xorps xmm1, xmm15 + ; FADD_R f0, a0 + addpd xmm0, xmm8 + ; IADD_RC r3, r3, -173615832 + lea r11, [r11+r11-173615832] + ; IMUL_R r0, 928402279 + imul r8, 928402279 + ; ISUB_R r2, r0 + sub r10, r8 + ; IXOR_R r6, r3 + xor r14, r11 + ; ISUB_R r2, 2106401471 + sub r10, 2106401471 + ; FADD_R f0, a2 + addpd xmm0, xmm10 + ; IMUL_R r4, r6 + imul r12, r14 + ; IADD_RC r4, r0, -373491513 + lea r12, [r12+r8-373491513] + ; ISDIV_C r0, -1739042721 + mov rax, 7057121271817449967 + imul r8 + xor eax, eax + sub rdx, r8 + sar rdx, 30 + sets al + add rdx, rax + add r8, rdx + ; IADD_R r3, r1 + add r11, r9 + ; ISUB_M r7, L1[r5] + mov eax, r13d + and eax, 16376 + sub r15, qword ptr [rsi+rax] + ; IMUL_R r1, r2 + imul r9, r10 + ; ISUB_R r0, 722465116 + sub r8, 722465116 + ; IADD_RC r0, r0, -1919541169 + lea r8, [r8+r8-1919541169] + ; ISUB_M r2, L1[r3] + mov eax, r11d + and eax, 16376 + sub r10, qword ptr [rsi+rax] + ; IADD_R r7, -1183581468 + add r15, -1183581468 + ; FMUL_R e1, a3 + mulpd xmm5, xmm11 + ; FSUB_R f0, a0 + subpd xmm0, xmm8 + ; FADD_R f0, a3 + addpd xmm0, xmm11 + ; IMUL_9C r6, 1241113238 + lea r14, [r14+r14*8+1241113238] + ; FSUB_R f3, a3 + subpd xmm3, xmm11 + ; IADD_M r0, L1[r3] + mov eax, r11d + and eax, 16376 + add r8, qword ptr [rsi+rax] + ; IROR_R r3, r7 + mov ecx, r15d + ror r11, cl + ; FADD_R f2, a1 + addpd xmm2, xmm9 + ; IMUL_M r3, L1[r2] + mov eax, r10d + and eax, 16376 + imul r11, qword ptr [rsi+rax] + ; IMUL_9C r7, -2080412544 + lea r15, [r15+r15*8-2080412544] + ; IMUL_R r0, r3 + imul r8, r11 + ; FADD_R f1, a1 + addpd xmm1, xmm9 + ; IROR_R r6, 21 + ror r14, 21 + ; FDIV_M e3, L1[r1] + mov eax, r9d + and eax, 16376 cvtdq2pd xmm12, qword ptr [rsi+rax] andps xmm12, xmm14 divpd xmm7, xmm12 maxpd xmm7, xmm13 - ; IMUL_R r2, r0 - imul r10, r8 - ; FMUL_R e3, a2 - mulpd xmm7, xmm10 - ; IMUL_R r0, r6 - imul r8, r14 - ; ISTORE L1[r0], r7 - mov eax, r8d - and eax, 16376 - mov qword ptr [rsi+rax], r15 - ; FSUB_M f0, L2[r1] - mov eax, r9d - and eax, 262136 - cvtdq2pd xmm12, qword ptr [rsi+rax] - subpd xmm0, xmm12 - ; FADD_R f3, a1 - addpd xmm3, xmm9 - ; IXOR_R r5, r4 - xor r13, r12 - ; ISTORE L2[r7], r2 - mov eax, r15d - and eax, 262136 - mov qword ptr [rsi+rax], r10 - ; FSWAP_R e2 - shufpd xmm6, xmm6, 1 - ; FADD_R f3, a2 - addpd xmm3, xmm10 - ; ISMULH_R r5, r0 - mov rax, r13 - imul r8 - mov r13, rdx - ; IADD_M r0, L1[r4] - mov eax, r12d - and eax, 16376 - add r8, qword ptr [rsi+rax] - ; COND_R r7, ge(r6, 2322068811) - xor ecx, ecx - cmp r14d, -1972898485 - setge cl - add r15, rcx - ; FADD_R f2, a2 - addpd xmm2, xmm10 - ; IROR_R r7, r6 - mov ecx, r14d - ror r15, cl - ; IADD_RC r2, r4, 4177509323 - lea r10, [r10+r12-117457973] - ; IMUL_R r0, 2794074228 - imul r8, -1500893068 - ; IADD_R r2, r3 - add r10, r11 - ; FSQRT_R e2 - sqrtpd xmm6, xmm6 - ; IROR_R r7, r4 - mov ecx, r12d - ror r15, cl - ; IMUL_9C r4, 381194890 - lea r12, [r12+r12*8+381194890] - ; IADD_RC r3, r7, 1050899263 - lea r11, [r11+r15+1050899263] - ; IADD_R r2, r7 - add r10, r15 - ; FMUL_R e3, a0 - mulpd xmm7, xmm8 - ; IADD_RC r6, r6, 540663146 - lea r14, [r14+r14+540663146] - ; IROR_R r5, 58 - ror r13, 58 - ; FADD_R f2, a1 - addpd xmm2, xmm9 - ; FADD_R f2, a2 - addpd xmm2, xmm10 - ; FMUL_R e1, a2 - mulpd xmm5, xmm10 + ; FSUB_R f0, a1 + subpd xmm0, xmm9 ; FSWAP_R e1 shufpd xmm5, xmm5, 1 - ; IADD_R r5, r3 - add r13, r11 - ; IADD_R r7, 2514699120 - add r15, -1780268176 - ; IADD_RC r7, r0, 2797210442 - lea r15, [r15+r8-1497756854] - ; ISTORE L2[r0], r7 - mov eax, r8d - and eax, 262136 - mov qword ptr [rsi+rax], r15 - ; ISMULH_R r2, r4 - mov rax, r10 - imul r12 - mov r10, rdx - ; FSUB_M f0, L1[r2] - mov eax, r10d - and eax, 16376 - cvtdq2pd xmm12, qword ptr [rsi+rax] - subpd xmm0, xmm12 - ; ISMULH_R r2, r3 - mov rax, r10 - imul r11 - mov r10, rdx - ; IADD_R r0, r3 - add r8, r11 - ; ISUB_R r7, r2 - sub r15, r10 - ; FADD_R f2, a0 - addpd xmm2, xmm8 - ; FMUL_R e0, a2 - mulpd xmm4, xmm10 - ; FADD_M f2, L1[r3] - mov eax, r11d - and eax, 16376 - cvtdq2pd xmm12, qword ptr [rsi+rax] - addpd xmm2, xmm12 - ; IMUL_R r1, r2 - imul r9, r10 - ; IMUL_M r7, L1[r5] + ; COND_M r0, no(L1[r5], -1627153829) + xor ecx, ecx mov eax, r13d and eax, 16376 - imul r15, qword ptr [rsi+rax] + cmp dword ptr [rsi+rax], -1627153829 + setno cl + add r8, rcx + ; FADD_R f2, a3 + addpd xmm2, xmm11 + ; FSUB_R f1, a2 + subpd xmm1, xmm10 + ; FSUB_M f1, L1[r4] + mov eax, r12d + and eax, 16376 + cvtdq2pd xmm12, qword ptr [rsi+rax] + subpd xmm1, xmm12 + ; ISTORE L1[r5], r1 + mov eax, r13d + and eax, 16376 + mov qword ptr [rsi+rax], r9 + ; ISUB_M r2, L2[r7] + mov eax, r15d + and eax, 262136 + sub r10, qword ptr [rsi+rax] + ; ISTORE L1[r2], r3 + mov eax, r10d + and eax, 16376 + mov qword ptr [rsi+rax], r11 + ; FADD_R f0, a3 + addpd xmm0, xmm11 + ; ISUB_M r1, L1[r7] + mov eax, r15d + and eax, 16376 + sub r9, qword ptr [rsi+rax] + ; IDIV_C r5, 624165039 + mov rax, 15866829597104432181 + mul r13 + shr rdx, 29 + add r13, rdx + ; FMUL_R e3, a0 + mulpd xmm7, xmm8 + ; IMUL_R r5, r4 + imul r13, r12 + ; FMUL_R e3, a1 + mulpd xmm7, xmm9 + ; FMUL_R e3, a3 + mulpd xmm7, xmm11 + ; IXOR_R r0, -2064879200 + xor r8, -2064879200 + ; FADD_R f1, a3 + addpd xmm1, xmm11 + ; IADD_M r0, L1[r3] + mov eax, r11d + and eax, 16376 + add r8, qword ptr [rsi+rax] + ; ISMULH_R r7, r3 + mov rax, r15 + imul r11 + mov r15, rdx + ; IMUL_R r5, -1645503310 + imul r13, -1645503310 + ; IMUL_R r7, r3 + imul r15, r11 + ; FMUL_R e2, a2 + mulpd xmm6, xmm10 + ; IADD_R r6, 1769041191 + add r14, 1769041191 + ; FSUB_M f1, L1[r4] + mov eax, r12d + and eax, 16376 + cvtdq2pd xmm12, qword ptr [rsi+rax] + subpd xmm1, xmm12 + ; ISTORE L2[r1], r0 + mov eax, r9d + and eax, 262136 + mov qword ptr [rsi+rax], r8 + ; FNEG_R f0 + xorps xmm0, xmm15 + ; FMUL_R e0, a3 + mulpd xmm4, xmm11 + ; IMUL_R r2, r7 + imul r10, r15 + ; IADD_R r5, r1 + add r13, r9 + ; IROR_R r3, r6 + mov ecx, r14d + ror r11, cl + ; FADD_R f0, a0 + addpd xmm0, xmm8 + ; FMUL_R e1, a2 + mulpd xmm5, xmm10 + ; FNEG_R f3 + xorps xmm3, xmm15 + ; FADD_R f1, a1 + addpd xmm1, xmm9 + ; IMULH_R r2, r5 + mov rax, r10 + mul r13 + mov r10, rdx + ; ISTORE L1[r4], r0 + mov eax, r12d + and eax, 16376 + mov qword ptr [rsi+rax], r8 + ; ISWAP_R r7, r0 + xchg r15, r8 + ; FSWAP_R f0 + shufpd xmm0, xmm0, 1 + ; ISUB_R r2, r0 + sub r10, r8 + ; FSUB_R f1, a3 + subpd xmm1, xmm11 + ; ISUB_M r5, L1[r3] + mov eax, r11d + and eax, 16376 + sub r13, qword ptr [rsi+rax] + ; IXOR_R r7, r0 + xor r15, r8 + ; IMUL_R r4, r1 + imul r12, r9 + ; IADD_RC r0, r2, -1102648763 + lea r8, [r8+r10-1102648763] + ; FMUL_R e3, a3 + mulpd xmm7, xmm11 + ; IXOR_R r4, r1 + xor r12, r9 + ; IXOR_R r6, r0 + xor r14, r8 + ; FSQRT_R e1 + sqrtpd xmm5, xmm5 + ; IMUL_M r6, L2[r1] + mov eax, r9d + and eax, 262136 + imul r14, qword ptr [rsi+rax] + ; ISMULH_M r5, L3[353552] + mov rax, r13 + imul qword ptr [rsi+353552] + mov r13, rdx + ; ISUB_M r1, L1[r6] + mov eax, r14d + and eax, 16376 + sub r9, qword ptr [rsi+rax] + ; FADD_R f0, a3 + addpd xmm0, xmm11 + ; FMUL_R e3, a3 + mulpd xmm7, xmm11 + ; FSUB_M f3, L2[r7] + mov eax, r15d + and eax, 262136 + cvtdq2pd xmm12, qword ptr [rsi+rax] + subpd xmm3, xmm12 + ; IMUL_R r0, r2 + imul r8, r10 + ; FMUL_R e1, a0 + mulpd xmm5, xmm8 + ; COND_R r5, sg(r3, -1392293091) + xor ecx, ecx + cmp r11d, -1392293091 + sets cl + add r13, rcx + ; FSWAP_R e3 + shufpd xmm7, xmm7, 1 + ; IMUL_R r7, r4 + imul r15, r12 + ; IXOR_R r7, r5 + xor r15, r13 + ; FMUL_R e3, a3 + mulpd xmm7, xmm11 + ; IMUL_R r4, r3 + imul r12, r11 + ; FADD_M f1, L1[r1] + mov eax, r9d + and eax, 16376 + cvtdq2pd xmm12, qword ptr [rsi+rax] + addpd xmm1, xmm12 + ; IMUL_R r5, r0 + imul r13, r8 + ; ISUB_R r7, r0 + sub r15, r8 + ; IADD_M r5, L1[r4] + mov eax, r12d + and eax, 16376 + add r13, qword ptr [rsi+rax] + ; IADD_R r6, r2 + add r14, r10 + ; FMUL_R e1, a1 + mulpd xmm5, xmm9 + ; IADD_M r2, L3[1073640] + add r10, qword ptr [rsi+1073640] ; IMUL_R r3, r2 imul r11, r10 ; IXOR_R r1, r0 xor r9, r8 - ; FNEG_R f0 - xorps xmm0, xmm15 - ; IADD_RC r4, r4, 1456841848 - lea r12, [r12+r12+1456841848] - ; IXOR_R r3, r2 - xor r11, r10 - ; COND_R r0, of(r4, 1678513610) - xor ecx, ecx - cmp r12d, 1678513610 - seto cl - add r8, rcx - ; ISMULH_R r4, r4 - mov rax, r12 - imul r12 - mov r12, rdx - ; IMUL_R r4, r1 - imul r12, r9 - ; FADD_R f1, a2 - addpd xmm1, xmm10 - ; FSUB_R f2, a0 - subpd xmm2, xmm8 - ; FMUL_R e1, a2 - mulpd xmm5, xmm10 - ; FSUB_R f0, a3 - subpd xmm0, xmm11 - ; IXOR_R r0, r7 - xor r8, r15 - ; ISTORE L2[r1], r4 - mov eax, r9d - and eax, 262136 - mov qword ptr [rsi+rax], r12 - ; IXOR_M r7, L1[r6] - mov eax, r14d - and eax, 16376 - xor r15, qword ptr [rsi+rax] - ; ISUB_R r2, r4 - sub r10, r12 - ; ISUB_M r4, L1[r6] - mov eax, r14d - and eax, 16376 - sub r12, qword ptr [rsi+rax] - ; FADD_R f2, a2 - addpd xmm2, xmm10 - ; FSUB_M f3, L2[r4] + ; IROR_R r7, r4 + mov ecx, r12d + ror r15, cl + ; FSUB_R f1, a1 + subpd xmm1, xmm9 + ; IMUL_R r7, r5 + imul r15, r13 + ; ISUB_R r1, 866191482 + sub r9, 866191482 + ; IMUL_M r7, L1[r4] mov eax, r12d - and eax, 262136 - cvtdq2pd xmm12, qword ptr [rsi+rax] - subpd xmm3, xmm12 - ; IXOR_R r7, r2 - xor r15, r10 - ; IXOR_R r0, r5 - xor r8, r13 - ; FADD_R f1, a2 - addpd xmm1, xmm10 - ; FMUL_R e3, a2 - mulpd xmm7, xmm10 - ; FSWAP_R e3 - shufpd xmm7, xmm7, 1 - ; FSWAP_R f1 - shufpd xmm1, xmm1, 1 - ; COND_R r2, ge(r2, 4068636356) - xor ecx, ecx - cmp r10d, -226330940 - setge cl - add r10, rcx - ; FMUL_R e2, a3 - mulpd xmm6, xmm11 - ; FSUB_M f2, L2[r1] - mov eax, r9d - and eax, 262136 - cvtdq2pd xmm12, qword ptr [rsi+rax] - subpd xmm2, xmm12 - ; FADD_R f1, a0 - addpd xmm1, xmm8 - ; ISUB_R r7, r5 - sub r15, r13 - ; ISUB_M r0, L1[r1] - mov eax, r9d and eax, 16376 - sub r8, qword ptr [rsi+rax] - ; FSUB_R f3, a1 - subpd xmm3, xmm9 - ; ISWAP_R r3, r5 - xchg r11, r13 - ; IADD_RC r5, r2, 795784298 - lea r13, [r13+r10+795784298] - ; IADD_RC r0, r4, 2244788743 - lea r8, [r8+r12-2050178553] - ; IMUL_9C r5, 1062534001 - lea r13, [r13+r13*8+1062534001] - ; FADD_R f0, a2 - addpd xmm0, xmm10 - ; FMUL_R e3, a1 - mulpd xmm7, xmm9 - ; IDIV_C r3, 1662492575 - mov rax, 11914062610815620875 - mul r11 - shr rdx, 30 - add r11, rdx - ; IMUL_M r5, L1[r0] - mov eax, r8d - and eax, 16376 - imul r13, qword ptr [rsi+rax] - ; IDIV_C r4, 1963597892 - mov rax, r12 - shr rax, 2 - mov rcx, 1260889558222626443 - mul rcx - shr rdx, 25 - add r12, rdx - ; IMUL_9C r7, 1820045218 - lea r15, [r15+r15*8+1820045218] - ; IMUL_M r0, L1[r3] - mov eax, r11d - and eax, 16376 - imul r8, qword ptr [rsi+rax] - ; IXOR_R r3, r7 - xor r11, r15 - ; ISMULH_R r4, r2 - mov rax, r12 - imul r10 - mov r12, rdx - ; ISWAP_R r3, r0 - xchg r11, r8 - ; IXOR_R r2, r0 - xor r10, r8 - ; IXOR_M r0, L2[r1] - mov eax, r9d - and eax, 262136 - xor r8, qword ptr [rsi+rax] - ; ISDIV_C r7, 3359520316 - mov rax, 7859804860668271393 - imul r15 - xor eax, eax - sub rdx, r15 - sar rdx, 29 - sets al - add rdx, rax - add r15, rdx - ; IMUL_M r6, L1[r2] - mov eax, r10d - and eax, 16376 - imul r14, qword ptr [rsi+rax] - ; FNEG_R f3 - xorps xmm3, xmm15 - ; IADD_RC r4, r2, 1704868083 - lea r12, [r12+r10+1704868083] + imul r15, qword ptr [rsi+rax] ; FADD_R f2, a0 addpd xmm2, xmm8 - ; ISTORE L1[r0], r0 - mov eax, r8d - and eax, 16376 - mov qword ptr [rsi+rax], r8 - ; FADD_M f0, L1[r7] - mov eax, r15d - and eax, 16376 - cvtdq2pd xmm12, qword ptr [rsi+rax] - addpd xmm0, xmm12 - ; FMUL_R e0, a3 - mulpd xmm4, xmm11 - ; FSUB_R f3, a2 - subpd xmm3, xmm10 - ; IADD_RC r7, r7, 1302457878 - lea r15, [r15+r15+1302457878] - ; ISUB_R r1, 1330165941 - sub r9, 1330165941 - ; FNEG_R f1 - xorps xmm1, xmm15 - ; IROR_R r0, r4 - mov ecx, r12d - ror r8, cl - ; FSUB_R f1, a0 - subpd xmm1, xmm8 - ; IROR_R r5, r6 - mov ecx, r14d - ror r13, cl - ; COND_R r0, ab(r1, 3984033425) - xor ecx, ecx - cmp r9d, -310933871 - seta cl - add r8, rcx - ; COND_R r4, ab(r7, 757929676) - xor ecx, ecx - cmp r15d, 757929676 - seta cl - add r12, rcx - ; FMUL_R e0, a1 - mulpd xmm4, xmm9 - ; IMUL_R r1, r3 - imul r9, r11 - ; ISUB_R r3, r2 - sub r11, r10 - ; FSUB_R f3, a2 - subpd xmm3, xmm10 - ; FDIV_M e1, L1[r4] - mov eax, r12d - and eax, 16376 - cvtdq2pd xmm12, qword ptr [rsi+rax] - andps xmm12, xmm14 - divpd xmm5, xmm12 - maxpd xmm5, xmm13 - ; FSWAP_R f1 - shufpd xmm1, xmm1, 1 - ; IADD_R r7, 2873779272 - add r15, -1421188024 - ; FSUB_M f3, L2[r2] - mov eax, r10d - and eax, 262136 - cvtdq2pd xmm12, qword ptr [rsi+rax] - subpd xmm3, xmm12 - ; FSUB_R f2, a3 - subpd xmm2, xmm11 - ; FSUB_R f3, a1 - subpd xmm3, xmm9 - ; FMUL_R e1, a3 - mulpd xmm5, xmm11 - ; IADD_RC r2, r4, 3977135268 - lea r10, [r10+r12-317832028] - ; IMUL_M r4, L1[r5] - mov eax, r13d - and eax, 16376 - imul r12, qword ptr [rsi+rax] - ; FDIV_M e1, L1[r7] - mov eax, r15d - and eax, 16376 - cvtdq2pd xmm12, qword ptr [rsi+rax] - andps xmm12, xmm14 - divpd xmm5, xmm12 - maxpd xmm5, xmm13 - ; IADD_R r5, r2 - add r13, r10 - ; ISUB_R r4, 401020510 - sub r12, 401020510 - ; IROR_R r3, r0 - mov ecx, r8d - ror r11, cl - ; ISTORE L1[r7], r0 - mov eax, r15d - and eax, 16376 - mov qword ptr [rsi+rax], r8 - ; FSUB_R f2, a1 - subpd xmm2, xmm9 - ; FMUL_R e3, a1 - mulpd xmm7, xmm9 - ; IMUL_9C r3, 720965215 - lea r11, [r11+r11*8+720965215] - ; IMUL_9C r6, 74948046 - lea r14, [r14+r14*8+74948046] - ; ISTORE L1[r7], r3 - mov eax, r15d - and eax, 16376 - mov qword ptr [rsi+rax], r11 - ; IXOR_R r2, r6 - xor r10, r14 - ; FMUL_R e3, a1 - mulpd xmm7, xmm9 - ; ISUB_R r4, r1 - sub r12, r9 - ; ISUB_R r3, r0 - sub r11, r8 - ; ISWAP_R r7, r5 - xchg r15, r13 - ; IMUL_R r2, r6 - imul r10, r14 - ; COND_R r2, ge(r2, 2402809790) - xor ecx, ecx - cmp r10d, -1892157506 - setge cl - add r10, rcx - ; FADD_R f1, a3 - addpd xmm1, xmm11 - ; IADD_R r7, r0 - add r15, r8 - ; IDIV_C r1, 624867857 - mov rax, 15848983434401622933 - mul r9 - shr rdx, 29 - add r9, rdx - ; FADD_R f0, a1 - addpd xmm0, xmm9 - ; IADD_RC r5, r7, 3817376178 - lea r13, [r13+r15-477591118] - ; FSUB_R f0, a3 - subpd xmm0, xmm11 - ; ISUB_M r6, L1[r2] - mov eax, r10d - and eax, 16376 - sub r14, qword ptr [rsi+rax] - ; FMUL_R e3, a1 - mulpd xmm7, xmm9 - ; IADD_R r0, r4 - add r8, r12 - ; FSUB_R f3, a1 - subpd xmm3, xmm9 - ; FSUB_M f2, L1[r4] - mov eax, r12d - and eax, 16376 - cvtdq2pd xmm12, qword ptr [rsi+rax] - subpd xmm2, xmm12 - ; ISDIV_C r2, 3898255608 - mov rax, 5964731804029407733 - imul r10 - xor eax, eax - sub rdx, r10 - sar rdx, 28 - sets al - add rdx, rax - add r10, rdx - ; FNEG_R f2 - xorps xmm2, xmm15 - ; FSUB_R f3, a2 - subpd xmm3, xmm10 - ; FADD_R f1, a3 - addpd xmm1, xmm11 - ; IMUL_R r3, r2 - imul r11, r10 - ; FADD_M f0, L1[r3] - mov eax, r11d - and eax, 16376 - cvtdq2pd xmm12, qword ptr [rsi+rax] - addpd xmm0, xmm12 - ; ISMULH_R r5, r2 - mov rax, r13 - imul r10 - mov r13, rdx - ; IMULH_R r6, r2 - mov rax, r14 - mul r10 - mov r14, rdx - ; FADD_M f3, L1[r3] - mov eax, r11d - and eax, 16376 - cvtdq2pd xmm12, qword ptr [rsi+rax] - addpd xmm3, xmm12 - ; IMUL_R r6, r7 - imul r14, r15 - ; FSUB_R f0, a0 - subpd xmm0, xmm8 - ; FNEG_R f2 - xorps xmm2, xmm15 - ; ISUB_R r6, r4 - sub r14, r12 - ; FADD_R f1, a1 - addpd xmm1, xmm9 - ; IXOR_R r0, r5 - xor r8, r13 - ; FADD_R f2, a1 - addpd xmm2, xmm9 - ; ISWAP_R r7, r5 - xchg r15, r13 - ; FMUL_R e3, a2 - mulpd xmm7, xmm10 - ; IADD_RC r3, r6, 2977336568 - lea r11, [r11+r14-1317630728] - ; IMUL_R r2, r3 - imul r10, r11 - ; IADD_RC r1, r4, 894105694 - lea r9, [r9+r12+894105694] - ; IMUL_9C r7, 504293473 - lea r15, [r15+r15*8+504293473] - ; FSUB_R f1, a0 - subpd xmm1, xmm8 - ; IMUL_R r7, r1 - imul r15, r9 - ; IXOR_R r2, r4 - xor r10, r12 - ; IADD_RC r0, r1, 392362094 - lea r8, [r8+r9+392362094] - ; IDIV_C r4, 1645771433 - mov rax, 376097195048767223 - mul r12 - shr rdx, 25 - add r12, rdx - ; ISUB_R r4, r3 - sub r12, r11 - ; ISUB_M r7, L1[r4] - mov eax, r12d - and eax, 16376 - sub r15, qword ptr [rsi+rax] - ; IMUL_M r5, L1[r7] - mov eax, r15d - and eax, 16376 - imul r13, qword ptr [rsi+rax] - ; IROR_R r1, r7 - mov ecx, r15d - ror r9, cl - ; INEG_R r4 - neg r12 - ; IMUL_R r3, 1863959234 - imul r11, 1863959234 - ; IROR_R r4, 59 - ror r12, 59 - ; IMUL_M r1, L3[363256] - imul r9, qword ptr [rsi+363256] - ; ISTORE L2[r6], r7 - mov eax, r14d - and eax, 262136 - mov qword ptr [rsi+rax], r15 - ; ISTORE L1[r1], r5 - mov eax, r9d - and eax, 16376 - mov qword ptr [rsi+rax], r13 - ; FNEG_R f0 - xorps xmm0, xmm15 - ; FSQRT_R e2 - sqrtpd xmm6, xmm6 - ; FMUL_R e0, a3 - mulpd xmm4, xmm11 - ; FMUL_R e3, a2 - mulpd xmm7, xmm10 - ; IROR_R r5, r2 - mov ecx, r10d - ror r13, cl - ; IADD_R r0, r4 - add r8, r12 + ; IADD_R r2, r1 + add r10, r9 From b8ce504be68771c900920d6fbbb89c458f892b62 Mon Sep 17 00:00:00 2001 From: tevador Date: Sat, 9 Feb 2019 19:32:53 +0100 Subject: [PATCH 35/35] Added comments to hashAes1Rx4 and fillAes1Rx4 Fixed gcc compilation Added performance numbers --- README.md | 23 ++- makefile | 28 ++- src/AddressTransform.cpp | 292 ------------------------------ src/AssemblyGeneratorX86.cpp | 2 + src/Instruction.cpp | 2 + src/InterpretedVirtualMachine.cpp | 3 +- src/JitCompilerX86.cpp | 2 + src/JitCompilerX86.hpp | 2 +- src/VirtualMachine.cpp | 8 +- src/common.hpp | 1 + src/hashAes1Rx4.cpp | 22 +++ src/main.cpp | 12 +- 12 files changed, 72 insertions(+), 325 deletions(-) delete mode 100644 src/AddressTransform.cpp diff --git a/README.md b/README.md index 2c9b876..fed319c 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,3 @@ - - - # RandomX RandomX is a proof-of-work (PoW) algorithm that is optimized for general-purpose CPUs. RandomX uses random code execution (hence the name) together with several memory-hard techniques to achieve the following goals: @@ -26,7 +23,7 @@ The structure of the VM mimics the components that are found in a typical genera The VM executes programs in a special instruction set, which was designed in such way that any random 8-byte word is a valid instruction and any sequence of valid instructions is a valid program. For more details see [RandomX ISA documentation](doc/isa.md). Because there are no "syntax" rules, generating a random program is as easy as filling the program buffer with random data. A RandomX program consists of 256 instructions. See [program.inc](src/program.inc) as an example of a RandomX program translated into x86-64 assembly. -#### Hash calculation +### Hash calculation Calculating a RandomX hash consists of initializing the 2 MiB scratchpad with random data, executing 8 RandomX loops and calculating a hash of the scratchpad. @@ -40,15 +37,27 @@ Hash of the register state after 2048 interations is used to initialize the rand The loads from the dataset are fully prefetched, so they don't slow down the loop. -RandomX uses the [Blake2b](https://en.wikipedia.org/wiki/BLAKE_%28hash_function%29#BLAKE2) cryptographic hash function. Special hashing functions based on [AES](https://en.wikipedia.org/wiki/Advanced_Encryption_Standard) encryption are used to initialize and hash the scratchpad. +RandomX uses the [Blake2b](https://en.wikipedia.org/wiki/BLAKE_%28hash_function%29#BLAKE2) cryptographic hash function. Special hashing functions `fillAes1Rx4` and `hashAes1Rx4` based on [AES](https://en.wikipedia.org/wiki/Advanced_Encryption_Standard) encryption are used to initialize and hash the scratchpad ([hashAes1Rx4.cpp](src/hashAes1Rx4.cpp)). -#### Hash verification +### Hash verification RandomX is a symmetric PoW algorithm, so the verifying party has to repeat the same steps as when a hash is calculated. However, to allow hash verification on devices that cannot store the whole 4 GiB dataset, RandomX allows a time-memory tradeoff by using just 256 MiB of memory at the cost of 16 times more random memory accesses. See [Dataset initialization](doc/dataset.md) for more details. -#### Documentation +### Performance +Preliminary mining performance with the x86-64 JIT compiled VM: + +|CPU|RAM|threads|hashrate [H/s]|comment| +|-----|-----|----|----------|-----| +|AMD Ryzen 1700|DDR4-2933|8|4100| +|Intel i5-3230M|DDR3-1333|1|280|without large pages +|Intel i7-8550U|DDR4-2400|4|1200|limited by thermals +|Intel i5-2500K|DDR3-1333|3|1350| + +Hash verification is performed using the portable interpreter in "light-client mode" and takes 30-70 ms depending on RAM latency and CPU clock speed. Hash verification in "mining mode" takes 2-4 ms. + +### Documentation * [RandomX ISA](doc/isa.md) * [RandomX instruction listing](doc/isa-ops.md) * [Dataset initialization](doc/dataset.md) diff --git a/makefile b/makefile index 87fef86..77788dc 100644 --- a/makefile +++ b/makefile @@ -11,12 +11,12 @@ SRCDIR=src OBJDIR=obj LDFLAGS=-lpthread TOBJS=$(addprefix $(OBJDIR)/,instructionsPortable.o TestAluFpu.o) -ROBJS=$(addprefix $(OBJDIR)/,argon2_core.o argon2_ref.o AssemblyGeneratorX86.o blake2b.o CompiledVirtualMachine.o dataset.o JitCompilerX86.o instructionsPortable.o Instruction.o InterpretedVirtualMachine.o main.o Program.o softAes.o VirtualMachine.o t1ha2.o Cache.o virtualMemory.o divideByConstantCodegen.o LightClientAsyncWorker.o AddressTransform.o hashAes1Rx4.o) +ROBJS=$(addprefix $(OBJDIR)/,argon2_core.o argon2_ref.o AssemblyGeneratorX86.o blake2b.o CompiledVirtualMachine.o dataset.o JitCompilerX86.o instructionsPortable.o Instruction.o InterpretedVirtualMachine.o main.o Program.o softAes.o VirtualMachine.o Cache.o virtualMemory.o divideByConstantCodegen.o LightClientAsyncWorker.o hashAes1Rx4.o) ifeq ($(PLATFORM),x86_64) ROBJS += $(OBJDIR)/JitCompilerX86-static.o $(OBJDIR)/squareHash.o endif -all: release test +all: release release: CXXFLAGS += -march=native -O3 -flto release: CCFLAGS += -march=native -O3 -flto @@ -41,11 +41,8 @@ $(BINDIR)/randomx: $(ROBJS) | $(BINDIR) $(BINDIR)/AluFpuTest: $(TOBJS) | $(BINDIR) $(CXX) $(TOBJS) $(LDFLAGS) -o $@ -$(OBJDIR)/TestAluFpu.o: $(addprefix $(SRCDIR)/,TestAluFpu.cpp instructions.hpp Pcg32.hpp) | $(OBJDIR) +$(OBJDIR)/TestAluFpu.o: $(addprefix $(SRCDIR)/,TestAluFpu.cpp instructions.hpp) | $(OBJDIR) $(CXX) $(CXXFLAGS) -c $(SRCDIR)/TestAluFpu.cpp -o $@ - -$(OBJDIR)/AddressTransform.o: $(addprefix $(SRCDIR)/,AddressTransform.cpp InterpretedVirtualMachine.hpp common.hpp) | $(OBJDIR) - $(CXX) $(CXXFLAGS) -c $(SRCDIR)/AddressTransform.cpp -o $@ $(OBJDIR)/argon2_core.o: $(addprefix $(SRCDIR)/,argon2_core.c argon2_core.h blake2/blake2.h blake2/blake2-impl.h) | $(OBJDIR) $(CC) $(CCFLAGS) -c $(SRCDIR)/argon2_core.c -o $@ @@ -53,16 +50,16 @@ $(OBJDIR)/argon2_core.o: $(addprefix $(SRCDIR)/,argon2_core.c argon2_core.h blak $(OBJDIR)/argon2_ref.o: $(addprefix $(SRCDIR)/,argon2_ref.c argon2.h argon2_core.h blake2/blake2.h blake2/blake2-impl.h blake2/blamka-round-ref.h) | $(OBJDIR) $(CC) $(CCFLAGS) -c $(SRCDIR)/argon2_ref.c -o $@ -$(OBJDIR)/AssemblyGeneratorX86.o: $(addprefix $(SRCDIR)/,AssemblyGeneratorX86.cpp AssemblyGeneratorX86.hpp Instruction.hpp Pcg32.hpp common.hpp instructions.hpp instructionWeights.hpp) | $(OBJDIR) +$(OBJDIR)/AssemblyGeneratorX86.o: $(addprefix $(SRCDIR)/,AssemblyGeneratorX86.cpp AssemblyGeneratorX86.hpp Instruction.hpp common.hpp instructionWeights.hpp) | $(OBJDIR) $(CXX) $(CXXFLAGS) -c $(SRCDIR)/AssemblyGeneratorX86.cpp -o $@ $(OBJDIR)/blake2b.o: $(addprefix $(SRCDIR)/blake2/,blake2b.c blake2.h blake2-impl.h) | $(OBJDIR) $(CC) $(CCFLAGS) -c $(SRCDIR)/blake2/blake2b.c -o $@ -$(OBJDIR)/CompiledVirtualMachine.o: $(addprefix $(SRCDIR)/,CompiledVirtualMachine.cpp CompiledVirtualMachine.hpp Pcg32.hpp common.hpp instructions.hpp) | $(OBJDIR) +$(OBJDIR)/CompiledVirtualMachine.o: $(addprefix $(SRCDIR)/,CompiledVirtualMachine.cpp CompiledVirtualMachine.hpp common.hpp) | $(OBJDIR) $(CXX) $(CXXFLAGS) -c $(SRCDIR)/CompiledVirtualMachine.cpp -o $@ -$(OBJDIR)/dataset.o: $(addprefix $(SRCDIR)/,dataset.cpp common.hpp Pcg32.hpp) | $(OBJDIR) +$(OBJDIR)/dataset.o: $(addprefix $(SRCDIR)/,dataset.cpp common.hpp) | $(OBJDIR) $(CXX) $(CXXFLAGS) -c $(SRCDIR)/dataset.cpp -o $@ $(OBJDIR)/divideByConstantCodegen.o: $(addprefix $(SRCDIR)/,divideByConstantCodegen.c divideByConstantCodegen.h) | $(OBJDIR) @@ -74,19 +71,19 @@ $(OBJDIR)/hashAes1Rx4.o: $(addprefix $(SRCDIR)/,hashAes1Rx4.cpp softAes.h) | $(O $(OBJDIR)/JitCompilerX86.o: $(addprefix $(SRCDIR)/,JitCompilerX86.cpp JitCompilerX86.hpp Instruction.hpp instructionWeights.hpp) | $(OBJDIR) $(CXX) $(CXXFLAGS) -c $(SRCDIR)/JitCompilerX86.cpp -o $@ -$(OBJDIR)/JitCompilerX86-static.o: $(addprefix $(SRCDIR)/,JitCompilerX86-static.S $(addprefix asm/program_, prologue_linux.inc prologue_load.inc epilogue_linux.inc epilogue_store.inc read.inc)) | $(OBJDIR) +$(OBJDIR)/JitCompilerX86-static.o: $(addprefix $(SRCDIR)/,JitCompilerX86-static.S $(addprefix asm/program_, prologue_linux.inc prologue_load.inc epilogue_linux.inc epilogue_store.inc read_dataset.inc loop_load.inc loop_store.inc xmm_constants.inc)) | $(OBJDIR) $(CXX) -x assembler-with-cpp -c $(SRCDIR)/JitCompilerX86-static.S -o $@ $(OBJDIR)/squareHash.o: $(addprefix $(SRCDIR)/,squareHash.S $(addprefix asm/, squareHash.inc)) | $(OBJDIR) $(CXX) -x assembler-with-cpp -c $(SRCDIR)/squareHash.S -o $@ -$(OBJDIR)/instructionsPortable.o: $(addprefix $(SRCDIR)/,instructionsPortable.cpp instructions.hpp intrinPortable.h) | $(OBJDIR) +$(OBJDIR)/instructionsPortable.o: $(addprefix $(SRCDIR)/,instructionsPortable.cpp intrinPortable.h) | $(OBJDIR) $(CXX) $(CXXFLAGS) -c $(SRCDIR)/instructionsPortable.cpp -o $@ $(OBJDIR)/Instruction.o: $(addprefix $(SRCDIR)/,Instruction.cpp Instruction.hpp instructionWeights.hpp) | $(OBJDIR) $(CXX) $(CXXFLAGS) -c $(SRCDIR)/Instruction.cpp -o $@ -$(OBJDIR)/InterpretedVirtualMachine.o: $(addprefix $(SRCDIR)/,InterpretedVirtualMachine.cpp InterpretedVirtualMachine.hpp Pcg32.hpp instructions.hpp instructionWeights.hpp) | $(OBJDIR) +$(OBJDIR)/InterpretedVirtualMachine.o: $(addprefix $(SRCDIR)/,InterpretedVirtualMachine.cpp InterpretedVirtualMachine.hpp instructionWeights.hpp) | $(OBJDIR) $(CXX) $(CXXFLAGS) -c $(SRCDIR)/InterpretedVirtualMachine.cpp -o $@ $(OBJDIR)/LightClientAsyncWorker.o: $(addprefix $(SRCDIR)/,LightClientAsyncWorker.cpp LightClientAsyncWorker.hpp common.hpp) | $(OBJDIR) @@ -95,10 +92,10 @@ $(OBJDIR)/LightClientAsyncWorker.o: $(addprefix $(SRCDIR)/,LightClientAsyncWorke $(OBJDIR)/main.o: $(addprefix $(SRCDIR)/,main.cpp InterpretedVirtualMachine.hpp Stopwatch.hpp blake2/blake2.h) | $(OBJDIR) $(CXX) $(CXXFLAGS) -c $(SRCDIR)/main.cpp -o $@ -$(OBJDIR)/Program.o: $(addprefix $(SRCDIR)/,Program.cpp Program.hpp Pcg32.hpp) | $(OBJDIR) +$(OBJDIR)/Program.o: $(addprefix $(SRCDIR)/,Program.cpp Program.hpp) | $(OBJDIR) $(CXX) $(CXXFLAGS) -c $(SRCDIR)/Program.cpp -o $@ -$(OBJDIR)/Cache.o: $(addprefix $(SRCDIR)/,Cache.cpp Cache.hpp Pcg32.hpp argon2_core.h) | $(OBJDIR) +$(OBJDIR)/Cache.o: $(addprefix $(SRCDIR)/,Cache.cpp Cache.hpp argon2_core.h) | $(OBJDIR) $(CXX) $(CXXFLAGS) -c $(SRCDIR)/Cache.cpp -o $@ $(OBJDIR)/softAes.o: $(addprefix $(SRCDIR)/,softAes.cpp softAes.h) | $(OBJDIR) @@ -109,9 +106,6 @@ $(OBJDIR)/VirtualMachine.o: $(addprefix $(SRCDIR)/,VirtualMachine.cpp VirtualMac $(OBJDIR)/virtualMemory.o: $(addprefix $(SRCDIR)/,virtualMemory.cpp virtualMemory.hpp) | $(OBJDIR) $(CXX) $(CXXFLAGS) -c $(SRCDIR)/virtualMemory.cpp -o $@ - -$(OBJDIR)/t1ha2.o: $(addprefix $(SRCDIR)/t1ha/,t1ha2.c t1ha.h t1ha_bits.h) | $(OBJDIR) - $(CC) $(CCFLAGS) -c $(SRCDIR)/t1ha/t1ha2.c -o $@ $(OBJDIR): mkdir $(OBJDIR) diff --git a/src/AddressTransform.cpp b/src/AddressTransform.cpp deleted file mode 100644 index b8070a0..0000000 --- a/src/AddressTransform.cpp +++ /dev/null @@ -1,292 +0,0 @@ -/* -Copyright (c) 2019 tevador - -This file is part of RandomX. - -RandomX is free software: you can redistribute it and/or modify -it under the terms of the GNU General Public License as published by -the Free Software Foundation, either version 3 of the License, or -(at your option) any later version. - -RandomX is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with RandomX. If not, see. -*/ - -#include "common.hpp" -#include "InterpretedVirtualMachine.hpp" - -#include -#include -#include - -namespace RandomX { - - class Mul9Transform : public ITransform { - public: - Mul9Transform(int32_t cc) : c(cc) { - std::ostringstream oss; - oss << "mul9_" << std::hex << (cc & 255); - name = oss.str(); - } - int32_t apply(int32_t x) const override { - return 9 * x + c; - } - const char* getName() const override { - return name.c_str(); - } - std::ostream& printAsm(std::ostream& os) const override { - os << "lea ecx, [rcx+rcx*8" << std::showpos << c << "]" << std::noshowpos << std::endl; - return os; - } - std::ostream& printCxx(std::ostream& os) const override { - os << "static const Mul9Transform " << name << "(" << c << ");" << std::endl; - return os; - } - private: - int32_t c; - std::string name; - }; - - class AddTransform : public ITransform { - public: - AddTransform(int32_t cc) : c(cc) { - std::ostringstream oss; - oss << "add_" << std::hex << (cc & 255); - name = oss.str(); - } - int32_t apply(int32_t x) const override { - return x + c; - } - const char* getName() const override { - return name.c_str(); - } - std::ostream& printAsm(std::ostream& os) const override { - os << "db 64" << std::endl; - os << "add ecx, " << c << std::endl; - return os; - } - std::ostream& printCxx(std::ostream& os) const override { - os << "static const AddTransform " << name << "(" << c << ");" << std::endl; - return os; - } - private: - int32_t c; - std::string name; - }; - - class XorTransform : public ITransform { - public: - XorTransform(int32_t cc) : c(cc) { - std::ostringstream oss; - oss << "xor_" << std::hex << (cc & 255); - name = oss.str(); - } - int32_t apply(int32_t x) const override { - return x ^ c; - } - const char* getName() const override { - return name.c_str(); - } - std::ostream& printAsm(std::ostream& os) const override { - os << "db 64" << std::endl; - os << "xor ecx, " << c << std::endl; - return os; - } - std::ostream& printCxx(std::ostream& os) const override { - os << "static const XorTransform " << name << "(" << c << ");" << std::endl; - return os; - } - private: - int32_t c; - std::string name; - }; - - static const Mul9Transform mul9_6d(109); - static const XorTransform xor_60(96); - static const Mul9Transform mul9_ed(-19); - static const AddTransform add_9e(-98); - static const AddTransform add_eb(-21); - static const XorTransform xor_b0(-80); - static const Mul9Transform mul9_a4(-92); - static const AddTransform add_71(113); - static const Mul9Transform mul9_64(100); - static const AddTransform add_d9(-39); - static const XorTransform xor_78(120); - static const Mul9Transform mul9_89(-119); - static const AddTransform add_8f(-113); - static const AddTransform add_6f(111); - static const XorTransform xor_68(104); - static const Mul9Transform mul9_ad(-83); - static const Mul9Transform mul9_7f(127); - static const XorTransform xor_90(-112); - static const AddTransform add_59(89); - static const AddTransform add_e0(-32); - static const AddTransform add_68(104); - static const XorTransform xor_88(-120); - static const XorTransform xor_18(24); - static const Mul9Transform mul9_9(9); - static const AddTransform add_e1(-31); - static const XorTransform xor_f0(-16); - static const AddTransform add_44(68); - static const Mul9Transform mul9_92(-110); - static const XorTransform xor_40(64); - static const XorTransform xor_d8(-40); - static const XorTransform xor_f8(-8); - static const AddTransform add_f6(-10); - static const XorTransform xor_e0(-32); - static const AddTransform add_e(14); - static const Mul9Transform mul9_d2(-46); - static const XorTransform xor_98(-104); - static const Mul9Transform mul9_24(36); - static const AddTransform add_64(100); - static const Mul9Transform mul9_bf(-65); - static const Mul9Transform mul9_1b(27); - static const Mul9Transform mul9_5b(91); - static const AddTransform add_9b(-101); - static const AddTransform add_a2(-94); - static const Mul9Transform mul9_f6(-10); - static const XorTransform xor_50(80); - static const AddTransform add_94(-108); - static const AddTransform add_c6(-58); - static const XorTransform xor_30(48); - static const Mul9Transform mul9_49(73); - static const XorTransform xor_d0(-48); - static const XorTransform xor_20(32); - static const XorTransform xor_a0(-96); - static const AddTransform add_76(118); - static const AddTransform add_5b(91); - static const Mul9Transform mul9_12(18); - static const AddTransform add_f5(-11); - static const Mul9Transform mul9_3f(63); - static const AddTransform add_72(114); - static const Mul9Transform mul9_2d(45); - static const AddTransform add_bd(-67); - static const AddTransform add_35(53); - static const Mul9Transform mul9_9b(-101); - static const Mul9Transform mul9_ff(-1); - static const XorTransform xor_10(16); - static const Mul9Transform mul9_db(-37); - static const Mul9Transform mul9_e4(-28); - static const Mul9Transform mul9_c9(-55); - static const XorTransform xor_a8(-88); - static const XorTransform xor_b8(-72); - static const AddTransform add_24(36); - static const XorTransform xor_c8(-56); - static const AddTransform add_74(116); - static const XorTransform xor_58(88); - static const XorTransform xor_80(-128); - static const AddTransform add_32(50); - static const AddTransform add_69(105); - static const AddTransform add_db(-37); - static const XorTransform xor_70(112); - static const XorTransform xor_8(8); - static const XorTransform xor_e8(-24); - static const Mul9Transform mul9_76(118); - static const XorTransform xor_48(72); - static const XorTransform xor_c0(-64); - static const AddTransform add_28(40); - static const Mul9Transform mul9_b6(-74); - static const Mul9Transform mul9_52(82); - static const Mul9Transform mul9_36(54); - static const XorTransform xor_38(56); - static const XorTransform xor_28(40); - static const AddTransform add_57(87); - - const ITransform* InterpretedVirtualMachine::addressTransformations[TransformationCount] = { - (ITransform*)&mul9_6d, - (ITransform*)&xor_60, - (ITransform*)&mul9_ed, - (ITransform*)&add_9e, - (ITransform*)&add_eb, - (ITransform*)&xor_b0, - (ITransform*)&mul9_a4, - (ITransform*)&add_71, - (ITransform*)&mul9_64, - (ITransform*)&add_d9, - (ITransform*)&xor_78, - (ITransform*)&mul9_89, - (ITransform*)&add_8f, - (ITransform*)&add_6f, - (ITransform*)&xor_68, - (ITransform*)&mul9_ad, - (ITransform*)&mul9_7f, - (ITransform*)&xor_90, - (ITransform*)&add_59, - (ITransform*)&add_e0, - (ITransform*)&add_68, - (ITransform*)&xor_88, - (ITransform*)&xor_18, - (ITransform*)&mul9_9, - (ITransform*)&add_e1, - (ITransform*)&xor_f0, - (ITransform*)&add_44, - (ITransform*)&mul9_92, - (ITransform*)&xor_40, - (ITransform*)&xor_d8, - (ITransform*)&xor_f8, - (ITransform*)&add_f6, - (ITransform*)&xor_e0, - (ITransform*)&add_e, - (ITransform*)&mul9_d2, - (ITransform*)&xor_98, - (ITransform*)&mul9_24, - (ITransform*)&add_64, - (ITransform*)&mul9_bf, - (ITransform*)&mul9_1b, - (ITransform*)&mul9_5b, - (ITransform*)&add_9b, - (ITransform*)&add_a2, - (ITransform*)&mul9_f6, - (ITransform*)&xor_50, - (ITransform*)&add_94, - (ITransform*)&add_c6, - (ITransform*)&xor_30, - (ITransform*)&mul9_49, - (ITransform*)&xor_d0, - (ITransform*)&xor_20, - (ITransform*)&xor_a0, - (ITransform*)&add_76, - (ITransform*)&add_5b, - (ITransform*)&mul9_12, - (ITransform*)&add_f5, - (ITransform*)&mul9_3f, - (ITransform*)&add_72, - (ITransform*)&mul9_2d, - (ITransform*)&add_bd, - (ITransform*)&add_35, - (ITransform*)&mul9_9b, - (ITransform*)&mul9_ff, - (ITransform*)&xor_10, - (ITransform*)&mul9_db, - (ITransform*)&mul9_e4, - (ITransform*)&mul9_c9, - (ITransform*)&xor_a8, - (ITransform*)&xor_b8, - (ITransform*)&add_24, - (ITransform*)&xor_c8, - (ITransform*)&add_74, - (ITransform*)&xor_58, - (ITransform*)&xor_80, - (ITransform*)&add_32, - (ITransform*)&add_69, - (ITransform*)&add_db, - (ITransform*)&xor_70, - (ITransform*)&xor_8, - (ITransform*)&xor_e8, - (ITransform*)&mul9_76, - (ITransform*)&xor_48, - (ITransform*)&xor_c0, - (ITransform*)&add_28, - (ITransform*)&mul9_b6, - (ITransform*)&mul9_52, - (ITransform*)&mul9_36, - (ITransform*)&xor_38, - (ITransform*)&xor_28, - (ITransform*)&add_57, - }; -} \ No newline at end of file diff --git a/src/AssemblyGeneratorX86.cpp b/src/AssemblyGeneratorX86.cpp index 9f03da1..bb50718 100644 --- a/src/AssemblyGeneratorX86.cpp +++ b/src/AssemblyGeneratorX86.cpp @@ -450,6 +450,8 @@ namespace RandomX { return "l"; case 7: return "ge"; + default: + UNREACHABLE; } } diff --git a/src/Instruction.cpp b/src/Instruction.cpp index 2fefcf3..bdcaf39 100644 --- a/src/Instruction.cpp +++ b/src/Instruction.cpp @@ -296,6 +296,8 @@ namespace RandomX { return "lt"; case 7: return "ge"; + default: + UNREACHABLE; } } diff --git a/src/InterpretedVirtualMachine.cpp b/src/InterpretedVirtualMachine.cpp index 0757f43..c5a6d53 100644 --- a/src/InterpretedVirtualMachine.cpp +++ b/src/InterpretedVirtualMachine.cpp @@ -27,6 +27,7 @@ along with RandomX. If not, see. #include #include #include +#include #include #include "intrinPortable.h" #ifdef STATS @@ -262,7 +263,7 @@ namespace RandomX { uint32_t spAddr0 = mem.mx; uint32_t spAddr1 = mem.ma; - for(int iter = 0; iter < InstructionCount; ++iter) { + for(unsigned iter = 0; iter < InstructionCount; ++iter) { //std::cout << "Iteration " << iter << std::endl; spAddr0 ^= r[readReg0]; spAddr0 &= ScratchpadL3Mask64; diff --git a/src/JitCompilerX86.cpp b/src/JitCompilerX86.cpp index b77da17..0c2fac0 100644 --- a/src/JitCompilerX86.cpp +++ b/src/JitCompilerX86.cpp @@ -705,6 +705,8 @@ namespace RandomX { return 0x9c; //setl case 7: return 0x9d; //setge + default: + UNREACHABLE; } } diff --git a/src/JitCompilerX86.hpp b/src/JitCompilerX86.hpp index e790cfe..fedcf20 100644 --- a/src/JitCompilerX86.hpp +++ b/src/JitCompilerX86.hpp @@ -83,7 +83,7 @@ namespace RandomX { template void emit(const uint8_t (&src)[N]) { - for (int i = 0; i < N; ++i) { + for (unsigned i = 0; i < N; ++i) { code[codePos + i] = src[i]; } codePos += N; diff --git a/src/VirtualMachine.cpp b/src/VirtualMachine.cpp index 2adf4e4..057026c 100644 --- a/src/VirtualMachine.cpp +++ b/src/VirtualMachine.cpp @@ -28,9 +28,15 @@ along with RandomX. If not, see. std::ostream& operator<<(std::ostream& os, const RandomX::RegisterFile& rf) { for (int i = 0; i < RandomX::RegistersCount; ++i) os << std::hex << "r" << i << " = " << rf.r[i] << std::endl << std::dec; - for (int i = 0; i < RandomX::RegistersCount; ++i) + for (int i = 0; i < 4; ++i) os << std::hex << "f" << i << " = " << *(uint64_t*)&rf.f[i].hi << " (" << rf.f[i].hi << ")" << std::endl << " = " << *(uint64_t*)&rf.f[i].lo << " (" << rf.f[i].lo << ")" << std::endl << std::dec; + for (int i = 0; i < 4; ++i) + os << std::hex << "e" << i << " = " << *(uint64_t*)&rf.e[i].hi << " (" << rf.e[i].hi << ")" << std::endl + << " = " << *(uint64_t*)&rf.e[i].lo << " (" << rf.e[i].lo << ")" << std::endl << std::dec; + for (int i = 0; i < 4; ++i) + os << std::hex << "a" << i << " = " << *(uint64_t*)&rf.a[i].hi << " (" << rf.a[i].hi << ")" << std::endl + << " = " << *(uint64_t*)&rf.a[i].lo << " (" << rf.a[i].lo << ")" << std::endl << std::dec; return os; } diff --git a/src/common.hpp b/src/common.hpp index 8c16825..1d7f597 100644 --- a/src/common.hpp +++ b/src/common.hpp @@ -93,6 +93,7 @@ namespace RandomX { class ILightClientAsyncWorker { public: + virtual ~ILightClientAsyncWorker() {} virtual void prepareBlock(addr_t) = 0; virtual void prepareBlocks(void* out, uint32_t startBlock, uint32_t blockCount) = 0; virtual const uint64_t* getBlock(addr_t) = 0; diff --git a/src/hashAes1Rx4.cpp b/src/hashAes1Rx4.cpp index 623d4b6..db1c6a2 100644 --- a/src/hashAes1Rx4.cpp +++ b/src/hashAes1Rx4.cpp @@ -19,6 +19,18 @@ along with RandomX. If not, see. #include "softAes.h" +/* + Calculate a 512-bit hash of 'input' using 4 lanes of AES. + The input is treated as a set of round keys for the encryption + of the initial state. + + 'inputSize' must be a multiple of 64. + + For a 2 MiB input, this has the same security as 32768-round + AES encryption. + + Hashing throughput: >20 GiB/s per CPU core with hardware AES +*/ template void hashAes1Rx4(const void *input, size_t inputSize, void *hash) { const uint8_t* inptr = (uint8_t*)input; @@ -72,6 +84,16 @@ void hashAes1Rx4(const void *input, size_t inputSize, void *hash) { template void hashAes1Rx4(const void *input, size_t inputSize, void *hash); template void hashAes1Rx4(const void *input, size_t inputSize, void *hash); +/* + Fill 'buffer' with pseudorandom data based on 512-bit 'state'. + The state is encrypted using a single AES round per 16 bytes of output + in 4 lanes. + + 'outputSize' must be a multiple of 64. + + The modified state is written back to 'state' to allow multiple + calls to this function. +*/ template void fillAes1Rx4(void *state, size_t outputSize, void *buffer) { const uint8_t* outptr = (uint8_t*)buffer; diff --git a/src/main.cpp b/src/main.cpp index 0a10d8f..1229feb 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -254,7 +254,7 @@ int main(int argc, char** argv) { } if (RandomX::trace) { std::cout << "Keys: " << std::endl; - for (int i = 0; i < dataset.cache->getKeys().size(); ++i) { + for (unsigned i = 0; i < dataset.cache->getKeys().size(); ++i) { outputHex(std::cout, (char*)&dataset.cache->getKeys()[i], sizeof(__m128i)); } std::cout << std::endl; @@ -280,7 +280,7 @@ int main(int argc, char** argv) { threads.push_back(std::thread(&RandomX::datasetInit, cache, dataset, i * perThread, count)); } } - for (int i = 0; i < threads.size(); ++i) { + for (unsigned i = 0; i < threads.size(); ++i) { threads[i].join(); } } @@ -318,10 +318,10 @@ int main(int argc, char** argv) { std::cout << "Running benchmark (" << programCount << " programs) ..." << std::endl; sw.restart(); if (threadCount > 1) { - for (int i = 0; i < vms.size(); ++i) { + for (unsigned i = 0; i < vms.size(); ++i) { threads.push_back(std::thread(&mine, vms[i], std::ref(atomicNonce), std::ref(result), programCount, i, scratchpadMem + RandomX::ScratchpadSize * i)); } - for (int i = 0; i < threads.size(); ++i) { + for (unsigned i = 0; i < threads.size(); ++i) { threads[i].join(); } } @@ -336,10 +336,10 @@ int main(int argc, char** argv) { if(programCount == 1000) std::cout << "Reference result: 3e1c5f9b9d0bf8ffa250f860bf5f7ab76ac823b206ddee6a592660119a3640c6" << std::endl; if (lightClient) { - std::cout << "Performance: " << 1000 * elapsed / programCount << " ms per program" << std::endl; + std::cout << "Performance: " << 1000 * elapsed / programCount << " ms per hash" << std::endl; } else { - std::cout << "Performance: " << programCount / elapsed << " programs per second" << std::endl; + std::cout << "Performance: " << programCount / elapsed << " hashes per second" << std::endl; } } catch (std::exception& e) {