From f3b114af88bde166cf001668cf77f7764b986528 Mon Sep 17 00:00:00 2001 From: tevador Date: Fri, 22 Feb 2019 17:48:26 +0100 Subject: [PATCH] Replaced division instructions with IMUL_RCP --- doc/isa-ops.md | 9 +- makefile | 10 +- src/AssemblyGeneratorX86.cpp | 96 +- src/AssemblyGeneratorX86.hpp | 2 +- src/Instruction.cpp | 6 +- src/Instruction.hpp | 6 +- src/InterpretedVirtualMachine.cpp | 39 +- src/JitCompilerX86.cpp | 105 +-- src/JitCompilerX86.hpp | 2 +- src/instructionWeights.hpp | 6 +- src/main.cpp | 2 +- src/program.inc | 1378 ++++++++++++++--------------- src/reciprocal.c | 60 ++ src/reciprocal.h | 31 + 14 files changed, 814 insertions(+), 938 deletions(-) create mode 100644 src/reciprocal.c create mode 100644 src/reciprocal.h diff --git a/doc/isa-ops.md b/doc/isa-ops.md index 79ac307..d403bda 100644 --- a/doc/isa-ops.md +++ b/doc/isa-ops.md @@ -19,8 +19,7 @@ Memory operands are loaded as 8-byte values from the address indicated by `src`. |1/256|IMULH_M|R|mem|`src = imm32`|`dst = (dst * [src]) >> 64`| |4/256|ISMULH_R|R|R|`src = dst`|`dst = (dst * src) >> 64` (signed)| |1/256|ISMULH_M|R|mem|`src = imm32`|`dst = (dst * [src]) >> 64` (signed)| -|4/256|IDIV_C|R|-|-|`dst = dst + dst / imm32`| -|4/256|ISDIV_C|R|-|-|`dst = dst + dst / imm32` (signed)| +|8/256|IMUL_RCP|R|-|-|dst = 2x / imm32 * dst| |2/256|INEG_R|R|-|-|`dst = -dst`| |16/256|IXOR_R|R|R|`src = imm32`|`dst = dst ^ src`| |4/256|IXOR_M|R|mem|`src = imm32`|`dst = dst ^ [src]`| @@ -30,8 +29,8 @@ Memory operands are loaded as 8-byte values from the address indicated by `src`. #### IMULH and ISMULH These instructions output the high 64 bits of the whole 128-bit multiplication result. The result differs for signed and unsigned multiplication (`IMULH` is unsigned, `ISMULH` is signed). The variants with a register source operand do not use `imm32` (they perform a squaring operation if `dst` equals `src`). -#### IDIV_C and ISDIV_C -The division instructions use a constant divisor, so they can be optimized into a [multiplication by fixed-point reciprocal](https://en.wikipedia.org/wiki/Division_algorithm#Division_by_a_constant). `IDIV_C` performs unsigned division (`imm32` is zero-extended to 64 bits), while `ISDIV_C` performs signed division. In the case of division by zero, the instructions become a no-op. In the very rare case of signed overflow, the destination register is set to zero. +#### IMUL_RCP +This instruction multiplies the destination register by a reciprocal of `imm32`. The reciprocal is calculated as rcp = 2x / imm32 by choosing the largest integer `x` such that rcp < 264. If `imm32` equals 0, this instruction is a no-op. #### ISWAP_R This instruction swaps the values of two registers. If source and destination refer to the same register, the result is a no-op. @@ -54,7 +53,7 @@ Memory operands are loaded as 8-byte values from the address indicated by `src`. |6/256|FSQRT_R|E|-|`(dst0, dst1) = (√dst0, √dst1)`| #### FSCAL_R -This instruction negates the number and multiplies it by 2x. `x` is calculated by taking the 5 least significant digits of the biased exponent and interpreting them as a binary number using the digit set `{-1, +1}` as opposed to the traditional `{0, 1}`. The possible values of `x` are all odd numbers from -31 to +31. +This instruction negates the number and multiplies it by 2x. `x` is calculated by taking the 5 least significant digits of the biased exponent and interpreting them as a binary number using the digit set `{+1, -1}` as opposed to the traditional `{0, 1}`. The possible values of `x` are all odd numbers from -31 to +31. The mathematical operation described above is equivalent to a bitwise XOR of the binary representation with the value of `0x81F0000000000000`. diff --git a/makefile b/makefile index 7ad5231..159eb2a 100644 --- a/makefile +++ b/makefile @@ -9,7 +9,7 @@ OBJDIR=obj LDFLAGS=-lpthread CPPSRC=src/argon2_core.c src/Cache.cpp src/divideByConstantCodegen.c src/Instruction.cpp src/JitCompilerX86.cpp src/Program.cpp src/VirtualMachine.cpp src/argon2_ref.c src/CompiledVirtualMachine.cpp src/executeProgram-linux.cpp src/instructionsPortable.cpp src/LightClientAsyncWorker.cpp src/softAes.cpp src/virtualMemory.cpp src/AssemblyGeneratorX86.cpp src/dataset.cpp src/hashAes1Rx4.cpp src/InterpretedVirtualMachine.cpp src/main.cpp src/TestAluFpu.cpp src/blake2/blake2b.c TOBJS=$(addprefix $(OBJDIR)/,instructionsPortable.o TestAluFpu.o) -ROBJS=$(addprefix $(OBJDIR)/,argon2_core.o argon2_ref.o AssemblyGeneratorX86.o blake2b.o CompiledVirtualMachine.o dataset.o JitCompilerX86.o instructionsPortable.o Instruction.o InterpretedVirtualMachine.o main.o Program.o softAes.o VirtualMachine.o Cache.o virtualMemory.o divideByConstantCodegen.o LightClientAsyncWorker.o hashAes1Rx4.o) +ROBJS=$(addprefix $(OBJDIR)/,argon2_core.o argon2_ref.o AssemblyGeneratorX86.o blake2b.o CompiledVirtualMachine.o dataset.o JitCompilerX86.o instructionsPortable.o Instruction.o InterpretedVirtualMachine.o main.o Program.o softAes.o VirtualMachine.o Cache.o virtualMemory.o reciprocal.o LightClientAsyncWorker.o hashAes1Rx4.o) ifeq ($(PLATFORM),amd64) ROBJS += $(OBJDIR)/JitCompilerX86-static.o $(OBJDIR)/squareHash.o CXXFLAGS += -maes @@ -53,7 +53,7 @@ $(OBJDIR)/argon2_core.o: $(addprefix $(SRCDIR)/,argon2_core.c argon2_core.h blak $(OBJDIR)/argon2_ref.o: $(addprefix $(SRCDIR)/,argon2_ref.c argon2.h argon2_core.h blake2/blake2.h blake2/blake2-impl.h blake2/blamka-round-ref.h blake2/endian.h) | $(OBJDIR) $(CC) $(CCFLAGS) -c $(SRCDIR)/argon2_ref.c -o $@ -$(OBJDIR)/AssemblyGeneratorX86.o: $(addprefix $(SRCDIR)/,AssemblyGeneratorX86.cpp AssemblyGeneratorX86.hpp Instruction.hpp common.hpp instructionWeights.hpp blake2/endian.h divideByConstantCodegen.h Program.hpp) | $(OBJDIR) +$(OBJDIR)/AssemblyGeneratorX86.o: $(addprefix $(SRCDIR)/,AssemblyGeneratorX86.cpp AssemblyGeneratorX86.hpp Instruction.hpp common.hpp instructionWeights.hpp blake2/endian.h reciprocal.h Program.hpp) | $(OBJDIR) $(CXX) $(CXXFLAGS) -c $(SRCDIR)/AssemblyGeneratorX86.cpp -o $@ $(OBJDIR)/blake2b.o: $(addprefix $(SRCDIR)/blake2/,blake2b.c blake2.h blake2-impl.h endian.h) | $(OBJDIR) @@ -65,13 +65,13 @@ $(OBJDIR)/CompiledVirtualMachine.o: $(addprefix $(SRCDIR)/,CompiledVirtualMachin $(OBJDIR)/dataset.o: $(addprefix $(SRCDIR)/,dataset.cpp common.hpp blake2/endian.h dataset.hpp intrinPortable.h Cache.hpp virtualMemory.hpp) | $(OBJDIR) $(CXX) $(CXXFLAGS) -c $(SRCDIR)/dataset.cpp -o $@ -$(OBJDIR)/divideByConstantCodegen.o: $(addprefix $(SRCDIR)/,divideByConstantCodegen.c divideByConstantCodegen.h) | $(OBJDIR) - $(CC) $(CCFLAGS) -c $(SRCDIR)/divideByConstantCodegen.c -o $@ +$(OBJDIR)/reciprocal.o: $(addprefix $(SRCDIR)/,reciprocal.c reciprocal.h) | $(OBJDIR) + $(CC) $(CCFLAGS) -c $(SRCDIR)/reciprocal.c -o $@ $(OBJDIR)/hashAes1Rx4.o: $(addprefix $(SRCDIR)/,hashAes1Rx4.cpp softAes.h intrinPortable.h blake2/endian.h) | $(OBJDIR) $(CXX) $(CXXFLAGS) -c $(SRCDIR)/hashAes1Rx4.cpp -o $@ -$(OBJDIR)/JitCompilerX86.o: $(addprefix $(SRCDIR)/,JitCompilerX86.cpp JitCompilerX86.hpp Instruction.hpp instructionWeights.hpp common.hpp blake2/endian.h Program.hpp divideByConstantCodegen.h virtualMemory.hpp) | $(OBJDIR) +$(OBJDIR)/JitCompilerX86.o: $(addprefix $(SRCDIR)/,JitCompilerX86.cpp JitCompilerX86.hpp Instruction.hpp instructionWeights.hpp common.hpp blake2/endian.h Program.hpp reciprocal.h virtualMemory.hpp) | $(OBJDIR) $(CXX) $(CXXFLAGS) -c $(SRCDIR)/JitCompilerX86.cpp -o $@ $(OBJDIR)/JitCompilerX86-static.o: $(addprefix $(SRCDIR)/,JitCompilerX86-static.S $(addprefix asm/program_, prologue_linux.inc prologue_load.inc epilogue_linux.inc epilogue_store.inc read_dataset.inc loop_load.inc loop_store.inc xmm_constants.inc)) | $(OBJDIR) diff --git a/src/AssemblyGeneratorX86.cpp b/src/AssemblyGeneratorX86.cpp index c20138e..15a196b 100644 --- a/src/AssemblyGeneratorX86.cpp +++ b/src/AssemblyGeneratorX86.cpp @@ -17,12 +17,10 @@ You should have received a copy of the GNU General Public License along with RandomX. If not, see. */ //#define TRACE -#define MAGIC_DIVISION + #include "AssemblyGeneratorX86.hpp" #include "common.hpp" -#ifdef MAGIC_DIVISION -#include "divideByConstantCodegen.h" -#endif +#include "reciprocal.h" #include "Program.hpp" namespace RandomX { @@ -276,38 +274,12 @@ namespace RandomX { traceint(instr); } - //~6 uOPs - void AssemblyGeneratorX86::h_IDIV_C(Instruction& instr, int i) { + //2 uOPs + void AssemblyGeneratorX86::h_IMUL_RCP(Instruction& instr, int i) { if (instr.imm32 != 0) { uint32_t divisor = instr.imm32; - if (divisor & (divisor - 1)) { - magicu_info mi = compute_unsigned_magic_info(divisor, sizeof(uint64_t) * 8); - if (mi.pre_shift == 0 && !mi.increment) { - asmCode << "\tmov rax, " << mi.multiplier << std::endl; - asmCode << "\tmul " << regR[instr.dst] << std::endl; - } - else { - asmCode << "\tmov rax, " << regR[instr.dst] << std::endl; - if (mi.pre_shift > 0) - asmCode << "\tshr rax, " << mi.pre_shift << std::endl; - if (mi.increment) { - asmCode << "\tadd rax, 1" << std::endl; - asmCode << "\tsbb rax, 0" << std::endl; - } - asmCode << "\tmov rcx, " << mi.multiplier << std::endl; - asmCode << "\tmul rcx" << std::endl; - } - if (mi.post_shift > 0) - asmCode << "\tshr rdx, " << mi.post_shift << std::endl; - asmCode << "\tadd " << regR[instr.dst] << ", rdx" << std::endl; - } - else { //divisor is a power of two - int shift = 0; - while (divisor >>= 1) - ++shift; - if(shift > 0) - asmCode << "\tshr " << regR[instr.dst] << ", " << shift << std::endl; - } + asmCode << "\tmov rax, " << reciprocal(instr.imm32) << std::endl; + asmCode << "\timul " << regR[instr.dst] << ", rax" << std::endl; traceint(instr); } else { @@ -317,59 +289,7 @@ namespace RandomX { //~8.5 uOPs void AssemblyGeneratorX86::h_ISDIV_C(Instruction& instr, int i) { - int64_t divisor = (int32_t)instr.imm32; - if ((divisor & -divisor) == divisor || (divisor & -divisor) == -divisor) { - asmCode << "\tmov rax, " << regR[instr.dst] << std::endl; - // +/- power of two - bool negative = divisor < 0; - if (negative) - divisor = -divisor; - int shift = 0; - uint64_t unsignedDivisor = divisor; - while (unsignedDivisor >>= 1) - ++shift; - if (shift > 0) { - asmCode << "\tmov rcx, rax" << std::endl; - asmCode << "\tsar rcx, 63" << std::endl; - uint32_t mask = (1ULL << shift) + 0xFFFFFFFF; - asmCode << "\tand ecx, 0" << std::hex << mask << std::dec << "h" << std::endl; - asmCode << "\tadd rax, rcx" << std::endl; - asmCode << "\tsar rax, " << shift << std::endl; - } - if (negative) - asmCode << "\tneg rax" << std::endl; - asmCode << "\tadd " << regR[instr.dst] << ", rax" << std::endl; - traceint(instr); - } - else if (divisor != 0) { - magics_info mi = compute_signed_magic_info(divisor); - asmCode << "\tmov rax, " << mi.multiplier << std::endl; - asmCode << "\timul " << regR[instr.dst] << std::endl; - //asmCode << "\tmov rax, rdx" << std::endl; - asmCode << "\txor eax, eax" << std::endl; - bool haveSF = false; - if (divisor > 0 && mi.multiplier < 0) { - asmCode << "\tadd rdx, " << regR[instr.dst] << std::endl; - haveSF = true; - } - if (divisor < 0 && mi.multiplier > 0) { - asmCode << "\tsub rdx, " << regR[instr.dst] << std::endl; - haveSF = true; - } - if (mi.shift > 0) { - asmCode << "\tsar rdx, " << mi.shift << std::endl; - haveSF = true; - } - if (!haveSF) - asmCode << "\ttest rdx, rdx" << std::endl; - asmCode << "\tsets al" << std::endl; - asmCode << "\tadd rdx, rax" << std::endl; - asmCode << "\tadd " << regR[instr.dst] << ", rdx" << std::endl; - traceint(instr); - } - else { - tracenop(instr); - } + tracenop(instr); } //2 uOPs @@ -570,7 +490,7 @@ namespace RandomX { INST_HANDLE(IMULH_M) INST_HANDLE(ISMULH_R) INST_HANDLE(ISMULH_M) - INST_HANDLE(IDIV_C) + INST_HANDLE(IMUL_RCP) INST_HANDLE(ISDIV_C) INST_HANDLE(INEG_R) INST_HANDLE(IXOR_R) diff --git a/src/AssemblyGeneratorX86.hpp b/src/AssemblyGeneratorX86.hpp index 9968ebe..216e492 100644 --- a/src/AssemblyGeneratorX86.hpp +++ b/src/AssemblyGeneratorX86.hpp @@ -61,7 +61,7 @@ namespace RandomX { void h_IMULH_M(Instruction&, int); void h_ISMULH_R(Instruction&, int); void h_ISMULH_M(Instruction&, int); - void h_IDIV_C(Instruction&, int); + void h_IMUL_RCP(Instruction&, int); void h_ISDIV_C(Instruction&, int); void h_INEG_R(Instruction&, int); void h_IXOR_R(Instruction&, int); diff --git a/src/Instruction.cpp b/src/Instruction.cpp index 4296c88..205aaaa 100644 --- a/src/Instruction.cpp +++ b/src/Instruction.cpp @@ -193,7 +193,7 @@ namespace RandomX { } } - void Instruction::h_IDIV_C(std::ostream& os) const { + void Instruction::h_IMUL_RCP(std::ostream& os) const { os << "r" << (int)dst << ", " << imm32 << std::endl; } @@ -345,7 +345,7 @@ namespace RandomX { INST_NAME(IMULH_M) INST_NAME(ISMULH_R) INST_NAME(ISMULH_M) - INST_NAME(IDIV_C) + INST_NAME(IMUL_RCP) INST_NAME(ISDIV_C) INST_NAME(INEG_R) INST_NAME(IXOR_R) @@ -396,7 +396,7 @@ namespace RandomX { INST_HANDLE(IMULH_M) INST_HANDLE(ISMULH_R) INST_HANDLE(ISMULH_M) - INST_HANDLE(IDIV_C) + INST_HANDLE(IMUL_RCP) INST_HANDLE(ISDIV_C) INST_HANDLE(INEG_R) INST_HANDLE(IXOR_R) diff --git a/src/Instruction.hpp b/src/Instruction.hpp index a38e3e6..543dfbf 100644 --- a/src/Instruction.hpp +++ b/src/Instruction.hpp @@ -41,8 +41,8 @@ namespace RandomX { constexpr int IMULH_M = 9; constexpr int ISMULH_R = 10; constexpr int ISMULH_M = 11; - constexpr int IDIV_C = 12; - constexpr int ISDIV_C = 13; + constexpr int IMUL_RCP = 12; + //constexpr int ISDIV_C = 13; constexpr int INEG_R = 14; constexpr int IXOR_R = 15; constexpr int IXOR_M = 16; @@ -103,7 +103,7 @@ namespace RandomX { void h_IMULH_M(std::ostream&) const; void h_ISMULH_R(std::ostream&) const; void h_ISMULH_M(std::ostream&) const; - void h_IDIV_C(std::ostream&) const; + void h_IMUL_RCP(std::ostream&) const; void h_ISDIV_C(std::ostream&) const; void h_INEG_R(std::ostream&) const; void h_IXOR_R(std::ostream&) const; diff --git a/src/InterpretedVirtualMachine.cpp b/src/InterpretedVirtualMachine.cpp index 4872213..6a97d7d 100644 --- a/src/InterpretedVirtualMachine.cpp +++ b/src/InterpretedVirtualMachine.cpp @@ -30,6 +30,7 @@ along with RandomX. If not, see. #include #include #include "intrinPortable.h" +#include "reciprocal.h" #ifdef STATS #include #endif @@ -136,7 +137,7 @@ namespace RandomX { *ibc.idst += 8 * *ibc.idst + ibc.imm; } break; - case InstructionType::IMUL_R: { + case InstructionType::IMUL_R: { //also handles IMUL_RCP *ibc.idst *= *ibc.isrc; } break; @@ -160,24 +161,6 @@ namespace RandomX { *ibc.idst = smulh(unsigned64ToSigned2sCompl(*ibc.idst), unsigned64ToSigned2sCompl(load64(scratchpad + (*ibc.isrc & ibc.memMask)))); } break; - case InstructionType::IDIV_C: { - uint64_t dividend = *ibc.idst; - uint64_t quotient = dividend / ibc.imm; - *ibc.idst += quotient; - } break; - - case InstructionType::ISDIV_C: { - if (ibc.simm != -1) { - int64_t dividend = unsigned64ToSigned2sCompl(*ibc.idst); - int64_t quotient = dividend / ibc.simm; - *ibc.idst += quotient; - } - else { - uint64_t quotient = ~(*ibc.idst) + 1; - *ibc.idst += quotient; - } - } break; - case InstructionType::INEG_R: { *ibc.idst = ~(*ibc.idst) + 1; //two's complement negative } break; @@ -568,13 +551,14 @@ namespace RandomX { } } break; - CASE_REP(IDIV_C) { + CASE_REP(IMUL_RCP) { uint32_t divisor = instr.imm32; if (divisor != 0) { auto dst = instr.dst % RegistersCount; - ibc.type = InstructionType::IDIV_C; + ibc.type = InstructionType::IMUL_R; ibc.idst = &r[dst]; - ibc.imm = divisor; + ibc.imm = reciprocal(divisor); + ibc.isrc = &ibc.imm; } else { ibc.type = InstructionType::NOP; @@ -582,16 +566,7 @@ namespace RandomX { } break; CASE_REP(ISDIV_C) { - int32_t divisor = unsigned32ToSigned2sCompl(instr.imm32); - if (divisor != 0) { - auto dst = instr.dst % RegistersCount; - ibc.type = InstructionType::ISDIV_C; - ibc.idst = &r[dst]; - ibc.simm = divisor; - } - else { - ibc.type = InstructionType::NOP; - } + ibc.type = InstructionType::NOP; } break; CASE_REP(INEG_R) { diff --git a/src/JitCompilerX86.cpp b/src/JitCompilerX86.cpp index 543632e..5293b05 100644 --- a/src/JitCompilerX86.cpp +++ b/src/JitCompilerX86.cpp @@ -21,7 +21,7 @@ along with RandomX. If not, see. #include #include "JitCompilerX86.hpp" #include "Program.hpp" -#include "divideByConstantCodegen.h" +#include "reciprocal.h" #include "virtualMemory.hpp" namespace RandomX { @@ -395,106 +395,17 @@ namespace RandomX { emitByte(0xc2 + 8 * instr.dst); } - void JitCompilerX86::h_IDIV_C(Instruction& instr) { + void JitCompilerX86::h_IMUL_RCP(Instruction& instr) { if (instr.imm32 != 0) { - uint32_t divisor = instr.imm32; - if (divisor & (divisor - 1)) { - magicu_info mi = compute_unsigned_magic_info(divisor, sizeof(uint64_t) * 8); - if (mi.pre_shift == 0 && !mi.increment) { - emit(MOV_RAX_I); - emit64(mi.multiplier); - emit(REX_MUL_R); - emitByte(0xe0 + instr.dst); - } - else { - emit(REX_MOV_RR64); - emitByte(0xc0 + instr.dst); - if (mi.pre_shift > 0) { - emit(REX_SHR_RAX); - emitByte(mi.pre_shift); - } - if (mi.increment) { - emit(RAX_ADD_SBB_1); - } - emit(MOV_RCX_I); - emit64(mi.multiplier); - emit(MUL_RCX); - } - if (mi.post_shift > 0) { - emit(REX_SHR_RDX); - emitByte(mi.post_shift); - } - emit(REX_ADD_RM); - emitByte(0xc2 + 8 * instr.dst); - } - else { //divisor is a power of two - int shift = 0; - while (divisor >>= 1) - ++shift; - if (shift > 0) { - emit(REX_SH); - emitByte(0xe8 + instr.dst); - } - } + emit(MOV_RAX_I); + emit64(reciprocal(instr.imm32)); + emit(REX_IMUL_RM); + emitByte(0xc0 + 8 * instr.dst); } } void JitCompilerX86::h_ISDIV_C(Instruction& instr) { - int64_t divisor = (int32_t)instr.imm32; - if ((divisor & -divisor) == divisor || (divisor & -divisor) == -divisor) { - emit(REX_MOV_RR64); - emitByte(0xc0 + instr.dst); - // +/- power of two - bool negative = divisor < 0; - if (negative) - divisor = -divisor; - int shift = 0; - uint64_t unsignedDivisor = divisor; - while (unsignedDivisor >>= 1) - ++shift; - if (shift > 0) { - emit(MOV_RCX_RAX_SAR_RCX_63); - uint32_t mask = (1ULL << shift) - 1; - emit(AND_ECX_I); - emit32(mask); - emit(ADD_RAX_RCX); - emit(SAR_RAX_I8); - emitByte(shift); - } - if (negative) - emit(NEG_RAX); - emit(ADD_R_RAX); - emitByte(0xc0 + instr.dst); - } - else if (divisor != 0) { - magics_info mi = compute_signed_magic_info(divisor); - emit(MOV_RAX_I); - emit64(mi.multiplier); - emit(REX_MUL_R); - emitByte(0xe8 + instr.dst); - emit(XOR_EAX_EAX); - bool haveSF = false; - if (divisor > 0 && mi.multiplier < 0) { - emit(ADD_RDX_R); - emitByte(0xc2 + 8 * instr.dst); - haveSF = true; - } - if (divisor < 0 && mi.multiplier > 0) { - emit(SUB_RDX_R); - emitByte(0xc2 + 8 * instr.dst); - haveSF = true; - } - if (mi.shift > 0) { - emit(SAR_RDX_I8); - emitByte(mi.shift); - haveSF = true; - } - if (!haveSF) - emit(TEST_RDX_RDX); - emit(SETS_AL_ADD_RDX_RAX); - emit(ADD_R_RAX); - emitByte(0xc2 + 8 * instr.dst); - } + } void JitCompilerX86::h_INEG_R(Instruction& instr) { @@ -748,7 +659,7 @@ namespace RandomX { INST_HANDLE(IMULH_M) INST_HANDLE(ISMULH_R) INST_HANDLE(ISMULH_M) - INST_HANDLE(IDIV_C) + INST_HANDLE(IMUL_RCP) INST_HANDLE(ISDIV_C) INST_HANDLE(INEG_R) INST_HANDLE(IXOR_R) diff --git a/src/JitCompilerX86.hpp b/src/JitCompilerX86.hpp index 5936dcf..fed3a8a 100644 --- a/src/JitCompilerX86.hpp +++ b/src/JitCompilerX86.hpp @@ -101,7 +101,7 @@ namespace RandomX { void h_IMULH_M(Instruction&); void h_ISMULH_R(Instruction&); void h_ISMULH_M(Instruction&); - void h_IDIV_C(Instruction&); + void h_IMUL_RCP(Instruction&); void h_ISDIV_C(Instruction&); void h_INEG_R(Instruction&); void h_IXOR_R(Instruction&); diff --git a/src/instructionWeights.hpp b/src/instructionWeights.hpp index 74b6211..31f0c54 100644 --- a/src/instructionWeights.hpp +++ b/src/instructionWeights.hpp @@ -32,8 +32,8 @@ along with RandomX. If not, see. #define WT_IMULH_M 1 #define WT_ISMULH_R 4 #define WT_ISMULH_M 1 -#define WT_IDIV_C 4 -#define WT_ISDIV_C 4 +#define WT_IMUL_RCP 8 +#define WT_ISDIV_C 0 #define WT_INEG_R 2 #define WT_IXOR_R 16 #define WT_IXOR_M 4 @@ -71,7 +71,7 @@ along with RandomX. If not, see. constexpr int wtSum = WT_IADD_R + WT_IADD_M + WT_IADD_RC + WT_ISUB_R + \ WT_ISUB_M + WT_IMUL_9C + WT_IMUL_R + WT_IMUL_M + WT_IMULH_R + \ -WT_IMULH_M + WT_ISMULH_R + WT_ISMULH_M + WT_IDIV_C + WT_ISDIV_C + \ +WT_IMULH_M + WT_ISMULH_R + WT_ISMULH_M + WT_IMUL_RCP + WT_ISDIV_C + \ WT_INEG_R + WT_IXOR_R + WT_IXOR_M + WT_IROR_R + WT_IROL_R + \ WT_ISWAP_R + WT_FSWAP_R + WT_FADD_R + WT_FADD_M + WT_FSUB_R + WT_FSUB_M + \ WT_FSCAL_R + WT_FMUL_R + WT_FMUL_M + WT_FDIV_R + WT_FDIV_M + \ diff --git a/src/main.cpp b/src/main.cpp index bb2a52c..0b6a0fa 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -341,7 +341,7 @@ int main(int argc, char** argv) { std::cout << "Calculated result: "; result.print(std::cout); if(programCount == 1000) - std::cout << "Reference result: fe31e8fd7ed1cec773e87c0684b66b38e58b23ab255e8f9c6b62745e43a26851" << std::endl; + std::cout << "Reference result: d3ae5a9365196ed48bb98ebfc3316498e29443ea7f056ecbd272f749c6af7730" << std::endl; if (!miningMode) { std::cout << "Performance: " << 1000 * elapsed / programCount << " ms per hash" << std::endl; } diff --git a/src/program.inc b/src/program.inc index 3c73b24..8a18fe4 100644 --- a/src/program.inc +++ b/src/program.inc @@ -1,740 +1,720 @@ - ; COND_M r1, sg(L1[r3], -2004237569) - xor ecx, ecx - mov eax, r11d - and eax, 16376 - cmp dword ptr [rsi+rax], -2004237569 - sets cl - add r9, rcx - ; IXOR_R r7, -1379425991 - xor r15, -1379425991 - ; IXOR_R r2, r6 - xor r10, r14 - ; FSWAP_R f3 - shufpd xmm3, xmm3, 1 - ; FADD_R f1, a1 - addpd xmm1, xmm9 - ; IMUL_R r0, r5 - imul r8, r13 - ; FMUL_R e1, a3 - mulpd xmm5, xmm11 - ; IADD_R r3, r2 - add r11, r10 - ; COND_M r1, ab(L2[r6], -724006934) - xor ecx, ecx - mov eax, r14d - and eax, 262136 - cmp dword ptr [rsi+rax], -724006934 - seta cl - add r9, rcx - ; IADD_RC r2, r7, -854121467 - lea r10, [r10+r15-854121467] - ; IADD_RC r5, r6, 1291744030 - lea r13, [r13+r14+1291744030] - ; ISTORE L2[r6], r4 - mov eax, r14d - and eax, 262136 - mov qword ptr [rsi+rax], r12 - ; IMUL_R r6, r7 - imul r14, r15 - ; FSUB_R f0, a3 - subpd xmm0, xmm11 - ; IADD_M r3, L1[r0] - mov eax, r8d - and eax, 16376 - add r11, qword ptr [rsi+rax] - ; ISDIV_C r4, -692911499 - mov rax, -893288710803585809 - imul r12 - xor eax, eax - sar rdx, 25 - sets al - add rdx, rax - add r12, rdx - ; FMUL_R e0, a0 - mulpd xmm4, xmm8 - ; FDIV_M e1, L1[r0] - mov eax, r8d - and eax, 16376 - cvtdq2pd xmm12, qword ptr [rsi+rax] - andps xmm12, xmm14 - divpd xmm5, xmm12 - maxpd xmm5, xmm13 + ; IMULH_R r1, r0 + mov rax, r9 + mul r8 + mov r9, rdx + ; IMULH_R r4, r5 + mov rax, r12 + mul r13 + mov r12, rdx ; FMUL_R e0, a1 mulpd xmm4, xmm9 - ; COND_M r0, no(L1[r1], -540292380) - xor ecx, ecx - mov eax, r9d - and eax, 16376 - cmp dword ptr [rsi+rax], -540292380 - setno cl - add r8, rcx - ; FSUB_R f1, a1 - subpd xmm1, xmm9 - ; IADD_RC r0, r2, 310371682 - lea r8, [r8+r10+310371682] - ; COND_R r3, lt(r0, -1067603143) - xor ecx, ecx - cmp r8d, -1067603143 - setl cl - add r11, rcx - ; FMUL_R e0, a0 - mulpd xmm4, xmm8 - ; FADD_R f0, a3 - addpd xmm0, xmm11 - ; COND_R r4, sg(r3, -389806289) - xor ecx, ecx - cmp r11d, -389806289 - sets cl - add r12, rcx - ; FMUL_R e0, a3 - mulpd xmm4, xmm11 - ; ISTORE L2[r7], r4 - mov eax, r15d - and eax, 262136 - mov qword ptr [rsi+rax], r12 - ; IADD_RC r4, r2, 1888908452 - lea r12, [r12+r10+1888908452] - ; IADD_R r1, r2 - add r9, r10 - ; IXOR_R r6, r5 - xor r14, r13 - ; IADD_M r7, L1[r0] - mov eax, r8d - and eax, 16376 - add r15, qword ptr [rsi+rax] - ; IADD_R r5, r6 - add r13, r14 - ; FSUB_R f0, a1 - subpd xmm0, xmm9 - ; IMULH_R r5, r4 - mov rax, r13 - mul r12 - mov r13, rdx - ; IMUL_9C r7, 753606235 - lea r15, [r15+r15*8+753606235] - ; FSWAP_R e2 - shufpd xmm6, xmm6, 1 - ; IMUL_M r7, L1[r1] - mov eax, r9d - and eax, 16376 - imul r15, qword ptr [rsi+rax] - ; IMUL_R r5, 1431156245 - imul r13, 1431156245 - ; IADD_RC r4, r2, 1268508410 - lea r12, [r12+r10+1268508410] - ; FSWAP_R f2 - shufpd xmm2, xmm2, 1 - ; ISDIV_C r0, -845194077 - mov rax, -5858725577819591251 - imul r8 - xor eax, eax - sar rdx, 28 - sets al - add rdx, rax - add r8, rdx - ; COND_R r0, ab(r5, 1644043355) - xor ecx, ecx - cmp r13d, 1644043355 - seta cl - add r8, rcx - ; COND_R r5, lt(r0, 1216385844) - xor ecx, ecx - cmp r8d, 1216385844 - setl cl - add r13, rcx - ; IMUL_R r5, r2 - imul r13, r10 - ; ISTORE L1[r4], r6 - mov eax, r12d - and eax, 16376 - mov qword ptr [rsi+rax], r14 - ; IXOR_R r4, r3 - xor r12, r11 - ; IXOR_R r6, r2 - xor r14, r10 + ; IMUL_9C r6, 933674225 + lea r14, [r14+r14*8+933674225] + ; IROR_R r7, r6 + mov ecx, r14d + ror r15, cl ; FSQRT_R e1 sqrtpd xmm5, xmm5 - ; COND_R r5, be(r1, 1781435695) - xor ecx, ecx - cmp r9d, 1781435695 - setbe cl - add r13, rcx - ; ISDIV_C r0, 1367038890 - mov rax, 1811126293978922977 - imul r8 - xor eax, eax - sar rdx, 27 - sets al - add rdx, rax - add r8, rdx - ; FDIV_M e1, L1[r3] + ; IADD_R r1, r0 + add r9, r8 + ; FSCAL_R f1 + xorps xmm1, xmm15 + ; IMUL_R r6, r5 + imul r14, r13 + ; FSCAL_R f3 + xorps xmm3, xmm15 + ; IADD_M r5, L1[r0] + mov eax, r8d + and eax, 16376 + add r13, qword ptr [rsi+rax] + ; IMUL_RCP r0, 3332750793 + mov rax, 11886301652177618669 + imul r8, rax + ; ISTORE L1[r3], r0 mov eax, r11d and eax, 16376 + mov qword ptr [rsi+rax], r8 + ; FSUB_R f3, a0 + subpd xmm3, xmm8 + ; ISUB_R r1, r3 + sub r9, r11 + ; ISMULH_R r4, r1 + mov rax, r12 + imul r9 + mov r12, rdx + ; IADD_RC r3, r0, 1262539428 + lea r11, [r11+r8+1262539428] + ; FSWAP_R e1 + shufpd xmm5, xmm5, 1 + ; FMUL_R e1, a3 + mulpd xmm5, xmm11 + ; FMUL_R e3, a3 + mulpd xmm7, xmm11 + ; ISWAP_R r0, r2 + xchg r8, r10 + ; COND_R r5, of(r4, 137305269) + xor ecx, ecx + cmp r12d, 137305269 + seto cl + add r13, rcx + ; IMUL_R r6, r4 + imul r14, r12 + ; FMUL_R e3, a0 + mulpd xmm7, xmm8 + ; FSCAL_R f0 + xorps xmm0, xmm15 + ; FADD_R f1, a0 + addpd xmm1, xmm8 + ; IADD_R r6, r3 + add r14, r11 + ; ISMULH_M r1, L3[777112] + mov rax, r9 + imul qword ptr [rsi+777112] + mov r9, rdx + ; FADD_R f1, a1 + addpd xmm1, xmm9 + ; FSUB_M f2, L2[r3] + mov eax, r11d + and eax, 262136 cvtdq2pd xmm12, qword ptr [rsi+rax] - andps xmm12, xmm14 - divpd xmm5, xmm12 - maxpd xmm5, xmm13 - ; FMUL_R e2, a0 - mulpd xmm6, xmm8 - ; ISTORE L1[r5], r4 - mov eax, r13d + subpd xmm2, xmm12 + ; IMUL_R r5, r7 + imul r13, r15 + ; ISUB_M r1, L1[r3] + mov eax, r11d and eax, 16376 - mov qword ptr [rsi+rax], r12 - ; IXOR_R r0, r4 - xor r8, r12 - ; IMUL_R r5, r1 - imul r13, r9 - ; FDIV_M e0, L1[r2] + sub r9, qword ptr [rsi+rax] + ; IXOR_M r1, L1[r6] + mov eax, r14d + and eax, 16376 + xor r9, qword ptr [rsi+rax] + ; COND_R r2, ns(r3, 1727033430) + xor ecx, ecx + cmp r11d, 1727033430 + setns cl + add r10, rcx + ; FADD_R f3, a1 + addpd xmm3, xmm9 + ; FADD_R f2, a2 + addpd xmm2, xmm10 + ; IADD_R r5, -1048707993 + add r13, -1048707993 + ; COND_R r2, ge(r5, -1016934677) + xor ecx, ecx + cmp r13d, -1016934677 + setge cl + add r10, rcx + ; FSUB_R f2, a3 + subpd xmm2, xmm11 + ; ISUB_M r1, L2[r4] + mov eax, r12d + and eax, 262136 + sub r9, qword ptr [rsi+rax] + ; IMUL_R r5, r3 + imul r13, r11 + ; FSUB_R f1, a3 + subpd xmm1, xmm11 + ; IROR_R r1, r3 + mov ecx, r11d + ror r9, cl + ; FADD_R f3, a2 + addpd xmm3, xmm10 + ; ISUB_R r0, -28376526 + sub r8, -28376526 + ; IROR_R r6, r0 + mov ecx, r8d + ror r14, cl + ; FADD_R f1, a0 + addpd xmm1, xmm8 + ; FMUL_R e1, a0 + mulpd xmm5, xmm8 + ; IXOR_R r2, r4 + xor r10, r12 + ; FSUB_M f1, L1[r2] mov eax, r10d and eax, 16376 cvtdq2pd xmm12, qword ptr [rsi+rax] - andps xmm12, xmm14 - divpd xmm4, xmm12 - maxpd xmm4, xmm13 - ; IMUL_R r6, r1 - imul r14, r9 - ; FSUB_M f1, L1[r0] - mov eax, r8d + subpd xmm1, xmm12 + ; FSWAP_R f3 + shufpd xmm3, xmm3, 1 + ; FSUB_R f3, a0 + subpd xmm3, xmm8 + ; ISUB_R r7, r6 + sub r15, r14 + ; FADD_R f3, a1 + addpd xmm3, xmm9 + ; ISUB_R r1, r7 + sub r9, r15 + ; IADD_M r5, L2[r7] + mov eax, r15d + and eax, 262136 + add r13, qword ptr [rsi+rax] + ; IADD_RC r1, r3, 145589392 + lea r9, [r9+r11+145589392] + ; FADD_R f2, a1 + addpd xmm2, xmm9 + ; FSUB_R f1, a1 + subpd xmm1, xmm9 + ; FADD_M f0, L1[r3] + mov eax, r11d and eax, 16376 cvtdq2pd xmm12, qword ptr [rsi+rax] - subpd xmm1, xmm12 - ; COND_R r2, ns(r1, 392878356) + addpd xmm0, xmm12 + ; FADD_R f3, a1 + addpd xmm3, xmm9 + ; FSUB_R f0, a3 + subpd xmm0, xmm11 + ; FMUL_R e2, a2 + mulpd xmm6, xmm10 + ; FADD_R f2, a1 + addpd xmm2, xmm9 + ; IXOR_R r7, r4 + xor r15, r12 + ; FSUB_R f1, a3 + subpd xmm1, xmm11 + ; IMUL_RCP r0, 3339947118 + mov rax, 11860691159940745144 + imul r8, rax + ; FSCAL_R f2 + xorps xmm2, xmm15 + ; IMUL_9C r0, 850304074 + lea r8, [r8+r8*8+850304074] + ; IADD_R r2, r4 + add r10, r12 + ; IADD_R r0, -1929760745 + add r8, -1929760745 + ; ISTORE L2[r4], r7 + mov eax, r12d + and eax, 262136 + mov qword ptr [rsi+rax], r15 + ; IROR_R r2, r7 + mov ecx, r15d + ror r10, cl + ; FMUL_R e1, a1 + mulpd xmm5, xmm9 + ; FSQRT_R e3 + sqrtpd xmm7, xmm7 + ; IXOR_R r0, -1150923249 + xor r8, -1150923249 + ; IMUL_9C r7, 586146619 + lea r15, [r15+r15*8+586146619] + ; FSWAP_R f2 + shufpd xmm2, xmm2, 1 + ; FSUB_M f3, L1[r6] + mov eax, r14d + and eax, 16376 + cvtdq2pd xmm12, qword ptr [rsi+rax] + subpd xmm3, xmm12 + ; IXOR_R r0, 292938237 + xor r8, 292938237 + ; COND_R r6, no(r6, -2142285576) xor ecx, ecx - cmp r9d, 392878356 - setns cl + cmp r14d, -2142285576 + setno cl + add r14, rcx + ; IMUL_RCP r3, 670137279 + mov rax, 14778345608621248183 + imul r11, rax + ; ISTORE L1[r1], r5 + mov eax, r9d + and eax, 16376 + mov qword ptr [rsi+rax], r13 + ; COND_R r3, sg(r1, 1638220289) + xor ecx, ecx + cmp r9d, 1638220289 + sets cl + add r11, rcx + ; IXOR_R r4, r2 + xor r12, r10 + ; COND_R r2, be(r2, 1131588253) + xor ecx, ecx + cmp r10d, 1131588253 + setbe cl add r10, rcx - ; IADD_R r6, r5 - add r14, r13 - ; FMUL_R e2, a0 - mulpd xmm6, xmm8 + ; IMULH_R r3, r1 + mov rax, r11 + mul r9 + mov r11, rdx + ; COND_R r3, sg(r6, 1528901692) + xor ecx, ecx + cmp r14d, 1528901692 + sets cl + add r11, rcx + ; IMUL_M r6, L2[r4] + mov eax, r12d + and eax, 262136 + imul r14, qword ptr [rsi+rax] + ; ISMULH_M r1, L1[r2] + mov ecx, r10d + and ecx, 16376 + mov rax, r9 + imul qword ptr [rsi+rcx] + mov r9, rdx + ; ISUB_M r5, L1[r4] + mov eax, r12d + and eax, 16376 + sub r13, qword ptr [rsi+rax] + ; IMUL_RCP r1, 1612208358 + mov rax, 12285658072842024305 + imul r9, rax + ; COND_R r2, lt(r6, -1712049035) + xor ecx, ecx + cmp r14d, -1712049035 + setl cl + add r10, rcx + ; IMUL_RCP r2, 2888266520 + mov rax, 13715521397634789187 + imul r10, rax + ; IADD_M r1, L2[r6] + mov eax, r14d + and eax, 262136 + add r9, qword ptr [rsi+rax] + ; FMUL_R e0, a3 + mulpd xmm4, xmm11 + ; ISTORE L1[r7], r1 + mov eax, r15d + and eax, 16376 + mov qword ptr [rsi+rax], r9 ; ISTORE L1[r0], r3 mov eax, r8d and eax, 16376 mov qword ptr [rsi+rax], r11 - ; IMUL_R r1, r3 - imul r9, r11 - ; IMUL_R r5, r2 - imul r13, r10 - ; FADD_R f0, a0 - addpd xmm0, xmm8 - ; FADD_R f0, a1 - addpd xmm0, xmm9 - ; FSUB_R f0, a0 - subpd xmm0, xmm8 - ; IMUL_R r3, r5 - imul r11, r13 - ; IADD_R r1, r5 - add r9, r13 - ; IXOR_M r0, L1[r5] - mov eax, r13d - and eax, 16376 - xor r8, qword ptr [rsi+rax] - ; FSCAL_R f2 - xorps xmm2, xmm15 - ; IDIV_C r5, 2577129788 - mov rax, 15371395512010654233 - mul r13 - shr rdx, 31 - add r13, rdx - ; COND_R r5, be(r5, -999219370) - xor ecx, ecx - cmp r13d, -999219370 - setbe cl - add r13, rcx - ; ISTORE L2[r0], r2 - mov eax, r8d - and eax, 262136 - mov qword ptr [rsi+rax], r10 - ; FSUB_R f3, a3 - subpd xmm3, xmm11 - ; IROR_R r7, r6 - mov ecx, r14d - ror r15, cl - ; COND_R r6, ab(r4, 1309137534) - xor ecx, ecx - cmp r12d, 1309137534 - seta cl - add r14, rcx - ; FMUL_R e3, a0 - mulpd xmm7, xmm8 - ; COND_M r3, no(L2[r5], 483660199) - xor ecx, ecx - mov eax, r13d - and eax, 262136 - cmp dword ptr [rsi+rax], 483660199 - setno cl - add r11, rcx - ; IMUL_R r1, r6 - imul r9, r14 - ; IADD_RC r7, r2, -1340630490 - lea r15, [r15+r10-1340630490] - ; IADD_M r0, L3[1554088] - add r8, qword ptr [rsi+1554088] - ; FMUL_R e2, a3 - mulpd xmm6, xmm11 - ; IDIV_C r0, 1566192452 - mov rax, 12646619898641986559 - mul r8 - shr rdx, 30 - add r8, rdx - ; FADD_R f0, a1 - addpd xmm0, xmm9 - ; ISWAP_R r6, r0 - xchg r14, r8 - ; IMUL_9C r4, 1340891034 - lea r12, [r12+r12*8+1340891034] - ; IROR_R r7, r2 - mov ecx, r10d - ror r15, cl - ; FSQRT_R e2 - sqrtpd xmm6, xmm6 - ; FADD_R f2, a1 - addpd xmm2, xmm9 - ; IMUL_R r4, r3 - imul r12, r11 - ; IADD_RC r6, r3, -1584624397 - lea r14, [r14+r11-1584624397] - ; IROR_R r1, r7 - mov ecx, r15d - ror r9, cl - ; IXOR_R r4, r7 - xor r12, r15 - ; FSWAP_R f0 - shufpd xmm0, xmm0, 1 - ; FSWAP_R f3 - shufpd xmm3, xmm3, 1 - ; IROR_R r5, 3 - ror r13, 3 - ; FADD_R f3, a0 - addpd xmm3, xmm8 - ; FMUL_R e0, a0 - mulpd xmm4, xmm8 - ; IADD_R r4, r1 - add r12, r9 - ; COND_M r4, ge(L1[r6], -1612023931) - xor ecx, ecx - mov eax, r14d - and eax, 16376 - cmp dword ptr [rsi+rax], -1612023931 - setge cl - add r12, rcx - ; FSWAP_R e2 - shufpd xmm6, xmm6, 1 - ; IADD_R r3, r7 - add r11, r15 - ; COND_R r5, be(r2, -1083018923) - xor ecx, ecx - cmp r10d, -1083018923 - setbe cl - add r13, rcx - ; IADD_R r3, r7 - add r11, r15 - ; ISTORE L2[r6], r0 - mov eax, r14d - and eax, 262136 - mov qword ptr [rsi+rax], r8 - ; IXOR_R r2, r3 - xor r10, r11 - ; FMUL_R e2, a3 - mulpd xmm6, xmm11 - ; FMUL_R e3, a3 - mulpd xmm7, xmm11 - ; FADD_R f0, a2 - addpd xmm0, xmm10 - ; ISTORE L1[r5], r1 - mov eax, r13d - and eax, 16376 - mov qword ptr [rsi+rax], r9 - ; FMUL_R e3, a3 - mulpd xmm7, xmm11 - ; ISWAP_R r1, r2 - xchg r9, r10 - ; FSWAP_R e0 - shufpd xmm4, xmm4, 1 - ; FSUB_R f1, a2 - subpd xmm1, xmm10 - ; FSUB_R f0, a0 - subpd xmm0, xmm8 - ; IROR_R r7, r0 - mov ecx, r8d - ror r15, cl - ; IADD_RC r5, r4, 283260945 - lea r13, [r13+r12+283260945] - ; ISDIV_C r6, -340125851 - mov rax, -3639652898025032137 - imul r14 - xor eax, eax - sar rdx, 26 - sets al - add rdx, rax - add r14, rdx - ; ISTORE L2[r2], r3 - mov eax, r10d - and eax, 262136 - mov qword ptr [rsi+rax], r11 - ; IADD_RC r6, r6, -935765909 - lea r14, [r14+r14-935765909] - ; ISDIV_C r3, -701703430 - mov rax, -7056770631919985199 - imul r11 - xor eax, eax - sar rdx, 28 - sets al - add rdx, rax - add r11, rdx - ; IXOR_M r3, L2[r1] - mov eax, r9d - and eax, 262136 - xor r11, qword ptr [rsi+rax] - ; FADD_R f2, a1 - addpd xmm2, xmm9 - ; ISTORE L1[r5], r7 - mov eax, r13d - and eax, 16376 - mov qword ptr [rsi+rax], r15 - ; FSUB_R f2, a0 - subpd xmm2, xmm8 - ; FMUL_R e3, a2 - mulpd xmm7, xmm10 - ; IADD_R r2, r5 - add r10, r13 - ; IADD_RC r2, r5, -1056770544 - lea r10, [r10+r13-1056770544] - ; ISTORE L2[r2], r3 - mov eax, r10d - and eax, 262136 - mov qword ptr [rsi+rax], r11 - ; ISMULH_R r7, r1 - mov rax, r15 - imul r9 - mov r15, rdx - ; IXOR_R r0, r5 - xor r8, r13 - ; ISTORE L1[r4], r0 - mov eax, r12d - and eax, 16376 - mov qword ptr [rsi+rax], r8 - ; INEG_R r5 - neg r13 ; FSUB_R f0, a1 subpd xmm0, xmm9 - ; IMUL_R r6, -244261682 - imul r14, -244261682 - ; IMUL_R r1, r0 - imul r9, r8 - ; IMUL_9C r3, -985744277 - lea r11, [r11+r11*8-985744277] + ; FADD_R f2, a2 + addpd xmm2, xmm10 + ; FMUL_R e0, a1 + mulpd xmm4, xmm9 + ; FMUL_R e2, a0 + mulpd xmm6, xmm8 + ; FMUL_R e3, a2 + mulpd xmm7, xmm10 + ; IROR_R r5, 21 + ror r13, 21 + ; FSQRT_R e1 + sqrtpd xmm5, xmm5 + ; ISTORE L1[r3], r1 + mov eax, r11d + and eax, 16376 + mov qword ptr [rsi+rax], r9 + ; IMUL_9C r2, -290275273 + lea r10, [r10+r10*8-290275273] + ; ISUB_M r7, L1[r3] + mov eax, r11d + and eax, 16376 + sub r15, qword ptr [rsi+rax] + ; IMUL_R r6, 1301522739 + imul r14, 1301522739 + ; ISWAP_R r2, r4 + xchg r10, r12 + ; FMUL_R e3, a2 + mulpd xmm7, xmm10 + ; IMUL_9C r2, 877307769 + lea r10, [r10+r10*8+877307769] + ; IMUL_R r0, r3 + imul r8, r11 + ; IMUL_9C r0, 1293318220 + lea r8, [r8+r8*8+1293318220] + ; FSQRT_R e0 + sqrtpd xmm4, xmm4 + ; ISTORE L1[r0], r2 + mov eax, r8d + and eax, 16376 + mov qword ptr [rsi+rax], r10 + ; IMUL_RCP r5, 2071364883 + mov rax, 9562313618003962461 + imul r13, rax + ; FMUL_R e1, a2 + mulpd xmm5, xmm10 + ; FSUB_R f1, a3 + subpd xmm1, xmm11 + ; FSUB_R f0, a1 + subpd xmm0, xmm9 + ; IMULH_R r6, r1 + mov rax, r14 + mul r9 + mov r14, rdx + ; ISTORE L1[r6], r5 + mov eax, r14d + and eax, 16376 + mov qword ptr [rsi+rax], r13 + ; ISTORE L2[r1], r2 + mov eax, r9d + and eax, 262136 + mov qword ptr [rsi+rax], r10 + ; ISUB_M r1, L2[r4] + mov eax, r12d + and eax, 262136 + sub r9, qword ptr [rsi+rax] + ; IADD_M r7, L1[r6] + mov eax, r14d + and eax, 16376 + add r15, qword ptr [rsi+rax] + ; IADD_RC r2, r0, -1705364403 + lea r10, [r10+r8-1705364403] + ; ISTORE L1[r6], r5 + mov eax, r14d + and eax, 16376 + mov qword ptr [rsi+rax], r13 + ; FSUB_M f0, L1[r5] + mov eax, r13d + and eax, 16376 + cvtdq2pd xmm12, qword ptr [rsi+rax] + subpd xmm0, xmm12 + ; IXOR_R r1, r3 + xor r9, r11 + ; FADD_R f2, a0 + addpd xmm2, xmm8 + ; FSCAL_R f2 + xorps xmm2, xmm15 + ; ISUB_R r6, -789651909 + sub r14, -789651909 + ; COND_R r4, sg(r1, -1404926795) + xor ecx, ecx + cmp r9d, -1404926795 + sets cl + add r12, rcx + ; FSCAL_R f2 + xorps xmm2, xmm15 + ; ISUB_R r6, r7 + sub r14, r15 + ; IXOR_R r5, r2 + xor r13, r10 + ; IROR_R r6, r5 + mov ecx, r13d + ror r14, cl + ; FSUB_R f1, a2 + subpd xmm1, xmm10 + ; IMUL_M r4, L1[r5] + mov eax, r13d + and eax, 16376 + imul r12, qword ptr [rsi+rax] + ; FSUB_R f3, a0 + subpd xmm3, xmm8 + ; FSWAP_R e1 + shufpd xmm5, xmm5, 1 + ; IADD_RC r6, r5, 1744830258 + lea r14, [r14+r13+1744830258] + ; FSUB_R f3, a0 + subpd xmm3, xmm8 + ; ISUB_R r7, r0 + sub r15, r8 + ; FSUB_R f1, a3 + subpd xmm1, xmm11 + ; IMUL_9C r4, 241775739 + lea r12, [r12+r12*8+241775739] + ; FADD_R f0, a0 + addpd xmm0, xmm8 + ; IMUL_R r4, r3 + imul r12, r11 + ; IMUL_RCP r4, 2389176791 + mov rax, 16580640414036304271 + imul r12, rax + ; FSCAL_R f1 + xorps xmm1, xmm15 + ; FSUB_R f2, a1 + subpd xmm2, xmm9 + ; ISTORE L2[r2], r0 + mov eax, r10d + and eax, 262136 + mov qword ptr [rsi+rax], r8 + ; IXOR_M r5, L1[r7] + mov eax, r15d + and eax, 16376 + xor r13, qword ptr [rsi+rax] + ; IMULH_M r4, L1[r1] + mov ecx, r9d + and ecx, 16376 + mov rax, r12 + mul qword ptr [rsi+rcx] + mov r12, rdx + ; FMUL_R e2, a1 + mulpd xmm6, xmm9 + ; IXOR_R r0, r5 + xor r8, r13 + ; IROR_R r0, r7 + mov ecx, r15d + ror r8, cl + ; IADD_RC r6, r5, 472588845 + lea r14, [r14+r13+472588845] + ; FADD_R f0, a0 + addpd xmm0, xmm8 + ; FSCAL_R f0 + xorps xmm0, xmm15 ; IROR_R r2, r1 mov ecx, r9d ror r10, cl - ; ISUB_R r4, -1079131550 - sub r12, -1079131550 - ; FSCAL_R f3 - xorps xmm3, xmm15 - ; COND_R r4, ns(r5, -362284631) - xor ecx, ecx - cmp r13d, -362284631 - setns cl - add r12, rcx - ; FSUB_R f2, a0 - subpd xmm2, xmm8 - ; IXOR_R r4, r5 - xor r12, r13 - ; FSCAL_R f1 - xorps xmm1, xmm15 - ; FADD_R f0, a0 - addpd xmm0, xmm8 - ; IADD_RC r3, r3, -173615832 - lea r11, [r11+r11-173615832] - ; IMUL_R r0, 928402279 - imul r8, 928402279 - ; ISUB_R r2, r0 - sub r10, r8 - ; IXOR_R r6, r3 - xor r14, r11 - ; ISUB_R r2, 2106401471 - sub r10, 2106401471 - ; FADD_R f0, a2 - addpd xmm0, xmm10 - ; IMUL_R r4, r6 - imul r12, r14 - ; IADD_RC r4, r0, -373491513 - lea r12, [r12+r8-373491513] - ; ISDIV_C r0, -1739042721 - mov rax, 7057121271817449967 - imul r8 - xor eax, eax - sub rdx, r8 - sar rdx, 30 - sets al - add rdx, rax - add r8, rdx - ; IADD_R r3, r1 - add r11, r9 - ; ISUB_M r7, L1[r5] - mov eax, r13d - and eax, 16376 - sub r15, qword ptr [rsi+rax] - ; IMUL_R r1, r2 - imul r9, r10 - ; ISUB_R r0, 722465116 - sub r8, 722465116 - ; IADD_RC r0, r0, -1919541169 - lea r8, [r8+r8-1919541169] - ; ISUB_M r2, L1[r3] - mov eax, r11d - and eax, 16376 - sub r10, qword ptr [rsi+rax] - ; IADD_R r7, -1183581468 - add r15, -1183581468 - ; FMUL_R e1, a3 - mulpd xmm5, xmm11 + ; IADD_RC r2, r1, 1968510355 + lea r10, [r10+r9+1968510355] + ; FMUL_R e0, a0 + mulpd xmm4, xmm8 + ; ISUB_R r7, r1 + sub r15, r9 + ; IADD_RC r4, r7, 1111936914 + lea r12, [r12+r15+1111936914] + ; IADD_RC r7, r3, 373642756 + lea r15, [r15+r11+373642756] ; FSUB_R f0, a0 subpd xmm0, xmm8 + ; IMUL_RCP r6, 3388328460 + mov rax, 11691334451422153092 + imul r14, rax + ; FSWAP_R e1 + shufpd xmm5, xmm5, 1 + ; IADD_RC r7, r5, -644292398 + lea r15, [r15+r13-644292398] + ; IMUL_9C r7, -1398596563 + lea r15, [r15+r15*8-1398596563] ; FADD_R f0, a3 addpd xmm0, xmm11 - ; IMUL_9C r6, 1241113238 - lea r14, [r14+r14*8+1241113238] - ; FSUB_R f3, a3 - subpd xmm3, xmm11 - ; IADD_M r0, L1[r3] - mov eax, r11d + ; FDIV_M e1, L1[r5] + mov eax, r13d and eax, 16376 - add r8, qword ptr [rsi+rax] - ; IROR_R r3, r7 - mov ecx, r15d - ror r11, cl - ; FADD_R f2, a1 - addpd xmm2, xmm9 - ; IMUL_M r3, L1[r2] + cvtdq2pd xmm12, qword ptr [rsi+rax] + andps xmm12, xmm14 + divpd xmm5, xmm12 + maxpd xmm5, xmm13 + ; IXOR_M r2, L1[r5] + mov eax, r13d + and eax, 16376 + xor r10, qword ptr [rsi+rax] + ; IADD_R r5, r6 + add r13, r14 + ; IROR_R r4, r0 + mov ecx, r8d + ror r12, cl + ; IXOR_R r0, r6 + xor r8, r14 + ; IMUL_RCP r1, 1035942442 + mov rax, 9559913671615977868 + imul r9, rax + ; IMUL_9C r1, 105267179 + lea r9, [r9+r9*8+105267179] + ; IMUL_M r1, L1[r2] mov eax, r10d and eax, 16376 - imul r11, qword ptr [rsi+rax] - ; IMUL_9C r7, -2080412544 - lea r15, [r15+r15*8-2080412544] - ; IMUL_R r0, r3 - imul r8, r11 - ; FADD_R f1, a1 - addpd xmm1, xmm9 - ; IROR_R r6, 21 - ror r14, 21 - ; FDIV_M e3, L1[r1] + imul r9, qword ptr [rsi+rax] + ; COND_R r6, be(r7, 1344676209) + xor ecx, ecx + cmp r15d, 1344676209 + setbe cl + add r14, rcx + ; IADD_R r6, r1 + add r14, r9 + ; IROR_R r5, r1 + mov ecx, r9d + ror r13, cl + ; ISMULH_R r0, r6 + mov rax, r8 + imul r14 + mov r8, rdx + ; IXOR_R r6, r7 + xor r14, r15 + ; FSUB_R f1, a3 + subpd xmm1, xmm11 + ; IMUL_9C r1, 1991866007 + lea r9, [r9+r9*8+1991866007] + ; IMUL_RCP r2, 4139294400 + mov rax, 9570249764581173254 + imul r10, rax + ; FSWAP_R f0 + shufpd xmm0, xmm0, 1 + ; ISUB_R r5, r2 + sub r13, r10 + ; COND_R r6, lt(r1, -834783176) + xor ecx, ecx + cmp r9d, -834783176 + setl cl + add r14, rcx + ; ISTORE L2[r7], r3 + mov eax, r15d + and eax, 262136 + mov qword ptr [rsi+rax], r11 + ; FADD_R f2, a2 + addpd xmm2, xmm10 + ; FSCAL_R f1 + xorps xmm1, xmm15 + ; IMUL_R r7, r4 + imul r15, r12 + ; IMUL_RCP r4, 3027698566 + mov rax, 13083892069700893994 + imul r12, rax + ; IMULH_M r2, L1[r3] + mov ecx, r11d + and ecx, 16376 + mov rax, r10 + mul qword ptr [rsi+rcx] + mov r10, rdx + ; IADD_M r6, L1[r1] mov eax, r9d and eax, 16376 + add r14, qword ptr [rsi+rax] + ; IMUL_M r3, L1[r1] + mov eax, r9d + and eax, 16376 + imul r11, qword ptr [rsi+rax] + ; ISTORE L1[r7], r5 + mov eax, r15d + and eax, 16376 + mov qword ptr [rsi+rax], r13 + ; IADD_RC r3, r1, -183791073 + lea r11, [r11+r9-183791073] + ; IMUL_9C r6, 1353963989 + lea r14, [r14+r14*8+1353963989] + ; ISUB_R r2, r3 + sub r10, r11 + ; IMUL_R r2, r1 + imul r10, r9 + ; IMULH_R r6, r4 + mov rax, r14 + mul r12 + mov r14, rdx + ; ISMULH_R r6, r4 + mov rax, r14 + imul r12 + mov r14, rdx + ; IADD_R r7, r4 + add r15, r12 + ; FMUL_R e3, a1 + mulpd xmm7, xmm9 + ; FADD_R f1, a2 + addpd xmm1, xmm10 + ; IADD_R r5, r6 + add r13, r14 + ; IADD_RC r4, r0, -1810659257 + lea r12, [r12+r8-1810659257] + ; IROR_R r2, r5 + mov ecx, r13d + ror r10, cl + ; FADD_R f2, a2 + addpd xmm2, xmm10 + ; FSWAP_R e2 + shufpd xmm6, xmm6, 1 + ; FADD_M f0, L1[r2] + mov eax, r10d + and eax, 16376 + cvtdq2pd xmm12, qword ptr [rsi+rax] + addpd xmm0, xmm12 + ; IADD_R r0, 52817665 + add r8, 52817665 + ; IMUL_RCP r6, 3388141601 + mov rax, 11691979238837063231 + imul r14, rax + ; IMUL_RCP r3, 1356467790 + mov rax, 14601924774465956466 + imul r11, rax + ; IADD_RC r7, r4, -2056421852 + lea r15, [r15+r12-2056421852] + ; FSUB_M f1, L2[r4] + mov eax, r12d + and eax, 262136 + cvtdq2pd xmm12, qword ptr [rsi+rax] + subpd xmm1, xmm12 + ; ISWAP_R r1, r5 + xchg r9, r13 + ; ISTORE L2[r3], r5 + mov eax, r11d + and eax, 262136 + mov qword ptr [rsi+rax], r13 + ; FMUL_R e0, a3 + mulpd xmm4, xmm11 + ; IADD_RC r1, r4, -129008866 + lea r9, [r9+r12-129008866] + ; COND_R r6, no(r4, 311828213) + xor ecx, ecx + cmp r12d, 311828213 + setno cl + add r14, rcx + ; FSWAP_R e2 + shufpd xmm6, xmm6, 1 + ; IADD_RC r2, r2, 498744396 + lea r10, [r10+r10+498744396] + ; IADD_RC r2, r3, 1515945097 + lea r10, [r10+r11+1515945097] + ; FMUL_R e0, a2 + mulpd xmm4, xmm10 + ; ISTORE L2[r5], r7 + mov eax, r13d + and eax, 262136 + mov qword ptr [rsi+rax], r15 + ; IMUL_M r7, L2[r0] + mov eax, r8d + and eax, 262136 + imul r15, qword ptr [rsi+rax] + ; IADD_R r0, r2 + add r8, r10 + ; IADD_RC r7, r3, 1081450346 + lea r15, [r15+r11+1081450346] + ; FADD_R f1, a3 + addpd xmm1, xmm11 + ; FSCAL_R f3 + xorps xmm3, xmm15 + ; FADD_M f3, L2[r7] + mov eax, r15d + and eax, 262136 + cvtdq2pd xmm12, qword ptr [rsi+rax] + addpd xmm3, xmm12 + ; FSUB_R f3, a0 + subpd xmm3, xmm8 + ; COND_M r2, of(L1[r5], -255033167) + xor ecx, ecx + mov eax, r13d + and eax, 16376 + cmp dword ptr [rsi+rax], -255033167 + seto cl + add r10, rcx + ; FSUB_R f1, a1 + subpd xmm1, xmm9 + ; IADD_R r2, r5 + add r10, r13 + ; FSQRT_R e2 + sqrtpd xmm6, xmm6 + ; IMUL_9C r2, 1521722302 + lea r10, [r10+r10*8+1521722302] + ; FADD_R f0, a3 + addpd xmm0, xmm11 + ; ISUB_R r0, r5 + sub r8, r13 + ; FADD_R f2, a0 + addpd xmm2, xmm8 + ; ISWAP_R r6, r0 + xchg r14, r8 + ; IADD_RC r1, r4, -693164762 + lea r9, [r9+r12-693164762] + ; FDIV_M e0, L2[r2] + mov eax, r10d + and eax, 262136 + cvtdq2pd xmm12, qword ptr [rsi+rax] + andps xmm12, xmm14 + divpd xmm4, xmm12 + maxpd xmm4, xmm13 + ; IMUL_9C r4, -1849458799 + lea r12, [r12+r12*8-1849458799] + ; IADD_RC r1, r4, -651820510 + lea r9, [r9+r12-651820510] + ; IMULH_R r6, r6 + mov rax, r14 + mul r14 + mov r14, rdx + ; FSUB_M f3, L2[r0] + mov eax, r8d + and eax, 262136 + cvtdq2pd xmm12, qword ptr [rsi+rax] + subpd xmm3, xmm12 + ; FSUB_R f0, a2 + subpd xmm0, xmm10 + ; FDIV_M e3, L1[r0] + mov eax, r8d + and eax, 16376 cvtdq2pd xmm12, qword ptr [rsi+rax] andps xmm12, xmm14 divpd xmm7, xmm12 maxpd xmm7, xmm13 - ; FSUB_R f0, a1 - subpd xmm0, xmm9 - ; FSWAP_R e1 - shufpd xmm5, xmm5, 1 - ; COND_M r0, no(L1[r5], -1627153829) - xor ecx, ecx - mov eax, r13d - and eax, 16376 - cmp dword ptr [rsi+rax], -1627153829 - setno cl - add r8, rcx - ; FADD_R f2, a3 - addpd xmm2, xmm11 - ; FSUB_R f1, a2 - subpd xmm1, xmm10 - ; FSUB_M f1, L1[r4] - mov eax, r12d - and eax, 16376 - cvtdq2pd xmm12, qword ptr [rsi+rax] - subpd xmm1, xmm12 - ; ISTORE L1[r5], r1 - mov eax, r13d - and eax, 16376 - mov qword ptr [rsi+rax], r9 - ; ISUB_M r2, L2[r7] - mov eax, r15d - and eax, 262136 - sub r10, qword ptr [rsi+rax] - ; ISTORE L1[r2], r3 - mov eax, r10d - and eax, 16376 - mov qword ptr [rsi+rax], r11 - ; FADD_R f0, a3 - addpd xmm0, xmm11 - ; ISUB_M r1, L1[r7] + ; IADD_M r3, L1[r7] mov eax, r15d and eax, 16376 - sub r9, qword ptr [rsi+rax] - ; IDIV_C r5, 624165039 - mov rax, 15866829597104432181 - mul r13 - shr rdx, 29 - add r13, rdx - ; FMUL_R e3, a0 - mulpd xmm7, xmm8 - ; IMUL_R r5, r4 - imul r13, r12 - ; FMUL_R e3, a1 - mulpd xmm7, xmm9 - ; FMUL_R e3, a3 - mulpd xmm7, xmm11 - ; IXOR_R r0, -2064879200 - xor r8, -2064879200 - ; FADD_R f1, a3 - addpd xmm1, xmm11 - ; IADD_M r0, L1[r3] - mov eax, r11d - and eax, 16376 - add r8, qword ptr [rsi+rax] - ; ISMULH_R r7, r3 - mov rax, r15 - imul r11 - mov r15, rdx - ; IMUL_R r5, -1645503310 - imul r13, -1645503310 - ; IMUL_R r7, r3 - imul r15, r11 - ; FMUL_R e2, a2 - mulpd xmm6, xmm10 - ; IADD_R r6, 1769041191 - add r14, 1769041191 - ; FSUB_M f1, L1[r4] - mov eax, r12d - and eax, 16376 - cvtdq2pd xmm12, qword ptr [rsi+rax] - subpd xmm1, xmm12 - ; ISTORE L2[r1], r0 - mov eax, r9d - and eax, 262136 - mov qword ptr [rsi+rax], r8 - ; FSCAL_R f0 - xorps xmm0, xmm15 - ; FMUL_R e0, a3 - mulpd xmm4, xmm11 - ; IMUL_R r2, r7 - imul r10, r15 - ; IADD_R r5, r1 - add r13, r9 - ; IROR_R r3, r6 - mov ecx, r14d - ror r11, cl - ; FADD_R f0, a0 - addpd xmm0, xmm8 - ; FMUL_R e1, a2 - mulpd xmm5, xmm10 - ; FSCAL_R f3 - xorps xmm3, xmm15 - ; FADD_R f1, a1 - addpd xmm1, xmm9 - ; IMULH_R r2, r5 - mov rax, r10 - mul r13 - mov r10, rdx - ; ISTORE L1[r4], r0 - mov eax, r12d - and eax, 16376 - mov qword ptr [rsi+rax], r8 - ; ISWAP_R r7, r0 - xchg r15, r8 - ; FSWAP_R f0 - shufpd xmm0, xmm0, 1 - ; ISUB_R r2, r0 - sub r10, r8 - ; FSUB_R f1, a3 - subpd xmm1, xmm11 - ; ISUB_M r5, L1[r3] - mov eax, r11d - and eax, 16376 - sub r13, qword ptr [rsi+rax] - ; IXOR_R r7, r0 - xor r15, r8 - ; IMUL_R r4, r1 - imul r12, r9 - ; IADD_RC r0, r2, -1102648763 - lea r8, [r8+r10-1102648763] - ; FMUL_R e3, a3 - mulpd xmm7, xmm11 - ; IXOR_R r4, r1 - xor r12, r9 - ; IXOR_R r6, r0 - xor r14, r8 - ; FSQRT_R e1 - sqrtpd xmm5, xmm5 - ; IMUL_M r6, L2[r1] - mov eax, r9d - and eax, 262136 - imul r14, qword ptr [rsi+rax] - ; ISMULH_M r5, L3[353552] - mov rax, r13 - imul qword ptr [rsi+353552] - mov r13, rdx - ; ISUB_M r1, L1[r6] + add r11, qword ptr [rsi+rax] + ; IXOR_M r2, L2[r6] mov eax, r14d - and eax, 16376 - sub r9, qword ptr [rsi+rax] - ; FADD_R f0, a3 - addpd xmm0, xmm11 - ; FMUL_R e3, a3 - mulpd xmm7, xmm11 - ; FSUB_M f3, L2[r7] - mov eax, r15d and eax, 262136 - cvtdq2pd xmm12, qword ptr [rsi+rax] - subpd xmm3, xmm12 - ; IMUL_R r0, r2 - imul r8, r10 - ; FMUL_R e1, a0 - mulpd xmm5, xmm8 - ; COND_R r5, sg(r3, -1392293091) - xor ecx, ecx - cmp r11d, -1392293091 - sets cl - add r13, rcx - ; FSWAP_R e3 - shufpd xmm7, xmm7, 1 - ; IMUL_R r7, r4 - imul r15, r12 - ; IXOR_R r7, r5 - xor r15, r13 - ; FMUL_R e3, a3 - mulpd xmm7, xmm11 - ; IMUL_R r4, r3 - imul r12, r11 - ; FADD_M f1, L1[r1] - mov eax, r9d - and eax, 16376 - cvtdq2pd xmm12, qword ptr [rsi+rax] - addpd xmm1, xmm12 - ; IMUL_R r5, r0 - imul r13, r8 - ; ISUB_R r7, r0 - sub r15, r8 - ; IADD_M r5, L1[r4] - mov eax, r12d - and eax, 16376 - add r13, qword ptr [rsi+rax] - ; IADD_R r6, r2 - add r14, r10 - ; FMUL_R e1, a1 - mulpd xmm5, xmm9 - ; IADD_M r2, L3[1073640] - add r10, qword ptr [rsi+1073640] - ; IMUL_R r3, r2 - imul r11, r10 - ; IXOR_R r1, r0 - xor r9, r8 - ; IROR_R r7, r4 - mov ecx, r12d - ror r15, cl - ; FSUB_R f1, a1 - subpd xmm1, xmm9 - ; IMUL_R r7, r5 - imul r15, r13 - ; ISUB_R r1, 866191482 - sub r9, 866191482 - ; IMUL_M r7, L1[r4] - mov eax, r12d - and eax, 16376 - imul r15, qword ptr [rsi+rax] - ; FADD_R f2, a0 - addpd xmm2, xmm8 - ; IADD_R r2, r1 - add r10, r9 + xor r10, qword ptr [rsi+rax] diff --git a/src/reciprocal.c b/src/reciprocal.c new file mode 100644 index 0000000..c414702 --- /dev/null +++ b/src/reciprocal.c @@ -0,0 +1,60 @@ +/* +Copyright (c) 2019 tevador + +This file is part of RandomX. + +RandomX is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +RandomX is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with RandomX. If not, see. +*/ + +#include "reciprocal.h" + + +/* + Calculates rcp = 2**x / divisor for highest integer x such that rcp < 2**64. + + Equivalent x86 assembly (divisor in rcx): + + mov edx, 1 + mov r8, rcx + xor eax, eax + bsr rcx, rcx + shl rdx, cl + div r8 + ret + +*/ +uint64_t reciprocal(uint64_t divisor) { + + const uint64_t p2exp63 = 1ULL << 63; + + uint64_t quotient = p2exp63 / divisor, remainder = p2exp63 % divisor; + + unsigned bsr = 0; //highest set bit in divisor + + for (uint64_t bit = divisor; bit > 0; bit >>= 1) + bsr++; + + for (unsigned shift = 0; shift < bsr; shift++) { + if (remainder >= divisor - remainder) { + quotient = quotient * 2 + 1; + remainder = remainder * 2 - divisor; + } + else { + quotient = quotient * 2; + remainder = remainder * 2; + } + } + + return quotient; +} \ No newline at end of file diff --git a/src/reciprocal.h b/src/reciprocal.h new file mode 100644 index 0000000..0d13394 --- /dev/null +++ b/src/reciprocal.h @@ -0,0 +1,31 @@ +/* +Copyright (c) 2019 tevador + +This file is part of RandomX. + +RandomX is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +RandomX is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with RandomX. If not, see. +*/ + +#pragma once +#include + +#if defined(__cplusplus) +extern "C" { +#endif + + uint64_t reciprocal(uint64_t); + +#if defined(__cplusplus) +} +#endif