Added magic division to JIT compiler

New B operand selection rules
2024-08-15 00:23:14 +00:00 · 2019-01-11 16:53:52 +01:00 · 2019-01-11 16:53:52 +01:00 · 2756bcdcfe
commit 2756bcdcfe
parent 451dfc5730
9 changed files with 1237 additions and 1136 deletions
--- a/doc/isa.md
+++ b/doc/isa.md
@ -83,10 +83,10 @@ The `B.LOC.L` flag determines the B operand. It can be either a register or imme

 |`B.LOC.L`|IA/DIV|IA/SHIFT|IA/MATH|FP|CL|
 |----|--------|----|------|----|---|
-|0|register|register|register|register|register|
+|0|register|`imm8`|`imm32`|register|register|
 |1|`imm32`|register|register|register|register|
 |2|`imm32`|`imm8`|register|register|register|
-|3|`imm32`|`imm8`|`imm32`|register|register|
+|3|`imm32`|register|register|register|register|

 Integer instructions are split into 3 classes: integer division (IA/DIV), shift and rotate (IA/SHIFT) and other (IA/MATH). Floating point (FP) and control (CL) instructions always use a register operand.

--- a/src/AssemblyGeneratorX86.cpp
+++ b/src/AssemblyGeneratorX86.cpp
@ -17,7 +17,7 @@ You should have received a copy of the GNU General Public License
 along with RandomX.  If not, see<http://www.gnu.org/licenses/>.
 */
 //#define TRACE
-//#define MAGIC_DIVISION
+#define MAGIC_DIVISION
 #include "AssemblyGeneratorX86.hpp"
 #include "Pcg32.hpp"
 #include "common.hpp"
@ -64,108 +64,61 @@ namespace RandomX {
 		(this->*generator)(instr, i);
 	}

-	void AssemblyGeneratorX86::genar(Instruction& instr, int i) {
+	void AssemblyGeneratorX86::gena(Instruction& instr, int i) {
 		asmCode << "\txor " << regR[instr.rega % RegistersCount] << ", 0" << std::hex << instr.addra << "h" << std::dec << std::endl;
 		asmCode << "\tmov ecx, " << regR32[instr.rega % RegistersCount] << std::endl;
 		asmCode << "\ttest " << regIc8 << ", 63" << std::endl;
 		asmCode << "\tjnz short rx_body_" << i << std::endl;
-		switch (instr.loca & 3)
-		{
-			case 0:
-			case 1:
-			case 2:
-				asmCode << "\tcall rx_read_l1" << std::endl;
-				asmCode << "rx_body_" << i << ":" << std::endl;
-				if ((instr.loca & 192) == 0)
-					asmCode << "\txor " << regMx << ", rcx" << std::endl;
-				asmCode << "\tand ecx, " << (ScratchpadL1 - 1) << std::endl;
-				break;
-			default: //3
-				asmCode << "\tcall rx_read_l2" << std::endl;
-				asmCode << "rx_body_" << i << ":" << std::endl;
-				if ((instr.loca & 192) == 0)
-					asmCode << "\txor " << regMx << ", rcx" << std::endl;
-				asmCode << "\tand ecx, " << (ScratchpadL2 - 1) << std::endl;
-				break;
+		if (instr.loca & 3) {
+			asmCode << "\tcall rx_read_l1" << std::endl;
+			asmCode << "rx_body_" << i << ":" << std::endl;
+			if ((instr.loca & 192) == 0)
+				asmCode << "\txor " << regMx << ", rcx" << std::endl;
+			asmCode << "\tand ecx, " << (ScratchpadL1 - 1) << std::endl;
 		}
+		else {
+			asmCode << "\tcall rx_read_l2" << std::endl;
+			asmCode << "rx_body_" << i << ":" << std::endl;
+			if ((instr.loca & 192) == 0)
+				asmCode << "\txor " << regMx << ", rcx" << std::endl;
+			asmCode << "\tand ecx, " << (ScratchpadL2 - 1) << std::endl;
+		}
+	}
+
+	void AssemblyGeneratorX86::genar(Instruction& instr, int i) {
+		gena(instr, i);
 		asmCode << "\tmov rax, qword ptr [" << regScratchpadAddr << "+rcx*8]" << std::endl;
 	}


 	void AssemblyGeneratorX86::genaf(Instruction& instr, int i) {
-		asmCode << "\txor " << regR[instr.rega % RegistersCount] << ", 0" << std::hex << instr.addra << "h" << std::dec << std::endl;
-		asmCode << "\tmov ecx, " << regR32[instr.rega % RegistersCount] << std::endl;
-		asmCode << "\ttest " << regIc8 << ", 63" << std::endl;
-		asmCode << "\tjnz short rx_body_" << i << std::endl;
-		switch (instr.loca & 3)
-		{
-			case 0:
-			case 1:
-			case 2:
-				asmCode << "\tcall rx_read_l1" << std::endl;
-				asmCode << "rx_body_" << i << ":" << std::endl;
-				if((instr.loca & 192) == 0)
-					asmCode << "\txor " << regMx << ", rcx" << std::endl;
-				asmCode << "\tand ecx, " << (ScratchpadL1 - 1) << std::endl;
-				break;
-			default: //3
-				asmCode << "\tcall rx_read_l2" << std::endl;
-				asmCode << "rx_body_" << i << ":" << std::endl;
-				if ((instr.loca & 192) == 0)
-					asmCode << "\txor " << regMx << ", rcx" << std::endl;
-				asmCode << "\tand ecx, " << (ScratchpadL2 - 1) << std::endl;
-				break;
-		}
+		gena(instr, i);
 		asmCode << "\tcvtdq2pd xmm0, qword ptr [" << regScratchpadAddr << "+rcx*8]" << std::endl;
 	}

-	void AssemblyGeneratorX86::genbr0(Instruction& instr, const char* instrx86) {
-		switch (instr.locb & 7)
-		{
-		case 0:
-		case 1:
-		case 2:
-		case 3:
+	void AssemblyGeneratorX86::genbiashift(Instruction& instr, const char* instrx86) {
+		if (instr.locb & 1)	{
 			asmCode << "\tmov rcx, " << regR[instr.regb % RegistersCount] << std::endl;
 			asmCode << "\t" << instrx86 << " rax, cl" << std::endl;
-			return;
-		default:
+		} else {
 			asmCode << "\t" << instrx86 << " rax, " << (instr.imm8 & 63) << std::endl;;
-			return;
 		}
 	}

-	void AssemblyGeneratorX86::genbr1(Instruction& instr) {
-		switch (instr.locb & 7)
-		{
-		case 0:
-		case 1:
-		case 2:
-		case 3:
-		case 4:
-		case 5:
+	void AssemblyGeneratorX86::genbia(Instruction& instr) {
+		if (instr.locb & 3)	{
 			asmCode << regR[instr.regb % RegistersCount] << std::endl;
-			return;
-		default:
+		} else {
 			asmCode  << instr.imm32 << std::endl;;
-			return;
 		}
 	}

-	void AssemblyGeneratorX86::genbr132(Instruction& instr) {
-		switch (instr.locb & 7)
-		{
-		case 0:
-		case 1:
-		case 2:
-		case 3:
-		case 4:
-		case 5:
+	void AssemblyGeneratorX86::genbia32(Instruction& instr) {
+		if (instr.locb & 3)	{
 			asmCode << regR32[instr.regb % RegistersCount] << std::endl;
-			return;
-		default:
+		}
+		else {
 			asmCode << instr.imm32 << std::endl;;
-			return;
 		}
 	}

@ -241,28 +194,28 @@ namespace RandomX {
 	void AssemblyGeneratorX86::h_ADD_64(Instruction& instr, int i) {
 		genar(instr, i);
 		asmCode << "\tadd rax, ";
-		genbr1(instr);
+		genbia(instr);
 		gencr(instr);
 	}

 	void AssemblyGeneratorX86::h_ADD_32(Instruction& instr, int i) {
 		genar(instr, i);
 		asmCode << "\tadd eax, ";
-		genbr132(instr);
+		genbia32(instr);
 		gencr(instr);
 	}

 	void AssemblyGeneratorX86::h_SUB_64(Instruction& instr, int i) {
 		genar(instr, i);
 		asmCode << "\tsub rax, ";
-		genbr1(instr);
+		genbia(instr);
 		gencr(instr);
 	}

 	void AssemblyGeneratorX86::h_SUB_32(Instruction& instr, int i) {
 		genar(instr, i);
 		asmCode << "\tsub eax, ";
-		genbr132(instr);
+		genbia32(instr);
 		gencr(instr);
 	}

@ -272,14 +225,14 @@ namespace RandomX {
 		if ((instr.locb & 7) >= 6) {
 			asmCode << "rax, ";
 		}
-		genbr1(instr);
+		genbia(instr);
 		gencr(instr);
 	}

 	void AssemblyGeneratorX86::h_MULH_64(Instruction& instr, int i) {
 		genar(instr, i);
 		asmCode << "\tmov rcx, ";
-		genbr1(instr);
+		genbia(instr);
 		asmCode << "\tmul rcx" << std::endl;
 		asmCode << "\tmov rax, rdx" << std::endl;
 		gencr(instr);
@ -289,7 +242,7 @@ namespace RandomX {
 		genar(instr, i);
 		asmCode << "\tmov ecx, eax" << std::endl;
 		asmCode << "\tmov eax, ";
-		genbr132(instr);
+		genbia32(instr);
 		asmCode << "\timul rax, rcx" << std::endl;
 		gencr(instr);
 	}
@ -310,7 +263,7 @@ namespace RandomX {
 	void AssemblyGeneratorX86::h_IMULH_64(Instruction& instr, int i) {
 		genar(instr, i);
 		asmCode << "\tmov rcx, ";
-		genbr1(instr);
+		genbia(instr);
 		asmCode << "\timul rcx" << std::endl;
 		asmCode << "\tmov rax, rdx" << std::endl;
 		gencr(instr);
@ -318,7 +271,7 @@ namespace RandomX {

 	void AssemblyGeneratorX86::h_DIV_64(Instruction& instr, int i) {
 		genar(instr, i);
-		if ((instr.locb & 7) >= 6) {
+		if (instr.locb & 3) {
 #ifdef MAGIC_DIVISION
 			if (instr.imm32 != 0) {
 				uint32_t divisor = instr.imm32;
@ -373,8 +326,8 @@ namespace RandomX {

 	void AssemblyGeneratorX86::h_IDIV_64(Instruction& instr, int i) {
 		genar(instr, i);
+		if (instr.locb & 3) {
 #ifdef MAGIC_DIVISION
-		if ((instr.locb & 7) >= 6) {
 			int64_t divisor = instr.imm32;
 			asmCode << "\t; magic divide by " << divisor << std::endl;
 			if ((divisor & -divisor) == divisor || (divisor & -divisor) == -divisor) {
@ -394,9 +347,10 @@ namespace RandomX {
 					asmCode << "\tadd rax, rcx" << std::endl;
 					asmCode << "\tsar rax, " << shift << std::endl;
 				}
-				if(negative)
+				if (negative)
 					asmCode << "\tneg rax" << std::endl;
-			} else if(divisor != 0) {
+			}
+			else if (divisor != 0) {
 				magics_info mi = compute_signed_magic_info(divisor);
 				if ((divisor >= 0) != (mi.multiplier >= 0))
 					asmCode << "\tmov rcx, rax" << std::endl;
@ -422,25 +376,29 @@ namespace RandomX {
 				asmCode << "\tsets dl" << std::endl;
 				asmCode << "\tadd rax, rdx" << std::endl;
 			}
+#else
+			asmCode << "\tmov edx, " << instr.imm32 << std::endl;
+#endif
 		}
 		else {
-#endif
-		asmCode << "\tmov edx, ";
-		genbr132(instr);
-		asmCode << "\tcmp edx, -1" << std::endl;
-		asmCode << "\tjne short safe_idiv_" << i << std::endl;
-		asmCode << "\tneg rax" << std::endl;
-		asmCode << "\tjmp short result_idiv_" << i << std::endl;
-		asmCode << "safe_idiv_" << i << ":" << std::endl;
-		asmCode << "\tmov ecx, 1" << std::endl;
-		asmCode << "\ttest edx, edx" << std::endl;
-		asmCode << "\tcmovne ecx, edx" << std::endl;
-		asmCode << "\tmovsxd rcx, ecx" << std::endl;
-		asmCode << "\tcqo" << std::endl;
-		asmCode << "\tidiv rcx" << std::endl;
-		asmCode << "result_idiv_" << i << ":" << std::endl;
-#ifdef MAGIC_DIVISION
+			asmCode << "\tmov edx, " << regR32[instr.regb % RegistersCount] << std::endl;
+#ifndef MAGIC_DIVISION
 		}
+#endif
+			asmCode << "\tcmp edx, -1" << std::endl;
+			asmCode << "\tjne short body_idiv_" << i << std::endl;
+			asmCode << "\tneg rax" << std::endl;
+			asmCode << "\tjmp short result_idiv_" << i << std::endl;
+			asmCode << "body_idiv_" << i << ":" << std::endl;
+			asmCode << "\tmov ecx, 1" << std::endl;
+			asmCode << "\ttest edx, edx" << std::endl;
+			asmCode << "\tcmovne ecx, edx" << std::endl;
+			asmCode << "\tmovsxd rcx, ecx" << std::endl;
+			asmCode << "\tcqo" << std::endl;
+			asmCode << "\tidiv rcx" << std::endl;
+			asmCode << "result_idiv_" << i << ":" << std::endl;
+#ifdef MAGIC_DIVISION
+	}
 #endif
 		gencr(instr);
 	}
@ -448,72 +406,72 @@ namespace RandomX {
 	void AssemblyGeneratorX86::h_AND_64(Instruction& instr, int i) {
 		genar(instr, i);
 		asmCode << "\tand rax, ";
-		genbr1(instr);
+		genbia(instr);
 		gencr(instr);
 	}

 	void AssemblyGeneratorX86::h_AND_32(Instruction& instr, int i) {
 		genar(instr, i);
 		asmCode << "\tand eax, ";
-		genbr132(instr);
+		genbia32(instr);
 		gencr(instr);
 	}

 	void AssemblyGeneratorX86::h_OR_64(Instruction& instr, int i) {
 		genar(instr, i);
 		asmCode << "\tor rax, ";
-		genbr1(instr);
+		genbia(instr);
 		gencr(instr);
 	}

 	void AssemblyGeneratorX86::h_OR_32(Instruction& instr, int i) {
 		genar(instr, i);
 		asmCode << "\tor eax, ";
-		genbr132(instr);
+		genbia32(instr);
 		gencr(instr);
 	}

 	void AssemblyGeneratorX86::h_XOR_64(Instruction& instr, int i) {
 		genar(instr, i);
 		asmCode << "\txor rax, ";
-		genbr1(instr);
+		genbia(instr);
 		gencr(instr);
 	}

 	void AssemblyGeneratorX86::h_XOR_32(Instruction& instr, int i) {
 		genar(instr, i);
 		asmCode << "\txor eax, ";
-		genbr132(instr);
+		genbia32(instr);
 		gencr(instr);
 	}

 	void AssemblyGeneratorX86::h_SHL_64(Instruction& instr, int i) {
 		genar(instr, i);
-		genbr0(instr, "shl");
+		genbiashift(instr, "shl");
 		gencr(instr);
 	}

 	void AssemblyGeneratorX86::h_SHR_64(Instruction& instr, int i) {
 		genar(instr, i);
-		genbr0(instr, "shr");
+		genbiashift(instr, "shr");
 		gencr(instr);
 	}

 	void AssemblyGeneratorX86::h_SAR_64(Instruction& instr, int i) {
 		genar(instr, i);
-		genbr0(instr, "sar");
+		genbiashift(instr, "sar");
 		gencr(instr);
 	}

 	void AssemblyGeneratorX86::h_ROL_64(Instruction& instr, int i) {
 		genar(instr, i);
-		genbr0(instr, "rol");
+		genbiashift(instr, "rol");
 		gencr(instr);
 	}

 	void AssemblyGeneratorX86::h_ROR_64(Instruction& instr, int i) {
 		genar(instr, i);
-		genbr0(instr, "ror");
+		genbiashift(instr, "ror");
 		gencr(instr);
 	}

--- a/src/AssemblyGeneratorX86.hpp
+++ b/src/AssemblyGeneratorX86.hpp
@ -38,11 +38,12 @@ namespace RandomX {
 		static InstructionGenerator engine[256];
 		std::stringstream asmCode;

+		void gena(Instruction&, int);
 		void genar(Instruction&, int);
 		void genaf(Instruction&, int);
-		void genbr0(Instruction&, const char*);
-		void genbr1(Instruction&);
-		void genbr132(Instruction&);
+		void genbiashift(Instruction&, const char*);
+		void genbia(Instruction&);
+		void genbia32(Instruction&);
 		void genbf(Instruction&, const char*);
 		void gencr(Instruction&, bool);
 		void gencf(Instruction&, bool);
--- a/src/JitCompilerX86.cpp
+++ b/src/JitCompilerX86.cpp
@ -17,10 +17,14 @@ You should have received a copy of the GNU General Public License
 along with RandomX.  If not, see<http://www.gnu.org/licenses/>.
 */

+//#define MAGIC_DIVISION
 #include "JitCompilerX86.hpp"
 #include "Pcg32.hpp"
 #include <cstring>
 #include <stdexcept>
+#ifdef MAGIC_DIVISION
+#include "divideByConstantCodegen.h"
+#endif

 #ifdef _WIN32
 #include <windows.h>
@ -152,6 +156,17 @@ namespace RandomX {
 		instructionOffsets.push_back(codePos);
 		emit(0x840fcbff); //dec ebx; jz <epilogue>
 		emit(epilogueOffset - (codePos + 4)); //jump offset (RIP-relative)
+		auto generator = engine[instr.opcode];
+		(this->*generator)(instr, i);
+	}
+
+	void JitCompilerX86::fixCallOffsets() {
+		for (CallOffset& co : callOffsets) {
+			*reinterpret_cast<int32_t*>(code + co.pos) = instructionOffsets[co.index] - (co.pos + 4);
+		}
+	}
+
+	void JitCompilerX86::gena(Instruction& instr) {
 		emit(uint16_t(0x8149)); //xor
 		emitByte(0xf0 + (instr.rega % RegistersCount));
 		emit(instr.addra);
@ -169,41 +184,28 @@ namespace RandomX {
 			emit(uint16_t(0x3348));
 			emitByte(0xe9); //xor rbp, rcx
 		}
-		auto generator = engine[instr.opcode];
-		(this->*generator)(instr, i);
-	}
-
-	void JitCompilerX86::fixCallOffsets() {
-		for (CallOffset& co : callOffsets) {
-			*reinterpret_cast<int32_t*>(code + co.pos) = instructionOffsets[co.index] - (co.pos + 4);
+		emit(uint16_t(0xe181)); //and ecx,
+		if (instr.loca & 3) {
+			emit(ScratchpadL1 - 1); //first 16 KiB of scratchpad
+		}
+		else {
+			emit(ScratchpadL2 - 1); //whole scratchpad
 		}
 	}

 	void JitCompilerX86::genar(Instruction& instr) {
-		emit(uint16_t(0xe181)); //and ecx,
-		if (instr.loca & 3) {
-			emit(ScratchpadL1 - 1); //first 16 KiB of scratchpad
-		}
-		else {
-			emit(ScratchpadL2 - 1); //whole scratchpad
-		}
+		gena(instr);
 		emit(0xce048b48); //mov rax,QWORD PTR [rsi+rcx*8]
 	}

 	void JitCompilerX86::genaf(Instruction& instr) {
-		emit(uint16_t(0xe181)); //and ecx,
-		if (instr.loca & 3) {
-			emit(ScratchpadL1 - 1); //first 16 KiB of scratchpad
-		}
-		else {
-			emit(ScratchpadL2 - 1); //whole scratchpad
-		}
+		gena(instr);
 		emitByte(0xf3);
 		emit(0xce04e60f); //cvtdq2pd xmm0,QWORD PTR [rsi+rcx*8]
 	}

-	void JitCompilerX86::genbr0(Instruction& instr, uint16_t opcodeReg, uint16_t opcodeImm) {
-		if ((instr.locb & 7) <= 3) {
+	void JitCompilerX86::genbiashift(Instruction& instr, uint16_t opcodeReg, uint16_t opcodeImm) {
+		if (instr.locb & 1)	{
 			emit(uint16_t(0x8b49)); //mov
 			emitByte(0xc8 + (instr.regb % RegistersCount)); //rcx, regb
 			emitByte(0x48); //REX.W
@ -216,8 +218,8 @@ namespace RandomX {
 		}
 	}

-	void JitCompilerX86::genbr1(Instruction& instr, uint16_t opcodeReg, uint16_t opcodeImm) {
-		if ((instr.locb & 7) <= 5) {
+	void JitCompilerX86::genbia(Instruction& instr, uint16_t opcodeReg, uint16_t opcodeImm) {
+		if (instr.locb & 3) {
 			emit(opcodeReg); // xxx rax, r64
 			emitByte(0xc0 + (instr.regb % RegistersCount));
 		}
@ -227,8 +229,8 @@ namespace RandomX {
 		}
 	}

-	void JitCompilerX86::genbr132(Instruction& instr, uint16_t opcodeReg, uint8_t opcodeImm) {
-		if ((instr.locb & 7) <= 5) {
+	void JitCompilerX86::genbia32(Instruction& instr, uint16_t opcodeReg, uint8_t opcodeImm) {
+		if (instr.locb & 3) {
 			emit(opcodeReg); // xxx eax, r32
 			emitByte(0xc0 + (instr.regb % RegistersCount));
 		}
@ -328,25 +330,25 @@ namespace RandomX {

 	void JitCompilerX86::h_ADD_64(Instruction& instr, int i) {
 		genar(instr);
-		genbr1(instr, 0x0349, 0x0548);
+		genbia(instr, 0x0349, 0x0548);
 		gencr(instr);
 	}

 	void JitCompilerX86::h_ADD_32(Instruction& instr, int i) {
 		genar(instr);
-		genbr132(instr, 0x0341, 0x05);
+		genbia32(instr, 0x0341, 0x05);
 		gencr(instr);
 	}

 	void JitCompilerX86::h_SUB_64(Instruction& instr, int i) {
 		genar(instr);
-		genbr1(instr, 0x2b49, 0x2d48);
+		genbia(instr, 0x2b49, 0x2d48);
 		gencr(instr);
 	}

 	void JitCompilerX86::h_SUB_32(Instruction& instr, int i) {
 		genar(instr);
-		genbr132(instr, 0x2b41, 0x2d);
+		genbia32(instr, 0x2b41, 0x2d);
 		gencr(instr);
 	}

@ -435,104 +437,209 @@ namespace RandomX {

 	void JitCompilerX86::h_DIV_64(Instruction& instr, int i) {
 		genar(instr);
-		if ((instr.locb & 7) <= 5) {
+		if (instr.locb & 3) {
+#ifdef MAGIC_DIVISION
+			if (instr.imm32 != 0) {
+				uint32_t divisor = instr.imm32;
+				if (divisor & (divisor - 1)) {
+					magicu_info mi = compute_unsigned_magic_info(divisor, sizeof(uint64_t) * 8);
+					if (mi.pre_shift > 0) {
+						if (mi.pre_shift == 1) {
+							emitByte(0x48);
+							emit(uint16_t(0xe8d1)); //shr rax,1
+						}
+						else {
+							emit(0x00e8c148 | (mi.pre_shift << 24)); //shr rax, pre_shift
+						}
+					}
+					if (mi.increment) {
+						emit(0x00d8834801c08348); //add rax,1; sbb rax,0
+					}
+					emit(uint16_t(0xb948)); //movabs rcx, multiplier
+					emit(mi.multiplier);
+					emit(0x48e1f748); //mul rcx; REX
+					emit(uint16_t(0xc28b)); //mov rax,rdx
+					if (mi.post_shift > 0)
+						emit(0x00e8c148 | (mi.post_shift << 24)); //shr rax, post_shift
+			}
+				else { //divisor is a power of two
+					int shift = 0;
+					while (divisor >>= 1)
+						++shift;
+					if (shift > 0)
+						emit(0x00e8c148 | (shift << 24)); //shr rax, shift
+				}
+		}
+#else
+			emitByte(0xb9); //mov ecx, imm32
+			emit(instr.imm32 != 0 ? instr.imm32 : 1);
+#endif
+		}
+		else {
 			emitByte(0xb9); //mov ecx, 1
 			emit(1);
 			emit(uint16_t(0x8b41)); //mov edx, r32
 			emitByte(0xd0 + (instr.regb % RegistersCount));
 			emit(0x450fd285); //test edx, edx; cmovne ecx,edx
 			emitByte(0xca);
+#ifdef MAGIC_DIVISION
+			emit(0xf748d233); //xor edx,edx; div rcx
+			emitByte(0xf1);
+#endif
 		}
-		else {
-			emitByte(0xb9); //mov ecx, imm32
-			emit(instr.imm32 != 0 ? instr.imm32 : 1);
-		}
+#ifndef MAGIC_DIVISION
 		emit(0xf748d233); //xor edx,edx; div rcx
 		emitByte(0xf1);
+#endif
 		gencr(instr);
 	}

 	void JitCompilerX86::h_IDIV_64(Instruction& instr, int i) {
 		genar(instr);
-		if ((instr.locb & 7) <= 5) {
-			emit(uint16_t(0x8b41)); //mov edx, r32
-			emitByte(0xd0 + (instr.regb % RegistersCount));
+		if (instr.locb & 3) {
+#ifdef MAGIC_DIVISION
+			int64_t divisor = instr.imm32;
+			if ((divisor & -divisor) == divisor || (divisor & -divisor) == -divisor) {
+				// +/- power of two
+				bool negative = divisor < 0;
+				if (negative)
+					divisor = -divisor;
+				int shift = 0;
+				uint64_t unsignedDivisor = divisor;
+				while (unsignedDivisor >>= 1)
+					++shift;
+				if (shift > 0) {
+					emitByte(0x48);
+					emit(uint16_t(0xc88b)); //mov rcx, rax
+					emit(0x3ff9c148); //sar rcx, 63
+					uint32_t mask = (1ULL << shift) - 1;
+					emit(uint16_t(0xe181)); //and ecx, mask
+					emit(mask);
+					emitByte(0x48);
+					emit(uint16_t(0xc103)); //add rax, rcx
+					emit(0x00f8c148 | (shift << 24)); //sar rax, shift
+				}
+				if (negative) {
+					emitByte(0x48);
+					emit(uint16_t(0xd8f7)); //neg rax
+				}
+			}
+			else if (divisor != 0) {
+				magics_info mi = compute_signed_magic_info(divisor);
+				if ((divisor >= 0) != (mi.multiplier >= 0)) {
+					emitByte(0x48);
+					emit(uint16_t(0xc88b)); //mov rcx, rax
+				}
+				emit(uint16_t(0xba48)); //movabs rdx, multiplier
+				emit(mi.multiplier);
+				emit(0xd233c28b48eaf748); //imul rdx; mov rax,rdx; xor edx,edx
+				bool haveSF = false;
+				if (divisor > 0 && mi.multiplier < 0) {
+					emitByte(0x48);
+					emit(uint16_t(0xc103)); //add rax, rcx
+					haveSF = true;
+				}
+				if (divisor < 0 && mi.multiplier > 0) {
+					emitByte(0x48);
+					emit(uint16_t(0xc12b)); //sub rax, rcx
+					haveSF = true;
+				}
+				if (mi.shift > 0) {
+					emit(0x00f8c148 | (mi.shift << 24)); //sar rax, shift
+					haveSF = true;
+				}
+				if (!haveSF) {
+					emitByte(0x48);
+					emit(uint16_t(0x85c0));
+				}
+				emit(0x48c2980f); //sets dl; add rax, rdx
+				emit(uint16_t(0xc203));
+			}
+#else
+			emitByte(0xba); // mov edx, imm32
+			emit(instr.imm32);
+#endif
 		}
 		else {
-			emitByte(0xba); // xxx edx, imm32
-			emit(instr.imm32);
+			emit(uint16_t(0x8b41)); //mov edx, r32
+			emitByte(0xd0 + (instr.regb % RegistersCount));
+#ifndef MAGIC_DIVISION
 		}
+#endif
 		emit(0xc88b480b75fffa83);
 		emit(0x1274c9ff48c1d148);
 		emit(0x0fd28500000001b9);
 		emit(0x489948c96348ca45);
 		emit(uint16_t(0xf9f7)); //idiv rcx
+#ifdef MAGIC_DIVISION
+	}
+#endif
 		gencr(instr);
 	}

 	void JitCompilerX86::h_AND_64(Instruction& instr, int i) {
 		genar(instr);
-		genbr1(instr, 0x2349, 0x2548);
+		genbia(instr, 0x2349, 0x2548);
 		gencr(instr);
 	}

 	void JitCompilerX86::h_AND_32(Instruction& instr, int i) {
 		genar(instr);
-		genbr132(instr, 0x2341, 0x25);
+		genbia32(instr, 0x2341, 0x25);
 		gencr(instr);
 	}

 	void JitCompilerX86::h_OR_64(Instruction& instr, int i) {
 		genar(instr);
-		genbr1(instr, 0x0b49, 0x0d48);
+		genbia(instr, 0x0b49, 0x0d48);
 		gencr(instr);
 	}

 	void JitCompilerX86::h_OR_32(Instruction& instr, int i) {
 		genar(instr);
-		genbr132(instr, 0x0b41, 0x0d);
+		genbia32(instr, 0x0b41, 0x0d);
 		gencr(instr);
 	}

 	void JitCompilerX86::h_XOR_64(Instruction& instr, int i) {
 		genar(instr);
-		genbr1(instr, 0x3349, 0x3548);
+		genbia(instr, 0x3349, 0x3548);
 		gencr(instr);
 	}

 	void JitCompilerX86::h_XOR_32(Instruction& instr, int i) {
 		genar(instr);
-		genbr132(instr, 0x3341, 0x35);
+		genbia32(instr, 0x3341, 0x35);
 		gencr(instr);
 	}

 	void JitCompilerX86::h_SHL_64(Instruction& instr, int i) {
 		genar(instr);
-		genbr0(instr, 0xe0d3, 0xe0c1);
+		genbiashift(instr, 0xe0d3, 0xe0c1);
 		gencr(instr);
 	}

 	void JitCompilerX86::h_SHR_64(Instruction& instr, int i) {
 		genar(instr);
-		genbr0(instr, 0xe8d3, 0xe8c1);
+		genbiashift(instr, 0xe8d3, 0xe8c1);
 		gencr(instr);
 	}

 	void JitCompilerX86::h_SAR_64(Instruction& instr, int i) {
 		genar(instr);
-		genbr0(instr, 0xf8d3, 0xf8c1);
+		genbiashift(instr, 0xf8d3, 0xf8c1);
 		gencr(instr);
 	}

 	void JitCompilerX86::h_ROL_64(Instruction& instr, int i) {
 		genar(instr);
-		genbr0(instr, 0xc0d3, 0xc0c1);
+		genbiashift(instr, 0xc0d3, 0xc0c1);
 		gencr(instr);
 	}

 	void JitCompilerX86::h_ROR_64(Instruction& instr, int i) {
 		genar(instr);
-		genbr0(instr, 0xc8d3, 0xc8c1);
+		genbiashift(instr, 0xc8d3, 0xc8c1);
 		gencr(instr);
 	}

--- a/src/JitCompilerX86.hpp
+++ b/src/JitCompilerX86.hpp
@ -58,11 +58,12 @@ namespace RandomX {
 		std::vector<int32_t> instructionOffsets;
 		std::vector<CallOffset> callOffsets;

+		void gena(Instruction&);
 		void genar(Instruction&);
 		void genaf(Instruction&);
-		void genbr0(Instruction&, uint16_t, uint16_t);
-		void genbr1(Instruction&, uint16_t, uint16_t);
-		void genbr132(Instruction&, uint16_t, uint8_t);
+		void genbiashift(Instruction&, uint16_t, uint16_t);
+		void genbia(Instruction&, uint16_t, uint16_t);
+		void genbia32(Instruction&, uint16_t, uint8_t);
 		void genbf(Instruction&, uint8_t);
 		void scratchpadStoreR(Instruction&, uint32_t, bool);
 		void scratchpadStoreF(Instruction&, int, uint32_t, bool);
--- a/src/divideByConstantCodegen.c
+++ b/src/divideByConstantCodegen.c
@ -11,10 +11,10 @@

 #include "divideByConstantCodegen.h"

-struct magicu_info compute_unsigned_magic_info(uint D, unsigned num_bits) {
+struct magicu_info compute_unsigned_magic_info(unsigned_type D, unsigned num_bits) {

-	//The numerator must fit in a uint
-	assert(num_bits > 0 && num_bits <= sizeof(uint) * CHAR_BIT);
+	//The numerator must fit in a unsigned_type
+	assert(num_bits > 0 && num_bits <= sizeof(unsigned_type) * CHAR_BIT);

 	// D must be larger than zero and not a power of 2
 	assert(D & (D - 1));
@ -22,29 +22,29 @@ struct magicu_info compute_unsigned_magic_info(uint D, unsigned num_bits) {
 	// The eventual result
 	struct magicu_info result;

-	// Bits in a uint
-	const unsigned UINT_BITS = sizeof(uint) * CHAR_BIT;
+	// Bits in a unsigned_type
+	const unsigned UINT_BITS = sizeof(unsigned_type) * CHAR_BIT;

 	// The extra shift implicit in the difference between UINT_BITS and num_bits
 	const unsigned extra_shift = UINT_BITS - num_bits;

 	// The initial power of 2 is one less than the first one that can possibly work
-	const uint initial_power_of_2 = (uint)1 << (UINT_BITS - 1);
+	const unsigned_type initial_power_of_2 = (unsigned_type)1 << (UINT_BITS - 1);

 	// The remainder and quotient of our power of 2 divided by d
-	uint quotient = initial_power_of_2 / D, remainder = initial_power_of_2 % D;
+	unsigned_type quotient = initial_power_of_2 / D, remainder = initial_power_of_2 % D;

 	// ceil(log_2 D)
 	unsigned ceil_log_2_D;

 	// The magic info for the variant "round down" algorithm
-	uint down_multiplier = 0;
+	unsigned_type down_multiplier = 0;
 	unsigned down_exponent = 0;
 	int has_magic_down = 0;

 	// Compute ceil(log_2 D)
 	ceil_log_2_D = 0;
-	uint tmp;
+	unsigned_type tmp;
 	for (tmp = D; tmp > 0; tmp >>= 1)
 		ceil_log_2_D += 1;

@ -67,11 +67,11 @@ struct magicu_info compute_unsigned_magic_info(uint D, unsigned num_bits) {
 		// We're done if this exponent works for the round_up algorithm.
 		// Note that exponent may be larger than the maximum shift supported,
 		// so the check for >= ceil_log_2_D is critical.
-		if ((exponent + extra_shift >= ceil_log_2_D) || (D - remainder) <= ((uint)1 << (exponent + extra_shift)))
+		if ((exponent + extra_shift >= ceil_log_2_D) || (D - remainder) <= ((unsigned_type)1 << (exponent + extra_shift)))
 			break;

 		// Set magic_down if we have not set it yet and this exponent works for the round_down algorithm
-		if (!has_magic_down && remainder <= ((uint)1 << (exponent + extra_shift))) {
+		if (!has_magic_down && remainder <= ((unsigned_type)1 << (exponent + extra_shift))) {
 			has_magic_down = 1;
 			down_multiplier = quotient;
 			down_exponent = exponent;
@ -96,7 +96,7 @@ struct magicu_info compute_unsigned_magic_info(uint D, unsigned num_bits) {
 	else {
 		// Even divisor, so use a prefix-shifted dividend
 		unsigned pre_shift = 0;
-		uint shifted_D = D;
+		unsigned_type shifted_D = D;
 		while ((shifted_D & 1) == 0) {
 			shifted_D >>= 1;
 			pre_shift += 1;
@ -108,34 +108,34 @@ struct magicu_info compute_unsigned_magic_info(uint D, unsigned num_bits) {
 	return result;
 }

-struct magics_info compute_signed_magic_info(sint D) {
+struct magics_info compute_signed_magic_info(signed_type D) {
 	// D must not be zero and must not be a power of 2 (or its negative)
 	assert(D != 0 && (D & -D) != D && (D & -D) != -D);

 	// Our result
 	struct magics_info result;

-	// Bits in an sint
-	const unsigned SINT_BITS = sizeof(sint) * CHAR_BIT;
+	// Bits in an signed_type
+	const unsigned SINT_BITS = sizeof(signed_type) * CHAR_BIT;

 	// Absolute value of D (we know D is not the most negative value since that's a power of 2)
-	const uint abs_d = (D < 0 ? -D : D);
+	const unsigned_type abs_d = (D < 0 ? -D : D);

 	// The initial power of 2 is one less than the first one that can possibly work
 	// "two31" in Warren
 	unsigned exponent = SINT_BITS - 1;
-	const uint initial_power_of_2 = (uint)1 << exponent;
+	const unsigned_type initial_power_of_2 = (unsigned_type)1 << exponent;

 	// Compute the absolute value of our "test numerator,"
 	// which is the largest dividend whose remainder with d is d-1.
 	// This is called anc in Warren.
-	const uint tmp = initial_power_of_2 + (D < 0);
-	const uint abs_test_numer = tmp - 1 - tmp % abs_d;
+	const unsigned_type tmp = initial_power_of_2 + (D < 0);
+	const unsigned_type abs_test_numer = tmp - 1 - tmp % abs_d;

 	// Initialize our quotients and remainders (q1, r1, q2, r2 in Warren)
-	uint quotient1 = initial_power_of_2 / abs_test_numer, remainder1 = initial_power_of_2 % abs_test_numer;
-	uint quotient2 = initial_power_of_2 / abs_d, remainder2 = initial_power_of_2 % abs_d;
-	uint delta;
+	unsigned_type quotient1 = initial_power_of_2 / abs_test_numer, remainder1 = initial_power_of_2 % abs_test_numer;
+	unsigned_type quotient2 = initial_power_of_2 / abs_d, remainder2 = initial_power_of_2 % abs_d;
+	unsigned_type delta;

 	// Begin our loop
 	do {
--- a/src/divideByConstantCodegen.h
+++ b/src/divideByConstantCodegen.h
@ -24,11 +24,11 @@ along with RandomX.  If not, see<http://www.gnu.org/licenses/>.
 extern "C" {
 #endif

-	typedef uint64_t uint;
-	typedef int64_t sint;
+	typedef uint64_t unsigned_type;
+	typedef int64_t signed_type;

 	/* Computes "magic info" for performing signed division by a fixed integer D.
-	   The type 'sint' is assumed to be defined as a signed integer type large enough
+	   The type 'signed_type' is assumed to be defined as a signed integer type large enough
 	   to hold both the dividend and the divisor.
 	   Here >> is arithmetic (signed) shift, and >>> is logical shift.

@ -55,17 +55,17 @@ extern "C" {
 	 */

 	struct magics_info {
-		sint multiplier; // the "magic number" multiplier
+		signed_type multiplier; // the "magic number" multiplier
 		unsigned shift; // shift for the dividend after multiplying
 	};
-	struct magics_info compute_signed_magic_info(sint D);
+	struct magics_info compute_signed_magic_info(signed_type D);


 	/* Computes "magic info" for performing unsigned division by a fixed positive integer D.
-	   The type 'uint' is assumed to be defined as an unsigned integer type large enough
+	   The type 'unsigned_type' is assumed to be defined as an unsigned integer type large enough
 	   to hold both the dividend and the divisor. num_bits can be set appropriately if n is
-	   known to be smaller than the largest uint; if this is not known then pass
-	   (sizeof(uint) * CHAR_BIT) for num_bits.
+	   known to be smaller than the largest unsigned_type; if this is not known then pass
+	   (sizeof(unsigned_type) * CHAR_BIT) for num_bits.

 	   Assume we have a hardware register of width UINT_BITS, a known constant D which is
 	   not zero and not a power of 2, and a variable n of width num_bits (which may be
@ -105,12 +105,12 @@ extern "C" {
 	 */

 	struct magicu_info {
-		uint multiplier; // the "magic number" multiplier
+		unsigned_type multiplier; // the "magic number" multiplier
 		unsigned pre_shift; // shift for the dividend before multiplying
 		unsigned post_shift; //shift for the dividend after multiplying
 		int increment; // 0 or 1; if set then increment the numerator, using one of the two strategies
 	};
-	struct magicu_info compute_unsigned_magic_info(uint D, unsigned num_bits);
+	struct magicu_info compute_unsigned_magic_info(unsigned_type D, unsigned num_bits);

 #if defined(__cplusplus)
 }
--- a/src/instructionWeights.hpp
+++ b/src/instructionWeights.hpp
@ -19,17 +19,17 @@ along with RandomX.  If not, see<http://www.gnu.org/licenses/>.

 #pragma once

-#define WT_ADD_64 15
+#define WT_ADD_64 12
 #define WT_ADD_32 2
-#define WT_SUB_64 15
+#define WT_SUB_64 12
 #define WT_SUB_32 2
 #define WT_MUL_64 23
 #define WT_MULH_64 10
 #define WT_MUL_32 15
 #define WT_IMUL_32 15
 #define WT_IMULH_64 6
-#define WT_DIV_64 1
-#define WT_IDIV_64 1
+#define WT_DIV_64 4
+#define WT_IDIV_64 4
 #define WT_AND_64 4
 #define WT_AND_32 2
 #define WT_OR_64 4
--- a/src/program.inc
+++ b/src/program.inc