mirror of
				https://git.wownero.com/wownero/RandomWOW.git
				synced 2024-08-15 00:23:14 +00:00 
			
		
		
		
	SuperscalarHash JIT compiler
(unfinished)
This commit is contained in:
		
							parent
							
								
									690707ef49
								
							
						
					
					
						commit
						77dbe14658
					
				
					 18 changed files with 453 additions and 135 deletions
				
			
		|  | @ -97,14 +97,12 @@ namespace RandomX { | |||
| 	} | ||||
| 
 | ||||
| 	//1 uOP
 | ||||
| 	void AssemblyGeneratorX86::h_IADD_R(Instruction& instr, int i) { | ||||
| 	void AssemblyGeneratorX86::h_IADD_RS(Instruction& instr, int i) { | ||||
| 		registerUsage[instr.dst] = i; | ||||
| 		if (instr.src != instr.dst) { | ||||
| 			asmCode << "\tadd " << regR[instr.dst] << ", " << regR[instr.src] << std::endl; | ||||
| 		} | ||||
| 		else { | ||||
| 			asmCode << "\tadd " << regR[instr.dst] << ", " << (int32_t)instr.getImm32() << std::endl; | ||||
| 		} | ||||
| 		if(instr.dst == 5) | ||||
| 			asmCode << "\tlea " << regR[instr.dst] << ", [" << regR[instr.dst] << "+" << regR[instr.src] << "*" << (1 << (instr.mod % 4)) << std::showpos << (int32_t)instr.getImm32() << std::noshowpos << "]" << std::endl; | ||||
| 		else | ||||
| 			asmCode << "\tlea " << regR[instr.dst] << ", [" << regR[instr.dst] << "+" << regR[instr.src] << "*" << (1 << (instr.mod % 4)) << "]" << std::endl; | ||||
| 		traceint(instr); | ||||
| 	} | ||||
| 
 | ||||
|  | @ -517,7 +515,7 @@ namespace RandomX { | |||
| 
 | ||||
| 	InstructionGenerator AssemblyGeneratorX86::engine[256] = { | ||||
| 		//Integer
 | ||||
| 		INST_HANDLE(IADD_R) | ||||
| 		INST_HANDLE(IADD_RS) | ||||
| 		INST_HANDLE(IADD_M) | ||||
| 		INST_HANDLE(IADD_RC) | ||||
| 		INST_HANDLE(ISUB_R) | ||||
|  |  | |||
|  | @ -68,7 +68,7 @@ namespace RandomX { | |||
| 		void traceflt(Instruction&); | ||||
| 		void tracenop(Instruction&); | ||||
| 
 | ||||
| 		void  h_IADD_R(Instruction&, int); | ||||
| 		void  h_IADD_RS(Instruction&, int); | ||||
| 		void  h_IADD_M(Instruction&, int); | ||||
| 		void  h_IADD_RC(Instruction&, int); | ||||
| 		void  h_ISUB_R(Instruction&, int); | ||||
|  |  | |||
|  | @ -40,9 +40,9 @@ namespace RandomX { | |||
| 		os << "L3" << "[" << (getImm32() & ScratchpadL3Mask) << "]"; | ||||
| 	} | ||||
| 
 | ||||
| 	void Instruction::h_IADD_R(std::ostream& os) const { | ||||
| 	void Instruction::h_IADD_RS(std::ostream& os) const { | ||||
| 		if (src != dst) { | ||||
| 			os << "r" << (int)dst << ", r" << (int)src << std::endl; | ||||
| 			os << "r" << (int)dst << ", r" << (int)src << ", LSH " << (int)(mod % 4) << std::endl; | ||||
| 		} | ||||
| 		else { | ||||
| 			os << "r" << (int)dst << ", " << (int32_t)getImm32() << std::endl; | ||||
|  | @ -302,13 +302,13 @@ namespace RandomX { | |||
| 	} | ||||
| 
 | ||||
| 	void Instruction::h_COND_R(std::ostream& os) const { | ||||
| 		os << "r" << (int)dst << ", " << condition((mod >> 2) & 7) << "(r" << (int)src << ", " << (int32_t)getImm32() << "), " << (int)(mod >> 5) << std::endl; | ||||
| 		os << "r" << (int)dst << ", " << condition((mod >> 2) & 7) << "(r" << (int)src << ", " << (int32_t)getImm32() << "), LSH " << (int)(mod >> 5) << std::endl; | ||||
| 	} | ||||
| 
 | ||||
| 	void Instruction::h_COND_M(std::ostream& os) const { | ||||
| 		os << "r" << (int)dst << ", " << condition((mod >> 2) & 7) << "("; | ||||
| 		genAddressReg(os); | ||||
| 		os << ", " << (int32_t)getImm32() << "), " << (int)(mod >> 5) << std::endl; | ||||
| 		os << ", " << (int32_t)getImm32() << "), LSH " << (int)(mod >> 5) << std::endl; | ||||
| 	} | ||||
| 
 | ||||
| 	void  Instruction::h_ISTORE(std::ostream& os) const { | ||||
|  | @ -333,7 +333,7 @@ namespace RandomX { | |||
| 
 | ||||
| 	const char* Instruction::names[256] = { | ||||
| 		//Integer
 | ||||
| 		INST_NAME(IADD_R) | ||||
| 		INST_NAME(IADD_RS) | ||||
| 		INST_NAME(IADD_M) | ||||
| 		INST_NAME(IADD_RC) | ||||
| 		INST_NAME(ISUB_R) | ||||
|  | @ -379,7 +379,7 @@ namespace RandomX { | |||
| 
 | ||||
| 	InstructionVisualizer Instruction::engine[256] = { | ||||
| 		//Integer
 | ||||
| 		INST_HANDLE(IADD_R) | ||||
| 		INST_HANDLE(IADD_RS) | ||||
| 		INST_HANDLE(IADD_M) | ||||
| 		INST_HANDLE(IADD_RC) | ||||
| 		INST_HANDLE(ISUB_R) | ||||
|  |  | |||
|  | @ -98,7 +98,7 @@ namespace RandomX { | |||
| 		void genAddressImm(std::ostream& os) const; | ||||
| 		void genAddressRegDst(std::ostream&) const; | ||||
| 
 | ||||
| 		void  h_IADD_R(std::ostream&) const; | ||||
| 		void  h_IADD_RS(std::ostream&) const; | ||||
| 		void  h_IADD_M(std::ostream&) const; | ||||
| 		void  h_IADD_RC(std::ostream&) const; | ||||
| 		void  h_ISUB_R(std::ostream&) const; | ||||
|  |  | |||
|  | @ -442,7 +442,7 @@ namespace RandomX { | |||
| 			auto& instr = program(i); | ||||
| 			auto& ibc = byteCode[i]; | ||||
| 			switch (instr.opcode) { | ||||
| 				CASE_REP(IADD_R) { | ||||
| 				CASE_REP(IADD_RS) { | ||||
| 					auto dst = instr.dst % RegistersCount; | ||||
| 					auto src = instr.src % RegistersCount; | ||||
| 					ibc.type = InstructionType::IADD_R; | ||||
|  |  | |||
|  | @ -26,9 +26,14 @@ PUBLIC randomx_program_start | |||
| PUBLIC randomx_program_read_dataset | ||||
| PUBLIC randomx_program_read_dataset_light | ||||
| PUBLIC randomx_program_read_dataset_light_sub | ||||
| PUBLIC randomx_dataset_init | ||||
| PUBLIC randomx_program_loop_store | ||||
| PUBLIC randomx_program_loop_end | ||||
| PUBLIC randomx_program_epilogue | ||||
| PUBLIC randomx_sshash_load | ||||
| PUBLIC randomx_sshash_prefetch | ||||
| PUBLIC randomx_sshash_end | ||||
| PUBLIC randomx_sshash_init | ||||
| PUBLIC randomx_program_end | ||||
| 
 | ||||
| ALIGN 64 | ||||
|  | @ -75,11 +80,93 @@ randomx_program_read_dataset_light_sub PROC | |||
| 		include asm/squareHash.inc | ||||
| randomx_program_read_dataset_light_sub ENDP | ||||
| 
 | ||||
| ALIGN 64 | ||||
| randomx_dataset_init PROC | ||||
| 	push rbx | ||||
| 	push rbp | ||||
| 	push rdi | ||||
| 	push rsi | ||||
| 	push r12 | ||||
| 	push r13 | ||||
| 	push r14 | ||||
| 	push r15 | ||||
| 	mov rdi, rcx ;# cache | ||||
| 	mov rsi, rdx ;# dataset | ||||
| 	mov rbp, r8  ;# block index | ||||
| 	push r9      ;# max. block index | ||||
| init_block_loop: | ||||
| 	prefetchw byte ptr [rsi] | ||||
| 	mov rbx, rbp | ||||
| 	db 232 ;# 0xE8 = call | ||||
| 	dd 32768 - distance | ||||
| 	distance equ $ - offset randomx_dataset_init | ||||
| 	mov qword ptr [rsi+0], r8 | ||||
| 	mov qword ptr [rsi+8], r9 | ||||
| 	mov qword ptr [rsi+16], r10 | ||||
| 	mov qword ptr [rsi+24], r11 | ||||
| 	mov qword ptr [rsi+32], r12 | ||||
| 	mov qword ptr [rsi+40], r13 | ||||
| 	mov qword ptr [rsi+48], r14 | ||||
| 	mov qword ptr [rsi+56], r15 | ||||
| 	add rbp, 1 | ||||
| 	add rsi, 64 | ||||
| 	cmp rbp, qword ptr [rsp] | ||||
| 	jb init_block_loop | ||||
| 	pop r9 | ||||
| 	pop r15 | ||||
| 	pop r14 | ||||
| 	pop r13 | ||||
| 	pop r12 | ||||
| 	pop rsi | ||||
| 	pop rdi | ||||
| 	pop rbp | ||||
| 	pop rbx | ||||
| 	ret | ||||
| randomx_dataset_init ENDP | ||||
| 
 | ||||
| ALIGN 64 | ||||
| randomx_program_epilogue PROC | ||||
| 	include asm/program_epilogue_win64.inc | ||||
| randomx_program_epilogue ENDP | ||||
| 
 | ||||
| ALIGN 64 | ||||
| randomx_sshash_load PROC | ||||
| 	include asm/program_sshash_load.inc | ||||
| randomx_sshash_load ENDP | ||||
| 
 | ||||
| randomx_sshash_prefetch PROC | ||||
| 	include asm/program_sshash_prefetch.inc | ||||
| randomx_sshash_prefetch ENDP | ||||
| 
 | ||||
| randomx_sshash_end PROC | ||||
| 	nop | ||||
| randomx_sshash_end ENDP | ||||
| 
 | ||||
| ALIGN 64 | ||||
| randomx_sshash_init PROC | ||||
| 	lea r8, [rbx+1] | ||||
| 	include asm/program_sshash_prefetch.inc | ||||
| 	imul r8, qword ptr [r0_mul] | ||||
| 	mov r9, qword ptr [r1_add] | ||||
| 	xor r9, r8 | ||||
| 	mov r10, qword ptr [r2_add] | ||||
| 	xor r10, r8 | ||||
| 	mov r11, qword ptr [r3_add] | ||||
| 	xor r11, r8 | ||||
| 	mov r12, qword ptr [r4_add] | ||||
| 	xor r12, r8 | ||||
| 	mov r13, qword ptr [r5_add] | ||||
| 	xor r13, r8 | ||||
| 	mov r14, qword ptr [r6_add] | ||||
| 	xor r14, r8 | ||||
| 	mov r15, qword ptr [r7_add] | ||||
| 	xor r15, r8 | ||||
| 	jmp randomx_program_end | ||||
| randomx_sshash_init ENDP | ||||
| 
 | ||||
| ALIGN 64 | ||||
| 	include asm/program_sshash_constants.inc | ||||
| 
 | ||||
| ALIGN 64 | ||||
| randomx_program_end PROC | ||||
| 	nop | ||||
|  |  | |||
|  | @ -27,6 +27,11 @@ extern "C" { | |||
| 	void randomx_program_loop_store(); | ||||
| 	void randomx_program_loop_end(); | ||||
| 	void randomx_program_read_dataset_light_sub(); | ||||
| 	void randomx_dataset_init(); | ||||
| 	void randomx_program_epilogue(); | ||||
| 	void randomx_sshash_load(); | ||||
| 	void randomx_sshash_prefetch(); | ||||
| 	void randomx_sshash_end(); | ||||
| 	void randomx_sshash_init(); | ||||
| 	void randomx_program_end(); | ||||
| } | ||||
|  | @ -88,29 +88,40 @@ namespace RandomX { | |||
| 
 | ||||
| #include "JitCompilerX86-static.hpp" | ||||
| 
 | ||||
| #define NOP_TEST true | ||||
| 
 | ||||
| 	const uint8_t* codePrologue = (uint8_t*)&randomx_program_prologue; | ||||
| 	const uint8_t* codeLoopBegin = (uint8_t*)&randomx_program_loop_begin; | ||||
| 	const uint8_t* codeLoopLoad = (uint8_t*)&randomx_program_loop_load; | ||||
| 	const uint8_t* codeProgamStart = (uint8_t*)&randomx_program_start; | ||||
| 	const uint8_t* codeReadDataset = (uint8_t*)&randomx_program_read_dataset; | ||||
| 	const uint8_t* codeReadDatasetLight = (uint8_t*)&randomx_program_read_dataset_light; | ||||
| 	const uint8_t* codeDatasetInit = (uint8_t*)&randomx_dataset_init; | ||||
| 	const uint8_t* codeLoopStore = (uint8_t*)&randomx_program_loop_store; | ||||
| 	const uint8_t* codeLoopEnd = (uint8_t*)&randomx_program_loop_end; | ||||
| 	const uint8_t* codeReadDatasetLightSub = (uint8_t*)&randomx_program_read_dataset_light_sub; | ||||
| 	const uint8_t* codeEpilogue = (uint8_t*)&randomx_program_epilogue; | ||||
| 	const uint8_t* codeProgramEnd = (uint8_t*)&randomx_program_end; | ||||
| 	const uint8_t* codeShhLoad = (uint8_t*)&randomx_sshash_load; | ||||
| 	const uint8_t* codeShhPrefetch = (uint8_t*)&randomx_sshash_prefetch; | ||||
| 	const uint8_t* codeShhEnd = (uint8_t*)&randomx_sshash_end; | ||||
| 	const uint8_t* codeShhInit = (uint8_t*)&randomx_sshash_init; | ||||
| 
 | ||||
| 	const int32_t prologueSize = codeLoopBegin - codePrologue; | ||||
| 	const int32_t epilogueSize = codeProgramEnd - codeEpilogue; | ||||
| 
 | ||||
| 	const int32_t loopLoadSize = codeProgamStart - codeLoopLoad; | ||||
| 	const int32_t readDatasetSize = codeReadDatasetLight - codeReadDataset; | ||||
| 	const int32_t readDatasetLightSize = codeLoopStore - codeReadDatasetLight; | ||||
| 	const int32_t loopStoreSize = codeLoopEnd - codeLoopStore; | ||||
| 	const int32_t readDatasetLightSubSize = codeEpilogue - codeReadDatasetLightSub; | ||||
| 	const int32_t readDatasetLightSubSize = codeDatasetInit - codeReadDatasetLightSub; | ||||
| 	const int32_t datasetInitSize = codeEpilogue - codeDatasetInit; | ||||
| 	const int32_t epilogueSize = codeShhLoad - codeEpilogue; | ||||
| 	const int32_t codeSshLoadSize = codeShhPrefetch - codeShhLoad; | ||||
| 	const int32_t codeSshPrefetchSize = codeShhEnd - codeShhPrefetch; | ||||
| 	const int32_t codeSshInitSize = codeProgramEnd - codeShhInit; | ||||
| 
 | ||||
| 	const int32_t epilogueOffset = CodeSize - epilogueSize; | ||||
| 	const int32_t readDatasetLightSubOffset = epilogueOffset - readDatasetLightSubSize; | ||||
| 	constexpr int32_t superScalarHashOffset = 32768; | ||||
| 
 | ||||
| 	static const uint8_t REX_ADD_RR[] = { 0x4d, 0x03 }; | ||||
| 	static const uint8_t REX_ADD_RM[] = { 0x4c, 0x03 }; | ||||
|  | @ -166,7 +177,7 @@ namespace RandomX { | |||
| 	static const uint8_t SQRTPD[] = { 0x66, 0x0f, 0x51 }; | ||||
| 	static const uint8_t AND_OR_MOV_LDMXCSR[] = { 0x25, 0x00, 0x60, 0x00, 0x00, 0x0D, 0xC0, 0x9F, 0x00, 0x00, 0x89, 0x44, 0x24, 0xF8, 0x0F, 0xAE, 0x54, 0x24, 0xF8 }; | ||||
| 	static const uint8_t ROL_RAX[] = { 0x48, 0xc1, 0xc0 }; | ||||
| 	static const uint8_t XOR_ECX_ECX[] = { 0x33, 0xC9 }; | ||||
| 	static const uint8_t XOR_RCX_RCX[] = { 0x48, 0x33, 0xC9 }; | ||||
| 	static const uint8_t REX_CMP_R32I[] = { 0x41, 0x81 }; | ||||
| 	static const uint8_t REX_CMP_M32I[] = { 0x81, 0x3c, 0x06 }; | ||||
| 	static const uint8_t MOVAPD[] = { 0x66, 0x0f, 0x29 }; | ||||
|  | @ -184,6 +195,18 @@ namespace RandomX { | |||
| 	static const uint8_t REX_ADD_I[] = { 0x49, 0x81 }; | ||||
| 	static const uint8_t REX_TEST[] = { 0x49, 0xF7 }; | ||||
| 	static const uint8_t JZ[] = { 0x0f, 0x84 }; | ||||
| 	static const uint8_t RET = 0xc3; | ||||
| 
 | ||||
| 	static const uint8_t NOP1[] = { 0x90 }; | ||||
| 	static const uint8_t NOP2[] = { 0x66, 0x90 }; | ||||
| 	static const uint8_t NOP3[] = { 0x0F, 0x1F, 0x00 }; | ||||
| 	static const uint8_t NOP4[] = { 0x0F, 0x1F, 0x40, 0x00 }; | ||||
| 	static const uint8_t NOP5[] = { 0x0F, 0x1F, 0x44, 0x00, 0x00 }; | ||||
| 	static const uint8_t NOP6[] = { 0x66, 0x0F, 0x1F, 0x44, 0x00, 0x00 }; | ||||
| 	static const uint8_t NOP7[] = { 0x0F, 0x1F, 0x80, 0x00, 0x00, 0x00, 0x00 }; | ||||
| 	static const uint8_t NOP8[] = { 0x0F, 0x1F, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00 }; | ||||
| 
 | ||||
| 	static const uint8_t* NOPX[] = { NOP1, NOP2, NOP3, NOP4, NOP5, NOP6, NOP7, NOP8 }; | ||||
| 
 | ||||
| 	size_t JitCompilerX86::getCodeSize() { | ||||
| 		return codePos - prologueSize; | ||||
|  | @ -196,6 +219,10 @@ namespace RandomX { | |||
| 		memcpy(code + readDatasetLightSubOffset, codeReadDatasetLightSub, readDatasetLightSubSize); | ||||
| 	} | ||||
| 
 | ||||
| 	JitCompilerX86::~JitCompilerX86() { | ||||
| 		freePagedMemory(code, CodeSize); | ||||
| 	} | ||||
| 
 | ||||
| 	void JitCompilerX86::generateProgram(Program& prog) { | ||||
| 		generateProgramPrologue(prog); | ||||
| 		memcpy(code + codePos, codeReadDataset, readDatasetSize); | ||||
|  | @ -216,6 +243,42 @@ namespace RandomX { | |||
| 		generateProgramEpilogue(prog); | ||||
| 	} | ||||
| 
 | ||||
| 	template<size_t N> | ||||
| 	void JitCompilerX86::generateSuperScalarHash(LightProgram(&programs)[N]) { | ||||
| 		memcpy(code + superScalarHashOffset, codeShhInit, codeSshInitSize); | ||||
| 		codePos = superScalarHashOffset + codeSshInitSize; | ||||
| 		for (unsigned j = 0; j < N; ++j) { | ||||
| 			LightProgram& prog = programs[j]; | ||||
| 			for (unsigned i = 0; i < prog.getSize(); ++i) { | ||||
| 				Instruction& instr = prog(i); | ||||
| 				instr.src %= RegistersCount; | ||||
| 				instr.dst %= RegistersCount; | ||||
| 				generateCode(instr, i); | ||||
| 			} | ||||
| 			emit(codeShhLoad, codeSshLoadSize); | ||||
| 			if (j < N - 1) { | ||||
| 				emit(REX_MOV_RR64); | ||||
| 				emitByte(0xd8 + prog.getAddressRegister()); | ||||
| 				emit(codeShhPrefetch, codeSshPrefetchSize); | ||||
| 				int align = (codePos % 16); | ||||
| 				while (align != 0) { | ||||
| 					int nopSize = 16 - align; | ||||
| 					if (nopSize > 8) nopSize = 8; | ||||
| 					emit(NOPX[nopSize - 1], nopSize); | ||||
| 					align = (codePos % 16); | ||||
| 				} | ||||
| 			} | ||||
| 		} | ||||
| 		emitByte(RET); | ||||
| 	} | ||||
| 
 | ||||
| 	template | ||||
| 	void JitCompilerX86::generateSuperScalarHash(LightProgram(&programs)[RANDOMX_CACHE_ACCESSES]); | ||||
| 
 | ||||
| 	void JitCompilerX86::generateDatasetInitCode() { | ||||
| 		memcpy(code, codeDatasetInit, datasetInitSize); | ||||
| 	} | ||||
| 
 | ||||
| 	void JitCompilerX86::generateProgramPrologue(Program& prog) { | ||||
| #ifdef RANDOMX_JUMP | ||||
| 		instructionOffsets.clear(); | ||||
|  | @ -253,7 +316,6 @@ namespace RandomX { | |||
| 		emit32(prologueSize - codePos - 4); | ||||
| 		emitByte(JMP); | ||||
| 		emit32(epilogueOffset - codePos - 4); | ||||
| 		emitByte(0x90); | ||||
| 	} | ||||
| 
 | ||||
| 	void JitCompilerX86::generateCode(Instruction& instr, int i) { | ||||
|  | @ -287,9 +349,9 @@ namespace RandomX { | |||
| 		emit32(instr.getImm32() & ScratchpadL3Mask); | ||||
| 	} | ||||
| 
 | ||||
| 	void JitCompilerX86::h_IADD_R(Instruction& instr, int i) { | ||||
| 	void JitCompilerX86::h_IADD_RS(Instruction& instr, int i) { | ||||
| 		registerUsage[instr.dst] = i; | ||||
| 		if (instr.src != instr.dst) { | ||||
| 		/*if (instr.src != instr.dst) {
 | ||||
| 			emit(REX_ADD_RR); | ||||
| 			emitByte(0xc0 + 8 * instr.dst + instr.src); | ||||
| 		} | ||||
|  | @ -297,7 +359,19 @@ namespace RandomX { | |||
| 			emit(REX_81); | ||||
| 			emitByte(0xc0 + instr.dst); | ||||
| 			emit32(instr.getImm32()); | ||||
| 		}*/ | ||||
| 		if (false && NOP_TEST) { | ||||
| 			emit(NOP4); | ||||
| 			return; | ||||
| 		} | ||||
| 		emit(REX_LEA); | ||||
| 		if (instr.dst == 5) //rbp,r13 cannot be the base register without offset
 | ||||
| 			emitByte(0xac); | ||||
| 		else | ||||
| 			emitByte(0x04 + 8 * instr.dst); | ||||
| 		genSIB(instr.mod % 4, instr.src, instr.dst); | ||||
| 		if (instr.dst == 5) | ||||
| 			emit32(instr.getImm32()); | ||||
| 	} | ||||
| 
 | ||||
| 	void JitCompilerX86::h_IADD_M(Instruction& instr, int i) { | ||||
|  | @ -330,10 +404,18 @@ namespace RandomX { | |||
| 	void JitCompilerX86::h_ISUB_R(Instruction& instr, int i) { | ||||
| 		registerUsage[instr.dst] = i; | ||||
| 		if (instr.src != instr.dst) { | ||||
| 			if (false && NOP_TEST) { | ||||
| 				emit(NOP3); | ||||
| 				return; | ||||
| 			} | ||||
| 			emit(REX_SUB_RR); | ||||
| 			emitByte(0xc0 + 8 * instr.dst + instr.src); | ||||
| 		} | ||||
| 		else { | ||||
| 			if (false && NOP_TEST) { | ||||
| 				emit(NOP7); | ||||
| 				return; | ||||
| 			} | ||||
| 			emit(REX_81); | ||||
| 			emitByte(0xe8 + instr.dst); | ||||
| 			emit32(instr.getImm32()); | ||||
|  | @ -366,10 +448,18 @@ namespace RandomX { | |||
| 	void JitCompilerX86::h_IMUL_R(Instruction& instr, int i) { | ||||
| 		registerUsage[instr.dst] = i; | ||||
| 		if (instr.src != instr.dst) { | ||||
| 			if (false && NOP_TEST) { | ||||
| 				emit(NOP4); | ||||
| 				return; | ||||
| 			} | ||||
| 			emit(REX_IMUL_RR); | ||||
| 			emitByte(0xc0 + 8 * instr.dst + instr.src); | ||||
| 		} | ||||
| 		else { | ||||
| 			if (false && NOP_TEST) { | ||||
| 				emit(NOP7); | ||||
| 				return; | ||||
| 			} | ||||
| 			emit(REX_IMUL_RRI); | ||||
| 			emitByte(0xc0 + 9 * instr.dst); | ||||
| 			emit32(instr.getImm32()); | ||||
|  | @ -393,6 +483,12 @@ namespace RandomX { | |||
| 
 | ||||
| 	void JitCompilerX86::h_IMULH_R(Instruction& instr, int i) { | ||||
| 		registerUsage[instr.dst] = i; | ||||
| 		if (false && NOP_TEST) { | ||||
| 			emit(NOP3); | ||||
| 			emit(NOP3); | ||||
| 			emit(NOP3); | ||||
| 			return; | ||||
| 		} | ||||
| 		emit(REX_MOV_RR64); | ||||
| 		emitByte(0xc0 + instr.dst); | ||||
| 		emit(REX_MUL_R); | ||||
|  | @ -422,6 +518,12 @@ namespace RandomX { | |||
| 
 | ||||
| 	void JitCompilerX86::h_ISMULH_R(Instruction& instr, int i) { | ||||
| 		registerUsage[instr.dst] = i; | ||||
| 		if (false && NOP_TEST) { | ||||
| 			emit(NOP3); | ||||
| 			emit(NOP3); | ||||
| 			emit(NOP3); | ||||
| 			return; | ||||
| 		} | ||||
| 		emit(REX_MOV_RR64); | ||||
| 		emitByte(0xc0 + instr.dst); | ||||
| 		emit(REX_MUL_R); | ||||
|  | @ -451,6 +553,13 @@ namespace RandomX { | |||
| 
 | ||||
| 	void JitCompilerX86::h_IMUL_RCP(Instruction& instr, int i) { | ||||
| 		if (instr.getImm32() != 0) { | ||||
| 			if (false && NOP_TEST) { | ||||
| 				emitByte(0x66); | ||||
| 				emitByte(0x66); | ||||
| 				emit(NOP8); | ||||
| 				emit(NOP4); | ||||
| 				return; | ||||
| 			} | ||||
| 			registerUsage[instr.dst] = i; | ||||
| 			emit(MOV_RAX_I); | ||||
| 			emit64(reciprocal(instr.getImm32())); | ||||
|  | @ -472,10 +581,18 @@ namespace RandomX { | |||
| 	void JitCompilerX86::h_IXOR_R(Instruction& instr, int i) { | ||||
| 		registerUsage[instr.dst] = i; | ||||
| 		if (instr.src != instr.dst) { | ||||
| 			if (false && NOP_TEST) { | ||||
| 				emit(NOP3); | ||||
| 				return; | ||||
| 			} | ||||
| 			emit(REX_XOR_RR); | ||||
| 			emitByte(0xc0 + 8 * instr.dst + instr.src); | ||||
| 		} | ||||
| 		else { | ||||
| 			if (false && NOP_TEST) { | ||||
| 				emit(NOP7); | ||||
| 				return; | ||||
| 			} | ||||
| 			emit(REX_XOR_RI); | ||||
| 			emitByte(0xf0 + instr.dst); | ||||
| 			emit32(instr.getImm32()); | ||||
|  | @ -500,12 +617,21 @@ namespace RandomX { | |||
| 	void JitCompilerX86::h_IROR_R(Instruction& instr, int i) { | ||||
| 		registerUsage[instr.dst] = i; | ||||
| 		if (instr.src != instr.dst) { | ||||
| 			if (false && NOP_TEST) { | ||||
| 				emit(NOP3); | ||||
| 				emit(NOP3); | ||||
| 				return; | ||||
| 			} | ||||
| 			emit(REX_MOV_RR); | ||||
| 			emitByte(0xc8 + instr.src); | ||||
| 			emit(REX_ROT_CL); | ||||
| 			emitByte(0xc8 + instr.dst); | ||||
| 		} | ||||
| 		else { | ||||
| 			if (NOP_TEST) { | ||||
| 				emit(NOP4); | ||||
| 				return; | ||||
| 			} | ||||
| 			emit(REX_ROT_I8); | ||||
| 			emitByte(0xc8 + instr.dst); | ||||
| 			emitByte(instr.getImm32() & 63); | ||||
|  | @ -700,14 +826,21 @@ namespace RandomX { | |||
| 		const int conditionMask = ((1 << RANDOMX_CONDITION_BITS) - 1) << shift; | ||||
| 		int reg = getConditionRegister(); | ||||
| 		int target = registerUsage[reg] + 1; | ||||
| 		emit(REX_ADD_I); | ||||
| 		emitByte(0xc0 + reg); | ||||
| 		emit32(1 << shift); | ||||
| 		emit(REX_TEST); | ||||
| 		emitByte(0xc0 + reg); | ||||
| 		emit32(conditionMask); | ||||
| 		emit(JZ); | ||||
| 		emit32(instructionOffsets[target] - (codePos + 4)); | ||||
| 		if (false && NOP_TEST) { | ||||
| 			emit(NOP7); | ||||
| 			emit(NOP7); | ||||
| 			emit(NOP6); | ||||
| 		} | ||||
| 		else { | ||||
| 			emit(REX_ADD_I); | ||||
| 			emitByte(0xc0 + reg); | ||||
| 			emit32(1 << shift); | ||||
| 			emit(REX_TEST); | ||||
| 			emitByte(0xc0 + reg); | ||||
| 			emit32(conditionMask); | ||||
| 			emit(JZ); | ||||
| 			emit32(instructionOffsets[target] - (codePos + 4)); | ||||
| 		} | ||||
| 		for (unsigned j = 0; j < 8; ++j) { //mark all registers as used
 | ||||
| 			registerUsage[j] = i; | ||||
| 		} | ||||
|  | @ -717,7 +850,14 @@ namespace RandomX { | |||
| #ifdef RANDOMX_JUMP | ||||
| 		handleCondition(instr, i); | ||||
| #endif | ||||
| 		emit(XOR_ECX_ECX); | ||||
| 		if (false && NOP_TEST) { | ||||
| 			emit(NOP3); | ||||
| 			emit(NOP7); | ||||
| 			emit(NOP3); | ||||
| 			emit(NOP3); | ||||
| 			return; | ||||
| 		} | ||||
| 		emit(XOR_RCX_RCX); | ||||
| 		emit(REX_CMP_R32I); | ||||
| 		emitByte(0xf8 + instr.src); | ||||
| 		emit32(instr.getImm32()); | ||||
|  | @ -732,7 +872,7 @@ namespace RandomX { | |||
| #ifdef RANDOMX_JUMP | ||||
| 		handleCondition(instr, i); | ||||
| #endif | ||||
| 		emit(XOR_ECX_ECX); | ||||
| 		emit(XOR_RCX_RCX); | ||||
| 		genAddressReg(instr); | ||||
| 		emit(REX_CMP_M32I); | ||||
| 		emit32(instr.getImm32()); | ||||
|  | @ -765,7 +905,7 @@ namespace RandomX { | |||
| #define INST_HANDLE(x) REPN(&JitCompilerX86::h_##x, WT(x)) | ||||
| 
 | ||||
| 	InstructionGeneratorX86 JitCompilerX86::engine[256] = { | ||||
| 		INST_HANDLE(IADD_R) | ||||
| 		INST_HANDLE(IADD_RS) | ||||
| 		INST_HANDLE(IADD_M) | ||||
| 		INST_HANDLE(IADD_RC) | ||||
| 		INST_HANDLE(ISUB_R) | ||||
|  |  | |||
|  | @ -27,6 +27,7 @@ along with RandomX.  If not, see<http://www.gnu.org/licenses/>. | |||
| namespace RandomX { | ||||
| 
 | ||||
| 	class Program; | ||||
| 	class LightProgram; | ||||
| 	class JitCompilerX86; | ||||
| 
 | ||||
| 	typedef void(JitCompilerX86::*InstructionGeneratorX86)(Instruction&, int); | ||||
|  | @ -36,11 +37,18 @@ namespace RandomX { | |||
| 	class JitCompilerX86 { | ||||
| 	public: | ||||
| 		JitCompilerX86(); | ||||
| 		~JitCompilerX86(); | ||||
| 		void generateProgram(Program&); | ||||
| 		void generateProgramLight(Program&); | ||||
| 		template<size_t N> | ||||
| 		void generateSuperScalarHash(LightProgram (&programs)[N]); | ||||
| 		ProgramFunc getProgramFunc() { | ||||
| 			return (ProgramFunc)code; | ||||
| 		} | ||||
| 		DatasetInitFunc getDatasetInitFunc() { | ||||
| 			generateDatasetInitCode(); | ||||
| 			return (DatasetInitFunc)code; | ||||
| 		} | ||||
| 		uint8_t* getCode() { | ||||
| 			return code; | ||||
| 		} | ||||
|  | @ -62,6 +70,8 @@ namespace RandomX { | |||
| 			} | ||||
| 		} | ||||
| 
 | ||||
| 		void generateDatasetInitCode(); | ||||
| 
 | ||||
| 		void generateProgramPrologue(Program&); | ||||
| 		void generateProgramEpilogue(Program&); | ||||
| 		int getConditionRegister(); | ||||
|  | @ -100,13 +110,15 @@ namespace RandomX { | |||
| 
 | ||||
| 		template<size_t N> | ||||
| 		void emit(const uint8_t (&src)[N]) { | ||||
| 			for (unsigned i = 0; i < N; ++i) { | ||||
| 				code[codePos + i] = src[i]; | ||||
| 			} | ||||
| 			codePos += N; | ||||
| 			emit(src, N); | ||||
| 		} | ||||
| 
 | ||||
| 		void  h_IADD_R(Instruction&, int); | ||||
| 		void emit(const uint8_t* src, size_t count) { | ||||
| 			memcpy(code + codePos, src, count); | ||||
| 			codePos += count; | ||||
| 		} | ||||
| 
 | ||||
| 		void  h_IADD_RS(Instruction&, int); | ||||
| 		void  h_IADD_M(Instruction&, int); | ||||
| 		void  h_IADD_RC(Instruction&, int); | ||||
| 		void  h_ISUB_R(Instruction&, int); | ||||
|  |  | |||
|  | @ -26,6 +26,7 @@ along with RandomX.  If not, see<http://www.gnu.org/licenses/>. | |||
| #include <algorithm> | ||||
| #include <stdexcept> | ||||
| #include <iomanip> | ||||
| #include "LightProgramGenerator.hpp" | ||||
| 
 | ||||
| namespace RandomX { | ||||
|                                             //                             Intel Ivy Bridge reference
 | ||||
|  | @ -47,8 +48,8 @@ namespace RandomX { | |||
| 	} | ||||
| 
 | ||||
| 	namespace LightInstructionOpcode { | ||||
| 		constexpr int IADD_R = 0; | ||||
| 		constexpr int IADD_RC = RANDOMX_FREQ_IADD_R + RANDOMX_FREQ_IADD_M; | ||||
| 		constexpr int IADD_RS = 0; | ||||
| 		constexpr int IADD_RC = RANDOMX_FREQ_IADD_RS + RANDOMX_FREQ_IADD_M; | ||||
| 		constexpr int ISUB_R = IADD_RC + RANDOMX_FREQ_IADD_RC; | ||||
| 		constexpr int IMUL_9C = ISUB_R + RANDOMX_FREQ_ISUB_R + RANDOMX_FREQ_ISUB_M; | ||||
| 		constexpr int IMUL_R = IMUL_9C + RANDOMX_FREQ_IMUL_9C; | ||||
|  | @ -65,20 +66,18 @@ namespace RandomX { | |||
| 	} | ||||
| 
 | ||||
| 	const int lightInstructionOpcode[] = { | ||||
| 		LightInstructionOpcode::IADD_R, | ||||
| 		LightInstructionOpcode::IADD_R, | ||||
| 		LightInstructionOpcode::IADD_RC, | ||||
| 		LightInstructionOpcode::ISUB_R, | ||||
| 		LightInstructionOpcode::IMUL_9C, | ||||
| 		LightInstructionOpcode::IMUL_R, | ||||
| 		LightInstructionOpcode::IMUL_R, | ||||
| 		LightInstructionOpcode::IADD_RS, | ||||
| 		LightInstructionOpcode::ISUB_R,    //ISUB_R
 | ||||
| 		LightInstructionOpcode::ISUB_R,    //ISUB_R
 | ||||
| 		LightInstructionOpcode::IMUL_R,    //IMUL_R
 | ||||
| 		LightInstructionOpcode::IMUL_R,    //IMUL_C
 | ||||
| 		LightInstructionOpcode::IMULH_R, | ||||
| 		LightInstructionOpcode::ISMULH_R, | ||||
| 		LightInstructionOpcode::IMUL_RCP, | ||||
| 		LightInstructionOpcode::IXOR_R, | ||||
| 		LightInstructionOpcode::IXOR_R, | ||||
| 		LightInstructionOpcode::IROR_R, | ||||
| 		LightInstructionOpcode::IROR_R, | ||||
| 		LightInstructionOpcode::IXOR_R,    //IXOR_R
 | ||||
| 		LightInstructionOpcode::IXOR_R,    //IXOR_C
 | ||||
| 		LightInstructionOpcode::IROR_R,    //IROR_R
 | ||||
| 		LightInstructionOpcode::IROR_R,    //IROR_C
 | ||||
| 		LightInstructionOpcode::COND_R | ||||
| 	}; | ||||
| 
 | ||||
|  | @ -93,37 +92,30 @@ namespace RandomX { | |||
| 		constexpr type P015 = 6; | ||||
| 	} | ||||
| 
 | ||||
| 	class Blake2Generator { | ||||
| 	public: | ||||
| 		Blake2Generator(const void* seed, int nonce) : dataIndex(sizeof(data)) { | ||||
| 			memset(data, 0, sizeof(data)); | ||||
| 			memcpy(data, seed, SeedSize); | ||||
| 			store32(&data[60], nonce); | ||||
| 		} | ||||
| 	Blake2Generator::Blake2Generator(const void* seed, int nonce) : dataIndex(sizeof(data)) { | ||||
| 		memset(data, 0, sizeof(data)); | ||||
| 		memcpy(data, seed, SeedSize); | ||||
| 		store32(&data[60], nonce); | ||||
| 	} | ||||
| 
 | ||||
| 		uint8_t getByte() { | ||||
| 			checkData(1); | ||||
| 			return data[dataIndex++]; | ||||
| 		} | ||||
| 	uint8_t Blake2Generator::getByte() { | ||||
| 		checkData(1); | ||||
| 		return data[dataIndex++]; | ||||
| 	} | ||||
| 
 | ||||
| 		uint32_t getInt32() { | ||||
| 			checkData(4); | ||||
| 			auto ret = load32(&data[dataIndex]); | ||||
| 			dataIndex += 4; | ||||
| 			return ret; | ||||
| 		} | ||||
| 	uint32_t Blake2Generator::getInt32() { | ||||
| 		checkData(4); | ||||
| 		auto ret = load32(&data[dataIndex]); | ||||
| 		dataIndex += 4; | ||||
| 		return ret; | ||||
| 	} | ||||
| 
 | ||||
| 	private: | ||||
| 		uint8_t data[64]; | ||||
| 		size_t dataIndex; | ||||
| 
 | ||||
| 		void checkData(const size_t bytesNeeded) { | ||||
| 			if (dataIndex + bytesNeeded > sizeof(data))	{ | ||||
| 				blake2b(data, sizeof(data), data, sizeof(data), nullptr, 0); | ||||
| 				dataIndex = 0; | ||||
| 			} | ||||
| 	void Blake2Generator::checkData(const size_t bytesNeeded) { | ||||
| 		if (dataIndex + bytesNeeded > sizeof(data))	{ | ||||
| 			blake2b(data, sizeof(data), data, sizeof(data), nullptr, 0); | ||||
| 			dataIndex = 0; | ||||
| 		} | ||||
| 	}; | ||||
| 	} | ||||
| 
 | ||||
| 	class RegisterInfo { | ||||
| 	public: | ||||
|  | @ -201,7 +193,7 @@ namespace RandomX { | |||
| 		static const MacroOp Xor_ri; | ||||
| 		static const MacroOp Ror_rcl; | ||||
| 		static const MacroOp Ror_ri; | ||||
| 		static const MacroOp TestJmp_fused; | ||||
| 		static const MacroOp TestJz_fused; | ||||
| 		static const MacroOp Xor_self; | ||||
| 		static const MacroOp Cmp_ri; | ||||
| 		static const MacroOp Setcc_r; | ||||
|  | @ -235,13 +227,13 @@ namespace RandomX { | |||
| 	const MacroOp MacroOp::Xor_self = MacroOp("xor rcx,rcx", 3); | ||||
| 	const MacroOp MacroOp::Cmp_ri = MacroOp("cmp r,i", 7, 1, ExecutionPort::P015); | ||||
| 	const MacroOp MacroOp::Setcc_r = MacroOp("setcc cl", 3, 1, ExecutionPort::P05); | ||||
| 	const MacroOp MacroOp::TestJmp_fused = MacroOp("testjz r,i", 13, 0, ExecutionPort::P5); | ||||
| 	const MacroOp MacroOp::TestJz_fused = MacroOp("testjz r,i", 13, 0, ExecutionPort::P5); | ||||
| 
 | ||||
| 	const MacroOp IMULH_R_ops_array[] = { MacroOp::Mov_rr, MacroOp::Mul_r, MacroOp::Mov_rr }; | ||||
| 	const MacroOp ISMULH_R_ops_array[] = { MacroOp::Mov_rr, MacroOp::Imul_r, MacroOp::Mov_rr }; | ||||
| 	const MacroOp IMUL_RCP_ops_array[] = { MacroOp::Mov_ri64, MacroOp(MacroOp::Imul_rr, true) }; | ||||
| 	const MacroOp IROR_R_ops_array[] = { MacroOp::Mov_rr, MacroOp::Ror_rcl }; | ||||
| 	const MacroOp COND_R_ops_array[] = { MacroOp::Add_ri, MacroOp(MacroOp::TestJmp_fused, true), MacroOp::Xor_self, MacroOp::Cmp_ri, MacroOp(MacroOp::Setcc_r, true), MacroOp(MacroOp::Add_rr, true) }; | ||||
| 	const MacroOp COND_R_ops_array[] = { MacroOp::Add_ri, MacroOp(MacroOp::TestJz_fused, true), MacroOp::Xor_self, MacroOp::Cmp_ri, MacroOp(MacroOp::Setcc_r, true), MacroOp(MacroOp::Add_rr, true) }; | ||||
| 
 | ||||
| 
 | ||||
| 	class LightInstructionInfo { | ||||
|  | @ -349,7 +341,7 @@ namespace RandomX { | |||
| 
 | ||||
| 	class DecoderBuffer { | ||||
| 	public: | ||||
| 		static DecoderBuffer Default; | ||||
| 		static const DecoderBuffer Default; | ||||
| 		template <size_t N> | ||||
| 		DecoderBuffer(const char* name, int index, const int(&arr)[N]) | ||||
| 			: name_(name), index_(index), counts_(arr), opsCount_(N) {} | ||||
|  | @ -365,17 +357,17 @@ namespace RandomX { | |||
| 		const char* getName() const { | ||||
| 			return name_; | ||||
| 		} | ||||
| 		const DecoderBuffer& fetchNext(int prevType, Blake2Generator& gen) { | ||||
| 		const DecoderBuffer* fetchNext(int prevType, Blake2Generator& gen) const { | ||||
| 			if (prevType == LightInstructionType::IMULH_R || prevType == LightInstructionType::ISMULH_R) | ||||
| 				return decodeBuffer3310; //2-1-1 decode
 | ||||
| 				return &decodeBuffer3310; //2-1-1 decode
 | ||||
| 			if (index_ == 0) { | ||||
| 				return decodeBuffer4444; //IMUL_RCP end
 | ||||
| 			} | ||||
| 			if (index_ == 2) { | ||||
| 				return decodeBuffer133; //COND_R middle
 | ||||
| 				return &decodeBuffer4444; //IMUL_RCP end
 | ||||
| 			} | ||||
| 			/*if (index_ == 2) {
 | ||||
| 				return &decodeBuffer133; //COND_R middle
 | ||||
| 			}*/ | ||||
| 			if (index_ == 7) { | ||||
| 				return decodeBuffer7333; //COND_R end
 | ||||
| 				return &decodeBuffer7333; //COND_R end
 | ||||
| 			} | ||||
| 			return fetchNextDefault(gen); | ||||
| 		} | ||||
|  | @ -393,12 +385,12 @@ namespace RandomX { | |||
| 		static const DecoderBuffer decodeBuffer3373; | ||||
| 		static const DecoderBuffer decodeBuffer133; | ||||
| 		static const DecoderBuffer* decodeBuffers[7]; | ||||
| 		const DecoderBuffer& fetchNextDefault(Blake2Generator& gen) { | ||||
| 		const DecoderBuffer* fetchNextDefault(Blake2Generator& gen) const { | ||||
| 			int select; | ||||
| 			do { | ||||
| 				select = gen.getByte() & 7; | ||||
| 			} while (select == 7); | ||||
| 			return *decodeBuffers[select]; | ||||
| 			return decodeBuffers[select]; | ||||
| 		} | ||||
| 	}; | ||||
| 
 | ||||
|  | @ -420,7 +412,7 @@ namespace RandomX { | |||
| 			&DecoderBuffer::decodeBuffer3373, | ||||
| 	}; | ||||
| 
 | ||||
| 	DecoderBuffer DecoderBuffer::Default = DecoderBuffer(); | ||||
| 	const DecoderBuffer DecoderBuffer::Default = DecoderBuffer(); | ||||
| 
 | ||||
| 	const LightInstructionInfo* slot_3[]  = { &LightInstructionInfo::ISUB_R, &LightInstructionInfo::IXOR_R }; | ||||
| 	const LightInstructionInfo* slot_3L[] = { &LightInstructionInfo::ISUB_R, &LightInstructionInfo::IXOR_R, &LightInstructionInfo::IMULH_R, &LightInstructionInfo::ISMULH_R }; | ||||
|  | @ -472,7 +464,7 @@ namespace RandomX { | |||
| 			case 4: | ||||
| 				return create(slot_4[gen.getByte() & 3], gen); | ||||
| 			case 7: | ||||
| 				if (isLast) { | ||||
| 				if (false && isLast) { | ||||
| 					return create(slot_7L, gen); | ||||
| 				} | ||||
| 				else { | ||||
|  | @ -595,7 +587,7 @@ namespace RandomX { | |||
| 		bool selectDestination(int cycle, RegisterInfo (®isters)[8], Blake2Generator& gen) { | ||||
| 			std::vector<int> availableRegisters; | ||||
| 			for (unsigned i = 0; i < 8; ++i) { | ||||
| 				if (registers[i].latency <= cycle && (canReuse_ || i != src_) && (registers[i].lastOpGroup != opGroup_ || registers[i].lastOpPar != opGroupPar_)) | ||||
| 				if (registers[i].latency <= cycle && (canReuse_ || i != src_) && (registers[i].lastOpGroup != opGroup_ || registers[i].lastOpPar != opGroupPar_) && (info_.getType() != LightInstructionType::IADD_RS || i != 5)) | ||||
| 					availableRegisters.push_back(i); | ||||
| 			} | ||||
| 			return selectRegister(availableRegisters, gen, dst_); | ||||
|  | @ -607,6 +599,12 @@ namespace RandomX { | |||
| 				if (registers[i].latency <= cycle) | ||||
| 					availableRegisters.push_back(i); | ||||
| 			} | ||||
| 			if (availableRegisters.size() == 2 && info_.getType() == LightInstructionType::IADD_RS) { | ||||
| 				if (availableRegisters[0] == 5 || availableRegisters[1] == 5) { | ||||
| 					opGroupPar_ = src_ = 5; | ||||
| 					return true; | ||||
| 				} | ||||
| 			} | ||||
| 			if (selectRegister(availableRegisters, gen, src_)) { | ||||
| 				if (groupParIsSource_) | ||||
| 					opGroupPar_ = src_; | ||||
|  | @ -666,7 +664,7 @@ namespace RandomX { | |||
| 	constexpr int V4_SRC_INDEX_BITS = 3; | ||||
| 	constexpr int V4_DST_INDEX_BITS = 3; | ||||
| 	constexpr int CYCLE_MAP_SIZE = RANDOMX_LPROG_LATENCY + 3; | ||||
| 	constexpr bool TRACE = true; | ||||
| 	constexpr bool TRACE = false; | ||||
| 
 | ||||
| 	static int blakeCounter = 0; | ||||
| 
 | ||||
|  | @ -782,15 +780,14 @@ namespace RandomX { | |||
| 		} | ||||
| 	} | ||||
| 
 | ||||
| 	void generateLightProg2(LightProgram& prog, const void* seed, int indexRegister, int nonce) { | ||||
| 	double generateLightProg2(LightProgram& prog, Blake2Generator& gen) { | ||||
| 
 | ||||
| 		ExecutionPort::type portBusy[CYCLE_MAP_SIZE][3]; | ||||
| 		memset(portBusy, 0, sizeof(portBusy)); | ||||
| 		RegisterInfo registers[8]; | ||||
| 		Blake2Generator gen(seed, nonce); | ||||
| 		std::vector<LightInstruction> instructions; | ||||
| 
 | ||||
| 		DecoderBuffer& fetchLine = DecoderBuffer::Default; | ||||
| 		const DecoderBuffer* fetchLine = &DecoderBuffer::Default; | ||||
| 		LightInstruction currentInstruction = LightInstruction::Null; | ||||
| 		int instrIndex = 0; | ||||
| 		int codeSize = 0; | ||||
|  | @ -806,24 +803,24 @@ namespace RandomX { | |||
| 		constexpr int MAX_ATTEMPTS = 4; | ||||
| 
 | ||||
| 		while(!portsSaturated) { | ||||
| 			fetchLine = fetchLine.fetchNext(currentInstruction.getType(), gen); | ||||
| 			if (TRACE) std::cout << "; ------------- fetch cycle " << cycle << " (" << fetchLine.getName() << ")" << std::endl; | ||||
| 			fetchLine = fetchLine->fetchNext(currentInstruction.getType(), gen); | ||||
| 			if (TRACE) std::cout << "; ------------- fetch cycle " << cycle << " (" << fetchLine->getName() << ")" << std::endl; | ||||
| 
 | ||||
| 			mopIndex = 0; | ||||
| 			 | ||||
| 			while (mopIndex < fetchLine.getSize()) { | ||||
| 			while (mopIndex < fetchLine->getSize()) { | ||||
| 				int topCycle = cycle; | ||||
| 				if (instrIndex >= currentInstruction.getInfo().getSize()) { | ||||
| 					if (portsSaturated) | ||||
| 						break; | ||||
| 					currentInstruction = LightInstruction::createForSlot(gen, fetchLine.getCounts()[mopIndex], fetchLine.getSize() == mopIndex + 1, fetchLine.getIndex() == 0 && mopIndex == 0); | ||||
| 					currentInstruction = LightInstruction::createForSlot(gen, fetchLine->getCounts()[mopIndex], fetchLine->getSize() == mopIndex + 1, fetchLine->getIndex() == 0 && mopIndex == 0); | ||||
| 					instrIndex = 0; | ||||
| 					if (TRACE) std::cout << "; " << currentInstruction.getInfo().getName() << std::endl; | ||||
| 				} | ||||
| 				MacroOp& mop = currentInstruction.getInfo().getOp(instrIndex); | ||||
| 				if (fetchLine.getCounts()[mopIndex] != mop.getSize()) { | ||||
| 					if (TRACE) std::cout << "ERROR instruction " << mop.getName() << " doesn't fit into slot of size " << fetchLine.getCounts()[mopIndex] << std::endl; | ||||
| 					return; | ||||
| 				if (fetchLine->getCounts()[mopIndex] != mop.getSize()) { | ||||
| 					if (TRACE) std::cout << "ERROR instruction " << mop.getName() << " doesn't fit into slot of size " << fetchLine->getCounts()[mopIndex] << std::endl; | ||||
| 					return DBL_MIN; | ||||
| 				} | ||||
| 				 | ||||
| 				if (TRACE) std::cout << mop.getName() << " "; | ||||
|  | @ -831,7 +828,7 @@ namespace RandomX { | |||
| 				mop.setCycle(scheduleCycle); | ||||
| 				if (scheduleCycle < 0) { | ||||
| 					if (TRACE) std::cout << "; Failed at cycle " << cycle << std::endl; | ||||
| 					return; | ||||
| 					return DBL_MIN; | ||||
| 				} | ||||
| 
 | ||||
| 				if (instrIndex == currentInstruction.getInfo().getSrcOp()) { | ||||
|  | @ -893,25 +890,29 @@ namespace RandomX { | |||
| 		std::cout << "; (* = in use, _ = idle)" << std::endl; | ||||
| 
 | ||||
| 		int portCycles = 0; | ||||
| 		for (int i = 0; i < CYCLE_MAP_SIZE; ++i) { | ||||
| 		/*for (int i = 0; i < CYCLE_MAP_SIZE; ++i) {
 | ||||
| 			std::cout << "; " << std::setw(3) << i << " "; | ||||
| 			for (int j = 0; j < 3; ++j) { | ||||
| 				std::cout << (portBusy[i][j] ? '*' : '_'); | ||||
| 				portCycles += !!portBusy[i][j]; | ||||
| 			} | ||||
| 			std::cout << std::endl; | ||||
| 		} | ||||
| 		}*/ | ||||
| 
 | ||||
| 		double ipc = (macroOpCount / (double)retireCycle); | ||||
| 
 | ||||
| 		std::cout << "; code size " << codeSize << " bytes" << std::endl; | ||||
| 		std::cout << "; x86 macro-ops: " << macroOpCount << std::endl; | ||||
| 		std::cout << "; RandomX instructions: " << outIndex << std::endl; | ||||
| 		std::cout << "; Execution time: " << retireCycle << " cycles" << std::endl; | ||||
| 		std::cout << "; IPC = " << (macroOpCount / (double)retireCycle) << std::endl; | ||||
| 		std::cout << "; IPC = " << ipc << std::endl; | ||||
| 		std::cout << "; Port-cycles: " << portCycles << std::endl; | ||||
| 		std::cout << "; Multiplications: " << mulCount << std::endl; | ||||
| 
 | ||||
| 		int asicLatency[8]; | ||||
| 		memset(asicLatency, 0, sizeof(asicLatency)); | ||||
| 
 | ||||
| 		 | ||||
| 		for (int i = 0; i < outIndex; ++i) { | ||||
| 			Instruction& instr = prog(i); | ||||
| 			int latDst = asicLatency[instr.dst] + 1; | ||||
|  | @ -919,7 +920,16 @@ namespace RandomX { | |||
| 			asicLatency[instr.dst] = std::max(latDst, latSrc); | ||||
| 		} | ||||
| 
 | ||||
| 		std::cout << "; Multiplications: " << mulCount << std::endl; | ||||
| 		int asicLatencyFinal = 0; | ||||
| 		int addressReg = 0; | ||||
| 		for (int i = 0; i < 8; ++i) { | ||||
| 			if (asicLatency[i] > asicLatencyFinal) { | ||||
| 				asicLatencyFinal = asicLatency[i]; | ||||
| 				addressReg = i; | ||||
| 			} | ||||
| 		} | ||||
| 
 | ||||
| 		std::cout << "; ASIC latency: " << asicLatencyFinal << std::endl; | ||||
| 
 | ||||
| 		std::cout << "; ASIC latency:" << std::endl; | ||||
| 		for (int i = 0; i < 8; ++i) { | ||||
|  | @ -931,5 +941,7 @@ namespace RandomX { | |||
| 		} | ||||
| 
 | ||||
| 		prog.setSize(outIndex); | ||||
| 		prog.setAddressRegister(addressReg); | ||||
| 		return addressReg; | ||||
| 	} | ||||
| } | ||||
|  | @ -20,6 +20,18 @@ along with RandomX.  If not, see<http://www.gnu.org/licenses/>. | |||
| #include "Program.hpp" | ||||
| 
 | ||||
| namespace RandomX { | ||||
| 	void generateLightProgram(LightProgram& prog, const void* seed, int indexRegister, int nonce); | ||||
| 	void generateLightProg2(LightProgram& prog, const void* seed, int indexRegister, int nonce); | ||||
| 
 | ||||
| 	class Blake2Generator { | ||||
| 	public: | ||||
| 		Blake2Generator(const void* seed, int nonce); | ||||
| 		uint8_t getByte(); | ||||
| 		uint32_t getInt32(); | ||||
| 	private: | ||||
| 		uint8_t data[64]; | ||||
| 		size_t dataIndex; | ||||
| 
 | ||||
| 		void checkData(const size_t); | ||||
| 	}; | ||||
| 
 | ||||
| 	double generateLightProg2(LightProgram& prog, Blake2Generator& gen); | ||||
| } | ||||
|  | @ -68,6 +68,12 @@ namespace RandomX { | |||
| 		void setSize(uint32_t val) { | ||||
| 			size = val; | ||||
| 		} | ||||
| 		int getAddressRegister() { | ||||
| 			return addrReg; | ||||
| 		} | ||||
| 		void setAddressRegister(uint32_t val) { | ||||
| 			addrReg = val; | ||||
| 		} | ||||
| 	private: | ||||
| 		void print(std::ostream& os) const { | ||||
| 			for (unsigned i = 0; i < size; ++i) { | ||||
|  | @ -77,6 +83,7 @@ namespace RandomX { | |||
| 		} | ||||
| 		Instruction programBuffer[RANDOMX_LPROG_MAX_SIZE]; | ||||
| 		uint32_t size; | ||||
| 		int addrReg; | ||||
| 	}; | ||||
| 
 | ||||
| 	static_assert(sizeof(Program) % 64 == 0, "Invalid size of class Program"); | ||||
|  |  | |||
							
								
								
									
										16
									
								
								src/asm/program_sshash_constants.inc
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										16
									
								
								src/asm/program_sshash_constants.inc
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,16 @@ | |||
| r0_mul: ;# 6364136223846793005
 | ||||
| 	db 45, 127, 149, 76, 45, 244, 81, 88 | ||||
| r1_add: ;# 9298410992540426048
 | ||||
| 	db 64, 159, 245, 89, 136, 151, 10, 129 | ||||
| r2_add: ;# 12065312585734608966
 | ||||
| 	db 70, 216, 194, 56, 223, 153, 112, 167 | ||||
| r3_add: ;# 9306329213124610396
 | ||||
| 	db 92, 9, 34, 191, 28, 185, 38, 129 | ||||
| r4_add: ;# 5281919268842080866
 | ||||
| 	db 98, 138, 159, 23, 151, 37, 77, 73 | ||||
| r5_add: ;# 10536153434571861004
 | ||||
| 	db 12, 236, 170, 206, 185, 239, 55, 146 | ||||
| r6_add: ;# 3398623926847679864
 | ||||
| 	db 120, 45, 230, 108, 116, 86, 42, 47 | ||||
| r7_add: ;# 9549104520008361294
 | ||||
| 	db 78, 229, 44, 182, 247, 59, 133, 132 | ||||
							
								
								
									
										8
									
								
								src/asm/program_sshash_load.inc
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										8
									
								
								src/asm/program_sshash_load.inc
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,8 @@ | |||
| 	;xor r8, qword ptr [rbx+0] | ||||
| 	;xor r9, qword ptr [rbx+8] | ||||
| 	;xor r10, qword ptr [rbx+16] | ||||
| 	;xor r11, qword ptr [rbx+24] | ||||
| 	;xor r12, qword ptr [rbx+32] | ||||
| 	;xor r13, qword ptr [rbx+40] | ||||
| 	;xor r14, qword ptr [rbx+48] | ||||
| 	;xor r15, qword ptr [rbx+56] | ||||
							
								
								
									
										4
									
								
								src/asm/program_sshash_prefetch.inc
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										4
									
								
								src/asm/program_sshash_prefetch.inc
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,4 @@ | |||
| 	and rbx, 4194303 | ||||
| 	shl rbx, 6 | ||||
| 	add rbx, rdi | ||||
| 	; prefetchnta byte ptr [rbx] | ||||
|  | @ -41,7 +41,7 @@ namespace RandomX { | |||
| 	static_assert((RANDOMX_SCRATCHPAD_L1 & (RANDOMX_SCRATCHPAD_L1 - 1)) == 0, "RANDOMX_SCRATCHPAD_L1 must be a power of 2."); | ||||
| 	static_assert(RANDOMX_CACHE_ACCESSES > 1, "RANDOMX_CACHE_ACCESSES must be greater than 1"); | ||||
| 
 | ||||
| 	constexpr int wtSum = RANDOMX_FREQ_IADD_R + RANDOMX_FREQ_IADD_M + RANDOMX_FREQ_IADD_RC + RANDOMX_FREQ_ISUB_R + \ | ||||
| 	constexpr int wtSum = RANDOMX_FREQ_IADD_RS + RANDOMX_FREQ_IADD_M + RANDOMX_FREQ_IADD_RC + RANDOMX_FREQ_ISUB_R + \ | ||||
| 		RANDOMX_FREQ_ISUB_M + RANDOMX_FREQ_IMUL_9C + RANDOMX_FREQ_IMUL_R + RANDOMX_FREQ_IMUL_M + RANDOMX_FREQ_IMULH_R + \ | ||||
| 		RANDOMX_FREQ_IMULH_M + RANDOMX_FREQ_ISMULH_R + RANDOMX_FREQ_ISMULH_M + RANDOMX_FREQ_IMUL_RCP + \ | ||||
| 		RANDOMX_FREQ_INEG_R + RANDOMX_FREQ_IXOR_R + RANDOMX_FREQ_IXOR_M + RANDOMX_FREQ_IROR_R + RANDOMX_FREQ_ISWAP_R + \ | ||||
|  | @ -141,6 +141,7 @@ namespace RandomX { | |||
| 	typedef void(*DatasetReadFunc)(addr_t, MemoryRegisters&, int_reg_t(®)[RegistersCount]); | ||||
| 
 | ||||
| 	typedef void(*ProgramFunc)(RegisterFile&, MemoryRegisters&, uint8_t* /* scratchpad */, uint64_t); | ||||
| 	typedef void(*DatasetInitFunc)(uint8_t* cache, uint8_t* dataset, uint32_t startBlock, uint32_t endBlock); | ||||
| } | ||||
| 
 | ||||
| std::ostream& operator<<(std::ostream& os, const RandomX::RegisterFile& rf); | ||||
|  |  | |||
|  | @ -37,7 +37,7 @@ along with RandomX.  If not, see<http://www.gnu.org/licenses/>. | |||
| //Number of random Cache accesses per Dataset block. Minimum is 2.
 | ||||
| #define RANDOMX_CACHE_ACCESSES     8 | ||||
| 
 | ||||
| #define RANDOMX_LPROG_LATENCY      168 | ||||
| #define RANDOMX_LPROG_LATENCY      130 | ||||
| #define RANDOMX_LPROG_ASIC_LATENCY 84 | ||||
| #define RANDOMX_LPROG_MIN_SIZE     225 | ||||
| #define RANDOMX_LPROG_MAX_SIZE     512 | ||||
|  | @ -80,12 +80,12 @@ Instruction frequencies (per 256 opcodes) | |||
| Total sum of frequencies must be 256 | ||||
| */ | ||||
| 
 | ||||
| #define RANDOMX_FREQ_IADD_R        12 | ||||
| #define RANDOMX_FREQ_IADD_RS       32 | ||||
| #define RANDOMX_FREQ_IADD_M         7 | ||||
| #define RANDOMX_FREQ_IADD_RC       16 | ||||
| #define RANDOMX_FREQ_ISUB_R        12 | ||||
| #define RANDOMX_FREQ_IADD_RC        0 | ||||
| #define RANDOMX_FREQ_ISUB_R        17 | ||||
| #define RANDOMX_FREQ_ISUB_M         7 | ||||
| #define RANDOMX_FREQ_IMUL_9C        9 | ||||
| #define RANDOMX_FREQ_IMUL_9C        0 | ||||
| #define RANDOMX_FREQ_IMUL_R        16 | ||||
| #define RANDOMX_FREQ_IMUL_M         4 | ||||
| #define RANDOMX_FREQ_IMULH_R        4 | ||||
|  |  | |||
							
								
								
									
										46
									
								
								src/main.cpp
									
										
									
									
									
								
							
							
						
						
									
										46
									
								
								src/main.cpp
									
										
									
									
									
								
							|  | @ -37,6 +37,7 @@ along with RandomX.  If not, see<http://www.gnu.org/licenses/>. | |||
| #include "Cache.hpp" | ||||
| #include "hashAes1Rx4.hpp" | ||||
| #include "LightProgramGenerator.hpp" | ||||
| #include "JitCompilerX86.hpp" | ||||
| 
 | ||||
| const uint8_t seed[32] = { 191, 182, 222, 175, 249, 89, 134, 104, 241, 68, 191, 62, 162, 166, 61, 64, 123, 191, 227, 193, 118, 60, 188, 53, 223, 133, 175, 24, 123, 230, 55, 74 }; | ||||
| 
 | ||||
|  | @ -204,7 +205,7 @@ void mine(RandomX::VirtualMachine* vm, std::atomic<uint32_t>& atomicNonce, Atomi | |||
| } | ||||
| 
 | ||||
| int main(int argc, char** argv) { | ||||
| 	bool softAes, genAsm, miningMode, verificationMode, help, largePages, async, genNative, jit, genLight; | ||||
| 	bool softAes, genAsm, miningMode, verificationMode, help, largePages, async, genNative, jit, genLight, useSuperscalar; | ||||
| 	int programCount, threadCount, initThreadCount, epoch; | ||||
| 
 | ||||
| 	readOption("--softAes", argc, argv, softAes); | ||||
|  | @ -220,14 +221,16 @@ int main(int argc, char** argv) { | |||
| 	readOption("--genNative", argc, argv, genNative); | ||||
| 	readOption("--help", argc, argv, help); | ||||
| 	readOption("--genLight", argc, argv, genLight); | ||||
| 	readOption("--useSuperscalar", argc, argv, useSuperscalar); | ||||
| 
 | ||||
| 	if (genLight) { | ||||
| 		RandomX::LightProgram p; | ||||
| 		RandomX::generateLightProg2(p, seed, 0, programCount); | ||||
| 		//RandomX::AssemblyGeneratorX86 asmX86;
 | ||||
| 		//asmX86.generateProgram(p);
 | ||||
| 		RandomX::Blake2Generator gen(seed, programCount); | ||||
| 		RandomX::generateLightProg2(p, gen); | ||||
| 		RandomX::AssemblyGeneratorX86 asmX86; | ||||
| 		asmX86.generateProgram(p); | ||||
| 		//std::ofstream file("lightProg2.asm");
 | ||||
| 		//asmX86.printCode(std::cout);
 | ||||
| 		asmX86.printCode(std::cout); | ||||
| 		return 0; | ||||
| 	} | ||||
| 
 | ||||
|  | @ -287,24 +290,37 @@ int main(int argc, char** argv) { | |||
| 			dataset.dataset.size = datasetSize; | ||||
| 			RandomX::datasetAlloc(dataset, largePages); | ||||
| 			const uint64_t datasetBlockCount = datasetSize / RandomX::CacheLineSize; | ||||
| 			if (initThreadCount > 1) { | ||||
| 				auto perThread = datasetBlockCount / initThreadCount; | ||||
| 				auto remainder = datasetBlockCount % initThreadCount; | ||||
| 				for (int i = 0; i < initThreadCount; ++i) { | ||||
| 					auto count = perThread + (i == initThreadCount - 1 ? remainder : 0); | ||||
| 					threads.push_back(std::thread(&RandomX::datasetInit, std::ref(cache), std::ref(dataset.dataset), i * perThread, count)); | ||||
| 				} | ||||
| 				for (unsigned i = 0; i < threads.size(); ++i) { | ||||
| 					threads[i].join(); | ||||
| 			if (useSuperscalar) { | ||||
| 				RandomX::Blake2Generator gen(seed, programCount); | ||||
| 				RandomX::LightProgram programs[RANDOMX_CACHE_ACCESSES]; | ||||
| 				for (int i = 0; i < RANDOMX_CACHE_ACCESSES; ++i) { | ||||
| 					RandomX::generateLightProg2(programs[i], gen); | ||||
| 				} | ||||
| 				RandomX::JitCompilerX86 jit86; | ||||
| 				jit86.generateSuperScalarHash(programs); | ||||
| 				jit86.getDatasetInitFunc()(cache.memory, dataset.dataset.memory, 0, datasetBlockCount); | ||||
| 			} | ||||
| 			else { | ||||
| 				RandomX::datasetInit(cache, dataset.dataset, 0, datasetBlockCount); | ||||
| 				if (initThreadCount > 1) { | ||||
| 					auto perThread = datasetBlockCount / initThreadCount; | ||||
| 					auto remainder = datasetBlockCount % initThreadCount; | ||||
| 					for (int i = 0; i < initThreadCount; ++i) { | ||||
| 						auto count = perThread + (i == initThreadCount - 1 ? remainder : 0); | ||||
| 						threads.push_back(std::thread(&RandomX::datasetInit, std::ref(cache), std::ref(dataset.dataset), i * perThread, count)); | ||||
| 					} | ||||
| 					for (unsigned i = 0; i < threads.size(); ++i) { | ||||
| 						threads[i].join(); | ||||
| 					} | ||||
| 				} | ||||
| 				else { | ||||
| 					RandomX::datasetInit(cache, dataset.dataset, 0, datasetBlockCount); | ||||
| 				} | ||||
| 			} | ||||
| 			RandomX::deallocCache(cache, largePages); | ||||
| 			threads.clear(); | ||||
| 			std::cout << "Dataset (" << datasetSize << " bytes) initialized in " << sw.getElapsed() << " s" << std::endl; | ||||
| 		} | ||||
| 		return 0; | ||||
| 		std::cout << "Initializing " << threadCount << " virtual machine(s) ..." << std::endl; | ||||
| 		for (int i = 0; i < threadCount; ++i) { | ||||
| 			RandomX::VirtualMachine* vm; | ||||
|  |  | |||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue