diff --git a/src/JitCompilerX86.cpp b/src/JitCompilerX86.cpp index dc812f2..543632e 100644 --- a/src/JitCompilerX86.cpp +++ b/src/JitCompilerX86.cpp @@ -130,13 +130,13 @@ namespace RandomX { static const uint8_t ADD_RAX_RCX[] = { 0x48, 0x01, 0xC8 }; static const uint8_t SAR_RAX_I8[] = { 0x48, 0xC1, 0xF8 }; static const uint8_t NEG_RAX[] = { 0x48, 0xF7, 0xD8 }; - static const uint8_t ADD_R_RAX[] = { 0x49, 0x01 }; - static const uint8_t XOR_EAX_EAX[] = { 0x31, 0xC0 }; + static const uint8_t ADD_R_RAX[] = { 0x4C, 0x03 }; + static const uint8_t XOR_EAX_EAX[] = { 0x33, 0xC0 }; static const uint8_t ADD_RDX_R[] = { 0x4c, 0x01 }; static const uint8_t SUB_RDX_R[] = { 0x4c, 0x29 }; static const uint8_t SAR_RDX_I8[] = { 0x48, 0xC1, 0xFA }; static const uint8_t TEST_RDX_RDX[] = { 0x48, 0x85, 0xD2 }; - static const uint8_t SETS_AL_ADD_RDX_RAX[] = { 0x0F, 0x98, 0xC0, 0x48, 0x01, 0xC2 }; + static const uint8_t SETS_AL_ADD_RDX_RAX[] = { 0x0F, 0x98, 0xC0, 0x48, 0x03, 0xD0 }; static const uint8_t REX_NEG[] = { 0x49, 0xF7 }; static const uint8_t REX_XOR_RR[] = { 0x4D, 0x33 }; static const uint8_t REX_XOR_RI[] = { 0x49, 0x81 }; @@ -272,7 +272,7 @@ namespace RandomX { } void JitCompilerX86::genSIB(int scale, int index, int base) { - emitByte((scale << 5) | (index << 3) | base); + emitByte((scale << 6) | (index << 3) | base); } void JitCompilerX86::h_IADD_RC(Instruction& instr) { @@ -290,7 +290,7 @@ namespace RandomX { else { emit(REX_81); emitByte(0xe8 + instr.dst); - genAddressImm(instr); + emit32(instr.imm32); } } @@ -311,7 +311,7 @@ namespace RandomX { void JitCompilerX86::h_IMUL_9C(Instruction& instr) { emit(REX_LEA); emitByte(0x84 + 8 * instr.dst); - genSIB(3, instr.src, instr.dst); + genSIB(3, instr.dst, instr.dst); emit32(instr.imm32); } @@ -323,7 +323,7 @@ namespace RandomX { else { emit(REX_IMUL_RRI); emitByte(0xc0 + 9 * instr.dst); - genAddressImm(instr); + emit32(instr.imm32); } } @@ -424,7 +424,7 @@ namespace RandomX { emit(REX_SHR_RDX); emitByte(mi.post_shift); } - emit(REX_ADD_RR); + emit(REX_ADD_RM); emitByte(0xc2 + 8 * instr.dst); } else { //divisor is a power of two @@ -440,7 +440,7 @@ namespace RandomX { } void JitCompilerX86::h_ISDIV_C(Instruction& instr) { - int64_t divisor = instr.imm32; + int64_t divisor = (int32_t)instr.imm32; if ((divisor & -divisor) == divisor || (divisor & -divisor) == -divisor) { emit(REX_MOV_RR64); emitByte(0xc0 + instr.dst); @@ -493,7 +493,7 @@ namespace RandomX { emit(TEST_RDX_RDX); emit(SETS_AL_ADD_RDX_RAX); emit(ADD_R_RAX); - emitByte(0xd0 + instr.dst); + emitByte(0xc2 + 8 * instr.dst); } } @@ -559,7 +559,7 @@ namespace RandomX { void JitCompilerX86::h_ISWAP_R(Instruction& instr) { if (instr.src != instr.dst) { emit(REX_XCHG); - emitByte(0xc0 + instr.dst + 8 * instr.src); + emitByte(0xc0 + instr.src + 8 * instr.dst); } } diff --git a/src/executeProgram-win64.asm b/src/executeProgram-win64.asm index ac49e50..b3528f2 100644 --- a/src/executeProgram-win64.asm +++ b/src/executeProgram-win64.asm @@ -78,13 +78,14 @@ executeProgram PROC movdqu xmmword ptr [rsp+16], xmm14 movdqu xmmword ptr [rsp+0], xmm15 - ; function arguments - push rcx ; RegisterFile& registerFile - mov rbp, qword ptr [rdx] ; "mx", "ma" - mov eax, ebp ; "mx" - mov rdi, qword ptr [rdx+8] ; uint8_t* dataset - mov rsi, r8 ; convertible_t* scratchpad - mov rbx, r9 ; loop counter + ;# function arguments + push rcx ;# RegisterFile& registerFile + mov rbp, qword ptr [rdx] ;# "mx", "ma" + mov rdi, qword ptr [rdx+8] ;# uint8_t* dataset + mov rsi, r8 ;# uint8_t* scratchpad + mov rbx, r9 ;# loop counter + + mov rax, rbp ;# zero integer registers xor r8, r8 @@ -114,16 +115,16 @@ minDbl: absMask: db 255, 255, 255, 255, 255, 255, 255, 127, 255, 255, 255, 255, 255, 255, 255, 127 signMask: - db 0, 0, 0, 0, 0, 0, 0, 128, 0, 0, 0, 0, 0, 0, 0, 128 + db 0, 0, 0, 0, 0, 0, 240, 129, 0, 0, 0, 0, 0, 0, 240, 129 ALIGN 64 program_begin: xor rax, r8 ;# read address register 1 - xor rax, r9 + xor rax, r10 mov rdx, rax - and eax, 1048512 - push rax + and eax, 2097088 lea rcx, [rsi+rax] + push rcx xor r8, qword ptr [rcx+0] xor r9, qword ptr [rcx+8] xor r10, qword ptr [rcx+16] @@ -133,9 +134,9 @@ program_begin: xor r14, qword ptr [rcx+48] xor r15, qword ptr [rcx+56] ror rdx, 32 - and edx, 1048512 - push rdx + and edx, 2097088 lea rcx, [rsi+rdx] + push rcx cvtdq2pd xmm0, qword ptr [rcx+0] cvtdq2pd xmm1, qword ptr [rcx+8] cvtdq2pd xmm2, qword ptr [rcx+16] @@ -152,9 +153,10 @@ program_begin: ;# 256 instructions include program.inc - mov eax, r8d ;# read address register 1 - xor eax, r9d ;# read address register 2 + mov eax, r12d ;# read address register 1 + xor eax, r15d ;# read address register 2 xor rbp, rax ;# modify "mx" + xor eax, eax and rbp, -64 ;# align "mx" to the start of a cache line mov edx, ebp ;# edx = mx prefetchnta byte ptr [rdi+rdx] @@ -169,8 +171,7 @@ program_begin: xor r13, qword ptr [rcx+40] xor r14, qword ptr [rcx+48] xor r15, qword ptr [rcx+56] - pop rax - lea rcx, [rsi+rax] + pop rcx mov qword ptr [rcx+0], r8 mov qword ptr [rcx+8], r9 mov qword ptr [rcx+16], r10 @@ -179,8 +180,7 @@ program_begin: mov qword ptr [rcx+40], r13 mov qword ptr [rcx+48], r14 mov qword ptr [rcx+56], r15 - pop rax - lea rcx, [rsi+rax] + pop rcx mulpd xmm0, xmm4 mulpd xmm1, xmm5 mulpd xmm2, xmm6 @@ -189,8 +189,7 @@ program_begin: movapd xmmword ptr [rcx+16], xmm1 movapd xmmword ptr [rcx+32], xmm2 movapd xmmword ptr [rcx+48], xmm3 - xor eax, eax - dec ebx + sub ebx, 1 jnz program_begin rx_finish: diff --git a/src/main.cpp b/src/main.cpp index 51df7f6..57bafe7 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -123,29 +123,35 @@ void printUsage(const char* executable) { std::cout << " --genNative generate RandomX code for nonce N" << std::endl; } +template void generateAsm(int nonce) { - uint64_t hash[8]; + alignas(16) uint64_t hash[8]; uint8_t blockTemplate[sizeof(blockTemplate__)]; memcpy(blockTemplate, blockTemplate__, sizeof(blockTemplate)); int* noncePtr = (int*)(blockTemplate + 39); *noncePtr = nonce; blake2b(hash, sizeof(hash), blockTemplate, sizeof(blockTemplate), nullptr, 0); + uint8_t scratchpad[RandomX::ScratchpadSize]; + fillAes1Rx4((void*)hash, RandomX::ScratchpadSize, scratchpad); RandomX::AssemblyGeneratorX86 asmX86; RandomX::Program p; - fillAes1Rx4(hash, sizeof(p), &p); + fillAes1Rx4(hash, sizeof(p), &p); asmX86.generateProgram(p); asmX86.printCode(std::cout); } +template void generateNative(int nonce) { - uint64_t hash[4]; + alignas(16) uint64_t hash[8]; uint8_t blockTemplate[sizeof(blockTemplate__)]; memcpy(blockTemplate, blockTemplate__, sizeof(blockTemplate)); int* noncePtr = (int*)(blockTemplate + 39); *noncePtr = nonce; blake2b(hash, sizeof(hash), blockTemplate, sizeof(blockTemplate), nullptr, 0); + uint8_t scratchpad[RandomX::ScratchpadSize]; + fillAes1Rx4((void*)hash, RandomX::ScratchpadSize, scratchpad); alignas(16) RandomX::Program prog; - fillAes1Rx4((void*)hash, sizeof(prog), &prog); + fillAes1Rx4((void*)hash, sizeof(prog), &prog); for (int i = 0; i < RandomX::ProgramLength; ++i) { prog(i).dst %= 8; prog(i).src %= 8; @@ -181,7 +187,7 @@ void mine(RandomX::VirtualMachine* vm, std::atomic& atomicNonce, AtomicHash result.xorWith(hash); if (RandomX::trace) { std::cout << "Nonce: " << nonce << " "; - outputHex(std::cout, (char*)hash, sizeof(hash)); + outputHex(std::cout, (char*)hash, 16); std::cout << std::endl; } nonce = atomicNonce.fetch_add(1); @@ -208,12 +214,18 @@ int main(int argc, char** argv) { readOption("--genNative", argc, argv, genNative); if (genAsm) { - generateAsm(programCount); + if (softAes) + generateAsm(programCount); + else + generateAsm(programCount); return 0; } if (genNative) { - generateNative(programCount); + if (softAes) + generateNative(programCount); + else + generateNative(programCount); return 0; }