From 20eb549725979519154986440a8502660a7c6535 Mon Sep 17 00:00:00 2001 From: tevador Date: Sun, 27 Jan 2019 19:33:55 +0100 Subject: [PATCH] Merged load/store of integer and FP registers --- src/JitCompilerX86-static.S | 24 ++++------ src/JitCompilerX86-static.asm | 32 +++++-------- src/JitCompilerX86-static.hpp | 8 ++-- src/JitCompilerX86.cpp | 45 +++++++------------ src/asm/program_load_int.inc | 10 ----- ...ram_load_flt.inc => program_loop_load.inc} | 14 ++++++ src/asm/program_loop_store.inc | 18 ++++++++ src/asm/program_prologue_linux.inc | 3 +- src/asm/program_prologue_load.inc | 2 + src/asm/program_prologue_win64.inc | 3 +- src/asm/program_read_dataset.inc | 1 + src/asm/program_store_flt.inc | 11 ----- src/asm/program_store_int.inc | 10 ----- src/executeProgram-win64.asm | 21 +++++---- 14 files changed, 88 insertions(+), 114 deletions(-) delete mode 100644 src/asm/program_load_int.inc rename src/asm/{program_load_flt.inc => program_loop_load.inc} (55%) create mode 100644 src/asm/program_loop_store.inc delete mode 100644 src/asm/program_store_flt.inc delete mode 100644 src/asm/program_store_int.inc diff --git a/src/JitCompilerX86-static.S b/src/JitCompilerX86-static.S index a799e11..9bf06ba 100644 --- a/src/JitCompilerX86-static.S +++ b/src/JitCompilerX86-static.S @@ -27,13 +27,11 @@ #define DECL(x) x #endif .global DECL(randomx_program_prologue) -.global DECL(randomx_loop_begin) -.global DECL(randomx_program_load_int) -.global DECL(randomx_program_load_flt) +.global DECL(randomx_program_loop_begin) +.global DECL(randomx_program_loop_load) .global DECL(randomx_program_start) .global DECL(randomx_program_read_dataset) -.global DECL(randomx_program_store_int) -.global DECL(randomx_program_store_flt) +.global DECL(randomx_program_loop_store) .global DECL(randomx_program_loop_end) .global DECL(randomx_program_epilogue) .global DECL(randomx_program_end) @@ -48,14 +46,11 @@ DECL(randomx_program_prologue): #include "asm/program_xmm_constants.inc" .align 64 -DECL(randomx_loop_begin): +DECL(randomx_program_loop_begin): nop -DECL(randomx_program_load_int): - #include "asm/program_load_int.inc" - -DECL(randomx_program_load_flt): - #include "asm/program_load_flt.inc" +DECL(randomx_program_loop_load): + #include "asm/program_loop_load.inc" DECL(randomx_program_start): nop @@ -63,11 +58,8 @@ DECL(randomx_program_start): DECL(randomx_program_read_dataset): #include "asm/program_read_dataset.inc" -DECL(randomx_program_store_int): - #include "asm/program_store_int.inc" - -DECL(randomx_program_store_flt): - #include "asm/program_store_flt.inc" +DECL(randomx_program_loop_store): + #include "asm/program_loop_store.inc" DECL(randomx_program_loop_end): nop diff --git a/src/JitCompilerX86-static.asm b/src/JitCompilerX86-static.asm index 8d5a4fe..5b2d387 100644 --- a/src/JitCompilerX86-static.asm +++ b/src/JitCompilerX86-static.asm @@ -20,13 +20,11 @@ IFDEF RAX _RANDOMX_JITX86_STATIC SEGMENT PAGE READ EXECUTE PUBLIC randomx_program_prologue -PUBLIC randomx_loop_begin -PUBLIC randomx_program_load_int -PUBLIC randomx_program_load_flt +PUBLIC randomx_program_loop_begin +PUBLIC randomx_program_loop_load PUBLIC randomx_program_start PUBLIC randomx_program_read_dataset -PUBLIC randomx_program_store_int -PUBLIC randomx_program_store_flt +PUBLIC randomx_program_loop_store PUBLIC randomx_program_loop_end PUBLIC randomx_program_epilogue PUBLIC randomx_program_end @@ -40,17 +38,13 @@ ALIGN 64 include asm/program_xmm_constants.inc ALIGN 64 -randomx_loop_begin PROC +randomx_program_loop_begin PROC nop -randomx_loop_begin ENDP +randomx_program_loop_begin ENDP -randomx_program_load_int PROC - include asm/program_load_int.inc -randomx_program_load_int ENDP - -randomx_program_load_flt PROC - include asm/program_load_flt.inc -randomx_program_load_flt ENDP +randomx_program_loop_load PROC + include asm/program_loop_load.inc +randomx_program_loop_load ENDP randomx_program_start PROC nop @@ -60,13 +54,9 @@ randomx_program_read_dataset PROC include asm/program_read_dataset.inc randomx_program_read_dataset ENDP -randomx_program_store_int PROC - include asm/program_store_int.inc -randomx_program_store_int ENDP - -randomx_program_store_flt PROC - include asm/program_store_flt.inc -randomx_program_store_flt ENDP +randomx_program_loop_store PROC + include asm/program_loop_store.inc +randomx_program_loop_store ENDP randomx_program_loop_end PROC nop diff --git a/src/JitCompilerX86-static.hpp b/src/JitCompilerX86-static.hpp index df5cd28..64abfa3 100644 --- a/src/JitCompilerX86-static.hpp +++ b/src/JitCompilerX86-static.hpp @@ -19,13 +19,11 @@ along with RandomX. If not, see. extern "C" { void randomx_program_prologue(); - void randomx_loop_begin(); - void randomx_program_load_int(); - void randomx_program_load_flt(); + void randomx_program_loop_begin(); + void randomx_program_loop_load(); void randomx_program_start(); void randomx_program_read_dataset(); - void randomx_program_store_int(); - void randomx_program_store_flt(); + void randomx_program_loop_store(); void randomx_program_loop_end(); void randomx_program_epilogue(); void randomx_program_end(); diff --git a/src/JitCompilerX86.cpp b/src/JitCompilerX86.cpp index 30c6f73..cf50582 100644 --- a/src/JitCompilerX86.cpp +++ b/src/JitCompilerX86.cpp @@ -94,13 +94,11 @@ namespace RandomX { #include "JitCompilerX86-static.hpp" const uint8_t* codePrologue = (uint8_t*)&randomx_program_prologue; - const uint8_t* codeLoopBegin = (uint8_t*)&randomx_loop_begin; - const uint8_t* codeLoadInt = (uint8_t*)&randomx_program_load_int; - const uint8_t* codeLoadFlt = (uint8_t*)&randomx_program_load_flt; + const uint8_t* codeLoopBegin = (uint8_t*)&randomx_program_loop_begin; + const uint8_t* codeLoopLoad = (uint8_t*)&randomx_program_loop_load; const uint8_t* codeProgamStart = (uint8_t*)&randomx_program_start; const uint8_t* codeReadDataset = (uint8_t*)&randomx_program_read_dataset; - const uint8_t* codeStoreInt = (uint8_t*)&randomx_program_store_int; - const uint8_t* codeStoreFlt = (uint8_t*)&randomx_program_store_flt; + const uint8_t* codeLoopStore = (uint8_t*)&randomx_program_loop_store; const uint8_t* codeLoopEnd = (uint8_t*)&randomx_program_loop_end; const uint8_t* codeEpilogue = (uint8_t*)&randomx_program_epilogue; const uint8_t* codeProgramEnd = (uint8_t*)&randomx_program_end; @@ -108,11 +106,9 @@ namespace RandomX { const int32_t prologueSize = codeLoopBegin - codePrologue; const int32_t epilogueSize = codeProgramEnd - codeEpilogue; - const int32_t loadIntSize = codeLoadFlt - codeLoadInt; - const int32_t loadFltSize = codeProgamStart - codeLoadFlt; - const int32_t readDatasetSize = codeStoreInt - codeReadDataset; - const int32_t storeIntSize = codeStoreFlt - codeStoreInt; - const int32_t storeFltSize = codeLoopEnd - codeStoreFlt; + const int32_t loopLoadSize = codeProgamStart - codeLoopLoad; + const int32_t readDatasetSize = codeLoopStore - codeReadDataset; + const int32_t loopStoreSize = codeLoopEnd - codeLoopStore; const int32_t epilogueOffset = CodeSize - epilogueSize; @@ -179,6 +175,7 @@ namespace RandomX { static const uint8_t SUB_EBX[] = { 0x83, 0xEB, 0x01 }; static const uint8_t JNZ[] = { 0x0f, 0x85 }; static const uint8_t JMP = 0xe9; + static const uint8_t REX_XOR_RAX_R64[] = { 0x49, 0x33 }; size_t JitCompilerX86::getCodeSize() { return codePos - prologueSize; @@ -204,18 +201,16 @@ namespace RandomX { addressRegisters >>= 1; int readReg2 = 2 + (addressRegisters & 1); addressRegisters >>= 1; - int writeReg1 = 4 + (addressRegisters & 1); + int readReg3 = 4 + (addressRegisters & 1); addressRegisters >>= 1; - int writeReg2 = 6 + (addressRegisters & 1); + int readReg4 = 6 + (addressRegisters & 1); codePos = prologueSize; - emit(REX_XOR_EAX); + emit(REX_XOR_RAX_R64); emitByte(0xc0 + readReg1); - memcpy(code + codePos, codeLoadInt, loadIntSize); - codePos += loadIntSize; - emit(REX_XOR_EAX); + emit(REX_XOR_RAX_R64); emitByte(0xc0 + readReg2); - memcpy(code + codePos, codeLoadFlt, loadFltSize); - codePos += loadFltSize; + memcpy(code + codePos, codeLoopLoad, loopLoadSize); + codePos += loopLoadSize; Instruction instr; for (unsigned i = 0; i < ProgramLength; ++i) { for (unsigned j = 0; j < sizeof(instr) / sizeof(Pcg32::result_type); ++j) { @@ -226,19 +221,13 @@ namespace RandomX { generateCode(instr); } emit(REX_MOV_RR); - emitByte(0xc0 + readReg1); + emitByte(0xc0 + readReg3); emit(REX_XOR_EAX); - emitByte(0xc0 + readReg2); + emitByte(0xc0 + readReg4); memcpy(code + codePos, codeReadDataset, readDatasetSize); codePos += readDatasetSize; - emit(REX_MOV_RR); - emitByte(0xc0 + writeReg1); - memcpy(code + codePos, codeStoreInt, storeIntSize); - codePos += storeIntSize; - emit(REX_XOR_EAX); - emitByte(0xc0 + writeReg2); - memcpy(code + codePos, codeStoreFlt, storeFltSize); - codePos += storeFltSize; + memcpy(code + codePos, codeLoopStore, loopStoreSize); + codePos += loopStoreSize; emit(SUB_EBX); emit(JNZ); emit32(prologueSize - codePos - 4); diff --git a/src/asm/program_load_int.inc b/src/asm/program_load_int.inc deleted file mode 100644 index d9277ed..0000000 --- a/src/asm/program_load_int.inc +++ /dev/null @@ -1,10 +0,0 @@ - and eax, 1048512 - lea rcx, [rsi+rax] - xor r8, qword ptr [rcx+0] - xor r9, qword ptr [rcx+8] - xor r10, qword ptr [rcx+16] - xor r11, qword ptr [rcx+24] - xor r12, qword ptr [rcx+32] - xor r13, qword ptr [rcx+40] - xor r14, qword ptr [rcx+48] - xor r15, qword ptr [rcx+56] diff --git a/src/asm/program_load_flt.inc b/src/asm/program_loop_load.inc similarity index 55% rename from src/asm/program_load_flt.inc rename to src/asm/program_loop_load.inc index 2c631ce..c4c1fed 100644 --- a/src/asm/program_load_flt.inc +++ b/src/asm/program_loop_load.inc @@ -1,5 +1,19 @@ + mov rdx, rax and eax, 1048512 lea rcx, [rsi+rax] + push rcx + xor r8, qword ptr [rcx+0] + xor r9, qword ptr [rcx+8] + xor r10, qword ptr [rcx+16] + xor r11, qword ptr [rcx+24] + xor r12, qword ptr [rcx+32] + xor r13, qword ptr [rcx+40] + xor r14, qword ptr [rcx+48] + xor r15, qword ptr [rcx+56] + ror rdx, 32 + and edx, 1048512 + lea rcx, [rsi+rdx] + push rcx cvtdq2pd xmm0, qword ptr [rcx+0] cvtdq2pd xmm1, qword ptr [rcx+8] cvtdq2pd xmm2, qword ptr [rcx+16] diff --git a/src/asm/program_loop_store.inc b/src/asm/program_loop_store.inc new file mode 100644 index 0000000..a0acebc --- /dev/null +++ b/src/asm/program_loop_store.inc @@ -0,0 +1,18 @@ + pop rcx + mov qword ptr [rcx+0], r8 + mov qword ptr [rcx+8], r9 + mov qword ptr [rcx+16], r10 + mov qword ptr [rcx+24], r11 + mov qword ptr [rcx+32], r12 + mov qword ptr [rcx+40], r13 + mov qword ptr [rcx+48], r14 + mov qword ptr [rcx+56], r15 + pop rcx + mulpd xmm0, xmm4 + mulpd xmm1, xmm5 + mulpd xmm2, xmm6 + mulpd xmm3, xmm7 + movapd xmmword ptr [rcx+0], xmm0 + movapd xmmword ptr [rcx+16], xmm1 + movapd xmmword ptr [rcx+32], xmm2 + movapd xmmword ptr [rcx+48], xmm3 diff --git a/src/asm/program_prologue_linux.inc b/src/asm/program_prologue_linux.inc index 67a967d..bdde664 100644 --- a/src/asm/program_prologue_linux.inc +++ b/src/asm/program_prologue_linux.inc @@ -11,10 +11,9 @@ push rdi ;# RegisterFile& registerFile mov rcx, rdi mov rbp, qword ptr [rsi] ;# "mx", "ma" - mov eax, ebp ;# "mx" mov rdi, qword ptr [rsi+8] ;# uint8_t* dataset mov rsi, rdx ;# convertible_t* scratchpad #include "program_prologue_load.inc" - jmp DECL(randomx_loop_begin) \ No newline at end of file + jmp DECL(randomx_program_loop_begin) \ No newline at end of file diff --git a/src/asm/program_prologue_load.inc b/src/asm/program_prologue_load.inc index ecdd4f9..3a994ab 100644 --- a/src/asm/program_prologue_load.inc +++ b/src/asm/program_prologue_load.inc @@ -1,3 +1,5 @@ + mov rax, rbp + ;# zero integer registers xor r8, r8 xor r9, r9 diff --git a/src/asm/program_prologue_win64.inc b/src/asm/program_prologue_win64.inc index 83ae2a5..b1da4d7 100644 --- a/src/asm/program_prologue_win64.inc +++ b/src/asm/program_prologue_win64.inc @@ -23,11 +23,10 @@ ; function arguments push rcx ; RegisterFile& registerFile mov rbp, qword ptr [rdx] ; "mx", "ma" - mov eax, ebp ; "mx" mov rdi, qword ptr [rdx+8] ; uint8_t* dataset mov rsi, r8 ; convertible_t* scratchpad mov rbx, r9 ; loop counter include program_prologue_load.inc - jmp randomx_loop_begin \ No newline at end of file + jmp randomx_program_loop_begin \ No newline at end of file diff --git a/src/asm/program_read_dataset.inc b/src/asm/program_read_dataset.inc index bae4817..061d32c 100644 --- a/src/asm/program_read_dataset.inc +++ b/src/asm/program_read_dataset.inc @@ -1,4 +1,5 @@ xor rbp, rax ;# modify "mx" + xor eax, eax and rbp, -64 ;# align "mx" to the start of a cache line mov edx, ebp ;# edx = mx prefetchnta byte ptr [rdi+rdx] diff --git a/src/asm/program_store_flt.inc b/src/asm/program_store_flt.inc deleted file mode 100644 index 4bbab9f..0000000 --- a/src/asm/program_store_flt.inc +++ /dev/null @@ -1,11 +0,0 @@ - and eax, 1048512 - lea rcx, [rsi+rax] - mulpd xmm0, xmm4 - mulpd xmm1, xmm5 - mulpd xmm2, xmm6 - mulpd xmm3, xmm7 - movapd xmmword ptr [rcx+0], xmm0 - movapd xmmword ptr [rcx+16], xmm1 - movapd xmmword ptr [rcx+32], xmm2 - movapd xmmword ptr [rcx+48], xmm3 - diff --git a/src/asm/program_store_int.inc b/src/asm/program_store_int.inc deleted file mode 100644 index 03dd31a..0000000 --- a/src/asm/program_store_int.inc +++ /dev/null @@ -1,10 +0,0 @@ - and eax, 1048512 - lea rcx, [rsi+rax] - mov qword ptr [rcx+0], r8 - mov qword ptr [rcx+8], r9 - mov qword ptr [rcx+16], r10 - mov qword ptr [rcx+24], r11 - mov qword ptr [rcx+32], r12 - mov qword ptr [rcx+40], r13 - mov qword ptr [rcx+48], r14 - mov qword ptr [rcx+56], r15 diff --git a/src/executeProgram-win64.asm b/src/executeProgram-win64.asm index e9bc30a..ac49e50 100644 --- a/src/executeProgram-win64.asm +++ b/src/executeProgram-win64.asm @@ -118,8 +118,11 @@ signMask: ALIGN 64 program_begin: - xor eax, r8d ;# read address register 1 + xor rax, r8 ;# read address register 1 + xor rax, r9 + mov rdx, rax and eax, 1048512 + push rax lea rcx, [rsi+rax] xor r8, qword ptr [rcx+0] xor r9, qword ptr [rcx+8] @@ -129,9 +132,10 @@ program_begin: xor r13, qword ptr [rcx+40] xor r14, qword ptr [rcx+48] xor r15, qword ptr [rcx+56] - xor eax, r9d ;# read address register 2 - and eax, 1048512 - lea rcx, [rsi+rax] + ror rdx, 32 + and edx, 1048512 + push rdx + lea rcx, [rsi+rdx] cvtdq2pd xmm0, qword ptr [rcx+0] cvtdq2pd xmm1, qword ptr [rcx+8] cvtdq2pd xmm2, qword ptr [rcx+16] @@ -164,9 +168,8 @@ program_begin: xor r12, qword ptr [rcx+32] xor r13, qword ptr [rcx+40] xor r14, qword ptr [rcx+48] - xor r15, qword ptr [rcx+56] - mov eax, r12d ;# write address register 1 - and eax, 1048512 + xor r15, qword ptr [rcx+56] + pop rax lea rcx, [rsi+rax] mov qword ptr [rcx+0], r8 mov qword ptr [rcx+8], r9 @@ -176,8 +179,7 @@ program_begin: mov qword ptr [rcx+40], r13 mov qword ptr [rcx+48], r14 mov qword ptr [rcx+56], r15 - xor eax, r13d ;# write address register 2 - and eax, 1048512 + pop rax lea rcx, [rsi+rax] mulpd xmm0, xmm4 mulpd xmm1, xmm5 @@ -187,6 +189,7 @@ program_begin: movapd xmmword ptr [rcx+16], xmm1 movapd xmmword ptr [rcx+32], xmm2 movapd xmmword ptr [rcx+48], xmm3 + xor eax, eax dec ebx jnz program_begin