From 93fec18991ddcb4419cd052e7dc375702650a62c Mon Sep 17 00:00:00 2001 From: SChernykh Date: Wed, 11 Sep 2019 11:48:22 +0200 Subject: [PATCH] Optimized loading from scratchpad --- src/asm/program_loop_load.inc | 4 ---- src/asm/program_loop_store.inc | 1 - src/jit_compiler_x86.cpp | 20 +++++++++++++------- src/jit_compiler_x86.hpp | 2 +- src/jit_compiler_x86_static.S | 21 +++++++++++++++++++++ src/jit_compiler_x86_static.asm | 26 +++++++++++++++++++++++++- src/jit_compiler_x86_static.hpp | 3 +++ 7 files changed, 63 insertions(+), 14 deletions(-) diff --git a/src/asm/program_loop_load.inc b/src/asm/program_loop_load.inc index 374af66..c293323 100644 --- a/src/asm/program_loop_load.inc +++ b/src/asm/program_loop_load.inc @@ -1,5 +1,3 @@ - mov rdx, rax - and eax, RANDOMX_SCRATCHPAD_MASK lea rcx, [rsi+rax] push rcx xor r8, qword ptr [rcx+0] @@ -10,8 +8,6 @@ xor r13, qword ptr [rcx+40] xor r14, qword ptr [rcx+48] xor r15, qword ptr [rcx+56] - ror rdx, 32 - and edx, RANDOMX_SCRATCHPAD_MASK lea rcx, [rsi+rdx] push rcx cvtdq2pd xmm0, qword ptr [rcx+0] diff --git a/src/asm/program_loop_store.inc b/src/asm/program_loop_store.inc index 53164cb..1ba1635 100644 --- a/src/asm/program_loop_store.inc +++ b/src/asm/program_loop_store.inc @@ -1,4 +1,3 @@ - xor eax, eax pop rcx mov qword ptr [rcx+0], r8 mov qword ptr [rcx+8], r9 diff --git a/src/jit_compiler_x86.cpp b/src/jit_compiler_x86.cpp index aa51d81..fe0ff29 100644 --- a/src/jit_compiler_x86.cpp +++ b/src/jit_compiler_x86.cpp @@ -243,7 +243,7 @@ namespace randomx { generateProgramPrologue(prog, pcfg); memcpy(code + codePos, codeReadDataset, readDatasetSize); codePos += readDatasetSize; - generateProgramEpilogue(prog); + generateProgramEpilogue(prog, pcfg); } void JitCompilerX86::generateProgramLight(Program& prog, ProgramConfiguration& pcfg, uint32_t datasetOffset) { @@ -254,7 +254,7 @@ namespace randomx { emitByte(CALL); emit32(superScalarHashOffset - (codePos + 4)); emit(codeReadDatasetLightSshFin, readDatasetLightFinSize); - generateProgramEpilogue(prog); + generateProgramEpilogue(prog, pcfg); } template @@ -298,12 +298,13 @@ namespace randomx { for (unsigned i = 0; i < 8; ++i) { registerUsage[i] = -1; } + + codePos = ((uint8_t*)randomx_program_prologue_first_load) - ((uint8_t*)randomx_program_prologue); + code[codePos + sizeof(REX_XOR_RAX_R64)] = 0xc0 + pcfg.readReg0; + code[codePos + sizeof(REX_XOR_RAX_R64) * 2 + 1] = 0xc0 + pcfg.readReg1; + codePos = prologueSize; memcpy(code + codePos - 48, &pcfg.eMask, sizeof(pcfg.eMask)); - emit(REX_XOR_RAX_R64); - emitByte(0xc0 + pcfg.readReg0); - emit(REX_XOR_RAX_R64); - emitByte(0xc0 + pcfg.readReg1); memcpy(code + codePos, codeLoopLoad, loopLoadSize); codePos += loopLoadSize; for (unsigned i = 0; i < prog.getSize(); ++i) { @@ -318,7 +319,12 @@ namespace randomx { emitByte(0xc0 + pcfg.readReg3); } - void JitCompilerX86::generateProgramEpilogue(Program& prog) { + void JitCompilerX86::generateProgramEpilogue(Program& prog, ProgramConfiguration& pcfg) { + emit(REX_MOV_RR64); + emitByte(0xc0 + pcfg.readReg0); + emit(REX_XOR_RAX_R64); + emitByte(0xc0 + pcfg.readReg1); + emit((const uint8_t*)&randomx_prefetch_scratchpad, ((uint8_t*)&randomx_prefetch_scratchpad_end) - ((uint8_t*)&randomx_prefetch_scratchpad)); memcpy(code + codePos, codeLoopStore, loopStoreSize); codePos += loopStoreSize; emit(SUB_EBX); diff --git a/src/jit_compiler_x86.hpp b/src/jit_compiler_x86.hpp index b4f1dfa..7829fca 100644 --- a/src/jit_compiler_x86.hpp +++ b/src/jit_compiler_x86.hpp @@ -73,7 +73,7 @@ namespace randomx { int32_t codePos; void generateProgramPrologue(Program&, ProgramConfiguration&); - void generateProgramEpilogue(Program&); + void generateProgramEpilogue(Program&, ProgramConfiguration&); void genAddressReg(Instruction&, bool); void genAddressRegDst(Instruction&); void genAddressImm(Instruction&); diff --git a/src/jit_compiler_x86_static.S b/src/jit_compiler_x86_static.S index 6a33392..0b02278 100644 --- a/src/jit_compiler_x86_static.S +++ b/src/jit_compiler_x86_static.S @@ -37,7 +37,10 @@ #define WINABI #endif +.global DECL(randomx_prefetch_scratchpad) +.global DECL(randomx_prefetch_scratchpad_end) .global DECL(randomx_program_prologue) +.global DECL(randomx_program_prologue_first_load) .global DECL(randomx_program_loop_begin) .global DECL(randomx_program_loop_load) .global DECL(randomx_program_start) @@ -65,6 +68,16 @@ #define db .byte +DECL(randomx_prefetch_scratchpad): + mov rdx, rax + and eax, RANDOMX_SCRATCHPAD_MASK + prefetcht0 [rsi+rax] + ror rdx, 32 + and edx, RANDOMX_SCRATCHPAD_MASK + prefetcht0 [rsi+rdx] + +DECL(randomx_prefetch_scratchpad_end): + .balign 64 DECL(randomx_program_prologue): #if defined(WINABI) @@ -75,6 +88,14 @@ DECL(randomx_program_prologue): movapd xmm13, xmmword ptr [mantissaMask+rip] movapd xmm14, xmmword ptr [exp240+rip] movapd xmm15, xmmword ptr [scaleMask+rip] + +DECL(randomx_program_prologue_first_load): + xor rax, r8 + xor rax, r8 + mov rdx, rax + and eax, RANDOMX_SCRATCHPAD_MASK + ror rdx, 32 + and edx, RANDOMX_SCRATCHPAD_MASK jmp DECL(randomx_program_loop_begin) .balign 64 diff --git a/src/jit_compiler_x86_static.asm b/src/jit_compiler_x86_static.asm index f1d2f95..0f97183 100644 --- a/src/jit_compiler_x86_static.asm +++ b/src/jit_compiler_x86_static.asm @@ -28,7 +28,10 @@ IFDEF RAX _RANDOMX_JITX86_STATIC SEGMENT PAGE READ EXECUTE +PUBLIC randomx_prefetch_scratchpad +PUBLIC randomx_prefetch_scratchpad_end PUBLIC randomx_program_prologue +PUBLIC randomx_program_prologue_first_load PUBLIC randomx_program_loop_begin PUBLIC randomx_program_loop_load PUBLIC randomx_program_start @@ -54,15 +57,36 @@ RANDOMX_CACHE_MASK EQU (RANDOMX_ARGON_MEMORY*16-1) RANDOMX_ALIGN EQU 4096 SUPERSCALAR_OFFSET EQU ((((RANDOMX_ALIGN + 32 * RANDOMX_PROGRAM_SIZE) - 1) / (RANDOMX_ALIGN) + 1) * (RANDOMX_ALIGN)) +randomx_prefetch_scratchpad PROC + mov rdx, rax + and eax, RANDOMX_SCRATCHPAD_MASK + prefetcht0 [rsi+rax] + ror rdx, 32 + and edx, RANDOMX_SCRATCHPAD_MASK + prefetcht0 [rsi+rdx] +randomx_prefetch_scratchpad ENDP + +randomx_prefetch_scratchpad_end PROC +randomx_prefetch_scratchpad_end ENDP + ALIGN 64 randomx_program_prologue PROC include asm/program_prologue_win64.inc movapd xmm13, xmmword ptr [mantissaMask] movapd xmm14, xmmword ptr [exp240] movapd xmm15, xmmword ptr [scaleMask] - jmp randomx_program_loop_begin randomx_program_prologue ENDP +randomx_program_prologue_first_load PROC + xor rax, r8 + xor rax, r8 + mov rdx, rax + and eax, RANDOMX_SCRATCHPAD_MASK + ror rdx, 32 + and edx, RANDOMX_SCRATCHPAD_MASK + jmp randomx_program_loop_begin +randomx_program_prologue_first_load ENDP + ALIGN 64 include asm/program_xmm_constants.inc diff --git a/src/jit_compiler_x86_static.hpp b/src/jit_compiler_x86_static.hpp index ba19686..0a62c98 100644 --- a/src/jit_compiler_x86_static.hpp +++ b/src/jit_compiler_x86_static.hpp @@ -29,7 +29,10 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #pragma once extern "C" { + void randomx_prefetch_scratchpad(); + void randomx_prefetch_scratchpad_end(); void randomx_program_prologue(); + void randomx_program_prologue_first_load(); void randomx_program_loop_begin(); void randomx_program_loop_load(); void randomx_program_start();