Merged load/store of integer and FP registers

This commit is contained in:
tevador 2019-01-27 19:33:55 +01:00
parent 8f2abd6c05
commit 20eb549725
14 changed files with 88 additions and 114 deletions

View file

@ -27,13 +27,11 @@
#define DECL(x) x #define DECL(x) x
#endif #endif
.global DECL(randomx_program_prologue) .global DECL(randomx_program_prologue)
.global DECL(randomx_loop_begin) .global DECL(randomx_program_loop_begin)
.global DECL(randomx_program_load_int) .global DECL(randomx_program_loop_load)
.global DECL(randomx_program_load_flt)
.global DECL(randomx_program_start) .global DECL(randomx_program_start)
.global DECL(randomx_program_read_dataset) .global DECL(randomx_program_read_dataset)
.global DECL(randomx_program_store_int) .global DECL(randomx_program_loop_store)
.global DECL(randomx_program_store_flt)
.global DECL(randomx_program_loop_end) .global DECL(randomx_program_loop_end)
.global DECL(randomx_program_epilogue) .global DECL(randomx_program_epilogue)
.global DECL(randomx_program_end) .global DECL(randomx_program_end)
@ -48,14 +46,11 @@ DECL(randomx_program_prologue):
#include "asm/program_xmm_constants.inc" #include "asm/program_xmm_constants.inc"
.align 64 .align 64
DECL(randomx_loop_begin): DECL(randomx_program_loop_begin):
nop nop
DECL(randomx_program_load_int): DECL(randomx_program_loop_load):
#include "asm/program_load_int.inc" #include "asm/program_loop_load.inc"
DECL(randomx_program_load_flt):
#include "asm/program_load_flt.inc"
DECL(randomx_program_start): DECL(randomx_program_start):
nop nop
@ -63,11 +58,8 @@ DECL(randomx_program_start):
DECL(randomx_program_read_dataset): DECL(randomx_program_read_dataset):
#include "asm/program_read_dataset.inc" #include "asm/program_read_dataset.inc"
DECL(randomx_program_store_int): DECL(randomx_program_loop_store):
#include "asm/program_store_int.inc" #include "asm/program_loop_store.inc"
DECL(randomx_program_store_flt):
#include "asm/program_store_flt.inc"
DECL(randomx_program_loop_end): DECL(randomx_program_loop_end):
nop nop

View file

@ -20,13 +20,11 @@ IFDEF RAX
_RANDOMX_JITX86_STATIC SEGMENT PAGE READ EXECUTE _RANDOMX_JITX86_STATIC SEGMENT PAGE READ EXECUTE
PUBLIC randomx_program_prologue PUBLIC randomx_program_prologue
PUBLIC randomx_loop_begin PUBLIC randomx_program_loop_begin
PUBLIC randomx_program_load_int PUBLIC randomx_program_loop_load
PUBLIC randomx_program_load_flt
PUBLIC randomx_program_start PUBLIC randomx_program_start
PUBLIC randomx_program_read_dataset PUBLIC randomx_program_read_dataset
PUBLIC randomx_program_store_int PUBLIC randomx_program_loop_store
PUBLIC randomx_program_store_flt
PUBLIC randomx_program_loop_end PUBLIC randomx_program_loop_end
PUBLIC randomx_program_epilogue PUBLIC randomx_program_epilogue
PUBLIC randomx_program_end PUBLIC randomx_program_end
@ -40,17 +38,13 @@ ALIGN 64
include asm/program_xmm_constants.inc include asm/program_xmm_constants.inc
ALIGN 64 ALIGN 64
randomx_loop_begin PROC randomx_program_loop_begin PROC
nop nop
randomx_loop_begin ENDP randomx_program_loop_begin ENDP
randomx_program_load_int PROC randomx_program_loop_load PROC
include asm/program_load_int.inc include asm/program_loop_load.inc
randomx_program_load_int ENDP randomx_program_loop_load ENDP
randomx_program_load_flt PROC
include asm/program_load_flt.inc
randomx_program_load_flt ENDP
randomx_program_start PROC randomx_program_start PROC
nop nop
@ -60,13 +54,9 @@ randomx_program_read_dataset PROC
include asm/program_read_dataset.inc include asm/program_read_dataset.inc
randomx_program_read_dataset ENDP randomx_program_read_dataset ENDP
randomx_program_store_int PROC randomx_program_loop_store PROC
include asm/program_store_int.inc include asm/program_loop_store.inc
randomx_program_store_int ENDP randomx_program_loop_store ENDP
randomx_program_store_flt PROC
include asm/program_store_flt.inc
randomx_program_store_flt ENDP
randomx_program_loop_end PROC randomx_program_loop_end PROC
nop nop

View file

@ -19,13 +19,11 @@ along with RandomX. If not, see<http://www.gnu.org/licenses/>.
extern "C" { extern "C" {
void randomx_program_prologue(); void randomx_program_prologue();
void randomx_loop_begin(); void randomx_program_loop_begin();
void randomx_program_load_int(); void randomx_program_loop_load();
void randomx_program_load_flt();
void randomx_program_start(); void randomx_program_start();
void randomx_program_read_dataset(); void randomx_program_read_dataset();
void randomx_program_store_int(); void randomx_program_loop_store();
void randomx_program_store_flt();
void randomx_program_loop_end(); void randomx_program_loop_end();
void randomx_program_epilogue(); void randomx_program_epilogue();
void randomx_program_end(); void randomx_program_end();

View file

@ -94,13 +94,11 @@ namespace RandomX {
#include "JitCompilerX86-static.hpp" #include "JitCompilerX86-static.hpp"
const uint8_t* codePrologue = (uint8_t*)&randomx_program_prologue; const uint8_t* codePrologue = (uint8_t*)&randomx_program_prologue;
const uint8_t* codeLoopBegin = (uint8_t*)&randomx_loop_begin; const uint8_t* codeLoopBegin = (uint8_t*)&randomx_program_loop_begin;
const uint8_t* codeLoadInt = (uint8_t*)&randomx_program_load_int; const uint8_t* codeLoopLoad = (uint8_t*)&randomx_program_loop_load;
const uint8_t* codeLoadFlt = (uint8_t*)&randomx_program_load_flt;
const uint8_t* codeProgamStart = (uint8_t*)&randomx_program_start; const uint8_t* codeProgamStart = (uint8_t*)&randomx_program_start;
const uint8_t* codeReadDataset = (uint8_t*)&randomx_program_read_dataset; const uint8_t* codeReadDataset = (uint8_t*)&randomx_program_read_dataset;
const uint8_t* codeStoreInt = (uint8_t*)&randomx_program_store_int; const uint8_t* codeLoopStore = (uint8_t*)&randomx_program_loop_store;
const uint8_t* codeStoreFlt = (uint8_t*)&randomx_program_store_flt;
const uint8_t* codeLoopEnd = (uint8_t*)&randomx_program_loop_end; const uint8_t* codeLoopEnd = (uint8_t*)&randomx_program_loop_end;
const uint8_t* codeEpilogue = (uint8_t*)&randomx_program_epilogue; const uint8_t* codeEpilogue = (uint8_t*)&randomx_program_epilogue;
const uint8_t* codeProgramEnd = (uint8_t*)&randomx_program_end; const uint8_t* codeProgramEnd = (uint8_t*)&randomx_program_end;
@ -108,11 +106,9 @@ namespace RandomX {
const int32_t prologueSize = codeLoopBegin - codePrologue; const int32_t prologueSize = codeLoopBegin - codePrologue;
const int32_t epilogueSize = codeProgramEnd - codeEpilogue; const int32_t epilogueSize = codeProgramEnd - codeEpilogue;
const int32_t loadIntSize = codeLoadFlt - codeLoadInt; const int32_t loopLoadSize = codeProgamStart - codeLoopLoad;
const int32_t loadFltSize = codeProgamStart - codeLoadFlt; const int32_t readDatasetSize = codeLoopStore - codeReadDataset;
const int32_t readDatasetSize = codeStoreInt - codeReadDataset; const int32_t loopStoreSize = codeLoopEnd - codeLoopStore;
const int32_t storeIntSize = codeStoreFlt - codeStoreInt;
const int32_t storeFltSize = codeLoopEnd - codeStoreFlt;
const int32_t epilogueOffset = CodeSize - epilogueSize; const int32_t epilogueOffset = CodeSize - epilogueSize;
@ -179,6 +175,7 @@ namespace RandomX {
static const uint8_t SUB_EBX[] = { 0x83, 0xEB, 0x01 }; static const uint8_t SUB_EBX[] = { 0x83, 0xEB, 0x01 };
static const uint8_t JNZ[] = { 0x0f, 0x85 }; static const uint8_t JNZ[] = { 0x0f, 0x85 };
static const uint8_t JMP = 0xe9; static const uint8_t JMP = 0xe9;
static const uint8_t REX_XOR_RAX_R64[] = { 0x49, 0x33 };
size_t JitCompilerX86::getCodeSize() { size_t JitCompilerX86::getCodeSize() {
return codePos - prologueSize; return codePos - prologueSize;
@ -204,18 +201,16 @@ namespace RandomX {
addressRegisters >>= 1; addressRegisters >>= 1;
int readReg2 = 2 + (addressRegisters & 1); int readReg2 = 2 + (addressRegisters & 1);
addressRegisters >>= 1; addressRegisters >>= 1;
int writeReg1 = 4 + (addressRegisters & 1); int readReg3 = 4 + (addressRegisters & 1);
addressRegisters >>= 1; addressRegisters >>= 1;
int writeReg2 = 6 + (addressRegisters & 1); int readReg4 = 6 + (addressRegisters & 1);
codePos = prologueSize; codePos = prologueSize;
emit(REX_XOR_EAX); emit(REX_XOR_RAX_R64);
emitByte(0xc0 + readReg1); emitByte(0xc0 + readReg1);
memcpy(code + codePos, codeLoadInt, loadIntSize); emit(REX_XOR_RAX_R64);
codePos += loadIntSize;
emit(REX_XOR_EAX);
emitByte(0xc0 + readReg2); emitByte(0xc0 + readReg2);
memcpy(code + codePos, codeLoadFlt, loadFltSize); memcpy(code + codePos, codeLoopLoad, loopLoadSize);
codePos += loadFltSize; codePos += loopLoadSize;
Instruction instr; Instruction instr;
for (unsigned i = 0; i < ProgramLength; ++i) { for (unsigned i = 0; i < ProgramLength; ++i) {
for (unsigned j = 0; j < sizeof(instr) / sizeof(Pcg32::result_type); ++j) { for (unsigned j = 0; j < sizeof(instr) / sizeof(Pcg32::result_type); ++j) {
@ -226,19 +221,13 @@ namespace RandomX {
generateCode(instr); generateCode(instr);
} }
emit(REX_MOV_RR); emit(REX_MOV_RR);
emitByte(0xc0 + readReg1); emitByte(0xc0 + readReg3);
emit(REX_XOR_EAX); emit(REX_XOR_EAX);
emitByte(0xc0 + readReg2); emitByte(0xc0 + readReg4);
memcpy(code + codePos, codeReadDataset, readDatasetSize); memcpy(code + codePos, codeReadDataset, readDatasetSize);
codePos += readDatasetSize; codePos += readDatasetSize;
emit(REX_MOV_RR); memcpy(code + codePos, codeLoopStore, loopStoreSize);
emitByte(0xc0 + writeReg1); codePos += loopStoreSize;
memcpy(code + codePos, codeStoreInt, storeIntSize);
codePos += storeIntSize;
emit(REX_XOR_EAX);
emitByte(0xc0 + writeReg2);
memcpy(code + codePos, codeStoreFlt, storeFltSize);
codePos += storeFltSize;
emit(SUB_EBX); emit(SUB_EBX);
emit(JNZ); emit(JNZ);
emit32(prologueSize - codePos - 4); emit32(prologueSize - codePos - 4);

View file

@ -1,10 +0,0 @@
and eax, 1048512
lea rcx, [rsi+rax]
xor r8, qword ptr [rcx+0]
xor r9, qword ptr [rcx+8]
xor r10, qword ptr [rcx+16]
xor r11, qword ptr [rcx+24]
xor r12, qword ptr [rcx+32]
xor r13, qword ptr [rcx+40]
xor r14, qword ptr [rcx+48]
xor r15, qword ptr [rcx+56]

View file

@ -1,5 +1,19 @@
mov rdx, rax
and eax, 1048512 and eax, 1048512
lea rcx, [rsi+rax] lea rcx, [rsi+rax]
push rcx
xor r8, qword ptr [rcx+0]
xor r9, qword ptr [rcx+8]
xor r10, qword ptr [rcx+16]
xor r11, qword ptr [rcx+24]
xor r12, qword ptr [rcx+32]
xor r13, qword ptr [rcx+40]
xor r14, qword ptr [rcx+48]
xor r15, qword ptr [rcx+56]
ror rdx, 32
and edx, 1048512
lea rcx, [rsi+rdx]
push rcx
cvtdq2pd xmm0, qword ptr [rcx+0] cvtdq2pd xmm0, qword ptr [rcx+0]
cvtdq2pd xmm1, qword ptr [rcx+8] cvtdq2pd xmm1, qword ptr [rcx+8]
cvtdq2pd xmm2, qword ptr [rcx+16] cvtdq2pd xmm2, qword ptr [rcx+16]

View file

@ -0,0 +1,18 @@
pop rcx
mov qword ptr [rcx+0], r8
mov qword ptr [rcx+8], r9
mov qword ptr [rcx+16], r10
mov qword ptr [rcx+24], r11
mov qword ptr [rcx+32], r12
mov qword ptr [rcx+40], r13
mov qword ptr [rcx+48], r14
mov qword ptr [rcx+56], r15
pop rcx
mulpd xmm0, xmm4
mulpd xmm1, xmm5
mulpd xmm2, xmm6
mulpd xmm3, xmm7
movapd xmmword ptr [rcx+0], xmm0
movapd xmmword ptr [rcx+16], xmm1
movapd xmmword ptr [rcx+32], xmm2
movapd xmmword ptr [rcx+48], xmm3

View file

@ -11,10 +11,9 @@
push rdi ;# RegisterFile& registerFile push rdi ;# RegisterFile& registerFile
mov rcx, rdi mov rcx, rdi
mov rbp, qword ptr [rsi] ;# "mx", "ma" mov rbp, qword ptr [rsi] ;# "mx", "ma"
mov eax, ebp ;# "mx"
mov rdi, qword ptr [rsi+8] ;# uint8_t* dataset mov rdi, qword ptr [rsi+8] ;# uint8_t* dataset
mov rsi, rdx ;# convertible_t* scratchpad mov rsi, rdx ;# convertible_t* scratchpad
#include "program_prologue_load.inc" #include "program_prologue_load.inc"
jmp DECL(randomx_loop_begin) jmp DECL(randomx_program_loop_begin)

View file

@ -1,3 +1,5 @@
mov rax, rbp
;# zero integer registers ;# zero integer registers
xor r8, r8 xor r8, r8
xor r9, r9 xor r9, r9

View file

@ -23,11 +23,10 @@
; function arguments ; function arguments
push rcx ; RegisterFile& registerFile push rcx ; RegisterFile& registerFile
mov rbp, qword ptr [rdx] ; "mx", "ma" mov rbp, qword ptr [rdx] ; "mx", "ma"
mov eax, ebp ; "mx"
mov rdi, qword ptr [rdx+8] ; uint8_t* dataset mov rdi, qword ptr [rdx+8] ; uint8_t* dataset
mov rsi, r8 ; convertible_t* scratchpad mov rsi, r8 ; convertible_t* scratchpad
mov rbx, r9 ; loop counter mov rbx, r9 ; loop counter
include program_prologue_load.inc include program_prologue_load.inc
jmp randomx_loop_begin jmp randomx_program_loop_begin

View file

@ -1,4 +1,5 @@
xor rbp, rax ;# modify "mx" xor rbp, rax ;# modify "mx"
xor eax, eax
and rbp, -64 ;# align "mx" to the start of a cache line and rbp, -64 ;# align "mx" to the start of a cache line
mov edx, ebp ;# edx = mx mov edx, ebp ;# edx = mx
prefetchnta byte ptr [rdi+rdx] prefetchnta byte ptr [rdi+rdx]

View file

@ -1,11 +0,0 @@
and eax, 1048512
lea rcx, [rsi+rax]
mulpd xmm0, xmm4
mulpd xmm1, xmm5
mulpd xmm2, xmm6
mulpd xmm3, xmm7
movapd xmmword ptr [rcx+0], xmm0
movapd xmmword ptr [rcx+16], xmm1
movapd xmmword ptr [rcx+32], xmm2
movapd xmmword ptr [rcx+48], xmm3

View file

@ -1,10 +0,0 @@
and eax, 1048512
lea rcx, [rsi+rax]
mov qword ptr [rcx+0], r8
mov qword ptr [rcx+8], r9
mov qword ptr [rcx+16], r10
mov qword ptr [rcx+24], r11
mov qword ptr [rcx+32], r12
mov qword ptr [rcx+40], r13
mov qword ptr [rcx+48], r14
mov qword ptr [rcx+56], r15

View file

@ -118,8 +118,11 @@ signMask:
ALIGN 64 ALIGN 64
program_begin: program_begin:
xor eax, r8d ;# read address register 1 xor rax, r8 ;# read address register 1
xor rax, r9
mov rdx, rax
and eax, 1048512 and eax, 1048512
push rax
lea rcx, [rsi+rax] lea rcx, [rsi+rax]
xor r8, qword ptr [rcx+0] xor r8, qword ptr [rcx+0]
xor r9, qword ptr [rcx+8] xor r9, qword ptr [rcx+8]
@ -129,9 +132,10 @@ program_begin:
xor r13, qword ptr [rcx+40] xor r13, qword ptr [rcx+40]
xor r14, qword ptr [rcx+48] xor r14, qword ptr [rcx+48]
xor r15, qword ptr [rcx+56] xor r15, qword ptr [rcx+56]
xor eax, r9d ;# read address register 2 ror rdx, 32
and eax, 1048512 and edx, 1048512
lea rcx, [rsi+rax] push rdx
lea rcx, [rsi+rdx]
cvtdq2pd xmm0, qword ptr [rcx+0] cvtdq2pd xmm0, qword ptr [rcx+0]
cvtdq2pd xmm1, qword ptr [rcx+8] cvtdq2pd xmm1, qword ptr [rcx+8]
cvtdq2pd xmm2, qword ptr [rcx+16] cvtdq2pd xmm2, qword ptr [rcx+16]
@ -165,8 +169,7 @@ program_begin:
xor r13, qword ptr [rcx+40] xor r13, qword ptr [rcx+40]
xor r14, qword ptr [rcx+48] xor r14, qword ptr [rcx+48]
xor r15, qword ptr [rcx+56] xor r15, qword ptr [rcx+56]
mov eax, r12d ;# write address register 1 pop rax
and eax, 1048512
lea rcx, [rsi+rax] lea rcx, [rsi+rax]
mov qword ptr [rcx+0], r8 mov qword ptr [rcx+0], r8
mov qword ptr [rcx+8], r9 mov qword ptr [rcx+8], r9
@ -176,8 +179,7 @@ program_begin:
mov qword ptr [rcx+40], r13 mov qword ptr [rcx+40], r13
mov qword ptr [rcx+48], r14 mov qword ptr [rcx+48], r14
mov qword ptr [rcx+56], r15 mov qword ptr [rcx+56], r15
xor eax, r13d ;# write address register 2 pop rax
and eax, 1048512
lea rcx, [rsi+rax] lea rcx, [rsi+rax]
mulpd xmm0, xmm4 mulpd xmm0, xmm4
mulpd xmm1, xmm5 mulpd xmm1, xmm5
@ -187,6 +189,7 @@ program_begin:
movapd xmmword ptr [rcx+16], xmm1 movapd xmmword ptr [rcx+16], xmm1
movapd xmmword ptr [rcx+32], xmm2 movapd xmmword ptr [rcx+32], xmm2
movapd xmmword ptr [rcx+48], xmm3 movapd xmmword ptr [rcx+48], xmm3
xor eax, eax
dec ebx dec ebx
jnz program_begin jnz program_begin