Merged load/store of integer and FP registers

This commit is contained in:
tevador 2019-01-27 19:33:55 +01:00
parent 8f2abd6c05
commit 20eb549725
14 changed files with 88 additions and 114 deletions

View File

@ -27,13 +27,11 @@
#define DECL(x) x
#endif
.global DECL(randomx_program_prologue)
.global DECL(randomx_loop_begin)
.global DECL(randomx_program_load_int)
.global DECL(randomx_program_load_flt)
.global DECL(randomx_program_loop_begin)
.global DECL(randomx_program_loop_load)
.global DECL(randomx_program_start)
.global DECL(randomx_program_read_dataset)
.global DECL(randomx_program_store_int)
.global DECL(randomx_program_store_flt)
.global DECL(randomx_program_loop_store)
.global DECL(randomx_program_loop_end)
.global DECL(randomx_program_epilogue)
.global DECL(randomx_program_end)
@ -48,14 +46,11 @@ DECL(randomx_program_prologue):
#include "asm/program_xmm_constants.inc"
.align 64
DECL(randomx_loop_begin):
DECL(randomx_program_loop_begin):
nop
DECL(randomx_program_load_int):
#include "asm/program_load_int.inc"
DECL(randomx_program_load_flt):
#include "asm/program_load_flt.inc"
DECL(randomx_program_loop_load):
#include "asm/program_loop_load.inc"
DECL(randomx_program_start):
nop
@ -63,11 +58,8 @@ DECL(randomx_program_start):
DECL(randomx_program_read_dataset):
#include "asm/program_read_dataset.inc"
DECL(randomx_program_store_int):
#include "asm/program_store_int.inc"
DECL(randomx_program_store_flt):
#include "asm/program_store_flt.inc"
DECL(randomx_program_loop_store):
#include "asm/program_loop_store.inc"
DECL(randomx_program_loop_end):
nop

View File

@ -20,13 +20,11 @@ IFDEF RAX
_RANDOMX_JITX86_STATIC SEGMENT PAGE READ EXECUTE
PUBLIC randomx_program_prologue
PUBLIC randomx_loop_begin
PUBLIC randomx_program_load_int
PUBLIC randomx_program_load_flt
PUBLIC randomx_program_loop_begin
PUBLIC randomx_program_loop_load
PUBLIC randomx_program_start
PUBLIC randomx_program_read_dataset
PUBLIC randomx_program_store_int
PUBLIC randomx_program_store_flt
PUBLIC randomx_program_loop_store
PUBLIC randomx_program_loop_end
PUBLIC randomx_program_epilogue
PUBLIC randomx_program_end
@ -40,17 +38,13 @@ ALIGN 64
include asm/program_xmm_constants.inc
ALIGN 64
randomx_loop_begin PROC
randomx_program_loop_begin PROC
nop
randomx_loop_begin ENDP
randomx_program_loop_begin ENDP
randomx_program_load_int PROC
include asm/program_load_int.inc
randomx_program_load_int ENDP
randomx_program_load_flt PROC
include asm/program_load_flt.inc
randomx_program_load_flt ENDP
randomx_program_loop_load PROC
include asm/program_loop_load.inc
randomx_program_loop_load ENDP
randomx_program_start PROC
nop
@ -60,13 +54,9 @@ randomx_program_read_dataset PROC
include asm/program_read_dataset.inc
randomx_program_read_dataset ENDP
randomx_program_store_int PROC
include asm/program_store_int.inc
randomx_program_store_int ENDP
randomx_program_store_flt PROC
include asm/program_store_flt.inc
randomx_program_store_flt ENDP
randomx_program_loop_store PROC
include asm/program_loop_store.inc
randomx_program_loop_store ENDP
randomx_program_loop_end PROC
nop

View File

@ -19,13 +19,11 @@ along with RandomX. If not, see<http://www.gnu.org/licenses/>.
extern "C" {
void randomx_program_prologue();
void randomx_loop_begin();
void randomx_program_load_int();
void randomx_program_load_flt();
void randomx_program_loop_begin();
void randomx_program_loop_load();
void randomx_program_start();
void randomx_program_read_dataset();
void randomx_program_store_int();
void randomx_program_store_flt();
void randomx_program_loop_store();
void randomx_program_loop_end();
void randomx_program_epilogue();
void randomx_program_end();

View File

@ -94,13 +94,11 @@ namespace RandomX {
#include "JitCompilerX86-static.hpp"
const uint8_t* codePrologue = (uint8_t*)&randomx_program_prologue;
const uint8_t* codeLoopBegin = (uint8_t*)&randomx_loop_begin;
const uint8_t* codeLoadInt = (uint8_t*)&randomx_program_load_int;
const uint8_t* codeLoadFlt = (uint8_t*)&randomx_program_load_flt;
const uint8_t* codeLoopBegin = (uint8_t*)&randomx_program_loop_begin;
const uint8_t* codeLoopLoad = (uint8_t*)&randomx_program_loop_load;
const uint8_t* codeProgamStart = (uint8_t*)&randomx_program_start;
const uint8_t* codeReadDataset = (uint8_t*)&randomx_program_read_dataset;
const uint8_t* codeStoreInt = (uint8_t*)&randomx_program_store_int;
const uint8_t* codeStoreFlt = (uint8_t*)&randomx_program_store_flt;
const uint8_t* codeLoopStore = (uint8_t*)&randomx_program_loop_store;
const uint8_t* codeLoopEnd = (uint8_t*)&randomx_program_loop_end;
const uint8_t* codeEpilogue = (uint8_t*)&randomx_program_epilogue;
const uint8_t* codeProgramEnd = (uint8_t*)&randomx_program_end;
@ -108,11 +106,9 @@ namespace RandomX {
const int32_t prologueSize = codeLoopBegin - codePrologue;
const int32_t epilogueSize = codeProgramEnd - codeEpilogue;
const int32_t loadIntSize = codeLoadFlt - codeLoadInt;
const int32_t loadFltSize = codeProgamStart - codeLoadFlt;
const int32_t readDatasetSize = codeStoreInt - codeReadDataset;
const int32_t storeIntSize = codeStoreFlt - codeStoreInt;
const int32_t storeFltSize = codeLoopEnd - codeStoreFlt;
const int32_t loopLoadSize = codeProgamStart - codeLoopLoad;
const int32_t readDatasetSize = codeLoopStore - codeReadDataset;
const int32_t loopStoreSize = codeLoopEnd - codeLoopStore;
const int32_t epilogueOffset = CodeSize - epilogueSize;
@ -179,6 +175,7 @@ namespace RandomX {
static const uint8_t SUB_EBX[] = { 0x83, 0xEB, 0x01 };
static const uint8_t JNZ[] = { 0x0f, 0x85 };
static const uint8_t JMP = 0xe9;
static const uint8_t REX_XOR_RAX_R64[] = { 0x49, 0x33 };
size_t JitCompilerX86::getCodeSize() {
return codePos - prologueSize;
@ -204,18 +201,16 @@ namespace RandomX {
addressRegisters >>= 1;
int readReg2 = 2 + (addressRegisters & 1);
addressRegisters >>= 1;
int writeReg1 = 4 + (addressRegisters & 1);
int readReg3 = 4 + (addressRegisters & 1);
addressRegisters >>= 1;
int writeReg2 = 6 + (addressRegisters & 1);
int readReg4 = 6 + (addressRegisters & 1);
codePos = prologueSize;
emit(REX_XOR_EAX);
emit(REX_XOR_RAX_R64);
emitByte(0xc0 + readReg1);
memcpy(code + codePos, codeLoadInt, loadIntSize);
codePos += loadIntSize;
emit(REX_XOR_EAX);
emit(REX_XOR_RAX_R64);
emitByte(0xc0 + readReg2);
memcpy(code + codePos, codeLoadFlt, loadFltSize);
codePos += loadFltSize;
memcpy(code + codePos, codeLoopLoad, loopLoadSize);
codePos += loopLoadSize;
Instruction instr;
for (unsigned i = 0; i < ProgramLength; ++i) {
for (unsigned j = 0; j < sizeof(instr) / sizeof(Pcg32::result_type); ++j) {
@ -226,19 +221,13 @@ namespace RandomX {
generateCode(instr);
}
emit(REX_MOV_RR);
emitByte(0xc0 + readReg1);
emitByte(0xc0 + readReg3);
emit(REX_XOR_EAX);
emitByte(0xc0 + readReg2);
emitByte(0xc0 + readReg4);
memcpy(code + codePos, codeReadDataset, readDatasetSize);
codePos += readDatasetSize;
emit(REX_MOV_RR);
emitByte(0xc0 + writeReg1);
memcpy(code + codePos, codeStoreInt, storeIntSize);
codePos += storeIntSize;
emit(REX_XOR_EAX);
emitByte(0xc0 + writeReg2);
memcpy(code + codePos, codeStoreFlt, storeFltSize);
codePos += storeFltSize;
memcpy(code + codePos, codeLoopStore, loopStoreSize);
codePos += loopStoreSize;
emit(SUB_EBX);
emit(JNZ);
emit32(prologueSize - codePos - 4);

View File

@ -1,10 +0,0 @@
and eax, 1048512
lea rcx, [rsi+rax]
xor r8, qword ptr [rcx+0]
xor r9, qword ptr [rcx+8]
xor r10, qword ptr [rcx+16]
xor r11, qword ptr [rcx+24]
xor r12, qword ptr [rcx+32]
xor r13, qword ptr [rcx+40]
xor r14, qword ptr [rcx+48]
xor r15, qword ptr [rcx+56]

View File

@ -1,5 +1,19 @@
mov rdx, rax
and eax, 1048512
lea rcx, [rsi+rax]
push rcx
xor r8, qword ptr [rcx+0]
xor r9, qword ptr [rcx+8]
xor r10, qword ptr [rcx+16]
xor r11, qword ptr [rcx+24]
xor r12, qword ptr [rcx+32]
xor r13, qword ptr [rcx+40]
xor r14, qword ptr [rcx+48]
xor r15, qword ptr [rcx+56]
ror rdx, 32
and edx, 1048512
lea rcx, [rsi+rdx]
push rcx
cvtdq2pd xmm0, qword ptr [rcx+0]
cvtdq2pd xmm1, qword ptr [rcx+8]
cvtdq2pd xmm2, qword ptr [rcx+16]

View File

@ -0,0 +1,18 @@
pop rcx
mov qword ptr [rcx+0], r8
mov qword ptr [rcx+8], r9
mov qword ptr [rcx+16], r10
mov qword ptr [rcx+24], r11
mov qword ptr [rcx+32], r12
mov qword ptr [rcx+40], r13
mov qword ptr [rcx+48], r14
mov qword ptr [rcx+56], r15
pop rcx
mulpd xmm0, xmm4
mulpd xmm1, xmm5
mulpd xmm2, xmm6
mulpd xmm3, xmm7
movapd xmmword ptr [rcx+0], xmm0
movapd xmmword ptr [rcx+16], xmm1
movapd xmmword ptr [rcx+32], xmm2
movapd xmmword ptr [rcx+48], xmm3

View File

@ -11,10 +11,9 @@
push rdi ;# RegisterFile& registerFile
mov rcx, rdi
mov rbp, qword ptr [rsi] ;# "mx", "ma"
mov eax, ebp ;# "mx"
mov rdi, qword ptr [rsi+8] ;# uint8_t* dataset
mov rsi, rdx ;# convertible_t* scratchpad
#include "program_prologue_load.inc"
jmp DECL(randomx_loop_begin)
jmp DECL(randomx_program_loop_begin)

View File

@ -1,3 +1,5 @@
mov rax, rbp
;# zero integer registers
xor r8, r8
xor r9, r9

View File

@ -23,11 +23,10 @@
; function arguments
push rcx ; RegisterFile& registerFile
mov rbp, qword ptr [rdx] ; "mx", "ma"
mov eax, ebp ; "mx"
mov rdi, qword ptr [rdx+8] ; uint8_t* dataset
mov rsi, r8 ; convertible_t* scratchpad
mov rbx, r9 ; loop counter
include program_prologue_load.inc
jmp randomx_loop_begin
jmp randomx_program_loop_begin

View File

@ -1,4 +1,5 @@
xor rbp, rax ;# modify "mx"
xor eax, eax
and rbp, -64 ;# align "mx" to the start of a cache line
mov edx, ebp ;# edx = mx
prefetchnta byte ptr [rdi+rdx]

View File

@ -1,11 +0,0 @@
and eax, 1048512
lea rcx, [rsi+rax]
mulpd xmm0, xmm4
mulpd xmm1, xmm5
mulpd xmm2, xmm6
mulpd xmm3, xmm7
movapd xmmword ptr [rcx+0], xmm0
movapd xmmword ptr [rcx+16], xmm1
movapd xmmword ptr [rcx+32], xmm2
movapd xmmword ptr [rcx+48], xmm3

View File

@ -1,10 +0,0 @@
and eax, 1048512
lea rcx, [rsi+rax]
mov qword ptr [rcx+0], r8
mov qword ptr [rcx+8], r9
mov qword ptr [rcx+16], r10
mov qword ptr [rcx+24], r11
mov qword ptr [rcx+32], r12
mov qword ptr [rcx+40], r13
mov qword ptr [rcx+48], r14
mov qword ptr [rcx+56], r15

View File

@ -118,8 +118,11 @@ signMask:
ALIGN 64
program_begin:
xor eax, r8d ;# read address register 1
xor rax, r8 ;# read address register 1
xor rax, r9
mov rdx, rax
and eax, 1048512
push rax
lea rcx, [rsi+rax]
xor r8, qword ptr [rcx+0]
xor r9, qword ptr [rcx+8]
@ -129,9 +132,10 @@ program_begin:
xor r13, qword ptr [rcx+40]
xor r14, qword ptr [rcx+48]
xor r15, qword ptr [rcx+56]
xor eax, r9d ;# read address register 2
and eax, 1048512
lea rcx, [rsi+rax]
ror rdx, 32
and edx, 1048512
push rdx
lea rcx, [rsi+rdx]
cvtdq2pd xmm0, qword ptr [rcx+0]
cvtdq2pd xmm1, qword ptr [rcx+8]
cvtdq2pd xmm2, qword ptr [rcx+16]
@ -164,9 +168,8 @@ program_begin:
xor r12, qword ptr [rcx+32]
xor r13, qword ptr [rcx+40]
xor r14, qword ptr [rcx+48]
xor r15, qword ptr [rcx+56]
mov eax, r12d ;# write address register 1
and eax, 1048512
xor r15, qword ptr [rcx+56]
pop rax
lea rcx, [rsi+rax]
mov qword ptr [rcx+0], r8
mov qword ptr [rcx+8], r9
@ -176,8 +179,7 @@ program_begin:
mov qword ptr [rcx+40], r13
mov qword ptr [rcx+48], r14
mov qword ptr [rcx+56], r15
xor eax, r13d ;# write address register 2
and eax, 1048512
pop rax
lea rcx, [rsi+rax]
mulpd xmm0, xmm4
mulpd xmm1, xmm5
@ -187,6 +189,7 @@ program_begin:
movapd xmmword ptr [rcx+16], xmm1
movapd xmmword ptr [rcx+32], xmm2
movapd xmmword ptr [rcx+48], xmm3
xor eax, eax
dec ebx
jnz program_begin