Mix dataset cacheline with registers r0-r7

This commit is contained in:
tevador 2019-01-13 21:14:59 +01:00
parent 48d85643de
commit a7ffe8c19a
8 changed files with 691 additions and 767 deletions

View file

@ -67,20 +67,16 @@ namespace RandomX {
void AssemblyGeneratorX86::gena(Instruction& instr, int i) { void AssemblyGeneratorX86::gena(Instruction& instr, int i) {
asmCode << "\txor " << regR[instr.rega % RegistersCount] << ", 0" << std::hex << instr.addra << "h" << std::dec << std::endl; asmCode << "\txor " << regR[instr.rega % RegistersCount] << ", 0" << std::hex << instr.addra << "h" << std::dec << std::endl;
asmCode << "\tmov eax, " << regR32[instr.rega % RegistersCount] << std::endl; asmCode << "\tmov eax, " << regR32[instr.rega % RegistersCount] << std::endl;
if ((instr.loca & 192) == 0)
asmCode << "\txor " << regMx << ", rax" << std::endl;
asmCode << "\ttest " << regIc8 << ", 63" << std::endl; asmCode << "\ttest " << regIc8 << ", 63" << std::endl;
asmCode << "\tjnz short rx_body_" << i << std::endl; asmCode << "\tjnz short rx_body_" << i << std::endl;
asmCode << "\tcall rx_read" << std::endl;
asmCode << "rx_body_" << i << ":" << std::endl;
if (instr.loca & 3) { if (instr.loca & 3) {
asmCode << "\tcall rx_read_l1" << std::endl;
asmCode << "rx_body_" << i << ":" << std::endl;
if ((instr.loca & 192) == 0)
asmCode << "\txor " << regMx << ", rax" << std::endl;
asmCode << "\tand eax, " << (ScratchpadL1 - 1) << std::endl; asmCode << "\tand eax, " << (ScratchpadL1 - 1) << std::endl;
} }
else { else {
asmCode << "\tcall rx_read_l2" << std::endl;
asmCode << "rx_body_" << i << ":" << std::endl;
if ((instr.loca & 192) == 0)
asmCode << "\txor " << regMx << ", rax" << std::endl;
asmCode << "\tand eax, " << (ScratchpadL2 - 1) << std::endl; asmCode << "\tand eax, " << (ScratchpadL2 - 1) << std::endl;
} }
} }

View file

@ -29,8 +29,7 @@
.global DECL(randomx_program_prologue) .global DECL(randomx_program_prologue)
.global DECL(randomx_program_begin) .global DECL(randomx_program_begin)
.global DECL(randomx_program_epilogue) .global DECL(randomx_program_epilogue)
.global DECL(randomx_program_read_l1) .global DECL(randomx_program_read)
.global DECL(randomx_program_read_l2)
.global DECL(randomx_program_end) .global DECL(randomx_program_end)
.global DECL(randomx_program_transform) .global DECL(randomx_program_transform)
@ -48,22 +47,10 @@ DECL(randomx_program_begin):
DECL(randomx_program_epilogue): DECL(randomx_program_epilogue):
#include "asm/program_epilogue_linux.inc" #include "asm/program_epilogue_linux.inc"
#define scratchpad_mask and eax, 2040
.align 64 .align 64
DECL(randomx_program_read_l1): DECL(randomx_program_read):
#include "asm/program_read.inc" #include "asm/program_read.inc"
#undef scratchpad_mask
#define scratchpad_mask and eax, 32760
.align 64
DECL(randomx_program_read_l2):
#include "asm/program_read.inc"
#undef scratchpad_mask
.align 64 .align 64
DECL(randomx_program_end): DECL(randomx_program_end):
nop nop

View file

@ -20,8 +20,7 @@ _RANDOMX_JITX86_STATIC SEGMENT PAGE READ EXECUTE
PUBLIC randomx_program_prologue PUBLIC randomx_program_prologue
PUBLIC randomx_program_begin PUBLIC randomx_program_begin
PUBLIC randomx_program_epilogue PUBLIC randomx_program_epilogue
PUBLIC randomx_program_read_l1 PUBLIC randomx_program_read
PUBLIC randomx_program_read_l2
PUBLIC randomx_program_end PUBLIC randomx_program_end
PUBLIC randomx_program_transform PUBLIC randomx_program_transform
@ -41,23 +40,10 @@ randomx_program_epilogue PROC
include asm/program_epilogue_win64.inc include asm/program_epilogue_win64.inc
randomx_program_epilogue ENDP randomx_program_epilogue ENDP
scratchpad_mask MACRO
and eax, 2040
ENDM
ALIGN 64 ALIGN 64
randomx_program_read_l1 PROC randomx_program_read PROC
include asm/program_read.inc include asm/program_read.inc
randomx_program_read_l1 ENDP randomx_program_read ENDP
scratchpad_mask MACRO
and eax, 32760
ENDM
ALIGN 64
randomx_program_read_l2 PROC
include asm/program_read.inc
randomx_program_read_l2 ENDP
ALIGN 64 ALIGN 64
randomx_program_end PROC randomx_program_end PROC

View file

@ -22,7 +22,6 @@ extern "C" {
void randomx_program_begin(); void randomx_program_begin();
void randomx_program_epilogue(); void randomx_program_epilogue();
void randomx_program_transform(); void randomx_program_transform();
void randomx_program_read_l1(); void randomx_program_read();
void randomx_program_read_l2();
void randomx_program_end(); void randomx_program_end();
} }

View file

@ -102,22 +102,19 @@ namespace RandomX {
const uint8_t* codePrologue = (uint8_t*)&randomx_program_prologue; const uint8_t* codePrologue = (uint8_t*)&randomx_program_prologue;
const uint8_t* codeProgramBegin = (uint8_t*)&randomx_program_begin; const uint8_t* codeProgramBegin = (uint8_t*)&randomx_program_begin;
const uint8_t* codeEpilogue = (uint8_t*)&randomx_program_epilogue; const uint8_t* codeEpilogue = (uint8_t*)&randomx_program_epilogue;
const uint8_t* codeReadDatasetL1 = (uint8_t*)&randomx_program_read_l1; const uint8_t* codeReadDataset = (uint8_t*)&randomx_program_read;
const uint8_t* codeReadDatasetL2 = (uint8_t*)&randomx_program_read_l2;
const uint8_t* codeProgramEnd = (uint8_t*)&randomx_program_end; const uint8_t* codeProgramEnd = (uint8_t*)&randomx_program_end;
const uint32_t* addressTransformations = (uint32_t*)&randomx_program_transform; const uint32_t* addressTransformations = (uint32_t*)&randomx_program_transform;
const int32_t prologueSize = codeProgramBegin - codePrologue; const int32_t prologueSize = codeProgramBegin - codePrologue;
const int32_t epilogueSize = codeReadDatasetL1 - codeEpilogue; const int32_t epilogueSize = codeReadDataset - codeEpilogue;
const int32_t readDatasetL1Size = codeReadDatasetL2 - codeReadDatasetL1; const int32_t readDatasetSize = codeProgramEnd - codeReadDataset;
const int32_t readDatasetL2Size = codeProgramEnd - codeReadDatasetL2;
const int32_t readDatasetL2Offset = CodeSize - readDatasetL2Size; const int32_t readDatasetOffset = CodeSize - readDatasetSize;
const int32_t readDatasetL1Offset = readDatasetL2Offset - readDatasetL1Size; const int32_t epilogueOffset = readDatasetOffset - epilogueSize;
const int32_t epilogueOffset = readDatasetL1Offset - epilogueSize;
size_t JitCompilerX86::getCodeSize() { size_t JitCompilerX86::getCodeSize() {
return codePos - prologueSize + readDatasetL1Size + readDatasetL2Size; return codePos - prologueSize + readDatasetSize;
} }
JitCompilerX86::JitCompilerX86() { JitCompilerX86::JitCompilerX86() {
@ -131,9 +128,8 @@ namespace RandomX {
throw std::runtime_error("mmap failed"); throw std::runtime_error("mmap failed");
#endif #endif
memcpy(code, codePrologue, prologueSize); memcpy(code, codePrologue, prologueSize);
memcpy(code + CodeSize - epilogueSize - readDatasetL1Size - readDatasetL2Size, codeEpilogue, epilogueSize); memcpy(code + CodeSize - epilogueSize - readDatasetSize, codeEpilogue, epilogueSize);
memcpy(code + CodeSize - readDatasetL1Size - readDatasetL2Size, codeReadDatasetL1, readDatasetL1Size); memcpy(code + CodeSize - readDatasetSize, codeReadDataset, readDatasetSize);
memcpy(code + CodeSize - readDatasetL2Size, codeReadDatasetL2, readDatasetL2Size);
} }
void JitCompilerX86::generateProgram(Pcg32& gen) { void JitCompilerX86::generateProgram(Pcg32& gen) {
@ -150,10 +146,8 @@ namespace RandomX {
emitByte(0xe9); emitByte(0xe9);
emit(instructionOffsets[0] - (codePos + 4)); emit(instructionOffsets[0] - (codePos + 4));
fixCallOffsets(); fixCallOffsets();
uint32_t transformL1 = addressTransformations[gen.getUniform(0, TransformationCount - 1)]; uint32_t transform = addressTransformations[gen.getUniform(0, TransformationCount - 1)];
uint32_t transformL2 = addressTransformations[gen.getUniform(0, TransformationCount - 1)]; *reinterpret_cast<uint32_t*>(code + readDatasetOffset) = transform;
*reinterpret_cast<uint32_t*>(code + readDatasetL1Offset + 1) = transformL1;
*reinterpret_cast<uint32_t*>(code + readDatasetL2Offset + 1) = transformL2;
} }
void JitCompilerX86::generateCode(Instruction& instr, int i) { void JitCompilerX86::generateCode(Instruction& instr, int i) {
@ -176,18 +170,13 @@ namespace RandomX {
emit(instr.addra); emit(instr.addra);
emit(uint16_t(0x8b41)); //mov emit(uint16_t(0x8b41)); //mov
emitByte(0xc0 + (instr.rega % RegistersCount)); //eax, rega emitByte(0xc0 + (instr.rega % RegistersCount)); //eax, rega
emit(0x753fc3f6); //test bl,0x3f; jne
emit(uint16_t(0xe805));
if (instr.loca & 3) { //A.LOC.W
emit(readDatasetL1Offset - (codePos + 4));
}
else {
emit(readDatasetL2Offset - (codePos + 4));
}
if ((instr.loca & 192) == 0) { //A.LOC.X if ((instr.loca & 192) == 0) { //A.LOC.X
emit(uint16_t(0x3348)); emit(uint16_t(0x3348));
emitByte(0xe8); //xor rbp, rax emitByte(0xe8); //xor rbp, rax
} }
emit(0x753fc3f6); //test bl,0x3f; jne
emit(uint16_t(0xe805));
emit(readDatasetOffset - (codePos + 4));
emitByte(0x25); //and eax, emitByte(0x25); //and eax,
if (instr.loca & 3) { if (instr.loca & 3) {
emit(ScratchpadL1 - 1); //first 16 KiB of scratchpad emit(ScratchpadL1 - 1); //first 16 KiB of scratchpad

View file

@ -1,4 +1,3 @@
push rax ;# preserve eax
db 0, 0, 0, 0 ;# TransformAddress placeholder db 0, 0, 0, 0 ;# TransformAddress placeholder
mov rcx, qword ptr [rdi] ;# load the dataset address mov rcx, qword ptr [rdi] ;# load the dataset address
xor rbp, rax ;# modify "mx" xor rbp, rax ;# modify "mx"
@ -9,24 +8,13 @@
;# read cacheline "ma" ;# read cacheline "ma"
ror rbp, 32 ;# swap "ma" and "mx" ror rbp, 32 ;# swap "ma" and "mx"
mov edx, ebp ;# edx = ma mov edx, ebp ;# edx = ma
scratchpad_mask ;# limit address to the specified scratchpad size aligned to multiple of 8
lea rax, [rsi+rax*8] ;# scratchpad cache line
lea rcx, [rcx+rdx] ;# dataset cache line lea rcx, [rcx+rdx] ;# dataset cache line
mov rdx, qword ptr [rcx+0] ;# load first dataset quadword (prefetched into the cache by now) xor r8, qword ptr [rcx+0]
xor qword ptr [rax+0], rdx ;# XOR the dataset item with a scratchpad item, repeat for the rest of the cacheline xor r9, qword ptr [rcx+8]
mov rdx, qword ptr [rcx+8] xor r10, qword ptr [rcx+16]
xor qword ptr [rax+8], rdx xor r11, qword ptr [rcx+24]
mov rdx, qword ptr [rcx+16] xor r12, qword ptr [rcx+32]
xor qword ptr [rax+16], rdx xor r13, qword ptr [rcx+40]
mov rdx, qword ptr [rcx+24] xor r14, qword ptr [rcx+48]
xor qword ptr [rax+24], rdx xor r15, qword ptr [rcx+56]
mov rdx, qword ptr [rcx+32]
xor qword ptr [rax+32], rdx
mov rdx, qword ptr [rcx+40]
xor qword ptr [rax+40], rdx
mov rdx, qword ptr [rcx+48]
xor qword ptr [rax+48], rdx
mov rdx, qword ptr [rcx+56]
xor qword ptr [rax+56], rdx
pop rax ;# restore eax
ret ret

View file

@ -221,54 +221,33 @@ TransformAddress MACRO reg32, reg64
;xor reg32, -8 ;# C = all except 0 to 7 ;xor reg32, -8 ;# C = all except 0 to 7
ENDM ENDM
ReadMemoryRandom MACRO spmask ALIGN 64
rx_read:
;# IN eax = random 32-bit address ;# IN eax = random 32-bit address
;# GLOBAL rdi = address of the dataset address ;# GLOBAL rdi = address of the dataset address
;# GLOBAL rsi = address of the scratchpad ;# GLOBAL rsi = address of the scratchpad
;# GLOBAL rbp = low 32 bits = "mx", high 32 bits = "ma" ;# GLOBAL rbp = low 32 bits = "mx", high 32 bits = "ma"
;# MODIFY rcx, rdx ;# MODIFY rcx, rdx
push rax ;# preserve eax
TransformAddress eax, rax ;# TransformAddress function TransformAddress eax, rax ;# TransformAddress function
mov rcx, qword ptr [rdi] ;# load the dataset address mov rcx, qword ptr [rdi] ;# load the dataset address
xor rbp, rax ;# modify "mx" xor rbp, rax ;# modify "mx"
; prefetch cacheline "mx" ;# prefetch cacheline "mx"
and rbp, -64 ;# align "mx" to the start of a cache line and rbp, -64 ;# align "mx" to the start of a cache line
mov edx, ebp ;# edx = mx mov edx, ebp ;# edx = mx
prefetchnta byte ptr [rcx+rdx] prefetchnta byte ptr [rcx+rdx]
; read cacheline "ma" ;# read cacheline "ma"
ror rbp, 32 ;# swap "ma" and "mx" ror rbp, 32 ;# swap "ma" and "mx"
mov edx, ebp ;# edx = ma mov edx, ebp ;# edx = ma
and eax, spmask-7 ;# limit address to the specified scratchpad size aligned to multiple of 8
lea rax, [rsi+rax*8] ;# scratchpad cache line
lea rcx, [rcx+rdx] ;# dataset cache line lea rcx, [rcx+rdx] ;# dataset cache line
mov rdx, qword ptr [rcx+0] ;# load first dataset quadword (prefetched into the cache by now) xor r8, qword ptr [rcx+0]
xor qword ptr [rax+0], rdx ;# XOR the dataset item with a scratchpad item, repeat for the rest of the cacheline xor r9, qword ptr [rcx+8]
mov rdx, qword ptr [rcx+8] xor r10, qword ptr [rcx+16]
xor qword ptr [rax+8], rdx xor r11, qword ptr [rcx+24]
mov rdx, qword ptr [rcx+16] xor r12, qword ptr [rcx+32]
xor qword ptr [rax+16], rdx xor r13, qword ptr [rcx+40]
mov rdx, qword ptr [rcx+24] xor r14, qword ptr [rcx+48]
xor qword ptr [rax+24], rdx xor r15, qword ptr [rcx+56]
mov rdx, qword ptr [rcx+32]
xor qword ptr [rax+32], rdx
mov rdx, qword ptr [rcx+40]
xor qword ptr [rax+40], rdx
mov rdx, qword ptr [rcx+48]
xor qword ptr [rax+48], rdx
mov rdx, qword ptr [rcx+56]
xor qword ptr [rax+56], rdx
pop rax ;# restore eax
ret ret
ENDM
ALIGN 64
rx_read_l1:
ReadMemoryRandom 2047
ALIGN 64
rx_read_l2:
ReadMemoryRandom 32767
executeProgram ENDP executeProgram ENDP
_RANDOMX_EXECUTE_PROGRAM ENDS _RANDOMX_EXECUTE_PROGRAM ENDS

File diff suppressed because it is too large Load diff