mirror of
https://git.wownero.com/wownero/RandomWOW.git
synced 2024-08-15 00:23:14 +00:00
Mix dataset cacheline with registers r0-r7
This commit is contained in:
parent
48d85643de
commit
a7ffe8c19a
8 changed files with 691 additions and 767 deletions
|
@ -67,20 +67,16 @@ namespace RandomX {
|
|||
void AssemblyGeneratorX86::gena(Instruction& instr, int i) {
|
||||
asmCode << "\txor " << regR[instr.rega % RegistersCount] << ", 0" << std::hex << instr.addra << "h" << std::dec << std::endl;
|
||||
asmCode << "\tmov eax, " << regR32[instr.rega % RegistersCount] << std::endl;
|
||||
if ((instr.loca & 192) == 0)
|
||||
asmCode << "\txor " << regMx << ", rax" << std::endl;
|
||||
asmCode << "\ttest " << regIc8 << ", 63" << std::endl;
|
||||
asmCode << "\tjnz short rx_body_" << i << std::endl;
|
||||
asmCode << "\tcall rx_read" << std::endl;
|
||||
asmCode << "rx_body_" << i << ":" << std::endl;
|
||||
if (instr.loca & 3) {
|
||||
asmCode << "\tcall rx_read_l1" << std::endl;
|
||||
asmCode << "rx_body_" << i << ":" << std::endl;
|
||||
if ((instr.loca & 192) == 0)
|
||||
asmCode << "\txor " << regMx << ", rax" << std::endl;
|
||||
asmCode << "\tand eax, " << (ScratchpadL1 - 1) << std::endl;
|
||||
}
|
||||
else {
|
||||
asmCode << "\tcall rx_read_l2" << std::endl;
|
||||
asmCode << "rx_body_" << i << ":" << std::endl;
|
||||
if ((instr.loca & 192) == 0)
|
||||
asmCode << "\txor " << regMx << ", rax" << std::endl;
|
||||
asmCode << "\tand eax, " << (ScratchpadL2 - 1) << std::endl;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -29,8 +29,7 @@
|
|||
.global DECL(randomx_program_prologue)
|
||||
.global DECL(randomx_program_begin)
|
||||
.global DECL(randomx_program_epilogue)
|
||||
.global DECL(randomx_program_read_l1)
|
||||
.global DECL(randomx_program_read_l2)
|
||||
.global DECL(randomx_program_read)
|
||||
.global DECL(randomx_program_end)
|
||||
.global DECL(randomx_program_transform)
|
||||
|
||||
|
@ -48,22 +47,10 @@ DECL(randomx_program_begin):
|
|||
DECL(randomx_program_epilogue):
|
||||
#include "asm/program_epilogue_linux.inc"
|
||||
|
||||
#define scratchpad_mask and eax, 2040
|
||||
|
||||
.align 64
|
||||
DECL(randomx_program_read_l1):
|
||||
DECL(randomx_program_read):
|
||||
#include "asm/program_read.inc"
|
||||
|
||||
#undef scratchpad_mask
|
||||
|
||||
#define scratchpad_mask and eax, 32760
|
||||
|
||||
.align 64
|
||||
DECL(randomx_program_read_l2):
|
||||
#include "asm/program_read.inc"
|
||||
|
||||
#undef scratchpad_mask
|
||||
|
||||
.align 64
|
||||
DECL(randomx_program_end):
|
||||
nop
|
||||
|
|
|
@ -20,8 +20,7 @@ _RANDOMX_JITX86_STATIC SEGMENT PAGE READ EXECUTE
|
|||
PUBLIC randomx_program_prologue
|
||||
PUBLIC randomx_program_begin
|
||||
PUBLIC randomx_program_epilogue
|
||||
PUBLIC randomx_program_read_l1
|
||||
PUBLIC randomx_program_read_l2
|
||||
PUBLIC randomx_program_read
|
||||
PUBLIC randomx_program_end
|
||||
PUBLIC randomx_program_transform
|
||||
|
||||
|
@ -41,23 +40,10 @@ randomx_program_epilogue PROC
|
|||
include asm/program_epilogue_win64.inc
|
||||
randomx_program_epilogue ENDP
|
||||
|
||||
scratchpad_mask MACRO
|
||||
and eax, 2040
|
||||
ENDM
|
||||
|
||||
ALIGN 64
|
||||
randomx_program_read_l1 PROC
|
||||
randomx_program_read PROC
|
||||
include asm/program_read.inc
|
||||
randomx_program_read_l1 ENDP
|
||||
|
||||
scratchpad_mask MACRO
|
||||
and eax, 32760
|
||||
ENDM
|
||||
|
||||
ALIGN 64
|
||||
randomx_program_read_l2 PROC
|
||||
include asm/program_read.inc
|
||||
randomx_program_read_l2 ENDP
|
||||
randomx_program_read ENDP
|
||||
|
||||
ALIGN 64
|
||||
randomx_program_end PROC
|
||||
|
|
|
@ -22,7 +22,6 @@ extern "C" {
|
|||
void randomx_program_begin();
|
||||
void randomx_program_epilogue();
|
||||
void randomx_program_transform();
|
||||
void randomx_program_read_l1();
|
||||
void randomx_program_read_l2();
|
||||
void randomx_program_read();
|
||||
void randomx_program_end();
|
||||
}
|
|
@ -102,22 +102,19 @@ namespace RandomX {
|
|||
const uint8_t* codePrologue = (uint8_t*)&randomx_program_prologue;
|
||||
const uint8_t* codeProgramBegin = (uint8_t*)&randomx_program_begin;
|
||||
const uint8_t* codeEpilogue = (uint8_t*)&randomx_program_epilogue;
|
||||
const uint8_t* codeReadDatasetL1 = (uint8_t*)&randomx_program_read_l1;
|
||||
const uint8_t* codeReadDatasetL2 = (uint8_t*)&randomx_program_read_l2;
|
||||
const uint8_t* codeReadDataset = (uint8_t*)&randomx_program_read;
|
||||
const uint8_t* codeProgramEnd = (uint8_t*)&randomx_program_end;
|
||||
const uint32_t* addressTransformations = (uint32_t*)&randomx_program_transform;
|
||||
|
||||
const int32_t prologueSize = codeProgramBegin - codePrologue;
|
||||
const int32_t epilogueSize = codeReadDatasetL1 - codeEpilogue;
|
||||
const int32_t readDatasetL1Size = codeReadDatasetL2 - codeReadDatasetL1;
|
||||
const int32_t readDatasetL2Size = codeProgramEnd - codeReadDatasetL2;
|
||||
const int32_t epilogueSize = codeReadDataset - codeEpilogue;
|
||||
const int32_t readDatasetSize = codeProgramEnd - codeReadDataset;
|
||||
|
||||
const int32_t readDatasetL2Offset = CodeSize - readDatasetL2Size;
|
||||
const int32_t readDatasetL1Offset = readDatasetL2Offset - readDatasetL1Size;
|
||||
const int32_t epilogueOffset = readDatasetL1Offset - epilogueSize;
|
||||
const int32_t readDatasetOffset = CodeSize - readDatasetSize;
|
||||
const int32_t epilogueOffset = readDatasetOffset - epilogueSize;
|
||||
|
||||
size_t JitCompilerX86::getCodeSize() {
|
||||
return codePos - prologueSize + readDatasetL1Size + readDatasetL2Size;
|
||||
return codePos - prologueSize + readDatasetSize;
|
||||
}
|
||||
|
||||
JitCompilerX86::JitCompilerX86() {
|
||||
|
@ -131,9 +128,8 @@ namespace RandomX {
|
|||
throw std::runtime_error("mmap failed");
|
||||
#endif
|
||||
memcpy(code, codePrologue, prologueSize);
|
||||
memcpy(code + CodeSize - epilogueSize - readDatasetL1Size - readDatasetL2Size, codeEpilogue, epilogueSize);
|
||||
memcpy(code + CodeSize - readDatasetL1Size - readDatasetL2Size, codeReadDatasetL1, readDatasetL1Size);
|
||||
memcpy(code + CodeSize - readDatasetL2Size, codeReadDatasetL2, readDatasetL2Size);
|
||||
memcpy(code + CodeSize - epilogueSize - readDatasetSize, codeEpilogue, epilogueSize);
|
||||
memcpy(code + CodeSize - readDatasetSize, codeReadDataset, readDatasetSize);
|
||||
}
|
||||
|
||||
void JitCompilerX86::generateProgram(Pcg32& gen) {
|
||||
|
@ -150,10 +146,8 @@ namespace RandomX {
|
|||
emitByte(0xe9);
|
||||
emit(instructionOffsets[0] - (codePos + 4));
|
||||
fixCallOffsets();
|
||||
uint32_t transformL1 = addressTransformations[gen.getUniform(0, TransformationCount - 1)];
|
||||
uint32_t transformL2 = addressTransformations[gen.getUniform(0, TransformationCount - 1)];
|
||||
*reinterpret_cast<uint32_t*>(code + readDatasetL1Offset + 1) = transformL1;
|
||||
*reinterpret_cast<uint32_t*>(code + readDatasetL2Offset + 1) = transformL2;
|
||||
uint32_t transform = addressTransformations[gen.getUniform(0, TransformationCount - 1)];
|
||||
*reinterpret_cast<uint32_t*>(code + readDatasetOffset) = transform;
|
||||
}
|
||||
|
||||
void JitCompilerX86::generateCode(Instruction& instr, int i) {
|
||||
|
@ -176,18 +170,13 @@ namespace RandomX {
|
|||
emit(instr.addra);
|
||||
emit(uint16_t(0x8b41)); //mov
|
||||
emitByte(0xc0 + (instr.rega % RegistersCount)); //eax, rega
|
||||
emit(0x753fc3f6); //test bl,0x3f; jne
|
||||
emit(uint16_t(0xe805));
|
||||
if (instr.loca & 3) { //A.LOC.W
|
||||
emit(readDatasetL1Offset - (codePos + 4));
|
||||
}
|
||||
else {
|
||||
emit(readDatasetL2Offset - (codePos + 4));
|
||||
}
|
||||
if ((instr.loca & 192) == 0) { //A.LOC.X
|
||||
emit(uint16_t(0x3348));
|
||||
emitByte(0xe8); //xor rbp, rax
|
||||
}
|
||||
emit(0x753fc3f6); //test bl,0x3f; jne
|
||||
emit(uint16_t(0xe805));
|
||||
emit(readDatasetOffset - (codePos + 4));
|
||||
emitByte(0x25); //and eax,
|
||||
if (instr.loca & 3) {
|
||||
emit(ScratchpadL1 - 1); //first 16 KiB of scratchpad
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
push rax ;# preserve eax
|
||||
db 0, 0, 0, 0 ;# TransformAddress placeholder
|
||||
mov rcx, qword ptr [rdi] ;# load the dataset address
|
||||
xor rbp, rax ;# modify "mx"
|
||||
|
@ -9,24 +8,13 @@
|
|||
;# read cacheline "ma"
|
||||
ror rbp, 32 ;# swap "ma" and "mx"
|
||||
mov edx, ebp ;# edx = ma
|
||||
scratchpad_mask ;# limit address to the specified scratchpad size aligned to multiple of 8
|
||||
lea rax, [rsi+rax*8] ;# scratchpad cache line
|
||||
lea rcx, [rcx+rdx] ;# dataset cache line
|
||||
mov rdx, qword ptr [rcx+0] ;# load first dataset quadword (prefetched into the cache by now)
|
||||
xor qword ptr [rax+0], rdx ;# XOR the dataset item with a scratchpad item, repeat for the rest of the cacheline
|
||||
mov rdx, qword ptr [rcx+8]
|
||||
xor qword ptr [rax+8], rdx
|
||||
mov rdx, qword ptr [rcx+16]
|
||||
xor qword ptr [rax+16], rdx
|
||||
mov rdx, qword ptr [rcx+24]
|
||||
xor qword ptr [rax+24], rdx
|
||||
mov rdx, qword ptr [rcx+32]
|
||||
xor qword ptr [rax+32], rdx
|
||||
mov rdx, qword ptr [rcx+40]
|
||||
xor qword ptr [rax+40], rdx
|
||||
mov rdx, qword ptr [rcx+48]
|
||||
xor qword ptr [rax+48], rdx
|
||||
mov rdx, qword ptr [rcx+56]
|
||||
xor qword ptr [rax+56], rdx
|
||||
pop rax ;# restore eax
|
||||
xor r8, qword ptr [rcx+0]
|
||||
xor r9, qword ptr [rcx+8]
|
||||
xor r10, qword ptr [rcx+16]
|
||||
xor r11, qword ptr [rcx+24]
|
||||
xor r12, qword ptr [rcx+32]
|
||||
xor r13, qword ptr [rcx+40]
|
||||
xor r14, qword ptr [rcx+48]
|
||||
xor r15, qword ptr [rcx+56]
|
||||
ret
|
|
@ -221,54 +221,33 @@ TransformAddress MACRO reg32, reg64
|
|||
;xor reg32, -8 ;# C = all except 0 to 7
|
||||
ENDM
|
||||
|
||||
ReadMemoryRandom MACRO spmask
|
||||
ALIGN 64
|
||||
rx_read:
|
||||
;# IN eax = random 32-bit address
|
||||
;# GLOBAL rdi = address of the dataset address
|
||||
;# GLOBAL rsi = address of the scratchpad
|
||||
;# GLOBAL rbp = low 32 bits = "mx", high 32 bits = "ma"
|
||||
;# MODIFY rcx, rdx
|
||||
push rax ;# preserve eax
|
||||
TransformAddress eax, rax ;# TransformAddress function
|
||||
mov rcx, qword ptr [rdi] ;# load the dataset address
|
||||
xor rbp, rax ;# modify "mx"
|
||||
; prefetch cacheline "mx"
|
||||
;# prefetch cacheline "mx"
|
||||
and rbp, -64 ;# align "mx" to the start of a cache line
|
||||
mov edx, ebp ;# edx = mx
|
||||
prefetchnta byte ptr [rcx+rdx]
|
||||
; read cacheline "ma"
|
||||
;# read cacheline "ma"
|
||||
ror rbp, 32 ;# swap "ma" and "mx"
|
||||
mov edx, ebp ;# edx = ma
|
||||
and eax, spmask-7 ;# limit address to the specified scratchpad size aligned to multiple of 8
|
||||
lea rax, [rsi+rax*8] ;# scratchpad cache line
|
||||
lea rcx, [rcx+rdx] ;# dataset cache line
|
||||
mov rdx, qword ptr [rcx+0] ;# load first dataset quadword (prefetched into the cache by now)
|
||||
xor qword ptr [rax+0], rdx ;# XOR the dataset item with a scratchpad item, repeat for the rest of the cacheline
|
||||
mov rdx, qword ptr [rcx+8]
|
||||
xor qword ptr [rax+8], rdx
|
||||
mov rdx, qword ptr [rcx+16]
|
||||
xor qword ptr [rax+16], rdx
|
||||
mov rdx, qword ptr [rcx+24]
|
||||
xor qword ptr [rax+24], rdx
|
||||
mov rdx, qword ptr [rcx+32]
|
||||
xor qword ptr [rax+32], rdx
|
||||
mov rdx, qword ptr [rcx+40]
|
||||
xor qword ptr [rax+40], rdx
|
||||
mov rdx, qword ptr [rcx+48]
|
||||
xor qword ptr [rax+48], rdx
|
||||
mov rdx, qword ptr [rcx+56]
|
||||
xor qword ptr [rax+56], rdx
|
||||
pop rax ;# restore eax
|
||||
xor r8, qword ptr [rcx+0]
|
||||
xor r9, qword ptr [rcx+8]
|
||||
xor r10, qword ptr [rcx+16]
|
||||
xor r11, qword ptr [rcx+24]
|
||||
xor r12, qword ptr [rcx+32]
|
||||
xor r13, qword ptr [rcx+40]
|
||||
xor r14, qword ptr [rcx+48]
|
||||
xor r15, qword ptr [rcx+56]
|
||||
ret
|
||||
ENDM
|
||||
|
||||
ALIGN 64
|
||||
rx_read_l1:
|
||||
ReadMemoryRandom 2047
|
||||
|
||||
ALIGN 64
|
||||
rx_read_l2:
|
||||
ReadMemoryRandom 32767
|
||||
|
||||
executeProgram ENDP
|
||||
|
||||
_RANDOMX_EXECUTE_PROGRAM ENDS
|
||||
|
|
1296
src/program.inc
1296
src/program.inc
File diff suppressed because it is too large
Load diff
Loading…
Reference in a new issue