mirror of
https://git.wownero.com/wownero/RandomWOW.git
synced 2024-08-15 00:23:14 +00:00
Mix dataset cacheline with registers r0-r7
This commit is contained in:
parent
48d85643de
commit
a7ffe8c19a
8 changed files with 691 additions and 767 deletions
|
@ -67,20 +67,16 @@ namespace RandomX {
|
||||||
void AssemblyGeneratorX86::gena(Instruction& instr, int i) {
|
void AssemblyGeneratorX86::gena(Instruction& instr, int i) {
|
||||||
asmCode << "\txor " << regR[instr.rega % RegistersCount] << ", 0" << std::hex << instr.addra << "h" << std::dec << std::endl;
|
asmCode << "\txor " << regR[instr.rega % RegistersCount] << ", 0" << std::hex << instr.addra << "h" << std::dec << std::endl;
|
||||||
asmCode << "\tmov eax, " << regR32[instr.rega % RegistersCount] << std::endl;
|
asmCode << "\tmov eax, " << regR32[instr.rega % RegistersCount] << std::endl;
|
||||||
|
if ((instr.loca & 192) == 0)
|
||||||
|
asmCode << "\txor " << regMx << ", rax" << std::endl;
|
||||||
asmCode << "\ttest " << regIc8 << ", 63" << std::endl;
|
asmCode << "\ttest " << regIc8 << ", 63" << std::endl;
|
||||||
asmCode << "\tjnz short rx_body_" << i << std::endl;
|
asmCode << "\tjnz short rx_body_" << i << std::endl;
|
||||||
|
asmCode << "\tcall rx_read" << std::endl;
|
||||||
|
asmCode << "rx_body_" << i << ":" << std::endl;
|
||||||
if (instr.loca & 3) {
|
if (instr.loca & 3) {
|
||||||
asmCode << "\tcall rx_read_l1" << std::endl;
|
|
||||||
asmCode << "rx_body_" << i << ":" << std::endl;
|
|
||||||
if ((instr.loca & 192) == 0)
|
|
||||||
asmCode << "\txor " << regMx << ", rax" << std::endl;
|
|
||||||
asmCode << "\tand eax, " << (ScratchpadL1 - 1) << std::endl;
|
asmCode << "\tand eax, " << (ScratchpadL1 - 1) << std::endl;
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
asmCode << "\tcall rx_read_l2" << std::endl;
|
|
||||||
asmCode << "rx_body_" << i << ":" << std::endl;
|
|
||||||
if ((instr.loca & 192) == 0)
|
|
||||||
asmCode << "\txor " << regMx << ", rax" << std::endl;
|
|
||||||
asmCode << "\tand eax, " << (ScratchpadL2 - 1) << std::endl;
|
asmCode << "\tand eax, " << (ScratchpadL2 - 1) << std::endl;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -29,8 +29,7 @@
|
||||||
.global DECL(randomx_program_prologue)
|
.global DECL(randomx_program_prologue)
|
||||||
.global DECL(randomx_program_begin)
|
.global DECL(randomx_program_begin)
|
||||||
.global DECL(randomx_program_epilogue)
|
.global DECL(randomx_program_epilogue)
|
||||||
.global DECL(randomx_program_read_l1)
|
.global DECL(randomx_program_read)
|
||||||
.global DECL(randomx_program_read_l2)
|
|
||||||
.global DECL(randomx_program_end)
|
.global DECL(randomx_program_end)
|
||||||
.global DECL(randomx_program_transform)
|
.global DECL(randomx_program_transform)
|
||||||
|
|
||||||
|
@ -48,22 +47,10 @@ DECL(randomx_program_begin):
|
||||||
DECL(randomx_program_epilogue):
|
DECL(randomx_program_epilogue):
|
||||||
#include "asm/program_epilogue_linux.inc"
|
#include "asm/program_epilogue_linux.inc"
|
||||||
|
|
||||||
#define scratchpad_mask and eax, 2040
|
|
||||||
|
|
||||||
.align 64
|
.align 64
|
||||||
DECL(randomx_program_read_l1):
|
DECL(randomx_program_read):
|
||||||
#include "asm/program_read.inc"
|
#include "asm/program_read.inc"
|
||||||
|
|
||||||
#undef scratchpad_mask
|
|
||||||
|
|
||||||
#define scratchpad_mask and eax, 32760
|
|
||||||
|
|
||||||
.align 64
|
|
||||||
DECL(randomx_program_read_l2):
|
|
||||||
#include "asm/program_read.inc"
|
|
||||||
|
|
||||||
#undef scratchpad_mask
|
|
||||||
|
|
||||||
.align 64
|
.align 64
|
||||||
DECL(randomx_program_end):
|
DECL(randomx_program_end):
|
||||||
nop
|
nop
|
||||||
|
|
|
@ -20,8 +20,7 @@ _RANDOMX_JITX86_STATIC SEGMENT PAGE READ EXECUTE
|
||||||
PUBLIC randomx_program_prologue
|
PUBLIC randomx_program_prologue
|
||||||
PUBLIC randomx_program_begin
|
PUBLIC randomx_program_begin
|
||||||
PUBLIC randomx_program_epilogue
|
PUBLIC randomx_program_epilogue
|
||||||
PUBLIC randomx_program_read_l1
|
PUBLIC randomx_program_read
|
||||||
PUBLIC randomx_program_read_l2
|
|
||||||
PUBLIC randomx_program_end
|
PUBLIC randomx_program_end
|
||||||
PUBLIC randomx_program_transform
|
PUBLIC randomx_program_transform
|
||||||
|
|
||||||
|
@ -41,23 +40,10 @@ randomx_program_epilogue PROC
|
||||||
include asm/program_epilogue_win64.inc
|
include asm/program_epilogue_win64.inc
|
||||||
randomx_program_epilogue ENDP
|
randomx_program_epilogue ENDP
|
||||||
|
|
||||||
scratchpad_mask MACRO
|
|
||||||
and eax, 2040
|
|
||||||
ENDM
|
|
||||||
|
|
||||||
ALIGN 64
|
ALIGN 64
|
||||||
randomx_program_read_l1 PROC
|
randomx_program_read PROC
|
||||||
include asm/program_read.inc
|
include asm/program_read.inc
|
||||||
randomx_program_read_l1 ENDP
|
randomx_program_read ENDP
|
||||||
|
|
||||||
scratchpad_mask MACRO
|
|
||||||
and eax, 32760
|
|
||||||
ENDM
|
|
||||||
|
|
||||||
ALIGN 64
|
|
||||||
randomx_program_read_l2 PROC
|
|
||||||
include asm/program_read.inc
|
|
||||||
randomx_program_read_l2 ENDP
|
|
||||||
|
|
||||||
ALIGN 64
|
ALIGN 64
|
||||||
randomx_program_end PROC
|
randomx_program_end PROC
|
||||||
|
|
|
@ -22,7 +22,6 @@ extern "C" {
|
||||||
void randomx_program_begin();
|
void randomx_program_begin();
|
||||||
void randomx_program_epilogue();
|
void randomx_program_epilogue();
|
||||||
void randomx_program_transform();
|
void randomx_program_transform();
|
||||||
void randomx_program_read_l1();
|
void randomx_program_read();
|
||||||
void randomx_program_read_l2();
|
|
||||||
void randomx_program_end();
|
void randomx_program_end();
|
||||||
}
|
}
|
|
@ -102,22 +102,19 @@ namespace RandomX {
|
||||||
const uint8_t* codePrologue = (uint8_t*)&randomx_program_prologue;
|
const uint8_t* codePrologue = (uint8_t*)&randomx_program_prologue;
|
||||||
const uint8_t* codeProgramBegin = (uint8_t*)&randomx_program_begin;
|
const uint8_t* codeProgramBegin = (uint8_t*)&randomx_program_begin;
|
||||||
const uint8_t* codeEpilogue = (uint8_t*)&randomx_program_epilogue;
|
const uint8_t* codeEpilogue = (uint8_t*)&randomx_program_epilogue;
|
||||||
const uint8_t* codeReadDatasetL1 = (uint8_t*)&randomx_program_read_l1;
|
const uint8_t* codeReadDataset = (uint8_t*)&randomx_program_read;
|
||||||
const uint8_t* codeReadDatasetL2 = (uint8_t*)&randomx_program_read_l2;
|
|
||||||
const uint8_t* codeProgramEnd = (uint8_t*)&randomx_program_end;
|
const uint8_t* codeProgramEnd = (uint8_t*)&randomx_program_end;
|
||||||
const uint32_t* addressTransformations = (uint32_t*)&randomx_program_transform;
|
const uint32_t* addressTransformations = (uint32_t*)&randomx_program_transform;
|
||||||
|
|
||||||
const int32_t prologueSize = codeProgramBegin - codePrologue;
|
const int32_t prologueSize = codeProgramBegin - codePrologue;
|
||||||
const int32_t epilogueSize = codeReadDatasetL1 - codeEpilogue;
|
const int32_t epilogueSize = codeReadDataset - codeEpilogue;
|
||||||
const int32_t readDatasetL1Size = codeReadDatasetL2 - codeReadDatasetL1;
|
const int32_t readDatasetSize = codeProgramEnd - codeReadDataset;
|
||||||
const int32_t readDatasetL2Size = codeProgramEnd - codeReadDatasetL2;
|
|
||||||
|
|
||||||
const int32_t readDatasetL2Offset = CodeSize - readDatasetL2Size;
|
const int32_t readDatasetOffset = CodeSize - readDatasetSize;
|
||||||
const int32_t readDatasetL1Offset = readDatasetL2Offset - readDatasetL1Size;
|
const int32_t epilogueOffset = readDatasetOffset - epilogueSize;
|
||||||
const int32_t epilogueOffset = readDatasetL1Offset - epilogueSize;
|
|
||||||
|
|
||||||
size_t JitCompilerX86::getCodeSize() {
|
size_t JitCompilerX86::getCodeSize() {
|
||||||
return codePos - prologueSize + readDatasetL1Size + readDatasetL2Size;
|
return codePos - prologueSize + readDatasetSize;
|
||||||
}
|
}
|
||||||
|
|
||||||
JitCompilerX86::JitCompilerX86() {
|
JitCompilerX86::JitCompilerX86() {
|
||||||
|
@ -131,9 +128,8 @@ namespace RandomX {
|
||||||
throw std::runtime_error("mmap failed");
|
throw std::runtime_error("mmap failed");
|
||||||
#endif
|
#endif
|
||||||
memcpy(code, codePrologue, prologueSize);
|
memcpy(code, codePrologue, prologueSize);
|
||||||
memcpy(code + CodeSize - epilogueSize - readDatasetL1Size - readDatasetL2Size, codeEpilogue, epilogueSize);
|
memcpy(code + CodeSize - epilogueSize - readDatasetSize, codeEpilogue, epilogueSize);
|
||||||
memcpy(code + CodeSize - readDatasetL1Size - readDatasetL2Size, codeReadDatasetL1, readDatasetL1Size);
|
memcpy(code + CodeSize - readDatasetSize, codeReadDataset, readDatasetSize);
|
||||||
memcpy(code + CodeSize - readDatasetL2Size, codeReadDatasetL2, readDatasetL2Size);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void JitCompilerX86::generateProgram(Pcg32& gen) {
|
void JitCompilerX86::generateProgram(Pcg32& gen) {
|
||||||
|
@ -150,10 +146,8 @@ namespace RandomX {
|
||||||
emitByte(0xe9);
|
emitByte(0xe9);
|
||||||
emit(instructionOffsets[0] - (codePos + 4));
|
emit(instructionOffsets[0] - (codePos + 4));
|
||||||
fixCallOffsets();
|
fixCallOffsets();
|
||||||
uint32_t transformL1 = addressTransformations[gen.getUniform(0, TransformationCount - 1)];
|
uint32_t transform = addressTransformations[gen.getUniform(0, TransformationCount - 1)];
|
||||||
uint32_t transformL2 = addressTransformations[gen.getUniform(0, TransformationCount - 1)];
|
*reinterpret_cast<uint32_t*>(code + readDatasetOffset) = transform;
|
||||||
*reinterpret_cast<uint32_t*>(code + readDatasetL1Offset + 1) = transformL1;
|
|
||||||
*reinterpret_cast<uint32_t*>(code + readDatasetL2Offset + 1) = transformL2;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void JitCompilerX86::generateCode(Instruction& instr, int i) {
|
void JitCompilerX86::generateCode(Instruction& instr, int i) {
|
||||||
|
@ -176,18 +170,13 @@ namespace RandomX {
|
||||||
emit(instr.addra);
|
emit(instr.addra);
|
||||||
emit(uint16_t(0x8b41)); //mov
|
emit(uint16_t(0x8b41)); //mov
|
||||||
emitByte(0xc0 + (instr.rega % RegistersCount)); //eax, rega
|
emitByte(0xc0 + (instr.rega % RegistersCount)); //eax, rega
|
||||||
emit(0x753fc3f6); //test bl,0x3f; jne
|
|
||||||
emit(uint16_t(0xe805));
|
|
||||||
if (instr.loca & 3) { //A.LOC.W
|
|
||||||
emit(readDatasetL1Offset - (codePos + 4));
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
emit(readDatasetL2Offset - (codePos + 4));
|
|
||||||
}
|
|
||||||
if ((instr.loca & 192) == 0) { //A.LOC.X
|
if ((instr.loca & 192) == 0) { //A.LOC.X
|
||||||
emit(uint16_t(0x3348));
|
emit(uint16_t(0x3348));
|
||||||
emitByte(0xe8); //xor rbp, rax
|
emitByte(0xe8); //xor rbp, rax
|
||||||
}
|
}
|
||||||
|
emit(0x753fc3f6); //test bl,0x3f; jne
|
||||||
|
emit(uint16_t(0xe805));
|
||||||
|
emit(readDatasetOffset - (codePos + 4));
|
||||||
emitByte(0x25); //and eax,
|
emitByte(0x25); //and eax,
|
||||||
if (instr.loca & 3) {
|
if (instr.loca & 3) {
|
||||||
emit(ScratchpadL1 - 1); //first 16 KiB of scratchpad
|
emit(ScratchpadL1 - 1); //first 16 KiB of scratchpad
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
push rax ;# preserve eax
|
|
||||||
db 0, 0, 0, 0 ;# TransformAddress placeholder
|
db 0, 0, 0, 0 ;# TransformAddress placeholder
|
||||||
mov rcx, qword ptr [rdi] ;# load the dataset address
|
mov rcx, qword ptr [rdi] ;# load the dataset address
|
||||||
xor rbp, rax ;# modify "mx"
|
xor rbp, rax ;# modify "mx"
|
||||||
|
@ -9,24 +8,13 @@
|
||||||
;# read cacheline "ma"
|
;# read cacheline "ma"
|
||||||
ror rbp, 32 ;# swap "ma" and "mx"
|
ror rbp, 32 ;# swap "ma" and "mx"
|
||||||
mov edx, ebp ;# edx = ma
|
mov edx, ebp ;# edx = ma
|
||||||
scratchpad_mask ;# limit address to the specified scratchpad size aligned to multiple of 8
|
|
||||||
lea rax, [rsi+rax*8] ;# scratchpad cache line
|
|
||||||
lea rcx, [rcx+rdx] ;# dataset cache line
|
lea rcx, [rcx+rdx] ;# dataset cache line
|
||||||
mov rdx, qword ptr [rcx+0] ;# load first dataset quadword (prefetched into the cache by now)
|
xor r8, qword ptr [rcx+0]
|
||||||
xor qword ptr [rax+0], rdx ;# XOR the dataset item with a scratchpad item, repeat for the rest of the cacheline
|
xor r9, qword ptr [rcx+8]
|
||||||
mov rdx, qword ptr [rcx+8]
|
xor r10, qword ptr [rcx+16]
|
||||||
xor qword ptr [rax+8], rdx
|
xor r11, qword ptr [rcx+24]
|
||||||
mov rdx, qword ptr [rcx+16]
|
xor r12, qword ptr [rcx+32]
|
||||||
xor qword ptr [rax+16], rdx
|
xor r13, qword ptr [rcx+40]
|
||||||
mov rdx, qword ptr [rcx+24]
|
xor r14, qword ptr [rcx+48]
|
||||||
xor qword ptr [rax+24], rdx
|
xor r15, qword ptr [rcx+56]
|
||||||
mov rdx, qword ptr [rcx+32]
|
|
||||||
xor qword ptr [rax+32], rdx
|
|
||||||
mov rdx, qword ptr [rcx+40]
|
|
||||||
xor qword ptr [rax+40], rdx
|
|
||||||
mov rdx, qword ptr [rcx+48]
|
|
||||||
xor qword ptr [rax+48], rdx
|
|
||||||
mov rdx, qword ptr [rcx+56]
|
|
||||||
xor qword ptr [rax+56], rdx
|
|
||||||
pop rax ;# restore eax
|
|
||||||
ret
|
ret
|
|
@ -221,54 +221,33 @@ TransformAddress MACRO reg32, reg64
|
||||||
;xor reg32, -8 ;# C = all except 0 to 7
|
;xor reg32, -8 ;# C = all except 0 to 7
|
||||||
ENDM
|
ENDM
|
||||||
|
|
||||||
ReadMemoryRandom MACRO spmask
|
ALIGN 64
|
||||||
|
rx_read:
|
||||||
;# IN eax = random 32-bit address
|
;# IN eax = random 32-bit address
|
||||||
;# GLOBAL rdi = address of the dataset address
|
;# GLOBAL rdi = address of the dataset address
|
||||||
;# GLOBAL rsi = address of the scratchpad
|
;# GLOBAL rsi = address of the scratchpad
|
||||||
;# GLOBAL rbp = low 32 bits = "mx", high 32 bits = "ma"
|
;# GLOBAL rbp = low 32 bits = "mx", high 32 bits = "ma"
|
||||||
;# MODIFY rcx, rdx
|
;# MODIFY rcx, rdx
|
||||||
push rax ;# preserve eax
|
|
||||||
TransformAddress eax, rax ;# TransformAddress function
|
TransformAddress eax, rax ;# TransformAddress function
|
||||||
mov rcx, qword ptr [rdi] ;# load the dataset address
|
mov rcx, qword ptr [rdi] ;# load the dataset address
|
||||||
xor rbp, rax ;# modify "mx"
|
xor rbp, rax ;# modify "mx"
|
||||||
; prefetch cacheline "mx"
|
;# prefetch cacheline "mx"
|
||||||
and rbp, -64 ;# align "mx" to the start of a cache line
|
and rbp, -64 ;# align "mx" to the start of a cache line
|
||||||
mov edx, ebp ;# edx = mx
|
mov edx, ebp ;# edx = mx
|
||||||
prefetchnta byte ptr [rcx+rdx]
|
prefetchnta byte ptr [rcx+rdx]
|
||||||
; read cacheline "ma"
|
;# read cacheline "ma"
|
||||||
ror rbp, 32 ;# swap "ma" and "mx"
|
ror rbp, 32 ;# swap "ma" and "mx"
|
||||||
mov edx, ebp ;# edx = ma
|
mov edx, ebp ;# edx = ma
|
||||||
and eax, spmask-7 ;# limit address to the specified scratchpad size aligned to multiple of 8
|
|
||||||
lea rax, [rsi+rax*8] ;# scratchpad cache line
|
|
||||||
lea rcx, [rcx+rdx] ;# dataset cache line
|
lea rcx, [rcx+rdx] ;# dataset cache line
|
||||||
mov rdx, qword ptr [rcx+0] ;# load first dataset quadword (prefetched into the cache by now)
|
xor r8, qword ptr [rcx+0]
|
||||||
xor qword ptr [rax+0], rdx ;# XOR the dataset item with a scratchpad item, repeat for the rest of the cacheline
|
xor r9, qword ptr [rcx+8]
|
||||||
mov rdx, qword ptr [rcx+8]
|
xor r10, qword ptr [rcx+16]
|
||||||
xor qword ptr [rax+8], rdx
|
xor r11, qword ptr [rcx+24]
|
||||||
mov rdx, qword ptr [rcx+16]
|
xor r12, qword ptr [rcx+32]
|
||||||
xor qword ptr [rax+16], rdx
|
xor r13, qword ptr [rcx+40]
|
||||||
mov rdx, qword ptr [rcx+24]
|
xor r14, qword ptr [rcx+48]
|
||||||
xor qword ptr [rax+24], rdx
|
xor r15, qword ptr [rcx+56]
|
||||||
mov rdx, qword ptr [rcx+32]
|
|
||||||
xor qword ptr [rax+32], rdx
|
|
||||||
mov rdx, qword ptr [rcx+40]
|
|
||||||
xor qword ptr [rax+40], rdx
|
|
||||||
mov rdx, qword ptr [rcx+48]
|
|
||||||
xor qword ptr [rax+48], rdx
|
|
||||||
mov rdx, qword ptr [rcx+56]
|
|
||||||
xor qword ptr [rax+56], rdx
|
|
||||||
pop rax ;# restore eax
|
|
||||||
ret
|
ret
|
||||||
ENDM
|
|
||||||
|
|
||||||
ALIGN 64
|
|
||||||
rx_read_l1:
|
|
||||||
ReadMemoryRandom 2047
|
|
||||||
|
|
||||||
ALIGN 64
|
|
||||||
rx_read_l2:
|
|
||||||
ReadMemoryRandom 32767
|
|
||||||
|
|
||||||
executeProgram ENDP
|
executeProgram ENDP
|
||||||
|
|
||||||
_RANDOMX_EXECUTE_PROGRAM ENDS
|
_RANDOMX_EXECUTE_PROGRAM ENDS
|
||||||
|
|
1296
src/program.inc
1296
src/program.inc
File diff suppressed because it is too large
Load diff
Loading…
Reference in a new issue