mirror of
https://git.wownero.com/wownero/RandomWOW.git
synced 2024-08-15 00:23:14 +00:00
Reduced x86 code size by 512 bytes (and ecx -> and eax)
This commit is contained in:
parent
1426fcbab5
commit
67e741ff22
8 changed files with 1841 additions and 1842 deletions
|
@ -66,34 +66,34 @@ namespace RandomX {
|
|||
|
||||
void AssemblyGeneratorX86::gena(Instruction& instr, int i) {
|
||||
asmCode << "\txor " << regR[instr.rega % RegistersCount] << ", 0" << std::hex << instr.addra << "h" << std::dec << std::endl;
|
||||
asmCode << "\tmov ecx, " << regR32[instr.rega % RegistersCount] << std::endl;
|
||||
asmCode << "\tmov eax, " << regR32[instr.rega % RegistersCount] << std::endl;
|
||||
asmCode << "\ttest " << regIc8 << ", 63" << std::endl;
|
||||
asmCode << "\tjnz short rx_body_" << i << std::endl;
|
||||
if (instr.loca & 3) {
|
||||
asmCode << "\tcall rx_read_l1" << std::endl;
|
||||
asmCode << "rx_body_" << i << ":" << std::endl;
|
||||
if ((instr.loca & 192) == 0)
|
||||
asmCode << "\txor " << regMx << ", rcx" << std::endl;
|
||||
asmCode << "\tand ecx, " << (ScratchpadL1 - 1) << std::endl;
|
||||
asmCode << "\txor " << regMx << ", rax" << std::endl;
|
||||
asmCode << "\tand eax, " << (ScratchpadL1 - 1) << std::endl;
|
||||
}
|
||||
else {
|
||||
asmCode << "\tcall rx_read_l2" << std::endl;
|
||||
asmCode << "rx_body_" << i << ":" << std::endl;
|
||||
if ((instr.loca & 192) == 0)
|
||||
asmCode << "\txor " << regMx << ", rcx" << std::endl;
|
||||
asmCode << "\tand ecx, " << (ScratchpadL2 - 1) << std::endl;
|
||||
asmCode << "\txor " << regMx << ", rax" << std::endl;
|
||||
asmCode << "\tand eax, " << (ScratchpadL2 - 1) << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
void AssemblyGeneratorX86::genar(Instruction& instr, int i) {
|
||||
gena(instr, i);
|
||||
asmCode << "\tmov rax, qword ptr [" << regScratchpadAddr << "+rcx*8]" << std::endl;
|
||||
asmCode << "\tmov rax, qword ptr [" << regScratchpadAddr << "+rax*8]" << std::endl;
|
||||
}
|
||||
|
||||
|
||||
void AssemblyGeneratorX86::genaf(Instruction& instr, int i) {
|
||||
gena(instr, i);
|
||||
asmCode << "\tcvtdq2pd xmm0, qword ptr [" << regScratchpadAddr << "+rcx*8]" << std::endl;
|
||||
asmCode << "\tcvtdq2pd xmm0, qword ptr [" << regScratchpadAddr << "+rax*8]" << std::endl;
|
||||
}
|
||||
|
||||
void AssemblyGeneratorX86::genbiashift(Instruction& instr, const char* instrx86) {
|
||||
|
|
|
@ -48,7 +48,7 @@ DECL(randomx_program_begin):
|
|||
DECL(randomx_program_epilogue):
|
||||
#include "asm/program_epilogue_linux.inc"
|
||||
|
||||
#define scratchpad_mask and ecx, 2040
|
||||
#define scratchpad_mask and eax, 2040
|
||||
|
||||
.align 64
|
||||
DECL(randomx_program_read_l1):
|
||||
|
@ -56,7 +56,7 @@ DECL(randomx_program_read_l1):
|
|||
|
||||
#undef scratchpad_mask
|
||||
|
||||
#define scratchpad_mask and ecx, 32760
|
||||
#define scratchpad_mask and eax, 32760
|
||||
|
||||
.align 64
|
||||
DECL(randomx_program_read_l2):
|
||||
|
|
|
@ -42,7 +42,7 @@ randomx_program_epilogue PROC
|
|||
randomx_program_epilogue ENDP
|
||||
|
||||
scratchpad_mask MACRO
|
||||
and ecx, 2040
|
||||
and eax, 2040
|
||||
ENDM
|
||||
|
||||
ALIGN 64
|
||||
|
@ -51,7 +51,7 @@ randomx_program_read_l1 PROC
|
|||
randomx_program_read_l1 ENDP
|
||||
|
||||
scratchpad_mask MACRO
|
||||
and ecx, 32760
|
||||
and eax, 32760
|
||||
ENDM
|
||||
|
||||
ALIGN 64
|
||||
|
|
|
@ -175,7 +175,7 @@ namespace RandomX {
|
|||
emitByte(0xf0 + (instr.rega % RegistersCount));
|
||||
emit(instr.addra);
|
||||
emit(uint16_t(0x8b41)); //mov
|
||||
emitByte(0xc8 + (instr.rega % RegistersCount)); //ecx, rega
|
||||
emitByte(0xc0 + (instr.rega % RegistersCount)); //eax, rega
|
||||
emit(0x753fc3f6); //test bl,0x3f; jne
|
||||
emit(uint16_t(0xe805));
|
||||
if (instr.loca & 3) { //A.LOC.W
|
||||
|
@ -186,9 +186,9 @@ namespace RandomX {
|
|||
}
|
||||
if ((instr.loca & 192) == 0) { //A.LOC.X
|
||||
emit(uint16_t(0x3348));
|
||||
emitByte(0xe9); //xor rbp, rcx
|
||||
emitByte(0xe8); //xor rbp, rax
|
||||
}
|
||||
emit(uint16_t(0xe181)); //and ecx,
|
||||
emitByte(0x25); //and eax,
|
||||
if (instr.loca & 3) {
|
||||
emit(ScratchpadL1 - 1); //first 16 KiB of scratchpad
|
||||
}
|
||||
|
@ -199,14 +199,13 @@ namespace RandomX {
|
|||
|
||||
void JitCompilerX86::genar(Instruction& instr) {
|
||||
gena(instr);
|
||||
emit(0xce048b48); //mov rax,QWORD PTR [rsi+rcx*8]
|
||||
emit(0xdc580f66);
|
||||
emit(0xc6048b48); //mov rax,QWORD PTR [rsi+rax*8]
|
||||
}
|
||||
|
||||
void JitCompilerX86::genaf(Instruction& instr) {
|
||||
gena(instr);
|
||||
emitByte(0xf3);
|
||||
emit(0xce04e60f); //cvtdq2pd xmm0,QWORD PTR [rsi+rcx*8]
|
||||
emit(0xc604e60f); //cvtdq2pd xmm0,QWORD PTR [rsi+rax*8]
|
||||
}
|
||||
|
||||
void JitCompilerX86::genbiashift(Instruction& instr, uint16_t opcodeReg, uint16_t opcodeImm) {
|
||||
|
|
|
@ -1,32 +1,32 @@
|
|||
push rcx ;# preserve ecx
|
||||
push rax ;# preserve eax
|
||||
db 0, 0, 0, 0 ;# TransformAddress placeholder
|
||||
mov rax, qword ptr [rdi] ;# load the dataset address
|
||||
xor rbp, rcx ;# modify "mx"
|
||||
mov rcx, qword ptr [rdi] ;# load the dataset address
|
||||
xor rbp, rax ;# modify "mx"
|
||||
;# prefetch cacheline "mx"
|
||||
and rbp, -64 ;# align "mx" to the start of a cache line
|
||||
mov edx, ebp ;# edx = mx
|
||||
prefetchnta byte ptr [rax+rdx]
|
||||
prefetchnta byte ptr [rcx+rdx]
|
||||
;# read cacheline "ma"
|
||||
ror rbp, 32 ;# swap "ma" and "mx"
|
||||
mov edx, ebp ;# edx = ma
|
||||
scratchpad_mask ;# limit address to the specified scratchpad size aligned to multiple of 8
|
||||
lea rcx, [rsi+rcx*8] ;# scratchpad cache line
|
||||
lea rax, [rax+rdx] ;# dataset cache line
|
||||
mov rdx, qword ptr [rax+0] ;# load first dataset quadword (prefetched into the cache by now)
|
||||
xor qword ptr [rcx+0], rdx ;# XOR the dataset item with a scratchpad item, repeat for the rest of the cacheline
|
||||
mov rdx, qword ptr [rax+8]
|
||||
xor qword ptr [rcx+8], rdx
|
||||
mov rdx, qword ptr [rax+16]
|
||||
xor qword ptr [rcx+16], rdx
|
||||
mov rdx, qword ptr [rax+24]
|
||||
xor qword ptr [rcx+24], rdx
|
||||
mov rdx, qword ptr [rax+32]
|
||||
xor qword ptr [rcx+32], rdx
|
||||
mov rdx, qword ptr [rax+40]
|
||||
xor qword ptr [rcx+40], rdx
|
||||
mov rdx, qword ptr [rax+48]
|
||||
xor qword ptr [rcx+48], rdx
|
||||
mov rdx, qword ptr [rax+56]
|
||||
xor qword ptr [rcx+56], rdx
|
||||
pop rcx ;# restore ecx
|
||||
lea rax, [rsi+rax*8] ;# scratchpad cache line
|
||||
lea rcx, [rcx+rdx] ;# dataset cache line
|
||||
mov rdx, qword ptr [rcx+0] ;# load first dataset quadword (prefetched into the cache by now)
|
||||
xor qword ptr [rax+0], rdx ;# XOR the dataset item with a scratchpad item, repeat for the rest of the cacheline
|
||||
mov rdx, qword ptr [rcx+8]
|
||||
xor qword ptr [rax+8], rdx
|
||||
mov rdx, qword ptr [rcx+16]
|
||||
xor qword ptr [rax+16], rdx
|
||||
mov rdx, qword ptr [rcx+24]
|
||||
xor qword ptr [rax+24], rdx
|
||||
mov rdx, qword ptr [rcx+32]
|
||||
xor qword ptr [rax+32], rdx
|
||||
mov rdx, qword ptr [rcx+40]
|
||||
xor qword ptr [rax+40], rdx
|
||||
mov rdx, qword ptr [rcx+48]
|
||||
xor qword ptr [rax+48], rdx
|
||||
mov rdx, qword ptr [rcx+56]
|
||||
xor qword ptr [rax+56], rdx
|
||||
pop rax ;# restore eax
|
||||
ret
|
|
@ -1,154 +1,154 @@
|
|||
;# 90 address transformations
|
||||
;# forced REX prefix is used to make all transformations 4 bytes long
|
||||
lea ecx, [rcx+rcx*8+109]
|
||||
lea eax, [rax+rax*8+109]
|
||||
db 64
|
||||
xor ecx, 96
|
||||
lea ecx, [rcx+rcx*8-19]
|
||||
xor eax, 96
|
||||
lea eax, [rax+rax*8-19]
|
||||
db 64
|
||||
add ecx, -98
|
||||
add eax, -98
|
||||
db 64
|
||||
add ecx, -21
|
||||
add eax, -21
|
||||
db 64
|
||||
xor ecx, -80
|
||||
lea ecx, [rcx+rcx*8-92]
|
||||
xor eax, -80
|
||||
lea eax, [rax+rax*8-92]
|
||||
db 64
|
||||
add ecx, 113
|
||||
lea ecx, [rcx+rcx*8+100]
|
||||
add eax, 113
|
||||
lea eax, [rax+rax*8+100]
|
||||
db 64
|
||||
add ecx, -39
|
||||
add eax, -39
|
||||
db 64
|
||||
xor ecx, 120
|
||||
lea ecx, [rcx+rcx*8-119]
|
||||
xor eax, 120
|
||||
lea eax, [rax+rax*8-119]
|
||||
db 64
|
||||
add ecx, -113
|
||||
add eax, -113
|
||||
db 64
|
||||
add ecx, 111
|
||||
add eax, 111
|
||||
db 64
|
||||
xor ecx, 104
|
||||
lea ecx, [rcx+rcx*8-83]
|
||||
lea ecx, [rcx+rcx*8+127]
|
||||
xor eax, 104
|
||||
lea eax, [rax+rax*8-83]
|
||||
lea eax, [rax+rax*8+127]
|
||||
db 64
|
||||
xor ecx, -112
|
||||
xor eax, -112
|
||||
db 64
|
||||
add ecx, 89
|
||||
add eax, 89
|
||||
db 64
|
||||
add ecx, -32
|
||||
add eax, -32
|
||||
db 64
|
||||
add ecx, 104
|
||||
add eax, 104
|
||||
db 64
|
||||
xor ecx, -120
|
||||
xor eax, -120
|
||||
db 64
|
||||
xor ecx, 24
|
||||
lea ecx, [rcx+rcx*8+9]
|
||||
xor eax, 24
|
||||
lea eax, [rax+rax*8+9]
|
||||
db 64
|
||||
add ecx, -31
|
||||
add eax, -31
|
||||
db 64
|
||||
xor ecx, -16
|
||||
xor eax, -16
|
||||
db 64
|
||||
add ecx, 68
|
||||
lea ecx, [rcx+rcx*8-110]
|
||||
add eax, 68
|
||||
lea eax, [rax+rax*8-110]
|
||||
db 64
|
||||
xor ecx, 64
|
||||
xor eax, 64
|
||||
db 64
|
||||
xor ecx, -40
|
||||
xor eax, -40
|
||||
db 64
|
||||
xor ecx, -8
|
||||
xor eax, -8
|
||||
db 64
|
||||
add ecx, -10
|
||||
add eax, -10
|
||||
db 64
|
||||
xor ecx, -32
|
||||
xor eax, -32
|
||||
db 64
|
||||
add ecx, 14
|
||||
lea ecx, [rcx+rcx*8-46]
|
||||
add eax, 14
|
||||
lea eax, [rax+rax*8-46]
|
||||
db 64
|
||||
xor ecx, -104
|
||||
lea ecx, [rcx+rcx*8+36]
|
||||
xor eax, -104
|
||||
lea eax, [rax+rax*8+36]
|
||||
db 64
|
||||
add ecx, 100
|
||||
lea ecx, [rcx+rcx*8-65]
|
||||
lea ecx, [rcx+rcx*8+27]
|
||||
lea ecx, [rcx+rcx*8+91]
|
||||
add eax, 100
|
||||
lea eax, [rax+rax*8-65]
|
||||
lea eax, [rax+rax*8+27]
|
||||
lea eax, [rax+rax*8+91]
|
||||
db 64
|
||||
add ecx, -101
|
||||
add eax, -101
|
||||
db 64
|
||||
add ecx, -94
|
||||
lea ecx, [rcx+rcx*8-10]
|
||||
add eax, -94
|
||||
lea eax, [rax+rax*8-10]
|
||||
db 64
|
||||
xor ecx, 80
|
||||
xor eax, 80
|
||||
db 64
|
||||
add ecx, -108
|
||||
add eax, -108
|
||||
db 64
|
||||
add ecx, -58
|
||||
add eax, -58
|
||||
db 64
|
||||
xor ecx, 48
|
||||
lea ecx, [rcx+rcx*8+73]
|
||||
xor eax, 48
|
||||
lea eax, [rax+rax*8+73]
|
||||
db 64
|
||||
xor ecx, -48
|
||||
xor eax, -48
|
||||
db 64
|
||||
xor ecx, 32
|
||||
xor eax, 32
|
||||
db 64
|
||||
xor ecx, -96
|
||||
xor eax, -96
|
||||
db 64
|
||||
add ecx, 118
|
||||
add eax, 118
|
||||
db 64
|
||||
add ecx, 91
|
||||
lea ecx, [rcx+rcx*8+18]
|
||||
add eax, 91
|
||||
lea eax, [rax+rax*8+18]
|
||||
db 64
|
||||
add ecx, -11
|
||||
lea ecx, [rcx+rcx*8+63]
|
||||
add eax, -11
|
||||
lea eax, [rax+rax*8+63]
|
||||
db 64
|
||||
add ecx, 114
|
||||
lea ecx, [rcx+rcx*8+45]
|
||||
add eax, 114
|
||||
lea eax, [rax+rax*8+45]
|
||||
db 64
|
||||
add ecx, -67
|
||||
add eax, -67
|
||||
db 64
|
||||
add ecx, 53
|
||||
lea ecx, [rcx+rcx*8-101]
|
||||
lea ecx, [rcx+rcx*8-1]
|
||||
add eax, 53
|
||||
lea eax, [rax+rax*8-101]
|
||||
lea eax, [rax+rax*8-1]
|
||||
db 64
|
||||
xor ecx, 16
|
||||
lea ecx, [rcx+rcx*8-37]
|
||||
lea ecx, [rcx+rcx*8-28]
|
||||
lea ecx, [rcx+rcx*8-55]
|
||||
xor eax, 16
|
||||
lea eax, [rax+rax*8-37]
|
||||
lea eax, [rax+rax*8-28]
|
||||
lea eax, [rax+rax*8-55]
|
||||
db 64
|
||||
xor ecx, -88
|
||||
xor eax, -88
|
||||
db 64
|
||||
xor ecx, -72
|
||||
xor eax, -72
|
||||
db 64
|
||||
add ecx, 36
|
||||
add eax, 36
|
||||
db 64
|
||||
xor ecx, -56
|
||||
xor eax, -56
|
||||
db 64
|
||||
add ecx, 116
|
||||
add eax, 116
|
||||
db 64
|
||||
xor ecx, 88
|
||||
xor eax, 88
|
||||
db 64
|
||||
xor ecx, -128
|
||||
xor eax, -128
|
||||
db 64
|
||||
add ecx, 50
|
||||
add eax, 50
|
||||
db 64
|
||||
add ecx, 105
|
||||
add eax, 105
|
||||
db 64
|
||||
add ecx, -37
|
||||
add eax, -37
|
||||
db 64
|
||||
xor ecx, 112
|
||||
xor eax, 112
|
||||
db 64
|
||||
xor ecx, 8
|
||||
xor eax, 8
|
||||
db 64
|
||||
xor ecx, -24
|
||||
lea ecx, [rcx+rcx*8+118]
|
||||
xor eax, -24
|
||||
lea eax, [rax+rax*8+118]
|
||||
db 64
|
||||
xor ecx, 72
|
||||
xor eax, 72
|
||||
db 64
|
||||
xor ecx, -64
|
||||
xor eax, -64
|
||||
db 64
|
||||
add ecx, 40
|
||||
lea ecx, [rcx+rcx*8-74]
|
||||
lea ecx, [rcx+rcx*8+82]
|
||||
lea ecx, [rcx+rcx*8+54]
|
||||
add eax, 40
|
||||
lea eax, [rax+rax*8-74]
|
||||
lea eax, [rax+rax*8+82]
|
||||
lea eax, [rax+rax*8+54]
|
||||
db 64
|
||||
xor ecx, 56
|
||||
xor eax, 56
|
||||
db 64
|
||||
xor ecx, 40
|
||||
xor eax, 40
|
||||
db 64
|
||||
add ecx, 87
|
||||
add eax, 87
|
|
@ -222,42 +222,42 @@ TransformAddress MACRO reg32, reg64
|
|||
ENDM
|
||||
|
||||
ReadMemoryRandom MACRO spmask
|
||||
;# IN ecx = random 32-bit address
|
||||
;# IN eax = random 32-bit address
|
||||
;# GLOBAL rdi = address of the dataset address
|
||||
;# GLOBAL rsi = address of the scratchpad
|
||||
;# GLOBAL rbp = low 32 bits = "mx", high 32 bits = "ma"
|
||||
;# MODIFY rcx, rdx
|
||||
push rcx ;# preserve ecx
|
||||
TransformAddress ecx, rcx ;# TransformAddress function
|
||||
mov rax, qword ptr [rdi] ;# load the dataset address
|
||||
xor rbp, rcx ;# modify "mx"
|
||||
push rax ;# preserve eax
|
||||
TransformAddress eax, rax ;# TransformAddress function
|
||||
mov rcx, qword ptr [rdi] ;# load the dataset address
|
||||
xor rbp, rax ;# modify "mx"
|
||||
; prefetch cacheline "mx"
|
||||
and rbp, -64 ;# align "mx" to the start of a cache line
|
||||
mov edx, ebp ;# edx = mx
|
||||
prefetchnta byte ptr [rax+rdx]
|
||||
prefetchnta byte ptr [rcx+rdx]
|
||||
; read cacheline "ma"
|
||||
ror rbp, 32 ;# swap "ma" and "mx"
|
||||
mov edx, ebp ;# edx = ma
|
||||
and ecx, spmask-7 ;# limit address to the specified scratchpad size aligned to multiple of 8
|
||||
lea rcx, [rsi+rcx*8] ;# scratchpad cache line
|
||||
lea rax, [rax+rdx] ;# dataset cache line
|
||||
mov rdx, qword ptr [rax+0] ;# load first dataset quadword (prefetched into the cache by now)
|
||||
xor qword ptr [rcx+0], rdx ;# XOR the dataset item with a scratchpad item, repeat for the rest of the cacheline
|
||||
mov rdx, qword ptr [rax+8]
|
||||
xor qword ptr [rcx+8], rdx
|
||||
mov rdx, qword ptr [rax+16]
|
||||
xor qword ptr [rcx+16], rdx
|
||||
mov rdx, qword ptr [rax+24]
|
||||
xor qword ptr [rcx+24], rdx
|
||||
mov rdx, qword ptr [rax+32]
|
||||
xor qword ptr [rcx+32], rdx
|
||||
mov rdx, qword ptr [rax+40]
|
||||
xor qword ptr [rcx+40], rdx
|
||||
mov rdx, qword ptr [rax+48]
|
||||
xor qword ptr [rcx+48], rdx
|
||||
mov rdx, qword ptr [rax+56]
|
||||
xor qword ptr [rcx+56], rdx
|
||||
pop rcx ;# restore ecx
|
||||
and eax, spmask-7 ;# limit address to the specified scratchpad size aligned to multiple of 8
|
||||
lea rax, [rsi+rax*8] ;# scratchpad cache line
|
||||
lea rcx, [rcx+rdx] ;# dataset cache line
|
||||
mov rdx, qword ptr [rcx+0] ;# load first dataset quadword (prefetched into the cache by now)
|
||||
xor qword ptr [rax+0], rdx ;# XOR the dataset item with a scratchpad item, repeat for the rest of the cacheline
|
||||
mov rdx, qword ptr [rcx+8]
|
||||
xor qword ptr [rax+8], rdx
|
||||
mov rdx, qword ptr [rcx+16]
|
||||
xor qword ptr [rax+16], rdx
|
||||
mov rdx, qword ptr [rcx+24]
|
||||
xor qword ptr [rax+24], rdx
|
||||
mov rdx, qword ptr [rcx+32]
|
||||
xor qword ptr [rax+32], rdx
|
||||
mov rdx, qword ptr [rcx+40]
|
||||
xor qword ptr [rax+40], rdx
|
||||
mov rdx, qword ptr [rcx+48]
|
||||
xor qword ptr [rax+48], rdx
|
||||
mov rdx, qword ptr [rcx+56]
|
||||
xor qword ptr [rax+56], rdx
|
||||
pop rax ;# restore eax
|
||||
ret
|
||||
ENDM
|
||||
|
||||
|
|
3372
src/program.inc
3372
src/program.inc
File diff suppressed because it is too large
Load diff
Loading…
Reference in a new issue