Reduced x86 code size by 512 bytes (and ecx -> and eax)

This commit is contained in:
tevador 2019-01-12 20:27:35 +01:00
parent 1426fcbab5
commit 67e741ff22
8 changed files with 1841 additions and 1842 deletions

View file

@ -66,34 +66,34 @@ namespace RandomX {
void AssemblyGeneratorX86::gena(Instruction& instr, int i) {
asmCode << "\txor " << regR[instr.rega % RegistersCount] << ", 0" << std::hex << instr.addra << "h" << std::dec << std::endl;
asmCode << "\tmov ecx, " << regR32[instr.rega % RegistersCount] << std::endl;
asmCode << "\tmov eax, " << regR32[instr.rega % RegistersCount] << std::endl;
asmCode << "\ttest " << regIc8 << ", 63" << std::endl;
asmCode << "\tjnz short rx_body_" << i << std::endl;
if (instr.loca & 3) {
asmCode << "\tcall rx_read_l1" << std::endl;
asmCode << "rx_body_" << i << ":" << std::endl;
if ((instr.loca & 192) == 0)
asmCode << "\txor " << regMx << ", rcx" << std::endl;
asmCode << "\tand ecx, " << (ScratchpadL1 - 1) << std::endl;
asmCode << "\txor " << regMx << ", rax" << std::endl;
asmCode << "\tand eax, " << (ScratchpadL1 - 1) << std::endl;
}
else {
asmCode << "\tcall rx_read_l2" << std::endl;
asmCode << "rx_body_" << i << ":" << std::endl;
if ((instr.loca & 192) == 0)
asmCode << "\txor " << regMx << ", rcx" << std::endl;
asmCode << "\tand ecx, " << (ScratchpadL2 - 1) << std::endl;
asmCode << "\txor " << regMx << ", rax" << std::endl;
asmCode << "\tand eax, " << (ScratchpadL2 - 1) << std::endl;
}
}
void AssemblyGeneratorX86::genar(Instruction& instr, int i) {
gena(instr, i);
asmCode << "\tmov rax, qword ptr [" << regScratchpadAddr << "+rcx*8]" << std::endl;
asmCode << "\tmov rax, qword ptr [" << regScratchpadAddr << "+rax*8]" << std::endl;
}
void AssemblyGeneratorX86::genaf(Instruction& instr, int i) {
gena(instr, i);
asmCode << "\tcvtdq2pd xmm0, qword ptr [" << regScratchpadAddr << "+rcx*8]" << std::endl;
asmCode << "\tcvtdq2pd xmm0, qword ptr [" << regScratchpadAddr << "+rax*8]" << std::endl;
}
void AssemblyGeneratorX86::genbiashift(Instruction& instr, const char* instrx86) {

View file

@ -48,7 +48,7 @@ DECL(randomx_program_begin):
DECL(randomx_program_epilogue):
#include "asm/program_epilogue_linux.inc"
#define scratchpad_mask and ecx, 2040
#define scratchpad_mask and eax, 2040
.align 64
DECL(randomx_program_read_l1):
@ -56,7 +56,7 @@ DECL(randomx_program_read_l1):
#undef scratchpad_mask
#define scratchpad_mask and ecx, 32760
#define scratchpad_mask and eax, 32760
.align 64
DECL(randomx_program_read_l2):

View file

@ -42,7 +42,7 @@ randomx_program_epilogue PROC
randomx_program_epilogue ENDP
scratchpad_mask MACRO
and ecx, 2040
and eax, 2040
ENDM
ALIGN 64
@ -51,7 +51,7 @@ randomx_program_read_l1 PROC
randomx_program_read_l1 ENDP
scratchpad_mask MACRO
and ecx, 32760
and eax, 32760
ENDM
ALIGN 64

View file

@ -175,7 +175,7 @@ namespace RandomX {
emitByte(0xf0 + (instr.rega % RegistersCount));
emit(instr.addra);
emit(uint16_t(0x8b41)); //mov
emitByte(0xc8 + (instr.rega % RegistersCount)); //ecx, rega
emitByte(0xc0 + (instr.rega % RegistersCount)); //eax, rega
emit(0x753fc3f6); //test bl,0x3f; jne
emit(uint16_t(0xe805));
if (instr.loca & 3) { //A.LOC.W
@ -186,9 +186,9 @@ namespace RandomX {
}
if ((instr.loca & 192) == 0) { //A.LOC.X
emit(uint16_t(0x3348));
emitByte(0xe9); //xor rbp, rcx
emitByte(0xe8); //xor rbp, rax
}
emit(uint16_t(0xe181)); //and ecx,
emitByte(0x25); //and eax,
if (instr.loca & 3) {
emit(ScratchpadL1 - 1); //first 16 KiB of scratchpad
}
@ -199,14 +199,13 @@ namespace RandomX {
void JitCompilerX86::genar(Instruction& instr) {
gena(instr);
emit(0xce048b48); //mov rax,QWORD PTR [rsi+rcx*8]
emit(0xdc580f66);
emit(0xc6048b48); //mov rax,QWORD PTR [rsi+rax*8]
}
void JitCompilerX86::genaf(Instruction& instr) {
gena(instr);
emitByte(0xf3);
emit(0xce04e60f); //cvtdq2pd xmm0,QWORD PTR [rsi+rcx*8]
emit(0xc604e60f); //cvtdq2pd xmm0,QWORD PTR [rsi+rax*8]
}
void JitCompilerX86::genbiashift(Instruction& instr, uint16_t opcodeReg, uint16_t opcodeImm) {

View file

@ -1,32 +1,32 @@
push rcx ;# preserve ecx
push rax ;# preserve eax
db 0, 0, 0, 0 ;# TransformAddress placeholder
mov rax, qword ptr [rdi] ;# load the dataset address
xor rbp, rcx ;# modify "mx"
mov rcx, qword ptr [rdi] ;# load the dataset address
xor rbp, rax ;# modify "mx"
;# prefetch cacheline "mx"
and rbp, -64 ;# align "mx" to the start of a cache line
mov edx, ebp ;# edx = mx
prefetchnta byte ptr [rax+rdx]
prefetchnta byte ptr [rcx+rdx]
;# read cacheline "ma"
ror rbp, 32 ;# swap "ma" and "mx"
mov edx, ebp ;# edx = ma
scratchpad_mask ;# limit address to the specified scratchpad size aligned to multiple of 8
lea rcx, [rsi+rcx*8] ;# scratchpad cache line
lea rax, [rax+rdx] ;# dataset cache line
mov rdx, qword ptr [rax+0] ;# load first dataset quadword (prefetched into the cache by now)
xor qword ptr [rcx+0], rdx ;# XOR the dataset item with a scratchpad item, repeat for the rest of the cacheline
mov rdx, qword ptr [rax+8]
xor qword ptr [rcx+8], rdx
mov rdx, qword ptr [rax+16]
xor qword ptr [rcx+16], rdx
mov rdx, qword ptr [rax+24]
xor qword ptr [rcx+24], rdx
mov rdx, qword ptr [rax+32]
xor qword ptr [rcx+32], rdx
mov rdx, qword ptr [rax+40]
xor qword ptr [rcx+40], rdx
mov rdx, qword ptr [rax+48]
xor qword ptr [rcx+48], rdx
mov rdx, qword ptr [rax+56]
xor qword ptr [rcx+56], rdx
pop rcx ;# restore ecx
lea rax, [rsi+rax*8] ;# scratchpad cache line
lea rcx, [rcx+rdx] ;# dataset cache line
mov rdx, qword ptr [rcx+0] ;# load first dataset quadword (prefetched into the cache by now)
xor qword ptr [rax+0], rdx ;# XOR the dataset item with a scratchpad item, repeat for the rest of the cacheline
mov rdx, qword ptr [rcx+8]
xor qword ptr [rax+8], rdx
mov rdx, qword ptr [rcx+16]
xor qword ptr [rax+16], rdx
mov rdx, qword ptr [rcx+24]
xor qword ptr [rax+24], rdx
mov rdx, qword ptr [rcx+32]
xor qword ptr [rax+32], rdx
mov rdx, qword ptr [rcx+40]
xor qword ptr [rax+40], rdx
mov rdx, qword ptr [rcx+48]
xor qword ptr [rax+48], rdx
mov rdx, qword ptr [rcx+56]
xor qword ptr [rax+56], rdx
pop rax ;# restore eax
ret

View file

@ -1,154 +1,154 @@
;# 90 address transformations
;# forced REX prefix is used to make all transformations 4 bytes long
lea ecx, [rcx+rcx*8+109]
lea eax, [rax+rax*8+109]
db 64
xor ecx, 96
lea ecx, [rcx+rcx*8-19]
xor eax, 96
lea eax, [rax+rax*8-19]
db 64
add ecx, -98
add eax, -98
db 64
add ecx, -21
add eax, -21
db 64
xor ecx, -80
lea ecx, [rcx+rcx*8-92]
xor eax, -80
lea eax, [rax+rax*8-92]
db 64
add ecx, 113
lea ecx, [rcx+rcx*8+100]
add eax, 113
lea eax, [rax+rax*8+100]
db 64
add ecx, -39
add eax, -39
db 64
xor ecx, 120
lea ecx, [rcx+rcx*8-119]
xor eax, 120
lea eax, [rax+rax*8-119]
db 64
add ecx, -113
add eax, -113
db 64
add ecx, 111
add eax, 111
db 64
xor ecx, 104
lea ecx, [rcx+rcx*8-83]
lea ecx, [rcx+rcx*8+127]
xor eax, 104
lea eax, [rax+rax*8-83]
lea eax, [rax+rax*8+127]
db 64
xor ecx, -112
xor eax, -112
db 64
add ecx, 89
add eax, 89
db 64
add ecx, -32
add eax, -32
db 64
add ecx, 104
add eax, 104
db 64
xor ecx, -120
xor eax, -120
db 64
xor ecx, 24
lea ecx, [rcx+rcx*8+9]
xor eax, 24
lea eax, [rax+rax*8+9]
db 64
add ecx, -31
add eax, -31
db 64
xor ecx, -16
xor eax, -16
db 64
add ecx, 68
lea ecx, [rcx+rcx*8-110]
add eax, 68
lea eax, [rax+rax*8-110]
db 64
xor ecx, 64
xor eax, 64
db 64
xor ecx, -40
xor eax, -40
db 64
xor ecx, -8
xor eax, -8
db 64
add ecx, -10
add eax, -10
db 64
xor ecx, -32
xor eax, -32
db 64
add ecx, 14
lea ecx, [rcx+rcx*8-46]
add eax, 14
lea eax, [rax+rax*8-46]
db 64
xor ecx, -104
lea ecx, [rcx+rcx*8+36]
xor eax, -104
lea eax, [rax+rax*8+36]
db 64
add ecx, 100
lea ecx, [rcx+rcx*8-65]
lea ecx, [rcx+rcx*8+27]
lea ecx, [rcx+rcx*8+91]
add eax, 100
lea eax, [rax+rax*8-65]
lea eax, [rax+rax*8+27]
lea eax, [rax+rax*8+91]
db 64
add ecx, -101
add eax, -101
db 64
add ecx, -94
lea ecx, [rcx+rcx*8-10]
add eax, -94
lea eax, [rax+rax*8-10]
db 64
xor ecx, 80
xor eax, 80
db 64
add ecx, -108
add eax, -108
db 64
add ecx, -58
add eax, -58
db 64
xor ecx, 48
lea ecx, [rcx+rcx*8+73]
xor eax, 48
lea eax, [rax+rax*8+73]
db 64
xor ecx, -48
xor eax, -48
db 64
xor ecx, 32
xor eax, 32
db 64
xor ecx, -96
xor eax, -96
db 64
add ecx, 118
add eax, 118
db 64
add ecx, 91
lea ecx, [rcx+rcx*8+18]
add eax, 91
lea eax, [rax+rax*8+18]
db 64
add ecx, -11
lea ecx, [rcx+rcx*8+63]
add eax, -11
lea eax, [rax+rax*8+63]
db 64
add ecx, 114
lea ecx, [rcx+rcx*8+45]
add eax, 114
lea eax, [rax+rax*8+45]
db 64
add ecx, -67
add eax, -67
db 64
add ecx, 53
lea ecx, [rcx+rcx*8-101]
lea ecx, [rcx+rcx*8-1]
add eax, 53
lea eax, [rax+rax*8-101]
lea eax, [rax+rax*8-1]
db 64
xor ecx, 16
lea ecx, [rcx+rcx*8-37]
lea ecx, [rcx+rcx*8-28]
lea ecx, [rcx+rcx*8-55]
xor eax, 16
lea eax, [rax+rax*8-37]
lea eax, [rax+rax*8-28]
lea eax, [rax+rax*8-55]
db 64
xor ecx, -88
xor eax, -88
db 64
xor ecx, -72
xor eax, -72
db 64
add ecx, 36
add eax, 36
db 64
xor ecx, -56
xor eax, -56
db 64
add ecx, 116
add eax, 116
db 64
xor ecx, 88
xor eax, 88
db 64
xor ecx, -128
xor eax, -128
db 64
add ecx, 50
add eax, 50
db 64
add ecx, 105
add eax, 105
db 64
add ecx, -37
add eax, -37
db 64
xor ecx, 112
xor eax, 112
db 64
xor ecx, 8
xor eax, 8
db 64
xor ecx, -24
lea ecx, [rcx+rcx*8+118]
xor eax, -24
lea eax, [rax+rax*8+118]
db 64
xor ecx, 72
xor eax, 72
db 64
xor ecx, -64
xor eax, -64
db 64
add ecx, 40
lea ecx, [rcx+rcx*8-74]
lea ecx, [rcx+rcx*8+82]
lea ecx, [rcx+rcx*8+54]
add eax, 40
lea eax, [rax+rax*8-74]
lea eax, [rax+rax*8+82]
lea eax, [rax+rax*8+54]
db 64
xor ecx, 56
xor eax, 56
db 64
xor ecx, 40
xor eax, 40
db 64
add ecx, 87
add eax, 87

View file

@ -222,42 +222,42 @@ TransformAddress MACRO reg32, reg64
ENDM
ReadMemoryRandom MACRO spmask
;# IN ecx = random 32-bit address
;# IN eax = random 32-bit address
;# GLOBAL rdi = address of the dataset address
;# GLOBAL rsi = address of the scratchpad
;# GLOBAL rbp = low 32 bits = "mx", high 32 bits = "ma"
;# MODIFY rcx, rdx
push rcx ;# preserve ecx
TransformAddress ecx, rcx ;# TransformAddress function
mov rax, qword ptr [rdi] ;# load the dataset address
xor rbp, rcx ;# modify "mx"
push rax ;# preserve eax
TransformAddress eax, rax ;# TransformAddress function
mov rcx, qword ptr [rdi] ;# load the dataset address
xor rbp, rax ;# modify "mx"
; prefetch cacheline "mx"
and rbp, -64 ;# align "mx" to the start of a cache line
mov edx, ebp ;# edx = mx
prefetchnta byte ptr [rax+rdx]
prefetchnta byte ptr [rcx+rdx]
; read cacheline "ma"
ror rbp, 32 ;# swap "ma" and "mx"
mov edx, ebp ;# edx = ma
and ecx, spmask-7 ;# limit address to the specified scratchpad size aligned to multiple of 8
lea rcx, [rsi+rcx*8] ;# scratchpad cache line
lea rax, [rax+rdx] ;# dataset cache line
mov rdx, qword ptr [rax+0] ;# load first dataset quadword (prefetched into the cache by now)
xor qword ptr [rcx+0], rdx ;# XOR the dataset item with a scratchpad item, repeat for the rest of the cacheline
mov rdx, qword ptr [rax+8]
xor qword ptr [rcx+8], rdx
mov rdx, qword ptr [rax+16]
xor qword ptr [rcx+16], rdx
mov rdx, qword ptr [rax+24]
xor qword ptr [rcx+24], rdx
mov rdx, qword ptr [rax+32]
xor qword ptr [rcx+32], rdx
mov rdx, qword ptr [rax+40]
xor qword ptr [rcx+40], rdx
mov rdx, qword ptr [rax+48]
xor qword ptr [rcx+48], rdx
mov rdx, qword ptr [rax+56]
xor qword ptr [rcx+56], rdx
pop rcx ;# restore ecx
and eax, spmask-7 ;# limit address to the specified scratchpad size aligned to multiple of 8
lea rax, [rsi+rax*8] ;# scratchpad cache line
lea rcx, [rcx+rdx] ;# dataset cache line
mov rdx, qword ptr [rcx+0] ;# load first dataset quadword (prefetched into the cache by now)
xor qword ptr [rax+0], rdx ;# XOR the dataset item with a scratchpad item, repeat for the rest of the cacheline
mov rdx, qword ptr [rcx+8]
xor qword ptr [rax+8], rdx
mov rdx, qword ptr [rcx+16]
xor qword ptr [rax+16], rdx
mov rdx, qword ptr [rcx+24]
xor qword ptr [rax+24], rdx
mov rdx, qword ptr [rcx+32]
xor qword ptr [rax+32], rdx
mov rdx, qword ptr [rcx+40]
xor qword ptr [rax+40], rdx
mov rdx, qword ptr [rcx+48]
xor qword ptr [rax+48], rdx
mov rdx, qword ptr [rcx+56]
xor qword ptr [rax+56], rdx
pop rax ;# restore eax
ret
ENDM

File diff suppressed because it is too large Load diff