mirror of
https://git.wownero.com/wownero/RandomWOW.git
synced 2024-08-15 00:23:14 +00:00
3c8c7ee097
* Optimized dataset read There was a false dependency on readReg2 and readReg3 (caused by `xor rbp, rax` instruction) when reading dataset item (see design.md - 4.6.2 Loop execution, steps 5 and 7). This change uses `ma` register to read dataset item before the whole `rbp` (`ma` and `mx`) is changed, so superscalar and out-of-order CPU can start executing it earlier. Results: https://i.imgur.com/Bpeq9mx.png ~1% speedup on modern Intel/AMD CPUs. * ARMv8: optimized dataset read Break dependency from readReg2 and readReg3. * Fixed light mode hashing
35 lines
751 B
PHP
35 lines
751 B
PHP
;# callee-saved registers - System V AMD64 ABI
|
|
push rbx
|
|
push rbp
|
|
push r12
|
|
push r13
|
|
push r14
|
|
push r15
|
|
|
|
;# function arguments
|
|
mov rbx, rcx ;# loop counter
|
|
push rdi ;# RegisterFile& registerFile
|
|
mov rcx, rdi
|
|
mov rbp, qword ptr [rsi] ;# "mx", "ma"
|
|
mov rdi, qword ptr [rsi+8] ;# uint8_t* dataset
|
|
mov rsi, rdx ;# uint8_t* scratchpad
|
|
|
|
mov rax, rbp
|
|
ror rbp, 32
|
|
|
|
;# zero integer registers
|
|
xor r8, r8
|
|
xor r9, r9
|
|
xor r10, r10
|
|
xor r11, r11
|
|
xor r12, r12
|
|
xor r13, r13
|
|
xor r14, r14
|
|
xor r15, r15
|
|
|
|
;# load constant registers
|
|
lea rcx, [rcx+120]
|
|
movapd xmm8, xmmword ptr [rcx+72]
|
|
movapd xmm9, xmmword ptr [rcx+88]
|
|
movapd xmm10, xmmword ptr [rcx+104]
|
|
movapd xmm11, xmmword ptr [rcx+120]
|