mirror of
https://git.wownero.com/wownero/RandomWOW.git
synced 2024-08-15 00:23:14 +00:00
Optimized dataset read (#211)
* Optimized dataset read There was a false dependency on readReg2 and readReg3 (caused by `xor rbp, rax` instruction) when reading dataset item (see design.md - 4.6.2 Loop execution, steps 5 and 7). This change uses `ma` register to read dataset item before the whole `rbp` (`ma` and `mx`) is changed, so superscalar and out-of-order CPU can start executing it earlier. Results: https://i.imgur.com/Bpeq9mx.png ~1% speedup on modern Intel/AMD CPUs. * ARMv8: optimized dataset read Break dependency from readReg2 and readReg3. * Fixed light mode hashing
This commit is contained in:
parent
c12097400b
commit
3c8c7ee097
5 changed files with 20 additions and 18 deletions
|
@ -15,6 +15,7 @@
|
|||
mov rsi, rdx ;# uint8_t* scratchpad
|
||||
|
||||
mov rax, rbp
|
||||
ror rbp, 32
|
||||
|
||||
;# zero integer registers
|
||||
xor r8, r8
|
||||
|
|
|
@ -28,6 +28,7 @@
|
|||
mov rbx, r9 ;# loop counter
|
||||
|
||||
mov rax, rbp
|
||||
ror rbp, 32
|
||||
|
||||
;# zero integer registers
|
||||
xor r8, r8
|
||||
|
|
|
@ -1,17 +1,16 @@
|
|||
mov ecx, ebp ;# ecx = ma
|
||||
and ecx, RANDOMX_DATASET_BASE_MASK
|
||||
xor r8, qword ptr [rdi+rcx]
|
||||
ror rbp, 32 ;# swap "ma" and "mx"
|
||||
xor rbp, rax ;# modify "mx"
|
||||
mov edx, ebp ;# edx = mx
|
||||
and edx, RANDOMX_DATASET_BASE_MASK
|
||||
prefetchnta byte ptr [rdi+rdx]
|
||||
ror rbp, 32 ;# swap "ma" and "mx"
|
||||
mov edx, ebp ;# edx = ma
|
||||
and edx, RANDOMX_DATASET_BASE_MASK
|
||||
lea rcx, [rdi+rdx] ;# dataset cache line
|
||||
xor r8, qword ptr [rcx+0]
|
||||
xor r9, qword ptr [rcx+8]
|
||||
xor r10, qword ptr [rcx+16]
|
||||
xor r11, qword ptr [rcx+24]
|
||||
xor r12, qword ptr [rcx+32]
|
||||
xor r13, qword ptr [rcx+40]
|
||||
xor r14, qword ptr [rcx+48]
|
||||
xor r15, qword ptr [rcx+56]
|
||||
xor r9, qword ptr [rdi+rcx+8]
|
||||
xor r10, qword ptr [rdi+rcx+16]
|
||||
xor r11, qword ptr [rdi+rcx+24]
|
||||
xor r12, qword ptr [rdi+rcx+32]
|
||||
xor r13, qword ptr [rdi+rcx+40]
|
||||
xor r14, qword ptr [rdi+rcx+48]
|
||||
xor r15, qword ptr [rdi+rcx+56]
|
||||
|
|
@ -8,10 +8,10 @@
|
|||
mov qword ptr [rsp+16], r13
|
||||
mov qword ptr [rsp+8], r14
|
||||
mov qword ptr [rsp+0], r15
|
||||
xor rbp, rax ;# modify "mx"
|
||||
ror rbp, 32 ;# swap "ma" and "mx"
|
||||
mov ebx, ebp ;# ecx = ma
|
||||
and ebx, RANDOMX_DATASET_BASE_MASK
|
||||
shr ebx, 6 ;# ebx = Dataset block number
|
||||
xor rbp, rax ;# modify "mx"
|
||||
mov rbx, rbp ;# ebx = ma
|
||||
shr rbx, 38
|
||||
and ebx, RANDOMX_DATASET_BASE_MASK / 64 ;# ebx = Dataset block number
|
||||
;# add ebx, datasetOffset / 64
|
||||
;# call 32768
|
|
@ -307,6 +307,9 @@ literal_v14: .fill 2,8,0
|
|||
literal_v15: .fill 2,8,0
|
||||
|
||||
DECL(randomx_program_aarch64_vm_instructions_end):
|
||||
# Calculate dataset pointer for dataset read
|
||||
# Do it here to break false dependency from readReg2 and readReg3 (see next line)
|
||||
lsr x10, x9, 32
|
||||
|
||||
# mx ^= r[readReg2] ^ r[readReg3];
|
||||
eor x9, x9, x18
|
||||
|
@ -324,8 +327,6 @@ DECL(randomx_program_aarch64_cacheline_align_mask1):
|
|||
# mx <-> ma
|
||||
ror x9, x9, 32
|
||||
|
||||
# Calculate dataset pointer for dataset read
|
||||
mov w10, w9
|
||||
DECL(randomx_program_aarch64_cacheline_align_mask2):
|
||||
# Actual mask will be inserted by JIT compiler
|
||||
and x10, x10, 1
|
||||
|
|
Loading…
Reference in a new issue