mirror of
				https://git.wownero.com/wownero/RandomWOW.git
				synced 2024-08-15 00:23:14 +00:00 
			
		
		
		
	Optimized dataset read (#211)
* Optimized dataset read There was a false dependency on readReg2 and readReg3 (caused by `xor rbp, rax` instruction) when reading dataset item (see design.md - 4.6.2 Loop execution, steps 5 and 7). This change uses `ma` register to read dataset item before the whole `rbp` (`ma` and `mx`) is changed, so superscalar and out-of-order CPU can start executing it earlier. Results: https://i.imgur.com/Bpeq9mx.png ~1% speedup on modern Intel/AMD CPUs. * ARMv8: optimized dataset read Break dependency from readReg2 and readReg3. * Fixed light mode hashing
This commit is contained in:
		
							parent
							
								
									c12097400b
								
							
						
					
					
						commit
						3c8c7ee097
					
				
					 5 changed files with 20 additions and 18 deletions
				
			
		|  | @ -15,6 +15,7 @@ | |||
| 	mov rsi, rdx                ;# uint8_t* scratchpad
 | ||||
| 
 | ||||
| 	mov rax, rbp | ||||
| 	ror rbp, 32 | ||||
| 
 | ||||
| 	;# zero integer registers
 | ||||
| 	xor r8, r8 | ||||
|  |  | |||
|  | @ -28,6 +28,7 @@ | |||
| 	mov rbx, r9                 ;# loop counter
 | ||||
| 
 | ||||
| 	mov rax, rbp | ||||
| 	ror rbp, 32 | ||||
| 
 | ||||
| 	;# zero integer registers
 | ||||
| 	xor r8, r8 | ||||
|  |  | |||
|  | @ -1,17 +1,16 @@ | |||
| 	mov ecx, ebp                       ;# ecx = ma
 | ||||
| 	and ecx, RANDOMX_DATASET_BASE_MASK | ||||
| 	xor r8, qword ptr [rdi+rcx] | ||||
| 	ror rbp, 32                        ;# swap "ma" and "mx"
 | ||||
| 	xor rbp, rax                       ;# modify "mx"
 | ||||
| 	mov edx, ebp                       ;# edx = mx
 | ||||
| 	and edx, RANDOMX_DATASET_BASE_MASK | ||||
| 	prefetchnta byte ptr [rdi+rdx] | ||||
| 	ror rbp, 32                        ;# swap "ma" and "mx"
 | ||||
| 	mov edx, ebp                       ;# edx = ma
 | ||||
| 	and edx, RANDOMX_DATASET_BASE_MASK | ||||
| 	lea rcx, [rdi+rdx]                 ;# dataset cache line
 | ||||
| 	xor r8,  qword ptr [rcx+0] | ||||
| 	xor r9,  qword ptr [rcx+8] | ||||
| 	xor r10, qword ptr [rcx+16] | ||||
| 	xor r11, qword ptr [rcx+24] | ||||
| 	xor r12, qword ptr [rcx+32] | ||||
| 	xor r13, qword ptr [rcx+40] | ||||
| 	xor r14, qword ptr [rcx+48] | ||||
| 	xor r15, qword ptr [rcx+56] | ||||
| 	xor r9,  qword ptr [rdi+rcx+8] | ||||
| 	xor r10, qword ptr [rdi+rcx+16] | ||||
| 	xor r11, qword ptr [rdi+rcx+24] | ||||
| 	xor r12, qword ptr [rdi+rcx+32] | ||||
| 	xor r13, qword ptr [rdi+rcx+40] | ||||
| 	xor r14, qword ptr [rdi+rcx+48] | ||||
| 	xor r15, qword ptr [rdi+rcx+56] | ||||
| 	 | ||||
|  | @ -8,10 +8,10 @@ | |||
| 	mov qword ptr [rsp+16], r13 | ||||
| 	mov qword ptr [rsp+8], r14 | ||||
| 	mov qword ptr [rsp+0], r15 | ||||
| 	xor rbp, rax                       ;# modify "mx"
 | ||||
| 	ror rbp, 32                        ;# swap "ma" and "mx"
 | ||||
| 	mov ebx, ebp                       ;# ecx = ma
 | ||||
| 	and ebx, RANDOMX_DATASET_BASE_MASK | ||||
| 	shr ebx, 6                         ;# ebx = Dataset block number
 | ||||
| 	xor rbp, rax                       ;# modify "mx"
 | ||||
| 	mov rbx, rbp                       ;# ebx = ma
 | ||||
| 	shr rbx, 38 | ||||
| 	and ebx, RANDOMX_DATASET_BASE_MASK / 64 ;# ebx = Dataset block number
 | ||||
| 	;# add ebx, datasetOffset / 64
 | ||||
| 	;# call 32768
 | ||||
|  | @ -307,6 +307,9 @@ literal_v14: .fill 2,8,0 | |||
| literal_v15: .fill 2,8,0 | ||||
| 
 | ||||
| DECL(randomx_program_aarch64_vm_instructions_end): | ||||
| 	# Calculate dataset pointer for dataset read | ||||
| 	# Do it here to break false dependency from readReg2 and readReg3 (see next line) | ||||
| 	lsr	x10, x9, 32 | ||||
| 
 | ||||
| 	# mx ^= r[readReg2] ^ r[readReg3];
 | ||||
| 	eor	x9, x9, x18 | ||||
|  | @ -324,8 +327,6 @@ DECL(randomx_program_aarch64_cacheline_align_mask1): | |||
| 	# mx <-> ma | ||||
| 	ror	x9, x9, 32 | ||||
| 
 | ||||
| 	# Calculate dataset pointer for dataset read | ||||
| 	mov	w10, w9 | ||||
| DECL(randomx_program_aarch64_cacheline_align_mask2): | ||||
| 	# Actual mask will be inserted by JIT compiler | ||||
| 	and	x10, x10, 1 | ||||
|  |  | |||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue