mirror of
				https://git.wownero.com/wownero/RandomWOW.git
				synced 2024-08-15 00:23:14 +00:00 
			
		
		
		
	initBlock: cycle columns, asm implementation
This commit is contained in:
		
							parent
							
								
									55a22febbd
								
							
						
					
					
						commit
						edde7672e0
					
				
					 5 changed files with 249 additions and 30 deletions
				
			
		|  | @ -351,7 +351,7 @@ namespace RandomX { | |||
| 				//mem.mx &= CacheLineAlignMask;
 | ||||
| 				Cache& cache = mem.ds.cache; | ||||
| 				uint64_t datasetLine[CacheLineSize / sizeof(uint64_t)]; | ||||
| 				initBlock(cache, (uint8_t*)datasetLine, datasetBase + mem.ma / CacheLineSize); | ||||
| 				initBlock(cache, (uint8_t*)datasetLine, datasetBase + mem.ma / CacheLineSize, RANDOMX_CACHE_ACCESSES / 8); | ||||
| 				for (int i = 0; i < RegistersCount; ++i) | ||||
| 					r[i] ^= datasetLine[i]; | ||||
| 				std::swap(mem.mx, mem.ma); | ||||
|  |  | |||
|  | @ -54,7 +54,7 @@ namespace RandomX { | |||
| #endif | ||||
| 		uint32_t currentBlock = addr / CacheLineSize; | ||||
| 		if (currentBlock != startBlock || output != currentLine.data()) { | ||||
| 			initBlock(cache, (uint8_t*)currentLine.data(), currentBlock); | ||||
| 			initBlock(cache, (uint8_t*)currentLine.data(), currentBlock, RANDOMX_CACHE_ACCESSES / 8); | ||||
| 		} | ||||
| 		else { | ||||
| 			sync(); | ||||
|  | @ -81,7 +81,7 @@ namespace RandomX { | |||
| 
 | ||||
| 	void LightClientAsyncWorker::getBlocks(void* out, uint32_t startBlock, uint32_t blockCount) { | ||||
| 		for (uint32_t i = 0; i < blockCount; ++i) { | ||||
| 			initBlock(cache, (uint8_t*)out + CacheLineSize * i, startBlock + i); | ||||
| 			initBlock(cache, (uint8_t*)out + CacheLineSize * i, startBlock + i, RANDOMX_CACHE_ACCESSES / 8); | ||||
| 		} | ||||
| 	} | ||||
| 
 | ||||
|  | @ -101,7 +101,7 @@ namespace RandomX { | |||
| 			std::cout << sw.getElapsed() << ": runWorker-getBlocks " << startBlock << "/" << blockCount << std::endl; | ||||
| #endif | ||||
| 			//getBlocks(output, startBlock, blockCount);
 | ||||
| 			initBlock(cache, (uint8_t*)output, startBlock); | ||||
| 			initBlock(cache, (uint8_t*)output, startBlock, RANDOMX_CACHE_ACCESSES / 8); | ||||
| 			hasWork = false; | ||||
| #ifdef TRACE | ||||
| 			std::cout << sw.getElapsed() << ": runWorker-finished " << startBlock << "/" << blockCount << std::endl; | ||||
|  |  | |||
|  | @ -40,34 +40,65 @@ along with RandomX.  If not, see<http://www.gnu.org/licenses/>. | |||
| 
 | ||||
| namespace RandomX { | ||||
| 
 | ||||
| 	void initBlock(const Cache& cache, uint8_t* out, uint64_t blockNumber) { | ||||
| #if !defined(_M_X64) | ||||
| 	static FORCE_INLINE uint8_t* selectMixBlock(const Cache& cache, uint64_t& currentIndex, uint64_t& nextIndex) { | ||||
| 		uint8_t* mixBlock; | ||||
| 		if (RANDOMX_ARGON_GROWTH == 0) { | ||||
| 			constexpr uint32_t mask = (RANDOMX_ARGON_MEMORY * ArgonBlockSize / CacheLineSize - 1); | ||||
| 			mixBlock = cache.memory + (currentIndex & mask) * CacheLineSize; | ||||
| 		} | ||||
| 		else { | ||||
| 			const uint32_t modulus = cache.size / CacheLineSize; | ||||
| 			mixBlock = cache.memory + (currentIndex % modulus) * CacheLineSize; | ||||
| 		} | ||||
| 		PREFETCHNTA(mixBlock); | ||||
| 		nextIndex = squareHash(currentIndex + nextIndex); | ||||
| 		return mixBlock; | ||||
| 	} | ||||
| 
 | ||||
| 	static FORCE_INLINE void mixCache(uint8_t* mixBlock, uint64_t& c0, uint64_t& c1, uint64_t& c2, uint64_t& c3, uint64_t& c4, uint64_t& c5, uint64_t& c6, uint64_t& c7) { | ||||
| 		c0 ^= load64(mixBlock + 0); | ||||
| 		c1 ^= load64(mixBlock + 8); | ||||
| 		c2 ^= load64(mixBlock + 16); | ||||
| 		c3 ^= load64(mixBlock + 24); | ||||
| 		c4 ^= load64(mixBlock + 32); | ||||
| 		c5 ^= load64(mixBlock + 40); | ||||
| 		c6 ^= load64(mixBlock + 48); | ||||
| 		c7 ^= load64(mixBlock + 56); | ||||
| 	} | ||||
| 
 | ||||
| 	void initBlock(const Cache& cache, uint8_t* out, uint64_t blockNumber, unsigned iterations) { | ||||
| 		uint64_t c0, c1, c2, c3, c4, c5, c6, c7; | ||||
| 
 | ||||
| 		c0 = 4ULL * blockNumber; | ||||
| 		c0 = blockNumber; | ||||
| 		c1 = c2 = c3 = c4 = c5 = c6 = c7 = 0; | ||||
| 
 | ||||
| 		constexpr uint32_t mask = (CacheSize - 1) & CacheLineAlignMask; | ||||
| 		uint8_t* mixBlock; | ||||
| 
 | ||||
| 		for (auto i = 0; i < RANDOMX_CACHE_ACCESSES; ++i) { | ||||
| 			const uint8_t* mixBlock; | ||||
| 			if (RANDOMX_ARGON_GROWTH == 0) { | ||||
| 				constexpr uint32_t mask = (RANDOMX_ARGON_MEMORY * ArgonBlockSize / CacheLineSize - 1); | ||||
| 				mixBlock = cache.memory + (c0 & mask) * CacheLineSize; | ||||
| 			} | ||||
| 			else { | ||||
| 				const uint32_t modulus = cache.size / CacheLineSize; | ||||
| 				mixBlock = cache.memory + (c0 % modulus) * CacheLineSize; | ||||
| 			} | ||||
| 			PREFETCHNTA(mixBlock); | ||||
| 			c0 = squareHash(c0); | ||||
| 			c0 ^= load64(mixBlock + 0); | ||||
| 			c1 ^= load64(mixBlock + 8); | ||||
| 			c2 ^= load64(mixBlock + 16); | ||||
| 			c3 ^= load64(mixBlock + 24); | ||||
| 			c4 ^= load64(mixBlock + 32); | ||||
| 			c5 ^= load64(mixBlock + 40); | ||||
| 			c6 ^= load64(mixBlock + 48); | ||||
| 			c7 ^= load64(mixBlock + 56); | ||||
| 		for (auto i = 0; i < RANDOMX_CACHE_ACCESSES / 8; ++i) { | ||||
| 			mixBlock = selectMixBlock(cache, c0, c1); | ||||
| 			mixCache(mixBlock, c0, c1, c2, c3, c4, c5, c6, c7); | ||||
| 
 | ||||
| 			mixBlock = selectMixBlock(cache, c1, c2); | ||||
| 			mixCache(mixBlock, c0, c1, c2, c3, c4, c5, c6, c7); | ||||
| 
 | ||||
| 			mixBlock = selectMixBlock(cache, c2, c3); | ||||
| 			mixCache(mixBlock, c0, c1, c2, c3, c4, c5, c6, c7); | ||||
| 
 | ||||
| 			mixBlock = selectMixBlock(cache, c3, c4); | ||||
| 			mixCache(mixBlock, c0, c1, c2, c3, c4, c5, c6, c7); | ||||
| 
 | ||||
| 			mixBlock = selectMixBlock(cache, c4, c5); | ||||
| 			mixCache(mixBlock, c0, c1, c2, c3, c4, c5, c6, c7); | ||||
| 
 | ||||
| 			mixBlock = selectMixBlock(cache, c5, c6); | ||||
| 			mixCache(mixBlock, c0, c1, c2, c3, c4, c5, c6, c7); | ||||
| 
 | ||||
| 			mixBlock = selectMixBlock(cache, c6, c7); | ||||
| 			mixCache(mixBlock, c0, c1, c2, c3, c4, c5, c6, c7); | ||||
| 
 | ||||
| 			mixBlock = selectMixBlock(cache, c7, c0); | ||||
| 			mixCache(mixBlock, c0, c1, c2, c3, c4, c5, c6, c7); | ||||
| 		} | ||||
| 
 | ||||
| 		store64(out + 0, c0); | ||||
|  | @ -79,6 +110,7 @@ namespace RandomX { | |||
| 		store64(out + 48, c6); | ||||
| 		store64(out + 56, c7); | ||||
| 	} | ||||
| #endif | ||||
| 
 | ||||
| 	void datasetRead(addr_t addr, MemoryRegisters& memory, RegisterFile& reg) { | ||||
| 		uint64_t* datasetLine = (uint64_t*)(memory.ds.dataset.memory + memory.ma); | ||||
|  | @ -95,7 +127,7 @@ namespace RandomX { | |||
| 		memory.mx &= CacheLineAlignMask; //align to cache line
 | ||||
| 		Cache& cache = memory.ds.cache; | ||||
| 		uint64_t datasetLine[CacheLineSize / sizeof(uint64_t)]; | ||||
| 		initBlock(cache, (uint8_t*)datasetLine, memory.ma / CacheLineSize); | ||||
| 		initBlock(cache, (uint8_t*)datasetLine, memory.ma / CacheLineSize, RANDOMX_CACHE_ACCESSES / 8); | ||||
| 		for (int i = 0; i < RegistersCount; ++i) | ||||
| 			reg[i] ^= datasetLine[i]; | ||||
| 		std::swap(memory.mx, memory.ma); | ||||
|  | @ -128,7 +160,7 @@ namespace RandomX { | |||
| 
 | ||||
| 	void datasetInit(Cache& cache, Dataset& ds, uint32_t startBlock, uint32_t blockCount) { | ||||
| 		for (uint64_t i = startBlock; i < startBlock + blockCount; ++i) { | ||||
| 			initBlock(cache, ds.memory + i * CacheLineSize, i); | ||||
| 			initBlock(cache, ds.memory + i * CacheLineSize, i, RANDOMX_CACHE_ACCESSES / 8); | ||||
| 		} | ||||
| 	} | ||||
| 
 | ||||
|  |  | |||
|  | @ -25,7 +25,10 @@ along with RandomX.  If not, see<http://www.gnu.org/licenses/>. | |||
| 
 | ||||
| namespace RandomX { | ||||
| 
 | ||||
| 	void initBlock(const Cache& cache, uint8_t* out, uint64_t blockNumber); | ||||
| #if defined(_M_X64) | ||||
| 	extern "C" | ||||
| #endif | ||||
| 	void initBlock(const Cache& cache, uint8_t* out, uint64_t blockNumber, unsigned iterations); | ||||
| 
 | ||||
| 	void datasetAlloc(dataset_t& ds, bool largePages); | ||||
| 
 | ||||
|  |  | |||
|  | @ -1,6 +1,7 @@ | |||
| IFDEF RAX | ||||
| 
 | ||||
| PUBLIC squareHash | ||||
| PUBLIC initBlock | ||||
| 
 | ||||
| .code | ||||
| 
 | ||||
|  | @ -8,6 +9,189 @@ squareHash PROC | |||
| 	include asm/squareHash.inc | ||||
| squareHash ENDP | ||||
| 
 | ||||
| ; rcx = cache | ||||
| ; rdx = out | ||||
| ; r8 = blockNumber | ||||
| ; r9 = iterations | ||||
| initBlock PROC | ||||
| 	push rbx | ||||
| 	push rbp | ||||
| 	push rsi | ||||
| 	push rdi | ||||
| 	push r12 | ||||
| 	push r13 | ||||
| 	push r14 | ||||
| 	push r15 | ||||
| 	mov rsi, r9 | ||||
| 	mov rdi, qword ptr [rcx] | ||||
| 	mov rbp, rdx | ||||
| 	prefetcht0 byte ptr [rbp] | ||||
| 	; r8 = blockNumber | ||||
| 	xor r9, r9 | ||||
| 	xor r10, r10 | ||||
| 	xor r11, r11 | ||||
| 	xor r12, r12 | ||||
| 	xor r13, r13 | ||||
| 	xor r14, r14 | ||||
| 	xor r15, r15 | ||||
| initBlock_loop: | ||||
| 	; c0 | ||||
| 	mov rbx, r8 | ||||
| 	and rbx, 4194303 | ||||
| 	shl rbx, 6 | ||||
| 	add rbx, rdi | ||||
| 	prefetchnta byte ptr [rbx] | ||||
| 	lea rcx, [r8+r9] | ||||
| 	call squareHash | ||||
| 	mov r9, rax | ||||
| 	xor r8, qword ptr [rbx+0] | ||||
| 	xor r9, qword ptr [rbx+8] | ||||
| 	xor r10, qword ptr [rbx+16] | ||||
| 	xor r11, qword ptr [rbx+24] | ||||
| 	xor r12, qword ptr [rbx+32] | ||||
| 	xor r13, qword ptr [rbx+40] | ||||
| 	xor r14, qword ptr [rbx+48] | ||||
| 	xor r15, qword ptr [rbx+56] | ||||
| 	; c1 | ||||
| 	mov rbx, r9 | ||||
| 	and rbx, 4194303 | ||||
| 	shl rbx, 6 | ||||
| 	add rbx, rdi | ||||
| 	prefetchnta byte ptr [rbx] | ||||
| 	lea rcx, [r9+r10] | ||||
| 	call squareHash | ||||
| 	mov r10, rax | ||||
| 	xor r8, qword ptr [rbx+0] | ||||
| 	xor r9, qword ptr [rbx+8] | ||||
| 	xor r10, qword ptr [rbx+16] | ||||
| 	xor r11, qword ptr [rbx+24] | ||||
| 	xor r12, qword ptr [rbx+32] | ||||
| 	xor r13, qword ptr [rbx+40] | ||||
| 	xor r14, qword ptr [rbx+48] | ||||
| 	xor r15, qword ptr [rbx+56] | ||||
| 	; c2 | ||||
| 	mov rbx, r10 | ||||
| 	and rbx, 4194303 | ||||
| 	shl rbx, 6 | ||||
| 	add rbx, rdi | ||||
| 	prefetchnta byte ptr [rbx] | ||||
| 	lea rcx, [r10+r11] | ||||
| 	call squareHash | ||||
| 	mov r11, rax | ||||
| 	xor r8, qword ptr [rbx+0] | ||||
| 	xor r9, qword ptr [rbx+8] | ||||
| 	xor r10, qword ptr [rbx+16] | ||||
| 	xor r11, qword ptr [rbx+24] | ||||
| 	xor r12, qword ptr [rbx+32] | ||||
| 	xor r13, qword ptr [rbx+40] | ||||
| 	xor r14, qword ptr [rbx+48] | ||||
| 	xor r15, qword ptr [rbx+56] | ||||
| 	; c3 | ||||
| 	mov rbx, r11 | ||||
| 	and rbx, 4194303 | ||||
| 	shl rbx, 6 | ||||
| 	add rbx, rdi | ||||
| 	prefetchnta byte ptr [rbx] | ||||
| 	lea rcx, [r11+r12] | ||||
| 	call squareHash | ||||
| 	mov r12, rax | ||||
| 	xor r8, qword ptr [rbx+0] | ||||
| 	xor r9, qword ptr [rbx+8] | ||||
| 	xor r10, qword ptr [rbx+16] | ||||
| 	xor r11, qword ptr [rbx+24] | ||||
| 	xor r12, qword ptr [rbx+32] | ||||
| 	xor r13, qword ptr [rbx+40] | ||||
| 	xor r14, qword ptr [rbx+48] | ||||
| 	xor r15, qword ptr [rbx+56] | ||||
| 	; c4 | ||||
| 	mov rbx, r12 | ||||
| 	and rbx, 4194303 | ||||
| 	shl rbx, 6 | ||||
| 	add rbx, rdi | ||||
| 	prefetchnta byte ptr [rbx] | ||||
| 	lea rcx, [r12+r13] | ||||
| 	call squareHash | ||||
| 	mov r13, rax | ||||
| 	xor r8, qword ptr [rbx+0] | ||||
| 	xor r9, qword ptr [rbx+8] | ||||
| 	xor r10, qword ptr [rbx+16] | ||||
| 	xor r11, qword ptr [rbx+24] | ||||
| 	xor r12, qword ptr [rbx+32] | ||||
| 	xor r13, qword ptr [rbx+40] | ||||
| 	xor r14, qword ptr [rbx+48] | ||||
| 	xor r15, qword ptr [rbx+56] | ||||
| 	; c5 | ||||
| 	mov rbx, r13 | ||||
| 	and rbx, 4194303 | ||||
| 	shl rbx, 6 | ||||
| 	add rbx, rdi | ||||
| 	prefetchnta byte ptr [rbx] | ||||
| 	lea rcx, [r13+r14] | ||||
| 	call squareHash | ||||
| 	mov r14, rax | ||||
| 	xor r8, qword ptr [rbx+0] | ||||
| 	xor r9, qword ptr [rbx+8] | ||||
| 	xor r10, qword ptr [rbx+16] | ||||
| 	xor r11, qword ptr [rbx+24] | ||||
| 	xor r12, qword ptr [rbx+32] | ||||
| 	xor r13, qword ptr [rbx+40] | ||||
| 	xor r14, qword ptr [rbx+48] | ||||
| 	xor r15, qword ptr [rbx+56] | ||||
| 	; c6 | ||||
| 	mov rbx, r14 | ||||
| 	and rbx, 4194303 | ||||
| 	shl rbx, 6 | ||||
| 	add rbx, rdi | ||||
| 	prefetchnta byte ptr [rbx] | ||||
| 	lea rcx, [r14+r15] | ||||
| 	call squareHash | ||||
| 	mov r15, rax | ||||
| 	xor r8, qword ptr [rbx+0] | ||||
| 	xor r9, qword ptr [rbx+8] | ||||
| 	xor r10, qword ptr [rbx+16] | ||||
| 	xor r11, qword ptr [rbx+24] | ||||
| 	xor r12, qword ptr [rbx+32] | ||||
| 	xor r13, qword ptr [rbx+40] | ||||
| 	xor r14, qword ptr [rbx+48] | ||||
| 	xor r15, qword ptr [rbx+56] | ||||
| 	; c7 | ||||
| 	mov rbx, r15 | ||||
| 	and rbx, 4194303 | ||||
| 	shl rbx, 6 | ||||
| 	add rbx, rdi | ||||
| 	prefetchnta byte ptr [rbx] | ||||
| 	lea rcx, [r15+r8] | ||||
| 	call squareHash | ||||
| 	mov r8, rax | ||||
| 	xor r8, qword ptr [rbx+0] | ||||
| 	xor r9, qword ptr [rbx+8] | ||||
| 	xor r10, qword ptr [rbx+16] | ||||
| 	xor r11, qword ptr [rbx+24] | ||||
| 	xor r12, qword ptr [rbx+32] | ||||
| 	xor r13, qword ptr [rbx+40] | ||||
| 	xor r14, qword ptr [rbx+48] | ||||
| 	xor r15, qword ptr [rbx+56] | ||||
| 	sub rsi, 1 | ||||
| 	jnz initBlock_loop | ||||
| 	mov qword ptr [rbp+0], r8 | ||||
| 	mov qword ptr [rbp+8], r9 | ||||
| 	mov qword ptr [rbp+16], r10 | ||||
| 	mov qword ptr [rbp+24], r11 | ||||
| 	mov qword ptr [rbp+32], r12 | ||||
| 	mov qword ptr [rbp+40], r13 | ||||
| 	mov qword ptr [rbp+48], r14 | ||||
| 	mov qword ptr [rbp+56], r15 | ||||
| 	pop r15 | ||||
| 	pop r14 | ||||
| 	pop r13 | ||||
| 	pop r12 | ||||
| 	pop rdi | ||||
| 	pop rsi | ||||
| 	pop rbp | ||||
| 	pop rbx | ||||
| 	ret | ||||
| initBlock ENDP | ||||
| 
 | ||||
| ENDIF | ||||
| 
 | ||||
| END | ||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue