mirror of
https://git.wownero.com/wownero/RandomWOW.git
synced 2024-08-15 00:23:14 +00:00
initBlock: cycle columns, asm implementation
This commit is contained in:
parent
55a22febbd
commit
edde7672e0
5 changed files with 249 additions and 30 deletions
|
@ -351,7 +351,7 @@ namespace RandomX {
|
||||||
//mem.mx &= CacheLineAlignMask;
|
//mem.mx &= CacheLineAlignMask;
|
||||||
Cache& cache = mem.ds.cache;
|
Cache& cache = mem.ds.cache;
|
||||||
uint64_t datasetLine[CacheLineSize / sizeof(uint64_t)];
|
uint64_t datasetLine[CacheLineSize / sizeof(uint64_t)];
|
||||||
initBlock(cache, (uint8_t*)datasetLine, datasetBase + mem.ma / CacheLineSize);
|
initBlock(cache, (uint8_t*)datasetLine, datasetBase + mem.ma / CacheLineSize, RANDOMX_CACHE_ACCESSES / 8);
|
||||||
for (int i = 0; i < RegistersCount; ++i)
|
for (int i = 0; i < RegistersCount; ++i)
|
||||||
r[i] ^= datasetLine[i];
|
r[i] ^= datasetLine[i];
|
||||||
std::swap(mem.mx, mem.ma);
|
std::swap(mem.mx, mem.ma);
|
||||||
|
|
|
@ -54,7 +54,7 @@ namespace RandomX {
|
||||||
#endif
|
#endif
|
||||||
uint32_t currentBlock = addr / CacheLineSize;
|
uint32_t currentBlock = addr / CacheLineSize;
|
||||||
if (currentBlock != startBlock || output != currentLine.data()) {
|
if (currentBlock != startBlock || output != currentLine.data()) {
|
||||||
initBlock(cache, (uint8_t*)currentLine.data(), currentBlock);
|
initBlock(cache, (uint8_t*)currentLine.data(), currentBlock, RANDOMX_CACHE_ACCESSES / 8);
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
sync();
|
sync();
|
||||||
|
@ -81,7 +81,7 @@ namespace RandomX {
|
||||||
|
|
||||||
void LightClientAsyncWorker::getBlocks(void* out, uint32_t startBlock, uint32_t blockCount) {
|
void LightClientAsyncWorker::getBlocks(void* out, uint32_t startBlock, uint32_t blockCount) {
|
||||||
for (uint32_t i = 0; i < blockCount; ++i) {
|
for (uint32_t i = 0; i < blockCount; ++i) {
|
||||||
initBlock(cache, (uint8_t*)out + CacheLineSize * i, startBlock + i);
|
initBlock(cache, (uint8_t*)out + CacheLineSize * i, startBlock + i, RANDOMX_CACHE_ACCESSES / 8);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -101,7 +101,7 @@ namespace RandomX {
|
||||||
std::cout << sw.getElapsed() << ": runWorker-getBlocks " << startBlock << "/" << blockCount << std::endl;
|
std::cout << sw.getElapsed() << ": runWorker-getBlocks " << startBlock << "/" << blockCount << std::endl;
|
||||||
#endif
|
#endif
|
||||||
//getBlocks(output, startBlock, blockCount);
|
//getBlocks(output, startBlock, blockCount);
|
||||||
initBlock(cache, (uint8_t*)output, startBlock);
|
initBlock(cache, (uint8_t*)output, startBlock, RANDOMX_CACHE_ACCESSES / 8);
|
||||||
hasWork = false;
|
hasWork = false;
|
||||||
#ifdef TRACE
|
#ifdef TRACE
|
||||||
std::cout << sw.getElapsed() << ": runWorker-finished " << startBlock << "/" << blockCount << std::endl;
|
std::cout << sw.getElapsed() << ": runWorker-finished " << startBlock << "/" << blockCount << std::endl;
|
||||||
|
|
|
@ -40,34 +40,65 @@ along with RandomX. If not, see<http://www.gnu.org/licenses/>.
|
||||||
|
|
||||||
namespace RandomX {
|
namespace RandomX {
|
||||||
|
|
||||||
void initBlock(const Cache& cache, uint8_t* out, uint64_t blockNumber) {
|
#if !defined(_M_X64)
|
||||||
|
static FORCE_INLINE uint8_t* selectMixBlock(const Cache& cache, uint64_t& currentIndex, uint64_t& nextIndex) {
|
||||||
|
uint8_t* mixBlock;
|
||||||
|
if (RANDOMX_ARGON_GROWTH == 0) {
|
||||||
|
constexpr uint32_t mask = (RANDOMX_ARGON_MEMORY * ArgonBlockSize / CacheLineSize - 1);
|
||||||
|
mixBlock = cache.memory + (currentIndex & mask) * CacheLineSize;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
const uint32_t modulus = cache.size / CacheLineSize;
|
||||||
|
mixBlock = cache.memory + (currentIndex % modulus) * CacheLineSize;
|
||||||
|
}
|
||||||
|
PREFETCHNTA(mixBlock);
|
||||||
|
nextIndex = squareHash(currentIndex + nextIndex);
|
||||||
|
return mixBlock;
|
||||||
|
}
|
||||||
|
|
||||||
|
static FORCE_INLINE void mixCache(uint8_t* mixBlock, uint64_t& c0, uint64_t& c1, uint64_t& c2, uint64_t& c3, uint64_t& c4, uint64_t& c5, uint64_t& c6, uint64_t& c7) {
|
||||||
|
c0 ^= load64(mixBlock + 0);
|
||||||
|
c1 ^= load64(mixBlock + 8);
|
||||||
|
c2 ^= load64(mixBlock + 16);
|
||||||
|
c3 ^= load64(mixBlock + 24);
|
||||||
|
c4 ^= load64(mixBlock + 32);
|
||||||
|
c5 ^= load64(mixBlock + 40);
|
||||||
|
c6 ^= load64(mixBlock + 48);
|
||||||
|
c7 ^= load64(mixBlock + 56);
|
||||||
|
}
|
||||||
|
|
||||||
|
void initBlock(const Cache& cache, uint8_t* out, uint64_t blockNumber, unsigned iterations) {
|
||||||
uint64_t c0, c1, c2, c3, c4, c5, c6, c7;
|
uint64_t c0, c1, c2, c3, c4, c5, c6, c7;
|
||||||
|
|
||||||
c0 = 4ULL * blockNumber;
|
c0 = blockNumber;
|
||||||
c1 = c2 = c3 = c4 = c5 = c6 = c7 = 0;
|
c1 = c2 = c3 = c4 = c5 = c6 = c7 = 0;
|
||||||
|
|
||||||
constexpr uint32_t mask = (CacheSize - 1) & CacheLineAlignMask;
|
uint8_t* mixBlock;
|
||||||
|
|
||||||
for (auto i = 0; i < RANDOMX_CACHE_ACCESSES; ++i) {
|
for (auto i = 0; i < RANDOMX_CACHE_ACCESSES / 8; ++i) {
|
||||||
const uint8_t* mixBlock;
|
mixBlock = selectMixBlock(cache, c0, c1);
|
||||||
if (RANDOMX_ARGON_GROWTH == 0) {
|
mixCache(mixBlock, c0, c1, c2, c3, c4, c5, c6, c7);
|
||||||
constexpr uint32_t mask = (RANDOMX_ARGON_MEMORY * ArgonBlockSize / CacheLineSize - 1);
|
|
||||||
mixBlock = cache.memory + (c0 & mask) * CacheLineSize;
|
mixBlock = selectMixBlock(cache, c1, c2);
|
||||||
}
|
mixCache(mixBlock, c0, c1, c2, c3, c4, c5, c6, c7);
|
||||||
else {
|
|
||||||
const uint32_t modulus = cache.size / CacheLineSize;
|
mixBlock = selectMixBlock(cache, c2, c3);
|
||||||
mixBlock = cache.memory + (c0 % modulus) * CacheLineSize;
|
mixCache(mixBlock, c0, c1, c2, c3, c4, c5, c6, c7);
|
||||||
}
|
|
||||||
PREFETCHNTA(mixBlock);
|
mixBlock = selectMixBlock(cache, c3, c4);
|
||||||
c0 = squareHash(c0);
|
mixCache(mixBlock, c0, c1, c2, c3, c4, c5, c6, c7);
|
||||||
c0 ^= load64(mixBlock + 0);
|
|
||||||
c1 ^= load64(mixBlock + 8);
|
mixBlock = selectMixBlock(cache, c4, c5);
|
||||||
c2 ^= load64(mixBlock + 16);
|
mixCache(mixBlock, c0, c1, c2, c3, c4, c5, c6, c7);
|
||||||
c3 ^= load64(mixBlock + 24);
|
|
||||||
c4 ^= load64(mixBlock + 32);
|
mixBlock = selectMixBlock(cache, c5, c6);
|
||||||
c5 ^= load64(mixBlock + 40);
|
mixCache(mixBlock, c0, c1, c2, c3, c4, c5, c6, c7);
|
||||||
c6 ^= load64(mixBlock + 48);
|
|
||||||
c7 ^= load64(mixBlock + 56);
|
mixBlock = selectMixBlock(cache, c6, c7);
|
||||||
|
mixCache(mixBlock, c0, c1, c2, c3, c4, c5, c6, c7);
|
||||||
|
|
||||||
|
mixBlock = selectMixBlock(cache, c7, c0);
|
||||||
|
mixCache(mixBlock, c0, c1, c2, c3, c4, c5, c6, c7);
|
||||||
}
|
}
|
||||||
|
|
||||||
store64(out + 0, c0);
|
store64(out + 0, c0);
|
||||||
|
@ -79,6 +110,7 @@ namespace RandomX {
|
||||||
store64(out + 48, c6);
|
store64(out + 48, c6);
|
||||||
store64(out + 56, c7);
|
store64(out + 56, c7);
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
void datasetRead(addr_t addr, MemoryRegisters& memory, RegisterFile& reg) {
|
void datasetRead(addr_t addr, MemoryRegisters& memory, RegisterFile& reg) {
|
||||||
uint64_t* datasetLine = (uint64_t*)(memory.ds.dataset.memory + memory.ma);
|
uint64_t* datasetLine = (uint64_t*)(memory.ds.dataset.memory + memory.ma);
|
||||||
|
@ -95,7 +127,7 @@ namespace RandomX {
|
||||||
memory.mx &= CacheLineAlignMask; //align to cache line
|
memory.mx &= CacheLineAlignMask; //align to cache line
|
||||||
Cache& cache = memory.ds.cache;
|
Cache& cache = memory.ds.cache;
|
||||||
uint64_t datasetLine[CacheLineSize / sizeof(uint64_t)];
|
uint64_t datasetLine[CacheLineSize / sizeof(uint64_t)];
|
||||||
initBlock(cache, (uint8_t*)datasetLine, memory.ma / CacheLineSize);
|
initBlock(cache, (uint8_t*)datasetLine, memory.ma / CacheLineSize, RANDOMX_CACHE_ACCESSES / 8);
|
||||||
for (int i = 0; i < RegistersCount; ++i)
|
for (int i = 0; i < RegistersCount; ++i)
|
||||||
reg[i] ^= datasetLine[i];
|
reg[i] ^= datasetLine[i];
|
||||||
std::swap(memory.mx, memory.ma);
|
std::swap(memory.mx, memory.ma);
|
||||||
|
@ -128,7 +160,7 @@ namespace RandomX {
|
||||||
|
|
||||||
void datasetInit(Cache& cache, Dataset& ds, uint32_t startBlock, uint32_t blockCount) {
|
void datasetInit(Cache& cache, Dataset& ds, uint32_t startBlock, uint32_t blockCount) {
|
||||||
for (uint64_t i = startBlock; i < startBlock + blockCount; ++i) {
|
for (uint64_t i = startBlock; i < startBlock + blockCount; ++i) {
|
||||||
initBlock(cache, ds.memory + i * CacheLineSize, i);
|
initBlock(cache, ds.memory + i * CacheLineSize, i, RANDOMX_CACHE_ACCESSES / 8);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -25,7 +25,10 @@ along with RandomX. If not, see<http://www.gnu.org/licenses/>.
|
||||||
|
|
||||||
namespace RandomX {
|
namespace RandomX {
|
||||||
|
|
||||||
void initBlock(const Cache& cache, uint8_t* out, uint64_t blockNumber);
|
#if defined(_M_X64)
|
||||||
|
extern "C"
|
||||||
|
#endif
|
||||||
|
void initBlock(const Cache& cache, uint8_t* out, uint64_t blockNumber, unsigned iterations);
|
||||||
|
|
||||||
void datasetAlloc(dataset_t& ds, bool largePages);
|
void datasetAlloc(dataset_t& ds, bool largePages);
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
IFDEF RAX
|
IFDEF RAX
|
||||||
|
|
||||||
PUBLIC squareHash
|
PUBLIC squareHash
|
||||||
|
PUBLIC initBlock
|
||||||
|
|
||||||
.code
|
.code
|
||||||
|
|
||||||
|
@ -8,6 +9,189 @@ squareHash PROC
|
||||||
include asm/squareHash.inc
|
include asm/squareHash.inc
|
||||||
squareHash ENDP
|
squareHash ENDP
|
||||||
|
|
||||||
|
; rcx = cache
|
||||||
|
; rdx = out
|
||||||
|
; r8 = blockNumber
|
||||||
|
; r9 = iterations
|
||||||
|
initBlock PROC
|
||||||
|
push rbx
|
||||||
|
push rbp
|
||||||
|
push rsi
|
||||||
|
push rdi
|
||||||
|
push r12
|
||||||
|
push r13
|
||||||
|
push r14
|
||||||
|
push r15
|
||||||
|
mov rsi, r9
|
||||||
|
mov rdi, qword ptr [rcx]
|
||||||
|
mov rbp, rdx
|
||||||
|
prefetcht0 byte ptr [rbp]
|
||||||
|
; r8 = blockNumber
|
||||||
|
xor r9, r9
|
||||||
|
xor r10, r10
|
||||||
|
xor r11, r11
|
||||||
|
xor r12, r12
|
||||||
|
xor r13, r13
|
||||||
|
xor r14, r14
|
||||||
|
xor r15, r15
|
||||||
|
initBlock_loop:
|
||||||
|
; c0
|
||||||
|
mov rbx, r8
|
||||||
|
and rbx, 4194303
|
||||||
|
shl rbx, 6
|
||||||
|
add rbx, rdi
|
||||||
|
prefetchnta byte ptr [rbx]
|
||||||
|
lea rcx, [r8+r9]
|
||||||
|
call squareHash
|
||||||
|
mov r9, rax
|
||||||
|
xor r8, qword ptr [rbx+0]
|
||||||
|
xor r9, qword ptr [rbx+8]
|
||||||
|
xor r10, qword ptr [rbx+16]
|
||||||
|
xor r11, qword ptr [rbx+24]
|
||||||
|
xor r12, qword ptr [rbx+32]
|
||||||
|
xor r13, qword ptr [rbx+40]
|
||||||
|
xor r14, qword ptr [rbx+48]
|
||||||
|
xor r15, qword ptr [rbx+56]
|
||||||
|
; c1
|
||||||
|
mov rbx, r9
|
||||||
|
and rbx, 4194303
|
||||||
|
shl rbx, 6
|
||||||
|
add rbx, rdi
|
||||||
|
prefetchnta byte ptr [rbx]
|
||||||
|
lea rcx, [r9+r10]
|
||||||
|
call squareHash
|
||||||
|
mov r10, rax
|
||||||
|
xor r8, qword ptr [rbx+0]
|
||||||
|
xor r9, qword ptr [rbx+8]
|
||||||
|
xor r10, qword ptr [rbx+16]
|
||||||
|
xor r11, qword ptr [rbx+24]
|
||||||
|
xor r12, qword ptr [rbx+32]
|
||||||
|
xor r13, qword ptr [rbx+40]
|
||||||
|
xor r14, qword ptr [rbx+48]
|
||||||
|
xor r15, qword ptr [rbx+56]
|
||||||
|
; c2
|
||||||
|
mov rbx, r10
|
||||||
|
and rbx, 4194303
|
||||||
|
shl rbx, 6
|
||||||
|
add rbx, rdi
|
||||||
|
prefetchnta byte ptr [rbx]
|
||||||
|
lea rcx, [r10+r11]
|
||||||
|
call squareHash
|
||||||
|
mov r11, rax
|
||||||
|
xor r8, qword ptr [rbx+0]
|
||||||
|
xor r9, qword ptr [rbx+8]
|
||||||
|
xor r10, qword ptr [rbx+16]
|
||||||
|
xor r11, qword ptr [rbx+24]
|
||||||
|
xor r12, qword ptr [rbx+32]
|
||||||
|
xor r13, qword ptr [rbx+40]
|
||||||
|
xor r14, qword ptr [rbx+48]
|
||||||
|
xor r15, qword ptr [rbx+56]
|
||||||
|
; c3
|
||||||
|
mov rbx, r11
|
||||||
|
and rbx, 4194303
|
||||||
|
shl rbx, 6
|
||||||
|
add rbx, rdi
|
||||||
|
prefetchnta byte ptr [rbx]
|
||||||
|
lea rcx, [r11+r12]
|
||||||
|
call squareHash
|
||||||
|
mov r12, rax
|
||||||
|
xor r8, qword ptr [rbx+0]
|
||||||
|
xor r9, qword ptr [rbx+8]
|
||||||
|
xor r10, qword ptr [rbx+16]
|
||||||
|
xor r11, qword ptr [rbx+24]
|
||||||
|
xor r12, qword ptr [rbx+32]
|
||||||
|
xor r13, qword ptr [rbx+40]
|
||||||
|
xor r14, qword ptr [rbx+48]
|
||||||
|
xor r15, qword ptr [rbx+56]
|
||||||
|
; c4
|
||||||
|
mov rbx, r12
|
||||||
|
and rbx, 4194303
|
||||||
|
shl rbx, 6
|
||||||
|
add rbx, rdi
|
||||||
|
prefetchnta byte ptr [rbx]
|
||||||
|
lea rcx, [r12+r13]
|
||||||
|
call squareHash
|
||||||
|
mov r13, rax
|
||||||
|
xor r8, qword ptr [rbx+0]
|
||||||
|
xor r9, qword ptr [rbx+8]
|
||||||
|
xor r10, qword ptr [rbx+16]
|
||||||
|
xor r11, qword ptr [rbx+24]
|
||||||
|
xor r12, qword ptr [rbx+32]
|
||||||
|
xor r13, qword ptr [rbx+40]
|
||||||
|
xor r14, qword ptr [rbx+48]
|
||||||
|
xor r15, qword ptr [rbx+56]
|
||||||
|
; c5
|
||||||
|
mov rbx, r13
|
||||||
|
and rbx, 4194303
|
||||||
|
shl rbx, 6
|
||||||
|
add rbx, rdi
|
||||||
|
prefetchnta byte ptr [rbx]
|
||||||
|
lea rcx, [r13+r14]
|
||||||
|
call squareHash
|
||||||
|
mov r14, rax
|
||||||
|
xor r8, qword ptr [rbx+0]
|
||||||
|
xor r9, qword ptr [rbx+8]
|
||||||
|
xor r10, qword ptr [rbx+16]
|
||||||
|
xor r11, qword ptr [rbx+24]
|
||||||
|
xor r12, qword ptr [rbx+32]
|
||||||
|
xor r13, qword ptr [rbx+40]
|
||||||
|
xor r14, qword ptr [rbx+48]
|
||||||
|
xor r15, qword ptr [rbx+56]
|
||||||
|
; c6
|
||||||
|
mov rbx, r14
|
||||||
|
and rbx, 4194303
|
||||||
|
shl rbx, 6
|
||||||
|
add rbx, rdi
|
||||||
|
prefetchnta byte ptr [rbx]
|
||||||
|
lea rcx, [r14+r15]
|
||||||
|
call squareHash
|
||||||
|
mov r15, rax
|
||||||
|
xor r8, qword ptr [rbx+0]
|
||||||
|
xor r9, qword ptr [rbx+8]
|
||||||
|
xor r10, qword ptr [rbx+16]
|
||||||
|
xor r11, qword ptr [rbx+24]
|
||||||
|
xor r12, qword ptr [rbx+32]
|
||||||
|
xor r13, qword ptr [rbx+40]
|
||||||
|
xor r14, qword ptr [rbx+48]
|
||||||
|
xor r15, qword ptr [rbx+56]
|
||||||
|
; c7
|
||||||
|
mov rbx, r15
|
||||||
|
and rbx, 4194303
|
||||||
|
shl rbx, 6
|
||||||
|
add rbx, rdi
|
||||||
|
prefetchnta byte ptr [rbx]
|
||||||
|
lea rcx, [r15+r8]
|
||||||
|
call squareHash
|
||||||
|
mov r8, rax
|
||||||
|
xor r8, qword ptr [rbx+0]
|
||||||
|
xor r9, qword ptr [rbx+8]
|
||||||
|
xor r10, qword ptr [rbx+16]
|
||||||
|
xor r11, qword ptr [rbx+24]
|
||||||
|
xor r12, qword ptr [rbx+32]
|
||||||
|
xor r13, qword ptr [rbx+40]
|
||||||
|
xor r14, qword ptr [rbx+48]
|
||||||
|
xor r15, qword ptr [rbx+56]
|
||||||
|
sub rsi, 1
|
||||||
|
jnz initBlock_loop
|
||||||
|
mov qword ptr [rbp+0], r8
|
||||||
|
mov qword ptr [rbp+8], r9
|
||||||
|
mov qword ptr [rbp+16], r10
|
||||||
|
mov qword ptr [rbp+24], r11
|
||||||
|
mov qword ptr [rbp+32], r12
|
||||||
|
mov qword ptr [rbp+40], r13
|
||||||
|
mov qword ptr [rbp+48], r14
|
||||||
|
mov qword ptr [rbp+56], r15
|
||||||
|
pop r15
|
||||||
|
pop r14
|
||||||
|
pop r13
|
||||||
|
pop r12
|
||||||
|
pop rdi
|
||||||
|
pop rsi
|
||||||
|
pop rbp
|
||||||
|
pop rbx
|
||||||
|
ret
|
||||||
|
initBlock ENDP
|
||||||
|
|
||||||
ENDIF
|
ENDIF
|
||||||
|
|
||||||
END
|
END
|
Loading…
Reference in a new issue