/* Copyright (c) 2018 tevador This file is part of RandomX. RandomX is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. RandomX is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with RandomX. If not, see. */ #include #include #include #include #include "common.hpp" #include "dataset.hpp" #include "Pcg32.hpp" #include "Cache.hpp" #include "virtualMemory.hpp" #if defined(__SSE2__) #include #define PREFETCH(memory) _mm_prefetch((const char *)((memory).ds.dataset + (memory).ma), _MM_HINT_NTA) #else #define PREFETCH(memory) #endif namespace RandomX { template static inline void shuffle(T* buffer, size_t bytes, Pcg32& gen) { auto count = bytes / sizeof(T); for (auto i = count - 1; i >= 1; --i) { int j = gen.getUniform(0, i); std::swap(buffer[j], buffer[i]); } } template static inline __m128i aesenc(__m128i in, __m128i key) { return soft ? soft_aesenc(in, key) : _mm_aesenc_si128(in, key); } template static inline __m128i aesdec(__m128i in, __m128i key) { return soft ? soft_aesdec(in, key) : _mm_aesdec_si128(in, key); } #define AES_ROUND(i) x0 = aesdec(x0, keys[i]); \ x1 = aesenc(x1, keys[i]); \ x2 = aesdec(x2, keys[i]); \ x3 = aesenc(x3, keys[i]) template void initBlock(const uint8_t* intermediate, uint8_t* out, uint32_t blockNumber, const KeysContainer& keys) { __m128i x0, x1, x2, x3; __m128i* xit = (__m128i*)intermediate; __m128i* xout = (__m128i*)out; x0 = _mm_cvtsi32_si128(blockNumber); constexpr int mask = (CacheSize / CacheLineSize) - 1; for (auto i = 0; i < DatasetIterations; ++i) { x0 = aesenc(x0, keys[0]); x0 = aesenc(x0, keys[1]); x1 = aesenc(x0, keys[2]); x1 = aesenc(x1, keys[3]); x2 = aesenc(x1, keys[4]); x2 = aesenc(x2, keys[5]); x3 = aesenc(x2, keys[6]); x3 = aesenc(x3, keys[7]); int index = _mm_cvtsi128_si32(x3); index &= mask; __m128i t0 = _mm_load_si128(xit + 4 * index + 0); __m128i t1 = _mm_load_si128(xit + 4 * index + 1); __m128i t2 = _mm_load_si128(xit + 4 * index + 2); __m128i t3 = _mm_load_si128(xit + 4 * index + 3); x0 = _mm_xor_si128(x0, t0); x1 = _mm_xor_si128(x1, t1); x2 = _mm_xor_si128(x2, t2); x3 = _mm_xor_si128(x3, t3); } _mm_store_si128(xout + 0, x0); _mm_store_si128(xout + 1, x1); _mm_store_si128(xout + 2, x2); _mm_store_si128(xout + 3, x3); } template void initBlock(const uint8_t*, uint8_t*, uint32_t, const KeysContainer&); template void initBlock(const uint8_t*, uint8_t*, uint32_t, const KeysContainer&); void datasetRead(addr_t addr, MemoryRegisters& memory, RegisterFile& reg) { uint64_t* datasetLine = (uint64_t*)(memory.ds.dataset + memory.ma); memory.mx ^= addr; memory.mx &= -64; //align to cache line std::swap(memory.mx, memory.ma); PREFETCH(memory); for (int i = 0; i < RegistersCount; ++i) reg.r[i].u64 ^= datasetLine[i]; } template void datasetReadLight(addr_t addr, MemoryRegisters& memory, RegisterFile& reg) { Cache* cache = memory.ds.cache; uint64_t datasetLine[CacheLineSize / sizeof(uint64_t)]; initBlock(cache->getCache(), (uint8_t*)datasetLine, memory.ma / CacheLineSize, cache->getKeys()); for (int i = 0; i < RegistersCount; ++i) reg.r[i].u64 ^= datasetLine[i]; memory.mx ^= addr; memory.mx &= -64; //align to cache line std::swap(memory.mx, memory.ma); } template void datasetReadLight(addr_t addr, MemoryRegisters& memory, RegisterFile& reg); template void datasetReadLight(addr_t addr, MemoryRegisters& memory, RegisterFile& reg); void datasetReadLightAsync(addr_t addr, MemoryRegisters& memory, RegisterFile& reg) { ILightClientAsyncWorker* aw = memory.ds.asyncWorker; const uint64_t* datasetLine = aw->getBlock(memory.ma); for (int i = 0; i < RegistersCount; ++i) reg.r[i].u64 ^= datasetLine[i]; memory.mx ^= addr; memory.mx &= -64; //align to cache line std::swap(memory.mx, memory.ma); aw->prepareBlock(memory.ma); } void datasetAlloc(dataset_t& ds, bool largePages) { if (sizeof(size_t) <= 4) throw std::runtime_error("Platform doesn't support enough memory for the dataset"); if (largePages) { ds.dataset = (uint8_t*)allocLargePagesMemory(DatasetSize); } else { ds.dataset = (uint8_t*)_mm_malloc(DatasetSize, 64); if (ds.dataset == nullptr) { throw std::runtime_error("Dataset memory allocation failed. >4 GiB of free virtual memory is needed."); } } } template void datasetInit(Cache* cache, dataset_t ds, uint32_t startBlock, uint32_t blockCount) { for (uint32_t i = startBlock; i < startBlock + blockCount; ++i) { initBlock(cache->getCache(), ds.dataset + i * CacheLineSize, i, cache->getKeys()); } } template void datasetInit(Cache*, dataset_t, uint32_t, uint32_t); template void datasetInit(Cache*, dataset_t, uint32_t, uint32_t); template void datasetInitCache(const void* seed, dataset_t& ds, bool largePages) { ds.cache = new(Cache::alloc(largePages)) Cache(); ds.cache->initialize(seed, SeedSize); } template void datasetInitCache(const void*, dataset_t&, bool); template void datasetInitCache(const void*, dataset_t&, bool); template void aesBench(uint32_t blockCount) { alignas(16) KeysContainer keys; alignas(16) uint8_t buffer[CacheLineSize]; for (uint32_t block = 0; block < blockCount; ++block) { initBlock(buffer, buffer, 0, keys); } } template void aesBench(uint32_t blockCount); template void aesBench(uint32_t blockCount); }