diff --git a/makefile b/makefile index 8aff8c5..3ffacec 100644 --- a/makefile +++ b/makefile @@ -9,10 +9,9 @@ endif BINDIR=bin SRCDIR=src OBJDIR=obj -LDFLAGS= +LDFLAGS=-lpthread TOBJS=$(addprefix $(OBJDIR)/,instructionsPortable.o TestAluFpu.o) -ROBJS=$(addprefix $(OBJDIR)/,argon2_core.o argon2_ref.o AssemblyGeneratorX86.o blake2b.o CompiledVirtualMachine.o dataset.o JitCompilerX86.o instructionsPortable.o Instruction.o InterpretedVirtualMachine.o main.o Program.o softAes.o VirtualMachine.o t1ha2.o) -SRC1=$(addprefix $(SRCDIR)/,TestAluFpu.cpp instructions.hpp Pcg32.hpp) +ROBJS=$(addprefix $(OBJDIR)/,argon2_core.o argon2_ref.o AssemblyGeneratorX86.o blake2b.o CompiledVirtualMachine.o dataset.o JitCompilerX86.o instructionsPortable.o Instruction.o InterpretedVirtualMachine.o main.o Program.o softAes.o VirtualMachine.o t1ha2.o Cache.o) all: release test @@ -52,7 +51,7 @@ $(OBJDIR)/blake2b.o: $(addprefix $(SRCDIR)/blake2/,blake2b.c blake2.h blake2-imp $(OBJDIR)/CompiledVirtualMachine.o: $(addprefix $(SRCDIR)/,CompiledVirtualMachine.cpp CompiledVirtualMachine.hpp Pcg32.hpp common.hpp instructions.hpp) | $(OBJDIR) $(CXX) $(CXXFLAGS) -c $(SRCDIR)/CompiledVirtualMachine.cpp -o $@ -$(OBJDIR)/dataset.o: $(addprefix $(SRCDIR)/,dataset.cpp common.hpp Pcg32.hpp argon2_core.h) | $(OBJDIR) +$(OBJDIR)/dataset.o: $(addprefix $(SRCDIR)/,dataset.cpp common.hpp Pcg32.hpp) | $(OBJDIR) $(CXX) $(CXXFLAGS) -c $(SRCDIR)/dataset.cpp -o $@ $(OBJDIR)/JitCompilerX86.o: $(addprefix $(SRCDIR)/,JitCompilerX86.cpp JitCompilerX86.hpp Instruction.hpp) | $(OBJDIR) @@ -72,6 +71,9 @@ $(OBJDIR)/main.o: $(addprefix $(SRCDIR)/,main.cpp InterpretedVirtualMachine.hpp $(OBJDIR)/Program.o: $(addprefix $(SRCDIR)/,Program.cpp Program.hpp Pcg32.hpp) | $(OBJDIR) $(CXX) $(CXXFLAGS) -c $(SRCDIR)/Program.cpp -o $@ + +$(OBJDIR)/Cache.o: $(addprefix $(SRCDIR)/,Cache.cpp Cache.hpp Pcg32.hpp argon2_core.h) | $(OBJDIR) + $(CXX) $(CXXFLAGS) -c $(SRCDIR)/Cache.cpp -o $@ $(OBJDIR)/softAes.o: $(addprefix $(SRCDIR)/,softAes.cpp softAes.h) | $(OBJDIR) $(CXX) $(CXXFLAGS) -c $(SRCDIR)/softAes.cpp -o $@ diff --git a/src/Cache.cpp b/src/Cache.cpp new file mode 100644 index 0000000..171fa58 --- /dev/null +++ b/src/Cache.cpp @@ -0,0 +1,147 @@ +/* +Copyright (c) 2018 tevador + +This file is part of RandomX. + +RandomX is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +RandomX is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with RandomX. If not, see. +*/ + +#include +#include "Cache.hpp" +#include "softAes.h" +#include "argon2.h" +#include "Pcg32.hpp" +#include "argon2_core.h" + +namespace RandomX { + + static_assert(ArgonMemorySize % (ArgonLanes * ARGON2_SYNC_POINTS) == 0, "ArgonMemorySize - invalid value"); + + // This will shift and xor tmp1 into itself as 4 32-bit vals such as + // sl_xor(a1 a2 a3 a4) = a1 (a2^a1) (a3^a2^a1) (a4^a3^a2^a1) + static inline __m128i sl_xor(__m128i tmp1) { + __m128i tmp4; + tmp4 = _mm_slli_si128(tmp1, 0x04); + tmp1 = _mm_xor_si128(tmp1, tmp4); + tmp4 = _mm_slli_si128(tmp4, 0x04); + tmp1 = _mm_xor_si128(tmp1, tmp4); + tmp4 = _mm_slli_si128(tmp4, 0x04); + tmp1 = _mm_xor_si128(tmp1, tmp4); + return tmp1; + } + + template + static inline void aesGenKeys(__m128i* xout0, __m128i* xout2) { + __m128i xout1 = soft ? soft_aeskeygenassist(*xout2, rcon) : _mm_aeskeygenassist_si128(*xout2, rcon); + xout1 = _mm_shuffle_epi32(xout1, 0xFF); + *xout0 = sl_xor(*xout0); + *xout0 = _mm_xor_si128(*xout0, xout1); + xout1 = soft ? soft_aeskeygenassist(*xout0, 0x00) : _mm_aeskeygenassist_si128(*xout0, 0x00); + xout1 = _mm_shuffle_epi32(xout1, 0xAA); + *xout2 = sl_xor(*xout2); + *xout2 = _mm_xor_si128(*xout2, xout1); + } + + template + static inline void expandAesKeys(const __m128i* seed, __m128i* keys) { + __m128i xout0, xout2; + xout0 = _mm_load_si128(seed); + xout2 = _mm_load_si128(seed + 1); + *keys++ = xout0; + *keys++ = xout2; + aesGenKeys<0x01, soft>(&xout0, &xout2); + *keys++ = xout0; + *keys++ = xout2; + aesGenKeys<0x02, soft>(&xout0, &xout2); + *keys++ = xout0; + *keys++ = xout2; + aesGenKeys<0x04, soft>(&xout0, &xout2); + *keys++ = xout0; + *keys++ = xout2; + aesGenKeys<0x08, soft>(&xout0, &xout2); + *keys++ = xout0; + *keys++ = xout2; + } + + void Cache::argonFill(const void* seed, size_t seedSize) { + uint32_t memory_blocks, segment_length; + argon2_instance_t instance; + argon2_context context; + + context.out = nullptr; + context.outlen = 0; + context.pwd = CONST_CAST(uint8_t *)seed; + context.pwdlen = (uint32_t)seedSize; + context.salt = CONST_CAST(uint8_t *)ArgonSalt; + context.saltlen = (uint32_t)ArgonSaltSize; + context.secret = NULL; + context.secretlen = 0; + context.ad = NULL; + context.adlen = 0; + context.t_cost = ArgonIterations; + context.m_cost = ArgonMemorySize; + context.lanes = ArgonLanes; + context.threads = 1; + context.allocate_cbk = NULL; + context.free_cbk = NULL; + context.flags = ARGON2_DEFAULT_FLAGS; + context.version = ARGON2_VERSION_NUMBER; + + /* 2. Align memory size */ + /* Minimum memory_blocks = 8L blocks, where L is the number of lanes */ + memory_blocks = context.m_cost; + + segment_length = memory_blocks / (context.lanes * ARGON2_SYNC_POINTS); + + instance.version = context.version; + instance.memory = NULL; + instance.passes = context.t_cost; + instance.memory_blocks = memory_blocks; + instance.segment_length = segment_length; + instance.lane_length = segment_length * ARGON2_SYNC_POINTS; + instance.lanes = context.lanes; + instance.threads = context.threads; + instance.type = Argon2_d; + instance.memory = (block*)memory; + + if (instance.threads > instance.lanes) { + instance.threads = instance.lanes; + } + + /* 3. Initialization: Hashing inputs, allocating memory, filling first + * blocks + */ + argon_initialize(&instance, &context); + + fill_memory_blocks(&instance); + } + + template + void Cache::initialize(const void* seed, size_t seedSize) { + //Argon2d memory fill + argonFill(seed, seedSize); + + //Circular shift of the cache buffer by 512 bytes + //realized by copying the first 512 bytes to the back + //of the buffer and shifting the start by 512 bytes + memcpy(memory + CacheSize, memory, CacheShift); + + //AES keys + expandAesKeys((__m128i*)seed, keys.data()); + } + + template void Cache::initialize(const void*, size_t); + + template void Cache::initialize(const void*, size_t); +} \ No newline at end of file diff --git a/src/Cache.hpp b/src/Cache.hpp new file mode 100644 index 0000000..7a34ee8 --- /dev/null +++ b/src/Cache.hpp @@ -0,0 +1,57 @@ +/* +Copyright (c) 2018 tevador + +This file is part of RandomX. + +RandomX is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +RandomX is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with RandomX. If not, see. +*/ + +#pragma once + +#include +#include +#include "common.hpp" +#include "dataset.hpp" + +namespace RandomX { + + class Cache { + public: + void* operator new(size_t size) { + void* ptr = _mm_malloc(size, sizeof(__m128i)); + if (ptr == nullptr) + throw std::bad_alloc(); + return ptr; + } + + void operator delete(void* ptr) { + _mm_free(ptr); + } + + template + void initialize(const void* seed, size_t seedSize); + + const KeysContainer& getKeys() const { + return keys; + } + + const uint8_t* getCache() { + return memory + CacheShift; + } + private: + alignas(16) KeysContainer keys; + uint8_t memory[CacheSize + CacheShift]; + void argonFill(const void* seed, size_t seedSize); + }; +} \ No newline at end of file diff --git a/src/CompiledVirtualMachine.cpp b/src/CompiledVirtualMachine.cpp index fdb1498..35e4ba5 100644 --- a/src/CompiledVirtualMachine.cpp +++ b/src/CompiledVirtualMachine.cpp @@ -31,11 +31,11 @@ namespace RandomX { #endif } - void CompiledVirtualMachine::initializeDataset(const void* seed, bool lightClient) { + void CompiledVirtualMachine::setDataset(dataset_t ds, bool lightClient) { if (lightClient) { throw std::runtime_error("Compiled VM does not support light-client mode"); } - VirtualMachine::initializeDataset(seed, lightClient); + VirtualMachine::setDataset(ds, lightClient); } void CompiledVirtualMachine::initializeProgram(const void* seed) { diff --git a/src/CompiledVirtualMachine.hpp b/src/CompiledVirtualMachine.hpp index b5b1d63..c2e108d 100644 --- a/src/CompiledVirtualMachine.hpp +++ b/src/CompiledVirtualMachine.hpp @@ -27,7 +27,7 @@ namespace RandomX { class CompiledVirtualMachine : public VirtualMachine { public: CompiledVirtualMachine(bool softAes); - void initializeDataset(const void* seed, bool light = false) override; + void setDataset(dataset_t ds, bool light = false) override; void initializeProgram(const void* seed) override; virtual void execute() override; void* getProgram() { diff --git a/src/VirtualMachine.cpp b/src/VirtualMachine.cpp index 4ea7ac0..21c52ac 100644 --- a/src/VirtualMachine.cpp +++ b/src/VirtualMachine.cpp @@ -20,58 +20,65 @@ along with RandomX. If not, see. #include "VirtualMachine.hpp" #include "common.hpp" #include "dataset.hpp" +#include "Cache.hpp" #include "t1ha/t1ha.h" #include "blake2/blake2.h" #include namespace RandomX { VirtualMachine::VirtualMachine(bool softAes) : softAes(softAes), lightClient(false) { - mem.dataset = nullptr; + mem.ds.dataset = nullptr; } - void VirtualMachine::initializeDataset(const void* seed, bool light) { + VirtualMachine::~VirtualMachine() { if (lightClient) { - _mm_free(mem.lcm->cache); - _mm_free(mem.lcm->block); + delete mem.ds.lightDataset->block; + delete mem.ds.lightDataset; + } + } + + void VirtualMachine::setDataset(dataset_t ds, bool light) { + if (mem.ds.dataset != nullptr) { + throw std::runtime_error("Dataset is already initialized"); } - _mm_free(mem.dataset); lightClient = light; if (light) { + auto lds = mem.ds.lightDataset = new LightClientDataset(); + lds->cache = ds.cache; + lds->block = (uint8_t*)_mm_malloc(DatasetBlockSize, sizeof(__m128i)); + lds->blockNumber = -1; + if (lds->block == nullptr) { + throw std::bad_alloc(); + } if (softAes) { - datasetInitLight(seed, mem.lcm); readDataset = &datasetReadLight; } else { - datasetInitLight(seed, mem.lcm); readDataset = &datasetReadLight; } } else { + mem.ds = ds; readDataset = &datasetRead; - if (softAes) { - datasetInit(seed, mem.dataset); - } - else { - datasetInit(seed, mem.dataset); - } } } void VirtualMachine::initializeScratchpad(uint32_t index) { if (lightClient) { + auto cache = mem.ds.lightDataset->cache; if (softAes) { for (int i = 0; i < ScratchpadSize / DatasetBlockSize; ++i) { - initBlock(mem.lcm->cache + CacheShift, ((uint8_t*)scratchpad) + DatasetBlockSize * i, (ScratchpadSize / DatasetBlockSize) * index + i, mem.lcm->keys); + initBlock(cache->getCache(), ((uint8_t*)scratchpad) + DatasetBlockSize * i, (ScratchpadSize / DatasetBlockSize) * index + i, cache->getKeys()); } } else { for (int i = 0; i < ScratchpadSize / DatasetBlockSize; ++i) { - initBlock(mem.lcm->cache + CacheShift, ((uint8_t*)scratchpad) + DatasetBlockSize * i, (ScratchpadSize / DatasetBlockSize) * index + i, mem.lcm->keys); + initBlock(cache->getCache(), ((uint8_t*)scratchpad) + DatasetBlockSize * i, (ScratchpadSize / DatasetBlockSize) * index + i, cache->getKeys()); } } } else { - memcpy(scratchpad, mem.dataset + ScratchpadSize * index, ScratchpadSize); + memcpy(scratchpad, mem.ds.dataset + ScratchpadSize * index, ScratchpadSize); } } diff --git a/src/VirtualMachine.hpp b/src/VirtualMachine.hpp index 5c83fa5..569718c 100644 --- a/src/VirtualMachine.hpp +++ b/src/VirtualMachine.hpp @@ -26,30 +26,12 @@ namespace RandomX { class VirtualMachine { public: VirtualMachine(bool softAes); - virtual ~VirtualMachine() {} - virtual void initializeDataset(const void* seed, bool light = false); + virtual ~VirtualMachine(); + virtual void setDataset(dataset_t ds, bool light = false); void initializeScratchpad(uint32_t index); virtual void initializeProgram(const void* seed) = 0; virtual void execute() = 0; void getResult(void*); - const RegisterFile& getRegisterFile() const { - return reg; - } - const convertible_t* getScratchpad() const { - return scratchpad; - } - const void* getCache() { - if (lightClient) { - return mem.lcm->cache; - } - return nullptr; - } - const __m128i* getKeys() { - if (lightClient) { - return mem.lcm->keys; - } - return nullptr; - } protected: bool softAes, lightClient; RegisterFile reg; diff --git a/src/argon2_core.c b/src/argon2_core.c index f90a0d7..cf07be5 100644 --- a/src/argon2_core.c +++ b/src/argon2_core.c @@ -473,7 +473,7 @@ void initial_hash(uint8_t *blockhash, argon2_context *context, argon2_type type) blake2b_final(&BlakeHash, blockhash, ARGON2_PREHASH_DIGEST_LENGTH); } -int initialize(argon2_instance_t *instance, argon2_context *context) { +int argon_initialize(argon2_instance_t *instance, argon2_context *context) { uint8_t blockhash[ARGON2_PREHASH_SEED_LENGTH]; int result = ARGON2_OK; diff --git a/src/argon2_core.h b/src/argon2_core.h index 6886fac..69a6339 100644 --- a/src/argon2_core.h +++ b/src/argon2_core.h @@ -204,7 +204,7 @@ void fill_first_blocks(uint8_t *blockhash, const argon2_instance_t *instance); * @return Zero if successful, -1 if memory failed to allocate. @context->state * will be modified if successful. */ -int initialize(argon2_instance_t *instance, argon2_context *context); +int argon_initialize(argon2_instance_t *instance, argon2_context *context); /* * XORing the last block of each lane, hashing it, making the tag. Deallocates diff --git a/src/common.hpp b/src/common.hpp index 1343e62..04333a5 100644 --- a/src/common.hpp +++ b/src/common.hpp @@ -20,8 +20,6 @@ along with RandomX. If not, see. #pragma once #include -#include -#include "intrinPortable.h" namespace RandomX { @@ -55,13 +53,13 @@ namespace RandomX { constexpr bool trace = false; #endif - typedef union { + union convertible_t { double f64; int64_t i64; uint64_t u64; int32_t i32; uint32_t u32; - } convertible_t; + }; constexpr int ProgramLength = 512; constexpr int InstructionCount = 1024 * 1024; @@ -71,34 +69,27 @@ namespace RandomX { constexpr uint32_t ScratchpadL2 = ScratchpadSize / sizeof(convertible_t); constexpr int RegistersCount = 8; + class Cache; + inline int wrapInstr(int i) { return i % RandomX::ProgramLength; } - struct LightClientMemory { - uint8_t* cache; + struct LightClientDataset { + Cache* cache; uint8_t* block; uint32_t blockNumber; - alignas(16) __m128i keys[10]; + }; - void* operator new(size_t size) { - void* ptr = _mm_malloc(size, sizeof(__m128i)); - if (ptr == nullptr) - throw std::bad_alloc(); - return ptr; - } - - void operator delete(void* ptr) { - _mm_free(ptr); - } + union dataset_t { + uint8_t* dataset; + Cache* cache; + LightClientDataset* lightDataset; }; struct MemoryRegisters { addr_t ma, mx; - union { - uint8_t* dataset; - LightClientMemory* lcm; - }; + dataset_t ds; }; static_assert(sizeof(MemoryRegisters) == 2 * sizeof(addr_t) + sizeof(uintptr_t), "Invalid alignment of struct RandomX::MemoryRegisters"); diff --git a/src/dataset.cpp b/src/dataset.cpp index 0738b4f..a265fdf 100644 --- a/src/dataset.cpp +++ b/src/dataset.cpp @@ -19,135 +19,25 @@ along with RandomX. If not, see. // Parts of this file are originally copyright (c) xmr-stak -#include "common.hpp" -#include "dataset.hpp" -#include "Pcg32.hpp" -#include "argon2_core.h" #include #include #include #include -#if defined(_MSC_VER) -#if defined(_M_X64) || (defined(_M_IX86_FP) && _M_IX86_FP == 2) -#define __SSE2__ 1 -#endif -#endif +#include "common.hpp" +#include "dataset.hpp" +#include "Pcg32.hpp" +#include "Cache.hpp" #if defined(__SSE2__) #include -#define PREFETCH(memory) _mm_prefetch((const char *)((memory).dataset + (memory).ma), _MM_HINT_T0) +#define PREFETCH(memory) _mm_prefetch((const char *)((memory).ds.dataset + (memory).ma), _MM_HINT_T0) #else #define PREFETCH(memory) #endif namespace RandomX { - void initializeCache(const void* input, size_t inputLength, void* memory) { - uint32_t memory_blocks, segment_length; - argon2_instance_t instance; - argon2_context context; - - context.out = nullptr; - context.outlen = 0; - context.pwd = CONST_CAST(uint8_t *)input; - context.pwdlen = (uint32_t)inputLength; - context.salt = CONST_CAST(uint8_t *)ArgonSalt; - context.saltlen = (uint32_t)ArgonSaltSize; - context.secret = NULL; - context.secretlen = 0; - context.ad = NULL; - context.adlen = 0; - context.t_cost = ArgonIterations; - context.m_cost = ArgonMemorySize; - context.lanes = ArgonLanes; - context.threads = 1; - context.allocate_cbk = NULL; - context.free_cbk = NULL; - context.flags = ARGON2_DEFAULT_FLAGS; - context.version = ARGON2_VERSION_NUMBER; - - /* 2. Align memory size */ - /* Minimum memory_blocks = 8L blocks, where L is the number of lanes */ - memory_blocks = context.m_cost; - - segment_length = memory_blocks / (context.lanes * ARGON2_SYNC_POINTS); - - instance.version = context.version; - instance.memory = NULL; - instance.passes = context.t_cost; - instance.memory_blocks = memory_blocks; - instance.segment_length = segment_length; - instance.lane_length = segment_length * ARGON2_SYNC_POINTS; - instance.lanes = context.lanes; - instance.threads = context.threads; - instance.type = Argon2_d; - instance.memory = (block*)memory; - - if (instance.threads > instance.lanes) { - instance.threads = instance.lanes; - } - - /* 3. Initialization: Hashing inputs, allocating memory, filling first - * blocks - */ - initialize(&instance, &context); - - fill_memory_blocks(&instance); - } - - // This will shift and xor tmp1 into itself as 4 32-bit vals such as - // sl_xor(a1 a2 a3 a4) = a1 (a2^a1) (a3^a2^a1) (a4^a3^a2^a1) - static inline __m128i sl_xor(__m128i tmp1) { - __m128i tmp4; - tmp4 = _mm_slli_si128(tmp1, 0x04); - tmp1 = _mm_xor_si128(tmp1, tmp4); - tmp4 = _mm_slli_si128(tmp4, 0x04); - tmp1 = _mm_xor_si128(tmp1, tmp4); - tmp4 = _mm_slli_si128(tmp4, 0x04); - tmp1 = _mm_xor_si128(tmp1, tmp4); - return tmp1; - } - - template - static inline void aes_genkey_sub(__m128i* xout0, __m128i* xout2) { - __m128i xout1 = soft ? soft_aeskeygenassist(*xout2, rcon) : _mm_aeskeygenassist_si128(*xout2, rcon); - xout1 = _mm_shuffle_epi32(xout1, 0xFF); - *xout0 = sl_xor(*xout0); - *xout0 = _mm_xor_si128(*xout0, xout1); - xout1 = soft ? soft_aeskeygenassist(*xout0, 0x00) : _mm_aeskeygenassist_si128(*xout0, 0x00); - xout1 = _mm_shuffle_epi32(xout1, 0xAA); - *xout2 = sl_xor(*xout2); - *xout2 = _mm_xor_si128(*xout2, xout1); - } - - template - void expandAesKeys(const __m128i* seed, __m128i* keys) { - __m128i xout0, xout2; - xout0 = _mm_load_si128(seed); - xout2 = _mm_load_si128(seed + 1); - *keys++ = xout0; - *keys++ = xout2; - aes_genkey_sub<0x01, soft>(&xout0, &xout2); - *keys++ = xout0; - *keys++ = xout2; - aes_genkey_sub<0x02, soft>(&xout0, &xout2); - *keys++ = xout0; - *keys++ = xout2; - aes_genkey_sub<0x04, soft>(&xout0, &xout2); - *keys++ = xout0; - *keys++ = xout2; - aes_genkey_sub<0x08, soft>(&xout0, &xout2); - *keys++ = xout0; - *keys++ = xout2; - } - - template - void expandAesKeys(const __m128i* seed, __m128i* keys); - - template - void expandAesKeys(const __m128i* seed, __m128i* keys); - template static inline void shuffle(T* buffer, size_t bytes, Pcg32& gen) { auto count = bytes / sizeof(T); @@ -157,8 +47,18 @@ namespace RandomX { } } + template + static inline __m128i aesenc(__m128i in, __m128i key) { + return soft ? soft_aesenc(in, key) : _mm_aesenc_si128(in, key); + } + + template + static inline __m128i aesdec(__m128i in, __m128i key) { + return soft ? soft_aesdec(in, key) : _mm_aesdec_si128(in, key); + } + template - void initBlock(uint8_t* in, uint8_t* out, uint32_t blockNumber, const __m128i keys[10]) { + void initBlock(const uint8_t* in, uint8_t* out, uint32_t blockNumber, const KeysContainer& keys) { __m128i xin, xout; //Initialization vector = block number extended to 128 bits xout = _mm_cvtsi32_si128(blockNumber); @@ -200,20 +100,20 @@ namespace RandomX { } template - void initBlock(uint8_t* in, uint8_t* out, uint32_t blockNumber, const __m128i keys[10]); + void initBlock(const uint8_t*, uint8_t*, uint32_t, const KeysContainer&); template - void initBlock(uint8_t* in, uint8_t* out, uint32_t blockNumber, const __m128i keys[10]); + void initBlock(const uint8_t*, uint8_t*, uint32_t, const KeysContainer&); template - void initBlock(uint8_t* in, uint8_t* out, uint32_t blockNumber, const __m128i keys[10]); + void initBlock(const uint8_t*, uint8_t*, uint32_t, const KeysContainer&); template - void initBlock(uint8_t* in, uint8_t* out, uint32_t blockNumber, const __m128i keys[10]); + void initBlock(const uint8_t*, uint8_t*, uint32_t, const KeysContainer&); convertible_t datasetRead(addr_t addr, MemoryRegisters& memory) { convertible_t data; - data.u64 = *(uint64_t*)(memory.dataset + memory.ma); + data.u64 = *(uint64_t*)(memory.ds.dataset + memory.ma); memory.ma += 8; memory.mx ^= addr; if ((memory.mx & 0xFFF8) == 0) { @@ -224,24 +124,25 @@ namespace RandomX { } template - void initBlock(uint8_t* cache, uint8_t* block, uint32_t blockNumber, const __m128i k[10]) { + void initBlock(const uint8_t* cache, uint8_t* block, uint32_t blockNumber, const KeysContainer& keys) { if (blockNumber % 2 == 1) { - initBlock(cache + blockNumber * CacheBlockSize, block, blockNumber, k); + initBlock(cache + blockNumber * CacheBlockSize, block, blockNumber, keys); } else { - initBlock(cache + blockNumber * CacheBlockSize, block, blockNumber, k); + initBlock(cache + blockNumber * CacheBlockSize, block, blockNumber, keys); } } template convertible_t datasetReadLight(addr_t addr, MemoryRegisters& memory) { convertible_t data; + LightClientDataset* lds = memory.ds.lightDataset; auto blockNumber = memory.ma / DatasetBlockSize; - if (memory.lcm->blockNumber != blockNumber) { - initBlock(memory.lcm->cache + CacheShift, (uint8_t*)memory.lcm->block, blockNumber, memory.lcm->keys); - memory.lcm->blockNumber = blockNumber; + if (lds->blockNumber != blockNumber) { + initBlock(lds->cache->getCache(), (uint8_t*)lds->block, blockNumber, lds->cache->getKeys()); + lds->blockNumber = blockNumber; } - data.u64 = *(uint64_t*)(memory.lcm->block + (memory.ma % DatasetBlockSize)); + data.u64 = *(uint64_t*)(lds->block + (memory.ma % DatasetBlockSize)); memory.ma += 8; memory.mx ^= addr; if ((memory.mx & 0xFFF8) == 0) { @@ -256,54 +157,37 @@ namespace RandomX { template convertible_t datasetReadLight(addr_t addr, MemoryRegisters& memory); - template - void datasetInit(const void* seed, uint8_t*& dataset) { + void datasetAlloc(dataset_t& ds) { if (sizeof(size_t) <= 4) throw std::runtime_error("Platform doesn't support enough memory for the dataset"); - dataset = (uint8_t*)_mm_malloc(DatasetSize, sizeof(__m128i)); - if (dataset == nullptr) { - throw std::runtime_error("Dataset memory allocation failed. >4 GiB of virtual memory is needed."); + ds.dataset = (uint8_t*)_mm_malloc(DatasetSize, /*sizeof(__m128i)*/ 64); + if (ds.dataset == nullptr) { + throw std::runtime_error("Dataset memory allocation failed. >4 GiB of free virtual memory is needed."); } - uint8_t* cache = (uint8_t*)_mm_malloc(CacheSize + CacheShift, sizeof(__m128i)); - if (cache == nullptr) { - throw std::bad_alloc(); - } - initializeCache(seed, SeedSize, cache); - memcpy(cache + CacheSize, cache, CacheShift); - alignas(16) __m128i keys[10]; - expandAesKeys((const __m128i*)seed, keys); - for (uint32_t i = 0; i < DatasetBlockCount; ++i) { - initBlock(cache + CacheShift, dataset + i * DatasetBlockSize, i, keys); - } - _mm_free(cache); } - template - void datasetInit(const void*, uint8_t*&); - - template - void datasetInit(const void*, uint8_t*&); - template - void datasetInitLight(const void* seed, LightClientMemory*& lcm) { - lcm = new LightClientMemory(); - lcm->cache = (uint8_t*)_mm_malloc(CacheSize + CacheShift, sizeof(__m128i)); - if (lcm->cache == nullptr) { - throw std::bad_alloc(); + void datasetInit(Cache* cache, dataset_t ds, uint32_t startBlock, uint32_t blockCount) { + for (uint32_t i = startBlock; i < startBlock + blockCount; ++i) { + initBlock(cache->getCache(), ds.dataset + i * DatasetBlockSize, i, cache->getKeys()); } - initializeCache(seed, SeedSize, lcm->cache); - memcpy(lcm->cache + CacheSize, lcm->cache, CacheShift); - expandAesKeys((__m128i*)seed, lcm->keys); - lcm->block = (uint8_t*)_mm_malloc(DatasetBlockSize, sizeof(__m128i)); - if (lcm->block == nullptr) { - throw std::bad_alloc(); - } - lcm->blockNumber = -1; } template - void datasetInitLight(const void*, LightClientMemory*&); + void datasetInit(Cache*, dataset_t, uint32_t, uint32_t); template - void datasetInitLight(const void*, LightClientMemory*&); + void datasetInit(Cache*, dataset_t, uint32_t, uint32_t); + + template + void datasetInitCache(const void* seed, dataset_t& ds) { + ds.cache = new Cache(); + ds.cache->initialize(seed, SeedSize); + } + + template + void datasetInitCache(const void*, dataset_t&); + + template + void datasetInitCache(const void*, dataset_t&); } diff --git a/src/dataset.hpp b/src/dataset.hpp index 42d63e6..bb29197 100644 --- a/src/dataset.hpp +++ b/src/dataset.hpp @@ -20,43 +20,30 @@ along with RandomX. If not, see. #pragma once #include +#include #include "intrinPortable.h" -#include "argon2.h" #include "common.hpp" #include "softAes.h" namespace RandomX { - static_assert(ArgonMemorySize % (ArgonLanes * ARGON2_SYNC_POINTS) == 0, "ArgonMemorySize - invalid value"); - - void initializeCache(const void* input, size_t inputLength, void* memory); - - template - void expandAesKeys(const __m128i* seed, __m128i* keys); - - template - inline __m128i aesenc(__m128i in, __m128i key) { - return soft ? soft_aesenc(in, key) : _mm_aesenc_si128(in, key); - } - - template - inline __m128i aesdec(__m128i in, __m128i key) { - return soft ? soft_aesdec(in, key) : _mm_aesdec_si128(in, key); - } + using KeysContainer = std::array<__m128i, 10>; template - void initBlock(uint8_t* in, uint8_t* out, uint32_t blockNumber, const __m128i keys[10]); + void initBlock(const uint8_t* in, uint8_t* out, uint32_t blockNumber, const KeysContainer& keys); template - void initBlock(uint8_t* cache, uint8_t* block, uint32_t blockNumber, const __m128i keys[10]); + void initBlock(const uint8_t* cache, uint8_t* block, uint32_t blockNumber, const KeysContainer& keys); + + void datasetAlloc(dataset_t& ds); template - void datasetInit(const void* seed, uint8_t*& dataset); + void datasetInit(Cache* cache, dataset_t ds, uint32_t startBlock, uint32_t blockCount); convertible_t datasetRead(addr_t addr, MemoryRegisters& memory); template - void datasetInitLight(const void* seed, LightClientMemory*& lcm); + void datasetInitCache(const void* seed, dataset_t& dataset); template convertible_t datasetReadLight(addr_t addr, MemoryRegisters& memory); diff --git a/src/main.cpp b/src/main.cpp index f3f8124..b486c1a 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -30,6 +30,10 @@ along with RandomX. If not, see. #include "Program.hpp" #include #include "instructions.hpp" +#include +#include +#include "dataset.hpp" +#include "Cache.hpp" const uint8_t seed[32] = { 191, 182, 222, 175, 249, 89, 134, 104, 241, 68, 191, 62, 162, 166, 61, 64, 123, 191, 227, 193, 118, 60, 188, 53, 223, 133, 175, 24, 123, 230, 55, 74 }; @@ -45,7 +49,6 @@ void outputHex(std::ostream& os, const char* data, int length) { os << hexmap[(data[i] & 0xF0) >> 4]; os << hexmap[data[i] & 0x0F]; } - os << std::endl; } void readOption(const char* option, int argc, char** argv, bool& out) { @@ -58,6 +61,15 @@ void readOption(const char* option, int argc, char** argv, bool& out) { out = false; } +void readIntOption(const char* option, int argc, char** argv, int& out, int defaultValue) { + for (int i = 0; i < argc - 1; ++i) { + if (strcmp(argv[i], option) == 0 && (out = atoi(argv[i + 1])) > 0) { + return; + } + } + out = defaultValue; +} + void readInt(int argc, char** argv, int& out, int defaultValue) { for (int i = 0; i < argc; ++i) { if (*argv[i] != '-' && (out = atoi(argv[i])) > 0) { @@ -75,81 +87,144 @@ std::ostream& operator<<(std::ostream& os, const RandomX::RegisterFile& rf) { return os; } -int main(int argc, char** argv) { - bool softAes, lightClient, genAsm, compiled; - int programCount; - readOption("--softAes", argc, argv, softAes); - readOption("--lightClient", argc, argv, lightClient); - readOption("--genAsm", argc, argv, genAsm); - readOption("--compiled", argc, argv, compiled); - readInt(argc, argv, programCount, 1000); +class AtomicHash { +public: + AtomicHash() { + for (int i = 0; i < 4; ++i) + hash[i].store(0); + } + void xorWith(uint64_t update[4]) { + for (int i = 0; i < 4; ++i) + hash[i].fetch_xor(update[i]); + } + void print(std::ostream& os) { + for (int i = 0; i < 4; ++i) + print(hash[i], os); + os << std::endl; + } +private: + void print(std::atomic& hash, std::ostream& os) { + auto h = hash.load(); + outputHex(std::cout, (char*)&h, sizeof(h)); + } + std::atomic hash[4]; +}; +void mine(RandomX::VirtualMachine* vm, std::atomic& atomicNonce, AtomicHash& result, int noncesCount, int thread) { + uint64_t hash[4]; unsigned char blockTemplate[] = { 0x07, 0x07, 0xf7, 0xa4, 0xf0, 0xd6, 0x05, 0xb3, 0x03, 0x26, 0x08, 0x16, 0xba, 0x3f, 0x10, 0x90, 0x2e, 0x1a, 0x14, 0x5a, 0xc5, 0xfa, 0xd3, 0xaa, 0x3a, 0xf6, 0xea, 0x44, 0xc1, 0x18, 0x69, 0xdc, 0x4f, 0x85, 0x3f, 0x00, 0x2b, 0x2e, 0xea, 0x00, 0x00, 0x00, 0x00, 0x77, 0xb2, 0x06, 0xa0, 0x2c, 0xa5, 0xb1, 0xd4, 0xce, 0x6b, 0xbf, 0xdf, 0x0a, 0xca, 0xc3, 0x8b, 0xde, 0xd3, 0x4d, 0x2d, 0xcd, 0xee, 0xf9, 0x5c, 0xd2, 0x0c, 0xef, 0xc1, 0x2f, 0x61, 0xd5, 0x61, 0x09 }; - int* nonce = (int*)(blockTemplate + 39); - uint8_t hash[RandomX::ResultSize]; + int* noncePtr = (int*)(blockTemplate + 39); + int nonce = atomicNonce.fetch_add(1); - if (genAsm) { - *nonce = programCount; + while (nonce < noncesCount) { + //std::cout << "Thread " << thread << " nonce " << nonce << std::endl; + *noncePtr = nonce; blake2b(hash, sizeof(hash), blockTemplate, sizeof(blockTemplate), nullptr, 0); - RandomX::AssemblyGeneratorX86 asmX86; - asmX86.generateProgram(hash); - asmX86.printCode(std::cout); - return 0; + int spIndex = ((uint8_t*)hash)[24] | ((((uint8_t*)hash)[25] & 63) << 8); + vm->initializeScratchpad(spIndex); + vm->initializeProgram(hash); + vm->execute(); + vm->getResult(hash); + result.xorWith(hash); + if (RandomX::trace) { + std::cout << "Nonce: " << nonce << " "; + outputHex(std::cout, (char*)hash, sizeof(hash)); + std::cout << std::endl; + } + nonce = atomicNonce.fetch_add(1); } +} + +int main(int argc, char** argv) { + bool softAes, lightClient, genAsm, compiled; + int programCount, threadCount; + readOption("--softAes", argc, argv, softAes); + readOption("--lightClient", argc, argv, lightClient); + readOption("--genAsm", argc, argv, genAsm); + readOption("--compiled", argc, argv, compiled); + readIntOption("--threads", argc, argv, threadCount, 1); + readIntOption("--nonces", argc, argv, programCount, 1000); + + std::atomic atomicNonce(0); + AtomicHash result; + std::vector vms; + std::vector threads; + RandomX::dataset_t dataset; if (softAes) std::cout << "Using software AES." << std::endl; - - char cumulative[RandomX::ResultSize] = { 0 }; - - RandomX::VirtualMachine* vm; + std::cout << "Initializing..." << std::endl; try { - if (compiled) { - vm = new RandomX::CompiledVirtualMachine(softAes); + Stopwatch sw(true); + if (softAes) { + RandomX::datasetInitCache(seed, dataset); } else { - vm = new RandomX::InterpretedVirtualMachine(softAes); + RandomX::datasetInitCache(seed, dataset); } - std::cout << "Initializing..." << std::endl; - Stopwatch sw(true); - vm->initializeDataset(seed, lightClient); - if(lightClient) + if (RandomX::trace) { + std::cout << "Keys: " << std::endl; + for (int i = 0; i < dataset.cache->getKeys().size(); ++i) { + outputHex(std::cout, (char*)&dataset.cache->getKeys()[i], sizeof(__m128i)); + } + std::cout << std::endl; + std::cout << "Cache: " << std::endl; + outputHex(std::cout, (char*)dataset.cache->getCache(), sizeof(__m128i)); + std::cout << std::endl; + } + if (lightClient) { std::cout << "Cache (64 MiB) initialized in " << sw.getElapsed() << " s" << std::endl; - else + } + else { + RandomX::Cache* cache = dataset.cache; + RandomX::datasetAlloc(dataset); + auto perThread = RandomX::DatasetBlockCount / threadCount; + auto remainder = RandomX::DatasetBlockCount % threadCount; + for (int i = 0; i < threadCount; ++i) { + auto count = perThread + (i == threadCount - 1 ? remainder : 0); + if (softAes) { + threads.push_back(std::thread(&RandomX::datasetInit, cache, dataset, i * perThread, count)); + } + else { + threads.push_back(std::thread(&RandomX::datasetInit, cache, dataset, i * perThread, count)); + } + } + for (int i = 0; i < threads.size(); ++i) { + threads[i].join(); + } + delete cache; + threads.clear(); std::cout << "Dataset (4 GiB) initialized in " << sw.getElapsed() << " s" << std::endl; + } + std::cout << "Initializing " << threadCount << " virtual machine(s)..." << std::endl; + for (int i = 0; i < threadCount; ++i) { + RandomX::VirtualMachine* vm; + if (compiled) { + vm = new RandomX::CompiledVirtualMachine(softAes); + } + else { + vm = new RandomX::InterpretedVirtualMachine(softAes); + } + vm->setDataset(dataset, lightClient); + vms.push_back(vm); + } std::cout << "Running benchmark (" << programCount << " programs) ..." << std::endl; sw.restart(); - for (int i = 0; i < programCount; ++i) { - *nonce = i; - if (RandomX::trace) std::cout << "Nonce: " << i << " "; - blake2b(hash, sizeof(hash), blockTemplate, sizeof(blockTemplate), nullptr, 0); - int spIndex = hash[24] | ((hash[25] & 63) << 8); - vm->initializeScratchpad(spIndex); - //dump((const char *)vm.getScratchpad(), RandomX::ScratchpadSize, "scratchpad-before.txt"); - //return 0; - vm->initializeProgram(hash); - vm->execute(); - /*std::string fileName("scratchpad-after-"); - fileName = fileName + std::to_string(i) + ".txt"; - dump((const char *)vm.getScratchpad(), RandomX::ScratchpadSize, fileName.c_str());*/ - vm->getResult(hash); - if (RandomX::trace) { - outputHex(std::cout, (char*)hash, sizeof(hash)); - } - ((uint64_t*)cumulative)[0] ^= ((uint64_t*)hash)[0]; - ((uint64_t*)cumulative)[1] ^= ((uint64_t*)hash)[1]; - ((uint64_t*)cumulative)[2] ^= ((uint64_t*)hash)[2]; - ((uint64_t*)cumulative)[3] ^= ((uint64_t*)hash)[3]; + for (int i = 0; i < vms.size(); ++i) { + threads.push_back(std::thread(&mine, vms[i], std::ref(atomicNonce), std::ref(result), programCount, i)); + } + for (int i = 0; i < threads.size(); ++i) { + threads[i].join(); } double elapsed = sw.getElapsed(); std::cout << "Calculated result: "; - outputHex(std::cout, cumulative, sizeof(cumulative)); + result.print(std::cout); if(programCount == 1000) std::cout << "Reference result: d62ed85c39030cd2c5704fca3a23019f1244f2b03447c9a6b39dea5390ed1d10" << std::endl; std::cout << "Performance: " << programCount / elapsed << " programs per second" << std::endl;