diff --git a/.gitignore b/.gitignore index 35c1e9a..dd437d1 100644 --- a/.gitignore +++ b/.gitignore @@ -3,4 +3,6 @@ obj/ *.user *.suo .vs -x64 +x64/ +Release/ +Debug/ \ No newline at end of file diff --git a/makefile b/makefile index 5585b2b..3b39f4b 100644 --- a/makefile +++ b/makefile @@ -3,7 +3,7 @@ AR=gcc-ar PLATFORM=$(shell uname -m) CXXFLAGS=-std=c++11 -CCFLAGS= +CCFLAGS=-std=c99 ARFLAGS=rcs BINDIR=bin SRCDIR=src @@ -80,7 +80,8 @@ $(OBJDIR)/dataset.o: $(SRCDIR)/dataset.cpp $(SRCDIR)/common.hpp $(SRCDIR)/blake2 $(SRCDIR)/configuration.h $(SRCDIR)/randomx.h $(SRCDIR)/dataset.hpp \ $(SRCDIR)/superscalar_program.hpp $(SRCDIR)/instruction.hpp $(SRCDIR)/jit_compiler_x86.hpp \ $(SRCDIR)/allocator.hpp $(SRCDIR)/virtual_memory.hpp $(SRCDIR)/superscalar.hpp \ - $(SRCDIR)/blake2_generator.hpp $(SRCDIR)/reciprocal.h $(SRCDIR)/argon2.h $(SRCDIR)/argon2_core.h + $(SRCDIR)/blake2_generator.hpp $(SRCDIR)/reciprocal.h $(SRCDIR)/argon2.h $(SRCDIR)/argon2_core.h \ + $(SRCDIR)/intrin_portable.h $(OBJDIR)/jit_compiler_x86.o: $(SRCDIR)/jit_compiler_x86.cpp $(SRCDIR)/jit_compiler_x86.hpp \ $(SRCDIR)/common.hpp $(SRCDIR)/blake2/endian.h $(SRCDIR)/configuration.h $(SRCDIR)/randomx.h \ $(SRCDIR)/jit_compiler_x86_static.hpp $(SRCDIR)/superscalar.hpp \ @@ -90,7 +91,6 @@ $(OBJDIR)/jit_compiler_x86.o: $(SRCDIR)/jit_compiler_x86.cpp $(SRCDIR)/jit_compi $(OBJDIR)/jit_compiler_x86_static.o: $(SRCDIR)/jit_compiler_x86_static.S \ $(SRCDIR)/asm/program_prologue_linux.inc $(SRCDIR)/asm/program_xmm_constants.inc \ $(SRCDIR)/asm/program_loop_load.inc $(SRCDIR)/asm/program_read_dataset.inc \ - $(SRCDIR)/asm/program_read_dataset_light.inc \ $(SRCDIR)/asm/program_read_dataset_sshash_init.inc \ $(SRCDIR)/asm/program_read_dataset_sshash_fin.inc \ $(SRCDIR)/asm/program_loop_store.inc $(SRCDIR)/asm/program_epilogue_linux.inc \ diff --git a/src/asm/program_read_dataset_light.inc b/src/asm/program_read_dataset_light.inc deleted file mode 100644 index 65d2b8d..0000000 --- a/src/asm/program_read_dataset_light.inc +++ /dev/null @@ -1,5 +0,0 @@ - xor rbp, rax ;# modify "mx" - ror rbp, 32 ;# swap "ma" and "mx" - mov ecx, ebp ;# ecx = ma - and ecx, 2147483584 ;# align "ma" to the start of a cache line - shr ecx, 6 ;# ecx = Dataset block number diff --git a/src/assembly_generator_x86.cpp b/src/assembly_generator_x86.cpp index b73f3a8..165d016 100644 --- a/src/assembly_generator_x86.cpp +++ b/src/assembly_generator_x86.cpp @@ -27,12 +27,12 @@ along with RandomX. If not, see. namespace randomx { - static const char* regR[8] = { "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15" }; - static const char* regR32[8] = { "r8d", "r9d", "r10d", "r11d", "r12d", "r13d", "r14d", "r15d" }; - static const char* regFE[8] = { "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" }; - static const char* regF[4] = { "xmm0", "xmm1", "xmm2", "xmm3" }; - static const char* regE[4] = { "xmm4", "xmm5", "xmm6", "xmm7" }; - static const char* regA[4] = { "xmm8", "xmm9", "xmm10", "xmm11" }; + static const char* regR[] = { "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15" }; + static const char* regR32[] = { "r8d", "r9d", "r10d", "r11d", "r12d", "r13d", "r14d", "r15d" }; + static const char* regFE[] = { "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" }; + static const char* regF[] = { "xmm0", "xmm1", "xmm2", "xmm3" }; + static const char* regE[] = { "xmm4", "xmm5", "xmm6", "xmm7" }; + static const char* regA[] = { "xmm8", "xmm9", "xmm10", "xmm11" }; static const char* tempRegx = "xmm12"; static const char* mantissaMask = "xmm13"; @@ -49,7 +49,9 @@ namespace randomx { } asmCode.str(std::string()); //clear for (unsigned i = 0; i < prog.getSize(); ++i) { +#if RANDOMX_JUMP asmCode << "randomx_isn_" << i << ":" << std::endl; +#endif Instruction& instr = prog(i); instr.src %= RegistersCount; instr.dst %= RegistersCount; @@ -469,14 +471,14 @@ namespace randomx { } void AssemblyGeneratorX86::h_FADD_R(Instruction& instr, int i) { - instr.dst %= 4; - instr.src %= 4; + instr.dst %= RegisterCountFlt; + instr.src %= RegisterCountFlt; asmCode << "\taddpd " << regF[instr.dst] << ", " << regA[instr.src] << std::endl; traceflt(instr); } void AssemblyGeneratorX86::h_FADD_M(Instruction& instr, int i) { - instr.dst %= 4; + instr.dst %= RegisterCountFlt; genAddressReg(instr); asmCode << "\tcvtdq2pd " << tempRegx << ", qword ptr [" << regScratchpadAddr << "+rax]" << std::endl; asmCode << "\taddpd " << regF[instr.dst] << ", " << tempRegx << std::endl; @@ -484,14 +486,14 @@ namespace randomx { } void AssemblyGeneratorX86::h_FSUB_R(Instruction& instr, int i) { - instr.dst %= 4; - instr.src %= 4; + instr.dst %= RegisterCountFlt; + instr.src %= RegisterCountFlt; asmCode << "\tsubpd " << regF[instr.dst] << ", " << regA[instr.src] << std::endl; traceflt(instr); } void AssemblyGeneratorX86::h_FSUB_M(Instruction& instr, int i) { - instr.dst %= 4; + instr.dst %= RegisterCountFlt; genAddressReg(instr); asmCode << "\tcvtdq2pd " << tempRegx << ", qword ptr [" << regScratchpadAddr << "+rax]" << std::endl; asmCode << "\tsubpd " << regF[instr.dst] << ", " << tempRegx << std::endl; @@ -499,20 +501,20 @@ namespace randomx { } void AssemblyGeneratorX86::h_FSCAL_R(Instruction& instr, int i) { - instr.dst %= 4; + instr.dst %= RegisterCountFlt; asmCode << "\txorps " << regF[instr.dst] << ", " << scaleMask << std::endl; traceflt(instr); } void AssemblyGeneratorX86::h_FMUL_R(Instruction& instr, int i) { - instr.dst %= 4; - instr.src %= 4; + instr.dst %= RegisterCountFlt; + instr.src %= RegisterCountFlt; asmCode << "\tmulpd " << regE[instr.dst] << ", " << regA[instr.src] << std::endl; traceflt(instr); } void AssemblyGeneratorX86::h_FDIV_M(Instruction& instr, int i) { - instr.dst %= 4; + instr.dst %= RegisterCountFlt; genAddressReg(instr); asmCode << "\tcvtdq2pd " << tempRegx << ", qword ptr [" << regScratchpadAddr << "+rax]" << std::endl; asmCode << "\tandps " << tempRegx << ", " << mantissaMask << std::endl; @@ -522,7 +524,7 @@ namespace randomx { } void AssemblyGeneratorX86::h_FSQRT_R(Instruction& instr, int i) { - instr.dst %= 4; + instr.dst %= RegisterCountFlt; asmCode << "\tsqrtpd " << regE[instr.dst] << ", " << regE[instr.dst] << std::endl; traceflt(instr); } @@ -566,7 +568,7 @@ namespace randomx { void AssemblyGeneratorX86::handleCondition(Instruction& instr, int i) { const int shift = instr.getModShift(); - const int conditionMask = ((1 << RANDOMX_CONDITION_BITS) - 1) << shift; + const int conditionMask = ((1 << RANDOMX_JUMP_BITS) - 1) << shift; int reg = getConditionRegister(); int target = registerUsage[reg] + 1; registerUsage[reg] = i; @@ -579,7 +581,9 @@ namespace randomx { } void AssemblyGeneratorX86::h_COND_R(Instruction& instr, int i) { +#if RANDOMX_JUMP handleCondition(instr, i); +#endif asmCode << "\txor ecx, ecx" << std::endl; asmCode << "\tcmp " << regR32[instr.src] << ", " << (int32_t)instr.getImm32() << std::endl; asmCode << "\tset" << condition(instr) << " cl" << std::endl; @@ -602,7 +606,6 @@ namespace randomx { #define INST_HANDLE(x) REPN(&AssemblyGeneratorX86::h_##x, WT(x)) InstructionGenerator AssemblyGeneratorX86::engine[256] = { - //Integer INST_HANDLE(IADD_RS) INST_HANDLE(IADD_M) INST_HANDLE(ISUB_R) @@ -620,27 +623,18 @@ namespace randomx { INST_HANDLE(IROR_R) INST_HANDLE(IROL_R) INST_HANDLE(ISWAP_R) - - //Common floating point INST_HANDLE(FSWAP_R) - - //Floating point group F INST_HANDLE(FADD_R) INST_HANDLE(FADD_M) INST_HANDLE(FSUB_R) INST_HANDLE(FSUB_M) INST_HANDLE(FSCAL_R) - - //Floating point group E INST_HANDLE(FMUL_R) INST_HANDLE(FDIV_M) INST_HANDLE(FSQRT_R) - - //Control INST_HANDLE(COND_R) INST_HANDLE(CFROUND) INST_HANDLE(ISTORE) - INST_HANDLE(NOP) }; } \ No newline at end of file diff --git a/src/assembly_generator_x86.hpp b/src/assembly_generator_x86.hpp index 60ea7ab..1c27364 100644 --- a/src/assembly_generator_x86.hpp +++ b/src/assembly_generator_x86.hpp @@ -19,6 +19,7 @@ along with RandomX. If not, see. #pragma once +#include "common.hpp" #include namespace randomx { @@ -48,40 +49,40 @@ namespace randomx { void traceint(Instruction&); void traceflt(Instruction&); void tracenop(Instruction&); - void h_IADD_RS(Instruction&, int); - void h_IADD_M(Instruction&, int); - void h_ISUB_R(Instruction&, int); - void h_ISUB_M(Instruction&, int); - void h_IMUL_R(Instruction&, int); - void h_IMUL_M(Instruction&, int); - void h_IMULH_R(Instruction&, int); - void h_IMULH_M(Instruction&, int); - void h_ISMULH_R(Instruction&, int); - void h_ISMULH_M(Instruction&, int); - void h_IMUL_RCP(Instruction&, int); - void h_ISDIV_C(Instruction&, int); - void h_INEG_R(Instruction&, int); - void h_IXOR_R(Instruction&, int); - void h_IXOR_M(Instruction&, int); - void h_IROR_R(Instruction&, int); - void h_IROL_R(Instruction&, int); - void h_ISWAP_R(Instruction&, int); - void h_FSWAP_R(Instruction&, int); - void h_FADD_R(Instruction&, int); - void h_FADD_M(Instruction&, int); - void h_FSUB_R(Instruction&, int); - void h_FSUB_M(Instruction&, int); - void h_FSCAL_R(Instruction&, int); - void h_FMUL_R(Instruction&, int); - void h_FDIV_M(Instruction&, int); - void h_FSQRT_R(Instruction&, int); - void h_COND_R(Instruction&, int); - void h_CFROUND(Instruction&, int); - void h_ISTORE(Instruction&, int); - void h_NOP(Instruction&, int); + void h_IADD_RS(Instruction&, int); + void h_IADD_M(Instruction&, int); + void h_ISUB_R(Instruction&, int); + void h_ISUB_M(Instruction&, int); + void h_IMUL_R(Instruction&, int); + void h_IMUL_M(Instruction&, int); + void h_IMULH_R(Instruction&, int); + void h_IMULH_M(Instruction&, int); + void h_ISMULH_R(Instruction&, int); + void h_ISMULH_M(Instruction&, int); + void h_IMUL_RCP(Instruction&, int); + void h_ISDIV_C(Instruction&, int); + void h_INEG_R(Instruction&, int); + void h_IXOR_R(Instruction&, int); + void h_IXOR_M(Instruction&, int); + void h_IROR_R(Instruction&, int); + void h_IROL_R(Instruction&, int); + void h_ISWAP_R(Instruction&, int); + void h_FSWAP_R(Instruction&, int); + void h_FADD_R(Instruction&, int); + void h_FADD_M(Instruction&, int); + void h_FSUB_R(Instruction&, int); + void h_FSUB_M(Instruction&, int); + void h_FSCAL_R(Instruction&, int); + void h_FMUL_R(Instruction&, int); + void h_FDIV_M(Instruction&, int); + void h_FSQRT_R(Instruction&, int); + void h_COND_R(Instruction&, int); + void h_CFROUND(Instruction&, int); + void h_ISTORE(Instruction&, int); + void h_NOP(Instruction&, int); static InstructionGenerator engine[256]; std::stringstream asmCode; - int registerUsage[8]; + int registerUsage[RegistersCount]; }; } \ No newline at end of file diff --git a/src/common.hpp b/src/common.hpp index f7a6b1a..3c483bf 100644 --- a/src/common.hpp +++ b/src/common.hpp @@ -51,8 +51,6 @@ namespace randomx { static_assert(wtSum == 256, "Sum of instruction frequencies must be 256."); - using addr_t = uint32_t; - constexpr int ArgonBlockSize = 1024; constexpr int ArgonSaltSize = sizeof(RANDOMX_ARGON_SALT) - 1; constexpr int CacheLineSize = RANDOMX_DATASET_ITEM_SIZE; @@ -78,6 +76,10 @@ namespace randomx { #endif #endif +#define RANDOMX_JUMP (RANDOMX_JUMP_BITS > 0) + + using addr_t = uint32_t; + using int_reg_t = uint64_t; struct fpu_reg_t { @@ -95,6 +97,7 @@ namespace randomx { constexpr int ScratchpadL3Mask = (ScratchpadL3 - 1) * 8; constexpr int ScratchpadL3Mask64 = (ScratchpadL3 / 8 - 1) * 64; constexpr int RegistersCount = 8; + constexpr int RegisterCountFlt = RegistersCount / 2; constexpr int RegisterNeedsDisplacement = 5; //x86 r13 register constexpr int RegisterNeedsSib = 4; //x86 r12 register @@ -118,5 +121,3 @@ namespace randomx { typedef void(*CacheDeallocFunc)(randomx_cache*); typedef void(*CacheInitializeFunc)(randomx_cache*, const void*, size_t); } - -std::ostream& operator<<(std::ostream& os, const randomx::RegisterFile& rf); diff --git a/src/configuration.h b/src/configuration.h index e25b061..d155e4e 100644 --- a/src/configuration.h +++ b/src/configuration.h @@ -34,7 +34,10 @@ along with RandomX. If not, see. //Number of random Cache accesses per Dataset item. Minimum is 2. #define RANDOMX_CACHE_ACCESSES 8 +//Target latency for SuperscalarHash (in cycles of the reference CPU). #define RANDOMX_SUPERSCALAR_LATENCY 170 + +//The maximum size of a SuperscalarHash program (number of instructions). #define RANDOMX_SUPERSCALAR_MAX_SIZE 512 //Dataset base size in bytes. Must be a power of 2. @@ -61,8 +64,8 @@ along with RandomX. If not, see. //Scratchpad L1 size in bytes. Must be a power of two and less than or equal to RANDOMX_SCRATCHPAD_L2. #define RANDOMX_SCRATCHPAD_L1 (16 * 1024) -//How many register bits must be zero for a jump condition to be triggered -#define RANDOMX_CONDITION_BITS 7 +//How many register bits must be zero for a jump condition to be triggered. If set to 0, jumps are disabled. +#define RANDOMX_JUMP_BITS 7 /* Instruction frequencies (per 256 opcodes) diff --git a/src/dataset.cpp b/src/dataset.cpp index 8321797..31c2adb 100644 --- a/src/dataset.cpp +++ b/src/dataset.cpp @@ -39,6 +39,8 @@ along with RandomX. If not, see. #include "blake2/endian.h" #include "argon2.h" #include "argon2_core.h" +#include "jit_compiler_x86.hpp" +#include "intrin_portable.h" static_assert(RANDOMX_ARGON_MEMORY % (RANDOMX_ARGON_LANES * ARGON2_SYNC_POINTS) == 0, "RANDOMX_ARGON_MEMORY - invalid value"); static_assert(ARGON2_BLOCK_SIZE == randomx::ArgonBlockSize, "Unpexpected value of ARGON2_BLOCK_SIZE"); @@ -146,6 +148,7 @@ namespace randomx { rl[7] = rl[0] ^ superscalarAdd7; for (unsigned i = 0; i < RANDOMX_CACHE_ACCESSES; ++i) { mixBlock = getMixBlock(registerValue, cache->memory); + PREFETCHNTA(mixBlock); SuperscalarProgram& prog = cache->programs[i]; executeSuperscalar(rl, prog, &cache->reciprocalCache); diff --git a/src/dataset.hpp b/src/dataset.hpp index 4e072ff..4458017 100644 --- a/src/dataset.hpp +++ b/src/dataset.hpp @@ -24,7 +24,6 @@ along with RandomX. If not, see. #include #include "common.hpp" #include "superscalar_program.hpp" -#include "jit_compiler_x86.hpp" #include "allocator.hpp" /* Global scope for C binding */ @@ -33,6 +32,10 @@ struct randomx_dataset { randomx::DatasetDeallocFunc dealloc; }; +namespace randomx { + class JitCompilerX86; +} + /* Global scope for C binding */ struct randomx_cache { uint8_t* memory = nullptr; diff --git a/src/instruction.cpp b/src/instruction.cpp index 9f1b681..e1dc557 100644 --- a/src/instruction.cpp +++ b/src/instruction.cpp @@ -29,12 +29,12 @@ namespace randomx { } void Instruction::genAddressReg(std::ostream& os) const { - os << ((mod % 4) ? "L1" : "L2") << "[r" << (int)src << std::showpos << (int32_t)getImm32() << std::noshowpos << "]"; + os << (getModMem() ? "L1" : "L2") << "[r" << (int)src << std::showpos << (int32_t)getImm32() << std::noshowpos << "]"; } void Instruction::genAddressRegDst(std::ostream& os) const { if (getModCond()) - os << ((mod % 4) ? "L1" : "L2"); + os << (getModMem() ? "L1" : "L2"); else os << "L3"; os << "[r" << (int)dst << std::showpos << (int32_t)getImm32() << std::noshowpos << "]"; @@ -49,7 +49,7 @@ namespace randomx { if(dst == RegisterNeedsDisplacement) { os << ", " << (int32_t)getImm32(); } - os << ", LSH " << (int)(mod % 4) << std::endl; + os << ", LSH " << (int)getModMem() << std::endl; } void Instruction::h_IADD_M(std::ostream& os) const { @@ -65,7 +65,6 @@ namespace randomx { } } - //1 uOP void Instruction::h_ISUB_R(std::ostream& os) const { if (src != dst) { os << "r" << (int)dst << ", r" << (int)src << std::endl; @@ -197,57 +196,57 @@ namespace randomx { } void Instruction::h_FSWAP_R(std::ostream& os) const { - const char reg = (dst >= 4) ? 'e' : 'f'; - auto dstIndex = dst % 4; + const char reg = (dst >= RegisterCountFlt) ? 'e' : 'f'; + auto dstIndex = dst % RegisterCountFlt; os << reg << dstIndex << std::endl; } void Instruction::h_FADD_R(std::ostream& os) const { - auto dstIndex = dst % 4; - auto srcIndex = src % 4; + auto dstIndex = dst % RegisterCountFlt; + auto srcIndex = src % RegisterCountFlt; os << "f" << dstIndex << ", a" << srcIndex << std::endl; } void Instruction::h_FADD_M(std::ostream& os) const { - auto dstIndex = dst % 4; + auto dstIndex = dst % RegisterCountFlt; os << "f" << dstIndex << ", "; genAddressReg(os); os << std::endl; } void Instruction::h_FSUB_R(std::ostream& os) const { - auto dstIndex = dst % 4; - auto srcIndex = src % 4; + auto dstIndex = dst % RegisterCountFlt; + auto srcIndex = src % RegisterCountFlt; os << "f" << dstIndex << ", a" << srcIndex << std::endl; } void Instruction::h_FSUB_M(std::ostream& os) const { - auto dstIndex = dst % 4; + auto dstIndex = dst % RegisterCountFlt; os << "f" << dstIndex << ", "; genAddressReg(os); os << std::endl; } void Instruction::h_FSCAL_R(std::ostream& os) const { - auto dstIndex = dst % 4; + auto dstIndex = dst % RegisterCountFlt; os << "f" << dstIndex << std::endl; } void Instruction::h_FMUL_R(std::ostream& os) const { - auto dstIndex = dst % 4; - auto srcIndex = src % 4; + auto dstIndex = dst % RegisterCountFlt; + auto srcIndex = src % RegisterCountFlt; os << "e" << dstIndex << ", a" << srcIndex << std::endl; } void Instruction::h_FDIV_M(std::ostream& os) const { - auto dstIndex = dst % 4; + auto dstIndex = dst % RegisterCountFlt; os << "e" << dstIndex << ", "; genAddressReg(os); os << std::endl; } void Instruction::h_FSQRT_R(std::ostream& os) const { - auto dstIndex = dst % 4; + auto dstIndex = dst % RegisterCountFlt; os << "e" << dstIndex << std::endl; } @@ -280,7 +279,7 @@ namespace randomx { } void Instruction::h_COND_R(std::ostream& os) const { - os << "r" << (int)dst << ", " << condition((mod >> 2) & 7) << "(r" << (int)src << ", " << (int32_t)getImm32() << "), LSH " << (int)(mod >> 5) << std::endl; + os << "r" << (int)dst << ", " << condition(getModCond()) << "(r" << (int)src << ", " << (int32_t)getImm32() << "), LSH " << (int)(getModShift()) << std::endl; } void Instruction::h_ISTORE(std::ostream& os) const { @@ -297,7 +296,6 @@ namespace randomx { #define INST_HANDLE(x) REPN(&Instruction::h_##x, WT(x)) const char* Instruction::names[256] = { - //Integer INST_NAME(IADD_RS) INST_NAME(IADD_M) INST_NAME(ISUB_R) @@ -314,33 +312,22 @@ namespace randomx { INST_NAME(IXOR_M) INST_NAME(IROR_R) INST_NAME(ISWAP_R) - - //Common floating point INST_NAME(FSWAP_R) - - //Floating point group F INST_NAME(FADD_R) INST_NAME(FADD_M) INST_NAME(FSUB_R) INST_NAME(FSUB_M) INST_NAME(FSCAL_R) - - //Floating point group E INST_NAME(FMUL_R) INST_NAME(FDIV_M) INST_NAME(FSQRT_R) - - //Control INST_NAME(COND_R) INST_NAME(CFROUND) - INST_NAME(ISTORE) - INST_NAME(NOP) }; InstructionFormatter Instruction::engine[256] = { - //Integer INST_HANDLE(IADD_RS) INST_HANDLE(IADD_M) INST_HANDLE(ISUB_R) @@ -358,22 +345,15 @@ namespace randomx { INST_HANDLE(IROR_R) INST_HANDLE(IROL_R) INST_HANDLE(ISWAP_R) - - //Common floating point INST_HANDLE(FSWAP_R) - - //Floating point group F INST_HANDLE(FADD_R) INST_HANDLE(FADD_M) INST_HANDLE(FSUB_R) INST_HANDLE(FSUB_M) INST_HANDLE(FSCAL_R) - - //Floating point group E INST_HANDLE(FMUL_R) INST_HANDLE(FDIV_M) INST_HANDLE(FSQRT_R) - INST_HANDLE(COND_R) INST_HANDLE(CFROUND) INST_HANDLE(ISTORE) diff --git a/src/instruction.hpp b/src/instruction.hpp index 0dc382f..f6dbc3b 100644 --- a/src/instruction.hpp +++ b/src/instruction.hpp @@ -103,36 +103,36 @@ namespace randomx { void genAddressReg(std::ostream& os) const; void genAddressImm(std::ostream& os) const; void genAddressRegDst(std::ostream&) const; - void h_IADD_RS(std::ostream&) const; - void h_IADD_M(std::ostream&) const; - void h_ISUB_R(std::ostream&) const; - void h_ISUB_M(std::ostream&) const; - void h_IMUL_R(std::ostream&) const; - void h_IMUL_M(std::ostream&) const; - void h_IMULH_R(std::ostream&) const; - void h_IMULH_M(std::ostream&) const; - void h_ISMULH_R(std::ostream&) const; - void h_ISMULH_M(std::ostream&) const; - void h_IMUL_RCP(std::ostream&) const; - void h_INEG_R(std::ostream&) const; - void h_IXOR_R(std::ostream&) const; - void h_IXOR_M(std::ostream&) const; - void h_IROR_R(std::ostream&) const; - void h_IROL_R(std::ostream&) const; - void h_ISWAP_R(std::ostream&) const; - void h_FSWAP_R(std::ostream&) const; - void h_FADD_R(std::ostream&) const; - void h_FADD_M(std::ostream&) const; - void h_FSUB_R(std::ostream&) const; - void h_FSUB_M(std::ostream&) const; - void h_FSCAL_R(std::ostream&) const; - void h_FMUL_R(std::ostream&) const; - void h_FDIV_M(std::ostream&) const; - void h_FSQRT_R(std::ostream&) const; - void h_COND_R(std::ostream&) const; - void h_CFROUND(std::ostream&) const; - void h_ISTORE(std::ostream&) const; - void h_NOP(std::ostream&) const; + void h_IADD_RS(std::ostream&) const; + void h_IADD_M(std::ostream&) const; + void h_ISUB_R(std::ostream&) const; + void h_ISUB_M(std::ostream&) const; + void h_IMUL_R(std::ostream&) const; + void h_IMUL_M(std::ostream&) const; + void h_IMULH_R(std::ostream&) const; + void h_IMULH_M(std::ostream&) const; + void h_ISMULH_R(std::ostream&) const; + void h_ISMULH_M(std::ostream&) const; + void h_IMUL_RCP(std::ostream&) const; + void h_INEG_R(std::ostream&) const; + void h_IXOR_R(std::ostream&) const; + void h_IXOR_M(std::ostream&) const; + void h_IROR_R(std::ostream&) const; + void h_IROL_R(std::ostream&) const; + void h_ISWAP_R(std::ostream&) const; + void h_FSWAP_R(std::ostream&) const; + void h_FADD_R(std::ostream&) const; + void h_FADD_M(std::ostream&) const; + void h_FSUB_R(std::ostream&) const; + void h_FSUB_M(std::ostream&) const; + void h_FSCAL_R(std::ostream&) const; + void h_FMUL_R(std::ostream&) const; + void h_FDIV_M(std::ostream&) const; + void h_FSQRT_R(std::ostream&) const; + void h_COND_R(std::ostream&) const; + void h_CFROUND(std::ostream&) const; + void h_ISTORE(std::ostream&) const; + void h_NOP(std::ostream&) const; }; static_assert(sizeof(Instruction) == 8, "Invalid size of struct randomx::Instruction"); diff --git a/src/intrin_portable.h b/src/intrin_portable.h index 32aba08..a28ab66 100644 --- a/src/intrin_portable.h +++ b/src/intrin_portable.h @@ -318,18 +318,6 @@ constexpr uint64_t ieee_get_exponent_mask() { return (uint64_t)(E + 1023U) << 52; } -template -__m128d ieee_set_exponent(__m128d x) { - static_assert(E > -1023, "Invalid exponent value"); - constexpr uint64_t mantissaMask64 = (1ULL << 52) - 1; - const __m128d mantissaMask = _mm_castsi128_pd(_mm_set_epi64x(mantissaMask64, mantissaMask64)); - constexpr uint64_t exponent64 = (uint64_t)(E + 1023U) << 52; - const __m128d exponentMask = _mm_castsi128_pd(_mm_set_epi64x(exponent64, exponent64)); - x = _mm_and_pd(x, mantissaMask); - x = _mm_or_pd(x, exponentMask); - return x; -} - double loadDoublePortable(const void* addr); uint64_t mulh(uint64_t, uint64_t); int64_t smulh(int64_t, int64_t); diff --git a/src/jit_compiler_x86.cpp b/src/jit_compiler_x86.cpp index 2480aa2..7ada8e7 100644 --- a/src/jit_compiler_x86.cpp +++ b/src/jit_compiler_x86.cpp @@ -20,8 +20,6 @@ along with RandomX. If not, see. #include #include "jit_compiler_x86.hpp" -#define RANDOMX_JUMP - #if !defined(_M_X64) && !defined(__x86_64__) namespace randomx { @@ -113,7 +111,6 @@ namespace randomx { const uint8_t* codeLoopLoad = (uint8_t*)&randomx_program_loop_load; const uint8_t* codeProgamStart = (uint8_t*)&randomx_program_start; const uint8_t* codeReadDataset = (uint8_t*)&randomx_program_read_dataset; - const uint8_t* codeReadDatasetLight = (uint8_t*)&randomx_program_read_dataset_light; const uint8_t* codeReadDatasetLightSshInit = (uint8_t*)&randomx_program_read_dataset_sshash_init; const uint8_t* codeReadDatasetLightSshFin = (uint8_t*)&randomx_program_read_dataset_sshash_fin; const uint8_t* codeDatasetInit = (uint8_t*)&randomx_dataset_init; @@ -128,8 +125,7 @@ namespace randomx { const int32_t prologueSize = codeLoopBegin - codePrologue; const int32_t loopLoadSize = codeProgamStart - codeLoopLoad; - const int32_t readDatasetSize = codeReadDatasetLight - codeReadDataset; - const int32_t readDatasetLightSize = codeReadDatasetLightSshInit - codeReadDatasetLight; + const int32_t readDatasetSize = codeReadDatasetLightSshInit - codeReadDataset; const int32_t readDatasetLightInitSize = codeReadDatasetLightSshFin - codeReadDatasetLightSshInit; const int32_t readDatasetLightFinSize = codeLoopStore - codeReadDatasetLightSshFin; const int32_t loopStoreSize = codeLoopEnd - codeLoopStore; @@ -299,7 +295,7 @@ namespace randomx { } void JitCompilerX86::generateProgramPrologue(Program& prog, ProgramConfiguration& pcfg) { -#ifdef RANDOMX_JUMP +#if RANDOMX_JUMP instructionOffsets.clear(); for (unsigned i = 0; i < 8; ++i) { registerUsage[i] = -1; @@ -336,7 +332,7 @@ namespace randomx { } void JitCompilerX86::generateCode(Instruction& instr, int i) { -#ifdef RANDOMX_JUMP +#if RANDOMX_JUMP instructionOffsets.push_back(codePos); #endif auto generator = engine[instr.opcode]; @@ -467,15 +463,6 @@ namespace randomx { void JitCompilerX86::h_IADD_RS(Instruction& instr, int i) { registerUsage[instr.dst] = i; - /*if (instr.src != instr.dst) { - emit(REX_ADD_RR); - emitByte(0xc0 + 8 * instr.dst + instr.src); - } - else { - emit(REX_81); - emitByte(0xc0 + instr.dst); - emit32(instr.getImm32()); - }*/ emit(REX_LEA); if (instr.dst == RegisterNeedsDisplacement) emitByte(0xac); @@ -505,14 +492,6 @@ namespace randomx { emitByte((scale << 6) | (index << 3) | base); } - void JitCompilerX86::h_IADD_RC(Instruction& instr, int i) { - registerUsage[instr.dst] = i; - emit(REX_LEA); - emitByte(0x84 + 8 * instr.dst); - genSIB(0, instr.src, instr.dst); - emit32(instr.getImm32()); - } - void JitCompilerX86::h_ISUB_R(Instruction& instr, int i) { registerUsage[instr.dst] = i; if (instr.src != instr.dst) { @@ -541,14 +520,6 @@ namespace randomx { } } - void JitCompilerX86::h_IMUL_9C(Instruction& instr, int i) { - registerUsage[instr.dst] = i; - emit(REX_LEA); - emitByte(0x84 + 8 * instr.dst); - genSIB(3, instr.dst, instr.dst); - emit32(instr.getImm32()); - } - void JitCompilerX86::h_IMUL_R(Instruction& instr, int i) { registerUsage[instr.dst] = i; if (instr.src != instr.dst) { @@ -645,10 +616,6 @@ namespace randomx { } } - void JitCompilerX86::h_ISDIV_C(Instruction& instr, int i) { - - } - void JitCompilerX86::h_INEG_R(Instruction& instr, int i) { registerUsage[instr.dst] = i; emit(REX_NEG); @@ -729,17 +696,14 @@ namespace randomx { } void JitCompilerX86::h_FADD_R(Instruction& instr, int i) { - instr.dst %= 4; - instr.src %= 4; + instr.dst %= RegisterCountFlt; + instr.src %= RegisterCountFlt; emit(REX_ADDPD); emitByte(0xc0 + instr.src + 8 * instr.dst); - //emit(REX_PADD); - //emitByte(PADD_OPCODES[instr.mod % 4]); - //emitByte(0xf8 + instr.dst); } void JitCompilerX86::h_FADD_M(Instruction& instr, int i) { - instr.dst %= 4; + instr.dst %= RegisterCountFlt; genAddressReg(instr); emit(REX_CVTDQ2PD_XMM12); emit(REX_ADDPD); @@ -747,17 +711,14 @@ namespace randomx { } void JitCompilerX86::h_FSUB_R(Instruction& instr, int i) { - instr.dst %= 4; - instr.src %= 4; + instr.dst %= RegisterCountFlt; + instr.src %= RegisterCountFlt; emit(REX_SUBPD); emitByte(0xc0 + instr.src + 8 * instr.dst); - //emit(REX_PADD); - //emitByte(PADD_OPCODES[instr.mod % 4]); - //emitByte(0xf8 + instr.dst); } void JitCompilerX86::h_FSUB_M(Instruction& instr, int i) { - instr.dst %= 4; + instr.dst %= RegisterCountFlt; genAddressReg(instr); emit(REX_CVTDQ2PD_XMM12); emit(REX_SUBPD); @@ -765,40 +726,20 @@ namespace randomx { } void JitCompilerX86::h_FSCAL_R(Instruction& instr, int i) { - instr.dst %= 4; + instr.dst %= RegisterCountFlt; emit(REX_XORPS); emitByte(0xc7 + 8 * instr.dst); } void JitCompilerX86::h_FMUL_R(Instruction& instr, int i) { - instr.dst %= 4; - instr.src %= 4; + instr.dst %= RegisterCountFlt; + instr.src %= RegisterCountFlt; emit(REX_MULPD); emitByte(0xe0 + instr.src + 8 * instr.dst); } - void JitCompilerX86::h_FMUL_M(Instruction& instr, int i) { - instr.dst %= 4; - genAddressReg(instr); - emit(REX_CVTDQ2PD_XMM12); - emit(REX_ANDPS_XMM12); - emit(REX_MULPD); - emitByte(0xe4 + 8 * instr.dst); - emit(REX_MAXPD); - emitByte(0xe5 + 8 * instr.dst); - } - - void JitCompilerX86::h_FDIV_R(Instruction& instr, int i) { - instr.dst %= 4; - instr.src %= 4; - emit(REX_DIVPD); - emitByte(0xe0 + instr.src + 8 * instr.dst); - emit(REX_MAXPD); - emitByte(0xe5 + 8 * instr.dst); - } - void JitCompilerX86::h_FDIV_M(Instruction& instr, int i) { - instr.dst %= 4; + instr.dst %= RegisterCountFlt; genAddressReg(instr); emit(REX_CVTDQ2PD_XMM12); emit(REX_ANDPS_XMM12); @@ -807,7 +748,7 @@ namespace randomx { } void JitCompilerX86::h_FSQRT_R(Instruction& instr, int i) { - instr.dst %= 4; + instr.dst %= RegisterCountFlt; emit(SQRTPD); emitByte(0xe4 + 9 * instr.dst); } @@ -883,7 +824,7 @@ namespace randomx { void JitCompilerX86::handleCondition(Instruction& instr, int i) { const int shift = instr.getModShift(); - const int conditionMask = ((1 << RANDOMX_CONDITION_BITS) - 1) << shift; + const int conditionMask = ((1 << RANDOMX_JUMP_BITS) - 1) << shift; int reg = getConditionRegister(); int target = registerUsage[reg] + 1; emit(REX_ADD_I); @@ -900,7 +841,7 @@ namespace randomx { } void JitCompilerX86::h_COND_R(Instruction& instr, int i) { -#ifdef RANDOMX_JUMP +#if RANDOMX_JUMP handleCondition(instr, i); #endif emit(XOR_ECX_ECX); @@ -914,40 +855,15 @@ namespace randomx { emitByte(0xc1 + 8 * instr.dst); } - void JitCompilerX86::h_COND_M(Instruction& instr, int i) { -#ifdef RANDOMX_JUMP - handleCondition(instr, i); -#endif - emit(XOR_ECX_ECX); - genAddressReg(instr); - emit(REX_CMP_M32I); - emit32(instr.getImm32()); - emitByte(0x0f); - emitByte(condition(instr)); - emitByte(0xc1); - emit(REX_ADD_RM); - emitByte(0xc1 + 8 * instr.dst); - } - void JitCompilerX86::h_ISTORE(Instruction& instr, int i) { genAddressRegDst(instr); - //if (instr.getModCond()) emit(REX_MOV_MR); - //else - // emit(MOVNTI); - emitByte(0x04 + 8 * instr.src); - emitByte(0x06); - } - - void JitCompilerX86::h_FSTORE(Instruction& instr, int i) { - genAddressRegDst(instr, true); - emit(MOVAPD); emitByte(0x04 + 8 * instr.src); emitByte(0x06); } void JitCompilerX86::h_NOP(Instruction& instr, int i) { - emitByte(0x90); + emit(NOP1); } #include "instruction_weights.hpp" diff --git a/src/jit_compiler_x86.hpp b/src/jit_compiler_x86.hpp index 8bccb1f..964dd93 100644 --- a/src/jit_compiler_x86.hpp +++ b/src/jit_compiler_x86.hpp @@ -110,43 +110,36 @@ namespace randomx { codePos += count; } - void h_IADD_RS(Instruction&, int); - void h_IADD_M(Instruction&, int); - void h_IADD_RC(Instruction&, int); - void h_ISUB_R(Instruction&, int); - void h_ISUB_M(Instruction&, int); - void h_IMUL_9C(Instruction&, int); - void h_IMUL_R(Instruction&, int); - void h_IMUL_M(Instruction&, int); - void h_IMULH_R(Instruction&, int); - void h_IMULH_M(Instruction&, int); - void h_ISMULH_R(Instruction&, int); - void h_ISMULH_M(Instruction&, int); - void h_IMUL_RCP(Instruction&, int); - void h_ISDIV_C(Instruction&, int); - void h_INEG_R(Instruction&, int); - void h_IXOR_R(Instruction&, int); - void h_IXOR_M(Instruction&, int); - void h_IROR_R(Instruction&, int); - void h_IROL_R(Instruction&, int); - void h_ISWAP_R(Instruction&, int); - void h_FSWAP_R(Instruction&, int); - void h_FADD_R(Instruction&, int); - void h_FADD_M(Instruction&, int); - void h_FSUB_R(Instruction&, int); - void h_FSUB_M(Instruction&, int); - void h_FSCAL_R(Instruction&, int); - void h_FMUL_R(Instruction&, int); - void h_FMUL_M(Instruction&, int); - void h_FDIV_R(Instruction&, int); - void h_FDIV_M(Instruction&, int); - void h_FSQRT_R(Instruction&, int); - void h_COND_R(Instruction&, int); - void h_COND_M(Instruction&, int); - void h_CFROUND(Instruction&, int); - void h_ISTORE(Instruction&, int); - void h_FSTORE(Instruction&, int); - void h_NOP(Instruction&, int); + void h_IADD_RS(Instruction&, int); + void h_IADD_M(Instruction&, int); + void h_ISUB_R(Instruction&, int); + void h_ISUB_M(Instruction&, int); + void h_IMUL_R(Instruction&, int); + void h_IMUL_M(Instruction&, int); + void h_IMULH_R(Instruction&, int); + void h_IMULH_M(Instruction&, int); + void h_ISMULH_R(Instruction&, int); + void h_ISMULH_M(Instruction&, int); + void h_IMUL_RCP(Instruction&, int); + void h_INEG_R(Instruction&, int); + void h_IXOR_R(Instruction&, int); + void h_IXOR_M(Instruction&, int); + void h_IROR_R(Instruction&, int); + void h_IROL_R(Instruction&, int); + void h_ISWAP_R(Instruction&, int); + void h_FSWAP_R(Instruction&, int); + void h_FADD_R(Instruction&, int); + void h_FADD_M(Instruction&, int); + void h_FSUB_R(Instruction&, int); + void h_FSUB_M(Instruction&, int); + void h_FSCAL_R(Instruction&, int); + void h_FMUL_R(Instruction&, int); + void h_FDIV_M(Instruction&, int); + void h_FSQRT_R(Instruction&, int); + void h_COND_R(Instruction&, int); + void h_CFROUND(Instruction&, int); + void h_ISTORE(Instruction&, int); + void h_NOP(Instruction&, int); }; } \ No newline at end of file diff --git a/src/jit_compiler_x86_static.S b/src/jit_compiler_x86_static.S index 3b8e82e..04dbaa9 100644 --- a/src/jit_compiler_x86_static.S +++ b/src/jit_compiler_x86_static.S @@ -31,7 +31,6 @@ .global DECL(randomx_program_loop_load) .global DECL(randomx_program_start) .global DECL(randomx_program_read_dataset) -.global DECL(randomx_program_read_dataset_light) .global DECL(randomx_program_read_dataset_sshash_init) .global DECL(randomx_program_read_dataset_sshash_fin) .global DECL(randomx_program_loop_store) @@ -66,9 +65,6 @@ DECL(randomx_program_start): DECL(randomx_program_read_dataset): #include "asm/program_read_dataset.inc" -DECL(randomx_program_read_dataset_light): - #include "asm/program_read_dataset_light.inc" - DECL(randomx_program_read_dataset_sshash_init): #include "asm/program_read_dataset_sshash_init.inc" diff --git a/src/jit_compiler_x86_static.asm b/src/jit_compiler_x86_static.asm index 3153a8f..92d2ebd 100644 --- a/src/jit_compiler_x86_static.asm +++ b/src/jit_compiler_x86_static.asm @@ -24,7 +24,6 @@ PUBLIC randomx_program_loop_begin PUBLIC randomx_program_loop_load PUBLIC randomx_program_start PUBLIC randomx_program_read_dataset -PUBLIC randomx_program_read_dataset_light PUBLIC randomx_program_read_dataset_sshash_init PUBLIC randomx_program_read_dataset_sshash_fin PUBLIC randomx_dataset_init @@ -62,10 +61,6 @@ randomx_program_read_dataset PROC include asm/program_read_dataset.inc randomx_program_read_dataset ENDP -randomx_program_read_dataset_light PROC - include asm/program_read_dataset_light.inc -randomx_program_read_dataset_light ENDP - randomx_program_read_dataset_sshash_init PROC include asm/program_read_dataset_sshash_init.inc randomx_program_read_dataset_sshash_init ENDP diff --git a/src/jit_compiler_x86_static.hpp b/src/jit_compiler_x86_static.hpp index a3ce44f..09b4703 100644 --- a/src/jit_compiler_x86_static.hpp +++ b/src/jit_compiler_x86_static.hpp @@ -25,7 +25,6 @@ extern "C" { void randomx_program_loop_load(); void randomx_program_start(); void randomx_program_read_dataset(); - void randomx_program_read_dataset_light(); void randomx_program_read_dataset_sshash_init(); void randomx_program_read_dataset_sshash_fin(); void randomx_program_loop_store(); diff --git a/src/virtual_machine.cpp b/src/virtual_machine.cpp index e97fad7..3707ba7 100644 --- a/src/virtual_machine.cpp +++ b/src/virtual_machine.cpp @@ -76,22 +76,6 @@ void randomx_vm::initialize() { store64(&config.eMask[1], (program.getEntropy(15) & mask22bit) | maskExp240); } -//TODO -std::ostream& operator<<(std::ostream& os, const randomx::RegisterFile& rf) { - for (int i = 0; i < randomx::RegistersCount; ++i) - os << std::hex << "r" << i << " = " << rf.r[i] << std::endl << std::dec; - for (int i = 0; i < 4; ++i) - os << std::hex << "f" << i << " = " << *(uint64_t*)&rf.f[i].hi << " (" << rf.f[i].hi << ")" << std::endl - << " = " << *(uint64_t*)&rf.f[i].lo << " (" << rf.f[i].lo << ")" << std::endl << std::dec; - for (int i = 0; i < 4; ++i) - os << std::hex << "e" << i << " = " << *(uint64_t*)&rf.e[i].hi << " (" << rf.e[i].hi << ")" << std::endl - << " = " << *(uint64_t*)&rf.e[i].lo << " (" << rf.e[i].lo << ")" << std::endl << std::dec; - for (int i = 0; i < 4; ++i) - os << std::hex << "a" << i << " = " << *(uint64_t*)&rf.a[i].hi << " (" << rf.a[i].hi << ")" << std::endl - << " = " << *(uint64_t*)&rf.a[i].lo << " (" << rf.a[i].lo << ")" << std::endl << std::dec; - return os; -} - namespace randomx { alignas(16) volatile static __m128i aesDummy; diff --git a/src/vm_interpreted.cpp b/src/vm_interpreted.cpp index a5bba0f..2f69855 100644 --- a/src/vm_interpreted.cpp +++ b/src/vm_interpreted.cpp @@ -17,10 +17,6 @@ You should have received a copy of the GNU General Public License along with RandomX. If not, see. */ -//#define TRACE -//#define FPUCHECK -#define RANDOMX_JUMP - #include #include #include @@ -33,12 +29,6 @@ along with RandomX. If not, see. #include "intrin_portable.h" #include "reciprocal.h" -#ifdef FPUCHECK -constexpr bool fpuCheck = true; -#else -constexpr bool fpuCheck = false; -#endif - namespace randomx { static int_reg_t Zero = 0; @@ -53,49 +43,16 @@ namespace randomx { void InterpretedVm::run(void* seed) { VmBase::generateProgram(seed); randomx_vm::initialize(); - for (unsigned i = 0; i < RANDOMX_PROGRAM_SIZE; ++i) { - program(i).src %= RegistersCount; - program(i).dst %= RegistersCount; - } execute(); } template - void InterpretedVm::executeBytecode(int_reg_t(&r)[8], __m128d (&f)[4], __m128d (&e)[4], __m128d (&a)[4]) { - for (int ic = 0; ic < RANDOMX_PROGRAM_SIZE; ++ic) { - executeBytecode(ic, r, f, e, a); + void InterpretedVm::executeBytecode(int_reg_t(&r)[RegistersCount], __m128d (&f)[RegisterCountFlt], __m128d (&e)[RegisterCountFlt], __m128d (&a)[RegisterCountFlt]) { + for (int pc = 0; pc < RANDOMX_PROGRAM_SIZE; ++pc) { + executeBytecode(pc, r, f, e, a); } } - static void print(int_reg_t r) { - std::cout << std::hex << std::setw(16) << std::setfill('0') << r << std::endl; - } - - static void print(__m128d f) { - uint64_t lo = *(((uint64_t*)&f) + 0); - uint64_t hi = *(((uint64_t*)&f) + 1); - std::cout << std::hex << std::setw(16) << std::setfill('0') << hi << '-' << std::hex << std::setw(16) << std::setfill('0') << lo << std::endl; - } - - static void printState(int_reg_t(&r)[8], __m128d (&f)[4], __m128d (&e)[4], __m128d (&a)[4]) { - for (int i = 0; i < 8; ++i) { - std::cout << "r" << i << " = "; print(r[i]); - } - for (int i = 0; i < 4; ++i) { - std::cout << "f" << i << " = "; print(f[i]); - } - for (int i = 0; i < 4; ++i) { - std::cout << "e" << i << " = "; print(e[i]); - } - for (int i = 0; i < 4; ++i) { - std::cout << "a" << i << " = "; print(a[i]); - } - } - - static bool isDenormal(double x) { - return std::fpclassify(x) == FP_SUBNORMAL; - } - template FORCE_INLINE void* InterpretedVm::getScratchpadAddress(InstructionByteCode& ibc) { uint32_t addr = (*ibc.isrc + ibc.imm) & ibc.memMask; @@ -113,9 +70,8 @@ namespace randomx { } template - void InterpretedVm::executeBytecode(int& ic, int_reg_t(&r)[8], __m128d (&f)[4], __m128d (&e)[4], __m128d (&a)[4]) { - auto& ibc = byteCode[ic]; - if (trace && ibc.type != InstructionType::NOP) std::cout << std::dec << std::setw(3) << ic << " " << program(ic); + void InterpretedVm::executeBytecode(int& pc, int_reg_t(&r)[RegistersCount], __m128d (&f)[RegisterCountFlt], __m128d (&e)[RegisterCountFlt], __m128d (&a)[RegisterCountFlt]) { + auto& ibc = byteCode[pc]; switch (ibc.type) { case InstructionType::IADD_RS: { @@ -225,11 +181,11 @@ namespace randomx { } break; case InstructionType::COND_R: { -#ifdef RANDOMX_JUMP +#if RANDOMX_JUMP *ibc.creg += (1 << ibc.shift); - const uint64_t conditionMask = ((1ULL << RANDOMX_CONDITION_BITS) - 1) << ibc.shift; + const uint64_t conditionMask = ((1ULL << RANDOMX_JUMP_BITS) - 1) << ibc.shift; if ((*ibc.creg & conditionMask) == 0) { - ic = ibc.target; + pc = ibc.target; break; } #endif @@ -251,50 +207,23 @@ namespace randomx { default: UNREACHABLE; } - if (trace && ibc.type != InstructionType::NOP) { - if(ibc.type < 20 || ibc.type == 31 || ibc.type == 32) - print(*ibc.idst); - else //if(ibc.type >= 20 && ibc.type <= 30) - print(0); - } -#ifdef FPUCHECK - if (ibc.type >= 26 && ibc.type <= 30) { - double lo = *(((double*)ibc.fdst) + 0); - double hi = *(((double*)ibc.fdst) + 1); - if (lo <= 0 || hi <= 0) { - std::stringstream ss; - ss << "Underflow in operation " << ibc.type; - printState(r, f, e, a); - throw std::runtime_error(ss.str()); - } - } -#endif } template void InterpretedVm::execute() { - int_reg_t r[8] = { 0, 0, 0, 0, 0, 0, 0, 0 }; - __m128d f[4]; - __m128d e[4]; - __m128d a[4]; + int_reg_t r[RegistersCount] = { 0 }; + __m128d f[RegisterCountFlt]; + __m128d e[RegisterCountFlt]; + __m128d a[RegisterCountFlt]; - a[0] = _mm_load_pd(®.a[0].lo); - a[1] = _mm_load_pd(®.a[1].lo); - a[2] = _mm_load_pd(®.a[2].lo); - a[3] = _mm_load_pd(®.a[3].lo); + for(unsigned i = 0; i < RegisterCountFlt; ++i) + a[i] = _mm_load_pd(®.a[i].lo); precompileProgram(r, f, e, a); uint32_t spAddr0 = mem.mx; uint32_t spAddr1 = mem.ma; - if (trace) { - std::cout << "execute (reg: r" << config.readReg0 << ", r" << config.readReg1 << ", r" << config.readReg2 << ", r" << config.readReg3 << ")" << std::endl; - std::cout << "spAddr " << std::hex << std::setw(8) << std::setfill('0') << spAddr1 << " / " << std::setw(8) << std::setfill('0') << spAddr0 << std::endl; - std::cout << "ma/mx " << std::hex << std::setw(8) << std::setfill('0') << mem.ma << std::setw(8) << std::setfill('0') << mem.mx << std::endl; - printState(r, f, e, a); - } - for(unsigned ic = 0; ic < RANDOMX_PROGRAM_ITERATIONS; ++ic) { uint64_t spMix = r[config.readReg0] ^ r[config.readReg1]; spAddr0 ^= spMix; @@ -302,31 +231,14 @@ namespace randomx { spAddr1 ^= spMix >> 32; spAddr1 &= ScratchpadL3Mask64; - r[0] ^= load64(scratchpad + spAddr0 + 0); - r[1] ^= load64(scratchpad + spAddr0 + 8); - r[2] ^= load64(scratchpad + spAddr0 + 16); - r[3] ^= load64(scratchpad + spAddr0 + 24); - r[4] ^= load64(scratchpad + spAddr0 + 32); - r[5] ^= load64(scratchpad + spAddr0 + 40); - r[6] ^= load64(scratchpad + spAddr0 + 48); - r[7] ^= load64(scratchpad + spAddr0 + 56); + for (unsigned i = 0; i < RegistersCount; ++i) + r[i] ^= load64(scratchpad + spAddr0 + 8 * i); - f[0] = load_cvt_i32x2(scratchpad + spAddr1 + 0); - f[1] = load_cvt_i32x2(scratchpad + spAddr1 + 8); - f[2] = load_cvt_i32x2(scratchpad + spAddr1 + 16); - f[3] = load_cvt_i32x2(scratchpad + spAddr1 + 24); - e[0] = maskRegisterExponentMantissa(load_cvt_i32x2(scratchpad + spAddr1 + 32)); - e[1] = maskRegisterExponentMantissa(load_cvt_i32x2(scratchpad + spAddr1 + 40)); - e[2] = maskRegisterExponentMantissa(load_cvt_i32x2(scratchpad + spAddr1 + 48)); - e[3] = maskRegisterExponentMantissa(load_cvt_i32x2(scratchpad + spAddr1 + 56)); + for (unsigned i = 0; i < RegisterCountFlt; ++i) + f[i] = load_cvt_i32x2(scratchpad + spAddr1 + 8 * i); - if (trace) { - std::cout << "iteration " << std::dec << ic << std::endl; - std::cout << "spAddr " << std::hex << std::setw(8) << std::setfill('0') << spAddr1 << " / " << std::setw(8) << std::setfill('0') << spAddr0 << std::endl; - std::cout << "ma/mx " << std::hex << std::setw(8) << std::setfill('0') << mem.ma << std::setw(8) << std::setfill('0') << mem.mx << std::endl; - printState(r, f, e, a); - std::cout << "-----------------------------------" << std::endl; - } + for (unsigned i = 0; i < RegisterCountFlt; ++i) + e[i] = maskRegisterExponentMantissa(load_cvt_i32x2(scratchpad + spAddr1 + 8 * (RegisterCountFlt + i))); executeBytecode(r, f, e, a); @@ -335,72 +247,33 @@ namespace randomx { datasetRead(datasetOffset + mem.ma, r); std::swap(mem.mx, mem.ma); - if (trace) { - std::cout << "iteration " << std::dec << ic << std::endl; - std::cout << "spAddr " << std::hex << std::setw(8) << std::setfill('0') << spAddr1 << " / " << std::setw(8) << std::setfill('0') << spAddr0 << std::endl; - std::cout << "ma/mx " << std::hex << std::setw(8) << std::setfill('0') << mem.ma << std::setw(8) << std::setfill('0') << mem.mx << std::endl; - printState(r, f, e, a); - std::cout << "===================================" << std::endl; - } + for (unsigned i = 0; i < RegistersCount; ++i) + store64(scratchpad + spAddr1 + 8 * i, r[i]); - store64(scratchpad + spAddr1 + 0, r[0]); - store64(scratchpad + spAddr1 + 8, r[1]); - store64(scratchpad + spAddr1 + 16, r[2]); - store64(scratchpad + spAddr1 + 24, r[3]); - store64(scratchpad + spAddr1 + 32, r[4]); - store64(scratchpad + spAddr1 + 40, r[5]); - store64(scratchpad + spAddr1 + 48, r[6]); - store64(scratchpad + spAddr1 + 56, r[7]); + for (unsigned i = 0; i < RegisterCountFlt; ++i) + f[i] = _mm_xor_pd(f[i], e[i]); - f[0] = _mm_xor_pd(f[0], e[0]); - f[1] = _mm_xor_pd(f[1], e[1]); - f[2] = _mm_xor_pd(f[2], e[2]); - f[3] = _mm_xor_pd(f[3], e[3]); - -#ifdef FPUCHECK - for(int i = 0; i < 4; ++i) { - double lo = *(((double*)&f[i]) + 0); - double hi = *(((double*)&f[i]) + 1); - if (isDenormal(lo) || isDenormal(hi)) { - std::stringstream ss; - ss << "Denormal f" << i; - throw std::runtime_error(ss.str()); - } - } -#endif - - _mm_store_pd((double*)(scratchpad + spAddr0 + 0), f[0]); - _mm_store_pd((double*)(scratchpad + spAddr0 + 16), f[1]); - _mm_store_pd((double*)(scratchpad + spAddr0 + 32), f[2]); - _mm_store_pd((double*)(scratchpad + spAddr0 + 48), f[3]); + for (unsigned i = 0; i < RegisterCountFlt; ++i) + _mm_store_pd((double*)(scratchpad + spAddr0 + 16 * i), f[i]); spAddr0 = 0; spAddr1 = 0; } - store64(®.r[0], r[0]); - store64(®.r[1], r[1]); - store64(®.r[2], r[2]); - store64(®.r[3], r[3]); - store64(®.r[4], r[4]); - store64(®.r[5], r[5]); - store64(®.r[6], r[6]); - store64(®.r[7], r[7]); + for (unsigned i = 0; i < RegistersCount; ++i) + store64(®.r[i], r[i]); - _mm_store_pd(®.f[0].lo, f[0]); - _mm_store_pd(®.f[1].lo, f[1]); - _mm_store_pd(®.f[2].lo, f[2]); - _mm_store_pd(®.f[3].lo, f[3]); - _mm_store_pd(®.e[0].lo, e[0]); - _mm_store_pd(®.e[1].lo, e[1]); - _mm_store_pd(®.e[2].lo, e[2]); - _mm_store_pd(®.e[3].lo, e[3]); + for (unsigned i = 0; i < RegisterCountFlt; ++i) + _mm_store_pd(®.f[i].lo, f[i]); + + for (unsigned i = 0; i < RegisterCountFlt; ++i) + _mm_store_pd(®.e[i].lo, e[i]); } - static int getConditionRegister(int(®isterUsage)[8]) { + static int getConditionRegister(int(®isterUsage)[RegistersCount]) { int min = INT_MAX; int minIndex; - for (unsigned i = 0; i < 8; ++i) { + for (unsigned i = 0; i < RegistersCount; ++i) { if (registerUsage[i] < min) { min = registerUsage[i]; minIndex = i; @@ -410,7 +283,7 @@ namespace randomx { } template - void InterpretedVm::datasetRead(uint32_t address, int_reg_t(&r)[8]) { + void InterpretedVm::datasetRead(uint32_t address, int_reg_t(&r)[RegistersCount]) { uint64_t* datasetLine = (uint64_t*)(mem.memory + address); for (int i = 0; i < RegistersCount; ++i) r[i] ^= datasetLine[i]; @@ -419,9 +292,9 @@ namespace randomx { #include "instruction_weights.hpp" template - void InterpretedVm::precompileProgram(int_reg_t(&r)[8], __m128d (&f)[4], __m128d (&e)[4], __m128d (&a)[4]) { - int registerUsage[8]; - for (unsigned i = 0; i < 8; ++i) { + void InterpretedVm::precompileProgram(int_reg_t(&r)[RegistersCount], __m128d (&f)[RegisterCountFlt], __m128d (&e)[RegisterCountFlt], __m128d (&a)[RegisterCountFlt]) { + int registerUsage[RegistersCount]; + for (unsigned i = 0; i < RegistersCount; ++i) { registerUsage[i] = -1; } for (unsigned i = 0; i < RANDOMX_PROGRAM_SIZE; ++i) { @@ -443,7 +316,7 @@ namespace randomx { ibc.shift = instr.getModMem(); ibc.imm = signExtend2sCompl(instr.getImm32()); } - registerUsage[instr.dst] = i; + registerUsage[dst] = i; } break; CASE_REP(IADD_M) { @@ -452,7 +325,7 @@ namespace randomx { ibc.type = InstructionType::IADD_M; ibc.idst = &r[dst]; ibc.imm = signExtend2sCompl(instr.getImm32()); - if (instr.src != instr.dst) { + if (src != dst) { ibc.isrc = &r[src]; ibc.memMask = (instr.getModMem() ? ScratchpadL1Mask : ScratchpadL2Mask); } @@ -460,7 +333,7 @@ namespace randomx { ibc.isrc = &Zero; ibc.memMask = ScratchpadL3Mask; } - registerUsage[instr.dst] = i; + registerUsage[dst] = i; } break; CASE_REP(ISUB_R) { @@ -475,7 +348,7 @@ namespace randomx { ibc.imm = signExtend2sCompl(instr.getImm32()); ibc.isrc = &ibc.imm; } - registerUsage[instr.dst] = i; + registerUsage[dst] = i; } break; CASE_REP(ISUB_M) { @@ -484,7 +357,7 @@ namespace randomx { ibc.type = InstructionType::ISUB_M; ibc.idst = &r[dst]; ibc.imm = signExtend2sCompl(instr.getImm32()); - if (instr.src != instr.dst) { + if (src != dst) { ibc.isrc = &r[src]; ibc.memMask = (instr.getModMem() ? ScratchpadL1Mask : ScratchpadL2Mask); } @@ -492,7 +365,7 @@ namespace randomx { ibc.isrc = &Zero; ibc.memMask = ScratchpadL3Mask; } - registerUsage[instr.dst] = i; + registerUsage[dst] = i; } break; CASE_REP(IMUL_R) { @@ -507,7 +380,7 @@ namespace randomx { ibc.imm = signExtend2sCompl(instr.getImm32()); ibc.isrc = &ibc.imm; } - registerUsage[instr.dst] = i; + registerUsage[dst] = i; } break; CASE_REP(IMUL_M) { @@ -516,7 +389,7 @@ namespace randomx { ibc.type = InstructionType::IMUL_M; ibc.idst = &r[dst]; ibc.imm = signExtend2sCompl(instr.getImm32()); - if (instr.src != instr.dst) { + if (src != dst) { ibc.isrc = &r[src]; ibc.memMask = (instr.getModMem() ? ScratchpadL1Mask : ScratchpadL2Mask); } @@ -524,7 +397,7 @@ namespace randomx { ibc.isrc = &Zero; ibc.memMask = ScratchpadL3Mask; } - registerUsage[instr.dst] = i; + registerUsage[dst] = i; } break; CASE_REP(IMULH_R) { @@ -533,7 +406,7 @@ namespace randomx { ibc.type = InstructionType::IMULH_R; ibc.idst = &r[dst]; ibc.isrc = &r[src]; - registerUsage[instr.dst] = i; + registerUsage[dst] = i; } break; CASE_REP(IMULH_M) { @@ -542,7 +415,7 @@ namespace randomx { ibc.type = InstructionType::IMULH_M; ibc.idst = &r[dst]; ibc.imm = signExtend2sCompl(instr.getImm32()); - if (instr.src != instr.dst) { + if (src != dst) { ibc.isrc = &r[src]; ibc.memMask = (instr.getModMem() ? ScratchpadL1Mask : ScratchpadL2Mask); } @@ -550,7 +423,7 @@ namespace randomx { ibc.isrc = &Zero; ibc.memMask = ScratchpadL3Mask; } - registerUsage[instr.dst] = i; + registerUsage[dst] = i; } break; CASE_REP(ISMULH_R) { @@ -559,7 +432,7 @@ namespace randomx { ibc.type = InstructionType::ISMULH_R; ibc.idst = &r[dst]; ibc.isrc = &r[src]; - registerUsage[instr.dst] = i; + registerUsage[dst] = i; } break; CASE_REP(ISMULH_M) { @@ -568,7 +441,7 @@ namespace randomx { ibc.type = InstructionType::ISMULH_M; ibc.idst = &r[dst]; ibc.imm = signExtend2sCompl(instr.getImm32()); - if (instr.src != instr.dst) { + if (src != dst) { ibc.isrc = &r[src]; ibc.memMask = (instr.getModMem() ? ScratchpadL1Mask : ScratchpadL2Mask); } @@ -576,7 +449,7 @@ namespace randomx { ibc.isrc = &Zero; ibc.memMask = ScratchpadL3Mask; } - registerUsage[instr.dst] = i; + registerUsage[dst] = i; } break; CASE_REP(IMUL_RCP) { @@ -587,7 +460,7 @@ namespace randomx { ibc.idst = &r[dst]; ibc.imm = randomx_reciprocal(divisor); ibc.isrc = &ibc.imm; - registerUsage[instr.dst] = i; + registerUsage[dst] = i; } else { ibc.type = InstructionType::NOP; @@ -598,7 +471,7 @@ namespace randomx { auto dst = instr.dst % RegistersCount; ibc.type = InstructionType::INEG_R; ibc.idst = &r[dst]; - registerUsage[instr.dst] = i; + registerUsage[dst] = i; } break; CASE_REP(IXOR_R) { @@ -613,7 +486,7 @@ namespace randomx { ibc.imm = signExtend2sCompl(instr.getImm32()); ibc.isrc = &ibc.imm; } - registerUsage[instr.dst] = i; + registerUsage[dst] = i; } break; CASE_REP(IXOR_M) { @@ -622,7 +495,7 @@ namespace randomx { ibc.type = InstructionType::IXOR_M; ibc.idst = &r[dst]; ibc.imm = signExtend2sCompl(instr.getImm32()); - if (instr.src != instr.dst) { + if (src != dst) { ibc.isrc = &r[src]; ibc.memMask = (instr.getModMem() ? ScratchpadL1Mask : ScratchpadL2Mask); } @@ -630,7 +503,7 @@ namespace randomx { ibc.isrc = &Zero; ibc.memMask = ScratchpadL3Mask; } - registerUsage[instr.dst] = i; + registerUsage[dst] = i; } break; CASE_REP(IROR_R) { @@ -645,7 +518,7 @@ namespace randomx { ibc.imm = instr.getImm32(); ibc.isrc = &ibc.imm; } - registerUsage[instr.dst] = i; + registerUsage[dst] = i; } break; CASE_REP(IROL_R) { @@ -660,7 +533,7 @@ namespace randomx { ibc.imm = instr.getImm32(); ibc.isrc = &ibc.imm; } - registerUsage[instr.dst] = i; + registerUsage[dst] = i; } break; CASE_REP(ISWAP_R) { @@ -670,8 +543,8 @@ namespace randomx { ibc.idst = &r[dst]; ibc.isrc = &r[src]; ibc.type = InstructionType::ISWAP_R; - registerUsage[instr.dst] = i; - registerUsage[instr.src] = i; + registerUsage[dst] = i; + registerUsage[src] = i; } else { ibc.type = InstructionType::NOP; @@ -681,23 +554,23 @@ namespace randomx { CASE_REP(FSWAP_R) { auto dst = instr.dst % RegistersCount; ibc.type = InstructionType::FSWAP_R; - if (dst < 4) + if (dst < RegisterCountFlt) ibc.fdst = &f[dst]; else - ibc.fdst = &e[dst - 4]; + ibc.fdst = &e[dst - RegisterCountFlt]; } break; CASE_REP(FADD_R) { - auto dst = instr.dst % 4; - auto src = instr.src % 4; + auto dst = instr.dst % RegisterCountFlt; + auto src = instr.src % RegisterCountFlt; ibc.type = InstructionType::FADD_R; ibc.fdst = &f[dst]; ibc.fsrc = &a[src]; } break; CASE_REP(FADD_M) { - auto dst = instr.dst % 4; - auto src = instr.src % 8; + auto dst = instr.dst % RegisterCountFlt; + auto src = instr.src % RegistersCount; ibc.type = InstructionType::FADD_M; ibc.fdst = &f[dst]; ibc.isrc = &r[src]; @@ -706,16 +579,16 @@ namespace randomx { } break; CASE_REP(FSUB_R) { - auto dst = instr.dst % 4; - auto src = instr.src % 4; + auto dst = instr.dst % RegisterCountFlt; + auto src = instr.src % RegisterCountFlt; ibc.type = InstructionType::FSUB_R; ibc.fdst = &f[dst]; ibc.fsrc = &a[src]; } break; CASE_REP(FSUB_M) { - auto dst = instr.dst % 4; - auto src = instr.src % 8; + auto dst = instr.dst % RegisterCountFlt; + auto src = instr.src % RegistersCount; ibc.type = InstructionType::FSUB_M; ibc.fdst = &f[dst]; ibc.isrc = &r[src]; @@ -724,22 +597,22 @@ namespace randomx { } break; CASE_REP(FSCAL_R) { - auto dst = instr.dst % 4; + auto dst = instr.dst % RegisterCountFlt; ibc.fdst = &f[dst]; ibc.type = InstructionType::FSCAL_R; } break; CASE_REP(FMUL_R) { - auto dst = instr.dst % 4; - auto src = instr.src % 4; + auto dst = instr.dst % RegisterCountFlt; + auto src = instr.src % RegisterCountFlt; ibc.type = InstructionType::FMUL_R; ibc.fdst = &e[dst]; ibc.fsrc = &a[src]; } break; CASE_REP(FDIV_M) { - auto dst = instr.dst % 4; - auto src = instr.src % 8; + auto dst = instr.dst % RegisterCountFlt; + auto src = instr.src % RegistersCount; ibc.type = InstructionType::FDIV_M; ibc.fdst = &e[dst]; ibc.isrc = &r[src]; @@ -748,7 +621,7 @@ namespace randomx { } break; CASE_REP(FSQRT_R) { - auto dst = instr.dst % 4; + auto dst = instr.dst % RegisterCountFlt; ibc.type = InstructionType::FSQRT_R; ibc.fdst = &e[dst]; } break; @@ -766,13 +639,13 @@ namespace randomx { ibc.target = registerUsage[reg]; ibc.shift = instr.getModShift(); ibc.creg = &r[reg]; - for (unsigned j = 0; j < 8; ++j) { //mark all registers as used + for (unsigned j = 0; j < RegistersCount; ++j) { //mark all registers as used registerUsage[j] = i; } } break; CASE_REP(CFROUND) { - auto src = instr.src % 8; + auto src = instr.src % RegistersCount; ibc.isrc = &r[src]; ibc.type = InstructionType::CFROUND; ibc.imm = instr.getImm32() & 63; diff --git a/src/vm_interpreted.hpp b/src/vm_interpreted.hpp index 8a15785..e3a3eb4 100644 --- a/src/vm_interpreted.hpp +++ b/src/vm_interpreted.hpp @@ -71,12 +71,12 @@ namespace randomx { void run(void* seed) override; void setDataset(randomx_dataset* dataset) override; protected: - virtual void datasetRead(uint32_t blockNumber, int_reg_t(&r)[8]); + virtual void datasetRead(uint32_t blockNumber, int_reg_t(&r)[RegistersCount]); private: void execute(); - void precompileProgram(int_reg_t(&r)[8], __m128d (&f)[4], __m128d (&e)[4], __m128d (&a)[4]); - void executeBytecode(int_reg_t(&r)[8], __m128d (&f)[4], __m128d (&e)[4], __m128d (&a)[4]); - void executeBytecode(int& i, int_reg_t(&r)[8], __m128d (&f)[4], __m128d (&e)[4], __m128d (&a)[4]); + void precompileProgram(int_reg_t(&r)[RegistersCount], __m128d (&f)[RegisterCountFlt], __m128d (&e)[RegisterCountFlt], __m128d (&a)[RegisterCountFlt]); + void executeBytecode(int_reg_t(&r)[RegistersCount], __m128d (&f)[RegisterCountFlt], __m128d (&e)[RegisterCountFlt], __m128d (&a)[RegisterCountFlt]); + void executeBytecode(int& i, int_reg_t(&r)[RegistersCount], __m128d (&f)[RegisterCountFlt], __m128d (&e)[RegisterCountFlt], __m128d (&a)[RegisterCountFlt]); void* getScratchpadAddress(InstructionByteCode& ibc); __m128d maskRegisterExponentMantissa(__m128d); diff --git a/vcxproj/benchmark.vcxproj b/vcxproj/benchmark.vcxproj index eba548f..27031e3 100644 --- a/vcxproj/benchmark.vcxproj +++ b/vcxproj/benchmark.vcxproj @@ -106,7 +106,7 @@ MaxSpeed true true - true + false true diff --git a/vcxproj/randomx.vcxproj b/vcxproj/randomx.vcxproj index 0ad01ab..218975a 100644 --- a/vcxproj/randomx.vcxproj +++ b/vcxproj/randomx.vcxproj @@ -26,20 +26,20 @@ - Application + StaticLibrary true v141 MultiByte - Application + StaticLibrary false v141 true MultiByte - Application + StaticLibrary true v141 MultiByte