From 59bbb572c29b4428c4c8cd88fcf19086095f5c10 Mon Sep 17 00:00:00 2001 From: tevador Date: Thu, 28 Mar 2019 15:27:10 +0100 Subject: [PATCH 01/18] WIP --- src/AssemblyGeneratorX86.hpp | 1 + src/Instruction.hpp | 3 + src/JitCompilerX86.cpp | 7 +- src/JitCompilerX86.hpp | 10 + src/LightProgramGenerator.cpp | 342 ++++++++++++++++++++++++++ src/LightProgramGenerator.hpp | 24 ++ src/Program.cpp | 3 +- src/Program.hpp | 36 ++- src/configuration.h | 5 + src/main.cpp | 11 +- src/variant4_random_math.h | 441 ++++++++++++++++++++++++++++++++++ 11 files changed, 874 insertions(+), 9 deletions(-) create mode 100644 src/LightProgramGenerator.cpp create mode 100644 src/LightProgramGenerator.hpp create mode 100644 src/variant4_random_math.h diff --git a/src/AssemblyGeneratorX86.hpp b/src/AssemblyGeneratorX86.hpp index 62a6081..d2672a0 100644 --- a/src/AssemblyGeneratorX86.hpp +++ b/src/AssemblyGeneratorX86.hpp @@ -20,6 +20,7 @@ along with RandomX. If not, see. #pragma once #include "Instruction.hpp" +#include "configuration.h" #include namespace RandomX { diff --git a/src/Instruction.hpp b/src/Instruction.hpp index 7987ea4..d10575f 100644 --- a/src/Instruction.hpp +++ b/src/Instruction.hpp @@ -78,6 +78,9 @@ namespace RandomX { uint32_t getImm32() const { return load32(&imm32); } + void setImm32(uint32_t val) { + return store32(&imm32, val); + } const char* getName() const { return names[opcode]; } diff --git a/src/JitCompilerX86.cpp b/src/JitCompilerX86.cpp index 5ddc382..6c58a88 100644 --- a/src/JitCompilerX86.cpp +++ b/src/JitCompilerX86.cpp @@ -238,12 +238,7 @@ namespace RandomX { emitByte(0xc0 + readReg1); memcpy(code + codePos, codeLoopLoad, loopLoadSize); codePos += loopLoadSize; - for (unsigned i = 0; i < RANDOMX_PROGRAM_SIZE; ++i) { - Instruction& instr = prog(i); - instr.src %= RegistersCount; - instr.dst %= RegistersCount; - generateCode(instr, i); - } + generateCode(prog); emit(REX_MOV_RR); emitByte(0xc0 + readReg2); emit(REX_XOR_EAX); diff --git a/src/JitCompilerX86.hpp b/src/JitCompilerX86.hpp index e127a40..f2fd330 100644 --- a/src/JitCompilerX86.hpp +++ b/src/JitCompilerX86.hpp @@ -52,6 +52,16 @@ namespace RandomX { uint8_t* code; int32_t codePos; + template + void generateCode(P& prog) { + for (unsigned i = 0; i < prog.getSize(); ++i) { + Instruction& instr = prog(i); + instr.src %= RegistersCount; + instr.dst %= RegistersCount; + generateCode(instr, i); + } + } + void generateProgramPrologue(Program&); void generateProgramEpilogue(Program&); int getConditionRegister(); diff --git a/src/LightProgramGenerator.cpp b/src/LightProgramGenerator.cpp new file mode 100644 index 0000000..dc8fa4e --- /dev/null +++ b/src/LightProgramGenerator.cpp @@ -0,0 +1,342 @@ +/* +Copyright (c) 2019 tevador + +This file is part of RandomX. + +RandomX is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +RandomX is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with RandomX. If not, see. +*/ + +#include "blake2/blake2.h" +#include "configuration.h" +#include "Program.hpp" +#include "blake2/endian.h"; +#include + +namespace RandomX { + + namespace LightInstruction { + constexpr int IADD_R = 0; + constexpr int IADD_RC = 1; + constexpr int ISUB_R = 2; + constexpr int IMUL_9C = 3; + constexpr int IMUL_R = 4; + constexpr int IMULH_R = 5; + constexpr int ISMULH_R = 6; + constexpr int IMUL_RCP = 7; + constexpr int IXOR_R = 8; + constexpr int IROR_R = 9; + constexpr int COND_R = 10; + constexpr int COUNT = 11; + } + + const int lightInstruction[] = { + LightInstruction::IADD_RC, + LightInstruction::IADD_RC, + LightInstruction::ISUB_R, + LightInstruction::ISUB_R, + LightInstruction::IMUL_9C, + LightInstruction::IMUL_R, + LightInstruction::IMUL_R, + LightInstruction::IMUL_R, + LightInstruction::IMULH_R, + LightInstruction::ISMULH_R, + LightInstruction::IMUL_RCP, + LightInstruction::IXOR_R, + LightInstruction::IXOR_R, + LightInstruction::IROR_R, + LightInstruction::IROR_R, + LightInstruction::COND_R + }; + + namespace LightInstructionOpcode { + constexpr int IADD_R = 0; + constexpr int IADD_RC = RANDOMX_FREQ_IADD_R + RANDOMX_FREQ_IADD_M; + constexpr int ISUB_R = IADD_RC + RANDOMX_FREQ_IADD_RC; + constexpr int IMUL_9C = ISUB_R + RANDOMX_FREQ_ISUB_R + RANDOMX_FREQ_ISUB_M; + constexpr int IMUL_R = IMUL_9C + RANDOMX_FREQ_IMUL_9C; + constexpr int IMULH_R = IMUL_R + RANDOMX_FREQ_IMUL_R + RANDOMX_FREQ_IMUL_M; + constexpr int ISMULH_R = IMULH_R + RANDOMX_FREQ_IMULH_R + RANDOMX_FREQ_IMULH_M; + constexpr int IMUL_RCP = ISMULH_R + RANDOMX_FREQ_ISMULH_R + RANDOMX_FREQ_ISMULH_M;; + constexpr int IXOR_R = IMUL_RCP + RANDOMX_FREQ_IMUL_RCP + RANDOMX_FREQ_INEG_R; + constexpr int IROR_R = IXOR_R + RANDOMX_FREQ_IXOR_R + RANDOMX_FREQ_IXOR_M; + constexpr int COND_R = IROR_R + RANDOMX_FREQ_IROR_R + RANDOMX_FREQ_IROL_R + RANDOMX_FREQ_ISWAP_R + RANDOMX_FREQ_FSWAP_R + RANDOMX_FREQ_FADD_R + RANDOMX_FREQ_FADD_M + RANDOMX_FREQ_FSUB_R + RANDOMX_FREQ_FSUB_M + RANDOMX_FREQ_FSCAL_R + RANDOMX_FREQ_FMUL_R + RANDOMX_FREQ_FDIV_M + RANDOMX_FREQ_FSQRT_R; + } + + const int lightInstructionOpcode[] = { + LightInstructionOpcode::IADD_R, + LightInstructionOpcode::IADD_RC, + LightInstructionOpcode::ISUB_R, + LightInstructionOpcode::IMUL_9C, + LightInstructionOpcode::IMUL_R, + LightInstructionOpcode::IMULH_R, + LightInstructionOpcode::ISMULH_R, + LightInstructionOpcode::IMUL_RCP, + LightInstructionOpcode::IXOR_R, + LightInstructionOpcode::IROR_R, + LightInstructionOpcode::COND_R + }; + + constexpr int ALU_COUNT_MUL = 1; + constexpr int ALU_COUNT = 4; + constexpr int LIGHT_OPCODE_BITS = 4; + constexpr int V4_SRC_INDEX_BITS = 3; + constexpr int V4_DST_INDEX_BITS = 3; + + static int blakeCounter = 0; + + // If we don't have enough data available, generate more + static FORCE_INLINE void check_data(size_t& data_index, const size_t bytes_needed, uint8_t* data, const size_t data_size) + { + if (data_index + bytes_needed > data_size) + { + std::cout << "Calling Blake " << (++blakeCounter) << std::endl; + blake2b(data, data_size, data, data_size, nullptr, 0); + data_index = 0; + } + } + + void generateLightProgram(LightProgram& prog, const void* seed, int indexRegister) { + + // Source: https://www.agner.org/optimize/instruction_tables.pdf + const int op_latency[LightInstruction::COUNT] = { 1, 2, 1, 2, 3, 5, 5, 4, 1, 2, 5 }; + + // Instruction latencies for theoretical ASIC implementation + const int asic_op_latency[LightInstruction::COUNT] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }; + + // Available ALUs for each instruction + const int op_ALUs[LightInstruction::COUNT] = { ALU_COUNT, ALU_COUNT, ALU_COUNT, ALU_COUNT, ALU_COUNT_MUL, ALU_COUNT_MUL, ALU_COUNT_MUL, ALU_COUNT_MUL, ALU_COUNT, ALU_COUNT, ALU_COUNT }; + + uint8_t data[64]; + memset(data, 0, sizeof(data)); + memcpy(data, seed, SeedSize); + + // Set data_index past the last byte in data + // to trigger full data update with blake hash + // before we start using it + size_t data_index = sizeof(data); + + int code_size; + + do { + uint8_t opcode; + uint8_t dst_index; + uint8_t src_index; + uint32_t imm32 = 0; + + int latency[8]; + int asic_latency[9]; + + // Tracks previous instruction and value of the source operand for registers R0-R3 throughout code execution + // byte 0: current value of the destination register + // byte 1: instruction opcode + // byte 2: current value of the source register + // + // Registers R4-R8 are constant and are treated as having the same value because when we do + // the same operation twice with two constant source registers, it can be optimized into a single operation + uint64_t inst_data[8] = { 0, 1, 2, 3, 4, 5, 6, 7 }; + + bool alu_busy[RANDOMX_LPROG_LATENCY + 1][ALU_COUNT]; + bool is_rotation[LightInstruction::COUNT]; + bool rotated[8]; + int rotate_count = 0; + + memset(latency, 0, sizeof(latency)); + memset(asic_latency, 0, sizeof(asic_latency)); + memset(alu_busy, 0, sizeof(alu_busy)); + memset(is_rotation, 0, sizeof(is_rotation)); + memset(rotated, 0, sizeof(rotated)); + is_rotation[LightInstruction::IROR_R] = true; + + int num_retries = 0; + code_size = 0; + + int total_iterations = 0; + + // Generate random code to achieve minimal required latency for our abstract CPU + // Try to get this latency for all 4 registers + while (((latency[0] < RANDOMX_LPROG_LATENCY) || (latency[1] < RANDOMX_LPROG_LATENCY) || (latency[2] < RANDOMX_LPROG_LATENCY) || (latency[3] < RANDOMX_LPROG_LATENCY) + || (latency[4] < RANDOMX_LPROG_LATENCY) || (latency[5] < RANDOMX_LPROG_LATENCY) || (latency[6] < RANDOMX_LPROG_LATENCY) || (latency[7] < RANDOMX_LPROG_LATENCY)) && (num_retries < 64)) + { + // Fail-safe to guarantee loop termination + ++total_iterations; + if (total_iterations > 1024) { + std::cout << "total_iterations = " << total_iterations << std::endl; + break; + } + + check_data(data_index, 1, data, sizeof(data)); + const uint8_t b1 = data[data_index++]; + int instrType = lightInstruction[b1 & ((1 << LIGHT_OPCODE_BITS) - 1)]; + + check_data(data_index, 1, data, sizeof(data)); + const uint8_t b2 = data[data_index++]; + dst_index = b2 & ((1 << V4_DST_INDEX_BITS) - 1); + src_index = (b2 >> (V4_DST_INDEX_BITS)) & ((1 << V4_SRC_INDEX_BITS) - 1); + + const int a = dst_index; + int b = src_index; + + // Don't do rotation with the same destination twice because it's equal to a single rotation + if (is_rotation[instrType] && rotated[a]) + { + continue; + } + + // Don't do the same instruction (except MUL) with the same source value twice because all other cases can be optimized: + // 2x IADD_RC(a, b, C) = IADD_RC(a, b*2, C1+C2) + // 2x ISUB_R(a, b) = ISUB_R(a, 2*b) + // 2x IMUL_R(a, b) = IMUL_R(a, b*b) + // 2x IMUL_9C(a, C) = 9 * (9 * a + C1) + C2 = 81 * a + (9 * C1 + C2) + // 2x IMUL_RCP(a, C) = a * (C * C) + // 2x IXOR_R = NOP + // 2x IROR_R(a, b) = IROR_R(a, 2*b) + if (instrType != LightInstruction::IMULH_R && instrType != LightInstruction::ISMULH_R && ((inst_data[a] & 0xFFFF00) == (instrType << 8) + ((inst_data[b] & 255) << 16))) + { + continue; + } + + if ((instrType == LightInstruction::IADD_RC) || (instrType == LightInstruction::IMUL_9C) || (instrType == LightInstruction::IMUL_RCP) || (instrType == LightInstruction::COND_R) || ((instrType != LightInstruction::IMULH_R) && (instrType != LightInstruction::ISMULH_R) && (a == b))) + { + check_data(data_index, 4, data, sizeof(data)); + imm32 = load32(&data[data_index++]); + } + + // Find which ALU is available (and when) for this instruction + int next_latency = (latency[a] > latency[b]) ? latency[a] : latency[b]; + int alu_index = -1; + while (next_latency < RANDOMX_LPROG_LATENCY) + { + for (int i = op_ALUs[instrType] - 1; i >= 0; --i) + { + if (!alu_busy[next_latency][i]) + { + // ADD is implemented as two 1-cycle instructions on a real CPU, so do an additional availability check + if ((instrType == LightInstruction::IADD_RC || instrType == LightInstruction::IMUL_9C || instrType == LightInstruction::IMULH_R || instrType == LightInstruction::ISMULH_R) && alu_busy[next_latency + 1][i]) + { + continue; + } + + // Rotation can only start when previous rotation is finished, so do an additional availability check + if (is_rotation[instrType] && (next_latency < rotate_count * op_latency[instrType])) + { + continue; + } + + alu_index = i; + break; + } + } + if (alu_index >= 0) + { + break; + } + ++next_latency; + } + + // Don't generate instructions that leave some register unchanged for more than 15 cycles + if (next_latency > latency[a] + 15) + { + continue; + } + + next_latency += op_latency[instrType]; + + if (next_latency <= RANDOMX_LPROG_LATENCY) + { + if (is_rotation[instrType]) + { + ++rotate_count; + } + + // Mark ALU as busy only for the first cycle when it starts executing the instruction because ALUs are fully pipelined + alu_busy[next_latency - op_latency[instrType]][alu_index] = true; + latency[a] = next_latency; + + // ASIC is supposed to have enough ALUs to run as many independent instructions per cycle as possible, so latency calculation for ASIC is simple + asic_latency[a] = ((asic_latency[a] > asic_latency[b]) ? asic_latency[a] : asic_latency[b]) + asic_op_latency[instrType]; + + rotated[a] = is_rotation[instrType]; + + inst_data[a] = code_size + (instrType << 8) + ((inst_data[b] & 255) << 16); + + prog(code_size).opcode = lightInstructionOpcode[instrType]; + prog(code_size).dst = dst_index; + prog(code_size).src = src_index; + prog(code_size).setImm32(imm32); + + if (instrType == LightInstruction::IADD_RC || instrType == LightInstruction::IMUL_9C || instrType == LightInstruction::IMULH_R || instrType == LightInstruction::ISMULH_R) + { + // ADD instruction is implemented as two 1-cycle instructions on a real CPU, so mark ALU as busy for the next cycle too + alu_busy[next_latency - op_latency[instrType] + 1][alu_index] = true; + } + + ++code_size; + if (code_size >= RANDOMX_LPROG_MIN_SIZE) + { + break; + } + } + else + { + ++num_retries; + std::cout << "Retry " << num_retries << " with code_size = " << code_size << ", next_latency = " << next_latency << std::endl; + } + } + + // ASIC has more execution resources and can extract as much parallelism from the code as possible + // We need to add a few more MUL and ROR instructions to achieve minimal required latency for ASIC + // Get this latency for at least 1 of the 4 registers + const int prev_code_size = code_size; + if ((code_size < RANDOMX_LPROG_MAX_SIZE) && (asic_latency[indexRegister] < RANDOMX_LPROG_ASIC_LATENCY)) + { + int min_idx = indexRegister; + int max_idx = 0; + for (int i = 1; i < 8; ++i) + { + //if (asic_latency[i] < asic_latency[min_idx]) min_idx = i; + if (asic_latency[i] > asic_latency[max_idx]) max_idx = i; + } + + const int pattern[3] = { LightInstruction::IMUL_R, LightInstruction::IROR_R, LightInstruction::IMUL_R }; + const int instrType = pattern[(code_size - prev_code_size) % 3]; + latency[min_idx] = latency[max_idx] + op_latency[instrType]; + asic_latency[min_idx] = asic_latency[max_idx] + asic_op_latency[instrType]; + + prog(code_size).opcode = lightInstructionOpcode[instrType]; + prog(code_size).dst = min_idx; + prog(code_size).src = max_idx; + + ++code_size; + } + + for (int i = 0; i < 8; ++i) { + std::cout << "Latency " << i << " = " << latency[i] << std::endl; + } + + std::cout << "Code size = " << code_size << std::endl; + std::cout << "ALUs:" << std::endl; + for (int i = 0; i < RANDOMX_LPROG_LATENCY + 1; ++i) { + for (int j = 0; j < ALU_COUNT; ++j) { + std::cout << (alu_busy[i][j] ? '*' : '_'); + } + std::cout << std::endl; + } + + // There is ~98.15% chance that loop condition is false, so this loop will execute only 1 iteration most of the time + // It never does more than 4 iterations for all block heights < 10,000,000 + } while ((code_size < RANDOMX_LPROG_MIN_SIZE) || (code_size > RANDOMX_LPROG_MAX_SIZE)); + + prog.setSize(code_size); + } +} \ No newline at end of file diff --git a/src/LightProgramGenerator.hpp b/src/LightProgramGenerator.hpp new file mode 100644 index 0000000..71c4a7c --- /dev/null +++ b/src/LightProgramGenerator.hpp @@ -0,0 +1,24 @@ +/* +Copyright (c) 2019 tevador + +This file is part of RandomX. + +RandomX is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +RandomX is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with RandomX. If not, see. +*/ + +#include "Program.hpp" + +namespace RandomX { + void generateLightProgram(LightProgram& prog, const void* seed, int indexRegister); +} \ No newline at end of file diff --git a/src/Program.cpp b/src/Program.cpp index ebd271d..2b10f0b 100644 --- a/src/Program.cpp +++ b/src/Program.cpp @@ -21,7 +21,8 @@ along with RandomX. If not, see. #include "hashAes1Rx4.hpp" namespace RandomX { - void Program::print(std::ostream& os) const { + template + void ProgramBase::print(std::ostream& os) const { for (int i = 0; i < RANDOMX_PROGRAM_SIZE; ++i) { auto instr = programBuffer[i]; os << instr; diff --git a/src/Program.hpp b/src/Program.hpp index 621b614..53c973b 100644 --- a/src/Program.hpp +++ b/src/Program.hpp @@ -39,11 +39,45 @@ namespace RandomX { uint64_t getEntropy(int i) { return load64(&entropyBuffer[i]); } + uint32_t getSize() { + return RANDOMX_PROGRAM_SIZE; + } private: - void print(std::ostream&) const; + void print(std::ostream& os) const { + for (int i = 0; i < RANDOMX_PROGRAM_SIZE; ++i) { + auto instr = programBuffer[i]; + os << instr; + } + } uint64_t entropyBuffer[16]; Instruction programBuffer[RANDOMX_PROGRAM_SIZE]; }; + class LightProgram { + public: + Instruction& operator()(int pc) { + return programBuffer[pc]; + } + friend std::ostream& operator<<(std::ostream& os, const LightProgram& p) { + p.print(os); + return os; + } + uint32_t getSize() { + return size; + } + void setSize(uint32_t val) { + size = val; + } + private: + void print(std::ostream& os) const { + for (unsigned i = 0; i < size; ++i) { + auto instr = programBuffer[i]; + os << instr; + } + } + Instruction programBuffer[RANDOMX_LPROG_MAX_SIZE]; + uint32_t size; + }; + static_assert(sizeof(Program) % 64 == 0, "Invalid size of class Program"); } diff --git a/src/configuration.h b/src/configuration.h index 8780998..95c1412 100644 --- a/src/configuration.h +++ b/src/configuration.h @@ -37,6 +37,11 @@ along with RandomX. If not, see. //Number of random Cache accesses per Dataset block. Minimum is 2. #define RANDOMX_CACHE_ACCESSES 8 +#define RANDOMX_LPROG_LATENCY 168 +#define RANDOMX_LPROG_ASIC_LATENCY 84 +#define RANDOMX_LPROG_MIN_SIZE 225 +#define RANDOMX_LPROG_MAX_SIZE 512 + //Dataset size in bytes. Must be a power of 2. #define RANDOMX_DATASET_SIZE (2ULL * 1024 * 1024 * 1024) diff --git a/src/main.cpp b/src/main.cpp index a28bc52..61bb2ff 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -36,6 +36,7 @@ along with RandomX. If not, see. #include "dataset.hpp" #include "Cache.hpp" #include "hashAes1Rx4.hpp" +#include "LightProgramGenerator.hpp" const uint8_t seed[32] = { 191, 182, 222, 175, 249, 89, 134, 104, 241, 68, 191, 62, 162, 166, 61, 64, 123, 191, 227, 193, 118, 60, 188, 53, 223, 133, 175, 24, 123, 230, 55, 74 }; @@ -203,7 +204,7 @@ void mine(RandomX::VirtualMachine* vm, std::atomic& atomicNonce, Atomi } int main(int argc, char** argv) { - bool softAes, genAsm, miningMode, verificationMode, help, largePages, async, genNative, jit; + bool softAes, genAsm, miningMode, verificationMode, help, largePages, async, genNative, jit, genLight; int programCount, threadCount, initThreadCount, epoch; readOption("--softAes", argc, argv, softAes); @@ -218,6 +219,14 @@ int main(int argc, char** argv) { readOption("--jit", argc, argv, jit); readOption("--genNative", argc, argv, genNative); readOption("--help", argc, argv, help); + readOption("--genLight", argc, argv, genLight); + + if (genLight) { + RandomX::LightProgram p; + RandomX::generateLightProgram(p, seed, 0); + std::cout << p << std::endl; + return 0; + } if (genAsm) { if (softAes) diff --git a/src/variant4_random_math.h b/src/variant4_random_math.h new file mode 100644 index 0000000..3ae1841 --- /dev/null +++ b/src/variant4_random_math.h @@ -0,0 +1,441 @@ +#ifndef VARIANT4_RANDOM_MATH_H +#define VARIANT4_RANDOM_MATH_H + +// Register size can be configured to either 32 bit (uint32_t) or 64 bit (uint64_t) +typedef uint32_t v4_reg; + +enum V4_Settings +{ + // Generate code with minimal theoretical latency = 45 cycles, which is equivalent to 15 multiplications + TOTAL_LATENCY = 15 * 3, + + // Always generate at least 60 instructions + NUM_INSTRUCTIONS_MIN = 60, + + // Never generate more than 70 instructions (final RET instruction doesn't count here) + NUM_INSTRUCTIONS_MAX = 70, + + // Available ALUs for MUL + // Modern CPUs typically have only 1 ALU which can do multiplications + ALU_COUNT_MUL = 1, + + // Total available ALUs + // Modern CPUs have 4 ALUs, but we use only 3 because random math executes together with other main loop code + ALU_COUNT = 3, +}; + +enum V4_InstructionList +{ + MUL, // a*b + ADD, // a+b + C, C is an unsigned 32-bit constant + SUB, // a-b + ROR, // rotate right "a" by "b & 31" bits + ROL, // rotate left "a" by "b & 31" bits + XOR, // a^b + RET, // finish execution + V4_INSTRUCTION_COUNT = RET, +}; + +// V4_InstructionDefinition is used to generate code from random data +// Every random sequence of bytes is a valid code +// +// There are 9 registers in total: +// - 4 variable registers +// - 5 constant registers initialized from loop variables +// This is why dst_index is 2 bits +enum V4_InstructionDefinition +{ + V4_OPCODE_BITS = 3, + V4_DST_INDEX_BITS = 2, + V4_SRC_INDEX_BITS = 3, +}; + +struct V4_Instruction +{ + uint8_t opcode; + uint8_t dst_index; + uint8_t src_index; + uint32_t C; +}; + +#ifndef FORCEINLINE +#if defined(__GNUC__) +#define FORCEINLINE __attribute__((always_inline)) inline +#elif defined(_MSC_VER) +#define FORCEINLINE __forceinline +#else +#define FORCEINLINE inline +#endif +#endif + +#ifndef UNREACHABLE_CODE +#if defined(__GNUC__) +#define UNREACHABLE_CODE __builtin_unreachable() +#elif defined(_MSC_VER) +#define UNREACHABLE_CODE __assume(false) +#else +#define UNREACHABLE_CODE +#endif +#endif + +// Random math interpreter's loop is fully unrolled and inlined to achieve 100% branch prediction on CPU: +// every switch-case will point to the same destination on every iteration of Cryptonight main loop +// +// This is about as fast as it can get without using low-level machine code generation +static FORCEINLINE void v4_random_math(const struct V4_Instruction* code, v4_reg* r) +{ + enum + { + REG_BITS = sizeof(v4_reg) * 8, + }; + +#define V4_EXEC(i) \ + { \ + const struct V4_Instruction* op = code + i; \ + const v4_reg src = r[op->src_index]; \ + v4_reg* dst = r + op->dst_index; \ + switch (op->opcode) \ + { \ + case MUL: \ + *dst *= src; \ + break; \ + case ADD: \ + *dst += src + op->C; \ + break; \ + case SUB: \ + *dst -= src; \ + break; \ + case ROR: \ + { \ + const uint32_t shift = src % REG_BITS; \ + *dst = (*dst >> shift) | (*dst << ((REG_BITS - shift) % REG_BITS)); \ + } \ + break; \ + case ROL: \ + { \ + const uint32_t shift = src % REG_BITS; \ + *dst = (*dst << shift) | (*dst >> ((REG_BITS - shift) % REG_BITS)); \ + } \ + break; \ + case XOR: \ + *dst ^= src; \ + break; \ + case RET: \ + return; \ + default: \ + UNREACHABLE_CODE; \ + break; \ + } \ + } + +#define V4_EXEC_10(j) \ + V4_EXEC(j + 0) \ + V4_EXEC(j + 1) \ + V4_EXEC(j + 2) \ + V4_EXEC(j + 3) \ + V4_EXEC(j + 4) \ + V4_EXEC(j + 5) \ + V4_EXEC(j + 6) \ + V4_EXEC(j + 7) \ + V4_EXEC(j + 8) \ + V4_EXEC(j + 9) + + // Generated program can have 60 + a few more (usually 2-3) instructions to achieve required latency + // I've checked all block heights < 10,000,000 and here is the distribution of program sizes: + // + // 60 27960 + // 61 105054 + // 62 2452759 + // 63 5115997 + // 64 1022269 + // 65 1109635 + // 66 153145 + // 67 8550 + // 68 4529 + // 69 102 + + // Unroll 70 instructions here + V4_EXEC_10(0); // instructions 0-9 + V4_EXEC_10(10); // instructions 10-19 + V4_EXEC_10(20); // instructions 20-29 + V4_EXEC_10(30); // instructions 30-39 + V4_EXEC_10(40); // instructions 40-49 + V4_EXEC_10(50); // instructions 50-59 + V4_EXEC_10(60); // instructions 60-69 + +#undef V4_EXEC_10 +#undef V4_EXEC +} + +// If we don't have enough data available, generate more +static FORCEINLINE void check_data(size_t* data_index, const size_t bytes_needed, int8_t* data, const size_t data_size) +{ + if (*data_index + bytes_needed > data_size) + { + hash_extra_blake(data, data_size, (char*) data); + *data_index = 0; + } +} + +// Generates as many random math operations as possible with given latency and ALU restrictions +// "code" array must have space for NUM_INSTRUCTIONS_MAX+1 instructions +static inline int v4_random_math_init(struct V4_Instruction* code, const uint64_t height) +{ + // MUL is 3 cycles, 3-way addition and rotations are 2 cycles, SUB/XOR are 1 cycle + // These latencies match real-life instruction latencies for Intel CPUs starting from Sandy Bridge and up to Skylake/Coffee lake + // + // AMD Ryzen has the same latencies except 1-cycle ROR/ROL, so it'll be a bit faster than Intel Sandy Bridge and newer processors + // Surprisingly, Intel Nehalem also has 1-cycle ROR/ROL, so it'll also be faster than Intel Sandy Bridge and newer processors + // AMD Bulldozer has 4 cycles latency for MUL (slower than Intel) and 1 cycle for ROR/ROL (faster than Intel), so average performance will be the same + // Source: https://www.agner.org/optimize/instruction_tables.pdf + const int op_latency[V4_INSTRUCTION_COUNT] = { 3, 2, 1, 2, 2, 1 }; + + // Instruction latencies for theoretical ASIC implementation + const int asic_op_latency[V4_INSTRUCTION_COUNT] = { 3, 1, 1, 1, 1, 1 }; + + // Available ALUs for each instruction + const int op_ALUs[V4_INSTRUCTION_COUNT] = { ALU_COUNT_MUL, ALU_COUNT, ALU_COUNT, ALU_COUNT, ALU_COUNT, ALU_COUNT }; + + int8_t data[32]; + memset(data, 0, sizeof(data)); + uint64_t tmp = SWAP64LE(height); + memcpy(data, &tmp, sizeof(uint64_t)); + data[20] = -38; // change seed + + // Set data_index past the last byte in data + // to trigger full data update with blake hash + // before we start using it + size_t data_index = sizeof(data); + + int code_size; + + // There is a small chance (1.8%) that register R8 won't be used in the generated program + // So we keep track of it and try again if it's not used + bool r8_used; + do { + int latency[9]; + int asic_latency[9]; + + // Tracks previous instruction and value of the source operand for registers R0-R3 throughout code execution + // byte 0: current value of the destination register + // byte 1: instruction opcode + // byte 2: current value of the source register + // + // Registers R4-R8 are constant and are treated as having the same value because when we do + // the same operation twice with two constant source registers, it can be optimized into a single operation + uint32_t inst_data[9] = { 0, 1, 2, 3, 0xFFFFFF, 0xFFFFFF, 0xFFFFFF, 0xFFFFFF, 0xFFFFFF }; + + bool alu_busy[TOTAL_LATENCY + 1][ALU_COUNT]; + bool is_rotation[V4_INSTRUCTION_COUNT]; + bool rotated[4]; + int rotate_count = 0; + + memset(latency, 0, sizeof(latency)); + memset(asic_latency, 0, sizeof(asic_latency)); + memset(alu_busy, 0, sizeof(alu_busy)); + memset(is_rotation, 0, sizeof(is_rotation)); + memset(rotated, 0, sizeof(rotated)); + is_rotation[ROR] = true; + is_rotation[ROL] = true; + + int num_retries = 0; + code_size = 0; + + int total_iterations = 0; + r8_used = false; + + // Generate random code to achieve minimal required latency for our abstract CPU + // Try to get this latency for all 4 registers + while (((latency[0] < TOTAL_LATENCY) || (latency[1] < TOTAL_LATENCY) || (latency[2] < TOTAL_LATENCY) || (latency[3] < TOTAL_LATENCY)) && (num_retries < 64)) + { + // Fail-safe to guarantee loop termination + ++total_iterations; + if (total_iterations > 256) + break; + + check_data(&data_index, 1, data, sizeof(data)); + + const uint8_t c = ((uint8_t*)data)[data_index++]; + + // MUL = opcodes 0-2 + // ADD = opcode 3 + // SUB = opcode 4 + // ROR/ROL = opcode 5, shift direction is selected randomly + // XOR = opcodes 6-7 + uint8_t opcode = c & ((1 << V4_OPCODE_BITS) - 1); + if (opcode == 5) + { + check_data(&data_index, 1, data, sizeof(data)); + opcode = (data[data_index++] >= 0) ? ROR : ROL; + } + else if (opcode >= 6) + { + opcode = XOR; + } + else + { + opcode = (opcode <= 2) ? MUL : (opcode - 2); + } + + uint8_t dst_index = (c >> V4_OPCODE_BITS) & ((1 << V4_DST_INDEX_BITS) - 1); + uint8_t src_index = (c >> (V4_OPCODE_BITS + V4_DST_INDEX_BITS)) & ((1 << V4_SRC_INDEX_BITS) - 1); + + const int a = dst_index; + int b = src_index; + + // Don't do ADD/SUB/XOR with the same register + if (((opcode == ADD) || (opcode == SUB) || (opcode == XOR)) && (a == b)) + { + // Use register R8 as source instead + b = 8; + src_index = 8; + } + + // Don't do rotation with the same destination twice because it's equal to a single rotation + if (is_rotation[opcode] && rotated[a]) + { + continue; + } + + // Don't do the same instruction (except MUL) with the same source value twice because all other cases can be optimized: + // 2xADD(a, b, C) = ADD(a, b*2, C1+C2), same for SUB and rotations + // 2xXOR(a, b) = NOP + if ((opcode != MUL) && ((inst_data[a] & 0xFFFF00) == (opcode << 8) + ((inst_data[b] & 255) << 16))) + { + continue; + } + + // Find which ALU is available (and when) for this instruction + int next_latency = (latency[a] > latency[b]) ? latency[a] : latency[b]; + int alu_index = -1; + while (next_latency < TOTAL_LATENCY) + { + for (int i = op_ALUs[opcode] - 1; i >= 0; --i) + { + if (!alu_busy[next_latency][i]) + { + // ADD is implemented as two 1-cycle instructions on a real CPU, so do an additional availability check + if ((opcode == ADD) && alu_busy[next_latency + 1][i]) + { + continue; + } + + // Rotation can only start when previous rotation is finished, so do an additional availability check + if (is_rotation[opcode] && (next_latency < rotate_count * op_latency[opcode])) + { + continue; + } + + alu_index = i; + break; + } + } + if (alu_index >= 0) + { + break; + } + ++next_latency; + } + + // Don't generate instructions that leave some register unchanged for more than 7 cycles + if (next_latency > latency[a] + 7) + { + continue; + } + + next_latency += op_latency[opcode]; + + if (next_latency <= TOTAL_LATENCY) + { + if (is_rotation[opcode]) + { + ++rotate_count; + } + + // Mark ALU as busy only for the first cycle when it starts executing the instruction because ALUs are fully pipelined + alu_busy[next_latency - op_latency[opcode]][alu_index] = true; + latency[a] = next_latency; + + // ASIC is supposed to have enough ALUs to run as many independent instructions per cycle as possible, so latency calculation for ASIC is simple + asic_latency[a] = ((asic_latency[a] > asic_latency[b]) ? asic_latency[a] : asic_latency[b]) + asic_op_latency[opcode]; + + rotated[a] = is_rotation[opcode]; + + inst_data[a] = code_size + (opcode << 8) + ((inst_data[b] & 255) << 16); + + code[code_size].opcode = opcode; + code[code_size].dst_index = dst_index; + code[code_size].src_index = src_index; + code[code_size].C = 0; + + if (src_index == 8) + { + r8_used = true; + } + + if (opcode == ADD) + { + // ADD instruction is implemented as two 1-cycle instructions on a real CPU, so mark ALU as busy for the next cycle too + alu_busy[next_latency - op_latency[opcode] + 1][alu_index] = true; + + // ADD instruction requires 4 more random bytes for 32-bit constant "C" in "a = a + b + C" + check_data(&data_index, sizeof(uint32_t), data, sizeof(data)); + uint32_t t; + memcpy(&t, data + data_index, sizeof(uint32_t)); + code[code_size].C = SWAP32LE(t); + data_index += sizeof(uint32_t); + } + + ++code_size; + if (code_size >= NUM_INSTRUCTIONS_MIN) + { + break; + } + } + else + { + ++num_retries; + } + } + + // ASIC has more execution resources and can extract as much parallelism from the code as possible + // We need to add a few more MUL and ROR instructions to achieve minimal required latency for ASIC + // Get this latency for at least 1 of the 4 registers + const int prev_code_size = code_size; + while ((code_size < NUM_INSTRUCTIONS_MAX) && (asic_latency[0] < TOTAL_LATENCY) && (asic_latency[1] < TOTAL_LATENCY) && (asic_latency[2] < TOTAL_LATENCY) && (asic_latency[3] < TOTAL_LATENCY)) + { + int min_idx = 0; + int max_idx = 0; + for (int i = 1; i < 4; ++i) + { + if (asic_latency[i] < asic_latency[min_idx]) min_idx = i; + if (asic_latency[i] > asic_latency[max_idx]) max_idx = i; + } + + const uint8_t pattern[3] = { ROR, MUL, MUL }; + const uint8_t opcode = pattern[(code_size - prev_code_size) % 3]; + latency[min_idx] = latency[max_idx] + op_latency[opcode]; + asic_latency[min_idx] = asic_latency[max_idx] + asic_op_latency[opcode]; + + code[code_size].opcode = opcode; + code[code_size].dst_index = min_idx; + code[code_size].src_index = max_idx; + code[code_size].C = 0; + ++code_size; + } + + // There is ~98.15% chance that loop condition is false, so this loop will execute only 1 iteration most of the time + // It never does more than 4 iterations for all block heights < 10,000,000 + } while (!r8_used || (code_size < NUM_INSTRUCTIONS_MIN) || (code_size > NUM_INSTRUCTIONS_MAX)); + + // It's guaranteed that NUM_INSTRUCTIONS_MIN <= code_size <= NUM_INSTRUCTIONS_MAX here + // Add final instruction to stop the interpreter + code[code_size].opcode = RET; + code[code_size].dst_index = 0; + code[code_size].src_index = 0; + code[code_size].C = 0; + + return code_size; +} + +#endif \ No newline at end of file From 2fd0a125b5a3887a5f4f87158c5f38dcc857f806 Mon Sep 17 00:00:00 2001 From: tevador Date: Sun, 31 Mar 2019 13:32:16 +0200 Subject: [PATCH 02/18] Front-end simulation --- src/LightProgramGenerator.cpp | 698 +++++++++++++++++++++++++++++++--- src/LightProgramGenerator.hpp | 1 + src/main.cpp | 4 +- src/program.inc | 4 + 4 files changed, 661 insertions(+), 46 deletions(-) diff --git a/src/LightProgramGenerator.cpp b/src/LightProgramGenerator.cpp index dc8fa4e..eaf5efe 100644 --- a/src/LightProgramGenerator.cpp +++ b/src/LightProgramGenerator.cpp @@ -22,43 +22,29 @@ along with RandomX. If not, see. #include "Program.hpp" #include "blake2/endian.h"; #include +#include namespace RandomX { - - namespace LightInstruction { - constexpr int IADD_R = 0; - constexpr int IADD_RC = 1; - constexpr int ISUB_R = 2; - constexpr int IMUL_9C = 3; - constexpr int IMUL_R = 4; - constexpr int IMULH_R = 5; - constexpr int ISMULH_R = 6; - constexpr int IMUL_RCP = 7; - constexpr int IXOR_R = 8; - constexpr int IROR_R = 9; - constexpr int COND_R = 10; - constexpr int COUNT = 11; + // Intel Ivy Bridge reference + namespace LightInstructionType { //uOPs (decode) execution ports latency code size + constexpr int IADD_R = 0; //1 p015 1 3 + constexpr int IADD_C = 1; //1 p015 1 7 + constexpr int IADD_RC = 2; //1 p1 3 8 + constexpr int ISUB_R = 3; //1 p015 1 3 + constexpr int IMUL_9C = 4; //1 p1 3 8 + constexpr int IMUL_R = 5; //1 p1 3 4 + constexpr int IMUL_C = 6; //1 p1 3 7 + constexpr int IMULH_R = 7; //1+2+1 0+(p1,p5)+0 3 3+3+3 + constexpr int ISMULH_R = 8; //1+2+1 0+(p1,p5)+0 3 3+3+3 + constexpr int IMUL_RCP = 9; //1+1 p015+p1 4 10+4 + constexpr int IXOR_R = 10; //1 p015 1 3 + constexpr int IXOR_C = 11; //1 p015 1 7 + constexpr int IROR_R = 12; //1+2 0+(p0,p5) 1 3+3 + constexpr int IROR_C = 13; //1 p05 1 4 + constexpr int COND_R = 14; //1+1+1+1+1+1 p015+p5+0+p015+p05+p015 3 7+13+3+7+3+3 + constexpr int COUNT = 15; } - const int lightInstruction[] = { - LightInstruction::IADD_RC, - LightInstruction::IADD_RC, - LightInstruction::ISUB_R, - LightInstruction::ISUB_R, - LightInstruction::IMUL_9C, - LightInstruction::IMUL_R, - LightInstruction::IMUL_R, - LightInstruction::IMUL_R, - LightInstruction::IMULH_R, - LightInstruction::ISMULH_R, - LightInstruction::IMUL_RCP, - LightInstruction::IXOR_R, - LightInstruction::IXOR_R, - LightInstruction::IROR_R, - LightInstruction::IROR_R, - LightInstruction::COND_R - }; - namespace LightInstructionOpcode { constexpr int IADD_R = 0; constexpr int IADD_RC = RANDOMX_FREQ_IADD_R + RANDOMX_FREQ_IADD_M; @@ -67,26 +53,605 @@ namespace RandomX { constexpr int IMUL_R = IMUL_9C + RANDOMX_FREQ_IMUL_9C; constexpr int IMULH_R = IMUL_R + RANDOMX_FREQ_IMUL_R + RANDOMX_FREQ_IMUL_M; constexpr int ISMULH_R = IMULH_R + RANDOMX_FREQ_IMULH_R + RANDOMX_FREQ_IMULH_M; - constexpr int IMUL_RCP = ISMULH_R + RANDOMX_FREQ_ISMULH_R + RANDOMX_FREQ_ISMULH_M;; + constexpr int IMUL_RCP = ISMULH_R + RANDOMX_FREQ_ISMULH_R + RANDOMX_FREQ_ISMULH_M; constexpr int IXOR_R = IMUL_RCP + RANDOMX_FREQ_IMUL_RCP + RANDOMX_FREQ_INEG_R; constexpr int IROR_R = IXOR_R + RANDOMX_FREQ_IXOR_R + RANDOMX_FREQ_IXOR_M; constexpr int COND_R = IROR_R + RANDOMX_FREQ_IROR_R + RANDOMX_FREQ_IROL_R + RANDOMX_FREQ_ISWAP_R + RANDOMX_FREQ_FSWAP_R + RANDOMX_FREQ_FADD_R + RANDOMX_FREQ_FADD_M + RANDOMX_FREQ_FSUB_R + RANDOMX_FREQ_FSUB_M + RANDOMX_FREQ_FSCAL_R + RANDOMX_FREQ_FMUL_R + RANDOMX_FREQ_FDIV_M + RANDOMX_FREQ_FSQRT_R; } const int lightInstructionOpcode[] = { + LightInstructionOpcode::IADD_R, LightInstructionOpcode::IADD_R, LightInstructionOpcode::IADD_RC, LightInstructionOpcode::ISUB_R, LightInstructionOpcode::IMUL_9C, LightInstructionOpcode::IMUL_R, + LightInstructionOpcode::IMUL_R, LightInstructionOpcode::IMULH_R, LightInstructionOpcode::ISMULH_R, LightInstructionOpcode::IMUL_RCP, LightInstructionOpcode::IXOR_R, + LightInstructionOpcode::IXOR_R, + LightInstructionOpcode::IROR_R, LightInstructionOpcode::IROR_R, LightInstructionOpcode::COND_R }; + const int lightInstruction[] = { + LightInstructionType::IADD_R, + LightInstructionType::IADD_C, + LightInstructionType::IADD_RC, + LightInstructionType::ISUB_R, + LightInstructionType::IMUL_9C, + LightInstructionType::IMUL_R, + LightInstructionType::IMUL_R, + LightInstructionType::IMUL_C, + LightInstructionType::IMULH_R, + LightInstructionType::ISMULH_R, + LightInstructionType::IMUL_RCP, + LightInstructionType::IXOR_R, + LightInstructionType::IXOR_C, + LightInstructionType::IROR_R, + LightInstructionType::IROR_C, + LightInstructionType::COND_R + }; + + namespace ExecutionPort { + using type = int; + constexpr type Null = 0; + constexpr type P0 = 1; + constexpr type P1 = 2; + constexpr type P5 = 4; + constexpr type P05 = 6; + constexpr type P015 = 7; + } + + class Blake2Generator { + public: + Blake2Generator(const void* seed) : dataIndex(sizeof(data)) { + memset(data, 0, sizeof(data)); + memcpy(data, seed, SeedSize); + data[60] = 39; + } + + uint8_t getByte() { + checkData(1); + return data[dataIndex++]; + } + + uint32_t getInt32() { + checkData(4); + auto ret = load32(&data[dataIndex]); + dataIndex += 4; + return ret; + } + + private: + uint8_t data[64]; + size_t dataIndex; + + void checkData(const size_t bytesNeeded) { + if (dataIndex + bytesNeeded > sizeof(data)) { + blake2b(data, sizeof(data), data, sizeof(data), nullptr, 0); + dataIndex = 0; + } + } + }; + + class MacroOp { + public: + MacroOp(const char* name, int size) + : name_(name), size_(size), latency_(0), uop1_(ExecutionPort::Null), uop2_(ExecutionPort::Null) {} + MacroOp(const char* name, int size, int latency, ExecutionPort::type uop) + : name_(name), size_(size), latency_(latency), uop1_(uop), uop2_(ExecutionPort::Null) {} + MacroOp(const char* name, int size, int latency, ExecutionPort::type uop1, ExecutionPort::type uop2) + : name_(name), size_(size), latency_(latency), uop1_(uop1), uop2_(uop2) {} + const char* getName() const { + return name_; + } + int getSize() const { + return size_; + } + int getLatency() const { + return latency_; + } + ExecutionPort::type getUop1() const { + return uop1_; + } + ExecutionPort::type getUop2() const { + return uop2_; + } + bool isSimple() const { + return uop2_ == ExecutionPort::Null; + } + bool isEliminated() const { + return uop1_ == ExecutionPort::Null; + } + static const MacroOp Add_rr; + static const MacroOp Add_ri; + static const MacroOp Lea_sib; + static const MacroOp Sub_rr; + static const MacroOp Imul_rr; + static const MacroOp Imul_rri; + static const MacroOp Imul_r; + static const MacroOp Mul_r; + static const MacroOp Mov_rr; + static const MacroOp Mov_ri64; + static const MacroOp Xor_rr; + static const MacroOp Xor_ri; + static const MacroOp Ror_rcl; + static const MacroOp Ror_ri; + static const MacroOp TestJmp_fused; + static const MacroOp Xor_self; + static const MacroOp Cmp_ri; + static const MacroOp Setcc_r; + private: + const char* name_; + int size_; + int latency_; + ExecutionPort::type uop1_; + ExecutionPort::type uop2_; + }; + + const MacroOp MacroOp::Add_rr = MacroOp("add r,r", 3, 1, ExecutionPort::P015); + const MacroOp MacroOp::Add_ri = MacroOp("add r,i", 7, 1, ExecutionPort::P015); + const MacroOp MacroOp::Lea_sib = MacroOp("lea r,m", 8, 3, ExecutionPort::P1); + const MacroOp MacroOp::Sub_rr = MacroOp("sub r,r", 3, 1, ExecutionPort::P015); + const MacroOp MacroOp::Imul_rr = MacroOp("imul r,r", 4, 3, ExecutionPort::P1); + const MacroOp MacroOp::Imul_rri = MacroOp("imul r,r,i", 7, 3, ExecutionPort::P1); + const MacroOp MacroOp::Imul_r = MacroOp("imul r", 3, 3, ExecutionPort::P1, ExecutionPort::P5); + const MacroOp MacroOp::Mul_r = MacroOp("mul r", 3, 3, ExecutionPort::P1, ExecutionPort::P5); + const MacroOp MacroOp::Mov_rr = MacroOp("mov r,r", 3); + const MacroOp MacroOp::Mov_ri64 = MacroOp("mov rax,i64", 10, 1, ExecutionPort::P015); + const MacroOp MacroOp::Xor_rr = MacroOp("xor r,r", 3, 1, ExecutionPort::P015); + const MacroOp MacroOp::Xor_ri = MacroOp("xor r,i", 7, 1, ExecutionPort::P015); + const MacroOp MacroOp::Ror_rcl = MacroOp("ror r,cl", 3, 1, ExecutionPort::P0, ExecutionPort::P5); + const MacroOp MacroOp::Ror_ri = MacroOp("ror r,i", 4, 1, ExecutionPort::P05); + const MacroOp MacroOp::Xor_self = MacroOp("xor rcx,rcx", 3); + const MacroOp MacroOp::Cmp_ri = MacroOp("cmp r,i", 7, 1, ExecutionPort::P015); + const MacroOp MacroOp::Setcc_r = MacroOp("setcc cl", 3, 1, ExecutionPort::P05); + const MacroOp MacroOp::TestJmp_fused = MacroOp("testjmp r,i", 13, 0, ExecutionPort::P5); + + template + T* begin(T(&arr)[N]) { return &arr[0]; } + template + T* end(T(&arr)[N]) { return &arr[0] + N; } + + const MacroOp* IMULH_R_ops_array[] = { &MacroOp::Mov_rr, &MacroOp::Mul_r, &MacroOp::Mov_rr }; + const MacroOp* ISMULH_R_ops_array[] = { &MacroOp::Mov_rr, &MacroOp::Imul_r, &MacroOp::Mov_rr }; + const MacroOp* IMUL_RCP_ops_array[] = { &MacroOp::Mov_ri64, &MacroOp::Imul_rr }; + const MacroOp* IROR_R_ops_array[] = { &MacroOp::Mov_rr, &MacroOp::Ror_rcl }; + const MacroOp* COND_R_ops_array[] = { &MacroOp::Add_ri, &MacroOp::TestJmp_fused, &MacroOp::Xor_self, &MacroOp::Cmp_ri, &MacroOp::Setcc_r, &MacroOp::Add_rr }; + + + class LightInstructionInfo { + public: + LightInstructionInfo(const char* name, const MacroOp* op) + : name_(name), op_(op), opsCount_(1), latency_(op->getLatency()) {} + template + LightInstructionInfo(const char* name, const MacroOp*(&arr)[N]) + : name_(name), ops_(arr), opsCount_(N), latency_(0) { + for (unsigned i = 0; i < N; ++i) { + latency_ += arr[i]->getLatency(); + } + static_assert(N > 1, "Invalid array size"); + } + template + LightInstructionInfo(const char* name, const MacroOp*(&arr)[N], int latency) + : name_(name), ops_(arr), opsCount_(N), latency_(latency) { + static_assert(N > 1, "Invalid array size"); + } + const char* getName() const { + return name_; + } + int getSize() const { + return opsCount_; + } + bool isSimple() const { + return opsCount_ == 1; + } + int getLatency() const { + return latency_; + } + const MacroOp* getOp(int index) const { + return opsCount_ > 1 ? ops_[index] : op_; + } + static const LightInstructionInfo IADD_R; + static const LightInstructionInfo IADD_C; + static const LightInstructionInfo IADD_RC; + static const LightInstructionInfo ISUB_R; + static const LightInstructionInfo IMUL_9C; + static const LightInstructionInfo IMUL_R; + static const LightInstructionInfo IMUL_C; + static const LightInstructionInfo IMULH_R; + static const LightInstructionInfo ISMULH_R; + static const LightInstructionInfo IMUL_RCP; + static const LightInstructionInfo IXOR_R; + static const LightInstructionInfo IXOR_C; + static const LightInstructionInfo IROR_R; + static const LightInstructionInfo IROR_C; + static const LightInstructionInfo COND_R; + static const LightInstructionInfo NOP; + private: + const char* name_; + union { + const MacroOp** ops_; + const MacroOp* op_; + }; + int opsCount_; + int latency_; + + LightInstructionInfo(const char* name) + : name_(name), opsCount_(0), latency_(0) {} + }; + + const LightInstructionInfo LightInstructionInfo::IADD_R = LightInstructionInfo("IADD_R", &MacroOp::Add_rr); + const LightInstructionInfo LightInstructionInfo::IADD_C = LightInstructionInfo("IADD_C", &MacroOp::Add_ri); + const LightInstructionInfo LightInstructionInfo::IADD_RC = LightInstructionInfo("IADD_RC", &MacroOp::Lea_sib); + const LightInstructionInfo LightInstructionInfo::ISUB_R = LightInstructionInfo("ISUB_R", &MacroOp::Sub_rr); + const LightInstructionInfo LightInstructionInfo::IMUL_9C = LightInstructionInfo("IMUL_9C", &MacroOp::Lea_sib); + const LightInstructionInfo LightInstructionInfo::IMUL_R = LightInstructionInfo("IMUL_R", &MacroOp::Imul_rr); + const LightInstructionInfo LightInstructionInfo::IMUL_C = LightInstructionInfo("IMUL_C", &MacroOp::Imul_rri); + const LightInstructionInfo LightInstructionInfo::IMULH_R = LightInstructionInfo("IMULH_R", IMULH_R_ops_array); + const LightInstructionInfo LightInstructionInfo::ISMULH_R = LightInstructionInfo("ISMULH_R", ISMULH_R_ops_array); + const LightInstructionInfo LightInstructionInfo::IMUL_RCP = LightInstructionInfo("IMUL_RCP", IMUL_RCP_ops_array); + const LightInstructionInfo LightInstructionInfo::IXOR_R = LightInstructionInfo("IXOR_R", &MacroOp::Xor_rr); + const LightInstructionInfo LightInstructionInfo::IXOR_C = LightInstructionInfo("IXOR_C", &MacroOp::Xor_ri); + const LightInstructionInfo LightInstructionInfo::IROR_R = LightInstructionInfo("IROR_R", IROR_R_ops_array); + const LightInstructionInfo LightInstructionInfo::IROR_C = LightInstructionInfo("IROR_C", &MacroOp::Ror_ri); + const LightInstructionInfo LightInstructionInfo::COND_R = LightInstructionInfo("COND_R", COND_R_ops_array); + const LightInstructionInfo LightInstructionInfo::NOP = LightInstructionInfo("NOP"); + + const int buffer0[] = { 3, 3, 10 }; + const int buffer1[] = { 7, 3, 3, 3 }; + const int buffer2[] = { 3, 3, 3, 7 }; + const int buffer3[] = { 4, 8, 4 }; + const int buffer4[] = { 4, 4, 4, 4 }; + const int buffer5[] = { 3, 7, 3, 3 }; + const int buffer6[] = { 3, 3, 7, 3 }; + const int buffer7[] = { 13, 3 }; + + class DecoderBuffer { + public: + static DecoderBuffer Default; + template + DecoderBuffer(const char* name, int index, const int(&arr)[N]) + : name_(name), index_(index), counts_(arr), opsCount_(N) {} + const int* getCounts() const { + return counts_; + } + int getSize() const { + return opsCount_; + } + int getIndex() const { + return index_; + } + const char* getName() const { + return name_; + } + const DecoderBuffer& fetchNext(int prevType, Blake2Generator& gen) { + if (prevType == LightInstructionType::IMULH_R || prevType == LightInstructionType::ISMULH_R) + return decodeBuffers[0]; + if (index_ == 0) { + if ((gen.getByte() % 2) == 0) + return decodeBuffers[3]; + else + return decodeBuffers[4]; + } + if (index_ == 2) { + return decodeBuffers[7]; + } + if (index_ == 7) { + return decodeBuffers[1]; + } + return fetchNextDefault(gen); + } + private: + const char* name_; + int index_; + const int* counts_; + int opsCount_; + DecoderBuffer() : index_(-1) {} + static const DecoderBuffer decodeBuffers[8]; + const DecoderBuffer& fetchNextDefault(Blake2Generator& gen) { + int select; + do { + select = gen.getByte() & 7; + } while (select == 7); + return decodeBuffers[select]; + } + }; + + const DecoderBuffer DecoderBuffer::decodeBuffers[8] = { + DecoderBuffer("3,3,10", 0, buffer0), + DecoderBuffer("7,3,3,3", 1, buffer1), + DecoderBuffer("3,3,3,7", 2, buffer2), + DecoderBuffer("4,8,4", 3, buffer3), + DecoderBuffer("4,4,4,4", 4, buffer4), + DecoderBuffer("3,7,3,3", 5, buffer5), + DecoderBuffer("3,3,7,3", 6, buffer6), + DecoderBuffer("13,3", 7, buffer7), + }; + + DecoderBuffer DecoderBuffer::Default = DecoderBuffer(); + + const int slot_3[] = { LightInstructionType::IADD_R, LightInstructionType::ISUB_R, LightInstructionType::IXOR_R, LightInstructionType::IADD_R }; + const int slot_3L[] = { LightInstructionType::IADD_R, LightInstructionType::ISUB_R, LightInstructionType::IXOR_R, LightInstructionType::IMULH_R, LightInstructionType::ISMULH_R, LightInstructionType::IXOR_R, LightInstructionType::IMULH_R, LightInstructionType::ISMULH_R }; + const int slot_3F[] = { LightInstructionType::IADD_R, LightInstructionType::ISUB_R, LightInstructionType::IXOR_R, LightInstructionType::IROR_R }; + const int slot_4[] = { LightInstructionType::IMUL_R, LightInstructionType::IROR_C }; + const int slot_7[] = { LightInstructionType::IADD_C, LightInstructionType::IMUL_C, LightInstructionType::IXOR_C, LightInstructionType::IXOR_C }; + const int slot_7L = LightInstructionType::COND_R; + const int slot_8[] = { LightInstructionType::IADD_RC, LightInstructionType::IMUL_9C }; + const int slot_10 = LightInstructionType::IMUL_RCP; + + class LightInstruction { + public: + Instruction toInstr() { + Instruction instr; + instr.opcode = lightInstructionOpcode[type_]; + instr.dst = dst_; + instr.src = src_ >= 0 ? src_ : dst_; + instr.mod = mod_; + instr.setImm32(imm32_); + return instr; + } + + static LightInstruction createForSlot(Blake2Generator& gen, int slotSize, bool isLast = false, bool isFirst = false) { + switch (slotSize) + { + case 3: + if (isLast) { + return create(slot_3L[gen.getByte() & 7], gen); + } + else if (isFirst) { + return create(slot_3F[gen.getByte() & 3], gen); + } + else { + return create(slot_3[gen.getByte() & 3], gen); + } + case 4: + return create(slot_4[gen.getByte() & 1], gen); + case 7: + if (isLast) { + return create(slot_7L, gen); + } + else { + return create(slot_7[gen.getByte() & 3], gen); + } + case 8: + return create(slot_8[gen.getByte() & 1], gen); + case 10: + return create(slot_10, gen); + default: + break; + } + } + + static LightInstruction create(int type, Blake2Generator& gen) { + LightInstruction li; + li.type_ = type; + li.opGroup_ = type; + switch (type) + { + case LightInstructionType::IADD_R: { + li.dst_ = gen.getByte() & 7; + do { + li.src_ = gen.getByte() & 7; + } while (li.dst_ == li.src_); + li.mod_ = 0; + li.imm32_ = 0; + li.info_ = &LightInstructionInfo::IADD_R; + li.opGroup_ = LightInstructionType::IADD_R; + li.opGroupPar_ = li.src_; + } break; + + case LightInstructionType::IADD_C: { + li.dst_ = gen.getByte() & 7; + li.src_ = -1; + li.mod_ = 0; + li.imm32_ = gen.getInt32(); + li.info_ = &LightInstructionInfo::IADD_C; + li.opGroup_ = LightInstructionType::IADD_R; + li.opGroupPar_ = li.src_; + } break; + + case LightInstructionType::IADD_RC: { + li.dst_ = gen.getByte() & 7; + do { + li.src_ = gen.getByte() & 7; + } while (li.dst_ == li.src_); + li.mod_ = 0; + li.imm32_ = gen.getInt32(); + li.info_ = &LightInstructionInfo::IADD_RC; + li.opGroup_ = LightInstructionType::IADD_R; + li.opGroupPar_ = li.src_; + } break; + + case LightInstructionType::ISUB_R: { + li.dst_ = gen.getByte() & 7; + do { + li.src_ = gen.getByte() & 7; + } while (li.dst_ == li.src_); + li.mod_ = 0; + li.imm32_ = 0; + li.info_ = &LightInstructionInfo::ISUB_R; + li.opGroup_ = LightInstructionType::IADD_R; + li.opGroupPar_ = li.src_; + } break; + + case LightInstructionType::IMUL_9C: { + li.dst_ = gen.getByte() & 7; + do { + li.src_ = gen.getByte() & 7; + } while (li.dst_ == li.src_); + li.mod_ = 0; + li.imm32_ = gen.getInt32(); + li.info_ = &LightInstructionInfo::IMUL_9C; + li.opGroup_ = LightInstructionType::IMUL_C; + li.opGroupPar_ = -1; + } break; + + case LightInstructionType::IMUL_R: { + li.dst_ = gen.getByte() & 7; + do { + li.src_ = gen.getByte() & 7; + } while (li.dst_ == li.src_); + li.mod_ = 0; + li.imm32_ = 0; + li.info_ = &LightInstructionInfo::IMUL_R; + li.opGroup_ = LightInstructionType::IMUL_R; + li.opGroupPar_ = gen.getInt32(); + } break; + + case LightInstructionType::IMUL_C: { + li.dst_ = gen.getByte() & 7; + li.src_ = -1; + li.mod_ = 0; + li.imm32_ = gen.getInt32(); + li.info_ = &LightInstructionInfo::IMUL_C; + li.opGroup_ = LightInstructionType::IMUL_C; + li.opGroupPar_ = li.src_; + } break; + + case LightInstructionType::IMULH_R: { + li.dst_ = gen.getByte() & 7; + li.src_ = gen.getByte() & 7; + li.mod_ = 0; + li.imm32_ = 0; + li.info_ = &LightInstructionInfo::IMULH_R; + li.opGroup_ = LightInstructionType::IMULH_R; + li.opGroupPar_ = gen.getInt32(); + } break; + + case LightInstructionType::ISMULH_R: { + li.dst_ = gen.getByte() & 7; + li.src_ = gen.getByte() & 7; + li.mod_ = 0; + li.imm32_ = 0; + li.info_ = &LightInstructionInfo::ISMULH_R; + li.opGroup_ = LightInstructionType::ISMULH_R; + li.opGroupPar_ = gen.getInt32(); + } break; + + case LightInstructionType::IMUL_RCP: { + li.dst_ = gen.getByte() & 7; + li.src_ = -1; + li.mod_ = 0; + li.imm32_ = gen.getInt32(); + li.info_ = &LightInstructionInfo::IMUL_RCP; + li.opGroup_ = LightInstructionType::IMUL_C; + li.opGroupPar_ = -1; + } break; + + case LightInstructionType::IXOR_R: { + li.dst_ = gen.getByte() & 7; + do { + li.src_ = gen.getByte() & 7; + } while (li.dst_ == li.src_); + li.mod_ = 0; + li.imm32_ = 0; + li.info_ = &LightInstructionInfo::IXOR_R; + li.opGroup_ = LightInstructionType::IXOR_R; + li.opGroupPar_ = li.src_; + } break; + + case LightInstructionType::IXOR_C: { + li.dst_ = gen.getByte() & 7; + li.src_ = -1; + li.mod_ = 0; + li.imm32_ = gen.getInt32(); + li.info_ = &LightInstructionInfo::IXOR_C; + li.opGroup_ = LightInstructionType::IXOR_R; + li.opGroupPar_ = li.src_; + } break; + + case LightInstructionType::IROR_R: { + li.dst_ = gen.getByte() & 7; + do { + li.src_ = gen.getByte() & 7; + } while (li.dst_ == li.src_); + li.mod_ = 0; + li.imm32_ = 0; + li.info_ = &LightInstructionInfo::IROR_R; + li.opGroup_ = LightInstructionType::IROR_R; + li.opGroupPar_ = -1; + } break; + + case LightInstructionType::IROR_C: { + li.dst_ = gen.getByte() & 7; + li.src_ = -1; + li.mod_ = 0; + li.imm32_ = gen.getByte(); + li.info_ = &LightInstructionInfo::IROR_C; + li.opGroup_ = LightInstructionType::IROR_R; + li.opGroupPar_ = -1; + } break; + + case LightInstructionType::COND_R: { + li.dst_ = gen.getByte() & 7; + li.src_ = gen.getByte() & 7; + li.mod_ = gen.getByte(); + li.imm32_ = gen.getInt32(); + li.info_ = &LightInstructionInfo::COND_R; + li.opGroup_ = LightInstructionType::COND_R; + li.opGroupPar_ = li.imm32_; + } break; + + default: + break; + } + + return li; + } + + int getType() { + return type_; + } + int getSource() { + return src_; + } + int getDestination() { + return dst_; + } + int getGroup() { + return opGroup_; + } + int getGroupPar() { + return opGroupPar_; + } + + const LightInstructionInfo* getInfo() { + return info_; + } + + static const LightInstruction Null; + + private: + int type_; + int src_; + int dst_; + int mod_; + uint32_t imm32_; + + const LightInstructionInfo* info_; + int opGroup_; + int opGroupPar_; + + LightInstruction() {} + LightInstruction(int type, const LightInstructionInfo* info) : type_(type), info_(info) {} + }; + + class RegisterInfo { + public: + RegisterInfo() : lastOpGroup(-1), source(-1), value(0), latency(0) {} + int lastOpGroup; + int source; + int value; + int latency; + }; + + const LightInstruction LightInstruction::Null = LightInstruction(-1, &LightInstructionInfo::NOP); + constexpr int ALU_COUNT_MUL = 1; constexpr int ALU_COUNT = 4; constexpr int LIGHT_OPCODE_BITS = 4; @@ -106,16 +671,61 @@ namespace RandomX { } } + void generateLightProg2(LightProgram& prog, const void* seed, int indexRegister) { + + bool portBusy[RANDOMX_LPROG_LATENCY][3]; + RegisterInfo registers[8]; + bool decoderBusy[RANDOMX_LPROG_LATENCY][4]; + Blake2Generator gen(seed); + std::vector instructions; + + DecoderBuffer& fetchLine = DecoderBuffer::Default; + LightInstruction currentInstruction = LightInstruction::Null; + int instrIndex = 0; + int codeSize = 0; + int macroOpCount = 0; + int rxOpCount = 0; + + for (int cycle = 0; cycle < 170; ++cycle) { + fetchLine = fetchLine.fetchNext(currentInstruction.getType(), gen); + std::cout << "; cycle " << cycle << " buffer " << fetchLine.getName() << std::endl; + + int mopIndex = 0; + + while (mopIndex < fetchLine.getSize()) { + if (instrIndex >= currentInstruction.getInfo()->getSize()) { + currentInstruction = LightInstruction::createForSlot(gen, fetchLine.getCounts()[mopIndex], fetchLine.getSize() == mopIndex + 1, fetchLine.getIndex() == 0 && mopIndex == 0); + instrIndex = 0; + std::cout << "; " << currentInstruction.getInfo()->getName() << std::endl; + rxOpCount++; + } + if (fetchLine.getCounts()[mopIndex] != currentInstruction.getInfo()->getOp(instrIndex)->getSize()) { + std::cout << "ERROR instruction " << currentInstruction.getInfo()->getOp(instrIndex)->getName() << " doesn't fit into slot of size " << fetchLine.getCounts()[mopIndex] << std::endl; + return; + } + std::cout << currentInstruction.getInfo()->getOp(instrIndex)->getName() << std::endl; + codeSize += currentInstruction.getInfo()->getOp(instrIndex)->getSize(); + mopIndex++; + instrIndex++; + macroOpCount++; + } + } + + std::cout << "; code size " << codeSize << std::endl; + std::cout << "; x86 macro-ops: " << macroOpCount << std::endl; + std::cout << "; RandomX instructions: " << rxOpCount << std::endl; + } + void generateLightProgram(LightProgram& prog, const void* seed, int indexRegister) { // Source: https://www.agner.org/optimize/instruction_tables.pdf - const int op_latency[LightInstruction::COUNT] = { 1, 2, 1, 2, 3, 5, 5, 4, 1, 2, 5 }; + const int op_latency[LightInstructionType::COUNT] = { 1, 2, 1, 2, 3, 5, 5, 4, 1, 2, 5 }; // Instruction latencies for theoretical ASIC implementation - const int asic_op_latency[LightInstruction::COUNT] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }; + const int asic_op_latency[LightInstructionType::COUNT] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }; // Available ALUs for each instruction - const int op_ALUs[LightInstruction::COUNT] = { ALU_COUNT, ALU_COUNT, ALU_COUNT, ALU_COUNT, ALU_COUNT_MUL, ALU_COUNT_MUL, ALU_COUNT_MUL, ALU_COUNT_MUL, ALU_COUNT, ALU_COUNT, ALU_COUNT }; + const int op_ALUs[LightInstructionType::COUNT] = { ALU_COUNT, ALU_COUNT, ALU_COUNT, ALU_COUNT, ALU_COUNT_MUL, ALU_COUNT_MUL, ALU_COUNT_MUL, ALU_COUNT_MUL, ALU_COUNT, ALU_COUNT, ALU_COUNT }; uint8_t data[64]; memset(data, 0, sizeof(data)); @@ -147,7 +757,7 @@ namespace RandomX { uint64_t inst_data[8] = { 0, 1, 2, 3, 4, 5, 6, 7 }; bool alu_busy[RANDOMX_LPROG_LATENCY + 1][ALU_COUNT]; - bool is_rotation[LightInstruction::COUNT]; + bool is_rotation[LightInstructionType::COUNT]; bool rotated[8]; int rotate_count = 0; @@ -156,7 +766,7 @@ namespace RandomX { memset(alu_busy, 0, sizeof(alu_busy)); memset(is_rotation, 0, sizeof(is_rotation)); memset(rotated, 0, sizeof(rotated)); - is_rotation[LightInstruction::IROR_R] = true; + is_rotation[LightInstructionType::IROR_R] = true; int num_retries = 0; code_size = 0; @@ -201,12 +811,12 @@ namespace RandomX { // 2x IMUL_RCP(a, C) = a * (C * C) // 2x IXOR_R = NOP // 2x IROR_R(a, b) = IROR_R(a, 2*b) - if (instrType != LightInstruction::IMULH_R && instrType != LightInstruction::ISMULH_R && ((inst_data[a] & 0xFFFF00) == (instrType << 8) + ((inst_data[b] & 255) << 16))) + if (instrType != LightInstructionType::IMULH_R && instrType != LightInstructionType::ISMULH_R && ((inst_data[a] & 0xFFFF00) == (instrType << 8) + ((inst_data[b] & 255) << 16))) { continue; } - if ((instrType == LightInstruction::IADD_RC) || (instrType == LightInstruction::IMUL_9C) || (instrType == LightInstruction::IMUL_RCP) || (instrType == LightInstruction::COND_R) || ((instrType != LightInstruction::IMULH_R) && (instrType != LightInstruction::ISMULH_R) && (a == b))) + if ((instrType == LightInstructionType::IADD_RC) || (instrType == LightInstructionType::IMUL_9C) || (instrType == LightInstructionType::IMUL_RCP) || (instrType == LightInstructionType::COND_R) || ((instrType != LightInstructionType::IMULH_R) && (instrType != LightInstructionType::ISMULH_R) && (a == b))) { check_data(data_index, 4, data, sizeof(data)); imm32 = load32(&data[data_index++]); @@ -222,7 +832,7 @@ namespace RandomX { if (!alu_busy[next_latency][i]) { // ADD is implemented as two 1-cycle instructions on a real CPU, so do an additional availability check - if ((instrType == LightInstruction::IADD_RC || instrType == LightInstruction::IMUL_9C || instrType == LightInstruction::IMULH_R || instrType == LightInstruction::ISMULH_R) && alu_busy[next_latency + 1][i]) + if ((instrType == LightInstructionType::IADD_RC || instrType == LightInstructionType::IMUL_9C || instrType == LightInstructionType::IMULH_R || instrType == LightInstructionType::ISMULH_R) && alu_busy[next_latency + 1][i]) { continue; } @@ -275,7 +885,7 @@ namespace RandomX { prog(code_size).src = src_index; prog(code_size).setImm32(imm32); - if (instrType == LightInstruction::IADD_RC || instrType == LightInstruction::IMUL_9C || instrType == LightInstruction::IMULH_R || instrType == LightInstruction::ISMULH_R) + if (instrType == LightInstructionType::IADD_RC || instrType == LightInstructionType::IMUL_9C || instrType == LightInstructionType::IMULH_R || instrType == LightInstructionType::ISMULH_R) { // ADD instruction is implemented as two 1-cycle instructions on a real CPU, so mark ALU as busy for the next cycle too alu_busy[next_latency - op_latency[instrType] + 1][alu_index] = true; @@ -308,7 +918,7 @@ namespace RandomX { if (asic_latency[i] > asic_latency[max_idx]) max_idx = i; } - const int pattern[3] = { LightInstruction::IMUL_R, LightInstruction::IROR_R, LightInstruction::IMUL_R }; + const int pattern[3] = { LightInstructionType::IMUL_R, LightInstructionType::IROR_R, LightInstructionType::IMUL_R }; const int instrType = pattern[(code_size - prev_code_size) % 3]; latency[min_idx] = latency[max_idx] + op_latency[instrType]; asic_latency[min_idx] = asic_latency[max_idx] + asic_op_latency[instrType]; diff --git a/src/LightProgramGenerator.hpp b/src/LightProgramGenerator.hpp index 71c4a7c..a7762b1 100644 --- a/src/LightProgramGenerator.hpp +++ b/src/LightProgramGenerator.hpp @@ -21,4 +21,5 @@ along with RandomX. If not, see. namespace RandomX { void generateLightProgram(LightProgram& prog, const void* seed, int indexRegister); + void generateLightProg2(LightProgram& prog, const void* seed, int indexRegister); } \ No newline at end of file diff --git a/src/main.cpp b/src/main.cpp index 61bb2ff..8c1f64a 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -223,8 +223,8 @@ int main(int argc, char** argv) { if (genLight) { RandomX::LightProgram p; - RandomX::generateLightProgram(p, seed, 0); - std::cout << p << std::endl; + RandomX::generateLightProg2(p, seed, 0); + //std::cout << p << std::endl; return 0; } diff --git a/src/program.inc b/src/program.inc index 46d8093..97a8122 100644 --- a/src/program.inc +++ b/src/program.inc @@ -1,3 +1,5 @@ + mov ebx, 111 ; Start marker bytes + db 064h, 067h, 090h ; Start marker bytes randomx_isn_0: ; IROR_R r3, 30 ror r11, 30 @@ -1001,3 +1003,5 @@ randomx_isn_255: ; IROR_R r7, r3 mov ecx, r11d ror r15, cl + mov ebx, 222 ; End marker bytes + db 064h, 067h, 090h ; End marker bytes \ No newline at end of file From acef5ea0d7dc62b3f8e5b8b1b0703053177e1735 Mon Sep 17 00:00:00 2001 From: tevador Date: Sun, 31 Mar 2019 21:22:36 +0200 Subject: [PATCH 03/18] Port mapping --- src/LightProgramGenerator.cpp | 374 ++++++++++++++++++++++++---------- 1 file changed, 261 insertions(+), 113 deletions(-) diff --git a/src/LightProgramGenerator.cpp b/src/LightProgramGenerator.cpp index eaf5efe..db674ee 100644 --- a/src/LightProgramGenerator.cpp +++ b/src/LightProgramGenerator.cpp @@ -23,6 +23,8 @@ along with RandomX. If not, see. #include "blake2/endian.h"; #include #include +#include +#include namespace RandomX { // Intel Ivy Bridge reference @@ -101,9 +103,9 @@ namespace RandomX { constexpr type Null = 0; constexpr type P0 = 1; constexpr type P1 = 2; - constexpr type P5 = 4; - constexpr type P05 = 6; - constexpr type P015 = 7; + constexpr type P5 = 3; + constexpr type P05 = 4; + constexpr type P015 = 5; } class Blake2Generator { @@ -138,6 +140,15 @@ namespace RandomX { } }; + class RegisterInfo { + public: + RegisterInfo() : lastOpGroup(-1), source(-1), value(0), latency(0) {} + int lastOpGroup; + int source; + int value; + int latency; + }; + class MacroOp { public: MacroOp(const char* name, int size) @@ -146,6 +157,8 @@ namespace RandomX { : name_(name), size_(size), latency_(latency), uop1_(uop), uop2_(ExecutionPort::Null) {} MacroOp(const char* name, int size, int latency, ExecutionPort::type uop1, ExecutionPort::type uop2) : name_(name), size_(size), latency_(latency), uop1_(uop1), uop2_(uop2) {} + MacroOp(const MacroOp& parent, bool dependent) + : name_(parent.name_), size_(parent.size_), latency_(parent.latency_), uop1_(parent.uop1_), uop2_(parent.uop2_), dependent_(dependent) {} const char* getName() const { return name_; } @@ -167,6 +180,27 @@ namespace RandomX { bool isEliminated() const { return uop1_ == ExecutionPort::Null; } + bool isDependent() const { + return dependent_; + } + int getCycle() const { + return cycle_; + } + void setCycle(int cycle) { + cycle_ = cycle; + } + MacroOp* getSrcDep() const { + return depSrc_; + } + void setSrcDep(MacroOp* src) { + depSrc_ = src; + } + MacroOp* getDstDep() const { + return depDst_; + } + void setDstDep(MacroOp* dst) { + depDst_ = dst; + } static const MacroOp Add_rr; static const MacroOp Add_ri; static const MacroOp Lea_sib; @@ -191,6 +225,10 @@ namespace RandomX { int latency_; ExecutionPort::type uop1_; ExecutionPort::type uop2_; + int cycle_; + bool dependent_ = false; + MacroOp* depDst_ = nullptr; + MacroOp* depSrc_ = nullptr; }; const MacroOp MacroOp::Add_rr = MacroOp("add r,r", 3, 1, ExecutionPort::P015); @@ -212,49 +250,56 @@ namespace RandomX { const MacroOp MacroOp::Setcc_r = MacroOp("setcc cl", 3, 1, ExecutionPort::P05); const MacroOp MacroOp::TestJmp_fused = MacroOp("testjmp r,i", 13, 0, ExecutionPort::P5); - template - T* begin(T(&arr)[N]) { return &arr[0]; } - template - T* end(T(&arr)[N]) { return &arr[0] + N; } - - const MacroOp* IMULH_R_ops_array[] = { &MacroOp::Mov_rr, &MacroOp::Mul_r, &MacroOp::Mov_rr }; - const MacroOp* ISMULH_R_ops_array[] = { &MacroOp::Mov_rr, &MacroOp::Imul_r, &MacroOp::Mov_rr }; - const MacroOp* IMUL_RCP_ops_array[] = { &MacroOp::Mov_ri64, &MacroOp::Imul_rr }; - const MacroOp* IROR_R_ops_array[] = { &MacroOp::Mov_rr, &MacroOp::Ror_rcl }; - const MacroOp* COND_R_ops_array[] = { &MacroOp::Add_ri, &MacroOp::TestJmp_fused, &MacroOp::Xor_self, &MacroOp::Cmp_ri, &MacroOp::Setcc_r, &MacroOp::Add_rr }; + const MacroOp IMULH_R_ops_array[] = { MacroOp::Mov_rr, MacroOp::Mul_r, MacroOp::Mov_rr }; + const MacroOp ISMULH_R_ops_array[] = { MacroOp::Mov_rr, MacroOp::Imul_r, MacroOp::Mov_rr }; + const MacroOp IMUL_RCP_ops_array[] = { MacroOp::Mov_ri64, MacroOp(MacroOp::Imul_rr, true) }; + const MacroOp IROR_R_ops_array[] = { MacroOp::Mov_rr, MacroOp::Ror_rcl }; + const MacroOp COND_R_ops_array[] = { MacroOp::Add_ri, MacroOp(MacroOp::TestJmp_fused, true), MacroOp::Xor_self, MacroOp::Cmp_ri, MacroOp(MacroOp::Setcc_r, true), MacroOp(MacroOp::Add_rr, true) }; class LightInstructionInfo { public: - LightInstructionInfo(const char* name, const MacroOp* op) - : name_(name), op_(op), opsCount_(1), latency_(op->getLatency()) {} + LightInstructionInfo(const char* name, int type, const MacroOp& op) + : name_(name), type_(type), latency_(op.getLatency()) { + ops_.push_back(MacroOp(op)); + } template - LightInstructionInfo(const char* name, const MacroOp*(&arr)[N]) - : name_(name), ops_(arr), opsCount_(N), latency_(0) { + LightInstructionInfo(const char* name, int type, const MacroOp(&arr)[N]) + : name_(name), type_(type), latency_(0) { for (unsigned i = 0; i < N; ++i) { - latency_ += arr[i]->getLatency(); + ops_.push_back(MacroOp(arr[i])); + latency_ += ops_.back().getLatency(); } static_assert(N > 1, "Invalid array size"); } template - LightInstructionInfo(const char* name, const MacroOp*(&arr)[N], int latency) - : name_(name), ops_(arr), opsCount_(N), latency_(latency) { + LightInstructionInfo(const char* name, int type, const MacroOp*(&arr)[N], int latency) + : name_(name), type_(type), latency_(latency) { + for (unsigned i = 0; i < N; ++i) { + ops_.push_back(MacroOp(arr[i])); + if (arr[i].isDependent()) { + ops_[i].setSrcDep(&ops_[i - 1]); + } + } static_assert(N > 1, "Invalid array size"); } const char* getName() const { return name_; } int getSize() const { - return opsCount_; + return ops_.size(); } bool isSimple() const { - return opsCount_ == 1; + return getSize() == 1; } int getLatency() const { return latency_; } - const MacroOp* getOp(int index) const { - return opsCount_ > 1 ? ops_[index] : op_; + MacroOp& getOp(int index) { + return ops_[index]; + } + int getType() const { + return type_; } static const LightInstructionInfo IADD_R; static const LightInstructionInfo IADD_C; @@ -274,32 +319,29 @@ namespace RandomX { static const LightInstructionInfo NOP; private: const char* name_; - union { - const MacroOp** ops_; - const MacroOp* op_; - }; - int opsCount_; + int type_; + std::vector ops_; int latency_; LightInstructionInfo(const char* name) - : name_(name), opsCount_(0), latency_(0) {} + : name_(name), type_(-1), latency_(0) {} }; - const LightInstructionInfo LightInstructionInfo::IADD_R = LightInstructionInfo("IADD_R", &MacroOp::Add_rr); - const LightInstructionInfo LightInstructionInfo::IADD_C = LightInstructionInfo("IADD_C", &MacroOp::Add_ri); - const LightInstructionInfo LightInstructionInfo::IADD_RC = LightInstructionInfo("IADD_RC", &MacroOp::Lea_sib); - const LightInstructionInfo LightInstructionInfo::ISUB_R = LightInstructionInfo("ISUB_R", &MacroOp::Sub_rr); - const LightInstructionInfo LightInstructionInfo::IMUL_9C = LightInstructionInfo("IMUL_9C", &MacroOp::Lea_sib); - const LightInstructionInfo LightInstructionInfo::IMUL_R = LightInstructionInfo("IMUL_R", &MacroOp::Imul_rr); - const LightInstructionInfo LightInstructionInfo::IMUL_C = LightInstructionInfo("IMUL_C", &MacroOp::Imul_rri); - const LightInstructionInfo LightInstructionInfo::IMULH_R = LightInstructionInfo("IMULH_R", IMULH_R_ops_array); - const LightInstructionInfo LightInstructionInfo::ISMULH_R = LightInstructionInfo("ISMULH_R", ISMULH_R_ops_array); - const LightInstructionInfo LightInstructionInfo::IMUL_RCP = LightInstructionInfo("IMUL_RCP", IMUL_RCP_ops_array); - const LightInstructionInfo LightInstructionInfo::IXOR_R = LightInstructionInfo("IXOR_R", &MacroOp::Xor_rr); - const LightInstructionInfo LightInstructionInfo::IXOR_C = LightInstructionInfo("IXOR_C", &MacroOp::Xor_ri); - const LightInstructionInfo LightInstructionInfo::IROR_R = LightInstructionInfo("IROR_R", IROR_R_ops_array); - const LightInstructionInfo LightInstructionInfo::IROR_C = LightInstructionInfo("IROR_C", &MacroOp::Ror_ri); - const LightInstructionInfo LightInstructionInfo::COND_R = LightInstructionInfo("COND_R", COND_R_ops_array); + const LightInstructionInfo LightInstructionInfo::IADD_R = LightInstructionInfo("IADD_R", LightInstructionType::IADD_R, MacroOp::Add_rr); + const LightInstructionInfo LightInstructionInfo::IADD_C = LightInstructionInfo("IADD_C", LightInstructionType::IADD_C, MacroOp::Add_ri); + const LightInstructionInfo LightInstructionInfo::IADD_RC = LightInstructionInfo("IADD_RC", LightInstructionType::IADD_RC, MacroOp::Lea_sib); + const LightInstructionInfo LightInstructionInfo::ISUB_R = LightInstructionInfo("ISUB_R", LightInstructionType::ISUB_R, MacroOp::Sub_rr); + const LightInstructionInfo LightInstructionInfo::IMUL_9C = LightInstructionInfo("IMUL_9C", LightInstructionType::IMUL_9C, MacroOp::Lea_sib); + const LightInstructionInfo LightInstructionInfo::IMUL_R = LightInstructionInfo("IMUL_R", LightInstructionType::IMUL_R, MacroOp::Imul_rr); + const LightInstructionInfo LightInstructionInfo::IMUL_C = LightInstructionInfo("IMUL_C", LightInstructionType::IMUL_C, MacroOp::Imul_rri); + const LightInstructionInfo LightInstructionInfo::IMULH_R = LightInstructionInfo("IMULH_R", LightInstructionType::IMULH_R, IMULH_R_ops_array); + const LightInstructionInfo LightInstructionInfo::ISMULH_R = LightInstructionInfo("ISMULH_R", LightInstructionType::ISMULH_R, ISMULH_R_ops_array); + const LightInstructionInfo LightInstructionInfo::IMUL_RCP = LightInstructionInfo("IMUL_RCP", LightInstructionType::IMUL_RCP, IMUL_RCP_ops_array); + const LightInstructionInfo LightInstructionInfo::IXOR_R = LightInstructionInfo("IXOR_R", LightInstructionType::IXOR_R, MacroOp::Xor_rr); + const LightInstructionInfo LightInstructionInfo::IXOR_C = LightInstructionInfo("IXOR_C", LightInstructionType::IXOR_C, MacroOp::Xor_ri); + const LightInstructionInfo LightInstructionInfo::IROR_R = LightInstructionInfo("IROR_R", LightInstructionType::IROR_R, IROR_R_ops_array); + const LightInstructionInfo LightInstructionInfo::IROR_C = LightInstructionInfo("IROR_C", LightInstructionType::IROR_C, MacroOp::Ror_ri); + const LightInstructionInfo LightInstructionInfo::COND_R = LightInstructionInfo("COND_R", LightInstructionType::COND_R, COND_R_ops_array); const LightInstructionInfo LightInstructionInfo::NOP = LightInstructionInfo("NOP"); const int buffer0[] = { 3, 3, 10 }; @@ -375,20 +417,37 @@ namespace RandomX { DecoderBuffer DecoderBuffer::Default = DecoderBuffer(); - const int slot_3[] = { LightInstructionType::IADD_R, LightInstructionType::ISUB_R, LightInstructionType::IXOR_R, LightInstructionType::IADD_R }; - const int slot_3L[] = { LightInstructionType::IADD_R, LightInstructionType::ISUB_R, LightInstructionType::IXOR_R, LightInstructionType::IMULH_R, LightInstructionType::ISMULH_R, LightInstructionType::IXOR_R, LightInstructionType::IMULH_R, LightInstructionType::ISMULH_R }; - const int slot_3F[] = { LightInstructionType::IADD_R, LightInstructionType::ISUB_R, LightInstructionType::IXOR_R, LightInstructionType::IROR_R }; - const int slot_4[] = { LightInstructionType::IMUL_R, LightInstructionType::IROR_C }; - const int slot_7[] = { LightInstructionType::IADD_C, LightInstructionType::IMUL_C, LightInstructionType::IXOR_C, LightInstructionType::IXOR_C }; - const int slot_7L = LightInstructionType::COND_R; - const int slot_8[] = { LightInstructionType::IADD_RC, LightInstructionType::IMUL_9C }; - const int slot_10 = LightInstructionType::IMUL_RCP; + const LightInstructionInfo* slot_3[] = { &LightInstructionInfo::IADD_R, &LightInstructionInfo::ISUB_R, &LightInstructionInfo::IXOR_R, &LightInstructionInfo::IADD_R }; + const LightInstructionInfo* slot_3L[] = { &LightInstructionInfo::IADD_R, &LightInstructionInfo::ISUB_R, &LightInstructionInfo::IXOR_R, &LightInstructionInfo::IMULH_R, &LightInstructionInfo::ISMULH_R, &LightInstructionInfo::IXOR_R, &LightInstructionInfo::IMULH_R, &LightInstructionInfo::ISMULH_R }; + const LightInstructionInfo* slot_3F[] = { &LightInstructionInfo::IADD_R, &LightInstructionInfo::ISUB_R, &LightInstructionInfo::IXOR_R, &LightInstructionInfo::IROR_R }; + const LightInstructionInfo* slot_4[] = { &LightInstructionInfo::IMUL_R, &LightInstructionInfo::IROR_C }; + const LightInstructionInfo* slot_7[] = { &LightInstructionInfo::IADD_C, &LightInstructionInfo::IMUL_C, &LightInstructionInfo::IXOR_C, &LightInstructionInfo::IXOR_C }; + const LightInstructionInfo* slot_7L = &LightInstructionInfo::COND_R; + const LightInstructionInfo* slot_8[] = { &LightInstructionInfo::IADD_RC, &LightInstructionInfo::IMUL_9C }; + const LightInstructionInfo* slot_10 = &LightInstructionInfo::IMUL_RCP; + + template + static int selectRegister(std::vector& availableRegisters, Blake2Generator& gen) { + if (availableRegisters.size() == 0) + throw std::runtime_error("No avialable registers"); + int index; + if (availableRegisters.size() > 1) { + index = gen.getInt32() % availableRegisters.size(); + } + else { + index = 0; + } + int select = availableRegisters[index]; + if (erase) + availableRegisters.erase(availableRegisters.begin() + index); + return select; + } class LightInstruction { public: Instruction toInstr() { Instruction instr; - instr.opcode = lightInstructionOpcode[type_]; + instr.opcode = lightInstructionOpcode[getType()]; instr.dst = dst_; instr.src = src_ >= 0 ? src_ : dst_; instr.mod = mod_; @@ -396,42 +455,40 @@ namespace RandomX { return instr; } - static LightInstruction createForSlot(Blake2Generator& gen, int slotSize, bool isLast = false, bool isFirst = false) { + static LightInstruction createForSlot(Blake2Generator& gen, int slotSize, std::vector& availableRegisters, bool isLast = false, bool isFirst = false) { switch (slotSize) { case 3: if (isLast) { - return create(slot_3L[gen.getByte() & 7], gen); + return create(slot_3L[gen.getByte() & 7], availableRegisters, gen); } else if (isFirst) { - return create(slot_3F[gen.getByte() & 3], gen); + return create(slot_3F[gen.getByte() & 3], availableRegisters, gen); } else { - return create(slot_3[gen.getByte() & 3], gen); + return create(slot_3[gen.getByte() & 3], availableRegisters, gen); } case 4: - return create(slot_4[gen.getByte() & 1], gen); + return create(slot_4[gen.getByte() & 1], availableRegisters, gen); case 7: if (isLast) { - return create(slot_7L, gen); + return create(slot_7L, availableRegisters, gen); } else { - return create(slot_7[gen.getByte() & 3], gen); + return create(slot_7[gen.getByte() & 3], availableRegisters, gen); } case 8: - return create(slot_8[gen.getByte() & 1], gen); + return create(slot_8[gen.getByte() & 1], availableRegisters, gen); case 10: - return create(slot_10, gen); + return create(slot_10, availableRegisters, gen); default: break; } } - static LightInstruction create(int type, Blake2Generator& gen) { - LightInstruction li; - li.type_ = type; - li.opGroup_ = type; - switch (type) + static LightInstruction create(const LightInstructionInfo* info, std::vector& availableRegisters, Blake2Generator& gen) { + LightInstruction li(info); + switch (info->getType()) { case LightInstructionType::IADD_R: { li.dst_ = gen.getByte() & 7; @@ -440,7 +497,6 @@ namespace RandomX { } while (li.dst_ == li.src_); li.mod_ = 0; li.imm32_ = 0; - li.info_ = &LightInstructionInfo::IADD_R; li.opGroup_ = LightInstructionType::IADD_R; li.opGroupPar_ = li.src_; } break; @@ -450,7 +506,6 @@ namespace RandomX { li.src_ = -1; li.mod_ = 0; li.imm32_ = gen.getInt32(); - li.info_ = &LightInstructionInfo::IADD_C; li.opGroup_ = LightInstructionType::IADD_R; li.opGroupPar_ = li.src_; } break; @@ -462,7 +517,6 @@ namespace RandomX { } while (li.dst_ == li.src_); li.mod_ = 0; li.imm32_ = gen.getInt32(); - li.info_ = &LightInstructionInfo::IADD_RC; li.opGroup_ = LightInstructionType::IADD_R; li.opGroupPar_ = li.src_; } break; @@ -474,7 +528,6 @@ namespace RandomX { } while (li.dst_ == li.src_); li.mod_ = 0; li.imm32_ = 0; - li.info_ = &LightInstructionInfo::ISUB_R; li.opGroup_ = LightInstructionType::IADD_R; li.opGroupPar_ = li.src_; } break; @@ -486,7 +539,6 @@ namespace RandomX { } while (li.dst_ == li.src_); li.mod_ = 0; li.imm32_ = gen.getInt32(); - li.info_ = &LightInstructionInfo::IMUL_9C; li.opGroup_ = LightInstructionType::IMUL_C; li.opGroupPar_ = -1; } break; @@ -498,7 +550,6 @@ namespace RandomX { } while (li.dst_ == li.src_); li.mod_ = 0; li.imm32_ = 0; - li.info_ = &LightInstructionInfo::IMUL_R; li.opGroup_ = LightInstructionType::IMUL_R; li.opGroupPar_ = gen.getInt32(); } break; @@ -508,7 +559,6 @@ namespace RandomX { li.src_ = -1; li.mod_ = 0; li.imm32_ = gen.getInt32(); - li.info_ = &LightInstructionInfo::IMUL_C; li.opGroup_ = LightInstructionType::IMUL_C; li.opGroupPar_ = li.src_; } break; @@ -518,7 +568,6 @@ namespace RandomX { li.src_ = gen.getByte() & 7; li.mod_ = 0; li.imm32_ = 0; - li.info_ = &LightInstructionInfo::IMULH_R; li.opGroup_ = LightInstructionType::IMULH_R; li.opGroupPar_ = gen.getInt32(); } break; @@ -528,7 +577,6 @@ namespace RandomX { li.src_ = gen.getByte() & 7; li.mod_ = 0; li.imm32_ = 0; - li.info_ = &LightInstructionInfo::ISMULH_R; li.opGroup_ = LightInstructionType::ISMULH_R; li.opGroupPar_ = gen.getInt32(); } break; @@ -538,7 +586,6 @@ namespace RandomX { li.src_ = -1; li.mod_ = 0; li.imm32_ = gen.getInt32(); - li.info_ = &LightInstructionInfo::IMUL_RCP; li.opGroup_ = LightInstructionType::IMUL_C; li.opGroupPar_ = -1; } break; @@ -550,7 +597,6 @@ namespace RandomX { } while (li.dst_ == li.src_); li.mod_ = 0; li.imm32_ = 0; - li.info_ = &LightInstructionInfo::IXOR_R; li.opGroup_ = LightInstructionType::IXOR_R; li.opGroupPar_ = li.src_; } break; @@ -560,7 +606,6 @@ namespace RandomX { li.src_ = -1; li.mod_ = 0; li.imm32_ = gen.getInt32(); - li.info_ = &LightInstructionInfo::IXOR_C; li.opGroup_ = LightInstructionType::IXOR_R; li.opGroupPar_ = li.src_; } break; @@ -572,7 +617,6 @@ namespace RandomX { } while (li.dst_ == li.src_); li.mod_ = 0; li.imm32_ = 0; - li.info_ = &LightInstructionInfo::IROR_R; li.opGroup_ = LightInstructionType::IROR_R; li.opGroupPar_ = -1; } break; @@ -582,7 +626,6 @@ namespace RandomX { li.src_ = -1; li.mod_ = 0; li.imm32_ = gen.getByte(); - li.info_ = &LightInstructionInfo::IROR_C; li.opGroup_ = LightInstructionType::IROR_R; li.opGroupPar_ = -1; } break; @@ -592,7 +635,6 @@ namespace RandomX { li.src_ = gen.getByte() & 7; li.mod_ = gen.getByte(); li.imm32_ = gen.getInt32(); - li.info_ = &LightInstructionInfo::COND_R; li.opGroup_ = LightInstructionType::COND_R; li.opGroupPar_ = li.imm32_; } break; @@ -605,7 +647,7 @@ namespace RandomX { } int getType() { - return type_; + return info_.getType(); } int getSource() { return src_; @@ -620,37 +662,32 @@ namespace RandomX { return opGroupPar_; } - const LightInstructionInfo* getInfo() { + LightInstructionInfo& getInfo() { return info_; } static const LightInstruction Null; private: - int type_; + LightInstructionInfo info_; int src_; int dst_; int mod_; uint32_t imm32_; - - const LightInstructionInfo* info_; int opGroup_; int opGroupPar_; - LightInstruction() {} - LightInstruction(int type, const LightInstructionInfo* info) : type_(type), info_(info) {} + LightInstruction(const LightInstructionInfo* info) : info_(*info) { + for (unsigned i = 0; i < info_.getSize(); ++i) { + MacroOp& mop = info_.getOp(i); + if (mop.isDependent()) { + mop.setSrcDep(&info_.getOp(i - 1)); + } + } + } }; - class RegisterInfo { - public: - RegisterInfo() : lastOpGroup(-1), source(-1), value(0), latency(0) {} - int lastOpGroup; - int source; - int value; - int latency; - }; - - const LightInstruction LightInstruction::Null = LightInstruction(-1, &LightInstructionInfo::NOP); + const LightInstruction LightInstruction::Null = LightInstruction(&LightInstructionInfo::NOP); constexpr int ALU_COUNT_MUL = 1; constexpr int ALU_COUNT = 4; @@ -660,6 +697,73 @@ namespace RandomX { static int blakeCounter = 0; + static int scheduleUop(const MacroOp& mop, ExecutionPort::type(&portBusy)[RANDOMX_LPROG_LATENCY + 1][3], int cycle, int depCycle) { + if (mop.isDependent()) { + cycle = std::max(cycle, depCycle); + } + if (mop.isEliminated()) { + std::cout << "; (eliminated)" << std::endl; + return cycle; + } + else if (mop.isSimple()) { + if (mop.getUop1() <= ExecutionPort::P5) { + for (; cycle <= RANDOMX_LPROG_LATENCY; ++cycle) { + if (!portBusy[cycle][mop.getUop1() - 1]) { + std::cout << "; P" << mop.getUop1() - 1 << " at cycle " << cycle << std::endl; + portBusy[cycle][mop.getUop1() - 1] = mop.getUop1(); + return cycle; + } + } + } + else if (mop.getUop1() == ExecutionPort::P05) { + for (; cycle <= RANDOMX_LPROG_LATENCY; ++cycle) { + if (!portBusy[cycle][0]) { + std::cout << "; P0 at cycle " << cycle << std::endl; + portBusy[cycle][0] = mop.getUop1(); + return cycle; + } + if (!portBusy[cycle][2]) { + std::cout << "; P2 at cycle " << cycle << std::endl; + portBusy[cycle][2] = mop.getUop1(); + return cycle; + } + } + } + else { + for (; cycle <= RANDOMX_LPROG_LATENCY; ++cycle) { + if (!portBusy[cycle][0]) { + std::cout << "; P0 at cycle " << cycle << std::endl; + portBusy[cycle][0] = mop.getUop1(); + return cycle; + } + if (!portBusy[cycle][2]) { + std::cout << "; P2 at cycle " << cycle << std::endl; + portBusy[cycle][2] = mop.getUop1(); + return cycle; + } + if (!portBusy[cycle][1]) { + std::cout << "; P1 at cycle " << cycle << std::endl; + portBusy[cycle][1] = mop.getUop1(); + return cycle; + } + } + } + } + else { + for (; cycle <= RANDOMX_LPROG_LATENCY; ++cycle) { + if (!portBusy[cycle][mop.getUop1() - 1] && !portBusy[cycle][mop.getUop2() - 1]) { + std::cout << "; P" << mop.getUop1() - 1 << " P" << mop.getUop2() - 1 << " at cycle " << cycle << std::endl; + portBusy[cycle][mop.getUop1() - 1] = mop.getUop1(); + portBusy[cycle][mop.getUop2() - 1] = mop.getUop2(); + return cycle; + } + } + } + + std::cout << "Unable to map operation '" << mop.getName() << "' to execution port"; + return -1; + } + // If we don't have enough data available, generate more static FORCE_INLINE void check_data(size_t& data_index, const size_t bytes_needed, uint8_t* data, const size_t data_size) { @@ -673,11 +777,12 @@ namespace RandomX { void generateLightProg2(LightProgram& prog, const void* seed, int indexRegister) { - bool portBusy[RANDOMX_LPROG_LATENCY][3]; + ExecutionPort::type portBusy[RANDOMX_LPROG_LATENCY + 1][3]; + memset(portBusy, 0, sizeof(portBusy)); RegisterInfo registers[8]; - bool decoderBusy[RANDOMX_LPROG_LATENCY][4]; Blake2Generator gen(seed); std::vector instructions; + std::vector availableRegisters; DecoderBuffer& fetchLine = DecoderBuffer::Default; LightInstruction currentInstruction = LightInstruction::Null; @@ -685,35 +790,78 @@ namespace RandomX { int codeSize = 0; int macroOpCount = 0; int rxOpCount = 0; + int cycle = 0; + int depCycle = 0; + int mopIndex = 0; + bool portsSaturated = false; - for (int cycle = 0; cycle < 170; ++cycle) { + while(!portsSaturated) { fetchLine = fetchLine.fetchNext(currentInstruction.getType(), gen); - std::cout << "; cycle " << cycle << " buffer " << fetchLine.getName() << std::endl; + std::cout << "; ------------- fetch cycle " << cycle << " (" << fetchLine.getName() << ")" << std::endl; - int mopIndex = 0; + availableRegisters.clear(); + for (unsigned i = 0; i < 8; ++i) { + if (registers[i].latency <= cycle) + availableRegisters.push_back(i); + } + + mopIndex = 0; - while (mopIndex < fetchLine.getSize()) { - if (instrIndex >= currentInstruction.getInfo()->getSize()) { - currentInstruction = LightInstruction::createForSlot(gen, fetchLine.getCounts()[mopIndex], fetchLine.getSize() == mopIndex + 1, fetchLine.getIndex() == 0 && mopIndex == 0); + while (!portsSaturated && mopIndex < fetchLine.getSize()) { + if (instrIndex >= currentInstruction.getInfo().getSize()) { + currentInstruction = LightInstruction::createForSlot(gen, fetchLine.getCounts()[mopIndex], availableRegisters, fetchLine.getSize() == mopIndex + 1, fetchLine.getIndex() == 0 && mopIndex == 0); instrIndex = 0; - std::cout << "; " << currentInstruction.getInfo()->getName() << std::endl; + std::cout << "; " << currentInstruction.getInfo().getName() << std::endl; rxOpCount++; } - if (fetchLine.getCounts()[mopIndex] != currentInstruction.getInfo()->getOp(instrIndex)->getSize()) { - std::cout << "ERROR instruction " << currentInstruction.getInfo()->getOp(instrIndex)->getName() << " doesn't fit into slot of size " << fetchLine.getCounts()[mopIndex] << std::endl; + MacroOp& mop = currentInstruction.getInfo().getOp(instrIndex); + if (fetchLine.getCounts()[mopIndex] != mop.getSize()) { + std::cout << "ERROR instruction " << mop.getName() << " doesn't fit into slot of size " << fetchLine.getCounts()[mopIndex] << std::endl; return; } - std::cout << currentInstruction.getInfo()->getOp(instrIndex)->getName() << std::endl; - codeSize += currentInstruction.getInfo()->getOp(instrIndex)->getSize(); + + std::cout << mop.getName() << " "; + codeSize += mop.getSize(); mopIndex++; instrIndex++; macroOpCount++; + int scheduleCycle = scheduleUop(mop, portBusy, cycle, depCycle); + if (scheduleCycle >= RANDOMX_LPROG_LATENCY) { + portsSaturated = true; + } + mop.setCycle(scheduleCycle); + depCycle = scheduleCycle + mop.getLatency(); } + ++cycle; + } + + while (instrIndex < currentInstruction.getInfo().getSize()) { + if (mopIndex >= fetchLine.getSize()) { + fetchLine = fetchLine.fetchNext(currentInstruction.getType(), gen); + std::cout << "; cycle " << cycle++ << " buffer " << fetchLine.getName() << std::endl; + mopIndex = 0; + } + MacroOp& mop = currentInstruction.getInfo().getOp(instrIndex); + std::cout << mop.getName() << " "; + codeSize += mop.getSize(); + mopIndex++; + instrIndex++; + macroOpCount++; + int scheduleCycle = scheduleUop(mop, portBusy, cycle, depCycle); + mop.setCycle(scheduleCycle); + depCycle = scheduleCycle + mop.getLatency(); } std::cout << "; code size " << codeSize << std::endl; std::cout << "; x86 macro-ops: " << macroOpCount << std::endl; std::cout << "; RandomX instructions: " << rxOpCount << std::endl; + + for (int i = 0; i < RANDOMX_LPROG_LATENCY + 1; ++i) { + for (int j = 0; j < 3; ++j) { + std::cout << (portBusy[i][j] ? '*' : '_'); + } + std::cout << std::endl; + } } void generateLightProgram(LightProgram& prog, const void* seed, int indexRegister) { From 2b9209346e1ee149a81633d8a7cce32796e1fef8 Mon Sep 17 00:00:00 2001 From: tevador Date: Mon, 1 Apr 2019 00:38:17 +0200 Subject: [PATCH 04/18] Operand allocation --- src/AssemblyGeneratorX86.cpp | 15 --- src/AssemblyGeneratorX86.hpp | 17 ++- src/LightProgramGenerator.cpp | 197 +++++++++++++++++++--------------- src/main.cpp | 5 +- 4 files changed, 128 insertions(+), 106 deletions(-) diff --git a/src/AssemblyGeneratorX86.cpp b/src/AssemblyGeneratorX86.cpp index fd7ee06..8a4012a 100644 --- a/src/AssemblyGeneratorX86.cpp +++ b/src/AssemblyGeneratorX86.cpp @@ -58,21 +58,6 @@ namespace RandomX { return minIndex; } - void AssemblyGeneratorX86::generateProgram(Program& prog) { - for (unsigned i = 0; i < 8; ++i) { - registerUsage[i] = -1; - } - asmCode.str(std::string()); //clear - for (unsigned i = 0; i < RANDOMX_PROGRAM_SIZE; ++i) { - asmCode << "randomx_isn_" << i << ":" << std::endl; - Instruction& instr = prog(i); - instr.src %= RegistersCount; - instr.dst %= RegistersCount; - generateCode(instr, i); - //asmCode << std::endl; - } - } - void AssemblyGeneratorX86::traceint(Instruction& instr) { if (trace) { asmCode << "\tpush " << regR[instr.dst] << std::endl; diff --git a/src/AssemblyGeneratorX86.hpp b/src/AssemblyGeneratorX86.hpp index d2672a0..8ab638b 100644 --- a/src/AssemblyGeneratorX86.hpp +++ b/src/AssemblyGeneratorX86.hpp @@ -21,6 +21,7 @@ along with RandomX. If not, see. #include "Instruction.hpp" #include "configuration.h" +#include "common.hpp" #include namespace RandomX { @@ -32,7 +33,21 @@ namespace RandomX { class AssemblyGeneratorX86 { public: - void generateProgram(Program&); + template + void generateProgram(P& prog) { + for (unsigned i = 0; i < 8; ++i) { + registerUsage[i] = -1; + } + asmCode.str(std::string()); //clear + for (unsigned i = 0; i < prog.getSize(); ++i) { + asmCode << "randomx_isn_" << i << ":" << std::endl; + Instruction& instr = prog(i); + instr.src %= RegistersCount; + instr.dst %= RegistersCount; + generateCode(instr, i); + //asmCode << std::endl; + } + } void printCode(std::ostream& os) { os << asmCode.rdbuf(); } diff --git a/src/LightProgramGenerator.cpp b/src/LightProgramGenerator.cpp index db674ee..cea05ae 100644 --- a/src/LightProgramGenerator.cpp +++ b/src/LightProgramGenerator.cpp @@ -25,6 +25,7 @@ along with RandomX. If not, see. #include #include #include +#include namespace RandomX { // Intel Ivy Bridge reference @@ -142,11 +143,11 @@ namespace RandomX { class RegisterInfo { public: - RegisterInfo() : lastOpGroup(-1), source(-1), value(0), latency(0) {} + RegisterInfo() : latency(0), lastOpGroup(-1), source(-1), value(0) {} + int latency; int lastOpGroup; int source; int value; - int latency; }; class MacroOp { @@ -264,8 +265,8 @@ namespace RandomX { ops_.push_back(MacroOp(op)); } template - LightInstructionInfo(const char* name, int type, const MacroOp(&arr)[N]) - : name_(name), type_(type), latency_(0) { + LightInstructionInfo(const char* name, int type, const MacroOp(&arr)[N], int resultOp, int dstOp, int srcOp) + : name_(name), type_(type), latency_(0), resultOp_(resultOp), dstOp_(dstOp), srcOp_(srcOp) { for (unsigned i = 0; i < N; ++i) { ops_.push_back(MacroOp(arr[i])); latency_ += ops_.back().getLatency(); @@ -273,8 +274,8 @@ namespace RandomX { static_assert(N > 1, "Invalid array size"); } template - LightInstructionInfo(const char* name, int type, const MacroOp*(&arr)[N], int latency) - : name_(name), type_(type), latency_(latency) { + LightInstructionInfo(const char* name, int type, const MacroOp*(&arr)[N], int latency, int resultOp, int dstOp, int srcOp) + : name_(name), type_(type), latency_(latency), resultOp_(resultOp), dstOp_(dstOp), srcOp_(srcOp) { for (unsigned i = 0; i < N; ++i) { ops_.push_back(MacroOp(arr[i])); if (arr[i].isDependent()) { @@ -301,6 +302,15 @@ namespace RandomX { int getType() const { return type_; } + int getResultOp() const { + return resultOp_; + } + int getDstOp() const { + return dstOp_; + } + int getSrcOp() const { + return srcOp_; + } static const LightInstructionInfo IADD_R; static const LightInstructionInfo IADD_C; static const LightInstructionInfo IADD_RC; @@ -322,6 +332,9 @@ namespace RandomX { int type_; std::vector ops_; int latency_; + int resultOp_ = 0; + int dstOp_ = 0; + int srcOp_ = 0; LightInstructionInfo(const char* name) : name_(name), type_(-1), latency_(0) {} @@ -334,14 +347,14 @@ namespace RandomX { const LightInstructionInfo LightInstructionInfo::IMUL_9C = LightInstructionInfo("IMUL_9C", LightInstructionType::IMUL_9C, MacroOp::Lea_sib); const LightInstructionInfo LightInstructionInfo::IMUL_R = LightInstructionInfo("IMUL_R", LightInstructionType::IMUL_R, MacroOp::Imul_rr); const LightInstructionInfo LightInstructionInfo::IMUL_C = LightInstructionInfo("IMUL_C", LightInstructionType::IMUL_C, MacroOp::Imul_rri); - const LightInstructionInfo LightInstructionInfo::IMULH_R = LightInstructionInfo("IMULH_R", LightInstructionType::IMULH_R, IMULH_R_ops_array); - const LightInstructionInfo LightInstructionInfo::ISMULH_R = LightInstructionInfo("ISMULH_R", LightInstructionType::ISMULH_R, ISMULH_R_ops_array); - const LightInstructionInfo LightInstructionInfo::IMUL_RCP = LightInstructionInfo("IMUL_RCP", LightInstructionType::IMUL_RCP, IMUL_RCP_ops_array); + const LightInstructionInfo LightInstructionInfo::IMULH_R = LightInstructionInfo("IMULH_R", LightInstructionType::IMULH_R, IMULH_R_ops_array, 1, 0, 1); + const LightInstructionInfo LightInstructionInfo::ISMULH_R = LightInstructionInfo("ISMULH_R", LightInstructionType::ISMULH_R, ISMULH_R_ops_array, 1, 0, 1); + const LightInstructionInfo LightInstructionInfo::IMUL_RCP = LightInstructionInfo("IMUL_RCP", LightInstructionType::IMUL_RCP, IMUL_RCP_ops_array, 1, 1, -1); const LightInstructionInfo LightInstructionInfo::IXOR_R = LightInstructionInfo("IXOR_R", LightInstructionType::IXOR_R, MacroOp::Xor_rr); const LightInstructionInfo LightInstructionInfo::IXOR_C = LightInstructionInfo("IXOR_C", LightInstructionType::IXOR_C, MacroOp::Xor_ri); - const LightInstructionInfo LightInstructionInfo::IROR_R = LightInstructionInfo("IROR_R", LightInstructionType::IROR_R, IROR_R_ops_array); + const LightInstructionInfo LightInstructionInfo::IROR_R = LightInstructionInfo("IROR_R", LightInstructionType::IROR_R, IROR_R_ops_array, 1, 1, 0); const LightInstructionInfo LightInstructionInfo::IROR_C = LightInstructionInfo("IROR_C", LightInstructionType::IROR_C, MacroOp::Ror_ri); - const LightInstructionInfo LightInstructionInfo::COND_R = LightInstructionInfo("COND_R", LightInstructionType::COND_R, COND_R_ops_array); + const LightInstructionInfo LightInstructionInfo::COND_R = LightInstructionInfo("COND_R", LightInstructionType::COND_R, COND_R_ops_array, 5, 5, 3); const LightInstructionInfo LightInstructionInfo::NOP = LightInstructionInfo("NOP"); const int buffer0[] = { 3, 3, 10 }; @@ -426,75 +439,68 @@ namespace RandomX { const LightInstructionInfo* slot_8[] = { &LightInstructionInfo::IADD_RC, &LightInstructionInfo::IMUL_9C }; const LightInstructionInfo* slot_10 = &LightInstructionInfo::IMUL_RCP; - template - static int selectRegister(std::vector& availableRegisters, Blake2Generator& gen) { - if (availableRegisters.size() == 0) - throw std::runtime_error("No avialable registers"); + static bool selectRegister(std::vector& availableRegisters, Blake2Generator& gen, int& reg) { int index; + if (availableRegisters.size() == 0) + return false; + //throw std::runtime_error("No available registers"); + if (availableRegisters.size() > 1) { index = gen.getInt32() % availableRegisters.size(); } else { index = 0; } - int select = availableRegisters[index]; - if (erase) - availableRegisters.erase(availableRegisters.begin() + index); - return select; + reg = availableRegisters[index]; + return true; } class LightInstruction { public: - Instruction toInstr() { - Instruction instr; + void toInstr(Instruction& instr) { instr.opcode = lightInstructionOpcode[getType()]; instr.dst = dst_; instr.src = src_ >= 0 ? src_ : dst_; instr.mod = mod_; instr.setImm32(imm32_); - return instr; } - static LightInstruction createForSlot(Blake2Generator& gen, int slotSize, std::vector& availableRegisters, bool isLast = false, bool isFirst = false) { + static LightInstruction createForSlot(Blake2Generator& gen, int slotSize, bool isLast = false, bool isFirst = false) { switch (slotSize) { case 3: if (isLast) { - return create(slot_3L[gen.getByte() & 7], availableRegisters, gen); + return create(slot_3L[gen.getByte() & 7], gen); } else if (isFirst) { - return create(slot_3F[gen.getByte() & 3], availableRegisters, gen); + return create(slot_3F[gen.getByte() & 3], gen); } else { - return create(slot_3[gen.getByte() & 3], availableRegisters, gen); + return create(slot_3[gen.getByte() & 3], gen); } case 4: - return create(slot_4[gen.getByte() & 1], availableRegisters, gen); + return create(slot_4[gen.getByte() & 1], gen); case 7: if (isLast) { - return create(slot_7L, availableRegisters, gen); + return create(slot_7L, gen); } else { - return create(slot_7[gen.getByte() & 3], availableRegisters, gen); + return create(slot_7[gen.getByte() & 3], gen); } case 8: - return create(slot_8[gen.getByte() & 1], availableRegisters, gen); + return create(slot_8[gen.getByte() & 1], gen); case 10: - return create(slot_10, availableRegisters, gen); + return create(slot_10, gen); default: break; } } - static LightInstruction create(const LightInstructionInfo* info, std::vector& availableRegisters, Blake2Generator& gen) { + static LightInstruction create(const LightInstructionInfo* info, Blake2Generator& gen) { LightInstruction li(info); switch (info->getType()) { case LightInstructionType::IADD_R: { - li.dst_ = gen.getByte() & 7; - do { - li.src_ = gen.getByte() & 7; - } while (li.dst_ == li.src_); li.mod_ = 0; li.imm32_ = 0; li.opGroup_ = LightInstructionType::IADD_R; @@ -502,8 +508,7 @@ namespace RandomX { } break; case LightInstructionType::IADD_C: { - li.dst_ = gen.getByte() & 7; - li.src_ = -1; + li.hasSource_ = false; li.mod_ = 0; li.imm32_ = gen.getInt32(); li.opGroup_ = LightInstructionType::IADD_R; @@ -511,10 +516,6 @@ namespace RandomX { } break; case LightInstructionType::IADD_RC: { - li.dst_ = gen.getByte() & 7; - do { - li.src_ = gen.getByte() & 7; - } while (li.dst_ == li.src_); li.mod_ = 0; li.imm32_ = gen.getInt32(); li.opGroup_ = LightInstructionType::IADD_R; @@ -522,10 +523,6 @@ namespace RandomX { } break; case LightInstructionType::ISUB_R: { - li.dst_ = gen.getByte() & 7; - do { - li.src_ = gen.getByte() & 7; - } while (li.dst_ == li.src_); li.mod_ = 0; li.imm32_ = 0; li.opGroup_ = LightInstructionType::IADD_R; @@ -533,10 +530,6 @@ namespace RandomX { } break; case LightInstructionType::IMUL_9C: { - li.dst_ = gen.getByte() & 7; - do { - li.src_ = gen.getByte() & 7; - } while (li.dst_ == li.src_); li.mod_ = 0; li.imm32_ = gen.getInt32(); li.opGroup_ = LightInstructionType::IMUL_C; @@ -544,10 +537,6 @@ namespace RandomX { } break; case LightInstructionType::IMUL_R: { - li.dst_ = gen.getByte() & 7; - do { - li.src_ = gen.getByte() & 7; - } while (li.dst_ == li.src_); li.mod_ = 0; li.imm32_ = 0; li.opGroup_ = LightInstructionType::IMUL_R; @@ -555,8 +544,7 @@ namespace RandomX { } break; case LightInstructionType::IMUL_C: { - li.dst_ = gen.getByte() & 7; - li.src_ = -1; + li.hasSource_ = false; li.mod_ = 0; li.imm32_ = gen.getInt32(); li.opGroup_ = LightInstructionType::IMUL_C; @@ -564,8 +552,7 @@ namespace RandomX { } break; case LightInstructionType::IMULH_R: { - li.dst_ = gen.getByte() & 7; - li.src_ = gen.getByte() & 7; + li.canReuse_ = true; li.mod_ = 0; li.imm32_ = 0; li.opGroup_ = LightInstructionType::IMULH_R; @@ -573,8 +560,7 @@ namespace RandomX { } break; case LightInstructionType::ISMULH_R: { - li.dst_ = gen.getByte() & 7; - li.src_ = gen.getByte() & 7; + li.canReuse_ = true; li.mod_ = 0; li.imm32_ = 0; li.opGroup_ = LightInstructionType::ISMULH_R; @@ -582,8 +568,7 @@ namespace RandomX { } break; case LightInstructionType::IMUL_RCP: { - li.dst_ = gen.getByte() & 7; - li.src_ = -1; + li.hasSource_ = false; li.mod_ = 0; li.imm32_ = gen.getInt32(); li.opGroup_ = LightInstructionType::IMUL_C; @@ -591,10 +576,6 @@ namespace RandomX { } break; case LightInstructionType::IXOR_R: { - li.dst_ = gen.getByte() & 7; - do { - li.src_ = gen.getByte() & 7; - } while (li.dst_ == li.src_); li.mod_ = 0; li.imm32_ = 0; li.opGroup_ = LightInstructionType::IXOR_R; @@ -602,8 +583,7 @@ namespace RandomX { } break; case LightInstructionType::IXOR_C: { - li.dst_ = gen.getByte() & 7; - li.src_ = -1; + li.hasSource_ = false; li.mod_ = 0; li.imm32_ = gen.getInt32(); li.opGroup_ = LightInstructionType::IXOR_R; @@ -611,10 +591,6 @@ namespace RandomX { } break; case LightInstructionType::IROR_R: { - li.dst_ = gen.getByte() & 7; - do { - li.src_ = gen.getByte() & 7; - } while (li.dst_ == li.src_); li.mod_ = 0; li.imm32_ = 0; li.opGroup_ = LightInstructionType::IROR_R; @@ -622,8 +598,7 @@ namespace RandomX { } break; case LightInstructionType::IROR_C: { - li.dst_ = gen.getByte() & 7; - li.src_ = -1; + li.hasSource_ = false; li.mod_ = 0; li.imm32_ = gen.getByte(); li.opGroup_ = LightInstructionType::IROR_R; @@ -631,8 +606,7 @@ namespace RandomX { } break; case LightInstructionType::COND_R: { - li.dst_ = gen.getByte() & 7; - li.src_ = gen.getByte() & 7; + li.canReuse_ = true; li.mod_ = gen.getByte(); li.imm32_ = gen.getInt32(); li.opGroup_ = LightInstructionType::COND_R; @@ -646,6 +620,24 @@ namespace RandomX { return li; } + bool selectDestination(int cycle, RegisterInfo (®isters)[8], Blake2Generator& gen) { + std::vector availableRegisters; + for (unsigned i = 0; i < 8; ++i) { + if (registers[i].latency <= cycle) + availableRegisters.push_back(i); + } + return selectRegister(availableRegisters, gen, dst_); + } + + bool selectSource(int cycle, RegisterInfo(®isters)[8], Blake2Generator& gen) { + std::vector availableRegisters; + for (unsigned i = 0; i < 8; ++i) { + if (registers[i].latency <= cycle && (canReuse_ || i != dst_)) + availableRegisters.push_back(i); + } + return selectRegister(availableRegisters, gen, src_); + } + int getType() { return info_.getType(); } @@ -661,6 +653,9 @@ namespace RandomX { int getGroupPar() { return opGroupPar_; } + bool hasSource() { + return hasSource_; + } LightInstructionInfo& getInfo() { return info_; @@ -670,12 +665,14 @@ namespace RandomX { private: LightInstructionInfo info_; - int src_; - int dst_; + int src_ = -1; + int dst_ = -1; int mod_; uint32_t imm32_; int opGroup_; int opGroupPar_; + bool hasSource_ = true; + bool canReuse_ = false; LightInstruction(const LightInstructionInfo* info) : info_(*info) { for (unsigned i = 0; i < info_.getSize(); ++i) { @@ -782,7 +779,6 @@ namespace RandomX { RegisterInfo registers[8]; Blake2Generator gen(seed); std::vector instructions; - std::vector availableRegisters; DecoderBuffer& fetchLine = DecoderBuffer::Default; LightInstruction currentInstruction = LightInstruction::Null; @@ -794,22 +790,20 @@ namespace RandomX { int depCycle = 0; int mopIndex = 0; bool portsSaturated = false; + int outIndex = 0; while(!portsSaturated) { fetchLine = fetchLine.fetchNext(currentInstruction.getType(), gen); std::cout << "; ------------- fetch cycle " << cycle << " (" << fetchLine.getName() << ")" << std::endl; - availableRegisters.clear(); - for (unsigned i = 0; i < 8; ++i) { - if (registers[i].latency <= cycle) - availableRegisters.push_back(i); - } - mopIndex = 0; while (!portsSaturated && mopIndex < fetchLine.getSize()) { if (instrIndex >= currentInstruction.getInfo().getSize()) { - currentInstruction = LightInstruction::createForSlot(gen, fetchLine.getCounts()[mopIndex], availableRegisters, fetchLine.getSize() == mopIndex + 1, fetchLine.getIndex() == 0 && mopIndex == 0); + if (currentInstruction.getType() >= 0) { + currentInstruction.toInstr(prog(outIndex++)); + } + currentInstruction = LightInstruction::createForSlot(gen, fetchLine.getCounts()[mopIndex], fetchLine.getSize() == mopIndex + 1, fetchLine.getIndex() == 0 && mopIndex == 0); instrIndex = 0; std::cout << "; " << currentInstruction.getInfo().getName() << std::endl; rxOpCount++; @@ -821,16 +815,38 @@ namespace RandomX { } std::cout << mop.getName() << " "; + int scheduleCycle = scheduleUop(mop, portBusy, cycle, depCycle); + mop.setCycle(scheduleCycle); + + if (instrIndex == currentInstruction.getInfo().getDstOp()) { + while (!currentInstruction.selectDestination(scheduleCycle, registers, gen)) { + std::cout << "; dst STALL at cycle " << cycle << std::endl; + ++scheduleCycle; + ++cycle; + } + std::cout << "; dst = r" << currentInstruction.getDestination() << std::endl; + } + if (currentInstruction.hasSource() && instrIndex == currentInstruction.getInfo().getSrcOp()) { + while (!currentInstruction.selectSource(scheduleCycle, registers, gen)) { + std::cout << "; src STALL at cycle " << cycle << std::endl; + ++scheduleCycle; + ++cycle; + } + std::cout << "; src = r" << currentInstruction.getSource() << std::endl; + } + if (instrIndex == currentInstruction.getInfo().getResultOp()) { + int depCycle = scheduleCycle + mop.getLatency(); + registers[currentInstruction.getDestination()].latency = depCycle; + std::cout << "; RETIRED at cycle " << depCycle << std::endl; + } + codeSize += mop.getSize(); mopIndex++; instrIndex++; macroOpCount++; - int scheduleCycle = scheduleUop(mop, portBusy, cycle, depCycle); if (scheduleCycle >= RANDOMX_LPROG_LATENCY) { portsSaturated = true; } - mop.setCycle(scheduleCycle); - depCycle = scheduleCycle + mop.getLatency(); } ++cycle; } @@ -857,11 +873,14 @@ namespace RandomX { std::cout << "; RandomX instructions: " << rxOpCount << std::endl; for (int i = 0; i < RANDOMX_LPROG_LATENCY + 1; ++i) { + std::cout << std::setw(3) << i << " "; for (int j = 0; j < 3; ++j) { std::cout << (portBusy[i][j] ? '*' : '_'); } std::cout << std::endl; } + + prog.setSize(outIndex); } void generateLightProgram(LightProgram& prog, const void* seed, int indexRegister) { diff --git a/src/main.cpp b/src/main.cpp index 8c1f64a..fdc198c 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -224,7 +224,10 @@ int main(int argc, char** argv) { if (genLight) { RandomX::LightProgram p; RandomX::generateLightProg2(p, seed, 0); - //std::cout << p << std::endl; + RandomX::AssemblyGeneratorX86 asmX86; + asmX86.generateProgram(p); + std::cout << "-------------------------------------------------------" << std::endl; + asmX86.printCode(std::cout); return 0; } From 23a357db377ad9ab4c4e1481e46e005da28b0111 Mon Sep 17 00:00:00 2001 From: tevador Date: Mon, 1 Apr 2019 18:31:02 +0200 Subject: [PATCH 05/18] Removed optimizable instruction sequences --- src/LightProgramGenerator.cpp | 95 ++++++++++++++++++----------------- 1 file changed, 49 insertions(+), 46 deletions(-) diff --git a/src/LightProgramGenerator.cpp b/src/LightProgramGenerator.cpp index cea05ae..d4aa79d 100644 --- a/src/LightProgramGenerator.cpp +++ b/src/LightProgramGenerator.cpp @@ -143,10 +143,10 @@ namespace RandomX { class RegisterInfo { public: - RegisterInfo() : latency(0), lastOpGroup(-1), source(-1), value(0) {} + RegisterInfo() : latency(0), lastOpGroup(-1), lastOpPar(-1), value(0) {} int latency; int lastOpGroup; - int source; + int lastOpPar; int value; }; @@ -260,8 +260,8 @@ namespace RandomX { class LightInstructionInfo { public: - LightInstructionInfo(const char* name, int type, const MacroOp& op) - : name_(name), type_(type), latency_(op.getLatency()) { + LightInstructionInfo(const char* name, int type, const MacroOp& op, int srcOp) + : name_(name), type_(type), latency_(op.getLatency()), srcOp_(srcOp) { ops_.push_back(MacroOp(op)); } template @@ -334,26 +334,26 @@ namespace RandomX { int latency_; int resultOp_ = 0; int dstOp_ = 0; - int srcOp_ = 0; + int srcOp_; LightInstructionInfo(const char* name) : name_(name), type_(-1), latency_(0) {} }; - const LightInstructionInfo LightInstructionInfo::IADD_R = LightInstructionInfo("IADD_R", LightInstructionType::IADD_R, MacroOp::Add_rr); - const LightInstructionInfo LightInstructionInfo::IADD_C = LightInstructionInfo("IADD_C", LightInstructionType::IADD_C, MacroOp::Add_ri); - const LightInstructionInfo LightInstructionInfo::IADD_RC = LightInstructionInfo("IADD_RC", LightInstructionType::IADD_RC, MacroOp::Lea_sib); - const LightInstructionInfo LightInstructionInfo::ISUB_R = LightInstructionInfo("ISUB_R", LightInstructionType::ISUB_R, MacroOp::Sub_rr); - const LightInstructionInfo LightInstructionInfo::IMUL_9C = LightInstructionInfo("IMUL_9C", LightInstructionType::IMUL_9C, MacroOp::Lea_sib); - const LightInstructionInfo LightInstructionInfo::IMUL_R = LightInstructionInfo("IMUL_R", LightInstructionType::IMUL_R, MacroOp::Imul_rr); - const LightInstructionInfo LightInstructionInfo::IMUL_C = LightInstructionInfo("IMUL_C", LightInstructionType::IMUL_C, MacroOp::Imul_rri); + const LightInstructionInfo LightInstructionInfo::IADD_R = LightInstructionInfo("IADD_R", LightInstructionType::IADD_R, MacroOp::Add_rr, 0); + const LightInstructionInfo LightInstructionInfo::IADD_C = LightInstructionInfo("IADD_C", LightInstructionType::IADD_C, MacroOp::Add_ri, -1); + const LightInstructionInfo LightInstructionInfo::IADD_RC = LightInstructionInfo("IADD_RC", LightInstructionType::IADD_RC, MacroOp::Lea_sib, 0); + const LightInstructionInfo LightInstructionInfo::ISUB_R = LightInstructionInfo("ISUB_R", LightInstructionType::ISUB_R, MacroOp::Sub_rr, 0); + const LightInstructionInfo LightInstructionInfo::IMUL_9C = LightInstructionInfo("IMUL_9C", LightInstructionType::IMUL_9C, MacroOp::Lea_sib, 0); + const LightInstructionInfo LightInstructionInfo::IMUL_R = LightInstructionInfo("IMUL_R", LightInstructionType::IMUL_R, MacroOp::Imul_rr, 0); + const LightInstructionInfo LightInstructionInfo::IMUL_C = LightInstructionInfo("IMUL_C", LightInstructionType::IMUL_C, MacroOp::Imul_rri, -1); const LightInstructionInfo LightInstructionInfo::IMULH_R = LightInstructionInfo("IMULH_R", LightInstructionType::IMULH_R, IMULH_R_ops_array, 1, 0, 1); const LightInstructionInfo LightInstructionInfo::ISMULH_R = LightInstructionInfo("ISMULH_R", LightInstructionType::ISMULH_R, ISMULH_R_ops_array, 1, 0, 1); const LightInstructionInfo LightInstructionInfo::IMUL_RCP = LightInstructionInfo("IMUL_RCP", LightInstructionType::IMUL_RCP, IMUL_RCP_ops_array, 1, 1, -1); - const LightInstructionInfo LightInstructionInfo::IXOR_R = LightInstructionInfo("IXOR_R", LightInstructionType::IXOR_R, MacroOp::Xor_rr); - const LightInstructionInfo LightInstructionInfo::IXOR_C = LightInstructionInfo("IXOR_C", LightInstructionType::IXOR_C, MacroOp::Xor_ri); + const LightInstructionInfo LightInstructionInfo::IXOR_R = LightInstructionInfo("IXOR_R", LightInstructionType::IXOR_R, MacroOp::Xor_rr, 0); + const LightInstructionInfo LightInstructionInfo::IXOR_C = LightInstructionInfo("IXOR_C", LightInstructionType::IXOR_C, MacroOp::Xor_ri, -1); const LightInstructionInfo LightInstructionInfo::IROR_R = LightInstructionInfo("IROR_R", LightInstructionType::IROR_R, IROR_R_ops_array, 1, 1, 0); - const LightInstructionInfo LightInstructionInfo::IROR_C = LightInstructionInfo("IROR_C", LightInstructionType::IROR_C, MacroOp::Ror_ri); + const LightInstructionInfo LightInstructionInfo::IROR_C = LightInstructionInfo("IROR_C", LightInstructionType::IROR_C, MacroOp::Ror_ri, -1); const LightInstructionInfo LightInstructionInfo::COND_R = LightInstructionInfo("COND_R", LightInstructionType::COND_R, COND_R_ops_array, 5, 5, 3); const LightInstructionInfo LightInstructionInfo::NOP = LightInstructionInfo("NOP"); @@ -504,29 +504,28 @@ namespace RandomX { li.mod_ = 0; li.imm32_ = 0; li.opGroup_ = LightInstructionType::IADD_R; - li.opGroupPar_ = li.src_; + li.groupParIsSource_ = true; } break; case LightInstructionType::IADD_C: { - li.hasSource_ = false; li.mod_ = 0; li.imm32_ = gen.getInt32(); li.opGroup_ = LightInstructionType::IADD_R; - li.opGroupPar_ = li.src_; + li.groupParIsSource_ = true; } break; case LightInstructionType::IADD_RC: { li.mod_ = 0; li.imm32_ = gen.getInt32(); li.opGroup_ = LightInstructionType::IADD_R; - li.opGroupPar_ = li.src_; + li.groupParIsSource_ = true; } break; case LightInstructionType::ISUB_R: { li.mod_ = 0; li.imm32_ = 0; li.opGroup_ = LightInstructionType::IADD_R; - li.opGroupPar_ = li.src_; + li.groupParIsSource_ = true; } break; case LightInstructionType::IMUL_9C: { @@ -544,11 +543,10 @@ namespace RandomX { } break; case LightInstructionType::IMUL_C: { - li.hasSource_ = false; li.mod_ = 0; li.imm32_ = gen.getInt32(); li.opGroup_ = LightInstructionType::IMUL_C; - li.opGroupPar_ = li.src_; + li.opGroupPar_ = -1; } break; case LightInstructionType::IMULH_R: { @@ -568,7 +566,6 @@ namespace RandomX { } break; case LightInstructionType::IMUL_RCP: { - li.hasSource_ = false; li.mod_ = 0; li.imm32_ = gen.getInt32(); li.opGroup_ = LightInstructionType::IMUL_C; @@ -579,15 +576,14 @@ namespace RandomX { li.mod_ = 0; li.imm32_ = 0; li.opGroup_ = LightInstructionType::IXOR_R; - li.opGroupPar_ = li.src_; + li.groupParIsSource_ = true; } break; case LightInstructionType::IXOR_C: { - li.hasSource_ = false; li.mod_ = 0; li.imm32_ = gen.getInt32(); li.opGroup_ = LightInstructionType::IXOR_R; - li.opGroupPar_ = li.src_; + li.opGroupPar_ = -1; } break; case LightInstructionType::IROR_R: { @@ -598,9 +594,10 @@ namespace RandomX { } break; case LightInstructionType::IROR_C: { - li.hasSource_ = false; li.mod_ = 0; - li.imm32_ = gen.getByte(); + do { + li.imm32_ = gen.getByte(); + } while ((li.imm32_ & 63) == 0); li.opGroup_ = LightInstructionType::IROR_R; li.opGroupPar_ = -1; } break; @@ -623,7 +620,7 @@ namespace RandomX { bool selectDestination(int cycle, RegisterInfo (®isters)[8], Blake2Generator& gen) { std::vector availableRegisters; for (unsigned i = 0; i < 8; ++i) { - if (registers[i].latency <= cycle) + if (registers[i].latency <= cycle && (canReuse_ || i != src_) && (registers[i].lastOpGroup != opGroup_ || registers[i].lastOpPar != opGroupPar_)) availableRegisters.push_back(i); } return selectRegister(availableRegisters, gen, dst_); @@ -632,10 +629,15 @@ namespace RandomX { bool selectSource(int cycle, RegisterInfo(®isters)[8], Blake2Generator& gen) { std::vector availableRegisters; for (unsigned i = 0; i < 8; ++i) { - if (registers[i].latency <= cycle && (canReuse_ || i != dst_)) + if (registers[i].latency <= cycle) availableRegisters.push_back(i); } - return selectRegister(availableRegisters, gen, src_); + if (selectRegister(availableRegisters, gen, src_)) { + if (groupParIsSource_) + opGroupPar_ = src_; + return true; + } + return false; } int getType() { @@ -653,9 +655,6 @@ namespace RandomX { int getGroupPar() { return opGroupPar_; } - bool hasSource() { - return hasSource_; - } LightInstructionInfo& getInfo() { return info_; @@ -671,8 +670,8 @@ namespace RandomX { uint32_t imm32_; int opGroup_; int opGroupPar_; - bool hasSource_ = true; bool canReuse_ = false; + bool groupParIsSource_ = false; LightInstruction(const LightInstructionInfo* info) : info_(*info) { for (unsigned i = 0; i < info_.getSize(); ++i) { @@ -818,6 +817,14 @@ namespace RandomX { int scheduleCycle = scheduleUop(mop, portBusy, cycle, depCycle); mop.setCycle(scheduleCycle); + if (instrIndex == currentInstruction.getInfo().getSrcOp()) { + while (!currentInstruction.selectSource(scheduleCycle, registers, gen)) { + std::cout << "; src STALL at cycle " << cycle << std::endl; + ++scheduleCycle; + ++cycle; + } + std::cout << "; src = r" << currentInstruction.getSource() << std::endl; + } if (instrIndex == currentInstruction.getInfo().getDstOp()) { while (!currentInstruction.selectDestination(scheduleCycle, registers, gen)) { std::cout << "; dst STALL at cycle " << cycle << std::endl; @@ -826,20 +833,16 @@ namespace RandomX { } std::cout << "; dst = r" << currentInstruction.getDestination() << std::endl; } - if (currentInstruction.hasSource() && instrIndex == currentInstruction.getInfo().getSrcOp()) { - while (!currentInstruction.selectSource(scheduleCycle, registers, gen)) { - std::cout << "; src STALL at cycle " << cycle << std::endl; - ++scheduleCycle; - ++cycle; - } - std::cout << "; src = r" << currentInstruction.getSource() << std::endl; - } + depCycle = scheduleCycle + mop.getLatency(); if (instrIndex == currentInstruction.getInfo().getResultOp()) { - int depCycle = scheduleCycle + mop.getLatency(); - registers[currentInstruction.getDestination()].latency = depCycle; + int dst = currentInstruction.getDestination(); + RegisterInfo& ri = registers[dst]; + + ri.latency = depCycle; + ri.lastOpGroup = currentInstruction.getGroup(); + ri.lastOpPar = currentInstruction.getGroupPar(); std::cout << "; RETIRED at cycle " << depCycle << std::endl; } - codeSize += mop.getSize(); mopIndex++; instrIndex++; From 428b845a3d62bd722e3ec18622c28f6f876c78ac Mon Sep 17 00:00:00 2001 From: tevador Date: Mon, 1 Apr 2019 19:04:08 +0200 Subject: [PATCH 06/18] Fixed an infinite loop bug --- src/AssemblyGeneratorX86.cpp | 4 ++-- src/LightProgramGenerator.cpp | 30 +++++++++++++++++++++++------- src/LightProgramGenerator.hpp | 2 +- src/main.cpp | 2 +- 4 files changed, 27 insertions(+), 11 deletions(-) diff --git a/src/AssemblyGeneratorX86.cpp b/src/AssemblyGeneratorX86.cpp index 8a4012a..8b5dbcf 100644 --- a/src/AssemblyGeneratorX86.cpp +++ b/src/AssemblyGeneratorX86.cpp @@ -475,7 +475,7 @@ namespace RandomX { //4 uOPs void AssemblyGeneratorX86::h_COND_R(Instruction& instr, int i) { handleCondition(instr, i); - asmCode << "\txor ecx, ecx" << std::endl; + asmCode << "\txor rcx, rcx" << std::endl; asmCode << "\tcmp " << regR32[instr.src] << ", " << (int32_t)instr.getImm32() << std::endl; asmCode << "\tset" << condition(instr) << " cl" << std::endl; asmCode << "\tadd " << regR[instr.dst] << ", rcx" << std::endl; @@ -485,7 +485,7 @@ namespace RandomX { //6 uOPs void AssemblyGeneratorX86::h_COND_M(Instruction& instr, int i) { handleCondition(instr, i); - asmCode << "\txor ecx, ecx" << std::endl; + asmCode << "\txor rcx, rcx" << std::endl; genAddressReg(instr); asmCode << "\tcmp dword ptr [rsi+rax], " << (int32_t)instr.getImm32() << std::endl; asmCode << "\tset" << condition(instr) << " cl" << std::endl; diff --git a/src/LightProgramGenerator.cpp b/src/LightProgramGenerator.cpp index d4aa79d..96207c0 100644 --- a/src/LightProgramGenerator.cpp +++ b/src/LightProgramGenerator.cpp @@ -111,10 +111,10 @@ namespace RandomX { class Blake2Generator { public: - Blake2Generator(const void* seed) : dataIndex(sizeof(data)) { + Blake2Generator(const void* seed, int nonce) : dataIndex(sizeof(data)) { memset(data, 0, sizeof(data)); memcpy(data, seed, SeedSize); - data[60] = 39; + store32(&data[60], nonce); } uint8_t getByte() { @@ -434,7 +434,7 @@ namespace RandomX { const LightInstructionInfo* slot_3L[] = { &LightInstructionInfo::IADD_R, &LightInstructionInfo::ISUB_R, &LightInstructionInfo::IXOR_R, &LightInstructionInfo::IMULH_R, &LightInstructionInfo::ISMULH_R, &LightInstructionInfo::IXOR_R, &LightInstructionInfo::IMULH_R, &LightInstructionInfo::ISMULH_R }; const LightInstructionInfo* slot_3F[] = { &LightInstructionInfo::IADD_R, &LightInstructionInfo::ISUB_R, &LightInstructionInfo::IXOR_R, &LightInstructionInfo::IROR_R }; const LightInstructionInfo* slot_4[] = { &LightInstructionInfo::IMUL_R, &LightInstructionInfo::IROR_C }; - const LightInstructionInfo* slot_7[] = { &LightInstructionInfo::IADD_C, &LightInstructionInfo::IMUL_C, &LightInstructionInfo::IXOR_C, &LightInstructionInfo::IXOR_C }; + const LightInstructionInfo* slot_7[] = { &LightInstructionInfo::IADD_C, &LightInstructionInfo::IMUL_C, &LightInstructionInfo::IXOR_C, &LightInstructionInfo::IMUL_C }; const LightInstructionInfo* slot_7L = &LightInstructionInfo::COND_R; const LightInstructionInfo* slot_8[] = { &LightInstructionInfo::IADD_RC, &LightInstructionInfo::IMUL_9C }; const LightInstructionInfo* slot_10 = &LightInstructionInfo::IMUL_RCP; @@ -771,12 +771,12 @@ namespace RandomX { } } - void generateLightProg2(LightProgram& prog, const void* seed, int indexRegister) { + void generateLightProg2(LightProgram& prog, const void* seed, int indexRegister, int nonce) { ExecutionPort::type portBusy[RANDOMX_LPROG_LATENCY + 1][3]; memset(portBusy, 0, sizeof(portBusy)); RegisterInfo registers[8]; - Blake2Generator gen(seed); + Blake2Generator gen(seed, nonce); std::vector instructions; DecoderBuffer& fetchLine = DecoderBuffer::Default; @@ -790,6 +790,8 @@ namespace RandomX { int mopIndex = 0; bool portsSaturated = false; int outIndex = 0; + int attempts = 0; + constexpr int MAX_ATTEMPTS = 4; while(!portsSaturated) { fetchLine = fetchLine.fetchNext(currentInstruction.getType(), gen); @@ -798,6 +800,7 @@ namespace RandomX { mopIndex = 0; while (!portsSaturated && mopIndex < fetchLine.getSize()) { + int topCycle = cycle; if (instrIndex >= currentInstruction.getInfo().getSize()) { if (currentInstruction.getType() >= 0) { currentInstruction.toInstr(prog(outIndex++)); @@ -818,19 +821,31 @@ namespace RandomX { mop.setCycle(scheduleCycle); if (instrIndex == currentInstruction.getInfo().getSrcOp()) { - while (!currentInstruction.selectSource(scheduleCycle, registers, gen)) { + for (attempts = 0; attempts < MAX_ATTEMPTS && !currentInstruction.selectSource(scheduleCycle, registers, gen); ++attempts) { std::cout << "; src STALL at cycle " << cycle << std::endl; ++scheduleCycle; ++cycle; } + if (attempts == MAX_ATTEMPTS) { //throw instruction away + cycle = topCycle; + instrIndex = currentInstruction.getInfo().getSize(); + std::cout << "; THROW away " << currentInstruction.getInfo().getName() << std::endl; + continue; + } std::cout << "; src = r" << currentInstruction.getSource() << std::endl; } if (instrIndex == currentInstruction.getInfo().getDstOp()) { - while (!currentInstruction.selectDestination(scheduleCycle, registers, gen)) { + for (attempts = 0; attempts < MAX_ATTEMPTS && !currentInstruction.selectDestination(scheduleCycle, registers, gen); ++attempts) { std::cout << "; dst STALL at cycle " << cycle << std::endl; ++scheduleCycle; ++cycle; } + if (attempts == MAX_ATTEMPTS) { //throw instruction away + cycle = topCycle; + instrIndex = currentInstruction.getInfo().getSize(); + std::cout << "; THROW away " << currentInstruction.getInfo().getName() << std::endl; + continue; + } std::cout << "; dst = r" << currentInstruction.getDestination() << std::endl; } depCycle = scheduleCycle + mop.getLatency(); @@ -850,6 +865,7 @@ namespace RandomX { if (scheduleCycle >= RANDOMX_LPROG_LATENCY) { portsSaturated = true; } + cycle = topCycle; } ++cycle; } diff --git a/src/LightProgramGenerator.hpp b/src/LightProgramGenerator.hpp index a7762b1..34688db 100644 --- a/src/LightProgramGenerator.hpp +++ b/src/LightProgramGenerator.hpp @@ -21,5 +21,5 @@ along with RandomX. If not, see. namespace RandomX { void generateLightProgram(LightProgram& prog, const void* seed, int indexRegister); - void generateLightProg2(LightProgram& prog, const void* seed, int indexRegister); + void generateLightProg2(LightProgram& prog, const void* seed, int indexRegister, int nonce); } \ No newline at end of file diff --git a/src/main.cpp b/src/main.cpp index fdc198c..e4f9407 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -223,7 +223,7 @@ int main(int argc, char** argv) { if (genLight) { RandomX::LightProgram p; - RandomX::generateLightProg2(p, seed, 0); + RandomX::generateLightProg2(p, seed, 0, programCount); RandomX::AssemblyGeneratorX86 asmX86; asmX86.generateProgram(p); std::cout << "-------------------------------------------------------" << std::endl; From 2aaec84931aad4e753df957b35c9cc0aba161924 Mon Sep 17 00:00:00 2001 From: tevador Date: Wed, 3 Apr 2019 09:53:25 +0200 Subject: [PATCH 07/18] Bug fixes, trace output --- src/LightProgramGenerator.cpp | 184 +++++++++++++++++++++------------- src/LightProgramGenerator.hpp | 2 +- src/main.cpp | 2 +- 3 files changed, 115 insertions(+), 73 deletions(-) diff --git a/src/LightProgramGenerator.cpp b/src/LightProgramGenerator.cpp index 96207c0..9d35d67 100644 --- a/src/LightProgramGenerator.cpp +++ b/src/LightProgramGenerator.cpp @@ -62,6 +62,10 @@ namespace RandomX { constexpr int COND_R = IROR_R + RANDOMX_FREQ_IROR_R + RANDOMX_FREQ_IROL_R + RANDOMX_FREQ_ISWAP_R + RANDOMX_FREQ_FSWAP_R + RANDOMX_FREQ_FADD_R + RANDOMX_FREQ_FADD_M + RANDOMX_FREQ_FSUB_R + RANDOMX_FREQ_FSUB_M + RANDOMX_FREQ_FSCAL_R + RANDOMX_FREQ_FMUL_R + RANDOMX_FREQ_FDIV_M + RANDOMX_FREQ_FSQRT_R; } + static bool isMul(uint8_t opcode) { + return opcode == LightInstructionOpcode::IMUL_R || opcode == LightInstructionOpcode::IMULH_R || opcode == LightInstructionOpcode::ISMULH_R || opcode == LightInstructionOpcode::IMUL_RCP; + } + const int lightInstructionOpcode[] = { LightInstructionOpcode::IADD_R, LightInstructionOpcode::IADD_R, @@ -344,7 +348,7 @@ namespace RandomX { const LightInstructionInfo LightInstructionInfo::IADD_C = LightInstructionInfo("IADD_C", LightInstructionType::IADD_C, MacroOp::Add_ri, -1); const LightInstructionInfo LightInstructionInfo::IADD_RC = LightInstructionInfo("IADD_RC", LightInstructionType::IADD_RC, MacroOp::Lea_sib, 0); const LightInstructionInfo LightInstructionInfo::ISUB_R = LightInstructionInfo("ISUB_R", LightInstructionType::ISUB_R, MacroOp::Sub_rr, 0); - const LightInstructionInfo LightInstructionInfo::IMUL_9C = LightInstructionInfo("IMUL_9C", LightInstructionType::IMUL_9C, MacroOp::Lea_sib, 0); + const LightInstructionInfo LightInstructionInfo::IMUL_9C = LightInstructionInfo("IMUL_9C", LightInstructionType::IMUL_9C, MacroOp::Lea_sib, -1); const LightInstructionInfo LightInstructionInfo::IMUL_R = LightInstructionInfo("IMUL_R", LightInstructionType::IMUL_R, MacroOp::Imul_rr, 0); const LightInstructionInfo LightInstructionInfo::IMUL_C = LightInstructionInfo("IMUL_C", LightInstructionType::IMUL_C, MacroOp::Imul_rri, -1); const LightInstructionInfo LightInstructionInfo::IMULH_R = LightInstructionInfo("IMULH_R", LightInstructionType::IMULH_R, IMULH_R_ops_array, 1, 0, 1); @@ -434,7 +438,7 @@ namespace RandomX { const LightInstructionInfo* slot_3L[] = { &LightInstructionInfo::IADD_R, &LightInstructionInfo::ISUB_R, &LightInstructionInfo::IXOR_R, &LightInstructionInfo::IMULH_R, &LightInstructionInfo::ISMULH_R, &LightInstructionInfo::IXOR_R, &LightInstructionInfo::IMULH_R, &LightInstructionInfo::ISMULH_R }; const LightInstructionInfo* slot_3F[] = { &LightInstructionInfo::IADD_R, &LightInstructionInfo::ISUB_R, &LightInstructionInfo::IXOR_R, &LightInstructionInfo::IROR_R }; const LightInstructionInfo* slot_4[] = { &LightInstructionInfo::IMUL_R, &LightInstructionInfo::IROR_C }; - const LightInstructionInfo* slot_7[] = { &LightInstructionInfo::IADD_C, &LightInstructionInfo::IMUL_C, &LightInstructionInfo::IXOR_C, &LightInstructionInfo::IMUL_C }; + const LightInstructionInfo* slot_7[] = { &LightInstructionInfo::IADD_C, &LightInstructionInfo::IMUL_C, &LightInstructionInfo::IXOR_C, &LightInstructionInfo::IADD_C }; const LightInstructionInfo* slot_7L = &LightInstructionInfo::COND_R; const LightInstructionInfo* slot_8[] = { &LightInstructionInfo::IADD_RC, &LightInstructionInfo::IMUL_9C }; const LightInstructionInfo* slot_10 = &LightInstructionInfo::IMUL_RCP; @@ -686,77 +690,95 @@ namespace RandomX { const LightInstruction LightInstruction::Null = LightInstruction(&LightInstructionInfo::NOP); constexpr int ALU_COUNT_MUL = 1; - constexpr int ALU_COUNT = 4; + constexpr int ALU_COUNT = 3; constexpr int LIGHT_OPCODE_BITS = 4; constexpr int V4_SRC_INDEX_BITS = 3; constexpr int V4_DST_INDEX_BITS = 3; + constexpr int CYCLE_MAP_SIZE = RANDOMX_LPROG_LATENCY + 3; + constexpr bool TRACE = true; static int blakeCounter = 0; - static int scheduleUop(const MacroOp& mop, ExecutionPort::type(&portBusy)[RANDOMX_LPROG_LATENCY + 1][3], int cycle, int depCycle) { + template + static int scheduleUop(const MacroOp& mop, ExecutionPort::type(&portBusy)[CYCLE_MAP_SIZE][3], int cycle, int depCycle) { if (mop.isDependent()) { cycle = std::max(cycle, depCycle); } if (mop.isEliminated()) { - std::cout << "; (eliminated)" << std::endl; + if (commit) + if (TRACE) std::cout << "; (eliminated)" << std::endl; return cycle; } else if (mop.isSimple()) { if (mop.getUop1() <= ExecutionPort::P5) { - for (; cycle <= RANDOMX_LPROG_LATENCY; ++cycle) { + for (; cycle < CYCLE_MAP_SIZE; ++cycle) { if (!portBusy[cycle][mop.getUop1() - 1]) { - std::cout << "; P" << mop.getUop1() - 1 << " at cycle " << cycle << std::endl; - portBusy[cycle][mop.getUop1() - 1] = mop.getUop1(); + if (commit) { + if (TRACE) std::cout << "; P" << mop.getUop1() - 1 << " at cycle " << cycle << std::endl; + portBusy[cycle][mop.getUop1() - 1] = mop.getUop1(); + } return cycle; } } } else if (mop.getUop1() == ExecutionPort::P05) { - for (; cycle <= RANDOMX_LPROG_LATENCY; ++cycle) { + for (; cycle < CYCLE_MAP_SIZE; ++cycle) { if (!portBusy[cycle][0]) { - std::cout << "; P0 at cycle " << cycle << std::endl; - portBusy[cycle][0] = mop.getUop1(); + if (commit) { + if (TRACE) std::cout << "; P0 at cycle " << cycle << std::endl; + portBusy[cycle][0] = mop.getUop1(); + } return cycle; } if (!portBusy[cycle][2]) { - std::cout << "; P2 at cycle " << cycle << std::endl; - portBusy[cycle][2] = mop.getUop1(); + if (commit) { + if (TRACE) std::cout << "; P2 at cycle " << cycle << std::endl; + portBusy[cycle][2] = mop.getUop1(); + } return cycle; } } } else { - for (; cycle <= RANDOMX_LPROG_LATENCY; ++cycle) { + for (; cycle < CYCLE_MAP_SIZE; ++cycle) { if (!portBusy[cycle][0]) { - std::cout << "; P0 at cycle " << cycle << std::endl; - portBusy[cycle][0] = mop.getUop1(); + if (commit) { + if (TRACE) std::cout << "; P0 at cycle " << cycle << std::endl; + portBusy[cycle][0] = mop.getUop1(); + } return cycle; } if (!portBusy[cycle][2]) { - std::cout << "; P2 at cycle " << cycle << std::endl; - portBusy[cycle][2] = mop.getUop1(); + if (commit) { + if (TRACE) std::cout << "; P2 at cycle " << cycle << std::endl; + portBusy[cycle][2] = mop.getUop1(); + } return cycle; } if (!portBusy[cycle][1]) { - std::cout << "; P1 at cycle " << cycle << std::endl; - portBusy[cycle][1] = mop.getUop1(); + if (commit) { + if (TRACE) std::cout << "; P1 at cycle " << cycle << std::endl; + portBusy[cycle][1] = mop.getUop1(); + } return cycle; } } } } else { - for (; cycle <= RANDOMX_LPROG_LATENCY; ++cycle) { + for (; cycle < CYCLE_MAP_SIZE; ++cycle) { if (!portBusy[cycle][mop.getUop1() - 1] && !portBusy[cycle][mop.getUop2() - 1]) { - std::cout << "; P" << mop.getUop1() - 1 << " P" << mop.getUop2() - 1 << " at cycle " << cycle << std::endl; - portBusy[cycle][mop.getUop1() - 1] = mop.getUop1(); - portBusy[cycle][mop.getUop2() - 1] = mop.getUop2(); + if (commit) { + if (TRACE) std::cout << "; P" << mop.getUop1() - 1 << " P" << mop.getUop2() - 1 << " at cycle " << cycle << std::endl; + portBusy[cycle][mop.getUop1() - 1] = mop.getUop1(); + portBusy[cycle][mop.getUop2() - 1] = mop.getUop2(); + } return cycle; } } } - std::cout << "Unable to map operation '" << mop.getName() << "' to execution port"; + if (TRACE) std::cout << "Unable to map operation '" << mop.getName() << "' to execution port (cycle " << cycle << ")" << std::endl; return -1; } @@ -773,7 +795,7 @@ namespace RandomX { void generateLightProg2(LightProgram& prog, const void* seed, int indexRegister, int nonce) { - ExecutionPort::type portBusy[RANDOMX_LPROG_LATENCY + 1][3]; + ExecutionPort::type portBusy[CYCLE_MAP_SIZE][3]; memset(portBusy, 0, sizeof(portBusy)); RegisterInfo registers[8]; Blake2Generator gen(seed, nonce); @@ -784,9 +806,9 @@ namespace RandomX { int instrIndex = 0; int codeSize = 0; int macroOpCount = 0; - int rxOpCount = 0; int cycle = 0; int depCycle = 0; + int retireCycle = 0; int mopIndex = 0; bool portsSaturated = false; int outIndex = 0; @@ -795,69 +817,72 @@ namespace RandomX { while(!portsSaturated) { fetchLine = fetchLine.fetchNext(currentInstruction.getType(), gen); - std::cout << "; ------------- fetch cycle " << cycle << " (" << fetchLine.getName() << ")" << std::endl; + if (TRACE) std::cout << "; ------------- fetch cycle " << cycle << " (" << fetchLine.getName() << ")" << std::endl; mopIndex = 0; - while (!portsSaturated && mopIndex < fetchLine.getSize()) { + while (mopIndex < fetchLine.getSize()) { int topCycle = cycle; if (instrIndex >= currentInstruction.getInfo().getSize()) { - if (currentInstruction.getType() >= 0) { - currentInstruction.toInstr(prog(outIndex++)); - } + if (portsSaturated) + break; currentInstruction = LightInstruction::createForSlot(gen, fetchLine.getCounts()[mopIndex], fetchLine.getSize() == mopIndex + 1, fetchLine.getIndex() == 0 && mopIndex == 0); instrIndex = 0; - std::cout << "; " << currentInstruction.getInfo().getName() << std::endl; - rxOpCount++; + if (TRACE) std::cout << "; " << currentInstruction.getInfo().getName() << std::endl; } MacroOp& mop = currentInstruction.getInfo().getOp(instrIndex); if (fetchLine.getCounts()[mopIndex] != mop.getSize()) { - std::cout << "ERROR instruction " << mop.getName() << " doesn't fit into slot of size " << fetchLine.getCounts()[mopIndex] << std::endl; + if (TRACE) std::cout << "ERROR instruction " << mop.getName() << " doesn't fit into slot of size " << fetchLine.getCounts()[mopIndex] << std::endl; return; } - std::cout << mop.getName() << " "; - int scheduleCycle = scheduleUop(mop, portBusy, cycle, depCycle); + if (TRACE) std::cout << mop.getName() << " "; + int scheduleCycle = scheduleUop(mop, portBusy, cycle, depCycle); mop.setCycle(scheduleCycle); + if (scheduleCycle < 0) { + if (TRACE) std::cout << "; Failed at cycle " << cycle << std::endl; + return; + } if (instrIndex == currentInstruction.getInfo().getSrcOp()) { for (attempts = 0; attempts < MAX_ATTEMPTS && !currentInstruction.selectSource(scheduleCycle, registers, gen); ++attempts) { - std::cout << "; src STALL at cycle " << cycle << std::endl; + if (TRACE) std::cout << "; src STALL at cycle " << cycle << std::endl; ++scheduleCycle; ++cycle; } if (attempts == MAX_ATTEMPTS) { //throw instruction away - cycle = topCycle; + //cycle = topCycle; instrIndex = currentInstruction.getInfo().getSize(); - std::cout << "; THROW away " << currentInstruction.getInfo().getName() << std::endl; + if (TRACE) std::cout << "; THROW away " << currentInstruction.getInfo().getName() << std::endl; continue; } - std::cout << "; src = r" << currentInstruction.getSource() << std::endl; + if (TRACE) std::cout << "; src = r" << currentInstruction.getSource() << std::endl; } if (instrIndex == currentInstruction.getInfo().getDstOp()) { for (attempts = 0; attempts < MAX_ATTEMPTS && !currentInstruction.selectDestination(scheduleCycle, registers, gen); ++attempts) { - std::cout << "; dst STALL at cycle " << cycle << std::endl; + if (TRACE) std::cout << "; dst STALL at cycle " << cycle << std::endl; ++scheduleCycle; ++cycle; } if (attempts == MAX_ATTEMPTS) { //throw instruction away - cycle = topCycle; + //cycle = topCycle; instrIndex = currentInstruction.getInfo().getSize(); - std::cout << "; THROW away " << currentInstruction.getInfo().getName() << std::endl; + if (TRACE) std::cout << "; THROW away " << currentInstruction.getInfo().getName() << std::endl; continue; } - std::cout << "; dst = r" << currentInstruction.getDestination() << std::endl; + if (TRACE) std::cout << "; dst = r" << currentInstruction.getDestination() << std::endl; } depCycle = scheduleCycle + mop.getLatency(); if (instrIndex == currentInstruction.getInfo().getResultOp()) { int dst = currentInstruction.getDestination(); RegisterInfo& ri = registers[dst]; - - ri.latency = depCycle; + retireCycle = depCycle; + ri.latency = retireCycle; ri.lastOpGroup = currentInstruction.getGroup(); ri.lastOpPar = currentInstruction.getGroupPar(); - std::cout << "; RETIRED at cycle " << depCycle << std::endl; + if (TRACE) std::cout << "; RETIRED at cycle " << retireCycle << std::endl; } + scheduleUop(mop, portBusy, scheduleCycle, scheduleCycle); codeSize += mop.getSize(); mopIndex++; instrIndex++; @@ -866,43 +891,60 @@ namespace RandomX { portsSaturated = true; } cycle = topCycle; + if (instrIndex >= currentInstruction.getInfo().getSize()) { + currentInstruction.toInstr(prog(outIndex++)); + } } ++cycle; } - while (instrIndex < currentInstruction.getInfo().getSize()) { - if (mopIndex >= fetchLine.getSize()) { - fetchLine = fetchLine.fetchNext(currentInstruction.getType(), gen); - std::cout << "; cycle " << cycle++ << " buffer " << fetchLine.getName() << std::endl; - mopIndex = 0; - } - MacroOp& mop = currentInstruction.getInfo().getOp(instrIndex); - std::cout << mop.getName() << " "; - codeSize += mop.getSize(); - mopIndex++; - instrIndex++; - macroOpCount++; - int scheduleCycle = scheduleUop(mop, portBusy, cycle, depCycle); - mop.setCycle(scheduleCycle); - depCycle = scheduleCycle + mop.getLatency(); - } + std::cout << "; ALU port utilization:" << std::endl; + std::cout << "; (*= in use, _ = idle)" << std::endl; - std::cout << "; code size " << codeSize << std::endl; - std::cout << "; x86 macro-ops: " << macroOpCount << std::endl; - std::cout << "; RandomX instructions: " << rxOpCount << std::endl; - - for (int i = 0; i < RANDOMX_LPROG_LATENCY + 1; ++i) { - std::cout << std::setw(3) << i << " "; + int portCycles = 0; + for (int i = 0; i < CYCLE_MAP_SIZE; ++i) { + std::cout << "; " << std::setw(3) << i << " "; for (int j = 0; j < 3; ++j) { std::cout << (portBusy[i][j] ? '*' : '_'); + portCycles += !!portBusy[i][j]; } std::cout << std::endl; } + std::cout << "; code size " << codeSize << " bytes" << std::endl; + std::cout << "; x86 macro-ops: " << macroOpCount << std::endl; + std::cout << "; RandomX instructions: " << outIndex << std::endl; + std::cout << "; Execution time: " << retireCycle << " cycles" << std::endl; + std::cout << "; IPC = " << (macroOpCount / (double)retireCycle) << std::endl; + std::cout << "; Port-cycles: " << portCycles << std::endl; + + int asicLatency[8]; + memset(asicLatency, 0, sizeof(asicLatency)); + int mulCount = 0; + + for (int i = 0; i < outIndex; ++i) { + Instruction& instr = prog(i); + int latDst = asicLatency[instr.dst] + 1; + int latSrc = instr.dst != instr.src ? asicLatency[instr.src] + 1 : 0; + asicLatency[instr.dst] = std::max(latDst, latSrc); + mulCount += isMul(instr.opcode); + } + + std::cout << "; Multiplications: " << mulCount << std::endl; + + std::cout << "; ASIC latency:" << std::endl; + for (int i = 0; i < 8; ++i) { + std::cout << "; r" << i << " = " << asicLatency[i] << std::endl; + } + std::cout << "; CPU latency:" << std::endl; + for (int i = 0; i < 8; ++i) { + std::cout << "; r" << i << " = " << registers[i].latency << std::endl; + } + prog.setSize(outIndex); } - void generateLightProgram(LightProgram& prog, const void* seed, int indexRegister) { + void generateLightProgram(LightProgram& prog, const void* seed, int indexRegister, int nonce) { // Source: https://www.agner.org/optimize/instruction_tables.pdf const int op_latency[LightInstructionType::COUNT] = { 1, 2, 1, 2, 3, 5, 5, 4, 1, 2, 5 }; diff --git a/src/LightProgramGenerator.hpp b/src/LightProgramGenerator.hpp index 34688db..8027aab 100644 --- a/src/LightProgramGenerator.hpp +++ b/src/LightProgramGenerator.hpp @@ -20,6 +20,6 @@ along with RandomX. If not, see. #include "Program.hpp" namespace RandomX { - void generateLightProgram(LightProgram& prog, const void* seed, int indexRegister); + void generateLightProgram(LightProgram& prog, const void* seed, int indexRegister, int nonce); void generateLightProg2(LightProgram& prog, const void* seed, int indexRegister, int nonce); } \ No newline at end of file diff --git a/src/main.cpp b/src/main.cpp index e4f9407..d22b4f4 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -226,7 +226,7 @@ int main(int argc, char** argv) { RandomX::generateLightProg2(p, seed, 0, programCount); RandomX::AssemblyGeneratorX86 asmX86; asmX86.generateProgram(p); - std::cout << "-------------------------------------------------------" << std::endl; + //std::ofstream file("lightProg2.asm"); asmX86.printCode(std::cout); return 0; } From 690707ef4914094f428ed5933bd402227918da7d Mon Sep 17 00:00:00 2001 From: tevador Date: Wed, 3 Apr 2019 14:06:59 +0200 Subject: [PATCH 08/18] Reworked addition instructions Some bug fixes --- src/LightProgramGenerator.cpp | 453 ++++++++-------------------------- src/main.cpp | 6 +- 2 files changed, 107 insertions(+), 352 deletions(-) diff --git a/src/LightProgramGenerator.cpp b/src/LightProgramGenerator.cpp index 9d35d67..5825808 100644 --- a/src/LightProgramGenerator.cpp +++ b/src/LightProgramGenerator.cpp @@ -30,22 +30,20 @@ along with RandomX. If not, see. namespace RandomX { // Intel Ivy Bridge reference namespace LightInstructionType { //uOPs (decode) execution ports latency code size - constexpr int IADD_R = 0; //1 p015 1 3 - constexpr int IADD_C = 1; //1 p015 1 7 - constexpr int IADD_RC = 2; //1 p1 3 8 - constexpr int ISUB_R = 3; //1 p015 1 3 - constexpr int IMUL_9C = 4; //1 p1 3 8 - constexpr int IMUL_R = 5; //1 p1 3 4 - constexpr int IMUL_C = 6; //1 p1 3 7 - constexpr int IMULH_R = 7; //1+2+1 0+(p1,p5)+0 3 3+3+3 - constexpr int ISMULH_R = 8; //1+2+1 0+(p1,p5)+0 3 3+3+3 - constexpr int IMUL_RCP = 9; //1+1 p015+p1 4 10+4 - constexpr int IXOR_R = 10; //1 p015 1 3 - constexpr int IXOR_C = 11; //1 p015 1 7 - constexpr int IROR_R = 12; //1+2 0+(p0,p5) 1 3+3 - constexpr int IROR_C = 13; //1 p05 1 4 - constexpr int COND_R = 14; //1+1+1+1+1+1 p015+p5+0+p015+p05+p015 3 7+13+3+7+3+3 - constexpr int COUNT = 15; + constexpr int IADD_RS = 0; //1 p01 1 4 + constexpr int ISUB_R = 1; //1 p015 1 3 + constexpr int ISUB_C = 2; //1 p015 3 7 + constexpr int IMUL_R = 3; //1 p1 3 4 + constexpr int IMUL_C = 4; //1 p1 3 7 + constexpr int IMULH_R = 5; //1+2+1 0+(p1,p5)+0 3 3+3+3 + constexpr int ISMULH_R = 6; //1+2+1 0+(p1,p5)+0 3 3+3+3 + constexpr int IMUL_RCP = 7; //1+1 p015+p1 4 10+4 + constexpr int IXOR_R = 8; //1 p015 1 3 + constexpr int IXOR_C = 9; //1 p015 1 7 + constexpr int IROR_R = 10; //1+2 0+(p0,p5) 1 3+3 + constexpr int IROR_C = 11; //1 p05 1 4 + constexpr int COND_R = 12; //1+1+1+1+1+1 p015+p5+0+p015+p05+p015 3 7+13+3+7+3+3 + constexpr int COUNT = 13; } namespace LightInstructionOpcode { @@ -62,8 +60,8 @@ namespace RandomX { constexpr int COND_R = IROR_R + RANDOMX_FREQ_IROR_R + RANDOMX_FREQ_IROL_R + RANDOMX_FREQ_ISWAP_R + RANDOMX_FREQ_FSWAP_R + RANDOMX_FREQ_FADD_R + RANDOMX_FREQ_FADD_M + RANDOMX_FREQ_FSUB_R + RANDOMX_FREQ_FSUB_M + RANDOMX_FREQ_FSCAL_R + RANDOMX_FREQ_FMUL_R + RANDOMX_FREQ_FDIV_M + RANDOMX_FREQ_FSQRT_R; } - static bool isMul(uint8_t opcode) { - return opcode == LightInstructionOpcode::IMUL_R || opcode == LightInstructionOpcode::IMULH_R || opcode == LightInstructionOpcode::ISMULH_R || opcode == LightInstructionOpcode::IMUL_RCP; + static bool isMul(int type) { + return type == LightInstructionType::IMUL_R || type == LightInstructionType::IMUL_C || type == LightInstructionType::IMULH_R || type == LightInstructionType::ISMULH_R || type == LightInstructionType::IMUL_RCP; } const int lightInstructionOpcode[] = { @@ -84,33 +82,15 @@ namespace RandomX { LightInstructionOpcode::COND_R }; - const int lightInstruction[] = { - LightInstructionType::IADD_R, - LightInstructionType::IADD_C, - LightInstructionType::IADD_RC, - LightInstructionType::ISUB_R, - LightInstructionType::IMUL_9C, - LightInstructionType::IMUL_R, - LightInstructionType::IMUL_R, - LightInstructionType::IMUL_C, - LightInstructionType::IMULH_R, - LightInstructionType::ISMULH_R, - LightInstructionType::IMUL_RCP, - LightInstructionType::IXOR_R, - LightInstructionType::IXOR_C, - LightInstructionType::IROR_R, - LightInstructionType::IROR_C, - LightInstructionType::COND_R - }; - namespace ExecutionPort { using type = int; constexpr type Null = 0; constexpr type P0 = 1; constexpr type P1 = 2; constexpr type P5 = 3; - constexpr type P05 = 4; - constexpr type P015 = 5; + constexpr type P01 = 4; + constexpr type P05 = 5; + constexpr type P015 = 6; } class Blake2Generator { @@ -210,6 +190,7 @@ namespace RandomX { static const MacroOp Add_ri; static const MacroOp Lea_sib; static const MacroOp Sub_rr; + static const MacroOp Sub_ri; static const MacroOp Imul_rr; static const MacroOp Imul_rri; static const MacroOp Imul_r; @@ -238,8 +219,9 @@ namespace RandomX { const MacroOp MacroOp::Add_rr = MacroOp("add r,r", 3, 1, ExecutionPort::P015); const MacroOp MacroOp::Add_ri = MacroOp("add r,i", 7, 1, ExecutionPort::P015); - const MacroOp MacroOp::Lea_sib = MacroOp("lea r,m", 8, 3, ExecutionPort::P1); + const MacroOp MacroOp::Lea_sib = MacroOp("lea r,r+r*s", 4, 1, ExecutionPort::P01); const MacroOp MacroOp::Sub_rr = MacroOp("sub r,r", 3, 1, ExecutionPort::P015); + const MacroOp MacroOp::Sub_ri = MacroOp("sub r,i", 7, 1, ExecutionPort::P015); const MacroOp MacroOp::Imul_rr = MacroOp("imul r,r", 4, 3, ExecutionPort::P1); const MacroOp MacroOp::Imul_rri = MacroOp("imul r,r,i", 7, 3, ExecutionPort::P1); const MacroOp MacroOp::Imul_r = MacroOp("imul r", 3, 3, ExecutionPort::P1, ExecutionPort::P5); @@ -253,7 +235,7 @@ namespace RandomX { const MacroOp MacroOp::Xor_self = MacroOp("xor rcx,rcx", 3); const MacroOp MacroOp::Cmp_ri = MacroOp("cmp r,i", 7, 1, ExecutionPort::P015); const MacroOp MacroOp::Setcc_r = MacroOp("setcc cl", 3, 1, ExecutionPort::P05); - const MacroOp MacroOp::TestJmp_fused = MacroOp("testjmp r,i", 13, 0, ExecutionPort::P5); + const MacroOp MacroOp::TestJmp_fused = MacroOp("testjz r,i", 13, 0, ExecutionPort::P5); const MacroOp IMULH_R_ops_array[] = { MacroOp::Mov_rr, MacroOp::Mul_r, MacroOp::Mov_rr }; const MacroOp ISMULH_R_ops_array[] = { MacroOp::Mov_rr, MacroOp::Imul_r, MacroOp::Mov_rr }; @@ -315,11 +297,9 @@ namespace RandomX { int getSrcOp() const { return srcOp_; } - static const LightInstructionInfo IADD_R; - static const LightInstructionInfo IADD_C; - static const LightInstructionInfo IADD_RC; + static const LightInstructionInfo IADD_RS; static const LightInstructionInfo ISUB_R; - static const LightInstructionInfo IMUL_9C; + static const LightInstructionInfo ISUB_C; static const LightInstructionInfo IMUL_R; static const LightInstructionInfo IMUL_C; static const LightInstructionInfo IMULH_R; @@ -344,11 +324,9 @@ namespace RandomX { : name_(name), type_(-1), latency_(0) {} }; - const LightInstructionInfo LightInstructionInfo::IADD_R = LightInstructionInfo("IADD_R", LightInstructionType::IADD_R, MacroOp::Add_rr, 0); - const LightInstructionInfo LightInstructionInfo::IADD_C = LightInstructionInfo("IADD_C", LightInstructionType::IADD_C, MacroOp::Add_ri, -1); - const LightInstructionInfo LightInstructionInfo::IADD_RC = LightInstructionInfo("IADD_RC", LightInstructionType::IADD_RC, MacroOp::Lea_sib, 0); + const LightInstructionInfo LightInstructionInfo::IADD_RS = LightInstructionInfo("IADD_RS", LightInstructionType::IADD_RS, MacroOp::Lea_sib, 0); const LightInstructionInfo LightInstructionInfo::ISUB_R = LightInstructionInfo("ISUB_R", LightInstructionType::ISUB_R, MacroOp::Sub_rr, 0); - const LightInstructionInfo LightInstructionInfo::IMUL_9C = LightInstructionInfo("IMUL_9C", LightInstructionType::IMUL_9C, MacroOp::Lea_sib, -1); + const LightInstructionInfo LightInstructionInfo::ISUB_C = LightInstructionInfo("ISUB_C", LightInstructionType::ISUB_C, MacroOp::Sub_ri, -1); const LightInstructionInfo LightInstructionInfo::IMUL_R = LightInstructionInfo("IMUL_R", LightInstructionType::IMUL_R, MacroOp::Imul_rr, 0); const LightInstructionInfo LightInstructionInfo::IMUL_C = LightInstructionInfo("IMUL_C", LightInstructionType::IMUL_C, MacroOp::Imul_rri, -1); const LightInstructionInfo LightInstructionInfo::IMULH_R = LightInstructionInfo("IMULH_R", LightInstructionType::IMULH_R, IMULH_R_ops_array, 1, 0, 1); @@ -364,7 +342,6 @@ namespace RandomX { const int buffer0[] = { 3, 3, 10 }; const int buffer1[] = { 7, 3, 3, 3 }; const int buffer2[] = { 3, 3, 3, 7 }; - const int buffer3[] = { 4, 8, 4 }; const int buffer4[] = { 4, 4, 4, 4 }; const int buffer5[] = { 3, 7, 3, 3 }; const int buffer6[] = { 3, 3, 7, 3 }; @@ -390,18 +367,15 @@ namespace RandomX { } const DecoderBuffer& fetchNext(int prevType, Blake2Generator& gen) { if (prevType == LightInstructionType::IMULH_R || prevType == LightInstructionType::ISMULH_R) - return decodeBuffers[0]; + return decodeBuffer3310; //2-1-1 decode if (index_ == 0) { - if ((gen.getByte() % 2) == 0) - return decodeBuffers[3]; - else - return decodeBuffers[4]; + return decodeBuffer4444; //IMUL_RCP end } if (index_ == 2) { - return decodeBuffers[7]; + return decodeBuffer133; //COND_R middle } if (index_ == 7) { - return decodeBuffers[1]; + return decodeBuffer7333; //COND_R end } return fetchNextDefault(gen); } @@ -411,36 +385,49 @@ namespace RandomX { const int* counts_; int opsCount_; DecoderBuffer() : index_(-1) {} - static const DecoderBuffer decodeBuffers[8]; + static const DecoderBuffer decodeBuffer3310; + static const DecoderBuffer decodeBuffer7333; + static const DecoderBuffer decodeBuffer3337; + static const DecoderBuffer decodeBuffer4444; + static const DecoderBuffer decodeBuffer3733; + static const DecoderBuffer decodeBuffer3373; + static const DecoderBuffer decodeBuffer133; + static const DecoderBuffer* decodeBuffers[7]; const DecoderBuffer& fetchNextDefault(Blake2Generator& gen) { int select; do { select = gen.getByte() & 7; } while (select == 7); - return decodeBuffers[select]; + return *decodeBuffers[select]; } }; - const DecoderBuffer DecoderBuffer::decodeBuffers[8] = { - DecoderBuffer("3,3,10", 0, buffer0), - DecoderBuffer("7,3,3,3", 1, buffer1), - DecoderBuffer("3,3,3,7", 2, buffer2), - DecoderBuffer("4,8,4", 3, buffer3), - DecoderBuffer("4,4,4,4", 4, buffer4), - DecoderBuffer("3,7,3,3", 5, buffer5), - DecoderBuffer("3,3,7,3", 6, buffer6), - DecoderBuffer("13,3", 7, buffer7), + const DecoderBuffer DecoderBuffer::decodeBuffer3310 = DecoderBuffer("3,3,10", 0, buffer0); + const DecoderBuffer DecoderBuffer::decodeBuffer7333 = DecoderBuffer("7,3,3,3", 1, buffer1); + const DecoderBuffer DecoderBuffer::decodeBuffer3337 = DecoderBuffer("3,3,3,7", 2, buffer2); + const DecoderBuffer DecoderBuffer::decodeBuffer4444 = DecoderBuffer("4,4,4,4", 4, buffer4); + const DecoderBuffer DecoderBuffer::decodeBuffer3733 = DecoderBuffer("3,7,3,3", 5, buffer5); + const DecoderBuffer DecoderBuffer::decodeBuffer3373 = DecoderBuffer("3,3,7,3", 6, buffer6); + const DecoderBuffer DecoderBuffer::decodeBuffer133 = DecoderBuffer("13,3", 7, buffer7); + + const DecoderBuffer* DecoderBuffer::decodeBuffers[7] = { + &DecoderBuffer::decodeBuffer3310, + &DecoderBuffer::decodeBuffer7333, + &DecoderBuffer::decodeBuffer3337, + &DecoderBuffer::decodeBuffer4444, + &DecoderBuffer::decodeBuffer4444, + &DecoderBuffer::decodeBuffer3733, + &DecoderBuffer::decodeBuffer3373, }; DecoderBuffer DecoderBuffer::Default = DecoderBuffer(); - const LightInstructionInfo* slot_3[] = { &LightInstructionInfo::IADD_R, &LightInstructionInfo::ISUB_R, &LightInstructionInfo::IXOR_R, &LightInstructionInfo::IADD_R }; - const LightInstructionInfo* slot_3L[] = { &LightInstructionInfo::IADD_R, &LightInstructionInfo::ISUB_R, &LightInstructionInfo::IXOR_R, &LightInstructionInfo::IMULH_R, &LightInstructionInfo::ISMULH_R, &LightInstructionInfo::IXOR_R, &LightInstructionInfo::IMULH_R, &LightInstructionInfo::ISMULH_R }; - const LightInstructionInfo* slot_3F[] = { &LightInstructionInfo::IADD_R, &LightInstructionInfo::ISUB_R, &LightInstructionInfo::IXOR_R, &LightInstructionInfo::IROR_R }; - const LightInstructionInfo* slot_4[] = { &LightInstructionInfo::IMUL_R, &LightInstructionInfo::IROR_C }; - const LightInstructionInfo* slot_7[] = { &LightInstructionInfo::IADD_C, &LightInstructionInfo::IMUL_C, &LightInstructionInfo::IXOR_C, &LightInstructionInfo::IADD_C }; + const LightInstructionInfo* slot_3[] = { &LightInstructionInfo::ISUB_R, &LightInstructionInfo::IXOR_R }; + const LightInstructionInfo* slot_3L[] = { &LightInstructionInfo::ISUB_R, &LightInstructionInfo::IXOR_R, &LightInstructionInfo::IMULH_R, &LightInstructionInfo::ISMULH_R }; + const LightInstructionInfo* slot_3C[] = { &LightInstructionInfo::ISUB_R, &LightInstructionInfo::IXOR_R, &LightInstructionInfo::IROR_R, &LightInstructionInfo::IXOR_R }; + const LightInstructionInfo* slot_4[] = { &LightInstructionInfo::IMUL_R, &LightInstructionInfo::IROR_C, &LightInstructionInfo::IADD_RS, &LightInstructionInfo::IMUL_R }; + const LightInstructionInfo* slot_7[] = { &LightInstructionInfo::ISUB_C, &LightInstructionInfo::IMUL_C, &LightInstructionInfo::IXOR_C, &LightInstructionInfo::ISUB_C }; const LightInstructionInfo* slot_7L = &LightInstructionInfo::COND_R; - const LightInstructionInfo* slot_8[] = { &LightInstructionInfo::IADD_RC, &LightInstructionInfo::IMUL_9C }; const LightInstructionInfo* slot_10 = &LightInstructionInfo::IMUL_RCP; static bool selectRegister(std::vector& availableRegisters, Blake2Generator& gen, int& reg) { @@ -469,21 +456,21 @@ namespace RandomX { instr.setImm32(imm32_); } - static LightInstruction createForSlot(Blake2Generator& gen, int slotSize, bool isLast = false, bool isFirst = false) { + static LightInstruction createForSlot(Blake2Generator& gen, int slotSize, bool isLast = false, bool complex = false) { switch (slotSize) { case 3: if (isLast) { - return create(slot_3L[gen.getByte() & 7], gen); + return create(slot_3L[gen.getByte() & 3], gen); } - else if (isFirst) { - return create(slot_3F[gen.getByte() & 3], gen); + else if (complex) { + return create(slot_3C[gen.getByte() & 3], gen); } else { - return create(slot_3[gen.getByte() & 3], gen); + return create(slot_3[gen.getByte() & 1], gen); } case 4: - return create(slot_4[gen.getByte() & 1], gen); + return create(slot_4[gen.getByte() & 3], gen); case 7: if (isLast) { return create(slot_7L, gen); @@ -491,12 +478,10 @@ namespace RandomX { else { return create(slot_7[gen.getByte() & 3], gen); } - case 8: - return create(slot_8[gen.getByte() & 1], gen); case 10: return create(slot_10, gen); default: - break; + throw std::runtime_error("Invalid slot"); } } @@ -504,38 +489,24 @@ namespace RandomX { LightInstruction li(info); switch (info->getType()) { - case LightInstructionType::IADD_R: { - li.mod_ = 0; + case LightInstructionType::IADD_RS: { + li.mod_ = gen.getByte(); li.imm32_ = 0; - li.opGroup_ = LightInstructionType::IADD_R; - li.groupParIsSource_ = true; - } break; - - case LightInstructionType::IADD_C: { - li.mod_ = 0; - li.imm32_ = gen.getInt32(); - li.opGroup_ = LightInstructionType::IADD_R; - li.groupParIsSource_ = true; - } break; - - case LightInstructionType::IADD_RC: { - li.mod_ = 0; - li.imm32_ = gen.getInt32(); - li.opGroup_ = LightInstructionType::IADD_R; + li.opGroup_ = LightInstructionType::IADD_RS; li.groupParIsSource_ = true; } break; case LightInstructionType::ISUB_R: { li.mod_ = 0; li.imm32_ = 0; - li.opGroup_ = LightInstructionType::IADD_R; + li.opGroup_ = LightInstructionType::IADD_RS; li.groupParIsSource_ = true; } break; - case LightInstructionType::IMUL_9C: { + case LightInstructionType::ISUB_C: { li.mod_ = 0; li.imm32_ = gen.getInt32(); - li.opGroup_ = LightInstructionType::IMUL_C; + li.opGroup_ = LightInstructionType::ISUB_C; li.opGroupPar_ = -1; } break; @@ -721,7 +692,7 @@ namespace RandomX { } } } - else if (mop.getUop1() == ExecutionPort::P05) { + else if (mop.getUop1() == ExecutionPort::P01) { for (; cycle < CYCLE_MAP_SIZE; ++cycle) { if (!portBusy[cycle][0]) { if (commit) { @@ -730,6 +701,17 @@ namespace RandomX { } return cycle; } + if (!portBusy[cycle][1]) { + if (commit) { + if (TRACE) std::cout << "; P1 at cycle " << cycle << std::endl; + portBusy[cycle][1] = mop.getUop1(); + } + return cycle; + } + } + } + else if (mop.getUop1() == ExecutionPort::P05) { + for (; cycle < CYCLE_MAP_SIZE; ++cycle) { if (!portBusy[cycle][2]) { if (commit) { if (TRACE) std::cout << "; P2 at cycle " << cycle << std::endl; @@ -737,17 +719,17 @@ namespace RandomX { } return cycle; } + if (!portBusy[cycle][0]) { + if (commit) { + if (TRACE) std::cout << "; P0 at cycle " << cycle << std::endl; + portBusy[cycle][0] = mop.getUop1(); + } + return cycle; + } } } else { for (; cycle < CYCLE_MAP_SIZE; ++cycle) { - if (!portBusy[cycle][0]) { - if (commit) { - if (TRACE) std::cout << "; P0 at cycle " << cycle << std::endl; - portBusy[cycle][0] = mop.getUop1(); - } - return cycle; - } if (!portBusy[cycle][2]) { if (commit) { if (TRACE) std::cout << "; P2 at cycle " << cycle << std::endl; @@ -755,6 +737,13 @@ namespace RandomX { } return cycle; } + if (!portBusy[cycle][0]) { + if (commit) { + if (TRACE) std::cout << "; P0 at cycle " << cycle << std::endl; + portBusy[cycle][0] = mop.getUop1(); + } + return cycle; + } if (!portBusy[cycle][1]) { if (commit) { if (TRACE) std::cout << "; P1 at cycle " << cycle << std::endl; @@ -813,6 +802,7 @@ namespace RandomX { bool portsSaturated = false; int outIndex = 0; int attempts = 0; + int mulCount = 0; constexpr int MAX_ATTEMPTS = 4; while(!portsSaturated) { @@ -872,6 +862,7 @@ namespace RandomX { } if (TRACE) std::cout << "; dst = r" << currentInstruction.getDestination() << std::endl; } + scheduleCycle = scheduleUop(mop, portBusy, scheduleCycle, scheduleCycle); depCycle = scheduleCycle + mop.getLatency(); if (instrIndex == currentInstruction.getInfo().getResultOp()) { int dst = currentInstruction.getDestination(); @@ -882,7 +873,6 @@ namespace RandomX { ri.lastOpPar = currentInstruction.getGroupPar(); if (TRACE) std::cout << "; RETIRED at cycle " << retireCycle << std::endl; } - scheduleUop(mop, portBusy, scheduleCycle, scheduleCycle); codeSize += mop.getSize(); mopIndex++; instrIndex++; @@ -893,13 +883,14 @@ namespace RandomX { cycle = topCycle; if (instrIndex >= currentInstruction.getInfo().getSize()) { currentInstruction.toInstr(prog(outIndex++)); + mulCount += isMul(currentInstruction.getType()); } } ++cycle; } std::cout << "; ALU port utilization:" << std::endl; - std::cout << "; (*= in use, _ = idle)" << std::endl; + std::cout << "; (* = in use, _ = idle)" << std::endl; int portCycles = 0; for (int i = 0; i < CYCLE_MAP_SIZE; ++i) { @@ -920,14 +911,12 @@ namespace RandomX { int asicLatency[8]; memset(asicLatency, 0, sizeof(asicLatency)); - int mulCount = 0; for (int i = 0; i < outIndex; ++i) { Instruction& instr = prog(i); int latDst = asicLatency[instr.dst] + 1; int latSrc = instr.dst != instr.src ? asicLatency[instr.src] + 1 : 0; asicLatency[instr.dst] = std::max(latDst, latSrc); - mulCount += isMul(instr.opcode); } std::cout << "; Multiplications: " << mulCount << std::endl; @@ -943,238 +932,4 @@ namespace RandomX { prog.setSize(outIndex); } - - void generateLightProgram(LightProgram& prog, const void* seed, int indexRegister, int nonce) { - - // Source: https://www.agner.org/optimize/instruction_tables.pdf - const int op_latency[LightInstructionType::COUNT] = { 1, 2, 1, 2, 3, 5, 5, 4, 1, 2, 5 }; - - // Instruction latencies for theoretical ASIC implementation - const int asic_op_latency[LightInstructionType::COUNT] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }; - - // Available ALUs for each instruction - const int op_ALUs[LightInstructionType::COUNT] = { ALU_COUNT, ALU_COUNT, ALU_COUNT, ALU_COUNT, ALU_COUNT_MUL, ALU_COUNT_MUL, ALU_COUNT_MUL, ALU_COUNT_MUL, ALU_COUNT, ALU_COUNT, ALU_COUNT }; - - uint8_t data[64]; - memset(data, 0, sizeof(data)); - memcpy(data, seed, SeedSize); - - // Set data_index past the last byte in data - // to trigger full data update with blake hash - // before we start using it - size_t data_index = sizeof(data); - - int code_size; - - do { - uint8_t opcode; - uint8_t dst_index; - uint8_t src_index; - uint32_t imm32 = 0; - - int latency[8]; - int asic_latency[9]; - - // Tracks previous instruction and value of the source operand for registers R0-R3 throughout code execution - // byte 0: current value of the destination register - // byte 1: instruction opcode - // byte 2: current value of the source register - // - // Registers R4-R8 are constant and are treated as having the same value because when we do - // the same operation twice with two constant source registers, it can be optimized into a single operation - uint64_t inst_data[8] = { 0, 1, 2, 3, 4, 5, 6, 7 }; - - bool alu_busy[RANDOMX_LPROG_LATENCY + 1][ALU_COUNT]; - bool is_rotation[LightInstructionType::COUNT]; - bool rotated[8]; - int rotate_count = 0; - - memset(latency, 0, sizeof(latency)); - memset(asic_latency, 0, sizeof(asic_latency)); - memset(alu_busy, 0, sizeof(alu_busy)); - memset(is_rotation, 0, sizeof(is_rotation)); - memset(rotated, 0, sizeof(rotated)); - is_rotation[LightInstructionType::IROR_R] = true; - - int num_retries = 0; - code_size = 0; - - int total_iterations = 0; - - // Generate random code to achieve minimal required latency for our abstract CPU - // Try to get this latency for all 4 registers - while (((latency[0] < RANDOMX_LPROG_LATENCY) || (latency[1] < RANDOMX_LPROG_LATENCY) || (latency[2] < RANDOMX_LPROG_LATENCY) || (latency[3] < RANDOMX_LPROG_LATENCY) - || (latency[4] < RANDOMX_LPROG_LATENCY) || (latency[5] < RANDOMX_LPROG_LATENCY) || (latency[6] < RANDOMX_LPROG_LATENCY) || (latency[7] < RANDOMX_LPROG_LATENCY)) && (num_retries < 64)) - { - // Fail-safe to guarantee loop termination - ++total_iterations; - if (total_iterations > 1024) { - std::cout << "total_iterations = " << total_iterations << std::endl; - break; - } - - check_data(data_index, 1, data, sizeof(data)); - const uint8_t b1 = data[data_index++]; - int instrType = lightInstruction[b1 & ((1 << LIGHT_OPCODE_BITS) - 1)]; - - check_data(data_index, 1, data, sizeof(data)); - const uint8_t b2 = data[data_index++]; - dst_index = b2 & ((1 << V4_DST_INDEX_BITS) - 1); - src_index = (b2 >> (V4_DST_INDEX_BITS)) & ((1 << V4_SRC_INDEX_BITS) - 1); - - const int a = dst_index; - int b = src_index; - - // Don't do rotation with the same destination twice because it's equal to a single rotation - if (is_rotation[instrType] && rotated[a]) - { - continue; - } - - // Don't do the same instruction (except MUL) with the same source value twice because all other cases can be optimized: - // 2x IADD_RC(a, b, C) = IADD_RC(a, b*2, C1+C2) - // 2x ISUB_R(a, b) = ISUB_R(a, 2*b) - // 2x IMUL_R(a, b) = IMUL_R(a, b*b) - // 2x IMUL_9C(a, C) = 9 * (9 * a + C1) + C2 = 81 * a + (9 * C1 + C2) - // 2x IMUL_RCP(a, C) = a * (C * C) - // 2x IXOR_R = NOP - // 2x IROR_R(a, b) = IROR_R(a, 2*b) - if (instrType != LightInstructionType::IMULH_R && instrType != LightInstructionType::ISMULH_R && ((inst_data[a] & 0xFFFF00) == (instrType << 8) + ((inst_data[b] & 255) << 16))) - { - continue; - } - - if ((instrType == LightInstructionType::IADD_RC) || (instrType == LightInstructionType::IMUL_9C) || (instrType == LightInstructionType::IMUL_RCP) || (instrType == LightInstructionType::COND_R) || ((instrType != LightInstructionType::IMULH_R) && (instrType != LightInstructionType::ISMULH_R) && (a == b))) - { - check_data(data_index, 4, data, sizeof(data)); - imm32 = load32(&data[data_index++]); - } - - // Find which ALU is available (and when) for this instruction - int next_latency = (latency[a] > latency[b]) ? latency[a] : latency[b]; - int alu_index = -1; - while (next_latency < RANDOMX_LPROG_LATENCY) - { - for (int i = op_ALUs[instrType] - 1; i >= 0; --i) - { - if (!alu_busy[next_latency][i]) - { - // ADD is implemented as two 1-cycle instructions on a real CPU, so do an additional availability check - if ((instrType == LightInstructionType::IADD_RC || instrType == LightInstructionType::IMUL_9C || instrType == LightInstructionType::IMULH_R || instrType == LightInstructionType::ISMULH_R) && alu_busy[next_latency + 1][i]) - { - continue; - } - - // Rotation can only start when previous rotation is finished, so do an additional availability check - if (is_rotation[instrType] && (next_latency < rotate_count * op_latency[instrType])) - { - continue; - } - - alu_index = i; - break; - } - } - if (alu_index >= 0) - { - break; - } - ++next_latency; - } - - // Don't generate instructions that leave some register unchanged for more than 15 cycles - if (next_latency > latency[a] + 15) - { - continue; - } - - next_latency += op_latency[instrType]; - - if (next_latency <= RANDOMX_LPROG_LATENCY) - { - if (is_rotation[instrType]) - { - ++rotate_count; - } - - // Mark ALU as busy only for the first cycle when it starts executing the instruction because ALUs are fully pipelined - alu_busy[next_latency - op_latency[instrType]][alu_index] = true; - latency[a] = next_latency; - - // ASIC is supposed to have enough ALUs to run as many independent instructions per cycle as possible, so latency calculation for ASIC is simple - asic_latency[a] = ((asic_latency[a] > asic_latency[b]) ? asic_latency[a] : asic_latency[b]) + asic_op_latency[instrType]; - - rotated[a] = is_rotation[instrType]; - - inst_data[a] = code_size + (instrType << 8) + ((inst_data[b] & 255) << 16); - - prog(code_size).opcode = lightInstructionOpcode[instrType]; - prog(code_size).dst = dst_index; - prog(code_size).src = src_index; - prog(code_size).setImm32(imm32); - - if (instrType == LightInstructionType::IADD_RC || instrType == LightInstructionType::IMUL_9C || instrType == LightInstructionType::IMULH_R || instrType == LightInstructionType::ISMULH_R) - { - // ADD instruction is implemented as two 1-cycle instructions on a real CPU, so mark ALU as busy for the next cycle too - alu_busy[next_latency - op_latency[instrType] + 1][alu_index] = true; - } - - ++code_size; - if (code_size >= RANDOMX_LPROG_MIN_SIZE) - { - break; - } - } - else - { - ++num_retries; - std::cout << "Retry " << num_retries << " with code_size = " << code_size << ", next_latency = " << next_latency << std::endl; - } - } - - // ASIC has more execution resources and can extract as much parallelism from the code as possible - // We need to add a few more MUL and ROR instructions to achieve minimal required latency for ASIC - // Get this latency for at least 1 of the 4 registers - const int prev_code_size = code_size; - if ((code_size < RANDOMX_LPROG_MAX_SIZE) && (asic_latency[indexRegister] < RANDOMX_LPROG_ASIC_LATENCY)) - { - int min_idx = indexRegister; - int max_idx = 0; - for (int i = 1; i < 8; ++i) - { - //if (asic_latency[i] < asic_latency[min_idx]) min_idx = i; - if (asic_latency[i] > asic_latency[max_idx]) max_idx = i; - } - - const int pattern[3] = { LightInstructionType::IMUL_R, LightInstructionType::IROR_R, LightInstructionType::IMUL_R }; - const int instrType = pattern[(code_size - prev_code_size) % 3]; - latency[min_idx] = latency[max_idx] + op_latency[instrType]; - asic_latency[min_idx] = asic_latency[max_idx] + asic_op_latency[instrType]; - - prog(code_size).opcode = lightInstructionOpcode[instrType]; - prog(code_size).dst = min_idx; - prog(code_size).src = max_idx; - - ++code_size; - } - - for (int i = 0; i < 8; ++i) { - std::cout << "Latency " << i << " = " << latency[i] << std::endl; - } - - std::cout << "Code size = " << code_size << std::endl; - std::cout << "ALUs:" << std::endl; - for (int i = 0; i < RANDOMX_LPROG_LATENCY + 1; ++i) { - for (int j = 0; j < ALU_COUNT; ++j) { - std::cout << (alu_busy[i][j] ? '*' : '_'); - } - std::cout << std::endl; - } - - // There is ~98.15% chance that loop condition is false, so this loop will execute only 1 iteration most of the time - // It never does more than 4 iterations for all block heights < 10,000,000 - } while ((code_size < RANDOMX_LPROG_MIN_SIZE) || (code_size > RANDOMX_LPROG_MAX_SIZE)); - - prog.setSize(code_size); - } } \ No newline at end of file diff --git a/src/main.cpp b/src/main.cpp index d22b4f4..7f37a37 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -224,10 +224,10 @@ int main(int argc, char** argv) { if (genLight) { RandomX::LightProgram p; RandomX::generateLightProg2(p, seed, 0, programCount); - RandomX::AssemblyGeneratorX86 asmX86; - asmX86.generateProgram(p); + //RandomX::AssemblyGeneratorX86 asmX86; + //asmX86.generateProgram(p); //std::ofstream file("lightProg2.asm"); - asmX86.printCode(std::cout); + //asmX86.printCode(std::cout); return 0; } From 77dbe14658dfca2f7cc4aa803e6090d275a518f7 Mon Sep 17 00:00:00 2001 From: tevador Date: Sat, 6 Apr 2019 12:00:56 +0200 Subject: [PATCH 09/18] SuperscalarHash JIT compiler (unfinished) --- src/AssemblyGeneratorX86.cpp | 14 +-- src/AssemblyGeneratorX86.hpp | 2 +- src/Instruction.cpp | 12 +- src/Instruction.hpp | 2 +- src/InterpretedVirtualMachine.cpp | 2 +- src/JitCompilerX86-static.asm | 87 +++++++++++++ src/JitCompilerX86-static.hpp | 5 + src/JitCompilerX86.cpp | 176 ++++++++++++++++++++++++--- src/JitCompilerX86.hpp | 22 +++- src/LightProgramGenerator.cpp | 156 +++++++++++++----------- src/LightProgramGenerator.hpp | 16 ++- src/Program.hpp | 7 ++ src/asm/program_sshash_constants.inc | 16 +++ src/asm/program_sshash_load.inc | 8 ++ src/asm/program_sshash_prefetch.inc | 4 + src/common.hpp | 3 +- src/configuration.h | 10 +- src/main.cpp | 46 ++++--- 18 files changed, 453 insertions(+), 135 deletions(-) create mode 100644 src/asm/program_sshash_constants.inc create mode 100644 src/asm/program_sshash_load.inc create mode 100644 src/asm/program_sshash_prefetch.inc diff --git a/src/AssemblyGeneratorX86.cpp b/src/AssemblyGeneratorX86.cpp index 8b5dbcf..dc4cea2 100644 --- a/src/AssemblyGeneratorX86.cpp +++ b/src/AssemblyGeneratorX86.cpp @@ -97,14 +97,12 @@ namespace RandomX { } //1 uOP - void AssemblyGeneratorX86::h_IADD_R(Instruction& instr, int i) { + void AssemblyGeneratorX86::h_IADD_RS(Instruction& instr, int i) { registerUsage[instr.dst] = i; - if (instr.src != instr.dst) { - asmCode << "\tadd " << regR[instr.dst] << ", " << regR[instr.src] << std::endl; - } - else { - asmCode << "\tadd " << regR[instr.dst] << ", " << (int32_t)instr.getImm32() << std::endl; - } + if(instr.dst == 5) + asmCode << "\tlea " << regR[instr.dst] << ", [" << regR[instr.dst] << "+" << regR[instr.src] << "*" << (1 << (instr.mod % 4)) << std::showpos << (int32_t)instr.getImm32() << std::noshowpos << "]" << std::endl; + else + asmCode << "\tlea " << regR[instr.dst] << ", [" << regR[instr.dst] << "+" << regR[instr.src] << "*" << (1 << (instr.mod % 4)) << "]" << std::endl; traceint(instr); } @@ -517,7 +515,7 @@ namespace RandomX { InstructionGenerator AssemblyGeneratorX86::engine[256] = { //Integer - INST_HANDLE(IADD_R) + INST_HANDLE(IADD_RS) INST_HANDLE(IADD_M) INST_HANDLE(IADD_RC) INST_HANDLE(ISUB_R) diff --git a/src/AssemblyGeneratorX86.hpp b/src/AssemblyGeneratorX86.hpp index 8ab638b..601d278 100644 --- a/src/AssemblyGeneratorX86.hpp +++ b/src/AssemblyGeneratorX86.hpp @@ -68,7 +68,7 @@ namespace RandomX { void traceflt(Instruction&); void tracenop(Instruction&); - void h_IADD_R(Instruction&, int); + void h_IADD_RS(Instruction&, int); void h_IADD_M(Instruction&, int); void h_IADD_RC(Instruction&, int); void h_ISUB_R(Instruction&, int); diff --git a/src/Instruction.cpp b/src/Instruction.cpp index 7069926..e8ddc64 100644 --- a/src/Instruction.cpp +++ b/src/Instruction.cpp @@ -40,9 +40,9 @@ namespace RandomX { os << "L3" << "[" << (getImm32() & ScratchpadL3Mask) << "]"; } - void Instruction::h_IADD_R(std::ostream& os) const { + void Instruction::h_IADD_RS(std::ostream& os) const { if (src != dst) { - os << "r" << (int)dst << ", r" << (int)src << std::endl; + os << "r" << (int)dst << ", r" << (int)src << ", LSH " << (int)(mod % 4) << std::endl; } else { os << "r" << (int)dst << ", " << (int32_t)getImm32() << std::endl; @@ -302,13 +302,13 @@ namespace RandomX { } void Instruction::h_COND_R(std::ostream& os) const { - os << "r" << (int)dst << ", " << condition((mod >> 2) & 7) << "(r" << (int)src << ", " << (int32_t)getImm32() << "), " << (int)(mod >> 5) << std::endl; + os << "r" << (int)dst << ", " << condition((mod >> 2) & 7) << "(r" << (int)src << ", " << (int32_t)getImm32() << "), LSH " << (int)(mod >> 5) << std::endl; } void Instruction::h_COND_M(std::ostream& os) const { os << "r" << (int)dst << ", " << condition((mod >> 2) & 7) << "("; genAddressReg(os); - os << ", " << (int32_t)getImm32() << "), " << (int)(mod >> 5) << std::endl; + os << ", " << (int32_t)getImm32() << "), LSH " << (int)(mod >> 5) << std::endl; } void Instruction::h_ISTORE(std::ostream& os) const { @@ -333,7 +333,7 @@ namespace RandomX { const char* Instruction::names[256] = { //Integer - INST_NAME(IADD_R) + INST_NAME(IADD_RS) INST_NAME(IADD_M) INST_NAME(IADD_RC) INST_NAME(ISUB_R) @@ -379,7 +379,7 @@ namespace RandomX { InstructionVisualizer Instruction::engine[256] = { //Integer - INST_HANDLE(IADD_R) + INST_HANDLE(IADD_RS) INST_HANDLE(IADD_M) INST_HANDLE(IADD_RC) INST_HANDLE(ISUB_R) diff --git a/src/Instruction.hpp b/src/Instruction.hpp index d10575f..65d1c8a 100644 --- a/src/Instruction.hpp +++ b/src/Instruction.hpp @@ -98,7 +98,7 @@ namespace RandomX { void genAddressImm(std::ostream& os) const; void genAddressRegDst(std::ostream&) const; - void h_IADD_R(std::ostream&) const; + void h_IADD_RS(std::ostream&) const; void h_IADD_M(std::ostream&) const; void h_IADD_RC(std::ostream&) const; void h_ISUB_R(std::ostream&) const; diff --git a/src/InterpretedVirtualMachine.cpp b/src/InterpretedVirtualMachine.cpp index 15a5049..ebb3571 100644 --- a/src/InterpretedVirtualMachine.cpp +++ b/src/InterpretedVirtualMachine.cpp @@ -442,7 +442,7 @@ namespace RandomX { auto& instr = program(i); auto& ibc = byteCode[i]; switch (instr.opcode) { - CASE_REP(IADD_R) { + CASE_REP(IADD_RS) { auto dst = instr.dst % RegistersCount; auto src = instr.src % RegistersCount; ibc.type = InstructionType::IADD_R; diff --git a/src/JitCompilerX86-static.asm b/src/JitCompilerX86-static.asm index ffac80c..d16cab7 100644 --- a/src/JitCompilerX86-static.asm +++ b/src/JitCompilerX86-static.asm @@ -26,9 +26,14 @@ PUBLIC randomx_program_start PUBLIC randomx_program_read_dataset PUBLIC randomx_program_read_dataset_light PUBLIC randomx_program_read_dataset_light_sub +PUBLIC randomx_dataset_init PUBLIC randomx_program_loop_store PUBLIC randomx_program_loop_end PUBLIC randomx_program_epilogue +PUBLIC randomx_sshash_load +PUBLIC randomx_sshash_prefetch +PUBLIC randomx_sshash_end +PUBLIC randomx_sshash_init PUBLIC randomx_program_end ALIGN 64 @@ -75,11 +80,93 @@ randomx_program_read_dataset_light_sub PROC include asm/squareHash.inc randomx_program_read_dataset_light_sub ENDP +ALIGN 64 +randomx_dataset_init PROC + push rbx + push rbp + push rdi + push rsi + push r12 + push r13 + push r14 + push r15 + mov rdi, rcx ;# cache + mov rsi, rdx ;# dataset + mov rbp, r8 ;# block index + push r9 ;# max. block index +init_block_loop: + prefetchw byte ptr [rsi] + mov rbx, rbp + db 232 ;# 0xE8 = call + dd 32768 - distance + distance equ $ - offset randomx_dataset_init + mov qword ptr [rsi+0], r8 + mov qword ptr [rsi+8], r9 + mov qword ptr [rsi+16], r10 + mov qword ptr [rsi+24], r11 + mov qword ptr [rsi+32], r12 + mov qword ptr [rsi+40], r13 + mov qword ptr [rsi+48], r14 + mov qword ptr [rsi+56], r15 + add rbp, 1 + add rsi, 64 + cmp rbp, qword ptr [rsp] + jb init_block_loop + pop r9 + pop r15 + pop r14 + pop r13 + pop r12 + pop rsi + pop rdi + pop rbp + pop rbx + ret +randomx_dataset_init ENDP + ALIGN 64 randomx_program_epilogue PROC include asm/program_epilogue_win64.inc randomx_program_epilogue ENDP +ALIGN 64 +randomx_sshash_load PROC + include asm/program_sshash_load.inc +randomx_sshash_load ENDP + +randomx_sshash_prefetch PROC + include asm/program_sshash_prefetch.inc +randomx_sshash_prefetch ENDP + +randomx_sshash_end PROC + nop +randomx_sshash_end ENDP + +ALIGN 64 +randomx_sshash_init PROC + lea r8, [rbx+1] + include asm/program_sshash_prefetch.inc + imul r8, qword ptr [r0_mul] + mov r9, qword ptr [r1_add] + xor r9, r8 + mov r10, qword ptr [r2_add] + xor r10, r8 + mov r11, qword ptr [r3_add] + xor r11, r8 + mov r12, qword ptr [r4_add] + xor r12, r8 + mov r13, qword ptr [r5_add] + xor r13, r8 + mov r14, qword ptr [r6_add] + xor r14, r8 + mov r15, qword ptr [r7_add] + xor r15, r8 + jmp randomx_program_end +randomx_sshash_init ENDP + +ALIGN 64 + include asm/program_sshash_constants.inc + ALIGN 64 randomx_program_end PROC nop diff --git a/src/JitCompilerX86-static.hpp b/src/JitCompilerX86-static.hpp index 3d835b6..cf250c2 100644 --- a/src/JitCompilerX86-static.hpp +++ b/src/JitCompilerX86-static.hpp @@ -27,6 +27,11 @@ extern "C" { void randomx_program_loop_store(); void randomx_program_loop_end(); void randomx_program_read_dataset_light_sub(); + void randomx_dataset_init(); void randomx_program_epilogue(); + void randomx_sshash_load(); + void randomx_sshash_prefetch(); + void randomx_sshash_end(); + void randomx_sshash_init(); void randomx_program_end(); } \ No newline at end of file diff --git a/src/JitCompilerX86.cpp b/src/JitCompilerX86.cpp index 6c58a88..8c49326 100644 --- a/src/JitCompilerX86.cpp +++ b/src/JitCompilerX86.cpp @@ -88,29 +88,40 @@ namespace RandomX { #include "JitCompilerX86-static.hpp" +#define NOP_TEST true + const uint8_t* codePrologue = (uint8_t*)&randomx_program_prologue; const uint8_t* codeLoopBegin = (uint8_t*)&randomx_program_loop_begin; const uint8_t* codeLoopLoad = (uint8_t*)&randomx_program_loop_load; const uint8_t* codeProgamStart = (uint8_t*)&randomx_program_start; const uint8_t* codeReadDataset = (uint8_t*)&randomx_program_read_dataset; const uint8_t* codeReadDatasetLight = (uint8_t*)&randomx_program_read_dataset_light; + const uint8_t* codeDatasetInit = (uint8_t*)&randomx_dataset_init; const uint8_t* codeLoopStore = (uint8_t*)&randomx_program_loop_store; const uint8_t* codeLoopEnd = (uint8_t*)&randomx_program_loop_end; const uint8_t* codeReadDatasetLightSub = (uint8_t*)&randomx_program_read_dataset_light_sub; const uint8_t* codeEpilogue = (uint8_t*)&randomx_program_epilogue; const uint8_t* codeProgramEnd = (uint8_t*)&randomx_program_end; + const uint8_t* codeShhLoad = (uint8_t*)&randomx_sshash_load; + const uint8_t* codeShhPrefetch = (uint8_t*)&randomx_sshash_prefetch; + const uint8_t* codeShhEnd = (uint8_t*)&randomx_sshash_end; + const uint8_t* codeShhInit = (uint8_t*)&randomx_sshash_init; const int32_t prologueSize = codeLoopBegin - codePrologue; - const int32_t epilogueSize = codeProgramEnd - codeEpilogue; - const int32_t loopLoadSize = codeProgamStart - codeLoopLoad; const int32_t readDatasetSize = codeReadDatasetLight - codeReadDataset; const int32_t readDatasetLightSize = codeLoopStore - codeReadDatasetLight; const int32_t loopStoreSize = codeLoopEnd - codeLoopStore; - const int32_t readDatasetLightSubSize = codeEpilogue - codeReadDatasetLightSub; + const int32_t readDatasetLightSubSize = codeDatasetInit - codeReadDatasetLightSub; + const int32_t datasetInitSize = codeEpilogue - codeDatasetInit; + const int32_t epilogueSize = codeShhLoad - codeEpilogue; + const int32_t codeSshLoadSize = codeShhPrefetch - codeShhLoad; + const int32_t codeSshPrefetchSize = codeShhEnd - codeShhPrefetch; + const int32_t codeSshInitSize = codeProgramEnd - codeShhInit; const int32_t epilogueOffset = CodeSize - epilogueSize; const int32_t readDatasetLightSubOffset = epilogueOffset - readDatasetLightSubSize; + constexpr int32_t superScalarHashOffset = 32768; static const uint8_t REX_ADD_RR[] = { 0x4d, 0x03 }; static const uint8_t REX_ADD_RM[] = { 0x4c, 0x03 }; @@ -166,7 +177,7 @@ namespace RandomX { static const uint8_t SQRTPD[] = { 0x66, 0x0f, 0x51 }; static const uint8_t AND_OR_MOV_LDMXCSR[] = { 0x25, 0x00, 0x60, 0x00, 0x00, 0x0D, 0xC0, 0x9F, 0x00, 0x00, 0x89, 0x44, 0x24, 0xF8, 0x0F, 0xAE, 0x54, 0x24, 0xF8 }; static const uint8_t ROL_RAX[] = { 0x48, 0xc1, 0xc0 }; - static const uint8_t XOR_ECX_ECX[] = { 0x33, 0xC9 }; + static const uint8_t XOR_RCX_RCX[] = { 0x48, 0x33, 0xC9 }; static const uint8_t REX_CMP_R32I[] = { 0x41, 0x81 }; static const uint8_t REX_CMP_M32I[] = { 0x81, 0x3c, 0x06 }; static const uint8_t MOVAPD[] = { 0x66, 0x0f, 0x29 }; @@ -184,6 +195,18 @@ namespace RandomX { static const uint8_t REX_ADD_I[] = { 0x49, 0x81 }; static const uint8_t REX_TEST[] = { 0x49, 0xF7 }; static const uint8_t JZ[] = { 0x0f, 0x84 }; + static const uint8_t RET = 0xc3; + + static const uint8_t NOP1[] = { 0x90 }; + static const uint8_t NOP2[] = { 0x66, 0x90 }; + static const uint8_t NOP3[] = { 0x0F, 0x1F, 0x00 }; + static const uint8_t NOP4[] = { 0x0F, 0x1F, 0x40, 0x00 }; + static const uint8_t NOP5[] = { 0x0F, 0x1F, 0x44, 0x00, 0x00 }; + static const uint8_t NOP6[] = { 0x66, 0x0F, 0x1F, 0x44, 0x00, 0x00 }; + static const uint8_t NOP7[] = { 0x0F, 0x1F, 0x80, 0x00, 0x00, 0x00, 0x00 }; + static const uint8_t NOP8[] = { 0x0F, 0x1F, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00 }; + + static const uint8_t* NOPX[] = { NOP1, NOP2, NOP3, NOP4, NOP5, NOP6, NOP7, NOP8 }; size_t JitCompilerX86::getCodeSize() { return codePos - prologueSize; @@ -196,6 +219,10 @@ namespace RandomX { memcpy(code + readDatasetLightSubOffset, codeReadDatasetLightSub, readDatasetLightSubSize); } + JitCompilerX86::~JitCompilerX86() { + freePagedMemory(code, CodeSize); + } + void JitCompilerX86::generateProgram(Program& prog) { generateProgramPrologue(prog); memcpy(code + codePos, codeReadDataset, readDatasetSize); @@ -216,6 +243,42 @@ namespace RandomX { generateProgramEpilogue(prog); } + template + void JitCompilerX86::generateSuperScalarHash(LightProgram(&programs)[N]) { + memcpy(code + superScalarHashOffset, codeShhInit, codeSshInitSize); + codePos = superScalarHashOffset + codeSshInitSize; + for (unsigned j = 0; j < N; ++j) { + LightProgram& prog = programs[j]; + for (unsigned i = 0; i < prog.getSize(); ++i) { + Instruction& instr = prog(i); + instr.src %= RegistersCount; + instr.dst %= RegistersCount; + generateCode(instr, i); + } + emit(codeShhLoad, codeSshLoadSize); + if (j < N - 1) { + emit(REX_MOV_RR64); + emitByte(0xd8 + prog.getAddressRegister()); + emit(codeShhPrefetch, codeSshPrefetchSize); + int align = (codePos % 16); + while (align != 0) { + int nopSize = 16 - align; + if (nopSize > 8) nopSize = 8; + emit(NOPX[nopSize - 1], nopSize); + align = (codePos % 16); + } + } + } + emitByte(RET); + } + + template + void JitCompilerX86::generateSuperScalarHash(LightProgram(&programs)[RANDOMX_CACHE_ACCESSES]); + + void JitCompilerX86::generateDatasetInitCode() { + memcpy(code, codeDatasetInit, datasetInitSize); + } + void JitCompilerX86::generateProgramPrologue(Program& prog) { #ifdef RANDOMX_JUMP instructionOffsets.clear(); @@ -253,7 +316,6 @@ namespace RandomX { emit32(prologueSize - codePos - 4); emitByte(JMP); emit32(epilogueOffset - codePos - 4); - emitByte(0x90); } void JitCompilerX86::generateCode(Instruction& instr, int i) { @@ -287,9 +349,9 @@ namespace RandomX { emit32(instr.getImm32() & ScratchpadL3Mask); } - void JitCompilerX86::h_IADD_R(Instruction& instr, int i) { + void JitCompilerX86::h_IADD_RS(Instruction& instr, int i) { registerUsage[instr.dst] = i; - if (instr.src != instr.dst) { + /*if (instr.src != instr.dst) { emit(REX_ADD_RR); emitByte(0xc0 + 8 * instr.dst + instr.src); } @@ -297,7 +359,19 @@ namespace RandomX { emit(REX_81); emitByte(0xc0 + instr.dst); emit32(instr.getImm32()); + }*/ + if (false && NOP_TEST) { + emit(NOP4); + return; } + emit(REX_LEA); + if (instr.dst == 5) //rbp,r13 cannot be the base register without offset + emitByte(0xac); + else + emitByte(0x04 + 8 * instr.dst); + genSIB(instr.mod % 4, instr.src, instr.dst); + if (instr.dst == 5) + emit32(instr.getImm32()); } void JitCompilerX86::h_IADD_M(Instruction& instr, int i) { @@ -330,10 +404,18 @@ namespace RandomX { void JitCompilerX86::h_ISUB_R(Instruction& instr, int i) { registerUsage[instr.dst] = i; if (instr.src != instr.dst) { + if (false && NOP_TEST) { + emit(NOP3); + return; + } emit(REX_SUB_RR); emitByte(0xc0 + 8 * instr.dst + instr.src); } else { + if (false && NOP_TEST) { + emit(NOP7); + return; + } emit(REX_81); emitByte(0xe8 + instr.dst); emit32(instr.getImm32()); @@ -366,10 +448,18 @@ namespace RandomX { void JitCompilerX86::h_IMUL_R(Instruction& instr, int i) { registerUsage[instr.dst] = i; if (instr.src != instr.dst) { + if (false && NOP_TEST) { + emit(NOP4); + return; + } emit(REX_IMUL_RR); emitByte(0xc0 + 8 * instr.dst + instr.src); } else { + if (false && NOP_TEST) { + emit(NOP7); + return; + } emit(REX_IMUL_RRI); emitByte(0xc0 + 9 * instr.dst); emit32(instr.getImm32()); @@ -393,6 +483,12 @@ namespace RandomX { void JitCompilerX86::h_IMULH_R(Instruction& instr, int i) { registerUsage[instr.dst] = i; + if (false && NOP_TEST) { + emit(NOP3); + emit(NOP3); + emit(NOP3); + return; + } emit(REX_MOV_RR64); emitByte(0xc0 + instr.dst); emit(REX_MUL_R); @@ -422,6 +518,12 @@ namespace RandomX { void JitCompilerX86::h_ISMULH_R(Instruction& instr, int i) { registerUsage[instr.dst] = i; + if (false && NOP_TEST) { + emit(NOP3); + emit(NOP3); + emit(NOP3); + return; + } emit(REX_MOV_RR64); emitByte(0xc0 + instr.dst); emit(REX_MUL_R); @@ -451,6 +553,13 @@ namespace RandomX { void JitCompilerX86::h_IMUL_RCP(Instruction& instr, int i) { if (instr.getImm32() != 0) { + if (false && NOP_TEST) { + emitByte(0x66); + emitByte(0x66); + emit(NOP8); + emit(NOP4); + return; + } registerUsage[instr.dst] = i; emit(MOV_RAX_I); emit64(reciprocal(instr.getImm32())); @@ -472,10 +581,18 @@ namespace RandomX { void JitCompilerX86::h_IXOR_R(Instruction& instr, int i) { registerUsage[instr.dst] = i; if (instr.src != instr.dst) { + if (false && NOP_TEST) { + emit(NOP3); + return; + } emit(REX_XOR_RR); emitByte(0xc0 + 8 * instr.dst + instr.src); } else { + if (false && NOP_TEST) { + emit(NOP7); + return; + } emit(REX_XOR_RI); emitByte(0xf0 + instr.dst); emit32(instr.getImm32()); @@ -500,12 +617,21 @@ namespace RandomX { void JitCompilerX86::h_IROR_R(Instruction& instr, int i) { registerUsage[instr.dst] = i; if (instr.src != instr.dst) { + if (false && NOP_TEST) { + emit(NOP3); + emit(NOP3); + return; + } emit(REX_MOV_RR); emitByte(0xc8 + instr.src); emit(REX_ROT_CL); emitByte(0xc8 + instr.dst); } else { + if (NOP_TEST) { + emit(NOP4); + return; + } emit(REX_ROT_I8); emitByte(0xc8 + instr.dst); emitByte(instr.getImm32() & 63); @@ -700,14 +826,21 @@ namespace RandomX { const int conditionMask = ((1 << RANDOMX_CONDITION_BITS) - 1) << shift; int reg = getConditionRegister(); int target = registerUsage[reg] + 1; - emit(REX_ADD_I); - emitByte(0xc0 + reg); - emit32(1 << shift); - emit(REX_TEST); - emitByte(0xc0 + reg); - emit32(conditionMask); - emit(JZ); - emit32(instructionOffsets[target] - (codePos + 4)); + if (false && NOP_TEST) { + emit(NOP7); + emit(NOP7); + emit(NOP6); + } + else { + emit(REX_ADD_I); + emitByte(0xc0 + reg); + emit32(1 << shift); + emit(REX_TEST); + emitByte(0xc0 + reg); + emit32(conditionMask); + emit(JZ); + emit32(instructionOffsets[target] - (codePos + 4)); + } for (unsigned j = 0; j < 8; ++j) { //mark all registers as used registerUsage[j] = i; } @@ -717,7 +850,14 @@ namespace RandomX { #ifdef RANDOMX_JUMP handleCondition(instr, i); #endif - emit(XOR_ECX_ECX); + if (false && NOP_TEST) { + emit(NOP3); + emit(NOP7); + emit(NOP3); + emit(NOP3); + return; + } + emit(XOR_RCX_RCX); emit(REX_CMP_R32I); emitByte(0xf8 + instr.src); emit32(instr.getImm32()); @@ -732,7 +872,7 @@ namespace RandomX { #ifdef RANDOMX_JUMP handleCondition(instr, i); #endif - emit(XOR_ECX_ECX); + emit(XOR_RCX_RCX); genAddressReg(instr); emit(REX_CMP_M32I); emit32(instr.getImm32()); @@ -765,7 +905,7 @@ namespace RandomX { #define INST_HANDLE(x) REPN(&JitCompilerX86::h_##x, WT(x)) InstructionGeneratorX86 JitCompilerX86::engine[256] = { - INST_HANDLE(IADD_R) + INST_HANDLE(IADD_RS) INST_HANDLE(IADD_M) INST_HANDLE(IADD_RC) INST_HANDLE(ISUB_R) diff --git a/src/JitCompilerX86.hpp b/src/JitCompilerX86.hpp index f2fd330..16fe26d 100644 --- a/src/JitCompilerX86.hpp +++ b/src/JitCompilerX86.hpp @@ -27,6 +27,7 @@ along with RandomX. If not, see. namespace RandomX { class Program; + class LightProgram; class JitCompilerX86; typedef void(JitCompilerX86::*InstructionGeneratorX86)(Instruction&, int); @@ -36,11 +37,18 @@ namespace RandomX { class JitCompilerX86 { public: JitCompilerX86(); + ~JitCompilerX86(); void generateProgram(Program&); void generateProgramLight(Program&); + template + void generateSuperScalarHash(LightProgram (&programs)[N]); ProgramFunc getProgramFunc() { return (ProgramFunc)code; } + DatasetInitFunc getDatasetInitFunc() { + generateDatasetInitCode(); + return (DatasetInitFunc)code; + } uint8_t* getCode() { return code; } @@ -62,6 +70,8 @@ namespace RandomX { } } + void generateDatasetInitCode(); + void generateProgramPrologue(Program&); void generateProgramEpilogue(Program&); int getConditionRegister(); @@ -100,13 +110,15 @@ namespace RandomX { template void emit(const uint8_t (&src)[N]) { - for (unsigned i = 0; i < N; ++i) { - code[codePos + i] = src[i]; - } - codePos += N; + emit(src, N); } - void h_IADD_R(Instruction&, int); + void emit(const uint8_t* src, size_t count) { + memcpy(code + codePos, src, count); + codePos += count; + } + + void h_IADD_RS(Instruction&, int); void h_IADD_M(Instruction&, int); void h_IADD_RC(Instruction&, int); void h_ISUB_R(Instruction&, int); diff --git a/src/LightProgramGenerator.cpp b/src/LightProgramGenerator.cpp index 5825808..900e2ae 100644 --- a/src/LightProgramGenerator.cpp +++ b/src/LightProgramGenerator.cpp @@ -26,6 +26,7 @@ along with RandomX. If not, see. #include #include #include +#include "LightProgramGenerator.hpp" namespace RandomX { // Intel Ivy Bridge reference @@ -47,8 +48,8 @@ namespace RandomX { } namespace LightInstructionOpcode { - constexpr int IADD_R = 0; - constexpr int IADD_RC = RANDOMX_FREQ_IADD_R + RANDOMX_FREQ_IADD_M; + constexpr int IADD_RS = 0; + constexpr int IADD_RC = RANDOMX_FREQ_IADD_RS + RANDOMX_FREQ_IADD_M; constexpr int ISUB_R = IADD_RC + RANDOMX_FREQ_IADD_RC; constexpr int IMUL_9C = ISUB_R + RANDOMX_FREQ_ISUB_R + RANDOMX_FREQ_ISUB_M; constexpr int IMUL_R = IMUL_9C + RANDOMX_FREQ_IMUL_9C; @@ -65,20 +66,18 @@ namespace RandomX { } const int lightInstructionOpcode[] = { - LightInstructionOpcode::IADD_R, - LightInstructionOpcode::IADD_R, - LightInstructionOpcode::IADD_RC, - LightInstructionOpcode::ISUB_R, - LightInstructionOpcode::IMUL_9C, - LightInstructionOpcode::IMUL_R, - LightInstructionOpcode::IMUL_R, + LightInstructionOpcode::IADD_RS, + LightInstructionOpcode::ISUB_R, //ISUB_R + LightInstructionOpcode::ISUB_R, //ISUB_R + LightInstructionOpcode::IMUL_R, //IMUL_R + LightInstructionOpcode::IMUL_R, //IMUL_C LightInstructionOpcode::IMULH_R, LightInstructionOpcode::ISMULH_R, LightInstructionOpcode::IMUL_RCP, - LightInstructionOpcode::IXOR_R, - LightInstructionOpcode::IXOR_R, - LightInstructionOpcode::IROR_R, - LightInstructionOpcode::IROR_R, + LightInstructionOpcode::IXOR_R, //IXOR_R + LightInstructionOpcode::IXOR_R, //IXOR_C + LightInstructionOpcode::IROR_R, //IROR_R + LightInstructionOpcode::IROR_R, //IROR_C LightInstructionOpcode::COND_R }; @@ -93,37 +92,30 @@ namespace RandomX { constexpr type P015 = 6; } - class Blake2Generator { - public: - Blake2Generator(const void* seed, int nonce) : dataIndex(sizeof(data)) { - memset(data, 0, sizeof(data)); - memcpy(data, seed, SeedSize); - store32(&data[60], nonce); - } + Blake2Generator::Blake2Generator(const void* seed, int nonce) : dataIndex(sizeof(data)) { + memset(data, 0, sizeof(data)); + memcpy(data, seed, SeedSize); + store32(&data[60], nonce); + } - uint8_t getByte() { - checkData(1); - return data[dataIndex++]; - } + uint8_t Blake2Generator::getByte() { + checkData(1); + return data[dataIndex++]; + } - uint32_t getInt32() { - checkData(4); - auto ret = load32(&data[dataIndex]); - dataIndex += 4; - return ret; - } + uint32_t Blake2Generator::getInt32() { + checkData(4); + auto ret = load32(&data[dataIndex]); + dataIndex += 4; + return ret; + } - private: - uint8_t data[64]; - size_t dataIndex; - - void checkData(const size_t bytesNeeded) { - if (dataIndex + bytesNeeded > sizeof(data)) { - blake2b(data, sizeof(data), data, sizeof(data), nullptr, 0); - dataIndex = 0; - } + void Blake2Generator::checkData(const size_t bytesNeeded) { + if (dataIndex + bytesNeeded > sizeof(data)) { + blake2b(data, sizeof(data), data, sizeof(data), nullptr, 0); + dataIndex = 0; } - }; + } class RegisterInfo { public: @@ -201,7 +193,7 @@ namespace RandomX { static const MacroOp Xor_ri; static const MacroOp Ror_rcl; static const MacroOp Ror_ri; - static const MacroOp TestJmp_fused; + static const MacroOp TestJz_fused; static const MacroOp Xor_self; static const MacroOp Cmp_ri; static const MacroOp Setcc_r; @@ -235,13 +227,13 @@ namespace RandomX { const MacroOp MacroOp::Xor_self = MacroOp("xor rcx,rcx", 3); const MacroOp MacroOp::Cmp_ri = MacroOp("cmp r,i", 7, 1, ExecutionPort::P015); const MacroOp MacroOp::Setcc_r = MacroOp("setcc cl", 3, 1, ExecutionPort::P05); - const MacroOp MacroOp::TestJmp_fused = MacroOp("testjz r,i", 13, 0, ExecutionPort::P5); + const MacroOp MacroOp::TestJz_fused = MacroOp("testjz r,i", 13, 0, ExecutionPort::P5); const MacroOp IMULH_R_ops_array[] = { MacroOp::Mov_rr, MacroOp::Mul_r, MacroOp::Mov_rr }; const MacroOp ISMULH_R_ops_array[] = { MacroOp::Mov_rr, MacroOp::Imul_r, MacroOp::Mov_rr }; const MacroOp IMUL_RCP_ops_array[] = { MacroOp::Mov_ri64, MacroOp(MacroOp::Imul_rr, true) }; const MacroOp IROR_R_ops_array[] = { MacroOp::Mov_rr, MacroOp::Ror_rcl }; - const MacroOp COND_R_ops_array[] = { MacroOp::Add_ri, MacroOp(MacroOp::TestJmp_fused, true), MacroOp::Xor_self, MacroOp::Cmp_ri, MacroOp(MacroOp::Setcc_r, true), MacroOp(MacroOp::Add_rr, true) }; + const MacroOp COND_R_ops_array[] = { MacroOp::Add_ri, MacroOp(MacroOp::TestJz_fused, true), MacroOp::Xor_self, MacroOp::Cmp_ri, MacroOp(MacroOp::Setcc_r, true), MacroOp(MacroOp::Add_rr, true) }; class LightInstructionInfo { @@ -349,7 +341,7 @@ namespace RandomX { class DecoderBuffer { public: - static DecoderBuffer Default; + static const DecoderBuffer Default; template DecoderBuffer(const char* name, int index, const int(&arr)[N]) : name_(name), index_(index), counts_(arr), opsCount_(N) {} @@ -365,17 +357,17 @@ namespace RandomX { const char* getName() const { return name_; } - const DecoderBuffer& fetchNext(int prevType, Blake2Generator& gen) { + const DecoderBuffer* fetchNext(int prevType, Blake2Generator& gen) const { if (prevType == LightInstructionType::IMULH_R || prevType == LightInstructionType::ISMULH_R) - return decodeBuffer3310; //2-1-1 decode + return &decodeBuffer3310; //2-1-1 decode if (index_ == 0) { - return decodeBuffer4444; //IMUL_RCP end - } - if (index_ == 2) { - return decodeBuffer133; //COND_R middle + return &decodeBuffer4444; //IMUL_RCP end } + /*if (index_ == 2) { + return &decodeBuffer133; //COND_R middle + }*/ if (index_ == 7) { - return decodeBuffer7333; //COND_R end + return &decodeBuffer7333; //COND_R end } return fetchNextDefault(gen); } @@ -393,12 +385,12 @@ namespace RandomX { static const DecoderBuffer decodeBuffer3373; static const DecoderBuffer decodeBuffer133; static const DecoderBuffer* decodeBuffers[7]; - const DecoderBuffer& fetchNextDefault(Blake2Generator& gen) { + const DecoderBuffer* fetchNextDefault(Blake2Generator& gen) const { int select; do { select = gen.getByte() & 7; } while (select == 7); - return *decodeBuffers[select]; + return decodeBuffers[select]; } }; @@ -420,7 +412,7 @@ namespace RandomX { &DecoderBuffer::decodeBuffer3373, }; - DecoderBuffer DecoderBuffer::Default = DecoderBuffer(); + const DecoderBuffer DecoderBuffer::Default = DecoderBuffer(); const LightInstructionInfo* slot_3[] = { &LightInstructionInfo::ISUB_R, &LightInstructionInfo::IXOR_R }; const LightInstructionInfo* slot_3L[] = { &LightInstructionInfo::ISUB_R, &LightInstructionInfo::IXOR_R, &LightInstructionInfo::IMULH_R, &LightInstructionInfo::ISMULH_R }; @@ -472,7 +464,7 @@ namespace RandomX { case 4: return create(slot_4[gen.getByte() & 3], gen); case 7: - if (isLast) { + if (false && isLast) { return create(slot_7L, gen); } else { @@ -595,7 +587,7 @@ namespace RandomX { bool selectDestination(int cycle, RegisterInfo (®isters)[8], Blake2Generator& gen) { std::vector availableRegisters; for (unsigned i = 0; i < 8; ++i) { - if (registers[i].latency <= cycle && (canReuse_ || i != src_) && (registers[i].lastOpGroup != opGroup_ || registers[i].lastOpPar != opGroupPar_)) + if (registers[i].latency <= cycle && (canReuse_ || i != src_) && (registers[i].lastOpGroup != opGroup_ || registers[i].lastOpPar != opGroupPar_) && (info_.getType() != LightInstructionType::IADD_RS || i != 5)) availableRegisters.push_back(i); } return selectRegister(availableRegisters, gen, dst_); @@ -607,6 +599,12 @@ namespace RandomX { if (registers[i].latency <= cycle) availableRegisters.push_back(i); } + if (availableRegisters.size() == 2 && info_.getType() == LightInstructionType::IADD_RS) { + if (availableRegisters[0] == 5 || availableRegisters[1] == 5) { + opGroupPar_ = src_ = 5; + return true; + } + } if (selectRegister(availableRegisters, gen, src_)) { if (groupParIsSource_) opGroupPar_ = src_; @@ -666,7 +664,7 @@ namespace RandomX { constexpr int V4_SRC_INDEX_BITS = 3; constexpr int V4_DST_INDEX_BITS = 3; constexpr int CYCLE_MAP_SIZE = RANDOMX_LPROG_LATENCY + 3; - constexpr bool TRACE = true; + constexpr bool TRACE = false; static int blakeCounter = 0; @@ -782,15 +780,14 @@ namespace RandomX { } } - void generateLightProg2(LightProgram& prog, const void* seed, int indexRegister, int nonce) { + double generateLightProg2(LightProgram& prog, Blake2Generator& gen) { ExecutionPort::type portBusy[CYCLE_MAP_SIZE][3]; memset(portBusy, 0, sizeof(portBusy)); RegisterInfo registers[8]; - Blake2Generator gen(seed, nonce); std::vector instructions; - DecoderBuffer& fetchLine = DecoderBuffer::Default; + const DecoderBuffer* fetchLine = &DecoderBuffer::Default; LightInstruction currentInstruction = LightInstruction::Null; int instrIndex = 0; int codeSize = 0; @@ -806,24 +803,24 @@ namespace RandomX { constexpr int MAX_ATTEMPTS = 4; while(!portsSaturated) { - fetchLine = fetchLine.fetchNext(currentInstruction.getType(), gen); - if (TRACE) std::cout << "; ------------- fetch cycle " << cycle << " (" << fetchLine.getName() << ")" << std::endl; + fetchLine = fetchLine->fetchNext(currentInstruction.getType(), gen); + if (TRACE) std::cout << "; ------------- fetch cycle " << cycle << " (" << fetchLine->getName() << ")" << std::endl; mopIndex = 0; - while (mopIndex < fetchLine.getSize()) { + while (mopIndex < fetchLine->getSize()) { int topCycle = cycle; if (instrIndex >= currentInstruction.getInfo().getSize()) { if (portsSaturated) break; - currentInstruction = LightInstruction::createForSlot(gen, fetchLine.getCounts()[mopIndex], fetchLine.getSize() == mopIndex + 1, fetchLine.getIndex() == 0 && mopIndex == 0); + currentInstruction = LightInstruction::createForSlot(gen, fetchLine->getCounts()[mopIndex], fetchLine->getSize() == mopIndex + 1, fetchLine->getIndex() == 0 && mopIndex == 0); instrIndex = 0; if (TRACE) std::cout << "; " << currentInstruction.getInfo().getName() << std::endl; } MacroOp& mop = currentInstruction.getInfo().getOp(instrIndex); - if (fetchLine.getCounts()[mopIndex] != mop.getSize()) { - if (TRACE) std::cout << "ERROR instruction " << mop.getName() << " doesn't fit into slot of size " << fetchLine.getCounts()[mopIndex] << std::endl; - return; + if (fetchLine->getCounts()[mopIndex] != mop.getSize()) { + if (TRACE) std::cout << "ERROR instruction " << mop.getName() << " doesn't fit into slot of size " << fetchLine->getCounts()[mopIndex] << std::endl; + return DBL_MIN; } if (TRACE) std::cout << mop.getName() << " "; @@ -831,7 +828,7 @@ namespace RandomX { mop.setCycle(scheduleCycle); if (scheduleCycle < 0) { if (TRACE) std::cout << "; Failed at cycle " << cycle << std::endl; - return; + return DBL_MIN; } if (instrIndex == currentInstruction.getInfo().getSrcOp()) { @@ -893,25 +890,29 @@ namespace RandomX { std::cout << "; (* = in use, _ = idle)" << std::endl; int portCycles = 0; - for (int i = 0; i < CYCLE_MAP_SIZE; ++i) { + /*for (int i = 0; i < CYCLE_MAP_SIZE; ++i) { std::cout << "; " << std::setw(3) << i << " "; for (int j = 0; j < 3; ++j) { std::cout << (portBusy[i][j] ? '*' : '_'); portCycles += !!portBusy[i][j]; } std::cout << std::endl; - } + }*/ + + double ipc = (macroOpCount / (double)retireCycle); std::cout << "; code size " << codeSize << " bytes" << std::endl; std::cout << "; x86 macro-ops: " << macroOpCount << std::endl; std::cout << "; RandomX instructions: " << outIndex << std::endl; std::cout << "; Execution time: " << retireCycle << " cycles" << std::endl; - std::cout << "; IPC = " << (macroOpCount / (double)retireCycle) << std::endl; + std::cout << "; IPC = " << ipc << std::endl; std::cout << "; Port-cycles: " << portCycles << std::endl; + std::cout << "; Multiplications: " << mulCount << std::endl; int asicLatency[8]; memset(asicLatency, 0, sizeof(asicLatency)); + for (int i = 0; i < outIndex; ++i) { Instruction& instr = prog(i); int latDst = asicLatency[instr.dst] + 1; @@ -919,7 +920,16 @@ namespace RandomX { asicLatency[instr.dst] = std::max(latDst, latSrc); } - std::cout << "; Multiplications: " << mulCount << std::endl; + int asicLatencyFinal = 0; + int addressReg = 0; + for (int i = 0; i < 8; ++i) { + if (asicLatency[i] > asicLatencyFinal) { + asicLatencyFinal = asicLatency[i]; + addressReg = i; + } + } + + std::cout << "; ASIC latency: " << asicLatencyFinal << std::endl; std::cout << "; ASIC latency:" << std::endl; for (int i = 0; i < 8; ++i) { @@ -931,5 +941,7 @@ namespace RandomX { } prog.setSize(outIndex); + prog.setAddressRegister(addressReg); + return addressReg; } } \ No newline at end of file diff --git a/src/LightProgramGenerator.hpp b/src/LightProgramGenerator.hpp index 8027aab..e7b1bda 100644 --- a/src/LightProgramGenerator.hpp +++ b/src/LightProgramGenerator.hpp @@ -20,6 +20,18 @@ along with RandomX. If not, see. #include "Program.hpp" namespace RandomX { - void generateLightProgram(LightProgram& prog, const void* seed, int indexRegister, int nonce); - void generateLightProg2(LightProgram& prog, const void* seed, int indexRegister, int nonce); + + class Blake2Generator { + public: + Blake2Generator(const void* seed, int nonce); + uint8_t getByte(); + uint32_t getInt32(); + private: + uint8_t data[64]; + size_t dataIndex; + + void checkData(const size_t); + }; + + double generateLightProg2(LightProgram& prog, Blake2Generator& gen); } \ No newline at end of file diff --git a/src/Program.hpp b/src/Program.hpp index 53c973b..2b81435 100644 --- a/src/Program.hpp +++ b/src/Program.hpp @@ -68,6 +68,12 @@ namespace RandomX { void setSize(uint32_t val) { size = val; } + int getAddressRegister() { + return addrReg; + } + void setAddressRegister(uint32_t val) { + addrReg = val; + } private: void print(std::ostream& os) const { for (unsigned i = 0; i < size; ++i) { @@ -77,6 +83,7 @@ namespace RandomX { } Instruction programBuffer[RANDOMX_LPROG_MAX_SIZE]; uint32_t size; + int addrReg; }; static_assert(sizeof(Program) % 64 == 0, "Invalid size of class Program"); diff --git a/src/asm/program_sshash_constants.inc b/src/asm/program_sshash_constants.inc new file mode 100644 index 0000000..a25a90e --- /dev/null +++ b/src/asm/program_sshash_constants.inc @@ -0,0 +1,16 @@ +r0_mul: ;# 6364136223846793005 + db 45, 127, 149, 76, 45, 244, 81, 88 +r1_add: ;# 9298410992540426048 + db 64, 159, 245, 89, 136, 151, 10, 129 +r2_add: ;# 12065312585734608966 + db 70, 216, 194, 56, 223, 153, 112, 167 +r3_add: ;# 9306329213124610396 + db 92, 9, 34, 191, 28, 185, 38, 129 +r4_add: ;# 5281919268842080866 + db 98, 138, 159, 23, 151, 37, 77, 73 +r5_add: ;# 10536153434571861004 + db 12, 236, 170, 206, 185, 239, 55, 146 +r6_add: ;# 3398623926847679864 + db 120, 45, 230, 108, 116, 86, 42, 47 +r7_add: ;# 9549104520008361294 + db 78, 229, 44, 182, 247, 59, 133, 132 \ No newline at end of file diff --git a/src/asm/program_sshash_load.inc b/src/asm/program_sshash_load.inc new file mode 100644 index 0000000..a9ae9a2 --- /dev/null +++ b/src/asm/program_sshash_load.inc @@ -0,0 +1,8 @@ + ;xor r8, qword ptr [rbx+0] + ;xor r9, qword ptr [rbx+8] + ;xor r10, qword ptr [rbx+16] + ;xor r11, qword ptr [rbx+24] + ;xor r12, qword ptr [rbx+32] + ;xor r13, qword ptr [rbx+40] + ;xor r14, qword ptr [rbx+48] + ;xor r15, qword ptr [rbx+56] \ No newline at end of file diff --git a/src/asm/program_sshash_prefetch.inc b/src/asm/program_sshash_prefetch.inc new file mode 100644 index 0000000..78faba4 --- /dev/null +++ b/src/asm/program_sshash_prefetch.inc @@ -0,0 +1,4 @@ + and rbx, 4194303 + shl rbx, 6 + add rbx, rdi + ; prefetchnta byte ptr [rbx] \ No newline at end of file diff --git a/src/common.hpp b/src/common.hpp index 118f053..83a9bc7 100644 --- a/src/common.hpp +++ b/src/common.hpp @@ -41,7 +41,7 @@ namespace RandomX { static_assert((RANDOMX_SCRATCHPAD_L1 & (RANDOMX_SCRATCHPAD_L1 - 1)) == 0, "RANDOMX_SCRATCHPAD_L1 must be a power of 2."); static_assert(RANDOMX_CACHE_ACCESSES > 1, "RANDOMX_CACHE_ACCESSES must be greater than 1"); - constexpr int wtSum = RANDOMX_FREQ_IADD_R + RANDOMX_FREQ_IADD_M + RANDOMX_FREQ_IADD_RC + RANDOMX_FREQ_ISUB_R + \ + constexpr int wtSum = RANDOMX_FREQ_IADD_RS + RANDOMX_FREQ_IADD_M + RANDOMX_FREQ_IADD_RC + RANDOMX_FREQ_ISUB_R + \ RANDOMX_FREQ_ISUB_M + RANDOMX_FREQ_IMUL_9C + RANDOMX_FREQ_IMUL_R + RANDOMX_FREQ_IMUL_M + RANDOMX_FREQ_IMULH_R + \ RANDOMX_FREQ_IMULH_M + RANDOMX_FREQ_ISMULH_R + RANDOMX_FREQ_ISMULH_M + RANDOMX_FREQ_IMUL_RCP + \ RANDOMX_FREQ_INEG_R + RANDOMX_FREQ_IXOR_R + RANDOMX_FREQ_IXOR_M + RANDOMX_FREQ_IROR_R + RANDOMX_FREQ_ISWAP_R + \ @@ -141,6 +141,7 @@ namespace RandomX { typedef void(*DatasetReadFunc)(addr_t, MemoryRegisters&, int_reg_t(®)[RegistersCount]); typedef void(*ProgramFunc)(RegisterFile&, MemoryRegisters&, uint8_t* /* scratchpad */, uint64_t); + typedef void(*DatasetInitFunc)(uint8_t* cache, uint8_t* dataset, uint32_t startBlock, uint32_t endBlock); } std::ostream& operator<<(std::ostream& os, const RandomX::RegisterFile& rf); diff --git a/src/configuration.h b/src/configuration.h index 95c1412..72e44a4 100644 --- a/src/configuration.h +++ b/src/configuration.h @@ -37,7 +37,7 @@ along with RandomX. If not, see. //Number of random Cache accesses per Dataset block. Minimum is 2. #define RANDOMX_CACHE_ACCESSES 8 -#define RANDOMX_LPROG_LATENCY 168 +#define RANDOMX_LPROG_LATENCY 130 #define RANDOMX_LPROG_ASIC_LATENCY 84 #define RANDOMX_LPROG_MIN_SIZE 225 #define RANDOMX_LPROG_MAX_SIZE 512 @@ -80,12 +80,12 @@ Instruction frequencies (per 256 opcodes) Total sum of frequencies must be 256 */ -#define RANDOMX_FREQ_IADD_R 12 +#define RANDOMX_FREQ_IADD_RS 32 #define RANDOMX_FREQ_IADD_M 7 -#define RANDOMX_FREQ_IADD_RC 16 -#define RANDOMX_FREQ_ISUB_R 12 +#define RANDOMX_FREQ_IADD_RC 0 +#define RANDOMX_FREQ_ISUB_R 17 #define RANDOMX_FREQ_ISUB_M 7 -#define RANDOMX_FREQ_IMUL_9C 9 +#define RANDOMX_FREQ_IMUL_9C 0 #define RANDOMX_FREQ_IMUL_R 16 #define RANDOMX_FREQ_IMUL_M 4 #define RANDOMX_FREQ_IMULH_R 4 diff --git a/src/main.cpp b/src/main.cpp index 7f37a37..d5e4657 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -37,6 +37,7 @@ along with RandomX. If not, see. #include "Cache.hpp" #include "hashAes1Rx4.hpp" #include "LightProgramGenerator.hpp" +#include "JitCompilerX86.hpp" const uint8_t seed[32] = { 191, 182, 222, 175, 249, 89, 134, 104, 241, 68, 191, 62, 162, 166, 61, 64, 123, 191, 227, 193, 118, 60, 188, 53, 223, 133, 175, 24, 123, 230, 55, 74 }; @@ -204,7 +205,7 @@ void mine(RandomX::VirtualMachine* vm, std::atomic& atomicNonce, Atomi } int main(int argc, char** argv) { - bool softAes, genAsm, miningMode, verificationMode, help, largePages, async, genNative, jit, genLight; + bool softAes, genAsm, miningMode, verificationMode, help, largePages, async, genNative, jit, genLight, useSuperscalar; int programCount, threadCount, initThreadCount, epoch; readOption("--softAes", argc, argv, softAes); @@ -220,14 +221,16 @@ int main(int argc, char** argv) { readOption("--genNative", argc, argv, genNative); readOption("--help", argc, argv, help); readOption("--genLight", argc, argv, genLight); + readOption("--useSuperscalar", argc, argv, useSuperscalar); if (genLight) { RandomX::LightProgram p; - RandomX::generateLightProg2(p, seed, 0, programCount); - //RandomX::AssemblyGeneratorX86 asmX86; - //asmX86.generateProgram(p); + RandomX::Blake2Generator gen(seed, programCount); + RandomX::generateLightProg2(p, gen); + RandomX::AssemblyGeneratorX86 asmX86; + asmX86.generateProgram(p); //std::ofstream file("lightProg2.asm"); - //asmX86.printCode(std::cout); + asmX86.printCode(std::cout); return 0; } @@ -287,24 +290,37 @@ int main(int argc, char** argv) { dataset.dataset.size = datasetSize; RandomX::datasetAlloc(dataset, largePages); const uint64_t datasetBlockCount = datasetSize / RandomX::CacheLineSize; - if (initThreadCount > 1) { - auto perThread = datasetBlockCount / initThreadCount; - auto remainder = datasetBlockCount % initThreadCount; - for (int i = 0; i < initThreadCount; ++i) { - auto count = perThread + (i == initThreadCount - 1 ? remainder : 0); - threads.push_back(std::thread(&RandomX::datasetInit, std::ref(cache), std::ref(dataset.dataset), i * perThread, count)); - } - for (unsigned i = 0; i < threads.size(); ++i) { - threads[i].join(); + if (useSuperscalar) { + RandomX::Blake2Generator gen(seed, programCount); + RandomX::LightProgram programs[RANDOMX_CACHE_ACCESSES]; + for (int i = 0; i < RANDOMX_CACHE_ACCESSES; ++i) { + RandomX::generateLightProg2(programs[i], gen); } + RandomX::JitCompilerX86 jit86; + jit86.generateSuperScalarHash(programs); + jit86.getDatasetInitFunc()(cache.memory, dataset.dataset.memory, 0, datasetBlockCount); } else { - RandomX::datasetInit(cache, dataset.dataset, 0, datasetBlockCount); + if (initThreadCount > 1) { + auto perThread = datasetBlockCount / initThreadCount; + auto remainder = datasetBlockCount % initThreadCount; + for (int i = 0; i < initThreadCount; ++i) { + auto count = perThread + (i == initThreadCount - 1 ? remainder : 0); + threads.push_back(std::thread(&RandomX::datasetInit, std::ref(cache), std::ref(dataset.dataset), i * perThread, count)); + } + for (unsigned i = 0; i < threads.size(); ++i) { + threads[i].join(); + } + } + else { + RandomX::datasetInit(cache, dataset.dataset, 0, datasetBlockCount); + } } RandomX::deallocCache(cache, largePages); threads.clear(); std::cout << "Dataset (" << datasetSize << " bytes) initialized in " << sw.getElapsed() << " s" << std::endl; } + return 0; std::cout << "Initializing " << threadCount << " virtual machine(s) ..." << std::endl; for (int i = 0; i < threadCount; ++i) { RandomX::VirtualMachine* vm; From 6e3136b37fd9771d6683994c8767018d55257a5c Mon Sep 17 00:00:00 2001 From: tevador Date: Sat, 6 Apr 2019 17:07:40 +0200 Subject: [PATCH 10/18] Fixed cache alignment Performance tuning --- src/Cache.hpp | 2 +- src/JitCompilerX86.cpp | 2 +- src/LightProgramGenerator.cpp | 49 ++++++++++++++++++----------- src/asm/program_sshash_load.inc | 16 +++++----- src/asm/program_sshash_prefetch.inc | 2 +- src/configuration.h | 2 +- 6 files changed, 43 insertions(+), 30 deletions(-) diff --git a/src/Cache.hpp b/src/Cache.hpp index 5656baf..bfc7ddf 100644 --- a/src/Cache.hpp +++ b/src/Cache.hpp @@ -34,7 +34,7 @@ namespace RandomX { return (uint8_t*)allocLargePagesMemory(size); } else { - void* ptr = _mm_malloc(size, sizeof(__m128i)); + void* ptr = _mm_malloc(size, CacheLineSize); if (ptr == nullptr) throw std::bad_alloc(); return (uint8_t*)ptr; diff --git a/src/JitCompilerX86.cpp b/src/JitCompilerX86.cpp index 8c49326..d6e27f1 100644 --- a/src/JitCompilerX86.cpp +++ b/src/JitCompilerX86.cpp @@ -628,7 +628,7 @@ namespace RandomX { emitByte(0xc8 + instr.dst); } else { - if (NOP_TEST) { + if (false && NOP_TEST) { emit(NOP4); return; } diff --git a/src/LightProgramGenerator.cpp b/src/LightProgramGenerator.cpp index 900e2ae..d5ebadf 100644 --- a/src/LightProgramGenerator.cpp +++ b/src/LightProgramGenerator.cpp @@ -216,7 +216,7 @@ namespace RandomX { const MacroOp MacroOp::Sub_ri = MacroOp("sub r,i", 7, 1, ExecutionPort::P015); const MacroOp MacroOp::Imul_rr = MacroOp("imul r,r", 4, 3, ExecutionPort::P1); const MacroOp MacroOp::Imul_rri = MacroOp("imul r,r,i", 7, 3, ExecutionPort::P1); - const MacroOp MacroOp::Imul_r = MacroOp("imul r", 3, 3, ExecutionPort::P1, ExecutionPort::P5); + const MacroOp MacroOp::Imul_r = MacroOp("imul r", 3, 4, ExecutionPort::P1, ExecutionPort::P5); const MacroOp MacroOp::Mul_r = MacroOp("mul r", 3, 3, ExecutionPort::P1, ExecutionPort::P5); const MacroOp MacroOp::Mov_rr = MacroOp("mov r,r", 3); const MacroOp MacroOp::Mov_ri64 = MacroOp("mov rax,i64", 10, 1, ExecutionPort::P015); @@ -357,9 +357,11 @@ namespace RandomX { const char* getName() const { return name_; } - const DecoderBuffer* fetchNext(int prevType, Blake2Generator& gen) const { - if (prevType == LightInstructionType::IMULH_R || prevType == LightInstructionType::ISMULH_R) + const DecoderBuffer* fetchNext(int instrType, int cycle, int mulCount, Blake2Generator& gen) const { + if (instrType == LightInstructionType::IMULH_R || instrType == LightInstructionType::ISMULH_R) return &decodeBuffer3310; //2-1-1 decode + if (mulCount < cycle) + return &decodeBuffer4444_mul; if (index_ == 0) { return &decodeBuffer4444; //IMUL_RCP end } @@ -381,15 +383,16 @@ namespace RandomX { static const DecoderBuffer decodeBuffer7333; static const DecoderBuffer decodeBuffer3337; static const DecoderBuffer decodeBuffer4444; + static const DecoderBuffer decodeBuffer4444_mul; static const DecoderBuffer decodeBuffer3733; static const DecoderBuffer decodeBuffer3373; static const DecoderBuffer decodeBuffer133; static const DecoderBuffer* decodeBuffers[7]; const DecoderBuffer* fetchNextDefault(Blake2Generator& gen) const { int select; - do { - select = gen.getByte() & 7; - } while (select == 7); + //do { + select = gen.getByte() & 3; + //} while (select == 7); return decodeBuffers[select]; } }; @@ -397,17 +400,16 @@ namespace RandomX { const DecoderBuffer DecoderBuffer::decodeBuffer3310 = DecoderBuffer("3,3,10", 0, buffer0); const DecoderBuffer DecoderBuffer::decodeBuffer7333 = DecoderBuffer("7,3,3,3", 1, buffer1); const DecoderBuffer DecoderBuffer::decodeBuffer3337 = DecoderBuffer("3,3,3,7", 2, buffer2); + const DecoderBuffer DecoderBuffer::decodeBuffer4444_mul = DecoderBuffer("4,4,4,4-MUL", 3, buffer4); const DecoderBuffer DecoderBuffer::decodeBuffer4444 = DecoderBuffer("4,4,4,4", 4, buffer4); + const DecoderBuffer DecoderBuffer::decodeBuffer3733 = DecoderBuffer("3,7,3,3", 5, buffer5); const DecoderBuffer DecoderBuffer::decodeBuffer3373 = DecoderBuffer("3,3,7,3", 6, buffer6); const DecoderBuffer DecoderBuffer::decodeBuffer133 = DecoderBuffer("13,3", 7, buffer7); const DecoderBuffer* DecoderBuffer::decodeBuffers[7] = { &DecoderBuffer::decodeBuffer3310, - &DecoderBuffer::decodeBuffer7333, &DecoderBuffer::decodeBuffer3337, - &DecoderBuffer::decodeBuffer4444, - &DecoderBuffer::decodeBuffer4444, &DecoderBuffer::decodeBuffer3733, &DecoderBuffer::decodeBuffer3373, }; @@ -417,8 +419,8 @@ namespace RandomX { const LightInstructionInfo* slot_3[] = { &LightInstructionInfo::ISUB_R, &LightInstructionInfo::IXOR_R }; const LightInstructionInfo* slot_3L[] = { &LightInstructionInfo::ISUB_R, &LightInstructionInfo::IXOR_R, &LightInstructionInfo::IMULH_R, &LightInstructionInfo::ISMULH_R }; const LightInstructionInfo* slot_3C[] = { &LightInstructionInfo::ISUB_R, &LightInstructionInfo::IXOR_R, &LightInstructionInfo::IROR_R, &LightInstructionInfo::IXOR_R }; - const LightInstructionInfo* slot_4[] = { &LightInstructionInfo::IMUL_R, &LightInstructionInfo::IROR_C, &LightInstructionInfo::IADD_RS, &LightInstructionInfo::IMUL_R }; - const LightInstructionInfo* slot_7[] = { &LightInstructionInfo::ISUB_C, &LightInstructionInfo::IMUL_C, &LightInstructionInfo::IXOR_C, &LightInstructionInfo::ISUB_C }; + const LightInstructionInfo* slot_4[] = { &LightInstructionInfo::IROR_C, &LightInstructionInfo::IADD_RS }; + const LightInstructionInfo* slot_7[] = { &LightInstructionInfo::IXOR_C, &LightInstructionInfo::ISUB_C }; const LightInstructionInfo* slot_7L = &LightInstructionInfo::COND_R; const LightInstructionInfo* slot_10 = &LightInstructionInfo::IMUL_RCP; @@ -448,27 +450,34 @@ namespace RandomX { instr.setImm32(imm32_); } - static LightInstruction createForSlot(Blake2Generator& gen, int slotSize, bool isLast = false, bool complex = false) { + static LightInstruction createForSlot(Blake2Generator& gen, int slotSize, int fetchType, bool isLast, bool isFirst) { switch (slotSize) { case 3: if (isLast) { return create(slot_3L[gen.getByte() & 3], gen); } - else if (complex) { + else if (false && isFirst && fetchType == 0) { return create(slot_3C[gen.getByte() & 3], gen); } else { return create(slot_3[gen.getByte() & 1], gen); } case 4: - return create(slot_4[gen.getByte() & 3], gen); + if (fetchType == 3 && !isLast) { + return create(&LightInstructionInfo::IMUL_R, gen); + } + else { + return create(slot_4[gen.getByte() & 1], gen); + } case 7: if (false && isLast) { return create(slot_7L, gen); } - else { - return create(slot_7[gen.getByte() & 3], gen); + if (false && isFirst) { + return create(&LightInstructionInfo::IMUL_C, gen); + } else { + return create(slot_7[gen.getByte() & 1], gen); } case 10: return create(slot_10, gen); @@ -664,7 +673,11 @@ namespace RandomX { constexpr int V4_SRC_INDEX_BITS = 3; constexpr int V4_DST_INDEX_BITS = 3; constexpr int CYCLE_MAP_SIZE = RANDOMX_LPROG_LATENCY + 3; +#ifndef _DEBUG constexpr bool TRACE = false; +#else + constexpr bool TRACE = true; +#endif static int blakeCounter = 0; @@ -803,7 +816,7 @@ namespace RandomX { constexpr int MAX_ATTEMPTS = 4; while(!portsSaturated) { - fetchLine = fetchLine->fetchNext(currentInstruction.getType(), gen); + fetchLine = fetchLine->fetchNext(currentInstruction.getType(), cycle, mulCount, gen); if (TRACE) std::cout << "; ------------- fetch cycle " << cycle << " (" << fetchLine->getName() << ")" << std::endl; mopIndex = 0; @@ -813,7 +826,7 @@ namespace RandomX { if (instrIndex >= currentInstruction.getInfo().getSize()) { if (portsSaturated) break; - currentInstruction = LightInstruction::createForSlot(gen, fetchLine->getCounts()[mopIndex], fetchLine->getSize() == mopIndex + 1, fetchLine->getIndex() == 0 && mopIndex == 0); + currentInstruction = LightInstruction::createForSlot(gen, fetchLine->getCounts()[mopIndex], fetchLine->getIndex(), fetchLine->getSize() == mopIndex + 1, mopIndex == 0); instrIndex = 0; if (TRACE) std::cout << "; " << currentInstruction.getInfo().getName() << std::endl; } diff --git a/src/asm/program_sshash_load.inc b/src/asm/program_sshash_load.inc index a9ae9a2..5351356 100644 --- a/src/asm/program_sshash_load.inc +++ b/src/asm/program_sshash_load.inc @@ -1,8 +1,8 @@ - ;xor r8, qword ptr [rbx+0] - ;xor r9, qword ptr [rbx+8] - ;xor r10, qword ptr [rbx+16] - ;xor r11, qword ptr [rbx+24] - ;xor r12, qword ptr [rbx+32] - ;xor r13, qword ptr [rbx+40] - ;xor r14, qword ptr [rbx+48] - ;xor r15, qword ptr [rbx+56] \ No newline at end of file + xor r8, qword ptr [rbx+0] + xor r9, qword ptr [rbx+8] + xor r10, qword ptr [rbx+16] + xor r11, qword ptr [rbx+24] + xor r12, qword ptr [rbx+32] + xor r13, qword ptr [rbx+40] + xor r14, qword ptr [rbx+48] + xor r15, qword ptr [rbx+56] \ No newline at end of file diff --git a/src/asm/program_sshash_prefetch.inc b/src/asm/program_sshash_prefetch.inc index 78faba4..96ec35a 100644 --- a/src/asm/program_sshash_prefetch.inc +++ b/src/asm/program_sshash_prefetch.inc @@ -1,4 +1,4 @@ and rbx, 4194303 shl rbx, 6 add rbx, rdi - ; prefetchnta byte ptr [rbx] \ No newline at end of file + prefetchnta byte ptr [rbx] \ No newline at end of file diff --git a/src/configuration.h b/src/configuration.h index 72e44a4..6d9912d 100644 --- a/src/configuration.h +++ b/src/configuration.h @@ -37,7 +37,7 @@ along with RandomX. If not, see. //Number of random Cache accesses per Dataset block. Minimum is 2. #define RANDOMX_CACHE_ACCESSES 8 -#define RANDOMX_LPROG_LATENCY 130 +#define RANDOMX_LPROG_LATENCY 170 #define RANDOMX_LPROG_ASIC_LATENCY 84 #define RANDOMX_LPROG_MIN_SIZE 225 #define RANDOMX_LPROG_MAX_SIZE 512 From b4c02051fa45b1542afaea2a66814d234cf7d338 Mon Sep 17 00:00:00 2001 From: tevador Date: Sun, 7 Apr 2019 15:38:51 +0200 Subject: [PATCH 11/18] Reworked SuperscalarHash instruction set ASM and C code generator for SuperscalarHash Support for Superscalar hash in the light mode --- src/AssemblyGeneratorX86.cpp | 174 ++++++++++++++++++ src/AssemblyGeneratorX86.hpp | 19 +- src/CompiledLightVirtualMachine.cpp | 19 +- src/CompiledLightVirtualMachine.hpp | 5 +- src/CompiledVirtualMachine.cpp | 2 +- src/CompiledVirtualMachine.hpp | 2 +- src/InterpretedVirtualMachine.cpp | 2 +- src/InterpretedVirtualMachine.hpp | 2 +- src/JitCompilerX86-static.asm | 34 ++++ src/JitCompilerX86-static.hpp | 2 + src/JitCompilerX86.cpp | 121 +++++++++++- src/JitCompilerX86.hpp | 5 +- src/LightProgramGenerator.cpp | 276 ++++++++++++---------------- src/LightProgramGenerator.hpp | 21 +++ src/VirtualMachine.hpp | 4 +- src/main.cpp | 29 +-- 16 files changed, 505 insertions(+), 212 deletions(-) diff --git a/src/AssemblyGeneratorX86.cpp b/src/AssemblyGeneratorX86.cpp index dc4cea2..a25a377 100644 --- a/src/AssemblyGeneratorX86.cpp +++ b/src/AssemblyGeneratorX86.cpp @@ -23,6 +23,7 @@ along with RandomX. If not, see. #include "common.hpp" #include "reciprocal.h" #include "Program.hpp" +#include "./LightProgramGenerator.hpp" namespace RandomX { @@ -46,6 +47,179 @@ namespace RandomX { static const char* regDatasetAddr = "rdi"; static const char* regScratchpadAddr = "rsi"; + void AssemblyGeneratorX86::generateProgram(Program& prog) { + for (unsigned i = 0; i < 8; ++i) { + registerUsage[i] = -1; + } + asmCode.str(std::string()); //clear + for (unsigned i = 0; i < prog.getSize(); ++i) { + asmCode << "randomx_isn_" << i << ":" << std::endl; + Instruction& instr = prog(i); + instr.src %= RegistersCount; + instr.dst %= RegistersCount; + generateCode(instr, i); + //asmCode << std::endl; + } + } + + void AssemblyGeneratorX86::generateAsm(LightProgram& prog) { + asmCode.str(std::string()); //clear + asmCode << "ALIGN 16" << std::endl; + for (unsigned i = 0; i < prog.getSize(); ++i) { + Instruction& instr = prog(i); + switch (instr.opcode) + { + case RandomX::LightInstructionType::ISUB_R: + asmCode << "sub " << regR[instr.dst] << ", " << regR[instr.src] << std::endl; + break; + case RandomX::LightInstructionType::IXOR_R: + asmCode << "xor " << regR[instr.dst] << ", " << regR[instr.src] << std::endl; + break; + case RandomX::LightInstructionType::IADD_RS: + asmCode << "lea " << regR[instr.dst] << ", [" << regR[instr.dst] << "+" << regR[instr.src] << "*" << (1 << (instr.mod % 4)) << "]" << std::endl; + break; + case RandomX::LightInstructionType::IMUL_R: + asmCode << "imul " << regR[instr.dst] << ", " << regR[instr.src] << std::endl; + break; + case RandomX::LightInstructionType::IROR_C: + asmCode << "ror " << regR[instr.dst] << ", " << instr.getImm32() << std::endl; + break; + case RandomX::LightInstructionType::IADD_C7: + asmCode << "add " << regR[instr.dst] << ", " << (int32_t)instr.getImm32() << std::endl; + break; + case RandomX::LightInstructionType::IXOR_C7: + asmCode << "xor " << regR[instr.dst] << ", " << (int32_t)instr.getImm32() << std::endl; + break; + case RandomX::LightInstructionType::IADD_C8: + asmCode << "add " << regR[instr.dst] << ", " << (int32_t)instr.getImm32() << std::endl; + asmCode << "nop" << std::endl; + break; + case RandomX::LightInstructionType::IXOR_C8: + asmCode << "xor " << regR[instr.dst] << ", " << (int32_t)instr.getImm32() << std::endl; + asmCode << "nop" << std::endl; + break; + case RandomX::LightInstructionType::IADD_C9: + asmCode << "add " << regR[instr.dst] << ", " << (int32_t)instr.getImm32() << std::endl; + asmCode << "xchg ax, ax ;nop" << std::endl; + break; + case RandomX::LightInstructionType::IXOR_C9: + asmCode << "xor " << regR[instr.dst] << ", " << (int32_t)instr.getImm32() << std::endl; + asmCode << "xchg ax, ax ;nop" << std::endl; + break; + case RandomX::LightInstructionType::IMULH_R: + asmCode << "mov rax, " << regR[instr.dst] << std::endl; + asmCode << "mul " << regR[instr.src] << std::endl; + asmCode << "mov " << regR[instr.dst] << ", rdx" << std::endl; + break; + case RandomX::LightInstructionType::ISMULH_R: + asmCode << "mov rax, " << regR[instr.dst] << std::endl; + asmCode << "imul " << regR[instr.src] << std::endl; + asmCode << "mov " << regR[instr.dst] << ", rdx" << std::endl; + break; + case RandomX::LightInstructionType::IMUL_RCP: + asmCode << "mov rax, " << (int64_t)reciprocal(instr.getImm32()) << std::endl; + asmCode << "imul " << regR[instr.dst] << ", rax" << std::endl; + break; + default: + UNREACHABLE; + } + } + } + + void AssemblyGeneratorX86::generateC(LightProgram& prog) { + asmCode.str(std::string()); //clear + asmCode << "#include " << std::endl; + asmCode << "#if defined(__SIZEOF_INT128__)" << std::endl; + asmCode << " static inline uint64_t mulh(uint64_t a, uint64_t b) {" << std::endl; + asmCode << " return ((unsigned __int128)a * b) >> 64;" << std::endl; + asmCode << " }" << std::endl; + asmCode << " static inline int64_t smulh(int64_t a, int64_t b) {" << std::endl; + asmCode << " return ((__int128)a * b) >> 64;" << std::endl; + asmCode << " }" << std::endl; + asmCode << " #define HAVE_MULH" << std::endl; + asmCode << " #define HAVE_SMULH" << std::endl; + asmCode << "#endif" << std::endl; + asmCode << "#if defined(_MSC_VER)" << std::endl; + asmCode << " #define HAS_VALUE(X) X ## 0" << std::endl; + asmCode << " #define EVAL_DEFINE(X) HAS_VALUE(X)" << std::endl; + asmCode << " #include " << std::endl; + asmCode << " #include " << std::endl; + asmCode << " static __inline uint64_t rotr(uint64_t x , int c) {" << std::endl; + asmCode << " return _rotr64(x, c);" << std::endl; + asmCode << " }" << std::endl; + asmCode << " #define HAVE_ROTR" << std::endl; + asmCode << " #if EVAL_DEFINE(__MACHINEARM64_X64(1))" << std::endl; + asmCode << " static __inline uint64_t mulh(uint64_t a, uint64_t b) {" << std::endl; + asmCode << " return __umulh(a, b);" << std::endl; + asmCode << " }" << std::endl; + asmCode << " #define HAVE_MULH" << std::endl; + asmCode << " #endif" << std::endl; + asmCode << " #if EVAL_DEFINE(__MACHINEX64(1))" << std::endl; + asmCode << " static __inline int64_t smulh(int64_t a, int64_t b) {" << std::endl; + asmCode << " int64_t hi;" << std::endl; + asmCode << " _mul128(a, b, &hi);" << std::endl; + asmCode << " return hi;" << std::endl; + asmCode << " }" << std::endl; + asmCode << " #define HAVE_SMULH" << std::endl; + asmCode << " #endif" << std::endl; + asmCode << "#endif" << std::endl; + asmCode << "#ifndef HAVE_ROTR" << std::endl; + asmCode << " static inline uint64_t rotr(uint64_t a, int b) {" << std::endl; + asmCode << " return (a >> b) | (a << (64 - b));" << std::endl; + asmCode << " }" << std::endl; + asmCode << " #define HAVE_ROTR" << std::endl; + asmCode << "#endif" << std::endl; + asmCode << "#if !defined(HAVE_MULH) || !defined(HAVE_SMULH) || !defined(HAVE_ROTR)" << std::endl; + asmCode << " #error \"Required functions are not defined\"" << std::endl; + asmCode << "#endif" << std::endl; + asmCode << "void superScalar(uint64_t r[8]) {" << std::endl; + asmCode << "uint64_t r8 = r[0], r9 = r[1], r10 = r[2], r11 = r[3], r12 = r[4], r13 = r[5], r14 = r[6], r15 = r[7];" << std::endl; + for (unsigned i = 0; i < prog.getSize(); ++i) { + Instruction& instr = prog(i); + switch (instr.opcode) + { + case RandomX::LightInstructionType::ISUB_R: + asmCode << regR[instr.dst] << " -= " << regR[instr.src] << ";" << std::endl; + break; + case RandomX::LightInstructionType::IXOR_R: + asmCode << regR[instr.dst] << " ^= " << regR[instr.src] << ";" << std::endl; + break; + case RandomX::LightInstructionType::IADD_RS: + asmCode << regR[instr.dst] << " += " << regR[instr.src] << "*" << (1 << (instr.mod % 4)) << ";" << std::endl; + break; + case RandomX::LightInstructionType::IMUL_R: + asmCode << regR[instr.dst] << " *= " << regR[instr.src] << ";" << std::endl; + break; + case RandomX::LightInstructionType::IROR_C: + asmCode << regR[instr.dst] << " = rotr(" << regR[instr.dst] << ", " << instr.getImm32() << ");" << std::endl; + break; + case RandomX::LightInstructionType::IADD_C7: + case RandomX::LightInstructionType::IADD_C8: + case RandomX::LightInstructionType::IADD_C9: + asmCode << regR[instr.dst] << " += " << (int32_t)instr.getImm32() << ";" << std::endl; + break; + case RandomX::LightInstructionType::IXOR_C7: + case RandomX::LightInstructionType::IXOR_C8: + case RandomX::LightInstructionType::IXOR_C9: + asmCode << regR[instr.dst] << " ^= " << (int32_t)instr.getImm32() << ";" << std::endl; + break; + case RandomX::LightInstructionType::IMULH_R: + asmCode << regR[instr.dst] << " = mulh(" << regR[instr.dst] << ", " << regR[instr.src] << ");" << std::endl; + break; + case RandomX::LightInstructionType::ISMULH_R: + asmCode << regR[instr.dst] << " = smulh(" << regR[instr.dst] << ", " << regR[instr.src] << ");" << std::endl; + break; + case RandomX::LightInstructionType::IMUL_RCP: + asmCode << regR[instr.dst] << " *= " << (int64_t)reciprocal(instr.getImm32()) << ";" << std::endl; + break; + default: + UNREACHABLE; + } + } + asmCode << "r[0] = r8; r[1] = r9; r[2] = r10; r[3] = r11; r[4] = r12; r[5] = r13; r[6] = r14; r[7] = r15;" << std::endl; + asmCode << "}" << std::endl; + } + int AssemblyGeneratorX86::getConditionRegister() { int min = INT_MAX; int minIndex; diff --git a/src/AssemblyGeneratorX86.hpp b/src/AssemblyGeneratorX86.hpp index 601d278..8688cd4 100644 --- a/src/AssemblyGeneratorX86.hpp +++ b/src/AssemblyGeneratorX86.hpp @@ -27,27 +27,16 @@ along with RandomX. If not, see. namespace RandomX { class Program; + class LightProgram; class AssemblyGeneratorX86; typedef void(AssemblyGeneratorX86::*InstructionGenerator)(Instruction&, int); class AssemblyGeneratorX86 { public: - template - void generateProgram(P& prog) { - for (unsigned i = 0; i < 8; ++i) { - registerUsage[i] = -1; - } - asmCode.str(std::string()); //clear - for (unsigned i = 0; i < prog.getSize(); ++i) { - asmCode << "randomx_isn_" << i << ":" << std::endl; - Instruction& instr = prog(i); - instr.src %= RegistersCount; - instr.dst %= RegistersCount; - generateCode(instr, i); - //asmCode << std::endl; - } - } + void generateProgram(Program& prog); + void generateAsm(LightProgram& prog); + void generateC(LightProgram& prog); void printCode(std::ostream& os) { os << asmCode.rdbuf(); } diff --git a/src/CompiledLightVirtualMachine.cpp b/src/CompiledLightVirtualMachine.cpp index 49e593c..760842a 100644 --- a/src/CompiledLightVirtualMachine.cpp +++ b/src/CompiledLightVirtualMachine.cpp @@ -23,18 +23,25 @@ along with RandomX. If not, see. namespace RandomX { - CompiledLightVirtualMachine::CompiledLightVirtualMachine() { - } - - void CompiledLightVirtualMachine::setDataset(dataset_t ds, uint64_t size) { + template + void CompiledLightVirtualMachine::setDataset(dataset_t ds, uint64_t size, LightProgram(&programs)[RANDOMX_CACHE_ACCESSES]) { mem.ds = ds; datasetRange = (size - RANDOMX_DATASET_SIZE + CacheLineSize) / CacheLineSize; + if(superscalar) + compiler.generateSuperScalarHash(programs); //datasetBasePtr = ds.dataset.memory; } - void CompiledLightVirtualMachine::initialize() { + template void CompiledLightVirtualMachine::setDataset(dataset_t ds, uint64_t size, LightProgram(&programs)[RANDOMX_CACHE_ACCESSES]); + template void CompiledLightVirtualMachine::setDataset(dataset_t ds, uint64_t size, LightProgram(&programs)[RANDOMX_CACHE_ACCESSES]); + + template + void CompiledLightVirtualMachine::initialize() { VirtualMachine::initialize(); - compiler.generateProgramLight(program); + compiler.generateProgramLight(program); //mem.ds.dataset.memory = datasetBasePtr + (datasetBase * CacheLineSize); } + + template void CompiledLightVirtualMachine::initialize(); + template void CompiledLightVirtualMachine::initialize(); } \ No newline at end of file diff --git a/src/CompiledLightVirtualMachine.hpp b/src/CompiledLightVirtualMachine.hpp index 9ac52be..9493c58 100644 --- a/src/CompiledLightVirtualMachine.hpp +++ b/src/CompiledLightVirtualMachine.hpp @@ -26,6 +26,7 @@ along with RandomX. If not, see. namespace RandomX { + template class CompiledLightVirtualMachine : public CompiledVirtualMachine { public: void* operator new(size_t size) { @@ -37,8 +38,8 @@ namespace RandomX { void operator delete(void* ptr) { _mm_free(ptr); } - CompiledLightVirtualMachine(); - void setDataset(dataset_t ds, uint64_t size) override; + CompiledLightVirtualMachine() {} + void setDataset(dataset_t ds, uint64_t size, LightProgram(&programs)[RANDOMX_CACHE_ACCESSES]) override; void initialize() override; }; } \ No newline at end of file diff --git a/src/CompiledVirtualMachine.cpp b/src/CompiledVirtualMachine.cpp index c313209..4984938 100644 --- a/src/CompiledVirtualMachine.cpp +++ b/src/CompiledVirtualMachine.cpp @@ -29,7 +29,7 @@ namespace RandomX { CompiledVirtualMachine::CompiledVirtualMachine() { } - void CompiledVirtualMachine::setDataset(dataset_t ds, uint64_t size) { + void CompiledVirtualMachine::setDataset(dataset_t ds, uint64_t size, LightProgram(&programs)[RANDOMX_CACHE_ACCESSES]) { mem.ds = ds; datasetRange = (size - RANDOMX_DATASET_SIZE + CacheLineSize) / CacheLineSize; datasetBasePtr = ds.dataset.memory; diff --git a/src/CompiledVirtualMachine.hpp b/src/CompiledVirtualMachine.hpp index 9deb621..65b1885 100644 --- a/src/CompiledVirtualMachine.hpp +++ b/src/CompiledVirtualMachine.hpp @@ -42,7 +42,7 @@ namespace RandomX { _mm_free(ptr); } CompiledVirtualMachine(); - void setDataset(dataset_t ds, uint64_t size) override; + void setDataset(dataset_t ds, uint64_t size, LightProgram(&programs)[RANDOMX_CACHE_ACCESSES]) override; void initialize() override; virtual void execute() override; void* getProgram() { diff --git a/src/InterpretedVirtualMachine.cpp b/src/InterpretedVirtualMachine.cpp index ebb3571..636b95b 100644 --- a/src/InterpretedVirtualMachine.cpp +++ b/src/InterpretedVirtualMachine.cpp @@ -49,7 +49,7 @@ namespace RandomX { } - void InterpretedVirtualMachine::setDataset(dataset_t ds, uint64_t size) { + void InterpretedVirtualMachine::setDataset(dataset_t ds, uint64_t size, LightProgram(&programs)[RANDOMX_CACHE_ACCESSES]) { mem.ds = ds; readDataset = &datasetReadLight; datasetRange = (size - RANDOMX_DATASET_SIZE + CacheLineSize) / CacheLineSize; diff --git a/src/InterpretedVirtualMachine.hpp b/src/InterpretedVirtualMachine.hpp index d6da7e3..49178bc 100644 --- a/src/InterpretedVirtualMachine.hpp +++ b/src/InterpretedVirtualMachine.hpp @@ -75,7 +75,7 @@ namespace RandomX { } InterpretedVirtualMachine(bool soft) : softAes(soft) {} ~InterpretedVirtualMachine(); - void setDataset(dataset_t ds, uint64_t size) override; + void setDataset(dataset_t ds, uint64_t size, LightProgram(&programs)[RANDOMX_CACHE_ACCESSES]) override; void initialize() override; void execute() override; private: diff --git a/src/JitCompilerX86-static.asm b/src/JitCompilerX86-static.asm index d16cab7..f149655 100644 --- a/src/JitCompilerX86-static.asm +++ b/src/JitCompilerX86-static.asm @@ -25,6 +25,8 @@ PUBLIC randomx_program_loop_load PUBLIC randomx_program_start PUBLIC randomx_program_read_dataset PUBLIC randomx_program_read_dataset_light +PUBLIC randomx_program_read_dataset_sshash_init +PUBLIC randomx_program_read_dataset_sshash_fin PUBLIC randomx_program_read_dataset_light_sub PUBLIC randomx_dataset_init PUBLIC randomx_program_loop_store @@ -65,6 +67,38 @@ randomx_program_read_dataset_light PROC include asm/program_read_dataset_light.inc randomx_program_read_dataset_light ENDP +randomx_program_read_dataset_sshash_init PROC + sub rsp, 72 + mov qword ptr [rsp+64], rbx + mov qword ptr [rsp+56], r8 + mov qword ptr [rsp+48], r9 + mov qword ptr [rsp+40], r10 + mov qword ptr [rsp+32], r11 + mov qword ptr [rsp+24], r12 + mov qword ptr [rsp+16], r13 + mov qword ptr [rsp+8], r14 + mov qword ptr [rsp+0], r15 + xor rbp, rax ;# modify "mx" + ror rbp, 32 ;# swap "ma" and "mx" + mov ebx, ebp ;# ecx = ma + and ebx, 2147483584 ;# align "ma" to the start of a cache line + shr ebx, 6 ;# ebx = Dataset block number + ;# call 32768 +randomx_program_read_dataset_sshash_init ENDP + +randomx_program_read_dataset_sshash_fin PROC + mov rbx, qword ptr [rsp+64] + xor r8, qword ptr [rsp+56] + xor r9, qword ptr [rsp+48] + xor r10, qword ptr [rsp+40] + xor r11, qword ptr [rsp+32] + xor r12, qword ptr [rsp+24] + xor r13, qword ptr [rsp+16] + xor r14, qword ptr [rsp+8] + xor r15, qword ptr [rsp+0] + add rsp, 72 +randomx_program_read_dataset_sshash_fin ENDP + randomx_program_loop_store PROC include asm/program_loop_store.inc randomx_program_loop_store ENDP diff --git a/src/JitCompilerX86-static.hpp b/src/JitCompilerX86-static.hpp index cf250c2..3bb56ac 100644 --- a/src/JitCompilerX86-static.hpp +++ b/src/JitCompilerX86-static.hpp @@ -24,6 +24,8 @@ extern "C" { void randomx_program_start(); void randomx_program_read_dataset(); void randomx_program_read_dataset_light(); + void randomx_program_read_dataset_sshash_init(); + void randomx_program_read_dataset_sshash_fin(); void randomx_program_loop_store(); void randomx_program_loop_end(); void randomx_program_read_dataset_light_sub(); diff --git a/src/JitCompilerX86.cpp b/src/JitCompilerX86.cpp index d6e27f1..c4b8ea8 100644 --- a/src/JitCompilerX86.cpp +++ b/src/JitCompilerX86.cpp @@ -87,6 +87,7 @@ namespace RandomX { */ #include "JitCompilerX86-static.hpp" +#include "LightProgramGenerator.hpp" #define NOP_TEST true @@ -96,6 +97,8 @@ namespace RandomX { const uint8_t* codeProgamStart = (uint8_t*)&randomx_program_start; const uint8_t* codeReadDataset = (uint8_t*)&randomx_program_read_dataset; const uint8_t* codeReadDatasetLight = (uint8_t*)&randomx_program_read_dataset_light; + const uint8_t* codeReadDatasetLightSshInit = (uint8_t*)&randomx_program_read_dataset_sshash_init; + const uint8_t* codeReadDatasetLightSshFin = (uint8_t*)&randomx_program_read_dataset_sshash_fin; const uint8_t* codeDatasetInit = (uint8_t*)&randomx_dataset_init; const uint8_t* codeLoopStore = (uint8_t*)&randomx_program_loop_store; const uint8_t* codeLoopEnd = (uint8_t*)&randomx_program_loop_end; @@ -110,7 +113,9 @@ namespace RandomX { const int32_t prologueSize = codeLoopBegin - codePrologue; const int32_t loopLoadSize = codeProgamStart - codeLoopLoad; const int32_t readDatasetSize = codeReadDatasetLight - codeReadDataset; - const int32_t readDatasetLightSize = codeLoopStore - codeReadDatasetLight; + const int32_t readDatasetLightSize = codeReadDatasetLightSshInit - codeReadDatasetLight; + const int32_t readDatasetLightInitSize = codeReadDatasetLightSshFin - codeReadDatasetLightSshInit; + const int32_t readDatasetLightFinSize = codeLoopStore - codeReadDatasetLightSshFin; const int32_t loopStoreSize = codeLoopEnd - codeLoopStore; const int32_t readDatasetLightSubSize = codeDatasetInit - codeReadDatasetLightSub; const int32_t datasetInitSize = codeEpilogue - codeDatasetInit; @@ -199,7 +204,7 @@ namespace RandomX { static const uint8_t NOP1[] = { 0x90 }; static const uint8_t NOP2[] = { 0x66, 0x90 }; - static const uint8_t NOP3[] = { 0x0F, 0x1F, 0x00 }; + static const uint8_t NOP3[] = { 0x66, 0x66, 0x90 }; static const uint8_t NOP4[] = { 0x0F, 0x1F, 0x40, 0x00 }; static const uint8_t NOP5[] = { 0x0F, 0x1F, 0x44, 0x00, 0x00 }; static const uint8_t NOP6[] = { 0x66, 0x0F, 0x1F, 0x44, 0x00, 0x00 }; @@ -230,19 +235,31 @@ namespace RandomX { generateProgramEpilogue(prog); } + template void JitCompilerX86::generateProgramLight(Program& prog) { if (RANDOMX_CACHE_ACCESSES != 8) throw std::runtime_error("JIT compiler: Unsupported value of RANDOMX_CACHE_ACCESSES"); if (RANDOMX_ARGON_GROWTH != 0) throw std::runtime_error("JIT compiler: Unsupported value of RANDOMX_ARGON_GROWTH"); generateProgramPrologue(prog); - memcpy(code + codePos, codeReadDatasetLight, readDatasetLightSize); - codePos += readDatasetLightSize; - emitByte(CALL); - emit32(readDatasetLightSubOffset - (codePos + 4)); + if (superscalar) { + emit(codeReadDatasetLightSshInit, readDatasetLightInitSize); + emitByte(CALL); + emit32(superScalarHashOffset - (codePos + 4)); + emit(codeReadDatasetLightSshFin, readDatasetLightFinSize); + } + else { + memcpy(code + codePos, codeReadDatasetLight, readDatasetLightSize); + codePos += readDatasetLightSize; + emitByte(CALL); + emit32(readDatasetLightSubOffset - (codePos + 4)); + } generateProgramEpilogue(prog); } + template void JitCompilerX86::generateProgramLight(Program& prog); + template void JitCompilerX86::generateProgramLight(Program& prog); + template void JitCompilerX86::generateSuperScalarHash(LightProgram(&programs)[N]) { memcpy(code + superScalarHashOffset, codeShhInit, codeSshInitSize); @@ -253,7 +270,7 @@ namespace RandomX { Instruction& instr = prog(i); instr.src %= RegistersCount; instr.dst %= RegistersCount; - generateCode(instr, i); + generateCode(instr, i); } emit(codeShhLoad, codeSshLoadSize); if (j < N - 1) { @@ -318,6 +335,7 @@ namespace RandomX { emit32(epilogueOffset - codePos - 4); } + template void JitCompilerX86::generateCode(Instruction& instr, int i) { #ifdef RANDOMX_JUMP instructionOffsets.push_back(codePos); @@ -326,6 +344,95 @@ namespace RandomX { (this->*generator)(instr, i); } + template<> + void JitCompilerX86::generateCode(Instruction& instr, int i) { + switch (instr.opcode) + { + case RandomX::LightInstructionType::ISUB_R: + emit(REX_SUB_RR); + emitByte(0xc0 + 8 * instr.dst + instr.src); + break; + case RandomX::LightInstructionType::IXOR_R: + emit(REX_XOR_RR); + emitByte(0xc0 + 8 * instr.dst + instr.src); + break; + case RandomX::LightInstructionType::IADD_RS: + emit(REX_LEA); + emitByte(0x04 + 8 * instr.dst); + genSIB(instr.mod % 4, instr.src, instr.dst); + break; + case RandomX::LightInstructionType::IMUL_R: + emit(REX_IMUL_RR); + emitByte(0xc0 + 8 * instr.dst + instr.src); + break; + case RandomX::LightInstructionType::IROR_C: + emit(REX_ROT_I8); + emitByte(0xc8 + instr.dst); + emitByte(instr.getImm32() & 63); + break; + case RandomX::LightInstructionType::IADD_C7: + emit(REX_81); + emitByte(0xc0 + instr.dst); + emit32(instr.getImm32()); + break; + case RandomX::LightInstructionType::IXOR_C7: + emit(REX_XOR_RI); + emitByte(0xf0 + instr.dst); + emit32(instr.getImm32()); + break; + case RandomX::LightInstructionType::IADD_C8: + emit(REX_81); + emitByte(0xc0 + instr.dst); + emit32(instr.getImm32()); + emit(NOP1); + break; + case RandomX::LightInstructionType::IXOR_C8: + emit(REX_XOR_RI); + emitByte(0xf0 + instr.dst); + emit32(instr.getImm32()); + emit(NOP1); + break; + case RandomX::LightInstructionType::IADD_C9: + emit(REX_81); + emitByte(0xc0 + instr.dst); + emit32(instr.getImm32()); + emit(NOP2); + break; + case RandomX::LightInstructionType::IXOR_C9: + emit(REX_XOR_RI); + emitByte(0xf0 + instr.dst); + emit32(instr.getImm32()); + emit(NOP2); + break; + case RandomX::LightInstructionType::IMULH_R: + emit(REX_MOV_RR64); + emitByte(0xc0 + instr.dst); + emit(REX_MUL_R); + emitByte(0xe0 + instr.src); + emit(REX_MOV_R64R); + emitByte(0xc2 + 8 * instr.dst); + break; + case RandomX::LightInstructionType::ISMULH_R: + emit(REX_MOV_RR64); + emitByte(0xc0 + instr.dst); + emit(REX_MUL_R); + emitByte(0xe8 + instr.src); + emit(REX_MOV_R64R); + emitByte(0xc2 + 8 * instr.dst); + break; + case RandomX::LightInstructionType::IMUL_RCP: + emit(MOV_RAX_I); + emit64(reciprocal(instr.getImm32())); + emit(REX_IMUL_RM); + emitByte(0xc0 + 8 * instr.dst); + break; + default: + UNREACHABLE; + } + } + + template void JitCompilerX86::generateCode(Instruction& instr, int i); + void JitCompilerX86::genAddressReg(Instruction& instr, bool rax = true) { emit(REX_MOV_RR); emitByte((rax ? 0xc0 : 0xc8) + instr.src); diff --git a/src/JitCompilerX86.hpp b/src/JitCompilerX86.hpp index 16fe26d..9240cfe 100644 --- a/src/JitCompilerX86.hpp +++ b/src/JitCompilerX86.hpp @@ -39,6 +39,7 @@ namespace RandomX { JitCompilerX86(); ~JitCompilerX86(); void generateProgram(Program&); + template void generateProgramLight(Program&); template void generateSuperScalarHash(LightProgram (&programs)[N]); @@ -66,7 +67,7 @@ namespace RandomX { Instruction& instr = prog(i); instr.src %= RegistersCount; instr.dst %= RegistersCount; - generateCode(instr, i); + generateCode

(instr, i); } } @@ -81,6 +82,8 @@ namespace RandomX { void genSIB(int scale, int index, int base); void handleCondition(Instruction&, int); + + template void generateCode(Instruction&, int); void emitByte(uint8_t val) { diff --git a/src/LightProgramGenerator.cpp b/src/LightProgramGenerator.cpp index d5ebadf..eeb09de 100644 --- a/src/LightProgramGenerator.cpp +++ b/src/LightProgramGenerator.cpp @@ -29,23 +29,6 @@ along with RandomX. If not, see. #include "LightProgramGenerator.hpp" namespace RandomX { - // Intel Ivy Bridge reference - namespace LightInstructionType { //uOPs (decode) execution ports latency code size - constexpr int IADD_RS = 0; //1 p01 1 4 - constexpr int ISUB_R = 1; //1 p015 1 3 - constexpr int ISUB_C = 2; //1 p015 3 7 - constexpr int IMUL_R = 3; //1 p1 3 4 - constexpr int IMUL_C = 4; //1 p1 3 7 - constexpr int IMULH_R = 5; //1+2+1 0+(p1,p5)+0 3 3+3+3 - constexpr int ISMULH_R = 6; //1+2+1 0+(p1,p5)+0 3 3+3+3 - constexpr int IMUL_RCP = 7; //1+1 p015+p1 4 10+4 - constexpr int IXOR_R = 8; //1 p015 1 3 - constexpr int IXOR_C = 9; //1 p015 1 7 - constexpr int IROR_R = 10; //1+2 0+(p0,p5) 1 3+3 - constexpr int IROR_C = 11; //1 p05 1 4 - constexpr int COND_R = 12; //1+1+1+1+1+1 p015+p5+0+p015+p05+p015 3 7+13+3+7+3+3 - constexpr int COUNT = 13; - } namespace LightInstructionOpcode { constexpr int IADD_RS = 0; @@ -62,7 +45,7 @@ namespace RandomX { } static bool isMul(int type) { - return type == LightInstructionType::IMUL_R || type == LightInstructionType::IMUL_C || type == LightInstructionType::IMULH_R || type == LightInstructionType::ISMULH_R || type == LightInstructionType::IMUL_RCP; + return type == LightInstructionType::IMUL_R || type == LightInstructionType::IMULH_R || type == LightInstructionType::ISMULH_R || type == LightInstructionType::IMUL_RCP; } const int lightInstructionOpcode[] = { @@ -289,19 +272,20 @@ namespace RandomX { int getSrcOp() const { return srcOp_; } - static const LightInstructionInfo IADD_RS; static const LightInstructionInfo ISUB_R; - static const LightInstructionInfo ISUB_C; + static const LightInstructionInfo IXOR_R; + static const LightInstructionInfo IADD_RS; static const LightInstructionInfo IMUL_R; - static const LightInstructionInfo IMUL_C; + static const LightInstructionInfo IROR_C; + static const LightInstructionInfo IADD_C7; + static const LightInstructionInfo IXOR_C7; + static const LightInstructionInfo IADD_C8; + static const LightInstructionInfo IXOR_C8; + static const LightInstructionInfo IADD_C9; + static const LightInstructionInfo IXOR_C9; static const LightInstructionInfo IMULH_R; static const LightInstructionInfo ISMULH_R; static const LightInstructionInfo IMUL_RCP; - static const LightInstructionInfo IXOR_R; - static const LightInstructionInfo IXOR_C; - static const LightInstructionInfo IROR_R; - static const LightInstructionInfo IROR_C; - static const LightInstructionInfo COND_R; static const LightInstructionInfo NOP; private: const char* name_; @@ -316,28 +300,31 @@ namespace RandomX { : name_(name), type_(-1), latency_(0) {} }; - const LightInstructionInfo LightInstructionInfo::IADD_RS = LightInstructionInfo("IADD_RS", LightInstructionType::IADD_RS, MacroOp::Lea_sib, 0); const LightInstructionInfo LightInstructionInfo::ISUB_R = LightInstructionInfo("ISUB_R", LightInstructionType::ISUB_R, MacroOp::Sub_rr, 0); - const LightInstructionInfo LightInstructionInfo::ISUB_C = LightInstructionInfo("ISUB_C", LightInstructionType::ISUB_C, MacroOp::Sub_ri, -1); + const LightInstructionInfo LightInstructionInfo::IXOR_R = LightInstructionInfo("IXOR_R", LightInstructionType::IXOR_R, MacroOp::Xor_rr, 0); + const LightInstructionInfo LightInstructionInfo::IADD_RS = LightInstructionInfo("IADD_RS", LightInstructionType::IADD_RS, MacroOp::Lea_sib, 0); const LightInstructionInfo LightInstructionInfo::IMUL_R = LightInstructionInfo("IMUL_R", LightInstructionType::IMUL_R, MacroOp::Imul_rr, 0); - const LightInstructionInfo LightInstructionInfo::IMUL_C = LightInstructionInfo("IMUL_C", LightInstructionType::IMUL_C, MacroOp::Imul_rri, -1); + const LightInstructionInfo LightInstructionInfo::IROR_C = LightInstructionInfo("IROR_C", LightInstructionType::IROR_C, MacroOp::Ror_ri, -1); + + const LightInstructionInfo LightInstructionInfo::IADD_C7 = LightInstructionInfo("IADD_C7", LightInstructionType::IADD_C7, MacroOp::Add_ri, -1); + const LightInstructionInfo LightInstructionInfo::IXOR_C7 = LightInstructionInfo("IXOR_C7", LightInstructionType::IXOR_C7, MacroOp::Xor_ri, -1); + const LightInstructionInfo LightInstructionInfo::IADD_C8 = LightInstructionInfo("IADD_C8", LightInstructionType::IADD_C8, MacroOp::Add_ri, -1); + const LightInstructionInfo LightInstructionInfo::IXOR_C8 = LightInstructionInfo("IXOR_C8", LightInstructionType::IXOR_C8, MacroOp::Xor_ri, -1); + const LightInstructionInfo LightInstructionInfo::IADD_C9 = LightInstructionInfo("IADD_C9", LightInstructionType::IADD_C9, MacroOp::Add_ri, -1); + const LightInstructionInfo LightInstructionInfo::IXOR_C9 = LightInstructionInfo("IXOR_C9", LightInstructionType::IXOR_C9, MacroOp::Xor_ri, -1); + const LightInstructionInfo LightInstructionInfo::IMULH_R = LightInstructionInfo("IMULH_R", LightInstructionType::IMULH_R, IMULH_R_ops_array, 1, 0, 1); const LightInstructionInfo LightInstructionInfo::ISMULH_R = LightInstructionInfo("ISMULH_R", LightInstructionType::ISMULH_R, ISMULH_R_ops_array, 1, 0, 1); const LightInstructionInfo LightInstructionInfo::IMUL_RCP = LightInstructionInfo("IMUL_RCP", LightInstructionType::IMUL_RCP, IMUL_RCP_ops_array, 1, 1, -1); - const LightInstructionInfo LightInstructionInfo::IXOR_R = LightInstructionInfo("IXOR_R", LightInstructionType::IXOR_R, MacroOp::Xor_rr, 0); - const LightInstructionInfo LightInstructionInfo::IXOR_C = LightInstructionInfo("IXOR_C", LightInstructionType::IXOR_C, MacroOp::Xor_ri, -1); - const LightInstructionInfo LightInstructionInfo::IROR_R = LightInstructionInfo("IROR_R", LightInstructionType::IROR_R, IROR_R_ops_array, 1, 1, 0); - const LightInstructionInfo LightInstructionInfo::IROR_C = LightInstructionInfo("IROR_C", LightInstructionType::IROR_C, MacroOp::Ror_ri, -1); - const LightInstructionInfo LightInstructionInfo::COND_R = LightInstructionInfo("COND_R", LightInstructionType::COND_R, COND_R_ops_array, 5, 5, 3); + const LightInstructionInfo LightInstructionInfo::NOP = LightInstructionInfo("NOP"); - const int buffer0[] = { 3, 3, 10 }; + const int buffer0[] = { 4, 8, 4 }; const int buffer1[] = { 7, 3, 3, 3 }; - const int buffer2[] = { 3, 3, 3, 7 }; + const int buffer2[] = { 3, 7, 3, 3 }; + const int buffer3[] = { 4, 9, 3 }; const int buffer4[] = { 4, 4, 4, 4 }; - const int buffer5[] = { 3, 7, 3, 3 }; - const int buffer6[] = { 3, 3, 7, 3 }; - const int buffer7[] = { 13, 3 }; + const int buffer5[] = { 3, 3, 10 }; class DecoderBuffer { public: @@ -360,16 +347,10 @@ namespace RandomX { const DecoderBuffer* fetchNext(int instrType, int cycle, int mulCount, Blake2Generator& gen) const { if (instrType == LightInstructionType::IMULH_R || instrType == LightInstructionType::ISMULH_R) return &decodeBuffer3310; //2-1-1 decode - if (mulCount < cycle) - return &decodeBuffer4444_mul; - if (index_ == 0) { - return &decodeBuffer4444; //IMUL_RCP end - } - /*if (index_ == 2) { - return &decodeBuffer133; //COND_R middle - }*/ - if (index_ == 7) { - return &decodeBuffer7333; //COND_R end + if (mulCount < cycle + 1) + return &decodeBuffer4444; + if (index_ == 5) { //IMUL_RCP end + return (gen.getByte() & 1) ? &decodeBuffer484 : &decodeBuffer493; } return fetchNextDefault(gen); } @@ -379,49 +360,40 @@ namespace RandomX { const int* counts_; int opsCount_; DecoderBuffer() : index_(-1) {} - static const DecoderBuffer decodeBuffer3310; + static const DecoderBuffer decodeBuffer484; static const DecoderBuffer decodeBuffer7333; - static const DecoderBuffer decodeBuffer3337; - static const DecoderBuffer decodeBuffer4444; - static const DecoderBuffer decodeBuffer4444_mul; static const DecoderBuffer decodeBuffer3733; - static const DecoderBuffer decodeBuffer3373; - static const DecoderBuffer decodeBuffer133; - static const DecoderBuffer* decodeBuffers[7]; + static const DecoderBuffer decodeBuffer493; + static const DecoderBuffer decodeBuffer4444; + static const DecoderBuffer decodeBuffer3310; + static const DecoderBuffer* decodeBuffers[4]; const DecoderBuffer* fetchNextDefault(Blake2Generator& gen) const { - int select; - //do { - select = gen.getByte() & 3; - //} while (select == 7); - return decodeBuffers[select]; + return decodeBuffers[gen.getByte() & 3]; } }; - const DecoderBuffer DecoderBuffer::decodeBuffer3310 = DecoderBuffer("3,3,10", 0, buffer0); + const DecoderBuffer DecoderBuffer::decodeBuffer484 = DecoderBuffer("4,8,4", 0, buffer0); const DecoderBuffer DecoderBuffer::decodeBuffer7333 = DecoderBuffer("7,3,3,3", 1, buffer1); - const DecoderBuffer DecoderBuffer::decodeBuffer3337 = DecoderBuffer("3,3,3,7", 2, buffer2); - const DecoderBuffer DecoderBuffer::decodeBuffer4444_mul = DecoderBuffer("4,4,4,4-MUL", 3, buffer4); + const DecoderBuffer DecoderBuffer::decodeBuffer3733 = DecoderBuffer("3,7,3,3", 2, buffer2); + const DecoderBuffer DecoderBuffer::decodeBuffer493 = DecoderBuffer("4,9,3", 3, buffer3); const DecoderBuffer DecoderBuffer::decodeBuffer4444 = DecoderBuffer("4,4,4,4", 4, buffer4); - - const DecoderBuffer DecoderBuffer::decodeBuffer3733 = DecoderBuffer("3,7,3,3", 5, buffer5); - const DecoderBuffer DecoderBuffer::decodeBuffer3373 = DecoderBuffer("3,3,7,3", 6, buffer6); - const DecoderBuffer DecoderBuffer::decodeBuffer133 = DecoderBuffer("13,3", 7, buffer7); + const DecoderBuffer DecoderBuffer::decodeBuffer3310 = DecoderBuffer("3,3,10", 5, buffer5); - const DecoderBuffer* DecoderBuffer::decodeBuffers[7] = { - &DecoderBuffer::decodeBuffer3310, - &DecoderBuffer::decodeBuffer3337, + const DecoderBuffer* DecoderBuffer::decodeBuffers[4] = { + &DecoderBuffer::decodeBuffer484, + &DecoderBuffer::decodeBuffer7333, &DecoderBuffer::decodeBuffer3733, - &DecoderBuffer::decodeBuffer3373, + &DecoderBuffer::decodeBuffer493, }; const DecoderBuffer DecoderBuffer::Default = DecoderBuffer(); const LightInstructionInfo* slot_3[] = { &LightInstructionInfo::ISUB_R, &LightInstructionInfo::IXOR_R }; const LightInstructionInfo* slot_3L[] = { &LightInstructionInfo::ISUB_R, &LightInstructionInfo::IXOR_R, &LightInstructionInfo::IMULH_R, &LightInstructionInfo::ISMULH_R }; - const LightInstructionInfo* slot_3C[] = { &LightInstructionInfo::ISUB_R, &LightInstructionInfo::IXOR_R, &LightInstructionInfo::IROR_R, &LightInstructionInfo::IXOR_R }; const LightInstructionInfo* slot_4[] = { &LightInstructionInfo::IROR_C, &LightInstructionInfo::IADD_RS }; - const LightInstructionInfo* slot_7[] = { &LightInstructionInfo::IXOR_C, &LightInstructionInfo::ISUB_C }; - const LightInstructionInfo* slot_7L = &LightInstructionInfo::COND_R; + const LightInstructionInfo* slot_7[] = { &LightInstructionInfo::IXOR_C7, &LightInstructionInfo::IADD_C7 }; + const LightInstructionInfo* slot_8[] = { &LightInstructionInfo::IXOR_C8, &LightInstructionInfo::IADD_C8 }; + const LightInstructionInfo* slot_9[] = { &LightInstructionInfo::IXOR_C9, &LightInstructionInfo::IADD_C9 }; const LightInstructionInfo* slot_10 = &LightInstructionInfo::IMUL_RCP; static bool selectRegister(std::vector& availableRegisters, Blake2Generator& gen, int& reg) { @@ -443,7 +415,7 @@ namespace RandomX { class LightInstruction { public: void toInstr(Instruction& instr) { - instr.opcode = lightInstructionOpcode[getType()]; + instr.opcode = getType(); instr.dst = dst_; instr.src = src_ >= 0 ? src_ : dst_; instr.mod = mod_; @@ -457,28 +429,22 @@ namespace RandomX { if (isLast) { return create(slot_3L[gen.getByte() & 3], gen); } - else if (false && isFirst && fetchType == 0) { - return create(slot_3C[gen.getByte() & 3], gen); - } else { return create(slot_3[gen.getByte() & 1], gen); } case 4: - if (fetchType == 3 && !isLast) { + if (fetchType == 4 && !isLast) { return create(&LightInstructionInfo::IMUL_R, gen); } else { return create(slot_4[gen.getByte() & 1], gen); } case 7: - if (false && isLast) { - return create(slot_7L, gen); - } - if (false && isFirst) { - return create(&LightInstructionInfo::IMUL_C, gen); - } else { - return create(slot_7[gen.getByte() & 1], gen); - } + return create(slot_7[gen.getByte() & 1], gen); + case 8: + return create(slot_8[gen.getByte() & 1], gen); + case 9: + return create(slot_9[gen.getByte() & 1], gen); case 10: return create(slot_10, gen); default: @@ -490,13 +456,6 @@ namespace RandomX { LightInstruction li(info); switch (info->getType()) { - case LightInstructionType::IADD_RS: { - li.mod_ = gen.getByte(); - li.imm32_ = 0; - li.opGroup_ = LightInstructionType::IADD_RS; - li.groupParIsSource_ = true; - } break; - case LightInstructionType::ISUB_R: { li.mod_ = 0; li.imm32_ = 0; @@ -504,24 +463,51 @@ namespace RandomX { li.groupParIsSource_ = true; } break; - case LightInstructionType::ISUB_C: { + case LightInstructionType::IXOR_R: { li.mod_ = 0; - li.imm32_ = gen.getInt32(); - li.opGroup_ = LightInstructionType::ISUB_C; - li.opGroupPar_ = -1; + li.imm32_ = 0; + li.opGroup_ = LightInstructionType::IXOR_R; + li.groupParIsSource_ = true; + } break; + + case LightInstructionType::IADD_RS: { + li.mod_ = gen.getByte(); + li.imm32_ = 0; + li.opGroup_ = LightInstructionType::IADD_RS; + li.groupParIsSource_ = true; } break; case LightInstructionType::IMUL_R: { li.mod_ = 0; li.imm32_ = 0; li.opGroup_ = LightInstructionType::IMUL_R; - li.opGroupPar_ = gen.getInt32(); + li.opGroupPar_ = -1; //TODO } break; - case LightInstructionType::IMUL_C: { + case LightInstructionType::IROR_C: { + li.mod_ = 0; + do { + li.imm32_ = gen.getByte() & 63; + } while (li.imm32_ == 0); + li.opGroup_ = LightInstructionType::IROR_C; + li.opGroupPar_ = -1; + } break; + + case LightInstructionType::IADD_C7: + case LightInstructionType::IADD_C8: + case LightInstructionType::IADD_C9: { li.mod_ = 0; li.imm32_ = gen.getInt32(); - li.opGroup_ = LightInstructionType::IMUL_C; + li.opGroup_ = LightInstructionType::IADD_C7; + li.opGroupPar_ = -1; + } break; + + case LightInstructionType::IXOR_C7: + case LightInstructionType::IXOR_C8: + case LightInstructionType::IXOR_C9: { + li.mod_ = 0; + li.imm32_ = gen.getInt32(); + li.opGroup_ = LightInstructionType::IXOR_C7; li.opGroupPar_ = -1; } break; @@ -542,50 +528,14 @@ namespace RandomX { } break; case LightInstructionType::IMUL_RCP: { - li.mod_ = 0; - li.imm32_ = gen.getInt32(); - li.opGroup_ = LightInstructionType::IMUL_C; - li.opGroupPar_ = -1; - } break; - - case LightInstructionType::IXOR_R: { - li.mod_ = 0; - li.imm32_ = 0; - li.opGroup_ = LightInstructionType::IXOR_R; - li.groupParIsSource_ = true; - } break; - - case LightInstructionType::IXOR_C: { - li.mod_ = 0; - li.imm32_ = gen.getInt32(); - li.opGroup_ = LightInstructionType::IXOR_R; - li.opGroupPar_ = -1; - } break; - - case LightInstructionType::IROR_R: { - li.mod_ = 0; - li.imm32_ = 0; - li.opGroup_ = LightInstructionType::IROR_R; - li.opGroupPar_ = -1; - } break; - - case LightInstructionType::IROR_C: { li.mod_ = 0; do { - li.imm32_ = gen.getByte(); - } while ((li.imm32_ & 63) == 0); - li.opGroup_ = LightInstructionType::IROR_R; + li.imm32_ = gen.getInt32(); + } while ((li.imm32_ & (li.imm32_ - 1)) == 0); + li.opGroup_ = LightInstructionType::IMUL_RCP; li.opGroupPar_ = -1; } break; - case LightInstructionType::COND_R: { - li.canReuse_ = true; - li.mod_ = gen.getByte(); - li.imm32_ = gen.getInt32(); - li.opGroup_ = LightInstructionType::COND_R; - li.opGroupPar_ = li.imm32_; - } break; - default: break; } @@ -675,8 +625,10 @@ namespace RandomX { constexpr int CYCLE_MAP_SIZE = RANDOMX_LPROG_LATENCY + 3; #ifndef _DEBUG constexpr bool TRACE = false; + constexpr bool INFO = false; #else constexpr bool TRACE = true; + constexpr bool INFO = true; #endif static int blakeCounter = 0; @@ -806,6 +758,7 @@ namespace RandomX { int codeSize = 0; int macroOpCount = 0; int cycle = 0; + int fetchCycle = 0; int depCycle = 0; int retireCycle = 0; int mopIndex = 0; @@ -816,7 +769,7 @@ namespace RandomX { constexpr int MAX_ATTEMPTS = 4; while(!portsSaturated) { - fetchLine = fetchLine->fetchNext(currentInstruction.getType(), cycle, mulCount, gen); + fetchLine = fetchLine->fetchNext(currentInstruction.getType(), fetchCycle++, mulCount, gen); if (TRACE) std::cout << "; ------------- fetch cycle " << cycle << " (" << fetchLine->getName() << ")" << std::endl; mopIndex = 0; @@ -833,7 +786,6 @@ namespace RandomX { MacroOp& mop = currentInstruction.getInfo().getOp(instrIndex); if (fetchLine->getCounts()[mopIndex] != mop.getSize()) { if (TRACE) std::cout << "ERROR instruction " << mop.getName() << " doesn't fit into slot of size " << fetchLine->getCounts()[mopIndex] << std::endl; - return DBL_MIN; } if (TRACE) std::cout << mop.getName() << " "; @@ -899,8 +851,8 @@ namespace RandomX { ++cycle; } - std::cout << "; ALU port utilization:" << std::endl; - std::cout << "; (* = in use, _ = idle)" << std::endl; + if(INFO) std::cout << "; ALU port utilization:" << std::endl; + if (INFO) std::cout << "; (* = in use, _ = idle)" << std::endl; int portCycles = 0; /*for (int i = 0; i < CYCLE_MAP_SIZE; ++i) { @@ -914,13 +866,13 @@ namespace RandomX { double ipc = (macroOpCount / (double)retireCycle); - std::cout << "; code size " << codeSize << " bytes" << std::endl; - std::cout << "; x86 macro-ops: " << macroOpCount << std::endl; - std::cout << "; RandomX instructions: " << outIndex << std::endl; - std::cout << "; Execution time: " << retireCycle << " cycles" << std::endl; - std::cout << "; IPC = " << ipc << std::endl; - std::cout << "; Port-cycles: " << portCycles << std::endl; - std::cout << "; Multiplications: " << mulCount << std::endl; + if (INFO) std::cout << "; code size " << codeSize << " bytes" << std::endl; + if (INFO) std::cout << "; x86 macro-ops: " << macroOpCount << std::endl; + if (INFO) std::cout << "; RandomX instructions: " << outIndex << std::endl; + if (INFO) std::cout << "; Execution time: " << retireCycle << " cycles" << std::endl; + if (INFO) std::cout << "; IPC = " << ipc << std::endl; + if (INFO) std::cout << "; Port-cycles: " << portCycles << std::endl; + if (INFO) std::cout << "; Multiplications: " << mulCount << std::endl; int asicLatency[8]; memset(asicLatency, 0, sizeof(asicLatency)); @@ -942,19 +894,21 @@ namespace RandomX { } } - std::cout << "; ASIC latency: " << asicLatencyFinal << std::endl; + if (INFO) std::cout << "; ASIC latency: " << asicLatencyFinal << std::endl; - std::cout << "; ASIC latency:" << std::endl; - for (int i = 0; i < 8; ++i) { - std::cout << "; r" << i << " = " << asicLatency[i] << std::endl; - } - std::cout << "; CPU latency:" << std::endl; - for (int i = 0; i < 8; ++i) { - std::cout << "; r" << i << " = " << registers[i].latency << std::endl; + if (INFO) { + std::cout << "; ASIC latency:" << std::endl; + for (int i = 0; i < 8; ++i) { + std::cout << "; r" << i << " = " << asicLatency[i] << std::endl; + } + if (INFO) std::cout << "; CPU latency:" << std::endl; + for (int i = 0; i < 8; ++i) { + std::cout << "; r" << i << " = " << registers[i].latency << std::endl; + } } prog.setSize(outIndex); prog.setAddressRegister(addressReg); - return addressReg; + return outIndex; } } \ No newline at end of file diff --git a/src/LightProgramGenerator.hpp b/src/LightProgramGenerator.hpp index e7b1bda..d920dd0 100644 --- a/src/LightProgramGenerator.hpp +++ b/src/LightProgramGenerator.hpp @@ -21,6 +21,27 @@ along with RandomX. If not, see. namespace RandomX { + // Intel Ivy Bridge reference + namespace LightInstructionType { //uOPs (decode) execution ports latency code size + constexpr int ISUB_R = 0; //1 p015 1 3 + constexpr int IXOR_R = 1; //1 p015 1 3 + constexpr int IADD_RS = 2; //1 p01 1 4 + constexpr int IMUL_R = 3; //1 p1 3 4 + constexpr int IROR_C = 4; //1 p05 1 4 + constexpr int IADD_C7 = 5; //1 p015 1 7 + constexpr int IXOR_C7 = 6; //1 p015 1 7 + constexpr int IADD_C8 = 7; //1+0 p015 1 8 + constexpr int IXOR_C8 = 8; //1+0 p015 1 8 + constexpr int IADD_C9 = 9; //1+0 p015 1 9 + constexpr int IXOR_C9 = 10; //1+0 p015 1 9 + constexpr int IMULH_R = 11; //1+2+1 0+(p1,p5)+0 3 3+3+3 + constexpr int ISMULH_R = 12; //1+2+1 0+(p1,p5)+0 3 3+3+3 + constexpr int IMUL_RCP = 13; //1+1 p015+p1 4 10+4 + + constexpr int COUNT = 14; + constexpr int INVALID = -1; + } + class Blake2Generator { public: Blake2Generator(const void* seed, int nonce); diff --git a/src/VirtualMachine.hpp b/src/VirtualMachine.hpp index 00a14de..1edacdb 100644 --- a/src/VirtualMachine.hpp +++ b/src/VirtualMachine.hpp @@ -24,13 +24,11 @@ along with RandomX. If not, see. namespace RandomX { - - class VirtualMachine { public: VirtualMachine(); virtual ~VirtualMachine() {} - virtual void setDataset(dataset_t ds, uint64_t size) = 0; + virtual void setDataset(dataset_t ds, uint64_t size, LightProgram (&programs)[RANDOMX_CACHE_ACCESSES]) = 0; void setScratchpad(void* ptr) { scratchpad = (uint8_t*)ptr; } diff --git a/src/main.cpp b/src/main.cpp index d5e4657..9410881 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -205,7 +205,7 @@ void mine(RandomX::VirtualMachine* vm, std::atomic& atomicNonce, Atomi } int main(int argc, char** argv) { - bool softAes, genAsm, miningMode, verificationMode, help, largePages, async, genNative, jit, genLight, useSuperscalar; + bool softAes, genAsm, miningMode, verificationMode, help, largePages, async, genNative, jit, genSuperscalar, useSuperscalar; int programCount, threadCount, initThreadCount, epoch; readOption("--softAes", argc, argv, softAes); @@ -220,15 +220,15 @@ int main(int argc, char** argv) { readOption("--jit", argc, argv, jit); readOption("--genNative", argc, argv, genNative); readOption("--help", argc, argv, help); - readOption("--genLight", argc, argv, genLight); + readOption("--genSuperscalar", argc, argv, genSuperscalar); readOption("--useSuperscalar", argc, argv, useSuperscalar); - if (genLight) { + if (genSuperscalar) { RandomX::LightProgram p; RandomX::Blake2Generator gen(seed, programCount); RandomX::generateLightProg2(p, gen); RandomX::AssemblyGeneratorX86 asmX86; - asmX86.generateProgram(p); + asmX86.generateAsm(p); //std::ofstream file("lightProg2.asm"); asmX86.printCode(std::cout); return 0; @@ -266,6 +266,7 @@ int main(int argc, char** argv) { const uint64_t cacheSize = (RANDOMX_ARGON_MEMORY + RANDOMX_ARGON_GROWTH * epoch) * RandomX::ArgonBlockSize; const uint64_t datasetSize = (RANDOMX_DATASET_SIZE + RANDOMX_DS_GROWTH * epoch); dataset.cache.size = cacheSize; + RandomX::LightProgram programs[RANDOMX_CACHE_ACCESSES]; std::cout << "RandomX - " << (miningMode ? "mining" : "verification") << " mode" << std::endl; @@ -282,6 +283,12 @@ int main(int argc, char** argv) { outputHex(std::cout, (char*)dataset.cache.memory, sizeof(__m128i)); std::cout << std::endl; } + if (useSuperscalar) { + RandomX::Blake2Generator gen(seed, programCount); + for (int i = 0; i < RANDOMX_CACHE_ACCESSES; ++i) { + RandomX::generateLightProg2(programs[i], gen); + } + } if (!miningMode) { std::cout << "Cache (" << cacheSize << " bytes) initialized in " << sw.getElapsed() << " s" << std::endl; } @@ -291,11 +298,6 @@ int main(int argc, char** argv) { RandomX::datasetAlloc(dataset, largePages); const uint64_t datasetBlockCount = datasetSize / RandomX::CacheLineSize; if (useSuperscalar) { - RandomX::Blake2Generator gen(seed, programCount); - RandomX::LightProgram programs[RANDOMX_CACHE_ACCESSES]; - for (int i = 0; i < RANDOMX_CACHE_ACCESSES; ++i) { - RandomX::generateLightProg2(programs[i], gen); - } RandomX::JitCompilerX86 jit86; jit86.generateSuperScalarHash(programs); jit86.getDatasetInitFunc()(cache.memory, dataset.dataset.memory, 0, datasetBlockCount); @@ -320,7 +322,6 @@ int main(int argc, char** argv) { threads.clear(); std::cout << "Dataset (" << datasetSize << " bytes) initialized in " << sw.getElapsed() << " s" << std::endl; } - return 0; std::cout << "Initializing " << threadCount << " virtual machine(s) ..." << std::endl; for (int i = 0; i < threadCount; ++i) { RandomX::VirtualMachine* vm; @@ -328,12 +329,14 @@ int main(int argc, char** argv) { vm = new RandomX::CompiledVirtualMachine(); } else { - if (jit) - vm = new RandomX::CompiledLightVirtualMachine(); + if (jit && useSuperscalar) + vm = new RandomX::CompiledLightVirtualMachine(); + else if(jit) + vm = new RandomX::CompiledLightVirtualMachine(); else vm = new RandomX::InterpretedVirtualMachine(softAes); } - vm->setDataset(dataset, datasetSize); + vm->setDataset(dataset, datasetSize, programs); vms.push_back(vm); } uint8_t* scratchpadMem; From 2132e5fef5a47b3870aa4e655750ef46ce9d5f82 Mon Sep 17 00:00:00 2001 From: tevador Date: Thu, 11 Apr 2019 00:01:22 +0200 Subject: [PATCH 12/18] SuperscalarHash interpreter Linux assembly code --- makefile | 5 +- src/Instruction.hpp | 2 +- src/InterpretedVirtualMachine.cpp | 173 ++++++++++++++++--- src/InterpretedVirtualMachine.hpp | 26 ++- src/JitCompilerX86-static.S | 88 ++++++++++ src/JitCompilerX86-static.asm | 28 +-- src/LightProgramGenerator.cpp | 5 +- src/asm/program_read_dataset_sshash_fin.inc | 10 ++ src/asm/program_read_dataset_sshash_init.inc | 16 ++ src/asm/program_sshash_constants.inc | 24 ++- src/main.cpp | 7 +- 11 files changed, 310 insertions(+), 74 deletions(-) create mode 100644 src/asm/program_read_dataset_sshash_fin.inc create mode 100644 src/asm/program_read_dataset_sshash_init.inc diff --git a/makefile b/makefile index cd49f88..7dde5ae 100644 --- a/makefile +++ b/makefile @@ -9,7 +9,7 @@ OBJDIR=obj LDFLAGS=-lpthread CPPSRC=src/argon2_core.c src/Cache.cpp src/divideByConstantCodegen.c src/Instruction.cpp src/JitCompilerX86.cpp src/Program.cpp src/VirtualMachine.cpp src/argon2_ref.c src/CompiledVirtualMachine.cpp src/executeProgram-linux.cpp src/instructionsPortable.cpp src/LightClientAsyncWorker.cpp src/softAes.cpp src/virtualMemory.cpp src/AssemblyGeneratorX86.cpp src/dataset.cpp src/hashAes1Rx4.cpp src/InterpretedVirtualMachine.cpp src/main.cpp src/TestAluFpu.cpp src/blake2/blake2b.c TOBJS=$(addprefix $(OBJDIR)/,instructionsPortable.o TestAluFpu.o) -ROBJS=$(addprefix $(OBJDIR)/,argon2_core.o argon2_ref.o AssemblyGeneratorX86.o blake2b.o CompiledVirtualMachine.o CompiledLightVirtualMachine.o dataset.o JitCompilerX86.o instructionsPortable.o Instruction.o InterpretedVirtualMachine.o main.o Program.o softAes.o VirtualMachine.o Cache.o virtualMemory.o reciprocal.o LightClientAsyncWorker.o hashAes1Rx4.o) +ROBJS=$(addprefix $(OBJDIR)/,argon2_core.o argon2_ref.o AssemblyGeneratorX86.o blake2b.o CompiledVirtualMachine.o CompiledLightVirtualMachine.o dataset.o JitCompilerX86.o instructionsPortable.o Instruction.o InterpretedVirtualMachine.o main.o softAes.o VirtualMachine.o Cache.o virtualMemory.o reciprocal.o LightClientAsyncWorker.o hashAes1Rx4.o LightProgramGenerator.o) ifeq ($(PLATFORM),amd64) ROBJS += $(OBJDIR)/JitCompilerX86-static.o $(OBJDIR)/squareHash.o CXXFLAGS += -maes @@ -99,6 +99,9 @@ $(OBJDIR)/InterpretedVirtualMachine.o: $(addprefix $(SRCDIR)/,InterpretedVirtual $(OBJDIR)/LightClientAsyncWorker.o: $(addprefix $(SRCDIR)/,LightClientAsyncWorker.cpp LightClientAsyncWorker.hpp common.hpp) | $(OBJDIR) $(CXX) $(CXXFLAGS) -c $(SRCDIR)/LightClientAsyncWorker.cpp -o $@ + +$(OBJDIR)/LightProgramGenerator.o: $(addprefix $(SRCDIR)/,LightProgramGenerator.cpp LightProgramGenerator.hpp Program.hpp blake2/blake2.h blake2/endian.h configuration.h) | $(OBJDIR) + $(CXX) $(CXXFLAGS) -c $(SRCDIR)/LightProgramGenerator.cpp -o $@ $(OBJDIR)/main.o: $(addprefix $(SRCDIR)/,main.cpp InterpretedVirtualMachine.hpp Stopwatch.hpp blake2/blake2.h VirtualMachine.hpp common.hpp blake2/endian.h Program.hpp Instruction.hpp intrinPortable.h CompiledVirtualMachine.hpp JitCompilerX86.hpp AssemblyGeneratorX86.hpp dataset.hpp Cache.hpp virtualMemory.hpp hashAes1Rx4.hpp softAes.h configuration.h) | $(OBJDIR) $(CXX) $(CXXFLAGS) -c $(SRCDIR)/main.cpp -o $@ diff --git a/src/Instruction.hpp b/src/Instruction.hpp index 65d1c8a..9baf8ce 100644 --- a/src/Instruction.hpp +++ b/src/Instruction.hpp @@ -30,7 +30,7 @@ namespace RandomX { typedef void(Instruction::*InstructionVisualizer)(std::ostream&) const; namespace InstructionType { - constexpr int IADD_R = 0; + constexpr int IADD_RS = 0; constexpr int IADD_M = 1; constexpr int IADD_RC = 2; constexpr int ISUB_R = 3; diff --git a/src/InterpretedVirtualMachine.cpp b/src/InterpretedVirtualMachine.cpp index 636b95b..7ee00ba 100644 --- a/src/InterpretedVirtualMachine.cpp +++ b/src/InterpretedVirtualMachine.cpp @@ -36,6 +36,7 @@ along with RandomX. If not, see. #ifdef STATS #include #endif +#include "LightProgramGenerator.hpp" #ifdef FPUCHECK constexpr bool fpuCheck = true; @@ -45,17 +46,20 @@ constexpr bool fpuCheck = false; namespace RandomX { - InterpretedVirtualMachine::~InterpretedVirtualMachine() { - - } - - void InterpretedVirtualMachine::setDataset(dataset_t ds, uint64_t size, LightProgram(&programs)[RANDOMX_CACHE_ACCESSES]) { + template + void InterpretedVirtualMachine::setDataset(dataset_t ds, uint64_t size, LightProgram(&programs)[RANDOMX_CACHE_ACCESSES]) { mem.ds = ds; readDataset = &datasetReadLight; datasetRange = (size - RANDOMX_DATASET_SIZE + CacheLineSize) / CacheLineSize; + if(superscalar) + precompileSuperscalar(programs); } - void InterpretedVirtualMachine::initialize() { + template void InterpretedVirtualMachine::setDataset(dataset_t ds, uint64_t size, LightProgram(&programs)[RANDOMX_CACHE_ACCESSES]); + template void InterpretedVirtualMachine::setDataset(dataset_t ds, uint64_t size, LightProgram(&programs)[RANDOMX_CACHE_ACCESSES]); + + template + void InterpretedVirtualMachine::initialize() { VirtualMachine::initialize(); for (unsigned i = 0; i < RANDOMX_PROGRAM_SIZE; ++i) { program(i).src %= RegistersCount; @@ -63,12 +67,19 @@ namespace RandomX { } } - void InterpretedVirtualMachine::executeBytecode(int_reg_t(&r)[8], __m128d (&f)[4], __m128d (&e)[4], __m128d (&a)[4]) { + template void InterpretedVirtualMachine::initialize(); + template void InterpretedVirtualMachine::initialize(); + + template + void InterpretedVirtualMachine::executeBytecode(int_reg_t(&r)[8], __m128d (&f)[4], __m128d (&e)[4], __m128d (&a)[4]) { for (int ic = 0; ic < RANDOMX_PROGRAM_SIZE; ++ic) { executeBytecode(ic, r, f, e, a); } } + template void InterpretedVirtualMachine::executeBytecode(int_reg_t(&r)[8], __m128d (&f)[4], __m128d (&e)[4], __m128d (&a)[4]); + template void InterpretedVirtualMachine::executeBytecode(int_reg_t(&r)[8], __m128d (&f)[4], __m128d (&e)[4], __m128d (&a)[4]); + static void print(int_reg_t r) { std::cout << std::hex << std::setw(16) << std::setfill('0') << r << std::endl; } @@ -98,14 +109,15 @@ namespace RandomX { return std::fpclassify(x) == FP_SUBNORMAL; } - FORCE_INLINE void InterpretedVirtualMachine::executeBytecode(int& ic, int_reg_t(&r)[8], __m128d (&f)[4], __m128d (&e)[4], __m128d (&a)[4]) { + template + FORCE_INLINE void InterpretedVirtualMachine::executeBytecode(int& ic, int_reg_t(&r)[8], __m128d (&f)[4], __m128d (&e)[4], __m128d (&a)[4]) { auto& ibc = byteCode[ic]; if (trace) std::cout << std::dec << std::setw(3) << ic << " " << program(ic); //if(trace) printState(r, f, e, a); switch (ibc.type) { - case InstructionType::IADD_R: { - *ibc.idst += *ibc.isrc; + case InstructionType::IADD_RS: { + *ibc.idst += (*ibc.isrc << ibc.shift) + ibc.imm; } break; case InstructionType::IADD_M: { @@ -289,7 +301,8 @@ namespace RandomX { #endif } - void InterpretedVirtualMachine::execute() { + template + void InterpretedVirtualMachine::execute() { int_reg_t r[8] = { 0, 0, 0, 0, 0, 0, 0, 0 }; __m128d f[4]; __m128d e[4]; @@ -350,11 +363,16 @@ namespace RandomX { mem.mx ^= r[readReg2] ^ r[readReg3]; mem.mx &= CacheLineAlignMask; - Cache& cache = mem.ds.cache; - uint64_t datasetLine[CacheLineSize / sizeof(uint64_t)]; - initBlock(cache, (uint8_t*)datasetLine, datasetBase + mem.ma / CacheLineSize, RANDOMX_CACHE_ACCESSES / 8); - for (int i = 0; i < RegistersCount; ++i) - r[i] ^= datasetLine[i]; + if (superscalar) { + executeSuperscalar(datasetBase + mem.ma / CacheLineSize, r); + } + else { + Cache& cache = mem.ds.cache; + uint64_t datasetLine[CacheLineSize / sizeof(uint64_t)]; + initBlock(cache, (uint8_t*)datasetLine, datasetBase + mem.ma / CacheLineSize, RANDOMX_CACHE_ACCESSES / 8); + for (int i = 0; i < RegistersCount; ++i) + r[i] ^= datasetLine[i]; + } std::swap(mem.mx, mem.ma); if (trace) { @@ -419,6 +437,9 @@ namespace RandomX { _mm_store_pd(®.e[3].lo, e[3]); } + template void InterpretedVirtualMachine::execute(); + template void InterpretedVirtualMachine::execute(); + static int getConditionRegister(int(®isterUsage)[8]) { int min = INT_MAX; int minIndex; @@ -431,9 +452,118 @@ namespace RandomX { return minIndex; } + constexpr uint64_t superscalarMul0 = 6364136223846793005ULL; + constexpr uint64_t superscalarAdd1 = 9298410992540426048ULL; + constexpr uint64_t superscalarAdd2 = 12065312585734608966ULL; + constexpr uint64_t superscalarAdd3 = 9306329213124610396ULL; + constexpr uint64_t superscalarAdd4 = 5281919268842080866ULL; + constexpr uint64_t superscalarAdd5 = 10536153434571861004ULL; + constexpr uint64_t superscalarAdd6 = 3398623926847679864ULL; + constexpr uint64_t superscalarAdd7 = 9549104520008361294ULL; + + static uint8_t* getMixBlock(uint64_t registerValue, Cache& cache) { + uint8_t* mixBlock; + if (RANDOMX_ARGON_GROWTH == 0) { + constexpr uint32_t mask = (RANDOMX_ARGON_MEMORY * ArgonBlockSize / CacheLineSize - 1); + mixBlock = cache.memory + (registerValue & mask) * CacheLineSize; + } + else { + const uint32_t modulus = cache.size / CacheLineSize; + mixBlock = cache.memory + (registerValue % modulus) * CacheLineSize; + } + return mixBlock; + } + + template + void InterpretedVirtualMachine::executeSuperscalar(uint32_t blockNumber, int_reg_t(&r)[8]) { + int_reg_t rl[8]; + uint8_t* mixBlock; + uint64_t registerValue = blockNumber; + rl[0] = (blockNumber + 1) * superscalarMul0; + rl[1] = rl[0] ^ superscalarAdd1; + rl[2] = rl[0] ^ superscalarAdd2; + rl[3] = rl[0] ^ superscalarAdd3; + rl[4] = rl[0] ^ superscalarAdd4; + rl[5] = rl[0] ^ superscalarAdd5; + rl[6] = rl[0] ^ superscalarAdd6; + rl[7] = rl[0] ^ superscalarAdd7; + Cache& cache = mem.ds.cache; + for (unsigned i = 0; i < RANDOMX_CACHE_ACCESSES; ++i) { + mixBlock = getMixBlock(registerValue, cache); + LightProgram& prog = superScalarPrograms[i]; + for (unsigned j = 0; j < prog.getSize(); ++j) { + Instruction& instr = prog(j); + switch (instr.opcode) + { + case RandomX::LightInstructionType::ISUB_R: + rl[instr.dst] -= rl[instr.src]; + break; + case RandomX::LightInstructionType::IXOR_R: + rl[instr.dst] ^= rl[instr.src]; + break; + case RandomX::LightInstructionType::IADD_RS: + rl[instr.dst] += rl[instr.src] << (instr.mod % 4); + break; + case RandomX::LightInstructionType::IMUL_R: + rl[instr.dst] *= rl[instr.src]; + break; + case RandomX::LightInstructionType::IROR_C: + rl[instr.dst] = rotr(rl[instr.dst], instr.getImm32()); + break; + case RandomX::LightInstructionType::IADD_C7: + case RandomX::LightInstructionType::IADD_C8: + case RandomX::LightInstructionType::IADD_C9: + rl[instr.dst] += signExtend2sCompl(instr.getImm32()); + break; + case RandomX::LightInstructionType::IXOR_C7: + case RandomX::LightInstructionType::IXOR_C8: + case RandomX::LightInstructionType::IXOR_C9: + rl[instr.dst] ^= signExtend2sCompl(instr.getImm32()); + break; + case RandomX::LightInstructionType::IMULH_R: + rl[instr.dst] = mulh(rl[instr.dst], rl[instr.src]); + break; + case RandomX::LightInstructionType::ISMULH_R: + rl[instr.dst] = smulh(rl[instr.dst], rl[instr.src]); + break; + case RandomX::LightInstructionType::IMUL_RCP: + rl[instr.dst] *= reciprocals[instr.getImm32()]; + break; + default: + UNREACHABLE; + } + } + + for(unsigned q = 0; q < 8; ++q) + rl[q] ^= load64(mixBlock + 8 * q); + + registerValue = rl[prog.getAddressRegister()]; + } + + for (unsigned q = 0; q < 8; ++q) + r[q] ^= rl[q]; + } + + template + void InterpretedVirtualMachine::precompileSuperscalar(LightProgram* programs) { + memcpy(superScalarPrograms, programs, sizeof(superScalarPrograms)); + reciprocals.clear(); + for (unsigned i = 0; i < RANDOMX_CACHE_ACCESSES; ++i) { + for (unsigned j = 0; j < superScalarPrograms[i].getSize(); ++j) { + Instruction& instr = superScalarPrograms[i](j); + if (instr.opcode == LightInstructionType::IMUL_RCP) { + auto rcp = reciprocal(instr.getImm32()); + instr.setImm32(reciprocals.size()); + reciprocals.push_back(rcp); + } + } + } + } + #include "instructionWeights.hpp" - void InterpretedVirtualMachine::precompileProgram(int_reg_t(&r)[8], __m128d (&f)[4], __m128d (&e)[4], __m128d (&a)[4]) { + template + void InterpretedVirtualMachine::precompileProgram(int_reg_t(&r)[8], __m128d (&f)[4], __m128d (&e)[4], __m128d (&a)[4]) { int registerUsage[8]; for (unsigned i = 0; i < 8; ++i) { registerUsage[i] = -1; @@ -445,14 +575,17 @@ namespace RandomX { CASE_REP(IADD_RS) { auto dst = instr.dst % RegistersCount; auto src = instr.src % RegistersCount; - ibc.type = InstructionType::IADD_R; + ibc.type = InstructionType::IADD_RS; ibc.idst = &r[dst]; - if (src != dst) { + if (dst != 5) { ibc.isrc = &r[src]; + ibc.shift = instr.mod % 4; + ibc.imm = 0; } else { + ibc.isrc = &r[src]; + ibc.shift = instr.mod % 4; ibc.imm = signExtend2sCompl(instr.getImm32()); - ibc.isrc = &ibc.imm; } registerUsage[instr.dst] = i; } break; diff --git a/src/InterpretedVirtualMachine.hpp b/src/InterpretedVirtualMachine.hpp index 49178bc..24bb9c6 100644 --- a/src/InterpretedVirtualMachine.hpp +++ b/src/InterpretedVirtualMachine.hpp @@ -23,23 +23,17 @@ along with RandomX. If not, see. #include "VirtualMachine.hpp" #include "Program.hpp" #include "intrinPortable.h" +#include namespace RandomX { - class ITransform { - public: - virtual int32_t apply(int32_t) const = 0; - virtual const char* getName() const = 0; - virtual std::ostream& printAsm(std::ostream&) const = 0; - virtual std::ostream& printCxx(std::ostream&) const = 0; - }; - struct InstructionByteCode; - class InterpretedVirtualMachine; + template class InterpretedVirtualMachine; - typedef void(InterpretedVirtualMachine::*InstructionHandler)(Instruction&); + template + using InstructionHandler = void(InterpretedVirtualMachine::*)(Instruction&); - struct alignas(8) InstructionByteCode { + struct InstructionByteCode { union { int_reg_t* idst; __m128d* fdst; @@ -62,6 +56,7 @@ namespace RandomX { constexpr int asedwfagdewsa = sizeof(InstructionByteCode); + template class InterpretedVirtualMachine : public VirtualMachine { public: void* operator new(size_t size) { @@ -74,16 +69,17 @@ namespace RandomX { _mm_free(ptr); } InterpretedVirtualMachine(bool soft) : softAes(soft) {} - ~InterpretedVirtualMachine(); + ~InterpretedVirtualMachine() {} void setDataset(dataset_t ds, uint64_t size, LightProgram(&programs)[RANDOMX_CACHE_ACCESSES]) override; void initialize() override; void execute() override; private: - static InstructionHandler engine[256]; + static InstructionHandler engine[256]; DatasetReadFunc readDataset; bool softAes; InstructionByteCode byteCode[RANDOMX_PROGRAM_SIZE]; - + std::vector reciprocals; + alignas(64) LightProgram superScalarPrograms[RANDOMX_CACHE_ACCESSES]; #ifdef STATS int count_ADD_64 = 0; int count_ADD_32 = 0; @@ -131,7 +127,9 @@ namespace RandomX { int datasetAccess[256] = { 0 }; #endif void precompileProgram(int_reg_t(&r)[8], __m128d (&f)[4], __m128d (&e)[4], __m128d (&a)[4]); + void precompileSuperscalar(LightProgram*); void executeBytecode(int_reg_t(&r)[8], __m128d (&f)[4], __m128d (&e)[4], __m128d (&a)[4]); void executeBytecode(int& i, int_reg_t(&r)[8], __m128d (&f)[4], __m128d (&e)[4], __m128d (&a)[4]); + void executeSuperscalar(uint32_t blockNumber, int_reg_t(&r)[8]); }; } \ No newline at end of file diff --git a/src/JitCompilerX86-static.S b/src/JitCompilerX86-static.S index 9ccdb16..e78dbe7 100644 --- a/src/JitCompilerX86-static.S +++ b/src/JitCompilerX86-static.S @@ -32,10 +32,18 @@ .global DECL(randomx_program_start) .global DECL(randomx_program_read_dataset) .global DECL(randomx_program_read_dataset_light) +.global DECL(randomx_program_read_dataset_sshash_init) +.global DECL(randomx_program_read_dataset_sshash_fin) +.global DECL(randomx_program_read_dataset_light_sub) +.global DECL(randomx_dataset_init) .global DECL(randomx_program_loop_store) .global DECL(randomx_program_loop_end) .global DECL(randomx_program_read_dataset_light_sub) .global DECL(randomx_program_epilogue) +.global DECL(randomx_sshash_load) +.global DECL(randomx_sshash_prefetch) +.global DECL(randomx_sshash_end) +.global DECL(randomx_sshash_init) .global DECL(randomx_program_end) #define db .byte @@ -63,6 +71,12 @@ DECL(randomx_program_read_dataset): DECL(randomx_program_read_dataset_light): #include "asm/program_read_dataset_light.inc" +DECL(randomx_program_read_dataset_sshash_init): + #include "asm/program_read_dataset_sshash_init.inc" + +DECL(randomx_program_read_dataset_sshash_fin): + #include "asm/program_read_dataset_sshash_fin.inc" + DECL(randomx_program_loop_store): #include "asm/program_loop_store.inc" @@ -75,10 +89,84 @@ DECL(randomx_program_read_dataset_light_sub): squareHashSub: #include "asm/squareHash.inc" +.balign 64 +DECL(randomx_dataset_init): + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + ;# cache in rdi + ;# dataset in rsi + mov rbp, rdx ;# block index + push rcx ;# max. block index +init_block_loop: + prefetchw byte ptr [rsi] + mov rbx, rbp + .byte 232 ;# 0xE8 = call + ;# .set CALL_LOC, + .int 32768 - (call_offset - DECL(randomx_dataset_init)) +call_offset: + mov qword ptr [rsi+0], r8 + mov qword ptr [rsi+8], r9 + mov qword ptr [rsi+16], r10 + mov qword ptr [rsi+24], r11 + mov qword ptr [rsi+32], r12 + mov qword ptr [rsi+40], r13 + mov qword ptr [rsi+48], r14 + mov qword ptr [rsi+56], r15 + add rbp, 1 + add rsi, 64 + cmp rbp, qword ptr [rsp] + jb init_block_loop + pop rcx + pop r15 + pop r14 + pop r13 + pop r12 + pop rbp + pop rbx + ret + .balign 64 DECL(randomx_program_epilogue): #include "asm/program_epilogue_linux.inc" +.balign 64 +DECL(randomx_sshash_load): + #include "asm/program_sshash_load.inc" + +DECL(randomx_sshash_prefetch): + #include "asm/program_sshash_prefetch.inc" + +DECL(randomx_sshash_end): + nop + +.balign 64 +DECL(randomx_sshash_init): + lea r8, [rbx+1] + #include "asm/program_sshash_prefetch.inc" + imul r8, qword ptr r0_mul[rip] + mov r9, qword ptr r1_add[rip] + xor r9, r8 + mov r10, qword ptr r2_add[rip] + xor r10, r8 + mov r11, qword ptr r3_add[rip] + xor r11, r8 + mov r12, qword ptr r4_add[rip] + xor r12, r8 + mov r13, qword ptr r5_add[rip] + xor r13, r8 + mov r14, qword ptr r6_add[rip] + xor r14, r8 + mov r15, qword ptr r7_add[rip] + xor r15, r8 + jmp DECL(randomx_program_end) + +.balign 64 + #include "asm/program_sshash_constants.inc" + .balign 64 DECL(randomx_program_end): nop diff --git a/src/JitCompilerX86-static.asm b/src/JitCompilerX86-static.asm index f149655..ab29312 100644 --- a/src/JitCompilerX86-static.asm +++ b/src/JitCompilerX86-static.asm @@ -68,35 +68,11 @@ randomx_program_read_dataset_light PROC randomx_program_read_dataset_light ENDP randomx_program_read_dataset_sshash_init PROC - sub rsp, 72 - mov qword ptr [rsp+64], rbx - mov qword ptr [rsp+56], r8 - mov qword ptr [rsp+48], r9 - mov qword ptr [rsp+40], r10 - mov qword ptr [rsp+32], r11 - mov qword ptr [rsp+24], r12 - mov qword ptr [rsp+16], r13 - mov qword ptr [rsp+8], r14 - mov qword ptr [rsp+0], r15 - xor rbp, rax ;# modify "mx" - ror rbp, 32 ;# swap "ma" and "mx" - mov ebx, ebp ;# ecx = ma - and ebx, 2147483584 ;# align "ma" to the start of a cache line - shr ebx, 6 ;# ebx = Dataset block number - ;# call 32768 + include asm/program_read_dataset_sshash_init.inc randomx_program_read_dataset_sshash_init ENDP randomx_program_read_dataset_sshash_fin PROC - mov rbx, qword ptr [rsp+64] - xor r8, qword ptr [rsp+56] - xor r9, qword ptr [rsp+48] - xor r10, qword ptr [rsp+40] - xor r11, qword ptr [rsp+32] - xor r12, qword ptr [rsp+24] - xor r13, qword ptr [rsp+16] - xor r14, qword ptr [rsp+8] - xor r15, qword ptr [rsp+0] - add rsp, 72 + include asm/program_read_dataset_sshash_fin.inc randomx_program_read_dataset_sshash_fin ENDP randomx_program_loop_store PROC diff --git a/src/LightProgramGenerator.cpp b/src/LightProgramGenerator.cpp index eeb09de..97fbb91 100644 --- a/src/LightProgramGenerator.cpp +++ b/src/LightProgramGenerator.cpp @@ -17,10 +17,11 @@ You should have received a copy of the GNU General Public License along with RandomX. If not, see. */ +#include #include "blake2/blake2.h" #include "configuration.h" #include "Program.hpp" -#include "blake2/endian.h"; +#include "blake2/endian.h" #include #include #include @@ -793,7 +794,7 @@ namespace RandomX { mop.setCycle(scheduleCycle); if (scheduleCycle < 0) { if (TRACE) std::cout << "; Failed at cycle " << cycle << std::endl; - return DBL_MIN; + return 0; } if (instrIndex == currentInstruction.getInfo().getSrcOp()) { diff --git a/src/asm/program_read_dataset_sshash_fin.inc b/src/asm/program_read_dataset_sshash_fin.inc new file mode 100644 index 0000000..f5a067d --- /dev/null +++ b/src/asm/program_read_dataset_sshash_fin.inc @@ -0,0 +1,10 @@ + mov rbx, qword ptr [rsp+64] + xor r8, qword ptr [rsp+56] + xor r9, qword ptr [rsp+48] + xor r10, qword ptr [rsp+40] + xor r11, qword ptr [rsp+32] + xor r12, qword ptr [rsp+24] + xor r13, qword ptr [rsp+16] + xor r14, qword ptr [rsp+8] + xor r15, qword ptr [rsp+0] + add rsp, 72 \ No newline at end of file diff --git a/src/asm/program_read_dataset_sshash_init.inc b/src/asm/program_read_dataset_sshash_init.inc new file mode 100644 index 0000000..a186d2e --- /dev/null +++ b/src/asm/program_read_dataset_sshash_init.inc @@ -0,0 +1,16 @@ + sub rsp, 72 + mov qword ptr [rsp+64], rbx + mov qword ptr [rsp+56], r8 + mov qword ptr [rsp+48], r9 + mov qword ptr [rsp+40], r10 + mov qword ptr [rsp+32], r11 + mov qword ptr [rsp+24], r12 + mov qword ptr [rsp+16], r13 + mov qword ptr [rsp+8], r14 + mov qword ptr [rsp+0], r15 + xor rbp, rax ;# modify "mx" + ror rbp, 32 ;# swap "ma" and "mx" + mov ebx, ebp ;# ecx = ma + and ebx, 2147483584 ;# align "ma" to the start of a cache line + shr ebx, 6 ;# ebx = Dataset block number + ;# call 32768 \ No newline at end of file diff --git a/src/asm/program_sshash_constants.inc b/src/asm/program_sshash_constants.inc index a25a90e..77b4ecd 100644 --- a/src/asm/program_sshash_constants.inc +++ b/src/asm/program_sshash_constants.inc @@ -1,16 +1,24 @@ -r0_mul: ;# 6364136223846793005 +r0_mul: + ;#/ 6364136223846793005 db 45, 127, 149, 76, 45, 244, 81, 88 -r1_add: ;# 9298410992540426048 +r1_add: + ;#/ 9298410992540426048 db 64, 159, 245, 89, 136, 151, 10, 129 -r2_add: ;# 12065312585734608966 +r2_add: + ;#/ 12065312585734608966 db 70, 216, 194, 56, 223, 153, 112, 167 -r3_add: ;# 9306329213124610396 +r3_add: + ;#/ 9306329213124610396 db 92, 9, 34, 191, 28, 185, 38, 129 -r4_add: ;# 5281919268842080866 +r4_add: + ;#/ 5281919268842080866 db 98, 138, 159, 23, 151, 37, 77, 73 -r5_add: ;# 10536153434571861004 +r5_add: + ;#/ 10536153434571861004 db 12, 236, 170, 206, 185, 239, 55, 146 -r6_add: ;# 3398623926847679864 +r6_add: + ;#/ 3398623926847679864 db 120, 45, 230, 108, 116, 86, 42, 47 -r7_add: ;# 9549104520008361294 +r7_add: + ;#/ 9549104520008361294 db 78, 229, 44, 182, 247, 59, 133, 132 \ No newline at end of file diff --git a/src/main.cpp b/src/main.cpp index 9410881..36cd800 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -301,6 +301,7 @@ int main(int argc, char** argv) { RandomX::JitCompilerX86 jit86; jit86.generateSuperScalarHash(programs); jit86.getDatasetInitFunc()(cache.memory, dataset.dataset.memory, 0, datasetBlockCount); + //dump((const char*)dataset.dataset.memory, RANDOMX_DATASET_SIZE, "dataset.dat"); } else { if (initThreadCount > 1) { @@ -331,10 +332,12 @@ int main(int argc, char** argv) { else { if (jit && useSuperscalar) vm = new RandomX::CompiledLightVirtualMachine(); - else if(jit) + else if (jit) vm = new RandomX::CompiledLightVirtualMachine(); + else if (useSuperscalar) + vm = new RandomX::InterpretedVirtualMachine(softAes); else - vm = new RandomX::InterpretedVirtualMachine(softAes); + vm = new RandomX::InterpretedVirtualMachine(softAes); } vm->setDataset(dataset, datasetSize, programs); vms.push_back(vm); From 2e68c89740072e6cc003966726136974d9b41073 Mon Sep 17 00:00:00 2001 From: tevador Date: Thu, 11 Apr 2019 18:31:13 +0200 Subject: [PATCH 13/18] Separate executeSuperscalar function Tweaked superscalar hash constants --- src/InterpretedVirtualMachine.cpp | 95 +++++++++++++++------------- src/InterpretedVirtualMachine.hpp | 1 + src/asm/program_sshash_constants.inc | 4 +- src/main.cpp | 20 +++--- 4 files changed, 66 insertions(+), 54 deletions(-) diff --git a/src/InterpretedVirtualMachine.cpp b/src/InterpretedVirtualMachine.cpp index 7ee00ba..423cefc 100644 --- a/src/InterpretedVirtualMachine.cpp +++ b/src/InterpretedVirtualMachine.cpp @@ -453,7 +453,7 @@ namespace RandomX { } constexpr uint64_t superscalarMul0 = 6364136223846793005ULL; - constexpr uint64_t superscalarAdd1 = 9298410992540426048ULL; + constexpr uint64_t superscalarAdd1 = 9298410992540426748ULL; constexpr uint64_t superscalarAdd2 = 12065312585734608966ULL; constexpr uint64_t superscalarAdd3 = 9306329213124610396ULL; constexpr uint64_t superscalarAdd4 = 5281919268842080866ULL; @@ -474,6 +474,55 @@ namespace RandomX { return mixBlock; } + template + void InterpretedVirtualMachine::executeSuperscalar(int_reg_t(&r)[8], LightProgram& prog, std::vector& reciprocals) { + for (unsigned j = 0; j < prog.getSize(); ++j) { + Instruction& instr = prog(j); + switch (instr.opcode) + { + case RandomX::LightInstructionType::ISUB_R: + r[instr.dst] -= r[instr.src]; + break; + case RandomX::LightInstructionType::IXOR_R: + r[instr.dst] ^= r[instr.src]; + break; + case RandomX::LightInstructionType::IADD_RS: + r[instr.dst] += r[instr.src] << (instr.mod % 4); + break; + case RandomX::LightInstructionType::IMUL_R: + r[instr.dst] *= r[instr.src]; + break; + case RandomX::LightInstructionType::IROR_C: + r[instr.dst] = rotr(r[instr.dst], instr.getImm32()); + break; + case RandomX::LightInstructionType::IADD_C7: + case RandomX::LightInstructionType::IADD_C8: + case RandomX::LightInstructionType::IADD_C9: + r[instr.dst] += signExtend2sCompl(instr.getImm32()); + break; + case RandomX::LightInstructionType::IXOR_C7: + case RandomX::LightInstructionType::IXOR_C8: + case RandomX::LightInstructionType::IXOR_C9: + r[instr.dst] ^= signExtend2sCompl(instr.getImm32()); + break; + case RandomX::LightInstructionType::IMULH_R: + r[instr.dst] = mulh(r[instr.dst], r[instr.src]); + break; + case RandomX::LightInstructionType::ISMULH_R: + r[instr.dst] = smulh(r[instr.dst], r[instr.src]); + break; + case RandomX::LightInstructionType::IMUL_RCP: + if(superscalar) + r[instr.dst] *= reciprocals[instr.getImm32()]; + else + r[instr.dst] *= reciprocal(instr.getImm32()); + break; + default: + UNREACHABLE; + } + } + } + template void InterpretedVirtualMachine::executeSuperscalar(uint32_t blockNumber, int_reg_t(&r)[8]) { int_reg_t rl[8]; @@ -491,49 +540,9 @@ namespace RandomX { for (unsigned i = 0; i < RANDOMX_CACHE_ACCESSES; ++i) { mixBlock = getMixBlock(registerValue, cache); LightProgram& prog = superScalarPrograms[i]; - for (unsigned j = 0; j < prog.getSize(); ++j) { - Instruction& instr = prog(j); - switch (instr.opcode) - { - case RandomX::LightInstructionType::ISUB_R: - rl[instr.dst] -= rl[instr.src]; - break; - case RandomX::LightInstructionType::IXOR_R: - rl[instr.dst] ^= rl[instr.src]; - break; - case RandomX::LightInstructionType::IADD_RS: - rl[instr.dst] += rl[instr.src] << (instr.mod % 4); - break; - case RandomX::LightInstructionType::IMUL_R: - rl[instr.dst] *= rl[instr.src]; - break; - case RandomX::LightInstructionType::IROR_C: - rl[instr.dst] = rotr(rl[instr.dst], instr.getImm32()); - break; - case RandomX::LightInstructionType::IADD_C7: - case RandomX::LightInstructionType::IADD_C8: - case RandomX::LightInstructionType::IADD_C9: - rl[instr.dst] += signExtend2sCompl(instr.getImm32()); - break; - case RandomX::LightInstructionType::IXOR_C7: - case RandomX::LightInstructionType::IXOR_C8: - case RandomX::LightInstructionType::IXOR_C9: - rl[instr.dst] ^= signExtend2sCompl(instr.getImm32()); - break; - case RandomX::LightInstructionType::IMULH_R: - rl[instr.dst] = mulh(rl[instr.dst], rl[instr.src]); - break; - case RandomX::LightInstructionType::ISMULH_R: - rl[instr.dst] = smulh(rl[instr.dst], rl[instr.src]); - break; - case RandomX::LightInstructionType::IMUL_RCP: - rl[instr.dst] *= reciprocals[instr.getImm32()]; - break; - default: - UNREACHABLE; - } - } + executeSuperscalar(rl, prog, reciprocals); + for(unsigned q = 0; q < 8; ++q) rl[q] ^= load64(mixBlock + 8 * q); diff --git a/src/InterpretedVirtualMachine.hpp b/src/InterpretedVirtualMachine.hpp index 24bb9c6..ddefa67 100644 --- a/src/InterpretedVirtualMachine.hpp +++ b/src/InterpretedVirtualMachine.hpp @@ -73,6 +73,7 @@ namespace RandomX { void setDataset(dataset_t ds, uint64_t size, LightProgram(&programs)[RANDOMX_CACHE_ACCESSES]) override; void initialize() override; void execute() override; + static void executeSuperscalar(int_reg_t(&r)[8], LightProgram& prog, std::vector& reciprocals); private: static InstructionHandler engine[256]; DatasetReadFunc readDataset; diff --git a/src/asm/program_sshash_constants.inc b/src/asm/program_sshash_constants.inc index 77b4ecd..2044a0e 100644 --- a/src/asm/program_sshash_constants.inc +++ b/src/asm/program_sshash_constants.inc @@ -2,8 +2,8 @@ r0_mul: ;#/ 6364136223846793005 db 45, 127, 149, 76, 45, 244, 81, 88 r1_add: - ;#/ 9298410992540426048 - db 64, 159, 245, 89, 136, 151, 10, 129 + ;#/ 9298410992540426748 + db 252, 161, 245, 89, 136, 151, 10, 129 r2_add: ;#/ 12065312585734608966 db 70, 216, 194, 56, 223, 153, 112, 167 diff --git a/src/main.cpp b/src/main.cpp index 36cd800..4866804 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -177,7 +177,6 @@ void mine(RandomX::VirtualMachine* vm, std::atomic& atomicNonce, Atomi fillAes1Rx4((void*)hash, RANDOMX_SCRATCHPAD_L3, scratchpad); vm->resetRoundingMode(); vm->setScratchpad(scratchpad); - //dump((char*)scratchpad, RandomX::ScratchpadSize, "spad-before.txt"); for (int chain = 0; chain < RANDOMX_PROGRAM_COUNT - 1; ++chain) { fillAes1Rx4((void*)hash, sizeof(RandomX::Program), vm->getProgramBuffer()); vm->initialize(); @@ -194,6 +193,7 @@ void mine(RandomX::VirtualMachine* vm, std::atomic& atomicNonce, Atomi } }*/ vm->getResult(scratchpad, RANDOMX_SCRATCHPAD_L3, hash); + //dump((char*)scratchpad, RANDOMX_SCRATCHPAD_L3, "spad.txt"); result.xorWith(hash); if (RandomX::trace) { std::cout << "Nonce: " << nonce << " "; @@ -204,8 +204,10 @@ void mine(RandomX::VirtualMachine* vm, std::atomic& atomicNonce, Atomi } } + + int main(int argc, char** argv) { - bool softAes, genAsm, miningMode, verificationMode, help, largePages, async, genNative, jit, genSuperscalar, useSuperscalar; + bool softAes, genAsm, miningMode, verificationMode, help, largePages, async, genNative, jit, genSuperscalar, legacy; int programCount, threadCount, initThreadCount, epoch; readOption("--softAes", argc, argv, softAes); @@ -221,7 +223,7 @@ int main(int argc, char** argv) { readOption("--genNative", argc, argv, genNative); readOption("--help", argc, argv, help); readOption("--genSuperscalar", argc, argv, genSuperscalar); - readOption("--useSuperscalar", argc, argv, useSuperscalar); + readOption("--legacy", argc, argv, legacy); if (genSuperscalar) { RandomX::LightProgram p; @@ -283,7 +285,7 @@ int main(int argc, char** argv) { outputHex(std::cout, (char*)dataset.cache.memory, sizeof(__m128i)); std::cout << std::endl; } - if (useSuperscalar) { + if (!legacy) { RandomX::Blake2Generator gen(seed, programCount); for (int i = 0; i < RANDOMX_CACHE_ACCESSES; ++i) { RandomX::generateLightProg2(programs[i], gen); @@ -297,7 +299,7 @@ int main(int argc, char** argv) { dataset.dataset.size = datasetSize; RandomX::datasetAlloc(dataset, largePages); const uint64_t datasetBlockCount = datasetSize / RandomX::CacheLineSize; - if (useSuperscalar) { + if (!legacy) { RandomX::JitCompilerX86 jit86; jit86.generateSuperScalarHash(programs); jit86.getDatasetInitFunc()(cache.memory, dataset.dataset.memory, 0, datasetBlockCount); @@ -330,11 +332,11 @@ int main(int argc, char** argv) { vm = new RandomX::CompiledVirtualMachine(); } else { - if (jit && useSuperscalar) + if (jit && !legacy) vm = new RandomX::CompiledLightVirtualMachine(); else if (jit) vm = new RandomX::CompiledLightVirtualMachine(); - else if (useSuperscalar) + else if (!legacy) vm = new RandomX::InterpretedVirtualMachine(softAes); else vm = new RandomX::InterpretedVirtualMachine(softAes); @@ -373,8 +375,8 @@ int main(int argc, char** argv) { double elapsed = sw.getElapsed(); std::cout << "Calculated result: "; result.print(std::cout); - if(programCount == 1000) - std::cout << "Reference result: 83875c55fb9ff4a75205a744b82926ebbe23219c6291889c9ee91603c845c597" << std::endl; + if(!legacy && programCount == 1000) + std::cout << "Reference result: 4a74a376d490c8b41d42887e86d4addb5a95572e0c663d1e81aec928e4e094e1" << std::endl; if (!miningMode) { std::cout << "Performance: " << 1000 * elapsed / programCount << " ms per hash" << std::endl; } From 37ff37cd11629f79856b09eafac6e916a43f42b9 Mon Sep 17 00:00:00 2001 From: tevador Date: Thu, 11 Apr 2019 20:46:35 +0200 Subject: [PATCH 14/18] msvc solution --- .gitignore | 2 +- randomx.sln | 57 +++++++ src/tests/superscalar-avalanche.cpp | 68 ++++++++ src/tests/superscalar-init.cpp | 78 +++++++++ vcxproj/randomx.vcxproj | 156 ++++++++++++++++++ vcxproj/randomx.vcxproj.filters | 87 ++++++++++ vcxproj/superscalar-avalanche.vcxproj | 142 ++++++++++++++++ vcxproj/superscalar-avalanche.vcxproj.filters | 69 ++++++++ vcxproj/superscalar-init.vcxproj | 142 ++++++++++++++++ vcxproj/superscalar-init.vcxproj.filters | 69 ++++++++ 10 files changed, 869 insertions(+), 1 deletion(-) create mode 100644 randomx.sln create mode 100644 src/tests/superscalar-avalanche.cpp create mode 100644 src/tests/superscalar-init.cpp create mode 100644 vcxproj/randomx.vcxproj create mode 100644 vcxproj/randomx.vcxproj.filters create mode 100644 vcxproj/superscalar-avalanche.vcxproj create mode 100644 vcxproj/superscalar-avalanche.vcxproj.filters create mode 100644 vcxproj/superscalar-init.vcxproj create mode 100644 vcxproj/superscalar-init.vcxproj.filters diff --git a/.gitignore b/.gitignore index 0f69877..35c1e9a 100644 --- a/.gitignore +++ b/.gitignore @@ -3,4 +3,4 @@ obj/ *.user *.suo .vs - +x64 diff --git a/randomx.sln b/randomx.sln new file mode 100644 index 0000000..c4d5a2a --- /dev/null +++ b/randomx.sln @@ -0,0 +1,57 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio 15 +VisualStudioVersion = 15.0.28307.572 +MinimumVisualStudioVersion = 10.0.40219.1 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "randomx", "vcxproj\randomx.vcxproj", "{3346A4AD-C438-4324-8B77-47A16452954B}" +EndProject +Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "tests", "tests", "{4A4A689F-86AF-41C0-A974-1080506D0923}" +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "superscalar-avalanche", "vcxproj\superscalar-avalanche.vcxproj", "{CF34A7EF-7DC9-4077-94A5-76F5425EA938}" +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "superscalar-init", "vcxproj\superscalar-init.vcxproj", "{E59DC709-9B12-4A53-BAF3-79398821C376}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|x64 = Debug|x64 + Debug|x86 = Debug|x86 + Release|x64 = Release|x64 + Release|x86 = Release|x86 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {3346A4AD-C438-4324-8B77-47A16452954B}.Debug|x64.ActiveCfg = Debug|x64 + {3346A4AD-C438-4324-8B77-47A16452954B}.Debug|x64.Build.0 = Debug|x64 + {3346A4AD-C438-4324-8B77-47A16452954B}.Debug|x86.ActiveCfg = Debug|Win32 + {3346A4AD-C438-4324-8B77-47A16452954B}.Debug|x86.Build.0 = Debug|Win32 + {3346A4AD-C438-4324-8B77-47A16452954B}.Release|x64.ActiveCfg = Release|x64 + {3346A4AD-C438-4324-8B77-47A16452954B}.Release|x64.Build.0 = Release|x64 + {3346A4AD-C438-4324-8B77-47A16452954B}.Release|x86.ActiveCfg = Release|Win32 + {3346A4AD-C438-4324-8B77-47A16452954B}.Release|x86.Build.0 = Release|Win32 + {CF34A7EF-7DC9-4077-94A5-76F5425EA938}.Debug|x64.ActiveCfg = Debug|x64 + {CF34A7EF-7DC9-4077-94A5-76F5425EA938}.Debug|x64.Build.0 = Debug|x64 + {CF34A7EF-7DC9-4077-94A5-76F5425EA938}.Debug|x86.ActiveCfg = Debug|Win32 + {CF34A7EF-7DC9-4077-94A5-76F5425EA938}.Debug|x86.Build.0 = Debug|Win32 + {CF34A7EF-7DC9-4077-94A5-76F5425EA938}.Release|x64.ActiveCfg = Release|x64 + {CF34A7EF-7DC9-4077-94A5-76F5425EA938}.Release|x64.Build.0 = Release|x64 + {CF34A7EF-7DC9-4077-94A5-76F5425EA938}.Release|x86.ActiveCfg = Release|Win32 + {CF34A7EF-7DC9-4077-94A5-76F5425EA938}.Release|x86.Build.0 = Release|Win32 + {E59DC709-9B12-4A53-BAF3-79398821C376}.Debug|x64.ActiveCfg = Debug|x64 + {E59DC709-9B12-4A53-BAF3-79398821C376}.Debug|x64.Build.0 = Debug|x64 + {E59DC709-9B12-4A53-BAF3-79398821C376}.Debug|x86.ActiveCfg = Debug|Win32 + {E59DC709-9B12-4A53-BAF3-79398821C376}.Debug|x86.Build.0 = Debug|Win32 + {E59DC709-9B12-4A53-BAF3-79398821C376}.Release|x64.ActiveCfg = Release|x64 + {E59DC709-9B12-4A53-BAF3-79398821C376}.Release|x64.Build.0 = Release|x64 + {E59DC709-9B12-4A53-BAF3-79398821C376}.Release|x86.ActiveCfg = Release|Win32 + {E59DC709-9B12-4A53-BAF3-79398821C376}.Release|x86.Build.0 = Release|Win32 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection + GlobalSection(NestedProjects) = preSolution + {CF34A7EF-7DC9-4077-94A5-76F5425EA938} = {4A4A689F-86AF-41C0-A974-1080506D0923} + {E59DC709-9B12-4A53-BAF3-79398821C376} = {4A4A689F-86AF-41C0-A974-1080506D0923} + EndGlobalSection + GlobalSection(ExtensibilityGlobals) = postSolution + SolutionGuid = {4EBC03DB-AE37-4141-8147-692F16E0ED02} + EndGlobalSection +EndGlobal diff --git a/src/tests/superscalar-avalanche.cpp b/src/tests/superscalar-avalanche.cpp new file mode 100644 index 0000000..9c91a88 --- /dev/null +++ b/src/tests/superscalar-avalanche.cpp @@ -0,0 +1,68 @@ +/* +Copyright (c) 2019 tevador + +This file is part of RandomX. + +RandomX is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +RandomX is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with RandomX. If not, see. +*/ + +#include +#include +#include +#include "../LightProgramGenerator.hpp" +#include "../InterpretedVirtualMachine.hpp" +#include "../intrinPortable.h" + +const uint8_t seed[32] = { 191, 182, 222, 175, 249, 89, 134, 104, 241, 68, 191, 62, 162, 166, 61, 64, 123, 191, 227, 193, 118, 60, 188, 53, 223, 133, 175, 24, 123, 230, 55, 74 }; + +int main() { + + int insensitiveProgCount[64] = { 0 }; + std::vector dummy; + for (int bit = 0; bit < 64; ++bit) { + for (int i = 0; i < 10000; ++i) { + uint64_t ra[8] = { + 6364136223846793005ULL, + 9298410992540426048ULL, + 12065312585734608966ULL, + 9306329213124610396ULL, + 5281919268842080866ULL, + 10536153434571861004ULL, + 3398623926847679864ULL, + 9549104520008361294ULL, + }; + uint64_t rb[8]; + memcpy(rb, ra, sizeof rb); + rb[0] ^= (1ULL << bit); + RandomX::LightProgram p; + RandomX::Blake2Generator gen(seed, i); + RandomX::generateLightProg2(p, gen); + RandomX::InterpretedVirtualMachine::executeSuperscalar(ra, p, dummy); + RandomX::InterpretedVirtualMachine::executeSuperscalar(rb, p, dummy); + uint64_t diff = 0; + for (int j = 0; j < 8; ++j) { + diff += __popcnt64(ra[j] ^ rb[j]); + } + if (diff < 192 || diff > 320) { + std::cout << "Seed: " << i << " diff = " << diff << std::endl; + insensitiveProgCount[bit]++; + } + } + } + for (int bit = 0; bit < 64; ++bit) { + std::cout << bit << " " << insensitiveProgCount[bit] << std::endl; + } + + return 0; +} \ No newline at end of file diff --git a/src/tests/superscalar-init.cpp b/src/tests/superscalar-init.cpp new file mode 100644 index 0000000..b366355 --- /dev/null +++ b/src/tests/superscalar-init.cpp @@ -0,0 +1,78 @@ +/* +Copyright (c) 2019 tevador + +This file is part of RandomX. + +RandomX is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +RandomX is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with RandomX. If not, see. +*/ + +#include +#include +#include +#include +#include "../LightProgramGenerator.hpp" +#include "../InterpretedVirtualMachine.hpp" +#include "../intrinPortable.h" +#include "../configuration.h" + +const uint8_t seed[32] = { 191, 182, 222, 175, 249, 89, 134, 104, 241, 68, 191, 62, 162, 166, 61, 64, 123, 191, 227, 193, 118, 60, 188, 53, 223, 133, 175, 24, 123, 230, 55, 74 }; + +int main() { + std::cout << "THIS PROGRAM REQUIRES MORE THAN 10 GB OF RAM TO COMPLETE" << std::endl; + std::vector dummy; + constexpr uint64_t superscalarMul0 = 6364136223846793005ULL; + constexpr uint64_t superscalarAdd1 = 9298410992540426748ULL; //9298410992540426048ULL + constexpr uint64_t superscalarAdd2 = 12065312585734608966ULL; + constexpr uint64_t superscalarAdd3 = 9306329213124610396ULL; + constexpr uint64_t superscalarAdd4 = 5281919268842080866ULL; + constexpr uint64_t superscalarAdd5 = 10536153434571861004ULL; + constexpr uint64_t superscalarAdd6 = 3398623926847679864ULL; + constexpr uint64_t superscalarAdd7 = 9549104520008361294ULL; + constexpr uint32_t totalBlocks = RANDOMX_DATASET_SIZE / RandomX::CacheLineSize; + std::unordered_set registerValues; + registerValues.reserve(totalBlocks); + registerValues.rehash(totalBlocks); + int collisionCount[9] = { 0 }; + for (uint32_t blockNumber = 0; blockNumber < totalBlocks; ++blockNumber) { + uint64_t rl[8]; + rl[0] = (blockNumber + 1) * superscalarMul0; + rl[1] = rl[0] ^ superscalarAdd1; + rl[2] = rl[0] ^ superscalarAdd2; + rl[3] = rl[0] ^ superscalarAdd3; + rl[4] = rl[0] ^ superscalarAdd4; + rl[5] = rl[0] ^ superscalarAdd5; + rl[6] = rl[0] ^ superscalarAdd6; + rl[7] = rl[0] ^ superscalarAdd7; + int blockCollisions = 0; + for (int i = 0; i < 8; ++i) { + uint64_t reducedValue = rl[i] & 0x3FFFFFFFFFFFF8; //bits 3-53 only + if (registerValues.find(reducedValue) != registerValues.end()) { + blockCollisions++; + std::cout << "Block " << blockNumber << ": collision of register r" << i << std::endl; + } + else { + registerValues.insert(reducedValue); + } + } + collisionCount[blockCollisions]++; + if ((blockNumber % (320 * 1024)) == 0) + std::cout << "Block " << blockNumber << " processed" << std::endl; + } + + for (int i = 0; i < 9; ++i) { + std::cout << i << " register(s) collide in " << collisionCount[i] << " blocks" << std::endl; + } + + return 0; +} \ No newline at end of file diff --git a/vcxproj/randomx.vcxproj b/vcxproj/randomx.vcxproj new file mode 100644 index 0000000..3dc09c8 --- /dev/null +++ b/vcxproj/randomx.vcxproj @@ -0,0 +1,156 @@ + + + + + Debug + Win32 + + + Release + Win32 + + + Debug + x64 + + + Release + x64 + + + + 15.0 + {3346A4AD-C438-4324-8B77-47A16452954B} + randomx + 10.0.17763.0 + + + + Application + true + v141 + MultiByte + + + Application + false + v141 + true + MultiByte + + + Application + true + v141 + MultiByte + + + Application + false + v141 + true + MultiByte + + + + + + + + + + + + + + + + + + + + + + + + Level3 + Disabled + false + true + + + + + Level4 + Disabled + false + true + + + + + Level3 + MaxSpeed + true + true + false + true + + + true + true + UseLinkTimeCodeGeneration + false + + + + + Level3 + MaxSpeed + true + true + false + true + AssemblyCode + + + true + true + UseLinkTimeCodeGeneration + false + + + 4194304 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/vcxproj/randomx.vcxproj.filters b/vcxproj/randomx.vcxproj.filters new file mode 100644 index 0000000..9f33e02 --- /dev/null +++ b/vcxproj/randomx.vcxproj.filters @@ -0,0 +1,87 @@ + + + + + {4FC737F1-C7A5-4376-A066-2A32D752A2FF} + cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx + + + {93995380-89BD-4b04-88EB-625FBE52EBFB} + h;hh;hpp;hxx;hm;inl;inc;ipp;xsd + + + {67DA6AB6-F800-4c08-8B7A-83BB121AAD01} + rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms + + + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + + + Source Files + + + Source Files + + + \ No newline at end of file diff --git a/vcxproj/superscalar-avalanche.vcxproj b/vcxproj/superscalar-avalanche.vcxproj new file mode 100644 index 0000000..dab0311 --- /dev/null +++ b/vcxproj/superscalar-avalanche.vcxproj @@ -0,0 +1,142 @@ + + + + + Debug + Win32 + + + Release + Win32 + + + Debug + x64 + + + Release + x64 + + + + 15.0 + {CF34A7EF-7DC9-4077-94A5-76F5425EA938} + superscalaravalanche + 10.0.17763.0 + + + + Application + true + v141 + MultiByte + + + Application + false + v141 + true + MultiByte + + + Application + true + v141 + MultiByte + + + Application + false + v141 + true + MultiByte + + + + + + + + + + + + + + + + + + + + + + + + Level3 + MaxSpeed + true + true + false + true + + + true + true + + + + + Level3 + Disabled + true + true + + + + + Level3 + Disabled + true + true + + + + + Level3 + MaxSpeed + true + true + true + true + + + true + true + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/vcxproj/superscalar-avalanche.vcxproj.filters b/vcxproj/superscalar-avalanche.vcxproj.filters new file mode 100644 index 0000000..9984ed1 --- /dev/null +++ b/vcxproj/superscalar-avalanche.vcxproj.filters @@ -0,0 +1,69 @@ + + + + + {4FC737F1-C7A5-4376-A066-2A32D752A2FF} + cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx + + + {93995380-89BD-4b04-88EB-625FBE52EBFB} + h;hh;hpp;hxx;hm;inl;inc;ipp;xsd + + + {67DA6AB6-F800-4c08-8B7A-83BB121AAD01} + rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms + + + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + + + Source Files + + + \ No newline at end of file diff --git a/vcxproj/superscalar-init.vcxproj b/vcxproj/superscalar-init.vcxproj new file mode 100644 index 0000000..4c4794c --- /dev/null +++ b/vcxproj/superscalar-init.vcxproj @@ -0,0 +1,142 @@ + + + + + Debug + Win32 + + + Release + Win32 + + + Debug + x64 + + + Release + x64 + + + + 15.0 + {E59DC709-9B12-4A53-BAF3-79398821C376} + superscalarinit + 10.0.17763.0 + + + + Application + true + v141 + MultiByte + + + Application + false + v141 + true + MultiByte + + + Application + true + v141 + MultiByte + + + Application + false + v141 + true + MultiByte + + + + + + + + + + + + + + + + + + + + + + + + Level3 + MaxSpeed + true + true + false + true + + + true + true + + + + + Level3 + Disabled + false + true + + + + + Level3 + Disabled + false + true + + + + + Level3 + MaxSpeed + true + true + false + true + + + true + true + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/vcxproj/superscalar-init.vcxproj.filters b/vcxproj/superscalar-init.vcxproj.filters new file mode 100644 index 0000000..4666d07 --- /dev/null +++ b/vcxproj/superscalar-init.vcxproj.filters @@ -0,0 +1,69 @@ + + + + + {4FC737F1-C7A5-4376-A066-2A32D752A2FF} + cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx + + + {93995380-89BD-4b04-88EB-625FBE52EBFB} + h;hh;hpp;hxx;hm;inl;inc;ipp;xsd + + + {67DA6AB6-F800-4c08-8B7A-83BB121AAD01} + rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms + + + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + + + Source Files + + + \ No newline at end of file From 24a22c6b54c115f93d1ef904238e5ad7aa1df4aa Mon Sep 17 00:00:00 2001 From: tevador Date: Fri, 12 Apr 2019 00:02:22 +0200 Subject: [PATCH 15/18] Code generator refactoring --- src/LightProgramGenerator.cpp | 402 ++++++++++++++++------------------ src/Program.hpp | 2 +- src/configuration.h | 6 +- 3 files changed, 187 insertions(+), 223 deletions(-) diff --git a/src/LightProgramGenerator.cpp b/src/LightProgramGenerator.cpp index 97fbb91..092e220 100644 --- a/src/LightProgramGenerator.cpp +++ b/src/LightProgramGenerator.cpp @@ -31,40 +31,10 @@ along with RandomX. If not, see. namespace RandomX { - namespace LightInstructionOpcode { - constexpr int IADD_RS = 0; - constexpr int IADD_RC = RANDOMX_FREQ_IADD_RS + RANDOMX_FREQ_IADD_M; - constexpr int ISUB_R = IADD_RC + RANDOMX_FREQ_IADD_RC; - constexpr int IMUL_9C = ISUB_R + RANDOMX_FREQ_ISUB_R + RANDOMX_FREQ_ISUB_M; - constexpr int IMUL_R = IMUL_9C + RANDOMX_FREQ_IMUL_9C; - constexpr int IMULH_R = IMUL_R + RANDOMX_FREQ_IMUL_R + RANDOMX_FREQ_IMUL_M; - constexpr int ISMULH_R = IMULH_R + RANDOMX_FREQ_IMULH_R + RANDOMX_FREQ_IMULH_M; - constexpr int IMUL_RCP = ISMULH_R + RANDOMX_FREQ_ISMULH_R + RANDOMX_FREQ_ISMULH_M; - constexpr int IXOR_R = IMUL_RCP + RANDOMX_FREQ_IMUL_RCP + RANDOMX_FREQ_INEG_R; - constexpr int IROR_R = IXOR_R + RANDOMX_FREQ_IXOR_R + RANDOMX_FREQ_IXOR_M; - constexpr int COND_R = IROR_R + RANDOMX_FREQ_IROR_R + RANDOMX_FREQ_IROL_R + RANDOMX_FREQ_ISWAP_R + RANDOMX_FREQ_FSWAP_R + RANDOMX_FREQ_FADD_R + RANDOMX_FREQ_FADD_M + RANDOMX_FREQ_FSUB_R + RANDOMX_FREQ_FSUB_M + RANDOMX_FREQ_FSCAL_R + RANDOMX_FREQ_FMUL_R + RANDOMX_FREQ_FDIV_M + RANDOMX_FREQ_FSQRT_R; - } - static bool isMul(int type) { return type == LightInstructionType::IMUL_R || type == LightInstructionType::IMULH_R || type == LightInstructionType::ISMULH_R || type == LightInstructionType::IMUL_RCP; } - const int lightInstructionOpcode[] = { - LightInstructionOpcode::IADD_RS, - LightInstructionOpcode::ISUB_R, //ISUB_R - LightInstructionOpcode::ISUB_R, //ISUB_R - LightInstructionOpcode::IMUL_R, //IMUL_R - LightInstructionOpcode::IMUL_R, //IMUL_C - LightInstructionOpcode::IMULH_R, - LightInstructionOpcode::ISMULH_R, - LightInstructionOpcode::IMUL_RCP, - LightInstructionOpcode::IXOR_R, //IXOR_R - LightInstructionOpcode::IXOR_R, //IXOR_C - LightInstructionOpcode::IROR_R, //IROR_R - LightInstructionOpcode::IROR_R, //IROR_C - LightInstructionOpcode::COND_R - }; - namespace ExecutionPort { using type = int; constexpr type Null = 0; @@ -144,31 +114,11 @@ namespace RandomX { bool isDependent() const { return dependent_; } - int getCycle() const { - return cycle_; - } - void setCycle(int cycle) { - cycle_ = cycle; - } - MacroOp* getSrcDep() const { - return depSrc_; - } - void setSrcDep(MacroOp* src) { - depSrc_ = src; - } - MacroOp* getDstDep() const { - return depDst_; - } - void setDstDep(MacroOp* dst) { - depDst_ = dst; - } static const MacroOp Add_rr; static const MacroOp Add_ri; static const MacroOp Lea_sib; static const MacroOp Sub_rr; - static const MacroOp Sub_ri; static const MacroOp Imul_rr; - static const MacroOp Imul_rri; static const MacroOp Imul_r; static const MacroOp Mul_r; static const MacroOp Mov_rr; @@ -193,21 +143,28 @@ namespace RandomX { MacroOp* depSrc_ = nullptr; }; + //Size: 3 bytes const MacroOp MacroOp::Add_rr = MacroOp("add r,r", 3, 1, ExecutionPort::P015); - const MacroOp MacroOp::Add_ri = MacroOp("add r,i", 7, 1, ExecutionPort::P015); - const MacroOp MacroOp::Lea_sib = MacroOp("lea r,r+r*s", 4, 1, ExecutionPort::P01); const MacroOp MacroOp::Sub_rr = MacroOp("sub r,r", 3, 1, ExecutionPort::P015); - const MacroOp MacroOp::Sub_ri = MacroOp("sub r,i", 7, 1, ExecutionPort::P015); - const MacroOp MacroOp::Imul_rr = MacroOp("imul r,r", 4, 3, ExecutionPort::P1); - const MacroOp MacroOp::Imul_rri = MacroOp("imul r,r,i", 7, 3, ExecutionPort::P1); + const MacroOp MacroOp::Xor_rr = MacroOp("xor r,r", 3, 1, ExecutionPort::P015); const MacroOp MacroOp::Imul_r = MacroOp("imul r", 3, 4, ExecutionPort::P1, ExecutionPort::P5); const MacroOp MacroOp::Mul_r = MacroOp("mul r", 3, 3, ExecutionPort::P1, ExecutionPort::P5); const MacroOp MacroOp::Mov_rr = MacroOp("mov r,r", 3); - const MacroOp MacroOp::Mov_ri64 = MacroOp("mov rax,i64", 10, 1, ExecutionPort::P015); - const MacroOp MacroOp::Xor_rr = MacroOp("xor r,r", 3, 1, ExecutionPort::P015); - const MacroOp MacroOp::Xor_ri = MacroOp("xor r,i", 7, 1, ExecutionPort::P015); - const MacroOp MacroOp::Ror_rcl = MacroOp("ror r,cl", 3, 1, ExecutionPort::P0, ExecutionPort::P5); + + //Size: 4 bytes + const MacroOp MacroOp::Lea_sib = MacroOp("lea r,r+r*s", 4, 1, ExecutionPort::P01); + const MacroOp MacroOp::Imul_rr = MacroOp("imul r,r", 4, 3, ExecutionPort::P1); const MacroOp MacroOp::Ror_ri = MacroOp("ror r,i", 4, 1, ExecutionPort::P05); + + //Size: 7 bytes (can be optionally padded with nop to 8 or 9 bytes) + const MacroOp MacroOp::Add_ri = MacroOp("add r,i", 7, 1, ExecutionPort::P015); + const MacroOp MacroOp::Xor_ri = MacroOp("xor r,i", 7, 1, ExecutionPort::P015); + + //Size: 10 bytes + const MacroOp MacroOp::Mov_ri64 = MacroOp("mov rax,i64", 10, 1, ExecutionPort::P015); + + //Unused: + const MacroOp MacroOp::Ror_rcl = MacroOp("ror r,cl", 3, 1, ExecutionPort::P0, ExecutionPort::P5); const MacroOp MacroOp::Xor_self = MacroOp("xor rcx,rcx", 3); const MacroOp MacroOp::Cmp_ri = MacroOp("cmp r,i", 7, 1, ExecutionPort::P015); const MacroOp MacroOp::Setcc_r = MacroOp("setcc cl", 3, 1, ExecutionPort::P05); @@ -216,36 +173,9 @@ namespace RandomX { const MacroOp IMULH_R_ops_array[] = { MacroOp::Mov_rr, MacroOp::Mul_r, MacroOp::Mov_rr }; const MacroOp ISMULH_R_ops_array[] = { MacroOp::Mov_rr, MacroOp::Imul_r, MacroOp::Mov_rr }; const MacroOp IMUL_RCP_ops_array[] = { MacroOp::Mov_ri64, MacroOp(MacroOp::Imul_rr, true) }; - const MacroOp IROR_R_ops_array[] = { MacroOp::Mov_rr, MacroOp::Ror_rcl }; - const MacroOp COND_R_ops_array[] = { MacroOp::Add_ri, MacroOp(MacroOp::TestJz_fused, true), MacroOp::Xor_self, MacroOp::Cmp_ri, MacroOp(MacroOp::Setcc_r, true), MacroOp(MacroOp::Add_rr, true) }; - class LightInstructionInfo { public: - LightInstructionInfo(const char* name, int type, const MacroOp& op, int srcOp) - : name_(name), type_(type), latency_(op.getLatency()), srcOp_(srcOp) { - ops_.push_back(MacroOp(op)); - } - template - LightInstructionInfo(const char* name, int type, const MacroOp(&arr)[N], int resultOp, int dstOp, int srcOp) - : name_(name), type_(type), latency_(0), resultOp_(resultOp), dstOp_(dstOp), srcOp_(srcOp) { - for (unsigned i = 0; i < N; ++i) { - ops_.push_back(MacroOp(arr[i])); - latency_ += ops_.back().getLatency(); - } - static_assert(N > 1, "Invalid array size"); - } - template - LightInstructionInfo(const char* name, int type, const MacroOp*(&arr)[N], int latency, int resultOp, int dstOp, int srcOp) - : name_(name), type_(type), latency_(latency), resultOp_(resultOp), dstOp_(dstOp), srcOp_(srcOp) { - for (unsigned i = 0; i < N; ++i) { - ops_.push_back(MacroOp(arr[i])); - if (arr[i].isDependent()) { - ops_[i].setSrcDep(&ops_[i - 1]); - } - } - static_assert(N > 1, "Invalid array size"); - } const char* getName() const { return name_; } @@ -258,7 +188,7 @@ namespace RandomX { int getLatency() const { return latency_; } - MacroOp& getOp(int index) { + const MacroOp& getOp(int index) const { return ops_[index]; } int getType() const { @@ -299,6 +229,19 @@ namespace RandomX { LightInstructionInfo(const char* name) : name_(name), type_(-1), latency_(0) {} + LightInstructionInfo(const char* name, int type, const MacroOp& op, int srcOp) + : name_(name), type_(type), latency_(op.getLatency()), srcOp_(srcOp) { + ops_.push_back(MacroOp(op)); + } + template + LightInstructionInfo(const char* name, int type, const MacroOp(&arr)[N], int resultOp, int dstOp, int srcOp) + : name_(name), type_(type), latency_(0), resultOp_(resultOp), dstOp_(dstOp), srcOp_(srcOp) { + for (unsigned i = 0; i < N; ++i) { + ops_.push_back(MacroOp(arr[i])); + latency_ += ops_.back().getLatency(); + } + static_assert(N > 1, "Invalid array size"); + } }; const LightInstructionInfo LightInstructionInfo::ISUB_R = LightInstructionInfo("ISUB_R", LightInstructionType::ISUB_R, MacroOp::Sub_rr, 0); @@ -320,13 +263,6 @@ namespace RandomX { const LightInstructionInfo LightInstructionInfo::NOP = LightInstructionInfo("NOP"); - const int buffer0[] = { 4, 8, 4 }; - const int buffer1[] = { 7, 3, 3, 3 }; - const int buffer2[] = { 3, 7, 3, 3 }; - const int buffer3[] = { 4, 9, 3 }; - const int buffer4[] = { 4, 4, 4, 4 }; - const int buffer5[] = { 3, 3, 10 }; - class DecoderBuffer { public: static const DecoderBuffer Default; @@ -346,13 +282,22 @@ namespace RandomX { return name_; } const DecoderBuffer* fetchNext(int instrType, int cycle, int mulCount, Blake2Generator& gen) const { + //If the current RandomX instruction is "IMULH", the next fetch configuration must be 3-3-10 + //because the full 128-bit multiplication instruction is 3 bytes long and decodes to 2 uOPs on Intel CPUs. + //Intel CPUs can decode at most 4 uOPs per cycle, so this requires a 2-1-1 configuration for a total of 3 macro ops. if (instrType == LightInstructionType::IMULH_R || instrType == LightInstructionType::ISMULH_R) - return &decodeBuffer3310; //2-1-1 decode + return &decodeBuffer3310; + + //To make sure that the multiplication port is saturated, a 4-4-4-4 configuration is generated if the number of multiplications + //is lower than the number of cycles. if (mulCount < cycle + 1) return &decodeBuffer4444; - if (index_ == 5) { //IMUL_RCP end + + //If the current RandomX instruction is "IMUL_RCP", the next buffer must begin with a 4-byte slot for multiplication. + if(instrType == LightInstructionType::IMUL_RCP) return (gen.getByte() & 1) ? &decodeBuffer484 : &decodeBuffer493; - } + + //Default: select a random fetch configuration. return fetchNextDefault(gen); } private: @@ -373,6 +318,16 @@ namespace RandomX { } }; + //these are some of the options how to split a 16-byte window into 3 or 4 x86 instructions. + //RandomX uses instructions with a native size of 3 (sub, xor, mul, mov), 4 (lea, mul), 7 (xor, add immediate) or 10 bytes (mov 64-bit immediate). + //Slots with sizes of 8 or 9 bytes need to be padded with a nop instruction. + const int buffer0[] = { 4, 8, 4 }; + const int buffer1[] = { 7, 3, 3, 3 }; + const int buffer2[] = { 3, 7, 3, 3 }; + const int buffer3[] = { 4, 9, 3 }; + const int buffer4[] = { 4, 4, 4, 4 }; + const int buffer5[] = { 3, 3, 10 }; + const DecoderBuffer DecoderBuffer::decodeBuffer484 = DecoderBuffer("4,8,4", 0, buffer0); const DecoderBuffer DecoderBuffer::decodeBuffer7333 = DecoderBuffer("7,3,3,3", 1, buffer1); const DecoderBuffer DecoderBuffer::decodeBuffer3733 = DecoderBuffer("3,7,3,3", 2, buffer2); @@ -401,7 +356,6 @@ namespace RandomX { int index; if (availableRegisters.size() == 0) return false; - //throw std::runtime_error("No available registers"); if (availableRegisters.size() > 1) { index = gen.getInt32() % availableRegisters.size(); @@ -423,131 +377,136 @@ namespace RandomX { instr.setImm32(imm32_); } - static LightInstruction createForSlot(Blake2Generator& gen, int slotSize, int fetchType, bool isLast, bool isFirst) { + void createForSlot(Blake2Generator& gen, int slotSize, int fetchType, bool isLast, bool isFirst) { switch (slotSize) { case 3: if (isLast) { - return create(slot_3L[gen.getByte() & 3], gen); + create(slot_3L[gen.getByte() & 3], gen); } else { - return create(slot_3[gen.getByte() & 1], gen); + create(slot_3[gen.getByte() & 1], gen); } + break; case 4: if (fetchType == 4 && !isLast) { - return create(&LightInstructionInfo::IMUL_R, gen); + create(&LightInstructionInfo::IMUL_R, gen); } else { - return create(slot_4[gen.getByte() & 1], gen); + create(slot_4[gen.getByte() & 1], gen); } + break; case 7: - return create(slot_7[gen.getByte() & 1], gen); + create(slot_7[gen.getByte() & 1], gen); + break; case 8: - return create(slot_8[gen.getByte() & 1], gen); + create(slot_8[gen.getByte() & 1], gen); + break; case 9: - return create(slot_9[gen.getByte() & 1], gen); + create(slot_9[gen.getByte() & 1], gen); + break; case 10: - return create(slot_10, gen); + create(slot_10, gen); + break; default: - throw std::runtime_error("Invalid slot"); + UNREACHABLE; } } - static LightInstruction create(const LightInstructionInfo* info, Blake2Generator& gen) { - LightInstruction li(info); + void create(const LightInstructionInfo* info, Blake2Generator& gen) { + info_ = info; + reset(); switch (info->getType()) { case LightInstructionType::ISUB_R: { - li.mod_ = 0; - li.imm32_ = 0; - li.opGroup_ = LightInstructionType::IADD_RS; - li.groupParIsSource_ = true; + mod_ = 0; + imm32_ = 0; + opGroup_ = LightInstructionType::IADD_RS; + groupParIsSource_ = true; } break; case LightInstructionType::IXOR_R: { - li.mod_ = 0; - li.imm32_ = 0; - li.opGroup_ = LightInstructionType::IXOR_R; - li.groupParIsSource_ = true; + mod_ = 0; + imm32_ = 0; + opGroup_ = LightInstructionType::IXOR_R; + groupParIsSource_ = true; } break; case LightInstructionType::IADD_RS: { - li.mod_ = gen.getByte(); - li.imm32_ = 0; - li.opGroup_ = LightInstructionType::IADD_RS; - li.groupParIsSource_ = true; + mod_ = gen.getByte(); + imm32_ = 0; + opGroup_ = LightInstructionType::IADD_RS; + groupParIsSource_ = true; } break; case LightInstructionType::IMUL_R: { - li.mod_ = 0; - li.imm32_ = 0; - li.opGroup_ = LightInstructionType::IMUL_R; - li.opGroupPar_ = -1; //TODO + mod_ = 0; + imm32_ = 0; + opGroup_ = LightInstructionType::IMUL_R; + opGroupPar_ = -1; } break; case LightInstructionType::IROR_C: { - li.mod_ = 0; + mod_ = 0; do { - li.imm32_ = gen.getByte() & 63; - } while (li.imm32_ == 0); - li.opGroup_ = LightInstructionType::IROR_C; - li.opGroupPar_ = -1; + imm32_ = gen.getByte() & 63; + } while (imm32_ == 0); + opGroup_ = LightInstructionType::IROR_C; + opGroupPar_ = -1; } break; case LightInstructionType::IADD_C7: case LightInstructionType::IADD_C8: case LightInstructionType::IADD_C9: { - li.mod_ = 0; - li.imm32_ = gen.getInt32(); - li.opGroup_ = LightInstructionType::IADD_C7; - li.opGroupPar_ = -1; + mod_ = 0; + imm32_ = gen.getInt32(); + opGroup_ = LightInstructionType::IADD_C7; + opGroupPar_ = -1; } break; case LightInstructionType::IXOR_C7: case LightInstructionType::IXOR_C8: case LightInstructionType::IXOR_C9: { - li.mod_ = 0; - li.imm32_ = gen.getInt32(); - li.opGroup_ = LightInstructionType::IXOR_C7; - li.opGroupPar_ = -1; + mod_ = 0; + imm32_ = gen.getInt32(); + opGroup_ = LightInstructionType::IXOR_C7; + opGroupPar_ = -1; } break; case LightInstructionType::IMULH_R: { - li.canReuse_ = true; - li.mod_ = 0; - li.imm32_ = 0; - li.opGroup_ = LightInstructionType::IMULH_R; - li.opGroupPar_ = gen.getInt32(); + canReuse_ = true; + mod_ = 0; + imm32_ = 0; + opGroup_ = LightInstructionType::IMULH_R; + opGroupPar_ = gen.getInt32(); } break; case LightInstructionType::ISMULH_R: { - li.canReuse_ = true; - li.mod_ = 0; - li.imm32_ = 0; - li.opGroup_ = LightInstructionType::ISMULH_R; - li.opGroupPar_ = gen.getInt32(); + canReuse_ = true; + mod_ = 0; + imm32_ = 0; + opGroup_ = LightInstructionType::ISMULH_R; + opGroupPar_ = gen.getInt32(); } break; case LightInstructionType::IMUL_RCP: { - li.mod_ = 0; + mod_ = 0; do { - li.imm32_ = gen.getInt32(); - } while ((li.imm32_ & (li.imm32_ - 1)) == 0); - li.opGroup_ = LightInstructionType::IMUL_RCP; - li.opGroupPar_ = -1; + imm32_ = gen.getInt32(); + } while ((imm32_ & (imm32_ - 1)) == 0); + opGroup_ = LightInstructionType::IMUL_RCP; + opGroupPar_ = -1; } break; default: break; } - - return li; } bool selectDestination(int cycle, RegisterInfo (®isters)[8], Blake2Generator& gen) { std::vector availableRegisters; for (unsigned i = 0; i < 8; ++i) { - if (registers[i].latency <= cycle && (canReuse_ || i != src_) && (registers[i].lastOpGroup != opGroup_ || registers[i].lastOpPar != opGroupPar_) && (info_.getType() != LightInstructionType::IADD_RS || i != 5)) + if (registers[i].latency <= cycle && (canReuse_ || i != src_) && (registers[i].lastOpGroup != opGroup_ || registers[i].lastOpPar != opGroupPar_) && (info_->getType() != LightInstructionType::IADD_RS || i != 5)) availableRegisters.push_back(i); } return selectRegister(availableRegisters, gen, dst_); @@ -559,7 +518,7 @@ namespace RandomX { if (registers[i].latency <= cycle) availableRegisters.push_back(i); } - if (availableRegisters.size() == 2 && info_.getType() == LightInstructionType::IADD_RS) { + if (availableRegisters.size() == 2 && info_->getType() == LightInstructionType::IADD_RS) { if (availableRegisters[0] == 5 || availableRegisters[1] == 5) { opGroupPar_ = src_ = 5; return true; @@ -574,7 +533,7 @@ namespace RandomX { } int getType() { - return info_.getType(); + return info_->getType(); } int getSource() { return src_; @@ -589,14 +548,14 @@ namespace RandomX { return opGroupPar_; } - LightInstructionInfo& getInfo() { - return info_; + const LightInstructionInfo& getInfo() const { + return *info_; } static const LightInstruction Null; private: - LightInstructionInfo info_; + const LightInstructionInfo* info_; int src_ = -1; int dst_ = -1; int mod_; @@ -606,24 +565,20 @@ namespace RandomX { bool canReuse_ = false; bool groupParIsSource_ = false; - LightInstruction(const LightInstructionInfo* info) : info_(*info) { - for (unsigned i = 0; i < info_.getSize(); ++i) { - MacroOp& mop = info_.getOp(i); - if (mop.isDependent()) { - mop.setSrcDep(&info_.getOp(i - 1)); - } - } + void reset() { + src_ = dst_ = -1; + canReuse_ = groupParIsSource_ = false; + } + + LightInstruction(const LightInstructionInfo* info) : info_(info) { } }; const LightInstruction LightInstruction::Null = LightInstruction(&LightInstructionInfo::NOP); - constexpr int ALU_COUNT_MUL = 1; - constexpr int ALU_COUNT = 3; - constexpr int LIGHT_OPCODE_BITS = 4; - constexpr int V4_SRC_INDEX_BITS = 3; - constexpr int V4_DST_INDEX_BITS = 3; - constexpr int CYCLE_MAP_SIZE = RANDOMX_LPROG_LATENCY + 3; + constexpr int CYCLE_MAP_SIZE = RANDOMX_SUPERSCALAR_LATENCY + 3; + constexpr int LOOK_FORWARD_CYCLES = 4; + constexpr int MAX_THROWAWAY_COUNT = 256; #ifndef _DEBUG constexpr bool TRACE = false; constexpr bool INFO = false; @@ -735,98 +690,102 @@ namespace RandomX { return -1; } - // If we don't have enough data available, generate more - static FORCE_INLINE void check_data(size_t& data_index, const size_t bytes_needed, uint8_t* data, const size_t data_size) - { - if (data_index + bytes_needed > data_size) - { - std::cout << "Calling Blake " << (++blakeCounter) << std::endl; - blake2b(data, data_size, data, data_size, nullptr, 0); - data_index = 0; - } - } - double generateLightProg2(LightProgram& prog, Blake2Generator& gen) { ExecutionPort::type portBusy[CYCLE_MAP_SIZE][3]; memset(portBusy, 0, sizeof(portBusy)); RegisterInfo registers[8]; - std::vector instructions; - const DecoderBuffer* fetchLine = &DecoderBuffer::Default; + const DecoderBuffer* decodeBuffer = &DecoderBuffer::Default; LightInstruction currentInstruction = LightInstruction::Null; int instrIndex = 0; int codeSize = 0; int macroOpCount = 0; int cycle = 0; - int fetchCycle = 0; int depCycle = 0; int retireCycle = 0; - int mopIndex = 0; bool portsSaturated = false; int outIndex = 0; - int attempts = 0; int mulCount = 0; - constexpr int MAX_ATTEMPTS = 4; + int decodeCycle; - while(!portsSaturated) { - fetchLine = fetchLine->fetchNext(currentInstruction.getType(), fetchCycle++, mulCount, gen); - if (TRACE) std::cout << "; ------------- fetch cycle " << cycle << " (" << fetchLine->getName() << ")" << std::endl; + //decode instructions for RANDOMX_SUPERSCALAR_LATENCY cycles or until an execution port is saturated. + //Each decode cycle decodes 16 bytes of x86 code. + //Since a decode cycle produces on average 3.45 macro-ops and there are only 3 ALU ports, execution ports are always + //saturated first. The cycle limit is present only to guarantee loop termination. + for (decodeCycle = 0; decodeCycle < RANDOMX_SUPERSCALAR_LATENCY && !portsSaturated && outIndex < RANDOMX_SUPERSCALAR_MAX_SIZE; ++decodeCycle) { - mopIndex = 0; + //select a fetch/decode configuration + decodeBuffer = decodeBuffer->fetchNext(currentInstruction.getType(), decodeCycle, mulCount, gen); + if (TRACE) std::cout << "; ------------- fetch cycle " << cycle << " (" << decodeBuffer->getName() << ")" << std::endl; + + int bufferIndex = 0; - while (mopIndex < fetchLine->getSize()) { + //fill all instruction slots in the current fetch/decode buffer + while (bufferIndex < decodeBuffer->getSize()) { int topCycle = cycle; + + //if we have created all macro-ops for the current RandomX instruction, create a new instruction if (instrIndex >= currentInstruction.getInfo().getSize()) { if (portsSaturated) break; - currentInstruction = LightInstruction::createForSlot(gen, fetchLine->getCounts()[mopIndex], fetchLine->getIndex(), fetchLine->getSize() == mopIndex + 1, mopIndex == 0); + currentInstruction.createForSlot(gen, decodeBuffer->getCounts()[bufferIndex], decodeBuffer->getIndex(), decodeBuffer->getSize() == bufferIndex + 1, bufferIndex == 0); instrIndex = 0; if (TRACE) std::cout << "; " << currentInstruction.getInfo().getName() << std::endl; } - MacroOp& mop = currentInstruction.getInfo().getOp(instrIndex); - if (fetchLine->getCounts()[mopIndex] != mop.getSize()) { - if (TRACE) std::cout << "ERROR instruction " << mop.getName() << " doesn't fit into slot of size " << fetchLine->getCounts()[mopIndex] << std::endl; - } + const MacroOp& mop = currentInstruction.getInfo().getOp(instrIndex); if (TRACE) std::cout << mop.getName() << " "; + + //calculate the earliest cycle when this macro-op (all of its uOPs) can be scheduled for execution int scheduleCycle = scheduleUop(mop, portBusy, cycle, depCycle); - mop.setCycle(scheduleCycle); if (scheduleCycle < 0) { if (TRACE) std::cout << "; Failed at cycle " << cycle << std::endl; return 0; } + //find a source register (if applicable) that will be ready when this instruction executes if (instrIndex == currentInstruction.getInfo().getSrcOp()) { - for (attempts = 0; attempts < MAX_ATTEMPTS && !currentInstruction.selectSource(scheduleCycle, registers, gen); ++attempts) { + int forward; + //if no suitable operand is ready, look up to LOOK_FORWARD_CYCLES forward + for (forward = 0; forward < LOOK_FORWARD_CYCLES && !currentInstruction.selectSource(scheduleCycle, registers, gen); ++forward) { if (TRACE) std::cout << "; src STALL at cycle " << cycle << std::endl; ++scheduleCycle; ++cycle; } - if (attempts == MAX_ATTEMPTS) { //throw instruction away - //cycle = topCycle; + //if no register was found, throw the instruction away and try another one + if (forward == LOOK_FORWARD_CYCLES) { instrIndex = currentInstruction.getInfo().getSize(); if (TRACE) std::cout << "; THROW away " << currentInstruction.getInfo().getName() << std::endl; continue; } if (TRACE) std::cout << "; src = r" << currentInstruction.getSource() << std::endl; } + //find a destination register that will be ready when this instruction executes if (instrIndex == currentInstruction.getInfo().getDstOp()) { - for (attempts = 0; attempts < MAX_ATTEMPTS && !currentInstruction.selectDestination(scheduleCycle, registers, gen); ++attempts) { + int forward; + for (forward = 0; forward < LOOK_FORWARD_CYCLES && !currentInstruction.selectDestination(scheduleCycle, registers, gen); ++forward) { if (TRACE) std::cout << "; dst STALL at cycle " << cycle << std::endl; ++scheduleCycle; ++cycle; } - if (attempts == MAX_ATTEMPTS) { //throw instruction away - //cycle = topCycle; + if (forward == LOOK_FORWARD_CYCLES) { //throw instruction away instrIndex = currentInstruction.getInfo().getSize(); if (TRACE) std::cout << "; THROW away " << currentInstruction.getInfo().getName() << std::endl; continue; } if (TRACE) std::cout << "; dst = r" << currentInstruction.getDestination() << std::endl; } + //recalculate when the instruction can be scheduled for execution based on operand availability scheduleCycle = scheduleUop(mop, portBusy, scheduleCycle, scheduleCycle); + + //calculate when the result will be ready depCycle = scheduleCycle + mop.getLatency(); + + //if this instruction writes the result, modify register information + // RegisterInfo.latency - which cycle the register will be ready + // RegisterInfo.lastOpGroup - the last operation that was applied to the register + // RegisterInfo.lastOpPar - the last operation parameter if (instrIndex == currentInstruction.getInfo().getResultOp()) { int dst = currentInstruction.getDestination(); RegisterInfo& ri = registers[dst]; @@ -837,13 +796,17 @@ namespace RandomX { if (TRACE) std::cout << "; RETIRED at cycle " << retireCycle << std::endl; } codeSize += mop.getSize(); - mopIndex++; + bufferIndex++; instrIndex++; macroOpCount++; - if (scheduleCycle >= RANDOMX_LPROG_LATENCY) { + + //terminating condition + if (scheduleCycle >= RANDOMX_SUPERSCALAR_LATENCY) { portsSaturated = true; } cycle = topCycle; + + //when all macro-ops of the current instruction have been issued, add the instruction into the program if (instrIndex >= currentInstruction.getInfo().getSize()) { currentInstruction.toInstr(prog(outIndex++)); mulCount += isMul(currentInstruction.getType()); @@ -856,19 +819,20 @@ namespace RandomX { if (INFO) std::cout << "; (* = in use, _ = idle)" << std::endl; int portCycles = 0; - /*for (int i = 0; i < CYCLE_MAP_SIZE; ++i) { + for (int i = 0; i < CYCLE_MAP_SIZE; ++i) { std::cout << "; " << std::setw(3) << i << " "; for (int j = 0; j < 3; ++j) { std::cout << (portBusy[i][j] ? '*' : '_'); portCycles += !!portBusy[i][j]; } std::cout << std::endl; - }*/ + } double ipc = (macroOpCount / (double)retireCycle); if (INFO) std::cout << "; code size " << codeSize << " bytes" << std::endl; if (INFO) std::cout << "; x86 macro-ops: " << macroOpCount << std::endl; + if (INFO) std::cout << "; fetch cycles: " << decodeCycle << std::endl; if (INFO) std::cout << "; RandomX instructions: " << outIndex << std::endl; if (INFO) std::cout << "; Execution time: " << retireCycle << " cycles" << std::endl; if (INFO) std::cout << "; IPC = " << ipc << std::endl; @@ -878,7 +842,8 @@ namespace RandomX { int asicLatency[8]; memset(asicLatency, 0, sizeof(asicLatency)); - + //Calculate ASIC latency: + //Assumes 1 cycle latency for all operations and unlimited parallelization. for (int i = 0; i < outIndex; ++i) { Instruction& instr = prog(i); int latDst = asicLatency[instr.dst] + 1; @@ -886,16 +851,17 @@ namespace RandomX { asicLatency[instr.dst] = std::max(latDst, latSrc); } - int asicLatencyFinal = 0; + //address register is the register with the highest ASIC latency + int asicLatencyMax = 0; int addressReg = 0; for (int i = 0; i < 8; ++i) { - if (asicLatency[i] > asicLatencyFinal) { - asicLatencyFinal = asicLatency[i]; + if (asicLatency[i] > asicLatencyMax) { + asicLatencyMax = asicLatency[i]; addressReg = i; } } - if (INFO) std::cout << "; ASIC latency: " << asicLatencyFinal << std::endl; + if (INFO) std::cout << "; ASIC latency: " << asicLatencyMax << std::endl; if (INFO) { std::cout << "; ASIC latency:" << std::endl; diff --git a/src/Program.hpp b/src/Program.hpp index 2b81435..37c8303 100644 --- a/src/Program.hpp +++ b/src/Program.hpp @@ -81,7 +81,7 @@ namespace RandomX { os << instr; } } - Instruction programBuffer[RANDOMX_LPROG_MAX_SIZE]; + Instruction programBuffer[RANDOMX_SUPERSCALAR_MAX_SIZE]; uint32_t size; int addrReg; }; diff --git a/src/configuration.h b/src/configuration.h index 6d9912d..80cf0c4 100644 --- a/src/configuration.h +++ b/src/configuration.h @@ -37,10 +37,8 @@ along with RandomX. If not, see. //Number of random Cache accesses per Dataset block. Minimum is 2. #define RANDOMX_CACHE_ACCESSES 8 -#define RANDOMX_LPROG_LATENCY 170 -#define RANDOMX_LPROG_ASIC_LATENCY 84 -#define RANDOMX_LPROG_MIN_SIZE 225 -#define RANDOMX_LPROG_MAX_SIZE 512 +#define RANDOMX_SUPERSCALAR_LATENCY 170 +#define RANDOMX_SUPERSCALAR_MAX_SIZE 512 //Dataset size in bytes. Must be a power of 2. #define RANDOMX_DATASET_SIZE (2ULL * 1024 * 1024 * 1024) From d49302561f78706f3323fb286840240918aae161 Mon Sep 17 00:00:00 2001 From: tevador Date: Fri, 12 Apr 2019 13:32:22 +0200 Subject: [PATCH 16/18] Refactoring + comments --- src/AssemblyGeneratorX86.cpp | 56 +++--- src/InterpretedVirtualMachine.cpp | 30 ++-- src/JitCompilerX86.cpp | 28 +-- src/LightProgramGenerator.cpp | 283 +++++++++++++----------------- src/LightProgramGenerator.hpp | 2 +- 5 files changed, 183 insertions(+), 216 deletions(-) diff --git a/src/AssemblyGeneratorX86.cpp b/src/AssemblyGeneratorX86.cpp index a25a377..c4e009c 100644 --- a/src/AssemblyGeneratorX86.cpp +++ b/src/AssemblyGeneratorX86.cpp @@ -69,54 +69,54 @@ namespace RandomX { Instruction& instr = prog(i); switch (instr.opcode) { - case RandomX::LightInstructionType::ISUB_R: + case RandomX::SuperscalarInstructionType::ISUB_R: asmCode << "sub " << regR[instr.dst] << ", " << regR[instr.src] << std::endl; break; - case RandomX::LightInstructionType::IXOR_R: + case RandomX::SuperscalarInstructionType::IXOR_R: asmCode << "xor " << regR[instr.dst] << ", " << regR[instr.src] << std::endl; break; - case RandomX::LightInstructionType::IADD_RS: + case RandomX::SuperscalarInstructionType::IADD_RS: asmCode << "lea " << regR[instr.dst] << ", [" << regR[instr.dst] << "+" << regR[instr.src] << "*" << (1 << (instr.mod % 4)) << "]" << std::endl; break; - case RandomX::LightInstructionType::IMUL_R: + case RandomX::SuperscalarInstructionType::IMUL_R: asmCode << "imul " << regR[instr.dst] << ", " << regR[instr.src] << std::endl; break; - case RandomX::LightInstructionType::IROR_C: + case RandomX::SuperscalarInstructionType::IROR_C: asmCode << "ror " << regR[instr.dst] << ", " << instr.getImm32() << std::endl; break; - case RandomX::LightInstructionType::IADD_C7: + case RandomX::SuperscalarInstructionType::IADD_C7: asmCode << "add " << regR[instr.dst] << ", " << (int32_t)instr.getImm32() << std::endl; break; - case RandomX::LightInstructionType::IXOR_C7: + case RandomX::SuperscalarInstructionType::IXOR_C7: asmCode << "xor " << regR[instr.dst] << ", " << (int32_t)instr.getImm32() << std::endl; break; - case RandomX::LightInstructionType::IADD_C8: + case RandomX::SuperscalarInstructionType::IADD_C8: asmCode << "add " << regR[instr.dst] << ", " << (int32_t)instr.getImm32() << std::endl; asmCode << "nop" << std::endl; break; - case RandomX::LightInstructionType::IXOR_C8: + case RandomX::SuperscalarInstructionType::IXOR_C8: asmCode << "xor " << regR[instr.dst] << ", " << (int32_t)instr.getImm32() << std::endl; asmCode << "nop" << std::endl; break; - case RandomX::LightInstructionType::IADD_C9: + case RandomX::SuperscalarInstructionType::IADD_C9: asmCode << "add " << regR[instr.dst] << ", " << (int32_t)instr.getImm32() << std::endl; asmCode << "xchg ax, ax ;nop" << std::endl; break; - case RandomX::LightInstructionType::IXOR_C9: + case RandomX::SuperscalarInstructionType::IXOR_C9: asmCode << "xor " << regR[instr.dst] << ", " << (int32_t)instr.getImm32() << std::endl; asmCode << "xchg ax, ax ;nop" << std::endl; break; - case RandomX::LightInstructionType::IMULH_R: + case RandomX::SuperscalarInstructionType::IMULH_R: asmCode << "mov rax, " << regR[instr.dst] << std::endl; asmCode << "mul " << regR[instr.src] << std::endl; asmCode << "mov " << regR[instr.dst] << ", rdx" << std::endl; break; - case RandomX::LightInstructionType::ISMULH_R: + case RandomX::SuperscalarInstructionType::ISMULH_R: asmCode << "mov rax, " << regR[instr.dst] << std::endl; asmCode << "imul " << regR[instr.src] << std::endl; asmCode << "mov " << regR[instr.dst] << ", rdx" << std::endl; break; - case RandomX::LightInstructionType::IMUL_RCP: + case RandomX::SuperscalarInstructionType::IMUL_RCP: asmCode << "mov rax, " << (int64_t)reciprocal(instr.getImm32()) << std::endl; asmCode << "imul " << regR[instr.dst] << ", rax" << std::endl; break; @@ -178,38 +178,38 @@ namespace RandomX { Instruction& instr = prog(i); switch (instr.opcode) { - case RandomX::LightInstructionType::ISUB_R: + case RandomX::SuperscalarInstructionType::ISUB_R: asmCode << regR[instr.dst] << " -= " << regR[instr.src] << ";" << std::endl; break; - case RandomX::LightInstructionType::IXOR_R: + case RandomX::SuperscalarInstructionType::IXOR_R: asmCode << regR[instr.dst] << " ^= " << regR[instr.src] << ";" << std::endl; break; - case RandomX::LightInstructionType::IADD_RS: + case RandomX::SuperscalarInstructionType::IADD_RS: asmCode << regR[instr.dst] << " += " << regR[instr.src] << "*" << (1 << (instr.mod % 4)) << ";" << std::endl; break; - case RandomX::LightInstructionType::IMUL_R: + case RandomX::SuperscalarInstructionType::IMUL_R: asmCode << regR[instr.dst] << " *= " << regR[instr.src] << ";" << std::endl; break; - case RandomX::LightInstructionType::IROR_C: + case RandomX::SuperscalarInstructionType::IROR_C: asmCode << regR[instr.dst] << " = rotr(" << regR[instr.dst] << ", " << instr.getImm32() << ");" << std::endl; break; - case RandomX::LightInstructionType::IADD_C7: - case RandomX::LightInstructionType::IADD_C8: - case RandomX::LightInstructionType::IADD_C9: + case RandomX::SuperscalarInstructionType::IADD_C7: + case RandomX::SuperscalarInstructionType::IADD_C8: + case RandomX::SuperscalarInstructionType::IADD_C9: asmCode << regR[instr.dst] << " += " << (int32_t)instr.getImm32() << ";" << std::endl; break; - case RandomX::LightInstructionType::IXOR_C7: - case RandomX::LightInstructionType::IXOR_C8: - case RandomX::LightInstructionType::IXOR_C9: + case RandomX::SuperscalarInstructionType::IXOR_C7: + case RandomX::SuperscalarInstructionType::IXOR_C8: + case RandomX::SuperscalarInstructionType::IXOR_C9: asmCode << regR[instr.dst] << " ^= " << (int32_t)instr.getImm32() << ";" << std::endl; break; - case RandomX::LightInstructionType::IMULH_R: + case RandomX::SuperscalarInstructionType::IMULH_R: asmCode << regR[instr.dst] << " = mulh(" << regR[instr.dst] << ", " << regR[instr.src] << ");" << std::endl; break; - case RandomX::LightInstructionType::ISMULH_R: + case RandomX::SuperscalarInstructionType::ISMULH_R: asmCode << regR[instr.dst] << " = smulh(" << regR[instr.dst] << ", " << regR[instr.src] << ");" << std::endl; break; - case RandomX::LightInstructionType::IMUL_RCP: + case RandomX::SuperscalarInstructionType::IMUL_RCP: asmCode << regR[instr.dst] << " *= " << (int64_t)reciprocal(instr.getImm32()) << ";" << std::endl; break; default: diff --git a/src/InterpretedVirtualMachine.cpp b/src/InterpretedVirtualMachine.cpp index 423cefc..673fecf 100644 --- a/src/InterpretedVirtualMachine.cpp +++ b/src/InterpretedVirtualMachine.cpp @@ -480,38 +480,38 @@ namespace RandomX { Instruction& instr = prog(j); switch (instr.opcode) { - case RandomX::LightInstructionType::ISUB_R: + case RandomX::SuperscalarInstructionType::ISUB_R: r[instr.dst] -= r[instr.src]; break; - case RandomX::LightInstructionType::IXOR_R: + case RandomX::SuperscalarInstructionType::IXOR_R: r[instr.dst] ^= r[instr.src]; break; - case RandomX::LightInstructionType::IADD_RS: + case RandomX::SuperscalarInstructionType::IADD_RS: r[instr.dst] += r[instr.src] << (instr.mod % 4); break; - case RandomX::LightInstructionType::IMUL_R: + case RandomX::SuperscalarInstructionType::IMUL_R: r[instr.dst] *= r[instr.src]; break; - case RandomX::LightInstructionType::IROR_C: + case RandomX::SuperscalarInstructionType::IROR_C: r[instr.dst] = rotr(r[instr.dst], instr.getImm32()); break; - case RandomX::LightInstructionType::IADD_C7: - case RandomX::LightInstructionType::IADD_C8: - case RandomX::LightInstructionType::IADD_C9: + case RandomX::SuperscalarInstructionType::IADD_C7: + case RandomX::SuperscalarInstructionType::IADD_C8: + case RandomX::SuperscalarInstructionType::IADD_C9: r[instr.dst] += signExtend2sCompl(instr.getImm32()); break; - case RandomX::LightInstructionType::IXOR_C7: - case RandomX::LightInstructionType::IXOR_C8: - case RandomX::LightInstructionType::IXOR_C9: + case RandomX::SuperscalarInstructionType::IXOR_C7: + case RandomX::SuperscalarInstructionType::IXOR_C8: + case RandomX::SuperscalarInstructionType::IXOR_C9: r[instr.dst] ^= signExtend2sCompl(instr.getImm32()); break; - case RandomX::LightInstructionType::IMULH_R: + case RandomX::SuperscalarInstructionType::IMULH_R: r[instr.dst] = mulh(r[instr.dst], r[instr.src]); break; - case RandomX::LightInstructionType::ISMULH_R: + case RandomX::SuperscalarInstructionType::ISMULH_R: r[instr.dst] = smulh(r[instr.dst], r[instr.src]); break; - case RandomX::LightInstructionType::IMUL_RCP: + case RandomX::SuperscalarInstructionType::IMUL_RCP: if(superscalar) r[instr.dst] *= reciprocals[instr.getImm32()]; else @@ -560,7 +560,7 @@ namespace RandomX { for (unsigned i = 0; i < RANDOMX_CACHE_ACCESSES; ++i) { for (unsigned j = 0; j < superScalarPrograms[i].getSize(); ++j) { Instruction& instr = superScalarPrograms[i](j); - if (instr.opcode == LightInstructionType::IMUL_RCP) { + if (instr.opcode == SuperscalarInstructionType::IMUL_RCP) { auto rcp = reciprocal(instr.getImm32()); instr.setImm32(reciprocals.size()); reciprocals.push_back(rcp); diff --git a/src/JitCompilerX86.cpp b/src/JitCompilerX86.cpp index c4b8ea8..8e15e15 100644 --- a/src/JitCompilerX86.cpp +++ b/src/JitCompilerX86.cpp @@ -348,63 +348,63 @@ namespace RandomX { void JitCompilerX86::generateCode(Instruction& instr, int i) { switch (instr.opcode) { - case RandomX::LightInstructionType::ISUB_R: + case RandomX::SuperscalarInstructionType::ISUB_R: emit(REX_SUB_RR); emitByte(0xc0 + 8 * instr.dst + instr.src); break; - case RandomX::LightInstructionType::IXOR_R: + case RandomX::SuperscalarInstructionType::IXOR_R: emit(REX_XOR_RR); emitByte(0xc0 + 8 * instr.dst + instr.src); break; - case RandomX::LightInstructionType::IADD_RS: + case RandomX::SuperscalarInstructionType::IADD_RS: emit(REX_LEA); emitByte(0x04 + 8 * instr.dst); genSIB(instr.mod % 4, instr.src, instr.dst); break; - case RandomX::LightInstructionType::IMUL_R: + case RandomX::SuperscalarInstructionType::IMUL_R: emit(REX_IMUL_RR); emitByte(0xc0 + 8 * instr.dst + instr.src); break; - case RandomX::LightInstructionType::IROR_C: + case RandomX::SuperscalarInstructionType::IROR_C: emit(REX_ROT_I8); emitByte(0xc8 + instr.dst); emitByte(instr.getImm32() & 63); break; - case RandomX::LightInstructionType::IADD_C7: + case RandomX::SuperscalarInstructionType::IADD_C7: emit(REX_81); emitByte(0xc0 + instr.dst); emit32(instr.getImm32()); break; - case RandomX::LightInstructionType::IXOR_C7: + case RandomX::SuperscalarInstructionType::IXOR_C7: emit(REX_XOR_RI); emitByte(0xf0 + instr.dst); emit32(instr.getImm32()); break; - case RandomX::LightInstructionType::IADD_C8: + case RandomX::SuperscalarInstructionType::IADD_C8: emit(REX_81); emitByte(0xc0 + instr.dst); emit32(instr.getImm32()); emit(NOP1); break; - case RandomX::LightInstructionType::IXOR_C8: + case RandomX::SuperscalarInstructionType::IXOR_C8: emit(REX_XOR_RI); emitByte(0xf0 + instr.dst); emit32(instr.getImm32()); emit(NOP1); break; - case RandomX::LightInstructionType::IADD_C9: + case RandomX::SuperscalarInstructionType::IADD_C9: emit(REX_81); emitByte(0xc0 + instr.dst); emit32(instr.getImm32()); emit(NOP2); break; - case RandomX::LightInstructionType::IXOR_C9: + case RandomX::SuperscalarInstructionType::IXOR_C9: emit(REX_XOR_RI); emitByte(0xf0 + instr.dst); emit32(instr.getImm32()); emit(NOP2); break; - case RandomX::LightInstructionType::IMULH_R: + case RandomX::SuperscalarInstructionType::IMULH_R: emit(REX_MOV_RR64); emitByte(0xc0 + instr.dst); emit(REX_MUL_R); @@ -412,7 +412,7 @@ namespace RandomX { emit(REX_MOV_R64R); emitByte(0xc2 + 8 * instr.dst); break; - case RandomX::LightInstructionType::ISMULH_R: + case RandomX::SuperscalarInstructionType::ISMULH_R: emit(REX_MOV_RR64); emitByte(0xc0 + instr.dst); emit(REX_MUL_R); @@ -420,7 +420,7 @@ namespace RandomX { emit(REX_MOV_R64R); emitByte(0xc2 + 8 * instr.dst); break; - case RandomX::LightInstructionType::IMUL_RCP: + case RandomX::SuperscalarInstructionType::IMUL_RCP: emit(MOV_RAX_I); emit64(reciprocal(instr.getImm32())); emit(REX_IMUL_RM); diff --git a/src/LightProgramGenerator.cpp b/src/LightProgramGenerator.cpp index 092e220..8692dc3 100644 --- a/src/LightProgramGenerator.cpp +++ b/src/LightProgramGenerator.cpp @@ -31,8 +31,8 @@ along with RandomX. If not, see. namespace RandomX { - static bool isMul(int type) { - return type == LightInstructionType::IMUL_R || type == LightInstructionType::IMULH_R || type == LightInstructionType::ISMULH_R || type == LightInstructionType::IMUL_RCP; + static bool isMultiplication(int type) { + return type == SuperscalarInstructionType::IMUL_R || type == SuperscalarInstructionType::IMULH_R || type == SuperscalarInstructionType::ISMULH_R || type == SuperscalarInstructionType::IMUL_RCP; } namespace ExecutionPort { @@ -40,10 +40,10 @@ namespace RandomX { constexpr type Null = 0; constexpr type P0 = 1; constexpr type P1 = 2; - constexpr type P5 = 3; - constexpr type P01 = 4; - constexpr type P05 = 5; - constexpr type P015 = 6; + constexpr type P5 = 4; + constexpr type P01 = P0 | P1; + constexpr type P05 = P0 | P5; + constexpr type P015 = P0 | P1 | P5; } Blake2Generator::Blake2Generator(const void* seed, int nonce) : dataIndex(sizeof(data)) { @@ -244,22 +244,22 @@ namespace RandomX { } }; - const LightInstructionInfo LightInstructionInfo::ISUB_R = LightInstructionInfo("ISUB_R", LightInstructionType::ISUB_R, MacroOp::Sub_rr, 0); - const LightInstructionInfo LightInstructionInfo::IXOR_R = LightInstructionInfo("IXOR_R", LightInstructionType::IXOR_R, MacroOp::Xor_rr, 0); - const LightInstructionInfo LightInstructionInfo::IADD_RS = LightInstructionInfo("IADD_RS", LightInstructionType::IADD_RS, MacroOp::Lea_sib, 0); - const LightInstructionInfo LightInstructionInfo::IMUL_R = LightInstructionInfo("IMUL_R", LightInstructionType::IMUL_R, MacroOp::Imul_rr, 0); - const LightInstructionInfo LightInstructionInfo::IROR_C = LightInstructionInfo("IROR_C", LightInstructionType::IROR_C, MacroOp::Ror_ri, -1); + const LightInstructionInfo LightInstructionInfo::ISUB_R = LightInstructionInfo("ISUB_R", SuperscalarInstructionType::ISUB_R, MacroOp::Sub_rr, 0); + const LightInstructionInfo LightInstructionInfo::IXOR_R = LightInstructionInfo("IXOR_R", SuperscalarInstructionType::IXOR_R, MacroOp::Xor_rr, 0); + const LightInstructionInfo LightInstructionInfo::IADD_RS = LightInstructionInfo("IADD_RS", SuperscalarInstructionType::IADD_RS, MacroOp::Lea_sib, 0); + const LightInstructionInfo LightInstructionInfo::IMUL_R = LightInstructionInfo("IMUL_R", SuperscalarInstructionType::IMUL_R, MacroOp::Imul_rr, 0); + const LightInstructionInfo LightInstructionInfo::IROR_C = LightInstructionInfo("IROR_C", SuperscalarInstructionType::IROR_C, MacroOp::Ror_ri, -1); - const LightInstructionInfo LightInstructionInfo::IADD_C7 = LightInstructionInfo("IADD_C7", LightInstructionType::IADD_C7, MacroOp::Add_ri, -1); - const LightInstructionInfo LightInstructionInfo::IXOR_C7 = LightInstructionInfo("IXOR_C7", LightInstructionType::IXOR_C7, MacroOp::Xor_ri, -1); - const LightInstructionInfo LightInstructionInfo::IADD_C8 = LightInstructionInfo("IADD_C8", LightInstructionType::IADD_C8, MacroOp::Add_ri, -1); - const LightInstructionInfo LightInstructionInfo::IXOR_C8 = LightInstructionInfo("IXOR_C8", LightInstructionType::IXOR_C8, MacroOp::Xor_ri, -1); - const LightInstructionInfo LightInstructionInfo::IADD_C9 = LightInstructionInfo("IADD_C9", LightInstructionType::IADD_C9, MacroOp::Add_ri, -1); - const LightInstructionInfo LightInstructionInfo::IXOR_C9 = LightInstructionInfo("IXOR_C9", LightInstructionType::IXOR_C9, MacroOp::Xor_ri, -1); + const LightInstructionInfo LightInstructionInfo::IADD_C7 = LightInstructionInfo("IADD_C7", SuperscalarInstructionType::IADD_C7, MacroOp::Add_ri, -1); + const LightInstructionInfo LightInstructionInfo::IXOR_C7 = LightInstructionInfo("IXOR_C7", SuperscalarInstructionType::IXOR_C7, MacroOp::Xor_ri, -1); + const LightInstructionInfo LightInstructionInfo::IADD_C8 = LightInstructionInfo("IADD_C8", SuperscalarInstructionType::IADD_C8, MacroOp::Add_ri, -1); + const LightInstructionInfo LightInstructionInfo::IXOR_C8 = LightInstructionInfo("IXOR_C8", SuperscalarInstructionType::IXOR_C8, MacroOp::Xor_ri, -1); + const LightInstructionInfo LightInstructionInfo::IADD_C9 = LightInstructionInfo("IADD_C9", SuperscalarInstructionType::IADD_C9, MacroOp::Add_ri, -1); + const LightInstructionInfo LightInstructionInfo::IXOR_C9 = LightInstructionInfo("IXOR_C9", SuperscalarInstructionType::IXOR_C9, MacroOp::Xor_ri, -1); - const LightInstructionInfo LightInstructionInfo::IMULH_R = LightInstructionInfo("IMULH_R", LightInstructionType::IMULH_R, IMULH_R_ops_array, 1, 0, 1); - const LightInstructionInfo LightInstructionInfo::ISMULH_R = LightInstructionInfo("ISMULH_R", LightInstructionType::ISMULH_R, ISMULH_R_ops_array, 1, 0, 1); - const LightInstructionInfo LightInstructionInfo::IMUL_RCP = LightInstructionInfo("IMUL_RCP", LightInstructionType::IMUL_RCP, IMUL_RCP_ops_array, 1, 1, -1); + const LightInstructionInfo LightInstructionInfo::IMULH_R = LightInstructionInfo("IMULH_R", SuperscalarInstructionType::IMULH_R, IMULH_R_ops_array, 1, 0, 1); + const LightInstructionInfo LightInstructionInfo::ISMULH_R = LightInstructionInfo("ISMULH_R", SuperscalarInstructionType::ISMULH_R, ISMULH_R_ops_array, 1, 0, 1); + const LightInstructionInfo LightInstructionInfo::IMUL_RCP = LightInstructionInfo("IMUL_RCP", SuperscalarInstructionType::IMUL_RCP, IMUL_RCP_ops_array, 1, 1, -1); const LightInstructionInfo LightInstructionInfo::NOP = LightInstructionInfo("NOP"); @@ -285,7 +285,7 @@ namespace RandomX { //If the current RandomX instruction is "IMULH", the next fetch configuration must be 3-3-10 //because the full 128-bit multiplication instruction is 3 bytes long and decodes to 2 uOPs on Intel CPUs. //Intel CPUs can decode at most 4 uOPs per cycle, so this requires a 2-1-1 configuration for a total of 3 macro ops. - if (instrType == LightInstructionType::IMULH_R || instrType == LightInstructionType::ISMULH_R) + if (instrType == SuperscalarInstructionType::IMULH_R || instrType == SuperscalarInstructionType::ISMULH_R) return &decodeBuffer3310; //To make sure that the multiplication port is saturated, a 4-4-4-4 configuration is generated if the number of multiplications @@ -294,7 +294,7 @@ namespace RandomX { return &decodeBuffer4444; //If the current RandomX instruction is "IMUL_RCP", the next buffer must begin with a 4-byte slot for multiplication. - if(instrType == LightInstructionType::IMUL_RCP) + if(instrType == SuperscalarInstructionType::IMUL_RCP) return (gen.getByte() & 1) ? &decodeBuffer484 : &decodeBuffer493; //Default: select a random fetch configuration. @@ -381,6 +381,7 @@ namespace RandomX { switch (slotSize) { case 3: + //if this is the last slot, we can also select "IMULH" instructions if (isLast) { create(slot_3L[gen.getByte() & 3], gen); } @@ -389,6 +390,7 @@ namespace RandomX { } break; case 4: + //if this is the 4-4-4-4 buffer, issue multiplications as the first 3 instructions if (fetchType == 4 && !isLast) { create(&LightInstructionInfo::IMUL_R, gen); } @@ -418,83 +420,83 @@ namespace RandomX { reset(); switch (info->getType()) { - case LightInstructionType::ISUB_R: { + case SuperscalarInstructionType::ISUB_R: { mod_ = 0; imm32_ = 0; - opGroup_ = LightInstructionType::IADD_RS; + opGroup_ = SuperscalarInstructionType::IADD_RS; groupParIsSource_ = true; } break; - case LightInstructionType::IXOR_R: { + case SuperscalarInstructionType::IXOR_R: { mod_ = 0; imm32_ = 0; - opGroup_ = LightInstructionType::IXOR_R; + opGroup_ = SuperscalarInstructionType::IXOR_R; groupParIsSource_ = true; } break; - case LightInstructionType::IADD_RS: { + case SuperscalarInstructionType::IADD_RS: { mod_ = gen.getByte(); imm32_ = 0; - opGroup_ = LightInstructionType::IADD_RS; + opGroup_ = SuperscalarInstructionType::IADD_RS; groupParIsSource_ = true; } break; - case LightInstructionType::IMUL_R: { + case SuperscalarInstructionType::IMUL_R: { mod_ = 0; imm32_ = 0; - opGroup_ = LightInstructionType::IMUL_R; + opGroup_ = SuperscalarInstructionType::IMUL_R; opGroupPar_ = -1; } break; - case LightInstructionType::IROR_C: { + case SuperscalarInstructionType::IROR_C: { mod_ = 0; do { imm32_ = gen.getByte() & 63; } while (imm32_ == 0); - opGroup_ = LightInstructionType::IROR_C; + opGroup_ = SuperscalarInstructionType::IROR_C; opGroupPar_ = -1; } break; - case LightInstructionType::IADD_C7: - case LightInstructionType::IADD_C8: - case LightInstructionType::IADD_C9: { + case SuperscalarInstructionType::IADD_C7: + case SuperscalarInstructionType::IADD_C8: + case SuperscalarInstructionType::IADD_C9: { mod_ = 0; imm32_ = gen.getInt32(); - opGroup_ = LightInstructionType::IADD_C7; + opGroup_ = SuperscalarInstructionType::IADD_C7; opGroupPar_ = -1; } break; - case LightInstructionType::IXOR_C7: - case LightInstructionType::IXOR_C8: - case LightInstructionType::IXOR_C9: { + case SuperscalarInstructionType::IXOR_C7: + case SuperscalarInstructionType::IXOR_C8: + case SuperscalarInstructionType::IXOR_C9: { mod_ = 0; imm32_ = gen.getInt32(); - opGroup_ = LightInstructionType::IXOR_C7; + opGroup_ = SuperscalarInstructionType::IXOR_C7; opGroupPar_ = -1; } break; - case LightInstructionType::IMULH_R: { + case SuperscalarInstructionType::IMULH_R: { canReuse_ = true; mod_ = 0; imm32_ = 0; - opGroup_ = LightInstructionType::IMULH_R; + opGroup_ = SuperscalarInstructionType::IMULH_R; opGroupPar_ = gen.getInt32(); } break; - case LightInstructionType::ISMULH_R: { + case SuperscalarInstructionType::ISMULH_R: { canReuse_ = true; mod_ = 0; imm32_ = 0; - opGroup_ = LightInstructionType::ISMULH_R; + opGroup_ = SuperscalarInstructionType::ISMULH_R; opGroupPar_ = gen.getInt32(); } break; - case LightInstructionType::IMUL_RCP: { + case SuperscalarInstructionType::IMUL_RCP: { mod_ = 0; do { imm32_ = gen.getInt32(); } while ((imm32_ & (imm32_ - 1)) == 0); - opGroup_ = LightInstructionType::IMUL_RCP; + opGroup_ = SuperscalarInstructionType::IMUL_RCP; opGroupPar_ = -1; } break; @@ -506,7 +508,7 @@ namespace RandomX { bool selectDestination(int cycle, RegisterInfo (®isters)[8], Blake2Generator& gen) { std::vector availableRegisters; for (unsigned i = 0; i < 8; ++i) { - if (registers[i].latency <= cycle && (canReuse_ || i != src_) && (registers[i].lastOpGroup != opGroup_ || registers[i].lastOpPar != opGroupPar_) && (info_->getType() != LightInstructionType::IADD_RS || i != 5)) + if (registers[i].latency <= cycle && (canReuse_ || i != src_) && (registers[i].lastOpGroup != opGroup_ || registers[i].lastOpPar != opGroupPar_) && (info_->getType() != SuperscalarInstructionType::IADD_RS || i != 5)) availableRegisters.push_back(i); } return selectRegister(availableRegisters, gen, dst_); @@ -518,7 +520,7 @@ namespace RandomX { if (registers[i].latency <= cycle) availableRegisters.push_back(i); } - if (availableRegisters.size() == 2 && info_->getType() == LightInstructionType::IADD_RS) { + if (availableRegisters.size() == 2 && info_->getType() == SuperscalarInstructionType::IADD_RS) { if (availableRegisters[0] == 5 || availableRegisters[1] == 5) { opGroupPar_ = src_ = 5; return true; @@ -587,106 +589,70 @@ namespace RandomX { constexpr bool INFO = true; #endif - static int blakeCounter = 0; + template + static int scheduleUop(ExecutionPort::type uop, ExecutionPort::type(&portBusy)[CYCLE_MAP_SIZE][3], int cycle) { + //The scheduling here is done optimistically by checking port availability in order P5 -> P0 -> P1 to not overload + //P1 (multiplication port) by instructions that can go to any port. + for (; cycle < CYCLE_MAP_SIZE; ++cycle) { + if ((uop & ExecutionPort::P5) != 0 && !portBusy[cycle][2]) { + if (commit) { + if (TRACE) std::cout << "; P5 at cycle " << cycle << std::endl; + portBusy[cycle][2] = uop; + } + return cycle; + } + if ((uop & ExecutionPort::P0) != 0 && !portBusy[cycle][0]) { + if (commit) { + if (TRACE) std::cout << "; P0 at cycle " << cycle << std::endl; + portBusy[cycle][0] = uop; + } + return cycle; + } + if ((uop & ExecutionPort::P1) != 0 && !portBusy[cycle][1]) { + if (commit) { + if (TRACE) std::cout << "; P1 at cycle " << cycle << std::endl; + portBusy[cycle][1] = uop; + } + return cycle; + } + } + return -1; + } template - static int scheduleUop(const MacroOp& mop, ExecutionPort::type(&portBusy)[CYCLE_MAP_SIZE][3], int cycle, int depCycle) { + static int scheduleMop(const MacroOp& mop, ExecutionPort::type(&portBusy)[CYCLE_MAP_SIZE][3], int cycle, int depCycle) { + //if this macro-op depends on the previous one, increase the starting cycle if needed + //this handles an explicit dependency chain in IMUL_RCP if (mop.isDependent()) { cycle = std::max(cycle, depCycle); } + //move instructions are eliminated and don't need an execution unit if (mop.isEliminated()) { if (commit) if (TRACE) std::cout << "; (eliminated)" << std::endl; return cycle; } else if (mop.isSimple()) { - if (mop.getUop1() <= ExecutionPort::P5) { - for (; cycle < CYCLE_MAP_SIZE; ++cycle) { - if (!portBusy[cycle][mop.getUop1() - 1]) { - if (commit) { - if (TRACE) std::cout << "; P" << mop.getUop1() - 1 << " at cycle " << cycle << std::endl; - portBusy[cycle][mop.getUop1() - 1] = mop.getUop1(); - } - return cycle; - } - } - } - else if (mop.getUop1() == ExecutionPort::P01) { - for (; cycle < CYCLE_MAP_SIZE; ++cycle) { - if (!portBusy[cycle][0]) { - if (commit) { - if (TRACE) std::cout << "; P0 at cycle " << cycle << std::endl; - portBusy[cycle][0] = mop.getUop1(); - } - return cycle; - } - if (!portBusy[cycle][1]) { - if (commit) { - if (TRACE) std::cout << "; P1 at cycle " << cycle << std::endl; - portBusy[cycle][1] = mop.getUop1(); - } - return cycle; - } - } - } - else if (mop.getUop1() == ExecutionPort::P05) { - for (; cycle < CYCLE_MAP_SIZE; ++cycle) { - if (!portBusy[cycle][2]) { - if (commit) { - if (TRACE) std::cout << "; P2 at cycle " << cycle << std::endl; - portBusy[cycle][2] = mop.getUop1(); - } - return cycle; - } - if (!portBusy[cycle][0]) { - if (commit) { - if (TRACE) std::cout << "; P0 at cycle " << cycle << std::endl; - portBusy[cycle][0] = mop.getUop1(); - } - return cycle; - } - } - } - else { - for (; cycle < CYCLE_MAP_SIZE; ++cycle) { - if (!portBusy[cycle][2]) { - if (commit) { - if (TRACE) std::cout << "; P2 at cycle " << cycle << std::endl; - portBusy[cycle][2] = mop.getUop1(); - } - return cycle; - } - if (!portBusy[cycle][0]) { - if (commit) { - if (TRACE) std::cout << "; P0 at cycle " << cycle << std::endl; - portBusy[cycle][0] = mop.getUop1(); - } - return cycle; - } - if (!portBusy[cycle][1]) { - if (commit) { - if (TRACE) std::cout << "; P1 at cycle " << cycle << std::endl; - portBusy[cycle][1] = mop.getUop1(); - } - return cycle; - } - } - } + //this macro-op has only one uOP + return scheduleUop(mop.getUop1(), portBusy, cycle); } else { + //macro-ops with 2 uOPs are scheduled conservatively by requiring both uOPs to execute in the same cycle for (; cycle < CYCLE_MAP_SIZE; ++cycle) { - if (!portBusy[cycle][mop.getUop1() - 1] && !portBusy[cycle][mop.getUop2() - 1]) { + + int cycle1 = scheduleUop(mop.getUop1(), portBusy, cycle); + int cycle2 = scheduleUop(mop.getUop2(), portBusy, cycle); + + if (cycle1 == cycle2) { if (commit) { - if (TRACE) std::cout << "; P" << mop.getUop1() - 1 << " P" << mop.getUop2() - 1 << " at cycle " << cycle << std::endl; - portBusy[cycle][mop.getUop1() - 1] = mop.getUop1(); - portBusy[cycle][mop.getUop2() - 1] = mop.getUop2(); + scheduleUop(mop.getUop1(), portBusy, cycle1); + scheduleUop(mop.getUop2(), portBusy, cycle2); } - return cycle; + return cycle1; } } } - if (TRACE) std::cout << "Unable to map operation '" << mop.getName() << "' to execution port (cycle " << cycle << ")" << std::endl; return -1; } @@ -698,14 +664,14 @@ namespace RandomX { const DecoderBuffer* decodeBuffer = &DecoderBuffer::Default; LightInstruction currentInstruction = LightInstruction::Null; - int instrIndex = 0; + int macroOpIndex = 0; int codeSize = 0; int macroOpCount = 0; int cycle = 0; int depCycle = 0; int retireCycle = 0; bool portsSaturated = false; - int outIndex = 0; + int programSize = 0; int mulCount = 0; int decodeCycle; @@ -713,39 +679,40 @@ namespace RandomX { //Each decode cycle decodes 16 bytes of x86 code. //Since a decode cycle produces on average 3.45 macro-ops and there are only 3 ALU ports, execution ports are always //saturated first. The cycle limit is present only to guarantee loop termination. - for (decodeCycle = 0; decodeCycle < RANDOMX_SUPERSCALAR_LATENCY && !portsSaturated && outIndex < RANDOMX_SUPERSCALAR_MAX_SIZE; ++decodeCycle) { + //Program size is limited to RANDOMX_SUPERSCALAR_MAX_SIZE instructions. + for (decodeCycle = 0; decodeCycle < RANDOMX_SUPERSCALAR_LATENCY && !portsSaturated && programSize < RANDOMX_SUPERSCALAR_MAX_SIZE; ++decodeCycle) { - //select a fetch/decode configuration + //select a decode configuration decodeBuffer = decodeBuffer->fetchNext(currentInstruction.getType(), decodeCycle, mulCount, gen); if (TRACE) std::cout << "; ------------- fetch cycle " << cycle << " (" << decodeBuffer->getName() << ")" << std::endl; int bufferIndex = 0; - //fill all instruction slots in the current fetch/decode buffer + //fill all instruction slots in the current decode buffer while (bufferIndex < decodeBuffer->getSize()) { int topCycle = cycle; - //if we have created all macro-ops for the current RandomX instruction, create a new instruction - if (instrIndex >= currentInstruction.getInfo().getSize()) { + //if we have issued all macro-ops for the current RandomX instruction, create a new instruction + if (macroOpIndex >= currentInstruction.getInfo().getSize()) { if (portsSaturated) break; + //select an instruction so that the first macro-op fits into the current slot currentInstruction.createForSlot(gen, decodeBuffer->getCounts()[bufferIndex], decodeBuffer->getIndex(), decodeBuffer->getSize() == bufferIndex + 1, bufferIndex == 0); - instrIndex = 0; + macroOpIndex = 0; if (TRACE) std::cout << "; " << currentInstruction.getInfo().getName() << std::endl; } - const MacroOp& mop = currentInstruction.getInfo().getOp(instrIndex); - + const MacroOp& mop = currentInstruction.getInfo().getOp(macroOpIndex); if (TRACE) std::cout << mop.getName() << " "; //calculate the earliest cycle when this macro-op (all of its uOPs) can be scheduled for execution - int scheduleCycle = scheduleUop(mop, portBusy, cycle, depCycle); + int scheduleCycle = scheduleMop(mop, portBusy, cycle, depCycle); if (scheduleCycle < 0) { - if (TRACE) std::cout << "; Failed at cycle " << cycle << std::endl; + /*if (TRACE)*/ std::cout << "Unable to map operation '" << mop.getName() << "' to execution port (cycle " << cycle << ")" << std::endl; return 0; } //find a source register (if applicable) that will be ready when this instruction executes - if (instrIndex == currentInstruction.getInfo().getSrcOp()) { + if (macroOpIndex == currentInstruction.getInfo().getSrcOp()) { int forward; //if no suitable operand is ready, look up to LOOK_FORWARD_CYCLES forward for (forward = 0; forward < LOOK_FORWARD_CYCLES && !currentInstruction.selectSource(scheduleCycle, registers, gen); ++forward) { @@ -755,14 +722,14 @@ namespace RandomX { } //if no register was found, throw the instruction away and try another one if (forward == LOOK_FORWARD_CYCLES) { - instrIndex = currentInstruction.getInfo().getSize(); + macroOpIndex = currentInstruction.getInfo().getSize(); if (TRACE) std::cout << "; THROW away " << currentInstruction.getInfo().getName() << std::endl; continue; } if (TRACE) std::cout << "; src = r" << currentInstruction.getSource() << std::endl; } //find a destination register that will be ready when this instruction executes - if (instrIndex == currentInstruction.getInfo().getDstOp()) { + if (macroOpIndex == currentInstruction.getInfo().getDstOp()) { int forward; for (forward = 0; forward < LOOK_FORWARD_CYCLES && !currentInstruction.selectDestination(scheduleCycle, registers, gen); ++forward) { if (TRACE) std::cout << "; dst STALL at cycle " << cycle << std::endl; @@ -770,14 +737,14 @@ namespace RandomX { ++cycle; } if (forward == LOOK_FORWARD_CYCLES) { //throw instruction away - instrIndex = currentInstruction.getInfo().getSize(); + macroOpIndex = currentInstruction.getInfo().getSize(); if (TRACE) std::cout << "; THROW away " << currentInstruction.getInfo().getName() << std::endl; continue; } if (TRACE) std::cout << "; dst = r" << currentInstruction.getDestination() << std::endl; } //recalculate when the instruction can be scheduled for execution based on operand availability - scheduleCycle = scheduleUop(mop, portBusy, scheduleCycle, scheduleCycle); + scheduleCycle = scheduleMop(mop, portBusy, scheduleCycle, scheduleCycle); //calculate when the result will be ready depCycle = scheduleCycle + mop.getLatency(); @@ -785,8 +752,8 @@ namespace RandomX { //if this instruction writes the result, modify register information // RegisterInfo.latency - which cycle the register will be ready // RegisterInfo.lastOpGroup - the last operation that was applied to the register - // RegisterInfo.lastOpPar - the last operation parameter - if (instrIndex == currentInstruction.getInfo().getResultOp()) { + // RegisterInfo.lastOpPar - the last operation source value (-1 = constant, 0-7 = register) + if (macroOpIndex == currentInstruction.getInfo().getResultOp()) { int dst = currentInstruction.getDestination(); RegisterInfo& ri = registers[dst]; retireCycle = depCycle; @@ -797,7 +764,7 @@ namespace RandomX { } codeSize += mop.getSize(); bufferIndex++; - instrIndex++; + macroOpIndex++; macroOpCount++; //terminating condition @@ -807,9 +774,9 @@ namespace RandomX { cycle = topCycle; //when all macro-ops of the current instruction have been issued, add the instruction into the program - if (instrIndex >= currentInstruction.getInfo().getSize()) { - currentInstruction.toInstr(prog(outIndex++)); - mulCount += isMul(currentInstruction.getType()); + if (macroOpIndex >= currentInstruction.getInfo().getSize()) { + currentInstruction.toInstr(prog(programSize++)); + mulCount += isMultiplication(currentInstruction.getType()); } } ++cycle; @@ -820,12 +787,12 @@ namespace RandomX { int portCycles = 0; for (int i = 0; i < CYCLE_MAP_SIZE; ++i) { - std::cout << "; " << std::setw(3) << i << " "; + //std::cout << "; " << std::setw(3) << i << " "; for (int j = 0; j < 3; ++j) { - std::cout << (portBusy[i][j] ? '*' : '_'); + //std::cout << (portBusy[i][j] ? '*' : '_'); portCycles += !!portBusy[i][j]; } - std::cout << std::endl; + //std::cout << std::endl; } double ipc = (macroOpCount / (double)retireCycle); @@ -833,7 +800,7 @@ namespace RandomX { if (INFO) std::cout << "; code size " << codeSize << " bytes" << std::endl; if (INFO) std::cout << "; x86 macro-ops: " << macroOpCount << std::endl; if (INFO) std::cout << "; fetch cycles: " << decodeCycle << std::endl; - if (INFO) std::cout << "; RandomX instructions: " << outIndex << std::endl; + if (INFO) std::cout << "; RandomX instructions: " << programSize << std::endl; if (INFO) std::cout << "; Execution time: " << retireCycle << " cycles" << std::endl; if (INFO) std::cout << "; IPC = " << ipc << std::endl; if (INFO) std::cout << "; Port-cycles: " << portCycles << std::endl; @@ -844,7 +811,7 @@ namespace RandomX { //Calculate ASIC latency: //Assumes 1 cycle latency for all operations and unlimited parallelization. - for (int i = 0; i < outIndex; ++i) { + for (int i = 0; i < programSize; ++i) { Instruction& instr = prog(i); int latDst = asicLatency[instr.dst] + 1; int latSrc = instr.dst != instr.src ? asicLatency[instr.src] + 1 : 0; @@ -874,8 +841,8 @@ namespace RandomX { } } - prog.setSize(outIndex); + prog.setSize(programSize); prog.setAddressRegister(addressReg); - return outIndex; + return ipc; } } \ No newline at end of file diff --git a/src/LightProgramGenerator.hpp b/src/LightProgramGenerator.hpp index d920dd0..7030d10 100644 --- a/src/LightProgramGenerator.hpp +++ b/src/LightProgramGenerator.hpp @@ -22,7 +22,7 @@ along with RandomX. If not, see. namespace RandomX { // Intel Ivy Bridge reference - namespace LightInstructionType { //uOPs (decode) execution ports latency code size + namespace SuperscalarInstructionType { //uOPs (decode) execution ports latency code size constexpr int ISUB_R = 0; //1 p015 1 3 constexpr int IXOR_R = 1; //1 p015 1 3 constexpr int IADD_RS = 2; //1 p01 1 4 From 9404516dd87a722b2aa763a36d6c9045b4fc4fbf Mon Sep 17 00:00:00 2001 From: tevador Date: Fri, 12 Apr 2019 14:56:20 +0200 Subject: [PATCH 17/18] Refactoring --- src/LightProgramGenerator.cpp | 47 +++++++++++++++---- src/LightProgramGenerator.hpp | 2 +- src/common.hpp | 1 + src/main.cpp | 6 +-- vcxproj/randomx.vcxproj | 29 ++++++++++++ vcxproj/randomx.vcxproj.filters | 83 +++++++++++++++++++++++++++++++++ 6 files changed, 154 insertions(+), 14 deletions(-) diff --git a/src/LightProgramGenerator.cpp b/src/LightProgramGenerator.cpp index 8692dc3..40a767b 100644 --- a/src/LightProgramGenerator.cpp +++ b/src/LightProgramGenerator.cpp @@ -507,8 +507,16 @@ namespace RandomX { bool selectDestination(int cycle, RegisterInfo (®isters)[8], Blake2Generator& gen) { std::vector availableRegisters; + //Conditions for the destination register: + // * value must be ready at the required cycle + // * cannot be the same as the source register unless the instruction allows it + // - this avoids optimizable instructions such as "xor r, r" or "sub r, r" + // * either the last instruction applied to the register or its source must be different than this instruction + // - this avoids optimizable instruction sequences such as "xor r1, r2; xor r1, r2" or "ror r, C1; ror r, C2" or "add r, C1; add r, C2" + // - it also avoids accumulation of trailing zeroes in registers due to excessive multiplication + // * register r5 cannot be the destination of the IADD_RS instruction (limitation of the x86 lea instruction) for (unsigned i = 0; i < 8; ++i) { - if (registers[i].latency <= cycle && (canReuse_ || i != src_) && (registers[i].lastOpGroup != opGroup_ || registers[i].lastOpPar != opGroupPar_) && (info_->getType() != SuperscalarInstructionType::IADD_RS || i != 5)) + if (registers[i].latency <= cycle && (canReuse_ || i != src_) && (registers[i].lastOpGroup != opGroup_ || registers[i].lastOpPar != opGroupPar_) && (info_->getType() != SuperscalarInstructionType::IADD_RS || i != LimitedAddressRegister)) availableRegisters.push_back(i); } return selectRegister(availableRegisters, gen, dst_); @@ -516,13 +524,15 @@ namespace RandomX { bool selectSource(int cycle, RegisterInfo(®isters)[8], Blake2Generator& gen) { std::vector availableRegisters; + //all registers that are ready at the cycle for (unsigned i = 0; i < 8; ++i) { if (registers[i].latency <= cycle) availableRegisters.push_back(i); } + //if there are only 2 available registers for IADD_RS and one of them is r5, select it as the source because it cannot be the destination if (availableRegisters.size() == 2 && info_->getType() == SuperscalarInstructionType::IADD_RS) { - if (availableRegisters[0] == 5 || availableRegisters[1] == 5) { - opGroupPar_ = src_ = 5; + if (availableRegisters[0] == LimitedAddressRegister || availableRegisters[1] == LimitedAddressRegister) { + opGroupPar_ = src_ = LimitedAddressRegister; return true; } } @@ -656,7 +666,7 @@ namespace RandomX { return -1; } - double generateLightProg2(LightProgram& prog, Blake2Generator& gen) { + double generateSuperscalar(LightProgram& prog, Blake2Generator& gen) { ExecutionPort::type portBusy[CYCLE_MAP_SIZE][3]; memset(portBusy, 0, sizeof(portBusy)); @@ -674,6 +684,7 @@ namespace RandomX { int programSize = 0; int mulCount = 0; int decodeCycle; + int throwAwayCount = 0; //decode instructions for RANDOMX_SUPERSCALAR_LATENCY cycles or until an execution port is saturated. //Each decode cycle decodes 16 bytes of x86 code. @@ -722,12 +733,20 @@ namespace RandomX { } //if no register was found, throw the instruction away and try another one if (forward == LOOK_FORWARD_CYCLES) { - macroOpIndex = currentInstruction.getInfo().getSize(); - if (TRACE) std::cout << "; THROW away " << currentInstruction.getInfo().getName() << std::endl; - continue; + if (throwAwayCount < MAX_THROWAWAY_COUNT) { + throwAwayCount++; + macroOpIndex = currentInstruction.getInfo().getSize(); + if (TRACE) std::cout << "; THROW away " << currentInstruction.getInfo().getName() << std::endl; + continue; + } + //abort this decode buffer + /*if (TRACE)*/ std::cout << "Aborting at cycle " << cycle << " with decode buffer " << decodeBuffer->getName() << " - source registers not available" << std::endl; + currentInstruction = LightInstruction::Null; + break; } if (TRACE) std::cout << "; src = r" << currentInstruction.getSource() << std::endl; } + throwAwayCount = 0; //find a destination register that will be ready when this instruction executes if (macroOpIndex == currentInstruction.getInfo().getDstOp()) { int forward; @@ -737,12 +756,20 @@ namespace RandomX { ++cycle; } if (forward == LOOK_FORWARD_CYCLES) { //throw instruction away - macroOpIndex = currentInstruction.getInfo().getSize(); - if (TRACE) std::cout << "; THROW away " << currentInstruction.getInfo().getName() << std::endl; - continue; + if (throwAwayCount < MAX_THROWAWAY_COUNT) { + throwAwayCount++; + macroOpIndex = currentInstruction.getInfo().getSize(); + if (TRACE) std::cout << "; THROW away " << currentInstruction.getInfo().getName() << std::endl; + continue; + } + //abort this decode buffer + /*if (TRACE)*/ std::cout << "Aborting at cycle " << cycle << " with decode buffer " << decodeBuffer->getName() << " - destination registers not available" << std::endl; + currentInstruction = LightInstruction::Null; + break; } if (TRACE) std::cout << "; dst = r" << currentInstruction.getDestination() << std::endl; } + throwAwayCount = 0; //recalculate when the instruction can be scheduled for execution based on operand availability scheduleCycle = scheduleMop(mop, portBusy, scheduleCycle, scheduleCycle); diff --git a/src/LightProgramGenerator.hpp b/src/LightProgramGenerator.hpp index 7030d10..beb7974 100644 --- a/src/LightProgramGenerator.hpp +++ b/src/LightProgramGenerator.hpp @@ -54,5 +54,5 @@ namespace RandomX { void checkData(const size_t); }; - double generateLightProg2(LightProgram& prog, Blake2Generator& gen); + double generateSuperscalar(LightProgram& prog, Blake2Generator& gen); } \ No newline at end of file diff --git a/src/common.hpp b/src/common.hpp index 83a9bc7..034c10f 100644 --- a/src/common.hpp +++ b/src/common.hpp @@ -95,6 +95,7 @@ namespace RandomX { constexpr int ScratchpadL3Mask = (ScratchpadL3 - 1) * 8; constexpr int ScratchpadL3Mask64 = (ScratchpadL3 / 8 - 1) * 64; constexpr int RegistersCount = 8; + constexpr int LimitedAddressRegister = 5; //x86 r13 register struct Cache { uint8_t* memory; diff --git a/src/main.cpp b/src/main.cpp index 4866804..a120cf9 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -228,11 +228,11 @@ int main(int argc, char** argv) { if (genSuperscalar) { RandomX::LightProgram p; RandomX::Blake2Generator gen(seed, programCount); - RandomX::generateLightProg2(p, gen); + RandomX::generateSuperscalar(p, gen); RandomX::AssemblyGeneratorX86 asmX86; asmX86.generateAsm(p); //std::ofstream file("lightProg2.asm"); - asmX86.printCode(std::cout); + //asmX86.printCode(std::cout); return 0; } @@ -288,7 +288,7 @@ int main(int argc, char** argv) { if (!legacy) { RandomX::Blake2Generator gen(seed, programCount); for (int i = 0; i < RANDOMX_CACHE_ACCESSES; ++i) { - RandomX::generateLightProg2(programs[i], gen); + RandomX::generateSuperscalar(programs[i], gen); } } if (!miningMode) { diff --git a/vcxproj/randomx.vcxproj b/vcxproj/randomx.vcxproj index 3dc09c8..1c1cae0 100644 --- a/vcxproj/randomx.vcxproj +++ b/vcxproj/randomx.vcxproj @@ -149,6 +149,35 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/vcxproj/randomx.vcxproj.filters b/vcxproj/randomx.vcxproj.filters index 9f33e02..5b821c8 100644 --- a/vcxproj/randomx.vcxproj.filters +++ b/vcxproj/randomx.vcxproj.filters @@ -84,4 +84,87 @@ Source Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + \ No newline at end of file From 8c37d4aac36b6a2c0a6dadfc0220dd2f4fe8dfc8 Mon Sep 17 00:00:00 2001 From: tevador Date: Fri, 12 Apr 2019 19:36:08 +0200 Subject: [PATCH 18/18] More refactoring --- src/AssemblyGeneratorX86.cpp | 6 +- src/AssemblyGeneratorX86.hpp | 6 +- src/Blake2Generator.cpp | 51 ++++ src/Blake2Generator.hpp | 36 +++ src/CompiledLightVirtualMachine.cpp | 6 +- src/CompiledLightVirtualMachine.hpp | 2 +- src/CompiledVirtualMachine.cpp | 2 +- src/CompiledVirtualMachine.hpp | 2 +- src/InterpretedVirtualMachine.cpp | 15 +- src/InterpretedVirtualMachine.hpp | 8 +- src/JitCompilerX86.cpp | 12 +- src/JitCompilerX86.hpp | 4 +- src/LightClientAsyncWorker.cpp | 113 ------- src/LightClientAsyncWorker.hpp | 57 ---- src/LightProgramGenerator.hpp | 58 ---- src/Program.hpp | 17 +- src/VirtualMachine.hpp | 2 +- src/main.cpp | 8 +- ...Generator.cpp => superscalarGenerator.cpp} | 283 ++++++++---------- src/superscalarGenerator.hpp | 47 +++ src/tests/superscalar-avalanche.cpp | 7 +- src/tests/superscalar-init.cpp | 2 +- vcxproj/randomx.vcxproj | 8 +- vcxproj/randomx.vcxproj.filters | 24 +- vcxproj/superscalar-avalanche.vcxproj | 3 +- vcxproj/superscalar-avalanche.vcxproj.filters | 9 +- vcxproj/superscalar-init.vcxproj | 3 +- vcxproj/superscalar-init.vcxproj.filters | 9 +- 28 files changed, 347 insertions(+), 453 deletions(-) create mode 100644 src/Blake2Generator.cpp create mode 100644 src/Blake2Generator.hpp delete mode 100644 src/LightClientAsyncWorker.cpp delete mode 100644 src/LightClientAsyncWorker.hpp delete mode 100644 src/LightProgramGenerator.hpp rename src/{LightProgramGenerator.cpp => superscalarGenerator.cpp} (76%) create mode 100644 src/superscalarGenerator.hpp diff --git a/src/AssemblyGeneratorX86.cpp b/src/AssemblyGeneratorX86.cpp index c4e009c..b3511c1 100644 --- a/src/AssemblyGeneratorX86.cpp +++ b/src/AssemblyGeneratorX86.cpp @@ -23,7 +23,7 @@ along with RandomX. If not, see. #include "common.hpp" #include "reciprocal.h" #include "Program.hpp" -#include "./LightProgramGenerator.hpp" +#include "superscalarGenerator.hpp" namespace RandomX { @@ -62,7 +62,7 @@ namespace RandomX { } } - void AssemblyGeneratorX86::generateAsm(LightProgram& prog) { + void AssemblyGeneratorX86::generateAsm(SuperscalarProgram& prog) { asmCode.str(std::string()); //clear asmCode << "ALIGN 16" << std::endl; for (unsigned i = 0; i < prog.getSize(); ++i) { @@ -126,7 +126,7 @@ namespace RandomX { } } - void AssemblyGeneratorX86::generateC(LightProgram& prog) { + void AssemblyGeneratorX86::generateC(SuperscalarProgram& prog) { asmCode.str(std::string()); //clear asmCode << "#include " << std::endl; asmCode << "#if defined(__SIZEOF_INT128__)" << std::endl; diff --git a/src/AssemblyGeneratorX86.hpp b/src/AssemblyGeneratorX86.hpp index 8688cd4..4b777e6 100644 --- a/src/AssemblyGeneratorX86.hpp +++ b/src/AssemblyGeneratorX86.hpp @@ -27,7 +27,7 @@ along with RandomX. If not, see. namespace RandomX { class Program; - class LightProgram; + class SuperscalarProgram; class AssemblyGeneratorX86; typedef void(AssemblyGeneratorX86::*InstructionGenerator)(Instruction&, int); @@ -35,8 +35,8 @@ namespace RandomX { class AssemblyGeneratorX86 { public: void generateProgram(Program& prog); - void generateAsm(LightProgram& prog); - void generateC(LightProgram& prog); + void generateAsm(SuperscalarProgram& prog); + void generateC(SuperscalarProgram& prog); void printCode(std::ostream& os) { os << asmCode.rdbuf(); } diff --git a/src/Blake2Generator.cpp b/src/Blake2Generator.cpp new file mode 100644 index 0000000..2879088 --- /dev/null +++ b/src/Blake2Generator.cpp @@ -0,0 +1,51 @@ +/* +Copyright (c) 2019 tevador + +This file is part of RandomX. + +RandomX is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +RandomX is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with RandomX. If not, see. +*/ + +#include "blake2/blake2.h" +#include "blake2/endian.h" +#include "Blake2Generator.hpp" +#include "common.hpp" + +namespace RandomX { + + Blake2Generator::Blake2Generator(const void* seed, int nonce) : dataIndex(sizeof(data)) { + memset(data, 0, sizeof(data)); + memcpy(data, seed, SeedSize); + store32(&data[60], nonce); + } + + uint8_t Blake2Generator::getByte() { + checkData(1); + return data[dataIndex++]; + } + + uint32_t Blake2Generator::getInt32() { + checkData(4); + auto ret = load32(&data[dataIndex]); + dataIndex += 4; + return ret; + } + + void Blake2Generator::checkData(const size_t bytesNeeded) { + if (dataIndex + bytesNeeded > sizeof(data)) { + blake2b(data, sizeof(data), data, sizeof(data), nullptr, 0); + dataIndex = 0; + } + } +} \ No newline at end of file diff --git a/src/Blake2Generator.hpp b/src/Blake2Generator.hpp new file mode 100644 index 0000000..24f2fca --- /dev/null +++ b/src/Blake2Generator.hpp @@ -0,0 +1,36 @@ +/* +Copyright (c) 2019 tevador + +This file is part of RandomX. + +RandomX is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +RandomX is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with RandomX. If not, see. +*/ + +#pragma once +#include + +namespace RandomX { + + class Blake2Generator { + public: + Blake2Generator(const void* seed, int nonce); + uint8_t getByte(); + uint32_t getInt32(); + private: + uint8_t data[64]; + size_t dataIndex; + + void checkData(const size_t); + }; +} \ No newline at end of file diff --git a/src/CompiledLightVirtualMachine.cpp b/src/CompiledLightVirtualMachine.cpp index 760842a..11bedf8 100644 --- a/src/CompiledLightVirtualMachine.cpp +++ b/src/CompiledLightVirtualMachine.cpp @@ -24,7 +24,7 @@ along with RandomX. If not, see. namespace RandomX { template - void CompiledLightVirtualMachine::setDataset(dataset_t ds, uint64_t size, LightProgram(&programs)[RANDOMX_CACHE_ACCESSES]) { + void CompiledLightVirtualMachine::setDataset(dataset_t ds, uint64_t size, SuperscalarProgram(&programs)[RANDOMX_CACHE_ACCESSES]) { mem.ds = ds; datasetRange = (size - RANDOMX_DATASET_SIZE + CacheLineSize) / CacheLineSize; if(superscalar) @@ -32,8 +32,8 @@ namespace RandomX { //datasetBasePtr = ds.dataset.memory; } - template void CompiledLightVirtualMachine::setDataset(dataset_t ds, uint64_t size, LightProgram(&programs)[RANDOMX_CACHE_ACCESSES]); - template void CompiledLightVirtualMachine::setDataset(dataset_t ds, uint64_t size, LightProgram(&programs)[RANDOMX_CACHE_ACCESSES]); + template void CompiledLightVirtualMachine::setDataset(dataset_t ds, uint64_t size, SuperscalarProgram(&programs)[RANDOMX_CACHE_ACCESSES]); + template void CompiledLightVirtualMachine::setDataset(dataset_t ds, uint64_t size, SuperscalarProgram(&programs)[RANDOMX_CACHE_ACCESSES]); template void CompiledLightVirtualMachine::initialize() { diff --git a/src/CompiledLightVirtualMachine.hpp b/src/CompiledLightVirtualMachine.hpp index 9493c58..1d4b78e 100644 --- a/src/CompiledLightVirtualMachine.hpp +++ b/src/CompiledLightVirtualMachine.hpp @@ -39,7 +39,7 @@ namespace RandomX { _mm_free(ptr); } CompiledLightVirtualMachine() {} - void setDataset(dataset_t ds, uint64_t size, LightProgram(&programs)[RANDOMX_CACHE_ACCESSES]) override; + void setDataset(dataset_t ds, uint64_t size, SuperscalarProgram(&programs)[RANDOMX_CACHE_ACCESSES]) override; void initialize() override; }; } \ No newline at end of file diff --git a/src/CompiledVirtualMachine.cpp b/src/CompiledVirtualMachine.cpp index 4984938..3e44476 100644 --- a/src/CompiledVirtualMachine.cpp +++ b/src/CompiledVirtualMachine.cpp @@ -29,7 +29,7 @@ namespace RandomX { CompiledVirtualMachine::CompiledVirtualMachine() { } - void CompiledVirtualMachine::setDataset(dataset_t ds, uint64_t size, LightProgram(&programs)[RANDOMX_CACHE_ACCESSES]) { + void CompiledVirtualMachine::setDataset(dataset_t ds, uint64_t size, SuperscalarProgram(&programs)[RANDOMX_CACHE_ACCESSES]) { mem.ds = ds; datasetRange = (size - RANDOMX_DATASET_SIZE + CacheLineSize) / CacheLineSize; datasetBasePtr = ds.dataset.memory; diff --git a/src/CompiledVirtualMachine.hpp b/src/CompiledVirtualMachine.hpp index 65b1885..a2866ca 100644 --- a/src/CompiledVirtualMachine.hpp +++ b/src/CompiledVirtualMachine.hpp @@ -42,7 +42,7 @@ namespace RandomX { _mm_free(ptr); } CompiledVirtualMachine(); - void setDataset(dataset_t ds, uint64_t size, LightProgram(&programs)[RANDOMX_CACHE_ACCESSES]) override; + void setDataset(dataset_t ds, uint64_t size, SuperscalarProgram(&programs)[RANDOMX_CACHE_ACCESSES]) override; void initialize() override; virtual void execute() override; void* getProgram() { diff --git a/src/InterpretedVirtualMachine.cpp b/src/InterpretedVirtualMachine.cpp index 673fecf..132a2c9 100644 --- a/src/InterpretedVirtualMachine.cpp +++ b/src/InterpretedVirtualMachine.cpp @@ -22,7 +22,6 @@ along with RandomX. If not, see. #include "InterpretedVirtualMachine.hpp" #include "dataset.hpp" #include "Cache.hpp" -#include "LightClientAsyncWorker.hpp" #include #include #include @@ -36,7 +35,7 @@ along with RandomX. If not, see. #ifdef STATS #include #endif -#include "LightProgramGenerator.hpp" +#include "superscalarGenerator.hpp" #ifdef FPUCHECK constexpr bool fpuCheck = true; @@ -47,7 +46,7 @@ constexpr bool fpuCheck = false; namespace RandomX { template - void InterpretedVirtualMachine::setDataset(dataset_t ds, uint64_t size, LightProgram(&programs)[RANDOMX_CACHE_ACCESSES]) { + void InterpretedVirtualMachine::setDataset(dataset_t ds, uint64_t size, SuperscalarProgram(&programs)[RANDOMX_CACHE_ACCESSES]) { mem.ds = ds; readDataset = &datasetReadLight; datasetRange = (size - RANDOMX_DATASET_SIZE + CacheLineSize) / CacheLineSize; @@ -55,8 +54,8 @@ namespace RandomX { precompileSuperscalar(programs); } - template void InterpretedVirtualMachine::setDataset(dataset_t ds, uint64_t size, LightProgram(&programs)[RANDOMX_CACHE_ACCESSES]); - template void InterpretedVirtualMachine::setDataset(dataset_t ds, uint64_t size, LightProgram(&programs)[RANDOMX_CACHE_ACCESSES]); + template void InterpretedVirtualMachine::setDataset(dataset_t ds, uint64_t size, SuperscalarProgram(&programs)[RANDOMX_CACHE_ACCESSES]); + template void InterpretedVirtualMachine::setDataset(dataset_t ds, uint64_t size, SuperscalarProgram(&programs)[RANDOMX_CACHE_ACCESSES]); template void InterpretedVirtualMachine::initialize() { @@ -475,7 +474,7 @@ namespace RandomX { } template - void InterpretedVirtualMachine::executeSuperscalar(int_reg_t(&r)[8], LightProgram& prog, std::vector& reciprocals) { + void InterpretedVirtualMachine::executeSuperscalar(int_reg_t(&r)[8], SuperscalarProgram& prog, std::vector& reciprocals) { for (unsigned j = 0; j < prog.getSize(); ++j) { Instruction& instr = prog(j); switch (instr.opcode) @@ -539,7 +538,7 @@ namespace RandomX { Cache& cache = mem.ds.cache; for (unsigned i = 0; i < RANDOMX_CACHE_ACCESSES; ++i) { mixBlock = getMixBlock(registerValue, cache); - LightProgram& prog = superScalarPrograms[i]; + SuperscalarProgram& prog = superScalarPrograms[i]; executeSuperscalar(rl, prog, reciprocals); @@ -554,7 +553,7 @@ namespace RandomX { } template - void InterpretedVirtualMachine::precompileSuperscalar(LightProgram* programs) { + void InterpretedVirtualMachine::precompileSuperscalar(SuperscalarProgram* programs) { memcpy(superScalarPrograms, programs, sizeof(superScalarPrograms)); reciprocals.clear(); for (unsigned i = 0; i < RANDOMX_CACHE_ACCESSES; ++i) { diff --git a/src/InterpretedVirtualMachine.hpp b/src/InterpretedVirtualMachine.hpp index ddefa67..3632112 100644 --- a/src/InterpretedVirtualMachine.hpp +++ b/src/InterpretedVirtualMachine.hpp @@ -70,17 +70,17 @@ namespace RandomX { } InterpretedVirtualMachine(bool soft) : softAes(soft) {} ~InterpretedVirtualMachine() {} - void setDataset(dataset_t ds, uint64_t size, LightProgram(&programs)[RANDOMX_CACHE_ACCESSES]) override; + void setDataset(dataset_t ds, uint64_t size, SuperscalarProgram(&programs)[RANDOMX_CACHE_ACCESSES]) override; void initialize() override; void execute() override; - static void executeSuperscalar(int_reg_t(&r)[8], LightProgram& prog, std::vector& reciprocals); + static void executeSuperscalar(int_reg_t(&r)[8], SuperscalarProgram& prog, std::vector& reciprocals); private: static InstructionHandler engine[256]; DatasetReadFunc readDataset; bool softAes; InstructionByteCode byteCode[RANDOMX_PROGRAM_SIZE]; std::vector reciprocals; - alignas(64) LightProgram superScalarPrograms[RANDOMX_CACHE_ACCESSES]; + alignas(64) SuperscalarProgram superScalarPrograms[RANDOMX_CACHE_ACCESSES]; #ifdef STATS int count_ADD_64 = 0; int count_ADD_32 = 0; @@ -128,7 +128,7 @@ namespace RandomX { int datasetAccess[256] = { 0 }; #endif void precompileProgram(int_reg_t(&r)[8], __m128d (&f)[4], __m128d (&e)[4], __m128d (&a)[4]); - void precompileSuperscalar(LightProgram*); + void precompileSuperscalar(SuperscalarProgram*); void executeBytecode(int_reg_t(&r)[8], __m128d (&f)[4], __m128d (&e)[4], __m128d (&a)[4]); void executeBytecode(int& i, int_reg_t(&r)[8], __m128d (&f)[4], __m128d (&e)[4], __m128d (&a)[4]); void executeSuperscalar(uint32_t blockNumber, int_reg_t(&r)[8]); diff --git a/src/JitCompilerX86.cpp b/src/JitCompilerX86.cpp index 8e15e15..ad7c85a 100644 --- a/src/JitCompilerX86.cpp +++ b/src/JitCompilerX86.cpp @@ -87,7 +87,7 @@ namespace RandomX { */ #include "JitCompilerX86-static.hpp" -#include "LightProgramGenerator.hpp" +#include "superscalarGenerator.hpp" #define NOP_TEST true @@ -261,16 +261,16 @@ namespace RandomX { template void JitCompilerX86::generateProgramLight(Program& prog); template - void JitCompilerX86::generateSuperScalarHash(LightProgram(&programs)[N]) { + void JitCompilerX86::generateSuperScalarHash(SuperscalarProgram(&programs)[N]) { memcpy(code + superScalarHashOffset, codeShhInit, codeSshInitSize); codePos = superScalarHashOffset + codeSshInitSize; for (unsigned j = 0; j < N; ++j) { - LightProgram& prog = programs[j]; + SuperscalarProgram& prog = programs[j]; for (unsigned i = 0; i < prog.getSize(); ++i) { Instruction& instr = prog(i); instr.src %= RegistersCount; instr.dst %= RegistersCount; - generateCode(instr, i); + generateCode(instr, i); } emit(codeShhLoad, codeSshLoadSize); if (j < N - 1) { @@ -290,7 +290,7 @@ namespace RandomX { } template - void JitCompilerX86::generateSuperScalarHash(LightProgram(&programs)[RANDOMX_CACHE_ACCESSES]); + void JitCompilerX86::generateSuperScalarHash(SuperscalarProgram(&programs)[RANDOMX_CACHE_ACCESSES]); void JitCompilerX86::generateDatasetInitCode() { memcpy(code, codeDatasetInit, datasetInitSize); @@ -345,7 +345,7 @@ namespace RandomX { } template<> - void JitCompilerX86::generateCode(Instruction& instr, int i) { + void JitCompilerX86::generateCode(Instruction& instr, int i) { switch (instr.opcode) { case RandomX::SuperscalarInstructionType::ISUB_R: diff --git a/src/JitCompilerX86.hpp b/src/JitCompilerX86.hpp index 9240cfe..2908b04 100644 --- a/src/JitCompilerX86.hpp +++ b/src/JitCompilerX86.hpp @@ -27,7 +27,7 @@ along with RandomX. If not, see. namespace RandomX { class Program; - class LightProgram; + class SuperscalarProgram; class JitCompilerX86; typedef void(JitCompilerX86::*InstructionGeneratorX86)(Instruction&, int); @@ -42,7 +42,7 @@ namespace RandomX { template void generateProgramLight(Program&); template - void generateSuperScalarHash(LightProgram (&programs)[N]); + void generateSuperScalarHash(SuperscalarProgram (&programs)[N]); ProgramFunc getProgramFunc() { return (ProgramFunc)code; } diff --git a/src/LightClientAsyncWorker.cpp b/src/LightClientAsyncWorker.cpp deleted file mode 100644 index fbba713..0000000 --- a/src/LightClientAsyncWorker.cpp +++ /dev/null @@ -1,113 +0,0 @@ -/* -Copyright (c) 2019 tevador - -This file is part of RandomX. - -RandomX is free software: you can redistribute it and/or modify -it under the terms of the GNU General Public License as published by -the Free Software Foundation, either version 3 of the License, or -(at your option) any later version. - -RandomX is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with RandomX. If not, see. -*/ - -#include "LightClientAsyncWorker.hpp" -#include "dataset.hpp" -#include "Cache.hpp" - -namespace RandomX { - - LightClientAsyncWorker::LightClientAsyncWorker(const Cache& c) : ILightClientAsyncWorker(c), output(nullptr), hasWork(false), -#ifdef TRACE - sw(true), -#endif - workerThread(&LightClientAsyncWorker::runWorker, this) { - - } - - void LightClientAsyncWorker::prepareBlock(addr_t addr) { -#ifdef TRACE - std::cout << sw.getElapsed() << ": prepareBlock-enter " << addr / CacheLineSize << std::endl; -#endif - { - std::lock_guard lk(mutex); - startBlock = addr / CacheLineSize; - blockCount = 1; - output = currentLine.data(); - hasWork = true; - } -#ifdef TRACE - std::cout << sw.getElapsed() << ": prepareBlock-notify " << startBlock << "/" << blockCount << std::endl; -#endif - notifier.notify_one(); - } - - const uint64_t* LightClientAsyncWorker::getBlock(addr_t addr) { -#ifdef TRACE - std::cout << sw.getElapsed() << ": getBlock-enter " << addr / CacheLineSize << std::endl; -#endif - uint32_t currentBlock = addr / CacheLineSize; - if (currentBlock != startBlock || output != currentLine.data()) { - initBlock(cache, (uint8_t*)currentLine.data(), currentBlock, RANDOMX_CACHE_ACCESSES / 8); - } - else { - sync(); - } -#ifdef TRACE - std::cout << sw.getElapsed() << ": getBlock-return " << addr / CacheLineSize << std::endl; -#endif - return currentLine.data(); - } - - void LightClientAsyncWorker::prepareBlocks(void* out, uint32_t startBlock, uint32_t blockCount) { -#ifdef TRACE - std::cout << sw.getElapsed() << ": prepareBlocks-enter " << startBlock << "/" << blockCount << std::endl; -#endif - { - std::lock_guard lk(mutex); - this->startBlock = startBlock; - this->blockCount = blockCount; - output = out; - hasWork = true; - notifier.notify_one(); - } - } - - void LightClientAsyncWorker::getBlocks(void* out, uint32_t startBlock, uint32_t blockCount) { - for (uint32_t i = 0; i < blockCount; ++i) { - initBlock(cache, (uint8_t*)out + CacheLineSize * i, startBlock + i, RANDOMX_CACHE_ACCESSES / 8); - } - } - - void LightClientAsyncWorker::sync() { - std::unique_lock lk(mutex); - notifier.wait(lk, [this] { return !hasWork; }); - } - - void LightClientAsyncWorker::runWorker() { -#ifdef TRACE - std::cout << sw.getElapsed() << ": runWorker-enter " << std::endl; -#endif - for (;;) { - std::unique_lock lk(mutex); - notifier.wait(lk, [this] { return hasWork; }); -#ifdef TRACE - std::cout << sw.getElapsed() << ": runWorker-getBlocks " << startBlock << "/" << blockCount << std::endl; -#endif - //getBlocks(output, startBlock, blockCount); - initBlock(cache, (uint8_t*)output, startBlock, RANDOMX_CACHE_ACCESSES / 8); - hasWork = false; -#ifdef TRACE - std::cout << sw.getElapsed() << ": runWorker-finished " << startBlock << "/" << blockCount << std::endl; -#endif - lk.unlock(); - notifier.notify_one(); - } - } -} \ No newline at end of file diff --git a/src/LightClientAsyncWorker.hpp b/src/LightClientAsyncWorker.hpp deleted file mode 100644 index 7c45e53..0000000 --- a/src/LightClientAsyncWorker.hpp +++ /dev/null @@ -1,57 +0,0 @@ -/* -Copyright (c) 2019 tevador - -This file is part of RandomX. - -RandomX is free software: you can redistribute it and/or modify -it under the terms of the GNU General Public License as published by -the Free Software Foundation, either version 3 of the License, or -(at your option) any later version. - -RandomX is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with RandomX. If not, see. -*/ - -//#define TRACE -#include "common.hpp" - -#include -#include -#include -#include -#ifdef TRACE -#include "Stopwatch.hpp" -#include -#endif - -namespace RandomX { - - using DatasetLine = std::array; - - class LightClientAsyncWorker : public ILightClientAsyncWorker { - public: - LightClientAsyncWorker(const Cache&); - void prepareBlock(addr_t) final; - void prepareBlocks(void* out, uint32_t startBlock, uint32_t blockCount) final; - const uint64_t* getBlock(addr_t) final; - void getBlocks(void* out, uint32_t startBlock, uint32_t blockCount) final; - void sync() final; - private: - void runWorker(); - std::condition_variable notifier; - std::mutex mutex; - alignas(16) DatasetLine currentLine; - void* output; - uint32_t startBlock, blockCount; - bool hasWork; -#ifdef TRACE - Stopwatch sw; -#endif - std::thread workerThread; - }; -} \ No newline at end of file diff --git a/src/LightProgramGenerator.hpp b/src/LightProgramGenerator.hpp deleted file mode 100644 index beb7974..0000000 --- a/src/LightProgramGenerator.hpp +++ /dev/null @@ -1,58 +0,0 @@ -/* -Copyright (c) 2019 tevador - -This file is part of RandomX. - -RandomX is free software: you can redistribute it and/or modify -it under the terms of the GNU General Public License as published by -the Free Software Foundation, either version 3 of the License, or -(at your option) any later version. - -RandomX is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with RandomX. If not, see. -*/ - -#include "Program.hpp" - -namespace RandomX { - - // Intel Ivy Bridge reference - namespace SuperscalarInstructionType { //uOPs (decode) execution ports latency code size - constexpr int ISUB_R = 0; //1 p015 1 3 - constexpr int IXOR_R = 1; //1 p015 1 3 - constexpr int IADD_RS = 2; //1 p01 1 4 - constexpr int IMUL_R = 3; //1 p1 3 4 - constexpr int IROR_C = 4; //1 p05 1 4 - constexpr int IADD_C7 = 5; //1 p015 1 7 - constexpr int IXOR_C7 = 6; //1 p015 1 7 - constexpr int IADD_C8 = 7; //1+0 p015 1 8 - constexpr int IXOR_C8 = 8; //1+0 p015 1 8 - constexpr int IADD_C9 = 9; //1+0 p015 1 9 - constexpr int IXOR_C9 = 10; //1+0 p015 1 9 - constexpr int IMULH_R = 11; //1+2+1 0+(p1,p5)+0 3 3+3+3 - constexpr int ISMULH_R = 12; //1+2+1 0+(p1,p5)+0 3 3+3+3 - constexpr int IMUL_RCP = 13; //1+1 p015+p1 4 10+4 - - constexpr int COUNT = 14; - constexpr int INVALID = -1; - } - - class Blake2Generator { - public: - Blake2Generator(const void* seed, int nonce); - uint8_t getByte(); - uint32_t getInt32(); - private: - uint8_t data[64]; - size_t dataIndex; - - void checkData(const size_t); - }; - - double generateSuperscalar(LightProgram& prog, Blake2Generator& gen); -} \ No newline at end of file diff --git a/src/Program.hpp b/src/Program.hpp index 37c8303..2f2a402 100644 --- a/src/Program.hpp +++ b/src/Program.hpp @@ -53,12 +53,14 @@ namespace RandomX { Instruction programBuffer[RANDOMX_PROGRAM_SIZE]; }; - class LightProgram { + static_assert(sizeof(Program) % 64 == 0, "Invalid size of class Program"); + + class SuperscalarProgram { public: Instruction& operator()(int pc) { return programBuffer[pc]; } - friend std::ostream& operator<<(std::ostream& os, const LightProgram& p) { + friend std::ostream& operator<<(std::ostream& os, const SuperscalarProgram& p) { p.print(os); return os; } @@ -74,6 +76,15 @@ namespace RandomX { void setAddressRegister(uint32_t val) { addrReg = val; } + double ipc; + int codeSize; + int macroOps; + int decodeCycles; + int cpuLatency; + int asicLatency; + int mulCount; + int cpuLatencies[8]; + int asicLatencies[8]; private: void print(std::ostream& os) const { for (unsigned i = 0; i < size; ++i) { @@ -85,6 +96,4 @@ namespace RandomX { uint32_t size; int addrReg; }; - - static_assert(sizeof(Program) % 64 == 0, "Invalid size of class Program"); } diff --git a/src/VirtualMachine.hpp b/src/VirtualMachine.hpp index 1edacdb..7352933 100644 --- a/src/VirtualMachine.hpp +++ b/src/VirtualMachine.hpp @@ -28,7 +28,7 @@ namespace RandomX { public: VirtualMachine(); virtual ~VirtualMachine() {} - virtual void setDataset(dataset_t ds, uint64_t size, LightProgram (&programs)[RANDOMX_CACHE_ACCESSES]) = 0; + virtual void setDataset(dataset_t ds, uint64_t size, SuperscalarProgram (&programs)[RANDOMX_CACHE_ACCESSES]) = 0; void setScratchpad(void* ptr) { scratchpad = (uint8_t*)ptr; } diff --git a/src/main.cpp b/src/main.cpp index a120cf9..42dc15f 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -36,7 +36,7 @@ along with RandomX. If not, see. #include "dataset.hpp" #include "Cache.hpp" #include "hashAes1Rx4.hpp" -#include "LightProgramGenerator.hpp" +#include "superscalarGenerator.hpp" #include "JitCompilerX86.hpp" const uint8_t seed[32] = { 191, 182, 222, 175, 249, 89, 134, 104, 241, 68, 191, 62, 162, 166, 61, 64, 123, 191, 227, 193, 118, 60, 188, 53, 223, 133, 175, 24, 123, 230, 55, 74 }; @@ -226,13 +226,13 @@ int main(int argc, char** argv) { readOption("--legacy", argc, argv, legacy); if (genSuperscalar) { - RandomX::LightProgram p; + RandomX::SuperscalarProgram p; RandomX::Blake2Generator gen(seed, programCount); RandomX::generateSuperscalar(p, gen); RandomX::AssemblyGeneratorX86 asmX86; asmX86.generateAsm(p); //std::ofstream file("lightProg2.asm"); - //asmX86.printCode(std::cout); + asmX86.printCode(std::cout); return 0; } @@ -268,7 +268,7 @@ int main(int argc, char** argv) { const uint64_t cacheSize = (RANDOMX_ARGON_MEMORY + RANDOMX_ARGON_GROWTH * epoch) * RandomX::ArgonBlockSize; const uint64_t datasetSize = (RANDOMX_DATASET_SIZE + RANDOMX_DS_GROWTH * epoch); dataset.cache.size = cacheSize; - RandomX::LightProgram programs[RANDOMX_CACHE_ACCESSES]; + RandomX::SuperscalarProgram programs[RANDOMX_CACHE_ACCESSES]; std::cout << "RandomX - " << (miningMode ? "mining" : "verification") << " mode" << std::endl; diff --git a/src/LightProgramGenerator.cpp b/src/superscalarGenerator.cpp similarity index 76% rename from src/LightProgramGenerator.cpp rename to src/superscalarGenerator.cpp index 40a767b..d4fd32a 100644 --- a/src/LightProgramGenerator.cpp +++ b/src/superscalarGenerator.cpp @@ -18,7 +18,6 @@ along with RandomX. If not, see. */ #include -#include "blake2/blake2.h" #include "configuration.h" #include "Program.hpp" #include "blake2/endian.h" @@ -27,7 +26,7 @@ along with RandomX. If not, see. #include #include #include -#include "LightProgramGenerator.hpp" +#include "superscalarGenerator.hpp" namespace RandomX { @@ -35,6 +34,7 @@ namespace RandomX { return type == SuperscalarInstructionType::IMUL_R || type == SuperscalarInstructionType::IMULH_R || type == SuperscalarInstructionType::ISMULH_R || type == SuperscalarInstructionType::IMUL_RCP; } + //uOPs (micro-ops) are represented only by the execution port they can go to namespace ExecutionPort { using type = int; constexpr type Null = 0; @@ -46,40 +46,9 @@ namespace RandomX { constexpr type P015 = P0 | P1 | P5; } - Blake2Generator::Blake2Generator(const void* seed, int nonce) : dataIndex(sizeof(data)) { - memset(data, 0, sizeof(data)); - memcpy(data, seed, SeedSize); - store32(&data[60], nonce); - } - - uint8_t Blake2Generator::getByte() { - checkData(1); - return data[dataIndex++]; - } - - uint32_t Blake2Generator::getInt32() { - checkData(4); - auto ret = load32(&data[dataIndex]); - dataIndex += 4; - return ret; - } - - void Blake2Generator::checkData(const size_t bytesNeeded) { - if (dataIndex + bytesNeeded > sizeof(data)) { - blake2b(data, sizeof(data), data, sizeof(data), nullptr, 0); - dataIndex = 0; - } - } - - class RegisterInfo { - public: - RegisterInfo() : latency(0), lastOpGroup(-1), lastOpPar(-1), value(0) {} - int latency; - int lastOpGroup; - int lastOpPar; - int value; - }; - + //Macro-operation as output of the x86 decoder + //Usually one macro-op = one x86 instruction, but 2 instructions are sometimes fused into 1 macro-op + //Macro-op can consist of 1 or 2 uOPs. class MacroOp { public: MacroOp(const char* name, int size) @@ -137,10 +106,7 @@ namespace RandomX { int latency_; ExecutionPort::type uop1_; ExecutionPort::type uop2_; - int cycle_; bool dependent_ = false; - MacroOp* depDst_ = nullptr; - MacroOp* depSrc_ = nullptr; }; //Size: 3 bytes @@ -174,7 +140,7 @@ namespace RandomX { const MacroOp ISMULH_R_ops_array[] = { MacroOp::Mov_rr, MacroOp::Imul_r, MacroOp::Mov_rr }; const MacroOp IMUL_RCP_ops_array[] = { MacroOp::Mov_ri64, MacroOp(MacroOp::Imul_rr, true) }; - class LightInstructionInfo { + class SuperscalarInstructionInfo { public: const char* getName() const { return name_; @@ -203,21 +169,21 @@ namespace RandomX { int getSrcOp() const { return srcOp_; } - static const LightInstructionInfo ISUB_R; - static const LightInstructionInfo IXOR_R; - static const LightInstructionInfo IADD_RS; - static const LightInstructionInfo IMUL_R; - static const LightInstructionInfo IROR_C; - static const LightInstructionInfo IADD_C7; - static const LightInstructionInfo IXOR_C7; - static const LightInstructionInfo IADD_C8; - static const LightInstructionInfo IXOR_C8; - static const LightInstructionInfo IADD_C9; - static const LightInstructionInfo IXOR_C9; - static const LightInstructionInfo IMULH_R; - static const LightInstructionInfo ISMULH_R; - static const LightInstructionInfo IMUL_RCP; - static const LightInstructionInfo NOP; + static const SuperscalarInstructionInfo ISUB_R; + static const SuperscalarInstructionInfo IXOR_R; + static const SuperscalarInstructionInfo IADD_RS; + static const SuperscalarInstructionInfo IMUL_R; + static const SuperscalarInstructionInfo IROR_C; + static const SuperscalarInstructionInfo IADD_C7; + static const SuperscalarInstructionInfo IXOR_C7; + static const SuperscalarInstructionInfo IADD_C8; + static const SuperscalarInstructionInfo IXOR_C8; + static const SuperscalarInstructionInfo IADD_C9; + static const SuperscalarInstructionInfo IXOR_C9; + static const SuperscalarInstructionInfo IMULH_R; + static const SuperscalarInstructionInfo ISMULH_R; + static const SuperscalarInstructionInfo IMUL_RCP; + static const SuperscalarInstructionInfo NOP; private: const char* name_; int type_; @@ -227,14 +193,14 @@ namespace RandomX { int dstOp_ = 0; int srcOp_; - LightInstructionInfo(const char* name) + SuperscalarInstructionInfo(const char* name) : name_(name), type_(-1), latency_(0) {} - LightInstructionInfo(const char* name, int type, const MacroOp& op, int srcOp) + SuperscalarInstructionInfo(const char* name, int type, const MacroOp& op, int srcOp) : name_(name), type_(type), latency_(op.getLatency()), srcOp_(srcOp) { ops_.push_back(MacroOp(op)); } template - LightInstructionInfo(const char* name, int type, const MacroOp(&arr)[N], int resultOp, int dstOp, int srcOp) + SuperscalarInstructionInfo(const char* name, int type, const MacroOp(&arr)[N], int resultOp, int dstOp, int srcOp) : name_(name), type_(type), latency_(0), resultOp_(resultOp), dstOp_(dstOp), srcOp_(srcOp) { for (unsigned i = 0; i < N; ++i) { ops_.push_back(MacroOp(arr[i])); @@ -244,24 +210,34 @@ namespace RandomX { } }; - const LightInstructionInfo LightInstructionInfo::ISUB_R = LightInstructionInfo("ISUB_R", SuperscalarInstructionType::ISUB_R, MacroOp::Sub_rr, 0); - const LightInstructionInfo LightInstructionInfo::IXOR_R = LightInstructionInfo("IXOR_R", SuperscalarInstructionType::IXOR_R, MacroOp::Xor_rr, 0); - const LightInstructionInfo LightInstructionInfo::IADD_RS = LightInstructionInfo("IADD_RS", SuperscalarInstructionType::IADD_RS, MacroOp::Lea_sib, 0); - const LightInstructionInfo LightInstructionInfo::IMUL_R = LightInstructionInfo("IMUL_R", SuperscalarInstructionType::IMUL_R, MacroOp::Imul_rr, 0); - const LightInstructionInfo LightInstructionInfo::IROR_C = LightInstructionInfo("IROR_C", SuperscalarInstructionType::IROR_C, MacroOp::Ror_ri, -1); + const SuperscalarInstructionInfo SuperscalarInstructionInfo::ISUB_R = SuperscalarInstructionInfo("ISUB_R", SuperscalarInstructionType::ISUB_R, MacroOp::Sub_rr, 0); + const SuperscalarInstructionInfo SuperscalarInstructionInfo::IXOR_R = SuperscalarInstructionInfo("IXOR_R", SuperscalarInstructionType::IXOR_R, MacroOp::Xor_rr, 0); + const SuperscalarInstructionInfo SuperscalarInstructionInfo::IADD_RS = SuperscalarInstructionInfo("IADD_RS", SuperscalarInstructionType::IADD_RS, MacroOp::Lea_sib, 0); + const SuperscalarInstructionInfo SuperscalarInstructionInfo::IMUL_R = SuperscalarInstructionInfo("IMUL_R", SuperscalarInstructionType::IMUL_R, MacroOp::Imul_rr, 0); + const SuperscalarInstructionInfo SuperscalarInstructionInfo::IROR_C = SuperscalarInstructionInfo("IROR_C", SuperscalarInstructionType::IROR_C, MacroOp::Ror_ri, -1); - const LightInstructionInfo LightInstructionInfo::IADD_C7 = LightInstructionInfo("IADD_C7", SuperscalarInstructionType::IADD_C7, MacroOp::Add_ri, -1); - const LightInstructionInfo LightInstructionInfo::IXOR_C7 = LightInstructionInfo("IXOR_C7", SuperscalarInstructionType::IXOR_C7, MacroOp::Xor_ri, -1); - const LightInstructionInfo LightInstructionInfo::IADD_C8 = LightInstructionInfo("IADD_C8", SuperscalarInstructionType::IADD_C8, MacroOp::Add_ri, -1); - const LightInstructionInfo LightInstructionInfo::IXOR_C8 = LightInstructionInfo("IXOR_C8", SuperscalarInstructionType::IXOR_C8, MacroOp::Xor_ri, -1); - const LightInstructionInfo LightInstructionInfo::IADD_C9 = LightInstructionInfo("IADD_C9", SuperscalarInstructionType::IADD_C9, MacroOp::Add_ri, -1); - const LightInstructionInfo LightInstructionInfo::IXOR_C9 = LightInstructionInfo("IXOR_C9", SuperscalarInstructionType::IXOR_C9, MacroOp::Xor_ri, -1); + const SuperscalarInstructionInfo SuperscalarInstructionInfo::IADD_C7 = SuperscalarInstructionInfo("IADD_C7", SuperscalarInstructionType::IADD_C7, MacroOp::Add_ri, -1); + const SuperscalarInstructionInfo SuperscalarInstructionInfo::IXOR_C7 = SuperscalarInstructionInfo("IXOR_C7", SuperscalarInstructionType::IXOR_C7, MacroOp::Xor_ri, -1); + const SuperscalarInstructionInfo SuperscalarInstructionInfo::IADD_C8 = SuperscalarInstructionInfo("IADD_C8", SuperscalarInstructionType::IADD_C8, MacroOp::Add_ri, -1); + const SuperscalarInstructionInfo SuperscalarInstructionInfo::IXOR_C8 = SuperscalarInstructionInfo("IXOR_C8", SuperscalarInstructionType::IXOR_C8, MacroOp::Xor_ri, -1); + const SuperscalarInstructionInfo SuperscalarInstructionInfo::IADD_C9 = SuperscalarInstructionInfo("IADD_C9", SuperscalarInstructionType::IADD_C9, MacroOp::Add_ri, -1); + const SuperscalarInstructionInfo SuperscalarInstructionInfo::IXOR_C9 = SuperscalarInstructionInfo("IXOR_C9", SuperscalarInstructionType::IXOR_C9, MacroOp::Xor_ri, -1); - const LightInstructionInfo LightInstructionInfo::IMULH_R = LightInstructionInfo("IMULH_R", SuperscalarInstructionType::IMULH_R, IMULH_R_ops_array, 1, 0, 1); - const LightInstructionInfo LightInstructionInfo::ISMULH_R = LightInstructionInfo("ISMULH_R", SuperscalarInstructionType::ISMULH_R, ISMULH_R_ops_array, 1, 0, 1); - const LightInstructionInfo LightInstructionInfo::IMUL_RCP = LightInstructionInfo("IMUL_RCP", SuperscalarInstructionType::IMUL_RCP, IMUL_RCP_ops_array, 1, 1, -1); + const SuperscalarInstructionInfo SuperscalarInstructionInfo::IMULH_R = SuperscalarInstructionInfo("IMULH_R", SuperscalarInstructionType::IMULH_R, IMULH_R_ops_array, 1, 0, 1); + const SuperscalarInstructionInfo SuperscalarInstructionInfo::ISMULH_R = SuperscalarInstructionInfo("ISMULH_R", SuperscalarInstructionType::ISMULH_R, ISMULH_R_ops_array, 1, 0, 1); + const SuperscalarInstructionInfo SuperscalarInstructionInfo::IMUL_RCP = SuperscalarInstructionInfo("IMUL_RCP", SuperscalarInstructionType::IMUL_RCP, IMUL_RCP_ops_array, 1, 1, -1); - const LightInstructionInfo LightInstructionInfo::NOP = LightInstructionInfo("NOP"); + const SuperscalarInstructionInfo SuperscalarInstructionInfo::NOP = SuperscalarInstructionInfo("NOP"); + + //these are some of the options how to split a 16-byte window into 3 or 4 x86 instructions. + //RandomX uses instructions with a native size of 3 (sub, xor, mul, mov), 4 (lea, mul), 7 (xor, add immediate) or 10 bytes (mov 64-bit immediate). + //Slots with sizes of 8 or 9 bytes need to be padded with a nop instruction. + const int buffer0[] = { 4, 8, 4 }; + const int buffer1[] = { 7, 3, 3, 3 }; + const int buffer2[] = { 3, 7, 3, 3 }; + const int buffer3[] = { 4, 9, 3 }; + const int buffer4[] = { 4, 4, 4, 4 }; + const int buffer5[] = { 3, 3, 10 }; class DecoderBuffer { public: @@ -318,16 +294,6 @@ namespace RandomX { } }; - //these are some of the options how to split a 16-byte window into 3 or 4 x86 instructions. - //RandomX uses instructions with a native size of 3 (sub, xor, mul, mov), 4 (lea, mul), 7 (xor, add immediate) or 10 bytes (mov 64-bit immediate). - //Slots with sizes of 8 or 9 bytes need to be padded with a nop instruction. - const int buffer0[] = { 4, 8, 4 }; - const int buffer1[] = { 7, 3, 3, 3 }; - const int buffer2[] = { 3, 7, 3, 3 }; - const int buffer3[] = { 4, 9, 3 }; - const int buffer4[] = { 4, 4, 4, 4 }; - const int buffer5[] = { 3, 3, 10 }; - const DecoderBuffer DecoderBuffer::decodeBuffer484 = DecoderBuffer("4,8,4", 0, buffer0); const DecoderBuffer DecoderBuffer::decodeBuffer7333 = DecoderBuffer("7,3,3,3", 1, buffer1); const DecoderBuffer DecoderBuffer::decodeBuffer3733 = DecoderBuffer("3,7,3,3", 2, buffer2); @@ -344,13 +310,13 @@ namespace RandomX { const DecoderBuffer DecoderBuffer::Default = DecoderBuffer(); - const LightInstructionInfo* slot_3[] = { &LightInstructionInfo::ISUB_R, &LightInstructionInfo::IXOR_R }; - const LightInstructionInfo* slot_3L[] = { &LightInstructionInfo::ISUB_R, &LightInstructionInfo::IXOR_R, &LightInstructionInfo::IMULH_R, &LightInstructionInfo::ISMULH_R }; - const LightInstructionInfo* slot_4[] = { &LightInstructionInfo::IROR_C, &LightInstructionInfo::IADD_RS }; - const LightInstructionInfo* slot_7[] = { &LightInstructionInfo::IXOR_C7, &LightInstructionInfo::IADD_C7 }; - const LightInstructionInfo* slot_8[] = { &LightInstructionInfo::IXOR_C8, &LightInstructionInfo::IADD_C8 }; - const LightInstructionInfo* slot_9[] = { &LightInstructionInfo::IXOR_C9, &LightInstructionInfo::IADD_C9 }; - const LightInstructionInfo* slot_10 = &LightInstructionInfo::IMUL_RCP; + const SuperscalarInstructionInfo* slot_3[] = { &SuperscalarInstructionInfo::ISUB_R, &SuperscalarInstructionInfo::IXOR_R }; + const SuperscalarInstructionInfo* slot_3L[] = { &SuperscalarInstructionInfo::ISUB_R, &SuperscalarInstructionInfo::IXOR_R, &SuperscalarInstructionInfo::IMULH_R, &SuperscalarInstructionInfo::ISMULH_R }; + const SuperscalarInstructionInfo* slot_4[] = { &SuperscalarInstructionInfo::IROR_C, &SuperscalarInstructionInfo::IADD_RS }; + const SuperscalarInstructionInfo* slot_7[] = { &SuperscalarInstructionInfo::IXOR_C7, &SuperscalarInstructionInfo::IADD_C7 }; + const SuperscalarInstructionInfo* slot_8[] = { &SuperscalarInstructionInfo::IXOR_C8, &SuperscalarInstructionInfo::IADD_C8 }; + const SuperscalarInstructionInfo* slot_9[] = { &SuperscalarInstructionInfo::IXOR_C9, &SuperscalarInstructionInfo::IADD_C9 }; + const SuperscalarInstructionInfo* slot_10 = &SuperscalarInstructionInfo::IMUL_RCP; static bool selectRegister(std::vector& availableRegisters, Blake2Generator& gen, int& reg) { int index; @@ -367,9 +333,19 @@ namespace RandomX { return true; } - class LightInstruction { + class RegisterInfo { public: - void toInstr(Instruction& instr) { + RegisterInfo() : latency(0), lastOpGroup(-1), lastOpPar(-1), value(0) {} + int latency; + int lastOpGroup; + int lastOpPar; + int value; + }; + + //"SuperscalarInstruction" consists of one or more macro-ops + class SuperscalarInstruction { + public: + void toInstr(Instruction& instr) { //translate to a RandomX instruction format instr.opcode = getType(); instr.dst = dst_; instr.src = src_ >= 0 ? src_ : dst_; @@ -392,7 +368,7 @@ namespace RandomX { case 4: //if this is the 4-4-4-4 buffer, issue multiplications as the first 3 instructions if (fetchType == 4 && !isLast) { - create(&LightInstructionInfo::IMUL_R, gen); + create(&SuperscalarInstructionInfo::IMUL_R, gen); } else { create(slot_4[gen.getByte() & 1], gen); @@ -415,7 +391,7 @@ namespace RandomX { } } - void create(const LightInstructionInfo* info, Blake2Generator& gen) { + void create(const SuperscalarInstructionInfo* info, Blake2Generator& gen) { info_ = info; reset(); switch (info->getType()) @@ -445,7 +421,7 @@ namespace RandomX { mod_ = 0; imm32_ = 0; opGroup_ = SuperscalarInstructionType::IMUL_R; - opGroupPar_ = -1; + groupParIsSource_ = true; } break; case SuperscalarInstructionType::IROR_C: { @@ -505,18 +481,22 @@ namespace RandomX { } } - bool selectDestination(int cycle, RegisterInfo (®isters)[8], Blake2Generator& gen) { + bool selectDestination(int cycle, bool allowChainedMul, RegisterInfo (®isters)[8], Blake2Generator& gen) { + /*if (allowChainedMultiplication && opGroup_ == SuperscalarInstructionType::IMUL_R) + std::cout << "Selecting destination with chained MUL enabled" << std::endl;*/ std::vector availableRegisters; //Conditions for the destination register: // * value must be ready at the required cycle // * cannot be the same as the source register unless the instruction allows it // - this avoids optimizable instructions such as "xor r, r" or "sub r, r" + // * register cannot be multiplied twice in a row unless allowChainedMul is true + // - this avoids accumulation of trailing zeroes in registers due to excessive multiplication + // - allowChainedMul is set to true if an attempt to find source/destination registers failed (this is quite rare, but prevents a catastrophic failure of the generator) // * either the last instruction applied to the register or its source must be different than this instruction // - this avoids optimizable instruction sequences such as "xor r1, r2; xor r1, r2" or "ror r, C1; ror r, C2" or "add r, C1; add r, C2" - // - it also avoids accumulation of trailing zeroes in registers due to excessive multiplication // * register r5 cannot be the destination of the IADD_RS instruction (limitation of the x86 lea instruction) for (unsigned i = 0; i < 8; ++i) { - if (registers[i].latency <= cycle && (canReuse_ || i != src_) && (registers[i].lastOpGroup != opGroup_ || registers[i].lastOpPar != opGroupPar_) && (info_->getType() != SuperscalarInstructionType::IADD_RS || i != LimitedAddressRegister)) + if (registers[i].latency <= cycle && (canReuse_ || i != src_) && (allowChainedMul || opGroup_ != SuperscalarInstructionType::IMUL_R || registers[i].lastOpGroup != SuperscalarInstructionType::IMUL_R) && (registers[i].lastOpGroup != opGroup_ || registers[i].lastOpPar != opGroupPar_) && (info_->getType() != SuperscalarInstructionType::IADD_RS || i != LimitedAddressRegister)) availableRegisters.push_back(i); } return selectRegister(availableRegisters, gen, dst_); @@ -560,14 +540,14 @@ namespace RandomX { return opGroupPar_; } - const LightInstructionInfo& getInfo() const { + const SuperscalarInstructionInfo& getInfo() const { return *info_; } - static const LightInstruction Null; + static const SuperscalarInstruction Null; private: - const LightInstructionInfo* info_; + const SuperscalarInstructionInfo* info_; int src_ = -1; int dst_ = -1; int mod_; @@ -582,15 +562,16 @@ namespace RandomX { canReuse_ = groupParIsSource_ = false; } - LightInstruction(const LightInstructionInfo* info) : info_(info) { + SuperscalarInstruction(const SuperscalarInstructionInfo* info) : info_(info) { } }; - const LightInstruction LightInstruction::Null = LightInstruction(&LightInstructionInfo::NOP); + const SuperscalarInstruction SuperscalarInstruction::Null = SuperscalarInstruction(&SuperscalarInstructionInfo::NOP); - constexpr int CYCLE_MAP_SIZE = RANDOMX_SUPERSCALAR_LATENCY + 3; + constexpr int CYCLE_MAP_SIZE = RANDOMX_SUPERSCALAR_LATENCY + 4; constexpr int LOOK_FORWARD_CYCLES = 4; constexpr int MAX_THROWAWAY_COUNT = 256; + #ifndef _DEBUG constexpr bool TRACE = false; constexpr bool INFO = false; @@ -602,7 +583,7 @@ namespace RandomX { template static int scheduleUop(ExecutionPort::type uop, ExecutionPort::type(&portBusy)[CYCLE_MAP_SIZE][3], int cycle) { //The scheduling here is done optimistically by checking port availability in order P5 -> P0 -> P1 to not overload - //P1 (multiplication port) by instructions that can go to any port. + //port P1 (multiplication) by instructions that can go to any port. for (; cycle < CYCLE_MAP_SIZE; ++cycle) { if ((uop & ExecutionPort::P5) != 0 && !portBusy[cycle][2]) { if (commit) { @@ -666,14 +647,14 @@ namespace RandomX { return -1; } - double generateSuperscalar(LightProgram& prog, Blake2Generator& gen) { + void generateSuperscalar(SuperscalarProgram& prog, Blake2Generator& gen) { ExecutionPort::type portBusy[CYCLE_MAP_SIZE][3]; memset(portBusy, 0, sizeof(portBusy)); RegisterInfo registers[8]; const DecoderBuffer* decodeBuffer = &DecoderBuffer::Default; - LightInstruction currentInstruction = LightInstruction::Null; + SuperscalarInstruction currentInstruction = SuperscalarInstruction::Null; int macroOpIndex = 0; int codeSize = 0; int macroOpCount = 0; @@ -719,7 +700,9 @@ namespace RandomX { int scheduleCycle = scheduleMop(mop, portBusy, cycle, depCycle); if (scheduleCycle < 0) { /*if (TRACE)*/ std::cout << "Unable to map operation '" << mop.getName() << "' to execution port (cycle " << cycle << ")" << std::endl; - return 0; + //__debugbreak(); + portsSaturated = true; + break; } //find a source register (if applicable) that will be ready when this instruction executes @@ -737,20 +720,20 @@ namespace RandomX { throwAwayCount++; macroOpIndex = currentInstruction.getInfo().getSize(); if (TRACE) std::cout << "; THROW away " << currentInstruction.getInfo().getName() << std::endl; + //cycle = topCycle; continue; } //abort this decode buffer - /*if (TRACE)*/ std::cout << "Aborting at cycle " << cycle << " with decode buffer " << decodeBuffer->getName() << " - source registers not available" << std::endl; - currentInstruction = LightInstruction::Null; + /*if (TRACE)*/ std::cout << "Aborting at cycle " << cycle << " with decode buffer " << decodeBuffer->getName() << " - source registers not available for operation " << currentInstruction.getInfo().getName() << std::endl; + currentInstruction = SuperscalarInstruction::Null; break; } if (TRACE) std::cout << "; src = r" << currentInstruction.getSource() << std::endl; } - throwAwayCount = 0; //find a destination register that will be ready when this instruction executes if (macroOpIndex == currentInstruction.getInfo().getDstOp()) { int forward; - for (forward = 0; forward < LOOK_FORWARD_CYCLES && !currentInstruction.selectDestination(scheduleCycle, registers, gen); ++forward) { + for (forward = 0; forward < LOOK_FORWARD_CYCLES && !currentInstruction.selectDestination(scheduleCycle, throwAwayCount > 0, registers, gen); ++forward) { if (TRACE) std::cout << "; dst STALL at cycle " << cycle << std::endl; ++scheduleCycle; ++cycle; @@ -760,16 +743,18 @@ namespace RandomX { throwAwayCount++; macroOpIndex = currentInstruction.getInfo().getSize(); if (TRACE) std::cout << "; THROW away " << currentInstruction.getInfo().getName() << std::endl; + //cycle = topCycle; continue; } //abort this decode buffer /*if (TRACE)*/ std::cout << "Aborting at cycle " << cycle << " with decode buffer " << decodeBuffer->getName() << " - destination registers not available" << std::endl; - currentInstruction = LightInstruction::Null; + currentInstruction = SuperscalarInstruction::Null; break; } if (TRACE) std::cout << "; dst = r" << currentInstruction.getDestination() << std::endl; } throwAwayCount = 0; + //recalculate when the instruction can be scheduled for execution based on operand availability scheduleCycle = scheduleMop(mop, portBusy, scheduleCycle, scheduleCycle); @@ -809,67 +794,53 @@ namespace RandomX { ++cycle; } - if(INFO) std::cout << "; ALU port utilization:" << std::endl; - if (INFO) std::cout << "; (* = in use, _ = idle)" << std::endl; - - int portCycles = 0; - for (int i = 0; i < CYCLE_MAP_SIZE; ++i) { - //std::cout << "; " << std::setw(3) << i << " "; - for (int j = 0; j < 3; ++j) { - //std::cout << (portBusy[i][j] ? '*' : '_'); - portCycles += !!portBusy[i][j]; - } - //std::cout << std::endl; - } - double ipc = (macroOpCount / (double)retireCycle); - if (INFO) std::cout << "; code size " << codeSize << " bytes" << std::endl; - if (INFO) std::cout << "; x86 macro-ops: " << macroOpCount << std::endl; - if (INFO) std::cout << "; fetch cycles: " << decodeCycle << std::endl; - if (INFO) std::cout << "; RandomX instructions: " << programSize << std::endl; - if (INFO) std::cout << "; Execution time: " << retireCycle << " cycles" << std::endl; - if (INFO) std::cout << "; IPC = " << ipc << std::endl; - if (INFO) std::cout << "; Port-cycles: " << portCycles << std::endl; - if (INFO) std::cout << "; Multiplications: " << mulCount << std::endl; - - int asicLatency[8]; - memset(asicLatency, 0, sizeof(asicLatency)); + memset(prog.asicLatencies, 0, sizeof(prog.asicLatencies)); //Calculate ASIC latency: //Assumes 1 cycle latency for all operations and unlimited parallelization. for (int i = 0; i < programSize; ++i) { Instruction& instr = prog(i); - int latDst = asicLatency[instr.dst] + 1; - int latSrc = instr.dst != instr.src ? asicLatency[instr.src] + 1 : 0; - asicLatency[instr.dst] = std::max(latDst, latSrc); + int latDst = prog.asicLatencies[instr.dst] + 1; + int latSrc = instr.dst != instr.src ? prog.asicLatencies[instr.src] + 1 : 0; + prog.asicLatencies[instr.dst] = std::max(latDst, latSrc); } //address register is the register with the highest ASIC latency int asicLatencyMax = 0; int addressReg = 0; for (int i = 0; i < 8; ++i) { - if (asicLatency[i] > asicLatencyMax) { - asicLatencyMax = asicLatency[i]; + if (prog.asicLatencies[i] > asicLatencyMax) { + asicLatencyMax = prog.asicLatencies[i]; addressReg = i; } - } - - if (INFO) std::cout << "; ASIC latency: " << asicLatencyMax << std::endl; - - if (INFO) { - std::cout << "; ASIC latency:" << std::endl; - for (int i = 0; i < 8; ++i) { - std::cout << "; r" << i << " = " << asicLatency[i] << std::endl; - } - if (INFO) std::cout << "; CPU latency:" << std::endl; - for (int i = 0; i < 8; ++i) { - std::cout << "; r" << i << " = " << registers[i].latency << std::endl; - } + prog.cpuLatencies[i] = registers[i].latency; } prog.setSize(programSize); prog.setAddressRegister(addressReg); - return ipc; + + prog.cpuLatency = retireCycle; + prog.asicLatency = asicLatencyMax; + prog.codeSize = codeSize; + prog.macroOps = macroOpCount; + prog.decodeCycles = decodeCycle; + prog.ipc = ipc; + prog.mulCount = mulCount; + + + /*if(INFO) std::cout << "; ALU port utilization:" << std::endl; + if (INFO) std::cout << "; (* = in use, _ = idle)" << std::endl; + + int portCycles = 0; + for (int i = 0; i < CYCLE_MAP_SIZE; ++i) { + std::cout << "; " << std::setw(3) << i << " "; + for (int j = 0; j < 3; ++j) { + std::cout << (portBusy[i][j] ? '*' : '_'); + portCycles += !!portBusy[i][j]; + } + std::cout << std::endl; + }*/ } } \ No newline at end of file diff --git a/src/superscalarGenerator.hpp b/src/superscalarGenerator.hpp new file mode 100644 index 0000000..a64e80d --- /dev/null +++ b/src/superscalarGenerator.hpp @@ -0,0 +1,47 @@ +/* +Copyright (c) 2019 tevador + +This file is part of RandomX. + +RandomX is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +RandomX is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with RandomX. If not, see. +*/ + +#pragma once +#include "Program.hpp" +#include "Blake2Generator.hpp" + +namespace RandomX { + // Intel Ivy Bridge reference + namespace SuperscalarInstructionType { //uOPs (decode) execution ports latency code size + constexpr int ISUB_R = 0; //1 p015 1 3 (sub) + constexpr int IXOR_R = 1; //1 p015 1 3 (xor) + constexpr int IADD_RS = 2; //1 p01 1 4 (lea) + constexpr int IMUL_R = 3; //1 p1 3 4 (imul) + constexpr int IROR_C = 4; //1 p05 1 4 (ror) + constexpr int IADD_C7 = 5; //1 p015 1 7 (add) + constexpr int IXOR_C7 = 6; //1 p015 1 7 (xor) + constexpr int IADD_C8 = 7; //1+0 p015 1 7+1 (add+nop) + constexpr int IXOR_C8 = 8; //1+0 p015 1 7+1 (xor+nop) + constexpr int IADD_C9 = 9; //1+0 p015 1 7+2 (add+nop) + constexpr int IXOR_C9 = 10; //1+0 p015 1 7+2 (xor+nop) + constexpr int IMULH_R = 11; //1+2+1 0+(p1,p5)+0 3 3+3+3 (mov+mul+mov) + constexpr int ISMULH_R = 12; //1+2+1 0+(p1,p5)+0 3 3+3+3 (mov+imul+mov) + constexpr int IMUL_RCP = 13; //1+1 p015+p1 4 10+4 (mov+imul) + + constexpr int COUNT = 14; + constexpr int INVALID = -1; + } + + void generateSuperscalar(SuperscalarProgram& prog, Blake2Generator& gen); +} \ No newline at end of file diff --git a/src/tests/superscalar-avalanche.cpp b/src/tests/superscalar-avalanche.cpp index 9c91a88..9fa1613 100644 --- a/src/tests/superscalar-avalanche.cpp +++ b/src/tests/superscalar-avalanche.cpp @@ -20,9 +20,10 @@ along with RandomX. If not, see. #include #include #include -#include "../LightProgramGenerator.hpp" +#include "../superscalarGenerator.hpp" #include "../InterpretedVirtualMachine.hpp" #include "../intrinPortable.h" +#include "../Blake2Generator.hpp" const uint8_t seed[32] = { 191, 182, 222, 175, 249, 89, 134, 104, 241, 68, 191, 62, 162, 166, 61, 64, 123, 191, 227, 193, 118, 60, 188, 53, 223, 133, 175, 24, 123, 230, 55, 74 }; @@ -45,9 +46,9 @@ int main() { uint64_t rb[8]; memcpy(rb, ra, sizeof rb); rb[0] ^= (1ULL << bit); - RandomX::LightProgram p; + RandomX::SuperscalarProgram p; RandomX::Blake2Generator gen(seed, i); - RandomX::generateLightProg2(p, gen); + RandomX::generateSuperscalar(p, gen); RandomX::InterpretedVirtualMachine::executeSuperscalar(ra, p, dummy); RandomX::InterpretedVirtualMachine::executeSuperscalar(rb, p, dummy); uint64_t diff = 0; diff --git a/src/tests/superscalar-init.cpp b/src/tests/superscalar-init.cpp index b366355..a7c1208 100644 --- a/src/tests/superscalar-init.cpp +++ b/src/tests/superscalar-init.cpp @@ -21,7 +21,7 @@ along with RandomX. If not, see. #include #include #include -#include "../LightProgramGenerator.hpp" +#include "../superscalarGenerator.hpp" #include "../InterpretedVirtualMachine.hpp" #include "../intrinPortable.h" #include "../configuration.h" diff --git a/vcxproj/randomx.vcxproj b/vcxproj/randomx.vcxproj index 1c1cae0..d646143 100644 --- a/vcxproj/randomx.vcxproj +++ b/vcxproj/randomx.vcxproj @@ -127,6 +127,7 @@ + @@ -137,8 +138,7 @@ - - + @@ -153,6 +153,7 @@ + @@ -167,8 +168,7 @@ - - + diff --git a/vcxproj/randomx.vcxproj.filters b/vcxproj/randomx.vcxproj.filters index 5b821c8..77939bd 100644 --- a/vcxproj/randomx.vcxproj.filters +++ b/vcxproj/randomx.vcxproj.filters @@ -54,12 +54,6 @@ Source Files - - Source Files - - - Source Files - Source Files @@ -75,6 +69,12 @@ Source Files + + Source Files + + + Source Files + @@ -136,12 +136,6 @@ Header Files - - Header Files - - - Header Files - Header Files @@ -166,5 +160,11 @@ Header Files + + Header Files + + + Header Files + \ No newline at end of file diff --git a/vcxproj/superscalar-avalanche.vcxproj b/vcxproj/superscalar-avalanche.vcxproj index dab0311..1cac62b 100644 --- a/vcxproj/superscalar-avalanche.vcxproj +++ b/vcxproj/superscalar-avalanche.vcxproj @@ -118,6 +118,7 @@ + @@ -125,9 +126,9 @@ - + diff --git a/vcxproj/superscalar-avalanche.vcxproj.filters b/vcxproj/superscalar-avalanche.vcxproj.filters index 9984ed1..93b3838 100644 --- a/vcxproj/superscalar-avalanche.vcxproj.filters +++ b/vcxproj/superscalar-avalanche.vcxproj.filters @@ -45,9 +45,6 @@ Source Files - - Source Files - Source Files @@ -60,6 +57,12 @@ Source Files + + Source Files + + + Source Files + diff --git a/vcxproj/superscalar-init.vcxproj b/vcxproj/superscalar-init.vcxproj index 4c4794c..d765f85 100644 --- a/vcxproj/superscalar-init.vcxproj +++ b/vcxproj/superscalar-init.vcxproj @@ -118,6 +118,7 @@ + @@ -125,9 +126,9 @@ - + diff --git a/vcxproj/superscalar-init.vcxproj.filters b/vcxproj/superscalar-init.vcxproj.filters index 4666d07..cad6e2b 100644 --- a/vcxproj/superscalar-init.vcxproj.filters +++ b/vcxproj/superscalar-init.vcxproj.filters @@ -42,9 +42,6 @@ Source Files - - Source Files - Source Files @@ -60,6 +57,12 @@ Source Files + + Source Files + + + Source Files +