From 8c37d4aac36b6a2c0a6dadfc0220dd2f4fe8dfc8 Mon Sep 17 00:00:00 2001
From: tevador <tevador@gmail.com>
Date: Fri, 12 Apr 2019 19:36:08 +0200
Subject: [PATCH] More refactoring

---
 src/AssemblyGeneratorX86.cpp                  |   6 +-
 src/AssemblyGeneratorX86.hpp                  |   6 +-
 src/Blake2Generator.cpp                       |  51 ++++
 src/Blake2Generator.hpp                       |  36 +++
 src/CompiledLightVirtualMachine.cpp           |   6 +-
 src/CompiledLightVirtualMachine.hpp           |   2 +-
 src/CompiledVirtualMachine.cpp                |   2 +-
 src/CompiledVirtualMachine.hpp                |   2 +-
 src/InterpretedVirtualMachine.cpp             |  15 +-
 src/InterpretedVirtualMachine.hpp             |   8 +-
 src/JitCompilerX86.cpp                        |  12 +-
 src/JitCompilerX86.hpp                        |   4 +-
 src/LightClientAsyncWorker.cpp                | 113 -------
 src/LightClientAsyncWorker.hpp                |  57 ----
 src/LightProgramGenerator.hpp                 |  58 ----
 src/Program.hpp                               |  17 +-
 src/VirtualMachine.hpp                        |   2 +-
 src/main.cpp                                  |   8 +-
 ...Generator.cpp => superscalarGenerator.cpp} | 283 ++++++++----------
 src/superscalarGenerator.hpp                  |  47 +++
 src/tests/superscalar-avalanche.cpp           |   7 +-
 src/tests/superscalar-init.cpp                |   2 +-
 vcxproj/randomx.vcxproj                       |   8 +-
 vcxproj/randomx.vcxproj.filters               |  24 +-
 vcxproj/superscalar-avalanche.vcxproj         |   3 +-
 vcxproj/superscalar-avalanche.vcxproj.filters |   9 +-
 vcxproj/superscalar-init.vcxproj              |   3 +-
 vcxproj/superscalar-init.vcxproj.filters      |   9 +-
 28 files changed, 347 insertions(+), 453 deletions(-)
 create mode 100644 src/Blake2Generator.cpp
 create mode 100644 src/Blake2Generator.hpp
 delete mode 100644 src/LightClientAsyncWorker.cpp
 delete mode 100644 src/LightClientAsyncWorker.hpp
 delete mode 100644 src/LightProgramGenerator.hpp
 rename src/{LightProgramGenerator.cpp => superscalarGenerator.cpp} (76%)
 create mode 100644 src/superscalarGenerator.hpp

diff --git a/src/AssemblyGeneratorX86.cpp b/src/AssemblyGeneratorX86.cpp
index c4e009c..b3511c1 100644
--- a/src/AssemblyGeneratorX86.cpp
+++ b/src/AssemblyGeneratorX86.cpp
@@ -23,7 +23,7 @@ along with RandomX.  If not, see<http://www.gnu.org/licenses/>.
 #include "common.hpp"
 #include "reciprocal.h"
 #include "Program.hpp"
-#include "./LightProgramGenerator.hpp"
+#include "superscalarGenerator.hpp"
 
 namespace RandomX {
 
@@ -62,7 +62,7 @@ namespace RandomX {
 		}
 	}
 
-	void AssemblyGeneratorX86::generateAsm(LightProgram& prog) {
+	void AssemblyGeneratorX86::generateAsm(SuperscalarProgram& prog) {
 		asmCode.str(std::string()); //clear
 		asmCode << "ALIGN 16" << std::endl;
 		for (unsigned i = 0; i < prog.getSize(); ++i) {
@@ -126,7 +126,7 @@ namespace RandomX {
 		}
 	}
 
-	void AssemblyGeneratorX86::generateC(LightProgram& prog) {
+	void AssemblyGeneratorX86::generateC(SuperscalarProgram& prog) {
 		asmCode.str(std::string()); //clear
 		asmCode << "#include <stdint.h>" << std::endl;
 		asmCode << "#if defined(__SIZEOF_INT128__)" << std::endl;
diff --git a/src/AssemblyGeneratorX86.hpp b/src/AssemblyGeneratorX86.hpp
index 8688cd4..4b777e6 100644
--- a/src/AssemblyGeneratorX86.hpp
+++ b/src/AssemblyGeneratorX86.hpp
@@ -27,7 +27,7 @@ along with RandomX.  If not, see<http://www.gnu.org/licenses/>.
 namespace RandomX {
 
 	class Program;
-	class LightProgram;
+	class SuperscalarProgram;
 	class AssemblyGeneratorX86;
 
 	typedef void(AssemblyGeneratorX86::*InstructionGenerator)(Instruction&, int);
@@ -35,8 +35,8 @@ namespace RandomX {
 	class AssemblyGeneratorX86 {
 	public:
 		void generateProgram(Program& prog);
-		void generateAsm(LightProgram& prog);
-		void generateC(LightProgram& prog);
+		void generateAsm(SuperscalarProgram& prog);
+		void generateC(SuperscalarProgram& prog);
 		void printCode(std::ostream& os) {
 			os << asmCode.rdbuf();
 		}
diff --git a/src/Blake2Generator.cpp b/src/Blake2Generator.cpp
new file mode 100644
index 0000000..2879088
--- /dev/null
+++ b/src/Blake2Generator.cpp
@@ -0,0 +1,51 @@
+/*
+Copyright (c) 2019 tevador
+
+This file is part of RandomX.
+
+RandomX is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+RandomX is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with RandomX.  If not, see<http://www.gnu.org/licenses/>.
+*/
+
+#include "blake2/blake2.h"
+#include "blake2/endian.h"
+#include "Blake2Generator.hpp"
+#include "common.hpp"
+
+namespace RandomX {
+
+	Blake2Generator::Blake2Generator(const void* seed, int nonce) : dataIndex(sizeof(data)) {
+		memset(data, 0, sizeof(data));
+		memcpy(data, seed, SeedSize);
+		store32(&data[60], nonce);
+	}
+
+	uint8_t Blake2Generator::getByte() {
+		checkData(1);
+		return data[dataIndex++];
+	}
+
+	uint32_t Blake2Generator::getInt32() {
+		checkData(4);
+		auto ret = load32(&data[dataIndex]);
+		dataIndex += 4;
+		return ret;
+	}
+
+	void Blake2Generator::checkData(const size_t bytesNeeded) {
+		if (dataIndex + bytesNeeded > sizeof(data)) {
+			blake2b(data, sizeof(data), data, sizeof(data), nullptr, 0);
+			dataIndex = 0;
+		}
+	}
+}
\ No newline at end of file
diff --git a/src/Blake2Generator.hpp b/src/Blake2Generator.hpp
new file mode 100644
index 0000000..24f2fca
--- /dev/null
+++ b/src/Blake2Generator.hpp
@@ -0,0 +1,36 @@
+/*
+Copyright (c) 2019 tevador
+
+This file is part of RandomX.
+
+RandomX is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+RandomX is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with RandomX.  If not, see<http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+#include <cstdint>
+
+namespace RandomX {
+
+	class Blake2Generator {
+	public:
+		Blake2Generator(const void* seed, int nonce);
+		uint8_t getByte();
+		uint32_t getInt32();
+	private:
+		uint8_t data[64];
+		size_t dataIndex;
+
+		void checkData(const size_t);
+	};
+}
\ No newline at end of file
diff --git a/src/CompiledLightVirtualMachine.cpp b/src/CompiledLightVirtualMachine.cpp
index 760842a..11bedf8 100644
--- a/src/CompiledLightVirtualMachine.cpp
+++ b/src/CompiledLightVirtualMachine.cpp
@@ -24,7 +24,7 @@ along with RandomX.  If not, see<http://www.gnu.org/licenses/>.
 namespace RandomX {
 
 	template<bool superscalar>
-	void CompiledLightVirtualMachine<superscalar>::setDataset(dataset_t ds, uint64_t size, LightProgram(&programs)[RANDOMX_CACHE_ACCESSES]) {
+	void CompiledLightVirtualMachine<superscalar>::setDataset(dataset_t ds, uint64_t size, SuperscalarProgram(&programs)[RANDOMX_CACHE_ACCESSES]) {
 		mem.ds = ds;
 		datasetRange = (size - RANDOMX_DATASET_SIZE + CacheLineSize) / CacheLineSize;
 		if(superscalar)
@@ -32,8 +32,8 @@ namespace RandomX {
 		//datasetBasePtr = ds.dataset.memory;
 	}
 
-	template void CompiledLightVirtualMachine<true>::setDataset(dataset_t ds, uint64_t size, LightProgram(&programs)[RANDOMX_CACHE_ACCESSES]);
-	template void CompiledLightVirtualMachine<false>::setDataset(dataset_t ds, uint64_t size, LightProgram(&programs)[RANDOMX_CACHE_ACCESSES]);
+	template void CompiledLightVirtualMachine<true>::setDataset(dataset_t ds, uint64_t size, SuperscalarProgram(&programs)[RANDOMX_CACHE_ACCESSES]);
+	template void CompiledLightVirtualMachine<false>::setDataset(dataset_t ds, uint64_t size, SuperscalarProgram(&programs)[RANDOMX_CACHE_ACCESSES]);
 
 	template<bool superscalar>
 	void CompiledLightVirtualMachine<superscalar>::initialize() {
diff --git a/src/CompiledLightVirtualMachine.hpp b/src/CompiledLightVirtualMachine.hpp
index 9493c58..1d4b78e 100644
--- a/src/CompiledLightVirtualMachine.hpp
+++ b/src/CompiledLightVirtualMachine.hpp
@@ -39,7 +39,7 @@ namespace RandomX {
 			_mm_free(ptr);
 		}
 		CompiledLightVirtualMachine() {}
-		void setDataset(dataset_t ds, uint64_t size, LightProgram(&programs)[RANDOMX_CACHE_ACCESSES]) override;
+		void setDataset(dataset_t ds, uint64_t size, SuperscalarProgram(&programs)[RANDOMX_CACHE_ACCESSES]) override;
 		void initialize() override;
 	};
 }
\ No newline at end of file
diff --git a/src/CompiledVirtualMachine.cpp b/src/CompiledVirtualMachine.cpp
index 4984938..3e44476 100644
--- a/src/CompiledVirtualMachine.cpp
+++ b/src/CompiledVirtualMachine.cpp
@@ -29,7 +29,7 @@ namespace RandomX {
 	CompiledVirtualMachine::CompiledVirtualMachine() {
 	}
 
-	void CompiledVirtualMachine::setDataset(dataset_t ds, uint64_t size, LightProgram(&programs)[RANDOMX_CACHE_ACCESSES]) {
+	void CompiledVirtualMachine::setDataset(dataset_t ds, uint64_t size, SuperscalarProgram(&programs)[RANDOMX_CACHE_ACCESSES]) {
 		mem.ds = ds;
 		datasetRange = (size - RANDOMX_DATASET_SIZE + CacheLineSize) / CacheLineSize;
 		datasetBasePtr = ds.dataset.memory;
diff --git a/src/CompiledVirtualMachine.hpp b/src/CompiledVirtualMachine.hpp
index 65b1885..a2866ca 100644
--- a/src/CompiledVirtualMachine.hpp
+++ b/src/CompiledVirtualMachine.hpp
@@ -42,7 +42,7 @@ namespace RandomX {
 			_mm_free(ptr);
 		}
 		CompiledVirtualMachine();
-		void setDataset(dataset_t ds, uint64_t size, LightProgram(&programs)[RANDOMX_CACHE_ACCESSES]) override;
+		void setDataset(dataset_t ds, uint64_t size, SuperscalarProgram(&programs)[RANDOMX_CACHE_ACCESSES]) override;
 		void initialize() override;
 		virtual void execute() override;
 		void* getProgram() {
diff --git a/src/InterpretedVirtualMachine.cpp b/src/InterpretedVirtualMachine.cpp
index 673fecf..132a2c9 100644
--- a/src/InterpretedVirtualMachine.cpp
+++ b/src/InterpretedVirtualMachine.cpp
@@ -22,7 +22,6 @@ along with RandomX.  If not, see<http://www.gnu.org/licenses/>.
 #include "InterpretedVirtualMachine.hpp"
 #include "dataset.hpp"
 #include "Cache.hpp"
-#include "LightClientAsyncWorker.hpp"
 #include <iostream>
 #include <iomanip>
 #include <stdexcept>
@@ -36,7 +35,7 @@ along with RandomX.  If not, see<http://www.gnu.org/licenses/>.
 #ifdef STATS
 #include <algorithm>
 #endif
-#include "LightProgramGenerator.hpp"
+#include "superscalarGenerator.hpp"
 
 #ifdef FPUCHECK
 constexpr bool fpuCheck = true;
@@ -47,7 +46,7 @@ constexpr bool fpuCheck = false;
 namespace RandomX {
 
 	template<bool superscalar>
-	void InterpretedVirtualMachine<superscalar>::setDataset(dataset_t ds, uint64_t size, LightProgram(&programs)[RANDOMX_CACHE_ACCESSES]) {
+	void InterpretedVirtualMachine<superscalar>::setDataset(dataset_t ds, uint64_t size, SuperscalarProgram(&programs)[RANDOMX_CACHE_ACCESSES]) {
 		mem.ds = ds;
 		readDataset = &datasetReadLight;
 		datasetRange = (size - RANDOMX_DATASET_SIZE + CacheLineSize) / CacheLineSize;
@@ -55,8 +54,8 @@ namespace RandomX {
 			precompileSuperscalar(programs);
 	}
 
-	template void InterpretedVirtualMachine<true>::setDataset(dataset_t ds, uint64_t size, LightProgram(&programs)[RANDOMX_CACHE_ACCESSES]);
-	template void InterpretedVirtualMachine<false>::setDataset(dataset_t ds, uint64_t size, LightProgram(&programs)[RANDOMX_CACHE_ACCESSES]);
+	template void InterpretedVirtualMachine<true>::setDataset(dataset_t ds, uint64_t size, SuperscalarProgram(&programs)[RANDOMX_CACHE_ACCESSES]);
+	template void InterpretedVirtualMachine<false>::setDataset(dataset_t ds, uint64_t size, SuperscalarProgram(&programs)[RANDOMX_CACHE_ACCESSES]);
 
 	template<bool superscalar>
 	void InterpretedVirtualMachine<superscalar>::initialize() {
@@ -475,7 +474,7 @@ namespace RandomX {
 	}
 
 	template<bool superscalar>
-	void InterpretedVirtualMachine<superscalar>::executeSuperscalar(int_reg_t(&r)[8], LightProgram& prog, std::vector<uint64_t>& reciprocals) {
+	void InterpretedVirtualMachine<superscalar>::executeSuperscalar(int_reg_t(&r)[8], SuperscalarProgram& prog, std::vector<uint64_t>& reciprocals) {
 		for (unsigned j = 0; j < prog.getSize(); ++j) {
 			Instruction& instr = prog(j);
 			switch (instr.opcode)
@@ -539,7 +538,7 @@ namespace RandomX {
 		Cache& cache = mem.ds.cache;
 		for (unsigned i = 0; i < RANDOMX_CACHE_ACCESSES; ++i) {
 			mixBlock = getMixBlock(registerValue, cache);
-			LightProgram& prog = superScalarPrograms[i];
+			SuperscalarProgram& prog = superScalarPrograms[i];
 			
 			executeSuperscalar(rl, prog, reciprocals);
 
@@ -554,7 +553,7 @@ namespace RandomX {
 	}
 
 	template<bool superscalar>
-	void InterpretedVirtualMachine<superscalar>::precompileSuperscalar(LightProgram* programs) {
+	void InterpretedVirtualMachine<superscalar>::precompileSuperscalar(SuperscalarProgram* programs) {
 		memcpy(superScalarPrograms, programs, sizeof(superScalarPrograms));
 		reciprocals.clear();
 		for (unsigned i = 0; i < RANDOMX_CACHE_ACCESSES; ++i) {
diff --git a/src/InterpretedVirtualMachine.hpp b/src/InterpretedVirtualMachine.hpp
index ddefa67..3632112 100644
--- a/src/InterpretedVirtualMachine.hpp
+++ b/src/InterpretedVirtualMachine.hpp
@@ -70,17 +70,17 @@ namespace RandomX {
 		}
 		InterpretedVirtualMachine(bool soft) : softAes(soft) {}
 		~InterpretedVirtualMachine() {}
-		void setDataset(dataset_t ds, uint64_t size, LightProgram(&programs)[RANDOMX_CACHE_ACCESSES]) override;
+		void setDataset(dataset_t ds, uint64_t size, SuperscalarProgram(&programs)[RANDOMX_CACHE_ACCESSES]) override;
 		void initialize() override;
 		void execute() override;
-		static void executeSuperscalar(int_reg_t(&r)[8], LightProgram& prog, std::vector<uint64_t>& reciprocals);
+		static void executeSuperscalar(int_reg_t(&r)[8], SuperscalarProgram& prog, std::vector<uint64_t>& reciprocals);
 	private:
 		static InstructionHandler<superscalar> engine[256];
 		DatasetReadFunc readDataset;
 		bool softAes;
 		InstructionByteCode byteCode[RANDOMX_PROGRAM_SIZE];
 		std::vector<uint64_t> reciprocals;
-		alignas(64) LightProgram superScalarPrograms[RANDOMX_CACHE_ACCESSES];
+		alignas(64) SuperscalarProgram superScalarPrograms[RANDOMX_CACHE_ACCESSES];
 #ifdef STATS
 		int count_ADD_64 = 0;
 		int count_ADD_32 = 0;
@@ -128,7 +128,7 @@ namespace RandomX {
 		int datasetAccess[256] = { 0 };
 #endif
 		void precompileProgram(int_reg_t(&r)[8], __m128d (&f)[4], __m128d (&e)[4], __m128d (&a)[4]);
-		void precompileSuperscalar(LightProgram*);
+		void precompileSuperscalar(SuperscalarProgram*);
 		void executeBytecode(int_reg_t(&r)[8], __m128d (&f)[4], __m128d (&e)[4], __m128d (&a)[4]);
 		void executeBytecode(int& i, int_reg_t(&r)[8], __m128d (&f)[4], __m128d (&e)[4], __m128d (&a)[4]);
 		void executeSuperscalar(uint32_t blockNumber, int_reg_t(&r)[8]);
diff --git a/src/JitCompilerX86.cpp b/src/JitCompilerX86.cpp
index 8e15e15..ad7c85a 100644
--- a/src/JitCompilerX86.cpp
+++ b/src/JitCompilerX86.cpp
@@ -87,7 +87,7 @@ namespace RandomX {
 	*/
 
 #include "JitCompilerX86-static.hpp"
-#include "LightProgramGenerator.hpp"
+#include "superscalarGenerator.hpp"
 
 #define NOP_TEST true
 
@@ -261,16 +261,16 @@ namespace RandomX {
 	template void JitCompilerX86::generateProgramLight<false>(Program& prog);
 
 	template<size_t N>
-	void JitCompilerX86::generateSuperScalarHash(LightProgram(&programs)[N]) {
+	void JitCompilerX86::generateSuperScalarHash(SuperscalarProgram(&programs)[N]) {
 		memcpy(code + superScalarHashOffset, codeShhInit, codeSshInitSize);
 		codePos = superScalarHashOffset + codeSshInitSize;
 		for (unsigned j = 0; j < N; ++j) {
-			LightProgram& prog = programs[j];
+			SuperscalarProgram& prog = programs[j];
 			for (unsigned i = 0; i < prog.getSize(); ++i) {
 				Instruction& instr = prog(i);
 				instr.src %= RegistersCount;
 				instr.dst %= RegistersCount;
-				generateCode<LightProgram>(instr, i);
+				generateCode<SuperscalarProgram>(instr, i);
 			}
 			emit(codeShhLoad, codeSshLoadSize);
 			if (j < N - 1) {
@@ -290,7 +290,7 @@ namespace RandomX {
 	}
 
 	template
-	void JitCompilerX86::generateSuperScalarHash(LightProgram(&programs)[RANDOMX_CACHE_ACCESSES]);
+	void JitCompilerX86::generateSuperScalarHash(SuperscalarProgram(&programs)[RANDOMX_CACHE_ACCESSES]);
 
 	void JitCompilerX86::generateDatasetInitCode() {
 		memcpy(code, codeDatasetInit, datasetInitSize);
@@ -345,7 +345,7 @@ namespace RandomX {
 	}
 
 	template<>
-	void JitCompilerX86::generateCode<LightProgram>(Instruction& instr, int i) {
+	void JitCompilerX86::generateCode<SuperscalarProgram>(Instruction& instr, int i) {
 		switch (instr.opcode)
 		{
 		case RandomX::SuperscalarInstructionType::ISUB_R:
diff --git a/src/JitCompilerX86.hpp b/src/JitCompilerX86.hpp
index 9240cfe..2908b04 100644
--- a/src/JitCompilerX86.hpp
+++ b/src/JitCompilerX86.hpp
@@ -27,7 +27,7 @@ along with RandomX.  If not, see<http://www.gnu.org/licenses/>.
 namespace RandomX {
 
 	class Program;
-	class LightProgram;
+	class SuperscalarProgram;
 	class JitCompilerX86;
 
 	typedef void(JitCompilerX86::*InstructionGeneratorX86)(Instruction&, int);
@@ -42,7 +42,7 @@ namespace RandomX {
 		template<bool superscalar>
 		void generateProgramLight(Program&);
 		template<size_t N>
-		void generateSuperScalarHash(LightProgram (&programs)[N]);
+		void generateSuperScalarHash(SuperscalarProgram (&programs)[N]);
 		ProgramFunc getProgramFunc() {
 			return (ProgramFunc)code;
 		}
diff --git a/src/LightClientAsyncWorker.cpp b/src/LightClientAsyncWorker.cpp
deleted file mode 100644
index fbba713..0000000
--- a/src/LightClientAsyncWorker.cpp
+++ /dev/null
@@ -1,113 +0,0 @@
-/*
-Copyright (c) 2019 tevador
-
-This file is part of RandomX.
-
-RandomX is free software: you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation, either version 3 of the License, or
-(at your option) any later version.
-
-RandomX is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License
-along with RandomX.  If not, see<http://www.gnu.org/licenses/>.
-*/
-
-#include "LightClientAsyncWorker.hpp"
-#include "dataset.hpp"
-#include "Cache.hpp"
-
-namespace RandomX {
-
-	LightClientAsyncWorker::LightClientAsyncWorker(const Cache& c) : ILightClientAsyncWorker(c), output(nullptr), hasWork(false), 
-#ifdef TRACE
-		sw(true),
-#endif
-		workerThread(&LightClientAsyncWorker::runWorker, this) {
-
-	}
-
-	void LightClientAsyncWorker::prepareBlock(addr_t addr) {
-#ifdef TRACE
-		std::cout << sw.getElapsed() << ": prepareBlock-enter " << addr / CacheLineSize << std::endl;
-#endif
-		{
-			std::lock_guard<std::mutex> lk(mutex);
-			startBlock = addr / CacheLineSize;
-			blockCount = 1;
-			output = currentLine.data();
-			hasWork = true;
-		}
-#ifdef TRACE
-		std::cout << sw.getElapsed() << ": prepareBlock-notify " << startBlock << "/" << blockCount << std::endl;
-#endif
-		notifier.notify_one();
-	}
-
-	const uint64_t* LightClientAsyncWorker::getBlock(addr_t addr) {
-#ifdef TRACE
-		std::cout << sw.getElapsed() << ": getBlock-enter " << addr / CacheLineSize << std::endl;
-#endif
-		uint32_t currentBlock = addr / CacheLineSize;
-		if (currentBlock != startBlock || output != currentLine.data()) {
-			initBlock(cache, (uint8_t*)currentLine.data(), currentBlock, RANDOMX_CACHE_ACCESSES / 8);
-		}
-		else {
-			sync();
-		}
-#ifdef TRACE
-		std::cout << sw.getElapsed() << ": getBlock-return " << addr / CacheLineSize << std::endl;
-#endif
-		return currentLine.data();
-	}
-
-	void LightClientAsyncWorker::prepareBlocks(void* out, uint32_t startBlock, uint32_t blockCount) {
-#ifdef TRACE
-		std::cout << sw.getElapsed() << ": prepareBlocks-enter " << startBlock << "/" << blockCount << std::endl;
-#endif
-		{
-			std::lock_guard<std::mutex> lk(mutex);
-			this->startBlock = startBlock;
-			this->blockCount = blockCount;
-			output = out;
-			hasWork = true;
-			notifier.notify_one();
-		}
-	}
-
-	void LightClientAsyncWorker::getBlocks(void* out, uint32_t startBlock, uint32_t blockCount) {
-		for (uint32_t i = 0; i < blockCount; ++i) {
-			initBlock(cache, (uint8_t*)out + CacheLineSize * i, startBlock + i, RANDOMX_CACHE_ACCESSES / 8);
-		}
-	}
-
-	void LightClientAsyncWorker::sync() {
-		std::unique_lock<std::mutex> lk(mutex);
-		notifier.wait(lk, [this] { return !hasWork; });
-	}
-
-	void LightClientAsyncWorker::runWorker() {
-#ifdef TRACE
-		std::cout << sw.getElapsed() << ": runWorker-enter " << std::endl;
-#endif
-		for (;;) {
-			std::unique_lock<std::mutex> lk(mutex);
-			notifier.wait(lk, [this] { return hasWork; });
-#ifdef TRACE
-			std::cout << sw.getElapsed() << ": runWorker-getBlocks " << startBlock << "/" << blockCount << std::endl;
-#endif
-			//getBlocks(output, startBlock, blockCount);
-			initBlock(cache, (uint8_t*)output, startBlock, RANDOMX_CACHE_ACCESSES / 8);
-			hasWork = false;
-#ifdef TRACE
-			std::cout << sw.getElapsed() << ": runWorker-finished " << startBlock << "/" << blockCount << std::endl;
-#endif
-			lk.unlock();
-			notifier.notify_one();
-		}
-	}
-}
\ No newline at end of file
diff --git a/src/LightClientAsyncWorker.hpp b/src/LightClientAsyncWorker.hpp
deleted file mode 100644
index 7c45e53..0000000
--- a/src/LightClientAsyncWorker.hpp
+++ /dev/null
@@ -1,57 +0,0 @@
-/*
-Copyright (c) 2019 tevador
-
-This file is part of RandomX.
-
-RandomX is free software: you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation, either version 3 of the License, or
-(at your option) any later version.
-
-RandomX is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License
-along with RandomX.  If not, see<http://www.gnu.org/licenses/>.
-*/
-
-//#define TRACE
-#include "common.hpp"
-
-#include <thread>
-#include <mutex>
-#include <condition_variable>
-#include <array>
-#ifdef TRACE
-#include "Stopwatch.hpp"
-#include <iostream>
-#endif
-
-namespace RandomX {
-
-	using DatasetLine = std::array<uint64_t, CacheLineSize / sizeof(uint64_t)>;
-
-	class LightClientAsyncWorker : public ILightClientAsyncWorker {
-	public:
-		LightClientAsyncWorker(const Cache&);
-		void prepareBlock(addr_t) final;
-		void prepareBlocks(void* out, uint32_t startBlock, uint32_t blockCount) final;
-		const uint64_t* getBlock(addr_t) final;
-		void getBlocks(void* out, uint32_t startBlock, uint32_t blockCount) final;
-		void sync() final;
-	private:
-		void runWorker();
-		std::condition_variable notifier;
-		std::mutex mutex;
-		alignas(16) DatasetLine currentLine;
-		void* output;
-		uint32_t startBlock, blockCount;
-		bool hasWork;
-#ifdef TRACE
-		Stopwatch sw;
-#endif
-		std::thread workerThread;
-	};
-}
\ No newline at end of file
diff --git a/src/LightProgramGenerator.hpp b/src/LightProgramGenerator.hpp
deleted file mode 100644
index beb7974..0000000
--- a/src/LightProgramGenerator.hpp
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
-Copyright (c) 2019 tevador
-
-This file is part of RandomX.
-
-RandomX is free software: you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation, either version 3 of the License, or
-(at your option) any later version.
-
-RandomX is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License
-along with RandomX.  If not, see<http://www.gnu.org/licenses/>.
-*/
-
-#include "Program.hpp"
-
-namespace RandomX {
-
-	//                             Intel Ivy Bridge reference
-	namespace SuperscalarInstructionType {        //uOPs (decode)   execution ports         latency       code size
-		constexpr int ISUB_R = 0;           //1               p015                    1               3
-		constexpr int IXOR_R = 1;           //1               p015                    1               3
-		constexpr int IADD_RS = 2;          //1               p01                     1               4
-		constexpr int IMUL_R = 3;           //1               p1                      3               4
-		constexpr int IROR_C = 4;           //1               p05                     1               4
-		constexpr int IADD_C7 = 5;          //1               p015                    1               7
-		constexpr int IXOR_C7 = 6;          //1               p015                    1               7
-		constexpr int IADD_C8 = 7;          //1+0             p015                    1               8
-		constexpr int IXOR_C8 = 8;          //1+0             p015                    1               8
-		constexpr int IADD_C9 = 9;          //1+0             p015                    1               9
-		constexpr int IXOR_C9 = 10;         //1+0             p015                    1               9
-		constexpr int IMULH_R = 11;         //1+2+1           0+(p1,p5)+0             3               3+3+3
-		constexpr int ISMULH_R = 12;        //1+2+1           0+(p1,p5)+0             3               3+3+3
-		constexpr int IMUL_RCP = 13;        //1+1             p015+p1                 4              10+4
-
-		constexpr int COUNT = 14;
-		constexpr int INVALID = -1;
-	}
-
-	class Blake2Generator {
-	public:
-		Blake2Generator(const void* seed, int nonce);
-		uint8_t getByte();
-		uint32_t getInt32();
-	private:
-		uint8_t data[64];
-		size_t dataIndex;
-
-		void checkData(const size_t);
-	};
-
-	double generateSuperscalar(LightProgram& prog, Blake2Generator& gen);
-}
\ No newline at end of file
diff --git a/src/Program.hpp b/src/Program.hpp
index 37c8303..2f2a402 100644
--- a/src/Program.hpp
+++ b/src/Program.hpp
@@ -53,12 +53,14 @@ namespace RandomX {
 		Instruction programBuffer[RANDOMX_PROGRAM_SIZE];
 	};
 
-	class LightProgram {
+	static_assert(sizeof(Program) % 64 == 0, "Invalid size of class Program");
+
+	class SuperscalarProgram {
 	public:
 		Instruction& operator()(int pc) {
 			return programBuffer[pc];
 		}
-		friend std::ostream& operator<<(std::ostream& os, const LightProgram& p) {
+		friend std::ostream& operator<<(std::ostream& os, const SuperscalarProgram& p) {
 			p.print(os);
 			return os;
 		}
@@ -74,6 +76,15 @@ namespace RandomX {
 		void setAddressRegister(uint32_t val) {
 			addrReg = val;
 		}
+		double ipc;
+		int codeSize;
+		int macroOps;
+		int decodeCycles;
+		int cpuLatency;
+		int asicLatency;
+		int mulCount;
+		int cpuLatencies[8];
+		int asicLatencies[8];
 	private:
 		void print(std::ostream& os) const {
 			for (unsigned i = 0; i < size; ++i) {
@@ -85,6 +96,4 @@ namespace RandomX {
 		uint32_t size;
 		int addrReg;
 	};
-
-	static_assert(sizeof(Program) % 64 == 0, "Invalid size of class Program");
 }
diff --git a/src/VirtualMachine.hpp b/src/VirtualMachine.hpp
index 1edacdb..7352933 100644
--- a/src/VirtualMachine.hpp
+++ b/src/VirtualMachine.hpp
@@ -28,7 +28,7 @@ namespace RandomX {
 	public:
 		VirtualMachine();
 		virtual ~VirtualMachine() {}
-		virtual void setDataset(dataset_t ds, uint64_t size, LightProgram (&programs)[RANDOMX_CACHE_ACCESSES]) = 0;
+		virtual void setDataset(dataset_t ds, uint64_t size, SuperscalarProgram (&programs)[RANDOMX_CACHE_ACCESSES]) = 0;
 		void setScratchpad(void* ptr) {
 			scratchpad = (uint8_t*)ptr;
 		}
diff --git a/src/main.cpp b/src/main.cpp
index a120cf9..42dc15f 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -36,7 +36,7 @@ along with RandomX.  If not, see<http://www.gnu.org/licenses/>.
 #include "dataset.hpp"
 #include "Cache.hpp"
 #include "hashAes1Rx4.hpp"
-#include "LightProgramGenerator.hpp"
+#include "superscalarGenerator.hpp"
 #include "JitCompilerX86.hpp"
 
 const uint8_t seed[32] = { 191, 182, 222, 175, 249, 89, 134, 104, 241, 68, 191, 62, 162, 166, 61, 64, 123, 191, 227, 193, 118, 60, 188, 53, 223, 133, 175, 24, 123, 230, 55, 74 };
@@ -226,13 +226,13 @@ int main(int argc, char** argv) {
 	readOption("--legacy", argc, argv, legacy);
 
 	if (genSuperscalar) {
-		RandomX::LightProgram p;
+		RandomX::SuperscalarProgram p;
 		RandomX::Blake2Generator gen(seed, programCount);
 		RandomX::generateSuperscalar(p, gen);
 		RandomX::AssemblyGeneratorX86 asmX86;
 		asmX86.generateAsm(p);
 		//std::ofstream file("lightProg2.asm");
-		//asmX86.printCode(std::cout);
+		asmX86.printCode(std::cout);
 		return 0;
 	}
 
@@ -268,7 +268,7 @@ int main(int argc, char** argv) {
 	const uint64_t cacheSize = (RANDOMX_ARGON_MEMORY + RANDOMX_ARGON_GROWTH * epoch) * RandomX::ArgonBlockSize;
 	const uint64_t datasetSize = (RANDOMX_DATASET_SIZE + RANDOMX_DS_GROWTH * epoch);
 	dataset.cache.size = cacheSize;
-	RandomX::LightProgram programs[RANDOMX_CACHE_ACCESSES];
+	RandomX::SuperscalarProgram programs[RANDOMX_CACHE_ACCESSES];
 
 	std::cout << "RandomX - " << (miningMode ? "mining" : "verification") << " mode" << std::endl;
 
diff --git a/src/LightProgramGenerator.cpp b/src/superscalarGenerator.cpp
similarity index 76%
rename from src/LightProgramGenerator.cpp
rename to src/superscalarGenerator.cpp
index 40a767b..d4fd32a 100644
--- a/src/LightProgramGenerator.cpp
+++ b/src/superscalarGenerator.cpp
@@ -18,7 +18,6 @@ along with RandomX.  If not, see<http://www.gnu.org/licenses/>.
 */
 
 #include <stddef.h>
-#include "blake2/blake2.h"
 #include "configuration.h"
 #include "Program.hpp"
 #include "blake2/endian.h"
@@ -27,7 +26,7 @@ along with RandomX.  If not, see<http://www.gnu.org/licenses/>.
 #include <algorithm>
 #include <stdexcept>
 #include <iomanip>
-#include "LightProgramGenerator.hpp"
+#include "superscalarGenerator.hpp"
 
 namespace RandomX {
 
@@ -35,6 +34,7 @@ namespace RandomX {
 		return type == SuperscalarInstructionType::IMUL_R || type == SuperscalarInstructionType::IMULH_R || type == SuperscalarInstructionType::ISMULH_R || type == SuperscalarInstructionType::IMUL_RCP;
 	}
 
+	//uOPs (micro-ops) are represented only by the execution port they can go to
 	namespace ExecutionPort {
 		using type = int;
 		constexpr type Null = 0;
@@ -46,40 +46,9 @@ namespace RandomX {
 		constexpr type P015 = P0 | P1 | P5;
 	}
 
-	Blake2Generator::Blake2Generator(const void* seed, int nonce) : dataIndex(sizeof(data)) {
-		memset(data, 0, sizeof(data));
-		memcpy(data, seed, SeedSize);
-		store32(&data[60], nonce);
-	}
-
-	uint8_t Blake2Generator::getByte() {
-		checkData(1);
-		return data[dataIndex++];
-	}
-
-	uint32_t Blake2Generator::getInt32() {
-		checkData(4);
-		auto ret = load32(&data[dataIndex]);
-		dataIndex += 4;
-		return ret;
-	}
-
-	void Blake2Generator::checkData(const size_t bytesNeeded) {
-		if (dataIndex + bytesNeeded > sizeof(data))	{
-			blake2b(data, sizeof(data), data, sizeof(data), nullptr, 0);
-			dataIndex = 0;
-		}
-	}
-
-	class RegisterInfo {
-	public:
-		RegisterInfo() : latency(0), lastOpGroup(-1), lastOpPar(-1), value(0) {}
-		int latency;
-		int lastOpGroup;
-		int lastOpPar;
-		int value;
-	};
-
+	//Macro-operation as output of the x86 decoder
+	//Usually one macro-op = one x86 instruction, but 2 instructions are sometimes fused into 1 macro-op
+	//Macro-op can consist of 1 or 2 uOPs.
 	class MacroOp {
 	public:
 		MacroOp(const char* name, int size)
@@ -137,10 +106,7 @@ namespace RandomX {
 		int latency_;
 		ExecutionPort::type uop1_;
 		ExecutionPort::type uop2_;
-		int cycle_;
 		bool dependent_ = false;
-		MacroOp* depDst_ = nullptr;
-		MacroOp* depSrc_ = nullptr;
 	};
 
 	//Size: 3 bytes
@@ -174,7 +140,7 @@ namespace RandomX {
 	const MacroOp ISMULH_R_ops_array[] = { MacroOp::Mov_rr, MacroOp::Imul_r, MacroOp::Mov_rr };
 	const MacroOp IMUL_RCP_ops_array[] = { MacroOp::Mov_ri64, MacroOp(MacroOp::Imul_rr, true) };
 
-	class LightInstructionInfo {
+	class SuperscalarInstructionInfo {
 	public:
 		const char* getName() const {
 			return name_;
@@ -203,21 +169,21 @@ namespace RandomX {
 		int getSrcOp() const {
 			return srcOp_;
 		}
-		static const LightInstructionInfo ISUB_R;
-		static const LightInstructionInfo IXOR_R;
-		static const LightInstructionInfo IADD_RS;
-		static const LightInstructionInfo IMUL_R;
-		static const LightInstructionInfo IROR_C;
-		static const LightInstructionInfo IADD_C7;
-		static const LightInstructionInfo IXOR_C7;
-		static const LightInstructionInfo IADD_C8;
-		static const LightInstructionInfo IXOR_C8;
-		static const LightInstructionInfo IADD_C9;
-		static const LightInstructionInfo IXOR_C9;
-		static const LightInstructionInfo IMULH_R;
-		static const LightInstructionInfo ISMULH_R;
-		static const LightInstructionInfo IMUL_RCP;
-		static const LightInstructionInfo NOP;
+		static const SuperscalarInstructionInfo ISUB_R;
+		static const SuperscalarInstructionInfo IXOR_R;
+		static const SuperscalarInstructionInfo IADD_RS;
+		static const SuperscalarInstructionInfo IMUL_R;
+		static const SuperscalarInstructionInfo IROR_C;
+		static const SuperscalarInstructionInfo IADD_C7;
+		static const SuperscalarInstructionInfo IXOR_C7;
+		static const SuperscalarInstructionInfo IADD_C8;
+		static const SuperscalarInstructionInfo IXOR_C8;
+		static const SuperscalarInstructionInfo IADD_C9;
+		static const SuperscalarInstructionInfo IXOR_C9;
+		static const SuperscalarInstructionInfo IMULH_R;
+		static const SuperscalarInstructionInfo ISMULH_R;
+		static const SuperscalarInstructionInfo IMUL_RCP;
+		static const SuperscalarInstructionInfo NOP;
 	private:
 		const char* name_;
 		int type_;
@@ -227,14 +193,14 @@ namespace RandomX {
 		int dstOp_ = 0;
 		int srcOp_;
 
-		LightInstructionInfo(const char* name)
+		SuperscalarInstructionInfo(const char* name)
 			: name_(name), type_(-1), latency_(0) {}
-		LightInstructionInfo(const char* name, int type, const MacroOp& op, int srcOp)
+		SuperscalarInstructionInfo(const char* name, int type, const MacroOp& op, int srcOp)
 			: name_(name), type_(type), latency_(op.getLatency()), srcOp_(srcOp) {
 			ops_.push_back(MacroOp(op));
 		}
 		template <size_t N>
-		LightInstructionInfo(const char* name, int type, const MacroOp(&arr)[N], int resultOp, int dstOp, int srcOp)
+		SuperscalarInstructionInfo(const char* name, int type, const MacroOp(&arr)[N], int resultOp, int dstOp, int srcOp)
 			: name_(name), type_(type), latency_(0), resultOp_(resultOp), dstOp_(dstOp), srcOp_(srcOp) {
 			for (unsigned i = 0; i < N; ++i) {
 				ops_.push_back(MacroOp(arr[i]));
@@ -244,24 +210,34 @@ namespace RandomX {
 		}
 	};
 
-	const LightInstructionInfo LightInstructionInfo::ISUB_R = LightInstructionInfo("ISUB_R", SuperscalarInstructionType::ISUB_R, MacroOp::Sub_rr, 0);
-	const LightInstructionInfo LightInstructionInfo::IXOR_R = LightInstructionInfo("IXOR_R", SuperscalarInstructionType::IXOR_R, MacroOp::Xor_rr, 0);
-	const LightInstructionInfo LightInstructionInfo::IADD_RS = LightInstructionInfo("IADD_RS", SuperscalarInstructionType::IADD_RS, MacroOp::Lea_sib, 0);
-	const LightInstructionInfo LightInstructionInfo::IMUL_R = LightInstructionInfo("IMUL_R", SuperscalarInstructionType::IMUL_R, MacroOp::Imul_rr, 0);
-	const LightInstructionInfo LightInstructionInfo::IROR_C = LightInstructionInfo("IROR_C", SuperscalarInstructionType::IROR_C, MacroOp::Ror_ri, -1);
+	const SuperscalarInstructionInfo SuperscalarInstructionInfo::ISUB_R = SuperscalarInstructionInfo("ISUB_R", SuperscalarInstructionType::ISUB_R, MacroOp::Sub_rr, 0);
+	const SuperscalarInstructionInfo SuperscalarInstructionInfo::IXOR_R = SuperscalarInstructionInfo("IXOR_R", SuperscalarInstructionType::IXOR_R, MacroOp::Xor_rr, 0);
+	const SuperscalarInstructionInfo SuperscalarInstructionInfo::IADD_RS = SuperscalarInstructionInfo("IADD_RS", SuperscalarInstructionType::IADD_RS, MacroOp::Lea_sib, 0);
+	const SuperscalarInstructionInfo SuperscalarInstructionInfo::IMUL_R = SuperscalarInstructionInfo("IMUL_R", SuperscalarInstructionType::IMUL_R, MacroOp::Imul_rr, 0);
+	const SuperscalarInstructionInfo SuperscalarInstructionInfo::IROR_C = SuperscalarInstructionInfo("IROR_C", SuperscalarInstructionType::IROR_C, MacroOp::Ror_ri, -1);
 
-	const LightInstructionInfo LightInstructionInfo::IADD_C7 = LightInstructionInfo("IADD_C7", SuperscalarInstructionType::IADD_C7, MacroOp::Add_ri, -1);
-	const LightInstructionInfo LightInstructionInfo::IXOR_C7 = LightInstructionInfo("IXOR_C7", SuperscalarInstructionType::IXOR_C7, MacroOp::Xor_ri, -1);
-	const LightInstructionInfo LightInstructionInfo::IADD_C8 = LightInstructionInfo("IADD_C8", SuperscalarInstructionType::IADD_C8, MacroOp::Add_ri, -1);
-	const LightInstructionInfo LightInstructionInfo::IXOR_C8 = LightInstructionInfo("IXOR_C8", SuperscalarInstructionType::IXOR_C8, MacroOp::Xor_ri, -1);
-	const LightInstructionInfo LightInstructionInfo::IADD_C9 = LightInstructionInfo("IADD_C9", SuperscalarInstructionType::IADD_C9, MacroOp::Add_ri, -1);
-	const LightInstructionInfo LightInstructionInfo::IXOR_C9 = LightInstructionInfo("IXOR_C9", SuperscalarInstructionType::IXOR_C9, MacroOp::Xor_ri, -1);
+	const SuperscalarInstructionInfo SuperscalarInstructionInfo::IADD_C7 = SuperscalarInstructionInfo("IADD_C7", SuperscalarInstructionType::IADD_C7, MacroOp::Add_ri, -1);
+	const SuperscalarInstructionInfo SuperscalarInstructionInfo::IXOR_C7 = SuperscalarInstructionInfo("IXOR_C7", SuperscalarInstructionType::IXOR_C7, MacroOp::Xor_ri, -1);
+	const SuperscalarInstructionInfo SuperscalarInstructionInfo::IADD_C8 = SuperscalarInstructionInfo("IADD_C8", SuperscalarInstructionType::IADD_C8, MacroOp::Add_ri, -1);
+	const SuperscalarInstructionInfo SuperscalarInstructionInfo::IXOR_C8 = SuperscalarInstructionInfo("IXOR_C8", SuperscalarInstructionType::IXOR_C8, MacroOp::Xor_ri, -1);
+	const SuperscalarInstructionInfo SuperscalarInstructionInfo::IADD_C9 = SuperscalarInstructionInfo("IADD_C9", SuperscalarInstructionType::IADD_C9, MacroOp::Add_ri, -1);
+	const SuperscalarInstructionInfo SuperscalarInstructionInfo::IXOR_C9 = SuperscalarInstructionInfo("IXOR_C9", SuperscalarInstructionType::IXOR_C9, MacroOp::Xor_ri, -1);
 
-	const LightInstructionInfo LightInstructionInfo::IMULH_R = LightInstructionInfo("IMULH_R", SuperscalarInstructionType::IMULH_R, IMULH_R_ops_array, 1, 0, 1);
-	const LightInstructionInfo LightInstructionInfo::ISMULH_R = LightInstructionInfo("ISMULH_R", SuperscalarInstructionType::ISMULH_R, ISMULH_R_ops_array, 1, 0, 1);
-	const LightInstructionInfo LightInstructionInfo::IMUL_RCP = LightInstructionInfo("IMUL_RCP", SuperscalarInstructionType::IMUL_RCP, IMUL_RCP_ops_array, 1, 1, -1);
+	const SuperscalarInstructionInfo SuperscalarInstructionInfo::IMULH_R = SuperscalarInstructionInfo("IMULH_R", SuperscalarInstructionType::IMULH_R, IMULH_R_ops_array, 1, 0, 1);
+	const SuperscalarInstructionInfo SuperscalarInstructionInfo::ISMULH_R = SuperscalarInstructionInfo("ISMULH_R", SuperscalarInstructionType::ISMULH_R, ISMULH_R_ops_array, 1, 0, 1);
+	const SuperscalarInstructionInfo SuperscalarInstructionInfo::IMUL_RCP = SuperscalarInstructionInfo("IMUL_RCP", SuperscalarInstructionType::IMUL_RCP, IMUL_RCP_ops_array, 1, 1, -1);
 	
-	const LightInstructionInfo LightInstructionInfo::NOP = LightInstructionInfo("NOP");
+	const SuperscalarInstructionInfo SuperscalarInstructionInfo::NOP = SuperscalarInstructionInfo("NOP");
+
+	//these are some of the options how to split a 16-byte window into 3 or 4 x86 instructions.
+	//RandomX uses instructions with a native size of 3 (sub, xor, mul, mov), 4 (lea, mul), 7 (xor, add immediate) or 10 bytes (mov 64-bit immediate).
+	//Slots with sizes of 8 or 9 bytes need to be padded with a nop instruction.
+	const int buffer0[] = { 4, 8, 4 };
+	const int buffer1[] = { 7, 3, 3, 3 };
+	const int buffer2[] = { 3, 7, 3, 3 };
+	const int buffer3[] = { 4, 9, 3 };
+	const int buffer4[] = { 4, 4, 4, 4 };
+	const int buffer5[] = { 3, 3, 10 };
 
 	class DecoderBuffer {
 	public:
@@ -318,16 +294,6 @@ namespace RandomX {
 		}
 	};
 
-	//these are some of the options how to split a 16-byte window into 3 or 4 x86 instructions.
-	//RandomX uses instructions with a native size of 3 (sub, xor, mul, mov), 4 (lea, mul), 7 (xor, add immediate) or 10 bytes (mov 64-bit immediate).
-	//Slots with sizes of 8 or 9 bytes need to be padded with a nop instruction.
-	const int buffer0[] = { 4, 8, 4 };
-	const int buffer1[] = { 7, 3, 3, 3 };
-	const int buffer2[] = { 3, 7, 3, 3 };
-	const int buffer3[] = { 4, 9, 3 };
-	const int buffer4[] = { 4, 4, 4, 4 };
-	const int buffer5[] = { 3, 3, 10 };
-
 	const DecoderBuffer DecoderBuffer::decodeBuffer484 = DecoderBuffer("4,8,4", 0, buffer0);
 	const DecoderBuffer DecoderBuffer::decodeBuffer7333 = DecoderBuffer("7,3,3,3", 1, buffer1);
 	const DecoderBuffer DecoderBuffer::decodeBuffer3733 = DecoderBuffer("3,7,3,3", 2, buffer2);
@@ -344,13 +310,13 @@ namespace RandomX {
 
 	const DecoderBuffer DecoderBuffer::Default = DecoderBuffer();
 
-	const LightInstructionInfo* slot_3[]  = { &LightInstructionInfo::ISUB_R, &LightInstructionInfo::IXOR_R };
-	const LightInstructionInfo* slot_3L[] = { &LightInstructionInfo::ISUB_R, &LightInstructionInfo::IXOR_R, &LightInstructionInfo::IMULH_R, &LightInstructionInfo::ISMULH_R };
-	const LightInstructionInfo* slot_4[]  = { &LightInstructionInfo::IROR_C, &LightInstructionInfo::IADD_RS };
-	const LightInstructionInfo* slot_7[]  = { &LightInstructionInfo::IXOR_C7, &LightInstructionInfo::IADD_C7 };
-	const LightInstructionInfo* slot_8[] = { &LightInstructionInfo::IXOR_C8, &LightInstructionInfo::IADD_C8 };
-	const LightInstructionInfo* slot_9[] = { &LightInstructionInfo::IXOR_C9, &LightInstructionInfo::IADD_C9 };
-	const LightInstructionInfo* slot_10   = &LightInstructionInfo::IMUL_RCP;
+	const SuperscalarInstructionInfo* slot_3[]  = { &SuperscalarInstructionInfo::ISUB_R, &SuperscalarInstructionInfo::IXOR_R };
+	const SuperscalarInstructionInfo* slot_3L[] = { &SuperscalarInstructionInfo::ISUB_R, &SuperscalarInstructionInfo::IXOR_R, &SuperscalarInstructionInfo::IMULH_R, &SuperscalarInstructionInfo::ISMULH_R };
+	const SuperscalarInstructionInfo* slot_4[]  = { &SuperscalarInstructionInfo::IROR_C, &SuperscalarInstructionInfo::IADD_RS };
+	const SuperscalarInstructionInfo* slot_7[]  = { &SuperscalarInstructionInfo::IXOR_C7, &SuperscalarInstructionInfo::IADD_C7 };
+	const SuperscalarInstructionInfo* slot_8[] = { &SuperscalarInstructionInfo::IXOR_C8, &SuperscalarInstructionInfo::IADD_C8 };
+	const SuperscalarInstructionInfo* slot_9[] = { &SuperscalarInstructionInfo::IXOR_C9, &SuperscalarInstructionInfo::IADD_C9 };
+	const SuperscalarInstructionInfo* slot_10   = &SuperscalarInstructionInfo::IMUL_RCP;
 
 	static bool selectRegister(std::vector<int>& availableRegisters, Blake2Generator& gen, int& reg) {
 		int index;
@@ -367,9 +333,19 @@ namespace RandomX {
 		return true;
 	}
 
-	class LightInstruction {
+	class RegisterInfo {
 	public:
-		void toInstr(Instruction& instr) {
+		RegisterInfo() : latency(0), lastOpGroup(-1), lastOpPar(-1), value(0) {}
+		int latency;
+		int lastOpGroup;
+		int lastOpPar;
+		int value;
+	};
+
+	//"SuperscalarInstruction" consists of one or more macro-ops
+	class SuperscalarInstruction {
+	public:
+		void toInstr(Instruction& instr) { //translate to a RandomX instruction format
 			instr.opcode = getType();
 			instr.dst = dst_;
 			instr.src = src_ >= 0 ? src_ : dst_;
@@ -392,7 +368,7 @@ namespace RandomX {
 			case 4:
 				//if this is the 4-4-4-4 buffer, issue multiplications as the first 3 instructions
 				if (fetchType == 4 && !isLast) {
-					create(&LightInstructionInfo::IMUL_R, gen);
+					create(&SuperscalarInstructionInfo::IMUL_R, gen);
 				}
 				else {
 					create(slot_4[gen.getByte() & 1], gen);
@@ -415,7 +391,7 @@ namespace RandomX {
 			}
 		}
 
-		void create(const LightInstructionInfo* info, Blake2Generator& gen) {
+		void create(const SuperscalarInstructionInfo* info, Blake2Generator& gen) {
 			info_ = info;
 			reset();
 			switch (info->getType())
@@ -445,7 +421,7 @@ namespace RandomX {
 				mod_ = 0;
 				imm32_ = 0;
 				opGroup_ = SuperscalarInstructionType::IMUL_R;
-				opGroupPar_ = -1;
+				groupParIsSource_ = true;
 			} break;
 
 			case SuperscalarInstructionType::IROR_C: {
@@ -505,18 +481,22 @@ namespace RandomX {
 			}
 		}
 
-		bool selectDestination(int cycle, RegisterInfo (&registers)[8], Blake2Generator& gen) {
+		bool selectDestination(int cycle, bool allowChainedMul, RegisterInfo (&registers)[8], Blake2Generator& gen) {
+			/*if (allowChainedMultiplication && opGroup_ == SuperscalarInstructionType::IMUL_R)
+				std::cout << "Selecting destination with chained MUL enabled" << std::endl;*/
 			std::vector<int> availableRegisters;
 			//Conditions for the destination register:
 			// * value must be ready at the required cycle
 			// * cannot be the same as the source register unless the instruction allows it
 			//   - this avoids optimizable instructions such as "xor r, r" or "sub r, r"
+			// * register cannot be multiplied twice in a row unless allowChainedMul is true 
+			//   - this avoids accumulation of trailing zeroes in registers due to excessive multiplication
+			//   - allowChainedMul is set to true if an attempt to find source/destination registers failed (this is quite rare, but prevents a catastrophic failure of the generator)
 			// * either the last instruction applied to the register or its source must be different than this instruction
 			//   - this avoids optimizable instruction sequences such as "xor r1, r2; xor r1, r2" or "ror r, C1; ror r, C2" or "add r, C1; add r, C2"
-			//   - it also avoids accumulation of trailing zeroes in registers due to excessive multiplication
 			// * register r5 cannot be the destination of the IADD_RS instruction (limitation of the x86 lea instruction)
 			for (unsigned i = 0; i < 8; ++i) {
-				if (registers[i].latency <= cycle && (canReuse_ || i != src_) && (registers[i].lastOpGroup != opGroup_ || registers[i].lastOpPar != opGroupPar_) && (info_->getType() != SuperscalarInstructionType::IADD_RS || i != LimitedAddressRegister))
+				if (registers[i].latency <= cycle && (canReuse_ || i != src_) && (allowChainedMul || opGroup_ != SuperscalarInstructionType::IMUL_R || registers[i].lastOpGroup != SuperscalarInstructionType::IMUL_R) && (registers[i].lastOpGroup != opGroup_ || registers[i].lastOpPar != opGroupPar_) && (info_->getType() != SuperscalarInstructionType::IADD_RS || i != LimitedAddressRegister))
 					availableRegisters.push_back(i);
 			}
 			return selectRegister(availableRegisters, gen, dst_);
@@ -560,14 +540,14 @@ namespace RandomX {
 			return opGroupPar_;
 		}
 
-		const LightInstructionInfo& getInfo() const {
+		const SuperscalarInstructionInfo& getInfo() const {
 			return *info_;
 		}
 
-		static const LightInstruction Null;
+		static const SuperscalarInstruction Null;
 
 	private:
-		const LightInstructionInfo* info_;
+		const SuperscalarInstructionInfo* info_;
 		int src_ = -1;
 		int dst_ = -1;
 		int mod_;
@@ -582,15 +562,16 @@ namespace RandomX {
 			canReuse_ = groupParIsSource_ = false;
 		}
 
-		LightInstruction(const LightInstructionInfo* info) : info_(info) {
+		SuperscalarInstruction(const SuperscalarInstructionInfo* info) : info_(info) {
 		}
 	};
 
-	const LightInstruction LightInstruction::Null = LightInstruction(&LightInstructionInfo::NOP);
+	const SuperscalarInstruction SuperscalarInstruction::Null = SuperscalarInstruction(&SuperscalarInstructionInfo::NOP);
 
-	constexpr int CYCLE_MAP_SIZE = RANDOMX_SUPERSCALAR_LATENCY + 3;
+	constexpr int CYCLE_MAP_SIZE = RANDOMX_SUPERSCALAR_LATENCY + 4;
 	constexpr int LOOK_FORWARD_CYCLES = 4;
 	constexpr int MAX_THROWAWAY_COUNT = 256;
+
 #ifndef _DEBUG
 	constexpr bool TRACE = false;
 	constexpr bool INFO = false;
@@ -602,7 +583,7 @@ namespace RandomX {
 	template<bool commit>
 	static int scheduleUop(ExecutionPort::type uop, ExecutionPort::type(&portBusy)[CYCLE_MAP_SIZE][3], int cycle) {
 		//The scheduling here is done optimistically by checking port availability in order P5 -> P0 -> P1 to not overload
-		//P1 (multiplication port) by instructions that can go to any port.
+		//port P1 (multiplication) by instructions that can go to any port.
 		for (; cycle < CYCLE_MAP_SIZE; ++cycle) {
 			if ((uop & ExecutionPort::P5) != 0 && !portBusy[cycle][2]) {
 				if (commit) {
@@ -666,14 +647,14 @@ namespace RandomX {
 		return -1;
 	}
 
-	double generateSuperscalar(LightProgram& prog, Blake2Generator& gen) {
+	void generateSuperscalar(SuperscalarProgram& prog, Blake2Generator& gen) {
 
 		ExecutionPort::type portBusy[CYCLE_MAP_SIZE][3];
 		memset(portBusy, 0, sizeof(portBusy));
 		RegisterInfo registers[8];
 
 		const DecoderBuffer* decodeBuffer = &DecoderBuffer::Default;
-		LightInstruction currentInstruction = LightInstruction::Null;
+		SuperscalarInstruction currentInstruction = SuperscalarInstruction::Null;
 		int macroOpIndex = 0;
 		int codeSize = 0;
 		int macroOpCount = 0;
@@ -719,7 +700,9 @@ namespace RandomX {
 				int scheduleCycle = scheduleMop<false>(mop, portBusy, cycle, depCycle);
 				if (scheduleCycle < 0) {
 					/*if (TRACE)*/ std::cout << "Unable to map operation '" << mop.getName() << "' to execution port (cycle " << cycle << ")" << std::endl;
-					return 0;
+					//__debugbreak();
+					portsSaturated = true;
+					break;
 				}
 
 				//find a source register (if applicable) that will be ready when this instruction executes
@@ -737,20 +720,20 @@ namespace RandomX {
 							throwAwayCount++;
 							macroOpIndex = currentInstruction.getInfo().getSize();
 							if (TRACE) std::cout << "; THROW away " << currentInstruction.getInfo().getName() << std::endl;
+							//cycle = topCycle;
 							continue;
 						}
 						//abort this decode buffer
-						/*if (TRACE)*/ std::cout << "Aborting at cycle " << cycle << " with decode buffer " << decodeBuffer->getName() << " - source registers not available" << std::endl;
-						currentInstruction = LightInstruction::Null;
+						/*if (TRACE)*/ std::cout << "Aborting at cycle " << cycle << " with decode buffer " << decodeBuffer->getName() << " - source registers not available for operation " << currentInstruction.getInfo().getName() << std::endl;
+						currentInstruction = SuperscalarInstruction::Null;
 						break;
 					}
 					if (TRACE) std::cout << "; src = r" << currentInstruction.getSource() << std::endl;
 				}
-				throwAwayCount = 0;
 				//find a destination register that will be ready when this instruction executes
 				if (macroOpIndex == currentInstruction.getInfo().getDstOp()) {
 					int forward;
-					for (forward = 0; forward < LOOK_FORWARD_CYCLES && !currentInstruction.selectDestination(scheduleCycle, registers, gen); ++forward) {
+					for (forward = 0; forward < LOOK_FORWARD_CYCLES && !currentInstruction.selectDestination(scheduleCycle, throwAwayCount > 0, registers, gen); ++forward) {
 						if (TRACE) std::cout << "; dst STALL at cycle " << cycle << std::endl;
 						++scheduleCycle;
 						++cycle;
@@ -760,16 +743,18 @@ namespace RandomX {
 							throwAwayCount++;
 							macroOpIndex = currentInstruction.getInfo().getSize();
 							if (TRACE) std::cout << "; THROW away " << currentInstruction.getInfo().getName() << std::endl;
+							//cycle = topCycle;
 							continue;
 						}
 						//abort this decode buffer
 						/*if (TRACE)*/ std::cout << "Aborting at cycle " << cycle << " with decode buffer " << decodeBuffer->getName() << " - destination registers not available" << std::endl;
-						currentInstruction = LightInstruction::Null;
+						currentInstruction = SuperscalarInstruction::Null;
 						break;
 					}
 					if (TRACE) std::cout << "; dst = r" << currentInstruction.getDestination() << std::endl;
 				}
 				throwAwayCount = 0;
+
 				//recalculate when the instruction can be scheduled for execution based on operand availability
 				scheduleCycle = scheduleMop<true>(mop, portBusy, scheduleCycle, scheduleCycle);
 
@@ -809,67 +794,53 @@ namespace RandomX {
 			++cycle;
 		}
 
-		if(INFO) std::cout << "; ALU port utilization:" << std::endl;
-		if (INFO) std::cout << "; (* = in use, _ = idle)" << std::endl;
-
-		int portCycles = 0;
-		for (int i = 0; i < CYCLE_MAP_SIZE; ++i) {
-			//std::cout << "; " << std::setw(3) << i << " ";
-			for (int j = 0; j < 3; ++j) {
-				//std::cout << (portBusy[i][j] ? '*' : '_');
-				portCycles += !!portBusy[i][j];
-			}
-			//std::cout << std::endl;
-		}
-
 		double ipc = (macroOpCount / (double)retireCycle);
 
-		if (INFO) std::cout << "; code size " << codeSize << " bytes" << std::endl;
-		if (INFO) std::cout << "; x86 macro-ops: " << macroOpCount << std::endl;
-		if (INFO) std::cout << "; fetch cycles: " << decodeCycle << std::endl;
-		if (INFO) std::cout << "; RandomX instructions: " << programSize << std::endl;
-		if (INFO) std::cout << "; Execution time: " << retireCycle << " cycles" << std::endl;
-		if (INFO) std::cout << "; IPC = " << ipc << std::endl;
-		if (INFO) std::cout << "; Port-cycles: " << portCycles << std::endl;
-		if (INFO) std::cout << "; Multiplications: " << mulCount << std::endl;
-
-		int asicLatency[8];
-		memset(asicLatency, 0, sizeof(asicLatency));
+		memset(prog.asicLatencies, 0, sizeof(prog.asicLatencies));
 
 		//Calculate ASIC latency:
 		//Assumes 1 cycle latency for all operations and unlimited parallelization.
 		for (int i = 0; i < programSize; ++i) {
 			Instruction& instr = prog(i);
-			int latDst = asicLatency[instr.dst] + 1;
-			int latSrc = instr.dst != instr.src ? asicLatency[instr.src] + 1 : 0;
-			asicLatency[instr.dst] = std::max(latDst, latSrc);
+			int latDst = prog.asicLatencies[instr.dst] + 1;
+			int latSrc = instr.dst != instr.src ? prog.asicLatencies[instr.src] + 1 : 0;
+			prog.asicLatencies[instr.dst] = std::max(latDst, latSrc);
 		}
 
 		//address register is the register with the highest ASIC latency
 		int asicLatencyMax = 0;
 		int addressReg = 0;
 		for (int i = 0; i < 8; ++i) {
-			if (asicLatency[i] > asicLatencyMax) {
-				asicLatencyMax = asicLatency[i];
+			if (prog.asicLatencies[i] > asicLatencyMax) {
+				asicLatencyMax = prog.asicLatencies[i];
 				addressReg = i;
 			}
-		}
-
-		if (INFO) std::cout << "; ASIC latency: " << asicLatencyMax << std::endl;
-
-		if (INFO) {
-			std::cout << "; ASIC latency:" << std::endl;
-			for (int i = 0; i < 8; ++i) {
-				std::cout << ";  r" << i << " = " << asicLatency[i] << std::endl;
-			}
-			if (INFO) std::cout << "; CPU latency:" << std::endl;
-			for (int i = 0; i < 8; ++i) {
-				std::cout << ";  r" << i << " = " << registers[i].latency << std::endl;
-			}
+			prog.cpuLatencies[i] = registers[i].latency;
 		}
 
 		prog.setSize(programSize);
 		prog.setAddressRegister(addressReg);
-		return ipc;
+
+		prog.cpuLatency = retireCycle;
+		prog.asicLatency = asicLatencyMax;
+		prog.codeSize = codeSize;
+		prog.macroOps = macroOpCount;
+		prog.decodeCycles = decodeCycle;
+		prog.ipc = ipc;
+		prog.mulCount = mulCount;
+		
+
+		/*if(INFO) std::cout << "; ALU port utilization:" << std::endl;
+		if (INFO) std::cout << "; (* = in use, _ = idle)" << std::endl;
+
+		int portCycles = 0;
+		for (int i = 0; i < CYCLE_MAP_SIZE; ++i) {
+			std::cout << "; " << std::setw(3) << i << " ";
+			for (int j = 0; j < 3; ++j) {
+				std::cout << (portBusy[i][j] ? '*' : '_');
+				portCycles += !!portBusy[i][j];
+			}
+			std::cout << std::endl;
+		}*/
 	}
 }
\ No newline at end of file
diff --git a/src/superscalarGenerator.hpp b/src/superscalarGenerator.hpp
new file mode 100644
index 0000000..a64e80d
--- /dev/null
+++ b/src/superscalarGenerator.hpp
@@ -0,0 +1,47 @@
+/*
+Copyright (c) 2019 tevador
+
+This file is part of RandomX.
+
+RandomX is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+RandomX is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with RandomX.  If not, see<http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+#include "Program.hpp"
+#include "Blake2Generator.hpp"
+
+namespace RandomX {
+	                                              //                  Intel Ivy Bridge reference
+	namespace SuperscalarInstructionType {        //uOPs (decode)   execution ports         latency       code size
+		constexpr int ISUB_R = 0;                 //1               p015                    1               3 (sub)
+		constexpr int IXOR_R = 1;                 //1               p015                    1               3 (xor)
+		constexpr int IADD_RS = 2;                //1               p01                     1               4 (lea)
+		constexpr int IMUL_R = 3;                 //1               p1                      3               4 (imul)
+		constexpr int IROR_C = 4;                 //1               p05                     1               4 (ror)
+		constexpr int IADD_C7 = 5;                //1               p015                    1               7 (add)
+		constexpr int IXOR_C7 = 6;                //1               p015                    1               7 (xor)
+		constexpr int IADD_C8 = 7;                //1+0             p015                    1               7+1 (add+nop)
+		constexpr int IXOR_C8 = 8;                //1+0             p015                    1               7+1 (xor+nop)
+		constexpr int IADD_C9 = 9;                //1+0             p015                    1               7+2 (add+nop)
+		constexpr int IXOR_C9 = 10;               //1+0             p015                    1               7+2 (xor+nop)
+		constexpr int IMULH_R = 11;               //1+2+1           0+(p1,p5)+0             3               3+3+3 (mov+mul+mov)
+		constexpr int ISMULH_R = 12;              //1+2+1           0+(p1,p5)+0             3               3+3+3 (mov+imul+mov)
+		constexpr int IMUL_RCP = 13;              //1+1             p015+p1                 4              10+4   (mov+imul)
+
+		constexpr int COUNT = 14;
+		constexpr int INVALID = -1;
+	}
+
+	void generateSuperscalar(SuperscalarProgram& prog, Blake2Generator& gen);
+}
\ No newline at end of file
diff --git a/src/tests/superscalar-avalanche.cpp b/src/tests/superscalar-avalanche.cpp
index 9c91a88..9fa1613 100644
--- a/src/tests/superscalar-avalanche.cpp
+++ b/src/tests/superscalar-avalanche.cpp
@@ -20,9 +20,10 @@ along with RandomX.  If not, see<http://www.gnu.org/licenses/>.
 #include <iostream>
 #include <cstdint>
 #include <vector>
-#include "../LightProgramGenerator.hpp"
+#include "../superscalarGenerator.hpp"
 #include "../InterpretedVirtualMachine.hpp"
 #include "../intrinPortable.h"
+#include "../Blake2Generator.hpp"
 
 const uint8_t seed[32] = { 191, 182, 222, 175, 249, 89, 134, 104, 241, 68, 191, 62, 162, 166, 61, 64, 123, 191, 227, 193, 118, 60, 188, 53, 223, 133, 175, 24, 123, 230, 55, 74 };
 
@@ -45,9 +46,9 @@ int main() {
 			uint64_t rb[8];
 			memcpy(rb, ra, sizeof rb);
 			rb[0] ^= (1ULL << bit);
-			RandomX::LightProgram p;
+			RandomX::SuperscalarProgram p;
 			RandomX::Blake2Generator gen(seed, i);
-			RandomX::generateLightProg2(p, gen);
+			RandomX::generateSuperscalar(p, gen);
 			RandomX::InterpretedVirtualMachine<false>::executeSuperscalar(ra, p, dummy);
 			RandomX::InterpretedVirtualMachine<false>::executeSuperscalar(rb, p, dummy);
 			uint64_t diff = 0;
diff --git a/src/tests/superscalar-init.cpp b/src/tests/superscalar-init.cpp
index b366355..a7c1208 100644
--- a/src/tests/superscalar-init.cpp
+++ b/src/tests/superscalar-init.cpp
@@ -21,7 +21,7 @@ along with RandomX.  If not, see<http://www.gnu.org/licenses/>.
 #include <cstdint>
 #include <vector>
 #include <unordered_set>
-#include "../LightProgramGenerator.hpp"
+#include "../superscalarGenerator.hpp"
 #include "../InterpretedVirtualMachine.hpp"
 #include "../intrinPortable.h"
 #include "../configuration.h"
diff --git a/vcxproj/randomx.vcxproj b/vcxproj/randomx.vcxproj
index 1c1cae0..d646143 100644
--- a/vcxproj/randomx.vcxproj
+++ b/vcxproj/randomx.vcxproj
@@ -127,6 +127,7 @@
     <ClCompile Include="..\src\argon2_core.c" />
     <ClCompile Include="..\src\argon2_ref.c" />
     <ClCompile Include="..\src\AssemblyGeneratorX86.cpp" />
+    <ClCompile Include="..\src\Blake2Generator.cpp" />
     <ClCompile Include="..\src\blake2\blake2b.c" />
     <ClCompile Include="..\src\Cache.cpp" />
     <ClCompile Include="..\src\CompiledLightVirtualMachine.cpp" />
@@ -137,8 +138,7 @@
     <ClCompile Include="..\src\instructionsPortable.cpp" />
     <ClCompile Include="..\src\InterpretedVirtualMachine.cpp" />
     <ClCompile Include="..\src\JitCompilerX86.cpp" />
-    <ClCompile Include="..\src\LightClientAsyncWorker.cpp" />
-    <ClCompile Include="..\src\LightProgramGenerator.cpp" />
+    <ClCompile Include="..\src\superscalarGenerator.cpp" />
     <ClCompile Include="..\src\main.cpp" />
     <ClCompile Include="..\src\reciprocal.c" />
     <ClCompile Include="..\src\softAes.cpp" />
@@ -153,6 +153,7 @@
     <ClInclude Include="..\src\argon2.h" />
     <ClInclude Include="..\src\argon2_core.h" />
     <ClInclude Include="..\src\AssemblyGeneratorX86.hpp" />
+    <ClInclude Include="..\src\Blake2Generator.hpp" />
     <ClInclude Include="..\src\Cache.hpp" />
     <ClInclude Include="..\src\catch.hpp" />
     <ClInclude Include="..\src\common.hpp" />
@@ -167,8 +168,7 @@
     <ClInclude Include="..\src\intrinPortable.h" />
     <ClInclude Include="..\src\JitCompilerX86-static.hpp" />
     <ClInclude Include="..\src\JitCompilerX86.hpp" />
-    <ClInclude Include="..\src\LightClientAsyncWorker.hpp" />
-    <ClInclude Include="..\src\LightProgramGenerator.hpp" />
+    <ClInclude Include="..\src\superscalarGenerator.hpp" />
     <ClInclude Include="..\src\Program.hpp" />
     <ClInclude Include="..\src\reciprocal.h" />
     <ClInclude Include="..\src\softAes.h" />
diff --git a/vcxproj/randomx.vcxproj.filters b/vcxproj/randomx.vcxproj.filters
index 5b821c8..77939bd 100644
--- a/vcxproj/randomx.vcxproj.filters
+++ b/vcxproj/randomx.vcxproj.filters
@@ -54,12 +54,6 @@
     <ClCompile Include="..\src\JitCompilerX86.cpp">
       <Filter>Source Files</Filter>
     </ClCompile>
-    <ClCompile Include="..\src\LightClientAsyncWorker.cpp">
-      <Filter>Source Files</Filter>
-    </ClCompile>
-    <ClCompile Include="..\src\LightProgramGenerator.cpp">
-      <Filter>Source Files</Filter>
-    </ClCompile>
     <ClCompile Include="..\src\main.cpp">
       <Filter>Source Files</Filter>
     </ClCompile>
@@ -75,6 +69,12 @@
     <ClCompile Include="..\src\blake2\blake2b.c">
       <Filter>Source Files</Filter>
     </ClCompile>
+    <ClCompile Include="..\src\Blake2Generator.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\superscalarGenerator.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
   </ItemGroup>
   <ItemGroup>
     <MASM Include="..\src\JitCompilerX86-static.asm">
@@ -136,12 +136,6 @@
     <ClInclude Include="..\src\JitCompilerX86-static.hpp">
       <Filter>Header Files</Filter>
     </ClInclude>
-    <ClInclude Include="..\src\LightClientAsyncWorker.hpp">
-      <Filter>Header Files</Filter>
-    </ClInclude>
-    <ClInclude Include="..\src\LightProgramGenerator.hpp">
-      <Filter>Header Files</Filter>
-    </ClInclude>
     <ClInclude Include="..\src\Program.hpp">
       <Filter>Header Files</Filter>
     </ClInclude>
@@ -166,5 +160,11 @@
     <ClInclude Include="..\src\virtualMemory.hpp">
       <Filter>Header Files</Filter>
     </ClInclude>
+    <ClInclude Include="..\src\Blake2Generator.hpp">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\superscalarGenerator.hpp">
+      <Filter>Header Files</Filter>
+    </ClInclude>
   </ItemGroup>
 </Project>
\ No newline at end of file
diff --git a/vcxproj/superscalar-avalanche.vcxproj b/vcxproj/superscalar-avalanche.vcxproj
index dab0311..1cac62b 100644
--- a/vcxproj/superscalar-avalanche.vcxproj
+++ b/vcxproj/superscalar-avalanche.vcxproj
@@ -118,6 +118,7 @@
   <ItemGroup>
     <ClCompile Include="..\src\argon2_core.c" />
     <ClCompile Include="..\src\argon2_ref.c" />
+    <ClCompile Include="..\src\Blake2Generator.cpp" />
     <ClCompile Include="..\src\blake2\blake2b.c" />
     <ClCompile Include="..\src\Cache.cpp" />
     <ClCompile Include="..\src\dataset.cpp" />
@@ -125,9 +126,9 @@
     <ClCompile Include="..\src\Instruction.cpp" />
     <ClCompile Include="..\src\instructionsPortable.cpp" />
     <ClCompile Include="..\src\InterpretedVirtualMachine.cpp" />
-    <ClCompile Include="..\src\LightProgramGenerator.cpp" />
     <ClCompile Include="..\src\reciprocal.c" />
     <ClCompile Include="..\src\softAes.cpp" />
+    <ClCompile Include="..\src\superscalarGenerator.cpp" />
     <ClCompile Include="..\src\tests\superscalar-avalanche.cpp" />
     <ClCompile Include="..\src\VirtualMachine.cpp" />
     <ClCompile Include="..\src\virtualMemory.cpp" />
diff --git a/vcxproj/superscalar-avalanche.vcxproj.filters b/vcxproj/superscalar-avalanche.vcxproj.filters
index 9984ed1..93b3838 100644
--- a/vcxproj/superscalar-avalanche.vcxproj.filters
+++ b/vcxproj/superscalar-avalanche.vcxproj.filters
@@ -45,9 +45,6 @@
     <ClCompile Include="..\src\blake2\blake2b.c">
       <Filter>Source Files</Filter>
     </ClCompile>
-    <ClCompile Include="..\src\LightProgramGenerator.cpp">
-      <Filter>Source Files</Filter>
-    </ClCompile>
     <ClCompile Include="..\src\hashAes1Rx4.cpp">
       <Filter>Source Files</Filter>
     </ClCompile>
@@ -60,6 +57,12 @@
     <ClCompile Include="..\src\virtualMemory.cpp">
       <Filter>Source Files</Filter>
     </ClCompile>
+    <ClCompile Include="..\src\superscalarGenerator.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\Blake2Generator.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
   </ItemGroup>
   <ItemGroup>
     <MASM Include="..\src\squareHash.asm">
diff --git a/vcxproj/superscalar-init.vcxproj b/vcxproj/superscalar-init.vcxproj
index 4c4794c..d765f85 100644
--- a/vcxproj/superscalar-init.vcxproj
+++ b/vcxproj/superscalar-init.vcxproj
@@ -118,6 +118,7 @@
   <ItemGroup>
     <ClCompile Include="..\src\argon2_core.c" />
     <ClCompile Include="..\src\argon2_ref.c" />
+    <ClCompile Include="..\src\Blake2Generator.cpp" />
     <ClCompile Include="..\src\blake2\blake2b.c" />
     <ClCompile Include="..\src\Cache.cpp" />
     <ClCompile Include="..\src\dataset.cpp" />
@@ -125,9 +126,9 @@
     <ClCompile Include="..\src\Instruction.cpp" />
     <ClCompile Include="..\src\instructionsPortable.cpp" />
     <ClCompile Include="..\src\InterpretedVirtualMachine.cpp" />
-    <ClCompile Include="..\src\LightProgramGenerator.cpp" />
     <ClCompile Include="..\src\reciprocal.c" />
     <ClCompile Include="..\src\softAes.cpp" />
+    <ClCompile Include="..\src\superscalarGenerator.cpp" />
     <ClCompile Include="..\src\tests\superscalar-init.cpp" />
     <ClCompile Include="..\src\VirtualMachine.cpp" />
     <ClCompile Include="..\src\virtualMemory.cpp" />
diff --git a/vcxproj/superscalar-init.vcxproj.filters b/vcxproj/superscalar-init.vcxproj.filters
index 4666d07..cad6e2b 100644
--- a/vcxproj/superscalar-init.vcxproj.filters
+++ b/vcxproj/superscalar-init.vcxproj.filters
@@ -42,9 +42,6 @@
     <ClCompile Include="..\src\InterpretedVirtualMachine.cpp">
       <Filter>Source Files</Filter>
     </ClCompile>
-    <ClCompile Include="..\src\LightProgramGenerator.cpp">
-      <Filter>Source Files</Filter>
-    </ClCompile>
     <ClCompile Include="..\src\reciprocal.c">
       <Filter>Source Files</Filter>
     </ClCompile>
@@ -60,6 +57,12 @@
     <ClCompile Include="..\src\virtualMemory.cpp">
       <Filter>Source Files</Filter>
     </ClCompile>
+    <ClCompile Include="..\src\superscalarGenerator.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\Blake2Generator.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
   </ItemGroup>
   <ItemGroup>
     <MASM Include="..\src\squareHash.asm">