diff --git a/makefile b/makefile
index 8aff8c5..3ffacec 100644
--- a/makefile
+++ b/makefile
@@ -9,10 +9,9 @@ endif
 BINDIR=bin
 SRCDIR=src
 OBJDIR=obj
-LDFLAGS=
+LDFLAGS=-lpthread
 TOBJS=$(addprefix $(OBJDIR)/,instructionsPortable.o TestAluFpu.o)
-ROBJS=$(addprefix $(OBJDIR)/,argon2_core.o argon2_ref.o AssemblyGeneratorX86.o blake2b.o CompiledVirtualMachine.o dataset.o JitCompilerX86.o instructionsPortable.o Instruction.o InterpretedVirtualMachine.o main.o Program.o softAes.o VirtualMachine.o t1ha2.o)
-SRC1=$(addprefix $(SRCDIR)/,TestAluFpu.cpp instructions.hpp Pcg32.hpp)
+ROBJS=$(addprefix $(OBJDIR)/,argon2_core.o argon2_ref.o AssemblyGeneratorX86.o blake2b.o CompiledVirtualMachine.o dataset.o JitCompilerX86.o instructionsPortable.o Instruction.o InterpretedVirtualMachine.o main.o Program.o softAes.o VirtualMachine.o t1ha2.o Cache.o)
 
 all: release test
 
@@ -52,7 +51,7 @@ $(OBJDIR)/blake2b.o: $(addprefix $(SRCDIR)/blake2/,blake2b.c blake2.h blake2-imp
 $(OBJDIR)/CompiledVirtualMachine.o: $(addprefix $(SRCDIR)/,CompiledVirtualMachine.cpp CompiledVirtualMachine.hpp Pcg32.hpp common.hpp instructions.hpp) | $(OBJDIR)
 	$(CXX) $(CXXFLAGS) -c $(SRCDIR)/CompiledVirtualMachine.cpp -o $@
   
-$(OBJDIR)/dataset.o: $(addprefix $(SRCDIR)/,dataset.cpp common.hpp Pcg32.hpp argon2_core.h) | $(OBJDIR)
+$(OBJDIR)/dataset.o: $(addprefix $(SRCDIR)/,dataset.cpp common.hpp Pcg32.hpp) | $(OBJDIR)
 	$(CXX) $(CXXFLAGS) -c $(SRCDIR)/dataset.cpp -o $@
 
 $(OBJDIR)/JitCompilerX86.o: $(addprefix $(SRCDIR)/,JitCompilerX86.cpp JitCompilerX86.hpp Instruction.hpp) | $(OBJDIR)
@@ -72,6 +71,9 @@ $(OBJDIR)/main.o: $(addprefix $(SRCDIR)/,main.cpp InterpretedVirtualMachine.hpp
   
 $(OBJDIR)/Program.o: $(addprefix $(SRCDIR)/,Program.cpp Program.hpp Pcg32.hpp) | $(OBJDIR)
 	$(CXX) $(CXXFLAGS) -c $(SRCDIR)/Program.cpp -o $@
+
+$(OBJDIR)/Cache.o: $(addprefix $(SRCDIR)/,Cache.cpp Cache.hpp Pcg32.hpp argon2_core.h) | $(OBJDIR)
+	$(CXX) $(CXXFLAGS) -c $(SRCDIR)/Cache.cpp -o $@
   
 $(OBJDIR)/softAes.o: $(addprefix $(SRCDIR)/,softAes.cpp softAes.h) | $(OBJDIR)
 	$(CXX) $(CXXFLAGS) -c $(SRCDIR)/softAes.cpp -o $@
diff --git a/src/Cache.cpp b/src/Cache.cpp
new file mode 100644
index 0000000..171fa58
--- /dev/null
+++ b/src/Cache.cpp
@@ -0,0 +1,147 @@
+/*
+Copyright (c) 2018 tevador
+
+This file is part of RandomX.
+
+RandomX is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+RandomX is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with RandomX.  If not, see<http://www.gnu.org/licenses/>.
+*/
+
+#include <cstring>
+#include "Cache.hpp"
+#include "softAes.h"
+#include "argon2.h"
+#include "Pcg32.hpp"
+#include "argon2_core.h"
+
+namespace RandomX {
+
+	static_assert(ArgonMemorySize % (ArgonLanes * ARGON2_SYNC_POINTS) == 0, "ArgonMemorySize - invalid value");
+
+	// This will shift and xor tmp1 into itself as 4 32-bit vals such as
+	// sl_xor(a1 a2 a3 a4) = a1 (a2^a1) (a3^a2^a1) (a4^a3^a2^a1)
+	static inline __m128i sl_xor(__m128i tmp1) {
+		__m128i tmp4;
+		tmp4 = _mm_slli_si128(tmp1, 0x04);
+		tmp1 = _mm_xor_si128(tmp1, tmp4);
+		tmp4 = _mm_slli_si128(tmp4, 0x04);
+		tmp1 = _mm_xor_si128(tmp1, tmp4);
+		tmp4 = _mm_slli_si128(tmp4, 0x04);
+		tmp1 = _mm_xor_si128(tmp1, tmp4);
+		return tmp1;
+	}
+
+	template<uint8_t rcon, bool soft>
+	static inline void aesGenKeys(__m128i* xout0, __m128i* xout2) {
+		__m128i xout1 = soft ? soft_aeskeygenassist(*xout2, rcon) : _mm_aeskeygenassist_si128(*xout2, rcon);
+		xout1 = _mm_shuffle_epi32(xout1, 0xFF);
+		*xout0 = sl_xor(*xout0);
+		*xout0 = _mm_xor_si128(*xout0, xout1);
+		xout1 = soft ? soft_aeskeygenassist(*xout0, 0x00) : _mm_aeskeygenassist_si128(*xout0, 0x00);
+		xout1 = _mm_shuffle_epi32(xout1, 0xAA);
+		*xout2 = sl_xor(*xout2);
+		*xout2 = _mm_xor_si128(*xout2, xout1);
+	}
+
+	template<bool soft>
+	static inline void expandAesKeys(const __m128i* seed, __m128i* keys) {
+		__m128i xout0, xout2;
+		xout0 = _mm_load_si128(seed);
+		xout2 = _mm_load_si128(seed + 1);
+		*keys++ = xout0;
+		*keys++ = xout2;
+		aesGenKeys<0x01, soft>(&xout0, &xout2);
+		*keys++ = xout0;
+		*keys++ = xout2;
+		aesGenKeys<0x02, soft>(&xout0, &xout2);
+		*keys++ = xout0;
+		*keys++ = xout2;
+		aesGenKeys<0x04, soft>(&xout0, &xout2);
+		*keys++ = xout0;
+		*keys++ = xout2;
+		aesGenKeys<0x08, soft>(&xout0, &xout2);
+		*keys++ = xout0;
+		*keys++ = xout2;
+	}
+
+	void Cache::argonFill(const void* seed, size_t seedSize) {
+		uint32_t memory_blocks, segment_length;
+		argon2_instance_t instance;
+		argon2_context context;
+
+		context.out = nullptr;
+		context.outlen = 0;
+		context.pwd = CONST_CAST(uint8_t *)seed;
+		context.pwdlen = (uint32_t)seedSize;
+		context.salt = CONST_CAST(uint8_t *)ArgonSalt;
+		context.saltlen = (uint32_t)ArgonSaltSize;
+		context.secret = NULL;
+		context.secretlen = 0;
+		context.ad = NULL;
+		context.adlen = 0;
+		context.t_cost = ArgonIterations;
+		context.m_cost = ArgonMemorySize;
+		context.lanes = ArgonLanes;
+		context.threads = 1;
+		context.allocate_cbk = NULL;
+		context.free_cbk = NULL;
+		context.flags = ARGON2_DEFAULT_FLAGS;
+		context.version = ARGON2_VERSION_NUMBER;
+
+		/* 2. Align memory size */
+		/* Minimum memory_blocks = 8L blocks, where L is the number of lanes */
+		memory_blocks = context.m_cost;
+
+		segment_length = memory_blocks / (context.lanes * ARGON2_SYNC_POINTS);
+
+		instance.version = context.version;
+		instance.memory = NULL;
+		instance.passes = context.t_cost;
+		instance.memory_blocks = memory_blocks;
+		instance.segment_length = segment_length;
+		instance.lane_length = segment_length * ARGON2_SYNC_POINTS;
+		instance.lanes = context.lanes;
+		instance.threads = context.threads;
+		instance.type = Argon2_d;
+		instance.memory = (block*)memory;
+
+		if (instance.threads > instance.lanes) {
+			instance.threads = instance.lanes;
+		}
+
+		/* 3. Initialization: Hashing inputs, allocating memory, filling first
+		 * blocks
+		 */
+		argon_initialize(&instance, &context);
+
+		fill_memory_blocks(&instance);
+	}
+
+	template<bool softAes>
+	void Cache::initialize(const void* seed, size_t seedSize) {
+		//Argon2d memory fill
+		argonFill(seed, seedSize);
+
+		//Circular shift of the cache buffer by 512 bytes
+		//realized by copying the first 512 bytes to the back 
+		//of the buffer and shifting the start by 512 bytes
+		memcpy(memory + CacheSize, memory, CacheShift);
+
+		//AES keys
+		expandAesKeys<softAes>((__m128i*)seed, keys.data());
+	}
+
+	template void Cache::initialize<true>(const void*, size_t);
+
+	template void Cache::initialize<false>(const void*, size_t);
+}
\ No newline at end of file
diff --git a/src/Cache.hpp b/src/Cache.hpp
new file mode 100644
index 0000000..7a34ee8
--- /dev/null
+++ b/src/Cache.hpp
@@ -0,0 +1,57 @@
+/*
+Copyright (c) 2018 tevador
+
+This file is part of RandomX.
+
+RandomX is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+RandomX is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with RandomX.  If not, see<http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include <cstdint>
+#include <new>
+#include "common.hpp"
+#include "dataset.hpp"
+
+namespace RandomX {
+
+	class Cache {
+	public:
+		void* operator new(size_t size) {
+			void* ptr = _mm_malloc(size, sizeof(__m128i));
+			if (ptr == nullptr)
+				throw std::bad_alloc();
+			return ptr;
+		}
+
+		void operator delete(void* ptr) {
+			_mm_free(ptr);
+		}
+
+		template<bool softAes>
+		void initialize(const void* seed, size_t seedSize);
+
+		const KeysContainer& getKeys() const {
+			return keys;
+		}
+
+		const uint8_t* getCache() {
+			return memory + CacheShift;
+		}
+	private:
+		alignas(16) KeysContainer keys;
+		uint8_t memory[CacheSize + CacheShift];
+		void argonFill(const void* seed, size_t seedSize);
+	};
+}
\ No newline at end of file
diff --git a/src/CompiledVirtualMachine.cpp b/src/CompiledVirtualMachine.cpp
index fdb1498..35e4ba5 100644
--- a/src/CompiledVirtualMachine.cpp
+++ b/src/CompiledVirtualMachine.cpp
@@ -31,11 +31,11 @@ namespace RandomX {
 #endif
 	}
 
-	void CompiledVirtualMachine::initializeDataset(const void* seed, bool lightClient) {
+	void CompiledVirtualMachine::setDataset(dataset_t ds, bool lightClient) {
 		if (lightClient) {
 			throw std::runtime_error("Compiled VM does not support light-client mode");
 		}
-		VirtualMachine::initializeDataset(seed, lightClient);
+		VirtualMachine::setDataset(ds, lightClient);
 	}
 
 	void CompiledVirtualMachine::initializeProgram(const void* seed) {
diff --git a/src/CompiledVirtualMachine.hpp b/src/CompiledVirtualMachine.hpp
index b5b1d63..c2e108d 100644
--- a/src/CompiledVirtualMachine.hpp
+++ b/src/CompiledVirtualMachine.hpp
@@ -27,7 +27,7 @@ namespace RandomX {
 	class CompiledVirtualMachine : public VirtualMachine {
 	public:
 		CompiledVirtualMachine(bool softAes);
-		void initializeDataset(const void* seed, bool light = false) override;
+		void setDataset(dataset_t ds, bool light = false) override;
 		void initializeProgram(const void* seed) override;
 		virtual void execute() override;
 		void* getProgram() {
diff --git a/src/VirtualMachine.cpp b/src/VirtualMachine.cpp
index 4ea7ac0..21c52ac 100644
--- a/src/VirtualMachine.cpp
+++ b/src/VirtualMachine.cpp
@@ -20,58 +20,65 @@ along with RandomX.  If not, see<http://www.gnu.org/licenses/>.
 #include "VirtualMachine.hpp"
 #include "common.hpp"
 #include "dataset.hpp"
+#include "Cache.hpp"
 #include "t1ha/t1ha.h"
 #include "blake2/blake2.h"
 #include <cstring>
 
 namespace RandomX {
 	VirtualMachine::VirtualMachine(bool softAes) : softAes(softAes), lightClient(false) {
-		mem.dataset = nullptr;
+		mem.ds.dataset = nullptr;
 	}
 
-	void VirtualMachine::initializeDataset(const void* seed, bool light) {
+	VirtualMachine::~VirtualMachine() {
 		if (lightClient) {
-			_mm_free(mem.lcm->cache);
-			_mm_free(mem.lcm->block);
+			delete mem.ds.lightDataset->block;
+			delete mem.ds.lightDataset;
+		}
+	}
+
+	void VirtualMachine::setDataset(dataset_t ds, bool light) {
+		if (mem.ds.dataset != nullptr) {
+			throw std::runtime_error("Dataset is already initialized");
 		}
-		_mm_free(mem.dataset);
 		lightClient = light;
 		if (light) {
+			auto lds = mem.ds.lightDataset = new LightClientDataset();
+			lds->cache = ds.cache;
+			lds->block = (uint8_t*)_mm_malloc(DatasetBlockSize, sizeof(__m128i));
+			lds->blockNumber = -1;
+			if (lds->block == nullptr) {
+				throw std::bad_alloc();
+			}
 			if (softAes) {
-				datasetInitLight<true>(seed, mem.lcm);
 				readDataset = &datasetReadLight<true>;
 			}
 			else {
-				datasetInitLight<false>(seed, mem.lcm);
 				readDataset = &datasetReadLight<false>;
 			}
 		}
 		else {
+			mem.ds = ds;
 			readDataset = &datasetRead;
-			if (softAes) {
-				datasetInit<true>(seed, mem.dataset);
-			}
-			else {
-				datasetInit<false>(seed, mem.dataset);
-			}
 		}
 	}
 
 	void VirtualMachine::initializeScratchpad(uint32_t index) {
 		if (lightClient) {
+			auto cache = mem.ds.lightDataset->cache;
 			if (softAes) {
 				for (int i = 0; i < ScratchpadSize / DatasetBlockSize; ++i) {
-					initBlock<true>(mem.lcm->cache + CacheShift, ((uint8_t*)scratchpad) + DatasetBlockSize * i, (ScratchpadSize / DatasetBlockSize) * index + i, mem.lcm->keys);
+					initBlock<true>(cache->getCache(), ((uint8_t*)scratchpad) + DatasetBlockSize * i, (ScratchpadSize / DatasetBlockSize) * index + i, cache->getKeys());
 				}
 			}
 			else {
 				for (int i = 0; i < ScratchpadSize / DatasetBlockSize; ++i) {
-					initBlock<false>(mem.lcm->cache + CacheShift, ((uint8_t*)scratchpad) + DatasetBlockSize * i, (ScratchpadSize / DatasetBlockSize) * index + i, mem.lcm->keys);
+					initBlock<false>(cache->getCache(), ((uint8_t*)scratchpad) + DatasetBlockSize * i, (ScratchpadSize / DatasetBlockSize) * index + i, cache->getKeys());
 				}
 			}
 		}
 		else {
-			memcpy(scratchpad, mem.dataset + ScratchpadSize * index, ScratchpadSize);
+			memcpy(scratchpad, mem.ds.dataset + ScratchpadSize * index, ScratchpadSize);
 		}
 	}
 
diff --git a/src/VirtualMachine.hpp b/src/VirtualMachine.hpp
index 5c83fa5..569718c 100644
--- a/src/VirtualMachine.hpp
+++ b/src/VirtualMachine.hpp
@@ -26,30 +26,12 @@ namespace RandomX {
 	class VirtualMachine {
 	public:
 		VirtualMachine(bool softAes);
-		virtual ~VirtualMachine() {}
-		virtual void initializeDataset(const void* seed, bool light = false);
+		virtual ~VirtualMachine();
+		virtual void setDataset(dataset_t ds, bool light = false);
 		void initializeScratchpad(uint32_t index);
 		virtual void initializeProgram(const void* seed) = 0;
 		virtual void execute() = 0;
 		void getResult(void*);
-		const RegisterFile& getRegisterFile() const {
-			return reg;
-		}
-		const convertible_t* getScratchpad() const {
-			return scratchpad;
-		}
-		const void* getCache() {
-			if (lightClient) {
-				return mem.lcm->cache;
-			}
-			return nullptr;
-		}
-		const __m128i* getKeys() {
-			if (lightClient) {
-				return mem.lcm->keys;
-			}
-			return nullptr;
-		}
 	protected:
 		bool softAes, lightClient;
 		RegisterFile reg;
diff --git a/src/argon2_core.c b/src/argon2_core.c
index f90a0d7..cf07be5 100644
--- a/src/argon2_core.c
+++ b/src/argon2_core.c
@@ -473,7 +473,7 @@ void initial_hash(uint8_t *blockhash, argon2_context *context, argon2_type type)
 	blake2b_final(&BlakeHash, blockhash, ARGON2_PREHASH_DIGEST_LENGTH);
 }
 
-int initialize(argon2_instance_t *instance, argon2_context *context) {
+int argon_initialize(argon2_instance_t *instance, argon2_context *context) {
 	uint8_t blockhash[ARGON2_PREHASH_SEED_LENGTH];
 	int result = ARGON2_OK;
 
diff --git a/src/argon2_core.h b/src/argon2_core.h
index 6886fac..69a6339 100644
--- a/src/argon2_core.h
+++ b/src/argon2_core.h
@@ -204,7 +204,7 @@ void fill_first_blocks(uint8_t *blockhash, const argon2_instance_t *instance);
  * @return Zero if successful, -1 if memory failed to allocate. @context->state
  * will be modified if successful.
  */
-int initialize(argon2_instance_t *instance, argon2_context *context);
+int argon_initialize(argon2_instance_t *instance, argon2_context *context);
 
 /*
  * XORing the last block of each lane, hashing it, making the tag. Deallocates
diff --git a/src/common.hpp b/src/common.hpp
index 1343e62..04333a5 100644
--- a/src/common.hpp
+++ b/src/common.hpp
@@ -20,8 +20,6 @@ along with RandomX.  If not, see<http://www.gnu.org/licenses/>.
 #pragma once
 
 #include <cstdint>
-#include <new>
-#include "intrinPortable.h"
 
 namespace RandomX {
 
@@ -55,13 +53,13 @@ namespace RandomX {
 	constexpr bool trace = false;
 #endif
 
-	typedef union {
+	union convertible_t {
 		double f64;
 		int64_t i64;
 		uint64_t u64;
 		int32_t i32;
 		uint32_t u32;
-	} convertible_t;
+	};
 
 	constexpr int ProgramLength = 512;
 	constexpr int InstructionCount = 1024 * 1024;
@@ -71,34 +69,27 @@ namespace RandomX {
 	constexpr uint32_t ScratchpadL2 = ScratchpadSize / sizeof(convertible_t);
 	constexpr int RegistersCount = 8;
 
+	class Cache;
+
 	inline int wrapInstr(int i) {
 		return i % RandomX::ProgramLength;
 	}
 
-	struct LightClientMemory {
-		uint8_t* cache;
+	struct LightClientDataset {
+		Cache* cache;
 		uint8_t* block;
 		uint32_t blockNumber;
-		alignas(16) __m128i keys[10];
+	};
 
-		void* operator new(size_t size) {
-			void* ptr = _mm_malloc(size, sizeof(__m128i));
-			if (ptr == nullptr)
-				throw std::bad_alloc();
-			return ptr;
-		}
-
-		void operator delete(void* ptr) {
-			_mm_free(ptr);
-		}
+	union dataset_t {
+		uint8_t* dataset;
+		Cache* cache;
+		LightClientDataset* lightDataset;
 	};
 
 	struct MemoryRegisters {
 		addr_t ma, mx;
-		union {
-			uint8_t* dataset;
-			LightClientMemory* lcm;
-		};
+		dataset_t ds;
 	};
 
 	static_assert(sizeof(MemoryRegisters) == 2 * sizeof(addr_t) + sizeof(uintptr_t), "Invalid alignment of struct RandomX::MemoryRegisters");
diff --git a/src/dataset.cpp b/src/dataset.cpp
index 0738b4f..a265fdf 100644
--- a/src/dataset.cpp
+++ b/src/dataset.cpp
@@ -19,135 +19,25 @@ along with RandomX.  If not, see<http://www.gnu.org/licenses/>.
 
 // Parts of this file are originally copyright (c) xmr-stak
 
-#include "common.hpp"
-#include "dataset.hpp"
-#include "Pcg32.hpp"
-#include "argon2_core.h"
 #include <new>
 #include <algorithm>
 #include <stdexcept>
 #include <cstring>
 
-#if defined(_MSC_VER)
-#if defined(_M_X64) || (defined(_M_IX86_FP) && _M_IX86_FP == 2)
-#define __SSE2__ 1
-#endif
-#endif
+#include "common.hpp"
+#include "dataset.hpp"
+#include "Pcg32.hpp"
+#include "Cache.hpp"
 
 #if defined(__SSE2__)
 #include <wmmintrin.h>
-#define PREFETCH(memory) _mm_prefetch((const char *)((memory).dataset + (memory).ma), _MM_HINT_T0)
+#define PREFETCH(memory) _mm_prefetch((const char *)((memory).ds.dataset + (memory).ma), _MM_HINT_T0)
 #else
 #define PREFETCH(memory)
 #endif
 
 namespace RandomX {
 
-	void initializeCache(const void* input, size_t inputLength, void* memory) {
-		uint32_t memory_blocks, segment_length;
-		argon2_instance_t instance;
-		argon2_context context;
-
-		context.out = nullptr;
-		context.outlen = 0;
-		context.pwd = CONST_CAST(uint8_t *)input;
-		context.pwdlen = (uint32_t)inputLength;
-		context.salt = CONST_CAST(uint8_t *)ArgonSalt;
-		context.saltlen = (uint32_t)ArgonSaltSize;
-		context.secret = NULL;
-		context.secretlen = 0;
-		context.ad = NULL;
-		context.adlen = 0;
-		context.t_cost = ArgonIterations;
-		context.m_cost = ArgonMemorySize;
-		context.lanes = ArgonLanes;
-		context.threads = 1;
-		context.allocate_cbk = NULL;
-		context.free_cbk = NULL;
-		context.flags = ARGON2_DEFAULT_FLAGS;
-		context.version = ARGON2_VERSION_NUMBER;
-
-		/* 2. Align memory size */
-		/* Minimum memory_blocks = 8L blocks, where L is the number of lanes */
-		memory_blocks = context.m_cost;
-
-		segment_length = memory_blocks / (context.lanes * ARGON2_SYNC_POINTS);
-
-		instance.version = context.version;
-		instance.memory = NULL;
-		instance.passes = context.t_cost;
-		instance.memory_blocks = memory_blocks;
-		instance.segment_length = segment_length;
-		instance.lane_length = segment_length * ARGON2_SYNC_POINTS;
-		instance.lanes = context.lanes;
-		instance.threads = context.threads;
-		instance.type = Argon2_d;
-		instance.memory = (block*)memory;
-
-		if (instance.threads > instance.lanes) {
-			instance.threads = instance.lanes;
-		}
-
-		/* 3. Initialization: Hashing inputs, allocating memory, filling first
-		 * blocks
-		 */
-		initialize(&instance, &context);
-
-		fill_memory_blocks(&instance);
-	}
-
-	// This will shift and xor tmp1 into itself as 4 32-bit vals such as
-	// sl_xor(a1 a2 a3 a4) = a1 (a2^a1) (a3^a2^a1) (a4^a3^a2^a1)
-	static inline __m128i sl_xor(__m128i tmp1) {
-		__m128i tmp4;
-		tmp4 = _mm_slli_si128(tmp1, 0x04);
-		tmp1 = _mm_xor_si128(tmp1, tmp4);
-		tmp4 = _mm_slli_si128(tmp4, 0x04);
-		tmp1 = _mm_xor_si128(tmp1, tmp4);
-		tmp4 = _mm_slli_si128(tmp4, 0x04);
-		tmp1 = _mm_xor_si128(tmp1, tmp4);
-		return tmp1;
-	}
-
-	template<uint8_t rcon, bool soft>
-	static inline void aes_genkey_sub(__m128i* xout0, __m128i* xout2) {
-		__m128i xout1 = soft ? soft_aeskeygenassist(*xout2, rcon) : _mm_aeskeygenassist_si128(*xout2, rcon);
-		xout1 = _mm_shuffle_epi32(xout1, 0xFF);
-		*xout0 = sl_xor(*xout0);
-		*xout0 = _mm_xor_si128(*xout0, xout1);
-		xout1 = soft ? soft_aeskeygenassist(*xout0, 0x00) : _mm_aeskeygenassist_si128(*xout0, 0x00);
-		xout1 = _mm_shuffle_epi32(xout1, 0xAA);
-		*xout2 = sl_xor(*xout2);
-		*xout2 = _mm_xor_si128(*xout2, xout1);
-	}
-
-	template<bool soft>
-	void expandAesKeys(const __m128i* seed, __m128i* keys) {
-		__m128i xout0, xout2;
-		xout0 = _mm_load_si128(seed);
-		xout2 = _mm_load_si128(seed + 1);
-		*keys++ = xout0;
-		*keys++ = xout2;
-		aes_genkey_sub<0x01, soft>(&xout0, &xout2);
-		*keys++ = xout0;
-		*keys++ = xout2;
-		aes_genkey_sub<0x02, soft>(&xout0, &xout2);
-		*keys++ = xout0;
-		*keys++ = xout2;
-		aes_genkey_sub<0x04, soft>(&xout0, &xout2);
-		*keys++ = xout0;
-		*keys++ = xout2;
-		aes_genkey_sub<0x08, soft>(&xout0, &xout2);
-		*keys++ = xout0;
-		*keys++ = xout2;
-	}
-
-	template
-		void expandAesKeys<true>(const __m128i* seed, __m128i* keys);
-
-	template
-		void expandAesKeys<false>(const __m128i* seed, __m128i* keys);
-
 	template<typename T>
 	static inline void shuffle(T* buffer, size_t bytes, Pcg32& gen) {
 		auto count = bytes / sizeof(T);
@@ -157,8 +47,18 @@ namespace RandomX {
 		}
 	}
 
+	template<bool soft>
+	static inline __m128i aesenc(__m128i in, __m128i key) {
+		return soft ? soft_aesenc(in, key) : _mm_aesenc_si128(in, key);
+	}
+
+	template<bool soft>
+	static inline __m128i aesdec(__m128i in, __m128i key) {
+		return soft ? soft_aesdec(in, key) : _mm_aesdec_si128(in, key);
+	}
+
 	template<bool soft, bool enc>
-	void initBlock(uint8_t* in, uint8_t* out, uint32_t blockNumber, const __m128i keys[10]) {
+	void initBlock(const uint8_t* in, uint8_t* out, uint32_t blockNumber, const KeysContainer& keys) {
 		__m128i xin, xout;
 		//Initialization vector = block number extended to 128 bits
 		xout = _mm_cvtsi32_si128(blockNumber);
@@ -200,20 +100,20 @@ namespace RandomX {
 	}
 
 	template
-		void initBlock<true, true>(uint8_t* in, uint8_t* out, uint32_t blockNumber, const __m128i keys[10]);
+		void initBlock<true, true>(const uint8_t*, uint8_t*, uint32_t, const KeysContainer&);
 
 	template
-		void initBlock<true, false>(uint8_t* in, uint8_t* out, uint32_t blockNumber, const __m128i keys[10]);
+		void initBlock<true, false>(const uint8_t*, uint8_t*, uint32_t, const KeysContainer&);
 
 	template
-		void initBlock<false, true>(uint8_t* in, uint8_t* out, uint32_t blockNumber, const __m128i keys[10]);
+		void initBlock<false, true>(const uint8_t*, uint8_t*, uint32_t, const KeysContainer&);
 
 	template
-		void initBlock<false, false>(uint8_t* in, uint8_t* out, uint32_t blockNumber, const __m128i keys[10]);
+		void initBlock<false, false>(const uint8_t*, uint8_t*, uint32_t, const KeysContainer&);
 
 	convertible_t datasetRead(addr_t addr, MemoryRegisters& memory) {
 		convertible_t data;
-		data.u64 = *(uint64_t*)(memory.dataset + memory.ma);
+		data.u64 = *(uint64_t*)(memory.ds.dataset + memory.ma);
 		memory.ma += 8;
 		memory.mx ^= addr;
 		if ((memory.mx & 0xFFF8) == 0) {
@@ -224,24 +124,25 @@ namespace RandomX {
 	}
 
 	template<bool softAes>
-	void initBlock(uint8_t* cache, uint8_t* block, uint32_t blockNumber, const __m128i k[10]) {
+	void initBlock(const uint8_t* cache, uint8_t* block, uint32_t blockNumber, const KeysContainer& keys) {
 		if (blockNumber % 2 == 1) {
-			initBlock<softAes, true>(cache + blockNumber * CacheBlockSize, block, blockNumber, k);
+			initBlock<softAes, true>(cache + blockNumber * CacheBlockSize, block, blockNumber, keys);
 		}
 		else {
-			initBlock<softAes, false>(cache + blockNumber * CacheBlockSize, block, blockNumber, k);
+			initBlock<softAes, false>(cache + blockNumber * CacheBlockSize, block, blockNumber, keys);
 		}
 	}
 
 	template<bool softAes>
 	convertible_t datasetReadLight(addr_t addr, MemoryRegisters& memory) {
 		convertible_t data;
+		LightClientDataset* lds = memory.ds.lightDataset;
 		auto blockNumber = memory.ma / DatasetBlockSize;
-		if (memory.lcm->blockNumber != blockNumber) {
-			initBlock<softAes>(memory.lcm->cache + CacheShift, (uint8_t*)memory.lcm->block, blockNumber, memory.lcm->keys);
-			memory.lcm->blockNumber = blockNumber;
+		if (lds->blockNumber != blockNumber) {
+			initBlock<softAes>(lds->cache->getCache(), (uint8_t*)lds->block, blockNumber, lds->cache->getKeys());
+			lds->blockNumber = blockNumber;
 		}
-		data.u64 = *(uint64_t*)(memory.lcm->block + (memory.ma % DatasetBlockSize));
+		data.u64 = *(uint64_t*)(lds->block + (memory.ma % DatasetBlockSize));
 		memory.ma += 8;
 		memory.mx ^= addr;
 		if ((memory.mx & 0xFFF8) == 0) {
@@ -256,54 +157,37 @@ namespace RandomX {
 	template
 		convertible_t datasetReadLight<true>(addr_t addr, MemoryRegisters& memory);
 
-	template<bool softAes>
-	void datasetInit(const void* seed, uint8_t*& dataset) {
+	void datasetAlloc(dataset_t& ds) {
 		if (sizeof(size_t) <= 4)
 			throw std::runtime_error("Platform doesn't support enough memory for the dataset");
-		dataset = (uint8_t*)_mm_malloc(DatasetSize, sizeof(__m128i));
-		if (dataset == nullptr) {
-			throw std::runtime_error("Dataset memory allocation failed. >4 GiB of virtual memory is needed.");
+		ds.dataset = (uint8_t*)_mm_malloc(DatasetSize, /*sizeof(__m128i)*/ 64);
+		if (ds.dataset == nullptr) {
+			throw std::runtime_error("Dataset memory allocation failed. >4 GiB of free virtual memory is needed.");
 		}
-		uint8_t* cache = (uint8_t*)_mm_malloc(CacheSize + CacheShift, sizeof(__m128i));
-		if (cache == nullptr) {
-			throw std::bad_alloc();
-		}
-		initializeCache(seed, SeedSize, cache);
-		memcpy(cache + CacheSize, cache, CacheShift);
-		alignas(16) __m128i keys[10];
-		expandAesKeys<softAes>((const __m128i*)seed, keys);
-		for (uint32_t i = 0; i < DatasetBlockCount; ++i) {
-			initBlock<softAes>(cache + CacheShift, dataset + i * DatasetBlockSize, i, keys);
-		}
-		_mm_free(cache);
 	}
 
-	template
-		void datasetInit<false>(const void*, uint8_t*&);
-
-	template
-		void datasetInit<true>(const void*, uint8_t*&);
-
 	template<bool softAes>
-	void datasetInitLight(const void* seed, LightClientMemory*& lcm) {
-		lcm = new LightClientMemory();
-		lcm->cache = (uint8_t*)_mm_malloc(CacheSize + CacheShift, sizeof(__m128i));
-		if (lcm->cache == nullptr) {
-			throw std::bad_alloc();
+	void datasetInit(Cache* cache, dataset_t ds, uint32_t startBlock, uint32_t blockCount) {
+		for (uint32_t i = startBlock; i < startBlock + blockCount; ++i) {
+			initBlock<softAes>(cache->getCache(), ds.dataset + i * DatasetBlockSize, i, cache->getKeys());
 		}
-		initializeCache(seed, SeedSize, lcm->cache);
-		memcpy(lcm->cache + CacheSize, lcm->cache, CacheShift);
-		expandAesKeys<softAes>((__m128i*)seed, lcm->keys);
-		lcm->block = (uint8_t*)_mm_malloc(DatasetBlockSize, sizeof(__m128i));
-		if (lcm->block == nullptr) {
-			throw std::bad_alloc();
-		}
-		lcm->blockNumber = -1;
 	}
 
 	template
-		void datasetInitLight<false>(const void*, LightClientMemory*&);
+		void datasetInit<false>(Cache*, dataset_t, uint32_t, uint32_t);
 
 	template
-		void datasetInitLight<true>(const void*, LightClientMemory*&);
+		void datasetInit<true>(Cache*, dataset_t, uint32_t, uint32_t);
+
+	template<bool softAes>
+	void datasetInitCache(const void* seed, dataset_t& ds) {
+		ds.cache = new Cache();
+		ds.cache->initialize<softAes>(seed, SeedSize);
+	}
+
+	template
+		void datasetInitCache<false>(const void*, dataset_t&);
+
+	template
+		void datasetInitCache<true>(const void*, dataset_t&);
 }
diff --git a/src/dataset.hpp b/src/dataset.hpp
index 42d63e6..bb29197 100644
--- a/src/dataset.hpp
+++ b/src/dataset.hpp
@@ -20,43 +20,30 @@ along with RandomX.  If not, see<http://www.gnu.org/licenses/>.
 #pragma once
 
 #include <cstdint>
+#include <array>
 #include "intrinPortable.h"
-#include "argon2.h"
 #include "common.hpp"
 #include "softAes.h"
 
 namespace RandomX {
 
-	static_assert(ArgonMemorySize % (ArgonLanes * ARGON2_SYNC_POINTS) == 0, "ArgonMemorySize - invalid value");
-
-	void initializeCache(const void* input, size_t inputLength, void* memory);
-
-	template<bool soft>
-	void expandAesKeys(const __m128i* seed, __m128i* keys);
-
-	template<bool soft>
-	inline __m128i aesenc(__m128i in, __m128i key) {
-		return soft ? soft_aesenc(in, key) : _mm_aesenc_si128(in, key);
-	}
-
-	template<bool soft>
-	inline __m128i aesdec(__m128i in, __m128i key) {
-		return soft ? soft_aesdec(in, key) : _mm_aesdec_si128(in, key);
-	}
+	using KeysContainer = std::array<__m128i, 10>;
 
 	template<bool soft, bool enc>
-	void initBlock(uint8_t* in, uint8_t* out, uint32_t blockNumber, const __m128i keys[10]);
+	void initBlock(const uint8_t* in, uint8_t* out, uint32_t blockNumber, const KeysContainer& keys);
 
 	template<bool softAes>
-	void initBlock(uint8_t* cache, uint8_t* block, uint32_t blockNumber, const __m128i keys[10]);
+	void initBlock(const uint8_t* cache, uint8_t* block, uint32_t blockNumber, const KeysContainer& keys);
+
+	void datasetAlloc(dataset_t& ds);
 
 	template<bool softAes>
-	void datasetInit(const void* seed, uint8_t*& dataset);
+	void datasetInit(Cache* cache, dataset_t ds, uint32_t startBlock, uint32_t blockCount);
 
 	convertible_t datasetRead(addr_t addr, MemoryRegisters& memory);
 
 	template<bool softAes>
-	void datasetInitLight(const void* seed, LightClientMemory*& lcm);
+	void datasetInitCache(const void* seed, dataset_t& dataset);
 
 	template<bool softAes>
 	convertible_t datasetReadLight(addr_t addr, MemoryRegisters& memory);
diff --git a/src/main.cpp b/src/main.cpp
index f3f8124..b486c1a 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -30,6 +30,10 @@ along with RandomX.  If not, see<http://www.gnu.org/licenses/>.
 #include "Program.hpp"
 #include <string>
 #include "instructions.hpp"
+#include <thread>
+#include <atomic>
+#include "dataset.hpp"
+#include "Cache.hpp"
 
 const uint8_t seed[32] = { 191, 182, 222, 175, 249, 89, 134, 104, 241, 68, 191, 62, 162, 166, 61, 64, 123, 191, 227, 193, 118, 60, 188, 53, 223, 133, 175, 24, 123, 230, 55, 74 };
 
@@ -45,7 +49,6 @@ void outputHex(std::ostream& os, const char* data, int length) {
 		os << hexmap[(data[i] & 0xF0) >> 4];
 		os << hexmap[data[i] & 0x0F];
 	}
-	os << std::endl;
 }
 
 void readOption(const char* option, int argc, char** argv, bool& out) {
@@ -58,6 +61,15 @@ void readOption(const char* option, int argc, char** argv, bool& out) {
 	out = false;
 }
 
+void readIntOption(const char* option, int argc, char** argv, int& out, int defaultValue) {
+	for (int i = 0; i < argc - 1; ++i) {
+		if (strcmp(argv[i], option) == 0 && (out = atoi(argv[i + 1])) > 0) {
+			return;
+		}
+	}
+	out = defaultValue;
+}
+
 void readInt(int argc, char** argv, int& out, int defaultValue) {
 	for (int i = 0; i < argc; ++i) {
 		if (*argv[i] != '-' && (out = atoi(argv[i])) > 0) {
@@ -75,81 +87,144 @@ std::ostream& operator<<(std::ostream& os, const RandomX::RegisterFile& rf) {
 	return os;
 }
 
-int main(int argc, char** argv) {
-	bool softAes, lightClient, genAsm, compiled;
-	int programCount;
-	readOption("--softAes", argc, argv, softAes);
-	readOption("--lightClient", argc, argv, lightClient);
-	readOption("--genAsm", argc, argv, genAsm);
-	readOption("--compiled", argc, argv, compiled);
-	readInt(argc, argv, programCount, 1000);
+class AtomicHash {
+public:
+	AtomicHash() {
+		for (int i = 0; i < 4; ++i)
+			hash[i].store(0);
+	}
+	void xorWith(uint64_t update[4]) {
+		for (int i = 0; i < 4; ++i)
+			hash[i].fetch_xor(update[i]);
+	}
+	void print(std::ostream& os) {
+		for (int i = 0; i < 4; ++i)
+			print(hash[i], os);
+		os << std::endl;
+	}
+private:
+	void print(std::atomic<uint64_t>& hash, std::ostream& os) {
+		auto h = hash.load();
+		outputHex(std::cout, (char*)&h, sizeof(h));
+	}
+	std::atomic<uint64_t> hash[4];
+};
 
+void mine(RandomX::VirtualMachine* vm, std::atomic<int>& atomicNonce, AtomicHash& result, int noncesCount, int thread) {
+	uint64_t hash[4];
 	unsigned char blockTemplate[] = {
 		0x07, 0x07, 0xf7, 0xa4, 0xf0, 0xd6, 0x05, 0xb3, 0x03, 0x26, 0x08, 0x16, 0xba, 0x3f, 0x10, 0x90, 0x2e, 0x1a, 0x14,
 		0x5a, 0xc5, 0xfa, 0xd3, 0xaa, 0x3a, 0xf6, 0xea, 0x44, 0xc1, 0x18, 0x69, 0xdc, 0x4f, 0x85, 0x3f, 0x00, 0x2b, 0x2e,
 		0xea, 0x00, 0x00, 0x00, 0x00, 0x77, 0xb2, 0x06, 0xa0, 0x2c, 0xa5, 0xb1, 0xd4, 0xce, 0x6b, 0xbf, 0xdf, 0x0a, 0xca,
 		0xc3, 0x8b, 0xde, 0xd3, 0x4d, 0x2d, 0xcd, 0xee, 0xf9, 0x5c, 0xd2, 0x0c, 0xef, 0xc1, 0x2f, 0x61, 0xd5, 0x61, 0x09
 	};
-	int* nonce = (int*)(blockTemplate + 39);
-	uint8_t hash[RandomX::ResultSize];
+	int* noncePtr = (int*)(blockTemplate + 39);
+	int nonce = atomicNonce.fetch_add(1);
 
-	if (genAsm) {
-		*nonce = programCount;
+	while (nonce < noncesCount) {
+		//std::cout << "Thread " << thread << " nonce " << nonce << std::endl;
+		*noncePtr = nonce;
 		blake2b(hash, sizeof(hash), blockTemplate, sizeof(blockTemplate), nullptr, 0);
-		RandomX::AssemblyGeneratorX86 asmX86;
-		asmX86.generateProgram(hash);
-		asmX86.printCode(std::cout);
-		return 0;
+		int spIndex = ((uint8_t*)hash)[24] | ((((uint8_t*)hash)[25] & 63) << 8);
+		vm->initializeScratchpad(spIndex);
+		vm->initializeProgram(hash);
+		vm->execute();
+		vm->getResult(hash);
+		result.xorWith(hash);
+		if (RandomX::trace) {
+			std::cout << "Nonce: " << nonce << " ";
+			outputHex(std::cout, (char*)hash, sizeof(hash));
+			std::cout << std::endl;
+		}
+		nonce = atomicNonce.fetch_add(1);
 	}
+}
+
+int main(int argc, char** argv) {
+	bool softAes, lightClient, genAsm, compiled;
+	int programCount, threadCount;
+	readOption("--softAes", argc, argv, softAes);
+	readOption("--lightClient", argc, argv, lightClient);
+	readOption("--genAsm", argc, argv, genAsm);
+	readOption("--compiled", argc, argv, compiled);
+	readIntOption("--threads", argc, argv, threadCount, 1);
+	readIntOption("--nonces", argc, argv, programCount, 1000);
+
+	std::atomic<int> atomicNonce(0);
+	AtomicHash result;
+	std::vector<RandomX::VirtualMachine*> vms;
+	std::vector<std::thread> threads;
+	RandomX::dataset_t dataset;
 
 	if (softAes)
 		std::cout << "Using software AES." << std::endl;
-
-	char cumulative[RandomX::ResultSize] = { 0 };
-
-	RandomX::VirtualMachine* vm;
+	std::cout << "Initializing..." << std::endl;
 
 	try {
-		if (compiled) {
-			vm = new RandomX::CompiledVirtualMachine(softAes);
+		Stopwatch sw(true);
+		if (softAes) {
+			RandomX::datasetInitCache<true>(seed, dataset);
 		}
 		else {
-			vm = new RandomX::InterpretedVirtualMachine(softAes);
+			RandomX::datasetInitCache<false>(seed, dataset);
 		}
-		std::cout << "Initializing..." << std::endl;
-		Stopwatch sw(true);
-		vm->initializeDataset(seed, lightClient);
-		if(lightClient)
+		if (RandomX::trace) {
+			std::cout << "Keys: " << std::endl;
+			for (int i = 0; i < dataset.cache->getKeys().size(); ++i) {
+				outputHex(std::cout, (char*)&dataset.cache->getKeys()[i], sizeof(__m128i));
+			}
+			std::cout << std::endl;
+			std::cout << "Cache: " << std::endl;
+			outputHex(std::cout, (char*)dataset.cache->getCache(), sizeof(__m128i));
+			std::cout << std::endl;
+		}
+		if (lightClient) {
 			std::cout << "Cache (64 MiB) initialized in " << sw.getElapsed() << " s" << std::endl;
-		else
+		}
+		else {
+			RandomX::Cache* cache = dataset.cache;
+			RandomX::datasetAlloc(dataset);
+			auto perThread = RandomX::DatasetBlockCount / threadCount;
+			auto remainder = RandomX::DatasetBlockCount % threadCount;
+			for (int i = 0; i < threadCount; ++i) {
+				auto count = perThread + (i == threadCount - 1 ? remainder : 0);
+				if (softAes) {
+					threads.push_back(std::thread(&RandomX::datasetInit<true>, cache, dataset, i * perThread, count));
+				}
+				else {
+					threads.push_back(std::thread(&RandomX::datasetInit<false>, cache, dataset, i * perThread, count));
+				}
+			}
+			for (int i = 0; i < threads.size(); ++i) {
+				threads[i].join();
+			}
+			delete cache;
+			threads.clear();
 			std::cout << "Dataset (4 GiB) initialized in " << sw.getElapsed() << " s" << std::endl;
+		}
+		std::cout << "Initializing " << threadCount << " virtual machine(s)..." << std::endl;
+		for (int i = 0; i < threadCount; ++i) {
+			RandomX::VirtualMachine* vm;
+			if (compiled) {
+				vm = new RandomX::CompiledVirtualMachine(softAes);
+			}
+			else {
+				vm = new RandomX::InterpretedVirtualMachine(softAes);
+			}
+			vm->setDataset(dataset, lightClient);
+			vms.push_back(vm);
+		}
 		std::cout << "Running benchmark (" << programCount << " programs) ..." << std::endl;
 		sw.restart();
-		for (int i = 0; i < programCount; ++i) {
-			*nonce = i;
-			if (RandomX::trace) std::cout << "Nonce: " << i << " ";
-			blake2b(hash, sizeof(hash), blockTemplate, sizeof(blockTemplate), nullptr, 0);
-			int spIndex = hash[24] | ((hash[25] & 63) << 8);
-			vm->initializeScratchpad(spIndex);
-			//dump((const char *)vm.getScratchpad(), RandomX::ScratchpadSize, "scratchpad-before.txt");
-			//return 0;
-			vm->initializeProgram(hash);
-			vm->execute();
-			/*std::string fileName("scratchpad-after-");
-			fileName = fileName + std::to_string(i) + ".txt";
-			dump((const char *)vm.getScratchpad(), RandomX::ScratchpadSize, fileName.c_str());*/
-			vm->getResult(hash);
-			if (RandomX::trace) {
-				outputHex(std::cout, (char*)hash, sizeof(hash));
-			}
-			((uint64_t*)cumulative)[0] ^= ((uint64_t*)hash)[0];
-			((uint64_t*)cumulative)[1] ^= ((uint64_t*)hash)[1];
-			((uint64_t*)cumulative)[2] ^= ((uint64_t*)hash)[2];
-			((uint64_t*)cumulative)[3] ^= ((uint64_t*)hash)[3];
+		for (int i = 0; i < vms.size(); ++i) {
+			threads.push_back(std::thread(&mine, vms[i], std::ref(atomicNonce), std::ref(result), programCount, i));
+		}
+		for (int i = 0; i < threads.size(); ++i) {
+			threads[i].join();
 		}
 		double elapsed = sw.getElapsed();
 		std::cout << "Calculated result: ";
-		outputHex(std::cout, cumulative, sizeof(cumulative));
+		result.print(std::cout);
 		if(programCount == 1000)
 		std::cout << "Reference result:  d62ed85c39030cd2c5704fca3a23019f1244f2b03447c9a6b39dea5390ed1d10" << std::endl;
 		std::cout << "Performance: " << programCount / elapsed << " programs per second" << std::endl;