Added comments to hashAes1Rx4 and fillAes1Rx4

Fixed gcc compilation Added performance numbers
2024-08-15 00:23:14 +00:00 · 2019-02-09 19:32:53 +01:00 · 2019-02-09 19:32:53 +01:00 · b8ce504be6
commit b8ce504be6
parent 2798d78717
12 changed files with 72 additions and 325 deletions
--- a/README.md
+++ b/README.md
@ -1,6 +1,3 @@
-
-
-
 # RandomX
 RandomX is a proof-of-work (PoW) algorithm that is optimized for general-purpose CPUs. RandomX uses random code execution (hence the name) together with several memory-hard techniques to achieve the following goals:

@ -26,7 +23,7 @@ The structure of the VM mimics the components that are found in a typical genera

 The VM executes programs in a special instruction set, which was designed in such way that any random 8-byte word is a valid instruction and any sequence of valid instructions is a valid program. For more details see [RandomX ISA documentation](doc/isa.md). Because there are no "syntax" rules, generating a random program is as easy as filling the program buffer with random data. A RandomX program consists of 256 instructions. See [program.inc](src/program.inc) as an example of a RandomX program translated into x86-64 assembly.

-#### Hash calculation
+### Hash calculation

 Calculating a RandomX hash consists of initializing the 2 MiB scratchpad with random data, executing 8 RandomX loops and calculating a hash of the scratchpad.

@ -40,15 +37,27 @@ Hash of the register state after 2048 interations is used to initialize the rand

 The loads from the dataset are fully prefetched, so they don't slow down the loop.

-RandomX uses the [Blake2b](https://en.wikipedia.org/wiki/BLAKE_%28hash_function%29#BLAKE2) cryptographic hash function. Special hashing functions based on [AES](https://en.wikipedia.org/wiki/Advanced_Encryption_Standard) encryption are used to initialize and hash the scratchpad.
+RandomX uses the [Blake2b](https://en.wikipedia.org/wiki/BLAKE_%28hash_function%29#BLAKE2) cryptographic hash function. Special hashing functions `fillAes1Rx4` and `hashAes1Rx4` based on [AES](https://en.wikipedia.org/wiki/Advanced_Encryption_Standard) encryption are used to initialize and hash the scratchpad ([hashAes1Rx4.cpp](src/hashAes1Rx4.cpp)).

-#### Hash verification
+### Hash verification

 RandomX is a symmetric PoW algorithm, so the verifying party has to repeat the same steps as when a hash is calculated.

 However, to allow hash verification on devices that cannot store the whole 4 GiB dataset, RandomX allows a time-memory tradeoff by using just 256 MiB of memory at the cost of 16 times more random memory accesses. See [Dataset initialization](doc/dataset.md) for more details.

-#### Documentation
+### Performance
+Preliminary mining performance with the x86-64 JIT compiled VM:
+
+|CPU|RAM|threads|hashrate [H/s]|comment|
+|-----|-----|----|----------|-----|
+|AMD Ryzen 1700|DDR4-2933|8|4100|
+|Intel i5-3230M|DDR3-1333|1|280|without large pages
+|Intel i7-8550U|DDR4-2400|4|1200|limited by thermals
+|Intel i5-2500K|DDR3-1333|3|1350|
+
+Hash verification is performed using the portable interpreter in "light-client mode" and takes 30-70 ms depending on RAM latency and CPU clock speed. Hash verification in "mining mode" takes 2-4 ms.
+
+### Documentation
 * [RandomX ISA](doc/isa.md)
 * [RandomX instruction listing](doc/isa-ops.md)
 * [Dataset initialization](doc/dataset.md)
--- a/28
+++ b/28
@ -11,12 +11,12 @@ SRCDIR=src
 OBJDIR=obj
 LDFLAGS=-lpthread
 TOBJS=$(addprefix $(OBJDIR)/,instructionsPortable.o TestAluFpu.o)
-ROBJS=$(addprefix $(OBJDIR)/,argon2_core.o argon2_ref.o AssemblyGeneratorX86.o blake2b.o CompiledVirtualMachine.o dataset.o JitCompilerX86.o instructionsPortable.o Instruction.o InterpretedVirtualMachine.o main.o Program.o softAes.o VirtualMachine.o t1ha2.o Cache.o virtualMemory.o divideByConstantCodegen.o LightClientAsyncWorker.o AddressTransform.o hashAes1Rx4.o)
+ROBJS=$(addprefix $(OBJDIR)/,argon2_core.o argon2_ref.o AssemblyGeneratorX86.o blake2b.o CompiledVirtualMachine.o dataset.o JitCompilerX86.o instructionsPortable.o Instruction.o InterpretedVirtualMachine.o main.o Program.o softAes.o VirtualMachine.o Cache.o virtualMemory.o divideByConstantCodegen.o LightClientAsyncWorker.o hashAes1Rx4.o)
 ifeq ($(PLATFORM),x86_64)
    ROBJS += $(OBJDIR)/JitCompilerX86-static.o $(OBJDIR)/squareHash.o
 endif

-all: release test
+all: release

 release: CXXFLAGS += -march=native -O3 -flto
 release: CCFLAGS += -march=native -O3 -flto
@ -41,11 +41,8 @@ $(BINDIR)/randomx: $(ROBJS) | $(BINDIR)
 $(BINDIR)/AluFpuTest: $(TOBJS) | $(BINDIR)
 	$(CXX) $(TOBJS) $(LDFLAGS) -o $@
  
-$(OBJDIR)/TestAluFpu.o: $(addprefix $(SRCDIR)/,TestAluFpu.cpp instructions.hpp Pcg32.hpp) | $(OBJDIR)
+$(OBJDIR)/TestAluFpu.o: $(addprefix $(SRCDIR)/,TestAluFpu.cpp instructions.hpp) | $(OBJDIR)
 	$(CXX) $(CXXFLAGS) -c $(SRCDIR)/TestAluFpu.cpp -o $@
-
-$(OBJDIR)/AddressTransform.o: $(addprefix $(SRCDIR)/,AddressTransform.cpp InterpretedVirtualMachine.hpp common.hpp) | $(OBJDIR)
-	$(CXX) $(CXXFLAGS) -c $(SRCDIR)/AddressTransform.cpp -o $@
  
 $(OBJDIR)/argon2_core.o: $(addprefix $(SRCDIR)/,argon2_core.c argon2_core.h blake2/blake2.h blake2/blake2-impl.h) | $(OBJDIR)
 	$(CC) $(CCFLAGS) -c $(SRCDIR)/argon2_core.c -o $@
@ -53,16 +50,16 @@ $(OBJDIR)/argon2_core.o: $(addprefix $(SRCDIR)/,argon2_core.c argon2_core.h blak
 $(OBJDIR)/argon2_ref.o: $(addprefix $(SRCDIR)/,argon2_ref.c argon2.h argon2_core.h blake2/blake2.h blake2/blake2-impl.h blake2/blamka-round-ref.h) | $(OBJDIR)
 	$(CC) $(CCFLAGS) -c $(SRCDIR)/argon2_ref.c -o $@

-$(OBJDIR)/AssemblyGeneratorX86.o: $(addprefix $(SRCDIR)/,AssemblyGeneratorX86.cpp AssemblyGeneratorX86.hpp Instruction.hpp Pcg32.hpp common.hpp instructions.hpp instructionWeights.hpp) | $(OBJDIR)
+$(OBJDIR)/AssemblyGeneratorX86.o: $(addprefix $(SRCDIR)/,AssemblyGeneratorX86.cpp AssemblyGeneratorX86.hpp Instruction.hpp common.hpp instructionWeights.hpp) | $(OBJDIR)
 	$(CXX) $(CXXFLAGS) -c $(SRCDIR)/AssemblyGeneratorX86.cpp -o $@

 $(OBJDIR)/blake2b.o: $(addprefix $(SRCDIR)/blake2/,blake2b.c blake2.h blake2-impl.h) | $(OBJDIR)
 	$(CC) $(CCFLAGS) -c $(SRCDIR)/blake2/blake2b.c -o $@

-$(OBJDIR)/CompiledVirtualMachine.o: $(addprefix $(SRCDIR)/,CompiledVirtualMachine.cpp CompiledVirtualMachine.hpp Pcg32.hpp common.hpp instructions.hpp) | $(OBJDIR)
+$(OBJDIR)/CompiledVirtualMachine.o: $(addprefix $(SRCDIR)/,CompiledVirtualMachine.cpp CompiledVirtualMachine.hpp common.hpp) | $(OBJDIR)
 	$(CXX) $(CXXFLAGS) -c $(SRCDIR)/CompiledVirtualMachine.cpp -o $@
  
-$(OBJDIR)/dataset.o: $(addprefix $(SRCDIR)/,dataset.cpp common.hpp Pcg32.hpp) | $(OBJDIR)
+$(OBJDIR)/dataset.o: $(addprefix $(SRCDIR)/,dataset.cpp common.hpp) | $(OBJDIR)
 	$(CXX) $(CXXFLAGS) -c $(SRCDIR)/dataset.cpp -o $@

 $(OBJDIR)/divideByConstantCodegen.o: $(addprefix $(SRCDIR)/,divideByConstantCodegen.c divideByConstantCodegen.h) | $(OBJDIR)
@ -74,19 +71,19 @@ $(OBJDIR)/hashAes1Rx4.o: $(addprefix $(SRCDIR)/,hashAes1Rx4.cpp softAes.h) | $(O
 $(OBJDIR)/JitCompilerX86.o: $(addprefix $(SRCDIR)/,JitCompilerX86.cpp JitCompilerX86.hpp Instruction.hpp instructionWeights.hpp) | $(OBJDIR)
 	$(CXX) $(CXXFLAGS) -c $(SRCDIR)/JitCompilerX86.cpp -o $@

-$(OBJDIR)/JitCompilerX86-static.o: $(addprefix $(SRCDIR)/,JitCompilerX86-static.S $(addprefix asm/program_, prologue_linux.inc prologue_load.inc epilogue_linux.inc epilogue_store.inc read.inc)) | $(OBJDIR)
+$(OBJDIR)/JitCompilerX86-static.o: $(addprefix $(SRCDIR)/,JitCompilerX86-static.S $(addprefix asm/program_, prologue_linux.inc prologue_load.inc epilogue_linux.inc epilogue_store.inc read_dataset.inc loop_load.inc loop_store.inc xmm_constants.inc)) | $(OBJDIR)
 	$(CXX) -x assembler-with-cpp -c $(SRCDIR)/JitCompilerX86-static.S -o $@

 $(OBJDIR)/squareHash.o: $(addprefix $(SRCDIR)/,squareHash.S $(addprefix asm/, squareHash.inc))  | $(OBJDIR)
 	$(CXX) -x assembler-with-cpp -c $(SRCDIR)/squareHash.S -o $@

-$(OBJDIR)/instructionsPortable.o: $(addprefix $(SRCDIR)/,instructionsPortable.cpp instructions.hpp intrinPortable.h) | $(OBJDIR)
+$(OBJDIR)/instructionsPortable.o: $(addprefix $(SRCDIR)/,instructionsPortable.cpp intrinPortable.h) | $(OBJDIR)
 	$(CXX) $(CXXFLAGS) -c $(SRCDIR)/instructionsPortable.cpp -o $@

 $(OBJDIR)/Instruction.o: $(addprefix $(SRCDIR)/,Instruction.cpp Instruction.hpp instructionWeights.hpp) | $(OBJDIR)
 	$(CXX) $(CXXFLAGS) -c $(SRCDIR)/Instruction.cpp -o $@
  
-$(OBJDIR)/InterpretedVirtualMachine.o: $(addprefix $(SRCDIR)/,InterpretedVirtualMachine.cpp InterpretedVirtualMachine.hpp Pcg32.hpp instructions.hpp instructionWeights.hpp) | $(OBJDIR)
+$(OBJDIR)/InterpretedVirtualMachine.o: $(addprefix $(SRCDIR)/,InterpretedVirtualMachine.cpp InterpretedVirtualMachine.hpp instructionWeights.hpp) | $(OBJDIR)
 	$(CXX) $(CXXFLAGS) -c $(SRCDIR)/InterpretedVirtualMachine.cpp -o $@

 $(OBJDIR)/LightClientAsyncWorker.o: $(addprefix $(SRCDIR)/,LightClientAsyncWorker.cpp LightClientAsyncWorker.hpp common.hpp) | $(OBJDIR)
@ -95,10 +92,10 @@ $(OBJDIR)/LightClientAsyncWorker.o: $(addprefix $(SRCDIR)/,LightClientAsyncWorke
 $(OBJDIR)/main.o: $(addprefix $(SRCDIR)/,main.cpp InterpretedVirtualMachine.hpp Stopwatch.hpp blake2/blake2.h) | $(OBJDIR)
 	$(CXX) $(CXXFLAGS) -c $(SRCDIR)/main.cpp -o $@
  
-$(OBJDIR)/Program.o: $(addprefix $(SRCDIR)/,Program.cpp Program.hpp Pcg32.hpp) | $(OBJDIR)
+$(OBJDIR)/Program.o: $(addprefix $(SRCDIR)/,Program.cpp Program.hpp) | $(OBJDIR)
 	$(CXX) $(CXXFLAGS) -c $(SRCDIR)/Program.cpp -o $@

-$(OBJDIR)/Cache.o: $(addprefix $(SRCDIR)/,Cache.cpp Cache.hpp Pcg32.hpp argon2_core.h) | $(OBJDIR)
+$(OBJDIR)/Cache.o: $(addprefix $(SRCDIR)/,Cache.cpp Cache.hpp argon2_core.h) | $(OBJDIR)
 	$(CXX) $(CXXFLAGS) -c $(SRCDIR)/Cache.cpp -o $@
  
 $(OBJDIR)/softAes.o: $(addprefix $(SRCDIR)/,softAes.cpp softAes.h) | $(OBJDIR)
@ -109,9 +106,6 @@ $(OBJDIR)/VirtualMachine.o: $(addprefix $(SRCDIR)/,VirtualMachine.cpp VirtualMac

 $(OBJDIR)/virtualMemory.o: $(addprefix $(SRCDIR)/,virtualMemory.cpp virtualMemory.hpp) | $(OBJDIR)
 	$(CXX) $(CXXFLAGS) -c $(SRCDIR)/virtualMemory.cpp -o $@
-
-$(OBJDIR)/t1ha2.o: $(addprefix $(SRCDIR)/t1ha/,t1ha2.c t1ha.h t1ha_bits.h) | $(OBJDIR)
-	$(CC) $(CCFLAGS) -c $(SRCDIR)/t1ha/t1ha2.c -o $@
  
 $(OBJDIR):
 	mkdir $(OBJDIR)
--- a/src/AddressTransform.cpp
+++ b/src/AddressTransform.cpp
@ -1,292 +0,0 @@
-/*
-Copyright (c) 2019 tevador
-
-This file is part of RandomX.
-
-RandomX is free software: you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation, either version 3 of the License, or
-(at your option) any later version.
-
-RandomX is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License
-along with RandomX.  If not, see<http://www.gnu.org/licenses/>.
-*/
-
-#include "common.hpp"
-#include "InterpretedVirtualMachine.hpp"
-
-#include <iostream>
-#include <iomanip>
-#include <sstream>
-
-namespace RandomX {
-
-	class Mul9Transform : public ITransform {
-	public:
-		Mul9Transform(int32_t cc) : c(cc) {
-			std::ostringstream oss;
-			oss << "mul9_" << std::hex << (cc & 255);
-			name = oss.str();
-		}
-		int32_t apply(int32_t x) const override {
-			return 9 * x + c;
-		}
-		const char* getName() const override {
-			return name.c_str();
-		}
-		std::ostream& printAsm(std::ostream& os) const override {
-			os << "lea ecx, [rcx+rcx*8" << std::showpos << c << "]" << std::noshowpos << std::endl;
-			return os;
-		}
-		std::ostream& printCxx(std::ostream& os) const override {
-			os << "static const Mul9Transform " << name << "(" << c << ");" << std::endl;
-			return os;
-		}
-	private:
-		int32_t c;
-		std::string name;
-	};
-
-	class AddTransform : public ITransform {
-	public:
-		AddTransform(int32_t cc) : c(cc) {
-			std::ostringstream oss;
-			oss << "add_" << std::hex << (cc & 255);
-			name = oss.str();
-		}
-		int32_t apply(int32_t x) const override {
-			return x + c;
-		}
-		const char* getName() const override {
-			return name.c_str();
-		}
-		std::ostream& printAsm(std::ostream& os) const override {
-			os << "db 64" << std::endl;
-			os << "add ecx, " << c << std::endl;
-			return os;
-		}
-		std::ostream& printCxx(std::ostream& os) const override {
-			os << "static const AddTransform " << name << "(" << c << ");" << std::endl;
-			return os;
-		}
-	private:
-		int32_t c;
-		std::string name;
-	};
-
-	class XorTransform : public ITransform {
-	public:
-		XorTransform(int32_t cc) : c(cc) {
-			std::ostringstream oss;
-			oss << "xor_" << std::hex << (cc & 255);
-			name = oss.str();
-		}
-		int32_t apply(int32_t x) const override {
-			return x ^ c;
-		}
-		const char* getName() const override {
-			return name.c_str();
-		}
-		std::ostream& printAsm(std::ostream& os) const override {
-			os << "db 64" << std::endl;
-			os << "xor ecx, " << c << std::endl;
-			return os;
-		}
-		std::ostream& printCxx(std::ostream& os) const override {
-			os << "static const XorTransform " << name << "(" << c << ");" << std::endl;
-			return os;
-		}
-	private:
-		int32_t c;
-		std::string name;
-	};
-
-	static const Mul9Transform mul9_6d(109);
-	static const XorTransform xor_60(96);
-	static const Mul9Transform mul9_ed(-19);
-	static const AddTransform add_9e(-98);
-	static const AddTransform add_eb(-21);
-	static const XorTransform xor_b0(-80);
-	static const Mul9Transform mul9_a4(-92);
-	static const AddTransform add_71(113);
-	static const Mul9Transform mul9_64(100);
-	static const AddTransform add_d9(-39);
-	static const XorTransform xor_78(120);
-	static const Mul9Transform mul9_89(-119);
-	static const AddTransform add_8f(-113);
-	static const AddTransform add_6f(111);
-	static const XorTransform xor_68(104);
-	static const Mul9Transform mul9_ad(-83);
-	static const Mul9Transform mul9_7f(127);
-	static const XorTransform xor_90(-112);
-	static const AddTransform add_59(89);
-	static const AddTransform add_e0(-32);
-	static const AddTransform add_68(104);
-	static const XorTransform xor_88(-120);
-	static const XorTransform xor_18(24);
-	static const Mul9Transform mul9_9(9);
-	static const AddTransform add_e1(-31);
-	static const XorTransform xor_f0(-16);
-	static const AddTransform add_44(68);
-	static const Mul9Transform mul9_92(-110);
-	static const XorTransform xor_40(64);
-	static const XorTransform xor_d8(-40);
-	static const XorTransform xor_f8(-8);
-	static const AddTransform add_f6(-10);
-	static const XorTransform xor_e0(-32);
-	static const AddTransform add_e(14);
-	static const Mul9Transform mul9_d2(-46);
-	static const XorTransform xor_98(-104);
-	static const Mul9Transform mul9_24(36);
-	static const AddTransform add_64(100);
-	static const Mul9Transform mul9_bf(-65);
-	static const Mul9Transform mul9_1b(27);
-	static const Mul9Transform mul9_5b(91);
-	static const AddTransform add_9b(-101);
-	static const AddTransform add_a2(-94);
-	static const Mul9Transform mul9_f6(-10);
-	static const XorTransform xor_50(80);
-	static const AddTransform add_94(-108);
-	static const AddTransform add_c6(-58);
-	static const XorTransform xor_30(48);
-	static const Mul9Transform mul9_49(73);
-	static const XorTransform xor_d0(-48);
-	static const XorTransform xor_20(32);
-	static const XorTransform xor_a0(-96);
-	static const AddTransform add_76(118);
-	static const AddTransform add_5b(91);
-	static const Mul9Transform mul9_12(18);
-	static const AddTransform add_f5(-11);
-	static const Mul9Transform mul9_3f(63);
-	static const AddTransform add_72(114);
-	static const Mul9Transform mul9_2d(45);
-	static const AddTransform add_bd(-67);
-	static const AddTransform add_35(53);
-	static const Mul9Transform mul9_9b(-101);
-	static const Mul9Transform mul9_ff(-1);
-	static const XorTransform xor_10(16);
-	static const Mul9Transform mul9_db(-37);
-	static const Mul9Transform mul9_e4(-28);
-	static const Mul9Transform mul9_c9(-55);
-	static const XorTransform xor_a8(-88);
-	static const XorTransform xor_b8(-72);
-	static const AddTransform add_24(36);
-	static const XorTransform xor_c8(-56);
-	static const AddTransform add_74(116);
-	static const XorTransform xor_58(88);
-	static const XorTransform xor_80(-128);
-	static const AddTransform add_32(50);
-	static const AddTransform add_69(105);
-	static const AddTransform add_db(-37);
-	static const XorTransform xor_70(112);
-	static const XorTransform xor_8(8);
-	static const XorTransform xor_e8(-24);
-	static const Mul9Transform mul9_76(118);
-	static const XorTransform xor_48(72);
-	static const XorTransform xor_c0(-64);
-	static const AddTransform add_28(40);
-	static const Mul9Transform mul9_b6(-74);
-	static const Mul9Transform mul9_52(82);
-	static const Mul9Transform mul9_36(54);
-	static const XorTransform xor_38(56);
-	static const XorTransform xor_28(40);
-	static const AddTransform add_57(87);
-
-	const ITransform* InterpretedVirtualMachine::addressTransformations[TransformationCount] = {
-		(ITransform*)&mul9_6d,
-		(ITransform*)&xor_60,
-		(ITransform*)&mul9_ed,
-		(ITransform*)&add_9e,
-		(ITransform*)&add_eb,
-		(ITransform*)&xor_b0,
-		(ITransform*)&mul9_a4,
-		(ITransform*)&add_71,
-		(ITransform*)&mul9_64,
-		(ITransform*)&add_d9,
-		(ITransform*)&xor_78,
-		(ITransform*)&mul9_89,
-		(ITransform*)&add_8f,
-		(ITransform*)&add_6f,
-		(ITransform*)&xor_68,
-		(ITransform*)&mul9_ad,
-		(ITransform*)&mul9_7f,
-		(ITransform*)&xor_90,
-		(ITransform*)&add_59,
-		(ITransform*)&add_e0,
-		(ITransform*)&add_68,
-		(ITransform*)&xor_88,
-		(ITransform*)&xor_18,
-		(ITransform*)&mul9_9,
-		(ITransform*)&add_e1,
-		(ITransform*)&xor_f0,
-		(ITransform*)&add_44,
-		(ITransform*)&mul9_92,
-		(ITransform*)&xor_40,
-		(ITransform*)&xor_d8,
-		(ITransform*)&xor_f8,
-		(ITransform*)&add_f6,
-		(ITransform*)&xor_e0,
-		(ITransform*)&add_e,
-		(ITransform*)&mul9_d2,
-		(ITransform*)&xor_98,
-		(ITransform*)&mul9_24,
-		(ITransform*)&add_64,
-		(ITransform*)&mul9_bf,
-		(ITransform*)&mul9_1b,
-		(ITransform*)&mul9_5b,
-		(ITransform*)&add_9b,
-		(ITransform*)&add_a2,
-		(ITransform*)&mul9_f6,
-		(ITransform*)&xor_50,
-		(ITransform*)&add_94,
-		(ITransform*)&add_c6,
-		(ITransform*)&xor_30,
-		(ITransform*)&mul9_49,
-		(ITransform*)&xor_d0,
-		(ITransform*)&xor_20,
-		(ITransform*)&xor_a0,
-		(ITransform*)&add_76,
-		(ITransform*)&add_5b,
-		(ITransform*)&mul9_12,
-		(ITransform*)&add_f5,
-		(ITransform*)&mul9_3f,
-		(ITransform*)&add_72,
-		(ITransform*)&mul9_2d,
-		(ITransform*)&add_bd,
-		(ITransform*)&add_35,
-		(ITransform*)&mul9_9b,
-		(ITransform*)&mul9_ff,
-		(ITransform*)&xor_10,
-		(ITransform*)&mul9_db,
-		(ITransform*)&mul9_e4,
-		(ITransform*)&mul9_c9,
-		(ITransform*)&xor_a8,
-		(ITransform*)&xor_b8,
-		(ITransform*)&add_24,
-		(ITransform*)&xor_c8,
-		(ITransform*)&add_74,
-		(ITransform*)&xor_58,
-		(ITransform*)&xor_80,
-		(ITransform*)&add_32,
-		(ITransform*)&add_69,
-		(ITransform*)&add_db,
-		(ITransform*)&xor_70,
-		(ITransform*)&xor_8,
-		(ITransform*)&xor_e8,
-		(ITransform*)&mul9_76,
-		(ITransform*)&xor_48,
-		(ITransform*)&xor_c0,
-		(ITransform*)&add_28,
-		(ITransform*)&mul9_b6,
-		(ITransform*)&mul9_52,
-		(ITransform*)&mul9_36,
-		(ITransform*)&xor_38,
-		(ITransform*)&xor_28,
-		(ITransform*)&add_57,
-	};
-}
--- a/src/AssemblyGeneratorX86.cpp
+++ b/src/AssemblyGeneratorX86.cpp
@ -450,6 +450,8 @@ namespace RandomX {
 				return "l";
 			case 7:
 				return "ge";
+			default:
+				UNREACHABLE;
 		}
 	}

--- a/src/Instruction.cpp
+++ b/src/Instruction.cpp
@ -296,6 +296,8 @@ namespace RandomX {
 			return "lt";
 		case 7:
 			return "ge";
+		default:
+			UNREACHABLE;
 		}
 	}

--- a/src/InterpretedVirtualMachine.cpp
+++ b/src/InterpretedVirtualMachine.cpp
@ -27,6 +27,7 @@ along with RandomX.  If not, see<http://www.gnu.org/licenses/>.
 #include <stdexcept>
 #include <sstream>
 #include <cmath>
+#include <cfloat>
 #include <thread>
 #include "intrinPortable.h"
 #ifdef STATS
@ -262,7 +263,7 @@ namespace RandomX {
 		uint32_t spAddr0 = mem.mx;
 		uint32_t spAddr1 = mem.ma;

-		for(int iter = 0; iter < InstructionCount; ++iter) {
+		for(unsigned iter = 0; iter < InstructionCount; ++iter) {
 			//std::cout << "Iteration " << iter << std::endl;
 			spAddr0 ^= r[readReg0];
 			spAddr0 &= ScratchpadL3Mask64;
--- a/src/JitCompilerX86.cpp
+++ b/src/JitCompilerX86.cpp
@ -705,6 +705,8 @@ namespace RandomX {
 				return 0x9c; //setl
 			case 7:
 				return 0x9d; //setge
+			default:
+				UNREACHABLE;
 		}
 	}

--- a/src/JitCompilerX86.hpp
+++ b/src/JitCompilerX86.hpp
@ -83,7 +83,7 @@ namespace RandomX {

 		template<size_t N>
 		void emit(const uint8_t (&src)[N]) {
-			for (int i = 0; i < N; ++i) {
+			for (unsigned i = 0; i < N; ++i) {
 				code[codePos + i] = src[i];
 			}
 			codePos += N;
--- a/src/VirtualMachine.cpp
+++ b/src/VirtualMachine.cpp
@ -28,9 +28,15 @@ along with RandomX.  If not, see<http://www.gnu.org/licenses/>.
 std::ostream& operator<<(std::ostream& os, const RandomX::RegisterFile& rf) {
 	for (int i = 0; i < RandomX::RegistersCount; ++i)
 		os << std::hex << "r" << i << " = " << rf.r[i] << std::endl << std::dec;
-	for (int i = 0; i < RandomX::RegistersCount; ++i)
+	for (int i = 0; i < 4; ++i)
 		os << std::hex << "f" << i << " = " << *(uint64_t*)&rf.f[i].hi << " (" << rf.f[i].hi << ")" << std::endl
 		<< "   = " << *(uint64_t*)&rf.f[i].lo << " (" << rf.f[i].lo << ")" << std::endl << std::dec;
+	for (int i = 0; i < 4; ++i)
+		os << std::hex << "e" << i << " = " << *(uint64_t*)&rf.e[i].hi << " (" << rf.e[i].hi << ")" << std::endl
+		<< "   = " << *(uint64_t*)&rf.e[i].lo << " (" << rf.e[i].lo << ")" << std::endl << std::dec;
+	for (int i = 0; i < 4; ++i)
+		os << std::hex << "a" << i << " = " << *(uint64_t*)&rf.a[i].hi << " (" << rf.a[i].hi << ")" << std::endl
+		<< "   = " << *(uint64_t*)&rf.a[i].lo << " (" << rf.a[i].lo << ")" << std::endl << std::dec;
 	return os;
 }

--- a/src/common.hpp
+++ b/src/common.hpp
@ -93,6 +93,7 @@ namespace RandomX {

 	class ILightClientAsyncWorker {
 	public:
+		virtual ~ILightClientAsyncWorker() {}
 		virtual void prepareBlock(addr_t) = 0;
 		virtual void prepareBlocks(void* out, uint32_t startBlock, uint32_t blockCount) = 0;
 		virtual const uint64_t* getBlock(addr_t) = 0;
--- a/src/hashAes1Rx4.cpp
+++ b/src/hashAes1Rx4.cpp
@ -19,6 +19,18 @@ along with RandomX.  If not, see<http://www.gnu.org/licenses/>.

 #include "softAes.h"

+/*
+	Calculate a 512-bit hash of 'input' using 4 lanes of AES.
+	The input is treated as a set of round keys for the encryption
+	of the initial state.
+
+	'inputSize' must be a multiple of 64.
+
+	For a 2 MiB input, this has the same security as 32768-round
+	AES encryption.
+
+	Hashing throughput: >20 GiB/s per CPU core with hardware AES
+*/
 template<bool softAes>
 void hashAes1Rx4(const void *input, size_t inputSize, void *hash) {
 	const uint8_t* inptr = (uint8_t*)input;
@ -72,6 +84,16 @@ void hashAes1Rx4(const void *input, size_t inputSize, void *hash) {
 template void hashAes1Rx4<false>(const void *input, size_t inputSize, void *hash);
 template void hashAes1Rx4<true>(const void *input, size_t inputSize, void *hash);

+/*
+	Fill 'buffer' with pseudorandom data based on 512-bit 'state'.
+	The state is encrypted using a single AES round per 16 bytes of output
+	in 4 lanes.
+
+	'outputSize' must be a multiple of 64.
+
+	The modified state is written back to 'state' to allow multiple
+	calls to this function.
+*/
 template<bool softAes>
 void fillAes1Rx4(void *state, size_t outputSize, void *buffer) {
 	const uint8_t* outptr = (uint8_t*)buffer;
--- a/src/main.cpp
+++ b/src/main.cpp
@ -254,7 +254,7 @@ int main(int argc, char** argv) {
 		}
 		if (RandomX::trace) {
 			std::cout << "Keys: " << std::endl;
-			for (int i = 0; i < dataset.cache->getKeys().size(); ++i) {
+			for (unsigned i = 0; i < dataset.cache->getKeys().size(); ++i) {
 				outputHex(std::cout, (char*)&dataset.cache->getKeys()[i], sizeof(__m128i));
 			}
 			std::cout << std::endl;
@ -280,7 +280,7 @@ int main(int argc, char** argv) {
 						threads.push_back(std::thread(&RandomX::datasetInit<false>, cache, dataset, i * perThread, count));
 					}
 				}
-				for (int i = 0; i < threads.size(); ++i) {
+				for (unsigned i = 0; i < threads.size(); ++i) {
 					threads[i].join();
 				}
 			}
@ -318,10 +318,10 @@ int main(int argc, char** argv) {
 		std::cout << "Running benchmark (" << programCount << " programs) ..." << std::endl;
 		sw.restart();
 		if (threadCount > 1) {
-			for (int i = 0; i < vms.size(); ++i) {
+			for (unsigned i = 0; i < vms.size(); ++i) {
 				threads.push_back(std::thread(&mine, vms[i], std::ref(atomicNonce), std::ref(result), programCount, i, scratchpadMem + RandomX::ScratchpadSize * i));
 			}
-			for (int i = 0; i < threads.size(); ++i) {
+			for (unsigned i = 0; i < threads.size(); ++i) {
 				threads[i].join();
 			}
 		}
@ -336,10 +336,10 @@ int main(int argc, char** argv) {
 		if(programCount == 1000)
 		std::cout << "Reference result:  3e1c5f9b9d0bf8ffa250f860bf5f7ab76ac823b206ddee6a592660119a3640c6" << std::endl;
 		if (lightClient) {
-			std::cout << "Performance: " << 1000 * elapsed / programCount << " ms per program" << std::endl;
+			std::cout << "Performance: " << 1000 * elapsed / programCount << " ms per hash" << std::endl;
 		}
 		else {
-			std::cout << "Performance: " << programCount / elapsed << " programs per second" << std::endl;
+			std::cout << "Performance: " << programCount / elapsed << " hashes per second" << std::endl;
 		}
 	}
 	catch (std::exception& e) {