rx-wow-fix-3: Revert "Increase the frequency of CBRANCH (#118 )"

This reverts commit 22689eda49.
rx-wow-fix-2: Revert "Decrease the frequency of FADD/FSUB in favor of FMUL (#77 )"
2024-08-15 00:23:14 +00:00 · 2020-07-13 08:55:05 +03:00 · 2020-07-13 08:54:49 +03:00 · 2020-07-13 08:54:35 +03:00 · 2020-07-13 08:54:21 +03:00 · 2020-07-04 14:57:56 +02:00
19 changed files with 191 additions and 71 deletions
--- a/README.md
+++ b/README.md
@ -48,6 +48,8 @@ cmake -DARCH=native ..
 make
 ```

+To build portable binaries, omit the `ARCH` option when executing cmake.
+
 ### Windows

 On Windows, it is possible to build using MinGW (same procedure as on Linux) or using Visual Studio (solution file is provided).
--- a/doc/design.md
+++ b/doc/design.md
@ -255,7 +255,7 @@ The Scratchpad is split into 3 levels to mimic the typical CPU cache hierarchy [
 |----------------|----------|----------|----------|------|
 ARM Cortex A55|2|6|-|[[24](https://www.anandtech.com/show/11441/dynamiq-and-arms-new-cpus-cortex-a75-a55/4)]
 |AMD Zen+|4|12|40|[[25](https://en.wikichip.org/wiki/amd/microarchitectures/zen%2B#Memory_Hierarchy)]|
-|Intel Skylake|4|12|42|[[26](https://en.wikichip.org/wiki/amd/microarchitectures/zen%2B#Memory_Hierarchy)]
+|Intel Skylake|4|12|42|[[26](https://en.wikichip.org/wiki/intel/microarchitectures/skylake_(client)#Memory_Hierarchy)]

 The L3 cache is much larger and located further from the CPU core. As a result, its access latencies are much higher and can cause stalls in program execution.

@ -638,7 +638,7 @@ state3 = 00000000000000000000000000000000

 [25] AMD Zen+ Microarchitecture - https://en.wikichip.org/wiki/amd/microarchitectures/zen%2B#Memory_Hierarchy

-[26] Intel Skylake Microarchitecture - https://en.wikichip.org/wiki/amd/microarchitectures/zen%2B#Memory_Hierarchy
+[26] Intel Skylake Microarchitecture - https://en.wikichip.org/wiki/intel/microarchitectures/skylake_(client)#Memory_Hierarchy

 [27] Biryukov et al.: Fast and Tradeoff-Resilient Memory-Hard Functions for
 Cryptocurrencies and Password Hashing - https://eprint.iacr.org/2015/430.pdf Table 2, page 8
@ -647,4 +647,4 @@ Cryptocurrencies and Password Hashing - https://eprint.iacr.org/2015/430.pdf Tab

 [29] 7-Zip File archiver - https://www.7-zip.org/

-[30] TestU01 library - http://simul.iro.umontreal.ca/testu01/tu01.html
+[30] TestU01 library - http://simul.iro.umontreal.ca/testu01/tu01.html
--- a/src/aes_hash.cpp
+++ b/src/aes_hash.cpp
@ -175,10 +175,10 @@ template void fillAes1Rx4<false>(void *state, size_t outputSize, void *buffer);
 //key0, key1, key2, key3 = Blake2b-512("RandomX AesGenerator4R keys 0-3")
 //key4, key5, key6, key7 = Blake2b-512("RandomX AesGenerator4R keys 4-7")

-#define AES_GEN_4R_KEY0 0x99e5d23f, 0x2f546d2b, 0xd1833ddb, 0x6421aadd
-#define AES_GEN_4R_KEY1 0xa5dfcde5, 0x06f79d53, 0xb6913f55, 0xb20e3450
-#define AES_GEN_4R_KEY2 0x171c02bf, 0x0aa4679f, 0x515e7baf, 0x5c3ed904
-#define AES_GEN_4R_KEY3 0xd8ded291, 0xcd673785, 0xe78f5d08, 0x85623763
+#define AES_GEN_4R_KEY0 0xcf359e95, 0x141f82b7, 0x7ffbe4a6, 0xf890465d
+#define AES_GEN_4R_KEY1 0x6741ffdc, 0xbd5c5ac3, 0xfee8278a, 0x6a55c450
+#define AES_GEN_4R_KEY2 0x3d324aac, 0xa7279ad2, 0xd524fde4, 0x114c47a4
+#define AES_GEN_4R_KEY3 0x76f6db08, 0x42d3dbd9, 0x99a9aeff, 0x810c3a2a
 #define AES_GEN_4R_KEY4 0x229effb4, 0x3d518b6d, 0xe3d6a7a6, 0xb5826f73
 #define AES_GEN_4R_KEY5 0xb272b7d2, 0xe9024d4e, 0x9c10b3d9, 0xc7566bf3
 #define AES_GEN_4R_KEY6 0xf63befa7, 0x2ba9660a, 0xf765a38b, 0xf273c9e7
@ -197,10 +197,6 @@ void fillAes4Rx4(void *state, size_t outputSize, void *buffer) {
 	key1 = rx_set_int_vec_i128(AES_GEN_4R_KEY1);
 	key2 = rx_set_int_vec_i128(AES_GEN_4R_KEY2);
 	key3 = rx_set_int_vec_i128(AES_GEN_4R_KEY3);
-	key4 = rx_set_int_vec_i128(AES_GEN_4R_KEY4);
-	key5 = rx_set_int_vec_i128(AES_GEN_4R_KEY5);
-	key6 = rx_set_int_vec_i128(AES_GEN_4R_KEY6);
-	key7 = rx_set_int_vec_i128(AES_GEN_4R_KEY7);

 	state0 = rx_load_vec_i128((rx_vec_i128*)state + 0);
 	state1 = rx_load_vec_i128((rx_vec_i128*)state + 1);
@ -210,23 +206,23 @@ void fillAes4Rx4(void *state, size_t outputSize, void *buffer) {
 	while (outptr < outputEnd) {
 		state0 = aesdec<softAes>(state0, key0);
 		state1 = aesenc<softAes>(state1, key0);
-		state2 = aesdec<softAes>(state2, key4);
-		state3 = aesenc<softAes>(state3, key4);
+		state2 = aesdec<softAes>(state2, key0);
+		state3 = aesenc<softAes>(state3, key0);

 		state0 = aesdec<softAes>(state0, key1);
 		state1 = aesenc<softAes>(state1, key1);
-		state2 = aesdec<softAes>(state2, key5);
-		state3 = aesenc<softAes>(state3, key5);
+		state2 = aesdec<softAes>(state2, key1);
+		state3 = aesenc<softAes>(state3, key1);

 		state0 = aesdec<softAes>(state0, key2);
 		state1 = aesenc<softAes>(state1, key2);
-		state2 = aesdec<softAes>(state2, key6);
-		state3 = aesenc<softAes>(state3, key6);
+		state2 = aesdec<softAes>(state2, key2);
+		state3 = aesenc<softAes>(state3, key2);

 		state0 = aesdec<softAes>(state0, key3);
 		state1 = aesenc<softAes>(state1, key3);
-		state2 = aesdec<softAes>(state2, key7);
-		state3 = aesenc<softAes>(state3, key7);
+		state2 = aesdec<softAes>(state2, key3);
+		state3 = aesenc<softAes>(state3, key3);

 		rx_store_vec_i128((rx_vec_i128*)outptr + 0, state0);
 		rx_store_vec_i128((rx_vec_i128*)outptr + 1, state1);
--- a/src/asm/configuration.asm
+++ b/src/asm/configuration.asm
@ -15,7 +15,7 @@ RANDOMX_SCRATCHPAD_L2 EQU 262144t
 RANDOMX_SCRATCHPAD_L1 EQU 16384t
 RANDOMX_JUMP_BITS EQU 8t
 RANDOMX_JUMP_OFFSET EQU 8t
-RANDOMX_FREQ_IADD_RS EQU 16t
+RANDOMX_FREQ_IADD_RS EQU 25t
 RANDOMX_FREQ_IADD_M EQU 7t
 RANDOMX_FREQ_ISUB_R EQU 16t
 RANDOMX_FREQ_ISUB_M EQU 7t
@ -29,19 +29,19 @@ RANDOMX_FREQ_IMUL_RCP EQU 8t
 RANDOMX_FREQ_INEG_R EQU 2t
 RANDOMX_FREQ_IXOR_R EQU 15t
 RANDOMX_FREQ_IXOR_M EQU 5t
-RANDOMX_FREQ_IROR_R EQU 8t
-RANDOMX_FREQ_IROL_R EQU 2t
+RANDOMX_FREQ_IROR_R EQU 10t
+RANDOMX_FREQ_IROL_R EQU 0t
 RANDOMX_FREQ_ISWAP_R EQU 4t
-RANDOMX_FREQ_FSWAP_R EQU 4t
-RANDOMX_FREQ_FADD_R EQU 16t
+RANDOMX_FREQ_FSWAP_R EQU 8t
+RANDOMX_FREQ_FADD_R EQU 20t
 RANDOMX_FREQ_FADD_M EQU 5t
-RANDOMX_FREQ_FSUB_R EQU 16t
+RANDOMX_FREQ_FSUB_R EQU 20t
 RANDOMX_FREQ_FSUB_M EQU 5t
 RANDOMX_FREQ_FSCAL_R EQU 6t
-RANDOMX_FREQ_FMUL_R EQU 32t
+RANDOMX_FREQ_FMUL_R EQU 20t
 RANDOMX_FREQ_FDIV_M EQU 4t
 RANDOMX_FREQ_FSQRT_R EQU 6t
-RANDOMX_FREQ_CBRANCH EQU 25t
+RANDOMX_FREQ_CBRANCH EQU 16t
 RANDOMX_FREQ_CFROUND EQU 1t
 RANDOMX_FREQ_ISTORE EQU 16t
 RANDOMX_FREQ_NOP EQU 0t
--- a/src/common.hpp
+++ b/src/common.hpp
@ -67,7 +67,7 @@ namespace randomx {
 	constexpr int wtSum = RANDOMX_FREQ_IADD_RS + RANDOMX_FREQ_IADD_M + RANDOMX_FREQ_ISUB_R + \
 		RANDOMX_FREQ_ISUB_M + RANDOMX_FREQ_IMUL_R + RANDOMX_FREQ_IMUL_M + RANDOMX_FREQ_IMULH_R + \
 		RANDOMX_FREQ_IMULH_M + RANDOMX_FREQ_ISMULH_R + RANDOMX_FREQ_ISMULH_M + RANDOMX_FREQ_IMUL_RCP + \
-		RANDOMX_FREQ_INEG_R + RANDOMX_FREQ_IXOR_R + RANDOMX_FREQ_IXOR_M + RANDOMX_FREQ_IROR_R + RANDOMX_FREQ_IROL_R + RANDOMX_FREQ_ISWAP_R + \
+		RANDOMX_FREQ_INEG_R + RANDOMX_FREQ_IXOR_R + RANDOMX_FREQ_IXOR_M + RANDOMX_FREQ_IROR_R + RANDOMX_FREQ_ISWAP_R + \
 		RANDOMX_FREQ_FSWAP_R + RANDOMX_FREQ_FADD_R + RANDOMX_FREQ_FADD_M + RANDOMX_FREQ_FSUB_R + RANDOMX_FREQ_FSUB_M + \
 		RANDOMX_FREQ_FSCAL_R + RANDOMX_FREQ_FMUL_R + RANDOMX_FREQ_FDIV_M + RANDOMX_FREQ_FSQRT_R + RANDOMX_FREQ_CBRANCH + \
 		RANDOMX_FREQ_CFROUND + RANDOMX_FREQ_ISTORE + RANDOMX_FREQ_NOP;
--- a/src/configuration.h
+++ b/src/configuration.h
@ -1,5 +1,6 @@
 /*
 Copyright (c) 2018-2019, tevador <tevador@gmail.com>
+Copyright (c) 2019, Wownero Inc., a Monero Enterprise Alliance partner company

 All rights reserved.

@ -38,7 +39,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define RANDOMX_ARGON_LANES        1

 //Argon2d salt
-#define RANDOMX_ARGON_SALT         "RandomX\x03"
+#define RANDOMX_ARGON_SALT         "RandomWOW\x01"

 //Number of random Cache accesses per Dataset item. Minimum is 2.
 #define RANDOMX_CACHE_ACCESSES     8
@ -56,16 +57,16 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define RANDOMX_PROGRAM_SIZE       256

 //Number of iterations during VM execution.
-#define RANDOMX_PROGRAM_ITERATIONS 2048
+#define RANDOMX_PROGRAM_ITERATIONS 1024

 //Number of chained VM executions per hash.
-#define RANDOMX_PROGRAM_COUNT      8
+#define RANDOMX_PROGRAM_COUNT      16

 //Scratchpad L3 size in bytes. Must be a power of 2.
-#define RANDOMX_SCRATCHPAD_L3      2097152
+#define RANDOMX_SCRATCHPAD_L3      1048576

 //Scratchpad L2 size in bytes. Must be a power of two and less than or equal to RANDOMX_SCRATCHPAD_L3.
-#define RANDOMX_SCRATCHPAD_L2      262144
+#define RANDOMX_SCRATCHPAD_L2      131072

 //Scratchpad L1 size in bytes. Must be a power of two (minimum 64) and less than or equal to RANDOMX_SCRATCHPAD_L2.
 #define RANDOMX_SCRATCHPAD_L1      16384
@ -82,7 +83,7 @@ Total sum of frequencies must be 256
 */

 //Integer instructions
-#define RANDOMX_FREQ_IADD_RS       16
+#define RANDOMX_FREQ_IADD_RS       25
 #define RANDOMX_FREQ_IADD_M         7
 #define RANDOMX_FREQ_ISUB_R        16
 #define RANDOMX_FREQ_ISUB_M         7
@ -96,23 +97,23 @@ Total sum of frequencies must be 256
 #define RANDOMX_FREQ_INEG_R         2
 #define RANDOMX_FREQ_IXOR_R        15
 #define RANDOMX_FREQ_IXOR_M         5
-#define RANDOMX_FREQ_IROR_R         8
-#define RANDOMX_FREQ_IROL_R         2
+#define RANDOMX_FREQ_IROR_R        10
+#define RANDOMX_FREQ_IROL_R         0
 #define RANDOMX_FREQ_ISWAP_R        4

 //Floating point instructions
-#define RANDOMX_FREQ_FSWAP_R        4
-#define RANDOMX_FREQ_FADD_R        16
+#define RANDOMX_FREQ_FSWAP_R        8
+#define RANDOMX_FREQ_FADD_R        20
 #define RANDOMX_FREQ_FADD_M         5
-#define RANDOMX_FREQ_FSUB_R        16
+#define RANDOMX_FREQ_FSUB_R        20
 #define RANDOMX_FREQ_FSUB_M         5
 #define RANDOMX_FREQ_FSCAL_R        6
-#define RANDOMX_FREQ_FMUL_R        32
+#define RANDOMX_FREQ_FMUL_R        20
 #define RANDOMX_FREQ_FDIV_M         4
 #define RANDOMX_FREQ_FSQRT_R        6

 //Control instructions
-#define RANDOMX_FREQ_CBRANCH       25
+#define RANDOMX_FREQ_CBRANCH       16
 #define RANDOMX_FREQ_CFROUND        1

 //Store instruction
--- a/src/instructions_portable.cpp
+++ b/src/instructions_portable.cpp
@ -157,6 +157,21 @@ void rx_set_rounding_mode(uint32_t mode) {
 	}
 }

+uint32_t rx_get_rounding_mode() {
+	switch (fegetround()) {
+	case FE_DOWNWARD:
+		return RoundDown;
+	case FE_UPWARD:
+		return RoundUp;
+	case FE_TOWARDZERO:
+		return RoundToZero;
+	case FE_TONEAREST:
+		return RoundToNearest;
+	default:
+		UNREACHABLE;
+	}
+}
+
 #endif

 #ifdef RANDOMX_USE_X87
--- a/src/intrin_portable.h
+++ b/src/intrin_portable.h
@ -173,6 +173,10 @@ FORCE_INLINE void rx_set_rounding_mode(uint32_t mode) {
 	_mm_setcsr(rx_mxcsr_default | (mode << 13));
 }

+FORCE_INLINE uint32_t rx_get_rounding_mode() {
+	return (_mm_getcsr() >> 13) & 3;
+}
+
 #elif defined(__PPC64__) && defined(__ALTIVEC__) && defined(__VSX__) //sadly only POWER7 and newer will be able to use SIMD acceleration. Earlier processors cant use doubles or 64 bit integers with SIMD
 #include <cstdint>
 #include <stdexcept>
@ -736,6 +740,8 @@ void rx_reset_float_state();

 void rx_set_rounding_mode(uint32_t mode);

+uint32_t rx_get_rounding_mode();
+
 #endif

 double loadDoublePortable(const void* addr);
--- a/src/jit_compiler.hpp
+++ b/src/jit_compiler.hpp
@ -35,3 +35,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #else
 #include "jit_compiler_fallback.hpp"
 #endif
+
+#if defined(__OpenBSD__) || defined(__NetBSD__)
+#define RANDOMX_FORCE_SECURE
+#endif
--- a/src/jit_compiler_x86.cpp
+++ b/src/jit_compiler_x86.cpp
@ -295,7 +295,7 @@ namespace randomx {

 	void JitCompilerX86::generateProgramPrologue(Program& prog, ProgramConfiguration& pcfg) {
 		instructionOffsets.clear();
-		for (unsigned i = 0; i < 8; ++i) {
+		for (unsigned i = 0; i < RegistersCount; ++i) {
 			registerUsage[i] = -1;
 		}

--- a/src/randomx.cpp
+++ b/src/randomx.cpp
@ -36,13 +36,14 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "cpu.hpp"
 #include <cassert>
 #include <limits>
+#include <cfenv>

 extern "C" {

 	randomx_flags randomx_get_flags() {
 		randomx_flags flags = RANDOMX_HAVE_COMPILER ? RANDOMX_FLAG_JIT : RANDOMX_FLAG_DEFAULT;
 		randomx::Cpu cpu;
-#ifdef __OpenBSD__
+#ifdef RANDOMX_FORCE_SECURE
 		if (flags == RANDOMX_FLAG_JIT) {
 			flags |= RANDOMX_FLAG_SECURE;
 		}
@ -328,7 +329,7 @@ extern "C" {
 	void randomx_vm_set_cache(randomx_vm *machine, randomx_cache* cache) {
 		assert(machine != nullptr);
 		assert(cache != nullptr && cache->isInitialized());
-		if (machine->cacheKey != cache->cacheKey) {
+		if (machine->cacheKey != cache->cacheKey || machine->getMemory() != cache->memory) {
 			machine->setCache(cache);
 			machine->cacheKey = cache->cacheKey;
 		}
@ -349,6 +350,8 @@ extern "C" {
 		assert(machine != nullptr);
 		assert(inputSize == 0 || input != nullptr);
 		assert(output != nullptr);
+		fenv_t fpstate;
+		fegetenv(&fpstate);
 		alignas(16) uint64_t tempHash[8];
 		int blakeResult = blake2b(tempHash, sizeof(tempHash), input, inputSize, nullptr, 0);
 		assert(blakeResult == 0);
@ -361,6 +364,7 @@ extern "C" {
 		}
 		machine->run(&tempHash);
 		machine->getFinalResult(output, RANDOMX_HASH_SIZE);
+		fesetenv(&fpstate);
 	}

 	void randomx_calculate_hash_first(randomx_vm* machine, const void* input, size_t inputSize) {
@ -380,4 +384,14 @@ extern "C" {
 		blake2b(machine->tempHash, sizeof(machine->tempHash), nextInput, nextInputSize, nullptr, 0);
 		machine->hashAndFill(output, RANDOMX_HASH_SIZE, machine->tempHash);
 	}
+
+	void randomx_calculate_hash_last(randomx_vm* machine, void* output) {
+		machine->resetRoundingMode();
+		for (int chain = 0; chain < RANDOMX_PROGRAM_COUNT - 1; ++chain) {
+			machine->run(machine->tempHash);
+			blake2b(machine->tempHash, sizeof(machine->tempHash), machine->getRegisterFile(), sizeof(randomx::RegisterFile), nullptr, 0);
+		}
+		machine->run(machine->tempHash);
+		machine->getFinalResult(output, RANDOMX_HASH_SIZE);
+	}
 }
--- a/src/randomx.h
+++ b/src/randomx.h
@ -240,9 +240,13 @@ RANDOMX_EXPORT void randomx_destroy_vm(randomx_vm *machine);
 RANDOMX_EXPORT void randomx_calculate_hash(randomx_vm *machine, const void *input, size_t inputSize, void *output);

 /**
- * Paired functions used to calculate multiple RandomX hashes more efficiently.
- * randomx_calculate_hash_first is called for the first input value.
- * randomx_calculate_hash_next will output the hash value of the previous input.
+ * Set of functions used to calculate multiple RandomX hashes more efficiently.
+ * randomx_calculate_hash_first will begin a hash calculation.
+ * randomx_calculate_hash_next  will output the hash value of the previous input
+ *                              and begin the calculation of the next hash.
+ * randomx_calculate_hash_last  will output the hash value of the previous input.
+ *
+ * WARNING: These functions may alter the floating point rounding mode of the calling thread.
 *
 * @param machine is a pointer to a randomx_vm structure. Must not be NULL.
 * @param input is a pointer to memory to be hashed. Must not be NULL.
@ -254,6 +258,7 @@ RANDOMX_EXPORT void randomx_calculate_hash(randomx_vm *machine, const void *inpu
 */
 RANDOMX_EXPORT void randomx_calculate_hash_first(randomx_vm* machine, const void* input, size_t inputSize);
 RANDOMX_EXPORT void randomx_calculate_hash_next(randomx_vm* machine, const void* nextInput, size_t nextInputSize, void* output);
+RANDOMX_EXPORT void randomx_calculate_hash_last(randomx_vm* machine, void* output);

 #if defined(__cplusplus)
 }
--- a/src/tests/affinity.cpp
+++ b/src/tests/affinity.cpp
@ -65,7 +65,7 @@ set_thread_affinity(std::thread::native_handle_type thread,
            (thread_policy_t)&policy, 1);
 #elif defined(_WIN32) || defined(__CYGWIN__)
    rc = SetThreadAffinityMask(reinterpret_cast<HANDLE>(thread), 1ULL << cpuid) == 0 ? -2 : 0;
-#elif !defined(__OpenBSD__) && !defined(__FreeBSD__) && !defined(__ANDROID__)
+#elif !defined(__OpenBSD__) && !defined(__FreeBSD__) && !defined(__ANDROID__) && !defined(__NetBSD__)
    cpu_set_t cs;
    CPU_ZERO(&cs);
    CPU_SET(cpuid, &cs);
--- a/src/tests/benchmark.cpp
+++ b/src/tests/benchmark.cpp
@ -40,9 +40,10 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "../dataset.hpp"
 #include "../blake2/endian.h"
 #include "../common.hpp"
+#include "../jit_compiler.hpp"
 #ifdef _WIN32
 #include <windows.h>
-#include <VersionHelpers.h>
+#include <versionhelpers.h>
 #endif
 #include "affinity.hpp"

@ -94,6 +95,7 @@ void printUsage(const char* executable) {
 	std::cout << "  --ssse3       use optimized Argon2 for SSSE3 CPUs" << std::endl;
 	std::cout << "  --avx2        use optimized Argon2 for AVX2 CPUs" << std::endl;
 	std::cout << "  --auto        select the best options for the current CPU" << std::endl;
+	std::cout << "  --noBatch     calculate hashes one by one (default: batch)" << std::endl;
 }

 struct MemoryException : public std::exception {
@ -109,11 +111,14 @@ struct DatasetAllocException : public MemoryException {
 	}
 };

-void mine(randomx_vm* vm, std::atomic<uint32_t>& atomicNonce, AtomicHash& result, uint32_t noncesCount, int thread, int cpuid=-1) {
+using MineFunc = void(randomx_vm * vm, std::atomic<uint32_t> & atomicNonce, AtomicHash & result, uint32_t noncesCount, int thread, int cpuid);
+
+template<bool batch>
+void mine(randomx_vm* vm, std::atomic<uint32_t>& atomicNonce, AtomicHash& result, uint32_t noncesCount, int thread, int cpuid = -1) {
 	if (cpuid >= 0) {
 		int rc = set_thread_affinity(cpuid);
 		if (rc) {
-			std::cerr << "Failed to set thread affinity for thread " << thread << " (error=" << rc << ")" <<  std::endl;
+			std::cerr << "Failed to set thread affinity for thread " << thread << " (error=" << rc << ")" << std::endl;
 		}
 	}
 	uint64_t hash[RANDOMX_HASH_SIZE / sizeof(uint64_t)];
@ -122,19 +127,27 @@ void mine(randomx_vm* vm, std::atomic<uint32_t>& atomicNonce, AtomicHash& result
 	void* noncePtr = blockTemplate + 39;
 	auto nonce = atomicNonce.fetch_add(1);

-	store32(noncePtr, nonce);
-	randomx_calculate_hash_first(vm, blockTemplate, sizeof(blockTemplate));
+	if (batch) {
+		store32(noncePtr, nonce);
+		randomx_calculate_hash_first(vm, blockTemplate, sizeof(blockTemplate));
+	}

 	while (nonce < noncesCount) {
-		nonce = atomicNonce.fetch_add(1);
+		if (batch) {
+			nonce = atomicNonce.fetch_add(1);
+		}
 		store32(noncePtr, nonce);
-		randomx_calculate_hash_next(vm, blockTemplate, sizeof(blockTemplate), &hash);
+		(batch ? randomx_calculate_hash_next : randomx_calculate_hash)(vm, blockTemplate, sizeof(blockTemplate), &hash);
 		result.xorWith(hash);
+		if (!batch) {
+			nonce = atomicNonce.fetch_add(1);
+		}
 	}
 }

 int main(int argc, char** argv) {
-	bool softAes, miningMode, verificationMode, help, largePages, jit, secure, ssse3, avx2, autoFlags;
+	bool softAes, miningMode, verificationMode, help, largePages, jit, secure;
+	bool ssse3, avx2, autoFlags, noBatch;
 	int noncesCount, threadCount, initThreadCount;
 	uint64_t threadAffinity;
 	int32_t seedValue;
@ -158,10 +171,11 @@ int main(int argc, char** argv) {
 	readOption("--ssse3", argc, argv, ssse3);
 	readOption("--avx2", argc, argv, avx2);
 	readOption("--auto", argc, argv, autoFlags);
+	readOption("--noBatch", argc, argv, noBatch);

 	store32(&seed, seedValue);

-	std::cout << "RandomX benchmark v1.1.7" << std::endl;
+	std::cout << "RandomX benchmark v1.1.8" << std::endl;

 	if (help) {
 		printUsage(argv[0]);
@ -199,7 +213,7 @@ int main(int argc, char** argv) {
 		}
 		if (jit) {
 			flags |= RANDOMX_FLAG_JIT;
-#ifdef __OpenBSD__
+#ifdef RANDOMX_FORCE_SECURE
 			flags |= RANDOMX_FLAG_SECURE;
 #endif
 		}
@ -211,7 +225,7 @@ int main(int argc, char** argv) {
 	if (miningMode) {
 		flags |= RANDOMX_FLAG_FULL_MEM;
 	}
-#ifndef __OpenBSD__
+#ifndef RANDOMX_FORCE_SECURE
 	if (secure) {
 		flags |= RANDOMX_FLAG_SECURE;
 	}
@ -263,6 +277,16 @@ int main(int argc, char** argv) {
 		std::cout << " - thread affinity (" << mask_to_string(threadAffinity) << ")" << std::endl;
 	}

+	MineFunc* func;
+
+	if (noBatch) {
+		func = &mine<false>;
+	}
+	else {
+		func = &mine<true>;
+		std::cout << " - batch mode" << std::endl;
+	}
+
 	std::cout << "Initializing";
 	if (miningMode)
 		std::cout << " (" << initThreadCount << " thread" << (initThreadCount > 1 ? "s)" : ")");
@ -333,14 +357,14 @@ int main(int argc, char** argv) {
 				int cpuid = -1;
 				if (threadAffinity)
 					cpuid = cpuid_from_mask(threadAffinity, i);
-				threads.push_back(std::thread(&mine, vms[i], std::ref(atomicNonce), std::ref(result), noncesCount, i, cpuid));
+				threads.push_back(std::thread(func, vms[i], std::ref(atomicNonce), std::ref(result), noncesCount, i, cpuid));
 			}
 			for (unsigned i = 0; i < threads.size(); ++i) {
 				threads[i].join();
 			}
 		}
 		else {
-			mine(vms[0], std::ref(atomicNonce), std::ref(result), noncesCount, 0);
+			func(vms[0], std::ref(atomicNonce), std::ref(result), noncesCount, 0, -1);
 		}

 		double elapsed = sw.getElapsed();
--- a/src/tests/tests.cpp
+++ b/src/tests/tests.cpp
@ -143,7 +143,7 @@ int main() {
 		randomx::JitCompiler jit;
 		jit.generateSuperscalarHash(cache->programs, cache->reciprocalCache);
 		jit.generateDatasetInitCode();
-#ifdef __OpenBSD__
+#ifdef RANDOMX_FORCE_SECURE
 		jit.enableExecution();
 #else
 		jit.enableAll();
@ -954,7 +954,7 @@ int main() {
 		assert(ibc.memMask == randomx::ScratchpadL3Mask);
 	});

-#ifdef __OpenBSD__
+#ifdef RANDOMX_FORCE_SECURE
 	vm = randomx_create_vm(RANDOMX_FLAG_DEFAULT | RANDOMX_FLAG_SECURE, cache, nullptr);
 #else
 	vm = randomx_create_vm(RANDOMX_FLAG_DEFAULT, cache, nullptr);
@ -1009,10 +1009,10 @@ int main() {
 		vm = nullptr;
 		cache = randomx_alloc_cache(RANDOMX_FLAG_JIT);
 		initCache("test key 000");
-#ifdef __OpenBSD__
-		vm = randomx_create_vm(RANDOMX_FLAG_DEFAULT | RANDOMX_FLAG_SECURE, cache, nullptr);
+#ifdef RANDOMX_FORCE_SECURE
+		vm = randomx_create_vm(RANDOMX_FLAG_JIT | RANDOMX_FLAG_SECURE, cache, nullptr);
 #else
-		vm = randomx_create_vm(RANDOMX_FLAG_DEFAULT, cache, nullptr);
+		vm = randomx_create_vm(RANDOMX_FLAG_JIT, cache, nullptr);
 #endif
 	}

@ -1026,9 +1026,6 @@ int main() {

 	runTest("Hash test 2e (compiler)", RANDOMX_HAVE_COMPILER && stringsEqual(RANDOMX_ARGON_SALT, "RandomX\x03"), test_e);

-	randomx_destroy_vm(vm);
-	vm = nullptr;
-
 	auto flags = randomx_get_flags();

 	randomx_release_cache(cache);
@ -1054,6 +1051,40 @@ int main() {
 		assert(cacheMemory[33554431] == 0x1f47f056d05cd99b);
 	});

+	if (cache != nullptr)
+		randomx_release_cache(cache);
+	cache = randomx_alloc_cache(RANDOMX_FLAG_DEFAULT);
+
+	runTest("Hash batch test", RANDOMX_HAVE_COMPILER && stringsEqual(RANDOMX_ARGON_SALT, "RandomX\x03"), []() {
+		char hash1[RANDOMX_HASH_SIZE];
+		char hash2[RANDOMX_HASH_SIZE];
+		char hash3[RANDOMX_HASH_SIZE];
+		initCache("test key 000");
+		char input1[] = "This is a test";
+		char input2[] = "Lorem ipsum dolor sit amet";
+		char input3[] = "sed do eiusmod tempor incididunt ut labore et dolore magna aliqua";
+
+		randomx_calculate_hash_first(vm, input1, sizeof(input1) - 1);
+		randomx_calculate_hash_next(vm, input2, sizeof(input2) - 1, &hash1);
+		randomx_calculate_hash_next(vm, input3, sizeof(input3) - 1, &hash2);
+		randomx_calculate_hash_last(vm, &hash3);
+
+		assert(equalsHex(hash1, "639183aae1bf4c9a35884cb46b09cad9175f04efd7684e7262a0ac1c2f0b4e3f"));
+		assert(equalsHex(hash2, "300a0adb47603dedb42228ccb2b211104f4da45af709cd7547cd049e9489c969"));
+		assert(equalsHex(hash3, "c36d4ed4191e617309867ed66a443be4075014e2b061bcdaf9ce7b721d2b77a8"));
+	});
+
+	runTest("Preserve rounding mode", RANDOMX_FREQ_CFROUND > 0, []() {
+		rx_set_rounding_mode(RoundToNearest);
+		char hash[RANDOMX_HASH_SIZE];
+		calcStringHash("test key 000", "Lorem ipsum dolor sit amet", &hash);
+		assert(equalsHex(hash, "300a0adb47603dedb42228ccb2b211104f4da45af709cd7547cd049e9489c969"));
+		assert(rx_get_rounding_mode() == RoundToNearest);
+	});
+
+	randomx_destroy_vm(vm);
+	vm = nullptr;
+
 	if (cache != nullptr)
 		randomx_release_cache(cache);

--- a/src/virtual_machine.hpp
+++ b/src/virtual_machine.hpp
@ -54,6 +54,9 @@ public:
 	{
 		return program;
 	}
+	const uint8_t* getMemory() const {
+		return mem.memory;
+	}
 protected:
 	void initialize();
 	alignas(64) randomx::Program program;
--- a/src/virtual_memory.cpp
+++ b/src/virtual_memory.cpp
@ -94,7 +94,12 @@ void* allocMemoryPages(std::size_t bytes) {
 	if (mem == nullptr)
 		throw std::runtime_error(getErrorMessage("allocMemoryPages - VirtualAlloc"));
 #else
-	mem = mmap(nullptr, bytes, PAGE_READWRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+	#if defined(__NetBSD__)
+		#define RESERVED_FLAGS PROT_MPROTECT(PROT_EXEC)
+	#else
+		#define RESERVED_FLAGS 0
+	#endif
+	mem = mmap(nullptr, bytes, PAGE_READWRITE | RESERVED_FLAGS, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
 	if (mem == MAP_FAILED)
 		throw std::runtime_error("allocMemoryPages - mmap failed");
 #endif
@ -141,7 +146,7 @@ void* allocLargePagesMemory(std::size_t bytes) {
 	mem = mmap(nullptr, bytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, VM_FLAGS_SUPERPAGE_SIZE_2MB, 0);
 #elif defined(__FreeBSD__)
 	mem = mmap(nullptr, bytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_ALIGNED_SUPER, -1, 0);
-#elif defined(__OpenBSD__)
+#elif defined(__OpenBSD__) || defined(__NetBSD__)
 	mem = MAP_FAILED; // OpenBSD does not support huge pages
 #else
 	mem = mmap(nullptr, bytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB | MAP_POPULATE, -1, 0);
--- a/vcxproj/randomx-dll.vcxproj
+++ b/vcxproj/randomx-dll.vcxproj
@ -54,12 +54,17 @@
  <ItemGroup>
    <ClCompile Include="..\src\aes_hash.cpp" />
    <ClCompile Include="..\src\allocator.cpp" />
+    <ClCompile Include="..\src\argon2_avx2.c">
+      <EnableEnhancedInstructionSet Condition="'$(Configuration)|$(Platform)'=='Release|x64'">AdvancedVectorExtensions2</EnableEnhancedInstructionSet>
+    </ClCompile>
    <ClCompile Include="..\src\argon2_core.c" />
    <ClCompile Include="..\src\argon2_ref.c" />
+    <ClCompile Include="..\src\argon2_ssse3.c" />
    <ClCompile Include="..\src\assembly_generator_x86.cpp" />
    <ClCompile Include="..\src\blake2\blake2b.c" />
    <ClCompile Include="..\src\blake2_generator.cpp" />
    <ClCompile Include="..\src\bytecode_machine.cpp" />
+    <ClCompile Include="..\src\cpu.cpp" />
    <ClCompile Include="..\src\dataset.cpp" />
    <ClCompile Include="..\src\instruction.cpp" />
    <ClCompile Include="..\src\instructions_portable.cpp" />
--- a/vcxproj/randomx-dll.vcxproj.filters
+++ b/vcxproj/randomx-dll.vcxproj.filters
@ -172,5 +172,14 @@
    <ClCompile Include="..\src\bytecode_machine.cpp">
      <Filter>Source Files</Filter>
    </ClCompile>
+    <ClCompile Include="..\src\argon2_avx2.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\argon2_ssse3.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\cpu.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
  </ItemGroup>
 </Project>
Author	SHA1	Message	Date
fuwa	89b7c02bba	rx-wow-fix-3: Revert "Increase the frequency of CBRANCH (#118 )" This reverts commit `22689eda49`.	2020-07-13 08:55:05 +03:00
fuwa	d2416d6157	rx-wow-fix-2: Revert "Decrease the frequency of FADD/FSUB in favor of FMUL (#77 )" This reverts commit `91cd35ff13`.	2020-07-13 08:54:49 +03:00
fuwa	b4ada42599	rx-wow-fix-1: AesGenerator4R	2020-07-13 08:54:35 +03:00
wowario	597b1e9c91	RandomWOW parameters	2020-07-13 08:54:21 +03:00
tevador	5ce5f4906c	add --noBatch benchmark option	2020-07-04 14:57:56 +02:00
tevador	9905ec9c5a	Merge pull request #188 from cryptonote-social/master replace hardcoded literal with its appropriate symbol	2020-06-28 16:36:40 +02:00
tevador	863765bbe6	Merge pull request #185 from tevador/pr-crosscomp Fix windows-target cross-compilation	2020-06-28 16:36:12 +02:00
tevador	a1c08a2f41	Merge pull request #187 from tevador/pr-netbsd Fix compilation and JIT support on NetBSD 1. Disable hugepages (not supported). 2. Force W^X (required). 3. When allocating JIT memory, PROT_EXEC must be reserved in order to set the pages executable later.	2020-06-28 16:35:19 +02:00
tevador	708a4e50c5	Fix compilation and JIT support on NetBSD: 1. Disable hugepages (not supported). 2. Force W^X (required). 3. When allocating JIT memory, PROT_EXEC must be reserved in order to set the pages executable later.	2020-06-28 16:16:20 +02:00
tevador	6a4afc721f	Merge pull request #189 from tevador/pr-set-cache Fix potential use-after-free when reallocating cache	2020-06-27 20:42:15 +02:00
tevador	32ab5dea54	fix potential use-after-free when reallocating cache	2020-06-27 20:21:06 +02:00
cryptonote-social	a7733de1e7	replace hardcoded literal with its appropriate symbol	2020-06-27 09:53:46 -07:00
tevador	bece0a7206	fix #184	2020-06-09 19:10:56 +02:00
tevador	7741eb1e97	Merge pull request #182 from tevador/pr-restore-fpstate Preserve floating point state when calling randomx_calculate_hash	2020-05-16 23:19:37 +02:00
tevador	148b923f71	fix test 92 not failing properly on GCC/amd64	2020-05-06 13:48:53 +02:00
tevador	6a764e90d0	Preserve floating point state when calling randomx_calculate_hash	2020-05-06 12:42:30 +02:00
tevador	ac574e3743	Merge pull request #179 from tevador/pr-hash-batch Add a missing function to calculate a batch of hashes	2020-02-07 19:33:36 +01:00
tevador	01381ccef3	Add a missing function to calculate a batch of hashes Add a test for batch calculation	2020-02-06 18:14:38 +01:00
tevador	913e495c53	Merge branch 'master' of git@github.com:tevador/RandomX.git	2020-02-06 18:13:52 +01:00
tevador	72ac5e49b6	Update dll project	2019-12-29 19:14:00 +01:00
tevador	bbbb34757b	Add a note about building portable binaries	2019-12-26 12:32:04 +01:00
tevador	a223b6b33b	Fixed an incorrect URL the the documentation	2019-12-18 12:30:49 +01:00