rx-wow-fix-3: Revert "Increase the frequency of CBRANCH (#118 )"

This reverts commit 22689eda49.
rx-wow-fix-2: Revert "Decrease the frequency of FADD/FSUB in favor of FMUL (#77 )"
2020-07-13 08:55:05 +03:00 · 2020-07-13 08:54:49 +03:00 · 2020-07-13 08:54:35 +03:00 · 2020-07-13 08:54:21 +03:00 · 2020-07-04 14:57:56 +02:00 · 2020-06-28 16:36:40 +02:00
30 changed files with 472 additions and 141 deletions
--- a/.gitattributes
+++ b/.gitattributes
@ -0,0 +1,3 @@
+.gitignore export-ignore
+.gitattributes export-ignore
+audits export-ignore
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -28,7 +28,9 @@

 cmake_minimum_required(VERSION 2.8.7)

-set (randomx_sources
+project(RandomX)
+
+set(randomx_sources
 src/aes_hash.cpp
 src/argon2_ref.c
 src/argon2_ssse3.c
@ -94,32 +96,50 @@ function(add_flag flag)
 endfunction()

 # x86-64
-if (ARCH_ID STREQUAL "x86_64" OR ARCH_ID STREQUAL "x86-64" OR ARCH_ID STREQUAL "amd64")
+if(ARCH_ID STREQUAL "x86_64" OR ARCH_ID STREQUAL "x86-64" OR ARCH_ID STREQUAL "amd64")
  list(APPEND randomx_sources
-    src/jit_compiler_x86_static.S
    src/jit_compiler_x86.cpp)
-  # cheat because cmake and ccache hate each other
-  set_property(SOURCE src/jit_compiler_x86_static.S PROPERTY LANGUAGE C)
-  set_property(SOURCE src/jit_compiler_x86_static.S PROPERTY XCODE_EXPLICIT_FILE_TYPE sourcecode.asm)

-  if(ARCH STREQUAL "native")
-    add_flag("-march=native")
+  if(MSVC)
+    enable_language(ASM_MASM)
+    list(APPEND randomx_sources src/jit_compiler_x86_static.asm)
+
+    set_property(SOURCE src/jit_compiler_x86_static.asm PROPERTY LANGUAGE ASM_MASM)
+
+    set_source_files_properties(src/argon2_avx2.c COMPILE_FLAGS /arch:AVX2)
+
+    add_custom_command(OUTPUT ${CMAKE_CURRENT_SOURCE_DIR}/src/asm/configuration.asm
+      COMMAND powershell -ExecutionPolicy Bypass -File h2inc.ps1 ..\\src\\configuration.h > ..\\src\\asm\\configuration.asm SET ERRORLEVEL = 0
+      COMMENT "Generating configuration.asm at ${CMAKE_CURRENT_SOURCE_DIR}"
+      WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/vcxproj)
+    add_custom_target(generate-asm
+      DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/src/asm/configuration.asm)
  else()
-    # default build has hardware AES enabled (software AES can be selected at runtime)
-    add_flag("-maes")
-    check_c_compiler_flag(-mssse3 HAVE_SSSE3)
-    if(HAVE_SSSE3)
-      set_source_files_properties(src/argon2_ssse3.c COMPILE_FLAGS -mssse3)
-    endif()
-    check_c_compiler_flag(-mavx2 HAVE_AVX2)
-    if(HAVE_AVX2)
-      set_source_files_properties(src/argon2_avx2.c COMPILE_FLAGS -mavx2)
+    list(APPEND randomx_sources src/jit_compiler_x86_static.S)
+
+    # cheat because cmake and ccache hate each other
+    set_property(SOURCE src/jit_compiler_x86_static.S PROPERTY LANGUAGE C)
+    set_property(SOURCE src/jit_compiler_x86_static.S PROPERTY XCODE_EXPLICIT_FILE_TYPE sourcecode.asm)
+
+    if(ARCH STREQUAL "native")
+      add_flag("-march=native")
+    else()
+      # default build has hardware AES enabled (software AES can be selected at runtime)
+      add_flag("-maes")
+      check_c_compiler_flag(-mssse3 HAVE_SSSE3)
+      if(HAVE_SSSE3)
+        set_source_files_properties(src/argon2_ssse3.c COMPILE_FLAGS -mssse3)
+      endif()
+      check_c_compiler_flag(-mavx2 HAVE_AVX2)
+      if(HAVE_AVX2)
+        set_source_files_properties(src/argon2_avx2.c COMPILE_FLAGS -mavx2)
+      endif()
    endif()
  endif()
 endif()

 # PowerPC
-if (ARCH_ID STREQUAL "ppc64" OR ARCH_ID STREQUAL "ppc64le")
+if(ARCH_ID STREQUAL "ppc64" OR ARCH_ID STREQUAL "ppc64le")
  if(ARCH STREQUAL "native")
    add_flag("-mcpu=native")
  endif()
@ -127,7 +147,7 @@ if (ARCH_ID STREQUAL "ppc64" OR ARCH_ID STREQUAL "ppc64le")
 endif()

 # ARMv8
-if (ARM_ID STREQUAL "aarch64" OR ARM_ID STREQUAL "arm64" OR ARM_ID STREQUAL "armv8-a")
+if(ARM_ID STREQUAL "aarch64" OR ARM_ID STREQUAL "arm64" OR ARM_ID STREQUAL "armv8-a")
  list(APPEND randomx_sources
    src/jit_compiler_a64_static.S
    src/jit_compiler_a64.cpp)
@ -152,14 +172,22 @@ endif()

 set(RANDOMX_INCLUDE "${CMAKE_CURRENT_SOURCE_DIR}/src" CACHE STRING "RandomX Include path")

-add_library(randomx
-  ${randomx_sources})
+add_library(randomx ${randomx_sources})
+
+if(TARGET generate-asm)
+  add_dependencies(randomx generate-asm)
+endif()
+
 set_property(TARGET randomx PROPERTY POSITION_INDEPENDENT_CODE ON)
 set_property(TARGET randomx PROPERTY CXX_STANDARD 11)
 set_property(TARGET randomx PROPERTY CXX_STANDARD_REQUIRED ON)
+set_property(TARGET randomx PROPERTY PUBLIC_HEADER src/randomx.h)
+
+include(GNUInstallDirs)
 install(TARGETS randomx
-  ARCHIVE DESTINATION lib
-  LIBRARY DESTINATION lib)
+  LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+  ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
+  PUBLIC_HEADER DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})

 add_executable(randomx-tests
  src/tests/tests.cpp)
@ -176,7 +204,7 @@ target_link_libraries(randomx-codegen
 set_property(TARGET randomx-codegen PROPERTY POSITION_INDEPENDENT_CODE ON)
 set_property(TARGET randomx-codegen PROPERTY CXX_STANDARD 11)

-if (NOT Threads_FOUND AND UNIX AND NOT APPLE)
+if(NOT Threads_FOUND AND UNIX AND NOT APPLE)
  set(THREADS_PREFER_PTHREAD_FLAG ON)
  find_package(Threads)
 endif()
--- a/README.md
+++ b/README.md
@ -48,6 +48,8 @@ cmake -DARCH=native ..
 make
 ```

+To build portable binaries, omit the `ARCH` option when executing cmake.
+
 ### Windows

 On Windows, it is possible to build using MinGW (same procedure as on Linux) or using Visual Studio (solution file is provided).
@ -63,8 +65,12 @@ RandomX was primarily designed as a PoW algorithm for [Monero](https://www.getmo
 * The key `K` is selected to be the hash of a block in the blockchain - this block is called the 'key block'. For optimal mining and verification performance, the key should change every 2048 blocks (~2.8 days) and there should be a delay of 64 blocks (~2 hours) between the key block and the change of the key `K`. This can be achieved by changing the key when `blockHeight % 2048 == 64` and selecting key block such that `keyBlockHeight % 2048 == 0`.
 * The input `H` is the standard hashing blob with a selected nonce value.

+RandomX was successfully activated on the Monero network on the 30th November 2019.
+
 If you wish to use RandomX as a PoW algorithm for your cryptocurrency, please follow the [configuration guidelines](doc/configuration.md).

+**Note**: To achieve ASIC resistance, the key `K` must change and must not be miner-selectable. We recommend to use blockchain data as the key in a similar way to the Monero example above. If blockchain data cannot be used for some reason, use a predefined sequence of keys.
+
 ### CPU performance
 The table below lists the performance of selected CPUs using the optimal number of threads (T) and large pages (if possible), in hashes per second (H/s). "CNv4" refers to the CryptoNight variant 4 (CN/R) hashrate measured using [XMRig](https://github.com/xmrig/xmrig) v2.14.1. "Fast mode" and "Light mode" are the two modes of RandomX.

@ -106,7 +112,12 @@ Most Intel and AMD CPUs made since 2011 should be fairly efficient at RandomX. M
    * DDR4 memory is limited to about 4000-6000 H/s per channel  (depending on frequency and timings)

 ### Does RandomX facilitate botnets/malware mining or web mining?
-Efficient mining requires more than 2 GiB of memory, which is difficult to hide in an infected computer and disqualifies many low-end machines such as IoT devices. Web mining is infeasible due to the large memory requirement and the lack of directed rounding support for floating point operations in both Javascript and WebAssembly.
+
+Due to the way the algorithm works, mining malware is much easier to detect. [RandomX Sniffer](https://github.com/tevador/randomx-sniffer) is a proof of concept tool that can detect illicit mining activity on Windows.
+
+Efficient mining requires more than 2 GiB of memory, which also disqualifies many low-end machines such as IoT devices, which are often parts of large botnets.
+
+Web mining is infeasible due to the large memory requirement and the lack of directed rounding support for floating point operations in both Javascript and WebAssembly.

 ### Since RandomX uses floating point math, does it give reproducible results on different platforms?

--- a/doc/design.md
+++ b/doc/design.md
@ -255,7 +255,7 @@ The Scratchpad is split into 3 levels to mimic the typical CPU cache hierarchy [
 |----------------|----------|----------|----------|------|
 ARM Cortex A55|2|6|-|[[24](https://www.anandtech.com/show/11441/dynamiq-and-arms-new-cpus-cortex-a75-a55/4)]
 |AMD Zen+|4|12|40|[[25](https://en.wikichip.org/wiki/amd/microarchitectures/zen%2B#Memory_Hierarchy)]|
-|Intel Skylake|4|12|42|[[26](https://en.wikichip.org/wiki/amd/microarchitectures/zen%2B#Memory_Hierarchy)]
+|Intel Skylake|4|12|42|[[26](https://en.wikichip.org/wiki/intel/microarchitectures/skylake_(client)#Memory_Hierarchy)]

 The L3 cache is much larger and located further from the CPU core. As a result, its access latencies are much higher and can cause stalls in program execution.

@ -638,7 +638,7 @@ state3 = 00000000000000000000000000000000

 [25] AMD Zen+ Microarchitecture - https://en.wikichip.org/wiki/amd/microarchitectures/zen%2B#Memory_Hierarchy

-[26] Intel Skylake Microarchitecture - https://en.wikichip.org/wiki/amd/microarchitectures/zen%2B#Memory_Hierarchy
+[26] Intel Skylake Microarchitecture - https://en.wikichip.org/wiki/intel/microarchitectures/skylake_(client)#Memory_Hierarchy

 [27] Biryukov et al.: Fast and Tradeoff-Resilient Memory-Hard Functions for
 Cryptocurrencies and Password Hashing - https://eprint.iacr.org/2015/430.pdf Table 2, page 8
@ -647,4 +647,4 @@ Cryptocurrencies and Password Hashing - https://eprint.iacr.org/2015/430.pdf Tab

 [29] 7-Zip File archiver - https://www.7-zip.org/

-[30] TestU01 library - http://simul.iro.umontreal.ca/testu01/tu01.html
+[30] TestU01 library - http://simul.iro.umontreal.ca/testu01/tu01.html
--- a/doc/tevador.asc
+++ b/doc/tevador.asc
@ -0,0 +1,13 @@
+-----BEGIN PGP PUBLIC KEY BLOCK-----
+
+mDMEXd+PeBYJKwYBBAHaRw8BAQdAZ0nqJ+nRYoScG2QLX62pl+WO1+Mkv6Yyt2Kb
+ntGUuLq0G3RldmFkb3IgPHRldmFkb3JAZ21haWwuY29tPoiWBBMWCAA+FiEEMoWj
+LVEwdmMs6CUQWijIaue9c6YFAl3fj3gCGwMFCQWnqDgFCwkIBwIGFQoJCAsCBBYC
+AwECHgECF4AACgkQWijIaue9c6YBFQD+N1XTUqSCZp9jB/yTHQ9ahSaIUMtmuvdT
+So2s+quudP4A/R5wLwukpfGN9UZ4cfpmKCJ9jO1HJ2udmlGMsJbQpDAIuDgEXd+P
+eBIKKwYBBAGXVQEFAQEHQBNbQuPcDojMCkRb5B5u7Ld/AFLClOh+6ElL+u61rIY/
+AwEIB4h+BBgWCAAmFiEEMoWjLVEwdmMs6CUQWijIaue9c6YFAl3fj3gCGwwFCQWn
+qDgACgkQWijIaue9c6YJvgD+IY1Q9mCM1P1iZIoXuafRihXJ7UgVXpQqW2yoaUT3
+bfQA/RkisI2eElYoOjdwPszPP6VfL5+SViwDmDuJG2P5llgE
+=V4vd
+-----END PGP PUBLIC KEY BLOCK-----
--- a/src/aes_hash.cpp
+++ b/src/aes_hash.cpp
@ -175,10 +175,10 @@ template void fillAes1Rx4<false>(void *state, size_t outputSize, void *buffer);
 //key0, key1, key2, key3 = Blake2b-512("RandomX AesGenerator4R keys 0-3")
 //key4, key5, key6, key7 = Blake2b-512("RandomX AesGenerator4R keys 4-7")

-#define AES_GEN_4R_KEY0 0x99e5d23f, 0x2f546d2b, 0xd1833ddb, 0x6421aadd
-#define AES_GEN_4R_KEY1 0xa5dfcde5, 0x06f79d53, 0xb6913f55, 0xb20e3450
-#define AES_GEN_4R_KEY2 0x171c02bf, 0x0aa4679f, 0x515e7baf, 0x5c3ed904
-#define AES_GEN_4R_KEY3 0xd8ded291, 0xcd673785, 0xe78f5d08, 0x85623763
+#define AES_GEN_4R_KEY0 0xcf359e95, 0x141f82b7, 0x7ffbe4a6, 0xf890465d
+#define AES_GEN_4R_KEY1 0x6741ffdc, 0xbd5c5ac3, 0xfee8278a, 0x6a55c450
+#define AES_GEN_4R_KEY2 0x3d324aac, 0xa7279ad2, 0xd524fde4, 0x114c47a4
+#define AES_GEN_4R_KEY3 0x76f6db08, 0x42d3dbd9, 0x99a9aeff, 0x810c3a2a
 #define AES_GEN_4R_KEY4 0x229effb4, 0x3d518b6d, 0xe3d6a7a6, 0xb5826f73
 #define AES_GEN_4R_KEY5 0xb272b7d2, 0xe9024d4e, 0x9c10b3d9, 0xc7566bf3
 #define AES_GEN_4R_KEY6 0xf63befa7, 0x2ba9660a, 0xf765a38b, 0xf273c9e7
@ -197,10 +197,6 @@ void fillAes4Rx4(void *state, size_t outputSize, void *buffer) {
 	key1 = rx_set_int_vec_i128(AES_GEN_4R_KEY1);
 	key2 = rx_set_int_vec_i128(AES_GEN_4R_KEY2);
 	key3 = rx_set_int_vec_i128(AES_GEN_4R_KEY3);
-	key4 = rx_set_int_vec_i128(AES_GEN_4R_KEY4);
-	key5 = rx_set_int_vec_i128(AES_GEN_4R_KEY5);
-	key6 = rx_set_int_vec_i128(AES_GEN_4R_KEY6);
-	key7 = rx_set_int_vec_i128(AES_GEN_4R_KEY7);

 	state0 = rx_load_vec_i128((rx_vec_i128*)state + 0);
 	state1 = rx_load_vec_i128((rx_vec_i128*)state + 1);
@ -210,23 +206,23 @@ void fillAes4Rx4(void *state, size_t outputSize, void *buffer) {
 	while (outptr < outputEnd) {
 		state0 = aesdec<softAes>(state0, key0);
 		state1 = aesenc<softAes>(state1, key0);
-		state2 = aesdec<softAes>(state2, key4);
-		state3 = aesenc<softAes>(state3, key4);
+		state2 = aesdec<softAes>(state2, key0);
+		state3 = aesenc<softAes>(state3, key0);

 		state0 = aesdec<softAes>(state0, key1);
 		state1 = aesenc<softAes>(state1, key1);
-		state2 = aesdec<softAes>(state2, key5);
-		state3 = aesenc<softAes>(state3, key5);
+		state2 = aesdec<softAes>(state2, key1);
+		state3 = aesenc<softAes>(state3, key1);

 		state0 = aesdec<softAes>(state0, key2);
 		state1 = aesenc<softAes>(state1, key2);
-		state2 = aesdec<softAes>(state2, key6);
-		state3 = aesenc<softAes>(state3, key6);
+		state2 = aesdec<softAes>(state2, key2);
+		state3 = aesenc<softAes>(state3, key2);

 		state0 = aesdec<softAes>(state0, key3);
 		state1 = aesenc<softAes>(state1, key3);
-		state2 = aesdec<softAes>(state2, key7);
-		state3 = aesenc<softAes>(state3, key7);
+		state2 = aesdec<softAes>(state2, key3);
+		state3 = aesenc<softAes>(state3, key3);

 		rx_store_vec_i128((rx_vec_i128*)outptr + 0, state0);
 		rx_store_vec_i128((rx_vec_i128*)outptr + 1, state1);
@ -239,3 +235,84 @@ void fillAes4Rx4(void *state, size_t outputSize, void *buffer) {

 template void fillAes4Rx4<true>(void *state, size_t outputSize, void *buffer);
 template void fillAes4Rx4<false>(void *state, size_t outputSize, void *buffer);
+
+template<bool softAes>
+void hashAndFillAes1Rx4(void *scratchpad, size_t scratchpadSize, void *hash, void* fill_state) {
+	uint8_t* scratchpadPtr = (uint8_t*)scratchpad;
+	const uint8_t* scratchpadEnd = scratchpadPtr + scratchpadSize;
+
+	// initial state
+	rx_vec_i128 hash_state0 = rx_set_int_vec_i128(AES_HASH_1R_STATE0);
+	rx_vec_i128 hash_state1 = rx_set_int_vec_i128(AES_HASH_1R_STATE1);
+	rx_vec_i128 hash_state2 = rx_set_int_vec_i128(AES_HASH_1R_STATE2);
+	rx_vec_i128 hash_state3 = rx_set_int_vec_i128(AES_HASH_1R_STATE3);
+
+	const rx_vec_i128 key0 = rx_set_int_vec_i128(AES_GEN_1R_KEY0);
+	const rx_vec_i128 key1 = rx_set_int_vec_i128(AES_GEN_1R_KEY1);
+	const rx_vec_i128 key2 = rx_set_int_vec_i128(AES_GEN_1R_KEY2);
+	const rx_vec_i128 key3 = rx_set_int_vec_i128(AES_GEN_1R_KEY3);
+
+	rx_vec_i128 fill_state0 = rx_load_vec_i128((rx_vec_i128*)fill_state + 0);
+	rx_vec_i128 fill_state1 = rx_load_vec_i128((rx_vec_i128*)fill_state + 1);
+	rx_vec_i128 fill_state2 = rx_load_vec_i128((rx_vec_i128*)fill_state + 2);
+	rx_vec_i128 fill_state3 = rx_load_vec_i128((rx_vec_i128*)fill_state + 3);
+
+	constexpr int PREFETCH_DISTANCE = 4096;
+	const char* prefetchPtr = ((const char*)scratchpad) + PREFETCH_DISTANCE;
+	scratchpadEnd -= PREFETCH_DISTANCE;
+
+	for (int i = 0; i < 2; ++i) {
+		//process 64 bytes at a time in 4 lanes
+		while (scratchpadPtr < scratchpadEnd) {
+			hash_state0 = aesenc<softAes>(hash_state0, rx_load_vec_i128((rx_vec_i128*)scratchpadPtr + 0));
+			hash_state1 = aesdec<softAes>(hash_state1, rx_load_vec_i128((rx_vec_i128*)scratchpadPtr + 1));
+			hash_state2 = aesenc<softAes>(hash_state2, rx_load_vec_i128((rx_vec_i128*)scratchpadPtr + 2));
+			hash_state3 = aesdec<softAes>(hash_state3, rx_load_vec_i128((rx_vec_i128*)scratchpadPtr + 3));
+
+			fill_state0 = aesdec<softAes>(fill_state0, key0);
+			fill_state1 = aesenc<softAes>(fill_state1, key1);
+			fill_state2 = aesdec<softAes>(fill_state2, key2);
+			fill_state3 = aesenc<softAes>(fill_state3, key3);
+
+			rx_store_vec_i128((rx_vec_i128*)scratchpadPtr + 0, fill_state0);
+			rx_store_vec_i128((rx_vec_i128*)scratchpadPtr + 1, fill_state1);
+			rx_store_vec_i128((rx_vec_i128*)scratchpadPtr + 2, fill_state2);
+			rx_store_vec_i128((rx_vec_i128*)scratchpadPtr + 3, fill_state3);
+
+			rx_prefetch_t0(prefetchPtr);
+
+			scratchpadPtr += 64;
+			prefetchPtr += 64;
+		}
+		prefetchPtr = (const char*) scratchpad;
+		scratchpadEnd += PREFETCH_DISTANCE;
+	}
+
+	rx_store_vec_i128((rx_vec_i128*)fill_state + 0, fill_state0);
+	rx_store_vec_i128((rx_vec_i128*)fill_state + 1, fill_state1);
+	rx_store_vec_i128((rx_vec_i128*)fill_state + 2, fill_state2);
+	rx_store_vec_i128((rx_vec_i128*)fill_state + 3, fill_state3);
+
+	//two extra rounds to achieve full diffusion
+	rx_vec_i128 xkey0 = rx_set_int_vec_i128(AES_HASH_1R_XKEY0);
+	rx_vec_i128 xkey1 = rx_set_int_vec_i128(AES_HASH_1R_XKEY1);
+
+	hash_state0 = aesenc<softAes>(hash_state0, xkey0);
+	hash_state1 = aesdec<softAes>(hash_state1, xkey0);
+	hash_state2 = aesenc<softAes>(hash_state2, xkey0);
+	hash_state3 = aesdec<softAes>(hash_state3, xkey0);
+
+	hash_state0 = aesenc<softAes>(hash_state0, xkey1);
+	hash_state1 = aesdec<softAes>(hash_state1, xkey1);
+	hash_state2 = aesenc<softAes>(hash_state2, xkey1);
+	hash_state3 = aesdec<softAes>(hash_state3, xkey1);
+
+	//output hash
+	rx_store_vec_i128((rx_vec_i128*)hash + 0, hash_state0);
+	rx_store_vec_i128((rx_vec_i128*)hash + 1, hash_state1);
+	rx_store_vec_i128((rx_vec_i128*)hash + 2, hash_state2);
+	rx_store_vec_i128((rx_vec_i128*)hash + 3, hash_state3);
+}
+
+template void hashAndFillAes1Rx4<false>(void *scratchpad, size_t scratchpadSize, void *hash, void* fill_state);
+template void hashAndFillAes1Rx4<true>(void *scratchpad, size_t scratchpadSize, void *hash, void* fill_state);
--- a/src/aes_hash.hpp
+++ b/src/aes_hash.hpp
@ -38,3 +38,6 @@ void fillAes1Rx4(void *state, size_t outputSize, void *buffer);

 template<bool softAes>
 void fillAes4Rx4(void *state, size_t outputSize, void *buffer);
+
+template<bool softAes>
+void hashAndFillAes1Rx4(void *scratchpad, size_t scratchpadSize, void *hash, void* fill_state);
--- a/src/allocator.cpp
+++ b/src/allocator.cpp
@ -47,7 +47,7 @@ namespace randomx {
 		rx_aligned_free(ptr);
 	}

-	template class AlignedAllocator<CacheLineSize>;
+	template struct AlignedAllocator<CacheLineSize>;

 	void* LargePageAllocator::allocMemory(size_t count) {
 		return allocLargePagesMemory(count);
--- a/src/asm/configuration.asm
+++ b/src/asm/configuration.asm
@ -15,7 +15,7 @@ RANDOMX_SCRATCHPAD_L2 EQU 262144t
 RANDOMX_SCRATCHPAD_L1 EQU 16384t
 RANDOMX_JUMP_BITS EQU 8t
 RANDOMX_JUMP_OFFSET EQU 8t
-RANDOMX_FREQ_IADD_RS EQU 16t
+RANDOMX_FREQ_IADD_RS EQU 25t
 RANDOMX_FREQ_IADD_M EQU 7t
 RANDOMX_FREQ_ISUB_R EQU 16t
 RANDOMX_FREQ_ISUB_M EQU 7t
@ -29,19 +29,19 @@ RANDOMX_FREQ_IMUL_RCP EQU 8t
 RANDOMX_FREQ_INEG_R EQU 2t
 RANDOMX_FREQ_IXOR_R EQU 15t
 RANDOMX_FREQ_IXOR_M EQU 5t
-RANDOMX_FREQ_IROR_R EQU 8t
-RANDOMX_FREQ_IROL_R EQU 2t
+RANDOMX_FREQ_IROR_R EQU 10t
+RANDOMX_FREQ_IROL_R EQU 0t
 RANDOMX_FREQ_ISWAP_R EQU 4t
-RANDOMX_FREQ_FSWAP_R EQU 4t
-RANDOMX_FREQ_FADD_R EQU 16t
+RANDOMX_FREQ_FSWAP_R EQU 8t
+RANDOMX_FREQ_FADD_R EQU 20t
 RANDOMX_FREQ_FADD_M EQU 5t
-RANDOMX_FREQ_FSUB_R EQU 16t
+RANDOMX_FREQ_FSUB_R EQU 20t
 RANDOMX_FREQ_FSUB_M EQU 5t
 RANDOMX_FREQ_FSCAL_R EQU 6t
-RANDOMX_FREQ_FMUL_R EQU 32t
+RANDOMX_FREQ_FMUL_R EQU 20t
 RANDOMX_FREQ_FDIV_M EQU 4t
 RANDOMX_FREQ_FSQRT_R EQU 6t
-RANDOMX_FREQ_CBRANCH EQU 25t
+RANDOMX_FREQ_CBRANCH EQU 16t
 RANDOMX_FREQ_CFROUND EQU 1t
 RANDOMX_FREQ_ISTORE EQU 16t
 RANDOMX_FREQ_NOP EQU 0t
--- a/src/common.hpp
+++ b/src/common.hpp
@ -67,7 +67,7 @@ namespace randomx {
 	constexpr int wtSum = RANDOMX_FREQ_IADD_RS + RANDOMX_FREQ_IADD_M + RANDOMX_FREQ_ISUB_R + \
 		RANDOMX_FREQ_ISUB_M + RANDOMX_FREQ_IMUL_R + RANDOMX_FREQ_IMUL_M + RANDOMX_FREQ_IMULH_R + \
 		RANDOMX_FREQ_IMULH_M + RANDOMX_FREQ_ISMULH_R + RANDOMX_FREQ_ISMULH_M + RANDOMX_FREQ_IMUL_RCP + \
-		RANDOMX_FREQ_INEG_R + RANDOMX_FREQ_IXOR_R + RANDOMX_FREQ_IXOR_M + RANDOMX_FREQ_IROR_R + RANDOMX_FREQ_IROL_R + RANDOMX_FREQ_ISWAP_R + \
+		RANDOMX_FREQ_INEG_R + RANDOMX_FREQ_IXOR_R + RANDOMX_FREQ_IXOR_M + RANDOMX_FREQ_IROR_R + RANDOMX_FREQ_ISWAP_R + \
 		RANDOMX_FREQ_FSWAP_R + RANDOMX_FREQ_FADD_R + RANDOMX_FREQ_FADD_M + RANDOMX_FREQ_FSUB_R + RANDOMX_FREQ_FSUB_M + \
 		RANDOMX_FREQ_FSCAL_R + RANDOMX_FREQ_FMUL_R + RANDOMX_FREQ_FDIV_M + RANDOMX_FREQ_FSQRT_R + RANDOMX_FREQ_CBRANCH + \
 		RANDOMX_FREQ_CFROUND + RANDOMX_FREQ_ISTORE + RANDOMX_FREQ_NOP;
--- a/src/configuration.h
+++ b/src/configuration.h
@ -1,5 +1,6 @@
 /*
 Copyright (c) 2018-2019, tevador <tevador@gmail.com>
+Copyright (c) 2019, Wownero Inc., a Monero Enterprise Alliance partner company

 All rights reserved.

@ -38,7 +39,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define RANDOMX_ARGON_LANES        1

 //Argon2d salt
-#define RANDOMX_ARGON_SALT         "RandomX\x03"
+#define RANDOMX_ARGON_SALT         "RandomWOW\x01"

 //Number of random Cache accesses per Dataset item. Minimum is 2.
 #define RANDOMX_CACHE_ACCESSES     8
@ -56,16 +57,16 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define RANDOMX_PROGRAM_SIZE       256

 //Number of iterations during VM execution.
-#define RANDOMX_PROGRAM_ITERATIONS 2048
+#define RANDOMX_PROGRAM_ITERATIONS 1024

 //Number of chained VM executions per hash.
-#define RANDOMX_PROGRAM_COUNT      8
+#define RANDOMX_PROGRAM_COUNT      16

 //Scratchpad L3 size in bytes. Must be a power of 2.
-#define RANDOMX_SCRATCHPAD_L3      2097152
+#define RANDOMX_SCRATCHPAD_L3      1048576

 //Scratchpad L2 size in bytes. Must be a power of two and less than or equal to RANDOMX_SCRATCHPAD_L3.
-#define RANDOMX_SCRATCHPAD_L2      262144
+#define RANDOMX_SCRATCHPAD_L2      131072

 //Scratchpad L1 size in bytes. Must be a power of two (minimum 64) and less than or equal to RANDOMX_SCRATCHPAD_L2.
 #define RANDOMX_SCRATCHPAD_L1      16384
@ -82,7 +83,7 @@ Total sum of frequencies must be 256
 */

 //Integer instructions
-#define RANDOMX_FREQ_IADD_RS       16
+#define RANDOMX_FREQ_IADD_RS       25
 #define RANDOMX_FREQ_IADD_M         7
 #define RANDOMX_FREQ_ISUB_R        16
 #define RANDOMX_FREQ_ISUB_M         7
@ -96,23 +97,23 @@ Total sum of frequencies must be 256
 #define RANDOMX_FREQ_INEG_R         2
 #define RANDOMX_FREQ_IXOR_R        15
 #define RANDOMX_FREQ_IXOR_M         5
-#define RANDOMX_FREQ_IROR_R         8
-#define RANDOMX_FREQ_IROL_R         2
+#define RANDOMX_FREQ_IROR_R        10
+#define RANDOMX_FREQ_IROL_R         0
 #define RANDOMX_FREQ_ISWAP_R        4

 //Floating point instructions
-#define RANDOMX_FREQ_FSWAP_R        4
-#define RANDOMX_FREQ_FADD_R        16
+#define RANDOMX_FREQ_FSWAP_R        8
+#define RANDOMX_FREQ_FADD_R        20
 #define RANDOMX_FREQ_FADD_M         5
-#define RANDOMX_FREQ_FSUB_R        16
+#define RANDOMX_FREQ_FSUB_R        20
 #define RANDOMX_FREQ_FSUB_M         5
 #define RANDOMX_FREQ_FSCAL_R        6
-#define RANDOMX_FREQ_FMUL_R        32
+#define RANDOMX_FREQ_FMUL_R        20
 #define RANDOMX_FREQ_FDIV_M         4
 #define RANDOMX_FREQ_FSQRT_R        6

 //Control instructions
-#define RANDOMX_FREQ_CBRANCH       25
+#define RANDOMX_FREQ_CBRANCH       16
 #define RANDOMX_FREQ_CFROUND        1

 //Store instruction
--- a/src/dataset.hpp
+++ b/src/dataset.hpp
@ -61,8 +61,17 @@ struct randomx_cache {

 //A pointer to a standard-layout struct object points to its initial member
 static_assert(std::is_standard_layout<randomx_dataset>(), "randomx_dataset must be a standard-layout struct");
+
 //the following assert fails when compiling Debug in Visual Studio (JIT mode will crash in Debug)
+#if defined(_MSC_VER) && !defined(__INTEL_COMPILER) && defined(_DEBUG)
+#define TO_STR(x) #x
+#define STR(x) TO_STR(x)
+#pragma message ( __FILE__ "(" STR(__LINE__) ") warning: check std::is_standard_layout<randomx_cache>() is disabled for Debug configuration. JIT mode will crash." )
+#undef STR
+#undef TO_STR
+#else
 static_assert(std::is_standard_layout<randomx_cache>(), "randomx_cache must be a standard-layout struct");
+#endif

 namespace randomx {

--- a/src/instructions_portable.cpp
+++ b/src/instructions_portable.cpp
@ -157,6 +157,21 @@ void rx_set_rounding_mode(uint32_t mode) {
 	}
 }

+uint32_t rx_get_rounding_mode() {
+	switch (fegetround()) {
+	case FE_DOWNWARD:
+		return RoundDown;
+	case FE_UPWARD:
+		return RoundUp;
+	case FE_TOWARDZERO:
+		return RoundToZero;
+	case FE_TONEAREST:
+		return RoundToNearest;
+	default:
+		UNREACHABLE;
+	}
+}
+
 #endif

 #ifdef RANDOMX_USE_X87
--- a/src/intrin_portable.h
+++ b/src/intrin_portable.h
@ -102,6 +102,7 @@ typedef __m128d rx_vec_f128;
 #define rx_aligned_alloc(a, b) _mm_malloc(a,b)
 #define rx_aligned_free(a) _mm_free(a)
 #define rx_prefetch_nta(x) _mm_prefetch((const char *)(x), _MM_HINT_NTA)
+#define rx_prefetch_t0(x) _mm_prefetch((const char *)(x), _MM_HINT_T0)

 #define rx_load_vec_f128 _mm_load_pd
 #define rx_store_vec_f128 _mm_store_pd
@ -172,6 +173,10 @@ FORCE_INLINE void rx_set_rounding_mode(uint32_t mode) {
 	_mm_setcsr(rx_mxcsr_default | (mode << 13));
 }

+FORCE_INLINE uint32_t rx_get_rounding_mode() {
+	return (_mm_getcsr() >> 13) & 3;
+}
+
 #elif defined(__PPC64__) && defined(__ALTIVEC__) && defined(__VSX__) //sadly only POWER7 and newer will be able to use SIMD acceleration. Earlier processors cant use doubles or 64 bit integers with SIMD
 #include <cstdint>
 #include <stdexcept>
@ -201,6 +206,7 @@ typedef union{
 #define rx_aligned_alloc(a, b) malloc(a)
 #define rx_aligned_free(a) free(a)
 #define rx_prefetch_nta(x)
+#define rx_prefetch_t0(x)

 /* Splat 64-bit long long to 2 64-bit long longs */
 FORCE_INLINE __m128i vec_splat2sd (int64_t scalar)
@ -399,6 +405,10 @@ inline void rx_prefetch_nta(void* ptr) {
 	asm volatile ("prfm pldl1strm, [%0]\n" : : "r" (ptr));
 }

+inline void rx_prefetch_t0(const void* ptr) {
+	asm volatile ("prfm pldl1strm, [%0]\n" : : "r" (ptr));
+}
+
 FORCE_INLINE rx_vec_f128 rx_load_vec_f128(const double* pd) {
 	return vld1q_f64((const float64_t*)pd);
 }
@ -532,6 +542,7 @@ typedef union {
 #define rx_aligned_alloc(a, b) malloc(a)
 #define rx_aligned_free(a) free(a)
 #define rx_prefetch_nta(x)
+#define rx_prefetch_t0(x)

 FORCE_INLINE rx_vec_f128 rx_load_vec_f128(const double* pd) {
 	rx_vec_f128 x;
@ -729,6 +740,8 @@ void rx_reset_float_state();

 void rx_set_rounding_mode(uint32_t mode);

+uint32_t rx_get_rounding_mode();
+
 #endif

 double loadDoublePortable(const void* addr);
--- a/src/jit_compiler.hpp
+++ b/src/jit_compiler.hpp
@ -35,3 +35,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #else
 #include "jit_compiler_fallback.hpp"
 #endif
+
+#if defined(__OpenBSD__) || defined(__NetBSD__)
+#define RANDOMX_FORCE_SECURE
+#endif
--- a/src/jit_compiler_a64.hpp
+++ b/src/jit_compiler_a64.hpp
@ -38,7 +38,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 namespace randomx {

 	class Program;
-	class ProgramConfiguration;
+	struct ProgramConfiguration;
 	class SuperscalarProgram;
 	class Instruction;

--- a/src/jit_compiler_a64_static.S
+++ b/src/jit_compiler_a64_static.S
@ -25,26 +25,32 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

+#if defined(__APPLE__)
+#define DECL(x) _##x
+#else
+#define DECL(x) x
+#endif
+
 	.arch armv8-a
 	.text
-	.global	randomx_program_aarch64
-	.global	randomx_program_aarch64_main_loop
-	.global	randomx_program_aarch64_vm_instructions
-	.global randomx_program_aarch64_imul_rcp_literals_end
-	.global	randomx_program_aarch64_vm_instructions_end
-	.global randomx_program_aarch64_cacheline_align_mask1
-	.global randomx_program_aarch64_cacheline_align_mask2
-	.global randomx_program_aarch64_update_spMix1
-	.global randomx_program_aarch64_vm_instructions_end_light
-	.global randomx_program_aarch64_light_cacheline_align_mask
-	.global randomx_program_aarch64_light_dataset_offset
-	.global randomx_init_dataset_aarch64
-	.global randomx_init_dataset_aarch64_end
-	.global randomx_calc_dataset_item_aarch64
-	.global randomx_calc_dataset_item_aarch64_prefetch
-	.global randomx_calc_dataset_item_aarch64_mix
-	.global randomx_calc_dataset_item_aarch64_store_result
-	.global randomx_calc_dataset_item_aarch64_end
+	.global DECL(randomx_program_aarch64)
+	.global DECL(randomx_program_aarch64_main_loop)
+	.global DECL(randomx_program_aarch64_vm_instructions)
+	.global DECL(randomx_program_aarch64_imul_rcp_literals_end)
+	.global DECL(randomx_program_aarch64_vm_instructions_end)
+	.global DECL(randomx_program_aarch64_cacheline_align_mask1)
+	.global DECL(randomx_program_aarch64_cacheline_align_mask2)
+	.global DECL(randomx_program_aarch64_update_spMix1)
+	.global DECL(randomx_program_aarch64_vm_instructions_end_light)
+	.global DECL(randomx_program_aarch64_light_cacheline_align_mask)
+	.global DECL(randomx_program_aarch64_light_dataset_offset)
+	.global DECL(randomx_init_dataset_aarch64)
+	.global DECL(randomx_init_dataset_aarch64_end)
+	.global DECL(randomx_calc_dataset_item_aarch64)
+	.global DECL(randomx_calc_dataset_item_aarch64_prefetch)
+	.global DECL(randomx_calc_dataset_item_aarch64_mix)
+	.global DECL(randomx_calc_dataset_item_aarch64_store_result)
+	.global DECL(randomx_calc_dataset_item_aarch64_end)

 #include "configuration.h"

@ -101,7 +107,7 @@
 # v31 -> scale mask   = 0x81f000000000000081f0000000000000

 	.balign 4
-randomx_program_aarch64:
+DECL(randomx_program_aarch64):
 	# Save callee-saved registers
 	sub	sp, sp, 192
 	stp	x16, x17, [sp]
@ -189,7 +195,7 @@ randomx_program_aarch64:
 	ldr	q14, literal_v14
 	ldr	q15, literal_v15

-randomx_program_aarch64_main_loop:
+DECL(randomx_program_aarch64_main_loop):
 	# spAddr0 = spMix1 & ScratchpadL3Mask64;
 	# spAddr1 = (spMix1 >> 32) & ScratchpadL3Mask64;
 	lsr	x18, x10, 32
@ -262,7 +268,7 @@ randomx_program_aarch64_main_loop:
 	orr	v23.16b, v23.16b, v30.16b

 	# Execute VM instructions
-randomx_program_aarch64_vm_instructions:
+DECL(randomx_program_aarch64_vm_instructions):

 	# buffer for generated instructions
 	# FDIV_M is the largest instruction taking up to 12 ARMv8 instructions
@ -281,7 +287,7 @@ literal_x27: .fill 1,8,0
 literal_x28: .fill 1,8,0
 literal_x29: .fill 1,8,0
 literal_x30: .fill 1,8,0
-randomx_program_aarch64_imul_rcp_literals_end:
+DECL(randomx_program_aarch64_imul_rcp_literals_end):

 literal_v0:  .fill 2,8,0
 literal_v1:  .fill 2,8,0
@ -300,14 +306,14 @@ literal_v13: .fill 2,8,0
 literal_v14: .fill 2,8,0
 literal_v15: .fill 2,8,0

-randomx_program_aarch64_vm_instructions_end:
+DECL(randomx_program_aarch64_vm_instructions_end):

 	# mx ^= r[readReg2] ^ r[readReg3];
 	eor	x9, x9, x18

 	# Calculate dataset pointer for dataset prefetch
 	mov	w18, w9
-randomx_program_aarch64_cacheline_align_mask1:
+DECL(randomx_program_aarch64_cacheline_align_mask1):
 	# Actual mask will be inserted by JIT compiler
 	and	x18, x18, 1
 	add	x18, x18, x1
@ -320,12 +326,12 @@ randomx_program_aarch64_cacheline_align_mask1:

 	# Calculate dataset pointer for dataset read
 	mov	w10, w9
-randomx_program_aarch64_cacheline_align_mask2:
+DECL(randomx_program_aarch64_cacheline_align_mask2):
 	# Actual mask will be inserted by JIT compiler
 	and	x10, x10, 1
 	add	x10, x10, x1

-randomx_program_aarch64_xor_with_dataset_line:
+DECL(randomx_program_aarch64_xor_with_dataset_line):
 	# xor integer registers with dataset data
 	ldp	x18, x19, [x10]
 	eor	x4, x4, x18
@ -340,7 +346,7 @@ randomx_program_aarch64_xor_with_dataset_line:
 	eor	x14, x14, x18
 	eor	x15, x15, x19

-randomx_program_aarch64_update_spMix1:
+DECL(randomx_program_aarch64_update_spMix1):
 	# JIT compiler will replace it with "eor x10, config.readReg0, config.readReg1"
 	eor	x10, x0, x0

@ -361,7 +367,7 @@ randomx_program_aarch64_update_spMix1:
 	stp	q18, q19, [x16, 32]

 	subs	x3, x3, 1
-	bne	randomx_program_aarch64_main_loop
+	bne	DECL(randomx_program_aarch64_main_loop)
 	
 	# Restore x0
 	ldr	x0, [sp], 16
@ -395,7 +401,7 @@ randomx_program_aarch64_update_spMix1:

 	ret

-randomx_program_aarch64_vm_instructions_end_light:
+DECL(randomx_program_aarch64_vm_instructions_end_light):
 	sub	sp, sp, 96
 	stp	x0, x1, [sp, 64]
 	stp	x2, x30, [sp, 80]
@ -412,26 +418,26 @@ randomx_program_aarch64_vm_instructions_end_light:
 	# x1 -> pointer to output
 	mov	x1, sp

-randomx_program_aarch64_light_cacheline_align_mask:
+DECL(randomx_program_aarch64_light_cacheline_align_mask):
 	# Actual mask will be inserted by JIT compiler
 	and	w2, w9, 1

 	# x2 -> item number
 	lsr	x2, x2, 6

-randomx_program_aarch64_light_dataset_offset:
+DECL(randomx_program_aarch64_light_dataset_offset):
 	# Apply dataset offset (filled in by JIT compiler)
 	add	x2, x2, 0
 	add	x2, x2, 0

-	bl	randomx_calc_dataset_item_aarch64
+	bl	DECL(randomx_calc_dataset_item_aarch64)

 	mov	x10, sp
 	ldp	x0, x1, [sp, 64]
 	ldp	x2, x30, [sp, 80]
 	add	sp, sp, 96

-	b	randomx_program_aarch64_xor_with_dataset_line
+	b	DECL(randomx_program_aarch64_xor_with_dataset_line)



@ -442,26 +448,26 @@ randomx_program_aarch64_light_dataset_offset:
 # x2 -> start item
 # x3 -> end item

-randomx_init_dataset_aarch64:
+DECL(randomx_init_dataset_aarch64):
 	# Save x30 (return address)
 	str	x30, [sp, -16]!

 	# Load pointer to cache memory
 	ldr	x0, [x0]

-randomx_init_dataset_aarch64_main_loop:
-	bl	randomx_calc_dataset_item_aarch64
+DECL(randomx_init_dataset_aarch64_main_loop):
+	bl	DECL(randomx_calc_dataset_item_aarch64)
 	add	x1, x1, 64
 	add	x2, x2, 1
 	cmp	x2, x3
-	bne	randomx_init_dataset_aarch64_main_loop
+	bne	DECL(randomx_init_dataset_aarch64_main_loop)

 	# Restore x30 (return address)
 	ldr	x30, [sp], 16

 	ret

-randomx_init_dataset_aarch64_end:
+DECL(randomx_init_dataset_aarch64_end):

 # Input parameters
 #
@ -479,7 +485,7 @@ randomx_init_dataset_aarch64_end:
 # x12 -> temporary
 # x13 -> temporary

-randomx_calc_dataset_item_aarch64:
+DECL(randomx_calc_dataset_item_aarch64):
 	sub	sp, sp, 112
 	stp	x0, x1, [sp]
 	stp	x2, x3, [sp, 16]
@ -526,7 +532,7 @@ randomx_calc_dataset_item_aarch64:
 	ldr	x12, superscalarAdd7
 	eor	x7, x0, x12

-	b	randomx_calc_dataset_item_aarch64_prefetch
+	b	DECL(randomx_calc_dataset_item_aarch64_prefetch)

 superscalarMul0: .quad 6364136223846793005
 superscalarAdd1: .quad 9298411001130361340
@ -539,7 +545,7 @@ superscalarAdd7: .quad 9549104520008361294

 # Prefetch -> SuperScalar hash -> Mix will be repeated N times

-randomx_calc_dataset_item_aarch64_prefetch:
+DECL(randomx_calc_dataset_item_aarch64_prefetch):
 	# Actual mask will be inserted by JIT compiler
 	and	x11, x10, 1
 	add	x11, x8, x11, lsl 6
@ -547,7 +553,7 @@ randomx_calc_dataset_item_aarch64_prefetch:

 	# Generated SuperScalar hash program goes here

-randomx_calc_dataset_item_aarch64_mix:
+DECL(randomx_calc_dataset_item_aarch64_mix):
 	ldp	x12, x13, [x11]
 	eor	x0, x0, x12
 	eor	x1, x1, x13
@ -561,7 +567,7 @@ randomx_calc_dataset_item_aarch64_mix:
 	eor	x6, x6, x12
 	eor	x7, x7, x13

-randomx_calc_dataset_item_aarch64_store_result:
+DECL(randomx_calc_dataset_item_aarch64_store_result):
 	stp	x0, x1, [x9]
 	stp	x2, x3, [x9, 16]
 	stp	x4, x5, [x9, 32]
@ -578,4 +584,4 @@ randomx_calc_dataset_item_aarch64_store_result:

 	ret

-randomx_calc_dataset_item_aarch64_end:
+DECL(randomx_calc_dataset_item_aarch64_end):
--- a/src/jit_compiler_fallback.hpp
+++ b/src/jit_compiler_fallback.hpp
@ -36,7 +36,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 namespace randomx {

 	class Program;
-	class ProgramConfiguration;
+	struct ProgramConfiguration;
 	class SuperscalarProgram;

 	class JitCompilerFallback {
--- a/src/jit_compiler_x86.cpp
+++ b/src/jit_compiler_x86.cpp
@ -295,7 +295,7 @@ namespace randomx {

 	void JitCompilerX86::generateProgramPrologue(Program& prog, ProgramConfiguration& pcfg) {
 		instructionOffsets.clear();
-		for (unsigned i = 0; i < 8; ++i) {
+		for (unsigned i = 0; i < RegistersCount; ++i) {
 			registerUsage[i] = -1;
 		}

--- a/src/jit_compiler_x86.hpp
+++ b/src/jit_compiler_x86.hpp
@ -36,7 +36,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 namespace randomx {

 	class Program;
-	class ProgramConfiguration;
+	struct ProgramConfiguration;
 	class SuperscalarProgram;
 	class JitCompilerX86;
 	class Instruction;
--- a/src/randomx.cpp
+++ b/src/randomx.cpp
@ -36,13 +36,14 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "cpu.hpp"
 #include <cassert>
 #include <limits>
+#include <cfenv>

 extern "C" {

 	randomx_flags randomx_get_flags() {
 		randomx_flags flags = RANDOMX_HAVE_COMPILER ? RANDOMX_FLAG_JIT : RANDOMX_FLAG_DEFAULT;
 		randomx::Cpu cpu;
-#ifdef __OpenBSD__
+#ifdef RANDOMX_FORCE_SECURE
 		if (flags == RANDOMX_FLAG_JIT) {
 			flags |= RANDOMX_FLAG_SECURE;
 		}
@ -328,7 +329,7 @@ extern "C" {
 	void randomx_vm_set_cache(randomx_vm *machine, randomx_cache* cache) {
 		assert(machine != nullptr);
 		assert(cache != nullptr && cache->isInitialized());
-		if (machine->cacheKey != cache->cacheKey) {
+		if (machine->cacheKey != cache->cacheKey || machine->getMemory() != cache->memory) {
 			machine->setCache(cache);
 			machine->cacheKey = cache->cacheKey;
 		}
@ -349,6 +350,8 @@ extern "C" {
 		assert(machine != nullptr);
 		assert(inputSize == 0 || input != nullptr);
 		assert(output != nullptr);
+		fenv_t fpstate;
+		fegetenv(&fpstate);
 		alignas(16) uint64_t tempHash[8];
 		int blakeResult = blake2b(tempHash, sizeof(tempHash), input, inputSize, nullptr, 0);
 		assert(blakeResult == 0);
@ -361,6 +364,34 @@ extern "C" {
 		}
 		machine->run(&tempHash);
 		machine->getFinalResult(output, RANDOMX_HASH_SIZE);
+		fesetenv(&fpstate);
 	}

+	void randomx_calculate_hash_first(randomx_vm* machine, const void* input, size_t inputSize) {
+		blake2b(machine->tempHash, sizeof(machine->tempHash), input, inputSize, nullptr, 0);
+		machine->initScratchpad(machine->tempHash);
+	}
+
+	void randomx_calculate_hash_next(randomx_vm* machine, const void* nextInput, size_t nextInputSize, void* output) {
+		machine->resetRoundingMode();
+		for (uint32_t chain = 0; chain < RANDOMX_PROGRAM_COUNT - 1; ++chain) {
+			machine->run(machine->tempHash);
+			blake2b(machine->tempHash, sizeof(machine->tempHash), machine->getRegisterFile(), sizeof(randomx::RegisterFile), nullptr, 0);
+		}
+		machine->run(machine->tempHash);
+
+		// Finish current hash and fill the scratchpad for the next hash at the same time
+		blake2b(machine->tempHash, sizeof(machine->tempHash), nextInput, nextInputSize, nullptr, 0);
+		machine->hashAndFill(output, RANDOMX_HASH_SIZE, machine->tempHash);
+	}
+
+	void randomx_calculate_hash_last(randomx_vm* machine, void* output) {
+		machine->resetRoundingMode();
+		for (int chain = 0; chain < RANDOMX_PROGRAM_COUNT - 1; ++chain) {
+			machine->run(machine->tempHash);
+			blake2b(machine->tempHash, sizeof(machine->tempHash), machine->getRegisterFile(), sizeof(randomx::RegisterFile), nullptr, 0);
+		}
+		machine->run(machine->tempHash);
+		machine->getFinalResult(output, RANDOMX_HASH_SIZE);
+	}
 }
--- a/src/randomx.h
+++ b/src/randomx.h
@ -30,6 +30,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define RANDOMX_H

 #include <stddef.h>
+#include <stdint.h>

 #define RANDOMX_HASH_SIZE 32
 #define RANDOMX_DATASET_ITEM_SIZE 64
@ -238,6 +239,27 @@ RANDOMX_EXPORT void randomx_destroy_vm(randomx_vm *machine);
 */
 RANDOMX_EXPORT void randomx_calculate_hash(randomx_vm *machine, const void *input, size_t inputSize, void *output);

+/**
+ * Set of functions used to calculate multiple RandomX hashes more efficiently.
+ * randomx_calculate_hash_first will begin a hash calculation.
+ * randomx_calculate_hash_next  will output the hash value of the previous input
+ *                              and begin the calculation of the next hash.
+ * randomx_calculate_hash_last  will output the hash value of the previous input.
+ *
+ * WARNING: These functions may alter the floating point rounding mode of the calling thread.
+ *
+ * @param machine is a pointer to a randomx_vm structure. Must not be NULL.
+ * @param input is a pointer to memory to be hashed. Must not be NULL.
+ * @param inputSize is the number of bytes to be hashed.
+ * @param nextInput is a pointer to memory to be hashed for the next hash. Must not be NULL.
+ * @param nextInputSize is the number of bytes to be hashed for the next hash.
+ * @param output is a pointer to memory where the hash will be stored. Must not
+ *        be NULL and at least RANDOMX_HASH_SIZE bytes must be available for writing.
+*/
+RANDOMX_EXPORT void randomx_calculate_hash_first(randomx_vm* machine, const void* input, size_t inputSize);
+RANDOMX_EXPORT void randomx_calculate_hash_next(randomx_vm* machine, const void* nextInput, size_t nextInputSize, void* output);
+RANDOMX_EXPORT void randomx_calculate_hash_last(randomx_vm* machine, void* output);
+
 #if defined(__cplusplus)
 }
 #endif
--- a/src/tests/affinity.cpp
+++ b/src/tests/affinity.cpp
@ -65,7 +65,7 @@ set_thread_affinity(std::thread::native_handle_type thread,
            (thread_policy_t)&policy, 1);
 #elif defined(_WIN32) || defined(__CYGWIN__)
    rc = SetThreadAffinityMask(reinterpret_cast<HANDLE>(thread), 1ULL << cpuid) == 0 ? -2 : 0;
-#elif !defined(__OpenBSD__)
+#elif !defined(__OpenBSD__) && !defined(__FreeBSD__) && !defined(__ANDROID__) && !defined(__NetBSD__)
    cpu_set_t cs;
    CPU_ZERO(&cs);
    CPU_SET(cpuid, &cs);
--- a/src/tests/benchmark.cpp
+++ b/src/tests/benchmark.cpp
@ -40,9 +40,10 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "../dataset.hpp"
 #include "../blake2/endian.h"
 #include "../common.hpp"
+#include "../jit_compiler.hpp"
 #ifdef _WIN32
 #include <windows.h>
-#include <VersionHelpers.h>
+#include <versionhelpers.h>
 #endif
 #include "affinity.hpp"

@ -94,6 +95,7 @@ void printUsage(const char* executable) {
 	std::cout << "  --ssse3       use optimized Argon2 for SSSE3 CPUs" << std::endl;
 	std::cout << "  --avx2        use optimized Argon2 for AVX2 CPUs" << std::endl;
 	std::cout << "  --auto        select the best options for the current CPU" << std::endl;
+	std::cout << "  --noBatch     calculate hashes one by one (default: batch)" << std::endl;
 }

 struct MemoryException : public std::exception {
@ -109,11 +111,14 @@ struct DatasetAllocException : public MemoryException {
 	}
 };

-void mine(randomx_vm* vm, std::atomic<uint32_t>& atomicNonce, AtomicHash& result, uint32_t noncesCount, int thread, int cpuid=-1) {
+using MineFunc = void(randomx_vm * vm, std::atomic<uint32_t> & atomicNonce, AtomicHash & result, uint32_t noncesCount, int thread, int cpuid);
+
+template<bool batch>
+void mine(randomx_vm* vm, std::atomic<uint32_t>& atomicNonce, AtomicHash& result, uint32_t noncesCount, int thread, int cpuid = -1) {
 	if (cpuid >= 0) {
 		int rc = set_thread_affinity(cpuid);
 		if (rc) {
-			std::cerr << "Failed to set thread affinity for thread " << thread << " (error=" << rc << ")" <<  std::endl;
+			std::cerr << "Failed to set thread affinity for thread " << thread << " (error=" << rc << ")" << std::endl;
 		}
 	}
 	uint64_t hash[RANDOMX_HASH_SIZE / sizeof(uint64_t)];
@ -122,16 +127,27 @@ void mine(randomx_vm* vm, std::atomic<uint32_t>& atomicNonce, AtomicHash& result
 	void* noncePtr = blockTemplate + 39;
 	auto nonce = atomicNonce.fetch_add(1);

-	while (nonce < noncesCount) {
+	if (batch) {
 		store32(noncePtr, nonce);
-		randomx_calculate_hash(vm, blockTemplate, sizeof(blockTemplate), &hash);
+		randomx_calculate_hash_first(vm, blockTemplate, sizeof(blockTemplate));
+	}
+
+	while (nonce < noncesCount) {
+		if (batch) {
+			nonce = atomicNonce.fetch_add(1);
+		}
+		store32(noncePtr, nonce);
+		(batch ? randomx_calculate_hash_next : randomx_calculate_hash)(vm, blockTemplate, sizeof(blockTemplate), &hash);
 		result.xorWith(hash);
-		nonce = atomicNonce.fetch_add(1);
+		if (!batch) {
+			nonce = atomicNonce.fetch_add(1);
+		}
 	}
 }

 int main(int argc, char** argv) {
-	bool softAes, miningMode, verificationMode, help, largePages, jit, secure, ssse3, avx2, autoFlags;
+	bool softAes, miningMode, verificationMode, help, largePages, jit, secure;
+	bool ssse3, avx2, autoFlags, noBatch;
 	int noncesCount, threadCount, initThreadCount;
 	uint64_t threadAffinity;
 	int32_t seedValue;
@ -155,16 +171,23 @@ int main(int argc, char** argv) {
 	readOption("--ssse3", argc, argv, ssse3);
 	readOption("--avx2", argc, argv, avx2);
 	readOption("--auto", argc, argv, autoFlags);
+	readOption("--noBatch", argc, argv, noBatch);

 	store32(&seed, seedValue);

-	std::cout << "RandomX benchmark v1.1.5" << std::endl;
+	std::cout << "RandomX benchmark v1.1.8" << std::endl;

-	if (help || (!miningMode && !verificationMode)) {
+	if (help) {
 		printUsage(argv[0]);
 		return 0;
 	}

+	if (!miningMode && !verificationMode) {
+		std::cout << "Please select either the fast mode (--mine) or the slow mode (--verify)" << std::endl;
+		std::cout << "Run '" << argv[0] << " --help' to see all supported options" << std::endl;
+		return 0;
+	}
+
 	std::atomic<uint32_t> atomicNonce(0);
 	AtomicHash result;
 	std::vector<randomx_vm*> vms;
@ -190,7 +213,7 @@ int main(int argc, char** argv) {
 		}
 		if (jit) {
 			flags |= RANDOMX_FLAG_JIT;
-#ifdef __OpenBSD__
+#ifdef RANDOMX_FORCE_SECURE
 			flags |= RANDOMX_FLAG_SECURE;
 #endif
 		}
@ -202,7 +225,7 @@ int main(int argc, char** argv) {
 	if (miningMode) {
 		flags |= RANDOMX_FLAG_FULL_MEM;
 	}
-#ifndef __OpenBSD__
+#ifndef RANDOMX_FORCE_SECURE
 	if (secure) {
 		flags |= RANDOMX_FLAG_SECURE;
 	}
@ -254,6 +277,16 @@ int main(int argc, char** argv) {
 		std::cout << " - thread affinity (" << mask_to_string(threadAffinity) << ")" << std::endl;
 	}

+	MineFunc* func;
+
+	if (noBatch) {
+		func = &mine<false>;
+	}
+	else {
+		func = &mine<true>;
+		std::cout << " - batch mode" << std::endl;
+	}
+
 	std::cout << "Initializing";
 	if (miningMode)
 		std::cout << " (" << initThreadCount << " thread" << (initThreadCount > 1 ? "s)" : ")");
@ -324,14 +357,14 @@ int main(int argc, char** argv) {
 				int cpuid = -1;
 				if (threadAffinity)
 					cpuid = cpuid_from_mask(threadAffinity, i);
-				threads.push_back(std::thread(&mine, vms[i], std::ref(atomicNonce), std::ref(result), noncesCount, i, cpuid));
+				threads.push_back(std::thread(func, vms[i], std::ref(atomicNonce), std::ref(result), noncesCount, i, cpuid));
 			}
 			for (unsigned i = 0; i < threads.size(); ++i) {
 				threads[i].join();
 			}
 		}
 		else {
-			mine(vms[0], std::ref(atomicNonce), std::ref(result), noncesCount, 0);
+			func(vms[0], std::ref(atomicNonce), std::ref(result), noncesCount, 0, -1);
 		}

 		double elapsed = sw.getElapsed();
--- a/src/tests/tests.cpp
+++ b/src/tests/tests.cpp
@ -143,7 +143,7 @@ int main() {
 		randomx::JitCompiler jit;
 		jit.generateSuperscalarHash(cache->programs, cache->reciprocalCache);
 		jit.generateDatasetInitCode();
-#ifdef __OpenBSD__
+#ifdef RANDOMX_FORCE_SECURE
 		jit.enableExecution();
 #else
 		jit.enableAll();
@ -954,7 +954,7 @@ int main() {
 		assert(ibc.memMask == randomx::ScratchpadL3Mask);
 	});

-#ifdef __OpenBSD__
+#ifdef RANDOMX_FORCE_SECURE
 	vm = randomx_create_vm(RANDOMX_FLAG_DEFAULT | RANDOMX_FLAG_SECURE, cache, nullptr);
 #else
 	vm = randomx_create_vm(RANDOMX_FLAG_DEFAULT, cache, nullptr);
@ -1009,10 +1009,10 @@ int main() {
 		vm = nullptr;
 		cache = randomx_alloc_cache(RANDOMX_FLAG_JIT);
 		initCache("test key 000");
-#ifdef __OpenBSD__
-		vm = randomx_create_vm(RANDOMX_FLAG_DEFAULT | RANDOMX_FLAG_SECURE, cache, nullptr);
+#ifdef RANDOMX_FORCE_SECURE
+		vm = randomx_create_vm(RANDOMX_FLAG_JIT | RANDOMX_FLAG_SECURE, cache, nullptr);
 #else
-		vm = randomx_create_vm(RANDOMX_FLAG_DEFAULT, cache, nullptr);
+		vm = randomx_create_vm(RANDOMX_FLAG_JIT, cache, nullptr);
 #endif
 	}

@ -1026,9 +1026,6 @@ int main() {

 	runTest("Hash test 2e (compiler)", RANDOMX_HAVE_COMPILER && stringsEqual(RANDOMX_ARGON_SALT, "RandomX\x03"), test_e);

-	randomx_destroy_vm(vm);
-	vm = nullptr;
-
 	auto flags = randomx_get_flags();

 	randomx_release_cache(cache);
@ -1054,6 +1051,40 @@ int main() {
 		assert(cacheMemory[33554431] == 0x1f47f056d05cd99b);
 	});

+	if (cache != nullptr)
+		randomx_release_cache(cache);
+	cache = randomx_alloc_cache(RANDOMX_FLAG_DEFAULT);
+
+	runTest("Hash batch test", RANDOMX_HAVE_COMPILER && stringsEqual(RANDOMX_ARGON_SALT, "RandomX\x03"), []() {
+		char hash1[RANDOMX_HASH_SIZE];
+		char hash2[RANDOMX_HASH_SIZE];
+		char hash3[RANDOMX_HASH_SIZE];
+		initCache("test key 000");
+		char input1[] = "This is a test";
+		char input2[] = "Lorem ipsum dolor sit amet";
+		char input3[] = "sed do eiusmod tempor incididunt ut labore et dolore magna aliqua";
+
+		randomx_calculate_hash_first(vm, input1, sizeof(input1) - 1);
+		randomx_calculate_hash_next(vm, input2, sizeof(input2) - 1, &hash1);
+		randomx_calculate_hash_next(vm, input3, sizeof(input3) - 1, &hash2);
+		randomx_calculate_hash_last(vm, &hash3);
+
+		assert(equalsHex(hash1, "639183aae1bf4c9a35884cb46b09cad9175f04efd7684e7262a0ac1c2f0b4e3f"));
+		assert(equalsHex(hash2, "300a0adb47603dedb42228ccb2b211104f4da45af709cd7547cd049e9489c969"));
+		assert(equalsHex(hash3, "c36d4ed4191e617309867ed66a443be4075014e2b061bcdaf9ce7b721d2b77a8"));
+	});
+
+	runTest("Preserve rounding mode", RANDOMX_FREQ_CFROUND > 0, []() {
+		rx_set_rounding_mode(RoundToNearest);
+		char hash[RANDOMX_HASH_SIZE];
+		calcStringHash("test key 000", "Lorem ipsum dolor sit amet", &hash);
+		assert(equalsHex(hash, "300a0adb47603dedb42228ccb2b211104f4da45af709cd7547cd049e9489c969"));
+		assert(rx_get_rounding_mode() == RoundToNearest);
+	});
+
+	randomx_destroy_vm(vm);
+	vm = nullptr;
+
 	if (cache != nullptr)
 		randomx_release_cache(cache);

--- a/src/virtual_machine.cpp
+++ b/src/virtual_machine.cpp
@ -120,6 +120,12 @@ namespace randomx {
 		blake2b(out, outSize, &reg, sizeof(RegisterFile), nullptr, 0);
 	}

+	template<class Allocator, bool softAes>
+	void VmBase<Allocator, softAes>::hashAndFill(void* out, size_t outSize, uint64_t *fill_state) {
+		hashAndFillAes1Rx4<softAes>((void*) getScratchpad(), ScratchpadSize, &reg.a, fill_state);
+		blake2b(out, outSize, &reg, sizeof(RegisterFile), nullptr, 0);
+	}
+
 	template<class Allocator, bool softAes>
 	void VmBase<Allocator, softAes>::initScratchpad(void* seed) {
 		fillAes1Rx4<softAes>(seed, ScratchpadSize, scratchpad);
--- a/src/virtual_machine.hpp
+++ b/src/virtual_machine.hpp
@ -38,6 +38,7 @@ public:
 	virtual ~randomx_vm() = 0;
 	virtual void allocate() = 0;
 	virtual void getFinalResult(void* out, size_t outSize) = 0;
+	virtual void hashAndFill(void* out, size_t outSize, uint64_t *fill_state) = 0;
 	virtual void setDataset(randomx_dataset* dataset) { }
 	virtual void setCache(randomx_cache* cache) { }
 	virtual void initScratchpad(void* seed) = 0;
@ -53,6 +54,9 @@ public:
 	{
 		return program;
 	}
+	const uint8_t* getMemory() const {
+		return mem.memory;
+	}
 protected:
 	void initialize();
 	alignas(64) randomx::Program program;
@ -67,6 +71,7 @@ protected:
 	uint64_t datasetOffset;
 public:
 	std::string cacheKey;
+	alignas(16) uint64_t tempHash[8]; //8 64-bit values used to store intermediate data
 };

 namespace randomx {
@ -78,6 +83,7 @@ namespace randomx {
 		void allocate() override;
 		void initScratchpad(void* seed) override;
 		void getFinalResult(void* out, size_t outSize) override;
+		void hashAndFill(void* out, size_t outSize, uint64_t *fill_state) override;
 	protected:
 		void generateProgram(void* seed);
 	};
--- a/src/virtual_memory.cpp
+++ b/src/virtual_memory.cpp
@ -94,7 +94,12 @@ void* allocMemoryPages(std::size_t bytes) {
 	if (mem == nullptr)
 		throw std::runtime_error(getErrorMessage("allocMemoryPages - VirtualAlloc"));
 #else
-	mem = mmap(nullptr, bytes, PAGE_READWRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+	#if defined(__NetBSD__)
+		#define RESERVED_FLAGS PROT_MPROTECT(PROT_EXEC)
+	#else
+		#define RESERVED_FLAGS 0
+	#endif
+	mem = mmap(nullptr, bytes, PAGE_READWRITE | RESERVED_FLAGS, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
 	if (mem == MAP_FAILED)
 		throw std::runtime_error("allocMemoryPages - mmap failed");
 #endif
@ -141,7 +146,7 @@ void* allocLargePagesMemory(std::size_t bytes) {
 	mem = mmap(nullptr, bytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, VM_FLAGS_SUPERPAGE_SIZE_2MB, 0);
 #elif defined(__FreeBSD__)
 	mem = mmap(nullptr, bytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_ALIGNED_SUPER, -1, 0);
-#elif defined(__OpenBSD__)
+#elif defined(__OpenBSD__) || defined(__NetBSD__)
 	mem = MAP_FAILED; // OpenBSD does not support huge pages
 #else
 	mem = mmap(nullptr, bytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB | MAP_POPULATE, -1, 0);
--- a/vcxproj/randomx-dll.vcxproj
+++ b/vcxproj/randomx-dll.vcxproj
@ -54,12 +54,17 @@
  <ItemGroup>
    <ClCompile Include="..\src\aes_hash.cpp" />
    <ClCompile Include="..\src\allocator.cpp" />
+    <ClCompile Include="..\src\argon2_avx2.c">
+      <EnableEnhancedInstructionSet Condition="'$(Configuration)|$(Platform)'=='Release|x64'">AdvancedVectorExtensions2</EnableEnhancedInstructionSet>
+    </ClCompile>
    <ClCompile Include="..\src\argon2_core.c" />
    <ClCompile Include="..\src\argon2_ref.c" />
+    <ClCompile Include="..\src\argon2_ssse3.c" />
    <ClCompile Include="..\src\assembly_generator_x86.cpp" />
    <ClCompile Include="..\src\blake2\blake2b.c" />
    <ClCompile Include="..\src\blake2_generator.cpp" />
    <ClCompile Include="..\src\bytecode_machine.cpp" />
+    <ClCompile Include="..\src\cpu.cpp" />
    <ClCompile Include="..\src\dataset.cpp" />
    <ClCompile Include="..\src\instruction.cpp" />
    <ClCompile Include="..\src\instructions_portable.cpp" />
--- a/vcxproj/randomx-dll.vcxproj.filters
+++ b/vcxproj/randomx-dll.vcxproj.filters
@ -172,5 +172,14 @@
    <ClCompile Include="..\src\bytecode_machine.cpp">
      <Filter>Source Files</Filter>
    </ClCompile>
+    <ClCompile Include="..\src\argon2_avx2.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\argon2_ssse3.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\cpu.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
  </ItemGroup>
 </Project>
Author	SHA1	Message	Date
fuwa	89b7c02bba	rx-wow-fix-3: Revert "Increase the frequency of CBRANCH (#118 )" This reverts commit `22689eda49`.	2020-07-13 08:55:05 +03:00
fuwa	d2416d6157	rx-wow-fix-2: Revert "Decrease the frequency of FADD/FSUB in favor of FMUL (#77 )" This reverts commit `91cd35ff13`.	2020-07-13 08:54:49 +03:00
fuwa	b4ada42599	rx-wow-fix-1: AesGenerator4R	2020-07-13 08:54:35 +03:00
wowario	597b1e9c91	RandomWOW parameters	2020-07-13 08:54:21 +03:00
tevador	5ce5f4906c	add --noBatch benchmark option	2020-07-04 14:57:56 +02:00
tevador	9905ec9c5a	Merge pull request #188 from cryptonote-social/master replace hardcoded literal with its appropriate symbol	2020-06-28 16:36:40 +02:00
tevador	863765bbe6	Merge pull request #185 from tevador/pr-crosscomp Fix windows-target cross-compilation	2020-06-28 16:36:12 +02:00
tevador	a1c08a2f41	Merge pull request #187 from tevador/pr-netbsd Fix compilation and JIT support on NetBSD 1. Disable hugepages (not supported). 2. Force W^X (required). 3. When allocating JIT memory, PROT_EXEC must be reserved in order to set the pages executable later.	2020-06-28 16:35:19 +02:00
tevador	708a4e50c5	Fix compilation and JIT support on NetBSD: 1. Disable hugepages (not supported). 2. Force W^X (required). 3. When allocating JIT memory, PROT_EXEC must be reserved in order to set the pages executable later.	2020-06-28 16:16:20 +02:00
tevador	6a4afc721f	Merge pull request #189 from tevador/pr-set-cache Fix potential use-after-free when reallocating cache	2020-06-27 20:42:15 +02:00
tevador	32ab5dea54	fix potential use-after-free when reallocating cache	2020-06-27 20:21:06 +02:00
cryptonote-social	a7733de1e7	replace hardcoded literal with its appropriate symbol	2020-06-27 09:53:46 -07:00
tevador	bece0a7206	fix #184	2020-06-09 19:10:56 +02:00
tevador	7741eb1e97	Merge pull request #182 from tevador/pr-restore-fpstate Preserve floating point state when calling randomx_calculate_hash	2020-05-16 23:19:37 +02:00
tevador	148b923f71	fix test 92 not failing properly on GCC/amd64	2020-05-06 13:48:53 +02:00
tevador	6a764e90d0	Preserve floating point state when calling randomx_calculate_hash	2020-05-06 12:42:30 +02:00
tevador	ac574e3743	Merge pull request #179 from tevador/pr-hash-batch Add a missing function to calculate a batch of hashes	2020-02-07 19:33:36 +01:00
tevador	01381ccef3	Add a missing function to calculate a batch of hashes Add a test for batch calculation	2020-02-06 18:14:38 +01:00
tevador	913e495c53	Merge branch 'master' of git@github.com:tevador/RandomX.git	2020-02-06 18:13:52 +01:00
tevador	72ac5e49b6	Update dll project	2019-12-29 19:14:00 +01:00
tevador	bbbb34757b	Add a note about building portable binaries	2019-12-26 12:32:04 +01:00
tevador	a223b6b33b	Fixed an incorrect URL the the documentation	2019-12-18 12:30:49 +01:00
tevador	30969c0e25	Benchmark version	2019-12-01 21:07:13 +01:00
tevador	6e842d22bb	Merge pull request #171 from tevador/pr-affinityfix Disable thread affinity on FreeBSD and Android	2019-12-01 20:51:36 +01:00
tevador	aa19c5b9b6	Disable thread affinity on FreeBSD and Android	2019-12-01 20:41:20 +01:00
tevador	70d4b0f2f1	Merge pull request #169 from tevador/pr-gitignore Add .gitattributes	2019-12-01 20:04:30 +01:00
tevador	f872ce0b94	Merge pull request #170 from tevador/pr-temphash Hide tempHash from the public API	2019-12-01 20:04:00 +01:00
tevador	3910d49b49	Hide tempHash from the public API	2019-12-01 18:19:09 +01:00
SChernykh	219c02e1e5	Combined hash and fill AES loop (#166 ) Adds more parallelizm into AES loop so modern CPUs can take advantage of it. Also, scratchpad data moves between L1 and L3 caches only one time which saves time and energy per hash.	2019-12-01 16:58:38 +01:00
tevador	6235852e00	Add .gitattributes	2019-12-01 16:39:42 +01:00
tevador	e3561d661e	Updated readme with RandomX Sniffer	2019-12-01 11:52:56 +01:00
tevador	65fae68287	Add GPG public key	2019-11-29 15:12:28 +01:00
tevador	fd96d3df22	Merge pull request #160 from tevador/pr-install1 Improve install target (fixed)	2019-11-22 18:25:16 +01:00
Vladimir	01914b49cd	Fixes for cmake build with visual studio (#144 ) * Fixed CMake configuration for visual studio build Added proper asm source and set correct type. * Disabled stadard layout check of randomx_cache for visual studio debug Required to silence static_assert which fails on Visual Studio Debug configuation. * Fixed warning message and defines check * Removed unsupported flags for MSVC compiler * Enabled AVX2 for msvc * Fixed formatting in CmakeLists * Added generation of configuration.asm by CMake for MSVC	2019-11-22 18:24:16 +01:00
tevador	7e20c8e56e	Merge pull request #161 from tevador/pr-struct Fix inconsistent class/struct usage	2019-11-22 18:23:11 +01:00
tevador	7646cfede6	Add a note about safe key usage	2019-11-22 18:20:43 +01:00
tevador	88268ae325	Improved benchmark instructions	2019-11-22 18:16:58 +01:00
tevador	57545d1c53	Fix inconsistent class/struct usage	2019-11-19 23:17:55 +01:00
tevador	79c53ae9b0	Improve install target (fixed)	2019-11-19 22:51:43 +01:00
tevador	cb299e5a25	Merge pull request #158 from tevador/revert-150-dev/beber/build Revert "dev/beber/build: improve install target"	2019-11-15 20:33:09 +01:00
tevador	4381ec3c89	Revert "dev/beber/build: improve install target (#150 )" This reverts commit `66c039030f`.	2019-11-15 20:28:19 +01:00
tevador	5e53ed9409	Merge pull request #156 from SChernykh/pr-apple Fix function names for clang on Apple	2019-11-15 12:56:18 +01:00
Bertrand Jacquin	66c039030f	dev/beber/build: improve install target (#150 ) * build: remove hardcoded lib/ destination cmake internally has a logic to know where is the best place to install a given library. For example on amd64 it does not make sense to install the library in /usr/lib, this change allow the library to the installed in /usr/lib64 * build: mark src/randomx.h as a public header this allow src/randomx.h to be installed in /usr/include/randomx.h automagically during make install	2019-11-15 12:55:11 +01:00
SChernykh	e43267fa86	Fix function names for clang on Apple	2019-11-12 13:04:11 +01:00