add CNv4 support

2019-02-24 11:30:51 -05:00 · 2019-02-24 11:30:51 -05:00 · 3c76d658c7
parent 9b396bcee9
commit 3c76d658c7
32 changed files with 3769 additions and 215 deletions
--- a/14
+++ b/14
@ -77,20 +77,23 @@ CC = gcc
 STORE = build/$(TYPE)
 SOURCE := $(foreach DIR,$(DIRS),$(wildcard $(DIR)/*.cpp))
 CSOURCE := $(foreach DIR,$(DIRS),$(wildcard $(DIR)/*.c))
+SSOURCE := $(foreach DIR,$(DIRS),$(wildcard $(DIR)/*.S))
 HTMLSOURCE := $(foreach DIR,$(DIRS),$(wildcard $(DIR)/*.html))
 HEADERS := $(foreach DIR,$(DIRS),$(wildcard $(DIR)/*.h))
 OBJECTS := $(addprefix $(STORE)/, $(SOURCE:.cpp=.o))
 COBJECTS := $(addprefix $(STORE)/, $(CSOURCE:.c=.o))
+SOBJECTS := $(addprefix $(STORE)/, $(SSOURCE:.S=.o))
 HTMLOBJECTS := $(addprefix $(STORE)/, $(HTMLSOURCE:.html=.o))
 DFILES := $(addprefix $(STORE)/,$(SOURCE:.cpp=.d))
 CDFILES := $(addprefix $(STORE)/,$(CSOURCE:.c=.d))
+SDFILES := $(addprefix $(STORE)/,$(CSOURCE:.S=.d))


 .PHONY: clean backup dirs

-$(TARGET): dirs $(OBJECTS) $(COBJECTS) $(HTMLOBJECTS)
+$(TARGET): dirs $(OBJECTS) $(COBJECTS) $(SOBJECTS) $(HTMLOBJECTS)
 	@echo Linking $(OBJECTS)...
-	$(C++) -o $(STORE)/$(TARGET) $(OBJECTS) $(COBJECTS) $(HTMLOBJECTS) $(LDPARAM) $(foreach LIBRARY, $(LIBS),-l$(LIBRARY)) $(foreach LIB,$(LIBPATH),-L$(LIB)) $(PKG_LIBS) $(STATIC_LIBS)
+	$(C++) -o $(STORE)/$(TARGET) $(OBJECTS) $(COBJECTS) $(SOBJECTS) $(HTMLOBJECTS) $(LDPARAM) $(foreach LIBRARY, $(LIBS),-l$(LIBRARY)) $(foreach LIB,$(LIBPATH),-L$(LIB)) $(PKG_LIBS) $(STATIC_LIBS)
 	@cp pool.conf $(STORE)/

 $(STORE)/%.o: %.cpp
@ -107,6 +110,13 @@ $(STORE)/%.o: %.c
 	@sed -e '1s/^\(.*\)$$/$(subst /,\/,$(dir $@))\1/' $(STORE)/$*.dd > $(STORE)/$*.d
 	@rm -f $(STORE)/$*.dd

+$(STORE)/%.o: %.S
+	@echo Creating object file for $*...
+	$(CC) -Wp,-MMD,$(STORE)/$*.dd $(CCPARAM) $(foreach INC,$(INCPATH),-I$(INC)) $(PKG_INC)\
+		$(foreach CPPDEF,$(CPPDEFS),-D$(CPPDEF)) -c $< -o $@
+	@sed -e '1s/^\(.*\)$$/$(subst /,\/,$(dir $@))\1/' $(STORE)/$*.dd > $(STORE)/$*.d
+	@rm -f $(STORE)/$*.dd
+
 $(STORE)/%.o: %.html
 	@echo Creating object file for $*...
 	xxd -i $< | sed -e 's/src_//' -e 's/embed_//' > $(STORE)/$*.c
--- a/monero/blockchain_db/blockchain_db.cpp
+++ b/monero/blockchain_db/blockchain_db.cpp
@ -197,6 +197,7 @@ void BlockchainDB::add_transaction(const crypto::hash& blk_hash, const transacti

 uint64_t BlockchainDB::add_block( const block& blk
                                , size_t block_weight
+                                , uint64_t long_term_block_weight
                                , const difficulty_type& cumulative_difficulty
                                , const uint64_t& coins_generated
                                , const std::vector<transaction>& txs
@ -241,7 +242,7 @@ uint64_t BlockchainDB::add_block( const block& blk

  // call out to subclass implementation to add the block & metadata
  time1 = epee::misc_utils::get_tick_count();
-  add_block(blk, block_weight, cumulative_difficulty, coins_generated, num_rct_outs, blk_hash);
+  add_block(blk, block_weight, long_term_block_weight, cumulative_difficulty, coins_generated, num_rct_outs, blk_hash);
  TIME_MEASURE_FINISH(time1);
  time_add_block1 += time1;

--- a/monero/blockchain_db/blockchain_db.h
+++ b/monero/blockchain_db/blockchain_db.h
@ -359,12 +359,14 @@ private:
   *
   * @param blk the block to be added
   * @param block_weight the weight of the block (transactions and all)
+   * @param long_term_block_weight the long term block weight of the block (transactions and all)
   * @param cumulative_difficulty the accumulated difficulty after this block
   * @param coins_generated the number of coins generated total after this block
   * @param blk_hash the hash of the block
   */
  virtual void add_block( const block& blk
                , size_t block_weight
+                , uint64_t long_term_block_weight
                , const difficulty_type& cumulative_difficulty
                , const uint64_t& coins_generated
                , uint64_t num_rct_outs
@ -376,7 +378,7 @@ private:
   *
   * The subclass implementing this will remove the block data from the top
   * block in the chain.  The data to be removed is that which was added in
-   * BlockchainDB::add_block(const block& blk, size_t block_weight, const difficulty_type& cumulative_difficulty, const uint64_t& coins_generated, const crypto::hash& blk_hash)
+   * BlockchainDB::add_block(const block& blk, size_t block_weight, uint64_t long_term_block_weight, const difficulty_type& cumulative_difficulty, const uint64_t& coins_generated, const crypto::hash& blk_hash)
   *
   * If any of this cannot be done, the subclass should throw the corresponding
   * subclass of DB_EXCEPTION
@ -790,6 +792,7 @@ public:
   *
   * @param blk the block to be added
   * @param block_weight the size of the block (transactions and all)
+   * @param long_term_block_weight the long term weight of the block (transactions and all)
   * @param cumulative_difficulty the accumulated difficulty after this block
   * @param coins_generated the number of coins generated total after this block
   * @param txs the transactions in the block
@ -798,6 +801,7 @@ public:
   */
  virtual uint64_t add_block( const block& blk
                            , size_t block_weight
+                            , uint64_t long_term_block_weight
                            , const difficulty_type& cumulative_difficulty
                            , const uint64_t& coins_generated
                            , const std::vector<transaction>& txs
@ -985,6 +989,17 @@ public:
   */
  virtual uint64_t get_block_already_generated_coins(const uint64_t& height) const = 0;

+  /**
+   * @brief fetch a block's long term weight
+   *
+   * If the block does not exist, the subclass should throw BLOCK_DNE
+   *
+   * @param height the height requested
+   *
+   * @return the long term weight
+   */
+  virtual uint64_t get_block_long_term_weight(const uint64_t& height) const = 0;
+
  /**
   * @brief fetch a block's hash
   *
--- a/monero/common/util.h
+++ b/monero/common/util.h
@ -238,4 +238,6 @@ namespace tools
 #ifdef _WIN32
  std::string input_line_win();
 #endif
+
+  void closefrom(int fd);
 }
--- a/monero/crypto/CryptonightR_JIT.c
+++ b/monero/crypto/CryptonightR_JIT.c
@ -0,0 +1,102 @@
+#include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdio.h>
+#include <unistd.h>
+
+#include "common/int-util.h"
+#include "hash-ops.h"
+#include "variant4_random_math.h"
+#include "CryptonightR_JIT.h"
+#include "CryptonightR_template.h"
+
+static const uint8_t prologue[] = {
+	0x4C, 0x8B, 0xD7,	// mov r10, rdi
+	0x53,			// push rbx
+	0x55,			// push rbp
+	0x41, 0x57,		// push r15
+	0x4C, 0x8B, 0xDC,	// mov r11, rsp
+	0x41, 0x8B, 0x1A,	// mov ebx, DWORD PTR [r10]
+	0x41, 0x8B, 0x72, 0x04,	// mov esi, DWORD PTR [r10+4]
+	0x41, 0x8B, 0x7A, 0x08,	// mov edi, DWORD PTR [r10+8]
+	0x41, 0x8B, 0x6A, 0x0C,	// mov ebp, DWORD PTR [r10+12]
+	0x41, 0x8B, 0x62, 0x10,	// mov esp, DWORD PTR [r10+16]
+	0x45, 0x8B, 0x7A, 0x14,	// mov r15d, DWORD PTR [r10+20]
+	0x41, 0x8B, 0x42, 0x18,	// mov eax, DWORD PTR [r10+24]
+	0x41, 0x8B, 0x52, 0x1C,	// mov edx, DWORD PTR [r10+28]
+	0x45, 0x8B, 0x4A, 0x20,	// mov r9d, DWORD PTR [r10+32]
+};
+
+static const uint8_t epilogue[] = {
+	0x49, 0x8B, 0xE3,	// mov rsp, r11
+	0x41, 0x89, 0x1A,	// mov DWORD PTR [r10], ebx
+	0x41, 0x89, 0x72, 0x04,	// mov DWORD PTR [r10+4], esi
+	0x41, 0x89, 0x7A, 0x08,	// mov DWORD PTR [r10+8], edi
+	0x41, 0x89, 0x6A, 0x0C,	// mov DWORD PTR [r10+12], ebp
+	0x41, 0x5F,		// pop r15
+	0x5D,			// pop rbp
+	0x5B,			// pop rbx
+	0xC3,			// ret
+};
+
+#define APPEND_CODE(src, size) \
+	do { \
+		if (JIT_code + (size) > JIT_code_end) \
+			return -1; \
+		memcpy(JIT_code, (src), (size)); \
+		JIT_code += (size); \
+	} while (0)
+
+int v4_generate_JIT_code(const struct V4_Instruction* code, v4_random_math_JIT_func buf, const size_t buf_size)
+{
+	uint8_t* JIT_code = (uint8_t*) buf;
+	const uint8_t* JIT_code_end = JIT_code + buf_size;
+
+	APPEND_CODE(prologue, sizeof(prologue));
+
+	uint32_t prev_rot_src = 0xFFFFFFFFU;
+
+	for (int i = 0;; ++i)
+	{
+		const struct V4_Instruction inst = code[i];
+		if (inst.opcode == RET)
+			break;
+
+		const uint8_t opcode = (inst.opcode == MUL) ? inst.opcode : (inst.opcode + 2);
+
+		const uint32_t a = inst.dst_index;
+		const uint32_t b = inst.src_index;
+		const uint8_t c = opcode | (inst.dst_index << V4_OPCODE_BITS) | (((inst.src_index == 8) ? inst.dst_index : inst.src_index) << (V4_OPCODE_BITS + V4_DST_INDEX_BITS));
+
+		switch (inst.opcode)
+		{
+		case ROR:
+		case ROL:
+			if (b != prev_rot_src)
+			{
+				prev_rot_src = b;
+				const uint8_t* p1 = (const uint8_t*) instructions_mov[c];
+				const uint8_t* p2 = (const uint8_t*) instructions_mov[c + 1];
+				APPEND_CODE(p1, p2 - p1);
+			}
+			break;
+		}
+
+		if (a == prev_rot_src)
+			prev_rot_src = 0xFFFFFFFFU;
+
+		const uint8_t* p1 = (const uint8_t*) instructions[c];
+		const uint8_t* p2 = (const uint8_t*) instructions[c + 1];
+		APPEND_CODE(p1, p2 - p1);
+
+		if (inst.opcode == ADD)
+			*(uint32_t*)(JIT_code - 4) = inst.C;
+	}
+
+	APPEND_CODE(epilogue, sizeof(epilogue));
+
+	__builtin___clear_cache((char*)buf, (char*)JIT_code);
+
+	return 0;
+}
--- a/monero/crypto/CryptonightR_JIT.h
+++ b/monero/crypto/CryptonightR_JIT.h
@ -0,0 +1,18 @@
+#ifndef CRYPTONIGHTR_JIT_H
+#define CRYPTONIGHTR_JIT_H
+
+// Minimalistic JIT code generator for random math sequence in CryptonightR
+//
+// Usage:
+// - Allocate writable and executable memory
+// - Call v4_generate_JIT_code with "buf" pointed to memory allocated on previous step
+// - Call the generated code instead of "v4_random_math(code, r)", omit the "code" parameter
+
+typedef void (*v4_random_math_JIT_func)(uint32_t* r) __attribute__((sysv_abi));
+
+// Given the random math sequence, generates machine code (x86-64) for it
+// Returns 0 if code was generated successfully
+// Returns -1 if provided buffer was too small
+int v4_generate_JIT_code(const struct V4_Instruction* code, v4_random_math_JIT_func buf, const size_t buf_size);
+
+#endif // CRYPTONIGHTR_JIT_H
--- a/monero/crypto/CryptonightR_template.S
+++ b/monero/crypto/CryptonightR_template.S
--- a/monero/crypto/CryptonightR_template.h
+++ b/monero/crypto/CryptonightR_template.h
--- a/monero/crypto/chacha.h
+++ b/monero/crypto/chacha.h
@ -73,18 +73,18 @@ namespace crypto {
  inline void generate_chacha_key(const void *data, size_t size, chacha_key& key, uint64_t kdf_rounds) {
    static_assert(sizeof(chacha_key) <= sizeof(hash), "Size of hash must be at least that of chacha_key");
    epee::mlocked<tools::scrubbed_arr<char, HASH_SIZE>> pwd_hash;
-    crypto::cn_slow_hash(data, size, pwd_hash.data(), 0/*variant*/, 0/*prehashed*/);
+    crypto::cn_slow_hash(data, size, pwd_hash.data(), 0/*variant*/, 0/*prehashed*/, 0/*height*/);
    for (uint64_t n = 1; n < kdf_rounds; ++n)
-      crypto::cn_slow_hash(pwd_hash.data(), pwd_hash.size(), pwd_hash.data(), 0/*variant*/, 0/*prehashed*/);
+      crypto::cn_slow_hash(pwd_hash.data(), pwd_hash.size(), pwd_hash.data(), 0/*variant*/, 0/*prehashed*/, 0/*height*/);
    memcpy(&unwrap(unwrap(key)), pwd_hash.data(), sizeof(key));
  }

  inline void generate_chacha_key_prehashed(const void *data, size_t size, chacha_key& key, uint64_t kdf_rounds) {
    static_assert(sizeof(chacha_key) <= sizeof(hash), "Size of hash must be at least that of chacha_key");
    epee::mlocked<tools::scrubbed_arr<char, HASH_SIZE>> pwd_hash;
-    crypto::cn_slow_hash(data, size, pwd_hash.data(), 0/*variant*/, 1/*prehashed*/);
+    crypto::cn_slow_hash(data, size, pwd_hash.data(), 0/*variant*/, 1/*prehashed*/, 0/*height*/);
    for (uint64_t n = 1; n < kdf_rounds; ++n)
-      crypto::cn_slow_hash(pwd_hash.data(), pwd_hash.size(), pwd_hash.data(), 0/*variant*/, 0/*prehashed*/);
+      crypto::cn_slow_hash(pwd_hash.data(), pwd_hash.size(), pwd_hash.data(), 0/*variant*/, 0/*prehashed*/, 0/*height*/);
    memcpy(&unwrap(unwrap(key)), pwd_hash.data(), sizeof(key));
  }

--- a/monero/crypto/hash-ops.h
+++ b/monero/crypto/hash-ops.h
@ -79,7 +79,7 @@ enum {
 };

 void cn_fast_hash(const void *data, size_t length, char *hash);
-void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int prehashed);
+void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int prehashed, uint64_t height);

 void hash_extra_blake(const void *data, size_t length, char *hash);
 void hash_extra_groestl(const void *data, size_t length, char *hash);
--- a/monero/crypto/hash.h
+++ b/monero/crypto/hash.h
@ -71,12 +71,12 @@ namespace crypto {
    return h;
  }

-  inline void cn_slow_hash(const void *data, std::size_t length, hash &hash, int variant = 0) {
-    cn_slow_hash(data, length, reinterpret_cast<char *>(&hash), variant, 0/*prehashed*/);
+  inline void cn_slow_hash(const void *data, std::size_t length, hash &hash, int variant = 0, uint64_t height = 0) {
+    cn_slow_hash(data, length, reinterpret_cast<char *>(&hash), variant, 0/*prehashed*/, height);
  }

-  inline void cn_slow_hash_prehashed(const void *data, std::size_t length, hash &hash, int variant = 0) {
-    cn_slow_hash(data, length, reinterpret_cast<char *>(&hash), variant, 1/*prehashed*/);
+  inline void cn_slow_hash_prehashed(const void *data, std::size_t length, hash &hash, int variant = 0, uint64_t height = 0) {
+    cn_slow_hash(data, length, reinterpret_cast<char *>(&hash), variant, 1/*prehashed*/, height);
  }

  inline void tree_hash(const hash *hashes, std::size_t count, hash &root_hash) {
--- a/monero/crypto/slow-hash.c
+++ b/monero/crypto/slow-hash.c
@ -39,6 +39,11 @@
 #include "hash-ops.h"
 #include "oaes_lib.h"
 #include "variant2_int_sqrt.h"
+#include "variant4_random_math.h"
+#include "CryptonightR_JIT.h"
+
+#include <errno.h>
+#include <string.h>

 #define MEMORY         (1 << 21) // 2MB scratchpad
 #define ITER           (1 << 20)
@ -47,8 +52,18 @@
 #define INIT_SIZE_BLK   8
 #define INIT_SIZE_BYTE (INIT_SIZE_BLK * AES_BLOCK_SIZE)

-extern int aesb_single_round(const uint8_t *in, uint8_t*out, const uint8_t *expandedKey);
-extern int aesb_pseudo_round(const uint8_t *in, uint8_t *out, const uint8_t *expandedKey);
+extern void aesb_single_round(const uint8_t *in, uint8_t *out, const uint8_t *expandedKey);
+extern void aesb_pseudo_round(const uint8_t *in, uint8_t *out, const uint8_t *expandedKey);
+
+static void local_abort(const char *msg)
+{
+  fprintf(stderr, "%s\n", msg);
+#ifdef NDEBUG
+  _exit(1);
+#else
+  abort();
+#endif
+}

 #define VARIANT1_1(p) \
  do if (variant == 1) \
@ -109,69 +124,96 @@ extern int aesb_pseudo_round(const uint8_t *in, uint8_t *out, const uint8_t *exp
    memcpy(b + AES_BLOCK_SIZE, state.hs.b + 64, AES_BLOCK_SIZE); \
    xor64(b + AES_BLOCK_SIZE, state.hs.b + 80); \
    xor64(b + AES_BLOCK_SIZE + 8, state.hs.b + 88); \
-    division_result = state.hs.w[12]; \
-    sqrt_result = state.hs.w[13]; \
+    division_result = SWAP64LE(state.hs.w[12]); \
+    sqrt_result = SWAP64LE(state.hs.w[13]); \
  } while (0)

 #define VARIANT2_SHUFFLE_ADD_SSE2(base_ptr, offset) \
  do if (variant >= 2) \
  { \
-    const __m128i chunk1 = _mm_load_si128((__m128i *)((base_ptr) + ((offset) ^ 0x10))); \
+    __m128i chunk1 = _mm_load_si128((__m128i *)((base_ptr) + ((offset) ^ 0x10))); \
    const __m128i chunk2 = _mm_load_si128((__m128i *)((base_ptr) + ((offset) ^ 0x20))); \
    const __m128i chunk3 = _mm_load_si128((__m128i *)((base_ptr) + ((offset) ^ 0x30))); \
    _mm_store_si128((__m128i *)((base_ptr) + ((offset) ^ 0x10)), _mm_add_epi64(chunk3, _b1)); \
    _mm_store_si128((__m128i *)((base_ptr) + ((offset) ^ 0x20)), _mm_add_epi64(chunk1, _b)); \
    _mm_store_si128((__m128i *)((base_ptr) + ((offset) ^ 0x30)), _mm_add_epi64(chunk2, _a)); \
+    if (variant >= 4) \
+    { \
+      chunk1 = _mm_xor_si128(chunk1, chunk2); \
+      _c = _mm_xor_si128(_c, chunk3); \
+      _c = _mm_xor_si128(_c, chunk1); \
+    } \
  } while (0)

 #define VARIANT2_SHUFFLE_ADD_NEON(base_ptr, offset) \
  do if (variant >= 2) \
  { \
-    const uint64x2_t chunk1 = vld1q_u64(U64((base_ptr) + ((offset) ^ 0x10))); \
+    uint64x2_t chunk1 = vld1q_u64(U64((base_ptr) + ((offset) ^ 0x10))); \
    const uint64x2_t chunk2 = vld1q_u64(U64((base_ptr) + ((offset) ^ 0x20))); \
    const uint64x2_t chunk3 = vld1q_u64(U64((base_ptr) + ((offset) ^ 0x30))); \
    vst1q_u64(U64((base_ptr) + ((offset) ^ 0x10)), vaddq_u64(chunk3, vreinterpretq_u64_u8(_b1))); \
    vst1q_u64(U64((base_ptr) + ((offset) ^ 0x20)), vaddq_u64(chunk1, vreinterpretq_u64_u8(_b))); \
    vst1q_u64(U64((base_ptr) + ((offset) ^ 0x30)), vaddq_u64(chunk2, vreinterpretq_u64_u8(_a))); \
+    if (variant >= 4) \
+    { \
+      chunk1 = veorq_u64(chunk1, chunk2); \
+      _c = vreinterpretq_u8_u64(veorq_u64(vreinterpretq_u64_u8(_c), chunk3)); \
+      _c = vreinterpretq_u8_u64(veorq_u64(vreinterpretq_u64_u8(_c), chunk1)); \
+    } \
  } while (0)

-#define VARIANT2_PORTABLE_SHUFFLE_ADD(base_ptr, offset) \
+#define VARIANT2_PORTABLE_SHUFFLE_ADD(out, a_, base_ptr, offset) \
  do if (variant >= 2) \
  { \
    uint64_t* chunk1 = U64((base_ptr) + ((offset) ^ 0x10)); \
    uint64_t* chunk2 = U64((base_ptr) + ((offset) ^ 0x20)); \
    uint64_t* chunk3 = U64((base_ptr) + ((offset) ^ 0x30)); \
    \
-    const uint64_t chunk1_old[2] = { chunk1[0], chunk1[1] }; \
+    uint64_t chunk1_old[2] = { SWAP64LE(chunk1[0]), SWAP64LE(chunk1[1]) }; \
+    const uint64_t chunk2_old[2] = { SWAP64LE(chunk2[0]), SWAP64LE(chunk2[1]) }; \
+    const uint64_t chunk3_old[2] = { SWAP64LE(chunk3[0]), SWAP64LE(chunk3[1]) }; \
    \
    uint64_t b1[2]; \
-    memcpy(b1, b + 16, 16); \
-    chunk1[0] = chunk3[0] + b1[0]; \
-    chunk1[1] = chunk3[1] + b1[1]; \
+    memcpy_swap64le(b1, b + 16, 2); \
+    chunk1[0] = SWAP64LE(chunk3_old[0] + b1[0]); \
+    chunk1[1] = SWAP64LE(chunk3_old[1] + b1[1]); \
    \
    uint64_t a0[2]; \
-    memcpy(a0, a, 16); \
-    chunk3[0] = chunk2[0] + a0[0]; \
-    chunk3[1] = chunk2[1] + a0[1]; \
+    memcpy_swap64le(a0, a_, 2); \
+    chunk3[0] = SWAP64LE(chunk2_old[0] + a0[0]); \
+    chunk3[1] = SWAP64LE(chunk2_old[1] + a0[1]); \
    \
    uint64_t b0[2]; \
-    memcpy(b0, b, 16); \
-    chunk2[0] = chunk1_old[0] + b0[0]; \
-    chunk2[1] = chunk1_old[1] + b0[1]; \
+    memcpy_swap64le(b0, b, 2); \
+    chunk2[0] = SWAP64LE(chunk1_old[0] + b0[0]); \
+    chunk2[1] = SWAP64LE(SWAP64LE(chunk1_old[1]) + b0[1]); \
+    if (variant >= 4) \
+    { \
+      uint64_t out_copy[2]; \
+      memcpy_swap64le(out_copy, out, 2); \
+      chunk1_old[0] ^= chunk2_old[0]; \
+      chunk1_old[1] ^= chunk2_old[1]; \
+      out_copy[0] ^= chunk3_old[0]; \
+      out_copy[1] ^= chunk3_old[1]; \
+      out_copy[0] ^= chunk1_old[0]; \
+      out_copy[1] ^= chunk1_old[1]; \
+      memcpy_swap64le(out, out_copy, 2); \
+    } \
  } while (0)

 #define VARIANT2_INTEGER_MATH_DIVISION_STEP(b, ptr) \
-  ((uint64_t*)(b))[0] ^= division_result ^ (sqrt_result << 32); \
+  uint64_t tmpx = division_result ^ (sqrt_result << 32); \
+  ((uint64_t*)(b))[0] ^= SWAP64LE(tmpx); \
  { \
-    const uint64_t dividend = ((uint64_t*)(ptr))[1]; \
-    const uint32_t divisor = (((uint64_t*)(ptr))[0] + (uint32_t)(sqrt_result << 1)) | 0x80000001UL; \
+    const uint64_t dividend = SWAP64LE(((uint64_t*)(ptr))[1]); \
+    const uint32_t divisor = (SWAP64LE(((uint64_t*)(ptr))[0]) + (uint32_t)(sqrt_result << 1)) | 0x80000001UL; \
    division_result = ((uint32_t)(dividend / divisor)) + \
                     (((uint64_t)(dividend % divisor)) << 32); \
  } \
-  const uint64_t sqrt_input = ((uint64_t*)(ptr))[0] + division_result
+  const uint64_t sqrt_input = SWAP64LE(((uint64_t*)(ptr))[0]) + division_result

 #define VARIANT2_INTEGER_MATH_SSE2(b, ptr) \
-  do if (variant >= 2) \
+  do if ((variant == 2) || (variant == 3)) \
  { \
    VARIANT2_INTEGER_MATH_DIVISION_STEP(b, ptr); \
    VARIANT2_INTEGER_MATH_SQRT_STEP_SSE2(); \
@ -181,7 +223,7 @@ extern int aesb_pseudo_round(const uint8_t *in, uint8_t *out, const uint8_t *exp
 #if defined DBL_MANT_DIG && (DBL_MANT_DIG >= 50)
  // double precision floating point type has enough bits of precision on current platform
  #define VARIANT2_PORTABLE_INTEGER_MATH(b, ptr) \
-    do if (variant >= 2) \
+    do if ((variant == 2) || (variant == 3)) \
    { \
      VARIANT2_INTEGER_MATH_DIVISION_STEP(b, ptr); \
      VARIANT2_INTEGER_MATH_SQRT_STEP_FP64(); \
@ -191,7 +233,7 @@ extern int aesb_pseudo_round(const uint8_t *in, uint8_t *out, const uint8_t *exp
  // double precision floating point type is not good enough on current platform
  // fall back to the reference code (integer only)
  #define VARIANT2_PORTABLE_INTEGER_MATH(b, ptr) \
-    do if (variant >= 2) \
+    do if ((variant == 2) || (variant == 3)) \
    { \
      VARIANT2_INTEGER_MATH_DIVISION_STEP(b, ptr); \
      VARIANT2_INTEGER_MATH_SQRT_STEP_REF(); \
@ -199,18 +241,80 @@ extern int aesb_pseudo_round(const uint8_t *in, uint8_t *out, const uint8_t *exp
 #endif

 #define VARIANT2_2_PORTABLE() \
-    if (variant >= 2) { \
+    if (variant == 2 || variant == 3) { \
      xor_blocks(long_state + (j ^ 0x10), d); \
      xor_blocks(d, long_state + (j ^ 0x20)); \
    }

 #define VARIANT2_2() \
-  do if (variant >= 2) \
+  do if (variant == 2 || variant == 3) \
  { \
-    *U64(hp_state + (j ^ 0x10)) ^= hi; \
-    *(U64(hp_state + (j ^ 0x10)) + 1) ^= lo; \
-    hi ^= *U64(hp_state + (j ^ 0x20)); \
-    lo ^= *(U64(hp_state + (j ^ 0x20)) + 1); \
+    *U64(hp_state + (j ^ 0x10)) ^= SWAP64LE(hi); \
+    *(U64(hp_state + (j ^ 0x10)) + 1) ^= SWAP64LE(lo); \
+    hi ^= SWAP64LE(*U64(hp_state + (j ^ 0x20))); \
+    lo ^= SWAP64LE(*(U64(hp_state + (j ^ 0x20)) + 1)); \
+  } while (0)
+
+#define V4_REG_LOAD(dst, src) \
+  do { \
+    memcpy((dst), (src), sizeof(v4_reg)); \
+    if (sizeof(v4_reg) == sizeof(uint32_t)) \
+      *(dst) = SWAP32LE(*(dst)); \
+    else \
+      *(dst) = SWAP64LE(*(dst)); \
+  } while (0)
+
+#define VARIANT4_RANDOM_MATH_INIT() \
+  v4_reg r[9]; \
+  struct V4_Instruction code[NUM_INSTRUCTIONS_MAX + 1]; \
+  int jit = use_v4_jit(); \
+  do if (variant >= 4) \
+  { \
+    for (int i = 0; i < 4; ++i) \
+      V4_REG_LOAD(r + i, (uint8_t*)(state.hs.w + 12) + sizeof(v4_reg) * i); \
+    v4_random_math_init(code, height); \
+    if (jit) \
+    { \
+      int ret = v4_generate_JIT_code(code, hp_jitfunc, 4096); \
+      if (ret < 0) \
+        local_abort("Error generating CryptonightR code"); \
+    } \
+  } while (0)
+
+#define VARIANT4_RANDOM_MATH(a, b, r, _b, _b1) \
+  do if (variant >= 4) \
+  { \
+    uint64_t t[2]; \
+    memcpy(t, b, sizeof(uint64_t)); \
+    \
+    if (sizeof(v4_reg) == sizeof(uint32_t)) \
+      t[0] ^= SWAP64LE((r[0] + r[1]) | ((uint64_t)(r[2] + r[3]) << 32)); \
+    else \
+      t[0] ^= SWAP64LE((r[0] + r[1]) ^ (r[2] + r[3])); \
+    \
+    memcpy(b, t, sizeof(uint64_t)); \
+    \
+    V4_REG_LOAD(r + 4, a); \
+    V4_REG_LOAD(r + 5, (uint64_t*)(a) + 1); \
+    V4_REG_LOAD(r + 6, _b); \
+    V4_REG_LOAD(r + 7, _b1); \
+    V4_REG_LOAD(r + 8, (uint64_t*)(_b1) + 1); \
+    \
+    if (jit) \
+      (*hp_jitfunc)(r); \
+    else \
+      v4_random_math(code, r); \
+    \
+    memcpy(t, a, sizeof(uint64_t) * 2); \
+    \
+    if (sizeof(v4_reg) == sizeof(uint32_t)) { \
+      t[0] ^= SWAP64LE(r[2] | ((uint64_t)(r[3]) << 32)); \
+      t[1] ^= SWAP64LE(r[0] | ((uint64_t)(r[1]) << 32)); \
+    } else { \
+      t[0] ^= SWAP64LE(r[2] ^ r[3]); \
+      t[1] ^= SWAP64LE(r[0] ^ r[1]); \
+    } \
+    memcpy(a, t, sizeof(uint64_t) * 2); \
  } while (0)


@ -297,6 +401,7 @@ extern int aesb_pseudo_round(const uint8_t *in, uint8_t *out, const uint8_t *exp
  p = U64(&hp_state[j]); \
  b[0] = p[0]; b[1] = p[1]; \
  VARIANT2_INTEGER_MATH_SSE2(b, c); \
+  VARIANT4_RANDOM_MATH(a, b, r, &_b, &_b1); \
  __mul(); \
  VARIANT2_2(); \
  VARIANT2_SHUFFLE_ADD_SSE2(hp_state, j); \
@ -328,6 +433,9 @@ union cn_slow_hash_state

 THREADV uint8_t *hp_state = NULL;
 THREADV int hp_allocated = 0;
+THREADV v4_random_math_JIT_func hp_jitfunc = NULL;
+THREADV uint8_t *hp_jitfunc_memory = NULL;
+THREADV int hp_jitfunc_allocated = 0;

 #if defined(_MSC_VER)
 #define cpuid(info,x)    __cpuidex(info,x,0)
@ -386,6 +494,31 @@ STATIC INLINE int force_software_aes(void)
  return use;
 }

+volatile int use_v4_jit_flag = -1;
+
+STATIC INLINE int use_v4_jit(void)
+{
+#if defined(__x86_64__)
+
+  if (use_v4_jit_flag != -1)
+    return use_v4_jit_flag;
+
+  const char *env = getenv("MONERO_USE_CNV4_JIT");
+  if (!env) {
+    use_v4_jit_flag = 0;
+  }
+  else if (!strcmp(env, "0") || !strcmp(env, "no")) {
+    use_v4_jit_flag = 0;
+  }
+  else {
+    use_v4_jit_flag = 1;
+  }
+  return use_v4_jit_flag;
+#else
+  return 0;
+#endif
+}
+
 STATIC INLINE int check_aes_hw(void)
 {
    int cpuid_results[4];
@ -637,6 +770,33 @@ void slow_hash_allocate_state(void)
        hp_allocated = 0;
        hp_state = (uint8_t *) malloc(MEMORY);
    }
+
+
+#if defined(_MSC_VER) || defined(__MINGW32__)
+    hp_jitfunc_memory = (uint8_t *) VirtualAlloc(hp_jitfunc_memory, 4096 + 4095,
+                                                 MEM_COMMIT | MEM_RESERVE, PAGE_EXECUTE_READWRITE);
+#else
+#if defined(__APPLE__) || defined(__FreeBSD__) || defined(__OpenBSD__) || \
+  defined(__DragonFly__) || defined(__NetBSD__)
+    hp_jitfunc_memory = mmap(0, 4096 + 4095, PROT_READ | PROT_WRITE | PROT_EXEC,
+                    MAP_PRIVATE | MAP_ANON, 0, 0);
+#else
+    hp_jitfunc_memory = mmap(0, 4096 + 4095, PROT_READ | PROT_WRITE | PROT_EXEC,
+                    MAP_PRIVATE | MAP_ANONYMOUS, 0, 0);
+#endif
+    if(hp_jitfunc_memory == MAP_FAILED)
+        hp_jitfunc_memory = NULL;
+#endif
+    hp_jitfunc_allocated = 1;
+    if (hp_jitfunc_memory == NULL)
+    {
+        hp_jitfunc_allocated = 0;
+        hp_jitfunc_memory = malloc(4096 + 4095);
+    }
+    hp_jitfunc = (v4_random_math_JIT_func)((size_t)(hp_jitfunc_memory + 4095) & ~4095);
+#if !(defined(_MSC_VER) || defined(__MINGW32__))
+    mprotect(hp_jitfunc, 4096, PROT_READ | PROT_WRITE | PROT_EXEC);
+#endif
 }

 /**
@ -659,8 +819,22 @@ void slow_hash_free_state(void)
 #endif
    }

+    if(!hp_jitfunc_allocated)
+        free(hp_jitfunc_memory);
+    else
+    {
+#if defined(_MSC_VER) || defined(__MINGW32__)
+        VirtualFree(hp_jitfunc_memory, 0, MEM_RELEASE);
+#else
+        munmap(hp_jitfunc_memory, 4096 + 4095);
+#endif
+    }
+
    hp_state = NULL;
    hp_allocated = 0;
+    hp_jitfunc = NULL;
+    hp_jitfunc_memory = NULL;
+    hp_jitfunc_allocated = 0;
 }

 /**
@ -693,7 +867,7 @@ void slow_hash_free_state(void)
 * @param length the length in bytes of the data
 * @param hash a pointer to a buffer in which the final 256 bit hash will be stored
 */
-void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int prehashed)
+void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int prehashed, uint64_t height)
 {
    RDATA_ALIGN16 uint8_t expandedKey[240];  /* These buffers are aligned to use later with SSE functions */

@ -729,6 +903,7 @@ void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int

    VARIANT1_INIT64();
    VARIANT2_INIT64();
+    VARIANT4_RANDOM_MATH_INIT();

    /* CryptoNight Step 2:  Iteratively encrypt the results from Keccak to fill
     * the 2MB large random access buffer.
@ -900,6 +1075,7 @@ union cn_slow_hash_state
  p = U64(&hp_state[j]); \
  b[0] = p[0]; b[1] = p[1]; \
  VARIANT2_PORTABLE_INTEGER_MATH(b, c); \
+  VARIANT4_RANDOM_MATH(a, b, r, &_b, &_b1); \
  __mul(); \
  VARIANT2_2(); \
  VARIANT2_SHUFFLE_ADD_NEON(hp_state, j); \
@ -1062,7 +1238,7 @@ STATIC INLINE void aligned_free(void *ptr)
 }
 #endif /* FORCE_USE_HEAP */

-void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int prehashed)
+void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int prehashed, uint64_t height)
 {
    RDATA_ALIGN16 uint8_t expandedKey[240];

@ -1099,6 +1275,7 @@ void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int

    VARIANT1_INIT64();
    VARIANT2_INIT64();
+    VARIANT4_RANDOM_MATH_INIT();

    /* CryptoNight Step 2:  Iteratively encrypt the results from Keccak to fill
     * the 2MB large random access buffer.
@ -1277,10 +1454,11 @@ STATIC INLINE void xor_blocks(uint8_t* a, const uint8_t* b)
  U64(a)[1] ^= U64(b)[1];
 }

-void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int prehashed)
+void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int prehashed, uint64_t height)
 {
    uint8_t text[INIT_SIZE_BYTE];
    uint8_t a[AES_BLOCK_SIZE];
+    uint8_t a1[AES_BLOCK_SIZE];
    uint8_t b[AES_BLOCK_SIZE * 2];
    uint8_t c[AES_BLOCK_SIZE];
    uint8_t c1[AES_BLOCK_SIZE];
@ -1316,6 +1494,7 @@ void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int

    VARIANT1_INIT64();
    VARIANT2_INIT64();
+    VARIANT4_RANDOM_MATH_INIT();

    // use aligned data
    memcpy(expandedKey, aes_ctx->key->exp_data, aes_ctx->key->exp_data_len);
@ -1339,10 +1518,10 @@ void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int
      // Iteration 1
      j = state_index(a);
      p = &long_state[j];
-      aesb_single_round(p, p, a);
-      copy_block(c1, p);
+      aesb_single_round(p, c1, a);

-      VARIANT2_PORTABLE_SHUFFLE_ADD(long_state, j);
+      VARIANT2_PORTABLE_SHUFFLE_ADD(c1, a, long_state, j);
+      copy_block(p, c1);
      xor_blocks(p, b);
      VARIANT1_1(p);

@ -1351,13 +1530,15 @@ void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int
      p = &long_state[j];
      copy_block(c, p);

+      copy_block(a1, a);
      VARIANT2_PORTABLE_INTEGER_MATH(c, c1);
+      VARIANT4_RANDOM_MATH(a1, c, r, b, b + AES_BLOCK_SIZE);
      mul(c1, c, d);
      VARIANT2_2_PORTABLE();
-      VARIANT2_PORTABLE_SHUFFLE_ADD(long_state, j);
-      sum_half_blocks(a, d);
-      swap_blocks(a, c);
-      xor_blocks(a, c);
+      VARIANT2_PORTABLE_SHUFFLE_ADD(c1, a, long_state, j);
+      sum_half_blocks(a1, d);
+      swap_blocks(a1, c);
+      xor_blocks(a1, c);
      VARIANT1_2(U64(c) + 1);
      copy_block(p, c);

@ -1365,6 +1546,7 @@ void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int
        copy_block(b + AES_BLOCK_SIZE, b);
      }
      copy_block(b, c1);
+      copy_block(a, a1);
    }

    memcpy(text, state.init, INIT_SIZE_BYTE);
@ -1408,10 +1590,7 @@ static void (*const extra_hashes[4])(const void *, size_t, char *) = {
  hash_extra_blake, hash_extra_groestl, hash_extra_jh, hash_extra_skein
 };

-extern int aesb_single_round(const uint8_t *in, uint8_t*out, const uint8_t *expandedKey);
-extern int aesb_pseudo_round(const uint8_t *in, uint8_t *out, const uint8_t *expandedKey);
-
-static size_t e2i(const uint8_t* a, size_t count) { return (*((uint64_t*)a) / AES_BLOCK_SIZE) & (count - 1); }
+static size_t e2i(const uint8_t* a, size_t count) { return (SWAP64LE(*((uint64_t*)a)) / AES_BLOCK_SIZE) & (count - 1); }

 static void mul(const uint8_t* a, const uint8_t* b, uint8_t* res) {
  uint64_t a0, b0;
@ -1478,7 +1657,7 @@ union cn_slow_hash_state {
 };
 #pragma pack(pop)

-void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int prehashed) {
+void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int prehashed, uint64_t height) {
 #ifndef FORCE_USE_HEAP
  uint8_t long_state[MEMORY];
 #else
@ -1488,6 +1667,7 @@ void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int
  union cn_slow_hash_state state;
  uint8_t text[INIT_SIZE_BYTE];
  uint8_t a[AES_BLOCK_SIZE];
+  uint8_t a1[AES_BLOCK_SIZE];
  uint8_t b[AES_BLOCK_SIZE * 2];
  uint8_t c1[AES_BLOCK_SIZE];
  uint8_t c2[AES_BLOCK_SIZE];
@ -1507,6 +1687,7 @@ void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int

  VARIANT1_PORTABLE_INIT();
  VARIANT2_PORTABLE_INIT();
+  VARIANT4_RANDOM_MATH_INIT();

  oaes_key_import_data(aes_ctx, aes_key, AES_KEY_SIZE);
  for (i = 0; i < MEMORY / INIT_SIZE_BYTE; i++) {
@ -1530,7 +1711,7 @@ void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int
    j = e2i(a, MEMORY / AES_BLOCK_SIZE) * AES_BLOCK_SIZE;
    copy_block(c1, &long_state[j]);
    aesb_single_round(c1, c1, a);
-    VARIANT2_PORTABLE_SHUFFLE_ADD(long_state, j);
+    VARIANT2_PORTABLE_SHUFFLE_ADD(c1, a, long_state, j);
    copy_block(&long_state[j], c1);
    xor_blocks(&long_state[j], b);
    assert(j == e2i(a, MEMORY / AES_BLOCK_SIZE) * AES_BLOCK_SIZE);
@ -1538,22 +1719,22 @@ void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int
    /* Iteration 2 */
    j = e2i(c1, MEMORY / AES_BLOCK_SIZE) * AES_BLOCK_SIZE;
    copy_block(c2, &long_state[j]);
+    copy_block(a1, a);
    VARIANT2_PORTABLE_INTEGER_MATH(c2, c1);
+    VARIANT4_RANDOM_MATH(a1, c2, r, b, b + AES_BLOCK_SIZE);
    mul(c1, c2, d);
    VARIANT2_2_PORTABLE();
-    VARIANT2_PORTABLE_SHUFFLE_ADD(long_state, j);
-    swap_blocks(a, c1);
-    sum_half_blocks(c1, d);
-    swap_blocks(c1, c2);
-    xor_blocks(c1, c2);
+    VARIANT2_PORTABLE_SHUFFLE_ADD(c1, a, long_state, j);
+    sum_half_blocks(a1, d);
+    swap_blocks(a1, c2);
+    xor_blocks(a1, c2);
    VARIANT1_2(c2 + 8);
    copy_block(&long_state[j], c2);
-    assert(j == e2i(a, MEMORY / AES_BLOCK_SIZE) * AES_BLOCK_SIZE);
    if (variant >= 2) {
      copy_block(b + AES_BLOCK_SIZE, b);
    }
-    copy_block(b, a);
-    copy_block(a, c1);
+    copy_block(b, c1);
+    copy_block(a, a1);
  }

  memcpy(text, state.init, INIT_SIZE_BYTE);
--- a/monero/crypto/variant4_random_math.h
+++ b/monero/crypto/variant4_random_math.h
@ -0,0 +1,441 @@
+#ifndef VARIANT4_RANDOM_MATH_H
+#define VARIANT4_RANDOM_MATH_H
+
+// Register size can be configured to either 32 bit (uint32_t) or 64 bit (uint64_t)
+typedef uint32_t v4_reg;
+
+enum V4_Settings
+{
+	// Generate code with minimal theoretical latency = 45 cycles, which is equivalent to 15 multiplications
+	TOTAL_LATENCY = 15 * 3,
+	
+	// Always generate at least 60 instructions
+	NUM_INSTRUCTIONS_MIN = 60,
+
+	// Never generate more than 70 instructions (final RET instruction doesn't count here)
+	NUM_INSTRUCTIONS_MAX = 70,
+
+	// Available ALUs for MUL
+	// Modern CPUs typically have only 1 ALU which can do multiplications
+	ALU_COUNT_MUL = 1,
+
+	// Total available ALUs
+	// Modern CPUs have 4 ALUs, but we use only 3 because random math executes together with other main loop code
+	ALU_COUNT = 3,
+};
+
+enum V4_InstructionList
+{
+	MUL,	// a*b
+	ADD,	// a+b + C, C is an unsigned 32-bit constant
+	SUB,	// a-b
+	ROR,	// rotate right "a" by "b & 31" bits
+	ROL,	// rotate left "a" by "b & 31" bits
+	XOR,	// a^b
+	RET,	// finish execution
+	V4_INSTRUCTION_COUNT = RET,
+};
+
+// V4_InstructionDefinition is used to generate code from random data
+// Every random sequence of bytes is a valid code
+//
+// There are 9 registers in total:
+// - 4 variable registers
+// - 5 constant registers initialized from loop variables
+// This is why dst_index is 2 bits
+enum V4_InstructionDefinition
+{
+	V4_OPCODE_BITS = 3,
+	V4_DST_INDEX_BITS = 2,
+	V4_SRC_INDEX_BITS = 3,
+};
+
+struct V4_Instruction
+{
+	uint8_t opcode;
+	uint8_t dst_index;
+	uint8_t src_index;
+	uint32_t C;
+};
+
+#ifndef FORCEINLINE
+#if defined(__GNUC__)
+#define FORCEINLINE __attribute__((always_inline)) inline
+#elif defined(_MSC_VER)
+#define FORCEINLINE __forceinline
+#else
+#define FORCEINLINE inline
+#endif
+#endif
+
+#ifndef UNREACHABLE_CODE
+#if defined(__GNUC__)
+#define UNREACHABLE_CODE __builtin_unreachable()
+#elif defined(_MSC_VER)
+#define UNREACHABLE_CODE __assume(false)
+#else
+#define UNREACHABLE_CODE
+#endif
+#endif
+
+// Random math interpreter's loop is fully unrolled and inlined to achieve 100% branch prediction on CPU:
+// every switch-case will point to the same destination on every iteration of Cryptonight main loop
+//
+// This is about as fast as it can get without using low-level machine code generation
+static FORCEINLINE void v4_random_math(const struct V4_Instruction* code, v4_reg* r)
+{
+	enum
+	{
+		REG_BITS = sizeof(v4_reg) * 8,
+	};
+
+#define V4_EXEC(i) \
+	{ \
+		const struct V4_Instruction* op = code + i; \
+		const v4_reg src = r[op->src_index]; \
+		v4_reg* dst = r + op->dst_index; \
+		switch (op->opcode) \
+		{ \
+		case MUL: \
+			*dst *= src; \
+			break; \
+		case ADD: \
+			*dst += src + op->C; \
+			break; \
+		case SUB: \
+			*dst -= src; \
+			break; \
+		case ROR: \
+			{ \
+				const uint32_t shift = src % REG_BITS; \
+				*dst = (*dst >> shift) | (*dst << ((REG_BITS - shift) % REG_BITS)); \
+			} \
+			break; \
+		case ROL: \
+			{ \
+				const uint32_t shift = src % REG_BITS; \
+				*dst = (*dst << shift) | (*dst >> ((REG_BITS - shift) % REG_BITS)); \
+			} \
+			break; \
+		case XOR: \
+			*dst ^= src; \
+			break; \
+		case RET: \
+			return; \
+		default: \
+			UNREACHABLE_CODE; \
+			break; \
+		} \
+	}
+
+#define V4_EXEC_10(j) \
+	V4_EXEC(j + 0) \
+	V4_EXEC(j + 1) \
+	V4_EXEC(j + 2) \
+	V4_EXEC(j + 3) \
+	V4_EXEC(j + 4) \
+	V4_EXEC(j + 5) \
+	V4_EXEC(j + 6) \
+	V4_EXEC(j + 7) \
+	V4_EXEC(j + 8) \
+	V4_EXEC(j + 9)
+
+	// Generated program can have 60 + a few more (usually 2-3) instructions to achieve required latency
+	// I've checked all block heights < 10,000,000 and here is the distribution of program sizes:
+	//
+	// 60      27960
+	// 61      105054
+	// 62      2452759
+	// 63      5115997
+	// 64      1022269
+	// 65      1109635
+	// 66      153145
+	// 67      8550
+	// 68      4529
+	// 69      102
+
+	// Unroll 70 instructions here
+	V4_EXEC_10(0);		// instructions 0-9
+	V4_EXEC_10(10);		// instructions 10-19
+	V4_EXEC_10(20);		// instructions 20-29
+	V4_EXEC_10(30);		// instructions 30-39
+	V4_EXEC_10(40);		// instructions 40-49
+	V4_EXEC_10(50);		// instructions 50-59
+	V4_EXEC_10(60);		// instructions 60-69
+
+#undef V4_EXEC_10
+#undef V4_EXEC
+}
+
+// If we don't have enough data available, generate more
+static FORCEINLINE void check_data(size_t* data_index, const size_t bytes_needed, int8_t* data, const size_t data_size)
+{
+	if (*data_index + bytes_needed > data_size)
+	{
+		hash_extra_blake(data, data_size, (char*) data);
+		*data_index = 0;
+	}
+}
+
+// Generates as many random math operations as possible with given latency and ALU restrictions
+// "code" array must have space for NUM_INSTRUCTIONS_MAX+1 instructions
+static inline int v4_random_math_init(struct V4_Instruction* code, const uint64_t height)
+{
+	// MUL is 3 cycles, 3-way addition and rotations are 2 cycles, SUB/XOR are 1 cycle
+	// These latencies match real-life instruction latencies for Intel CPUs starting from Sandy Bridge and up to Skylake/Coffee lake
+	//
+	// AMD Ryzen has the same latencies except 1-cycle ROR/ROL, so it'll be a bit faster than Intel Sandy Bridge and newer processors
+	// Surprisingly, Intel Nehalem also has 1-cycle ROR/ROL, so it'll also be faster than Intel Sandy Bridge and newer processors
+	// AMD Bulldozer has 4 cycles latency for MUL (slower than Intel) and 1 cycle for ROR/ROL (faster than Intel), so average performance will be the same
+	// Source: https://www.agner.org/optimize/instruction_tables.pdf
+	const int op_latency[V4_INSTRUCTION_COUNT] = { 3, 2, 1, 2, 2, 1 };
+
+	// Instruction latencies for theoretical ASIC implementation
+	const int asic_op_latency[V4_INSTRUCTION_COUNT] = { 3, 1, 1, 1, 1, 1 };
+
+	// Available ALUs for each instruction
+	const int op_ALUs[V4_INSTRUCTION_COUNT] = { ALU_COUNT_MUL, ALU_COUNT, ALU_COUNT, ALU_COUNT, ALU_COUNT, ALU_COUNT };
+
+	int8_t data[32];
+	memset(data, 0, sizeof(data));
+	uint64_t tmp = SWAP64LE(height);
+	memcpy(data, &tmp, sizeof(uint64_t));
+	data[20] = 0xda; // change seed
+
+	// Set data_index past the last byte in data
+	// to trigger full data update with blake hash
+	// before we start using it
+	size_t data_index = sizeof(data);
+
+	int code_size;
+
+	// There is a small chance (1.8%) that register R8 won't be used in the generated program
+	// So we keep track of it and try again if it's not used
+	bool r8_used;
+	do {
+		int latency[9];
+		int asic_latency[9];
+
+		// Tracks previous instruction and value of the source operand for registers R0-R3 throughout code execution
+		// byte 0: current value of the destination register
+		// byte 1: instruction opcode
+		// byte 2: current value of the source register
+		//
+		// Registers R4-R8 are constant and are treated as having the same value because when we do
+		// the same operation twice with two constant source registers, it can be optimized into a single operation
+		uint32_t inst_data[9] = { 0, 1, 2, 3, 0xFFFFFF, 0xFFFFFF, 0xFFFFFF, 0xFFFFFF, 0xFFFFFF };
+
+		bool alu_busy[TOTAL_LATENCY + 1][ALU_COUNT];
+		bool is_rotation[V4_INSTRUCTION_COUNT];
+		bool rotated[4];
+		int rotate_count = 0;
+
+		memset(latency, 0, sizeof(latency));
+		memset(asic_latency, 0, sizeof(asic_latency));
+		memset(alu_busy, 0, sizeof(alu_busy));
+		memset(is_rotation, 0, sizeof(is_rotation));
+		memset(rotated, 0, sizeof(rotated));
+		is_rotation[ROR] = true;
+		is_rotation[ROL] = true;
+
+		int num_retries = 0;
+		code_size = 0;
+
+		int total_iterations = 0;
+		r8_used = false;
+
+		// Generate random code to achieve minimal required latency for our abstract CPU
+		// Try to get this latency for all 4 registers
+		while (((latency[0] < TOTAL_LATENCY) || (latency[1] < TOTAL_LATENCY) || (latency[2] < TOTAL_LATENCY) || (latency[3] < TOTAL_LATENCY)) && (num_retries < 64))
+		{
+			// Fail-safe to guarantee loop termination
+			++total_iterations;
+			if (total_iterations > 256)
+				break;
+
+			check_data(&data_index, 1, data, sizeof(data));
+
+			const uint8_t c = ((uint8_t*)data)[data_index++];
+
+			// MUL = opcodes 0-2
+			// ADD = opcode 3
+			// SUB = opcode 4
+			// ROR/ROL = opcode 5, shift direction is selected randomly
+			// XOR = opcodes 6-7
+			uint8_t opcode = c & ((1 << V4_OPCODE_BITS) - 1);
+			if (opcode == 5)
+			{
+				check_data(&data_index, 1, data, sizeof(data));
+				opcode = (data[data_index++] >= 0) ? ROR : ROL;
+			}
+			else if (opcode >= 6)
+			{
+				opcode = XOR;
+			}
+			else
+			{
+				opcode = (opcode <= 2) ? MUL : (opcode - 2);
+			}
+
+			uint8_t dst_index = (c >> V4_OPCODE_BITS) & ((1 << V4_DST_INDEX_BITS) - 1);
+			uint8_t src_index = (c >> (V4_OPCODE_BITS + V4_DST_INDEX_BITS)) & ((1 << V4_SRC_INDEX_BITS) - 1);
+
+			const int a = dst_index;
+			int b = src_index;
+
+			// Don't do ADD/SUB/XOR with the same register
+			if (((opcode == ADD) || (opcode == SUB) || (opcode == XOR)) && (a == b))
+			{
+				// Use register R8 as source instead
+				b = 8;
+				src_index = 8;
+			}
+
+			// Don't do rotation with the same destination twice because it's equal to a single rotation
+			if (is_rotation[opcode] && rotated[a])
+			{
+				continue;
+			}
+
+			// Don't do the same instruction (except MUL) with the same source value twice because all other cases can be optimized:
+			// 2xADD(a, b, C) = ADD(a, b*2, C1+C2), same for SUB and rotations
+			// 2xXOR(a, b) = NOP
+			if ((opcode != MUL) && ((inst_data[a] & 0xFFFF00) == (opcode << 8) + ((inst_data[b] & 255) << 16)))
+			{
+				continue;
+			}
+
+			// Find which ALU is available (and when) for this instruction
+			int next_latency = (latency[a] > latency[b]) ? latency[a] : latency[b];
+			int alu_index = -1;
+			while (next_latency < TOTAL_LATENCY)
+			{
+				for (int i = op_ALUs[opcode] - 1; i >= 0; --i)
+				{
+					if (!alu_busy[next_latency][i])
+					{
+						// ADD is implemented as two 1-cycle instructions on a real CPU, so do an additional availability check
+						if ((opcode == ADD) && alu_busy[next_latency + 1][i])
+						{
+							continue;
+						}
+
+						// Rotation can only start when previous rotation is finished, so do an additional availability check
+						if (is_rotation[opcode] && (next_latency < rotate_count * op_latency[opcode]))
+						{
+							continue;
+						}
+
+						alu_index = i;
+						break;
+					}
+				}
+				if (alu_index >= 0)
+				{
+					break;
+				}
+				++next_latency;
+			}
+
+			// Don't generate instructions that leave some register unchanged for more than 7 cycles
+			if (next_latency > latency[a] + 7)
+			{
+				continue;
+			}
+
+			next_latency += op_latency[opcode];
+
+			if (next_latency <= TOTAL_LATENCY)
+			{
+				if (is_rotation[opcode])
+				{
+					++rotate_count;
+				}
+
+				// Mark ALU as busy only for the first cycle when it starts executing the instruction because ALUs are fully pipelined
+				alu_busy[next_latency - op_latency[opcode]][alu_index] = true;
+				latency[a] = next_latency;
+
+				// ASIC is supposed to have enough ALUs to run as many independent instructions per cycle as possible, so latency calculation for ASIC is simple
+				asic_latency[a] = ((asic_latency[a] > asic_latency[b]) ? asic_latency[a] : asic_latency[b]) + asic_op_latency[opcode];
+
+				rotated[a] = is_rotation[opcode];
+
+				inst_data[a] = code_size + (opcode << 8) + ((inst_data[b] & 255) << 16);
+
+				code[code_size].opcode = opcode;
+				code[code_size].dst_index = dst_index;
+				code[code_size].src_index = src_index;
+				code[code_size].C = 0;
+
+				if (src_index == 8)
+				{
+					r8_used = true;
+				}
+
+				if (opcode == ADD)
+				{
+					// ADD instruction is implemented as two 1-cycle instructions on a real CPU, so mark ALU as busy for the next cycle too
+					alu_busy[next_latency - op_latency[opcode] + 1][alu_index] = true;
+
+					// ADD instruction requires 4 more random bytes for 32-bit constant "C" in "a = a + b + C"
+					check_data(&data_index, sizeof(uint32_t), data, sizeof(data));
+					uint32_t t;
+					memcpy(&t, data + data_index, sizeof(uint32_t));
+					code[code_size].C = SWAP32LE(t);
+					data_index += sizeof(uint32_t);
+				}
+
+				++code_size;
+				if (code_size >= NUM_INSTRUCTIONS_MIN)
+				{
+					break;
+				}
+			}
+			else
+			{
+				++num_retries;
+			}
+		}
+
+		// ASIC has more execution resources and can extract as much parallelism from the code as possible
+		// We need to add a few more MUL and ROR instructions to achieve minimal required latency for ASIC
+		// Get this latency for at least 1 of the 4 registers
+		const int prev_code_size = code_size;
+		while ((code_size < NUM_INSTRUCTIONS_MAX) && (asic_latency[0] < TOTAL_LATENCY) && (asic_latency[1] < TOTAL_LATENCY) && (asic_latency[2] < TOTAL_LATENCY) && (asic_latency[3] < TOTAL_LATENCY))
+		{
+			int min_idx = 0;
+			int max_idx = 0;
+			for (int i = 1; i < 4; ++i)
+			{
+				if (asic_latency[i] < asic_latency[min_idx]) min_idx = i;
+				if (asic_latency[i] > asic_latency[max_idx]) max_idx = i;
+			}
+
+			const uint8_t pattern[3] = { ROR, MUL, MUL };
+			const uint8_t opcode = pattern[(code_size - prev_code_size) % 3];
+			latency[min_idx] = latency[max_idx] + op_latency[opcode];
+			asic_latency[min_idx] = asic_latency[max_idx] + asic_op_latency[opcode];
+
+			code[code_size].opcode = opcode;
+			code[code_size].dst_index = min_idx;
+			code[code_size].src_index = max_idx;
+			code[code_size].C = 0;
+			++code_size;
+		}
+
+	// There is ~98.15% chance that loop condition is false, so this loop will execute only 1 iteration most of the time
+	// It never does more than 4 iterations for all block heights < 10,000,000
+	}  while (!r8_used || (code_size < NUM_INSTRUCTIONS_MIN) || (code_size > NUM_INSTRUCTIONS_MAX));
+
+	// It's guaranteed that NUM_INSTRUCTIONS_MIN <= code_size <= NUM_INSTRUCTIONS_MAX here
+	// Add final instruction to stop the interpreter
+	code[code_size].opcode = RET;
+	code[code_size].dst_index = 0;
+	code[code_size].src_index = 0;
+	code[code_size].C = 0;
+
+	return code_size;
+}
+
+#endif
--- a/monero/cryptonote_basic/cryptonote_boost_serialization.h
+++ b/monero/cryptonote_basic/cryptonote_boost_serialization.h
@ -249,7 +249,6 @@ namespace boost
  {
    a & x.mask;
    a & x.amount;
-    // a & x.senderPk; // not serialized, as we do not use it in monero currently
  }

  template <class Archive>
@ -295,7 +294,7 @@ namespace boost
    a & x.type;
    if (x.type == rct::RCTTypeNull)
      return;
-    if (x.type != rct::RCTTypeFull && x.type != rct::RCTTypeSimple && x.type != rct::RCTTypeBulletproof)
+    if (x.type != rct::RCTTypeFull && x.type != rct::RCTTypeSimple && x.type != rct::RCTTypeBulletproof && x.type != rct::RCTTypeBulletproof2)
      throw boost::archive::archive_exception(boost::archive::archive_exception::other_exception, "Unsupported rct type");
    // a & x.message; message is not serialized, as it can be reconstructed from the tx data
    // a & x.mixRing; mixRing is not serialized, as it can be reconstructed from the offsets
@ -323,7 +322,7 @@ namespace boost
    a & x.type;
    if (x.type == rct::RCTTypeNull)
      return;
-    if (x.type != rct::RCTTypeFull && x.type != rct::RCTTypeSimple && x.type != rct::RCTTypeBulletproof)
+    if (x.type != rct::RCTTypeFull && x.type != rct::RCTTypeSimple && x.type != rct::RCTTypeBulletproof && x.type != rct::RCTTypeBulletproof2)
      throw boost::archive::archive_exception(boost::archive::archive_exception::other_exception, "Unsupported rct type");
    // a & x.message; message is not serialized, as it can be reconstructed from the tx data
    // a & x.mixRing; mixRing is not serialized, as it can be reconstructed from the offsets
@ -337,7 +336,7 @@ namespace boost
    if (x.p.rangeSigs.empty())
      a & x.p.bulletproofs;
    a & x.p.MGs;
-    if (x.type == rct::RCTTypeBulletproof)
+    if (x.type == rct::RCTTypeBulletproof || x.type == rct::RCTTypeBulletproof2)
      a & x.p.pseudoOuts;
  }
 }
--- a/monero/cryptonote_basic/cryptonote_format_utils.cpp
+++ b/monero/cryptonote_basic/cryptonote_format_utils.cpp
@ -1054,7 +1054,7 @@ namespace cryptonote
    }
    blobdata bd = get_block_hashing_blob(b);
    const int cn_variant = b.major_version >= 7 ? b.major_version - 6 : 0;
-    crypto::cn_slow_hash(bd.data(), bd.size(), res, cn_variant);
+    crypto::cn_slow_hash(bd.data(), bd.size(), res, cn_variant, height);
    return true;
  }
  //---------------------------------------------------------------
--- a/monero/cryptonote_basic/hardfork.cpp
+++ b/monero/cryptonote_basic/hardfork.cpp
@ -305,6 +305,29 @@ bool HardFork::rescan_from_chain_height(uint64_t height)
  return rescan_from_block_height(height - 1);
 }

+void HardFork::on_block_popped(uint64_t nblocks)
+{
+  CHECK_AND_ASSERT_THROW_MES(nblocks > 0, "nblocks must be greater than 0");
+
+  CRITICAL_REGION_LOCAL(lock);
+
+  const uint64_t new_chain_height = db.height();
+  const uint64_t old_chain_height = new_chain_height + nblocks;
+  uint8_t version;
+  uint64_t height;
+  for (height = old_chain_height - 1; height >= new_chain_height; --height)
+  {
+    versions.pop_back();
+    version = db.get_hard_fork_version(height);
+    versions.push_front(version);
+  }
+
+  // does not take voting into account
+  for (current_fork_index = heights.size() - 1; current_fork_index > 0; --current_fork_index)
+    if (height >= heights[current_fork_index].height)
+      break;
+}
+
 int HardFork::get_voted_fork_index(uint64_t height) const
 {
  CRITICAL_REGION_LOCAL(lock);
--- a/monero/cryptonote_basic/hardfork.h
+++ b/monero/cryptonote_basic/hardfork.h
@ -149,6 +149,16 @@ namespace cryptonote
    bool reorganize_from_block_height(uint64_t height);
    bool reorganize_from_chain_height(uint64_t height);

+    /**
+     * @brief called when one or more blocks are popped from the blockchain
+     *
+     * The current fork will be updated by looking up the db,
+     * which is much cheaper than recomputing everything
+     *
+     * @param new_chain_height the height of the chain after popping
+     */
+    void on_block_popped(uint64_t new_chain_height);
+
    /**
     * @brief returns current state at the given time
     *
--- a/monero/cryptonote_core/blockchain.h
+++ b/monero/cryptonote_core/blockchain.h
@ -37,6 +37,7 @@
 #include <boost/multi_index/global_fun.hpp>
 #include <boost/multi_index/hashed_index.hpp>
 #include <boost/multi_index/member.hpp>
+#include <boost/circular_buffer.hpp>
 #include <atomic>
 #include <unordered_map>
 #include <unordered_set>
@ -611,6 +612,13 @@ namespace cryptonote
     */
    uint64_t get_current_cumulative_block_weight_limit() const;

+    /**
+     * @brief gets the long term block weight for a new block
+     *
+     * @return the long term block weight
+     */
+    uint64_t get_next_long_term_block_weight(uint64_t block_weight) const;
+
    /**
     * @brief gets the block weight median based on recent blocks (same window as for the limit)
     *
@ -710,10 +718,17 @@ namespace cryptonote
    /**
     * @brief sets a block notify object to call for every new block
     *
-     * @param notify the notify object to cal at every new block
+     * @param notify the notify object to call at every new block
     */
    void set_block_notify(const std::shared_ptr<tools::Notify> &notify) { m_block_notify = notify; }

+    /**
+     * @brief sets a reorg notify object to call for every reorg
+     *
+     * @param notify the notify object to call at every reorg
+     */
+    void set_reorg_notify(const std::shared_ptr<tools::Notify> &notify) { m_reorg_notify = notify; }
+
    /**
     * @brief Put DB in safe sync mode
     */
@ -952,7 +967,14 @@ namespace cryptonote
     */
    void on_new_tx_from_block(const cryptonote::transaction &tx);

+    /**
+     * @brief returns the timestamps of the last N blocks
+     */
+    std::vector<time_t> get_last_block_timestamps(unsigned int blocks) const;
+
+#ifndef IN_UNIT_TESTS
  private:
+#endif

    // TODO: evaluate whether or not each of these typedefs are left over from blockchain_storage
    typedef std::unordered_map<crypto::hash, size_t> blocks_by_id_index;
@ -1005,6 +1027,8 @@ namespace cryptonote
    std::vector<uint64_t> m_timestamps;
    std::vector<difficulty_type> m_difficulties;
    uint64_t m_timestamps_and_difficulties_height;
+    uint64_t m_long_term_block_weights_window;
+    uint64_t m_long_term_effective_median_block_weight;

    epee::critical_section m_difficulty_lock;
    crypto::hash m_difficulty_for_next_block_top_hash;
@ -1042,6 +1066,7 @@ namespace cryptonote
    bool m_btc_valid;

    std::shared_ptr<tools::Notify> m_block_notify;
+    std::shared_ptr<tools::Notify> m_reorg_notify;

    /**
     * @brief collects the keys for all outputs being "spent" as an input
@ -1237,7 +1262,7 @@ namespace cryptonote
     * @param sz return-by-reference the list of weights
     * @param count the number of blocks to get weights for
     */
-    void get_last_n_blocks_weights(std::vector<size_t>& weights, size_t count) const;
+    void get_last_n_blocks_weights(std::vector<uint64_t>& weights, size_t count) const;

    /**
     * @brief checks if a transaction is unlocked (its outputs spendable)
@ -1336,9 +1361,11 @@ namespace cryptonote
    /**
     * @brief calculate the block weight limit for the next block to be added
     *
+     * @param long_term_effective_median_block_weight optionally return that value
+     *
     * @return true
     */
-    bool update_next_cumulative_weight_limit();
+    bool update_next_cumulative_weight_limit(uint64_t *long_term_effective_median_block_weight = NULL);
    void return_tx_to_pool(std::vector<transaction> &txs);

    /**
--- a/monero/cryptonote_core/cryptonote_core.h
+++ b/monero/cryptonote_core/cryptonote_core.h
@ -55,6 +55,7 @@ namespace cryptonote
 {
   struct test_options {
     const std::pair<uint8_t, uint64_t> *hard_forks;
+     const size_t long_term_block_weight_window;
   };

  extern const command_line::arg_descriptor<std::string, false, true, 2> arg_data_dir;
@ -945,6 +946,13 @@ namespace cryptonote
      */
     bool check_disk_space();

+     /**
+      * @brief checks block rate, and warns if it's too slow
+      *
+      * @return true on success, false otherwise
+      */
+     bool check_block_rate();
+
     bool m_test_drop_download = true; //!< whether or not to drop incoming blocks (for testing)

     uint64_t m_test_drop_download_height = 0; //!< height under which to drop incoming blocks, if doing so
@ -969,6 +977,7 @@ namespace cryptonote
     epee::math_helper::once_a_time_seconds<60*2, false> m_txpool_auto_relayer; //!< interval for checking re-relaying txpool transactions
     epee::math_helper::once_a_time_seconds<60*60*12, true> m_check_updates_interval; //!< interval for checking for new versions
     epee::math_helper::once_a_time_seconds<60*10, true> m_check_disk_space_interval; //!< interval for checking for disk space
+     epee::math_helper::once_a_time_seconds<90, false> m_block_rate_interval; //!< interval for checking block rate

     std::atomic<bool> m_starter_message_showed; //!< has the "daemon will sync now" message been shown?

@ -1005,6 +1014,8 @@ namespace cryptonote

     bool m_fluffy_blocks_enabled;
     bool m_offline;
+
+     std::shared_ptr<tools::Notify> m_block_rate_notify;
   };
 }

--- a/monero/cryptonote_core/cryptonote_tx_utils.cpp
+++ b/monero/cryptonote_core/cryptonote_tx_utils.cpp
@ -195,7 +195,7 @@ namespace cryptonote
    return addr.m_view_public_key;
  }
  //---------------------------------------------------------------
-  bool construct_tx_with_tx_key(const account_keys& sender_account_keys, const std::unordered_map<crypto::public_key, subaddress_index>& subaddresses, std::vector<tx_source_entry>& sources, std::vector<tx_destination_entry>& destinations, const boost::optional<cryptonote::account_public_address>& change_addr, std::vector<uint8_t> extra, transaction& tx, uint64_t unlock_time, const crypto::secret_key &tx_key, const std::vector<crypto::secret_key> &additional_tx_keys, bool rct, rct::RangeProofType range_proof_type, rct::multisig_out *msout, bool shuffle_outs)
+  bool construct_tx_with_tx_key(const account_keys& sender_account_keys, const std::unordered_map<crypto::public_key, subaddress_index>& subaddresses, std::vector<tx_source_entry>& sources, std::vector<tx_destination_entry>& destinations, const boost::optional<cryptonote::account_public_address>& change_addr, std::vector<uint8_t> extra, transaction& tx, uint64_t unlock_time, const crypto::secret_key &tx_key, const std::vector<crypto::secret_key> &additional_tx_keys, bool rct, const rct::RCTConfig &rct_config, rct::multisig_out *msout, bool shuffle_outs)
  {
    hw::device &hwdev = sender_account_keys.get_device();

@ -223,13 +223,15 @@ namespace cryptonote
    std::vector<tx_extra_field> tx_extra_fields;
    if (parse_tx_extra(tx.extra, tx_extra_fields))
    {
+      bool add_dummy_payment_id = true;
      tx_extra_nonce extra_nonce;
      if (find_tx_extra_field_by_type(tx_extra_fields, extra_nonce))
      {
-        crypto::hash8 payment_id = null_hash8;
-        if (get_encrypted_payment_id_from_tx_extra_nonce(extra_nonce.nonce, payment_id))
+        crypto::hash payment_id = null_hash;
+        crypto::hash8 payment_id8 = null_hash8;
+        if (get_encrypted_payment_id_from_tx_extra_nonce(extra_nonce.nonce, payment_id8))
        {
-          LOG_PRINT_L2("Encrypting payment id " << payment_id);
+          LOG_PRINT_L2("Encrypting payment id " << payment_id8);
          crypto::public_key view_key_pub = get_destination_view_key_pub(destinations, change_addr);
          if (view_key_pub == null_pkey)
          {
@ -237,21 +239,53 @@ namespace cryptonote
            return false;
          }

-          if (!hwdev.encrypt_payment_id(payment_id, view_key_pub, tx_key))
+          if (!hwdev.encrypt_payment_id(payment_id8, view_key_pub, tx_key))
          {
            LOG_ERROR("Failed to encrypt payment id");
            return false;
          }

          std::string extra_nonce;
-          set_encrypted_payment_id_to_tx_extra_nonce(extra_nonce, payment_id);
+          set_encrypted_payment_id_to_tx_extra_nonce(extra_nonce, payment_id8);
          remove_field_from_tx_extra(tx.extra, typeid(tx_extra_nonce));
          if (!add_extra_nonce_to_tx_extra(tx.extra, extra_nonce))
          {
            LOG_ERROR("Failed to add encrypted payment id to tx extra");
            return false;
          }
-          LOG_PRINT_L1("Encrypted payment ID: " << payment_id);
+          LOG_PRINT_L1("Encrypted payment ID: " << payment_id8);
+          add_dummy_payment_id = false;
+        }
+        else if (get_payment_id_from_tx_extra_nonce(extra_nonce.nonce, payment_id))
+        {
+          add_dummy_payment_id = false;
+        }
+      }
+
+      // we don't add one if we've got more than the usual 1 destination plus change
+      if (destinations.size() > 2)
+        add_dummy_payment_id = false;
+
+      if (add_dummy_payment_id)
+      {
+        // if we have neither long nor short payment id, add a dummy short one,
+        // this should end up being the vast majority of txes as time goes on
+        std::string extra_nonce;
+        crypto::hash8 payment_id8 = null_hash8;
+        crypto::public_key view_key_pub = get_destination_view_key_pub(destinations, change_addr);
+        if (view_key_pub == null_pkey)
+        {
+          LOG_ERROR("Failed to get key to encrypt dummy payment id with");
+        }
+        else
+        {
+          hwdev.encrypt_payment_id(payment_id8, view_key_pub, tx_key);
+          set_encrypted_payment_id_to_tx_extra_nonce(extra_nonce, payment_id8);
+          if (!add_extra_nonce_to_tx_extra(tx.extra, extra_nonce))
+          {
+            LOG_ERROR("Failed to add dummy encrypted payment id to tx extra");
+            // continue anyway
+          }
        }
      }
    }
@ -368,49 +402,12 @@ namespace cryptonote
    for(const tx_destination_entry& dst_entr: destinations)
    {
      CHECK_AND_ASSERT_MES(dst_entr.amount > 0 || tx.version > 1, false, "Destination with wrong amount: " << dst_entr.amount);
-      crypto::key_derivation derivation;
      crypto::public_key out_eph_public_key;

-      // make additional tx pubkey if necessary
-      keypair additional_txkey;
-      if (need_additional_txkeys)
-      {
-        additional_txkey.sec = additional_tx_keys[output_index];
-        if (dst_entr.is_subaddress)
-          additional_txkey.pub = rct::rct2pk(hwdev.scalarmultKey(rct::pk2rct(dst_entr.addr.m_spend_public_key), rct::sk2rct(additional_txkey.sec)));
-        else
-          additional_txkey.pub = rct::rct2pk(hwdev.scalarmultBase(rct::sk2rct(additional_txkey.sec)));
-      }
-
-      bool r;
-      if (change_addr && dst_entr.addr == *change_addr)
-      {
-        // sending change to yourself; derivation = a*R
-        r = hwdev.generate_key_derivation(txkey_pub, sender_account_keys.m_view_secret_key, derivation);
-        CHECK_AND_ASSERT_MES(r, false, "at creation outs: failed to generate_key_derivation(" << txkey_pub << ", " << sender_account_keys.m_view_secret_key << ")");
-      }
-      else
-      {
-        // sending to the recipient; derivation = r*A (or s*C in the subaddress scheme)
-        r = hwdev.generate_key_derivation(dst_entr.addr.m_view_public_key, dst_entr.is_subaddress && need_additional_txkeys ? additional_txkey.sec : tx_key, derivation);
-        CHECK_AND_ASSERT_MES(r, false, "at creation outs: failed to generate_key_derivation(" << dst_entr.addr.m_view_public_key << ", " << (dst_entr.is_subaddress && need_additional_txkeys ? additional_txkey.sec : tx_key) << ")");
-      }
-
-      if (need_additional_txkeys)
-      {
-        additional_tx_public_keys.push_back(additional_txkey.pub);
-      }
-
-      if (tx.version > 1)
-      {
-        crypto::secret_key scalar1;
-        hwdev.derivation_to_scalar(derivation, output_index, scalar1);
-        amount_keys.push_back(rct::sk2rct(scalar1));
-      }
-      r = hwdev.derive_public_key(derivation, output_index, dst_entr.addr.m_spend_public_key, out_eph_public_key);
-      CHECK_AND_ASSERT_MES(r, false, "at creation outs: failed to derive_public_key(" << derivation << ", " << output_index << ", "<< dst_entr.addr.m_spend_public_key << ")");
-
-      hwdev.add_output_key_mapping(dst_entr.addr.m_view_public_key, dst_entr.addr.m_spend_public_key, dst_entr.is_subaddress, output_index, amount_keys.back(), out_eph_public_key);
+      hwdev.generate_output_ephemeral_keys(tx.version,sender_account_keys, txkey_pub, tx_key,
+                                           dst_entr, change_addr, output_index,
+                                           need_additional_txkeys, additional_tx_keys,
+                                           additional_tx_public_keys, amount_keys, out_eph_public_key);

      tx_out out;
      out.amount = dst_entr.amount;
@ -491,7 +488,7 @@ namespace cryptonote

      // the non-simple version is slightly smaller, but assumes all real inputs
      // are on the same index, so can only be used if there just one ring.
-      bool use_simple_rct = sources.size() > 1 || range_proof_type != rct::RangeProofBorromean;
+      bool use_simple_rct = sources.size() > 1 || rct_config.range_proof_type != rct::RangeProofBorromean;

      if (!use_simple_rct)
      {
@ -589,9 +586,9 @@ namespace cryptonote
      get_transaction_prefix_hash(tx, tx_prefix_hash);
      rct::ctkeyV outSk;
      if (use_simple_rct)
-        tx.rct_signatures = rct::genRctSimple(rct::hash2rct(tx_prefix_hash), inSk, destinations, inamounts, outamounts, amount_in - amount_out, mixRing, amount_keys, msout ? &kLRki : NULL, msout, index, outSk, range_proof_type, hwdev);
+        tx.rct_signatures = rct::genRctSimple(rct::hash2rct(tx_prefix_hash), inSk, destinations, inamounts, outamounts, amount_in - amount_out, mixRing, amount_keys, msout ? &kLRki : NULL, msout, index, outSk, rct_config, hwdev);
      else
-        tx.rct_signatures = rct::genRct(rct::hash2rct(tx_prefix_hash), inSk, destinations, outamounts, mixRing, amount_keys, msout ? &kLRki[0] : NULL, msout, sources[0].real_output, outSk, hwdev); // same index assumption
+        tx.rct_signatures = rct::genRct(rct::hash2rct(tx_prefix_hash), inSk, destinations, outamounts, mixRing, amount_keys, msout ? &kLRki[0] : NULL, msout, sources[0].real_output, outSk, rct_config, hwdev); // same index assumption
      memwipe(inSk.data(), inSk.size() * sizeof(rct::ctkey));

      CHECK_AND_ASSERT_MES(tx.vout.size() == outSk.size(), false, "outSk size does not match vout");
@ -604,7 +601,7 @@ namespace cryptonote
    return true;
  }
  //---------------------------------------------------------------
-  bool construct_tx_and_get_tx_key(const account_keys& sender_account_keys, const std::unordered_map<crypto::public_key, subaddress_index>& subaddresses, std::vector<tx_source_entry>& sources, std::vector<tx_destination_entry>& destinations, const boost::optional<cryptonote::account_public_address>& change_addr, std::vector<uint8_t> extra, transaction& tx, uint64_t unlock_time, crypto::secret_key &tx_key, std::vector<crypto::secret_key> &additional_tx_keys, bool rct, rct::RangeProofType range_proof_type, rct::multisig_out *msout)
+  bool construct_tx_and_get_tx_key(const account_keys& sender_account_keys, const std::unordered_map<crypto::public_key, subaddress_index>& subaddresses, std::vector<tx_source_entry>& sources, std::vector<tx_destination_entry>& destinations, const boost::optional<cryptonote::account_public_address>& change_addr, std::vector<uint8_t> extra, transaction& tx, uint64_t unlock_time, crypto::secret_key &tx_key, std::vector<crypto::secret_key> &additional_tx_keys, bool rct, const rct::RCTConfig &rct_config, rct::multisig_out *msout)
  {
    hw::device &hwdev = sender_account_keys.get_device();
    hwdev.open_tx(tx_key);
@ -622,7 +619,7 @@ namespace cryptonote
        additional_tx_keys.push_back(keypair::generate(sender_account_keys.get_device()).sec);
    }

-    bool r = construct_tx_with_tx_key(sender_account_keys, subaddresses, sources, destinations, change_addr, extra, tx, unlock_time, tx_key, additional_tx_keys, rct, range_proof_type, msout);
+    bool r = construct_tx_with_tx_key(sender_account_keys, subaddresses, sources, destinations, change_addr, extra, tx, unlock_time, tx_key, additional_tx_keys, rct, rct_config, msout);
    hwdev.close_tx();
    return r;
  }
@ -634,7 +631,7 @@ namespace cryptonote
     crypto::secret_key tx_key;
     std::vector<crypto::secret_key> additional_tx_keys;
     std::vector<tx_destination_entry> destinations_copy = destinations;
-     return construct_tx_and_get_tx_key(sender_account_keys, subaddresses, sources, destinations_copy, change_addr, extra, tx, unlock_time, tx_key, additional_tx_keys, false, rct::RangeProofBorromean, NULL);
+     return construct_tx_and_get_tx_key(sender_account_keys, subaddresses, sources, destinations_copy, change_addr, extra, tx, unlock_time, tx_key, additional_tx_keys, false, { rct::RangeProofBorromean, 0}, NULL);
  }
  //---------------------------------------------------------------
  bool generate_genesis_block(
--- a/monero/cryptonote_core/cryptonote_tx_utils.h
+++ b/monero/cryptonote_core/cryptonote_tx_utils.h
@ -73,25 +73,36 @@ namespace cryptonote

  struct tx_destination_entry
  {
+    std::string original;
    uint64_t amount;                    //money
    account_public_address addr;        //destination address
    bool is_subaddress;
+    bool is_integrated;

-    tx_destination_entry() : amount(0), addr(AUTO_VAL_INIT(addr)), is_subaddress(false) { }
-    tx_destination_entry(uint64_t a, const account_public_address &ad, bool is_subaddress) : amount(a), addr(ad), is_subaddress(is_subaddress) { }
+    tx_destination_entry() : amount(0), addr(AUTO_VAL_INIT(addr)), is_subaddress(false), is_integrated(false) { }
+    tx_destination_entry(uint64_t a, const account_public_address &ad, bool is_subaddress) : amount(a), addr(ad), is_subaddress(is_subaddress), is_integrated(false) { }
+    tx_destination_entry(const std::string &o, uint64_t a, const account_public_address &ad, bool is_subaddress) : original(o), amount(a), addr(ad), is_subaddress(is_subaddress), is_integrated(false) { }

    BEGIN_SERIALIZE_OBJECT()
+      FIELD(original)
      VARINT_FIELD(amount)
      FIELD(addr)
      FIELD(is_subaddress)
+      FIELD(is_integrated)
    END_SERIALIZE()
  };

  //---------------------------------------------------------------
  crypto::public_key get_destination_view_key_pub(const std::vector<tx_destination_entry> &destinations, const boost::optional<cryptonote::account_public_address>& change_addr);
  bool construct_tx(const account_keys& sender_account_keys, std::vector<tx_source_entry> &sources, const std::vector<tx_destination_entry>& destinations, const boost::optional<cryptonote::account_public_address>& change_addr, std::vector<uint8_t> extra, transaction& tx, uint64_t unlock_time);
-  bool construct_tx_with_tx_key(const account_keys& sender_account_keys, const std::unordered_map<crypto::public_key, subaddress_index>& subaddresses, std::vector<tx_source_entry>& sources, std::vector<tx_destination_entry>& destinations, const boost::optional<cryptonote::account_public_address>& change_addr, std::vector<uint8_t> extra, transaction& tx, uint64_t unlock_time, const crypto::secret_key &tx_key, const std::vector<crypto::secret_key> &additional_tx_keys, bool rct = false, rct::RangeProofType  range_proof_type = rct::RangeProofBorromean, rct::multisig_out *msout = NULL, bool shuffle_outs = true);
-  bool construct_tx_and_get_tx_key(const account_keys& sender_account_keys, const std::unordered_map<crypto::public_key, subaddress_index>& subaddresses, std::vector<tx_source_entry>& sources, std::vector<tx_destination_entry>& destinations, const boost::optional<cryptonote::account_public_address>& change_addr, std::vector<uint8_t> extra, transaction& tx, uint64_t unlock_time, crypto::secret_key &tx_key, std::vector<crypto::secret_key> &additional_tx_keys, bool rct = false, rct::RangeProofType  range_proof_type = rct::RangeProofBorromean, rct::multisig_out *msout = NULL);
+  bool construct_tx_with_tx_key(const account_keys& sender_account_keys, const std::unordered_map<crypto::public_key, subaddress_index>& subaddresses, std::vector<tx_source_entry>& sources, std::vector<tx_destination_entry>& destinations, const boost::optional<cryptonote::account_public_address>& change_addr, std::vector<uint8_t> extra, transaction& tx, uint64_t unlock_time, const crypto::secret_key &tx_key, const std::vector<crypto::secret_key> &additional_tx_keys, bool rct = false, const rct::RCTConfig &rct_config = { rct::RangeProofBorromean, 0 }, rct::multisig_out *msout = NULL, bool shuffle_outs = true);
+  bool construct_tx_and_get_tx_key(const account_keys& sender_account_keys, const std::unordered_map<crypto::public_key, subaddress_index>& subaddresses, std::vector<tx_source_entry>& sources, std::vector<tx_destination_entry>& destinations, const boost::optional<cryptonote::account_public_address>& change_addr, std::vector<uint8_t> extra, transaction& tx, uint64_t unlock_time, crypto::secret_key &tx_key, std::vector<crypto::secret_key> &additional_tx_keys, bool rct = false, const rct::RCTConfig &rct_config = { rct::RangeProofBorromean, 0 }, rct::multisig_out *msout = NULL);
+  bool generate_output_ephemeral_keys(const size_t tx_version, const cryptonote::account_keys &sender_account_keys, const crypto::public_key &txkey_pub,  const crypto::secret_key &tx_key,
+                                      const cryptonote::tx_destination_entry &dst_entr, const boost::optional<cryptonote::account_public_address> &change_addr, const size_t output_index,
+                                      const bool &need_additional_txkeys, const std::vector<crypto::secret_key> &additional_tx_keys,
+                                      std::vector<crypto::public_key> &additional_tx_public_keys,
+                                      std::vector<rct::key> &amount_keys,
+                                      crypto::public_key &out_eph_public_key) ;

  bool generate_genesis_block(
      block& bl
@ -102,7 +113,7 @@ namespace cryptonote
 }

 BOOST_CLASS_VERSION(cryptonote::tx_source_entry, 1)
-BOOST_CLASS_VERSION(cryptonote::tx_destination_entry, 1)
+BOOST_CLASS_VERSION(cryptonote::tx_destination_entry, 2)

 namespace boost
 {
@ -132,6 +143,13 @@ namespace boost
      if (ver < 1)
        return;
      a & x.is_subaddress;
+      if (ver < 2)
+      {
+        x.is_integrated = false;
+        return;
+      }
+      a & x.original;
+      a & x.is_integrated;
    }
  }
 }
--- a/monero/device/device.hpp
+++ b/monero/device/device.hpp
@ -68,6 +68,7 @@ namespace cryptonote
    struct account_public_address;
    struct account_keys;
    struct subaddress_index;
+    struct tx_destination_entry;
 }

 namespace hw {
@ -188,12 +189,15 @@ namespace hw {
            return encrypt_payment_id(payment_id, public_key, secret_key);
        }

-        virtual bool  ecdhEncode(rct::ecdhTuple & unmasked, const rct::key & sharedSec) = 0;
-        virtual bool  ecdhDecode(rct::ecdhTuple & masked, const rct::key & sharedSec) = 0;
-
-        virtual bool  add_output_key_mapping(const crypto::public_key &Aout, const crypto::public_key &Bout, const bool is_subaddress, const size_t real_output_index,
-                                             const rct::key &amount_key,  const crypto::public_key &out_eph_public_key) = 0;
+        virtual bool  ecdhEncode(rct::ecdhTuple & unmasked, const rct::key & sharedSec, bool short_amount) = 0;
+        virtual bool  ecdhDecode(rct::ecdhTuple & masked, const rct::key & sharedSec, bool short_amount) = 0;

+        virtual bool  generate_output_ephemeral_keys(const size_t tx_version, const cryptonote::account_keys &sender_account_keys, const crypto::public_key &txkey_pub,  const crypto::secret_key &tx_key,
+                                                     const cryptonote::tx_destination_entry &dst_entr, const boost::optional<cryptonote::account_public_address> &change_addr, const size_t output_index,
+                                                     const bool &need_additional_txkeys, const std::vector<crypto::secret_key> &additional_tx_keys,
+                                                     std::vector<crypto::public_key> &additional_tx_public_keys,
+                                                     std::vector<rct::key> &amount_keys,
+                                                     crypto::public_key &out_eph_public_key) = 0;

        virtual bool  mlsag_prehash(const std::string &blob, size_t inputs_size, size_t outputs_size, const rct::keyV &hashes, const rct::ctkeyV &outPk, rct::key &prehash) = 0;
        virtual bool  mlsag_prepare(const rct::key &H, const rct::key &xx, rct::key &a, rct::key &aG, rct::key &aHP, rct::key &rvII) = 0;
--- a/monero/epee/src/mlocker.cpp
+++ b/monero/epee/src/mlocker.cpp
@ -47,7 +47,6 @@ static size_t query_page_size()
    MERROR("Failed to determine page size");
    return 0;
  }
-  MINFO("Page size: " << ret);
  return ret;
 #else
 #warning Missing query_page_size implementation
@ -84,13 +83,13 @@ namespace epee

  boost::mutex &mlocker::mutex()
  {
-    static boost::mutex vmutex;
-    return vmutex;
+    static boost::mutex *vmutex = new boost::mutex();
+    return *vmutex;
  }
  std::map<size_t, unsigned int> &mlocker::map()
  {
-    static std::map<size_t, unsigned int> vmap;
-    return vmap;
+    static std::map<size_t, unsigned int> *vmap = new std::map<size_t, unsigned int>();
+    return *vmap;
  }

  size_t mlocker::get_page_size()
--- a/monero/ringct/rctOps.cpp
+++ b/monero/ringct/rctOps.cpp
@ -487,18 +487,58 @@ namespace rct {

    //Elliptic Curve Diffie Helman: encodes and decodes the amount b and mask a
    // where C= aG + bH
-    void ecdhEncode(ecdhTuple & unmasked, const key & sharedSec) {
-        key sharedSec1 = hash_to_scalar(sharedSec);
-        key sharedSec2 = hash_to_scalar(sharedSec1);
-        //encode
-        sc_add(unmasked.mask.bytes, unmasked.mask.bytes, sharedSec1.bytes);
-        sc_add(unmasked.amount.bytes, unmasked.amount.bytes, sharedSec2.bytes);
+    static key ecdhHash(const key &k)
+    {
+        char data[38];
+        rct::key hash;
+        memcpy(data, "amount", 6);
+        memcpy(data + 6, &k, sizeof(k));
+        cn_fast_hash(hash, data, sizeof(data));
+        return hash;
    }
-    void ecdhDecode(ecdhTuple & masked, const key & sharedSec) {
-        key sharedSec1 = hash_to_scalar(sharedSec);
-        key sharedSec2 = hash_to_scalar(sharedSec1);
+    static void xor8(key &v, const key &k)
+    {
+        for (int i = 0; i < 8; ++i)
+            v.bytes[i] ^= k.bytes[i];
+    }
+    key genCommitmentMask(const key &sk)
+    {
+        char data[15 + sizeof(key)];
+        memcpy(data, "commitment_mask", 15);
+        memcpy(data + 15, &sk, sizeof(sk));
+        key scalar;
+        hash_to_scalar(scalar, data, sizeof(data));
+        return scalar;
+    }
+
+    void ecdhEncode(ecdhTuple & unmasked, const key & sharedSec, bool v2) {
+        //encode
+        if (v2)
+        {
+          unmasked.mask = zero();
+          xor8(unmasked.amount, ecdhHash(sharedSec));
+        }
+        else
+        {
+          key sharedSec1 = hash_to_scalar(sharedSec);
+          key sharedSec2 = hash_to_scalar(sharedSec1);
+          sc_add(unmasked.mask.bytes, unmasked.mask.bytes, sharedSec1.bytes);
+          sc_add(unmasked.amount.bytes, unmasked.amount.bytes, sharedSec2.bytes);
+        }
+    }
+    void ecdhDecode(ecdhTuple & masked, const key & sharedSec, bool v2) {
        //decode
-        sc_sub(masked.mask.bytes, masked.mask.bytes, sharedSec1.bytes);
-        sc_sub(masked.amount.bytes, masked.amount.bytes, sharedSec2.bytes);
+        if (v2)
+        {
+          masked.mask = genCommitmentMask(sharedSec);
+          xor8(masked.amount, ecdhHash(sharedSec));
+        }
+        else
+        {
+          key sharedSec1 = hash_to_scalar(sharedSec);
+          key sharedSec2 = hash_to_scalar(sharedSec1);
+          sc_sub(masked.mask.bytes, masked.mask.bytes, sharedSec1.bytes);
+          sc_sub(masked.amount.bytes, masked.amount.bytes, sharedSec2.bytes);
+        }
    }
 }
--- a/monero/ringct/rctOps.h
+++ b/monero/ringct/rctOps.h
@ -182,7 +182,8 @@ namespace rct {

    //Elliptic Curve Diffie Helman: encodes and decodes the amount b and mask a
    // where C= aG + bH
-    void ecdhEncode(ecdhTuple & unmasked, const key & sharedSec);
-    void ecdhDecode(ecdhTuple & masked, const key & sharedSec);
+    key genCommitmentMask(const key &sk);
+    void ecdhEncode(ecdhTuple & unmasked, const key & sharedSec, bool v2);
+    void ecdhDecode(ecdhTuple & masked, const key & sharedSec, bool v2);
 }
 #endif  /* RCTOPS_H */
--- a/monero/ringct/rctSigs.cpp
+++ b/monero/ringct/rctSigs.cpp
@ -45,18 +45,12 @@ using namespace std;
 #define CHECK_AND_ASSERT_MES_L1(expr, ret, message) {if(!(expr)) {MCERROR("verify", message); return ret;}}

 namespace rct {
-    Bulletproof proveRangeBulletproof(key &C, key &mask, uint64_t amount)
+    Bulletproof proveRangeBulletproof(keyV &C, keyV &masks, const std::vector<uint64_t> &amounts, const std::vector<key> &sk)
    {
-        mask = rct::skGen();
-        Bulletproof proof = bulletproof_PROVE(amount, mask);
-        CHECK_AND_ASSERT_THROW_MES(proof.V.size() == 1, "V has not exactly one element");
-        C = proof.V[0];
-        return proof;
-    }
-
-    Bulletproof proveRangeBulletproof(keyV &C, keyV &masks, const std::vector<uint64_t> &amounts)
-    {
-        masks = rct::skvGen(amounts.size());
+        CHECK_AND_ASSERT_THROW_MES(amounts.size() == sk.size(), "Invalid amounts/sk sizes");
+        masks.resize(amounts.size());
+        for (size_t i = 0; i < masks.size(); ++i)
+            masks[i] = genCommitmentMask(sk[i]);
        Bulletproof proof = bulletproof_PROVE(amounts, masks);
        CHECK_AND_ASSERT_THROW_MES(proof.V.size() == amounts.size(), "V does not have the expected size");
        C = proof.V;
@ -391,7 +385,7 @@ namespace rct {
      hashes.push_back(hash2rct(h));

      keyV kv;
-      if (rv.type == RCTTypeBulletproof)
+      if (rv.type == RCTTypeBulletproof || rv.type == RCTTypeBulletproof2)
      {
        kv.reserve((6*2+9) * rv.p.bulletproofs.size());
        for (const auto &p: rv.p.bulletproofs)
@ -652,7 +646,7 @@ namespace rct {
    //   must know the destination private key to find the correct amount, else will return a random number
    //   Note: For txn fees, the last index in the amounts vector should contain that
    //   Thus the amounts vector will be "one" longer than the destinations vectort
-    rctSig genRct(const key &message, const ctkeyV & inSk, const keyV & destinations, const vector<xmr_amount> & amounts, const ctkeyM &mixRing, const keyV &amount_keys, const multisig_kLRki *kLRki, multisig_out *msout, unsigned int index, ctkeyV &outSk, hw::device &hwdev) {
+    rctSig genRct(const key &message, const ctkeyV & inSk, const keyV & destinations, const vector<xmr_amount> & amounts, const ctkeyM &mixRing, const keyV &amount_keys, const multisig_kLRki *kLRki, multisig_out *msout, unsigned int index, ctkeyV &outSk, const RCTConfig &rct_config, hw::device &hwdev) {
        CHECK_AND_ASSERT_THROW_MES(amounts.size() == destinations.size() || amounts.size() == destinations.size() + 1, "Different number of amounts/destinations");
        CHECK_AND_ASSERT_THROW_MES(amount_keys.size() == destinations.size(), "Different number of amount_keys/destinations");
        CHECK_AND_ASSERT_THROW_MES(index < mixRing.size(), "Bad index into mixRing");
@ -682,7 +676,7 @@ namespace rct {
            //mask amount and mask
            rv.ecdhInfo[i].mask = copy(outSk[i].mask);
            rv.ecdhInfo[i].amount = d2h(amounts[i]);
-            hwdev.ecdhEncode(rv.ecdhInfo[i], amount_keys[i]);
+            hwdev.ecdhEncode(rv.ecdhInfo[i], amount_keys[i], rv.type == RCTTypeBulletproof2);
        }

        //set txn fee
@ -703,18 +697,18 @@ namespace rct {
        return rv;
    }

-    rctSig genRct(const key &message, const ctkeyV & inSk, const ctkeyV  & inPk, const keyV & destinations, const vector<xmr_amount> & amounts, const keyV &amount_keys, const multisig_kLRki *kLRki, multisig_out *msout, const int mixin, hw::device &hwdev) {
+    rctSig genRct(const key &message, const ctkeyV & inSk, const ctkeyV  & inPk, const keyV & destinations, const vector<xmr_amount> & amounts, const keyV &amount_keys, const multisig_kLRki *kLRki, multisig_out *msout, const int mixin, const RCTConfig &rct_config, hw::device &hwdev) {
        unsigned int index;
        ctkeyM mixRing;
        ctkeyV outSk;
        tie(mixRing, index) = populateFromBlockchain(inPk, mixin);
-        return genRct(message, inSk, destinations, amounts, mixRing, amount_keys, kLRki, msout, index, outSk, hwdev);
+        return genRct(message, inSk, destinations, amounts, mixRing, amount_keys, kLRki, msout, index, outSk, rct_config, hwdev);
    }
    
    //RCT simple    
    //for post-rct only
-    rctSig genRctSimple(const key &message, const ctkeyV & inSk, const keyV & destinations, const vector<xmr_amount> &inamounts, const vector<xmr_amount> &outamounts, xmr_amount txnFee, const ctkeyM & mixRing, const keyV &amount_keys, const std::vector<multisig_kLRki> *kLRki, multisig_out *msout, const std::vector<unsigned int> & index, ctkeyV &outSk, RangeProofType range_proof_type, hw::device &hwdev) {
-        const bool bulletproof = range_proof_type != RangeProofBorromean;
+    rctSig genRctSimple(const key &message, const ctkeyV & inSk, const keyV & destinations, const vector<xmr_amount> &inamounts, const vector<xmr_amount> &outamounts, xmr_amount txnFee, const ctkeyM & mixRing, const keyV &amount_keys, const std::vector<multisig_kLRki> *kLRki, multisig_out *msout, const std::vector<unsigned int> & index, ctkeyV &outSk, const RCTConfig &rct_config, hw::device &hwdev) {
+        const bool bulletproof = rct_config.range_proof_type != RangeProofBorromean;
        CHECK_AND_ASSERT_THROW_MES(inamounts.size() > 0, "Empty inamounts");
        CHECK_AND_ASSERT_THROW_MES(inamounts.size() == inSk.size(), "Different number of inamounts/inSk");
        CHECK_AND_ASSERT_THROW_MES(outamounts.size() == destinations.size(), "Different number of amounts/destinations");
@ -730,7 +724,7 @@ namespace rct {
        }

        rctSig rv;
-        rv.type = bulletproof ? RCTTypeBulletproof : RCTTypeSimple;
+        rv.type = bulletproof ? (rct_config.bp_version == 0 || rct_config.bp_version >= 2 ? RCTTypeBulletproof2 : RCTTypeBulletproof) : RCTTypeSimple;
        rv.message = message;
        rv.outPk.resize(destinations.size());
        if (!bulletproof)
@ -759,10 +753,11 @@ namespace rct {
            std::vector<uint64_t> proof_amounts;
            size_t n_amounts = outamounts.size();
            size_t amounts_proved = 0;
-            if (range_proof_type == RangeProofPaddedBulletproof)
+            if (rct_config.range_proof_type == RangeProofPaddedBulletproof)
            {
                rct::keyV C, masks;
-                rv.p.bulletproofs.push_back(proveRangeBulletproof(C, masks, outamounts));
+                const std::vector<key> keys(amount_keys.begin(), amount_keys.end());
+                rv.p.bulletproofs.push_back(proveRangeBulletproof(C, masks, outamounts, keys));
                #ifdef DBG
                CHECK_AND_ASSERT_THROW_MES(verBulletproof(rv.p.bulletproofs.back()), "verBulletproof failed on newly created proof");
                #endif
@ -775,14 +770,17 @@ namespace rct {
            else while (amounts_proved < n_amounts)
            {
                size_t batch_size = 1;
-                if (range_proof_type == RangeProofMultiOutputBulletproof)
+                if (rct_config.range_proof_type == RangeProofMultiOutputBulletproof)
                  while (batch_size * 2 + amounts_proved <= n_amounts && batch_size * 2 <= BULLETPROOF_MAX_OUTPUTS)
                    batch_size *= 2;
                rct::keyV C, masks;
                std::vector<uint64_t> batch_amounts(batch_size);
                for (i = 0; i < batch_size; ++i)
                  batch_amounts[i] = outamounts[i + amounts_proved];
-                rv.p.bulletproofs.push_back(proveRangeBulletproof(C, masks, batch_amounts));
+                std::vector<key> keys(batch_size);
+                for (size_t j = 0; j < batch_size; ++j)
+                  keys[j] = amount_keys[amounts_proved + j];
+                rv.p.bulletproofs.push_back(proveRangeBulletproof(C, masks, batch_amounts, keys));
            #ifdef DBG
                CHECK_AND_ASSERT_THROW_MES(verBulletproof(rv.p.bulletproofs.back()), "verBulletproof failed on newly created proof");
            #endif
@ -803,7 +801,7 @@ namespace rct {
            //mask amount and mask
            rv.ecdhInfo[i].mask = copy(outSk[i].mask);
            rv.ecdhInfo[i].amount = d2h(outamounts[i]);
-            hwdev.ecdhEncode(rv.ecdhInfo[i], amount_keys[i]);
+            hwdev.ecdhEncode(rv.ecdhInfo[i], amount_keys[i], rv.type == RCTTypeBulletproof2);
        }
            
        //set txn fee
@ -835,7 +833,7 @@ namespace rct {
        return rv;
    }

-    rctSig genRctSimple(const key &message, const ctkeyV & inSk, const ctkeyV & inPk, const keyV & destinations, const vector<xmr_amount> &inamounts, const vector<xmr_amount> &outamounts, const keyV &amount_keys, const std::vector<multisig_kLRki> *kLRki, multisig_out *msout, xmr_amount txnFee, unsigned int mixin, hw::device &hwdev) {
+    rctSig genRctSimple(const key &message, const ctkeyV & inSk, const ctkeyV & inPk, const keyV & destinations, const vector<xmr_amount> &inamounts, const vector<xmr_amount> &outamounts, const keyV &amount_keys, const std::vector<multisig_kLRki> *kLRki, multisig_out *msout, xmr_amount txnFee, unsigned int mixin, const RCTConfig &rct_config, hw::device &hwdev) {
        std::vector<unsigned int> index;
        index.resize(inPk.size());
        ctkeyM mixRing;
@ -845,7 +843,7 @@ namespace rct {
          mixRing[i].resize(mixin+1);
          index[i] = populateFromBlockchainSimple(mixRing[i], inPk[i], mixin);
        }
-        return genRctSimple(message, inSk, destinations, inamounts, outamounts, txnFee, mixRing, amount_keys, kLRki, msout, index, outSk, RangeProofBorromean, hwdev);
+        return genRctSimple(message, inSk, destinations, inamounts, outamounts, txnFee, mixRing, amount_keys, kLRki, msout, index, outSk, rct_config, hwdev);
    }

    //RingCT protocol
@ -935,7 +933,8 @@ namespace rct {
        {
          CHECK_AND_ASSERT_MES(rvp, false, "rctSig pointer is NULL");
          const rctSig &rv = *rvp;
-          CHECK_AND_ASSERT_MES(rv.type == RCTTypeSimple || rv.type == RCTTypeBulletproof, false, "verRctSemanticsSimple called on non simple rctSig");
+          CHECK_AND_ASSERT_MES(rv.type == RCTTypeSimple || rv.type == RCTTypeBulletproof || rv.type == RCTTypeBulletproof2,
+              false, "verRctSemanticsSimple called on non simple rctSig");
          const bool bulletproof = is_rct_bulletproof(rv.type);
          if (bulletproof)
          {
@ -1034,7 +1033,8 @@ namespace rct {
      {
        PERF_TIMER(verRctNonSemanticsSimple);

-        CHECK_AND_ASSERT_MES(rv.type == RCTTypeSimple || rv.type == RCTTypeBulletproof, false, "verRctNonSemanticsSimple called on non simple rctSig");
+        CHECK_AND_ASSERT_MES(rv.type == RCTTypeSimple || rv.type == RCTTypeBulletproof || rv.type == RCTTypeBulletproof2,
+            false, "verRctNonSemanticsSimple called on non simple rctSig");
        const bool bulletproof = is_rct_bulletproof(rv.type);
        // semantics check is early, and mixRing/MGs aren't resolved yet
        if (bulletproof)
@ -1100,7 +1100,7 @@ namespace rct {

        //mask amount and mask
        ecdhTuple ecdh_info = rv.ecdhInfo[i];
-        hwdev.ecdhDecode(ecdh_info, sk);
+        hwdev.ecdhDecode(ecdh_info, sk, rv.type == RCTTypeBulletproof2);
        mask = ecdh_info.mask;
        key amount = ecdh_info.amount;
        key C = rv.outPk[i].mask;
@ -1124,13 +1124,13 @@ namespace rct {
    }

    xmr_amount decodeRctSimple(const rctSig & rv, const key & sk, unsigned int i, key &mask, hw::device &hwdev) {
-        CHECK_AND_ASSERT_MES(rv.type == RCTTypeSimple || rv.type == RCTTypeBulletproof, false, "decodeRct called on non simple rctSig");
+        CHECK_AND_ASSERT_MES(rv.type == RCTTypeSimple || rv.type == RCTTypeBulletproof || rv.type == RCTTypeBulletproof2, false, "decodeRct called on non simple rctSig");
        CHECK_AND_ASSERT_THROW_MES(i < rv.ecdhInfo.size(), "Bad index");
        CHECK_AND_ASSERT_THROW_MES(rv.outPk.size() == rv.ecdhInfo.size(), "Mismatched sizes of rv.outPk and rv.ecdhInfo");

        //mask amount and mask
        ecdhTuple ecdh_info = rv.ecdhInfo[i];
-        hwdev.ecdhDecode(ecdh_info, sk);
+        hwdev.ecdhDecode(ecdh_info, sk, rv.type == RCTTypeBulletproof2);
        mask = ecdh_info.mask;
        key amount = ecdh_info.amount;
        key C = rv.outPk[i].mask;
@ -1154,7 +1154,7 @@ namespace rct {
    }

    bool signMultisig(rctSig &rv, const std::vector<unsigned int> &indices, const keyV &k, const multisig_out &msout, const key &secret_key) {
-        CHECK_AND_ASSERT_MES(rv.type == RCTTypeFull || rv.type == RCTTypeSimple || rv.type == RCTTypeBulletproof,
+        CHECK_AND_ASSERT_MES(rv.type == RCTTypeFull || rv.type == RCTTypeSimple || rv.type == RCTTypeBulletproof || rv.type == RCTTypeBulletproof2,
            false, "unsupported rct type");
        CHECK_AND_ASSERT_MES(indices.size() == k.size(), false, "Mismatched k/indices sizes");
        CHECK_AND_ASSERT_MES(k.size() == rv.p.MGs.size(), false, "Mismatched k/MGs size");
--- a/monero/ringct/rctSigs.h
+++ b/monero/ringct/rctSigs.h
@ -119,10 +119,10 @@ namespace rct {
    //decodeRct: (c.f. https://eprint.iacr.org/2015/1098 section 5.1.1)
    //   uses the attached ecdh info to find the amounts represented by each output commitment
    //   must know the destination private key to find the correct amount, else will return a random number
-    rctSig genRct(const key &message, const ctkeyV & inSk, const keyV & destinations, const std::vector<xmr_amount> & amounts, const ctkeyM &mixRing, const keyV &amount_keys, const multisig_kLRki *kLRki, multisig_out *msout, unsigned int index, ctkeyV &outSk, hw::device &hwdev);
-    rctSig genRct(const key &message, const ctkeyV & inSk, const ctkeyV  & inPk, const keyV & destinations, const std::vector<xmr_amount> & amounts, const keyV &amount_keys, const multisig_kLRki *kLRki, multisig_out *msout, const int mixin, hw::device &hwdev);
-    rctSig genRctSimple(const key & message, const ctkeyV & inSk, const ctkeyV & inPk, const keyV & destinations, const std::vector<xmr_amount> & inamounts, const std::vector<xmr_amount> & outamounts, const keyV &amount_keys, const std::vector<multisig_kLRki> *kLRki, multisig_out *msout, xmr_amount txnFee, unsigned int mixin, hw::device &hwdev);
-    rctSig genRctSimple(const key & message, const ctkeyV & inSk, const keyV & destinations, const std::vector<xmr_amount> & inamounts, const std::vector<xmr_amount> & outamounts, xmr_amount txnFee, const ctkeyM & mixRing, const keyV &amount_keys, const std::vector<multisig_kLRki> *kLRki, multisig_out *msout, const std::vector<unsigned int> & index, ctkeyV &outSk, RangeProofType range_proof_type, hw::device &hwdev);
+    rctSig genRct(const key &message, const ctkeyV & inSk, const keyV & destinations, const std::vector<xmr_amount> & amounts, const ctkeyM &mixRing, const keyV &amount_keys, const multisig_kLRki *kLRki, multisig_out *msout, unsigned int index, ctkeyV &outSk, const RCTConfig &rct_config, hw::device &hwdev);
+    rctSig genRct(const key &message, const ctkeyV & inSk, const ctkeyV  & inPk, const keyV & destinations, const std::vector<xmr_amount> & amounts, const keyV &amount_keys, const multisig_kLRki *kLRki, multisig_out *msout, const int mixin, const RCTConfig &rct_config, hw::device &hwdev);
+    rctSig genRctSimple(const key & message, const ctkeyV & inSk, const ctkeyV & inPk, const keyV & destinations, const std::vector<xmr_amount> & inamounts, const std::vector<xmr_amount> & outamounts, const keyV &amount_keys, const std::vector<multisig_kLRki> *kLRki, multisig_out *msout, xmr_amount txnFee, unsigned int mixin, const RCTConfig &rct_config, hw::device &hwdev);
+    rctSig genRctSimple(const key & message, const ctkeyV & inSk, const keyV & destinations, const std::vector<xmr_amount> & inamounts, const std::vector<xmr_amount> & outamounts, xmr_amount txnFee, const ctkeyM & mixRing, const keyV &amount_keys, const std::vector<multisig_kLRki> *kLRki, multisig_out *msout, const std::vector<unsigned int> & index, ctkeyV &outSk, const RCTConfig &rct_config, hw::device &hwdev);
    bool verRct(const rctSig & rv, bool semantics);
    static inline bool verRct(const rctSig & rv) { return verRct(rv, true) && verRct(rv, false); }
    bool verRctSemanticsSimple(const rctSig & rv);
--- a/monero/ringct/rctTypes.cpp
+++ b/monero/ringct/rctTypes.cpp
@ -217,6 +217,7 @@ namespace rct {
        {
            case RCTTypeSimple:
            case RCTTypeBulletproof:
+            case RCTTypeBulletproof2:
                return true;
            default:
                return false;
@ -228,6 +229,7 @@ namespace rct {
        switch (type)
        {
            case RCTTypeBulletproof:
+            case RCTTypeBulletproof2:
                return true;
            default:
                return false;
--- a/monero/ringct/rctTypes.h
+++ b/monero/ringct/rctTypes.h
@ -128,7 +128,7 @@ namespace rct {
        key senderPk;

        BEGIN_SERIALIZE_OBJECT()
-          FIELD(mask)
+          FIELD(mask) // not saved from v2 BPs
          FIELD(amount)
          // FIELD(senderPk) // not serialized, as we do not use it in monero currently
        END_SERIALIZE()
@ -230,8 +230,13 @@ namespace rct {
      RCTTypeFull = 1,
      RCTTypeSimple = 2,
      RCTTypeBulletproof = 3,
+      RCTTypeBulletproof2 = 4,
    };
    enum RangeProofType { RangeProofBorromean, RangeProofBulletproof, RangeProofMultiOutputBulletproof, RangeProofPaddedBulletproof };
+    struct RCTConfig {
+      RangeProofType range_proof_type;
+      int bp_version;
+    };
    struct rctSigBase {
        uint8_t type;
        key message;
@ -248,7 +253,7 @@ namespace rct {
          FIELD(type)
          if (type == RCTTypeNull)
            return true;
-          if (type != RCTTypeFull && type != RCTTypeSimple && type != RCTTypeBulletproof)
+          if (type != RCTTypeFull && type != RCTTypeSimple && type != RCTTypeBulletproof && type != RCTTypeBulletproof2)
            return false;
          VARINT_FIELD(txnFee)
          // inputs/outputs not saved, only here for serialization help
@ -277,7 +282,19 @@ namespace rct {
            return false;
          for (size_t i = 0; i < outputs; ++i)
          {
-            FIELDS(ecdhInfo[i])
+            if (type == RCTTypeBulletproof2)
+            {
+              ar.begin_object();
+              if (!typename Archive<W>::is_saving())
+                memset(ecdhInfo[i].amount.bytes, 0, sizeof(ecdhInfo[i].amount.bytes));
+              crypto::hash8 &amount = (crypto::hash8&)ecdhInfo[i].amount;
+              FIELD(amount);
+              ar.end_object();
+            }
+            else
+            {
+              FIELDS(ecdhInfo[i])
+            }
            if (outputs - i > 1)
              ar.delimit_array();
          }
@ -309,12 +326,15 @@ namespace rct {
        {
          if (type == RCTTypeNull)
            return true;
-          if (type != RCTTypeFull && type != RCTTypeSimple && type != RCTTypeBulletproof)
+          if (type != RCTTypeFull && type != RCTTypeSimple && type != RCTTypeBulletproof && type != RCTTypeBulletproof2)
            return false;
-          if (type == RCTTypeBulletproof)
+          if (type == RCTTypeBulletproof || type == RCTTypeBulletproof2)
          {
            uint32_t nbp = bulletproofs.size();
-            FIELD(nbp)
+            if (type == RCTTypeBulletproof2)
+              VARINT_FIELD(nbp)
+            else
+              FIELD(nbp)
            ar.tag("bp");
            ar.begin_array();
            if (nbp > outputs)
@ -350,7 +370,7 @@ namespace rct {
          ar.begin_array();
          // we keep a byte for size of MGs, because we don't know whether this is
          // a simple or full rct signature, and it's starting to annoy the hell out of me
-          size_t mg_elements = (type == RCTTypeSimple || type == RCTTypeBulletproof) ? inputs : 1;
+          size_t mg_elements = (type == RCTTypeSimple || type == RCTTypeBulletproof || type == RCTTypeBulletproof2) ? inputs : 1;
          PREPARE_CUSTOM_VECTOR_SERIALIZATION(mg_elements, MGs);
          if (MGs.size() != mg_elements)
            return false;
@ -368,7 +388,7 @@ namespace rct {
            for (size_t j = 0; j < mixin + 1; ++j)
            {
              ar.begin_array();
-              size_t mg_ss2_elements = ((type == RCTTypeSimple || type == RCTTypeBulletproof) ? 1 : inputs) + 1;
+              size_t mg_ss2_elements = ((type == RCTTypeSimple || type == RCTTypeBulletproof || type == RCTTypeBulletproof2) ? 1 : inputs) + 1;
              PREPARE_CUSTOM_VECTOR_SERIALIZATION(mg_ss2_elements, MGs[i].ss[j]);
              if (MGs[i].ss[j].size() != mg_ss2_elements)
                return false;
@ -394,7 +414,7 @@ namespace rct {
               ar.delimit_array();
          }
          ar.end_array();
-          if (type == RCTTypeBulletproof)
+          if (type == RCTTypeBulletproof || type == RCTTypeBulletproof2)
          {
            ar.tag("pseudoOuts");
            ar.begin_array();
@ -418,12 +438,12 @@ namespace rct {

        keyV& get_pseudo_outs()
        {
-          return type == RCTTypeBulletproof ? p.pseudoOuts : pseudoOuts;
+          return type == RCTTypeBulletproof || type == RCTTypeBulletproof2 ? p.pseudoOuts : pseudoOuts;
        }

        keyV const& get_pseudo_outs() const
        {
-          return type == RCTTypeBulletproof ? p.pseudoOuts : pseudoOuts;
+          return type == RCTTypeBulletproof || type == RCTTypeBulletproof2 ? p.pseudoOuts : pseudoOuts;
        }
    };

--- a/src/pool.c
+++ b/src/pool.c
@ -226,7 +226,7 @@ static char * stratum_new_proxy_job_body(int json_id, const char *client_id, con
        const block_template_t *block_template, const char *template_blob,
        uint64_t target, bool response);
 static char * stratum_new_job_body(int json_id, const char *client_id, const char *job_id,
-        const char *blob, uint64_t target, bool response);
+        const char *blob, uint64_t target, uint64_t height, bool response);
 static char * stratum_new_error_body(int json_id, const char *error);
 static char * stratum_new_status_body(int json_id, const char *status);
 static void client_add(int fd, struct bufferevent *bev);
@ -1074,7 +1074,7 @@ stratum_new_proxy_job_body(int json_id, const char *client_id, const char *job_i

 static char *
 stratum_new_job_body(int json_id, const char *client_id, const char *job_id,
-        const char *blob, uint64_t target, bool response)
+        const char *blob, uint64_t target, uint64_t height, bool response)
 {
    char *body = calloc(CLIENT_BODY_MAX, sizeof(char));

@ -1084,14 +1084,16 @@ stratum_new_job_body(int json_id, const char *client_id, const char *job_id,
    if (response)
    {
        snprintf(body, CLIENT_BODY_MAX, "{\"id\":%d,\"jsonrpc\":\"2.0\",\"error\":null,\"result\""
-                ":{\"id\":\"%.32s\",\"job\":{\"blob\":\"%s\",\"job_id\":\"%.32s\",\"target\":\"%.8s\"},"
-                "\"status\":\"OK\"}}\n", json_id, client_id, blob, job_id, target_hex);
+                ":{\"id\":\"%.32s\",\"job\":{"
+                "\"blob\":\"%s\",\"job_id\":\"%.32s\",\"target\":\"%.8s\",\"height\":%"PRIu64"},"
+                "\"status\":\"OK\"}}\n", json_id, client_id, blob, job_id, target_hex, height);
    }
    else
    {
        snprintf(body, CLIENT_BODY_MAX, "{\"id\":%d,\"jsonrpc\":\"2.0\",\"method\":\"job\",\"params\""
-                ":{\"id\":\"%.32s\",\"blob\":\"%s\",\"job_id\":\"%.32s\",\"target\":\"%.8s\"}}\n", 
-                json_id, client_id, blob, job_id, target_hex);
+                ":{\"id\":\"%.32s\",\"blob\":\"%s\",\"job_id\":\"%.32s\",\"target\":\"%.8s\","
+                "\"height\":%"PRIu64"}}\n", 
+                json_id, client_id, blob, job_id, target_hex, height);
    }
    return body;
 }
@ -1743,7 +1745,7 @@ client_send_job(client_t *client, bool response)
    if (!client->is_proxy)
    {
        body = stratum_new_job_body(client->json_id, client->client_id, job_id,
-            job->blob, target, response);
+            job->blob, target, bt->height, response);
    }
    else
    {
@ -1956,7 +1958,9 @@ client_on_submit(json_object *message, client_t *client)
    /* Hash and compare */
    char result_hash[32];
    char submitted_hash[32];
-    get_hash(hashing_blob, hashing_blob_size, (char**)&result_hash);
+    uint8_t major_version = (uint8_t)block[0];
+    const int cn_variant = major_version >= 7 ? major_version - 6 : 0;
+    get_hash(hashing_blob, hashing_blob_size, (char**)&result_hash, cn_variant, bt->height);
    hex_to_bin(result_hex, submitted_hash, 32);

    if (memcmp(submitted_hash, result_hash, 32) != 0)
--- a/src/xmr.cpp
+++ b/src/xmr.cpp
@ -94,9 +94,9 @@ int parse_address(const char *input, uint64_t *prefix)
    return rv ? 0 : -1;
 }

-void get_hash(const char *input, const size_t in_size, char **output)
+void get_hash(const char *input, const size_t in_size, char **output, int variant, uint64_t height)
 {
-    crypto::cn_slow_hash(input, in_size, reinterpret_cast<crypto::hash&>(*output), 2);
+    crypto::cn_slow_hash(input, in_size, reinterpret_cast<crypto::hash&>(*output), variant, height);
 }

 bool check_hash(const char* hash, uint64_t difficulty)
--- a/src/xmr.h
+++ b/src/xmr.h
@ -40,7 +40,7 @@ extern "C" {
 int get_hashing_blob(const char *input, const size_t in_size, char **output, size_t *out_size);
 int construct_block_blob(const char *block_data, uint64_t nonce, char **blob);
 int parse_address(const char *input, uint64_t *prefix);
-void get_hash(const char *input, const size_t in_size, char **output);
+void get_hash(const char *input, const size_t in_size, char **output, int variant, uint64_t height);
 bool check_hash(const char* hash, uint64_t difficulty);

 #ifdef __cplusplus