JIT compiler for ARMv8 (#125)

JIT compiler for ARMv8
2024-08-15 00:23:14 +00:00 · 2019-09-22 21:06:22 +02:00 · 2019-09-22 21:06:22 +02:00 · c6468a3816
commit c6468a3816
parent 5fb26fc607
8 changed files with 1794 additions and 28 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -116,6 +116,12 @@ endif()

 # ARMv8
 if (ARM_ID STREQUAL "aarch64" OR ARM_ID STREQUAL "arm64" OR ARM_ID STREQUAL "armv8-a")
+  list(APPEND randomx_sources
+    src/jit_compiler_a64_static.S
+    src/jit_compiler_a64.cpp)
+  # cheat because cmake and ccache hate each other
+  set_property(SOURCE src/jit_compiler_a64_static.S PROPERTY LANGUAGE C)
+
  if(ARCH STREQUAL "native")
    add_flag("-march=native")
  else()
--- a/src/common.hpp
+++ b/src/common.hpp
@ -119,7 +119,7 @@ namespace randomx {
 	class JitCompilerX86;
 	using JitCompiler = JitCompilerX86;
 #elif defined(__aarch64__)
-	#define RANDOMX_HAVE_COMPILER 0
+	#define RANDOMX_HAVE_COMPILER 1
 	class JitCompilerA64;
 	using JitCompiler = JitCompilerA64;
 #else
--- a/src/intrin_portable.h
+++ b/src/intrin_portable.h
@ -385,7 +385,14 @@ FORCE_INLINE rx_vec_f128 rx_cvt_packed_int_vec_f128(const void* addr) {
 typedef uint8x16_t rx_vec_i128;
 typedef float64x2_t rx_vec_f128;

-#define rx_aligned_alloc(size, align) aligned_alloc(align, size)
+inline void* rx_aligned_alloc(size_t size, size_t align) {
+	void* p;
+	if (posix_memalign(&p, align, size) == 0)
+		return p;
+
+	return 0;
+};
+
 #define rx_aligned_free(a) free(a)

 inline void rx_prefetch_nta(void* ptr) {
--- a/src/jit_compiler_a64.cpp
+++ b/src/jit_compiler_a64.cpp
--- a/src/jit_compiler_a64.hpp
+++ b/src/jit_compiler_a64.hpp
@ -1,5 +1,6 @@
 /*
 Copyright (c) 2018-2019, tevador <tevador@gmail.com>
+Copyright (c) 2019, SChernykh    <https://github.com/SChernykh>

 All rights reserved.

@ -32,45 +33,96 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <vector>
 #include <stdexcept>
 #include "common.hpp"
+#include "jit_compiler_a64_static.hpp"

 namespace randomx {

 	class Program;
 	class ProgramConfiguration;
 	class SuperscalarProgram;
+	class Instruction;
+
+	typedef void(JitCompilerA64::*InstructionGeneratorA64)(Instruction&, uint32_t&);

 	class JitCompilerA64 {
 	public:
-		JitCompilerA64() {
-			throw std::runtime_error("ARM64 JIT compiler is not implemented yet.");
-		}
-		void generateProgram(Program&, ProgramConfiguration&) {
+		JitCompilerA64();
+		~JitCompilerA64();
+
+		void generateProgram(Program&, ProgramConfiguration&);
+		void generateProgramLight(Program&, ProgramConfiguration&, uint32_t);

-		}
-		void generateProgramLight(Program&, ProgramConfiguration&, uint32_t) {
-			
-		}
 		template<size_t N>
-		void generateSuperscalarHash(SuperscalarProgram(&programs)[N], std::vector<uint64_t> &) {
+		void generateSuperscalarHash(SuperscalarProgram(&programs)[N], std::vector<uint64_t> &);

-		}
-		void generateDatasetInitCode() {
+		void generateDatasetInitCode() {}

+		ProgramFunc* getProgramFunc() { return reinterpret_cast<ProgramFunc*>(code); }
+		DatasetInitFunc* getDatasetInitFunc();
+		uint8_t* getCode() { return code; }
+		size_t getCodeSize();
+
+		void enableWriting();
+		void enableExecution();
+		void enableAll();
+
+	private:
+		static InstructionGeneratorA64 engine[256];
+		uint32_t reg_changed_offset[8];
+		uint8_t* code;
+		uint32_t literalPos;
+		uint32_t num32bitLiterals;
+
+		static void emit32(uint32_t val, uint8_t* code, uint32_t& codePos)
+		{
+			*(uint32_t*)(code + codePos) = val;
+			codePos += sizeof(val);
 		}
-		ProgramFunc* getProgramFunc() {
-			return nullptr;
+
+		static void emit64(uint64_t val, uint8_t* code, uint32_t& codePos)
+		{
+			*(uint64_t*)(code + codePos) = val;
+			codePos += sizeof(val);
 		}
-		DatasetInitFunc* getDatasetInitFunc() {
-			return nullptr;
-		}
-		uint8_t* getCode() {
-			return nullptr;
-		}
-		size_t getCodeSize() {
-			return 0;
-		}
-		void enableWriting() {}
-		void enableExecution() {}
-		void enableAll() {}
+
+		void emitMovImmediate(uint32_t dst, uint32_t imm, uint8_t* code, uint32_t& codePos);
+		void emitAddImmediate(uint32_t dst, uint32_t src, uint32_t imm, uint8_t* code, uint32_t& codePos);
+
+		template<uint32_t tmp_reg>
+		void emitMemLoad(uint32_t dst, uint32_t src, Instruction& instr, uint8_t* code, uint32_t& codePos);
+
+		template<uint32_t tmp_reg_fp>
+		void emitMemLoadFP(uint32_t src, Instruction& instr, uint8_t* code, uint32_t& codePos);
+
+		void h_IADD_RS(Instruction&, uint32_t&);
+		void h_IADD_M(Instruction&, uint32_t&);
+		void h_ISUB_R(Instruction&, uint32_t&);
+		void h_ISUB_M(Instruction&, uint32_t&);
+		void h_IMUL_R(Instruction&, uint32_t&);
+		void h_IMUL_M(Instruction&, uint32_t&);
+		void h_IMULH_R(Instruction&, uint32_t&);
+		void h_IMULH_M(Instruction&, uint32_t&);
+		void h_ISMULH_R(Instruction&, uint32_t&);
+		void h_ISMULH_M(Instruction&, uint32_t&);
+		void h_IMUL_RCP(Instruction&, uint32_t&);
+		void h_INEG_R(Instruction&, uint32_t&);
+		void h_IXOR_R(Instruction&, uint32_t&);
+		void h_IXOR_M(Instruction&, uint32_t&);
+		void h_IROR_R(Instruction&, uint32_t&);
+		void h_IROL_R(Instruction&, uint32_t&);
+		void h_ISWAP_R(Instruction&, uint32_t&);
+		void h_FSWAP_R(Instruction&, uint32_t&);
+		void h_FADD_R(Instruction&, uint32_t&);
+		void h_FADD_M(Instruction&, uint32_t&);
+		void h_FSUB_R(Instruction&, uint32_t&);
+		void h_FSUB_M(Instruction&, uint32_t&);
+		void h_FSCAL_R(Instruction&, uint32_t&);
+		void h_FMUL_R(Instruction&, uint32_t&);
+		void h_FDIV_M(Instruction&, uint32_t&);
+		void h_FSQRT_R(Instruction&, uint32_t&);
+		void h_CBRANCH(Instruction&, uint32_t&);
+		void h_CFROUND(Instruction&, uint32_t&);
+		void h_ISTORE(Instruction&, uint32_t&);
+		void h_NOP(Instruction&, uint32_t&);
 	};
-}
+}
--- a/src/jit_compiler_a64_static.S
+++ b/src/jit_compiler_a64_static.S
@ -0,0 +1,579 @@
+# Copyright (c) 2018-2019, tevador <tevador@gmail.com>
+# Copyright (c) 2019, SChernykh    <https://github.com/SChernykh>
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# 	* Redistributions of source code must retain the above copyright
+# 	  notice, this list of conditions and the following disclaimer.
+# 	* Redistributions in binary form must reproduce the above copyright
+# 	  notice, this list of conditions and the following disclaimer in the
+# 	  documentation and/or other materials provided with the distribution.
+# 	* Neither the name of the copyright holder nor the
+# 	  names of its contributors may be used to endorse or promote products
+# 	  derived from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+	.arch armv8-a
+	.text
+	.global	randomx_program_aarch64
+	.global	randomx_program_aarch64_main_loop
+	.global	randomx_program_aarch64_vm_instructions
+	.global randomx_program_aarch64_imul_rcp_literals_end
+	.global	randomx_program_aarch64_vm_instructions_end
+	.global randomx_program_aarch64_cacheline_align_mask1
+	.global randomx_program_aarch64_cacheline_align_mask2
+	.global randomx_program_aarch64_update_spMix1
+	.global randomx_program_aarch64_vm_instructions_end_light
+	.global randomx_program_aarch64_light_cacheline_align_mask
+	.global randomx_program_aarch64_light_dataset_offset
+	.global randomx_init_dataset_aarch64
+	.global randomx_init_dataset_aarch64_end
+	.global randomx_calc_dataset_item_aarch64
+	.global randomx_calc_dataset_item_aarch64_prefetch
+	.global randomx_calc_dataset_item_aarch64_mix
+	.global randomx_calc_dataset_item_aarch64_store_result
+	.global randomx_calc_dataset_item_aarch64_end
+
+#include "configuration.h"
+
+# Register allocation
+
+# x0  -> pointer to reg buffer and then literal for IMUL_RCP
+# x1  -> pointer to mem buffer and then to dataset
+# x2  -> pointer to scratchpad
+# x3  -> loop counter
+# x4  -> "r0"
+# x5  -> "r1"
+# x6  -> "r2"
+# x7  -> "r3"
+# x8  -> fpcr (reversed bits)
+# x9  -> mx, ma
+# x10 -> spMix1
+# x11 -> literal for IMUL_RCP
+# x12 -> "r4"
+# x13 -> "r5"
+# x14 -> "r6"
+# x15 -> "r7"
+# x16 -> spAddr0
+# x17 -> spAddr1
+# x18 -> temporary
+# x19 -> temporary
+# x20 -> literal for IMUL_RCP
+# x21 -> literal for IMUL_RCP
+# x22 -> literal for IMUL_RCP
+# x23 -> literal for IMUL_RCP
+# x24 -> literal for IMUL_RCP
+# x25 -> literal for IMUL_RCP
+# x26 -> literal for IMUL_RCP
+# x27 -> literal for IMUL_RCP
+# x28 -> literal for IMUL_RCP
+# x29 -> literal for IMUL_RCP
+# x30 -> literal for IMUL_RCP
+
+# v0-v15 -> store 32-bit literals
+# v16 -> "f0"
+# v17 -> "f1"
+# v18 -> "f2"
+# v19 -> "f3"
+# v20 -> "e0"
+# v21 -> "e1"
+# v22 -> "e2"
+# v23 -> "e3"
+# v24 -> "a0"
+# v25 -> "a1"
+# v26 -> "a2"
+# v27 -> "a3"
+# v28 -> temporary
+# v29 -> E 'and' mask = 0x00ffffffffffffff00ffffffffffffff
+# v30 -> E 'or' mask  = 0x3*00000000******3*00000000******
+# v31 -> scale mask   = 0x81f000000000000081f0000000000000
+
+randomx_program_aarch64:
+	# Save callee-saved registers
+	sub	sp, sp, 192
+	stp	x16, x17, [sp]
+	stp	x18, x19, [sp, 16]
+	stp	x20, x21, [sp, 32]
+	stp	x22, x23, [sp, 48]
+	stp	x24, x25, [sp, 64]
+	stp	x26, x27, [sp, 80]
+	stp	x28, x29, [sp, 96]
+	stp	x8, x30, [sp, 112]
+	stp	d8, d9, [sp, 128]
+	stp	d10, d11, [sp, 144]
+	stp	d12, d13, [sp, 160]
+	stp	d14, d15, [sp, 176]
+
+	# Zero integer registers
+	mov	x4, xzr
+	mov	x5, xzr
+	mov	x6, xzr
+	mov	x7, xzr
+	mov	x12, xzr
+	mov	x13, xzr
+	mov	x14, xzr
+	mov	x15, xzr
+
+	# Load ma, mx and dataset pointer
+	ldp	x9, x1, [x1]
+
+	# Load initial spMix value
+	mov	x10, x9
+
+	# Load group A registers
+	ldp	q24, q25, [x0, 192]
+	ldp	q26, q27, [x0, 224]
+
+	# Load E 'and' mask
+	mov	x16, 0x00FFFFFFFFFFFFFF
+	ins	v29.d[0], x16
+	ins	v29.d[1], x16
+
+	# Load E 'or' mask (stored in reg.f[0])
+	ldr	q30, [x0, 64]
+
+	# Load scale mask
+	mov	x16, 0x80f0000000000000
+	ins	v31.d[0], x16
+	ins	v31.d[1], x16
+
+	# Read fpcr
+	mrs	x8, fpcr
+	rbit	x8, x8
+
+	# Save x0
+	str	x0, [sp, -16]!
+
+	# Read literals
+	ldr	x0, literal_x0
+	ldr	x11, literal_x11
+	ldr	x20, literal_x20
+	ldr	x21, literal_x21
+	ldr	x22, literal_x22
+	ldr	x23, literal_x23
+	ldr	x24, literal_x24
+	ldr	x25, literal_x25
+	ldr	x26, literal_x26
+	ldr	x27, literal_x27
+	ldr	x28, literal_x28
+	ldr	x29, literal_x29
+	ldr	x30, literal_x30
+
+	ldr	q0, literal_v0
+	ldr	q1, literal_v1
+	ldr	q2, literal_v2
+	ldr	q3, literal_v3
+	ldr	q4, literal_v4
+	ldr	q5, literal_v5
+	ldr	q6, literal_v6
+	ldr	q7, literal_v7
+	ldr	q8, literal_v8
+	ldr	q9, literal_v9
+	ldr	q10, literal_v10
+	ldr	q11, literal_v11
+	ldr	q12, literal_v12
+	ldr	q13, literal_v13
+	ldr	q14, literal_v14
+	ldr	q15, literal_v15
+
+randomx_program_aarch64_main_loop:
+	# spAddr0 = spMix1 & ScratchpadL3Mask64;
+	# spAddr1 = (spMix1 >> 32) & ScratchpadL3Mask64;
+	lsr	x18, x10, 32
+
+	# Actual mask will be inserted by JIT compiler
+	and	w16, w10, 1
+	and	w17, w18, 1
+
+	# x16 = scratchpad + spAddr0
+	# x17 = scratchpad + spAddr1
+	add	x16, x16, x2
+	add	x17, x17, x2
+
+	# xor integer registers with scratchpad data (spAddr0)
+	ldp	x18, x19, [x16]
+	eor	x4, x4, x18
+	eor	x5, x5, x19
+	ldp	x18, x19, [x16, 16]
+	eor	x6, x6, x18
+	eor	x7, x7, x19
+	ldp	x18, x19, [x16, 32]
+	eor	x12, x12, x18
+	eor	x13, x13, x19
+	ldp	x18, x19, [x16, 48]
+	eor	x14, x14, x18
+	eor	x15, x15, x19
+
+	# Load group F registers (spAddr1)
+	ldpsw	x18, x19, [x17]
+	ins	v16.d[0], x18
+	ins	v16.d[1], x19
+	ldpsw	x18, x19, [x17, 8]
+	ins	v17.d[0], x18
+	ins	v17.d[1], x19
+	ldpsw	x18, x19, [x17, 16]
+	ins	v18.d[0], x18
+	ins	v18.d[1], x19
+	ldpsw	x18, x19, [x17, 24]
+	ins	v19.d[0], x18
+	ins	v19.d[1], x19
+	scvtf	v16.2d, v16.2d
+	scvtf	v17.2d, v17.2d
+	scvtf	v18.2d, v18.2d
+	scvtf	v19.2d, v19.2d
+
+	# Load group E registers (spAddr1)
+	ldpsw	x18, x19, [x17, 32]
+	ins	v20.d[0], x18
+	ins	v20.d[1], x19
+	ldpsw	x18, x19, [x17, 40]
+	ins	v21.d[0], x18
+	ins	v21.d[1], x19
+	ldpsw	x18, x19, [x17, 48]
+	ins	v22.d[0], x18
+	ins	v22.d[1], x19
+	ldpsw	x18, x19, [x17, 56]
+	ins	v23.d[0], x18
+	ins	v23.d[1], x19
+	scvtf	v20.2d, v20.2d
+	scvtf	v21.2d, v21.2d
+	scvtf	v22.2d, v22.2d
+	scvtf	v23.2d, v23.2d
+	and	v20.16b, v20.16b, v29.16b
+	and	v21.16b, v21.16b, v29.16b
+	and	v22.16b, v22.16b, v29.16b
+	and	v23.16b, v23.16b, v29.16b
+	orr	v20.16b, v20.16b, v30.16b
+	orr	v21.16b, v21.16b, v30.16b
+	orr	v22.16b, v22.16b, v30.16b
+	orr	v23.16b, v23.16b, v30.16b
+
+	# Execute VM instructions
+randomx_program_aarch64_vm_instructions:
+
+	# buffer for generated instructions
+	# FDIV_M is the largest instruction taking up to 12 ARMv8 instructions
+	.fill RANDOMX_PROGRAM_SIZE*12,4,0
+
+literal_x0:  .fill 1,8,0
+literal_x11: .fill 1,8,0
+literal_x20: .fill 1,8,0
+literal_x21: .fill 1,8,0
+literal_x22: .fill 1,8,0
+literal_x23: .fill 1,8,0
+literal_x24: .fill 1,8,0
+literal_x25: .fill 1,8,0
+literal_x26: .fill 1,8,0
+literal_x27: .fill 1,8,0
+literal_x28: .fill 1,8,0
+literal_x29: .fill 1,8,0
+literal_x30: .fill 1,8,0
+randomx_program_aarch64_imul_rcp_literals_end:
+
+literal_v0:  .fill 2,8,0
+literal_v1:  .fill 2,8,0
+literal_v2:  .fill 2,8,0
+literal_v3:  .fill 2,8,0
+literal_v4:  .fill 2,8,0
+literal_v5:  .fill 2,8,0
+literal_v6:  .fill 2,8,0
+literal_v7:  .fill 2,8,0
+literal_v8:  .fill 2,8,0
+literal_v9:  .fill 2,8,0
+literal_v10: .fill 2,8,0
+literal_v11: .fill 2,8,0
+literal_v12: .fill 2,8,0
+literal_v13: .fill 2,8,0
+literal_v14: .fill 2,8,0
+literal_v15: .fill 2,8,0
+
+randomx_program_aarch64_vm_instructions_end:
+
+	# mx ^= r[readReg2] ^ r[readReg3];
+	eor	x9, x9, x18
+
+	# Calculate dataset pointer for dataset prefetch
+	mov	w18, w9
+randomx_program_aarch64_cacheline_align_mask1:
+	# Actual mask will be inserted by JIT compiler
+	and	x18, x18, 1
+	add	x18, x18, x1
+
+	# Prefetch dataset data
+	prfm	pldl2strm, [x18]
+
+	# mx <-> ma
+	ror	x9, x9, 32
+
+	# Calculate dataset pointer for dataset read
+	mov	w10, w9
+randomx_program_aarch64_cacheline_align_mask2:
+	# Actual mask will be inserted by JIT compiler
+	and	x10, x10, 1
+	add	x10, x10, x1
+
+randomx_program_aarch64_xor_with_dataset_line:
+	# xor integer registers with dataset data
+	ldp	x18, x19, [x10]
+	eor	x4, x4, x18
+	eor	x5, x5, x19
+	ldp	x18, x19, [x10, 16]
+	eor	x6, x6, x18
+	eor	x7, x7, x19
+	ldp	x18, x19, [x10, 32]
+	eor	x12, x12, x18
+	eor	x13, x13, x19
+	ldp	x18, x19, [x10, 48]
+	eor	x14, x14, x18
+	eor	x15, x15, x19
+
+randomx_program_aarch64_update_spMix1:
+	# JIT compiler will replace it with "eor x10, config.readReg0, config.readReg1"
+	eor	x10, x0, x0
+
+	# Store integer registers to scratchpad (spAddr1)
+	stp	x4, x5, [x17, 0]
+	stp	x6, x7, [x17, 16]
+	stp	x12, x13, [x17, 32]
+	stp	x14, x15, [x17, 48]
+
+	# xor group F and group E registers
+	eor	v16.16b, v16.16b, v20.16b
+	eor	v17.16b, v17.16b, v21.16b
+	eor	v18.16b, v18.16b, v22.16b
+	eor	v19.16b, v19.16b, v23.16b
+
+	# Store FP registers to scratchpad (spAddr0)
+	stp	q16, q17, [x16, 0]
+	stp	q18, q19, [x16, 32]
+
+	subs	x3, x3, 1
+	bne	randomx_program_aarch64_main_loop
+	
+	# Restore x0
+	ldr	x0, [sp], 16
+
+	# Store integer registers
+	stp	x4, x5, [x0, 0]
+	stp	x6, x7, [x0, 16]
+	stp	x12, x13, [x0, 32]
+	stp	x14, x15, [x0, 48]
+
+	# Store FP registers
+	stp	q16, q17, [x0, 64]
+	stp	q18, q19, [x0, 96]
+	stp	q20, q21, [x0, 128]
+	stp	q22, q23, [x0, 160]
+
+	# Restore callee-saved registers
+	ldp	x16, x17, [sp]
+	ldp	x18, x19, [sp, 16]
+	ldp	x20, x21, [sp, 32]
+	ldp	x22, x23, [sp, 48]
+	ldp	x24, x25, [sp, 64]
+	ldp	x26, x27, [sp, 80]
+	ldp	x28, x29, [sp, 96]
+	ldp	x8, x30, [sp, 112]
+	ldp	d8, d9, [sp, 128]
+	ldp	d10, d11, [sp, 144]
+	ldp	d12, d13, [sp, 160]
+	ldp	d14, d15, [sp, 176]
+	add	sp, sp, 192
+
+	ret
+
+randomx_program_aarch64_vm_instructions_end_light:
+	sub	sp, sp, 96
+	stp	x0, x1, [sp, 64]
+	stp	x2, x30, [sp, 80]
+
+	# mx ^= r[readReg2] ^ r[readReg3];
+	eor	x9, x9, x18
+
+	# mx <-> ma
+	ror	x9, x9, 32
+
+	# x0 -> pointer to cache memory
+	mov	x0, x1
+
+	# x1 -> pointer to output
+	mov	x1, sp
+
+randomx_program_aarch64_light_cacheline_align_mask:
+	# Actual mask will be inserted by JIT compiler
+	and	w2, w9, 1
+
+	# x2 -> item number
+	lsr	x2, x2, 6
+
+randomx_program_aarch64_light_dataset_offset:
+	# Apply dataset offset (filled in by JIT compiler)
+	add	x2, x2, 0
+	add	x2, x2, 0
+
+	bl	randomx_calc_dataset_item_aarch64
+
+	mov	x10, sp
+	ldp	x0, x1, [sp, 64]
+	ldp	x2, x30, [sp, 80]
+	add	sp, sp, 96
+
+	b	randomx_program_aarch64_xor_with_dataset_line
+
+
+
+# Input parameters
+#
+# x0 -> pointer to cache
+# x1 -> pointer to dataset memory at startItem
+# x2 -> start item
+# x3 -> end item
+
+randomx_init_dataset_aarch64:
+	# Save x30 (return address)
+	str	x30, [sp, -16]!
+
+	# Load pointer to cache memory
+	ldr	x0, [x0]
+
+randomx_init_dataset_aarch64_main_loop:
+	bl	randomx_calc_dataset_item_aarch64
+	add	x1, x1, 64
+	add	x2, x2, 1
+	cmp	x2, x3
+	bne	randomx_init_dataset_aarch64_main_loop
+
+	# Restore x30 (return address)
+	ldr	x30, [sp], 16
+
+	ret
+
+randomx_init_dataset_aarch64_end:
+
+# Input parameters
+#
+# x0 -> pointer to cache memory
+# x1 -> pointer to output
+# x2 -> item number
+#
+# Register allocation
+#
+# x0-x7 -> output value (calculated dataset item)
+# x8 -> pointer to cache memory
+# x9 -> pointer to output
+# x10 -> registerValue
+# x11 -> mixBlock
+# x12 -> temporary
+# x13 -> temporary
+
+randomx_calc_dataset_item_aarch64:
+	sub	sp, sp, 112
+	stp	x0, x1, [sp]
+	stp	x2, x3, [sp, 16]
+	stp	x4, x5, [sp, 32]
+	stp	x6, x7, [sp, 48]
+	stp	x8, x9, [sp, 64]
+	stp	x10, x11, [sp, 80]
+	stp	x12, x13, [sp, 96]
+
+	mov	x8, x0
+	mov	x9, x1
+	mov	x10, x2
+
+	# rl[0] = (itemNumber + 1) * superscalarMul0;
+	ldr	x12, superscalarMul0
+	madd	x0, x2, x12, x12
+
+	# rl[1] = rl[0] ^ superscalarAdd1;
+	ldr	x12, superscalarAdd1
+	eor	x1, x0, x12
+
+	# rl[2] = rl[0] ^ superscalarAdd2;
+	ldr	x12, superscalarAdd2
+	eor	x2, x0, x12
+
+	# rl[3] = rl[0] ^ superscalarAdd3;
+	ldr	x12, superscalarAdd3
+	eor	x3, x0, x12
+
+	# rl[4] = rl[0] ^ superscalarAdd4;
+	ldr	x12, superscalarAdd4
+	eor	x4, x0, x12
+
+	# rl[5] = rl[0] ^ superscalarAdd5;
+	ldr	x12, superscalarAdd5
+	eor	x5, x0, x12
+
+	# rl[6] = rl[0] ^ superscalarAdd6;
+	ldr	x12, superscalarAdd6
+	eor	x6, x0, x12
+
+	# rl[7] = rl[0] ^ superscalarAdd7;
+	ldr	x12, superscalarAdd7
+	eor	x7, x0, x12
+
+	b	randomx_calc_dataset_item_aarch64_prefetch
+
+superscalarMul0: .quad 6364136223846793005
+superscalarAdd1: .quad 9298411001130361340
+superscalarAdd2: .quad 12065312585734608966
+superscalarAdd3: .quad 9306329213124626780
+superscalarAdd4: .quad 5281919268842080866
+superscalarAdd5: .quad 10536153434571861004
+superscalarAdd6: .quad 3398623926847679864
+superscalarAdd7: .quad 9549104520008361294
+
+# Prefetch -> SuperScalar hash -> Mix will be repeated N times
+
+randomx_calc_dataset_item_aarch64_prefetch:
+	# Actual mask will be inserted by JIT compiler
+	and	x11, x10, 1
+	add	x11, x8, x11, lsl 6
+	prfm	pldl2strm, [x11]
+
+	# Generated SuperScalar hash program goes here
+
+randomx_calc_dataset_item_aarch64_mix:
+	ldp	x12, x13, [x11]
+	eor	x0, x0, x12
+	eor	x1, x1, x13
+	ldp	x12, x13, [x11, 16]
+	eor	x2, x2, x12
+	eor	x3, x3, x13
+	ldp	x12, x13, [x11, 32]
+	eor	x4, x4, x12
+	eor	x5, x5, x13
+	ldp	x12, x13, [x11, 48]
+	eor	x6, x6, x12
+	eor	x7, x7, x13
+
+randomx_calc_dataset_item_aarch64_store_result:
+	stp	x0, x1, [x9]
+	stp	x2, x3, [x9, 16]
+	stp	x4, x5, [x9, 32]
+	stp	x6, x7, [x9, 48]
+
+	ldp	x0, x1, [sp]
+	ldp	x2, x3, [sp, 16]
+	ldp	x4, x5, [sp, 32]
+	ldp	x6, x7, [sp, 48]
+	ldp	x8, x9, [sp, 64]
+	ldp	x10, x11, [sp, 80]
+	ldp	x12, x13, [sp, 96]
+	add	sp, sp, 112
+
+	ret
+
+randomx_calc_dataset_item_aarch64_end:
--- a/src/jit_compiler_a64_static.hpp
+++ b/src/jit_compiler_a64_static.hpp
@ -0,0 +1,51 @@
+/*
+Copyright (c) 2018-2019, tevador <tevador@gmail.com>
+Copyright (c) 2019, SChernykh    <https://github.com/SChernykh>
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+	* Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	* Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	* Neither the name of the copyright holder nor the
+	  names of its contributors may be used to endorse or promote products
+	  derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#pragma once
+
+extern "C" {
+	void randomx_program_aarch64(void* reg, void* mem, void* scratchpad, uint64_t iterations);
+	void randomx_program_aarch64_main_loop();
+	void randomx_program_aarch64_vm_instructions();
+	void randomx_program_aarch64_imul_rcp_literals_end();
+	void randomx_program_aarch64_vm_instructions_end();
+	void randomx_program_aarch64_cacheline_align_mask1();
+	void randomx_program_aarch64_cacheline_align_mask2();
+	void randomx_program_aarch64_update_spMix1();
+	void randomx_program_aarch64_vm_instructions_end_light();
+	void randomx_program_aarch64_light_cacheline_align_mask();
+	void randomx_program_aarch64_light_dataset_offset();
+	void randomx_init_dataset_aarch64();
+	void randomx_init_dataset_aarch64_end();
+	void randomx_calc_dataset_item_aarch64();
+	void randomx_calc_dataset_item_aarch64_prefetch();
+	void randomx_calc_dataset_item_aarch64_mix();
+	void randomx_calc_dataset_item_aarch64_store_result();
+	void randomx_calc_dataset_item_aarch64_end();
+}
--- a/src/vm_compiled.cpp
+++ b/src/vm_compiled.cpp
@ -63,6 +63,9 @@ namespace randomx {

 	template<class Allocator, bool softAes, bool secureJit>
 	void CompiledVm<Allocator, softAes, secureJit>::execute() {
+#ifdef __aarch64__
+		memcpy(reg.f, config.eMask, sizeof(config.eMask));
+#endif
 		compiler.getProgramFunc()(reg, mem, scratchpad, RANDOMX_PROGRAM_ITERATIONS);
 	}