mirror of
				https://git.wownero.com/wownero/RandomWOW.git
				synced 2024-08-15 00:23:14 +00:00 
			
		
		
		
	
							parent
							
								
									5fb26fc607
								
							
						
					
					
						commit
						c6468a3816
					
				
					 8 changed files with 1794 additions and 28 deletions
				
			
		|  | @ -116,6 +116,12 @@ endif() | |||
| 
 | ||||
| # ARMv8 | ||||
| if (ARM_ID STREQUAL "aarch64" OR ARM_ID STREQUAL "arm64" OR ARM_ID STREQUAL "armv8-a") | ||||
|   list(APPEND randomx_sources | ||||
|     src/jit_compiler_a64_static.S | ||||
|     src/jit_compiler_a64.cpp) | ||||
|   # cheat because cmake and ccache hate each other | ||||
|   set_property(SOURCE src/jit_compiler_a64_static.S PROPERTY LANGUAGE C) | ||||
| 
 | ||||
|   if(ARCH STREQUAL "native") | ||||
|     add_flag("-march=native") | ||||
|   else() | ||||
|  |  | |||
|  | @ -119,7 +119,7 @@ namespace randomx { | |||
| 	class JitCompilerX86; | ||||
| 	using JitCompiler = JitCompilerX86; | ||||
| #elif defined(__aarch64__) | ||||
| 	#define RANDOMX_HAVE_COMPILER 0 | ||||
| 	#define RANDOMX_HAVE_COMPILER 1 | ||||
| 	class JitCompilerA64; | ||||
| 	using JitCompiler = JitCompilerA64; | ||||
| #else | ||||
|  |  | |||
|  | @ -385,7 +385,14 @@ FORCE_INLINE rx_vec_f128 rx_cvt_packed_int_vec_f128(const void* addr) { | |||
| typedef uint8x16_t rx_vec_i128; | ||||
| typedef float64x2_t rx_vec_f128; | ||||
| 
 | ||||
| #define rx_aligned_alloc(size, align) aligned_alloc(align, size) | ||||
| inline void* rx_aligned_alloc(size_t size, size_t align) { | ||||
| 	void* p; | ||||
| 	if (posix_memalign(&p, align, size) == 0) | ||||
| 		return p; | ||||
| 
 | ||||
| 	return 0; | ||||
| }; | ||||
| 
 | ||||
| #define rx_aligned_free(a) free(a) | ||||
| 
 | ||||
| inline void rx_prefetch_nta(void* ptr) { | ||||
|  |  | |||
							
								
								
									
										1068
									
								
								src/jit_compiler_a64.cpp
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										1068
									
								
								src/jit_compiler_a64.cpp
									
										
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because it is too large
												Load diff
											
										
									
								
							|  | @ -1,5 +1,6 @@ | |||
| /*
 | ||||
| Copyright (c) 2018-2019, tevador <tevador@gmail.com> | ||||
| Copyright (c) 2019, SChernykh    <https://github.com/SChernykh>
 | ||||
| 
 | ||||
| All rights reserved. | ||||
| 
 | ||||
|  | @ -32,45 +33,96 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include <vector> | ||||
| #include <stdexcept> | ||||
| #include "common.hpp" | ||||
| #include "jit_compiler_a64_static.hpp" | ||||
| 
 | ||||
| namespace randomx { | ||||
| 
 | ||||
| 	class Program; | ||||
| 	class ProgramConfiguration; | ||||
| 	class SuperscalarProgram; | ||||
| 	class Instruction; | ||||
| 
 | ||||
| 	typedef void(JitCompilerA64::*InstructionGeneratorA64)(Instruction&, uint32_t&); | ||||
| 
 | ||||
| 	class JitCompilerA64 { | ||||
| 	public: | ||||
| 		JitCompilerA64() { | ||||
| 			throw std::runtime_error("ARM64 JIT compiler is not implemented yet."); | ||||
| 		} | ||||
| 		void generateProgram(Program&, ProgramConfiguration&) { | ||||
| 		JitCompilerA64(); | ||||
| 		~JitCompilerA64(); | ||||
| 
 | ||||
| 		void generateProgram(Program&, ProgramConfiguration&); | ||||
| 		void generateProgramLight(Program&, ProgramConfiguration&, uint32_t); | ||||
| 
 | ||||
| 		} | ||||
| 		void generateProgramLight(Program&, ProgramConfiguration&, uint32_t) { | ||||
| 			 | ||||
| 		} | ||||
| 		template<size_t N> | ||||
| 		void generateSuperscalarHash(SuperscalarProgram(&programs)[N], std::vector<uint64_t> &) { | ||||
| 		void generateSuperscalarHash(SuperscalarProgram(&programs)[N], std::vector<uint64_t> &); | ||||
| 
 | ||||
| 		} | ||||
| 		void generateDatasetInitCode() { | ||||
| 		void generateDatasetInitCode() {} | ||||
| 
 | ||||
| 		ProgramFunc* getProgramFunc() { return reinterpret_cast<ProgramFunc*>(code); } | ||||
| 		DatasetInitFunc* getDatasetInitFunc(); | ||||
| 		uint8_t* getCode() { return code; } | ||||
| 		size_t getCodeSize(); | ||||
| 
 | ||||
| 		void enableWriting(); | ||||
| 		void enableExecution(); | ||||
| 		void enableAll(); | ||||
| 
 | ||||
| 	private: | ||||
| 		static InstructionGeneratorA64 engine[256]; | ||||
| 		uint32_t reg_changed_offset[8]; | ||||
| 		uint8_t* code; | ||||
| 		uint32_t literalPos; | ||||
| 		uint32_t num32bitLiterals; | ||||
| 
 | ||||
| 		static void emit32(uint32_t val, uint8_t* code, uint32_t& codePos) | ||||
| 		{ | ||||
| 			*(uint32_t*)(code + codePos) = val; | ||||
| 			codePos += sizeof(val); | ||||
| 		} | ||||
| 		ProgramFunc* getProgramFunc() { | ||||
| 			return nullptr; | ||||
| 
 | ||||
| 		static void emit64(uint64_t val, uint8_t* code, uint32_t& codePos) | ||||
| 		{ | ||||
| 			*(uint64_t*)(code + codePos) = val; | ||||
| 			codePos += sizeof(val); | ||||
| 		} | ||||
| 		DatasetInitFunc* getDatasetInitFunc() { | ||||
| 			return nullptr; | ||||
| 		} | ||||
| 		uint8_t* getCode() { | ||||
| 			return nullptr; | ||||
| 		} | ||||
| 		size_t getCodeSize() { | ||||
| 			return 0; | ||||
| 		} | ||||
| 		void enableWriting() {} | ||||
| 		void enableExecution() {} | ||||
| 		void enableAll() {} | ||||
| 
 | ||||
| 		void emitMovImmediate(uint32_t dst, uint32_t imm, uint8_t* code, uint32_t& codePos); | ||||
| 		void emitAddImmediate(uint32_t dst, uint32_t src, uint32_t imm, uint8_t* code, uint32_t& codePos); | ||||
| 
 | ||||
| 		template<uint32_t tmp_reg> | ||||
| 		void emitMemLoad(uint32_t dst, uint32_t src, Instruction& instr, uint8_t* code, uint32_t& codePos); | ||||
| 
 | ||||
| 		template<uint32_t tmp_reg_fp> | ||||
| 		void emitMemLoadFP(uint32_t src, Instruction& instr, uint8_t* code, uint32_t& codePos); | ||||
| 
 | ||||
| 		void h_IADD_RS(Instruction&, uint32_t&); | ||||
| 		void h_IADD_M(Instruction&, uint32_t&); | ||||
| 		void h_ISUB_R(Instruction&, uint32_t&); | ||||
| 		void h_ISUB_M(Instruction&, uint32_t&); | ||||
| 		void h_IMUL_R(Instruction&, uint32_t&); | ||||
| 		void h_IMUL_M(Instruction&, uint32_t&); | ||||
| 		void h_IMULH_R(Instruction&, uint32_t&); | ||||
| 		void h_IMULH_M(Instruction&, uint32_t&); | ||||
| 		void h_ISMULH_R(Instruction&, uint32_t&); | ||||
| 		void h_ISMULH_M(Instruction&, uint32_t&); | ||||
| 		void h_IMUL_RCP(Instruction&, uint32_t&); | ||||
| 		void h_INEG_R(Instruction&, uint32_t&); | ||||
| 		void h_IXOR_R(Instruction&, uint32_t&); | ||||
| 		void h_IXOR_M(Instruction&, uint32_t&); | ||||
| 		void h_IROR_R(Instruction&, uint32_t&); | ||||
| 		void h_IROL_R(Instruction&, uint32_t&); | ||||
| 		void h_ISWAP_R(Instruction&, uint32_t&); | ||||
| 		void h_FSWAP_R(Instruction&, uint32_t&); | ||||
| 		void h_FADD_R(Instruction&, uint32_t&); | ||||
| 		void h_FADD_M(Instruction&, uint32_t&); | ||||
| 		void h_FSUB_R(Instruction&, uint32_t&); | ||||
| 		void h_FSUB_M(Instruction&, uint32_t&); | ||||
| 		void h_FSCAL_R(Instruction&, uint32_t&); | ||||
| 		void h_FMUL_R(Instruction&, uint32_t&); | ||||
| 		void h_FDIV_M(Instruction&, uint32_t&); | ||||
| 		void h_FSQRT_R(Instruction&, uint32_t&); | ||||
| 		void h_CBRANCH(Instruction&, uint32_t&); | ||||
| 		void h_CFROUND(Instruction&, uint32_t&); | ||||
| 		void h_ISTORE(Instruction&, uint32_t&); | ||||
| 		void h_NOP(Instruction&, uint32_t&); | ||||
| 	}; | ||||
| } | ||||
| } | ||||
|  |  | |||
							
								
								
									
										579
									
								
								src/jit_compiler_a64_static.S
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										579
									
								
								src/jit_compiler_a64_static.S
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,579 @@ | |||
| # Copyright (c) 2018-2019, tevador <tevador@gmail.com>
 | ||||
| # Copyright (c) 2019, SChernykh    <https://github.com/SChernykh> | ||||
| # | ||||
| # All rights reserved. | ||||
| # | ||||
| # Redistribution and use in source and binary forms, with or without | ||||
| # modification, are permitted provided that the following conditions are met: | ||||
| # 	* Redistributions of source code must retain the above copyright | ||||
| # 	  notice, this list of conditions and the following disclaimer. | ||||
| # 	* Redistributions in binary form must reproduce the above copyright | ||||
| # 	  notice, this list of conditions and the following disclaimer in the | ||||
| # 	  documentation and/or other materials provided with the distribution. | ||||
| # 	* Neither the name of the copyright holder nor the | ||||
| # 	  names of its contributors may be used to endorse or promote products | ||||
| # 	  derived from this software without specific prior written permission. | ||||
| # | ||||
| # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND | ||||
| # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED | ||||
| # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE | ||||
| # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE | ||||
| # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | ||||
| # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | ||||
| # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 | ||||
| # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | ||||
| # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | ||||
| # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | ||||
| 
 | ||||
| 	.arch armv8-a | ||||
| 	.text | ||||
| 	.global	randomx_program_aarch64
 | ||||
| 	.global	randomx_program_aarch64_main_loop
 | ||||
| 	.global	randomx_program_aarch64_vm_instructions
 | ||||
| 	.global randomx_program_aarch64_imul_rcp_literals_end
 | ||||
| 	.global	randomx_program_aarch64_vm_instructions_end
 | ||||
| 	.global randomx_program_aarch64_cacheline_align_mask1
 | ||||
| 	.global randomx_program_aarch64_cacheline_align_mask2
 | ||||
| 	.global randomx_program_aarch64_update_spMix1
 | ||||
| 	.global randomx_program_aarch64_vm_instructions_end_light
 | ||||
| 	.global randomx_program_aarch64_light_cacheline_align_mask
 | ||||
| 	.global randomx_program_aarch64_light_dataset_offset
 | ||||
| 	.global randomx_init_dataset_aarch64
 | ||||
| 	.global randomx_init_dataset_aarch64_end
 | ||||
| 	.global randomx_calc_dataset_item_aarch64
 | ||||
| 	.global randomx_calc_dataset_item_aarch64_prefetch
 | ||||
| 	.global randomx_calc_dataset_item_aarch64_mix
 | ||||
| 	.global randomx_calc_dataset_item_aarch64_store_result
 | ||||
| 	.global randomx_calc_dataset_item_aarch64_end
 | ||||
| 
 | ||||
| #include "configuration.h" | ||||
| 
 | ||||
| # Register allocation | ||||
| 
 | ||||
| # x0  -> pointer to reg buffer and then literal for IMUL_RCP | ||||
| # x1  -> pointer to mem buffer and then to dataset | ||||
| # x2  -> pointer to scratchpad | ||||
| # x3  -> loop counter | ||||
| # x4  -> "r0" | ||||
| # x5  -> "r1" | ||||
| # x6  -> "r2" | ||||
| # x7  -> "r3" | ||||
| # x8  -> fpcr (reversed bits) | ||||
| # x9  -> mx, ma | ||||
| # x10 -> spMix1 | ||||
| # x11 -> literal for IMUL_RCP | ||||
| # x12 -> "r4" | ||||
| # x13 -> "r5" | ||||
| # x14 -> "r6" | ||||
| # x15 -> "r7" | ||||
| # x16 -> spAddr0 | ||||
| # x17 -> spAddr1 | ||||
| # x18 -> temporary | ||||
| # x19 -> temporary | ||||
| # x20 -> literal for IMUL_RCP | ||||
| # x21 -> literal for IMUL_RCP | ||||
| # x22 -> literal for IMUL_RCP | ||||
| # x23 -> literal for IMUL_RCP | ||||
| # x24 -> literal for IMUL_RCP | ||||
| # x25 -> literal for IMUL_RCP | ||||
| # x26 -> literal for IMUL_RCP | ||||
| # x27 -> literal for IMUL_RCP | ||||
| # x28 -> literal for IMUL_RCP | ||||
| # x29 -> literal for IMUL_RCP | ||||
| # x30 -> literal for IMUL_RCP | ||||
| 
 | ||||
| # v0-v15 -> store 32-bit literals | ||||
| # v16 -> "f0" | ||||
| # v17 -> "f1" | ||||
| # v18 -> "f2" | ||||
| # v19 -> "f3" | ||||
| # v20 -> "e0" | ||||
| # v21 -> "e1" | ||||
| # v22 -> "e2" | ||||
| # v23 -> "e3" | ||||
| # v24 -> "a0" | ||||
| # v25 -> "a1" | ||||
| # v26 -> "a2" | ||||
| # v27 -> "a3" | ||||
| # v28 -> temporary | ||||
| # v29 -> E 'and' mask = 0x00ffffffffffffff00ffffffffffffff | ||||
| # v30 -> E 'or' mask  = 0x3*00000000******3*00000000****** | ||||
| # v31 -> scale mask   = 0x81f000000000000081f0000000000000 | ||||
| 
 | ||||
| randomx_program_aarch64: | ||||
| 	# Save callee-saved registers | ||||
| 	sub	sp, sp, 192 | ||||
| 	stp	x16, x17, [sp] | ||||
| 	stp	x18, x19, [sp, 16] | ||||
| 	stp	x20, x21, [sp, 32] | ||||
| 	stp	x22, x23, [sp, 48] | ||||
| 	stp	x24, x25, [sp, 64] | ||||
| 	stp	x26, x27, [sp, 80] | ||||
| 	stp	x28, x29, [sp, 96] | ||||
| 	stp	x8, x30, [sp, 112] | ||||
| 	stp	d8, d9, [sp, 128] | ||||
| 	stp	d10, d11, [sp, 144] | ||||
| 	stp	d12, d13, [sp, 160] | ||||
| 	stp	d14, d15, [sp, 176] | ||||
| 
 | ||||
| 	# Zero integer registers | ||||
| 	mov	x4, xzr | ||||
| 	mov	x5, xzr | ||||
| 	mov	x6, xzr | ||||
| 	mov	x7, xzr | ||||
| 	mov	x12, xzr | ||||
| 	mov	x13, xzr | ||||
| 	mov	x14, xzr | ||||
| 	mov	x15, xzr | ||||
| 
 | ||||
| 	# Load ma, mx and dataset pointer | ||||
| 	ldp	x9, x1, [x1] | ||||
| 
 | ||||
| 	# Load initial spMix value | ||||
| 	mov	x10, x9 | ||||
| 
 | ||||
| 	# Load group A registers | ||||
| 	ldp	q24, q25, [x0, 192] | ||||
| 	ldp	q26, q27, [x0, 224] | ||||
| 
 | ||||
| 	# Load E 'and' mask | ||||
| 	mov	x16, 0x00FFFFFFFFFFFFFF | ||||
| 	ins	v29.d[0], x16 | ||||
| 	ins	v29.d[1], x16 | ||||
| 
 | ||||
| 	# Load E 'or' mask (stored in reg.f[0]) | ||||
| 	ldr	q30, [x0, 64] | ||||
| 
 | ||||
| 	# Load scale mask | ||||
| 	mov	x16, 0x80f0000000000000 | ||||
| 	ins	v31.d[0], x16 | ||||
| 	ins	v31.d[1], x16 | ||||
| 
 | ||||
| 	# Read fpcr | ||||
| 	mrs	x8, fpcr | ||||
| 	rbit	x8, x8 | ||||
| 
 | ||||
| 	# Save x0 | ||||
| 	str	x0, [sp, -16]! | ||||
| 
 | ||||
| 	# Read literals | ||||
| 	ldr	x0, literal_x0 | ||||
| 	ldr	x11, literal_x11 | ||||
| 	ldr	x20, literal_x20 | ||||
| 	ldr	x21, literal_x21 | ||||
| 	ldr	x22, literal_x22 | ||||
| 	ldr	x23, literal_x23 | ||||
| 	ldr	x24, literal_x24 | ||||
| 	ldr	x25, literal_x25 | ||||
| 	ldr	x26, literal_x26 | ||||
| 	ldr	x27, literal_x27 | ||||
| 	ldr	x28, literal_x28 | ||||
| 	ldr	x29, literal_x29 | ||||
| 	ldr	x30, literal_x30 | ||||
| 
 | ||||
| 	ldr	q0, literal_v0 | ||||
| 	ldr	q1, literal_v1 | ||||
| 	ldr	q2, literal_v2 | ||||
| 	ldr	q3, literal_v3 | ||||
| 	ldr	q4, literal_v4 | ||||
| 	ldr	q5, literal_v5 | ||||
| 	ldr	q6, literal_v6 | ||||
| 	ldr	q7, literal_v7 | ||||
| 	ldr	q8, literal_v8 | ||||
| 	ldr	q9, literal_v9 | ||||
| 	ldr	q10, literal_v10 | ||||
| 	ldr	q11, literal_v11 | ||||
| 	ldr	q12, literal_v12 | ||||
| 	ldr	q13, literal_v13 | ||||
| 	ldr	q14, literal_v14 | ||||
| 	ldr	q15, literal_v15 | ||||
| 
 | ||||
| randomx_program_aarch64_main_loop: | ||||
| 	# spAddr0 = spMix1 & ScratchpadL3Mask64;
 | ||||
| 	# spAddr1 = (spMix1 >> 32) & ScratchpadL3Mask64;
 | ||||
| 	lsr	x18, x10, 32 | ||||
| 
 | ||||
| 	# Actual mask will be inserted by JIT compiler | ||||
| 	and	w16, w10, 1 | ||||
| 	and	w17, w18, 1 | ||||
| 
 | ||||
| 	# x16 = scratchpad + spAddr0 | ||||
| 	# x17 = scratchpad + spAddr1 | ||||
| 	add	x16, x16, x2 | ||||
| 	add	x17, x17, x2 | ||||
| 
 | ||||
| 	# xor integer registers with scratchpad data (spAddr0) | ||||
| 	ldp	x18, x19, [x16] | ||||
| 	eor	x4, x4, x18 | ||||
| 	eor	x5, x5, x19 | ||||
| 	ldp	x18, x19, [x16, 16] | ||||
| 	eor	x6, x6, x18 | ||||
| 	eor	x7, x7, x19 | ||||
| 	ldp	x18, x19, [x16, 32] | ||||
| 	eor	x12, x12, x18 | ||||
| 	eor	x13, x13, x19 | ||||
| 	ldp	x18, x19, [x16, 48] | ||||
| 	eor	x14, x14, x18 | ||||
| 	eor	x15, x15, x19 | ||||
| 
 | ||||
| 	# Load group F registers (spAddr1) | ||||
| 	ldpsw	x18, x19, [x17] | ||||
| 	ins	v16.d[0], x18 | ||||
| 	ins	v16.d[1], x19 | ||||
| 	ldpsw	x18, x19, [x17, 8] | ||||
| 	ins	v17.d[0], x18 | ||||
| 	ins	v17.d[1], x19 | ||||
| 	ldpsw	x18, x19, [x17, 16] | ||||
| 	ins	v18.d[0], x18 | ||||
| 	ins	v18.d[1], x19 | ||||
| 	ldpsw	x18, x19, [x17, 24] | ||||
| 	ins	v19.d[0], x18 | ||||
| 	ins	v19.d[1], x19 | ||||
| 	scvtf	v16.2d, v16.2d | ||||
| 	scvtf	v17.2d, v17.2d | ||||
| 	scvtf	v18.2d, v18.2d | ||||
| 	scvtf	v19.2d, v19.2d | ||||
| 
 | ||||
| 	# Load group E registers (spAddr1) | ||||
| 	ldpsw	x18, x19, [x17, 32] | ||||
| 	ins	v20.d[0], x18 | ||||
| 	ins	v20.d[1], x19 | ||||
| 	ldpsw	x18, x19, [x17, 40] | ||||
| 	ins	v21.d[0], x18 | ||||
| 	ins	v21.d[1], x19 | ||||
| 	ldpsw	x18, x19, [x17, 48] | ||||
| 	ins	v22.d[0], x18 | ||||
| 	ins	v22.d[1], x19 | ||||
| 	ldpsw	x18, x19, [x17, 56] | ||||
| 	ins	v23.d[0], x18 | ||||
| 	ins	v23.d[1], x19 | ||||
| 	scvtf	v20.2d, v20.2d | ||||
| 	scvtf	v21.2d, v21.2d | ||||
| 	scvtf	v22.2d, v22.2d | ||||
| 	scvtf	v23.2d, v23.2d | ||||
| 	and	v20.16b, v20.16b, v29.16b | ||||
| 	and	v21.16b, v21.16b, v29.16b | ||||
| 	and	v22.16b, v22.16b, v29.16b | ||||
| 	and	v23.16b, v23.16b, v29.16b | ||||
| 	orr	v20.16b, v20.16b, v30.16b | ||||
| 	orr	v21.16b, v21.16b, v30.16b | ||||
| 	orr	v22.16b, v22.16b, v30.16b | ||||
| 	orr	v23.16b, v23.16b, v30.16b | ||||
| 
 | ||||
| 	# Execute VM instructions | ||||
| randomx_program_aarch64_vm_instructions: | ||||
| 
 | ||||
| 	# buffer for generated instructions | ||||
| 	# FDIV_M is the largest instruction taking up to 12 ARMv8 instructions | ||||
| 	.fill RANDOMX_PROGRAM_SIZE*12,4,0 | ||||
| 
 | ||||
| literal_x0:  .fill 1,8,0 | ||||
| literal_x11: .fill 1,8,0 | ||||
| literal_x20: .fill 1,8,0 | ||||
| literal_x21: .fill 1,8,0 | ||||
| literal_x22: .fill 1,8,0 | ||||
| literal_x23: .fill 1,8,0 | ||||
| literal_x24: .fill 1,8,0 | ||||
| literal_x25: .fill 1,8,0 | ||||
| literal_x26: .fill 1,8,0 | ||||
| literal_x27: .fill 1,8,0 | ||||
| literal_x28: .fill 1,8,0 | ||||
| literal_x29: .fill 1,8,0 | ||||
| literal_x30: .fill 1,8,0 | ||||
| randomx_program_aarch64_imul_rcp_literals_end: | ||||
| 
 | ||||
| literal_v0:  .fill 2,8,0 | ||||
| literal_v1:  .fill 2,8,0 | ||||
| literal_v2:  .fill 2,8,0 | ||||
| literal_v3:  .fill 2,8,0 | ||||
| literal_v4:  .fill 2,8,0 | ||||
| literal_v5:  .fill 2,8,0 | ||||
| literal_v6:  .fill 2,8,0 | ||||
| literal_v7:  .fill 2,8,0 | ||||
| literal_v8:  .fill 2,8,0 | ||||
| literal_v9:  .fill 2,8,0 | ||||
| literal_v10: .fill 2,8,0 | ||||
| literal_v11: .fill 2,8,0 | ||||
| literal_v12: .fill 2,8,0 | ||||
| literal_v13: .fill 2,8,0 | ||||
| literal_v14: .fill 2,8,0 | ||||
| literal_v15: .fill 2,8,0 | ||||
| 
 | ||||
| randomx_program_aarch64_vm_instructions_end: | ||||
| 
 | ||||
| 	# mx ^= r[readReg2] ^ r[readReg3];
 | ||||
| 	eor	x9, x9, x18 | ||||
| 
 | ||||
| 	# Calculate dataset pointer for dataset prefetch | ||||
| 	mov	w18, w9 | ||||
| randomx_program_aarch64_cacheline_align_mask1: | ||||
| 	# Actual mask will be inserted by JIT compiler | ||||
| 	and	x18, x18, 1 | ||||
| 	add	x18, x18, x1 | ||||
| 
 | ||||
| 	# Prefetch dataset data | ||||
| 	prfm	pldl2strm, [x18] | ||||
| 
 | ||||
| 	# mx <-> ma | ||||
| 	ror	x9, x9, 32 | ||||
| 
 | ||||
| 	# Calculate dataset pointer for dataset read | ||||
| 	mov	w10, w9 | ||||
| randomx_program_aarch64_cacheline_align_mask2: | ||||
| 	# Actual mask will be inserted by JIT compiler | ||||
| 	and	x10, x10, 1 | ||||
| 	add	x10, x10, x1 | ||||
| 
 | ||||
| randomx_program_aarch64_xor_with_dataset_line: | ||||
| 	# xor integer registers with dataset data | ||||
| 	ldp	x18, x19, [x10] | ||||
| 	eor	x4, x4, x18 | ||||
| 	eor	x5, x5, x19 | ||||
| 	ldp	x18, x19, [x10, 16] | ||||
| 	eor	x6, x6, x18 | ||||
| 	eor	x7, x7, x19 | ||||
| 	ldp	x18, x19, [x10, 32] | ||||
| 	eor	x12, x12, x18 | ||||
| 	eor	x13, x13, x19 | ||||
| 	ldp	x18, x19, [x10, 48] | ||||
| 	eor	x14, x14, x18 | ||||
| 	eor	x15, x15, x19 | ||||
| 
 | ||||
| randomx_program_aarch64_update_spMix1: | ||||
| 	# JIT compiler will replace it with "eor x10, config.readReg0, config.readReg1" | ||||
| 	eor	x10, x0, x0 | ||||
| 
 | ||||
| 	# Store integer registers to scratchpad (spAddr1) | ||||
| 	stp	x4, x5, [x17, 0] | ||||
| 	stp	x6, x7, [x17, 16] | ||||
| 	stp	x12, x13, [x17, 32] | ||||
| 	stp	x14, x15, [x17, 48] | ||||
| 
 | ||||
| 	# xor group F and group E registers | ||||
| 	eor	v16.16b, v16.16b, v20.16b | ||||
| 	eor	v17.16b, v17.16b, v21.16b | ||||
| 	eor	v18.16b, v18.16b, v22.16b | ||||
| 	eor	v19.16b, v19.16b, v23.16b | ||||
| 
 | ||||
| 	# Store FP registers to scratchpad (spAddr0) | ||||
| 	stp	q16, q17, [x16, 0] | ||||
| 	stp	q18, q19, [x16, 32] | ||||
| 
 | ||||
| 	subs	x3, x3, 1 | ||||
| 	bne	randomx_program_aarch64_main_loop | ||||
| 	 | ||||
| 	# Restore x0 | ||||
| 	ldr	x0, [sp], 16 | ||||
| 
 | ||||
| 	# Store integer registers | ||||
| 	stp	x4, x5, [x0, 0] | ||||
| 	stp	x6, x7, [x0, 16] | ||||
| 	stp	x12, x13, [x0, 32] | ||||
| 	stp	x14, x15, [x0, 48] | ||||
| 
 | ||||
| 	# Store FP registers | ||||
| 	stp	q16, q17, [x0, 64] | ||||
| 	stp	q18, q19, [x0, 96] | ||||
| 	stp	q20, q21, [x0, 128] | ||||
| 	stp	q22, q23, [x0, 160] | ||||
| 
 | ||||
| 	# Restore callee-saved registers | ||||
| 	ldp	x16, x17, [sp] | ||||
| 	ldp	x18, x19, [sp, 16] | ||||
| 	ldp	x20, x21, [sp, 32] | ||||
| 	ldp	x22, x23, [sp, 48] | ||||
| 	ldp	x24, x25, [sp, 64] | ||||
| 	ldp	x26, x27, [sp, 80] | ||||
| 	ldp	x28, x29, [sp, 96] | ||||
| 	ldp	x8, x30, [sp, 112] | ||||
| 	ldp	d8, d9, [sp, 128] | ||||
| 	ldp	d10, d11, [sp, 144] | ||||
| 	ldp	d12, d13, [sp, 160] | ||||
| 	ldp	d14, d15, [sp, 176] | ||||
| 	add	sp, sp, 192 | ||||
| 
 | ||||
| 	ret | ||||
| 
 | ||||
| randomx_program_aarch64_vm_instructions_end_light: | ||||
| 	sub	sp, sp, 96 | ||||
| 	stp	x0, x1, [sp, 64] | ||||
| 	stp	x2, x30, [sp, 80] | ||||
| 
 | ||||
| 	# mx ^= r[readReg2] ^ r[readReg3];
 | ||||
| 	eor	x9, x9, x18 | ||||
| 
 | ||||
| 	# mx <-> ma | ||||
| 	ror	x9, x9, 32 | ||||
| 
 | ||||
| 	# x0 -> pointer to cache memory | ||||
| 	mov	x0, x1 | ||||
| 
 | ||||
| 	# x1 -> pointer to output | ||||
| 	mov	x1, sp | ||||
| 
 | ||||
| randomx_program_aarch64_light_cacheline_align_mask: | ||||
| 	# Actual mask will be inserted by JIT compiler | ||||
| 	and	w2, w9, 1 | ||||
| 
 | ||||
| 	# x2 -> item number | ||||
| 	lsr	x2, x2, 6 | ||||
| 
 | ||||
| randomx_program_aarch64_light_dataset_offset: | ||||
| 	# Apply dataset offset (filled in by JIT compiler) | ||||
| 	add	x2, x2, 0 | ||||
| 	add	x2, x2, 0 | ||||
| 
 | ||||
| 	bl	randomx_calc_dataset_item_aarch64 | ||||
| 
 | ||||
| 	mov	x10, sp | ||||
| 	ldp	x0, x1, [sp, 64] | ||||
| 	ldp	x2, x30, [sp, 80] | ||||
| 	add	sp, sp, 96 | ||||
| 
 | ||||
| 	b	randomx_program_aarch64_xor_with_dataset_line | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| # Input parameters | ||||
| # | ||||
| # x0 -> pointer to cache | ||||
| # x1 -> pointer to dataset memory at startItem | ||||
| # x2 -> start item | ||||
| # x3 -> end item | ||||
| 
 | ||||
| randomx_init_dataset_aarch64: | ||||
| 	# Save x30 (return address) | ||||
| 	str	x30, [sp, -16]! | ||||
| 
 | ||||
| 	# Load pointer to cache memory | ||||
| 	ldr	x0, [x0] | ||||
| 
 | ||||
| randomx_init_dataset_aarch64_main_loop: | ||||
| 	bl	randomx_calc_dataset_item_aarch64 | ||||
| 	add	x1, x1, 64 | ||||
| 	add	x2, x2, 1 | ||||
| 	cmp	x2, x3 | ||||
| 	bne	randomx_init_dataset_aarch64_main_loop | ||||
| 
 | ||||
| 	# Restore x30 (return address) | ||||
| 	ldr	x30, [sp], 16 | ||||
| 
 | ||||
| 	ret | ||||
| 
 | ||||
| randomx_init_dataset_aarch64_end: | ||||
| 
 | ||||
| # Input parameters | ||||
| # | ||||
| # x0 -> pointer to cache memory | ||||
| # x1 -> pointer to output | ||||
| # x2 -> item number | ||||
| # | ||||
| # Register allocation | ||||
| # | ||||
| # x0-x7 -> output value (calculated dataset item) | ||||
| # x8 -> pointer to cache memory | ||||
| # x9 -> pointer to output | ||||
| # x10 -> registerValue | ||||
| # x11 -> mixBlock | ||||
| # x12 -> temporary | ||||
| # x13 -> temporary | ||||
| 
 | ||||
| randomx_calc_dataset_item_aarch64: | ||||
| 	sub	sp, sp, 112 | ||||
| 	stp	x0, x1, [sp] | ||||
| 	stp	x2, x3, [sp, 16] | ||||
| 	stp	x4, x5, [sp, 32] | ||||
| 	stp	x6, x7, [sp, 48] | ||||
| 	stp	x8, x9, [sp, 64] | ||||
| 	stp	x10, x11, [sp, 80] | ||||
| 	stp	x12, x13, [sp, 96] | ||||
| 
 | ||||
| 	mov	x8, x0 | ||||
| 	mov	x9, x1 | ||||
| 	mov	x10, x2 | ||||
| 
 | ||||
| 	# rl[0] = (itemNumber + 1) * superscalarMul0;
 | ||||
| 	ldr	x12, superscalarMul0 | ||||
| 	madd	x0, x2, x12, x12 | ||||
| 
 | ||||
| 	# rl[1] = rl[0] ^ superscalarAdd1;
 | ||||
| 	ldr	x12, superscalarAdd1 | ||||
| 	eor	x1, x0, x12 | ||||
| 
 | ||||
| 	# rl[2] = rl[0] ^ superscalarAdd2;
 | ||||
| 	ldr	x12, superscalarAdd2 | ||||
| 	eor	x2, x0, x12 | ||||
| 
 | ||||
| 	# rl[3] = rl[0] ^ superscalarAdd3;
 | ||||
| 	ldr	x12, superscalarAdd3 | ||||
| 	eor	x3, x0, x12 | ||||
| 
 | ||||
| 	# rl[4] = rl[0] ^ superscalarAdd4;
 | ||||
| 	ldr	x12, superscalarAdd4 | ||||
| 	eor	x4, x0, x12 | ||||
| 
 | ||||
| 	# rl[5] = rl[0] ^ superscalarAdd5;
 | ||||
| 	ldr	x12, superscalarAdd5 | ||||
| 	eor	x5, x0, x12 | ||||
| 
 | ||||
| 	# rl[6] = rl[0] ^ superscalarAdd6;
 | ||||
| 	ldr	x12, superscalarAdd6 | ||||
| 	eor	x6, x0, x12 | ||||
| 
 | ||||
| 	# rl[7] = rl[0] ^ superscalarAdd7;
 | ||||
| 	ldr	x12, superscalarAdd7 | ||||
| 	eor	x7, x0, x12 | ||||
| 
 | ||||
| 	b	randomx_calc_dataset_item_aarch64_prefetch | ||||
| 
 | ||||
| superscalarMul0: .quad 6364136223846793005 | ||||
| superscalarAdd1: .quad 9298411001130361340 | ||||
| superscalarAdd2: .quad 12065312585734608966 | ||||
| superscalarAdd3: .quad 9306329213124626780 | ||||
| superscalarAdd4: .quad 5281919268842080866 | ||||
| superscalarAdd5: .quad 10536153434571861004 | ||||
| superscalarAdd6: .quad 3398623926847679864 | ||||
| superscalarAdd7: .quad 9549104520008361294 | ||||
| 
 | ||||
| # Prefetch -> SuperScalar hash -> Mix will be repeated N times | ||||
| 
 | ||||
| randomx_calc_dataset_item_aarch64_prefetch: | ||||
| 	# Actual mask will be inserted by JIT compiler | ||||
| 	and	x11, x10, 1 | ||||
| 	add	x11, x8, x11, lsl 6 | ||||
| 	prfm	pldl2strm, [x11] | ||||
| 
 | ||||
| 	# Generated SuperScalar hash program goes here | ||||
| 
 | ||||
| randomx_calc_dataset_item_aarch64_mix: | ||||
| 	ldp	x12, x13, [x11] | ||||
| 	eor	x0, x0, x12 | ||||
| 	eor	x1, x1, x13 | ||||
| 	ldp	x12, x13, [x11, 16] | ||||
| 	eor	x2, x2, x12 | ||||
| 	eor	x3, x3, x13 | ||||
| 	ldp	x12, x13, [x11, 32] | ||||
| 	eor	x4, x4, x12 | ||||
| 	eor	x5, x5, x13 | ||||
| 	ldp	x12, x13, [x11, 48] | ||||
| 	eor	x6, x6, x12 | ||||
| 	eor	x7, x7, x13 | ||||
| 
 | ||||
| randomx_calc_dataset_item_aarch64_store_result: | ||||
| 	stp	x0, x1, [x9] | ||||
| 	stp	x2, x3, [x9, 16] | ||||
| 	stp	x4, x5, [x9, 32] | ||||
| 	stp	x6, x7, [x9, 48] | ||||
| 
 | ||||
| 	ldp	x0, x1, [sp] | ||||
| 	ldp	x2, x3, [sp, 16] | ||||
| 	ldp	x4, x5, [sp, 32] | ||||
| 	ldp	x6, x7, [sp, 48] | ||||
| 	ldp	x8, x9, [sp, 64] | ||||
| 	ldp	x10, x11, [sp, 80] | ||||
| 	ldp	x12, x13, [sp, 96] | ||||
| 	add	sp, sp, 112 | ||||
| 
 | ||||
| 	ret | ||||
| 
 | ||||
| randomx_calc_dataset_item_aarch64_end: | ||||
							
								
								
									
										51
									
								
								src/jit_compiler_a64_static.hpp
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										51
									
								
								src/jit_compiler_a64_static.hpp
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,51 @@ | |||
| /*
 | ||||
| Copyright (c) 2018-2019, tevador <tevador@gmail.com> | ||||
| Copyright (c) 2019, SChernykh    <https://github.com/SChernykh>
 | ||||
| 
 | ||||
| All rights reserved. | ||||
| 
 | ||||
| Redistribution and use in source and binary forms, with or without | ||||
| modification, are permitted provided that the following conditions are met: | ||||
| 	* Redistributions of source code must retain the above copyright | ||||
| 	  notice, this list of conditions and the following disclaimer. | ||||
| 	* Redistributions in binary form must reproduce the above copyright | ||||
| 	  notice, this list of conditions and the following disclaimer in the | ||||
| 	  documentation and/or other materials provided with the distribution. | ||||
| 	* Neither the name of the copyright holder nor the | ||||
| 	  names of its contributors may be used to endorse or promote products | ||||
| 	  derived from this software without specific prior written permission. | ||||
| 
 | ||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND | ||||
| ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED | ||||
| WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE | ||||
| DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE | ||||
| FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | ||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | ||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | ||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | ||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | ||||
| OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | ||||
| */ | ||||
| 
 | ||||
| #pragma once | ||||
| 
 | ||||
| extern "C" { | ||||
| 	void randomx_program_aarch64(void* reg, void* mem, void* scratchpad, uint64_t iterations); | ||||
| 	void randomx_program_aarch64_main_loop(); | ||||
| 	void randomx_program_aarch64_vm_instructions(); | ||||
| 	void randomx_program_aarch64_imul_rcp_literals_end(); | ||||
| 	void randomx_program_aarch64_vm_instructions_end(); | ||||
| 	void randomx_program_aarch64_cacheline_align_mask1(); | ||||
| 	void randomx_program_aarch64_cacheline_align_mask2(); | ||||
| 	void randomx_program_aarch64_update_spMix1(); | ||||
| 	void randomx_program_aarch64_vm_instructions_end_light(); | ||||
| 	void randomx_program_aarch64_light_cacheline_align_mask(); | ||||
| 	void randomx_program_aarch64_light_dataset_offset(); | ||||
| 	void randomx_init_dataset_aarch64(); | ||||
| 	void randomx_init_dataset_aarch64_end(); | ||||
| 	void randomx_calc_dataset_item_aarch64(); | ||||
| 	void randomx_calc_dataset_item_aarch64_prefetch(); | ||||
| 	void randomx_calc_dataset_item_aarch64_mix(); | ||||
| 	void randomx_calc_dataset_item_aarch64_store_result(); | ||||
| 	void randomx_calc_dataset_item_aarch64_end(); | ||||
| } | ||||
|  | @ -63,6 +63,9 @@ namespace randomx { | |||
| 
 | ||||
| 	template<class Allocator, bool softAes, bool secureJit> | ||||
| 	void CompiledVm<Allocator, softAes, secureJit>::execute() { | ||||
| #ifdef __aarch64__ | ||||
| 		memcpy(reg.f, config.eMask, sizeof(config.eMask)); | ||||
| #endif | ||||
| 		compiler.getProgramFunc()(reg, mem, scratchpad, RANDOMX_PROGRAM_ITERATIONS); | ||||
| 	} | ||||
| 
 | ||||
|  |  | |||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue