;# Copyright (c) 2018 tevador ;# ;# This file is part of RandomX. ;# ;# RandomX is free software: you can redistribute it and/or modify ;# it under the terms of the GNU General Public License as published by ;# the Free Software Foundation, either version 3 of the License, or ;# (at your option) any later version. ;# ;# RandomX is distributed in the hope that it will be useful, ;# but WITHOUT ANY WARRANTY; without even the implied warranty of ;# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ;# GNU General Public License for more details. ;# ;# You should have received a copy of the GNU General Public License ;# along with RandomX. If not, see. IFDEF RAX _RANDOMX_EXECUTE_PROGRAM SEGMENT PAGE READ EXECUTE PUBLIC executeProgram executeProgram PROC ; REGISTER ALLOCATION: ; rax -> temporary ; rbx -> "ic" ; rcx -> temporary ; rdx -> temporary ; rsi -> scratchpad pointer ; rdi -> dataset pointer ; rbp -> "ma", "mx" ; rsp -> stack pointer ; r8 -> "r0" ; r9 -> "r1" ; r10 -> "r2" ; r11 -> "r3" ; r12 -> "r4" ; r13 -> "r5" ; r14 -> "r6" ; r15 -> "r7" ; xmm0 -> "f0" ; xmm1 -> "f1" ; xmm2 -> "f2" ; xmm3 -> "f3" ; xmm4 -> "e0" ; xmm5 -> "e1" ; xmm6 -> "e2" ; xmm7 -> "e3" ; xmm8 -> "a0" ; xmm9 -> "a1" ; xmm10 -> "a2" ; xmm11 -> "a3" ; xmm12 -> temporary ; xmm13 -> mantissa mask = 0x000fffffffffffff000fffffffffffff ; xmm14 -> exponent 2**-240 = 0x30f000000000000030f0000000000000 ; xmm15 -> scale mask = 0x81f000000000000081f0000000000000 ; store callee-saved registers push rbx push rbp push rdi push rsi push r12 push r13 push r14 push r15 sub rsp, 80 movdqu xmmword ptr [rsp+64], xmm6 movdqu xmmword ptr [rsp+48], xmm7 movdqu xmmword ptr [rsp+32], xmm8 movdqu xmmword ptr [rsp+16], xmm9 movdqu xmmword ptr [rsp+0], xmm10 sub rsp, 80 movdqu xmmword ptr [rsp+64], xmm11 movdqu xmmword ptr [rsp+48], xmm12 movdqu xmmword ptr [rsp+32], xmm13 movdqu xmmword ptr [rsp+16], xmm14 movdqu xmmword ptr [rsp+0], xmm15 ;# function arguments push rcx ;# RegisterFile& registerFile mov rbp, qword ptr [rdx] ;# "mx", "ma" mov rdi, qword ptr [rdx+8] ;# uint8_t* dataset mov rsi, r8 ;# uint8_t* scratchpad mov rbx, r9 ;# loop counter mov rax, rbp ;# zero integer registers xor r8, r8 xor r9, r9 xor r10, r10 xor r11, r11 xor r12, r12 xor r13, r13 xor r14, r14 xor r15, r15 ;# load constant registers lea rcx, [rcx+120] movapd xmm8, xmmword ptr [rcx+72] movapd xmm9, xmmword ptr [rcx+88] movapd xmm10, xmmword ptr [rcx+104] movapd xmm11, xmmword ptr [rcx+120] movapd xmm13, xmmword ptr [mantissaMask] movapd xmm14, xmmword ptr [exp240] movapd xmm15, xmmword ptr [scaleMask] jmp program_begin ALIGN 64 mantissaMask: db 255, 255, 255, 255, 255, 255, 15, 0, 255, 255, 255, 255, 255, 255, 15, 0 exp240: db 0, 0, 0, 0, 0, 0, 240, 48, 0, 0, 0, 0, 0, 0, 240, 48 scaleMask: db 0, 0, 0, 0, 0, 0, 240, 129, 0, 0, 0, 0, 0, 0, 240, 129 ALIGN 64 program_begin: xor rax, r8 ;# read address register 1 xor rax, r10 mov rdx, rax and eax, 2097088 lea rcx, [rsi+rax] push rcx xor r8, qword ptr [rcx+0] xor r9, qword ptr [rcx+8] xor r10, qword ptr [rcx+16] xor r11, qword ptr [rcx+24] xor r12, qword ptr [rcx+32] xor r13, qword ptr [rcx+40] xor r14, qword ptr [rcx+48] xor r15, qword ptr [rcx+56] ror rdx, 32 and edx, 2097088 lea rcx, [rsi+rdx] push rcx cvtdq2pd xmm0, qword ptr [rcx+0] cvtdq2pd xmm1, qword ptr [rcx+8] cvtdq2pd xmm2, qword ptr [rcx+16] cvtdq2pd xmm3, qword ptr [rcx+24] cvtdq2pd xmm4, qword ptr [rcx+32] cvtdq2pd xmm5, qword ptr [rcx+40] cvtdq2pd xmm6, qword ptr [rcx+48] cvtdq2pd xmm7, qword ptr [rcx+56] andps xmm4, xmm13 andps xmm5, xmm13 andps xmm6, xmm13 andps xmm7, xmm13 orps xmm4, xmm14 orps xmm5, xmm14 orps xmm6, xmm14 orps xmm7, xmm14 ;# 256 instructions include program.inc IF 1 mov eax, r12d ;# read address register 1 xor eax, r15d ;# read address register 2 xor rbp, rax ;# modify "mx" xor eax, eax and rbp, -64 ;# align "mx" to the start of a cache line mov edx, ebp ;# edx = mx prefetchnta byte ptr [rdi+rdx] ror rbp, 32 ;# swap "ma" and "mx" mov edx, ebp ;# edx = ma lea rcx, [rdi+rdx] ;# dataset cache line xor r8, qword ptr [rcx+0] xor r9, qword ptr [rcx+8] xor r10, qword ptr [rcx+16] xor r11, qword ptr [rcx+24] xor r12, qword ptr [rcx+32] xor r13, qword ptr [rcx+40] xor r14, qword ptr [rcx+48] xor r15, qword ptr [rcx+56] pop rcx mov qword ptr [rcx+0], r8 mov qword ptr [rcx+8], r9 mov qword ptr [rcx+16], r10 mov qword ptr [rcx+24], r11 mov qword ptr [rcx+32], r12 mov qword ptr [rcx+40], r13 mov qword ptr [rcx+48], r14 mov qword ptr [rcx+56], r15 pop rcx xorpd xmm0, xmm4 xorpd xmm1, xmm5 xorpd xmm2, xmm6 xorpd xmm3, xmm7 movapd xmmword ptr [rcx+0], xmm0 movapd xmmword ptr [rcx+16], xmm1 movapd xmmword ptr [rcx+32], xmm2 movapd xmmword ptr [rcx+48], xmm3 else ; memcpy trace from stack to scratchpad mov rax, rsi mov rdx, rdi cld mov rsi, rsp mov rdi, rax mov rcx, 1024 rep movsq add rsp, 8192 pop rcx pop rcx mov rsi, rax mov rdi, rdx endif sub ebx, 1 jnz program_begin rx_finish: ; save VM register values pop rcx mov qword ptr [rcx+0], r8 mov qword ptr [rcx+8], r9 mov qword ptr [rcx+16], r10 mov qword ptr [rcx+24], r11 mov qword ptr [rcx+32], r12 mov qword ptr [rcx+40], r13 mov qword ptr [rcx+48], r14 mov qword ptr [rcx+56], r15 movdqa xmmword ptr [rcx+64], xmm0 movdqa xmmword ptr [rcx+80], xmm1 movdqa xmmword ptr [rcx+96], xmm2 movdqa xmmword ptr [rcx+112], xmm3 lea rcx, [rcx+64] movdqa xmmword ptr [rcx+64], xmm4 movdqa xmmword ptr [rcx+80], xmm5 movdqa xmmword ptr [rcx+96], xmm6 movdqa xmmword ptr [rcx+112], xmm7 ; load callee-saved registers movdqu xmm15, xmmword ptr [rsp] movdqu xmm14, xmmword ptr [rsp+16] movdqu xmm13, xmmword ptr [rsp+32] movdqu xmm12, xmmword ptr [rsp+48] movdqu xmm11, xmmword ptr [rsp+64] add rsp, 80 movdqu xmm10, xmmword ptr [rsp] movdqu xmm9, xmmword ptr [rsp+16] movdqu xmm8, xmmword ptr [rsp+32] movdqu xmm7, xmmword ptr [rsp+48] movdqu xmm6, xmmword ptr [rsp+64] add rsp, 80 pop r15 pop r14 pop r13 pop r12 pop rsi pop rdi pop rbp pop rbx ; return ret TransformAddress MACRO reg32, reg64 ;# Transforms the address in the register so that the transformed address ;# lies in a different cache line than the original address (mod 2^N). ;# This is done to prevent a load-store dependency. ;# There are 3 different transformations that can be used: x -> 9*x+C, x -> x+C, x -> x^C ;lea reg32, [reg64+reg64*8+127] ;# C = -119 -110 -101 -92 -83 -74 -65 -55 -46 -37 -28 -19 -10 -1 9 18 27 36 45 54 63 73 82 91 100 109 118 127 db 64 add reg32, -39 ;# C = all except -7 to +7 ;xor reg32, -8 ;# C = all except 0 to 7 ENDM ALIGN 64 rx_read: ;# IN eax = random 32-bit address ;# GLOBAL rdi = address of the dataset address ;# GLOBAL rsi = address of the scratchpad ;# GLOBAL rbp = low 32 bits = "mx", high 32 bits = "ma" ;# MODIFY rcx, rdx TransformAddress eax, rax ;# TransformAddress function mov rcx, qword ptr [rdi] ;# load the dataset address xor rbp, rax ;# modify "mx" ;# prefetch cacheline "mx" and rbp, -64 ;# align "mx" to the start of a cache line mov edx, ebp ;# edx = mx prefetchnta byte ptr [rcx+rdx] ;# read cacheline "ma" ror rbp, 32 ;# swap "ma" and "mx" mov edx, ebp ;# edx = ma lea rcx, [rcx+rdx] ;# dataset cache line xor r8, qword ptr [rcx+0] xor r9, qword ptr [rcx+8] xor r10, qword ptr [rcx+16] xor r11, qword ptr [rcx+24] xor r12, qword ptr [rcx+32] xor r13, qword ptr [rcx+40] xor r14, qword ptr [rcx+48] xor r15, qword ptr [rcx+56] ret executeProgram ENDP _RANDOMX_EXECUTE_PROGRAM ENDS ENDIF END