// Copyright (c) 2018 Andreas Auernhammer. All rights reserved.
// Use of this source code is governed by a license that can be
// found in the LICENSE file.

// +build 386,!gccgo,!appengine,!nacl amd64,!gccgo,!appengine,!nacl

// ROTL_SSE rotates all 4 32 bit values of the XMM register v
// left by n bits using SSE2 instructions (0 <= n <= 32).
// The XMM register t is used as a temp. register.
#define ROTL_SSE(n, t, v) \
	MOVO  v, t;       \
	PSLLL $n, t;      \
	PSRLL $(32-n), v; \
	PXOR  t, v

// ROTL_AVX rotates all 4/8 32 bit values of the AVX/AVX2 register v
// left by n bits using AVX/AVX2 instructions (0 <= n <= 32).
// The AVX/AVX2 register t is used as a temp. register.
#define ROTL_AVX(n, t, v) \
	VPSLLD $n, v, t;      \
	VPSRLD $(32-n), v, v; \
	VPXOR  v, t, v

// CHACHA_QROUND_SSE2 performs a ChaCha quarter-round using the
// 4 XMM registers v0, v1, v2 and v3. It uses only ROTL_SSE2 for
// rotations. The XMM register t is used as a temp. register.
#define CHACHA_QROUND_SSE2(v0, v1, v2, v3, t) \
	PADDL v1, v0;        \
	PXOR  v0, v3;        \
	ROTL_SSE(16, t, v3); \
	PADDL v3, v2;        \
	PXOR  v2, v1;        \
	ROTL_SSE(12, t, v1); \
	PADDL v1, v0;        \
	PXOR  v0, v3;        \
	ROTL_SSE(8, t, v3);  \
	PADDL v3, v2;        \
	PXOR  v2, v1;        \
	ROTL_SSE(7, t, v1)

// CHACHA_QROUND_SSSE3 performs a ChaCha quarter-round using the
// 4 XMM registers v0, v1, v2 and v3. It uses PSHUFB for 8/16 bit
// rotations. The XMM register t is used as a temp. register.
//
// r16 holds the PSHUFB constant for a 16 bit left rotate.
// r8 holds the PSHUFB constant for a 8 bit left rotate.
#define CHACHA_QROUND_SSSE3(v0, v1, v2, v3, t, r16, r8) \
	PADDL  v1, v0;       \
	PXOR   v0, v3;       \
	PSHUFB r16, v3;      \
	PADDL  v3, v2;       \
	PXOR   v2, v1;       \
	ROTL_SSE(12, t, v1); \
	PADDL  v1, v0;       \
	PXOR   v0, v3;       \
	PSHUFB r8, v3;       \
	PADDL  v3, v2;       \
	PXOR   v2, v1;       \
	ROTL_SSE(7, t, v1)

// CHACHA_QROUND_AVX performs a ChaCha quarter-round using the
// 4 AVX/AVX2 registers v0, v1, v2 and v3. It uses VPSHUFB for 8/16 bit
// rotations. The AVX/AVX2 register t is used as a temp. register.
//
// r16 holds the VPSHUFB constant for a 16 bit left rotate.
// r8 holds the VPSHUFB constant for a 8 bit left rotate.
#define CHACHA_QROUND_AVX(v0, v1, v2, v3, t, r16, r8) \
	VPADDD  v0, v1, v0;  \
	VPXOR   v3, v0, v3;  \
	VPSHUFB r16, v3, v3; \
	VPADDD  v2, v3, v2;  \
	VPXOR   v1, v2, v1;  \
	ROTL_AVX(12, t, v1); \
	VPADDD  v0, v1, v0;  \
	VPXOR   v3, v0, v3;  \
	VPSHUFB r8, v3, v3;  \
	VPADDD  v2, v3, v2;  \
	VPXOR   v1, v2, v1;  \
	ROTL_AVX(7, t, v1)

// CHACHA_SHUFFLE_SSE performs a ChaCha shuffle using the
// 3 XMM registers v1, v2 and v3. The inverse shuffle is
// performed by switching v1 and v3: CHACHA_SHUFFLE_SSE(v3, v2, v1).
#define CHACHA_SHUFFLE_SSE(v1, v2, v3) \
	PSHUFL $0x39, v1, v1; \
	PSHUFL $0x4E, v2, v2; \
	PSHUFL $0x93, v3, v3

// CHACHA_SHUFFLE_AVX performs a ChaCha shuffle using the
// 3 AVX/AVX2 registers v1, v2 and v3. The inverse shuffle is
// performed by switching v1 and v3: CHACHA_SHUFFLE_AVX(v3, v2, v1).
#define CHACHA_SHUFFLE_AVX(v1, v2, v3) \
	VPSHUFD $0x39, v1, v1; \
	VPSHUFD $0x4E, v2, v2; \
	VPSHUFD $0x93, v3, v3

// XOR_SSE extracts 4x16 byte vectors from src at
// off, xors all vectors with the corresponding XMM
// register (v0 - v3) and writes the result to dst
// at off.
// The XMM register t is used as a temp. register.
#define XOR_SSE(dst, src, off, v0, v1, v2, v3, t) \
	MOVOU 0+off(src), t;  \
	PXOR  v0, t;          \
	MOVOU t, 0+off(dst);  \
	MOVOU 16+off(src), t; \
	PXOR  v1, t;          \
	MOVOU t, 16+off(dst); \
	MOVOU 32+off(src), t; \
	PXOR  v2, t;          \
	MOVOU t, 32+off(dst); \
	MOVOU 48+off(src), t; \
	PXOR  v3, t;          \
	MOVOU t, 48+off(dst)

// XOR_AVX extracts 4x16 byte vectors from src at
// off, xors all vectors with the corresponding AVX
// register (v0 - v3) and writes the result to dst
// at off.
// The XMM register t is used as a temp. register.
#define XOR_AVX(dst, src, off, v0, v1, v2, v3, t) \
	VPXOR   0+off(src), v0, t;  \
	VMOVDQU t, 0+off(dst);      \
	VPXOR   16+off(src), v1, t; \
	VMOVDQU t, 16+off(dst);     \
	VPXOR   32+off(src), v2, t; \
	VMOVDQU t, 32+off(dst);     \
	VPXOR   48+off(src), v3, t; \
	VMOVDQU t, 48+off(dst)

#define XOR_AVX2(dst, src, off, v0, v1, v2, v3, t0, t1) \
	VMOVDQU    (0+off)(src), t0;  \
	VPERM2I128 $32, v1, v0, t1;   \
	VPXOR      t0, t1, t0;        \
	VMOVDQU    t0, (0+off)(dst);  \
	VMOVDQU    (32+off)(src), t0; \
	VPERM2I128 $32, v3, v2, t1;   \
	VPXOR      t0, t1, t0;        \
	VMOVDQU    t0, (32+off)(dst); \
	VMOVDQU    (64+off)(src), t0; \
	VPERM2I128 $49, v1, v0, t1;   \
	VPXOR      t0, t1, t0;        \
	VMOVDQU    t0, (64+off)(dst); \
	VMOVDQU    (96+off)(src), t0; \
	VPERM2I128 $49, v3, v2, t1;   \
	VPXOR      t0, t1, t0;        \
	VMOVDQU    t0, (96+off)(dst)

#define XOR_UPPER_AVX2(dst, src, off, v0, v1, v2, v3, t0, t1) \
	VMOVDQU    (0+off)(src), t0;  \
	VPERM2I128 $32, v1, v0, t1;   \
	VPXOR      t0, t1, t0;        \
	VMOVDQU    t0, (0+off)(dst);  \
	VMOVDQU    (32+off)(src), t0; \
	VPERM2I128 $32, v3, v2, t1;   \
	VPXOR      t0, t1, t0;        \
	VMOVDQU    t0, (32+off)(dst); \

#define EXTRACT_LOWER(dst, v0, v1, v2, v3, t0) \
	VPERM2I128 $49, v1, v0, t0; \
	VMOVDQU    t0, 0(dst);      \
	VPERM2I128 $49, v3, v2, t0; \
	VMOVDQU    t0, 32(dst)