xs/vendor/blitter.com/go/kyber/hwaccel_amd64.s
2020-01-30 12:18:39 -08:00

2749 lines
53 KiB
ArmAsm

// +build !noasm,go1.10
// hwaccel_amd64.s - AMD64 optimized routines.
//
// To the extent possible under law, Yawning Angel has waived all copyright
// and related or neighboring rights to the software, using the Creative
// Commons "CC0" public domain dedication. See LICENSE or
// <http://creativecommons.org/publicdomain/zero/1.0/> for full details.
#include "textflag.h"
// func cpuidAmd64(cpuidParams *uint32)
TEXT ·cpuidAmd64(SB), NOSPLIT, $0-8
MOVQ cpuidParams+0(FP), R15
MOVL 0(R15), AX
MOVL 8(R15), CX
CPUID
MOVL AX, 0(R15)
MOVL BX, 4(R15)
MOVL CX, 8(R15)
MOVL DX, 12(R15)
RET
// func xgetbv0Amd64(xcrVec *uint32)
TEXT ·xgetbv0Amd64(SB), NOSPLIT, $0-8
MOVQ xcrVec+0(FP), BX
XORL CX, CX
XGETBV
MOVL AX, 0(BX)
MOVL DX, 4(BX)
RET
// Routines taken from the `avx2` implementation, converted to Go's assembly
// dialect. I do this in lieu of cutting myself to see if I still can feel
// pain.
//
// The conversion is mostly direct except:
// * Instead of aligned loads, unaligned loads are used, as there is no
// meaningful difference on modern Intel systems, and it's not immediately
// obvious to me how Go will align global data.
// * The polyvec_pointwise_acc family of routines take vectors of pointers
// due to the different internal memory layout of a polyvec.
// * The constants are renamed slightly.
// Note:
// * These must be kept in sync with the values in params.go.
// Currently assumes Q = 7681, Q_INV = 57857.
// * Caution, Little endian so things will look different from avx2/consts.c.
DATA ·vpshufb_idx<>+0x00(SB)/8, $0x0504070601000302
DATA ·vpshufb_idx<>+0x08(SB)/8, $0x0d0c0f0e09080b0a
DATA ·vpshufb_idx<>+0x10(SB)/8, $0x0504070601000302
DATA ·vpshufb_idx<>+0x18(SB)/8, $0x0d0c0f0e09080b0a
GLOBL ·vpshufb_idx<>(SB), (NOPTR+RODATA), $32
DATA ·low_mask<>+0x00(SB)/8, $0x1fff1fff1fff1fff
DATA ·low_mask<>+0x08(SB)/8, $0x1fff1fff1fff1fff
DATA ·low_mask<>+0x10(SB)/8, $0x1fff1fff1fff1fff
DATA ·low_mask<>+0x18(SB)/8, $0x1fff1fff1fff1fff
GLOBL ·low_mask<>(SB), (NOPTR+RODATA), $32
DATA ·lowdword<>+0x00(SB)/8, $0x0000ffff0000ffff
DATA ·lowdword<>+0x08(SB)/8, $0x0000ffff0000ffff
DATA ·lowdword<>+0x10(SB)/8, $0x0000ffff0000ffff
DATA ·lowdword<>+0x18(SB)/8, $0x0000ffff0000ffff
GLOBL ·lowdword<>(SB), (NOPTR+RODATA), $32
DATA ·q_x16<>+0x00(SB)/8, $0x1e011e011e011e01
DATA ·q_x16<>+0x08(SB)/8, $0x1e011e011e011e01
DATA ·q_x16<>+0x10(SB)/8, $0x1e011e011e011e01
DATA ·q_x16<>+0x18(SB)/8, $0x1e011e011e011e01
GLOBL ·q_x16<>(SB), (NOPTR+RODATA), $32
DATA ·q2_x16<>+0x00(SB)/8, $0x3c023c023c023c02
DATA ·q2_x16<>+0x08(SB)/8, $0x3c023c023c023c02
DATA ·q2_x16<>+0x10(SB)/8, $0x3c023c023c023c02
DATA ·q2_x16<>+0x18(SB)/8, $0x3c023c023c023c02
GLOBL ·q2_x16<>(SB), (NOPTR+RODATA), $32
DATA ·qinv_x16<>+0x00(SB)/8, $0xe201e201e201e201
DATA ·qinv_x16<>+0x08(SB)/8, $0xe201e201e201e201
DATA ·qinv_x16<>+0x10(SB)/8, $0xe201e201e201e201
DATA ·qinv_x16<>+0x18(SB)/8, $0xe201e201e201e201
GLOBL ·qinv_x16<>(SB), (NOPTR+RODATA), $32
DATA ·f_x16<>+0x00(SB)/8, $0x0100010001000100
DATA ·f_x16<>+0x08(SB)/8, $0x0100010001000100
DATA ·f_x16<>+0x10(SB)/8, $0x0100010001000100
DATA ·f_x16<>+0x18(SB)/8, $0x0100010001000100
GLOBL ·f_x16<>(SB), (NOPTR+RODATA), $32
DATA ·v_x16<>+0x00(SB)/8, $0x4442444244424442
DATA ·v_x16<>+0x08(SB)/8, $0x4442444244424442
DATA ·v_x16<>+0x10(SB)/8, $0x4442444244424442
DATA ·v_x16<>+0x18(SB)/8, $0x4442444244424442
GLOBL ·v_x16<>(SB), (NOPTR+RODATA), $32
DATA ·montsq_x16<>+0x00(SB)/8, $0x15c115c115c115c1
DATA ·montsq_x16<>+0x08(SB)/8, $0x15c115c115c115c1
DATA ·montsq_x16<>+0x10(SB)/8, $0x15c115c115c115c1
DATA ·montsq_x16<>+0x18(SB)/8, $0x15c115c115c115c1
GLOBL ·montsq_x16<>(SB), (NOPTR+RODATA), $32
DATA ·mask11<>+0x00(SB)/8, $0x1111111111111111
DATA ·mask11<>+0x08(SB)/8, $0x1111111111111111
DATA ·mask11<>+0x10(SB)/8, $0x1111111111111111
DATA ·mask11<>+0x18(SB)/8, $0x1111111111111111
GLOBL ·mask11<>(SB), (NOPTR+RODATA), $32
DATA ·mask0f<>+0x00(SB)/8, $0x0f0f0f0f0f0f0f0f
DATA ·mask0f<>+0x08(SB)/8, $0x0f0f0f0f0f0f0f0f
DATA ·mask0f<>+0x10(SB)/8, $0x0f0f0f0f0f0f0f0f
DATA ·mask0f<>+0x18(SB)/8, $0x0f0f0f0f0f0f0f0f
GLOBL ·mask0f<>(SB), (NOPTR+RODATA), $32
// func nttAVX2(inout, zetas *uint16)
TEXT ·nttAVX2(SB), NOSPLIT, $0-16
MOVQ inout+0(FP), DI
MOVQ zetas+8(FP), SI
VMOVDQU ·qinv_x16<>(SB), Y0
VMOVDQU ·q_x16<>(SB), Y1
VMOVDQU ·low_mask<>(SB), Y2
// zetas
VMOVDQU (SI), Y3
// first round
// load
VMOVDQU (DI), Y4
VMOVDQU 32(DI), Y5
VMOVDQU 64(DI), Y6
VMOVDQU 96(DI), Y7
VMOVDQU 256(DI), Y8
VMOVDQU 288(DI), Y9
VMOVDQU 320(DI), Y10
VMOVDQU 352(DI), Y11
// level 0
// mul
VPMULLW Y3, Y8, Y12
VPMULHW Y3, Y8, Y8
VPMULLW Y3, Y9, Y13
VPMULHW Y3, Y9, Y9
VPMULLW Y3, Y10, Y14
VPMULHW Y3, Y10, Y10
VPMULLW Y3, Y11, Y15
VPMULHW Y3, Y11, Y11
// reduce
VPMULLW Y0, Y12, Y12
VPMULLW Y0, Y13, Y13
VPMULLW Y0, Y14, Y14
VPMULLW Y0, Y15, Y15
VPMULHW Y1, Y12, Y12
VPMULHW Y1, Y13, Y13
VPMULHW Y1, Y14, Y14
VPMULHW Y1, Y15, Y15
VPSUBW Y12, Y8, Y12
VPSUBW Y13, Y9, Y13
VPSUBW Y14, Y10, Y14
VPSUBW Y15, Y11, Y15
// update
VPSUBW Y12, Y4, Y8
VPSUBW Y13, Y5, Y9
VPSUBW Y14, Y6, Y10
VPSUBW Y15, Y7, Y11
VPADDW Y12, Y4, Y4
VPADDW Y13, Y5, Y5
VPADDW Y14, Y6, Y6
VPADDW Y15, Y7, Y7
// store
VMOVDQU Y4, (DI)
VMOVDQU Y5, 32(DI)
VMOVDQU Y6, 64(DI)
VMOVDQU Y7, 96(DI)
VMOVDQU Y8, 256(DI)
VMOVDQU Y9, 288(DI)
VMOVDQU Y10, 320(DI)
VMOVDQU Y11, 352(DI)
ADDQ $128, DI
// second round
// load
VMOVDQU (DI), Y4
VMOVDQU 32(DI), Y5
VMOVDQU 64(DI), Y6
VMOVDQU 96(DI), Y7
VMOVDQU 256(DI), Y8
VMOVDQU 288(DI), Y9
VMOVDQU 320(DI), Y10
VMOVDQU 352(DI), Y11
// level 0
// mul
VPMULLW Y3, Y8, Y12
VPMULHW Y3, Y8, Y8
VPMULLW Y3, Y9, Y13
VPMULHW Y3, Y9, Y9
VPMULLW Y3, Y10, Y14
VPMULHW Y3, Y10, Y10
VPMULLW Y3, Y11, Y15
VPMULHW Y3, Y11, Y11
// reduce
VPMULLW Y0, Y12, Y12
VPMULLW Y0, Y13, Y13
VPMULLW Y0, Y14, Y14
VPMULLW Y0, Y15, Y15
VPMULHW Y1, Y12, Y12
VPMULHW Y1, Y13, Y13
VPMULHW Y1, Y14, Y14
VPMULHW Y1, Y15, Y15
VPSUBW Y12, Y8, Y12
VPSUBW Y13, Y9, Y13
VPSUBW Y14, Y10, Y14
VPSUBW Y15, Y11, Y15
// update
VPSUBW Y12, Y4, Y8
VPSUBW Y13, Y5, Y9
VPSUBW Y14, Y6, Y10
VPSUBW Y15, Y7, Y11
VPADDW Y12, Y4, Y4
VPADDW Y13, Y5, Y5
VPADDW Y14, Y6, Y6
VPADDW Y15, Y7, Y7
// store
VMOVDQU Y4, (DI)
VMOVDQU Y5, 32(DI)
VMOVDQU Y6, 64(DI)
VMOVDQU Y7, 96(DI)
VMOVDQU Y8, 256(DI)
VMOVDQU Y9, 288(DI)
VMOVDQU Y10, 320(DI)
VMOVDQU Y11, 352(DI)
SUBQ $128, DI
// first round
// zetas
VMOVDQU 32(SI), Y3
// load
VMOVDQU (DI), Y4
VMOVDQU 32(DI), Y5
VMOVDQU 64(DI), Y6
VMOVDQU 96(DI), Y7
VMOVDQU 128(DI), Y8
VMOVDQU 160(DI), Y9
VMOVDQU 192(DI), Y10
VMOVDQU 224(DI), Y11
// level 1
// mul
VPMULLW Y3, Y8, Y12
VPMULHW Y3, Y8, Y8
VPMULLW Y3, Y9, Y13
VPMULHW Y3, Y9, Y9
VPMULLW Y3, Y10, Y14
VPMULHW Y3, Y10, Y10
VPMULLW Y3, Y11, Y15
VPMULHW Y3, Y11, Y11
// reduce
VPMULLW Y0, Y12, Y12
VPMULLW Y0, Y13, Y13
VPMULLW Y0, Y14, Y14
VPMULLW Y0, Y15, Y15
VPMULHW Y1, Y12, Y12
VPMULHW Y1, Y13, Y13
VPMULHW Y1, Y14, Y14
VPMULHW Y1, Y15, Y15
VPSUBW Y12, Y8, Y12
VPSUBW Y13, Y9, Y13
VPSUBW Y14, Y10, Y14
VPSUBW Y15, Y11, Y15
// update
VPSUBW Y12, Y4, Y8
VPSUBW Y13, Y5, Y9
VPSUBW Y14, Y6, Y10
VPSUBW Y15, Y7, Y11
VPADDW Y12, Y4, Y4
VPADDW Y13, Y5, Y5
VPADDW Y14, Y6, Y6
VPADDW Y15, Y7, Y7
// level 2
// zetas
VMOVDQU 96(SI), Y15
VMOVDQU 128(SI), Y3
// mul
VPMULLW Y15, Y6, Y12
VPMULHW Y15, Y6, Y6
VPMULLW Y15, Y7, Y13
VPMULHW Y15, Y7, Y7
VPMULLW Y3, Y10, Y14
VPMULHW Y3, Y10, Y10
VPMULLW Y3, Y11, Y15
VPMULHW Y3, Y11, Y11
// reduce
VPMULLW Y0, Y12, Y12
VPMULLW Y0, Y13, Y13
VPMULLW Y0, Y14, Y14
VPMULLW Y0, Y15, Y15
VPMULHW Y1, Y12, Y12
VPMULHW Y1, Y13, Y13
VPMULHW Y1, Y14, Y14
VPMULHW Y1, Y15, Y15
VPSUBW Y12, Y6, Y12
VPSUBW Y13, Y7, Y13
VPSUBW Y14, Y10, Y14
VPSUBW Y15, Y11, Y15
// update
VPSUBW Y12, Y4, Y6
VPSUBW Y13, Y5, Y7
VPSUBW Y14, Y8, Y10
VPSUBW Y15, Y9, Y11
VPADDW Y12, Y4, Y4
VPADDW Y13, Y5, Y5
VPADDW Y14, Y8, Y8
VPADDW Y15, Y9, Y9
// level 3
// zetas
VMOVDQU 224(SI), Y13
VMOVDQU 256(SI), Y14
VMOVDQU 288(SI), Y15
VMOVDQU 320(SI), Y3
// mul
VPMULLW Y13, Y5, Y12
VPMULHW Y13, Y5, Y5
VPMULLW Y14, Y7, Y13
VPMULHW Y14, Y7, Y7
VPMULLW Y15, Y9, Y14
VPMULHW Y15, Y9, Y9
VPMULLW Y3, Y11, Y15
VPMULHW Y3, Y11, Y11
// reduce
VPMULLW Y0, Y12, Y12
VPMULLW Y0, Y13, Y13
VPMULLW Y0, Y14, Y14
VPMULLW Y0, Y15, Y15
VPMULHW Y1, Y12, Y12
VPMULHW Y1, Y13, Y13
VPMULHW Y1, Y14, Y14
VPMULHW Y1, Y15, Y15
VPSUBW Y12, Y5, Y12
VPSUBW Y13, Y7, Y13
VPSUBW Y14, Y9, Y14
VPSUBW Y15, Y11, Y15
// reduce 2
VPSRAW $13, Y4, Y5
VPSRAW $13, Y6, Y7
VPSRAW $13, Y8, Y9
VPSRAW $13, Y10, Y11
VPAND Y2, Y4, Y4
VPAND Y2, Y6, Y6
VPAND Y2, Y8, Y8
VPAND Y2, Y10, Y10
VPSUBW Y5, Y4, Y4
VPSUBW Y7, Y6, Y6
VPSUBW Y9, Y8, Y8
VPSUBW Y11, Y10, Y10
VPSLLW $9, Y5, Y5
VPSLLW $9, Y7, Y7
VPSLLW $9, Y9, Y9
VPSLLW $9, Y11, Y11
VPADDW Y5, Y4, Y4
VPADDW Y7, Y6, Y6
VPADDW Y9, Y8, Y8
VPADDW Y11, Y10, Y10
// update
VPSUBW Y12, Y4, Y5
VPSUBW Y13, Y6, Y7
VPSUBW Y14, Y8, Y9
VPSUBW Y15, Y10, Y11
VPADDW Y12, Y4, Y4
VPADDW Y13, Y6, Y6
VPADDW Y14, Y8, Y8
VPADDW Y15, Y10, Y10
// level 4
// zetas
VMOVDQU 480(SI), Y12
VMOVDQU 512(SI), Y13
VMOVDQU 544(SI), Y14
VMOVDQU 576(SI), Y15
// shuffle
VPERM2I128 $0x02, Y4, Y5, Y3
VPERM2I128 $0x13, Y4, Y5, Y4
VPERM2I128 $0x02, Y6, Y7, Y5
VPERM2I128 $0x13, Y6, Y7, Y6
VPERM2I128 $0x02, Y8, Y9, Y7
VPERM2I128 $0x13, Y8, Y9, Y8
VPERM2I128 $0x02, Y10, Y11, Y9
VPERM2I128 $0x13, Y10, Y11, Y10
// mul
VPMULLW Y12, Y4, Y11
VPMULHW Y12, Y4, Y4
VPMULLW Y13, Y6, Y12
VPMULHW Y13, Y6, Y6
VPMULLW Y14, Y8, Y13
VPMULHW Y14, Y8, Y8
VPMULLW Y15, Y10, Y14
VPMULHW Y15, Y10, Y10
// reduce
VPMULLW Y0, Y11, Y11
VPMULLW Y0, Y12, Y12
VPMULLW Y0, Y13, Y13
VPMULLW Y0, Y14, Y14
VPMULHW Y1, Y11, Y11
VPMULHW Y1, Y12, Y12
VPMULHW Y1, Y13, Y13
VPMULHW Y1, Y14, Y14
VPSUBW Y11, Y4, Y11
VPSUBW Y12, Y6, Y12
VPSUBW Y13, Y8, Y13
VPSUBW Y14, Y10, Y14
// update
VPSUBW Y11, Y3, Y4
VPSUBW Y12, Y5, Y6
VPSUBW Y13, Y7, Y8
VPSUBW Y14, Y9, Y10
VPADDW Y11, Y3, Y3
VPADDW Y12, Y5, Y5
VPADDW Y13, Y7, Y7
VPADDW Y14, Y9, Y9
// level 5
// zetas
VMOVDQU 736(SI), Y12
VMOVDQU 768(SI), Y13
VMOVDQU 800(SI), Y14
VMOVDQU 832(SI), Y15
// shuffle
VSHUFPD $0x00, Y4, Y3, Y11
VSHUFPD $0x0F, Y4, Y3, Y3
VSHUFPD $0x00, Y6, Y5, Y4
VSHUFPD $0x0F, Y6, Y5, Y5
VSHUFPD $0x00, Y8, Y7, Y6
VSHUFPD $0x0F, Y8, Y7, Y7
VSHUFPD $0x00, Y10, Y9, Y8
VSHUFPD $0x0F, Y10, Y9, Y9
// mul
VPMULLW Y12, Y3, Y10
VPMULHW Y12, Y3, Y3
VPMULLW Y13, Y5, Y12
VPMULHW Y13, Y5, Y5
VPMULLW Y14, Y7, Y13
VPMULHW Y14, Y7, Y7
VPMULLW Y15, Y9, Y14
VPMULHW Y15, Y9, Y9
// reduce
VPMULLW Y0, Y10, Y10
VPMULLW Y0, Y12, Y12
VPMULLW Y0, Y13, Y13
VPMULLW Y0, Y14, Y14
VPMULHW Y1, Y10, Y10
VPMULHW Y1, Y12, Y12
VPMULHW Y1, Y13, Y13
VPMULHW Y1, Y14, Y14
VPSUBW Y10, Y3, Y10
VPSUBW Y12, Y5, Y12
VPSUBW Y13, Y7, Y13
VPSUBW Y14, Y9, Y14
// update
VPSUBW Y10, Y11, Y3
VPSUBW Y12, Y4, Y5
VPSUBW Y13, Y6, Y7
VPSUBW Y14, Y8, Y9
VPADDW Y10, Y11, Y10
VPADDW Y12, Y4, Y4
VPADDW Y13, Y6, Y6
VPADDW Y14, Y8, Y8
// level 6
// shuffle
VPSHUFD $0xB1, Y10, Y12
VPSHUFD $0xB1, Y3, Y13
VPSHUFD $0xB1, Y4, Y14
VPSHUFD $0xB1, Y5, Y15
VPBLENDD $0x55, Y10, Y13, Y10
VPBLENDD $0xAA, Y3, Y12, Y3
VPBLENDD $0x55, Y4, Y15, Y4
VPBLENDD $0xAA, Y5, Y14, Y5
VPSHUFD $0xB1, Y6, Y12
VPSHUFD $0xB1, Y7, Y13
VPSHUFD $0xB1, Y8, Y14
VPSHUFD $0xB1, Y9, Y15
VPBLENDD $0x55, Y6, Y13, Y6
VPBLENDD $0xAA, Y7, Y12, Y7
VPBLENDD $0x55, Y8, Y15, Y8
VPBLENDD $0xAA, Y9, Y14, Y9
// zetas
VMOVDQU 992(SI), Y12
VMOVDQU 1024(SI), Y13
VMOVDQU 1056(SI), Y14
VMOVDQU 1088(SI), Y15
// mul
VPMULLW Y12, Y3, Y11
VPMULHW Y12, Y3, Y3
VPMULLW Y13, Y5, Y12
VPMULHW Y13, Y5, Y5
VPMULLW Y14, Y7, Y13
VPMULHW Y14, Y7, Y7
VPMULLW Y15, Y9, Y14
VPMULHW Y15, Y9, Y9
// reduce
VPMULLW Y0, Y11, Y11
VPMULLW Y0, Y12, Y12
VPMULLW Y0, Y13, Y13
VPMULLW Y0, Y14, Y14
VPMULHW Y1, Y11, Y11
VPMULHW Y1, Y12, Y12
VPMULHW Y1, Y13, Y13
VPMULHW Y1, Y14, Y14
VPSUBW Y11, Y3, Y11
VPSUBW Y12, Y5, Y12
VPSUBW Y13, Y7, Y13
VPSUBW Y14, Y9, Y14
// reduce 2
VPSRAW $13, Y10, Y3
VPSRAW $13, Y4, Y5
VPSRAW $13, Y6, Y7
VPSRAW $13, Y8, Y9
VPAND Y2, Y10, Y10
VPAND Y2, Y4, Y4
VPAND Y2, Y6, Y6
VPAND Y2, Y8, Y8
VPSUBW Y3, Y10, Y10
VPSUBW Y5, Y4, Y4
VPSUBW Y7, Y6, Y6
VPSUBW Y9, Y8, Y8
VPSLLW $9, Y3, Y3
VPSLLW $9, Y5, Y5
VPSLLW $9, Y7, Y7
VPSLLW $9, Y9, Y9
VPADDW Y3, Y10, Y10
VPADDW Y5, Y4, Y4
VPADDW Y7, Y6, Y6
VPADDW Y9, Y8, Y8
// update
VPSUBW Y11, Y10, Y3
VPSUBW Y12, Y4, Y5
VPSUBW Y13, Y6, Y7
VPSUBW Y14, Y8, Y9
VPADDW Y11, Y10, Y10
VPADDW Y12, Y4, Y4
VPADDW Y13, Y6, Y6
VPADDW Y14, Y8, Y8
// level 7
// shuffle
VMOVDQU ·vpshufb_idx<>(SB), Y15
VPSHUFB Y15, Y10, Y11
VPSHUFB Y15, Y3, Y12
VPSHUFB Y15, Y4, Y13
VPSHUFB Y15, Y5, Y14
VPBLENDW $0x55, Y10, Y12, Y10
VPBLENDW $0xAA, Y3, Y11, Y3
VPBLENDW $0x55, Y4, Y14, Y4
VPBLENDW $0xAA, Y5, Y13, Y5
VPSHUFB Y15, Y6, Y11
VPSHUFB Y15, Y7, Y12
VPSHUFB Y15, Y8, Y13
VPSHUFB Y15, Y9, Y14
VPBLENDW $0x55, Y6, Y12, Y6
VPBLENDW $0xAA, Y7, Y11, Y7
VPBLENDW $0x55, Y8, Y14, Y8
VPBLENDW $0xAA, Y9, Y13, Y9
// zetas
VMOVDQU 1248(SI), Y12
VMOVDQU 1280(SI), Y13
VMOVDQU 1312(SI), Y14
VMOVDQU 1344(SI), Y15
// mul
VPMULLW Y12, Y3, Y11
VPMULHW Y12, Y3, Y3
VPMULLW Y13, Y5, Y12
VPMULHW Y13, Y5, Y5
VPMULLW Y14, Y7, Y13
VPMULHW Y14, Y7, Y7
VPMULLW Y15, Y9, Y14
VPMULHW Y15, Y9, Y9
// reduce
VPMULLW Y0, Y11, Y11
VPMULLW Y0, Y12, Y12
VPMULLW Y0, Y13, Y13
VPMULLW Y0, Y14, Y14
VPMULHW Y1, Y11, Y11
VPMULHW Y1, Y12, Y12
VPMULHW Y1, Y13, Y13
VPMULHW Y1, Y14, Y14
VPSUBW Y11, Y3, Y11
VPSUBW Y12, Y5, Y12
VPSUBW Y13, Y7, Y13
VPSUBW Y14, Y9, Y14
// reduce 3
VMOVDQU ·q2_x16<>(SB), Y15
VPSRAW $15, Y10, Y3
VPSRAW $15, Y4, Y5
VPSRAW $15, Y6, Y7
VPSRAW $15, Y8, Y9
VPAND Y15, Y3, Y3
VPAND Y15, Y5, Y5
VPAND Y15, Y7, Y7
VPAND Y15, Y9, Y9
VPADDW Y1, Y10, Y10
VPADDW Y1, Y4, Y4
VPADDW Y1, Y6, Y6
VPADDW Y1, Y8, Y8
VPADDW Y3, Y10, Y10
VPADDW Y5, Y4, Y4
VPADDW Y7, Y6, Y6
VPADDW Y9, Y8, Y8
// update
VPSUBW Y11, Y10, Y3
VPSUBW Y12, Y4, Y5
VPSUBW Y13, Y6, Y7
VPSUBW Y14, Y8, Y9
VPADDW Y11, Y10, Y10
VPADDW Y12, Y4, Y4
VPADDW Y13, Y6, Y6
VPADDW Y14, Y8, Y8
// reorder
VPUNPCKLWD Y3, Y10, Y12
VPUNPCKHWD Y3, Y10, Y13
VPUNPCKLWD Y5, Y4, Y14
VPUNPCKHWD Y5, Y4, Y15
VPUNPCKLWD Y7, Y6, Y3
VPUNPCKHWD Y7, Y6, Y4
VPUNPCKLWD Y9, Y8, Y5
VPUNPCKHWD Y9, Y8, Y6
VPERM2I128 $0x20, Y13, Y12, Y11
VPERM2I128 $0x31, Y13, Y12, Y12
VPERM2I128 $0x20, Y15, Y14, Y13
VPERM2I128 $0x31, Y15, Y14, Y14
VPERM2I128 $0x20, Y4, Y3, Y15
VPERM2I128 $0x31, Y4, Y3, Y3
VPERM2I128 $0x20, Y6, Y5, Y4
VPERM2I128 $0x31, Y6, Y5, Y5
// store
VMOVDQU Y11, (DI)
VMOVDQU Y12, 32(DI)
VMOVDQU Y13, 64(DI)
VMOVDQU Y14, 96(DI)
VMOVDQU Y15, 128(DI)
VMOVDQU Y3, 160(DI)
VMOVDQU Y4, 192(DI)
VMOVDQU Y5, 224(DI)
ADDQ $256, DI
// second round
// zetas
VMOVDQU 64(SI), Y3
// load
VMOVDQU (DI), Y4
VMOVDQU 32(DI), Y5
VMOVDQU 64(DI), Y6
VMOVDQU 96(DI), Y7
VMOVDQU 128(DI), Y8
VMOVDQU 160(DI), Y9
VMOVDQU 192(DI), Y10
VMOVDQU 224(DI), Y11
// level 1
// mul
VPMULLW Y3, Y8, Y12
VPMULHW Y3, Y8, Y8
VPMULLW Y3, Y9, Y13
VPMULHW Y3, Y9, Y9
VPMULLW Y3, Y10, Y14
VPMULHW Y3, Y10, Y10
VPMULLW Y3, Y11, Y15
VPMULHW Y3, Y11, Y11
// reduce
VPMULLW Y0, Y12, Y12
VPMULLW Y0, Y13, Y13
VPMULLW Y0, Y14, Y14
VPMULLW Y0, Y15, Y15
VPMULHW Y1, Y12, Y12
VPMULHW Y1, Y13, Y13
VPMULHW Y1, Y14, Y14
VPMULHW Y1, Y15, Y15
VPSUBW Y12, Y8, Y12
VPSUBW Y13, Y9, Y13
VPSUBW Y14, Y10, Y14
VPSUBW Y15, Y11, Y15
// update
VPSUBW Y12, Y4, Y8
VPSUBW Y13, Y5, Y9
VPSUBW Y14, Y6, Y10
VPSUBW Y15, Y7, Y11
VPADDW Y12, Y4, Y4
VPADDW Y13, Y5, Y5
VPADDW Y14, Y6, Y6
VPADDW Y15, Y7, Y7
// level 2
// zetas
VMOVDQU 160(SI), Y15
VMOVDQU 192(SI), Y3
// mul
VPMULLW Y15, Y6, Y12
VPMULHW Y15, Y6, Y6
VPMULLW Y15, Y7, Y13
VPMULHW Y15, Y7, Y7
VPMULLW Y3, Y10, Y14
VPMULHW Y3, Y10, Y10
VPMULLW Y3, Y11, Y15
VPMULHW Y3, Y11, Y11
// reduce
VPMULLW Y0, Y12, Y12
VPMULLW Y0, Y13, Y13
VPMULLW Y0, Y14, Y14
VPMULLW Y0, Y15, Y15
VPMULHW Y1, Y12, Y12
VPMULHW Y1, Y13, Y13
VPMULHW Y1, Y14, Y14
VPMULHW Y1, Y15, Y15
VPSUBW Y12, Y6, Y12
VPSUBW Y13, Y7, Y13
VPSUBW Y14, Y10, Y14
VPSUBW Y15, Y11, Y15
// update
VPSUBW Y12, Y4, Y6
VPSUBW Y13, Y5, Y7
VPSUBW Y14, Y8, Y10
VPSUBW Y15, Y9, Y11
VPADDW Y12, Y4, Y4
VPADDW Y13, Y5, Y5
VPADDW Y14, Y8, Y8
VPADDW Y15, Y9, Y9
// level 3
// zetas
VMOVDQU 352(SI), Y13
VMOVDQU 384(SI), Y14
VMOVDQU 416(SI), Y15
VMOVDQU 448(SI), Y3
// mul
VPMULLW Y13, Y5, Y12
VPMULHW Y13, Y5, Y5
VPMULLW Y14, Y7, Y13
VPMULHW Y14, Y7, Y7
VPMULLW Y15, Y9, Y14
VPMULHW Y15, Y9, Y9
VPMULLW Y3, Y11, Y15
VPMULHW Y3, Y11, Y11
// reduce
VPMULLW Y0, Y12, Y12
VPMULLW Y0, Y13, Y13
VPMULLW Y0, Y14, Y14
VPMULLW Y0, Y15, Y15
VPMULHW Y1, Y12, Y12
VPMULHW Y1, Y13, Y13
VPMULHW Y1, Y14, Y14
VPMULHW Y1, Y15, Y15
VPSUBW Y12, Y5, Y12
VPSUBW Y13, Y7, Y13
VPSUBW Y14, Y9, Y14
VPSUBW Y15, Y11, Y15
// reduce 2
VPSRAW $13, Y4, Y5
VPSRAW $13, Y6, Y7
VPSRAW $13, Y8, Y9
VPSRAW $13, Y10, Y11
VPAND Y2, Y4, Y4
VPAND Y2, Y6, Y6
VPAND Y2, Y8, Y8
VPAND Y2, Y10, Y10
VPSUBW Y5, Y4, Y4
VPSUBW Y7, Y6, Y6
VPSUBW Y9, Y8, Y8
VPSUBW Y11, Y10, Y10
VPSLLW $9, Y5, Y5
VPSLLW $9, Y7, Y7
VPSLLW $9, Y9, Y9
VPSLLW $9, Y11, Y11
VPADDW Y5, Y4, Y4
VPADDW Y7, Y6, Y6
VPADDW Y9, Y8, Y8
VPADDW Y11, Y10, Y10
// update
VPSUBW Y12, Y4, Y5
VPSUBW Y13, Y6, Y7
VPSUBW Y14, Y8, Y9
VPSUBW Y15, Y10, Y11
VPADDW Y12, Y4, Y4
VPADDW Y13, Y6, Y6
VPADDW Y14, Y8, Y8
VPADDW Y15, Y10, Y10
// level 4
// zetas
VMOVDQU 608(SI), Y12
VMOVDQU 640(SI), Y13
VMOVDQU 672(SI), Y14
VMOVDQU 704(SI), Y15
// shuffle
VPERM2I128 $0x02, Y4, Y5, Y3
VPERM2I128 $0x13, Y4, Y5, Y4
VPERM2I128 $0x02, Y6, Y7, Y5
VPERM2I128 $0x13, Y6, Y7, Y6
VPERM2I128 $0x02, Y8, Y9, Y7
VPERM2I128 $0x13, Y8, Y9, Y8
VPERM2I128 $0x02, Y10, Y11, Y9
VPERM2I128 $0x13, Y10, Y11, Y10
// mul
VPMULLW Y12, Y4, Y11
VPMULHW Y12, Y4, Y4
VPMULLW Y13, Y6, Y12
VPMULHW Y13, Y6, Y6
VPMULLW Y14, Y8, Y13
VPMULHW Y14, Y8, Y8
VPMULLW Y15, Y10, Y14
VPMULHW Y15, Y10, Y10
// reduce
VPMULLW Y0, Y11, Y11
VPMULLW Y0, Y12, Y12
VPMULLW Y0, Y13, Y13
VPMULLW Y0, Y14, Y14
VPMULHW Y1, Y11, Y11
VPMULHW Y1, Y12, Y12
VPMULHW Y1, Y13, Y13
VPMULHW Y1, Y14, Y14
VPSUBW Y11, Y4, Y11
VPSUBW Y12, Y6, Y12
VPSUBW Y13, Y8, Y13
VPSUBW Y14, Y10, Y14
// update
VPSUBW Y11, Y3, Y4
VPSUBW Y12, Y5, Y6
VPSUBW Y13, Y7, Y8
VPSUBW Y14, Y9, Y10
VPADDW Y11, Y3, Y3
VPADDW Y12, Y5, Y5
VPADDW Y13, Y7, Y7
VPADDW Y14, Y9, Y9
// level 5
// zetas
VMOVDQU 864(SI), Y12
VMOVDQU 896(SI), Y13
VMOVDQU 928(SI), Y14
VMOVDQU 960(SI), Y15
// shuffle
VSHUFPD $0x00, Y4, Y3, Y11
VSHUFPD $0x0F, Y4, Y3, Y3
VSHUFPD $0x00, Y6, Y5, Y4
VSHUFPD $0x0F, Y6, Y5, Y5
VSHUFPD $0x00, Y8, Y7, Y6
VSHUFPD $0x0F, Y8, Y7, Y7
VSHUFPD $0x00, Y10, Y9, Y8
VSHUFPD $0x0F, Y10, Y9, Y9
// mul
VPMULLW Y12, Y3, Y10
VPMULHW Y12, Y3, Y3
VPMULLW Y13, Y5, Y12
VPMULHW Y13, Y5, Y5
VPMULLW Y14, Y7, Y13
VPMULHW Y14, Y7, Y7
VPMULLW Y15, Y9, Y14
VPMULHW Y15, Y9, Y9
// reduce
VPMULLW Y0, Y10, Y10
VPMULLW Y0, Y12, Y12
VPMULLW Y0, Y13, Y13
VPMULLW Y0, Y14, Y14
VPMULHW Y1, Y10, Y10
VPMULHW Y1, Y12, Y12
VPMULHW Y1, Y13, Y13
VPMULHW Y1, Y14, Y14
VPSUBW Y10, Y3, Y10
VPSUBW Y12, Y5, Y12
VPSUBW Y13, Y7, Y13
VPSUBW Y14, Y9, Y14
// update
VPSUBW Y10, Y11, Y3
VPSUBW Y12, Y4, Y5
VPSUBW Y13, Y6, Y7
VPSUBW Y14, Y8, Y9
VPADDW Y10, Y11, Y10
VPADDW Y12, Y4, Y4
VPADDW Y13, Y6, Y6
VPADDW Y14, Y8, Y8
// level 6
// shuffle
VPSHUFD $0xB1, Y10, Y12
VPSHUFD $0xB1, Y3, Y13
VPSHUFD $0xB1, Y4, Y14
VPSHUFD $0xB1, Y5, Y15
VPBLENDD $0x55, Y10, Y13, Y10
VPBLENDD $0xAA, Y3, Y12, Y3
VPBLENDD $0x55, Y4, Y15, Y4
VPBLENDD $0xAA, Y5, Y14, Y5
VPSHUFD $0xB1, Y6, Y12
VPSHUFD $0xB1, Y7, Y13
VPSHUFD $0xB1, Y8, Y14
VPSHUFD $0xB1, Y9, Y15
VPBLENDD $0x55, Y6, Y13, Y6
VPBLENDD $0xAA, Y7, Y12, Y7
VPBLENDD $0x55, Y8, Y15, Y8
VPBLENDD $0xAA, Y9, Y14, Y9
// zetas
VMOVDQU 1120(SI), Y12
VMOVDQU 1152(SI), Y13
VMOVDQU 1184(SI), Y14
VMOVDQU 1216(SI), Y15
// mul
VPMULLW Y12, Y3, Y11
VPMULHW Y12, Y3, Y3
VPMULLW Y13, Y5, Y12
VPMULHW Y13, Y5, Y5
VPMULLW Y14, Y7, Y13
VPMULHW Y14, Y7, Y7
VPMULLW Y15, Y9, Y14
VPMULHW Y15, Y9, Y9
// reduce
VPMULLW Y0, Y11, Y11
VPMULLW Y0, Y12, Y12
VPMULLW Y0, Y13, Y13
VPMULLW Y0, Y14, Y14
VPMULHW Y1, Y11, Y11
VPMULHW Y1, Y12, Y12
VPMULHW Y1, Y13, Y13
VPMULHW Y1, Y14, Y14
VPSUBW Y11, Y3, Y11
VPSUBW Y12, Y5, Y12
VPSUBW Y13, Y7, Y13
VPSUBW Y14, Y9, Y14
// reduce 2
VPSRAW $13, Y10, Y3
VPSRAW $13, Y4, Y5
VPSRAW $13, Y6, Y7
VPSRAW $13, Y8, Y9
VPAND Y2, Y10, Y10
VPAND Y2, Y4, Y4
VPAND Y2, Y6, Y6
VPAND Y2, Y8, Y8
VPSUBW Y3, Y10, Y10
VPSUBW Y5, Y4, Y4
VPSUBW Y7, Y6, Y6
VPSUBW Y9, Y8, Y8
VPSLLW $9, Y3, Y3
VPSLLW $9, Y5, Y5
VPSLLW $9, Y7, Y7
VPSLLW $9, Y9, Y9
VPADDW Y3, Y10, Y10
VPADDW Y5, Y4, Y4
VPADDW Y7, Y6, Y6
VPADDW Y9, Y8, Y8
// update
VPSUBW Y11, Y10, Y3
VPSUBW Y12, Y4, Y5
VPSUBW Y13, Y6, Y7
VPSUBW Y14, Y8, Y9
VPADDW Y11, Y10, Y10
VPADDW Y12, Y4, Y4
VPADDW Y13, Y6, Y6
VPADDW Y14, Y8, Y8
// level 7
// shuffle
VMOVDQU ·vpshufb_idx<>(SB), Y15
VPSHUFB Y15, Y10, Y11
VPSHUFB Y15, Y3, Y12
VPSHUFB Y15, Y4, Y13
VPSHUFB Y15, Y5, Y14
VPBLENDW $0x55, Y10, Y12, Y10
VPBLENDW $0xAA, Y3, Y11, Y3
VPBLENDW $0x55, Y4, Y14, Y4
VPBLENDW $0xAA, Y5, Y13, Y5
VPSHUFB Y15, Y6, Y11
VPSHUFB Y15, Y7, Y12
VPSHUFB Y15, Y8, Y13
VPSHUFB Y15, Y9, Y14
VPBLENDW $0x55, Y6, Y12, Y6
VPBLENDW $0xAA, Y7, Y11, Y7
VPBLENDW $0x55, Y8, Y14, Y8
VPBLENDW $0xAA, Y9, Y13, Y9
// zetas
VMOVDQU 1376(SI), Y12
VMOVDQU 1408(SI), Y13
VMOVDQU 1440(SI), Y14
VMOVDQU 1472(SI), Y15
// mul
VPMULLW Y12, Y3, Y11
VPMULHW Y12, Y3, Y3
VPMULLW Y13, Y5, Y12
VPMULHW Y13, Y5, Y5
VPMULLW Y14, Y7, Y13
VPMULHW Y14, Y7, Y7
VPMULLW Y15, Y9, Y14
VPMULHW Y15, Y9, Y9
// reduce
VPMULLW Y0, Y11, Y11
VPMULLW Y0, Y12, Y12
VPMULLW Y0, Y13, Y13
VPMULLW Y0, Y14, Y14
VPMULHW Y1, Y11, Y11
VPMULHW Y1, Y12, Y12
VPMULHW Y1, Y13, Y13
VPMULHW Y1, Y14, Y14
VPSUBW Y11, Y3, Y11
VPSUBW Y12, Y5, Y12
VPSUBW Y13, Y7, Y13
VPSUBW Y14, Y9, Y14
// reduce 3
VMOVDQU ·q2_x16<>(SB), Y15
VPSRAW $15, Y10, Y3
VPSRAW $15, Y4, Y5
VPSRAW $15, Y6, Y7
VPSRAW $15, Y8, Y9
VPAND Y15, Y3, Y3
VPAND Y15, Y5, Y5
VPAND Y15, Y7, Y7
VPAND Y15, Y9, Y9
VPADDW Y1, Y10, Y10
VPADDW Y1, Y4, Y4
VPADDW Y1, Y6, Y6
VPADDW Y1, Y8, Y8
VPADDW Y3, Y10, Y10
VPADDW Y5, Y4, Y4
VPADDW Y7, Y6, Y6
VPADDW Y9, Y8, Y8
// update
VPSUBW Y11, Y10, Y3
VPSUBW Y12, Y4, Y5
VPSUBW Y13, Y6, Y7
VPSUBW Y14, Y8, Y9
VPADDW Y11, Y10, Y10
VPADDW Y12, Y4, Y4
VPADDW Y13, Y6, Y6
VPADDW Y14, Y8, Y8
// reorder
VPUNPCKLWD Y3, Y10, Y12
VPUNPCKHWD Y3, Y10, Y13
VPUNPCKLWD Y5, Y4, Y14
VPUNPCKHWD Y5, Y4, Y15
VPUNPCKLWD Y7, Y6, Y3
VPUNPCKHWD Y7, Y6, Y4
VPUNPCKLWD Y9, Y8, Y5
VPUNPCKHWD Y9, Y8, Y6
VPERM2I128 $0x20, Y13, Y12, Y11
VPERM2I128 $0x31, Y13, Y12, Y12
VPERM2I128 $0x20, Y15, Y14, Y13
VPERM2I128 $0x31, Y15, Y14, Y14
VPERM2I128 $0x20, Y4, Y3, Y15
VPERM2I128 $0x31, Y4, Y3, Y3
VPERM2I128 $0x20, Y6, Y5, Y4
VPERM2I128 $0x31, Y6, Y5, Y5
// store
VMOVDQU Y11, (DI)
VMOVDQU Y12, 32(DI)
VMOVDQU Y13, 64(DI)
VMOVDQU Y14, 96(DI)
VMOVDQU Y15, 128(DI)
VMOVDQU Y3, 160(DI)
VMOVDQU Y4, 192(DI)
VMOVDQU Y5, 224(DI)
VZEROUPPER
RET
// Go 1.10's VPERMQ support expects the imm8 to be a `int8`, instead of a
// `uint8`. While this is fixed in master, use the signed representation
// for now till it's reasonable to expect versions with the fix to be widely
// available.
//
// See: https://github.com/golang/go/issues/24378
#define invntt_VPERMQ_IDX $-40 // $0xd8
// func invnttAVX2(inout, omegas *uint16)
TEXT ·invnttAVX2(SB), NOSPLIT, $0-16
MOVQ inout+0(FP), DI
MOVQ omegas+8(FP), SI
VMOVDQU ·qinv_x16<>(SB), Y0
VMOVDQU ·q_x16<>(SB), Y1
VMOVDQU ·v_x16<>(SB), Y2
MOVQ SI, R8
// first round
// load
VMOVDQU (DI), Y4
VMOVDQU 32(DI), Y5
VMOVDQU 64(DI), Y6
VMOVDQU 96(DI), Y7
VMOVDQU 128(DI), Y8
VMOVDQU 160(DI), Y9
VMOVDQU 192(DI), Y10
VMOVDQU 224(DI), Y11
// reorder
VMOVDQU ·lowdword<>(SB), Y3
VPAND Y3, Y4, Y12
VPAND Y3, Y5, Y13
VPAND Y3, Y6, Y14
VPAND Y3, Y7, Y15
VPSRLD $16, Y4, Y4
VPSRLD $16, Y5, Y5
VPSRLD $16, Y6, Y6
VPSRLD $16, Y7, Y7
VPACKUSDW Y5, Y4, Y5
VPACKUSDW Y13, Y12, Y4
VPACKUSDW Y7, Y6, Y7
VPACKUSDW Y15, Y14, Y6
VPERMQ invntt_VPERMQ_IDX, Y4, Y4
VPERMQ invntt_VPERMQ_IDX, Y5, Y5
VPERMQ invntt_VPERMQ_IDX, Y6, Y6
VPERMQ invntt_VPERMQ_IDX, Y7, Y7
VPAND Y3, Y8, Y12
VPAND Y3, Y9, Y13
VPAND Y3, Y10, Y14
VPAND Y3, Y11, Y15
VPSRLD $16, Y8, Y8
VPSRLD $16, Y9, Y9
VPSRLD $16, Y10, Y10
VPSRLD $16, Y11, Y11
VPACKUSDW Y9, Y8, Y9
VPACKUSDW Y13, Y12, Y8
VPACKUSDW Y11, Y10, Y11
VPACKUSDW Y15, Y14, Y10
VPERMQ invntt_VPERMQ_IDX, Y8, Y8
VPERMQ invntt_VPERMQ_IDX, Y9, Y9
VPERMQ invntt_VPERMQ_IDX, Y10, Y10
VPERMQ invntt_VPERMQ_IDX, Y11, Y11
// level 0
// update
VPSUBW Y5, Y4, Y12
VPSUBW Y7, Y6, Y13
VPSUBW Y9, Y8, Y14
VPSUBW Y11, Y10, Y15
VPADDW Y4, Y5, Y4
VPADDW Y6, Y7, Y6
VPADDW Y8, Y9, Y8
VPADDW Y10, Y11, Y10
// zetas
VMOVDQU (R8), Y7
VMOVDQU 32(R8), Y9
VMOVDQU 64(R8), Y11
VMOVDQU 96(R8), Y3
// mul
VPMULLW Y7, Y12, Y5
VPMULHW Y7, Y12, Y12
VPMULLW Y9, Y13, Y7
VPMULHW Y9, Y13, Y13
VPMULLW Y11, Y14, Y9
VPMULHW Y11, Y14, Y14
VPMULLW Y3, Y15, Y11
VPMULHW Y3, Y15, Y15
// reduce
VPMULLW Y0, Y5, Y5
VPMULLW Y0, Y7, Y7
VPMULLW Y0, Y9, Y9
VPMULLW Y0, Y11, Y11
VPMULHW Y1, Y5, Y5
VPMULHW Y1, Y7, Y7
VPMULHW Y1, Y9, Y9
VPMULHW Y1, Y11, Y11
VPSUBW Y5, Y12, Y5
VPSUBW Y7, Y13, Y7
VPSUBW Y9, Y14, Y9
VPSUBW Y11, Y15, Y11
// level 1
// shuffle
VMOVDQU ·vpshufb_idx<>(SB), Y3
VPSHUFB Y3, Y4, Y12
VPSHUFB Y3, Y5, Y13
VPSHUFB Y3, Y6, Y14
VPSHUFB Y3, Y7, Y15
VPBLENDW $0x55, Y4, Y13, Y4
VPBLENDW $0xAA, Y5, Y12, Y5
VPBLENDW $0x55, Y6, Y15, Y6
VPBLENDW $0xAA, Y7, Y14, Y7
VPSHUFB Y3, Y8, Y12
VPSHUFB Y3, Y9, Y13
VPSHUFB Y3, Y10, Y14
VPSHUFB Y3, Y11, Y15
VPBLENDW $0x55, Y8, Y13, Y8
VPBLENDW $0xAA, Y9, Y12, Y9
VPBLENDW $0x55, Y10, Y15, Y10
VPBLENDW $0xAA, Y11, Y14, Y11
// update
VPSUBW Y5, Y4, Y12
VPSUBW Y7, Y6, Y13
VPSUBW Y9, Y8, Y14
VPSUBW Y11, Y10, Y15
VPADDW Y4, Y5, Y4
VPADDW Y6, Y7, Y6
VPADDW Y8, Y9, Y8
VPADDW Y10, Y11, Y10
// zetas
VMOVDQU 256(R8), Y7
VMOVDQU 288(R8), Y9
VMOVDQU 320(R8), Y11
VMOVDQU 352(R8), Y3
// mul
VPMULLW Y7, Y12, Y5
VPMULHW Y7, Y12, Y12
VPMULLW Y9, Y13, Y7
VPMULHW Y9, Y13, Y13
VPMULLW Y11, Y14, Y9
VPMULHW Y11, Y14, Y14
VPMULLW Y3, Y15, Y11
VPMULHW Y3, Y15, Y15
// reduce
VPMULLW Y0, Y5, Y5
VPMULLW Y0, Y7, Y7
VPMULLW Y0, Y9, Y9
VPMULLW Y0, Y11, Y11
VPMULHW Y1, Y5, Y5
VPMULHW Y1, Y7, Y7
VPMULHW Y1, Y9, Y9
VPMULHW Y1, Y11, Y11
VPSUBW Y5, Y12, Y5
VPSUBW Y7, Y13, Y7
VPSUBW Y9, Y14, Y9
VPSUBW Y11, Y15, Y11
// reduce 2
VPMULHW Y2, Y4, Y12
VPMULHW Y2, Y6, Y13
VPMULHW Y2, Y8, Y14
VPMULHW Y2, Y10, Y15
VPSRAW $11, Y12, Y12
VPSRAW $11, Y13, Y13
VPSRAW $11, Y14, Y14
VPSRAW $11, Y15, Y15
VPMULLW Y1, Y12, Y12
VPMULLW Y1, Y13, Y13
VPMULLW Y1, Y14, Y14
VPMULLW Y1, Y15, Y15
VPSUBW Y12, Y4, Y4
VPSUBW Y13, Y6, Y6
VPSUBW Y14, Y8, Y8
VPSUBW Y15, Y10, Y10
// level 2
// shuffle
VPSHUFD $0xB1, Y4, Y12
VPSHUFD $0xB1, Y5, Y13
VPSHUFD $0xB1, Y6, Y14
VPSHUFD $0xB1, Y7, Y15
VPBLENDD $0x55, Y4, Y13, Y4
VPBLENDD $0xAA, Y5, Y12, Y5
VPBLENDD $0x55, Y6, Y15, Y6
VPBLENDD $0xAA, Y7, Y14, Y7
VPSHUFD $0xB1, Y8, Y12
VPSHUFD $0xB1, Y9, Y13
VPSHUFD $0xB1, Y10, Y14
VPSHUFD $0xB1, Y11, Y15
VPBLENDD $0x55, Y8, Y13, Y8
VPBLENDD $0xAA, Y9, Y12, Y9
VPBLENDD $0x55, Y10, Y15, Y10
VPBLENDD $0xAA, Y11, Y14, Y11
// update
VPSUBW Y5, Y4, Y12
VPSUBW Y7, Y6, Y13
VPSUBW Y9, Y8, Y14
VPSUBW Y11, Y10, Y15
VPADDW Y4, Y5, Y4
VPADDW Y6, Y7, Y6
VPADDW Y8, Y9, Y8
VPADDW Y10, Y11, Y10
// zetas
VMOVDQU 512(R8), Y7
VMOVDQU 544(R8), Y9
VMOVDQU 576(R8), Y11
VMOVDQU 608(R8), Y3
// mul
VPMULLW Y7, Y12, Y5
VPMULHW Y7, Y12, Y12
VPMULLW Y9, Y13, Y7
VPMULHW Y9, Y13, Y13
VPMULLW Y11, Y14, Y9
VPMULHW Y11, Y14, Y14
VPMULLW Y3, Y15, Y11
VPMULHW Y3, Y15, Y15
// reduce
VPMULLW Y0, Y5, Y5
VPMULLW Y0, Y7, Y7
VPMULLW Y0, Y9, Y9
VPMULLW Y0, Y11, Y11
VPMULHW Y1, Y5, Y5
VPMULHW Y1, Y7, Y7
VPMULHW Y1, Y9, Y9
VPMULHW Y1, Y11, Y11
VPSUBW Y5, Y12, Y5
VPSUBW Y7, Y13, Y7
VPSUBW Y9, Y14, Y9
VPSUBW Y11, Y15, Y11
// level 3
// shuffle
VSHUFPD $0x00, Y5, Y4, Y3
VSHUFPD $0x0F, Y5, Y4, Y4
VSHUFPD $0x00, Y7, Y6, Y5
VSHUFPD $0x0F, Y7, Y6, Y6
VSHUFPD $0x00, Y9, Y8, Y7
VSHUFPD $0x0F, Y9, Y8, Y8
VSHUFPD $0x00, Y11, Y10, Y9
VSHUFPD $0x0F, Y11, Y10, Y10
// update
VPSUBW Y4, Y3, Y12
VPSUBW Y6, Y5, Y13
VPSUBW Y8, Y7, Y14
VPSUBW Y10, Y9, Y15
VPADDW Y3, Y4, Y3
VPADDW Y5, Y6, Y5
VPADDW Y7, Y8, Y7
VPADDW Y9, Y10, Y9
// zetas
VMOVDQU 768(R8), Y6
VMOVDQU 800(R8), Y8
VMOVDQU 832(R8), Y10
VMOVDQU 864(R8), Y11
// mul
VPMULLW Y6, Y12, Y4
VPMULHW Y6, Y12, Y12
VPMULLW Y8, Y13, Y6
VPMULHW Y8, Y13, Y13
VPMULLW Y10, Y14, Y8
VPMULHW Y10, Y14, Y14
VPMULLW Y11, Y15, Y10
VPMULHW Y11, Y15, Y15
// reduce
VPMULLW Y0, Y4, Y4
VPMULLW Y0, Y6, Y6
VPMULLW Y0, Y8, Y8
VPMULLW Y0, Y10, Y10
VPMULHW Y1, Y4, Y4
VPMULHW Y1, Y6, Y6
VPMULHW Y1, Y8, Y8
VPMULHW Y1, Y10, Y10
VPSUBW Y4, Y12, Y4
VPSUBW Y6, Y13, Y6
VPSUBW Y8, Y14, Y8
VPSUBW Y10, Y15, Y10
// reduce 2
VPMULHW Y2, Y3, Y12
VPMULHW Y2, Y5, Y13
VPMULHW Y2, Y7, Y14
VPMULHW Y2, Y9, Y15
VPSRAW $11, Y12, Y12
VPSRAW $11, Y13, Y13
VPSRAW $11, Y14, Y14
VPSRAW $11, Y15, Y15
VPMULLW Y1, Y12, Y12
VPMULLW Y1, Y13, Y13
VPMULLW Y1, Y14, Y14
VPMULLW Y1, Y15, Y15
VPSUBW Y12, Y3, Y3
VPSUBW Y13, Y5, Y5
VPSUBW Y14, Y7, Y7
VPSUBW Y15, Y9, Y9
// level 4
// shuffle
VPERM2I128 $0x02, Y3, Y4, Y11
VPERM2I128 $0x13, Y3, Y4, Y3
VPERM2I128 $0x02, Y5, Y6, Y4
VPERM2I128 $0x13, Y5, Y6, Y5
VPERM2I128 $0x02, Y7, Y8, Y6
VPERM2I128 $0x13, Y7, Y8, Y7
VPERM2I128 $0x02, Y9, Y10, Y8
VPERM2I128 $0x13, Y9, Y10, Y9
// update
VMOVDQA Y11, Y12
VMOVDQA Y4, Y13
VMOVDQA Y6, Y14
VMOVDQA Y8, Y15
VPADDW Y11, Y3, Y10
VPADDW Y4, Y5, Y4
VPADDW Y6, Y7, Y6
VPADDW Y8, Y9, Y8
VPSUBW Y3, Y12, Y3
VPSUBW Y5, Y13, Y5
VPSUBW Y7, Y14, Y7
VPSUBW Y9, Y15, Y9
// zetas
VMOVDQU 1024(R8), Y12
VMOVDQU 1056(R8), Y13
VMOVDQU 1088(R8), Y14
VMOVDQU 1120(R8), Y15
// mul
VPMULLW Y12, Y3, Y11
VPMULHW Y12, Y3, Y3
VPMULLW Y13, Y5, Y12
VPMULHW Y13, Y5, Y5
VPMULLW Y14, Y7, Y13
VPMULHW Y14, Y7, Y7
VPMULLW Y15, Y9, Y14
VPMULHW Y15, Y9, Y9
// reduce
VPMULLW Y0, Y11, Y11
VPMULLW Y0, Y12, Y12
VPMULLW Y0, Y13, Y13
VPMULLW Y0, Y14, Y14
VPMULHW Y1, Y11, Y11
VPMULHW Y1, Y12, Y12
VPMULHW Y1, Y13, Y13
VPMULHW Y1, Y14, Y14
VPSUBW Y11, Y3, Y3
VPSUBW Y12, Y5, Y5
VPSUBW Y13, Y7, Y7
VPSUBW Y14, Y9, Y9
// level 5
// update
VMOVDQA Y10, Y12
VMOVDQA Y3, Y13
VMOVDQA Y6, Y14
VMOVDQA Y7, Y15
VPADDW Y10, Y4, Y10
VPADDW Y3, Y5, Y3
VPADDW Y6, Y8, Y6
VPADDW Y7, Y9, Y7
VPSUBW Y4, Y12, Y4
VPSUBW Y5, Y13, Y5
VPSUBW Y8, Y14, Y8
VPSUBW Y9, Y15, Y9
// zetas
VMOVDQU 1280(SI), Y14
VMOVDQU 1312(SI), Y15
// mul
VPMULLW Y14, Y4, Y11
VPMULLW Y14, Y5, Y12
VPMULLW Y15, Y8, Y13
VPMULHW Y14, Y4, Y4
VPMULHW Y14, Y5, Y5
VPMULHW Y15, Y8, Y8
VPMULLW Y15, Y9, Y14
VPMULHW Y15, Y9, Y9
// reduce
VPMULLW Y0, Y11, Y11
VPMULLW Y0, Y12, Y12
VPMULLW Y0, Y13, Y13
VPMULLW Y0, Y14, Y14
VPMULHW Y1, Y11, Y11
VPMULHW Y1, Y12, Y12
VPMULHW Y1, Y13, Y13
VPMULHW Y1, Y14, Y14
VPSUBW Y11, Y4, Y4
VPSUBW Y12, Y5, Y5
VPSUBW Y13, Y8, Y8
VPSUBW Y14, Y9, Y9
// reduce 2
VPMULHW Y2, Y10, Y12
VPMULHW Y2, Y6, Y13
VPSRAW $11, Y12, Y12
VPSRAW $11, Y13, Y13
VPMULLW Y1, Y12, Y12
VPMULLW Y1, Y13, Y13
VPSUBW Y12, Y10, Y10
VPSUBW Y13, Y6, Y6
// level 6
// update
VMOVDQA Y10, Y12
VMOVDQA Y3, Y13
VMOVDQA Y4, Y14
VMOVDQA Y5, Y15
VPADDW Y10, Y6, Y10
VPADDW Y3, Y7, Y3
VPADDW Y4, Y8, Y4
VPADDW Y5, Y9, Y5
VPSUBW Y6, Y12, Y6
VPSUBW Y7, Y13, Y7
VPSUBW Y8, Y14, Y8
VPSUBW Y9, Y15, Y9
// zetas
VMOVDQU 1408(SI), Y15
// mul
VPMULLW Y15, Y6, Y11
VPMULLW Y15, Y7, Y12
VPMULLW Y15, Y8, Y13
VPMULLW Y15, Y9, Y14
VPMULHW Y15, Y6, Y6
VPMULHW Y15, Y7, Y7
VPMULHW Y15, Y8, Y8
VPMULHW Y15, Y9, Y9
// reduce
VPMULLW Y0, Y11, Y11
VPMULLW Y0, Y12, Y12
VPMULLW Y0, Y13, Y13
VPMULLW Y0, Y14, Y14
VPMULHW Y1, Y11, Y11
VPMULHW Y1, Y12, Y12
VPMULHW Y1, Y13, Y13
VPMULHW Y1, Y14, Y14
VPSUBW Y11, Y6, Y6
VPSUBW Y12, Y7, Y7
VPSUBW Y13, Y8, Y8
VPSUBW Y14, Y9, Y9
// reduce 2
VPMULHW Y2, Y3, Y12
VPSRAW $11, Y12, Y12
VPMULLW Y1, Y12, Y12
VPSUBW Y12, Y3, Y3
// store
VMOVDQU Y10, (DI)
VMOVDQU Y3, 32(DI)
VMOVDQU Y4, 64(DI)
VMOVDQU Y5, 96(DI)
VMOVDQU Y6, 128(DI)
VMOVDQU Y7, 160(DI)
VMOVDQU Y8, 192(DI)
VMOVDQU Y9, 224(DI)
ADDQ $256, DI
ADDQ $128, R8
// second round
// load
VMOVDQU (DI), Y4
VMOVDQU 32(DI), Y5
VMOVDQU 64(DI), Y6
VMOVDQU 96(DI), Y7
VMOVDQU 128(DI), Y8
VMOVDQU 160(DI), Y9
VMOVDQU 192(DI), Y10
VMOVDQU 224(DI), Y11
// reorder
VMOVDQU ·lowdword<>(SB), Y3
VPAND Y3, Y4, Y12
VPAND Y3, Y5, Y13
VPAND Y3, Y6, Y14
VPAND Y3, Y7, Y15
VPSRLD $16, Y4, Y4
VPSRLD $16, Y5, Y5
VPSRLD $16, Y6, Y6
VPSRLD $16, Y7, Y7
VPACKUSDW Y5, Y4, Y5
VPACKUSDW Y13, Y12, Y4
VPACKUSDW Y7, Y6, Y7
VPACKUSDW Y15, Y14, Y6
VPERMQ invntt_VPERMQ_IDX, Y4, Y4
VPERMQ invntt_VPERMQ_IDX, Y5, Y5
VPERMQ invntt_VPERMQ_IDX, Y6, Y6
VPERMQ invntt_VPERMQ_IDX, Y7, Y7
VPAND Y3, Y8, Y12
VPAND Y3, Y9, Y13
VPAND Y3, Y10, Y14
VPAND Y3, Y11, Y15
VPSRLD $16, Y8, Y8
VPSRLD $16, Y9, Y9
VPSRLD $16, Y10, Y10
VPSRLD $16, Y11, Y11
VPACKUSDW Y9, Y8, Y9
VPACKUSDW Y13, Y12, Y8
VPACKUSDW Y11, Y10, Y11
VPACKUSDW Y15, Y14, Y10
VPERMQ invntt_VPERMQ_IDX, Y8, Y8
VPERMQ invntt_VPERMQ_IDX, Y9, Y9
VPERMQ invntt_VPERMQ_IDX, Y10, Y10
VPERMQ invntt_VPERMQ_IDX, Y11, Y11
// level 0
// update
VMOVDQA Y4, Y12
VMOVDQA Y6, Y13
VMOVDQA Y8, Y14
VMOVDQA Y10, Y15
VPADDW Y4, Y5, Y4
VPADDW Y6, Y7, Y6
VPADDW Y8, Y9, Y8
VPADDW Y10, Y11, Y10
VPSUBW Y5, Y12, Y5
VPSUBW Y7, Y13, Y7
VPSUBW Y9, Y14, Y9
VPSUBW Y11, Y15, Y11
// zetas
VMOVDQU (R8), Y13
VMOVDQU 32(R8), Y14
VMOVDQU 64(R8), Y15
VMOVDQU 96(R8), Y3
// mul
VPMULLW Y13, Y5, Y12
VPMULHW Y13, Y5, Y5
VPMULLW Y14, Y7, Y13
VPMULHW Y14, Y7, Y7
VPMULLW Y15, Y9, Y14
VPMULHW Y15, Y9, Y9
VPMULLW Y3, Y11, Y15
VPMULHW Y3, Y11, Y11
// reduce
VPMULLW Y0, Y12, Y12
VPMULLW Y0, Y13, Y13
VPMULLW Y0, Y14, Y14
VPMULLW Y0, Y15, Y15
VPMULHW Y1, Y12, Y12
VPMULHW Y1, Y13, Y13
VPMULHW Y1, Y14, Y14
VPMULHW Y1, Y15, Y15
VPSUBW Y12, Y5, Y5
VPSUBW Y13, Y7, Y7
VPSUBW Y14, Y9, Y9
VPSUBW Y15, Y11, Y11
// level 1
// shuffle
VMOVDQU ·vpshufb_idx<>(SB), Y3
VPSHUFB Y3, Y4, Y12
VPSHUFB Y3, Y5, Y13
VPSHUFB Y3, Y6, Y14
VPSHUFB Y3, Y7, Y15
VPBLENDW $0x55, Y4, Y13, Y4
VPBLENDW $0xAA, Y5, Y12, Y5
VPBLENDW $0x55, Y6, Y15, Y6
VPBLENDW $0xAA, Y7, Y14, Y7
VPSHUFB Y3, Y8, Y12
VPSHUFB Y3, Y9, Y13
VPSHUFB Y3, Y10, Y14
VPSHUFB Y3, Y11, Y15
VPBLENDW $0x55, Y8, Y13, Y8
VPBLENDW $0xAA, Y9, Y12, Y9
VPBLENDW $0x55, Y10, Y15, Y10
VPBLENDW $0xAA, Y11, Y14, Y11
// update
VMOVDQA Y4, Y12
VMOVDQA Y6, Y13
VMOVDQA Y8, Y14
VMOVDQA Y10, Y15
VPADDW Y4, Y5, Y4
VPADDW Y6, Y7, Y6
VPADDW Y8, Y9, Y8
VPADDW Y10, Y11, Y10
VPSUBW Y5, Y12, Y5
VPSUBW Y7, Y13, Y7
VPSUBW Y9, Y14, Y9
VPSUBW Y11, Y15, Y11
// zetas
VMOVDQU 256(R8), Y13
VMOVDQU 288(R8), Y14
VMOVDQU 320(R8), Y15
VMOVDQU 352(R8), Y3
// mul
VPMULLW Y13, Y5, Y12
VPMULHW Y13, Y5, Y5
VPMULLW Y14, Y7, Y13
VPMULHW Y14, Y7, Y7
VPMULLW Y15, Y9, Y14
VPMULHW Y15, Y9, Y9
VPMULLW Y3, Y11, Y15
VPMULHW Y3, Y11, Y11
// reduce
VPMULLW Y0, Y12, Y12
VPMULLW Y0, Y13, Y13
VPMULLW Y0, Y14, Y14
VPMULLW Y0, Y15, Y15
VPMULHW Y1, Y12, Y12
VPMULHW Y1, Y13, Y13
VPMULHW Y1, Y14, Y14
VPMULHW Y1, Y15, Y15
VPSUBW Y12, Y5, Y5
VPSUBW Y13, Y7, Y7
VPSUBW Y14, Y9, Y9
VPSUBW Y15, Y11, Y11
// reduce 2
VPMULHW Y2, Y4, Y12
VPMULHW Y2, Y6, Y13
VPMULHW Y2, Y8, Y14
VPMULHW Y2, Y10, Y15
VPSRAW $11, Y12, Y12
VPSRAW $11, Y13, Y13
VPSRAW $11, Y14, Y14
VPSRAW $11, Y15, Y15
VPMULLW Y1, Y12, Y12
VPMULLW Y1, Y13, Y13
VPMULLW Y1, Y14, Y14
VPMULLW Y1, Y15, Y15
VPSUBW Y12, Y4, Y4
VPSUBW Y13, Y6, Y6
VPSUBW Y14, Y8, Y8
VPSUBW Y15, Y10, Y10
// level 2
// shuffle
VPSHUFD $0xB1, Y4, Y12
VPSHUFD $0xB1, Y5, Y13
VPSHUFD $0xB1, Y6, Y14
VPSHUFD $0xB1, Y7, Y15
VPBLENDD $0x55, Y4, Y13, Y4
VPBLENDD $0xAA, Y5, Y12, Y5
VPBLENDD $0x55, Y6, Y15, Y6
VPBLENDD $0xAA, Y7, Y14, Y7
VPSHUFD $0xB1, Y8, Y12
VPSHUFD $0xB1, Y9, Y13
VPSHUFD $0xB1, Y10, Y14
VPSHUFD $0xB1, Y11, Y15
VPBLENDD $0x55, Y8, Y13, Y8
VPBLENDD $0xAA, Y9, Y12, Y9
VPBLENDD $0x55, Y10, Y15, Y10
VPBLENDD $0xAA, Y11, Y14, Y11
// update
VMOVDQA Y4, Y12
VMOVDQA Y6, Y13
VMOVDQA Y8, Y14
VMOVDQA Y10, Y15
VPADDW Y4, Y5, Y4
VPADDW Y6, Y7, Y6
VPADDW Y8, Y9, Y8
VPADDW Y10, Y11, Y10
VPSUBW Y5, Y12, Y5
VPSUBW Y7, Y13, Y7
VPSUBW Y9, Y14, Y9
VPSUBW Y11, Y15, Y11
// zetas
VMOVDQU 512(R8), Y13
VMOVDQU 544(R8), Y14
VMOVDQU 576(R8), Y15
VMOVDQU 608(R8), Y3
// mul
VPMULLW Y13, Y5, Y12
VPMULHW Y13, Y5, Y5
VPMULLW Y14, Y7, Y13
VPMULHW Y14, Y7, Y7
VPMULLW Y15, Y9, Y14
VPMULHW Y15, Y9, Y9
VPMULLW Y3, Y11, Y15
VPMULHW Y3, Y11, Y11
// reduce
VPMULLW Y0, Y12, Y12
VPMULLW Y0, Y13, Y13
VPMULLW Y0, Y14, Y14
VPMULLW Y0, Y15, Y15
VPMULHW Y1, Y12, Y12
VPMULHW Y1, Y13, Y13
VPMULHW Y1, Y14, Y14
VPMULHW Y1, Y15, Y15
VPSUBW Y12, Y5, Y5
VPSUBW Y13, Y7, Y7
VPSUBW Y14, Y9, Y9
VPSUBW Y15, Y11, Y11
// level 3
// shuffle
VSHUFPD $0x00, Y5, Y4, Y3
VSHUFPD $0x0F, Y5, Y4, Y4
VSHUFPD $0x00, Y7, Y6, Y5
VSHUFPD $0x0F, Y7, Y6, Y6
VSHUFPD $0x00, Y9, Y8, Y7
VSHUFPD $0x0F, Y9, Y8, Y8
VSHUFPD $0x00, Y11, Y10, Y9
VSHUFPD $0x0F, Y11, Y10, Y10
// update
VMOVDQA Y3, Y12
VMOVDQA Y5, Y13
VMOVDQA Y7, Y14
VMOVDQA Y9, Y15
VPADDW Y3, Y4, Y3
VPADDW Y5, Y6, Y5
VPADDW Y7, Y8, Y7
VPADDW Y9, Y10, Y9
VPSUBW Y4, Y12, Y4
VPSUBW Y6, Y13, Y6
VPSUBW Y8, Y14, Y8
VPSUBW Y10, Y15, Y10
// zetas
VMOVDQU 768(R8), Y12
VMOVDQU 800(R8), Y13
VMOVDQU 832(R8), Y14
VMOVDQU 864(R8), Y15
// mul
VPMULLW Y12, Y4, Y11
VPMULHW Y12, Y4, Y4
VPMULLW Y13, Y6, Y12
VPMULHW Y13, Y6, Y6
VPMULLW Y14, Y8, Y13
VPMULHW Y14, Y8, Y8
VPMULLW Y15, Y10, Y14
VPMULHW Y15, Y10, Y10
// reduce
VPMULLW Y0, Y11, Y11
VPMULLW Y0, Y12, Y12
VPMULLW Y0, Y13, Y13
VPMULLW Y0, Y14, Y14
VPMULHW Y1, Y11, Y11
VPMULHW Y1, Y12, Y12
VPMULHW Y1, Y13, Y13
VPMULHW Y1, Y14, Y14
VPSUBW Y11, Y4, Y4
VPSUBW Y12, Y6, Y6
VPSUBW Y13, Y8, Y8
VPSUBW Y14, Y10, Y10
// reduce 2
VPMULHW Y2, Y3, Y12
VPMULHW Y2, Y5, Y13
VPMULHW Y2, Y7, Y14
VPMULHW Y2, Y9, Y15
VPSRAW $11, Y12, Y12
VPSRAW $11, Y13, Y13
VPSRAW $11, Y14, Y14
VPSRAW $11, Y15, Y15
VPMULLW Y1, Y12, Y12
VPMULLW Y1, Y13, Y13
VPMULLW Y1, Y14, Y14
VPMULLW Y1, Y15, Y15
VPSUBW Y12, Y3, Y3
VPSUBW Y13, Y5, Y5
VPSUBW Y14, Y7, Y7
VPSUBW Y15, Y9, Y9
// level 4
// shuffle
VPERM2I128 $0x02, Y3, Y4, Y11
VPERM2I128 $0x13, Y3, Y4, Y3
VPERM2I128 $0x02, Y5, Y6, Y4
VPERM2I128 $0x13, Y5, Y6, Y5
VPERM2I128 $0x02, Y7, Y8, Y6
VPERM2I128 $0x13, Y7, Y8, Y7
VPERM2I128 $0x02, Y9, Y10, Y8
VPERM2I128 $0x13, Y9, Y10, Y9
// update
VMOVDQA Y11, Y12
VMOVDQA Y4, Y13
VMOVDQA Y6, Y14
VMOVDQA Y8, Y15
VPADDW Y11, Y3, Y10
VPADDW Y4, Y5, Y4
VPADDW Y6, Y7, Y6
VPADDW Y8, Y9, Y8
VPSUBW Y3, Y12, Y3
VPSUBW Y5, Y13, Y5
VPSUBW Y7, Y14, Y7
VPSUBW Y9, Y15, Y9
// zetas
VMOVDQU 1024(R8), Y12
VMOVDQU 1056(R8), Y13
VMOVDQU 1088(R8), Y14
VMOVDQU 1120(R8), Y15
// mul
VPMULLW Y12, Y3, Y11
VPMULHW Y12, Y3, Y3
VPMULLW Y13, Y5, Y12
VPMULHW Y13, Y5, Y5
VPMULLW Y14, Y7, Y13
VPMULHW Y14, Y7, Y7
VPMULLW Y15, Y9, Y14
VPMULHW Y15, Y9, Y9
// reduce
VPMULLW Y0, Y11, Y11
VPMULLW Y0, Y12, Y12
VPMULLW Y0, Y13, Y13
VPMULLW Y0, Y14, Y14
VPMULHW Y1, Y11, Y11
VPMULHW Y1, Y12, Y12
VPMULHW Y1, Y13, Y13
VPMULHW Y1, Y14, Y14
VPSUBW Y11, Y3, Y3
VPSUBW Y12, Y5, Y5
VPSUBW Y13, Y7, Y7
VPSUBW Y14, Y9, Y9
// level 5
// update
VMOVDQA Y10, Y12
VMOVDQA Y3, Y13
VMOVDQA Y6, Y14
VMOVDQA Y7, Y15
VPADDW Y10, Y4, Y10
VPADDW Y3, Y5, Y3
VPADDW Y6, Y8, Y6
VPADDW Y7, Y9, Y7
VPSUBW Y4, Y12, Y4
VPSUBW Y5, Y13, Y5
VPSUBW Y8, Y14, Y8
VPSUBW Y9, Y15, Y9
// zetas
VMOVDQU 1344(SI), Y14
VMOVDQU 1376(SI), Y15
// mul
VPMULLW Y14, Y4, Y11
VPMULLW Y14, Y5, Y12
VPMULLW Y15, Y8, Y13
VPMULHW Y14, Y4, Y4
VPMULHW Y14, Y5, Y5
VPMULHW Y15, Y8, Y8
VPMULLW Y15, Y9, Y14
VPMULHW Y15, Y9, Y9
// reduce
VPMULLW Y0, Y11, Y11
VPMULLW Y0, Y12, Y12
VPMULLW Y0, Y13, Y13
VPMULLW Y0, Y14, Y14
VPMULHW Y1, Y11, Y11
VPMULHW Y1, Y12, Y12
VPMULHW Y1, Y13, Y13
VPMULHW Y1, Y14, Y14
VPSUBW Y11, Y4, Y4
VPSUBW Y12, Y5, Y5
VPSUBW Y13, Y8, Y8
VPSUBW Y14, Y9, Y9
// reduce 2
VPMULHW Y2, Y10, Y12
VPMULHW Y2, Y6, Y13
VPSRAW $11, Y12, Y12
VPSRAW $11, Y13, Y13
VPMULLW Y1, Y12, Y12
VPMULLW Y1, Y13, Y13
VPSUBW Y12, Y10, Y10
VPSUBW Y13, Y6, Y6
// level 6
// update
VMOVDQA Y10, Y12
VMOVDQA Y3, Y13
VMOVDQA Y4, Y14
VMOVDQA Y5, Y15
VPADDW Y10, Y6, Y10
VPADDW Y3, Y7, Y3
VPADDW Y4, Y8, Y4
VPADDW Y5, Y9, Y5
VPSUBW Y6, Y12, Y6
VPSUBW Y7, Y13, Y7
VPSUBW Y8, Y14, Y8
VPSUBW Y9, Y15, Y9
// zetas
VMOVDQU 1440(SI), Y15
// mul
VPMULLW Y15, Y6, Y11
VPMULLW Y15, Y7, Y12
VPMULLW Y15, Y8, Y13
VPMULLW Y15, Y9, Y14
VPMULHW Y15, Y6, Y6
VPMULHW Y15, Y7, Y7
VPMULHW Y15, Y8, Y8
VPMULHW Y15, Y9, Y9
// reduce
VPMULLW Y0, Y11, Y11
VPMULLW Y0, Y12, Y12
VPMULLW Y0, Y13, Y13
VPMULLW Y0, Y14, Y14
VPMULHW Y1, Y11, Y11
VPMULHW Y1, Y12, Y12
VPMULHW Y1, Y13, Y13
VPMULHW Y1, Y14, Y14
VPSUBW Y11, Y6, Y6
VPSUBW Y12, Y7, Y7
VPSUBW Y13, Y8, Y8
VPSUBW Y14, Y9, Y9
// reduce 2
VPMULHW Y2, Y3, Y12
VPSRAW $11, Y12, Y12
VPMULLW Y1, Y12, Y12
VPSUBW Y12, Y3, Y3
// store
VMOVDQU Y10, (DI)
VMOVDQU Y3, 32(DI)
VMOVDQU Y4, 64(DI)
VMOVDQU Y5, 96(DI)
VMOVDQU Y6, 128(DI)
VMOVDQU Y7, 160(DI)
VMOVDQU Y8, 192(DI)
VMOVDQU Y9, 224(DI)
SUBQ $256, DI
// f
VMOVDQU ·f_x16<>(SB), Y2
// first round
// load
VMOVDQU (DI), Y4
VMOVDQU 32(DI), Y5
VMOVDQU 64(DI), Y6
VMOVDQU 96(DI), Y7
VMOVDQU 256(DI), Y8
VMOVDQU 288(DI), Y9
VMOVDQU 320(DI), Y10
VMOVDQU 352(DI), Y11
// level 7
// update
VMOVDQA Y4, Y12
VMOVDQA Y5, Y13
VMOVDQA Y6, Y14
VMOVDQA Y7, Y15
VPADDW Y4, Y8, Y4
VPADDW Y5, Y9, Y5
VPADDW Y6, Y10, Y6
VPADDW Y7, Y11, Y7
VPSUBW Y8, Y12, Y8
VPSUBW Y9, Y13, Y9
VPSUBW Y10, Y14, Y10
VPSUBW Y11, Y15, Y11
// zeta
VMOVDQU 1472(SI), Y3
// mul
VPMULLW Y3, Y8, Y12
VPMULLW Y3, Y9, Y13
VPMULLW Y3, Y10, Y14
VPMULLW Y3, Y11, Y15
VPMULHW Y3, Y8, Y8
VPMULHW Y3, Y9, Y9
VPMULHW Y3, Y10, Y10
VPMULHW Y3, Y11, Y11
// reduce
VPMULLW Y0, Y12, Y12
VPMULLW Y0, Y13, Y13
VPMULLW Y0, Y14, Y14
VPMULLW Y0, Y15, Y15
VPMULHW Y1, Y12, Y12
VPMULHW Y1, Y13, Y13
VPMULHW Y1, Y14, Y14
VPMULHW Y1, Y15, Y15
VPSUBW Y12, Y8, Y8
VPSUBW Y13, Y9, Y9
VPSUBW Y14, Y10, Y10
VPSUBW Y15, Y11, Y11
VPADDW Y1, Y8, Y8
VPADDW Y1, Y9, Y9
VPADDW Y1, Y10, Y10
VPADDW Y1, Y11, Y11
// mul
VPMULLW Y2, Y4, Y12
VPMULLW Y2, Y5, Y13
VPMULLW Y2, Y6, Y14
VPMULLW Y2, Y7, Y15
VPMULHW Y2, Y4, Y4
VPMULHW Y2, Y5, Y5
VPMULHW Y2, Y6, Y6
VPMULHW Y2, Y7, Y7
// reduce
VPMULLW Y0, Y12, Y12
VPMULLW Y0, Y13, Y13
VPMULLW Y0, Y14, Y14
VPMULLW Y0, Y15, Y15
VPMULHW Y1, Y12, Y12
VPMULHW Y1, Y13, Y13
VPMULHW Y1, Y14, Y14
VPMULHW Y1, Y15, Y15
VPSUBW Y12, Y4, Y4
VPSUBW Y13, Y5, Y5
VPSUBW Y14, Y6, Y6
VPSUBW Y15, Y7, Y7
VPADDW Y1, Y4, Y4
VPADDW Y1, Y5, Y5
VPADDW Y1, Y6, Y6
VPADDW Y1, Y7, Y7
// store
VMOVDQU Y4, (DI)
VMOVDQU Y5, 32(DI)
VMOVDQU Y6, 64(DI)
VMOVDQU Y7, 96(DI)
VMOVDQU Y8, 256(DI)
VMOVDQU Y9, 288(DI)
VMOVDQU Y10, 320(DI)
VMOVDQU Y11, 352(DI)
ADDQ $128, DI
// second round
// load
VMOVDQU (DI), Y4
VMOVDQU 32(DI), Y5
VMOVDQU 64(DI), Y6
VMOVDQU 96(DI), Y7
VMOVDQU 256(DI), Y8
VMOVDQU 288(DI), Y9
VMOVDQU 320(DI), Y10
VMOVDQU 352(DI), Y11
// zeta
VMOVDQU 1472(SI), Y3
// level 7
// update
VMOVDQA Y4, Y12
VMOVDQA Y5, Y13
VMOVDQA Y6, Y14
VMOVDQA Y7, Y15
VPADDW Y4, Y8, Y4
VPADDW Y5, Y9, Y5
VPADDW Y6, Y10, Y6
VPADDW Y7, Y11, Y7
VPSUBW Y8, Y12, Y8
VPSUBW Y9, Y13, Y9
VPSUBW Y10, Y14, Y10
VPSUBW Y11, Y15, Y11
// mul
VPMULLW Y3, Y8, Y12
VPMULLW Y3, Y9, Y13
VPMULLW Y3, Y10, Y14
VPMULLW Y3, Y11, Y15
VPMULHW Y3, Y8, Y8
VPMULHW Y3, Y9, Y9
VPMULHW Y3, Y10, Y10
VPMULHW Y3, Y11, Y11
// reduce
VPMULLW Y0, Y12, Y12
VPMULLW Y0, Y13, Y13
VPMULLW Y0, Y14, Y14
VPMULLW Y0, Y15, Y15
VPMULHW Y1, Y12, Y12
VPMULHW Y1, Y13, Y13
VPMULHW Y1, Y14, Y14
VPMULHW Y1, Y15, Y15
VPSUBW Y12, Y8, Y8
VPSUBW Y13, Y9, Y9
VPSUBW Y14, Y10, Y10
VPSUBW Y15, Y11, Y11
VPADDW Y1, Y8, Y8
VPADDW Y1, Y9, Y9
VPADDW Y1, Y10, Y10
VPADDW Y1, Y11, Y11
// mul
VPMULLW Y2, Y4, Y12
VPMULLW Y2, Y5, Y13
VPMULLW Y2, Y6, Y14
VPMULLW Y2, Y7, Y15
VPMULHW Y2, Y4, Y4
VPMULHW Y2, Y5, Y5
VPMULHW Y2, Y6, Y6
VPMULHW Y2, Y7, Y7
// reduce
VPMULLW Y0, Y12, Y12
VPMULLW Y0, Y13, Y13
VPMULLW Y0, Y14, Y14
VPMULLW Y0, Y15, Y15
VPMULHW Y1, Y12, Y12
VPMULHW Y1, Y13, Y13
VPMULHW Y1, Y14, Y14
VPMULHW Y1, Y15, Y15
VPSUBW Y12, Y4, Y4
VPSUBW Y13, Y5, Y5
VPSUBW Y14, Y6, Y6
VPSUBW Y15, Y7, Y7
VPADDW Y1, Y4, Y4
VPADDW Y1, Y5, Y5
VPADDW Y1, Y6, Y6
VPADDW Y1, Y7, Y7
// store
VMOVDQU Y4, (DI)
VMOVDQU Y5, 32(DI)
VMOVDQU Y6, 64(DI)
VMOVDQU Y7, 96(DI)
VMOVDQU Y8, 256(DI)
VMOVDQU Y9, 288(DI)
VMOVDQU Y10, 320(DI)
VMOVDQU Y11, 352(DI)
VZEROUPPER
RET
// func pointwiseAccK2AVX2(dst *uint16, a, b **uint16)
TEXT ·pointwiseAccK2AVX2(SB), NOSPLIT, $0-24
MOVQ dst+0(FP), DI
MOVQ a+8(FP), SI
MOVQ b+16(FP), DX
VMOVDQU ·qinv_x16<>(SB), Y0
VMOVDQU ·q_x16<>(SB), Y1
VMOVDQU ·montsq_x16<>(SB), Y2
XORQ AX, AX
XORQ BX, BX
MOVQ 8(SI), R8 // a[1]
MOVQ (SI), SI // a[0]
MOVQ 8(DX), R11 // b[1]
MOVQ (DX), DX // b[0]
looptop2:
// load a
VMOVDQU (SI)(BX*1), Y4
VMOVDQU 32(SI)(BX*1), Y5
VMOVDQU 64(SI)(BX*1), Y6
VMOVDQU (R8)(BX*1), Y7
VMOVDQU 32(R8)(BX*1), Y8
VMOVDQU 64(R8)(BX*1), Y9
// mul montsq
VPMULLW Y2, Y4, Y3
VPMULHW Y2, Y4, Y10
VPMULLW Y2, Y5, Y4
VPMULHW Y2, Y5, Y11
VPMULLW Y2, Y6, Y5
VPMULHW Y2, Y6, Y12
VPMULLW Y2, Y7, Y6
VPMULHW Y2, Y7, Y13
VPMULLW Y2, Y8, Y7
VPMULHW Y2, Y8, Y14
VPMULLW Y2, Y9, Y8
VPMULHW Y2, Y9, Y15
// reduce
VPMULLW Y0, Y3, Y3
VPMULLW Y0, Y4, Y4
VPMULLW Y0, Y5, Y5
VPMULLW Y0, Y6, Y6
VPMULLW Y0, Y7, Y7
VPMULLW Y0, Y8, Y8
VPMULHW Y1, Y3, Y3
VPMULHW Y1, Y4, Y4
VPMULHW Y1, Y5, Y5
VPMULHW Y1, Y6, Y6
VPMULHW Y1, Y7, Y7
VPMULHW Y1, Y8, Y8
VPSUBW Y3, Y10, Y3
VPSUBW Y4, Y11, Y4
VPSUBW Y5, Y12, Y5
VPSUBW Y6, Y13, Y6
VPSUBW Y7, Y14, Y7
VPSUBW Y8, Y15, Y8
// load b
VMOVDQU (DX)(BX*1), Y9
VMOVDQU 32(DX)(BX*1), Y10
VMOVDQU 64(DX)(BX*1), Y11
VMOVDQU (R11)(BX*1), Y12
VMOVDQU 32(R11)(BX*1), Y13
VMOVDQU 64(R11)(BX*1), Y14
// mul
VPMULLW Y3, Y9, Y15
VPMULHW Y3, Y9, Y9
VPMULLW Y4, Y10, Y3
VPMULHW Y4, Y10, Y10
VPMULLW Y5, Y11, Y4
VPMULHW Y5, Y11, Y11
VPMULLW Y6, Y12, Y5
VPMULHW Y6, Y12, Y12
VPMULLW Y7, Y13, Y6
VPMULHW Y7, Y13, Y13
VPMULLW Y8, Y14, Y7
VPMULHW Y8, Y14, Y14
// reduce
VPMULLW Y0, Y15, Y15
VPMULLW Y0, Y3, Y3
VPMULLW Y0, Y4, Y4
VPMULLW Y0, Y5, Y5
VPMULLW Y0, Y6, Y6
VPMULLW Y0, Y7, Y7
VPMULHW Y1, Y15, Y15
VPMULHW Y1, Y3, Y3
VPMULHW Y1, Y4, Y4
VPMULHW Y1, Y5, Y5
VPMULHW Y1, Y6, Y6
VPMULHW Y1, Y7, Y7
VPSUBW Y15, Y9, Y15
VPSUBW Y3, Y10, Y3
VPSUBW Y4, Y11, Y4
VPSUBW Y5, Y12, Y5
VPSUBW Y6, Y13, Y6
VPSUBW Y7, Y14, Y7
// add
VPADDW Y15, Y5, Y5
VPADDW Y3, Y6, Y6
VPADDW Y4, Y7, Y7
// reduce 2
VMOVDQU ·v_x16<>(SB), Y3
VPMULHW Y3, Y5, Y8
VPMULHW Y3, Y6, Y9
VPMULHW Y3, Y7, Y10
VPSRAW $11, Y8, Y8
VPSRAW $11, Y9, Y9
VPSRAW $11, Y10, Y10
VPMULLW Y1, Y8, Y8
VPMULLW Y1, Y9, Y9
VPMULLW Y1, Y10, Y10
VPSUBW Y8, Y5, Y5
VPSUBW Y9, Y6, Y6
VPSUBW Y10, Y7, Y7
// store
VMOVDQU Y5, (DI)(BX*1)
VMOVDQU Y6, 32(DI)(BX*1)
VMOVDQU Y7, 64(DI)(BX*1)
ADDQ $1, AX
ADDQ $96, BX
CMPQ AX, $5
JB looptop2
// load
VMOVDQU (SI)(BX*1), Y4
VMOVDQU (R8)(BX*1), Y7
VMOVDQU (DX)(BX*1), Y9
VMOVDQU (R11)(BX*1), Y12
// mul montsq
VPMULLW Y2, Y4, Y3
VPMULHW Y2, Y4, Y10
VPMULLW Y2, Y7, Y6
VPMULHW Y2, Y7, Y13
// reduce
VPMULLW Y0, Y3, Y3
VPMULLW Y0, Y6, Y6
VPMULHW Y1, Y3, Y3
VPMULHW Y1, Y6, Y6
VPSUBW Y3, Y10, Y3
VPSUBW Y6, Y13, Y6
// mul
VPMULLW Y3, Y9, Y15
VPMULHW Y3, Y9, Y9
VPMULLW Y6, Y12, Y5
VPMULHW Y6, Y12, Y12
// reduce
VPMULLW Y0, Y15, Y15
VPMULLW Y0, Y5, Y5
VPMULHW Y1, Y15, Y15
VPMULHW Y1, Y5, Y5
VPSUBW Y15, Y9, Y15
VPSUBW Y5, Y12, Y5
// add
VPADDW Y15, Y5, Y5
// reduce 2
VMOVDQU ·v_x16<>(SB), Y3
VPMULHW Y3, Y5, Y8
VPSRAW $11, Y8, Y8
VPMULLW Y1, Y8, Y8
VPSUBW Y8, Y5, Y5
// store
VMOVDQU Y5, (DI)(BX*1)
VZEROUPPER
RET
// func pointwiseAccK2AVX2(dst *uint16, a, b **uint16)
TEXT ·pointwiseAccK3AVX2(SB), NOSPLIT, $0-24
MOVQ dst+0(FP), DI
MOVQ a+8(FP), SI
MOVQ b+16(FP), DX
VMOVDQU ·qinv_x16<>(SB), Y0
VMOVDQU ·q_x16<>(SB), Y1
VMOVDQU ·montsq_x16<>(SB), Y2
XORQ AX, AX
XORQ BX, BX
MOVQ (16)(SI), R9 // a[2]
MOVQ 8(SI), R8 // a[1]
MOVQ (SI), SI // a[0]
MOVQ 16(DX), R12 // b[2]
MOVQ 8(DX), R11 // b[1]
MOVQ (DX), DX // b[0]
looptop3:
// load a
VMOVDQU (SI)(BX*1), Y4
VMOVDQU 32(SI)(BX*1), Y5
VMOVDQU (R8)(BX*1), Y6
VMOVDQU 32(R8)(BX*1), Y7
VMOVDQU (R9)(BX*1), Y8
VMOVDQU 32(R9)(BX*1), Y9
// mul montsq
VPMULLW Y2, Y4, Y3
VPMULHW Y2, Y4, Y10
VPMULLW Y2, Y5, Y4
VPMULHW Y2, Y5, Y11
VPMULLW Y2, Y6, Y5
VPMULHW Y2, Y6, Y12
VPMULLW Y2, Y7, Y6
VPMULHW Y2, Y7, Y13
VPMULLW Y2, Y8, Y7
VPMULHW Y2, Y8, Y14
VPMULLW Y2, Y9, Y8
VPMULHW Y2, Y9, Y15
// reduce
VPMULLW Y0, Y3, Y3
VPMULLW Y0, Y4, Y4
VPMULLW Y0, Y5, Y5
VPMULLW Y0, Y6, Y6
VPMULLW Y0, Y7, Y7
VPMULLW Y0, Y8, Y8
VPMULHW Y1, Y3, Y3
VPMULHW Y1, Y4, Y4
VPMULHW Y1, Y5, Y5
VPMULHW Y1, Y6, Y6
VPMULHW Y1, Y7, Y7
VPMULHW Y1, Y8, Y8
VPSUBW Y3, Y10, Y3
VPSUBW Y4, Y11, Y4
VPSUBW Y5, Y12, Y5
VPSUBW Y6, Y13, Y6
VPSUBW Y7, Y14, Y7
VPSUBW Y8, Y15, Y8
// load b
VMOVDQU (DX)(BX*1), Y9
VMOVDQU 32(DX)(BX*1), Y10
VMOVDQU (R11)(BX*1), Y11
VMOVDQU 32(R11)(BX*1), Y12
VMOVDQU (R12)(BX*1), Y13
VMOVDQU 32(R12)(BX*1), Y14
// mul
VPMULLW Y3, Y9, Y15
VPMULHW Y3, Y9, Y9
VPMULLW Y4, Y10, Y3
VPMULHW Y4, Y10, Y10
VPMULLW Y5, Y11, Y4
VPMULHW Y5, Y11, Y11
VPMULLW Y6, Y12, Y5
VPMULHW Y6, Y12, Y12
VPMULLW Y7, Y13, Y6
VPMULHW Y7, Y13, Y13
VPMULLW Y8, Y14, Y7
VPMULHW Y8, Y14, Y14
// reduce
VPMULLW Y0, Y15, Y15
VPMULLW Y0, Y3, Y3
VPMULLW Y0, Y4, Y4
VPMULLW Y0, Y5, Y5
VPMULLW Y0, Y6, Y6
VPMULLW Y0, Y7, Y7
VPMULHW Y1, Y15, Y15
VPMULHW Y1, Y3, Y3
VPMULHW Y1, Y4, Y4
VPMULHW Y1, Y5, Y5
VPMULHW Y1, Y6, Y6
VPMULHW Y1, Y7, Y7
VPSUBW Y15, Y9, Y15
VPSUBW Y3, Y10, Y3
VPSUBW Y4, Y11, Y4
VPSUBW Y5, Y12, Y5
VPSUBW Y6, Y13, Y6
VPSUBW Y7, Y14, Y7
// add
VPADDW Y15, Y4, Y4
VPADDW Y3, Y5, Y5
VPADDW Y4, Y6, Y6
VPADDW Y5, Y7, Y7
// reduce 2
VMOVDQU ·v_x16<>(SB), Y3
VPMULHW Y3, Y6, Y8
VPMULHW Y3, Y7, Y9
VPSRAW $11, Y8, Y8
VPSRAW $11, Y9, Y9
VPMULLW Y1, Y8, Y8
VPMULLW Y1, Y9, Y9
VPSUBW Y8, Y6, Y6
VPSUBW Y9, Y7, Y7
// store
VMOVDQU Y6, (DI)(BX*1)
VMOVDQU Y7, 32(DI)(BX*1)
ADDQ $1, AX
ADDQ $64, BX
CMPQ AX, $8
JB looptop3
VZEROUPPER
RET
// func pointwiseAccK2AVX2(dst *uint16, a, b **uint16)
TEXT ·pointwiseAccK4AVX2(SB), NOSPLIT, $0-24
MOVQ dst+0(FP), DI
MOVQ a+8(FP), SI
MOVQ b+16(FP), DX
VMOVDQU ·qinv_x16<>(SB), Y0
VMOVDQU ·q_x16<>(SB), Y1
VMOVDQU ·montsq_x16<>(SB), Y2
VMOVDQU ·v_x16<>(SB), Y3
XORQ AX, AX
XORQ BX, BX
MOVQ 24(SI), R10 // a[3]
MOVQ 16(SI), R9 // a[2]
MOVQ 8(SI), R8 // a[1]
MOVQ (SI), SI // a[0]
MOVQ 24(DX), R13 // b[3]
MOVQ 16(DX), R12 // b[2]
MOVQ 8(DX), R11 // b[1]
MOVQ (DX), DX // b[0]
looptop4:
// load a
VMOVDQU (SI)(BX*1), Y6
VMOVDQU (R8)(BX*1), Y7
VMOVDQU (R9)(BX*1), Y8
VMOVDQU (R10)(BX*1), Y9
// mul montsq
VPMULLW Y2, Y6, Y5
VPMULHW Y2, Y6, Y10
VPMULLW Y2, Y7, Y6
VPMULHW Y2, Y7, Y11
VPMULLW Y2, Y8, Y7
VPMULHW Y2, Y8, Y12
VPMULLW Y2, Y9, Y8
VPMULHW Y2, Y9, Y13
// reduce
VPMULLW Y0, Y5, Y5
VPMULLW Y0, Y6, Y6
VPMULLW Y0, Y7, Y7
VPMULLW Y0, Y8, Y8
VPMULHW Y1, Y5, Y5
VPMULHW Y1, Y6, Y6
VPMULHW Y1, Y7, Y7
VPMULHW Y1, Y8, Y8
VPSUBW Y5, Y10, Y5
VPSUBW Y6, Y11, Y6
VPSUBW Y7, Y12, Y7
VPSUBW Y8, Y13, Y8
// load b
VMOVDQU (DX)(BX*1), Y9
VMOVDQU (R11)(BX*1), Y10
VMOVDQU (R12)(BX*1), Y11
VMOVDQU (R13)(BX*1), Y12
// mul
VPMULLW Y5, Y9, Y4
VPMULHW Y5, Y9, Y9
VPMULLW Y6, Y10, Y5
VPMULHW Y6, Y10, Y10
VPMULLW Y7, Y11, Y6
VPMULHW Y7, Y11, Y11
VPMULLW Y8, Y12, Y7
VPMULHW Y8, Y12, Y12
// reduce
VPMULLW Y0, Y4, Y4
VPMULLW Y0, Y5, Y5
VPMULLW Y0, Y6, Y6
VPMULLW Y0, Y7, Y7
VPMULHW Y1, Y4, Y4
VPMULHW Y1, Y5, Y5
VPMULHW Y1, Y6, Y6
VPMULHW Y1, Y7, Y7
VPSUBW Y4, Y9, Y4
VPSUBW Y5, Y10, Y5
VPSUBW Y6, Y11, Y6
VPSUBW Y7, Y12, Y7
// add
VPADDW Y4, Y5, Y5
VPADDW Y5, Y6, Y6
VPADDW Y6, Y7, Y7
// reduce 2
VPMULHW Y3, Y7, Y8
VPSRAW $11, Y8, Y8
VPMULLW Y1, Y8, Y8
VPSUBW Y8, Y7, Y8
// store
VMOVDQU Y8, (DI)(BX*1)
ADDQ $1, AX
ADDQ $32, BX
CMPQ AX, $16
JB looptop4
VZEROUPPER
RET
// func cbdEta4AVX2(dst *uint16, b *byte)
TEXT ·cbdEta4AVX2(SB), NOSPLIT, $0-16
MOVQ dst+0(FP), DI
MOVQ b+8(FP), SI
VMOVDQU ·mask11<>(SB), Y0
VMOVDQU ·mask0f<>(SB), Y1
VMOVDQU ·q_x16<>(SB), Y2
MOVQ $256, DX
looptop:
VMOVUPD 0(SI), Y3
VPAND Y3, Y0, Y4
VPSRLW $1, Y3, Y3
VPAND Y3, Y0, Y5
VPADDB Y5, Y4, Y4
VPSRLW $1, Y3, Y3
VPAND Y3, Y0, Y5
VPADDB Y5, Y4, Y4
VPSRLW $1, Y3, Y3
VPAND Y3, Y0, Y3
VPADDB Y3, Y4, Y3
VPSRLW $4, Y3, Y4
VPAND Y3, Y1, Y3
VPAND Y4, Y1, Y4
VPSUBB Y4, Y3, Y3
VPMOVSXBW X3, Y4
VPADDW Y2, Y4, Y4
VMOVUPD Y4, 0(DI)
VPERM2F128 $0x21, Y3, Y3, Y3
VPMOVSXBW X3, Y4
VPADDW Y2, Y4, Y4
VMOVUPD Y4, 32(DI)
ADDQ $64, DI
ADDQ $32, SI
SUBQ $32, DX
JA looptop
VZEROUPPER
RET