mirror of
https://gogs.blitter.com/RLabs/xs
synced 2024-08-14 10:26:42 +00:00
eb9ce0e0e2
Signed-off-by: Russ Magee <rmagee@gmail.com>
2749 lines
53 KiB
ArmAsm
2749 lines
53 KiB
ArmAsm
// +build !noasm,go1.10
|
|
// hwaccel_amd64.s - AMD64 optimized routines.
|
|
//
|
|
// To the extent possible under law, Yawning Angel has waived all copyright
|
|
// and related or neighboring rights to the software, using the Creative
|
|
// Commons "CC0" public domain dedication. See LICENSE or
|
|
// <http://creativecommons.org/publicdomain/zero/1.0/> for full details.
|
|
|
|
#include "textflag.h"
|
|
|
|
// func cpuidAmd64(cpuidParams *uint32)
|
|
TEXT ·cpuidAmd64(SB), NOSPLIT, $0-8
|
|
MOVQ cpuidParams+0(FP), R15
|
|
MOVL 0(R15), AX
|
|
MOVL 8(R15), CX
|
|
CPUID
|
|
MOVL AX, 0(R15)
|
|
MOVL BX, 4(R15)
|
|
MOVL CX, 8(R15)
|
|
MOVL DX, 12(R15)
|
|
RET
|
|
|
|
// func xgetbv0Amd64(xcrVec *uint32)
|
|
TEXT ·xgetbv0Amd64(SB), NOSPLIT, $0-8
|
|
MOVQ xcrVec+0(FP), BX
|
|
XORL CX, CX
|
|
XGETBV
|
|
MOVL AX, 0(BX)
|
|
MOVL DX, 4(BX)
|
|
RET
|
|
|
|
// Routines taken from the `avx2` implementation, converted to Go's assembly
|
|
// dialect. I do this in lieu of cutting myself to see if I still can feel
|
|
// pain.
|
|
//
|
|
// The conversion is mostly direct except:
|
|
// * Instead of aligned loads, unaligned loads are used, as there is no
|
|
// meaningful difference on modern Intel systems, and it's not immediately
|
|
// obvious to me how Go will align global data.
|
|
// * The polyvec_pointwise_acc family of routines take vectors of pointers
|
|
// due to the different internal memory layout of a polyvec.
|
|
// * The constants are renamed slightly.
|
|
|
|
// Note:
|
|
// * These must be kept in sync with the values in params.go.
|
|
// Currently assumes Q = 7681, Q_INV = 57857.
|
|
// * Caution, Little endian so things will look different from avx2/consts.c.
|
|
DATA ·vpshufb_idx<>+0x00(SB)/8, $0x0504070601000302
|
|
DATA ·vpshufb_idx<>+0x08(SB)/8, $0x0d0c0f0e09080b0a
|
|
DATA ·vpshufb_idx<>+0x10(SB)/8, $0x0504070601000302
|
|
DATA ·vpshufb_idx<>+0x18(SB)/8, $0x0d0c0f0e09080b0a
|
|
GLOBL ·vpshufb_idx<>(SB), (NOPTR+RODATA), $32
|
|
|
|
DATA ·low_mask<>+0x00(SB)/8, $0x1fff1fff1fff1fff
|
|
DATA ·low_mask<>+0x08(SB)/8, $0x1fff1fff1fff1fff
|
|
DATA ·low_mask<>+0x10(SB)/8, $0x1fff1fff1fff1fff
|
|
DATA ·low_mask<>+0x18(SB)/8, $0x1fff1fff1fff1fff
|
|
GLOBL ·low_mask<>(SB), (NOPTR+RODATA), $32
|
|
|
|
DATA ·lowdword<>+0x00(SB)/8, $0x0000ffff0000ffff
|
|
DATA ·lowdword<>+0x08(SB)/8, $0x0000ffff0000ffff
|
|
DATA ·lowdword<>+0x10(SB)/8, $0x0000ffff0000ffff
|
|
DATA ·lowdword<>+0x18(SB)/8, $0x0000ffff0000ffff
|
|
GLOBL ·lowdword<>(SB), (NOPTR+RODATA), $32
|
|
|
|
DATA ·q_x16<>+0x00(SB)/8, $0x1e011e011e011e01
|
|
DATA ·q_x16<>+0x08(SB)/8, $0x1e011e011e011e01
|
|
DATA ·q_x16<>+0x10(SB)/8, $0x1e011e011e011e01
|
|
DATA ·q_x16<>+0x18(SB)/8, $0x1e011e011e011e01
|
|
GLOBL ·q_x16<>(SB), (NOPTR+RODATA), $32
|
|
|
|
DATA ·q2_x16<>+0x00(SB)/8, $0x3c023c023c023c02
|
|
DATA ·q2_x16<>+0x08(SB)/8, $0x3c023c023c023c02
|
|
DATA ·q2_x16<>+0x10(SB)/8, $0x3c023c023c023c02
|
|
DATA ·q2_x16<>+0x18(SB)/8, $0x3c023c023c023c02
|
|
GLOBL ·q2_x16<>(SB), (NOPTR+RODATA), $32
|
|
|
|
DATA ·qinv_x16<>+0x00(SB)/8, $0xe201e201e201e201
|
|
DATA ·qinv_x16<>+0x08(SB)/8, $0xe201e201e201e201
|
|
DATA ·qinv_x16<>+0x10(SB)/8, $0xe201e201e201e201
|
|
DATA ·qinv_x16<>+0x18(SB)/8, $0xe201e201e201e201
|
|
GLOBL ·qinv_x16<>(SB), (NOPTR+RODATA), $32
|
|
|
|
DATA ·f_x16<>+0x00(SB)/8, $0x0100010001000100
|
|
DATA ·f_x16<>+0x08(SB)/8, $0x0100010001000100
|
|
DATA ·f_x16<>+0x10(SB)/8, $0x0100010001000100
|
|
DATA ·f_x16<>+0x18(SB)/8, $0x0100010001000100
|
|
GLOBL ·f_x16<>(SB), (NOPTR+RODATA), $32
|
|
|
|
DATA ·v_x16<>+0x00(SB)/8, $0x4442444244424442
|
|
DATA ·v_x16<>+0x08(SB)/8, $0x4442444244424442
|
|
DATA ·v_x16<>+0x10(SB)/8, $0x4442444244424442
|
|
DATA ·v_x16<>+0x18(SB)/8, $0x4442444244424442
|
|
GLOBL ·v_x16<>(SB), (NOPTR+RODATA), $32
|
|
|
|
DATA ·montsq_x16<>+0x00(SB)/8, $0x15c115c115c115c1
|
|
DATA ·montsq_x16<>+0x08(SB)/8, $0x15c115c115c115c1
|
|
DATA ·montsq_x16<>+0x10(SB)/8, $0x15c115c115c115c1
|
|
DATA ·montsq_x16<>+0x18(SB)/8, $0x15c115c115c115c1
|
|
GLOBL ·montsq_x16<>(SB), (NOPTR+RODATA), $32
|
|
|
|
DATA ·mask11<>+0x00(SB)/8, $0x1111111111111111
|
|
DATA ·mask11<>+0x08(SB)/8, $0x1111111111111111
|
|
DATA ·mask11<>+0x10(SB)/8, $0x1111111111111111
|
|
DATA ·mask11<>+0x18(SB)/8, $0x1111111111111111
|
|
GLOBL ·mask11<>(SB), (NOPTR+RODATA), $32
|
|
|
|
DATA ·mask0f<>+0x00(SB)/8, $0x0f0f0f0f0f0f0f0f
|
|
DATA ·mask0f<>+0x08(SB)/8, $0x0f0f0f0f0f0f0f0f
|
|
DATA ·mask0f<>+0x10(SB)/8, $0x0f0f0f0f0f0f0f0f
|
|
DATA ·mask0f<>+0x18(SB)/8, $0x0f0f0f0f0f0f0f0f
|
|
GLOBL ·mask0f<>(SB), (NOPTR+RODATA), $32
|
|
|
|
// func nttAVX2(inout, zetas *uint16)
|
|
TEXT ·nttAVX2(SB), NOSPLIT, $0-16
|
|
MOVQ inout+0(FP), DI
|
|
MOVQ zetas+8(FP), SI
|
|
|
|
VMOVDQU ·qinv_x16<>(SB), Y0
|
|
VMOVDQU ·q_x16<>(SB), Y1
|
|
VMOVDQU ·low_mask<>(SB), Y2
|
|
|
|
// zetas
|
|
VMOVDQU (SI), Y3
|
|
|
|
// first round
|
|
// load
|
|
VMOVDQU (DI), Y4
|
|
VMOVDQU 32(DI), Y5
|
|
VMOVDQU 64(DI), Y6
|
|
VMOVDQU 96(DI), Y7
|
|
VMOVDQU 256(DI), Y8
|
|
VMOVDQU 288(DI), Y9
|
|
VMOVDQU 320(DI), Y10
|
|
VMOVDQU 352(DI), Y11
|
|
|
|
// level 0
|
|
// mul
|
|
VPMULLW Y3, Y8, Y12
|
|
VPMULHW Y3, Y8, Y8
|
|
VPMULLW Y3, Y9, Y13
|
|
VPMULHW Y3, Y9, Y9
|
|
VPMULLW Y3, Y10, Y14
|
|
VPMULHW Y3, Y10, Y10
|
|
VPMULLW Y3, Y11, Y15
|
|
VPMULHW Y3, Y11, Y11
|
|
|
|
// reduce
|
|
VPMULLW Y0, Y12, Y12
|
|
VPMULLW Y0, Y13, Y13
|
|
VPMULLW Y0, Y14, Y14
|
|
VPMULLW Y0, Y15, Y15
|
|
VPMULHW Y1, Y12, Y12
|
|
VPMULHW Y1, Y13, Y13
|
|
VPMULHW Y1, Y14, Y14
|
|
VPMULHW Y1, Y15, Y15
|
|
VPSUBW Y12, Y8, Y12
|
|
VPSUBW Y13, Y9, Y13
|
|
VPSUBW Y14, Y10, Y14
|
|
VPSUBW Y15, Y11, Y15
|
|
|
|
// update
|
|
VPSUBW Y12, Y4, Y8
|
|
VPSUBW Y13, Y5, Y9
|
|
VPSUBW Y14, Y6, Y10
|
|
VPSUBW Y15, Y7, Y11
|
|
VPADDW Y12, Y4, Y4
|
|
VPADDW Y13, Y5, Y5
|
|
VPADDW Y14, Y6, Y6
|
|
VPADDW Y15, Y7, Y7
|
|
|
|
// store
|
|
VMOVDQU Y4, (DI)
|
|
VMOVDQU Y5, 32(DI)
|
|
VMOVDQU Y6, 64(DI)
|
|
VMOVDQU Y7, 96(DI)
|
|
VMOVDQU Y8, 256(DI)
|
|
VMOVDQU Y9, 288(DI)
|
|
VMOVDQU Y10, 320(DI)
|
|
VMOVDQU Y11, 352(DI)
|
|
|
|
ADDQ $128, DI
|
|
|
|
// second round
|
|
// load
|
|
VMOVDQU (DI), Y4
|
|
VMOVDQU 32(DI), Y5
|
|
VMOVDQU 64(DI), Y6
|
|
VMOVDQU 96(DI), Y7
|
|
VMOVDQU 256(DI), Y8
|
|
VMOVDQU 288(DI), Y9
|
|
VMOVDQU 320(DI), Y10
|
|
VMOVDQU 352(DI), Y11
|
|
|
|
// level 0
|
|
// mul
|
|
VPMULLW Y3, Y8, Y12
|
|
VPMULHW Y3, Y8, Y8
|
|
VPMULLW Y3, Y9, Y13
|
|
VPMULHW Y3, Y9, Y9
|
|
VPMULLW Y3, Y10, Y14
|
|
VPMULHW Y3, Y10, Y10
|
|
VPMULLW Y3, Y11, Y15
|
|
VPMULHW Y3, Y11, Y11
|
|
|
|
// reduce
|
|
VPMULLW Y0, Y12, Y12
|
|
VPMULLW Y0, Y13, Y13
|
|
VPMULLW Y0, Y14, Y14
|
|
VPMULLW Y0, Y15, Y15
|
|
VPMULHW Y1, Y12, Y12
|
|
VPMULHW Y1, Y13, Y13
|
|
VPMULHW Y1, Y14, Y14
|
|
VPMULHW Y1, Y15, Y15
|
|
VPSUBW Y12, Y8, Y12
|
|
VPSUBW Y13, Y9, Y13
|
|
VPSUBW Y14, Y10, Y14
|
|
VPSUBW Y15, Y11, Y15
|
|
|
|
// update
|
|
VPSUBW Y12, Y4, Y8
|
|
VPSUBW Y13, Y5, Y9
|
|
VPSUBW Y14, Y6, Y10
|
|
VPSUBW Y15, Y7, Y11
|
|
VPADDW Y12, Y4, Y4
|
|
VPADDW Y13, Y5, Y5
|
|
VPADDW Y14, Y6, Y6
|
|
VPADDW Y15, Y7, Y7
|
|
|
|
// store
|
|
VMOVDQU Y4, (DI)
|
|
VMOVDQU Y5, 32(DI)
|
|
VMOVDQU Y6, 64(DI)
|
|
VMOVDQU Y7, 96(DI)
|
|
VMOVDQU Y8, 256(DI)
|
|
VMOVDQU Y9, 288(DI)
|
|
VMOVDQU Y10, 320(DI)
|
|
VMOVDQU Y11, 352(DI)
|
|
|
|
SUBQ $128, DI
|
|
|
|
// first round
|
|
// zetas
|
|
VMOVDQU 32(SI), Y3
|
|
|
|
// load
|
|
VMOVDQU (DI), Y4
|
|
VMOVDQU 32(DI), Y5
|
|
VMOVDQU 64(DI), Y6
|
|
VMOVDQU 96(DI), Y7
|
|
VMOVDQU 128(DI), Y8
|
|
VMOVDQU 160(DI), Y9
|
|
VMOVDQU 192(DI), Y10
|
|
VMOVDQU 224(DI), Y11
|
|
|
|
// level 1
|
|
// mul
|
|
VPMULLW Y3, Y8, Y12
|
|
VPMULHW Y3, Y8, Y8
|
|
VPMULLW Y3, Y9, Y13
|
|
VPMULHW Y3, Y9, Y9
|
|
VPMULLW Y3, Y10, Y14
|
|
VPMULHW Y3, Y10, Y10
|
|
VPMULLW Y3, Y11, Y15
|
|
VPMULHW Y3, Y11, Y11
|
|
|
|
// reduce
|
|
VPMULLW Y0, Y12, Y12
|
|
VPMULLW Y0, Y13, Y13
|
|
VPMULLW Y0, Y14, Y14
|
|
VPMULLW Y0, Y15, Y15
|
|
VPMULHW Y1, Y12, Y12
|
|
VPMULHW Y1, Y13, Y13
|
|
VPMULHW Y1, Y14, Y14
|
|
VPMULHW Y1, Y15, Y15
|
|
VPSUBW Y12, Y8, Y12
|
|
VPSUBW Y13, Y9, Y13
|
|
VPSUBW Y14, Y10, Y14
|
|
VPSUBW Y15, Y11, Y15
|
|
|
|
// update
|
|
VPSUBW Y12, Y4, Y8
|
|
VPSUBW Y13, Y5, Y9
|
|
VPSUBW Y14, Y6, Y10
|
|
VPSUBW Y15, Y7, Y11
|
|
VPADDW Y12, Y4, Y4
|
|
VPADDW Y13, Y5, Y5
|
|
VPADDW Y14, Y6, Y6
|
|
VPADDW Y15, Y7, Y7
|
|
|
|
// level 2
|
|
// zetas
|
|
VMOVDQU 96(SI), Y15
|
|
VMOVDQU 128(SI), Y3
|
|
|
|
// mul
|
|
VPMULLW Y15, Y6, Y12
|
|
VPMULHW Y15, Y6, Y6
|
|
VPMULLW Y15, Y7, Y13
|
|
VPMULHW Y15, Y7, Y7
|
|
VPMULLW Y3, Y10, Y14
|
|
VPMULHW Y3, Y10, Y10
|
|
VPMULLW Y3, Y11, Y15
|
|
VPMULHW Y3, Y11, Y11
|
|
|
|
// reduce
|
|
VPMULLW Y0, Y12, Y12
|
|
VPMULLW Y0, Y13, Y13
|
|
VPMULLW Y0, Y14, Y14
|
|
VPMULLW Y0, Y15, Y15
|
|
VPMULHW Y1, Y12, Y12
|
|
VPMULHW Y1, Y13, Y13
|
|
VPMULHW Y1, Y14, Y14
|
|
VPMULHW Y1, Y15, Y15
|
|
VPSUBW Y12, Y6, Y12
|
|
VPSUBW Y13, Y7, Y13
|
|
VPSUBW Y14, Y10, Y14
|
|
VPSUBW Y15, Y11, Y15
|
|
|
|
// update
|
|
VPSUBW Y12, Y4, Y6
|
|
VPSUBW Y13, Y5, Y7
|
|
VPSUBW Y14, Y8, Y10
|
|
VPSUBW Y15, Y9, Y11
|
|
VPADDW Y12, Y4, Y4
|
|
VPADDW Y13, Y5, Y5
|
|
VPADDW Y14, Y8, Y8
|
|
VPADDW Y15, Y9, Y9
|
|
|
|
// level 3
|
|
// zetas
|
|
VMOVDQU 224(SI), Y13
|
|
VMOVDQU 256(SI), Y14
|
|
VMOVDQU 288(SI), Y15
|
|
VMOVDQU 320(SI), Y3
|
|
|
|
// mul
|
|
VPMULLW Y13, Y5, Y12
|
|
VPMULHW Y13, Y5, Y5
|
|
VPMULLW Y14, Y7, Y13
|
|
VPMULHW Y14, Y7, Y7
|
|
VPMULLW Y15, Y9, Y14
|
|
VPMULHW Y15, Y9, Y9
|
|
VPMULLW Y3, Y11, Y15
|
|
VPMULHW Y3, Y11, Y11
|
|
|
|
// reduce
|
|
VPMULLW Y0, Y12, Y12
|
|
VPMULLW Y0, Y13, Y13
|
|
VPMULLW Y0, Y14, Y14
|
|
VPMULLW Y0, Y15, Y15
|
|
VPMULHW Y1, Y12, Y12
|
|
VPMULHW Y1, Y13, Y13
|
|
VPMULHW Y1, Y14, Y14
|
|
VPMULHW Y1, Y15, Y15
|
|
VPSUBW Y12, Y5, Y12
|
|
VPSUBW Y13, Y7, Y13
|
|
VPSUBW Y14, Y9, Y14
|
|
VPSUBW Y15, Y11, Y15
|
|
|
|
// reduce 2
|
|
VPSRAW $13, Y4, Y5
|
|
VPSRAW $13, Y6, Y7
|
|
VPSRAW $13, Y8, Y9
|
|
VPSRAW $13, Y10, Y11
|
|
VPAND Y2, Y4, Y4
|
|
VPAND Y2, Y6, Y6
|
|
VPAND Y2, Y8, Y8
|
|
VPAND Y2, Y10, Y10
|
|
VPSUBW Y5, Y4, Y4
|
|
VPSUBW Y7, Y6, Y6
|
|
VPSUBW Y9, Y8, Y8
|
|
VPSUBW Y11, Y10, Y10
|
|
VPSLLW $9, Y5, Y5
|
|
VPSLLW $9, Y7, Y7
|
|
VPSLLW $9, Y9, Y9
|
|
VPSLLW $9, Y11, Y11
|
|
VPADDW Y5, Y4, Y4
|
|
VPADDW Y7, Y6, Y6
|
|
VPADDW Y9, Y8, Y8
|
|
VPADDW Y11, Y10, Y10
|
|
|
|
// update
|
|
VPSUBW Y12, Y4, Y5
|
|
VPSUBW Y13, Y6, Y7
|
|
VPSUBW Y14, Y8, Y9
|
|
VPSUBW Y15, Y10, Y11
|
|
VPADDW Y12, Y4, Y4
|
|
VPADDW Y13, Y6, Y6
|
|
VPADDW Y14, Y8, Y8
|
|
VPADDW Y15, Y10, Y10
|
|
|
|
// level 4
|
|
// zetas
|
|
VMOVDQU 480(SI), Y12
|
|
VMOVDQU 512(SI), Y13
|
|
VMOVDQU 544(SI), Y14
|
|
VMOVDQU 576(SI), Y15
|
|
|
|
// shuffle
|
|
VPERM2I128 $0x02, Y4, Y5, Y3
|
|
VPERM2I128 $0x13, Y4, Y5, Y4
|
|
VPERM2I128 $0x02, Y6, Y7, Y5
|
|
VPERM2I128 $0x13, Y6, Y7, Y6
|
|
VPERM2I128 $0x02, Y8, Y9, Y7
|
|
VPERM2I128 $0x13, Y8, Y9, Y8
|
|
VPERM2I128 $0x02, Y10, Y11, Y9
|
|
VPERM2I128 $0x13, Y10, Y11, Y10
|
|
|
|
// mul
|
|
VPMULLW Y12, Y4, Y11
|
|
VPMULHW Y12, Y4, Y4
|
|
VPMULLW Y13, Y6, Y12
|
|
VPMULHW Y13, Y6, Y6
|
|
VPMULLW Y14, Y8, Y13
|
|
VPMULHW Y14, Y8, Y8
|
|
VPMULLW Y15, Y10, Y14
|
|
VPMULHW Y15, Y10, Y10
|
|
|
|
// reduce
|
|
VPMULLW Y0, Y11, Y11
|
|
VPMULLW Y0, Y12, Y12
|
|
VPMULLW Y0, Y13, Y13
|
|
VPMULLW Y0, Y14, Y14
|
|
VPMULHW Y1, Y11, Y11
|
|
VPMULHW Y1, Y12, Y12
|
|
VPMULHW Y1, Y13, Y13
|
|
VPMULHW Y1, Y14, Y14
|
|
VPSUBW Y11, Y4, Y11
|
|
VPSUBW Y12, Y6, Y12
|
|
VPSUBW Y13, Y8, Y13
|
|
VPSUBW Y14, Y10, Y14
|
|
|
|
// update
|
|
VPSUBW Y11, Y3, Y4
|
|
VPSUBW Y12, Y5, Y6
|
|
VPSUBW Y13, Y7, Y8
|
|
VPSUBW Y14, Y9, Y10
|
|
VPADDW Y11, Y3, Y3
|
|
VPADDW Y12, Y5, Y5
|
|
VPADDW Y13, Y7, Y7
|
|
VPADDW Y14, Y9, Y9
|
|
|
|
// level 5
|
|
// zetas
|
|
VMOVDQU 736(SI), Y12
|
|
VMOVDQU 768(SI), Y13
|
|
VMOVDQU 800(SI), Y14
|
|
VMOVDQU 832(SI), Y15
|
|
|
|
// shuffle
|
|
VSHUFPD $0x00, Y4, Y3, Y11
|
|
VSHUFPD $0x0F, Y4, Y3, Y3
|
|
VSHUFPD $0x00, Y6, Y5, Y4
|
|
VSHUFPD $0x0F, Y6, Y5, Y5
|
|
VSHUFPD $0x00, Y8, Y7, Y6
|
|
VSHUFPD $0x0F, Y8, Y7, Y7
|
|
VSHUFPD $0x00, Y10, Y9, Y8
|
|
VSHUFPD $0x0F, Y10, Y9, Y9
|
|
|
|
// mul
|
|
VPMULLW Y12, Y3, Y10
|
|
VPMULHW Y12, Y3, Y3
|
|
VPMULLW Y13, Y5, Y12
|
|
VPMULHW Y13, Y5, Y5
|
|
VPMULLW Y14, Y7, Y13
|
|
VPMULHW Y14, Y7, Y7
|
|
VPMULLW Y15, Y9, Y14
|
|
VPMULHW Y15, Y9, Y9
|
|
|
|
// reduce
|
|
VPMULLW Y0, Y10, Y10
|
|
VPMULLW Y0, Y12, Y12
|
|
VPMULLW Y0, Y13, Y13
|
|
VPMULLW Y0, Y14, Y14
|
|
VPMULHW Y1, Y10, Y10
|
|
VPMULHW Y1, Y12, Y12
|
|
VPMULHW Y1, Y13, Y13
|
|
VPMULHW Y1, Y14, Y14
|
|
VPSUBW Y10, Y3, Y10
|
|
VPSUBW Y12, Y5, Y12
|
|
VPSUBW Y13, Y7, Y13
|
|
VPSUBW Y14, Y9, Y14
|
|
|
|
// update
|
|
VPSUBW Y10, Y11, Y3
|
|
VPSUBW Y12, Y4, Y5
|
|
VPSUBW Y13, Y6, Y7
|
|
VPSUBW Y14, Y8, Y9
|
|
VPADDW Y10, Y11, Y10
|
|
VPADDW Y12, Y4, Y4
|
|
VPADDW Y13, Y6, Y6
|
|
VPADDW Y14, Y8, Y8
|
|
|
|
// level 6
|
|
// shuffle
|
|
VPSHUFD $0xB1, Y10, Y12
|
|
VPSHUFD $0xB1, Y3, Y13
|
|
VPSHUFD $0xB1, Y4, Y14
|
|
VPSHUFD $0xB1, Y5, Y15
|
|
VPBLENDD $0x55, Y10, Y13, Y10
|
|
VPBLENDD $0xAA, Y3, Y12, Y3
|
|
VPBLENDD $0x55, Y4, Y15, Y4
|
|
VPBLENDD $0xAA, Y5, Y14, Y5
|
|
VPSHUFD $0xB1, Y6, Y12
|
|
VPSHUFD $0xB1, Y7, Y13
|
|
VPSHUFD $0xB1, Y8, Y14
|
|
VPSHUFD $0xB1, Y9, Y15
|
|
VPBLENDD $0x55, Y6, Y13, Y6
|
|
VPBLENDD $0xAA, Y7, Y12, Y7
|
|
VPBLENDD $0x55, Y8, Y15, Y8
|
|
VPBLENDD $0xAA, Y9, Y14, Y9
|
|
|
|
// zetas
|
|
VMOVDQU 992(SI), Y12
|
|
VMOVDQU 1024(SI), Y13
|
|
VMOVDQU 1056(SI), Y14
|
|
VMOVDQU 1088(SI), Y15
|
|
|
|
// mul
|
|
VPMULLW Y12, Y3, Y11
|
|
VPMULHW Y12, Y3, Y3
|
|
VPMULLW Y13, Y5, Y12
|
|
VPMULHW Y13, Y5, Y5
|
|
VPMULLW Y14, Y7, Y13
|
|
VPMULHW Y14, Y7, Y7
|
|
VPMULLW Y15, Y9, Y14
|
|
VPMULHW Y15, Y9, Y9
|
|
|
|
// reduce
|
|
VPMULLW Y0, Y11, Y11
|
|
VPMULLW Y0, Y12, Y12
|
|
VPMULLW Y0, Y13, Y13
|
|
VPMULLW Y0, Y14, Y14
|
|
VPMULHW Y1, Y11, Y11
|
|
VPMULHW Y1, Y12, Y12
|
|
VPMULHW Y1, Y13, Y13
|
|
VPMULHW Y1, Y14, Y14
|
|
VPSUBW Y11, Y3, Y11
|
|
VPSUBW Y12, Y5, Y12
|
|
VPSUBW Y13, Y7, Y13
|
|
VPSUBW Y14, Y9, Y14
|
|
|
|
// reduce 2
|
|
VPSRAW $13, Y10, Y3
|
|
VPSRAW $13, Y4, Y5
|
|
VPSRAW $13, Y6, Y7
|
|
VPSRAW $13, Y8, Y9
|
|
VPAND Y2, Y10, Y10
|
|
VPAND Y2, Y4, Y4
|
|
VPAND Y2, Y6, Y6
|
|
VPAND Y2, Y8, Y8
|
|
VPSUBW Y3, Y10, Y10
|
|
VPSUBW Y5, Y4, Y4
|
|
VPSUBW Y7, Y6, Y6
|
|
VPSUBW Y9, Y8, Y8
|
|
VPSLLW $9, Y3, Y3
|
|
VPSLLW $9, Y5, Y5
|
|
VPSLLW $9, Y7, Y7
|
|
VPSLLW $9, Y9, Y9
|
|
VPADDW Y3, Y10, Y10
|
|
VPADDW Y5, Y4, Y4
|
|
VPADDW Y7, Y6, Y6
|
|
VPADDW Y9, Y8, Y8
|
|
|
|
// update
|
|
VPSUBW Y11, Y10, Y3
|
|
VPSUBW Y12, Y4, Y5
|
|
VPSUBW Y13, Y6, Y7
|
|
VPSUBW Y14, Y8, Y9
|
|
VPADDW Y11, Y10, Y10
|
|
VPADDW Y12, Y4, Y4
|
|
VPADDW Y13, Y6, Y6
|
|
VPADDW Y14, Y8, Y8
|
|
|
|
// level 7
|
|
// shuffle
|
|
VMOVDQU ·vpshufb_idx<>(SB), Y15
|
|
VPSHUFB Y15, Y10, Y11
|
|
VPSHUFB Y15, Y3, Y12
|
|
VPSHUFB Y15, Y4, Y13
|
|
VPSHUFB Y15, Y5, Y14
|
|
VPBLENDW $0x55, Y10, Y12, Y10
|
|
VPBLENDW $0xAA, Y3, Y11, Y3
|
|
VPBLENDW $0x55, Y4, Y14, Y4
|
|
VPBLENDW $0xAA, Y5, Y13, Y5
|
|
VPSHUFB Y15, Y6, Y11
|
|
VPSHUFB Y15, Y7, Y12
|
|
VPSHUFB Y15, Y8, Y13
|
|
VPSHUFB Y15, Y9, Y14
|
|
VPBLENDW $0x55, Y6, Y12, Y6
|
|
VPBLENDW $0xAA, Y7, Y11, Y7
|
|
VPBLENDW $0x55, Y8, Y14, Y8
|
|
VPBLENDW $0xAA, Y9, Y13, Y9
|
|
|
|
// zetas
|
|
VMOVDQU 1248(SI), Y12
|
|
VMOVDQU 1280(SI), Y13
|
|
VMOVDQU 1312(SI), Y14
|
|
VMOVDQU 1344(SI), Y15
|
|
|
|
// mul
|
|
VPMULLW Y12, Y3, Y11
|
|
VPMULHW Y12, Y3, Y3
|
|
VPMULLW Y13, Y5, Y12
|
|
VPMULHW Y13, Y5, Y5
|
|
VPMULLW Y14, Y7, Y13
|
|
VPMULHW Y14, Y7, Y7
|
|
VPMULLW Y15, Y9, Y14
|
|
VPMULHW Y15, Y9, Y9
|
|
|
|
// reduce
|
|
VPMULLW Y0, Y11, Y11
|
|
VPMULLW Y0, Y12, Y12
|
|
VPMULLW Y0, Y13, Y13
|
|
VPMULLW Y0, Y14, Y14
|
|
VPMULHW Y1, Y11, Y11
|
|
VPMULHW Y1, Y12, Y12
|
|
VPMULHW Y1, Y13, Y13
|
|
VPMULHW Y1, Y14, Y14
|
|
VPSUBW Y11, Y3, Y11
|
|
VPSUBW Y12, Y5, Y12
|
|
VPSUBW Y13, Y7, Y13
|
|
VPSUBW Y14, Y9, Y14
|
|
|
|
// reduce 3
|
|
VMOVDQU ·q2_x16<>(SB), Y15
|
|
VPSRAW $15, Y10, Y3
|
|
VPSRAW $15, Y4, Y5
|
|
VPSRAW $15, Y6, Y7
|
|
VPSRAW $15, Y8, Y9
|
|
VPAND Y15, Y3, Y3
|
|
VPAND Y15, Y5, Y5
|
|
VPAND Y15, Y7, Y7
|
|
VPAND Y15, Y9, Y9
|
|
VPADDW Y1, Y10, Y10
|
|
VPADDW Y1, Y4, Y4
|
|
VPADDW Y1, Y6, Y6
|
|
VPADDW Y1, Y8, Y8
|
|
VPADDW Y3, Y10, Y10
|
|
VPADDW Y5, Y4, Y4
|
|
VPADDW Y7, Y6, Y6
|
|
VPADDW Y9, Y8, Y8
|
|
|
|
// update
|
|
VPSUBW Y11, Y10, Y3
|
|
VPSUBW Y12, Y4, Y5
|
|
VPSUBW Y13, Y6, Y7
|
|
VPSUBW Y14, Y8, Y9
|
|
VPADDW Y11, Y10, Y10
|
|
VPADDW Y12, Y4, Y4
|
|
VPADDW Y13, Y6, Y6
|
|
VPADDW Y14, Y8, Y8
|
|
|
|
// reorder
|
|
VPUNPCKLWD Y3, Y10, Y12
|
|
VPUNPCKHWD Y3, Y10, Y13
|
|
VPUNPCKLWD Y5, Y4, Y14
|
|
VPUNPCKHWD Y5, Y4, Y15
|
|
VPUNPCKLWD Y7, Y6, Y3
|
|
VPUNPCKHWD Y7, Y6, Y4
|
|
VPUNPCKLWD Y9, Y8, Y5
|
|
VPUNPCKHWD Y9, Y8, Y6
|
|
VPERM2I128 $0x20, Y13, Y12, Y11
|
|
VPERM2I128 $0x31, Y13, Y12, Y12
|
|
VPERM2I128 $0x20, Y15, Y14, Y13
|
|
VPERM2I128 $0x31, Y15, Y14, Y14
|
|
VPERM2I128 $0x20, Y4, Y3, Y15
|
|
VPERM2I128 $0x31, Y4, Y3, Y3
|
|
VPERM2I128 $0x20, Y6, Y5, Y4
|
|
VPERM2I128 $0x31, Y6, Y5, Y5
|
|
|
|
// store
|
|
VMOVDQU Y11, (DI)
|
|
VMOVDQU Y12, 32(DI)
|
|
VMOVDQU Y13, 64(DI)
|
|
VMOVDQU Y14, 96(DI)
|
|
VMOVDQU Y15, 128(DI)
|
|
VMOVDQU Y3, 160(DI)
|
|
VMOVDQU Y4, 192(DI)
|
|
VMOVDQU Y5, 224(DI)
|
|
|
|
ADDQ $256, DI
|
|
|
|
// second round
|
|
// zetas
|
|
VMOVDQU 64(SI), Y3
|
|
|
|
// load
|
|
VMOVDQU (DI), Y4
|
|
VMOVDQU 32(DI), Y5
|
|
VMOVDQU 64(DI), Y6
|
|
VMOVDQU 96(DI), Y7
|
|
VMOVDQU 128(DI), Y8
|
|
VMOVDQU 160(DI), Y9
|
|
VMOVDQU 192(DI), Y10
|
|
VMOVDQU 224(DI), Y11
|
|
|
|
// level 1
|
|
// mul
|
|
VPMULLW Y3, Y8, Y12
|
|
VPMULHW Y3, Y8, Y8
|
|
VPMULLW Y3, Y9, Y13
|
|
VPMULHW Y3, Y9, Y9
|
|
VPMULLW Y3, Y10, Y14
|
|
VPMULHW Y3, Y10, Y10
|
|
VPMULLW Y3, Y11, Y15
|
|
VPMULHW Y3, Y11, Y11
|
|
|
|
// reduce
|
|
VPMULLW Y0, Y12, Y12
|
|
VPMULLW Y0, Y13, Y13
|
|
VPMULLW Y0, Y14, Y14
|
|
VPMULLW Y0, Y15, Y15
|
|
VPMULHW Y1, Y12, Y12
|
|
VPMULHW Y1, Y13, Y13
|
|
VPMULHW Y1, Y14, Y14
|
|
VPMULHW Y1, Y15, Y15
|
|
VPSUBW Y12, Y8, Y12
|
|
VPSUBW Y13, Y9, Y13
|
|
VPSUBW Y14, Y10, Y14
|
|
VPSUBW Y15, Y11, Y15
|
|
|
|
// update
|
|
VPSUBW Y12, Y4, Y8
|
|
VPSUBW Y13, Y5, Y9
|
|
VPSUBW Y14, Y6, Y10
|
|
VPSUBW Y15, Y7, Y11
|
|
VPADDW Y12, Y4, Y4
|
|
VPADDW Y13, Y5, Y5
|
|
VPADDW Y14, Y6, Y6
|
|
VPADDW Y15, Y7, Y7
|
|
|
|
// level 2
|
|
// zetas
|
|
VMOVDQU 160(SI), Y15
|
|
VMOVDQU 192(SI), Y3
|
|
|
|
// mul
|
|
VPMULLW Y15, Y6, Y12
|
|
VPMULHW Y15, Y6, Y6
|
|
VPMULLW Y15, Y7, Y13
|
|
VPMULHW Y15, Y7, Y7
|
|
VPMULLW Y3, Y10, Y14
|
|
VPMULHW Y3, Y10, Y10
|
|
VPMULLW Y3, Y11, Y15
|
|
VPMULHW Y3, Y11, Y11
|
|
|
|
// reduce
|
|
VPMULLW Y0, Y12, Y12
|
|
VPMULLW Y0, Y13, Y13
|
|
VPMULLW Y0, Y14, Y14
|
|
VPMULLW Y0, Y15, Y15
|
|
VPMULHW Y1, Y12, Y12
|
|
VPMULHW Y1, Y13, Y13
|
|
VPMULHW Y1, Y14, Y14
|
|
VPMULHW Y1, Y15, Y15
|
|
VPSUBW Y12, Y6, Y12
|
|
VPSUBW Y13, Y7, Y13
|
|
VPSUBW Y14, Y10, Y14
|
|
VPSUBW Y15, Y11, Y15
|
|
|
|
// update
|
|
VPSUBW Y12, Y4, Y6
|
|
VPSUBW Y13, Y5, Y7
|
|
VPSUBW Y14, Y8, Y10
|
|
VPSUBW Y15, Y9, Y11
|
|
VPADDW Y12, Y4, Y4
|
|
VPADDW Y13, Y5, Y5
|
|
VPADDW Y14, Y8, Y8
|
|
VPADDW Y15, Y9, Y9
|
|
|
|
// level 3
|
|
// zetas
|
|
VMOVDQU 352(SI), Y13
|
|
VMOVDQU 384(SI), Y14
|
|
VMOVDQU 416(SI), Y15
|
|
VMOVDQU 448(SI), Y3
|
|
|
|
// mul
|
|
VPMULLW Y13, Y5, Y12
|
|
VPMULHW Y13, Y5, Y5
|
|
VPMULLW Y14, Y7, Y13
|
|
VPMULHW Y14, Y7, Y7
|
|
VPMULLW Y15, Y9, Y14
|
|
VPMULHW Y15, Y9, Y9
|
|
VPMULLW Y3, Y11, Y15
|
|
VPMULHW Y3, Y11, Y11
|
|
|
|
// reduce
|
|
VPMULLW Y0, Y12, Y12
|
|
VPMULLW Y0, Y13, Y13
|
|
VPMULLW Y0, Y14, Y14
|
|
VPMULLW Y0, Y15, Y15
|
|
VPMULHW Y1, Y12, Y12
|
|
VPMULHW Y1, Y13, Y13
|
|
VPMULHW Y1, Y14, Y14
|
|
VPMULHW Y1, Y15, Y15
|
|
VPSUBW Y12, Y5, Y12
|
|
VPSUBW Y13, Y7, Y13
|
|
VPSUBW Y14, Y9, Y14
|
|
VPSUBW Y15, Y11, Y15
|
|
|
|
// reduce 2
|
|
VPSRAW $13, Y4, Y5
|
|
VPSRAW $13, Y6, Y7
|
|
VPSRAW $13, Y8, Y9
|
|
VPSRAW $13, Y10, Y11
|
|
VPAND Y2, Y4, Y4
|
|
VPAND Y2, Y6, Y6
|
|
VPAND Y2, Y8, Y8
|
|
VPAND Y2, Y10, Y10
|
|
VPSUBW Y5, Y4, Y4
|
|
VPSUBW Y7, Y6, Y6
|
|
VPSUBW Y9, Y8, Y8
|
|
VPSUBW Y11, Y10, Y10
|
|
VPSLLW $9, Y5, Y5
|
|
VPSLLW $9, Y7, Y7
|
|
VPSLLW $9, Y9, Y9
|
|
VPSLLW $9, Y11, Y11
|
|
VPADDW Y5, Y4, Y4
|
|
VPADDW Y7, Y6, Y6
|
|
VPADDW Y9, Y8, Y8
|
|
VPADDW Y11, Y10, Y10
|
|
|
|
// update
|
|
VPSUBW Y12, Y4, Y5
|
|
VPSUBW Y13, Y6, Y7
|
|
VPSUBW Y14, Y8, Y9
|
|
VPSUBW Y15, Y10, Y11
|
|
VPADDW Y12, Y4, Y4
|
|
VPADDW Y13, Y6, Y6
|
|
VPADDW Y14, Y8, Y8
|
|
VPADDW Y15, Y10, Y10
|
|
|
|
// level 4
|
|
// zetas
|
|
VMOVDQU 608(SI), Y12
|
|
VMOVDQU 640(SI), Y13
|
|
VMOVDQU 672(SI), Y14
|
|
VMOVDQU 704(SI), Y15
|
|
|
|
// shuffle
|
|
VPERM2I128 $0x02, Y4, Y5, Y3
|
|
VPERM2I128 $0x13, Y4, Y5, Y4
|
|
VPERM2I128 $0x02, Y6, Y7, Y5
|
|
VPERM2I128 $0x13, Y6, Y7, Y6
|
|
VPERM2I128 $0x02, Y8, Y9, Y7
|
|
VPERM2I128 $0x13, Y8, Y9, Y8
|
|
VPERM2I128 $0x02, Y10, Y11, Y9
|
|
VPERM2I128 $0x13, Y10, Y11, Y10
|
|
|
|
// mul
|
|
VPMULLW Y12, Y4, Y11
|
|
VPMULHW Y12, Y4, Y4
|
|
VPMULLW Y13, Y6, Y12
|
|
VPMULHW Y13, Y6, Y6
|
|
VPMULLW Y14, Y8, Y13
|
|
VPMULHW Y14, Y8, Y8
|
|
VPMULLW Y15, Y10, Y14
|
|
VPMULHW Y15, Y10, Y10
|
|
|
|
// reduce
|
|
VPMULLW Y0, Y11, Y11
|
|
VPMULLW Y0, Y12, Y12
|
|
VPMULLW Y0, Y13, Y13
|
|
VPMULLW Y0, Y14, Y14
|
|
VPMULHW Y1, Y11, Y11
|
|
VPMULHW Y1, Y12, Y12
|
|
VPMULHW Y1, Y13, Y13
|
|
VPMULHW Y1, Y14, Y14
|
|
VPSUBW Y11, Y4, Y11
|
|
VPSUBW Y12, Y6, Y12
|
|
VPSUBW Y13, Y8, Y13
|
|
VPSUBW Y14, Y10, Y14
|
|
|
|
// update
|
|
VPSUBW Y11, Y3, Y4
|
|
VPSUBW Y12, Y5, Y6
|
|
VPSUBW Y13, Y7, Y8
|
|
VPSUBW Y14, Y9, Y10
|
|
VPADDW Y11, Y3, Y3
|
|
VPADDW Y12, Y5, Y5
|
|
VPADDW Y13, Y7, Y7
|
|
VPADDW Y14, Y9, Y9
|
|
|
|
// level 5
|
|
// zetas
|
|
VMOVDQU 864(SI), Y12
|
|
VMOVDQU 896(SI), Y13
|
|
VMOVDQU 928(SI), Y14
|
|
VMOVDQU 960(SI), Y15
|
|
|
|
// shuffle
|
|
VSHUFPD $0x00, Y4, Y3, Y11
|
|
VSHUFPD $0x0F, Y4, Y3, Y3
|
|
VSHUFPD $0x00, Y6, Y5, Y4
|
|
VSHUFPD $0x0F, Y6, Y5, Y5
|
|
VSHUFPD $0x00, Y8, Y7, Y6
|
|
VSHUFPD $0x0F, Y8, Y7, Y7
|
|
VSHUFPD $0x00, Y10, Y9, Y8
|
|
VSHUFPD $0x0F, Y10, Y9, Y9
|
|
|
|
// mul
|
|
VPMULLW Y12, Y3, Y10
|
|
VPMULHW Y12, Y3, Y3
|
|
VPMULLW Y13, Y5, Y12
|
|
VPMULHW Y13, Y5, Y5
|
|
VPMULLW Y14, Y7, Y13
|
|
VPMULHW Y14, Y7, Y7
|
|
VPMULLW Y15, Y9, Y14
|
|
VPMULHW Y15, Y9, Y9
|
|
|
|
// reduce
|
|
VPMULLW Y0, Y10, Y10
|
|
VPMULLW Y0, Y12, Y12
|
|
VPMULLW Y0, Y13, Y13
|
|
VPMULLW Y0, Y14, Y14
|
|
VPMULHW Y1, Y10, Y10
|
|
VPMULHW Y1, Y12, Y12
|
|
VPMULHW Y1, Y13, Y13
|
|
VPMULHW Y1, Y14, Y14
|
|
VPSUBW Y10, Y3, Y10
|
|
VPSUBW Y12, Y5, Y12
|
|
VPSUBW Y13, Y7, Y13
|
|
VPSUBW Y14, Y9, Y14
|
|
|
|
// update
|
|
VPSUBW Y10, Y11, Y3
|
|
VPSUBW Y12, Y4, Y5
|
|
VPSUBW Y13, Y6, Y7
|
|
VPSUBW Y14, Y8, Y9
|
|
VPADDW Y10, Y11, Y10
|
|
VPADDW Y12, Y4, Y4
|
|
VPADDW Y13, Y6, Y6
|
|
VPADDW Y14, Y8, Y8
|
|
|
|
// level 6
|
|
// shuffle
|
|
VPSHUFD $0xB1, Y10, Y12
|
|
VPSHUFD $0xB1, Y3, Y13
|
|
VPSHUFD $0xB1, Y4, Y14
|
|
VPSHUFD $0xB1, Y5, Y15
|
|
VPBLENDD $0x55, Y10, Y13, Y10
|
|
VPBLENDD $0xAA, Y3, Y12, Y3
|
|
VPBLENDD $0x55, Y4, Y15, Y4
|
|
VPBLENDD $0xAA, Y5, Y14, Y5
|
|
VPSHUFD $0xB1, Y6, Y12
|
|
VPSHUFD $0xB1, Y7, Y13
|
|
VPSHUFD $0xB1, Y8, Y14
|
|
VPSHUFD $0xB1, Y9, Y15
|
|
VPBLENDD $0x55, Y6, Y13, Y6
|
|
VPBLENDD $0xAA, Y7, Y12, Y7
|
|
VPBLENDD $0x55, Y8, Y15, Y8
|
|
VPBLENDD $0xAA, Y9, Y14, Y9
|
|
|
|
// zetas
|
|
VMOVDQU 1120(SI), Y12
|
|
VMOVDQU 1152(SI), Y13
|
|
VMOVDQU 1184(SI), Y14
|
|
VMOVDQU 1216(SI), Y15
|
|
|
|
// mul
|
|
VPMULLW Y12, Y3, Y11
|
|
VPMULHW Y12, Y3, Y3
|
|
VPMULLW Y13, Y5, Y12
|
|
VPMULHW Y13, Y5, Y5
|
|
VPMULLW Y14, Y7, Y13
|
|
VPMULHW Y14, Y7, Y7
|
|
VPMULLW Y15, Y9, Y14
|
|
VPMULHW Y15, Y9, Y9
|
|
|
|
// reduce
|
|
VPMULLW Y0, Y11, Y11
|
|
VPMULLW Y0, Y12, Y12
|
|
VPMULLW Y0, Y13, Y13
|
|
VPMULLW Y0, Y14, Y14
|
|
VPMULHW Y1, Y11, Y11
|
|
VPMULHW Y1, Y12, Y12
|
|
VPMULHW Y1, Y13, Y13
|
|
VPMULHW Y1, Y14, Y14
|
|
VPSUBW Y11, Y3, Y11
|
|
VPSUBW Y12, Y5, Y12
|
|
VPSUBW Y13, Y7, Y13
|
|
VPSUBW Y14, Y9, Y14
|
|
|
|
// reduce 2
|
|
VPSRAW $13, Y10, Y3
|
|
VPSRAW $13, Y4, Y5
|
|
VPSRAW $13, Y6, Y7
|
|
VPSRAW $13, Y8, Y9
|
|
VPAND Y2, Y10, Y10
|
|
VPAND Y2, Y4, Y4
|
|
VPAND Y2, Y6, Y6
|
|
VPAND Y2, Y8, Y8
|
|
VPSUBW Y3, Y10, Y10
|
|
VPSUBW Y5, Y4, Y4
|
|
VPSUBW Y7, Y6, Y6
|
|
VPSUBW Y9, Y8, Y8
|
|
VPSLLW $9, Y3, Y3
|
|
VPSLLW $9, Y5, Y5
|
|
VPSLLW $9, Y7, Y7
|
|
VPSLLW $9, Y9, Y9
|
|
VPADDW Y3, Y10, Y10
|
|
VPADDW Y5, Y4, Y4
|
|
VPADDW Y7, Y6, Y6
|
|
VPADDW Y9, Y8, Y8
|
|
|
|
// update
|
|
VPSUBW Y11, Y10, Y3
|
|
VPSUBW Y12, Y4, Y5
|
|
VPSUBW Y13, Y6, Y7
|
|
VPSUBW Y14, Y8, Y9
|
|
VPADDW Y11, Y10, Y10
|
|
VPADDW Y12, Y4, Y4
|
|
VPADDW Y13, Y6, Y6
|
|
VPADDW Y14, Y8, Y8
|
|
|
|
// level 7
|
|
// shuffle
|
|
VMOVDQU ·vpshufb_idx<>(SB), Y15
|
|
VPSHUFB Y15, Y10, Y11
|
|
VPSHUFB Y15, Y3, Y12
|
|
VPSHUFB Y15, Y4, Y13
|
|
VPSHUFB Y15, Y5, Y14
|
|
VPBLENDW $0x55, Y10, Y12, Y10
|
|
VPBLENDW $0xAA, Y3, Y11, Y3
|
|
VPBLENDW $0x55, Y4, Y14, Y4
|
|
VPBLENDW $0xAA, Y5, Y13, Y5
|
|
VPSHUFB Y15, Y6, Y11
|
|
VPSHUFB Y15, Y7, Y12
|
|
VPSHUFB Y15, Y8, Y13
|
|
VPSHUFB Y15, Y9, Y14
|
|
VPBLENDW $0x55, Y6, Y12, Y6
|
|
VPBLENDW $0xAA, Y7, Y11, Y7
|
|
VPBLENDW $0x55, Y8, Y14, Y8
|
|
VPBLENDW $0xAA, Y9, Y13, Y9
|
|
|
|
// zetas
|
|
VMOVDQU 1376(SI), Y12
|
|
VMOVDQU 1408(SI), Y13
|
|
VMOVDQU 1440(SI), Y14
|
|
VMOVDQU 1472(SI), Y15
|
|
|
|
// mul
|
|
VPMULLW Y12, Y3, Y11
|
|
VPMULHW Y12, Y3, Y3
|
|
VPMULLW Y13, Y5, Y12
|
|
VPMULHW Y13, Y5, Y5
|
|
VPMULLW Y14, Y7, Y13
|
|
VPMULHW Y14, Y7, Y7
|
|
VPMULLW Y15, Y9, Y14
|
|
VPMULHW Y15, Y9, Y9
|
|
|
|
// reduce
|
|
VPMULLW Y0, Y11, Y11
|
|
VPMULLW Y0, Y12, Y12
|
|
VPMULLW Y0, Y13, Y13
|
|
VPMULLW Y0, Y14, Y14
|
|
VPMULHW Y1, Y11, Y11
|
|
VPMULHW Y1, Y12, Y12
|
|
VPMULHW Y1, Y13, Y13
|
|
VPMULHW Y1, Y14, Y14
|
|
VPSUBW Y11, Y3, Y11
|
|
VPSUBW Y12, Y5, Y12
|
|
VPSUBW Y13, Y7, Y13
|
|
VPSUBW Y14, Y9, Y14
|
|
|
|
// reduce 3
|
|
VMOVDQU ·q2_x16<>(SB), Y15
|
|
VPSRAW $15, Y10, Y3
|
|
VPSRAW $15, Y4, Y5
|
|
VPSRAW $15, Y6, Y7
|
|
VPSRAW $15, Y8, Y9
|
|
VPAND Y15, Y3, Y3
|
|
VPAND Y15, Y5, Y5
|
|
VPAND Y15, Y7, Y7
|
|
VPAND Y15, Y9, Y9
|
|
VPADDW Y1, Y10, Y10
|
|
VPADDW Y1, Y4, Y4
|
|
VPADDW Y1, Y6, Y6
|
|
VPADDW Y1, Y8, Y8
|
|
VPADDW Y3, Y10, Y10
|
|
VPADDW Y5, Y4, Y4
|
|
VPADDW Y7, Y6, Y6
|
|
VPADDW Y9, Y8, Y8
|
|
|
|
// update
|
|
VPSUBW Y11, Y10, Y3
|
|
VPSUBW Y12, Y4, Y5
|
|
VPSUBW Y13, Y6, Y7
|
|
VPSUBW Y14, Y8, Y9
|
|
VPADDW Y11, Y10, Y10
|
|
VPADDW Y12, Y4, Y4
|
|
VPADDW Y13, Y6, Y6
|
|
VPADDW Y14, Y8, Y8
|
|
|
|
// reorder
|
|
VPUNPCKLWD Y3, Y10, Y12
|
|
VPUNPCKHWD Y3, Y10, Y13
|
|
VPUNPCKLWD Y5, Y4, Y14
|
|
VPUNPCKHWD Y5, Y4, Y15
|
|
VPUNPCKLWD Y7, Y6, Y3
|
|
VPUNPCKHWD Y7, Y6, Y4
|
|
VPUNPCKLWD Y9, Y8, Y5
|
|
VPUNPCKHWD Y9, Y8, Y6
|
|
VPERM2I128 $0x20, Y13, Y12, Y11
|
|
VPERM2I128 $0x31, Y13, Y12, Y12
|
|
VPERM2I128 $0x20, Y15, Y14, Y13
|
|
VPERM2I128 $0x31, Y15, Y14, Y14
|
|
VPERM2I128 $0x20, Y4, Y3, Y15
|
|
VPERM2I128 $0x31, Y4, Y3, Y3
|
|
VPERM2I128 $0x20, Y6, Y5, Y4
|
|
VPERM2I128 $0x31, Y6, Y5, Y5
|
|
|
|
// store
|
|
VMOVDQU Y11, (DI)
|
|
VMOVDQU Y12, 32(DI)
|
|
VMOVDQU Y13, 64(DI)
|
|
VMOVDQU Y14, 96(DI)
|
|
VMOVDQU Y15, 128(DI)
|
|
VMOVDQU Y3, 160(DI)
|
|
VMOVDQU Y4, 192(DI)
|
|
VMOVDQU Y5, 224(DI)
|
|
|
|
VZEROUPPER
|
|
RET
|
|
|
|
// Go 1.10's VPERMQ support expects the imm8 to be a `int8`, instead of a
|
|
// `uint8`. While this is fixed in master, use the signed representation
|
|
// for now till it's reasonable to expect versions with the fix to be widely
|
|
// available.
|
|
//
|
|
// See: https://github.com/golang/go/issues/24378
|
|
#define invntt_VPERMQ_IDX $-40 // $0xd8
|
|
|
|
// func invnttAVX2(inout, omegas *uint16)
|
|
TEXT ·invnttAVX2(SB), NOSPLIT, $0-16
|
|
MOVQ inout+0(FP), DI
|
|
MOVQ omegas+8(FP), SI
|
|
|
|
VMOVDQU ·qinv_x16<>(SB), Y0
|
|
VMOVDQU ·q_x16<>(SB), Y1
|
|
VMOVDQU ·v_x16<>(SB), Y2
|
|
|
|
MOVQ SI, R8
|
|
|
|
// first round
|
|
// load
|
|
VMOVDQU (DI), Y4
|
|
VMOVDQU 32(DI), Y5
|
|
VMOVDQU 64(DI), Y6
|
|
VMOVDQU 96(DI), Y7
|
|
VMOVDQU 128(DI), Y8
|
|
VMOVDQU 160(DI), Y9
|
|
VMOVDQU 192(DI), Y10
|
|
VMOVDQU 224(DI), Y11
|
|
|
|
// reorder
|
|
VMOVDQU ·lowdword<>(SB), Y3
|
|
VPAND Y3, Y4, Y12
|
|
VPAND Y3, Y5, Y13
|
|
VPAND Y3, Y6, Y14
|
|
VPAND Y3, Y7, Y15
|
|
VPSRLD $16, Y4, Y4
|
|
VPSRLD $16, Y5, Y5
|
|
VPSRLD $16, Y6, Y6
|
|
VPSRLD $16, Y7, Y7
|
|
VPACKUSDW Y5, Y4, Y5
|
|
VPACKUSDW Y13, Y12, Y4
|
|
VPACKUSDW Y7, Y6, Y7
|
|
VPACKUSDW Y15, Y14, Y6
|
|
VPERMQ invntt_VPERMQ_IDX, Y4, Y4
|
|
VPERMQ invntt_VPERMQ_IDX, Y5, Y5
|
|
VPERMQ invntt_VPERMQ_IDX, Y6, Y6
|
|
VPERMQ invntt_VPERMQ_IDX, Y7, Y7
|
|
VPAND Y3, Y8, Y12
|
|
VPAND Y3, Y9, Y13
|
|
VPAND Y3, Y10, Y14
|
|
VPAND Y3, Y11, Y15
|
|
VPSRLD $16, Y8, Y8
|
|
VPSRLD $16, Y9, Y9
|
|
VPSRLD $16, Y10, Y10
|
|
VPSRLD $16, Y11, Y11
|
|
VPACKUSDW Y9, Y8, Y9
|
|
VPACKUSDW Y13, Y12, Y8
|
|
VPACKUSDW Y11, Y10, Y11
|
|
VPACKUSDW Y15, Y14, Y10
|
|
VPERMQ invntt_VPERMQ_IDX, Y8, Y8
|
|
VPERMQ invntt_VPERMQ_IDX, Y9, Y9
|
|
VPERMQ invntt_VPERMQ_IDX, Y10, Y10
|
|
VPERMQ invntt_VPERMQ_IDX, Y11, Y11
|
|
|
|
// level 0
|
|
// update
|
|
VPSUBW Y5, Y4, Y12
|
|
VPSUBW Y7, Y6, Y13
|
|
VPSUBW Y9, Y8, Y14
|
|
VPSUBW Y11, Y10, Y15
|
|
VPADDW Y4, Y5, Y4
|
|
VPADDW Y6, Y7, Y6
|
|
VPADDW Y8, Y9, Y8
|
|
VPADDW Y10, Y11, Y10
|
|
|
|
// zetas
|
|
VMOVDQU (R8), Y7
|
|
VMOVDQU 32(R8), Y9
|
|
VMOVDQU 64(R8), Y11
|
|
VMOVDQU 96(R8), Y3
|
|
|
|
// mul
|
|
VPMULLW Y7, Y12, Y5
|
|
VPMULHW Y7, Y12, Y12
|
|
VPMULLW Y9, Y13, Y7
|
|
VPMULHW Y9, Y13, Y13
|
|
VPMULLW Y11, Y14, Y9
|
|
VPMULHW Y11, Y14, Y14
|
|
VPMULLW Y3, Y15, Y11
|
|
VPMULHW Y3, Y15, Y15
|
|
|
|
// reduce
|
|
VPMULLW Y0, Y5, Y5
|
|
VPMULLW Y0, Y7, Y7
|
|
VPMULLW Y0, Y9, Y9
|
|
VPMULLW Y0, Y11, Y11
|
|
VPMULHW Y1, Y5, Y5
|
|
VPMULHW Y1, Y7, Y7
|
|
VPMULHW Y1, Y9, Y9
|
|
VPMULHW Y1, Y11, Y11
|
|
VPSUBW Y5, Y12, Y5
|
|
VPSUBW Y7, Y13, Y7
|
|
VPSUBW Y9, Y14, Y9
|
|
VPSUBW Y11, Y15, Y11
|
|
|
|
// level 1
|
|
// shuffle
|
|
VMOVDQU ·vpshufb_idx<>(SB), Y3
|
|
VPSHUFB Y3, Y4, Y12
|
|
VPSHUFB Y3, Y5, Y13
|
|
VPSHUFB Y3, Y6, Y14
|
|
VPSHUFB Y3, Y7, Y15
|
|
VPBLENDW $0x55, Y4, Y13, Y4
|
|
VPBLENDW $0xAA, Y5, Y12, Y5
|
|
VPBLENDW $0x55, Y6, Y15, Y6
|
|
VPBLENDW $0xAA, Y7, Y14, Y7
|
|
VPSHUFB Y3, Y8, Y12
|
|
VPSHUFB Y3, Y9, Y13
|
|
VPSHUFB Y3, Y10, Y14
|
|
VPSHUFB Y3, Y11, Y15
|
|
VPBLENDW $0x55, Y8, Y13, Y8
|
|
VPBLENDW $0xAA, Y9, Y12, Y9
|
|
VPBLENDW $0x55, Y10, Y15, Y10
|
|
VPBLENDW $0xAA, Y11, Y14, Y11
|
|
|
|
// update
|
|
VPSUBW Y5, Y4, Y12
|
|
VPSUBW Y7, Y6, Y13
|
|
VPSUBW Y9, Y8, Y14
|
|
VPSUBW Y11, Y10, Y15
|
|
VPADDW Y4, Y5, Y4
|
|
VPADDW Y6, Y7, Y6
|
|
VPADDW Y8, Y9, Y8
|
|
VPADDW Y10, Y11, Y10
|
|
|
|
// zetas
|
|
VMOVDQU 256(R8), Y7
|
|
VMOVDQU 288(R8), Y9
|
|
VMOVDQU 320(R8), Y11
|
|
VMOVDQU 352(R8), Y3
|
|
|
|
// mul
|
|
VPMULLW Y7, Y12, Y5
|
|
VPMULHW Y7, Y12, Y12
|
|
VPMULLW Y9, Y13, Y7
|
|
VPMULHW Y9, Y13, Y13
|
|
VPMULLW Y11, Y14, Y9
|
|
VPMULHW Y11, Y14, Y14
|
|
VPMULLW Y3, Y15, Y11
|
|
VPMULHW Y3, Y15, Y15
|
|
|
|
// reduce
|
|
VPMULLW Y0, Y5, Y5
|
|
VPMULLW Y0, Y7, Y7
|
|
VPMULLW Y0, Y9, Y9
|
|
VPMULLW Y0, Y11, Y11
|
|
VPMULHW Y1, Y5, Y5
|
|
VPMULHW Y1, Y7, Y7
|
|
VPMULHW Y1, Y9, Y9
|
|
VPMULHW Y1, Y11, Y11
|
|
VPSUBW Y5, Y12, Y5
|
|
VPSUBW Y7, Y13, Y7
|
|
VPSUBW Y9, Y14, Y9
|
|
VPSUBW Y11, Y15, Y11
|
|
|
|
// reduce 2
|
|
VPMULHW Y2, Y4, Y12
|
|
VPMULHW Y2, Y6, Y13
|
|
VPMULHW Y2, Y8, Y14
|
|
VPMULHW Y2, Y10, Y15
|
|
VPSRAW $11, Y12, Y12
|
|
VPSRAW $11, Y13, Y13
|
|
VPSRAW $11, Y14, Y14
|
|
VPSRAW $11, Y15, Y15
|
|
VPMULLW Y1, Y12, Y12
|
|
VPMULLW Y1, Y13, Y13
|
|
VPMULLW Y1, Y14, Y14
|
|
VPMULLW Y1, Y15, Y15
|
|
VPSUBW Y12, Y4, Y4
|
|
VPSUBW Y13, Y6, Y6
|
|
VPSUBW Y14, Y8, Y8
|
|
VPSUBW Y15, Y10, Y10
|
|
|
|
// level 2
|
|
// shuffle
|
|
VPSHUFD $0xB1, Y4, Y12
|
|
VPSHUFD $0xB1, Y5, Y13
|
|
VPSHUFD $0xB1, Y6, Y14
|
|
VPSHUFD $0xB1, Y7, Y15
|
|
VPBLENDD $0x55, Y4, Y13, Y4
|
|
VPBLENDD $0xAA, Y5, Y12, Y5
|
|
VPBLENDD $0x55, Y6, Y15, Y6
|
|
VPBLENDD $0xAA, Y7, Y14, Y7
|
|
VPSHUFD $0xB1, Y8, Y12
|
|
VPSHUFD $0xB1, Y9, Y13
|
|
VPSHUFD $0xB1, Y10, Y14
|
|
VPSHUFD $0xB1, Y11, Y15
|
|
VPBLENDD $0x55, Y8, Y13, Y8
|
|
VPBLENDD $0xAA, Y9, Y12, Y9
|
|
VPBLENDD $0x55, Y10, Y15, Y10
|
|
VPBLENDD $0xAA, Y11, Y14, Y11
|
|
|
|
// update
|
|
VPSUBW Y5, Y4, Y12
|
|
VPSUBW Y7, Y6, Y13
|
|
VPSUBW Y9, Y8, Y14
|
|
VPSUBW Y11, Y10, Y15
|
|
VPADDW Y4, Y5, Y4
|
|
VPADDW Y6, Y7, Y6
|
|
VPADDW Y8, Y9, Y8
|
|
VPADDW Y10, Y11, Y10
|
|
|
|
// zetas
|
|
VMOVDQU 512(R8), Y7
|
|
VMOVDQU 544(R8), Y9
|
|
VMOVDQU 576(R8), Y11
|
|
VMOVDQU 608(R8), Y3
|
|
|
|
// mul
|
|
VPMULLW Y7, Y12, Y5
|
|
VPMULHW Y7, Y12, Y12
|
|
VPMULLW Y9, Y13, Y7
|
|
VPMULHW Y9, Y13, Y13
|
|
VPMULLW Y11, Y14, Y9
|
|
VPMULHW Y11, Y14, Y14
|
|
VPMULLW Y3, Y15, Y11
|
|
VPMULHW Y3, Y15, Y15
|
|
|
|
// reduce
|
|
VPMULLW Y0, Y5, Y5
|
|
VPMULLW Y0, Y7, Y7
|
|
VPMULLW Y0, Y9, Y9
|
|
VPMULLW Y0, Y11, Y11
|
|
VPMULHW Y1, Y5, Y5
|
|
VPMULHW Y1, Y7, Y7
|
|
VPMULHW Y1, Y9, Y9
|
|
VPMULHW Y1, Y11, Y11
|
|
VPSUBW Y5, Y12, Y5
|
|
VPSUBW Y7, Y13, Y7
|
|
VPSUBW Y9, Y14, Y9
|
|
VPSUBW Y11, Y15, Y11
|
|
|
|
// level 3
|
|
// shuffle
|
|
VSHUFPD $0x00, Y5, Y4, Y3
|
|
VSHUFPD $0x0F, Y5, Y4, Y4
|
|
VSHUFPD $0x00, Y7, Y6, Y5
|
|
VSHUFPD $0x0F, Y7, Y6, Y6
|
|
VSHUFPD $0x00, Y9, Y8, Y7
|
|
VSHUFPD $0x0F, Y9, Y8, Y8
|
|
VSHUFPD $0x00, Y11, Y10, Y9
|
|
VSHUFPD $0x0F, Y11, Y10, Y10
|
|
|
|
// update
|
|
VPSUBW Y4, Y3, Y12
|
|
VPSUBW Y6, Y5, Y13
|
|
VPSUBW Y8, Y7, Y14
|
|
VPSUBW Y10, Y9, Y15
|
|
VPADDW Y3, Y4, Y3
|
|
VPADDW Y5, Y6, Y5
|
|
VPADDW Y7, Y8, Y7
|
|
VPADDW Y9, Y10, Y9
|
|
|
|
// zetas
|
|
VMOVDQU 768(R8), Y6
|
|
VMOVDQU 800(R8), Y8
|
|
VMOVDQU 832(R8), Y10
|
|
VMOVDQU 864(R8), Y11
|
|
|
|
// mul
|
|
VPMULLW Y6, Y12, Y4
|
|
VPMULHW Y6, Y12, Y12
|
|
VPMULLW Y8, Y13, Y6
|
|
VPMULHW Y8, Y13, Y13
|
|
VPMULLW Y10, Y14, Y8
|
|
VPMULHW Y10, Y14, Y14
|
|
VPMULLW Y11, Y15, Y10
|
|
VPMULHW Y11, Y15, Y15
|
|
|
|
// reduce
|
|
VPMULLW Y0, Y4, Y4
|
|
VPMULLW Y0, Y6, Y6
|
|
VPMULLW Y0, Y8, Y8
|
|
VPMULLW Y0, Y10, Y10
|
|
VPMULHW Y1, Y4, Y4
|
|
VPMULHW Y1, Y6, Y6
|
|
VPMULHW Y1, Y8, Y8
|
|
VPMULHW Y1, Y10, Y10
|
|
VPSUBW Y4, Y12, Y4
|
|
VPSUBW Y6, Y13, Y6
|
|
VPSUBW Y8, Y14, Y8
|
|
VPSUBW Y10, Y15, Y10
|
|
|
|
// reduce 2
|
|
VPMULHW Y2, Y3, Y12
|
|
VPMULHW Y2, Y5, Y13
|
|
VPMULHW Y2, Y7, Y14
|
|
VPMULHW Y2, Y9, Y15
|
|
VPSRAW $11, Y12, Y12
|
|
VPSRAW $11, Y13, Y13
|
|
VPSRAW $11, Y14, Y14
|
|
VPSRAW $11, Y15, Y15
|
|
VPMULLW Y1, Y12, Y12
|
|
VPMULLW Y1, Y13, Y13
|
|
VPMULLW Y1, Y14, Y14
|
|
VPMULLW Y1, Y15, Y15
|
|
VPSUBW Y12, Y3, Y3
|
|
VPSUBW Y13, Y5, Y5
|
|
VPSUBW Y14, Y7, Y7
|
|
VPSUBW Y15, Y9, Y9
|
|
|
|
// level 4
|
|
// shuffle
|
|
VPERM2I128 $0x02, Y3, Y4, Y11
|
|
VPERM2I128 $0x13, Y3, Y4, Y3
|
|
VPERM2I128 $0x02, Y5, Y6, Y4
|
|
VPERM2I128 $0x13, Y5, Y6, Y5
|
|
VPERM2I128 $0x02, Y7, Y8, Y6
|
|
VPERM2I128 $0x13, Y7, Y8, Y7
|
|
VPERM2I128 $0x02, Y9, Y10, Y8
|
|
VPERM2I128 $0x13, Y9, Y10, Y9
|
|
|
|
// update
|
|
VMOVDQA Y11, Y12
|
|
VMOVDQA Y4, Y13
|
|
VMOVDQA Y6, Y14
|
|
VMOVDQA Y8, Y15
|
|
VPADDW Y11, Y3, Y10
|
|
VPADDW Y4, Y5, Y4
|
|
VPADDW Y6, Y7, Y6
|
|
VPADDW Y8, Y9, Y8
|
|
VPSUBW Y3, Y12, Y3
|
|
VPSUBW Y5, Y13, Y5
|
|
VPSUBW Y7, Y14, Y7
|
|
VPSUBW Y9, Y15, Y9
|
|
|
|
// zetas
|
|
VMOVDQU 1024(R8), Y12
|
|
VMOVDQU 1056(R8), Y13
|
|
VMOVDQU 1088(R8), Y14
|
|
VMOVDQU 1120(R8), Y15
|
|
|
|
// mul
|
|
VPMULLW Y12, Y3, Y11
|
|
VPMULHW Y12, Y3, Y3
|
|
VPMULLW Y13, Y5, Y12
|
|
VPMULHW Y13, Y5, Y5
|
|
VPMULLW Y14, Y7, Y13
|
|
VPMULHW Y14, Y7, Y7
|
|
VPMULLW Y15, Y9, Y14
|
|
VPMULHW Y15, Y9, Y9
|
|
|
|
// reduce
|
|
VPMULLW Y0, Y11, Y11
|
|
VPMULLW Y0, Y12, Y12
|
|
VPMULLW Y0, Y13, Y13
|
|
VPMULLW Y0, Y14, Y14
|
|
VPMULHW Y1, Y11, Y11
|
|
VPMULHW Y1, Y12, Y12
|
|
VPMULHW Y1, Y13, Y13
|
|
VPMULHW Y1, Y14, Y14
|
|
VPSUBW Y11, Y3, Y3
|
|
VPSUBW Y12, Y5, Y5
|
|
VPSUBW Y13, Y7, Y7
|
|
VPSUBW Y14, Y9, Y9
|
|
|
|
// level 5
|
|
// update
|
|
VMOVDQA Y10, Y12
|
|
VMOVDQA Y3, Y13
|
|
VMOVDQA Y6, Y14
|
|
VMOVDQA Y7, Y15
|
|
VPADDW Y10, Y4, Y10
|
|
VPADDW Y3, Y5, Y3
|
|
VPADDW Y6, Y8, Y6
|
|
VPADDW Y7, Y9, Y7
|
|
VPSUBW Y4, Y12, Y4
|
|
VPSUBW Y5, Y13, Y5
|
|
VPSUBW Y8, Y14, Y8
|
|
VPSUBW Y9, Y15, Y9
|
|
|
|
// zetas
|
|
VMOVDQU 1280(SI), Y14
|
|
VMOVDQU 1312(SI), Y15
|
|
|
|
// mul
|
|
VPMULLW Y14, Y4, Y11
|
|
VPMULLW Y14, Y5, Y12
|
|
VPMULLW Y15, Y8, Y13
|
|
VPMULHW Y14, Y4, Y4
|
|
VPMULHW Y14, Y5, Y5
|
|
VPMULHW Y15, Y8, Y8
|
|
VPMULLW Y15, Y9, Y14
|
|
VPMULHW Y15, Y9, Y9
|
|
|
|
// reduce
|
|
VPMULLW Y0, Y11, Y11
|
|
VPMULLW Y0, Y12, Y12
|
|
VPMULLW Y0, Y13, Y13
|
|
VPMULLW Y0, Y14, Y14
|
|
VPMULHW Y1, Y11, Y11
|
|
VPMULHW Y1, Y12, Y12
|
|
VPMULHW Y1, Y13, Y13
|
|
VPMULHW Y1, Y14, Y14
|
|
VPSUBW Y11, Y4, Y4
|
|
VPSUBW Y12, Y5, Y5
|
|
VPSUBW Y13, Y8, Y8
|
|
VPSUBW Y14, Y9, Y9
|
|
|
|
// reduce 2
|
|
VPMULHW Y2, Y10, Y12
|
|
VPMULHW Y2, Y6, Y13
|
|
VPSRAW $11, Y12, Y12
|
|
VPSRAW $11, Y13, Y13
|
|
VPMULLW Y1, Y12, Y12
|
|
VPMULLW Y1, Y13, Y13
|
|
VPSUBW Y12, Y10, Y10
|
|
VPSUBW Y13, Y6, Y6
|
|
|
|
// level 6
|
|
// update
|
|
VMOVDQA Y10, Y12
|
|
VMOVDQA Y3, Y13
|
|
VMOVDQA Y4, Y14
|
|
VMOVDQA Y5, Y15
|
|
VPADDW Y10, Y6, Y10
|
|
VPADDW Y3, Y7, Y3
|
|
VPADDW Y4, Y8, Y4
|
|
VPADDW Y5, Y9, Y5
|
|
VPSUBW Y6, Y12, Y6
|
|
VPSUBW Y7, Y13, Y7
|
|
VPSUBW Y8, Y14, Y8
|
|
VPSUBW Y9, Y15, Y9
|
|
|
|
// zetas
|
|
VMOVDQU 1408(SI), Y15
|
|
|
|
// mul
|
|
VPMULLW Y15, Y6, Y11
|
|
VPMULLW Y15, Y7, Y12
|
|
VPMULLW Y15, Y8, Y13
|
|
VPMULLW Y15, Y9, Y14
|
|
VPMULHW Y15, Y6, Y6
|
|
VPMULHW Y15, Y7, Y7
|
|
VPMULHW Y15, Y8, Y8
|
|
VPMULHW Y15, Y9, Y9
|
|
|
|
// reduce
|
|
VPMULLW Y0, Y11, Y11
|
|
VPMULLW Y0, Y12, Y12
|
|
VPMULLW Y0, Y13, Y13
|
|
VPMULLW Y0, Y14, Y14
|
|
VPMULHW Y1, Y11, Y11
|
|
VPMULHW Y1, Y12, Y12
|
|
VPMULHW Y1, Y13, Y13
|
|
VPMULHW Y1, Y14, Y14
|
|
VPSUBW Y11, Y6, Y6
|
|
VPSUBW Y12, Y7, Y7
|
|
VPSUBW Y13, Y8, Y8
|
|
VPSUBW Y14, Y9, Y9
|
|
|
|
// reduce 2
|
|
VPMULHW Y2, Y3, Y12
|
|
VPSRAW $11, Y12, Y12
|
|
VPMULLW Y1, Y12, Y12
|
|
VPSUBW Y12, Y3, Y3
|
|
|
|
// store
|
|
VMOVDQU Y10, (DI)
|
|
VMOVDQU Y3, 32(DI)
|
|
VMOVDQU Y4, 64(DI)
|
|
VMOVDQU Y5, 96(DI)
|
|
VMOVDQU Y6, 128(DI)
|
|
VMOVDQU Y7, 160(DI)
|
|
VMOVDQU Y8, 192(DI)
|
|
VMOVDQU Y9, 224(DI)
|
|
|
|
ADDQ $256, DI
|
|
ADDQ $128, R8
|
|
|
|
// second round
|
|
// load
|
|
VMOVDQU (DI), Y4
|
|
VMOVDQU 32(DI), Y5
|
|
VMOVDQU 64(DI), Y6
|
|
VMOVDQU 96(DI), Y7
|
|
VMOVDQU 128(DI), Y8
|
|
VMOVDQU 160(DI), Y9
|
|
VMOVDQU 192(DI), Y10
|
|
VMOVDQU 224(DI), Y11
|
|
|
|
// reorder
|
|
VMOVDQU ·lowdword<>(SB), Y3
|
|
VPAND Y3, Y4, Y12
|
|
VPAND Y3, Y5, Y13
|
|
VPAND Y3, Y6, Y14
|
|
VPAND Y3, Y7, Y15
|
|
VPSRLD $16, Y4, Y4
|
|
VPSRLD $16, Y5, Y5
|
|
VPSRLD $16, Y6, Y6
|
|
VPSRLD $16, Y7, Y7
|
|
VPACKUSDW Y5, Y4, Y5
|
|
VPACKUSDW Y13, Y12, Y4
|
|
VPACKUSDW Y7, Y6, Y7
|
|
VPACKUSDW Y15, Y14, Y6
|
|
VPERMQ invntt_VPERMQ_IDX, Y4, Y4
|
|
VPERMQ invntt_VPERMQ_IDX, Y5, Y5
|
|
VPERMQ invntt_VPERMQ_IDX, Y6, Y6
|
|
VPERMQ invntt_VPERMQ_IDX, Y7, Y7
|
|
VPAND Y3, Y8, Y12
|
|
VPAND Y3, Y9, Y13
|
|
VPAND Y3, Y10, Y14
|
|
VPAND Y3, Y11, Y15
|
|
VPSRLD $16, Y8, Y8
|
|
VPSRLD $16, Y9, Y9
|
|
VPSRLD $16, Y10, Y10
|
|
VPSRLD $16, Y11, Y11
|
|
VPACKUSDW Y9, Y8, Y9
|
|
VPACKUSDW Y13, Y12, Y8
|
|
VPACKUSDW Y11, Y10, Y11
|
|
VPACKUSDW Y15, Y14, Y10
|
|
VPERMQ invntt_VPERMQ_IDX, Y8, Y8
|
|
VPERMQ invntt_VPERMQ_IDX, Y9, Y9
|
|
VPERMQ invntt_VPERMQ_IDX, Y10, Y10
|
|
VPERMQ invntt_VPERMQ_IDX, Y11, Y11
|
|
|
|
// level 0
|
|
// update
|
|
VMOVDQA Y4, Y12
|
|
VMOVDQA Y6, Y13
|
|
VMOVDQA Y8, Y14
|
|
VMOVDQA Y10, Y15
|
|
VPADDW Y4, Y5, Y4
|
|
VPADDW Y6, Y7, Y6
|
|
VPADDW Y8, Y9, Y8
|
|
VPADDW Y10, Y11, Y10
|
|
VPSUBW Y5, Y12, Y5
|
|
VPSUBW Y7, Y13, Y7
|
|
VPSUBW Y9, Y14, Y9
|
|
VPSUBW Y11, Y15, Y11
|
|
|
|
// zetas
|
|
VMOVDQU (R8), Y13
|
|
VMOVDQU 32(R8), Y14
|
|
VMOVDQU 64(R8), Y15
|
|
VMOVDQU 96(R8), Y3
|
|
|
|
// mul
|
|
VPMULLW Y13, Y5, Y12
|
|
VPMULHW Y13, Y5, Y5
|
|
VPMULLW Y14, Y7, Y13
|
|
VPMULHW Y14, Y7, Y7
|
|
VPMULLW Y15, Y9, Y14
|
|
VPMULHW Y15, Y9, Y9
|
|
VPMULLW Y3, Y11, Y15
|
|
VPMULHW Y3, Y11, Y11
|
|
|
|
// reduce
|
|
VPMULLW Y0, Y12, Y12
|
|
VPMULLW Y0, Y13, Y13
|
|
VPMULLW Y0, Y14, Y14
|
|
VPMULLW Y0, Y15, Y15
|
|
VPMULHW Y1, Y12, Y12
|
|
VPMULHW Y1, Y13, Y13
|
|
VPMULHW Y1, Y14, Y14
|
|
VPMULHW Y1, Y15, Y15
|
|
VPSUBW Y12, Y5, Y5
|
|
VPSUBW Y13, Y7, Y7
|
|
VPSUBW Y14, Y9, Y9
|
|
VPSUBW Y15, Y11, Y11
|
|
|
|
// level 1
|
|
// shuffle
|
|
VMOVDQU ·vpshufb_idx<>(SB), Y3
|
|
VPSHUFB Y3, Y4, Y12
|
|
VPSHUFB Y3, Y5, Y13
|
|
VPSHUFB Y3, Y6, Y14
|
|
VPSHUFB Y3, Y7, Y15
|
|
VPBLENDW $0x55, Y4, Y13, Y4
|
|
VPBLENDW $0xAA, Y5, Y12, Y5
|
|
VPBLENDW $0x55, Y6, Y15, Y6
|
|
VPBLENDW $0xAA, Y7, Y14, Y7
|
|
VPSHUFB Y3, Y8, Y12
|
|
VPSHUFB Y3, Y9, Y13
|
|
VPSHUFB Y3, Y10, Y14
|
|
VPSHUFB Y3, Y11, Y15
|
|
VPBLENDW $0x55, Y8, Y13, Y8
|
|
VPBLENDW $0xAA, Y9, Y12, Y9
|
|
VPBLENDW $0x55, Y10, Y15, Y10
|
|
VPBLENDW $0xAA, Y11, Y14, Y11
|
|
|
|
// update
|
|
VMOVDQA Y4, Y12
|
|
VMOVDQA Y6, Y13
|
|
VMOVDQA Y8, Y14
|
|
VMOVDQA Y10, Y15
|
|
VPADDW Y4, Y5, Y4
|
|
VPADDW Y6, Y7, Y6
|
|
VPADDW Y8, Y9, Y8
|
|
VPADDW Y10, Y11, Y10
|
|
VPSUBW Y5, Y12, Y5
|
|
VPSUBW Y7, Y13, Y7
|
|
VPSUBW Y9, Y14, Y9
|
|
VPSUBW Y11, Y15, Y11
|
|
|
|
// zetas
|
|
VMOVDQU 256(R8), Y13
|
|
VMOVDQU 288(R8), Y14
|
|
VMOVDQU 320(R8), Y15
|
|
VMOVDQU 352(R8), Y3
|
|
|
|
// mul
|
|
VPMULLW Y13, Y5, Y12
|
|
VPMULHW Y13, Y5, Y5
|
|
VPMULLW Y14, Y7, Y13
|
|
VPMULHW Y14, Y7, Y7
|
|
VPMULLW Y15, Y9, Y14
|
|
VPMULHW Y15, Y9, Y9
|
|
VPMULLW Y3, Y11, Y15
|
|
VPMULHW Y3, Y11, Y11
|
|
|
|
// reduce
|
|
VPMULLW Y0, Y12, Y12
|
|
VPMULLW Y0, Y13, Y13
|
|
VPMULLW Y0, Y14, Y14
|
|
VPMULLW Y0, Y15, Y15
|
|
VPMULHW Y1, Y12, Y12
|
|
VPMULHW Y1, Y13, Y13
|
|
VPMULHW Y1, Y14, Y14
|
|
VPMULHW Y1, Y15, Y15
|
|
VPSUBW Y12, Y5, Y5
|
|
VPSUBW Y13, Y7, Y7
|
|
VPSUBW Y14, Y9, Y9
|
|
VPSUBW Y15, Y11, Y11
|
|
|
|
// reduce 2
|
|
VPMULHW Y2, Y4, Y12
|
|
VPMULHW Y2, Y6, Y13
|
|
VPMULHW Y2, Y8, Y14
|
|
VPMULHW Y2, Y10, Y15
|
|
VPSRAW $11, Y12, Y12
|
|
VPSRAW $11, Y13, Y13
|
|
VPSRAW $11, Y14, Y14
|
|
VPSRAW $11, Y15, Y15
|
|
VPMULLW Y1, Y12, Y12
|
|
VPMULLW Y1, Y13, Y13
|
|
VPMULLW Y1, Y14, Y14
|
|
VPMULLW Y1, Y15, Y15
|
|
VPSUBW Y12, Y4, Y4
|
|
VPSUBW Y13, Y6, Y6
|
|
VPSUBW Y14, Y8, Y8
|
|
VPSUBW Y15, Y10, Y10
|
|
|
|
// level 2
|
|
// shuffle
|
|
VPSHUFD $0xB1, Y4, Y12
|
|
VPSHUFD $0xB1, Y5, Y13
|
|
VPSHUFD $0xB1, Y6, Y14
|
|
VPSHUFD $0xB1, Y7, Y15
|
|
VPBLENDD $0x55, Y4, Y13, Y4
|
|
VPBLENDD $0xAA, Y5, Y12, Y5
|
|
VPBLENDD $0x55, Y6, Y15, Y6
|
|
VPBLENDD $0xAA, Y7, Y14, Y7
|
|
VPSHUFD $0xB1, Y8, Y12
|
|
VPSHUFD $0xB1, Y9, Y13
|
|
VPSHUFD $0xB1, Y10, Y14
|
|
VPSHUFD $0xB1, Y11, Y15
|
|
VPBLENDD $0x55, Y8, Y13, Y8
|
|
VPBLENDD $0xAA, Y9, Y12, Y9
|
|
VPBLENDD $0x55, Y10, Y15, Y10
|
|
VPBLENDD $0xAA, Y11, Y14, Y11
|
|
|
|
// update
|
|
VMOVDQA Y4, Y12
|
|
VMOVDQA Y6, Y13
|
|
VMOVDQA Y8, Y14
|
|
VMOVDQA Y10, Y15
|
|
VPADDW Y4, Y5, Y4
|
|
VPADDW Y6, Y7, Y6
|
|
VPADDW Y8, Y9, Y8
|
|
VPADDW Y10, Y11, Y10
|
|
VPSUBW Y5, Y12, Y5
|
|
VPSUBW Y7, Y13, Y7
|
|
VPSUBW Y9, Y14, Y9
|
|
VPSUBW Y11, Y15, Y11
|
|
|
|
// zetas
|
|
VMOVDQU 512(R8), Y13
|
|
VMOVDQU 544(R8), Y14
|
|
VMOVDQU 576(R8), Y15
|
|
VMOVDQU 608(R8), Y3
|
|
|
|
// mul
|
|
VPMULLW Y13, Y5, Y12
|
|
VPMULHW Y13, Y5, Y5
|
|
VPMULLW Y14, Y7, Y13
|
|
VPMULHW Y14, Y7, Y7
|
|
VPMULLW Y15, Y9, Y14
|
|
VPMULHW Y15, Y9, Y9
|
|
VPMULLW Y3, Y11, Y15
|
|
VPMULHW Y3, Y11, Y11
|
|
|
|
// reduce
|
|
VPMULLW Y0, Y12, Y12
|
|
VPMULLW Y0, Y13, Y13
|
|
VPMULLW Y0, Y14, Y14
|
|
VPMULLW Y0, Y15, Y15
|
|
VPMULHW Y1, Y12, Y12
|
|
VPMULHW Y1, Y13, Y13
|
|
VPMULHW Y1, Y14, Y14
|
|
VPMULHW Y1, Y15, Y15
|
|
VPSUBW Y12, Y5, Y5
|
|
VPSUBW Y13, Y7, Y7
|
|
VPSUBW Y14, Y9, Y9
|
|
VPSUBW Y15, Y11, Y11
|
|
|
|
// level 3
|
|
// shuffle
|
|
VSHUFPD $0x00, Y5, Y4, Y3
|
|
VSHUFPD $0x0F, Y5, Y4, Y4
|
|
VSHUFPD $0x00, Y7, Y6, Y5
|
|
VSHUFPD $0x0F, Y7, Y6, Y6
|
|
VSHUFPD $0x00, Y9, Y8, Y7
|
|
VSHUFPD $0x0F, Y9, Y8, Y8
|
|
VSHUFPD $0x00, Y11, Y10, Y9
|
|
VSHUFPD $0x0F, Y11, Y10, Y10
|
|
|
|
// update
|
|
VMOVDQA Y3, Y12
|
|
VMOVDQA Y5, Y13
|
|
VMOVDQA Y7, Y14
|
|
VMOVDQA Y9, Y15
|
|
VPADDW Y3, Y4, Y3
|
|
VPADDW Y5, Y6, Y5
|
|
VPADDW Y7, Y8, Y7
|
|
VPADDW Y9, Y10, Y9
|
|
VPSUBW Y4, Y12, Y4
|
|
VPSUBW Y6, Y13, Y6
|
|
VPSUBW Y8, Y14, Y8
|
|
VPSUBW Y10, Y15, Y10
|
|
|
|
// zetas
|
|
VMOVDQU 768(R8), Y12
|
|
VMOVDQU 800(R8), Y13
|
|
VMOVDQU 832(R8), Y14
|
|
VMOVDQU 864(R8), Y15
|
|
|
|
// mul
|
|
VPMULLW Y12, Y4, Y11
|
|
VPMULHW Y12, Y4, Y4
|
|
VPMULLW Y13, Y6, Y12
|
|
VPMULHW Y13, Y6, Y6
|
|
VPMULLW Y14, Y8, Y13
|
|
VPMULHW Y14, Y8, Y8
|
|
VPMULLW Y15, Y10, Y14
|
|
VPMULHW Y15, Y10, Y10
|
|
|
|
// reduce
|
|
VPMULLW Y0, Y11, Y11
|
|
VPMULLW Y0, Y12, Y12
|
|
VPMULLW Y0, Y13, Y13
|
|
VPMULLW Y0, Y14, Y14
|
|
VPMULHW Y1, Y11, Y11
|
|
VPMULHW Y1, Y12, Y12
|
|
VPMULHW Y1, Y13, Y13
|
|
VPMULHW Y1, Y14, Y14
|
|
VPSUBW Y11, Y4, Y4
|
|
VPSUBW Y12, Y6, Y6
|
|
VPSUBW Y13, Y8, Y8
|
|
VPSUBW Y14, Y10, Y10
|
|
|
|
// reduce 2
|
|
VPMULHW Y2, Y3, Y12
|
|
VPMULHW Y2, Y5, Y13
|
|
VPMULHW Y2, Y7, Y14
|
|
VPMULHW Y2, Y9, Y15
|
|
VPSRAW $11, Y12, Y12
|
|
VPSRAW $11, Y13, Y13
|
|
VPSRAW $11, Y14, Y14
|
|
VPSRAW $11, Y15, Y15
|
|
VPMULLW Y1, Y12, Y12
|
|
VPMULLW Y1, Y13, Y13
|
|
VPMULLW Y1, Y14, Y14
|
|
VPMULLW Y1, Y15, Y15
|
|
VPSUBW Y12, Y3, Y3
|
|
VPSUBW Y13, Y5, Y5
|
|
VPSUBW Y14, Y7, Y7
|
|
VPSUBW Y15, Y9, Y9
|
|
|
|
// level 4
|
|
// shuffle
|
|
VPERM2I128 $0x02, Y3, Y4, Y11
|
|
VPERM2I128 $0x13, Y3, Y4, Y3
|
|
VPERM2I128 $0x02, Y5, Y6, Y4
|
|
VPERM2I128 $0x13, Y5, Y6, Y5
|
|
VPERM2I128 $0x02, Y7, Y8, Y6
|
|
VPERM2I128 $0x13, Y7, Y8, Y7
|
|
VPERM2I128 $0x02, Y9, Y10, Y8
|
|
VPERM2I128 $0x13, Y9, Y10, Y9
|
|
|
|
// update
|
|
VMOVDQA Y11, Y12
|
|
VMOVDQA Y4, Y13
|
|
VMOVDQA Y6, Y14
|
|
VMOVDQA Y8, Y15
|
|
VPADDW Y11, Y3, Y10
|
|
VPADDW Y4, Y5, Y4
|
|
VPADDW Y6, Y7, Y6
|
|
VPADDW Y8, Y9, Y8
|
|
VPSUBW Y3, Y12, Y3
|
|
VPSUBW Y5, Y13, Y5
|
|
VPSUBW Y7, Y14, Y7
|
|
VPSUBW Y9, Y15, Y9
|
|
|
|
// zetas
|
|
VMOVDQU 1024(R8), Y12
|
|
VMOVDQU 1056(R8), Y13
|
|
VMOVDQU 1088(R8), Y14
|
|
VMOVDQU 1120(R8), Y15
|
|
|
|
// mul
|
|
VPMULLW Y12, Y3, Y11
|
|
VPMULHW Y12, Y3, Y3
|
|
VPMULLW Y13, Y5, Y12
|
|
VPMULHW Y13, Y5, Y5
|
|
VPMULLW Y14, Y7, Y13
|
|
VPMULHW Y14, Y7, Y7
|
|
VPMULLW Y15, Y9, Y14
|
|
VPMULHW Y15, Y9, Y9
|
|
|
|
// reduce
|
|
VPMULLW Y0, Y11, Y11
|
|
VPMULLW Y0, Y12, Y12
|
|
VPMULLW Y0, Y13, Y13
|
|
VPMULLW Y0, Y14, Y14
|
|
VPMULHW Y1, Y11, Y11
|
|
VPMULHW Y1, Y12, Y12
|
|
VPMULHW Y1, Y13, Y13
|
|
VPMULHW Y1, Y14, Y14
|
|
VPSUBW Y11, Y3, Y3
|
|
VPSUBW Y12, Y5, Y5
|
|
VPSUBW Y13, Y7, Y7
|
|
VPSUBW Y14, Y9, Y9
|
|
|
|
// level 5
|
|
// update
|
|
VMOVDQA Y10, Y12
|
|
VMOVDQA Y3, Y13
|
|
VMOVDQA Y6, Y14
|
|
VMOVDQA Y7, Y15
|
|
VPADDW Y10, Y4, Y10
|
|
VPADDW Y3, Y5, Y3
|
|
VPADDW Y6, Y8, Y6
|
|
VPADDW Y7, Y9, Y7
|
|
VPSUBW Y4, Y12, Y4
|
|
VPSUBW Y5, Y13, Y5
|
|
VPSUBW Y8, Y14, Y8
|
|
VPSUBW Y9, Y15, Y9
|
|
|
|
// zetas
|
|
VMOVDQU 1344(SI), Y14
|
|
VMOVDQU 1376(SI), Y15
|
|
|
|
// mul
|
|
VPMULLW Y14, Y4, Y11
|
|
VPMULLW Y14, Y5, Y12
|
|
VPMULLW Y15, Y8, Y13
|
|
VPMULHW Y14, Y4, Y4
|
|
VPMULHW Y14, Y5, Y5
|
|
VPMULHW Y15, Y8, Y8
|
|
VPMULLW Y15, Y9, Y14
|
|
VPMULHW Y15, Y9, Y9
|
|
|
|
// reduce
|
|
VPMULLW Y0, Y11, Y11
|
|
VPMULLW Y0, Y12, Y12
|
|
VPMULLW Y0, Y13, Y13
|
|
VPMULLW Y0, Y14, Y14
|
|
VPMULHW Y1, Y11, Y11
|
|
VPMULHW Y1, Y12, Y12
|
|
VPMULHW Y1, Y13, Y13
|
|
VPMULHW Y1, Y14, Y14
|
|
VPSUBW Y11, Y4, Y4
|
|
VPSUBW Y12, Y5, Y5
|
|
VPSUBW Y13, Y8, Y8
|
|
VPSUBW Y14, Y9, Y9
|
|
|
|
// reduce 2
|
|
VPMULHW Y2, Y10, Y12
|
|
VPMULHW Y2, Y6, Y13
|
|
VPSRAW $11, Y12, Y12
|
|
VPSRAW $11, Y13, Y13
|
|
VPMULLW Y1, Y12, Y12
|
|
VPMULLW Y1, Y13, Y13
|
|
VPSUBW Y12, Y10, Y10
|
|
VPSUBW Y13, Y6, Y6
|
|
|
|
// level 6
|
|
// update
|
|
VMOVDQA Y10, Y12
|
|
VMOVDQA Y3, Y13
|
|
VMOVDQA Y4, Y14
|
|
VMOVDQA Y5, Y15
|
|
VPADDW Y10, Y6, Y10
|
|
VPADDW Y3, Y7, Y3
|
|
VPADDW Y4, Y8, Y4
|
|
VPADDW Y5, Y9, Y5
|
|
VPSUBW Y6, Y12, Y6
|
|
VPSUBW Y7, Y13, Y7
|
|
VPSUBW Y8, Y14, Y8
|
|
VPSUBW Y9, Y15, Y9
|
|
|
|
// zetas
|
|
VMOVDQU 1440(SI), Y15
|
|
|
|
// mul
|
|
VPMULLW Y15, Y6, Y11
|
|
VPMULLW Y15, Y7, Y12
|
|
VPMULLW Y15, Y8, Y13
|
|
VPMULLW Y15, Y9, Y14
|
|
VPMULHW Y15, Y6, Y6
|
|
VPMULHW Y15, Y7, Y7
|
|
VPMULHW Y15, Y8, Y8
|
|
VPMULHW Y15, Y9, Y9
|
|
|
|
// reduce
|
|
VPMULLW Y0, Y11, Y11
|
|
VPMULLW Y0, Y12, Y12
|
|
VPMULLW Y0, Y13, Y13
|
|
VPMULLW Y0, Y14, Y14
|
|
VPMULHW Y1, Y11, Y11
|
|
VPMULHW Y1, Y12, Y12
|
|
VPMULHW Y1, Y13, Y13
|
|
VPMULHW Y1, Y14, Y14
|
|
VPSUBW Y11, Y6, Y6
|
|
VPSUBW Y12, Y7, Y7
|
|
VPSUBW Y13, Y8, Y8
|
|
VPSUBW Y14, Y9, Y9
|
|
|
|
// reduce 2
|
|
VPMULHW Y2, Y3, Y12
|
|
VPSRAW $11, Y12, Y12
|
|
VPMULLW Y1, Y12, Y12
|
|
VPSUBW Y12, Y3, Y3
|
|
|
|
// store
|
|
VMOVDQU Y10, (DI)
|
|
VMOVDQU Y3, 32(DI)
|
|
VMOVDQU Y4, 64(DI)
|
|
VMOVDQU Y5, 96(DI)
|
|
VMOVDQU Y6, 128(DI)
|
|
VMOVDQU Y7, 160(DI)
|
|
VMOVDQU Y8, 192(DI)
|
|
VMOVDQU Y9, 224(DI)
|
|
|
|
SUBQ $256, DI
|
|
|
|
// f
|
|
VMOVDQU ·f_x16<>(SB), Y2
|
|
|
|
// first round
|
|
// load
|
|
VMOVDQU (DI), Y4
|
|
VMOVDQU 32(DI), Y5
|
|
VMOVDQU 64(DI), Y6
|
|
VMOVDQU 96(DI), Y7
|
|
VMOVDQU 256(DI), Y8
|
|
VMOVDQU 288(DI), Y9
|
|
VMOVDQU 320(DI), Y10
|
|
VMOVDQU 352(DI), Y11
|
|
|
|
// level 7
|
|
// update
|
|
VMOVDQA Y4, Y12
|
|
VMOVDQA Y5, Y13
|
|
VMOVDQA Y6, Y14
|
|
VMOVDQA Y7, Y15
|
|
VPADDW Y4, Y8, Y4
|
|
VPADDW Y5, Y9, Y5
|
|
VPADDW Y6, Y10, Y6
|
|
VPADDW Y7, Y11, Y7
|
|
VPSUBW Y8, Y12, Y8
|
|
VPSUBW Y9, Y13, Y9
|
|
VPSUBW Y10, Y14, Y10
|
|
VPSUBW Y11, Y15, Y11
|
|
|
|
// zeta
|
|
VMOVDQU 1472(SI), Y3
|
|
|
|
// mul
|
|
VPMULLW Y3, Y8, Y12
|
|
VPMULLW Y3, Y9, Y13
|
|
VPMULLW Y3, Y10, Y14
|
|
VPMULLW Y3, Y11, Y15
|
|
VPMULHW Y3, Y8, Y8
|
|
VPMULHW Y3, Y9, Y9
|
|
VPMULHW Y3, Y10, Y10
|
|
VPMULHW Y3, Y11, Y11
|
|
|
|
// reduce
|
|
VPMULLW Y0, Y12, Y12
|
|
VPMULLW Y0, Y13, Y13
|
|
VPMULLW Y0, Y14, Y14
|
|
VPMULLW Y0, Y15, Y15
|
|
VPMULHW Y1, Y12, Y12
|
|
VPMULHW Y1, Y13, Y13
|
|
VPMULHW Y1, Y14, Y14
|
|
VPMULHW Y1, Y15, Y15
|
|
VPSUBW Y12, Y8, Y8
|
|
VPSUBW Y13, Y9, Y9
|
|
VPSUBW Y14, Y10, Y10
|
|
VPSUBW Y15, Y11, Y11
|
|
VPADDW Y1, Y8, Y8
|
|
VPADDW Y1, Y9, Y9
|
|
VPADDW Y1, Y10, Y10
|
|
VPADDW Y1, Y11, Y11
|
|
|
|
// mul
|
|
VPMULLW Y2, Y4, Y12
|
|
VPMULLW Y2, Y5, Y13
|
|
VPMULLW Y2, Y6, Y14
|
|
VPMULLW Y2, Y7, Y15
|
|
VPMULHW Y2, Y4, Y4
|
|
VPMULHW Y2, Y5, Y5
|
|
VPMULHW Y2, Y6, Y6
|
|
VPMULHW Y2, Y7, Y7
|
|
|
|
// reduce
|
|
VPMULLW Y0, Y12, Y12
|
|
VPMULLW Y0, Y13, Y13
|
|
VPMULLW Y0, Y14, Y14
|
|
VPMULLW Y0, Y15, Y15
|
|
VPMULHW Y1, Y12, Y12
|
|
VPMULHW Y1, Y13, Y13
|
|
VPMULHW Y1, Y14, Y14
|
|
VPMULHW Y1, Y15, Y15
|
|
VPSUBW Y12, Y4, Y4
|
|
VPSUBW Y13, Y5, Y5
|
|
VPSUBW Y14, Y6, Y6
|
|
VPSUBW Y15, Y7, Y7
|
|
VPADDW Y1, Y4, Y4
|
|
VPADDW Y1, Y5, Y5
|
|
VPADDW Y1, Y6, Y6
|
|
VPADDW Y1, Y7, Y7
|
|
|
|
// store
|
|
VMOVDQU Y4, (DI)
|
|
VMOVDQU Y5, 32(DI)
|
|
VMOVDQU Y6, 64(DI)
|
|
VMOVDQU Y7, 96(DI)
|
|
VMOVDQU Y8, 256(DI)
|
|
VMOVDQU Y9, 288(DI)
|
|
VMOVDQU Y10, 320(DI)
|
|
VMOVDQU Y11, 352(DI)
|
|
|
|
ADDQ $128, DI
|
|
|
|
// second round
|
|
// load
|
|
VMOVDQU (DI), Y4
|
|
VMOVDQU 32(DI), Y5
|
|
VMOVDQU 64(DI), Y6
|
|
VMOVDQU 96(DI), Y7
|
|
VMOVDQU 256(DI), Y8
|
|
VMOVDQU 288(DI), Y9
|
|
VMOVDQU 320(DI), Y10
|
|
VMOVDQU 352(DI), Y11
|
|
|
|
// zeta
|
|
VMOVDQU 1472(SI), Y3
|
|
|
|
// level 7
|
|
// update
|
|
VMOVDQA Y4, Y12
|
|
VMOVDQA Y5, Y13
|
|
VMOVDQA Y6, Y14
|
|
VMOVDQA Y7, Y15
|
|
VPADDW Y4, Y8, Y4
|
|
VPADDW Y5, Y9, Y5
|
|
VPADDW Y6, Y10, Y6
|
|
VPADDW Y7, Y11, Y7
|
|
VPSUBW Y8, Y12, Y8
|
|
VPSUBW Y9, Y13, Y9
|
|
VPSUBW Y10, Y14, Y10
|
|
VPSUBW Y11, Y15, Y11
|
|
|
|
// mul
|
|
VPMULLW Y3, Y8, Y12
|
|
VPMULLW Y3, Y9, Y13
|
|
VPMULLW Y3, Y10, Y14
|
|
VPMULLW Y3, Y11, Y15
|
|
VPMULHW Y3, Y8, Y8
|
|
VPMULHW Y3, Y9, Y9
|
|
VPMULHW Y3, Y10, Y10
|
|
VPMULHW Y3, Y11, Y11
|
|
|
|
// reduce
|
|
VPMULLW Y0, Y12, Y12
|
|
VPMULLW Y0, Y13, Y13
|
|
VPMULLW Y0, Y14, Y14
|
|
VPMULLW Y0, Y15, Y15
|
|
VPMULHW Y1, Y12, Y12
|
|
VPMULHW Y1, Y13, Y13
|
|
VPMULHW Y1, Y14, Y14
|
|
VPMULHW Y1, Y15, Y15
|
|
VPSUBW Y12, Y8, Y8
|
|
VPSUBW Y13, Y9, Y9
|
|
VPSUBW Y14, Y10, Y10
|
|
VPSUBW Y15, Y11, Y11
|
|
VPADDW Y1, Y8, Y8
|
|
VPADDW Y1, Y9, Y9
|
|
VPADDW Y1, Y10, Y10
|
|
VPADDW Y1, Y11, Y11
|
|
|
|
// mul
|
|
VPMULLW Y2, Y4, Y12
|
|
VPMULLW Y2, Y5, Y13
|
|
VPMULLW Y2, Y6, Y14
|
|
VPMULLW Y2, Y7, Y15
|
|
VPMULHW Y2, Y4, Y4
|
|
VPMULHW Y2, Y5, Y5
|
|
VPMULHW Y2, Y6, Y6
|
|
VPMULHW Y2, Y7, Y7
|
|
|
|
// reduce
|
|
VPMULLW Y0, Y12, Y12
|
|
VPMULLW Y0, Y13, Y13
|
|
VPMULLW Y0, Y14, Y14
|
|
VPMULLW Y0, Y15, Y15
|
|
VPMULHW Y1, Y12, Y12
|
|
VPMULHW Y1, Y13, Y13
|
|
VPMULHW Y1, Y14, Y14
|
|
VPMULHW Y1, Y15, Y15
|
|
VPSUBW Y12, Y4, Y4
|
|
VPSUBW Y13, Y5, Y5
|
|
VPSUBW Y14, Y6, Y6
|
|
VPSUBW Y15, Y7, Y7
|
|
VPADDW Y1, Y4, Y4
|
|
VPADDW Y1, Y5, Y5
|
|
VPADDW Y1, Y6, Y6
|
|
VPADDW Y1, Y7, Y7
|
|
|
|
// store
|
|
VMOVDQU Y4, (DI)
|
|
VMOVDQU Y5, 32(DI)
|
|
VMOVDQU Y6, 64(DI)
|
|
VMOVDQU Y7, 96(DI)
|
|
VMOVDQU Y8, 256(DI)
|
|
VMOVDQU Y9, 288(DI)
|
|
VMOVDQU Y10, 320(DI)
|
|
VMOVDQU Y11, 352(DI)
|
|
|
|
VZEROUPPER
|
|
RET
|
|
|
|
// func pointwiseAccK2AVX2(dst *uint16, a, b **uint16)
|
|
TEXT ·pointwiseAccK2AVX2(SB), NOSPLIT, $0-24
|
|
MOVQ dst+0(FP), DI
|
|
MOVQ a+8(FP), SI
|
|
MOVQ b+16(FP), DX
|
|
|
|
VMOVDQU ·qinv_x16<>(SB), Y0
|
|
VMOVDQU ·q_x16<>(SB), Y1
|
|
VMOVDQU ·montsq_x16<>(SB), Y2
|
|
|
|
XORQ AX, AX
|
|
XORQ BX, BX
|
|
|
|
MOVQ 8(SI), R8 // a[1]
|
|
MOVQ (SI), SI // a[0]
|
|
MOVQ 8(DX), R11 // b[1]
|
|
MOVQ (DX), DX // b[0]
|
|
|
|
looptop2:
|
|
// load a
|
|
VMOVDQU (SI)(BX*1), Y4
|
|
VMOVDQU 32(SI)(BX*1), Y5
|
|
VMOVDQU 64(SI)(BX*1), Y6
|
|
VMOVDQU (R8)(BX*1), Y7
|
|
VMOVDQU 32(R8)(BX*1), Y8
|
|
VMOVDQU 64(R8)(BX*1), Y9
|
|
|
|
// mul montsq
|
|
VPMULLW Y2, Y4, Y3
|
|
VPMULHW Y2, Y4, Y10
|
|
VPMULLW Y2, Y5, Y4
|
|
VPMULHW Y2, Y5, Y11
|
|
VPMULLW Y2, Y6, Y5
|
|
VPMULHW Y2, Y6, Y12
|
|
VPMULLW Y2, Y7, Y6
|
|
VPMULHW Y2, Y7, Y13
|
|
VPMULLW Y2, Y8, Y7
|
|
VPMULHW Y2, Y8, Y14
|
|
VPMULLW Y2, Y9, Y8
|
|
VPMULHW Y2, Y9, Y15
|
|
|
|
// reduce
|
|
VPMULLW Y0, Y3, Y3
|
|
VPMULLW Y0, Y4, Y4
|
|
VPMULLW Y0, Y5, Y5
|
|
VPMULLW Y0, Y6, Y6
|
|
VPMULLW Y0, Y7, Y7
|
|
VPMULLW Y0, Y8, Y8
|
|
VPMULHW Y1, Y3, Y3
|
|
VPMULHW Y1, Y4, Y4
|
|
VPMULHW Y1, Y5, Y5
|
|
VPMULHW Y1, Y6, Y6
|
|
VPMULHW Y1, Y7, Y7
|
|
VPMULHW Y1, Y8, Y8
|
|
VPSUBW Y3, Y10, Y3
|
|
VPSUBW Y4, Y11, Y4
|
|
VPSUBW Y5, Y12, Y5
|
|
VPSUBW Y6, Y13, Y6
|
|
VPSUBW Y7, Y14, Y7
|
|
VPSUBW Y8, Y15, Y8
|
|
|
|
// load b
|
|
VMOVDQU (DX)(BX*1), Y9
|
|
VMOVDQU 32(DX)(BX*1), Y10
|
|
VMOVDQU 64(DX)(BX*1), Y11
|
|
VMOVDQU (R11)(BX*1), Y12
|
|
VMOVDQU 32(R11)(BX*1), Y13
|
|
VMOVDQU 64(R11)(BX*1), Y14
|
|
|
|
// mul
|
|
VPMULLW Y3, Y9, Y15
|
|
VPMULHW Y3, Y9, Y9
|
|
VPMULLW Y4, Y10, Y3
|
|
VPMULHW Y4, Y10, Y10
|
|
VPMULLW Y5, Y11, Y4
|
|
VPMULHW Y5, Y11, Y11
|
|
VPMULLW Y6, Y12, Y5
|
|
VPMULHW Y6, Y12, Y12
|
|
VPMULLW Y7, Y13, Y6
|
|
VPMULHW Y7, Y13, Y13
|
|
VPMULLW Y8, Y14, Y7
|
|
VPMULHW Y8, Y14, Y14
|
|
|
|
// reduce
|
|
VPMULLW Y0, Y15, Y15
|
|
VPMULLW Y0, Y3, Y3
|
|
VPMULLW Y0, Y4, Y4
|
|
VPMULLW Y0, Y5, Y5
|
|
VPMULLW Y0, Y6, Y6
|
|
VPMULLW Y0, Y7, Y7
|
|
VPMULHW Y1, Y15, Y15
|
|
VPMULHW Y1, Y3, Y3
|
|
VPMULHW Y1, Y4, Y4
|
|
VPMULHW Y1, Y5, Y5
|
|
VPMULHW Y1, Y6, Y6
|
|
VPMULHW Y1, Y7, Y7
|
|
VPSUBW Y15, Y9, Y15
|
|
VPSUBW Y3, Y10, Y3
|
|
VPSUBW Y4, Y11, Y4
|
|
VPSUBW Y5, Y12, Y5
|
|
VPSUBW Y6, Y13, Y6
|
|
VPSUBW Y7, Y14, Y7
|
|
|
|
// add
|
|
VPADDW Y15, Y5, Y5
|
|
VPADDW Y3, Y6, Y6
|
|
VPADDW Y4, Y7, Y7
|
|
|
|
// reduce 2
|
|
VMOVDQU ·v_x16<>(SB), Y3
|
|
VPMULHW Y3, Y5, Y8
|
|
VPMULHW Y3, Y6, Y9
|
|
VPMULHW Y3, Y7, Y10
|
|
VPSRAW $11, Y8, Y8
|
|
VPSRAW $11, Y9, Y9
|
|
VPSRAW $11, Y10, Y10
|
|
VPMULLW Y1, Y8, Y8
|
|
VPMULLW Y1, Y9, Y9
|
|
VPMULLW Y1, Y10, Y10
|
|
VPSUBW Y8, Y5, Y5
|
|
VPSUBW Y9, Y6, Y6
|
|
VPSUBW Y10, Y7, Y7
|
|
|
|
// store
|
|
VMOVDQU Y5, (DI)(BX*1)
|
|
VMOVDQU Y6, 32(DI)(BX*1)
|
|
VMOVDQU Y7, 64(DI)(BX*1)
|
|
|
|
ADDQ $1, AX
|
|
ADDQ $96, BX
|
|
CMPQ AX, $5
|
|
JB looptop2
|
|
|
|
// load
|
|
VMOVDQU (SI)(BX*1), Y4
|
|
VMOVDQU (R8)(BX*1), Y7
|
|
VMOVDQU (DX)(BX*1), Y9
|
|
VMOVDQU (R11)(BX*1), Y12
|
|
|
|
// mul montsq
|
|
VPMULLW Y2, Y4, Y3
|
|
VPMULHW Y2, Y4, Y10
|
|
VPMULLW Y2, Y7, Y6
|
|
VPMULHW Y2, Y7, Y13
|
|
|
|
// reduce
|
|
VPMULLW Y0, Y3, Y3
|
|
VPMULLW Y0, Y6, Y6
|
|
VPMULHW Y1, Y3, Y3
|
|
VPMULHW Y1, Y6, Y6
|
|
VPSUBW Y3, Y10, Y3
|
|
VPSUBW Y6, Y13, Y6
|
|
|
|
// mul
|
|
VPMULLW Y3, Y9, Y15
|
|
VPMULHW Y3, Y9, Y9
|
|
VPMULLW Y6, Y12, Y5
|
|
VPMULHW Y6, Y12, Y12
|
|
|
|
// reduce
|
|
VPMULLW Y0, Y15, Y15
|
|
VPMULLW Y0, Y5, Y5
|
|
VPMULHW Y1, Y15, Y15
|
|
VPMULHW Y1, Y5, Y5
|
|
VPSUBW Y15, Y9, Y15
|
|
VPSUBW Y5, Y12, Y5
|
|
|
|
// add
|
|
VPADDW Y15, Y5, Y5
|
|
|
|
// reduce 2
|
|
VMOVDQU ·v_x16<>(SB), Y3
|
|
VPMULHW Y3, Y5, Y8
|
|
VPSRAW $11, Y8, Y8
|
|
VPMULLW Y1, Y8, Y8
|
|
VPSUBW Y8, Y5, Y5
|
|
|
|
// store
|
|
VMOVDQU Y5, (DI)(BX*1)
|
|
|
|
VZEROUPPER
|
|
RET
|
|
|
|
// func pointwiseAccK2AVX2(dst *uint16, a, b **uint16)
|
|
TEXT ·pointwiseAccK3AVX2(SB), NOSPLIT, $0-24
|
|
MOVQ dst+0(FP), DI
|
|
MOVQ a+8(FP), SI
|
|
MOVQ b+16(FP), DX
|
|
|
|
VMOVDQU ·qinv_x16<>(SB), Y0
|
|
VMOVDQU ·q_x16<>(SB), Y1
|
|
VMOVDQU ·montsq_x16<>(SB), Y2
|
|
|
|
XORQ AX, AX
|
|
XORQ BX, BX
|
|
|
|
MOVQ (16)(SI), R9 // a[2]
|
|
MOVQ 8(SI), R8 // a[1]
|
|
MOVQ (SI), SI // a[0]
|
|
MOVQ 16(DX), R12 // b[2]
|
|
MOVQ 8(DX), R11 // b[1]
|
|
MOVQ (DX), DX // b[0]
|
|
|
|
looptop3:
|
|
// load a
|
|
VMOVDQU (SI)(BX*1), Y4
|
|
VMOVDQU 32(SI)(BX*1), Y5
|
|
VMOVDQU (R8)(BX*1), Y6
|
|
VMOVDQU 32(R8)(BX*1), Y7
|
|
VMOVDQU (R9)(BX*1), Y8
|
|
VMOVDQU 32(R9)(BX*1), Y9
|
|
|
|
// mul montsq
|
|
VPMULLW Y2, Y4, Y3
|
|
VPMULHW Y2, Y4, Y10
|
|
VPMULLW Y2, Y5, Y4
|
|
VPMULHW Y2, Y5, Y11
|
|
VPMULLW Y2, Y6, Y5
|
|
VPMULHW Y2, Y6, Y12
|
|
VPMULLW Y2, Y7, Y6
|
|
VPMULHW Y2, Y7, Y13
|
|
VPMULLW Y2, Y8, Y7
|
|
VPMULHW Y2, Y8, Y14
|
|
VPMULLW Y2, Y9, Y8
|
|
VPMULHW Y2, Y9, Y15
|
|
|
|
// reduce
|
|
VPMULLW Y0, Y3, Y3
|
|
VPMULLW Y0, Y4, Y4
|
|
VPMULLW Y0, Y5, Y5
|
|
VPMULLW Y0, Y6, Y6
|
|
VPMULLW Y0, Y7, Y7
|
|
VPMULLW Y0, Y8, Y8
|
|
VPMULHW Y1, Y3, Y3
|
|
VPMULHW Y1, Y4, Y4
|
|
VPMULHW Y1, Y5, Y5
|
|
VPMULHW Y1, Y6, Y6
|
|
VPMULHW Y1, Y7, Y7
|
|
VPMULHW Y1, Y8, Y8
|
|
VPSUBW Y3, Y10, Y3
|
|
VPSUBW Y4, Y11, Y4
|
|
VPSUBW Y5, Y12, Y5
|
|
VPSUBW Y6, Y13, Y6
|
|
VPSUBW Y7, Y14, Y7
|
|
VPSUBW Y8, Y15, Y8
|
|
|
|
// load b
|
|
VMOVDQU (DX)(BX*1), Y9
|
|
VMOVDQU 32(DX)(BX*1), Y10
|
|
VMOVDQU (R11)(BX*1), Y11
|
|
VMOVDQU 32(R11)(BX*1), Y12
|
|
VMOVDQU (R12)(BX*1), Y13
|
|
VMOVDQU 32(R12)(BX*1), Y14
|
|
|
|
// mul
|
|
VPMULLW Y3, Y9, Y15
|
|
VPMULHW Y3, Y9, Y9
|
|
VPMULLW Y4, Y10, Y3
|
|
VPMULHW Y4, Y10, Y10
|
|
VPMULLW Y5, Y11, Y4
|
|
VPMULHW Y5, Y11, Y11
|
|
VPMULLW Y6, Y12, Y5
|
|
VPMULHW Y6, Y12, Y12
|
|
VPMULLW Y7, Y13, Y6
|
|
VPMULHW Y7, Y13, Y13
|
|
VPMULLW Y8, Y14, Y7
|
|
VPMULHW Y8, Y14, Y14
|
|
|
|
// reduce
|
|
VPMULLW Y0, Y15, Y15
|
|
VPMULLW Y0, Y3, Y3
|
|
VPMULLW Y0, Y4, Y4
|
|
VPMULLW Y0, Y5, Y5
|
|
VPMULLW Y0, Y6, Y6
|
|
VPMULLW Y0, Y7, Y7
|
|
VPMULHW Y1, Y15, Y15
|
|
VPMULHW Y1, Y3, Y3
|
|
VPMULHW Y1, Y4, Y4
|
|
VPMULHW Y1, Y5, Y5
|
|
VPMULHW Y1, Y6, Y6
|
|
VPMULHW Y1, Y7, Y7
|
|
VPSUBW Y15, Y9, Y15
|
|
VPSUBW Y3, Y10, Y3
|
|
VPSUBW Y4, Y11, Y4
|
|
VPSUBW Y5, Y12, Y5
|
|
VPSUBW Y6, Y13, Y6
|
|
VPSUBW Y7, Y14, Y7
|
|
|
|
// add
|
|
VPADDW Y15, Y4, Y4
|
|
VPADDW Y3, Y5, Y5
|
|
VPADDW Y4, Y6, Y6
|
|
VPADDW Y5, Y7, Y7
|
|
|
|
// reduce 2
|
|
VMOVDQU ·v_x16<>(SB), Y3
|
|
VPMULHW Y3, Y6, Y8
|
|
VPMULHW Y3, Y7, Y9
|
|
VPSRAW $11, Y8, Y8
|
|
VPSRAW $11, Y9, Y9
|
|
VPMULLW Y1, Y8, Y8
|
|
VPMULLW Y1, Y9, Y9
|
|
VPSUBW Y8, Y6, Y6
|
|
VPSUBW Y9, Y7, Y7
|
|
|
|
// store
|
|
VMOVDQU Y6, (DI)(BX*1)
|
|
VMOVDQU Y7, 32(DI)(BX*1)
|
|
|
|
ADDQ $1, AX
|
|
ADDQ $64, BX
|
|
CMPQ AX, $8
|
|
JB looptop3
|
|
|
|
VZEROUPPER
|
|
RET
|
|
|
|
// func pointwiseAccK2AVX2(dst *uint16, a, b **uint16)
|
|
TEXT ·pointwiseAccK4AVX2(SB), NOSPLIT, $0-24
|
|
MOVQ dst+0(FP), DI
|
|
MOVQ a+8(FP), SI
|
|
MOVQ b+16(FP), DX
|
|
|
|
VMOVDQU ·qinv_x16<>(SB), Y0
|
|
VMOVDQU ·q_x16<>(SB), Y1
|
|
VMOVDQU ·montsq_x16<>(SB), Y2
|
|
VMOVDQU ·v_x16<>(SB), Y3
|
|
|
|
XORQ AX, AX
|
|
XORQ BX, BX
|
|
|
|
MOVQ 24(SI), R10 // a[3]
|
|
MOVQ 16(SI), R9 // a[2]
|
|
MOVQ 8(SI), R8 // a[1]
|
|
MOVQ (SI), SI // a[0]
|
|
MOVQ 24(DX), R13 // b[3]
|
|
MOVQ 16(DX), R12 // b[2]
|
|
MOVQ 8(DX), R11 // b[1]
|
|
MOVQ (DX), DX // b[0]
|
|
|
|
looptop4:
|
|
// load a
|
|
VMOVDQU (SI)(BX*1), Y6
|
|
VMOVDQU (R8)(BX*1), Y7
|
|
VMOVDQU (R9)(BX*1), Y8
|
|
VMOVDQU (R10)(BX*1), Y9
|
|
|
|
// mul montsq
|
|
VPMULLW Y2, Y6, Y5
|
|
VPMULHW Y2, Y6, Y10
|
|
VPMULLW Y2, Y7, Y6
|
|
VPMULHW Y2, Y7, Y11
|
|
VPMULLW Y2, Y8, Y7
|
|
VPMULHW Y2, Y8, Y12
|
|
VPMULLW Y2, Y9, Y8
|
|
VPMULHW Y2, Y9, Y13
|
|
|
|
// reduce
|
|
VPMULLW Y0, Y5, Y5
|
|
VPMULLW Y0, Y6, Y6
|
|
VPMULLW Y0, Y7, Y7
|
|
VPMULLW Y0, Y8, Y8
|
|
VPMULHW Y1, Y5, Y5
|
|
VPMULHW Y1, Y6, Y6
|
|
VPMULHW Y1, Y7, Y7
|
|
VPMULHW Y1, Y8, Y8
|
|
VPSUBW Y5, Y10, Y5
|
|
VPSUBW Y6, Y11, Y6
|
|
VPSUBW Y7, Y12, Y7
|
|
VPSUBW Y8, Y13, Y8
|
|
|
|
// load b
|
|
VMOVDQU (DX)(BX*1), Y9
|
|
VMOVDQU (R11)(BX*1), Y10
|
|
VMOVDQU (R12)(BX*1), Y11
|
|
VMOVDQU (R13)(BX*1), Y12
|
|
|
|
// mul
|
|
VPMULLW Y5, Y9, Y4
|
|
VPMULHW Y5, Y9, Y9
|
|
VPMULLW Y6, Y10, Y5
|
|
VPMULHW Y6, Y10, Y10
|
|
VPMULLW Y7, Y11, Y6
|
|
VPMULHW Y7, Y11, Y11
|
|
VPMULLW Y8, Y12, Y7
|
|
VPMULHW Y8, Y12, Y12
|
|
|
|
// reduce
|
|
VPMULLW Y0, Y4, Y4
|
|
VPMULLW Y0, Y5, Y5
|
|
VPMULLW Y0, Y6, Y6
|
|
VPMULLW Y0, Y7, Y7
|
|
VPMULHW Y1, Y4, Y4
|
|
VPMULHW Y1, Y5, Y5
|
|
VPMULHW Y1, Y6, Y6
|
|
VPMULHW Y1, Y7, Y7
|
|
VPSUBW Y4, Y9, Y4
|
|
VPSUBW Y5, Y10, Y5
|
|
VPSUBW Y6, Y11, Y6
|
|
VPSUBW Y7, Y12, Y7
|
|
|
|
// add
|
|
VPADDW Y4, Y5, Y5
|
|
VPADDW Y5, Y6, Y6
|
|
VPADDW Y6, Y7, Y7
|
|
|
|
// reduce 2
|
|
VPMULHW Y3, Y7, Y8
|
|
VPSRAW $11, Y8, Y8
|
|
VPMULLW Y1, Y8, Y8
|
|
VPSUBW Y8, Y7, Y8
|
|
|
|
// store
|
|
VMOVDQU Y8, (DI)(BX*1)
|
|
|
|
ADDQ $1, AX
|
|
ADDQ $32, BX
|
|
CMPQ AX, $16
|
|
JB looptop4
|
|
|
|
VZEROUPPER
|
|
RET
|
|
|
|
// func cbdEta4AVX2(dst *uint16, b *byte)
|
|
TEXT ·cbdEta4AVX2(SB), NOSPLIT, $0-16
|
|
MOVQ dst+0(FP), DI
|
|
MOVQ b+8(FP), SI
|
|
|
|
VMOVDQU ·mask11<>(SB), Y0
|
|
VMOVDQU ·mask0f<>(SB), Y1
|
|
VMOVDQU ·q_x16<>(SB), Y2
|
|
|
|
MOVQ $256, DX
|
|
|
|
looptop:
|
|
VMOVUPD 0(SI), Y3
|
|
VPAND Y3, Y0, Y4
|
|
VPSRLW $1, Y3, Y3
|
|
VPAND Y3, Y0, Y5
|
|
VPADDB Y5, Y4, Y4
|
|
VPSRLW $1, Y3, Y3
|
|
VPAND Y3, Y0, Y5
|
|
VPADDB Y5, Y4, Y4
|
|
VPSRLW $1, Y3, Y3
|
|
VPAND Y3, Y0, Y3
|
|
VPADDB Y3, Y4, Y3
|
|
VPSRLW $4, Y3, Y4
|
|
VPAND Y3, Y1, Y3
|
|
VPAND Y4, Y1, Y4
|
|
VPSUBB Y4, Y3, Y3
|
|
VPMOVSXBW X3, Y4
|
|
VPADDW Y2, Y4, Y4
|
|
VMOVUPD Y4, 0(DI)
|
|
VPERM2F128 $0x21, Y3, Y3, Y3
|
|
VPMOVSXBW X3, Y4
|
|
VPADDW Y2, Y4, Y4
|
|
VMOVUPD Y4, 32(DI)
|
|
|
|
ADDQ $64, DI
|
|
ADDQ $32, SI
|
|
SUBQ $32, DX
|
|
JA looptop
|
|
|
|
VZEROUPPER
|
|
RET
|